xref: /onnv-gate/usr/src/uts/common/io/lvm/mirror/mirror.c (revision 10948:c686aa11575c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/param.h>
28 #include <sys/systm.h>
29 #include <sys/conf.h>
30 #include <sys/file.h>
31 #include <sys/user.h>
32 #include <sys/uio.h>
33 #include <sys/t_lock.h>
34 #include <sys/buf.h>
35 #include <sys/dkio.h>
36 #include <sys/vtoc.h>
37 #include <sys/kmem.h>
38 #include <vm/page.h>
39 #include <sys/cmn_err.h>
40 #include <sys/sysmacros.h>
41 #include <sys/types.h>
42 #include <sys/mkdev.h>
43 #include <sys/stat.h>
44 #include <sys/open.h>
45 #include <sys/modctl.h>
46 #include <sys/ddi.h>
47 #include <sys/sunddi.h>
48 #include <sys/debug.h>
49 #include <sys/dklabel.h>
50 #include <vm/hat.h>
51 #include <sys/lvm/mdvar.h>
52 #include <sys/lvm/md_mirror.h>
53 #include <sys/lvm/md_convert.h>
54 #include <sys/lvm/md_mddb.h>
55 #include <sys/esunddi.h>
56 
57 #include <sys/sysevent/eventdefs.h>
58 #include <sys/sysevent/svm.h>
59 #include <sys/lvm/mdmn_commd.h>
60 #include <sys/avl.h>
61 
62 md_ops_t		mirror_md_ops;
63 #ifndef	lint
64 char			_depends_on[] = "drv/md";
65 md_ops_t		*md_interface_ops = &mirror_md_ops;
66 #endif
67 
68 extern mdq_anchor_t	md_done_daemon;
69 extern mdq_anchor_t	md_mstr_daemon;
70 extern mdq_anchor_t	md_mirror_daemon;
71 extern mdq_anchor_t	md_mirror_io_daemon;
72 extern mdq_anchor_t	md_mirror_rs_daemon;
73 extern mdq_anchor_t	md_mhs_daemon;
74 
75 extern unit_t		md_nunits;
76 extern set_t		md_nsets;
77 extern md_set_t		md_set[];
78 
79 extern int		md_status;
80 extern clock_t		md_hz;
81 
82 extern md_krwlock_t	md_unit_array_rw;
83 extern kmutex_t		md_mx;
84 extern kcondvar_t	md_cv;
85 extern int		md_mtioctl_cnt;
86 
87 daemon_request_t	mirror_timeout;
88 static daemon_request_t	hotspare_request;
89 static daemon_request_t	mn_hs_request[MD_MAXSETS];	/* Multinode hs req */
90 
91 int	md_mirror_mcs_buf_off;
92 
93 /* Flags for mdmn_ksend_message to allow debugging */
94 int	md_mirror_msg_flags;
95 
96 #ifdef DEBUG
97 /* Flag to switch on debug messages */
98 int	mirror_debug_flag = 0;
99 #endif
100 
101 /*
102  * Struct used to hold count of DMR reads and the timestamp of last DMR read
103  * It is used to verify, using a debugger, that the DMR read ioctl has been
104  * executed.
105  */
106 dmr_stats_t	mirror_dmr_stats = {0, 0};
107 
108 /*
109  * Mutex protecting list of non-failfast drivers.
110  */
111 static kmutex_t	non_ff_drv_mutex;
112 extern char	**non_ff_drivers;
113 
114 extern major_t	md_major;
115 
116 /*
117  * Write-On-Write memory pool.
118  */
119 static void		copy_write_cont(wowhdr_t *wowhdr);
120 static kmem_cache_t	*mirror_wowblk_cache = NULL;
121 static int		md_wowbuf_size = 16384;
122 static size_t		md_wowblk_size;
123 
124 /*
125  * This is a flag that allows:
126  *	- disabling the write-on-write mechanism.
127  *	- logging occurrences of write-on-write
128  *	- switching wow handling procedure processing
129  * Counter for occurences of WOW.
130  */
131 static uint_t	md_mirror_wow_flg = 0;
132 static int	md_mirror_wow_cnt = 0;
133 
134 /*
135  * Tunable to enable/disable dirty region
136  * processing when closing down a mirror.
137  */
138 static int	new_resync = 1;
139 kmem_cache_t	*mirror_parent_cache = NULL;
140 kmem_cache_t	*mirror_child_cache = NULL;
141 
142 extern int	md_ff_disable;		/* disable failfast */
143 
144 static int	mirror_map_write(mm_unit_t *, md_mcs_t *, md_mps_t *, int);
145 static void	mirror_read_strategy(buf_t *, int, void *);
146 static void	mirror_write_strategy(buf_t *, int, void *);
147 static void	become_owner(daemon_queue_t *);
148 static int	mirror_done(struct buf *cb);
149 static int	mirror_done_common(struct buf *cb);
150 static void	clear_retry_error(struct buf *cb);
151 
152 /*
153  * patchables
154  */
155 int	md_min_rr_size	= 200;	/* 2000 blocks, or 100k */
156 int	md_def_num_rr	= 1000;	/* Default number of dirty regions */
157 
158 /*
159  * patchable to change delay before rescheduling mirror ownership request.
160  * Value is clock ticks, default 0.5 seconds
161  */
162 clock_t	md_mirror_owner_to = 500000;
163 
164 /*ARGSUSED1*/
165 static int
166 mirror_parent_constructor(void *p, void *d1, int d2)
167 {
168 	mutex_init(&((md_mps_t *)p)->ps_mx, NULL, MUTEX_DEFAULT, NULL);
169 	return (0);
170 }
171 
172 static void
173 mirror_parent_init(md_mps_t *ps)
174 {
175 	bzero(ps, offsetof(md_mps_t, ps_mx));
176 	bzero(&ps->ps_overlap_node, sizeof (avl_node_t));
177 }
178 
179 /*ARGSUSED1*/
180 static void
181 mirror_parent_destructor(void *p, void *d)
182 {
183 	mutex_destroy(&((md_mps_t *)p)->ps_mx);
184 }
185 
186 /*ARGSUSED1*/
187 static int
188 mirror_child_constructor(void *p, void *d1, int d2)
189 {
190 	bioinit(&((md_mcs_t *)p)->cs_buf);
191 	return (0);
192 }
193 
194 void
195 mirror_child_init(md_mcs_t *cs)
196 {
197 	cs->cs_ps = NULL;
198 	cs->cs_mdunit = 0;
199 	md_bioreset(&cs->cs_buf);
200 }
201 
202 /*ARGSUSED1*/
203 static void
204 mirror_child_destructor(void *p, void *d)
205 {
206 	biofini(&((md_mcs_t *)p)->cs_buf);
207 }
208 
209 static void
210 mirror_wowblk_init(wowhdr_t *p)
211 {
212 	bzero(p, md_wowblk_size);
213 }
214 
215 static void
216 send_poke_hotspares_msg(daemon_request_t *drq)
217 {
218 	int			rval;
219 	md_mn_msg_pokehsp_t	pokehsp;
220 	md_mn_kresult_t		*kresult;
221 	set_t			setno = (set_t)drq->dq.qlen;
222 
223 	pokehsp.pokehsp_setno = setno;
224 
225 	kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
226 	rval = mdmn_ksend_message(setno, MD_MN_MSG_POKE_HOTSPARES,
227 	    MD_MSGF_NO_LOG | MD_MSGF_NO_BCAST, 0, (char *)&pokehsp,
228 	    sizeof (pokehsp), kresult);
229 
230 	if (!MDMN_KSEND_MSG_OK(rval, kresult)) {
231 		mdmn_ksend_show_error(rval, kresult, "POKE_HOTSPARES");
232 		/* If we're shutting down already, pause things here. */
233 		if (kresult->kmmr_comm_state == MDMNE_RPC_FAIL) {
234 			while (!md_mn_is_commd_present()) {
235 				delay(md_hz);
236 			}
237 		}
238 		cmn_err(CE_PANIC,
239 		    "ksend_message failure: POKE_HOTSPARES");
240 	}
241 	kmem_free(kresult, sizeof (md_mn_kresult_t));
242 
243 	/* Allow further requests to use this set's queue structure */
244 	mutex_enter(&drq->dr_mx);
245 	drq->dr_pending = 0;
246 	mutex_exit(&drq->dr_mx);
247 }
248 
249 /*
250  * Send a poke_hotspares message to the master node. To avoid swamping the
251  * commd handler with requests we only send a message if there is not one
252  * already outstanding. We punt the request to a separate thread context as
253  * cannot afford to block waiting on the request to be serviced. This is
254  * essential when a reconfig cycle is in progress as any open() of a multinode
255  * metadevice may result in a livelock.
256  */
257 static void
258 send_poke_hotspares(set_t setno)
259 {
260 	daemon_request_t	*drq = &mn_hs_request[setno];
261 
262 	mutex_enter(&drq->dr_mx);
263 	if (drq->dr_pending == 0) {
264 		drq->dr_pending = 1;
265 		drq->dq.qlen = (int)setno;
266 		daemon_request(&md_mhs_daemon,
267 		    send_poke_hotspares_msg, (daemon_queue_t *)drq, REQ_OLD);
268 	}
269 	mutex_exit(&drq->dr_mx);
270 }
271 
272 void
273 mirror_set_sm_state(
274 	mm_submirror_t		*sm,
275 	mm_submirror_ic_t	*smic,
276 	sm_state_t		newstate,
277 	int			force)
278 {
279 	int			compcnt;
280 	int			i;
281 	int			errcnt;
282 	sm_state_t		origstate;
283 	md_m_shared_t		*shared;
284 
285 	if (force) {
286 		sm->sm_state = newstate;
287 		uniqtime32(&sm->sm_timestamp);
288 		return;
289 	}
290 
291 	origstate = newstate;
292 
293 	compcnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm);
294 	for (i = 0, errcnt = 0; i < compcnt; i++) {
295 		shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
296 		    (sm->sm_dev, sm, i);
297 		if (shared->ms_state & (CS_ERRED | CS_LAST_ERRED))
298 			newstate |= SMS_COMP_ERRED;
299 		if (shared->ms_state & (CS_RESYNC))
300 			newstate |= SMS_COMP_RESYNC;
301 		if (shared->ms_state & CS_ERRED)
302 			errcnt++;
303 	}
304 
305 	if ((newstate & (SMS_COMP_ERRED | SMS_COMP_RESYNC)) != 0)
306 		newstate &= ~origstate;
307 
308 	if (errcnt == compcnt)
309 		newstate |= SMS_ALL_ERRED;
310 	else
311 		newstate &= ~SMS_ALL_ERRED;
312 
313 	sm->sm_state = newstate;
314 	uniqtime32(&sm->sm_timestamp);
315 }
316 
317 static int
318 mirror_geterror(mm_unit_t *un, int *smi, int *cip, int clr_error,
319 							int frm_probe)
320 {
321 	mm_submirror_t		*sm;
322 	mm_submirror_ic_t	*smic;
323 	md_m_shared_t		*shared;
324 	int			ci;
325 	int			i;
326 	int			compcnt;
327 	int			open_comp; /* flag for open component */
328 
329 	for (i = *smi; i < NMIRROR; i++) {
330 		sm = &un->un_sm[i];
331 		smic = &un->un_smic[i];
332 
333 		if (!SMS_IS(sm, SMS_INUSE))
334 			continue;
335 
336 		compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un);
337 		for (ci = *cip; ci < compcnt; ci++) {
338 			shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
339 			    (sm->sm_dev, sm, ci);
340 			/*
341 			 * if called from any routine but probe, we check for
342 			 * MDM_S_ISOPEN flag. Since probe does a pseduo open,
343 			 * it sets MDM_S_PROBEOPEN flag and we test for this
344 			 * flag. They are both exclusive tests.
345 			 */
346 			open_comp = (frm_probe) ?
347 			    (shared->ms_flags & MDM_S_PROBEOPEN):
348 			    (shared->ms_flags & MDM_S_ISOPEN);
349 			if ((shared->ms_flags & MDM_S_IOERR || !open_comp) &&
350 			    ((shared->ms_state == CS_OKAY) ||
351 			    (shared->ms_state == CS_RESYNC))) {
352 				if (clr_error) {
353 					shared->ms_flags &= ~MDM_S_IOERR;
354 				}
355 				*cip = ci;
356 				*smi = i;
357 				return (1);
358 			}
359 
360 			if (clr_error && (shared->ms_flags & MDM_S_IOERR)) {
361 				shared->ms_flags &= ~MDM_S_IOERR;
362 			}
363 		}
364 
365 		*cip = 0;
366 	}
367 	return (0);
368 }
369 
370 /*ARGSUSED*/
371 static void
372 mirror_run_queue(void *d)
373 {
374 	if (!(md_status & MD_GBL_DAEMONS_LIVE))
375 		md_daemon(1, &md_done_daemon);
376 }
377 /*
378  * check_comp_4_hotspares
379  *
380  * This function attempts to allocate a hotspare for this component if the
381  * component is in error. In a MN set, the function can be called in 2 modes.
382  * It can be called either when a component error has been detected or when a
383  * new hotspare has been allocated. In this case, MD_HOTSPARE_XMIT is set
384  * in flags and the request is sent to all nodes.
385  * The handler on each of the nodes then calls this function with
386  * MD_HOTSPARE_XMIT unset and the hotspare allocation is then performed.
387  *
388  * For non-MN sets the function simply attempts to allocate a hotspare.
389  *
390  * On entry, the following locks are held
391  *	mirror_md_ops.md_link_rw (if flags has MD_HOTSPARE_LINKHELD set)
392  *	md_unit_writerlock
393  *
394  * Returns	0 if ok
395  *		1 if the unit containing the component has been cleared while
396  *		  the mdmn_ksend_message() was being executed
397  */
398 extern int
399 check_comp_4_hotspares(
400 	mm_unit_t	*un,
401 	int		smi,
402 	int		ci,
403 	uint_t		flags,
404 	mddb_recid_t	hs_id,	/* Only used by MN disksets */
405 	IOLOCK		*lockp	/* can be NULL */
406 )
407 {
408 	mm_submirror_t		*sm;
409 	mm_submirror_ic_t	*smic;
410 	md_m_shared_t		*shared;
411 	mddb_recid_t		recids[6];
412 	minor_t			mnum;
413 	intptr_t		(*hs_dev)();
414 	void			(*hs_done)();
415 	void			*hs_data;
416 	md_error_t		mde = mdnullerror;
417 	set_t			setno;
418 	md_mn_msg_allochsp_t	allochspmsg;
419 	md_mn_kresult_t		*kresult;
420 	mm_unit_t		*new_un;
421 	int			rval;
422 
423 	mnum = MD_SID(un);
424 	setno = MD_UN2SET(un);
425 	sm = &un->un_sm[smi];
426 	smic = &un->un_smic[smi];
427 	shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
428 	    (sm->sm_dev, sm, ci);
429 
430 	if (shared->ms_state != CS_ERRED)
431 		return (0);
432 
433 	/* Don't start a new component resync if a resync is already running. */
434 	if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE)
435 		return (0);
436 
437 	if (MD_MNSET_SETNO(setno) && (flags & MD_HOTSPARE_XMIT)) {
438 		uint_t		msgflags;
439 		md_mn_msgtype_t	msgtype;
440 
441 		/* Send allocate hotspare message to all nodes */
442 
443 		allochspmsg.msg_allochsp_mnum = un->c.un_self_id;
444 		allochspmsg.msg_allochsp_sm = smi;
445 		allochspmsg.msg_allochsp_comp = ci;
446 		allochspmsg.msg_allochsp_hs_id = shared->ms_hs_id;
447 
448 		/*
449 		 * Before calling mdmn_ksend_message(), release locks
450 		 * Can never be in the context of an ioctl.
451 		 */
452 		md_unit_writerexit(MDI_UNIT(mnum));
453 		if (flags & MD_HOTSPARE_LINKHELD)
454 			rw_exit(&mirror_md_ops.md_link_rw.lock);
455 #ifdef DEBUG
456 		if (mirror_debug_flag)
457 			printf("send alloc hotspare, flags="
458 			    "0x%x %x, %x, %x, %x\n", flags,
459 			    allochspmsg.msg_allochsp_mnum,
460 			    allochspmsg.msg_allochsp_sm,
461 			    allochspmsg.msg_allochsp_comp,
462 			    allochspmsg.msg_allochsp_hs_id);
463 #endif
464 		if (flags & MD_HOTSPARE_WMUPDATE) {
465 			msgtype  = MD_MN_MSG_ALLOCATE_HOTSPARE2;
466 			/*
467 			 * When coming from an update of watermarks, there
468 			 * must already be a message logged that triggered
469 			 * this action. So, no need to log this message, too.
470 			 */
471 			msgflags = MD_MSGF_NO_LOG;
472 		} else {
473 			msgtype  = MD_MN_MSG_ALLOCATE_HOTSPARE;
474 			msgflags = MD_MSGF_DEFAULT_FLAGS;
475 		}
476 
477 		kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
478 		rval = mdmn_ksend_message(setno, msgtype, msgflags, 0,
479 		    (char *)&allochspmsg, sizeof (allochspmsg),
480 		    kresult);
481 
482 		if (!MDMN_KSEND_MSG_OK(rval, kresult)) {
483 #ifdef DEBUG
484 			if (mirror_debug_flag)
485 				mdmn_ksend_show_error(rval, kresult,
486 				    "ALLOCATE HOTSPARE");
487 #endif
488 			/*
489 			 * If message is sent ok but exitval indicates an error
490 			 * it must be because the mirror has been cleared. In
491 			 * this case re-obtain lock and return an error
492 			 */
493 			if ((rval == 0) && (kresult->kmmr_exitval != 0)) {
494 				if (flags & MD_HOTSPARE_LINKHELD) {
495 					rw_enter(&mirror_md_ops.md_link_rw.lock,
496 					    RW_READER);
497 				}
498 				kmem_free(kresult, sizeof (md_mn_kresult_t));
499 				return (1);
500 			}
501 			/* If we're shutting down already, pause things here. */
502 			if (kresult->kmmr_comm_state == MDMNE_RPC_FAIL) {
503 				while (!md_mn_is_commd_present()) {
504 					delay(md_hz);
505 				}
506 			}
507 			cmn_err(CE_PANIC,
508 			    "ksend_message failure: ALLOCATE_HOTSPARE");
509 		}
510 		kmem_free(kresult, sizeof (md_mn_kresult_t));
511 
512 		/*
513 		 * re-obtain the locks
514 		 */
515 		if (flags & MD_HOTSPARE_LINKHELD)
516 			rw_enter(&mirror_md_ops.md_link_rw.lock, RW_READER);
517 		new_un = md_unit_writerlock(MDI_UNIT(mnum));
518 
519 		/*
520 		 * As we had to release the locks in order to send the
521 		 * message to all nodes, we need to check to see if the
522 		 * unit has changed. If it has we release the writerlock
523 		 * and return fail.
524 		 */
525 		if ((new_un != un) || (un->c.un_type != MD_METAMIRROR)) {
526 			md_unit_writerexit(MDI_UNIT(mnum));
527 			return (1);
528 		}
529 	} else {
530 		if (MD_MNSET_SETNO(setno)) {
531 			/*
532 			 * If 2 or more nodes simultaneously see a
533 			 * component failure, these nodes will each
534 			 * send an ALLOCATE_HOTSPARE[2] message.
535 			 * The first message will allocate the hotspare
536 			 * and the subsequent messages should do nothing.
537 			 *
538 			 * If a slave node doesn't have a hotspare allocated
539 			 * at the time the message is initiated, then the
540 			 * passed in hs_id will be 0.  If the node
541 			 * executing this routine has a component shared
542 			 * ms_hs_id of non-zero, but the message shows a
543 			 * hs_id of 0, then just return since a hotspare
544 			 * has already been allocated for this failing
545 			 * component.  When the slave node returns from
546 			 * the ksend_message the hotspare will have
547 			 * already been allocated.
548 			 *
549 			 * If the slave node does send an hs_id of non-zero,
550 			 * and the slave node's hs_id matches this node's
551 			 * ms_hs_id, then the hotspare has error'd and
552 			 * should be replaced.
553 			 *
554 			 * If the slave node sends an hs_id of non-zero and
555 			 * this node has a different shared ms_hs_id, then
556 			 * just return since this hotspare has already
557 			 * been hotspared.
558 			 */
559 			if (shared->ms_hs_id != 0) {
560 				if (hs_id == 0) {
561 #ifdef DEBUG
562 					if (mirror_debug_flag) {
563 						printf("check_comp_4_hotspares"
564 						    "(NOXMIT), short circuit "
565 						    "hs_id=0x%x, "
566 						    "ms_hs_id=0x%x\n",
567 						    hs_id, shared->ms_hs_id);
568 					}
569 #endif
570 					return (0);
571 				}
572 				if (hs_id != shared->ms_hs_id) {
573 #ifdef DEBUG
574 					if (mirror_debug_flag) {
575 						printf("check_comp_4_hotspares"
576 						    "(NOXMIT), short circuit2 "
577 						    "hs_id=0x%x, "
578 						    "ms_hs_id=0x%x\n",
579 						    hs_id, shared->ms_hs_id);
580 					}
581 #endif
582 					return (0);
583 				}
584 			}
585 		}
586 
587 		sm = &un->un_sm[smi];
588 		hs_dev = md_get_named_service(sm->sm_dev, 0,
589 		    "hotspare device", 0);
590 		if ((*hs_dev)(sm->sm_dev, 0, ci, recids, 6, &hs_done,
591 		    &hs_data) != 0)
592 			return (0);
593 
594 		/*
595 		 * set_sm_comp_state() commits the modified records.
596 		 * As we don't transmit the changes, no need to drop the lock.
597 		 */
598 		set_sm_comp_state(un, smi, ci, CS_RESYNC, recids,
599 		    MD_STATE_NO_XMIT, (IOLOCK *)NULL);
600 
601 		(*hs_done)(sm->sm_dev, hs_data);
602 
603 		mirror_check_failfast(mnum);
604 
605 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_HOTSPARED, SVM_TAG_METADEVICE,
606 		    setno, MD_SID(un));
607 
608 		/*
609 		 * For a multi-node set we need to reset the un_rs_type,
610 		 * un_rs_resync_done and un_rs_resync_2_do fields as the
611 		 * hot-spare resync must copy all applicable data.
612 		 */
613 		if (MD_MNSET_SETNO(setno)) {
614 			un->un_rs_type = MD_RS_NONE;
615 			un->un_rs_resync_done = 0;
616 			un->un_rs_resync_2_do = 0;
617 		}
618 
619 		/*
620 		 * Must drop writer lock since mirror_resync_unit will
621 		 * open devices and must be able to grab readerlock.
622 		 * Don't need to drop IOLOCK since any descendent routines
623 		 * calling ksend_messages will drop the IOLOCK as needed.
624 		 *
625 		 */
626 		if (lockp) {
627 			md_ioctl_writerexit(lockp);
628 		} else {
629 			md_unit_writerexit(MDI_UNIT(mnum));
630 		}
631 
632 		/* start resync */
633 		(void) mirror_resync_unit(mnum, NULL, &mde, lockp);
634 
635 		if (lockp) {
636 			new_un = md_ioctl_writerlock(lockp, MDI_UNIT(mnum));
637 		} else {
638 			new_un = md_unit_writerlock(MDI_UNIT(mnum));
639 		}
640 	}
641 	return (0);
642 }
643 
644 /*
645  * check_unit_4_hotspares
646  *
647  * For a given mirror, allocate hotspares, if available for any components
648  * that are in error
649  *
650  * Returns	0 if ok
651  *		1 if check_comp_4_hotspares returns non-zero. This will only
652  *		  happen for a MN unit where the unit has been cleared while
653  *		  the allocate hotspare message is sent to all nodes.
654  */
655 static int
656 check_unit_4_hotspares(mm_unit_t *un, int flags)
657 {
658 	mm_submirror_t		*sm;
659 	mm_submirror_ic_t	*smic;
660 	int			ci;
661 	int			i;
662 	int			compcnt;
663 
664 	if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE)
665 		return (0);
666 
667 	for (i = 0; i < NMIRROR; i++) {
668 		sm = &un->un_sm[i];
669 		smic = &un->un_smic[i];
670 		if (!SMS_IS(sm, SMS_INUSE))
671 			continue;
672 		compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, sm);
673 		for (ci = 0; ci < compcnt; ci++) {
674 			md_m_shared_t		*shared;
675 
676 			shared = (md_m_shared_t *)
677 			    (*(smic->sm_shared_by_indx))(sm->sm_dev, sm, ci);
678 			/*
679 			 * Never called from ioctl context, so pass in
680 			 * (IOLOCK *)NULL.  Pass through flags from calling
681 			 * routine, also setting XMIT flag.
682 			 */
683 			if (check_comp_4_hotspares(un, i, ci,
684 			    (MD_HOTSPARE_XMIT | flags),
685 			    shared->ms_hs_id, (IOLOCK *)NULL) != 0)
686 				return (1);
687 		}
688 	}
689 	return (0);
690 }
691 
692 static void
693 check_4_hotspares(daemon_request_t *drq)
694 {
695 	mdi_unit_t	*ui;
696 	mm_unit_t	*un;
697 	md_link_t	*next;
698 	int		x;
699 
700 	mutex_enter(&drq->dr_mx);	/* clear up front so can poke */
701 	drq->dr_pending = 0;		/* again in low level routine if */
702 	mutex_exit(&drq->dr_mx);	/* something found to do	*/
703 
704 	/*
705 	 * Used to have a problem here. The disksets weren't marked as being
706 	 * MNHOLD. This opened a window where we could be searching for
707 	 * hotspares and have the disk set unloaded (released) from under
708 	 * us causing a panic in stripe_component_count().
709 	 * The way to prevent that is to mark the set MNHOLD which prevents
710 	 * any diskset from being released while we are scanning the mirrors,
711 	 * submirrors and components.
712 	 */
713 
714 	for (x = 0; x < md_nsets; x++)
715 		md_holdset_enter(x);
716 
717 	rw_enter(&mirror_md_ops.md_link_rw.lock, RW_READER);
718 	for (next = mirror_md_ops.md_head; next != NULL; next = next->ln_next) {
719 		ui = MDI_UNIT(next->ln_id);
720 
721 		un = (mm_unit_t *)md_unit_readerlock(ui);
722 
723 		/*
724 		 * Only check the unit if we are the master for this set
725 		 * For an MN set, poke_hotspares() is only effective on the
726 		 * master
727 		 */
728 		if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
729 		    md_set[MD_UN2SET(un)].s_am_i_master == 0) {
730 			md_unit_readerexit(ui);
731 			continue;
732 		}
733 		if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) {
734 			md_unit_readerexit(ui);
735 			continue;
736 		}
737 		md_unit_readerexit(ui);
738 
739 		un = (mm_unit_t *)md_unit_writerlock(ui);
740 		/*
741 		 * check_unit_4_hotspares will exit 1 if the unit has been
742 		 * removed during the process of allocating the hotspare.
743 		 * This can only happen for a MN metadevice. If unit no longer
744 		 * exists, no need to release writerlock
745 		 */
746 		if (check_unit_4_hotspares(un, MD_HOTSPARE_LINKHELD) == 0)
747 			md_unit_writerexit(ui);
748 		else {
749 			/*
750 			 * If check_unit_4_hotspares failed, queue another
751 			 * request and break out of this one
752 			 */
753 			(void) poke_hotspares();
754 			break;
755 		}
756 	}
757 	rw_exit(&mirror_md_ops.md_link_rw.lock);
758 
759 	for (x = 0; x < md_nsets; x++)
760 		md_holdset_exit(x);
761 }
762 
763 /*
764  * poke_hotspares
765  *
766  * If there is not a pending poke_hotspares request pending, queue a requent
767  * to call check_4_hotspares(). This will scan all mirrors and attempt to
768  * allocate hotspares for all components in error.
769  */
770 int
771 poke_hotspares()
772 {
773 	mutex_enter(&hotspare_request.dr_mx);
774 	if (hotspare_request.dr_pending == 0) {
775 		hotspare_request.dr_pending = 1;
776 		daemon_request(&md_mhs_daemon,
777 		    check_4_hotspares, (daemon_queue_t *)&hotspare_request,
778 		    REQ_OLD);
779 	}
780 	mutex_exit(&hotspare_request.dr_mx);
781 	return (0);
782 }
783 
784 static void
785 free_all_ecomps(err_comp_t *ecomp)
786 {
787 	err_comp_t	*d;
788 
789 	while (ecomp != NULL) {
790 		d = ecomp;
791 		ecomp = ecomp->ec_next;
792 		kmem_free(d, sizeof (err_comp_t));
793 	}
794 }
795 
796 /*
797  * NAME: mirror_openfail_console_info
798  *
799  * DESCRIPTION: Prints a informative message to the console when mirror
800  *		cannot be opened.
801  *
802  * PARAMETERS: mm_unit_t	un - pointer to mirror unit structure
803  *	       int		smi - submirror index
804  *	       int		ci - component index
805  */
806 
807 void
808 mirror_openfail_console_info(mm_unit_t *un, int smi, int ci)
809 {
810 	void (*get_dev)();
811 	ms_cd_info_t cd;
812 	md_dev64_t tmpdev;
813 
814 	tmpdev = un->un_sm[smi].sm_dev;
815 	get_dev = (void (*)())md_get_named_service(tmpdev, 0, "get device", 0);
816 	if (get_dev != NULL) {
817 		(void) (*get_dev)(tmpdev, smi, ci, &cd);
818 		cmn_err(CE_WARN, "md %s: open error on %s",
819 		    md_shortname(MD_SID(un)), md_devname(MD_UN2SET(un),
820 		    cd.cd_dev, NULL, 0));
821 	} else {
822 		cmn_err(CE_WARN, "md %s: open error",
823 		    md_shortname(MD_SID(un)));
824 	}
825 }
826 
827 static int
828 mirror_close_all_devs(mm_unit_t *un, int md_cflags)
829 {
830 	int i;
831 	md_dev64_t dev;
832 
833 	for (i = 0; i < NMIRROR; i++) {
834 		if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
835 			continue;
836 		dev = un->un_sm[i].sm_dev;
837 		md_layered_close(dev, md_cflags);
838 	}
839 	return (0);
840 }
841 
842 /*
843  * Keep track of drivers that don't support failfast.  We use this so that
844  * we only log one diagnostic message for each of these drivers, no matter
845  * how many times we run the mirror_check_failfast function.
846  * Return 1 if this is a new driver that does not support failfast,
847  * return 0 if we have already seen this non-failfast driver.
848  */
849 static int
850 new_non_ff_driver(const char *s)
851 {
852 	mutex_enter(&non_ff_drv_mutex);
853 	if (non_ff_drivers == NULL) {
854 		non_ff_drivers = (char **)kmem_alloc(2 * sizeof (char *),
855 		    KM_NOSLEEP);
856 		if (non_ff_drivers == NULL) {
857 			mutex_exit(&non_ff_drv_mutex);
858 			return (1);
859 		}
860 
861 		non_ff_drivers[0] = (char *)kmem_alloc(strlen(s) + 1,
862 		    KM_NOSLEEP);
863 		if (non_ff_drivers[0] == NULL) {
864 			kmem_free(non_ff_drivers, 2 * sizeof (char *));
865 			non_ff_drivers = NULL;
866 			mutex_exit(&non_ff_drv_mutex);
867 			return (1);
868 		}
869 
870 		(void) strcpy(non_ff_drivers[0], s);
871 		non_ff_drivers[1] = NULL;
872 
873 	} else {
874 		int i;
875 		char **tnames;
876 		char **tmp;
877 
878 		for (i = 0; non_ff_drivers[i] != NULL; i++) {
879 			if (strcmp(s, non_ff_drivers[i]) == 0) {
880 				mutex_exit(&non_ff_drv_mutex);
881 				return (0);
882 			}
883 		}
884 
885 		/* allow for new element and null */
886 		i += 2;
887 		tnames = (char **)kmem_alloc(i * sizeof (char *), KM_NOSLEEP);
888 		if (tnames == NULL) {
889 			mutex_exit(&non_ff_drv_mutex);
890 			return (1);
891 		}
892 
893 		for (i = 0; non_ff_drivers[i] != NULL; i++)
894 			tnames[i] = non_ff_drivers[i];
895 
896 		tnames[i] = (char *)kmem_alloc(strlen(s) + 1, KM_NOSLEEP);
897 		if (tnames[i] == NULL) {
898 			/* adjust i so that it is the right count to free */
899 			kmem_free(tnames, (i + 2) * sizeof (char *));
900 			mutex_exit(&non_ff_drv_mutex);
901 			return (1);
902 		}
903 
904 		(void) strcpy(tnames[i++], s);
905 		tnames[i] = NULL;
906 
907 		tmp = non_ff_drivers;
908 		non_ff_drivers = tnames;
909 		/* i now represents the count we previously alloced */
910 		kmem_free(tmp, i * sizeof (char *));
911 	}
912 	mutex_exit(&non_ff_drv_mutex);
913 
914 	return (1);
915 }
916 
917 /*
918  * Check for the "ddi-failfast-supported" devtree property on each submirror
919  * component to indicate if we should do I/O to that submirror with the
920  * B_FAILFAST flag set or not.  This check is made at various state transitions
921  * in the mirror code (e.g. open, enable, hotspare, etc.).  Sometimes we
922  * only need to check one drive (e.g. hotspare) but since the check is
923  * fast and infrequent and sometimes needs to be done on all components we
924  * just check all components on each call.
925  */
926 void
927 mirror_check_failfast(minor_t mnum)
928 {
929 	int		i;
930 	mm_unit_t	*un;
931 
932 	if (md_ff_disable)
933 		return;
934 
935 	un = MD_UNIT(mnum);
936 
937 	for (i = 0; i < NMIRROR; i++) {
938 		int			ci;
939 		int			cnt;
940 		int			ff = 1;
941 		mm_submirror_t		*sm;
942 		mm_submirror_ic_t	*smic;
943 		void			(*get_dev)();
944 
945 		if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
946 			continue;
947 
948 		sm = &un->un_sm[i];
949 		smic = &un->un_smic[i];
950 
951 		get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0,
952 		    "get device", 0);
953 
954 		cnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm);
955 		for (ci = 0; ci < cnt; ci++) {
956 			int		found = 0;
957 			dev_t		ci_dev;
958 			major_t		major;
959 			dev_info_t	*devi;
960 			ms_cd_info_t	cd;
961 
962 			/*
963 			 * this already returns the hs
964 			 * dev if the device is spared
965 			 */
966 			(void) (*get_dev)(sm->sm_dev, sm, ci, &cd);
967 
968 			ci_dev = md_dev64_to_dev(cd.cd_dev);
969 			major = getmajor(ci_dev);
970 
971 			if (major == md_major) {
972 				/*
973 				 * this component must be a soft
974 				 * partition; get the real dev
975 				 */
976 				minor_t	dev_mnum;
977 				mdi_unit_t	*ui;
978 				mp_unit_t	*un;
979 				set_t	setno;
980 				side_t	side;
981 				md_dev64_t	tmpdev;
982 
983 				ui = MDI_UNIT(getminor(ci_dev));
984 
985 				/* grab necessary lock */
986 				un = (mp_unit_t *)md_unit_readerlock(ui);
987 
988 				dev_mnum = MD_SID(un);
989 				setno = MD_MIN2SET(dev_mnum);
990 				side = mddb_getsidenum(setno);
991 
992 				tmpdev = un->un_dev;
993 
994 				/* Get dev by device id */
995 				if (md_devid_found(setno, side,
996 				    un->un_key) == 1) {
997 					tmpdev = md_resolve_bydevid(dev_mnum,
998 					    tmpdev, un->un_key);
999 				}
1000 
1001 				md_unit_readerexit(ui);
1002 
1003 				ci_dev = md_dev64_to_dev(tmpdev);
1004 				major = getmajor(ci_dev);
1005 			}
1006 
1007 			if (ci_dev != NODEV32 &&
1008 			    (devi = e_ddi_hold_devi_by_dev(ci_dev, 0))
1009 			    != NULL) {
1010 				ddi_prop_op_t	prop_op = PROP_LEN_AND_VAL_BUF;
1011 				int		propvalue = 0;
1012 				int		proplength = sizeof (int);
1013 				int		error;
1014 				struct cb_ops	*cb;
1015 
1016 				if ((cb = devopsp[major]->devo_cb_ops) !=
1017 				    NULL) {
1018 					error = (*cb->cb_prop_op)
1019 					    (DDI_DEV_T_ANY, devi, prop_op,
1020 					    DDI_PROP_NOTPROM|DDI_PROP_DONTPASS,
1021 					    "ddi-failfast-supported",
1022 					    (caddr_t)&propvalue, &proplength);
1023 
1024 					if (error == DDI_PROP_SUCCESS)
1025 						found = 1;
1026 				}
1027 
1028 				if (!found && new_non_ff_driver(
1029 				    ddi_driver_name(devi))) {
1030 					cmn_err(CE_NOTE, "!md: B_FAILFAST I/O"
1031 					    "disabled on %s",
1032 					    ddi_driver_name(devi));
1033 				}
1034 
1035 				ddi_release_devi(devi);
1036 			}
1037 
1038 			/*
1039 			 * All components must support
1040 			 * failfast in the submirror.
1041 			 */
1042 			if (!found) {
1043 				ff = 0;
1044 				break;
1045 			}
1046 		}
1047 
1048 		if (ff) {
1049 			sm->sm_flags |= MD_SM_FAILFAST;
1050 		} else {
1051 			sm->sm_flags &= ~MD_SM_FAILFAST;
1052 		}
1053 	}
1054 }
1055 
1056 /*
1057  * Return true if the submirror is unavailable.
1058  * If any of the submirror components are opened then the submirror cannot
1059  * be unavailable (MD_INACCESSIBLE).
1060  * If any of the components are already in the errored state, then the submirror
1061  * cannot be unavailable (MD_INACCESSIBLE).
1062  */
1063 static bool_t
1064 submirror_unavailable(mm_unit_t *un, int smi, int from_probe)
1065 {
1066 	mm_submirror_t		*sm;
1067 	mm_submirror_ic_t	*smic;
1068 	md_m_shared_t		*shared;
1069 	int			ci;
1070 	int			compcnt;
1071 
1072 	sm = &un->un_sm[smi];
1073 	smic = &un->un_smic[smi];
1074 
1075 	compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un);
1076 	for (ci = 0; ci < compcnt; ci++) {
1077 		shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
1078 		    (sm->sm_dev, sm, ci);
1079 		if (from_probe) {
1080 			if (shared->ms_flags & MDM_S_PROBEOPEN)
1081 				return (B_FALSE);
1082 		} else {
1083 			if (shared->ms_flags & MDM_S_ISOPEN)
1084 				return (B_FALSE);
1085 		}
1086 		if (shared->ms_state == CS_ERRED ||
1087 		    shared->ms_state == CS_LAST_ERRED)
1088 			return (B_FALSE);
1089 	}
1090 
1091 	return (B_TRUE);
1092 }
1093 
1094 static int
1095 mirror_open_all_devs(minor_t mnum, int md_oflags, IOLOCK *lockp)
1096 {
1097 	int		i;
1098 	mm_unit_t	*un;
1099 	mdi_unit_t	*ui;
1100 	int		err;
1101 	int		smi;
1102 	int		ci;
1103 	err_comp_t	*c;
1104 	err_comp_t	*ecomps = NULL;
1105 	int		smmask = 0;
1106 	set_t		setno;
1107 	int		sm_cnt;
1108 	int		sm_unavail_cnt;
1109 
1110 	mirror_check_failfast(mnum);
1111 
1112 	un = MD_UNIT(mnum);
1113 	ui = MDI_UNIT(mnum);
1114 	setno = MD_UN2SET(un);
1115 
1116 	for (i = 0; i < NMIRROR; i++) {
1117 		md_dev64_t tmpdev = un->un_sm[i].sm_dev;
1118 
1119 		if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
1120 			continue;
1121 		if (md_layered_open(mnum, &tmpdev, md_oflags))
1122 			smmask |= SMI2BIT(i);
1123 		un->un_sm[i].sm_dev = tmpdev;
1124 	}
1125 
1126 	/*
1127 	 * If smmask is clear, all submirrors are accessible. Clear the
1128 	 * MD_INACCESSIBLE bit in this case.  This bit is also cleared for the
1129 	 * mirror device.   If smmask is set, we have to determine which of the
1130 	 * submirrors are in error. If no submirror is accessible we mark the
1131 	 * whole mirror as MD_INACCESSIBLE.
1132 	 */
1133 	if (smmask == 0) {
1134 		if (lockp) {
1135 			md_ioctl_readerexit(lockp);
1136 			(void) md_ioctl_writerlock(lockp, ui);
1137 		} else {
1138 			md_unit_readerexit(ui);
1139 			(void) md_unit_writerlock(ui);
1140 		}
1141 		ui->ui_tstate &= ~MD_INACCESSIBLE;
1142 		if (lockp) {
1143 			md_ioctl_writerexit(lockp);
1144 			(void) md_ioctl_readerlock(lockp, ui);
1145 		} else {
1146 			md_unit_writerexit(ui);
1147 			(void) md_unit_readerlock(ui);
1148 		}
1149 
1150 		for (i = 0; i < NMIRROR; i++) {
1151 			md_dev64_t	tmpdev;
1152 			mdi_unit_t	*sm_ui;
1153 
1154 			if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
1155 				continue;
1156 
1157 			tmpdev = un->un_sm[i].sm_dev;
1158 			sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev)));
1159 			(void) md_unit_writerlock(sm_ui);
1160 			sm_ui->ui_tstate &= ~MD_INACCESSIBLE;
1161 			md_unit_writerexit(sm_ui);
1162 		}
1163 
1164 		return (0);
1165 	}
1166 
1167 	for (i = 0; i < NMIRROR; i++) {
1168 		md_dev64_t tmpdev;
1169 
1170 		if (!(smmask & SMI2BIT(i)))
1171 			continue;
1172 
1173 		tmpdev = un->un_sm[i].sm_dev;
1174 		err = md_layered_open(mnum, &tmpdev, MD_OFLG_CONT_ERRS);
1175 		un->un_sm[i].sm_dev = tmpdev;
1176 		ASSERT(err == 0);
1177 	}
1178 
1179 	if (lockp) {
1180 		md_ioctl_readerexit(lockp);
1181 		un = (mm_unit_t *)md_ioctl_writerlock(lockp, ui);
1182 	} else {
1183 		md_unit_readerexit(ui);
1184 		un = (mm_unit_t *)md_unit_writerlock(ui);
1185 	}
1186 
1187 	/*
1188 	 * We want to make sure the unavailable flag is not masking a real
1189 	 * error on the submirror.
1190 	 * For each submirror,
1191 	 *    if all of the submirror components couldn't be opened and there
1192 	 *    are no errors on the submirror, then set the unavailable flag
1193 	 *    otherwise, clear unavailable.
1194 	 */
1195 	sm_cnt = 0;
1196 	sm_unavail_cnt = 0;
1197 	for (i = 0; i < NMIRROR; i++) {
1198 		md_dev64_t	tmpdev;
1199 		mdi_unit_t	*sm_ui;
1200 
1201 		if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
1202 			continue;
1203 
1204 		sm_cnt++;
1205 		tmpdev = un->un_sm[i].sm_dev;
1206 		sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev)));
1207 
1208 		(void) md_unit_writerlock(sm_ui);
1209 		if (submirror_unavailable(un, i, 0)) {
1210 			sm_ui->ui_tstate |= MD_INACCESSIBLE;
1211 			sm_unavail_cnt++;
1212 		} else {
1213 			sm_ui->ui_tstate &= ~MD_INACCESSIBLE;
1214 		}
1215 		md_unit_writerexit(sm_ui);
1216 	}
1217 
1218 	/*
1219 	 * If all of the submirrors are unavailable, the mirror is also
1220 	 * unavailable.
1221 	 */
1222 	if (sm_cnt == sm_unavail_cnt) {
1223 		ui->ui_tstate |= MD_INACCESSIBLE;
1224 	} else {
1225 		ui->ui_tstate &= ~MD_INACCESSIBLE;
1226 	}
1227 
1228 	smi = 0;
1229 	ci = 0;
1230 	while (mirror_geterror(un, &smi, &ci, 1, 0) != 0) {
1231 		if (mirror_other_sources(un, smi, ci, 1) == 1) {
1232 
1233 			free_all_ecomps(ecomps);
1234 			(void) mirror_close_all_devs(un, md_oflags);
1235 			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL,
1236 			    SVM_TAG_METADEVICE, setno, MD_SID(un));
1237 			mirror_openfail_console_info(un, smi, ci);
1238 			if (lockp) {
1239 				md_ioctl_writerexit(lockp);
1240 				(void) md_ioctl_readerlock(lockp, ui);
1241 			} else {
1242 				md_unit_writerexit(ui);
1243 				(void) md_unit_readerlock(ui);
1244 			}
1245 			return (ENXIO);
1246 		}
1247 
1248 		/* track all component states that need changing */
1249 		c = (err_comp_t *)kmem_alloc(sizeof (err_comp_t), KM_SLEEP);
1250 		c->ec_next = ecomps;
1251 		c->ec_smi = smi;
1252 		c->ec_ci = ci;
1253 		ecomps = c;
1254 		ci++;
1255 	}
1256 
1257 	/* Make all state changes and commit them */
1258 	for (c = ecomps; c != NULL; c = c->ec_next) {
1259 		/*
1260 		 * If lockp is set, then entering kernel through ioctl.
1261 		 * For a MN set, the only ioctl path is via a commd message
1262 		 * (ALLOCATE_HOTSPARE or *RESYNC* messages) that is already
1263 		 * being sent to each node.
1264 		 * In this case, set NO_XMIT so that set_sm_comp_state
1265 		 * won't attempt to send a message on a message.
1266 		 *
1267 		 * In !MN sets, the xmit flag is ignored, so it doesn't matter
1268 		 * which flag is passed.
1269 		 */
1270 		if (lockp) {
1271 			set_sm_comp_state(un, c->ec_smi, c->ec_ci, CS_ERRED, 0,
1272 			    MD_STATE_NO_XMIT, lockp);
1273 		} else {
1274 			set_sm_comp_state(un, c->ec_smi, c->ec_ci, CS_ERRED, 0,
1275 			    (MD_STATE_XMIT | MD_STATE_OCHELD), lockp);
1276 		}
1277 		/*
1278 		 * For a MN set, the NOTIFY is done when the state change is
1279 		 * processed on each node
1280 		 */
1281 		if (!MD_MNSET_SETNO(setno)) {
1282 			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED,
1283 			    SVM_TAG_METADEVICE, setno, MD_SID(un));
1284 		}
1285 	}
1286 
1287 	if (lockp) {
1288 		md_ioctl_writerexit(lockp);
1289 		(void) md_ioctl_readerlock(lockp, ui);
1290 	} else {
1291 		md_unit_writerexit(ui);
1292 		(void) md_unit_readerlock(ui);
1293 	}
1294 
1295 	free_all_ecomps(ecomps);
1296 
1297 	/* allocate hotspares for all errored components */
1298 	if (MD_MNSET_SETNO(setno)) {
1299 		/*
1300 		 * If we're called from an ioctl (lockp set) then we cannot
1301 		 * directly call send_poke_hotspares as this will block until
1302 		 * the message gets despatched to all nodes. If the cluster is
1303 		 * going through a reconfig cycle then the message will block
1304 		 * until the cycle is complete, and as we originate from a
1305 		 * service call from commd we will livelock.
1306 		 */
1307 		if (lockp == NULL) {
1308 			md_unit_readerexit(ui);
1309 			send_poke_hotspares(setno);
1310 			(void) md_unit_readerlock(ui);
1311 		}
1312 	} else {
1313 		(void) poke_hotspares();
1314 	}
1315 	return (0);
1316 }
1317 
1318 void
1319 mirror_overlap_tree_remove(md_mps_t *ps)
1320 {
1321 	mm_unit_t	*un;
1322 
1323 	if (panicstr)
1324 		return;
1325 
1326 	VERIFY(ps->ps_flags & MD_MPS_ON_OVERLAP);
1327 	un = ps->ps_un;
1328 
1329 	mutex_enter(&un->un_overlap_tree_mx);
1330 	avl_remove(&un->un_overlap_root, ps);
1331 	ps->ps_flags &= ~MD_MPS_ON_OVERLAP;
1332 	if (un->un_overlap_tree_flag != 0) {
1333 		un->un_overlap_tree_flag = 0;
1334 		cv_broadcast(&un->un_overlap_tree_cv);
1335 	}
1336 	mutex_exit(&un->un_overlap_tree_mx);
1337 }
1338 
1339 
1340 /*
1341  * wait_for_overlaps:
1342  * -----------------
1343  * Check that given i/o request does not cause an overlap with already pending
1344  * i/o. If it does, block until the overlapped i/o completes.
1345  *
1346  * The flag argument has MD_OVERLAP_ALLOW_REPEAT set if it is ok for the parent
1347  * structure to be already in the overlap tree and MD_OVERLAP_NO_REPEAT if
1348  * it must not already be in the tree.
1349  */
1350 static void
1351 wait_for_overlaps(md_mps_t *ps, int flags)
1352 {
1353 	mm_unit_t	*un;
1354 	avl_index_t	where;
1355 	md_mps_t	*ps1;
1356 
1357 	if (panicstr)
1358 		return;
1359 
1360 	un = ps->ps_un;
1361 	mutex_enter(&un->un_overlap_tree_mx);
1362 	if ((flags & MD_OVERLAP_ALLOW_REPEAT) &&
1363 	    (ps->ps_flags & MD_MPS_ON_OVERLAP)) {
1364 		mutex_exit(&un->un_overlap_tree_mx);
1365 		return;
1366 	}
1367 
1368 	VERIFY(!(ps->ps_flags & MD_MPS_ON_OVERLAP));
1369 
1370 	do {
1371 		ps1 = avl_find(&un->un_overlap_root, ps, &where);
1372 		if (ps1 == NULL) {
1373 			/*
1374 			 * The candidate range does not overlap with any
1375 			 * range in the tree.  Insert it and be done.
1376 			 */
1377 			avl_insert(&un->un_overlap_root, ps, where);
1378 			ps->ps_flags |= MD_MPS_ON_OVERLAP;
1379 		} else {
1380 			/*
1381 			 * The candidate range would overlap.  Set the flag
1382 			 * indicating we need to be woken up, and sleep
1383 			 * until another thread removes a range.  If upon
1384 			 * waking up we find this mps was put on the tree
1385 			 * by another thread, the loop terminates.
1386 			 */
1387 			un->un_overlap_tree_flag = 1;
1388 			cv_wait(&un->un_overlap_tree_cv,
1389 			    &un->un_overlap_tree_mx);
1390 		}
1391 	} while (!(ps->ps_flags & MD_MPS_ON_OVERLAP));
1392 	mutex_exit(&un->un_overlap_tree_mx);
1393 }
1394 
1395 /*
1396  * This function is called from mirror_done to check whether any pages have
1397  * been modified while a mirrored write was in progress.  Returns 0 if
1398  * all pages associated with bp are clean, 1 otherwise.
1399  */
1400 static int
1401 any_pages_dirty(struct buf *bp)
1402 {
1403 	int	rval;
1404 
1405 	rval = biomodified(bp);
1406 	if (rval == -1)
1407 		rval = 0;
1408 
1409 	return (rval);
1410 }
1411 
1412 #define	MAX_EXTRAS 10
1413 
1414 void
1415 mirror_commit(
1416 	mm_unit_t	*un,
1417 	int		smmask,
1418 	mddb_recid_t	*extras
1419 )
1420 {
1421 	mm_submirror_t		*sm;
1422 	md_unit_t		*su;
1423 	int			i;
1424 
1425 	/* 2=mirror,null id */
1426 	mddb_recid_t		recids[NMIRROR+2+MAX_EXTRAS];
1427 
1428 	int			ri = 0;
1429 
1430 	if (md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)
1431 		return;
1432 
1433 	/* Add two, this includes the mirror unit and the null recid */
1434 	if (extras != NULL) {
1435 		int	nrecids = 0;
1436 		while (extras[nrecids] != 0) {
1437 			nrecids++;
1438 		}
1439 		ASSERT(nrecids <= MAX_EXTRAS);
1440 	}
1441 
1442 	if (un != NULL)
1443 		recids[ri++] = un->c.un_record_id;
1444 	for (i = 0;  i < NMIRROR; i++) {
1445 		if (!(smmask & SMI2BIT(i)))
1446 			continue;
1447 		sm = &un->un_sm[i];
1448 		if (!SMS_IS(sm, SMS_INUSE))
1449 			continue;
1450 		if (md_getmajor(sm->sm_dev) != md_major)
1451 			continue;
1452 		su =  MD_UNIT(md_getminor(sm->sm_dev));
1453 		recids[ri++] = su->c.un_record_id;
1454 	}
1455 
1456 	if (extras != NULL)
1457 		while (*extras != 0) {
1458 			recids[ri++] = *extras;
1459 			extras++;
1460 		}
1461 
1462 	if (ri == 0)
1463 		return;
1464 	recids[ri] = 0;
1465 
1466 	/*
1467 	 * Ok to hold ioctl lock across record commit to mddb as
1468 	 * long as the record(s) being committed aren't resync records.
1469 	 */
1470 	mddb_commitrecs_wrapper(recids);
1471 }
1472 
1473 
1474 /*
1475  * This routine is used to set a bit in the writable_bm bitmap
1476  * which represents each submirror in a metamirror which
1477  * is writable. The first writable submirror index is assigned
1478  * to the sm_index.  The number of writable submirrors are returned in nunits.
1479  *
1480  * This routine returns the submirror's unit number.
1481  */
1482 
1483 static void
1484 select_write_units(struct mm_unit *un, md_mps_t *ps)
1485 {
1486 
1487 	int		i;
1488 	unsigned	writable_bm = 0;
1489 	unsigned	nunits = 0;
1490 
1491 	for (i = 0; i < NMIRROR; i++) {
1492 		if (SUBMIRROR_IS_WRITEABLE(un, i)) {
1493 			/* set bit of all writable units */
1494 			writable_bm |= SMI2BIT(i);
1495 			nunits++;
1496 		}
1497 	}
1498 	ps->ps_writable_sm = writable_bm;
1499 	ps->ps_active_cnt = nunits;
1500 	ps->ps_current_sm = 0;
1501 }
1502 
1503 static
1504 unsigned
1505 select_write_after_read_units(struct mm_unit *un, md_mps_t *ps)
1506 {
1507 
1508 	int		i;
1509 	unsigned	writable_bm = 0;
1510 	unsigned	nunits = 0;
1511 
1512 	for (i = 0; i < NMIRROR; i++) {
1513 		if (SUBMIRROR_IS_WRITEABLE(un, i) &&
1514 		    un->un_sm[i].sm_flags & MD_SM_RESYNC_TARGET) {
1515 			writable_bm |= SMI2BIT(i);
1516 			nunits++;
1517 		}
1518 	}
1519 	if ((writable_bm & ps->ps_allfrom_sm) != 0) {
1520 		writable_bm &= ~ps->ps_allfrom_sm;
1521 		nunits--;
1522 	}
1523 	ps->ps_writable_sm = writable_bm;
1524 	ps->ps_active_cnt = nunits;
1525 	ps->ps_current_sm = 0;
1526 	return (nunits);
1527 }
1528 
1529 static md_dev64_t
1530 select_read_unit(
1531 	mm_unit_t	*un,
1532 	diskaddr_t	blkno,
1533 	u_longlong_t	reqcount,
1534 	u_longlong_t	*cando,
1535 	int		must_be_opened,
1536 	md_m_shared_t	**shared,
1537 	md_mcs_t	*cs)
1538 {
1539 	int			i;
1540 	md_m_shared_t		*s;
1541 	uint_t			lasterrcnt = 0;
1542 	md_dev64_t		dev = 0;
1543 	u_longlong_t		cnt;
1544 	u_longlong_t		mincnt;
1545 	mm_submirror_t		*sm;
1546 	mm_submirror_ic_t	*smic;
1547 	mdi_unit_t		*ui;
1548 
1549 	mincnt = reqcount;
1550 	for (i = 0; i < NMIRROR; i++) {
1551 		if (!SUBMIRROR_IS_READABLE(un, i))
1552 			continue;
1553 		sm = &un->un_sm[i];
1554 		smic = &un->un_smic[i];
1555 		cnt = reqcount;
1556 
1557 		/*
1558 		 * If the current submirror is marked as inaccessible, do not
1559 		 * try to access it.
1560 		 */
1561 		ui = MDI_UNIT(getminor(expldev(sm->sm_dev)));
1562 		(void) md_unit_readerlock(ui);
1563 		if (ui->ui_tstate & MD_INACCESSIBLE) {
1564 			md_unit_readerexit(ui);
1565 			continue;
1566 		}
1567 		md_unit_readerexit(ui);
1568 
1569 		s = (md_m_shared_t *)(*(smic->sm_shared_by_blk))
1570 		    (sm->sm_dev, sm, blkno, &cnt);
1571 
1572 		if (must_be_opened && !(s->ms_flags & MDM_S_ISOPEN))
1573 			continue;
1574 		if (s->ms_state == CS_OKAY) {
1575 			*cando = cnt;
1576 			if (shared != NULL)
1577 				*shared = s;
1578 
1579 			if (un->un_sm[i].sm_flags & MD_SM_FAILFAST &&
1580 			    cs != NULL) {
1581 				cs->cs_buf.b_flags |= B_FAILFAST;
1582 			}
1583 
1584 			return (un->un_sm[i].sm_dev);
1585 		}
1586 		if (s->ms_state != CS_LAST_ERRED)
1587 			continue;
1588 
1589 		/* don't use B_FAILFAST since we're Last Erred */
1590 
1591 		if (mincnt > cnt)
1592 			mincnt = cnt;
1593 		if (s->ms_lasterrcnt > lasterrcnt) {
1594 			lasterrcnt = s->ms_lasterrcnt;
1595 			if (shared != NULL)
1596 				*shared = s;
1597 			dev = un->un_sm[i].sm_dev;
1598 		}
1599 	}
1600 	*cando = mincnt;
1601 	return (dev);
1602 }
1603 
1604 /*
1605  * Given a 32-bit bitmap, this routine will return the bit number
1606  * of the nth bit set.	The nth bit set is passed via the index integer.
1607  *
1608  * This routine is used to run through the writable submirror bitmap
1609  * and starting all of the writes.  See the value returned is the
1610  * index to appropriate submirror structure, in the md_sm
1611  * array for metamirrors.
1612  */
1613 static int
1614 md_find_nth_unit(uint_t mask, int index)
1615 {
1616 	int	bit, nfound;
1617 
1618 	for (bit = -1, nfound = -1; nfound != index; bit++) {
1619 		ASSERT(mask != 0);
1620 		nfound += (mask & 1);
1621 		mask >>= 1;
1622 	}
1623 	return (bit);
1624 }
1625 
1626 static int
1627 fast_select_read_unit(md_mps_t *ps, md_mcs_t *cs)
1628 {
1629 	mm_unit_t	*un;
1630 	buf_t		*bp;
1631 	int		i;
1632 	unsigned	nunits = 0;
1633 	int		iunit;
1634 	uint_t		running_bm = 0;
1635 	uint_t		sm_index;
1636 
1637 	bp = &cs->cs_buf;
1638 	un = ps->ps_un;
1639 
1640 	for (i = 0; i < NMIRROR; i++) {
1641 		if (!SMS_BY_INDEX_IS(un, i, SMS_RUNNING))
1642 			continue;
1643 		running_bm |= SMI2BIT(i);
1644 		nunits++;
1645 	}
1646 	if (nunits == 0)
1647 		return (1);
1648 
1649 	/*
1650 	 * For directed mirror read (DMR) we only use the specified side and
1651 	 * do not compute the source of the read.
1652 	 * If we're running with MD_MPS_DIRTY_RD set we always return the
1653 	 * first mirror side (this prevents unnecessary ownership switching).
1654 	 * Otherwise we return the submirror according to the mirror read option
1655 	 */
1656 	if (ps->ps_flags & MD_MPS_DMR) {
1657 		sm_index = un->un_dmr_last_read;
1658 	} else if (ps->ps_flags & MD_MPS_DIRTY_RD) {
1659 		sm_index = md_find_nth_unit(running_bm, 0);
1660 	} else {
1661 		/* Normal (non-DMR) operation */
1662 		switch (un->un_read_option) {
1663 		case RD_GEOMETRY:
1664 			iunit = (int)(bp->b_lblkno /
1665 			    howmany(un->c.un_total_blocks, nunits));
1666 			sm_index = md_find_nth_unit(running_bm, iunit);
1667 			break;
1668 		case RD_FIRST:
1669 			sm_index = md_find_nth_unit(running_bm, 0);
1670 			break;
1671 		case RD_LOAD_BAL:
1672 			/* this is intentional to fall into the default */
1673 		default:
1674 			un->un_last_read = (un->un_last_read + 1) % nunits;
1675 			sm_index = md_find_nth_unit(running_bm,
1676 			    un->un_last_read);
1677 			break;
1678 		}
1679 	}
1680 	bp->b_edev = md_dev64_to_dev(un->un_sm[sm_index].sm_dev);
1681 	ps->ps_allfrom_sm = SMI2BIT(sm_index);
1682 
1683 	if (un->un_sm[sm_index].sm_flags & MD_SM_FAILFAST) {
1684 		bp->b_flags |= B_FAILFAST;
1685 	}
1686 
1687 	return (0);
1688 }
1689 
1690 static
1691 int
1692 mirror_are_submirrors_available(mm_unit_t *un)
1693 {
1694 	int i;
1695 	for (i = 0; i < NMIRROR; i++) {
1696 		md_dev64_t tmpdev = un->un_sm[i].sm_dev;
1697 
1698 		if ((!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) ||
1699 		    md_getmajor(tmpdev) != md_major)
1700 			continue;
1701 
1702 		if ((MD_MIN2SET(md_getminor(tmpdev)) >= md_nsets) ||
1703 		    (MD_MIN2UNIT(md_getminor(tmpdev)) >= md_nunits))
1704 			return (0);
1705 
1706 		if (MDI_UNIT(md_getminor(tmpdev)) == NULL)
1707 			return (0);
1708 	}
1709 	return (1);
1710 }
1711 
1712 void
1713 build_submirror(mm_unit_t *un, int i, int snarfing)
1714 {
1715 	struct mm_submirror	*sm;
1716 	struct mm_submirror_ic	*smic;
1717 	md_unit_t		*su;
1718 	set_t			setno;
1719 
1720 	sm = &un->un_sm[i];
1721 	smic = &un->un_smic[i];
1722 
1723 	sm->sm_flags = 0; /* sometime we may need to do more here */
1724 
1725 	setno = MD_UN2SET(un);
1726 
1727 	if (!SMS_IS(sm, SMS_INUSE))
1728 		return;
1729 	if (snarfing) {
1730 		sm->sm_dev = md_getdevnum(setno, mddb_getsidenum(setno),
1731 		    sm->sm_key, MD_NOTRUST_DEVT);
1732 	} else {
1733 		if (md_getmajor(sm->sm_dev) == md_major) {
1734 			su = MD_UNIT(md_getminor(sm->sm_dev));
1735 			un->c.un_flag |= (su->c.un_flag & MD_LABELED);
1736 			/* submirror can no longer be soft partitioned */
1737 			MD_CAPAB(su) &= (~MD_CAN_SP);
1738 		}
1739 	}
1740 	smic->sm_shared_by_blk = md_get_named_service(sm->sm_dev,
1741 	    0, "shared by blk", 0);
1742 	smic->sm_shared_by_indx = md_get_named_service(sm->sm_dev,
1743 	    0, "shared by indx", 0);
1744 	smic->sm_get_component_count = (int (*)())md_get_named_service(
1745 	    sm->sm_dev, 0, "get component count", 0);
1746 	smic->sm_get_bcss = (int (*)())md_get_named_service(sm->sm_dev, 0,
1747 	    "get block count skip size", 0);
1748 	sm->sm_state &= ~SMS_IGNORE;
1749 	if (SMS_IS(sm, SMS_OFFLINE))
1750 		MD_STATUS(un) |= MD_UN_OFFLINE_SM;
1751 	md_set_parent(sm->sm_dev, MD_SID(un));
1752 }
1753 
1754 static void
1755 mirror_cleanup(mm_unit_t *un)
1756 {
1757 	mddb_recid_t	recid;
1758 	int		smi;
1759 	sv_dev_t	sv[NMIRROR];
1760 	int		nsv = 0;
1761 
1762 	/*
1763 	 * If a MN diskset and this node is not the master, do
1764 	 * not delete any records on snarf of the mirror records.
1765 	 */
1766 	if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
1767 	    md_set[MD_UN2SET(un)].s_am_i_master == 0) {
1768 		return;
1769 	}
1770 
1771 	for (smi = 0; smi < NMIRROR; smi++) {
1772 		if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE))
1773 			continue;
1774 		sv[nsv].setno = MD_UN2SET(un);
1775 		sv[nsv++].key = un->un_sm[smi].sm_key;
1776 	}
1777 
1778 	recid = un->un_rr_dirty_recid;
1779 	mddb_deleterec_wrapper(un->c.un_record_id);
1780 	if (recid > 0)
1781 		mddb_deleterec_wrapper(recid);
1782 
1783 	md_rem_names(sv, nsv);
1784 }
1785 
1786 /*
1787  * Comparison function for the avl tree which tracks
1788  * outstanding writes on submirrors.
1789  *
1790  * Returns:
1791  *	-1: ps1 < ps2
1792  *	 0: ps1 and ps2 overlap
1793  *	 1: ps1 > ps2
1794  */
1795 static int
1796 mirror_overlap_compare(const void *p1, const void *p2)
1797 {
1798 	const md_mps_t *ps1 = (md_mps_t *)p1;
1799 	const md_mps_t *ps2 = (md_mps_t *)p2;
1800 
1801 	if (ps1->ps_firstblk < ps2->ps_firstblk) {
1802 		if (ps1->ps_lastblk >= ps2->ps_firstblk)
1803 			return (0);
1804 		return (-1);
1805 	}
1806 
1807 	if (ps1->ps_firstblk > ps2->ps_firstblk) {
1808 		if (ps1->ps_firstblk <= ps2->ps_lastblk)
1809 			return (0);
1810 		return (1);
1811 	}
1812 
1813 	return (0);
1814 }
1815 
1816 /*
1817  * Collapse any sparse submirror entries snarfed from the on-disk replica.
1818  * Only the in-core entries are updated. The replica will be updated on-disk
1819  * when the in-core replica is committed on shutdown of the SVM subsystem.
1820  */
1821 static void
1822 collapse_submirrors(mm_unit_t *un)
1823 {
1824 	int			smi, nremovals, smiremove;
1825 	mm_submirror_t		*sm, *new_sm, *old_sm;
1826 	mm_submirror_ic_t	*smic;
1827 	int			nsmidx = un->un_nsm - 1;
1828 
1829 rescan:
1830 	nremovals = 0;
1831 	smiremove = -1;
1832 
1833 	for (smi = 0; smi <= nsmidx; smi++) {
1834 		sm = &un->un_sm[smi];
1835 
1836 		/*
1837 		 * Check to see if this submirror is marked as in-use.
1838 		 * If it isn't then it is a potential sparse entry and
1839 		 * may need to be cleared from the configuration.
1840 		 * The records should _already_ have been cleared by the
1841 		 * original mirror_detach() code, but we need to shuffle
1842 		 * any NULL entries in un_sm[] to the end of the array.
1843 		 * Any NULL un_smic[] entries need to be reset to the underlying
1844 		 * submirror/slice accessor functions.
1845 		 */
1846 		if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE)) {
1847 			nremovals++;
1848 			smiremove = smi;
1849 			break;
1850 		}
1851 	}
1852 
1853 	if (nremovals == 0) {
1854 		/*
1855 		 * Ensure that we have a matching contiguous set of un_smic[]
1856 		 * entries for the corresponding un_sm[] entries
1857 		 */
1858 		for (smi = 0; smi <= nsmidx; smi++) {
1859 			smic = &un->un_smic[smi];
1860 			sm = &un->un_sm[smi];
1861 
1862 			smic->sm_shared_by_blk =
1863 			    md_get_named_service(sm->sm_dev, 0,
1864 			    "shared by_blk", 0);
1865 			smic->sm_shared_by_indx =
1866 			    md_get_named_service(sm->sm_dev, 0,
1867 			    "shared by indx", 0);
1868 			smic->sm_get_component_count =
1869 			    (int (*)())md_get_named_service(sm->sm_dev, 0,
1870 			    "get component count", 0);
1871 			smic->sm_get_bcss =
1872 			    (int (*)())md_get_named_service(sm->sm_dev, 0,
1873 			    "get block count skip size", 0);
1874 		}
1875 		return;
1876 	}
1877 
1878 	/*
1879 	 * Reshuffle the submirror devices so that we do not have a dead record
1880 	 * in the middle of the array. Once we've done this we need to rescan
1881 	 * the mirror to check for any other holes.
1882 	 */
1883 	for (smi = 0; smi < NMIRROR; smi++) {
1884 		if (smi < smiremove)
1885 			continue;
1886 		if (smi > smiremove) {
1887 			old_sm = &un->un_sm[smi];
1888 			new_sm = &un->un_sm[smi - 1];
1889 			bcopy(old_sm, new_sm, sizeof (mm_submirror_t));
1890 			bzero(old_sm, sizeof (mm_submirror_t));
1891 		}
1892 	}
1893 
1894 	/*
1895 	 * Now we need to rescan the array to find the next potential dead
1896 	 * entry.
1897 	 */
1898 	goto rescan;
1899 }
1900 
1901 /* Return a -1 if optimized record unavailable and set should be released */
1902 int
1903 mirror_build_incore(mm_unit_t *un, int snarfing)
1904 {
1905 	int		i;
1906 
1907 	if (MD_STATUS(un) & MD_UN_BEING_RESET) {
1908 		mddb_setrecprivate(un->c.un_record_id, MD_PRV_PENDCLEAN);
1909 		return (1);
1910 	}
1911 
1912 	if (mirror_are_submirrors_available(un) == 0)
1913 		return (1);
1914 
1915 	if (MD_UNIT(MD_SID(un)) != NULL)
1916 		return (0);
1917 
1918 	MD_STATUS(un) = 0;
1919 
1920 	/* pre-4.1 didn't define CAN_META_CHILD capability */
1921 	MD_CAPAB(un) = MD_CAN_META_CHILD | MD_CAN_PARENT | MD_CAN_SP;
1922 
1923 	un->un_overlap_tree_flag = 0;
1924 	avl_create(&un->un_overlap_root, mirror_overlap_compare,
1925 	    sizeof (md_mps_t), offsetof(md_mps_t, ps_overlap_node));
1926 
1927 	/*
1928 	 * We need to collapse any sparse submirror entries into a non-sparse
1929 	 * array. This is to cover the case where we have an old replica image
1930 	 * which has not been updated (i.e. snarfed) since being modified.
1931 	 * The new code expects all submirror access to be sequential (i.e.
1932 	 * both the un_sm[] and un_smic[] entries correspond to non-empty
1933 	 * submirrors.
1934 	 */
1935 
1936 	collapse_submirrors(un);
1937 
1938 	for (i = 0; i < NMIRROR; i++)
1939 		build_submirror(un, i, snarfing);
1940 
1941 	if (unit_setup_resync(un, snarfing) != 0) {
1942 		if (snarfing) {
1943 			mddb_setrecprivate(un->c.un_record_id, MD_PRV_GOTIT);
1944 			/*
1945 			 * If a MN set and set is not stale, then return -1
1946 			 * which will force the caller to unload the set.
1947 			 * The MN diskset nodes will return failure if
1948 			 * unit_setup_resync fails so that nodes won't
1949 			 * get out of sync.
1950 			 *
1951 			 * If set is STALE, the master node can't allocate
1952 			 * a resync record (if needed), but node needs to
1953 			 * join the set so that user can delete broken mddbs.
1954 			 * So, if set is STALE, just continue on.
1955 			 */
1956 			if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
1957 			    !(md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)) {
1958 				return (-1);
1959 			}
1960 		} else
1961 			return (1);
1962 	}
1963 
1964 	mutex_init(&un->un_overlap_tree_mx, NULL, MUTEX_DEFAULT, NULL);
1965 	cv_init(&un->un_overlap_tree_cv, NULL, CV_DEFAULT, NULL);
1966 
1967 	un->un_suspend_wr_flag = 0;
1968 	mutex_init(&un->un_suspend_wr_mx, NULL, MUTEX_DEFAULT, NULL);
1969 	cv_init(&un->un_suspend_wr_cv, NULL, CV_DEFAULT, NULL);
1970 
1971 	/*
1972 	 * Allocate mutexes for mirror-owner and resync-owner changes.
1973 	 * All references to the owner message state field must be guarded
1974 	 * by this mutex.
1975 	 */
1976 	mutex_init(&un->un_owner_mx, NULL, MUTEX_DEFAULT, NULL);
1977 
1978 	/*
1979 	 * Allocate mutex and condvar for resync thread manipulation. These
1980 	 * will be used by mirror_resync_unit/mirror_ioctl_resync
1981 	 */
1982 	mutex_init(&un->un_rs_thread_mx, NULL, MUTEX_DEFAULT, NULL);
1983 	cv_init(&un->un_rs_thread_cv, NULL, CV_DEFAULT, NULL);
1984 
1985 	/*
1986 	 * Allocate mutex and condvar for resync progress thread manipulation.
1987 	 * This allows resyncs to be continued across an intervening reboot.
1988 	 */
1989 	mutex_init(&un->un_rs_progress_mx, NULL, MUTEX_DEFAULT, NULL);
1990 	cv_init(&un->un_rs_progress_cv, NULL, CV_DEFAULT, NULL);
1991 
1992 	/*
1993 	 * Allocate mutex and condvar for Directed Mirror Reads (DMR). This
1994 	 * provides synchronization between a user-ioctl and the resulting
1995 	 * strategy() call that performs the read().
1996 	 */
1997 	mutex_init(&un->un_dmr_mx, NULL, MUTEX_DEFAULT, NULL);
1998 	cv_init(&un->un_dmr_cv, NULL, CV_DEFAULT, NULL);
1999 
2000 	/*
2001 	 * Allocate rwlocks for un_pernode_dirty_bm accessing.
2002 	 */
2003 	for (i = 0; i < MD_MNMAXSIDES; i++) {
2004 		rw_init(&un->un_pernode_dirty_mx[i], NULL, RW_DEFAULT, NULL);
2005 	}
2006 
2007 	/* place various information in the in-core data structures */
2008 	md_nblocks_set(MD_SID(un), un->c.un_total_blocks);
2009 	MD_UNIT(MD_SID(un)) = un;
2010 
2011 	return (0);
2012 }
2013 
2014 
2015 void
2016 reset_mirror(struct mm_unit *un, minor_t mnum, int removing)
2017 {
2018 	mddb_recid_t	recid, vtoc_id;
2019 	size_t		bitcnt;
2020 	size_t		shortcnt;
2021 	int		smi;
2022 	sv_dev_t	sv[NMIRROR];
2023 	int		nsv = 0;
2024 	uint_t		bits = 0;
2025 	minor_t		selfid;
2026 	md_unit_t	*su;
2027 	int		i;
2028 
2029 	md_destroy_unit_incore(mnum, &mirror_md_ops);
2030 
2031 	shortcnt = un->un_rrd_num * sizeof (short);
2032 	bitcnt = howmany(un->un_rrd_num, NBBY);
2033 
2034 	if (un->un_outstanding_writes)
2035 		kmem_free((caddr_t)un->un_outstanding_writes, shortcnt);
2036 	if (un->un_goingclean_bm)
2037 		kmem_free((caddr_t)un->un_goingclean_bm, bitcnt);
2038 	if (un->un_goingdirty_bm)
2039 		kmem_free((caddr_t)un->un_goingdirty_bm, bitcnt);
2040 	if (un->un_resync_bm)
2041 		kmem_free((caddr_t)un->un_resync_bm, bitcnt);
2042 	if (un->un_pernode_dirty_sum)
2043 		kmem_free((caddr_t)un->un_pernode_dirty_sum, un->un_rrd_num);
2044 
2045 	/*
2046 	 * Destroy the taskq for deferred processing of DRL clean requests.
2047 	 * This taskq will only be present for Multi Owner mirrors.
2048 	 */
2049 	if (un->un_drl_task != NULL)
2050 		ddi_taskq_destroy(un->un_drl_task);
2051 
2052 	md_nblocks_set(mnum, -1ULL);
2053 	MD_UNIT(mnum) = NULL;
2054 
2055 	/*
2056 	 * Attempt release of its minor node
2057 	 */
2058 	md_remove_minor_node(mnum);
2059 
2060 	if (!removing)
2061 		return;
2062 
2063 	for (smi = 0; smi < NMIRROR; smi++) {
2064 		if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE))
2065 			continue;
2066 		/* reallow soft partitioning of submirror and reset parent */
2067 		su = MD_UNIT(md_getminor(un->un_sm[smi].sm_dev));
2068 		MD_CAPAB(su) |= MD_CAN_SP;
2069 		md_reset_parent(un->un_sm[smi].sm_dev);
2070 		reset_comp_states(&un->un_sm[smi], &un->un_smic[smi]);
2071 
2072 		sv[nsv].setno = MD_MIN2SET(mnum);
2073 		sv[nsv++].key = un->un_sm[smi].sm_key;
2074 		bits |= SMI2BIT(smi);
2075 	}
2076 
2077 	MD_STATUS(un) |= MD_UN_BEING_RESET;
2078 	recid = un->un_rr_dirty_recid;
2079 	vtoc_id = un->c.un_vtoc_id;
2080 	selfid = MD_SID(un);
2081 
2082 	mirror_commit(un, bits, 0);
2083 
2084 	avl_destroy(&un->un_overlap_root);
2085 
2086 	/* Destroy all mutexes and condvars before returning. */
2087 	mutex_destroy(&un->un_suspend_wr_mx);
2088 	cv_destroy(&un->un_suspend_wr_cv);
2089 	mutex_destroy(&un->un_overlap_tree_mx);
2090 	cv_destroy(&un->un_overlap_tree_cv);
2091 	mutex_destroy(&un->un_owner_mx);
2092 	mutex_destroy(&un->un_rs_thread_mx);
2093 	cv_destroy(&un->un_rs_thread_cv);
2094 	mutex_destroy(&un->un_rs_progress_mx);
2095 	cv_destroy(&un->un_rs_progress_cv);
2096 	mutex_destroy(&un->un_dmr_mx);
2097 	cv_destroy(&un->un_dmr_cv);
2098 
2099 	for (i = 0; i < MD_MNMAXSIDES; i++) {
2100 		rw_destroy(&un->un_pernode_dirty_mx[i]);
2101 		if (un->un_pernode_dirty_bm[i])
2102 			kmem_free((caddr_t)un->un_pernode_dirty_bm[i], bitcnt);
2103 	}
2104 
2105 	/*
2106 	 * Remove self from the namespace
2107 	 */
2108 	if (un->c.un_revision & MD_FN_META_DEV) {
2109 		(void) md_rem_selfname(un->c.un_self_id);
2110 	}
2111 
2112 	/* This frees the unit structure. */
2113 	mddb_deleterec_wrapper(un->c.un_record_id);
2114 
2115 	if (recid != 0)
2116 		mddb_deleterec_wrapper(recid);
2117 
2118 	/* Remove the vtoc, if present */
2119 	if (vtoc_id)
2120 		mddb_deleterec_wrapper(vtoc_id);
2121 
2122 	md_rem_names(sv, nsv);
2123 
2124 	SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, SVM_TAG_METADEVICE,
2125 	    MD_MIN2SET(selfid), selfid);
2126 }
2127 
2128 int
2129 mirror_internal_open(
2130 	minor_t		mnum,
2131 	int		flag,
2132 	int		otyp,
2133 	int		md_oflags,
2134 	IOLOCK		*lockp		/* can be NULL */
2135 )
2136 {
2137 	mdi_unit_t	*ui = MDI_UNIT(mnum);
2138 	int		err = 0;
2139 
2140 tryagain:
2141 	/* single thread */
2142 	if (lockp) {
2143 		/*
2144 		 * If ioctl lock is held, use openclose_enter
2145 		 * routine that will set the ioctl flag when
2146 		 * grabbing the readerlock.
2147 		 */
2148 		(void) md_ioctl_openclose_enter(lockp, ui);
2149 	} else {
2150 		(void) md_unit_openclose_enter(ui);
2151 	}
2152 
2153 	/*
2154 	 * The mirror_open_all_devs routine may end up sending a STATE_UPDATE
2155 	 * message in a MN diskset and this requires that the openclose
2156 	 * lock is dropped in order to send this message.  So, another
2157 	 * flag (MD_UL_OPENINPROGRESS) is used to keep another thread from
2158 	 * attempting an open while this thread has an open in progress.
2159 	 * Call the *_lh version of the lock exit routines since the ui_mx
2160 	 * mutex must be held from checking for OPENINPROGRESS until
2161 	 * after the cv_wait call.
2162 	 */
2163 	mutex_enter(&ui->ui_mx);
2164 	if (ui->ui_lock & MD_UL_OPENINPROGRESS) {
2165 		if (lockp) {
2166 			(void) md_ioctl_openclose_exit_lh(lockp);
2167 		} else {
2168 			md_unit_openclose_exit_lh(ui);
2169 		}
2170 		cv_wait(&ui->ui_cv, &ui->ui_mx);
2171 		mutex_exit(&ui->ui_mx);
2172 		goto tryagain;
2173 	}
2174 
2175 	ui->ui_lock |= MD_UL_OPENINPROGRESS;
2176 	mutex_exit(&ui->ui_mx);
2177 
2178 	/* open devices, if necessary */
2179 	if (! md_unit_isopen(ui) || (ui->ui_tstate & MD_INACCESSIBLE)) {
2180 		if ((err = mirror_open_all_devs(mnum, md_oflags, lockp)) != 0)
2181 			goto out;
2182 	}
2183 
2184 	/* count open */
2185 	if ((err = md_unit_incopen(mnum, flag, otyp)) != 0)
2186 		goto out;
2187 
2188 	/* unlock, return success */
2189 out:
2190 	mutex_enter(&ui->ui_mx);
2191 	ui->ui_lock &= ~MD_UL_OPENINPROGRESS;
2192 	mutex_exit(&ui->ui_mx);
2193 
2194 	if (lockp) {
2195 		/*
2196 		 * If ioctl lock is held, use openclose_exit
2197 		 * routine that will clear the lockp reader flag.
2198 		 */
2199 		(void) md_ioctl_openclose_exit(lockp);
2200 	} else {
2201 		md_unit_openclose_exit(ui);
2202 	}
2203 	return (err);
2204 }
2205 
2206 int
2207 mirror_internal_close(
2208 	minor_t		mnum,
2209 	int		otyp,
2210 	int		md_cflags,
2211 	IOLOCK		*lockp		/* can be NULL */
2212 )
2213 {
2214 	mdi_unit_t	*ui = MDI_UNIT(mnum);
2215 	mm_unit_t	*un;
2216 	int		err = 0;
2217 
2218 	/* single thread */
2219 	if (lockp) {
2220 		/*
2221 		 * If ioctl lock is held, use openclose_enter
2222 		 * routine that will set the ioctl flag when
2223 		 * grabbing the readerlock.
2224 		 */
2225 		un = (mm_unit_t *)md_ioctl_openclose_enter(lockp, ui);
2226 	} else {
2227 		un = (mm_unit_t *)md_unit_openclose_enter(ui);
2228 	}
2229 
2230 	/* count closed */
2231 	if ((err = md_unit_decopen(mnum, otyp)) != 0)
2232 		goto out;
2233 
2234 	/* close devices, if necessary */
2235 	if (! md_unit_isopen(ui) || (md_cflags & MD_OFLG_PROBEDEV)) {
2236 		/*
2237 		 * Clean up dirty bitmap for this unit. Do this
2238 		 * before closing the underlying devices to avoid
2239 		 * race conditions with reset_mirror() as a
2240 		 * result of a 'metaset -r' command running in
2241 		 * parallel. This might cause deallocation of
2242 		 * dirty region bitmaps; with underlying metadevices
2243 		 * in place this can't happen.
2244 		 * Don't do this if a MN set and ABR not set
2245 		 */
2246 		if (new_resync && !(MD_STATUS(un) & MD_UN_KEEP_DIRTY)) {
2247 			if (!MD_MNSET_SETNO(MD_UN2SET(un)) ||
2248 			    !(ui->ui_tstate & MD_ABR_CAP))
2249 				mirror_process_unit_resync(un);
2250 		}
2251 		(void) mirror_close_all_devs(un, md_cflags);
2252 
2253 		/*
2254 		 * For a MN set with transient capabilities (eg ABR/DMR) set,
2255 		 * clear these capabilities on the last open in the cluster.
2256 		 * To do this we send a message to all nodes to see of the
2257 		 * device is open.
2258 		 */
2259 		if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
2260 		    (ui->ui_tstate & (MD_ABR_CAP|MD_DMR_CAP))) {
2261 			if (lockp) {
2262 				(void) md_ioctl_openclose_exit(lockp);
2263 			} else {
2264 				md_unit_openclose_exit(ui);
2265 			}
2266 
2267 			/*
2268 			 * if we are in the context of an ioctl, drop the
2269 			 * ioctl lock.
2270 			 * Otherwise, no other locks should be held.
2271 			 */
2272 			if (lockp) {
2273 				IOLOCK_RETURN_RELEASE(0, lockp);
2274 			}
2275 
2276 			mdmn_clear_all_capabilities(mnum);
2277 
2278 			/* if dropped the lock previously, regain it */
2279 			if (lockp) {
2280 				IOLOCK_RETURN_REACQUIRE(lockp);
2281 			}
2282 			return (0);
2283 		}
2284 		/* unlock and return success */
2285 	}
2286 out:
2287 	/* Call whether lockp is NULL or not. */
2288 	if (lockp) {
2289 		md_ioctl_openclose_exit(lockp);
2290 	} else {
2291 		md_unit_openclose_exit(ui);
2292 	}
2293 	return (err);
2294 }
2295 
2296 /*
2297  * When a component has completed resyncing and is now ok, check if the
2298  * corresponding component in the other submirrors is in the Last Erred
2299  * state.  If it is, we want to change that to the Erred state so we stop
2300  * using that component and start using this good component instead.
2301  *
2302  * This is called from set_sm_comp_state and recursively calls
2303  * set_sm_comp_state if it needs to change the Last Erred state.
2304  */
2305 static void
2306 reset_lasterred(mm_unit_t *un, int smi, mddb_recid_t *extras, uint_t flags,
2307 	IOLOCK *lockp)
2308 {
2309 	mm_submirror_t		*sm;
2310 	mm_submirror_ic_t	*smic;
2311 	int			ci;
2312 	int			i;
2313 	int			compcnt;
2314 	int			changed = 0;
2315 
2316 	for (i = 0; i < NMIRROR; i++) {
2317 		sm = &un->un_sm[i];
2318 		smic = &un->un_smic[i];
2319 
2320 		if (!SMS_IS(sm, SMS_INUSE))
2321 			continue;
2322 
2323 		/* ignore the submirror that we just made ok */
2324 		if (i == smi)
2325 			continue;
2326 
2327 		compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un);
2328 		for (ci = 0; ci < compcnt; ci++) {
2329 			md_m_shared_t	*shared;
2330 
2331 			shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
2332 			    (sm->sm_dev, sm, ci);
2333 
2334 			if ((shared->ms_state & CS_LAST_ERRED) &&
2335 			    !mirror_other_sources(un, i, ci, 1)) {
2336 
2337 				set_sm_comp_state(un, i, ci, CS_ERRED, extras,
2338 				    flags, lockp);
2339 				changed = 1;
2340 			}
2341 		}
2342 	}
2343 
2344 	/* maybe there is a hotspare for this newly erred component */
2345 	if (changed) {
2346 		set_t	setno;
2347 
2348 		setno = MD_UN2SET(un);
2349 		if (MD_MNSET_SETNO(setno)) {
2350 			send_poke_hotspares(setno);
2351 		} else {
2352 			(void) poke_hotspares();
2353 		}
2354 	}
2355 }
2356 
2357 /*
2358  * set_sm_comp_state
2359  *
2360  * Set the state of a submirror component to the specified new state.
2361  * If the mirror is in a multi-node set, send messages to all nodes to
2362  * block all writes to the mirror and then update the state and release the
2363  * writes. These messages are only sent if MD_STATE_XMIT is set in flags.
2364  * MD_STATE_XMIT will be unset in 2 cases:
2365  * 1. When the state is changed to CS_RESYNC as this state change
2366  * will already have been updated on each node by the processing of the
2367  * distributed metasync command, hence no need to xmit.
2368  * 2. When the state is change to CS_OKAY after a resync has completed. Again
2369  * the resync completion will already have been processed on each node by
2370  * the processing of the MD_MN_MSG_RESYNC_PHASE_DONE message for a component
2371  * resync, hence no need to xmit.
2372  *
2373  * In case we are called from the updates of a watermark,
2374  * (then MD_STATE_WMUPDATE will be set in the ps->flags) this is due to
2375  * a metainit or similar. In this case the message that we sent to propagate
2376  * the state change must not be a class1 message as that would deadlock with
2377  * the metainit command that is still being processed.
2378  * This we achieve by creating a class2 message MD_MN_MSG_STATE_UPDATE2
2379  * instead. This also makes the submessage generator to create a class2
2380  * submessage rather than a class1 (which would also block)
2381  *
2382  * On entry, unit_writerlock is held
2383  * If MD_STATE_OCHELD is set in flags, then unit_openclose lock is
2384  * also held.
2385  */
2386 void
2387 set_sm_comp_state(
2388 	mm_unit_t	*un,
2389 	int		smi,
2390 	int		ci,
2391 	int		newstate,
2392 	mddb_recid_t	*extras,
2393 	uint_t		flags,
2394 	IOLOCK		*lockp
2395 )
2396 {
2397 	mm_submirror_t		*sm;
2398 	mm_submirror_ic_t	*smic;
2399 	md_m_shared_t		*shared;
2400 	int			origstate;
2401 	void			(*get_dev)();
2402 	ms_cd_info_t		cd;
2403 	char			devname[MD_MAX_CTDLEN];
2404 	int			err;
2405 	set_t			setno = MD_UN2SET(un);
2406 	md_mn_msg_stch_t	stchmsg;
2407 	mdi_unit_t		*ui = MDI_UNIT(MD_SID(un));
2408 	md_mn_kresult_t		*kresult;
2409 	int			rval;
2410 	uint_t			msgflags;
2411 	md_mn_msgtype_t		msgtype;
2412 	int			save_lock = 0;
2413 	mdi_unit_t		*ui_sm;
2414 
2415 	sm = &un->un_sm[smi];
2416 	smic = &un->un_smic[smi];
2417 
2418 	/* If we have a real error status then turn off MD_INACCESSIBLE. */
2419 	ui_sm = MDI_UNIT(getminor(md_dev64_to_dev(sm->sm_dev)));
2420 	if (newstate & (CS_ERRED | CS_RESYNC | CS_LAST_ERRED) &&
2421 	    ui_sm->ui_tstate & MD_INACCESSIBLE) {
2422 		ui_sm->ui_tstate &= ~MD_INACCESSIBLE;
2423 	}
2424 
2425 	shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
2426 	    (sm->sm_dev, sm, ci);
2427 	origstate = shared->ms_state;
2428 
2429 	/*
2430 	 * If the new state is an error and the old one wasn't, generate
2431 	 * a console message. We do this before we send the state to other
2432 	 * nodes in a MN set because the state change may change the component
2433 	 * name  if a hotspare is allocated.
2434 	 */
2435 	if ((! (origstate & (CS_ERRED|CS_LAST_ERRED))) &&
2436 	    (newstate & (CS_ERRED|CS_LAST_ERRED))) {
2437 
2438 		get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0,
2439 		    "get device", 0);
2440 		(void) (*get_dev)(sm->sm_dev, sm, ci, &cd);
2441 
2442 		err = md_getdevname(setno, mddb_getsidenum(setno), 0,
2443 		    cd.cd_dev, devname, sizeof (devname));
2444 
2445 		if (err == ENOENT) {
2446 			(void) md_devname(setno, cd.cd_dev, devname,
2447 			    sizeof (devname));
2448 		}
2449 
2450 		cmn_err(CE_WARN, "md: %s: %s needs maintenance",
2451 		    md_shortname(md_getminor(sm->sm_dev)), devname);
2452 
2453 		if (newstate & CS_LAST_ERRED) {
2454 			cmn_err(CE_WARN, "md: %s: %s last erred",
2455 			    md_shortname(md_getminor(sm->sm_dev)),
2456 			    devname);
2457 
2458 		} else if (shared->ms_flags & MDM_S_ISOPEN) {
2459 			/*
2460 			 * Close the broken device and clear the open flag on
2461 			 * it.  Closing the device means the RCM framework will
2462 			 * be able to unconfigure the device if required.
2463 			 *
2464 			 * We have to check that the device is open, otherwise
2465 			 * the first open on it has resulted in the error that
2466 			 * is being processed and the actual cd.cd_dev will be
2467 			 * NODEV64.
2468 			 *
2469 			 * If this is a multi-node mirror, then the multinode
2470 			 * state checks following this code will cause the
2471 			 * slave nodes to close the mirror in the function
2472 			 * mirror_set_state().
2473 			 */
2474 			md_layered_close(cd.cd_dev, MD_OFLG_NULL);
2475 			shared->ms_flags &= ~MDM_S_ISOPEN;
2476 		}
2477 
2478 	} else if ((origstate & CS_LAST_ERRED) && (newstate & CS_ERRED) &&
2479 	    (shared->ms_flags & MDM_S_ISOPEN)) {
2480 		/*
2481 		 * Similar to logic above except no log messages since we
2482 		 * are just transitioning from Last Erred to Erred.
2483 		 */
2484 		get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0,
2485 		    "get device", 0);
2486 		(void) (*get_dev)(sm->sm_dev, sm, ci, &cd);
2487 
2488 		md_layered_close(cd.cd_dev, MD_OFLG_NULL);
2489 		shared->ms_flags &= ~MDM_S_ISOPEN;
2490 	}
2491 
2492 	if ((MD_MNSET_SETNO(setno)) && (origstate != newstate) &&
2493 	    (flags & MD_STATE_XMIT) && !(ui->ui_tstate & MD_ERR_PENDING)) {
2494 		/*
2495 		 * For a multi-node mirror, send the state change to the
2496 		 * master, which broadcasts to all nodes, including this
2497 		 * one. Once the message is received, the state is set
2498 		 * in-core and the master commits the change to disk.
2499 		 * There is a case, comp_replace,  where this function
2500 		 * can be called from within an ioctl and therefore in this
2501 		 * case, as the ioctl will already be called on each node,
2502 		 * there is no need to xmit the state change to the master for
2503 		 * distribution to the other nodes. MD_STATE_XMIT flag is used
2504 		 * to indicate whether a xmit is required. The mirror's
2505 		 * transient state is set to MD_ERR_PENDING to avoid sending
2506 		 * multiple messages.
2507 		 */
2508 		if (newstate & (CS_ERRED|CS_LAST_ERRED))
2509 			ui->ui_tstate |= MD_ERR_PENDING;
2510 
2511 		/*
2512 		 * Send a state update message to all nodes. This message
2513 		 * will generate 2 submessages, the first one to suspend
2514 		 * all writes to the mirror and the second to update the
2515 		 * state and resume writes.
2516 		 */
2517 		stchmsg.msg_stch_mnum = un->c.un_self_id;
2518 		stchmsg.msg_stch_sm = smi;
2519 		stchmsg.msg_stch_comp = ci;
2520 		stchmsg.msg_stch_new_state = newstate;
2521 		stchmsg.msg_stch_hs_id = shared->ms_hs_id;
2522 #ifdef DEBUG
2523 		if (mirror_debug_flag)
2524 			printf("send set state, %x, %x, %x, %x, %x\n",
2525 			    stchmsg.msg_stch_mnum, stchmsg.msg_stch_sm,
2526 			    stchmsg.msg_stch_comp, stchmsg.msg_stch_new_state,
2527 			    stchmsg.msg_stch_hs_id);
2528 #endif
2529 		if (flags & MD_STATE_WMUPDATE) {
2530 			msgtype  = MD_MN_MSG_STATE_UPDATE2;
2531 			/*
2532 			 * When coming from an update of watermarks, there
2533 			 * must already be a message logged that triggered
2534 			 * this action. So, no need to log this message, too.
2535 			 */
2536 			msgflags = MD_MSGF_NO_LOG;
2537 		} else {
2538 			msgtype  = MD_MN_MSG_STATE_UPDATE;
2539 			msgflags = MD_MSGF_DEFAULT_FLAGS;
2540 		}
2541 
2542 		/*
2543 		 * If we are in the context of an ioctl, drop the ioctl lock.
2544 		 * lockp holds the list of locks held.
2545 		 *
2546 		 * Otherwise, increment the appropriate reacquire counters.
2547 		 * If openclose lock is *held, then must reacquire reader
2548 		 * lock before releasing the openclose lock.
2549 		 * Do not drop the ARRAY_WRITER lock as we may not be able
2550 		 * to reacquire it.
2551 		 */
2552 		if (lockp) {
2553 			if (lockp->l_flags & MD_ARRAY_WRITER) {
2554 				save_lock = MD_ARRAY_WRITER;
2555 				lockp->l_flags &= ~MD_ARRAY_WRITER;
2556 			} else if (lockp->l_flags & MD_ARRAY_READER) {
2557 				save_lock = MD_ARRAY_READER;
2558 				lockp->l_flags &= ~MD_ARRAY_READER;
2559 			}
2560 			IOLOCK_RETURN_RELEASE(0, lockp);
2561 		} else {
2562 			if (flags & MD_STATE_OCHELD) {
2563 				md_unit_writerexit(ui);
2564 				(void) md_unit_readerlock(ui);
2565 				md_unit_openclose_exit(ui);
2566 			} else {
2567 				md_unit_writerexit(ui);
2568 			}
2569 		}
2570 
2571 		kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
2572 		rval = mdmn_ksend_message(setno, msgtype, msgflags, 0,
2573 		    (char *)&stchmsg, sizeof (stchmsg), kresult);
2574 
2575 		if (!MDMN_KSEND_MSG_OK(rval, kresult)) {
2576 			mdmn_ksend_show_error(rval, kresult, "STATE UPDATE");
2577 			/* If we're shutting down already, pause things here. */
2578 			if (kresult->kmmr_comm_state == MDMNE_RPC_FAIL) {
2579 				while (!md_mn_is_commd_present()) {
2580 					delay(md_hz);
2581 				}
2582 			}
2583 			cmn_err(CE_PANIC,
2584 			    "ksend_message failure: STATE_UPDATE");
2585 		}
2586 		kmem_free(kresult, sizeof (md_mn_kresult_t));
2587 
2588 		/* if dropped the lock previously, regain it */
2589 		if (lockp) {
2590 			IOLOCK_RETURN_REACQUIRE(lockp);
2591 			lockp->l_flags |= save_lock;
2592 		} else {
2593 			/*
2594 			 * Reacquire dropped locks and update acquirecnts
2595 			 * appropriately.
2596 			 */
2597 			if (flags & MD_STATE_OCHELD) {
2598 				/*
2599 				 * openclose also grabs readerlock.
2600 				 */
2601 				(void) md_unit_openclose_enter(ui);
2602 				md_unit_readerexit(ui);
2603 				(void) md_unit_writerlock(ui);
2604 			} else {
2605 				(void) md_unit_writerlock(ui);
2606 			}
2607 		}
2608 
2609 		ui->ui_tstate &= ~MD_ERR_PENDING;
2610 	} else {
2611 		shared->ms_state = newstate;
2612 		uniqtime32(&shared->ms_timestamp);
2613 
2614 		if (newstate == CS_ERRED)
2615 			shared->ms_flags |= MDM_S_NOWRITE;
2616 		else
2617 			shared->ms_flags &= ~MDM_S_NOWRITE;
2618 
2619 		shared->ms_flags &= ~MDM_S_IOERR;
2620 		un->un_changecnt++;
2621 		shared->ms_lasterrcnt = un->un_changecnt;
2622 
2623 		mirror_set_sm_state(sm, smic, SMS_RUNNING, 0);
2624 		mirror_commit(un, SMI2BIT(smi), extras);
2625 	}
2626 
2627 	if ((origstate & CS_RESYNC) && (newstate & CS_OKAY)) {
2628 		/*
2629 		 * Resetting the Last Erred state will recursively call back
2630 		 * into this function (set_sm_comp_state) to update the state.
2631 		 */
2632 		reset_lasterred(un, smi, extras, flags, lockp);
2633 	}
2634 }
2635 
2636 static int
2637 find_another_logical(
2638 	mm_unit_t		*un,
2639 	mm_submirror_t		*esm,
2640 	diskaddr_t		blk,
2641 	u_longlong_t		cnt,
2642 	int			must_be_open,
2643 	int			state,
2644 	int			err_cnt)
2645 {
2646 	u_longlong_t	cando;
2647 	md_dev64_t	dev;
2648 	md_m_shared_t	*s;
2649 
2650 	esm->sm_state |= SMS_IGNORE;
2651 	while (cnt != 0) {
2652 		u_longlong_t	 mcnt;
2653 
2654 		mcnt = MIN(cnt, lbtodb(1024 * 1024 * 1024));	/* 1 Gig Blks */
2655 
2656 		dev = select_read_unit(un, blk, mcnt, &cando,
2657 		    must_be_open, &s, NULL);
2658 		if (dev == (md_dev64_t)0)
2659 			break;
2660 
2661 		if ((state == CS_LAST_ERRED) &&
2662 		    (s->ms_state == CS_LAST_ERRED) &&
2663 		    (err_cnt > s->ms_lasterrcnt))
2664 			break;
2665 
2666 		cnt -= cando;
2667 		blk += cando;
2668 	}
2669 	esm->sm_state &= ~SMS_IGNORE;
2670 	return (cnt != 0);
2671 }
2672 
2673 int
2674 mirror_other_sources(mm_unit_t *un, int smi, int ci, int must_be_open)
2675 {
2676 	mm_submirror_t		*sm;
2677 	mm_submirror_ic_t	*smic;
2678 	size_t			count;
2679 	diskaddr_t		block;
2680 	u_longlong_t		skip;
2681 	u_longlong_t		size;
2682 	md_dev64_t		dev;
2683 	int			cnt;
2684 	md_m_shared_t		*s;
2685 	int			not_found;
2686 
2687 	sm = &un->un_sm[smi];
2688 	smic = &un->un_smic[smi];
2689 	dev = sm->sm_dev;
2690 
2691 	/*
2692 	 * Make sure every component of the submirror
2693 	 * has other sources.
2694 	 */
2695 	if (ci < 0) {
2696 		/* Find the highest lasterrcnt */
2697 		cnt = (*(smic->sm_get_component_count))(dev, sm);
2698 		for (ci = 0; ci < cnt; ci++) {
2699 			not_found = mirror_other_sources(un, smi, ci,
2700 			    must_be_open);
2701 			if (not_found)
2702 				return (1);
2703 		}
2704 		return (0);
2705 	}
2706 
2707 	/*
2708 	 * Make sure this component has other sources
2709 	 */
2710 	(void) (*(smic->sm_get_bcss))
2711 	    (dev, sm, ci, &block, &count, &skip, &size);
2712 
2713 	if (count == 0)
2714 		return (1);
2715 
2716 	s = (md_m_shared_t *)(*(smic->sm_shared_by_indx))(dev, sm, ci);
2717 
2718 	while (count--) {
2719 		if (block >= un->c.un_total_blocks)
2720 			return (0);
2721 
2722 		if ((block + size) > un->c.un_total_blocks)
2723 			size = un->c.un_total_blocks - block;
2724 
2725 		not_found = find_another_logical(un, sm, block, size,
2726 		    must_be_open, s->ms_state, s->ms_lasterrcnt);
2727 		if (not_found)
2728 			return (1);
2729 
2730 		block += size + skip;
2731 	}
2732 	return (0);
2733 }
2734 
2735 static void
2736 finish_error(md_mps_t *ps)
2737 {
2738 	struct buf	*pb;
2739 	mm_unit_t	*un;
2740 	mdi_unit_t	*ui;
2741 	uint_t		new_str_flags;
2742 
2743 	pb = ps->ps_bp;
2744 	un = ps->ps_un;
2745 	ui = ps->ps_ui;
2746 
2747 	/*
2748 	 * Must flag any error to the resync originator if we're performing
2749 	 * a Write-after-Read. This corresponds to an i/o error on a resync
2750 	 * target device and in this case we ought to abort the resync as there
2751 	 * is nothing that can be done to recover from this without operator
2752 	 * intervention. If we don't set the B_ERROR flag we will continue
2753 	 * reading from the mirror but won't write to the target (as it will
2754 	 * have been placed into an errored state).
2755 	 * To handle the case of multiple components within a submirror we only
2756 	 * set the B_ERROR bit if explicitly requested to via MD_MPS_FLAG_ERROR.
2757 	 * The originator of the resync read will cause this bit to be set if
2758 	 * the underlying component count is one for a submirror resync. All
2759 	 * other resync types will have the flag set as there is no underlying
2760 	 * resync which can be performed on a contained metadevice for these
2761 	 * resync types (optimized or component).
2762 	 */
2763 
2764 	if (ps->ps_flags & MD_MPS_WRITE_AFTER_READ) {
2765 		if (ps->ps_flags & MD_MPS_FLAG_ERROR)
2766 			pb->b_flags |= B_ERROR;
2767 		md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
2768 		MPS_FREE(mirror_parent_cache, ps);
2769 		md_unit_readerexit(ui);
2770 		md_biodone(pb);
2771 		return;
2772 	}
2773 	/*
2774 	 * Set the MD_IO_COUNTED flag as we are retrying the same I/O
2775 	 * operation therefore this I/O request has already been counted,
2776 	 * the I/O count variable will be decremented by mirror_done()'s
2777 	 * call to md_biodone().
2778 	 */
2779 	if (ps->ps_changecnt != un->un_changecnt) {
2780 		new_str_flags = MD_STR_NOTTOP | MD_IO_COUNTED;
2781 		if (ps->ps_flags & MD_MPS_WOW)
2782 			new_str_flags |= MD_STR_WOW;
2783 		if (ps->ps_flags & MD_MPS_MAPPED)
2784 			new_str_flags |= MD_STR_MAPPED;
2785 		/*
2786 		 * If this I/O request was a read that was part of a resync,
2787 		 * set MD_STR_WAR for the retried read to ensure that the
2788 		 * resync write (i.e. write-after-read) will be performed
2789 		 */
2790 		if (ps->ps_flags & MD_MPS_RESYNC_READ)
2791 			new_str_flags |= MD_STR_WAR;
2792 		md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
2793 		MPS_FREE(mirror_parent_cache, ps);
2794 		md_unit_readerexit(ui);
2795 		(void) md_mirror_strategy(pb, new_str_flags, NULL);
2796 		return;
2797 	}
2798 
2799 	pb->b_flags |= B_ERROR;
2800 	md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
2801 	MPS_FREE(mirror_parent_cache, ps);
2802 	md_unit_readerexit(ui);
2803 	md_biodone(pb);
2804 }
2805 
2806 static void
2807 error_update_unit(md_mps_t *ps)
2808 {
2809 	mm_unit_t		*un;
2810 	mdi_unit_t		*ui;
2811 	int			smi;	/* sub mirror index */
2812 	int			ci;	/* errored component */
2813 	set_t			setno;
2814 	uint_t			flags;	/* for set_sm_comp_state() */
2815 	uint_t			hspflags; /* for check_comp_4_hotspares() */
2816 
2817 	ui = ps->ps_ui;
2818 	un = (mm_unit_t *)md_unit_writerlock(ui);
2819 	setno = MD_UN2SET(un);
2820 
2821 	/* All of these updates have to propagated in case of MN set */
2822 	flags = MD_STATE_XMIT;
2823 	hspflags = MD_HOTSPARE_XMIT;
2824 
2825 	/* special treatment if we are called during updating watermarks */
2826 	if (ps->ps_flags & MD_MPS_WMUPDATE) {
2827 		flags |= MD_STATE_WMUPDATE;
2828 		hspflags |= MD_HOTSPARE_WMUPDATE;
2829 	}
2830 	smi = 0;
2831 	ci = 0;
2832 	while (mirror_geterror(un, &smi, &ci, 1, 0) != 0) {
2833 		if (mirror_other_sources(un, smi, ci, 0) == 1) {
2834 
2835 			/* Never called from ioctl context, so (IOLOCK *)NULL */
2836 			set_sm_comp_state(un, smi, ci, CS_LAST_ERRED, 0, flags,
2837 			    (IOLOCK *)NULL);
2838 			/*
2839 			 * For a MN set, the NOTIFY is done when the state
2840 			 * change is processed on each node
2841 			 */
2842 			if (!MD_MNSET_SETNO(MD_UN2SET(un))) {
2843 				SE_NOTIFY(EC_SVM_STATE, ESC_SVM_LASTERRED,
2844 				    SVM_TAG_METADEVICE, setno, MD_SID(un));
2845 			}
2846 			continue;
2847 		}
2848 		/* Never called from ioctl context, so (IOLOCK *)NULL */
2849 		set_sm_comp_state(un, smi, ci, CS_ERRED, 0, flags,
2850 		    (IOLOCK *)NULL);
2851 		/*
2852 		 * For a MN set, the NOTIFY is done when the state
2853 		 * change is processed on each node
2854 		 */
2855 		if (!MD_MNSET_SETNO(MD_UN2SET(un))) {
2856 			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED,
2857 			    SVM_TAG_METADEVICE, setno, MD_SID(un));
2858 		}
2859 		smi = 0;
2860 		ci = 0;
2861 	}
2862 
2863 	md_unit_writerexit(ui);
2864 	if (MD_MNSET_SETNO(setno)) {
2865 		send_poke_hotspares(setno);
2866 	} else {
2867 		(void) poke_hotspares();
2868 	}
2869 	(void) md_unit_readerlock(ui);
2870 
2871 	finish_error(ps);
2872 }
2873 
2874 /*
2875  * When we have a B_FAILFAST IO error on a Last Erred component we need to
2876  * retry the IO without B_FAILFAST set so that we try to ensure that the
2877  * component "sees" each IO.
2878  */
2879 static void
2880 last_err_retry(md_mcs_t *cs)
2881 {
2882 	struct buf	*cb;
2883 	md_mps_t	*ps;
2884 	uint_t		flags;
2885 
2886 	cb = &cs->cs_buf;
2887 	cb->b_flags &= ~B_FAILFAST;
2888 
2889 	/* if we're panicing just let this I/O error out */
2890 	if (panicstr) {
2891 		(void) mirror_done(cb);
2892 		return;
2893 	}
2894 
2895 	/* reissue the I/O */
2896 
2897 	ps = cs->cs_ps;
2898 
2899 	bioerror(cb, 0);
2900 
2901 	mutex_enter(&ps->ps_mx);
2902 
2903 	flags = MD_STR_NOTTOP;
2904 	if (ps->ps_flags & MD_MPS_MAPPED)
2905 		flags |= MD_STR_MAPPED;
2906 	if (ps->ps_flags & MD_MPS_NOBLOCK)
2907 		flags |= MD_NOBLOCK;
2908 
2909 	mutex_exit(&ps->ps_mx);
2910 
2911 	clear_retry_error(cb);
2912 
2913 	cmn_err(CE_NOTE, "!md: %s: Last Erred, retry I/O without B_FAILFAST",
2914 	    md_shortname(getminor(cb->b_edev)));
2915 
2916 	md_call_strategy(cb, flags, NULL);
2917 }
2918 
2919 static void
2920 mirror_error(md_mps_t *ps)
2921 {
2922 	int		smi;	/* sub mirror index */
2923 	int		ci;	/* errored component */
2924 
2925 	if (panicstr) {
2926 		finish_error(ps);
2927 		return;
2928 	}
2929 
2930 	if (ps->ps_flags & MD_MPS_ON_OVERLAP)
2931 		mirror_overlap_tree_remove(ps);
2932 
2933 	smi = 0;
2934 	ci = 0;
2935 	if (mirror_geterror(ps->ps_un, &smi, &ci, 0, 0) != 0) {
2936 		md_unit_readerexit(ps->ps_ui);
2937 		daemon_request(&md_mstr_daemon, error_update_unit,
2938 		    (daemon_queue_t *)ps, REQ_OLD);
2939 		return;
2940 	}
2941 
2942 	finish_error(ps);
2943 }
2944 
2945 static int
2946 copy_write_done(struct buf *cb)
2947 {
2948 	md_mps_t	*ps;
2949 	buf_t		*pb;
2950 	char		*wowbuf;
2951 	wowhdr_t	*wowhdr;
2952 	ssize_t		wow_resid;
2953 
2954 	/* get wowbuf ans save structure */
2955 	wowbuf = cb->b_un.b_addr;
2956 	wowhdr = WOWBUF_HDR(wowbuf);
2957 	ps = wowhdr->wow_ps;
2958 	pb = ps->ps_bp;
2959 
2960 	/* Save error information, then free cb */
2961 	if (cb->b_flags & B_ERROR)
2962 		pb->b_flags |= B_ERROR;
2963 
2964 	if (cb->b_flags & B_REMAPPED)
2965 		bp_mapout(cb);
2966 
2967 	freerbuf(cb);
2968 
2969 	/* update residual and continue if needed */
2970 	if ((pb->b_flags & B_ERROR) == 0) {
2971 		wow_resid = pb->b_bcount - wowhdr->wow_offset;
2972 		pb->b_resid = wow_resid;
2973 		if (wow_resid > 0)  {
2974 			daemon_request(&md_mstr_daemon, copy_write_cont,
2975 			    (daemon_queue_t *)wowhdr, REQ_OLD);
2976 			return (1);
2977 		}
2978 	}
2979 
2980 	/* Write is complete, release resources. */
2981 	kmem_cache_free(mirror_wowblk_cache, wowhdr);
2982 	ASSERT(!(ps->ps_flags & MD_MPS_ON_OVERLAP));
2983 	md_kstat_done(ps->ps_ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
2984 	MPS_FREE(mirror_parent_cache, ps);
2985 	md_biodone(pb);
2986 	return (0);
2987 }
2988 
2989 static void
2990 copy_write_cont(wowhdr_t *wowhdr)
2991 {
2992 	buf_t		*pb;
2993 	buf_t		*cb;
2994 	char		*wowbuf;
2995 	int		wow_offset;
2996 	size_t		wow_resid;
2997 	diskaddr_t	wow_blkno;
2998 
2999 	wowbuf = WOWHDR_BUF(wowhdr);
3000 	pb = wowhdr->wow_ps->ps_bp;
3001 
3002 	/* get data on current location */
3003 	wow_offset = wowhdr->wow_offset;
3004 	wow_resid = pb->b_bcount - wow_offset;
3005 	wow_blkno = pb->b_lblkno + lbtodb(wow_offset);
3006 
3007 	/* setup child buffer */
3008 	cb = getrbuf(KM_SLEEP);
3009 	cb->b_flags = B_WRITE;
3010 	cb->b_edev = pb->b_edev;
3011 	cb->b_un.b_addr = wowbuf;	/* change to point at WOWBUF */
3012 	cb->b_bufsize = md_wowbuf_size; /* change to wowbuf_size */
3013 	cb->b_iodone = copy_write_done;
3014 	cb->b_bcount = MIN(md_wowbuf_size, wow_resid);
3015 	cb->b_lblkno = wow_blkno;
3016 
3017 	/* move offset to next section */
3018 	wowhdr->wow_offset += cb->b_bcount;
3019 
3020 	/* copy and setup write for current section */
3021 	bcopy(&pb->b_un.b_addr[wow_offset], wowbuf, cb->b_bcount);
3022 
3023 	/* do it */
3024 	/*
3025 	 * Do not set the MD_IO_COUNTED flag as this is a new I/O request
3026 	 * that handles the WOW condition. The resultant increment on the
3027 	 * I/O count variable is cleared by copy_write_done()'s call to
3028 	 * md_biodone().
3029 	 */
3030 	(void) md_mirror_strategy(cb, MD_STR_NOTTOP | MD_STR_WOW
3031 	    | MD_STR_MAPPED, NULL);
3032 }
3033 
3034 static void
3035 md_mirror_copy_write(md_mps_t *ps)
3036 {
3037 	wowhdr_t	*wowhdr;
3038 
3039 	wowhdr = kmem_cache_alloc(mirror_wowblk_cache, MD_ALLOCFLAGS);
3040 	mirror_wowblk_init(wowhdr);
3041 	wowhdr->wow_ps = ps;
3042 	wowhdr->wow_offset = 0;
3043 	copy_write_cont(wowhdr);
3044 }
3045 
3046 static void
3047 handle_wow(md_mps_t *ps)
3048 {
3049 	buf_t		*pb;
3050 
3051 	pb = ps->ps_bp;
3052 
3053 	bp_mapin(pb);
3054 
3055 	md_mirror_wow_cnt++;
3056 	if (!(pb->b_flags & B_PHYS) && (md_mirror_wow_flg & WOW_LOGIT)) {
3057 		cmn_err(CE_NOTE,
3058 		    "md: %s, blk %lld, cnt %ld: Write on write %d occurred",
3059 		    md_shortname(getminor(pb->b_edev)),
3060 		    (longlong_t)pb->b_lblkno, pb->b_bcount, md_mirror_wow_cnt);
3061 	}
3062 
3063 	/*
3064 	 * Set the MD_IO_COUNTED flag as we are retrying the same I/O
3065 	 * operation therefore this I/O request has already been counted,
3066 	 * the I/O count variable will be decremented by mirror_done()'s
3067 	 * call to md_biodone().
3068 	 */
3069 	if (md_mirror_wow_flg & WOW_NOCOPY)
3070 		(void) md_mirror_strategy(pb, MD_STR_NOTTOP | MD_STR_WOW |
3071 		    MD_STR_MAPPED | MD_IO_COUNTED, ps);
3072 	else
3073 		md_mirror_copy_write(ps);
3074 }
3075 
3076 /*
3077  * Return true if the specified submirror is either in the Last Erred
3078  * state or is transitioning into the Last Erred state.
3079  */
3080 static bool_t
3081 submirror_is_lasterred(mm_unit_t *un, int smi)
3082 {
3083 	mm_submirror_t		*sm;
3084 	mm_submirror_ic_t	*smic;
3085 	md_m_shared_t		*shared;
3086 	int			ci;
3087 	int			compcnt;
3088 
3089 	sm = &un->un_sm[smi];
3090 	smic = &un->un_smic[smi];
3091 
3092 	compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un);
3093 	for (ci = 0; ci < compcnt; ci++) {
3094 		shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
3095 		    (sm->sm_dev, sm, ci);
3096 
3097 		if (shared->ms_state == CS_LAST_ERRED)
3098 			return (B_TRUE);
3099 
3100 		/*
3101 		 * It is not currently Last Erred, check if entering Last Erred.
3102 		 */
3103 		if ((shared->ms_flags & MDM_S_IOERR) &&
3104 		    ((shared->ms_state == CS_OKAY) ||
3105 		    (shared->ms_state == CS_RESYNC))) {
3106 			if (mirror_other_sources(un, smi, ci, 0) == 1)
3107 				return (B_TRUE);
3108 		}
3109 	}
3110 
3111 	return (B_FALSE);
3112 }
3113 
3114 
3115 static int
3116 mirror_done(struct buf *cb)
3117 {
3118 	md_mps_t	*ps;
3119 	md_mcs_t	*cs;
3120 
3121 	/*LINTED*/
3122 	cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off);
3123 	ps = cs->cs_ps;
3124 
3125 	mutex_enter(&ps->ps_mx);
3126 
3127 	/* check if we need to retry an errored failfast I/O */
3128 	if (cb->b_flags & B_ERROR) {
3129 		struct buf *pb = ps->ps_bp;
3130 
3131 		if (cb->b_flags & B_FAILFAST) {
3132 			int		i;
3133 			mm_unit_t	*un = ps->ps_un;
3134 
3135 			for (i = 0; i < NMIRROR; i++) {
3136 				if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
3137 					continue;
3138 
3139 				if (cb->b_edev ==
3140 				    md_dev64_to_dev(un->un_sm[i].sm_dev)) {
3141 
3142 					/*
3143 					 * This is the submirror that had the
3144 					 * error.  Check if it is Last Erred.
3145 					 */
3146 					if (submirror_is_lasterred(un, i)) {
3147 						daemon_queue_t *dqp;
3148 
3149 						mutex_exit(&ps->ps_mx);
3150 						dqp = (daemon_queue_t *)cs;
3151 						dqp->dq_prev = NULL;
3152 						dqp->dq_next = NULL;
3153 						daemon_request(&md_done_daemon,
3154 						    last_err_retry, dqp,
3155 						    REQ_OLD);
3156 						return (1);
3157 					}
3158 					break;
3159 				}
3160 			}
3161 		}
3162 
3163 		/* continue to process the buf without doing a retry */
3164 		ps->ps_flags |= MD_MPS_ERROR;
3165 		pb->b_error = cb->b_error;
3166 	}
3167 
3168 	return (mirror_done_common(cb));
3169 }
3170 
3171 /*
3172  * Split from the original mirror_done function so we can handle bufs after a
3173  * retry.
3174  * ps->ps_mx is already held in the caller of this function and the cb error
3175  * has already been checked and handled in the caller.
3176  */
3177 static int
3178 mirror_done_common(struct buf *cb)
3179 {
3180 	struct buf	*pb;
3181 	mm_unit_t	*un;
3182 	mdi_unit_t	*ui;
3183 	md_mps_t	*ps;
3184 	md_mcs_t	*cs;
3185 	size_t		end_rr, start_rr, current_rr;
3186 
3187 	/*LINTED*/
3188 	cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off);
3189 	ps = cs->cs_ps;
3190 	pb = ps->ps_bp;
3191 
3192 	if (cb->b_flags & B_REMAPPED)
3193 		bp_mapout(cb);
3194 
3195 	ps->ps_frags--;
3196 	if (ps->ps_frags != 0) {
3197 		mutex_exit(&ps->ps_mx);
3198 		kmem_cache_free(mirror_child_cache, cs);
3199 		return (1);
3200 	}
3201 	un = ps->ps_un;
3202 	ui = ps->ps_ui;
3203 
3204 	/*
3205 	 * Do not update outstanding_writes if we're running with ABR
3206 	 * set for this mirror or the write() was issued with MD_STR_ABR set.
3207 	 * Also a resync initiated write() has no outstanding_writes update
3208 	 * either.
3209 	 */
3210 	if (((cb->b_flags & B_READ) == 0) &&
3211 	    (un->un_nsm >= 2) &&
3212 	    (ps->ps_call == NULL) &&
3213 	    !((ui->ui_tstate & MD_ABR_CAP) || (ps->ps_flags & MD_MPS_ABR)) &&
3214 	    !(ps->ps_flags & MD_MPS_WRITE_AFTER_READ)) {
3215 		BLK_TO_RR(end_rr, ps->ps_lastblk, un);
3216 		BLK_TO_RR(start_rr, ps->ps_firstblk, un);
3217 		mutex_enter(&un->un_resync_mx);
3218 		for (current_rr = start_rr; current_rr <= end_rr; current_rr++)
3219 			un->un_outstanding_writes[current_rr]--;
3220 		mutex_exit(&un->un_resync_mx);
3221 	}
3222 	kmem_cache_free(mirror_child_cache, cs);
3223 	mutex_exit(&ps->ps_mx);
3224 
3225 	if (ps->ps_call != NULL) {
3226 		daemon_request(&md_done_daemon, ps->ps_call,
3227 		    (daemon_queue_t *)ps, REQ_OLD);
3228 		return (1);
3229 	}
3230 
3231 	if ((ps->ps_flags & MD_MPS_ERROR)) {
3232 		daemon_request(&md_done_daemon, mirror_error,
3233 		    (daemon_queue_t *)ps, REQ_OLD);
3234 		return (1);
3235 	}
3236 
3237 	if (ps->ps_flags & MD_MPS_ON_OVERLAP)
3238 		mirror_overlap_tree_remove(ps);
3239 
3240 	/*
3241 	 * Handle Write-on-Write problem.
3242 	 * Skip In case of Raw and Direct I/O as they are
3243 	 * handled earlier.
3244 	 *
3245 	 */
3246 	if (!(md_mirror_wow_flg & WOW_DISABLE) &&
3247 	    !(pb->b_flags & B_READ) &&
3248 	    !(ps->ps_flags & MD_MPS_WOW) &&
3249 	    !(pb->b_flags & B_PHYS) &&
3250 	    any_pages_dirty(pb)) {
3251 		md_unit_readerexit(ps->ps_ui);
3252 		daemon_request(&md_mstr_daemon, handle_wow,
3253 		    (daemon_queue_t *)ps, REQ_OLD);
3254 		return (1);
3255 	}
3256 
3257 	md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
3258 	MPS_FREE(mirror_parent_cache, ps);
3259 	md_unit_readerexit(ui);
3260 	md_biodone(pb);
3261 	return (0);
3262 }
3263 
3264 /*
3265  * Clear error state in submirror component if the retry worked after
3266  * a failfast error.
3267  */
3268 static void
3269 clear_retry_error(struct buf *cb)
3270 {
3271 	int			smi;
3272 	md_mcs_t		*cs;
3273 	mm_unit_t		*un;
3274 	mdi_unit_t		*ui_sm;
3275 	mm_submirror_t		*sm;
3276 	mm_submirror_ic_t	*smic;
3277 	u_longlong_t		cnt;
3278 	md_m_shared_t		*shared;
3279 
3280 	/*LINTED*/
3281 	cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off);
3282 	un = cs->cs_ps->ps_un;
3283 
3284 	for (smi = 0; smi < NMIRROR; smi++) {
3285 		if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE))
3286 			continue;
3287 
3288 		if (cb->b_edev == md_dev64_to_dev(un->un_sm[smi].sm_dev))
3289 			break;
3290 	}
3291 
3292 	if (smi >= NMIRROR)
3293 		return;
3294 
3295 	sm = &un->un_sm[smi];
3296 	smic = &un->un_smic[smi];
3297 	cnt = cb->b_bcount;
3298 
3299 	ui_sm = MDI_UNIT(getminor(cb->b_edev));
3300 	(void) md_unit_writerlock(ui_sm);
3301 
3302 	shared = (md_m_shared_t *)(*(smic->sm_shared_by_blk))(sm->sm_dev, sm,
3303 	    cb->b_blkno, &cnt);
3304 
3305 	if (shared->ms_flags & MDM_S_IOERR) {
3306 		shared->ms_flags &= ~MDM_S_IOERR;
3307 
3308 	} else {
3309 		/* the buf spans components and the first one is not erred */
3310 		int	cnt;
3311 		int	i;
3312 
3313 		cnt = (*(smic->sm_get_component_count))(sm->sm_dev, un);
3314 		for (i = 0; i < cnt; i++) {
3315 			shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
3316 			    (sm->sm_dev, sm, i);
3317 
3318 			if (shared->ms_flags & MDM_S_IOERR &&
3319 			    shared->ms_state == CS_OKAY) {
3320 
3321 				shared->ms_flags &= ~MDM_S_IOERR;
3322 				break;
3323 			}
3324 		}
3325 	}
3326 
3327 	md_unit_writerexit(ui_sm);
3328 }
3329 
3330 static size_t
3331 mirror_map_read(
3332 	md_mps_t *ps,
3333 	md_mcs_t *cs,
3334 	diskaddr_t blkno,
3335 	u_longlong_t	count
3336 )
3337 {
3338 	mm_unit_t	*un;
3339 	buf_t		*bp;
3340 	u_longlong_t	cando;
3341 
3342 	bp = &cs->cs_buf;
3343 	un = ps->ps_un;
3344 
3345 	bp->b_lblkno = blkno;
3346 	if (fast_select_read_unit(ps, cs) == 0) {
3347 		bp->b_bcount = ldbtob(count);
3348 		return (0);
3349 	}
3350 	bp->b_edev = md_dev64_to_dev(select_read_unit(un, blkno,
3351 	    count, &cando, 0, NULL, cs));
3352 	bp->b_bcount = ldbtob(cando);
3353 	if (count != cando)
3354 		return (cando);
3355 	return (0);
3356 }
3357 
3358 static void
3359 write_after_read(md_mps_t *ps)
3360 {
3361 	struct buf	*pb;
3362 	int		flags;
3363 
3364 	if (ps->ps_flags & MD_MPS_ERROR) {
3365 		mirror_error(ps);
3366 		return;
3367 	}
3368 
3369 	pb = ps->ps_bp;
3370 	md_kstat_done(ps->ps_ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
3371 	ps->ps_call = NULL;
3372 	ps->ps_flags |= MD_MPS_WRITE_AFTER_READ;
3373 	flags = MD_STR_NOTTOP | MD_STR_WAR;
3374 	if (ps->ps_flags & MD_MPS_MAPPED)
3375 		flags |= MD_STR_MAPPED;
3376 	if (ps->ps_flags & MD_MPS_NOBLOCK)
3377 		flags |= MD_NOBLOCK;
3378 	if (ps->ps_flags & MD_MPS_DIRTY_RD)
3379 		flags |= MD_STR_DIRTY_RD;
3380 	(void) mirror_write_strategy(pb, flags, ps);
3381 }
3382 
3383 static void
3384 continue_serial(md_mps_t *ps)
3385 {
3386 	md_mcs_t	*cs;
3387 	buf_t		*cb;
3388 	mm_unit_t	*un;
3389 	int		flags;
3390 
3391 	un = ps->ps_un;
3392 	cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS);
3393 	mirror_child_init(cs);
3394 	cb = &cs->cs_buf;
3395 	ps->ps_call = NULL;
3396 	ps->ps_frags = 1;
3397 	(void) mirror_map_write(un, cs, ps, 0);
3398 	flags = MD_STR_NOTTOP;
3399 	if (ps->ps_flags & MD_MPS_MAPPED)
3400 		flags |= MD_STR_MAPPED;
3401 	md_call_strategy(cb, flags, NULL);
3402 }
3403 
3404 static int
3405 mirror_map_write(mm_unit_t *un, md_mcs_t *cs, md_mps_t *ps, int war)
3406 {
3407 	int i;
3408 	dev_t		dev;	/* needed for bioclone, so not md_dev64_t */
3409 	buf_t		*cb;
3410 	buf_t		*pb;
3411 	diskaddr_t	blkno;
3412 	size_t		bcount;
3413 	off_t		offset;
3414 
3415 	pb = ps->ps_bp;
3416 	cb = &cs->cs_buf;
3417 	cs->cs_ps = ps;
3418 
3419 	i = md_find_nth_unit(ps->ps_writable_sm, ps->ps_current_sm);
3420 
3421 	dev = md_dev64_to_dev(un->un_sm[i].sm_dev);
3422 
3423 	blkno = pb->b_lblkno;
3424 	bcount = pb->b_bcount;
3425 	offset = 0;
3426 	if (war && (blkno == 0) && (un->c.un_flag & MD_LABELED)) {
3427 		blkno = DK_LABEL_LOC + 1;
3428 		/*
3429 		 * This handles the case where we're requesting
3430 		 * a write to block 0 on a label partition
3431 		 * and the request size was smaller than the
3432 		 * size of the label.  If this is the case
3433 		 * then we'll return -1.  Failure to do so will
3434 		 * either cause the calling thread to hang due to
3435 		 * an ssd bug, or worse if the bcount were allowed
3436 		 * to go negative (ie large).
3437 		 */
3438 		if (bcount <= DEV_BSIZE*(DK_LABEL_LOC + 1))
3439 			return (-1);
3440 		bcount -= (DEV_BSIZE*(DK_LABEL_LOC + 1));
3441 		offset = (DEV_BSIZE*(DK_LABEL_LOC + 1));
3442 	}
3443 
3444 	cb = md_bioclone(pb, offset, bcount, dev, blkno, mirror_done,
3445 	    cb, KM_NOSLEEP);
3446 	if (war)
3447 		cb->b_flags = (cb->b_flags & ~B_READ) | B_WRITE;
3448 
3449 	/*
3450 	 * If the submirror is in the erred stated, check if any component is
3451 	 * in the Last Erred state.  If so, we don't want to use the B_FAILFAST
3452 	 * flag on the IO.
3453 	 *
3454 	 * Provide a fast path for the non-erred case (which should be the
3455 	 * normal case).
3456 	 */
3457 	if (un->un_sm[i].sm_flags & MD_SM_FAILFAST) {
3458 		if (un->un_sm[i].sm_state & SMS_COMP_ERRED) {
3459 			mm_submirror_t		*sm;
3460 			mm_submirror_ic_t	*smic;
3461 			int			ci;
3462 			int			compcnt;
3463 
3464 			sm = &un->un_sm[i];
3465 			smic = &un->un_smic[i];
3466 
3467 			compcnt = (*(smic->sm_get_component_count))
3468 			    (sm->sm_dev, un);
3469 			for (ci = 0; ci < compcnt; ci++) {
3470 				md_m_shared_t	*shared;
3471 
3472 				shared = (md_m_shared_t *)
3473 				    (*(smic->sm_shared_by_indx))(sm->sm_dev,
3474 				    sm, ci);
3475 
3476 				if (shared->ms_state == CS_LAST_ERRED)
3477 					break;
3478 			}
3479 			if (ci >= compcnt)
3480 				cb->b_flags |= B_FAILFAST;
3481 
3482 		} else {
3483 			cb->b_flags |= B_FAILFAST;
3484 		}
3485 	}
3486 
3487 	ps->ps_current_sm++;
3488 	if (ps->ps_current_sm != ps->ps_active_cnt) {
3489 		if (un->un_write_option == WR_SERIAL) {
3490 			ps->ps_call = continue_serial;
3491 			return (0);
3492 		}
3493 		return (1);
3494 	}
3495 	return (0);
3496 }
3497 
3498 /*
3499  * directed_read_done:
3500  * ------------------
3501  * Completion routine called when a DMR request has been returned from the
3502  * underlying driver. Wake-up the original ioctl() and return the data to
3503  * the user.
3504  */
3505 static void
3506 directed_read_done(md_mps_t *ps)
3507 {
3508 	mm_unit_t	*un;
3509 	mdi_unit_t	*ui;
3510 
3511 	un = ps->ps_un;
3512 	ui = ps->ps_ui;
3513 
3514 	md_unit_readerexit(ui);
3515 	md_kstat_done(ui, ps->ps_bp, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
3516 	ps->ps_call = NULL;
3517 
3518 	mutex_enter(&un->un_dmr_mx);
3519 	cv_signal(&un->un_dmr_cv);
3520 	mutex_exit(&un->un_dmr_mx);
3521 
3522 	/* release the parent structure */
3523 	kmem_cache_free(mirror_parent_cache, ps);
3524 }
3525 
3526 /*
3527  * daemon_io:
3528  * ------------
3529  * Called to issue a mirror_write_strategy() or mirror_read_strategy
3530  * call from a blockable context. NOTE: no mutex can be held on entry to this
3531  * routine
3532  */
3533 static void
3534 daemon_io(daemon_queue_t *dq)
3535 {
3536 	md_mps_t	*ps = (md_mps_t *)dq;
3537 	int		flag = MD_STR_NOTTOP;
3538 	buf_t		*pb = ps->ps_bp;
3539 
3540 	if (ps->ps_flags & MD_MPS_MAPPED)
3541 		flag |= MD_STR_MAPPED;
3542 	if (ps->ps_flags & MD_MPS_WOW)
3543 		flag |= MD_STR_WOW;
3544 	if (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)
3545 		flag |= MD_STR_WAR;
3546 	if (ps->ps_flags & MD_MPS_ABR)
3547 		flag |= MD_STR_ABR;
3548 	if (ps->ps_flags & MD_MPS_BLOCKABLE_IO)
3549 		flag |= MD_STR_BLOCK_OK;
3550 
3551 	/*
3552 	 * If this is a resync read, ie MD_STR_DIRTY_RD not set, set
3553 	 * MD_STR_WAR before calling mirror_read_strategy
3554 	 */
3555 	if (pb->b_flags & B_READ) {
3556 		if (!(ps->ps_flags & MD_MPS_DIRTY_RD))
3557 			flag |= MD_STR_WAR;
3558 		mirror_read_strategy(pb, flag, ps);
3559 	} else
3560 		mirror_write_strategy(pb, flag, ps);
3561 }
3562 
3563 /*
3564  * update_resync:
3565  * -------------
3566  * Called to update the in-core version of the resync record with the latest
3567  * version that was committed to disk when the previous mirror owner
3568  * relinquished ownership. This call is likely to block as we must hold-off
3569  * any current resync processing that may be occurring.
3570  * On completion of the resync record update we issue the mirror_write_strategy
3571  * call to complete the i/o that first started this sequence. To remove a race
3572  * condition between a new write() request which is submitted and the resync
3573  * record update we acquire the writerlock. This will hold off all i/o to the
3574  * mirror until the resync update has completed.
3575  * NOTE: no mutex can be held on entry to this routine
3576  */
3577 static void
3578 update_resync(daemon_queue_t *dq)
3579 {
3580 	md_mps_t	*ps = (md_mps_t *)dq;
3581 	buf_t		*pb = ps->ps_bp;
3582 	mdi_unit_t	*ui = ps->ps_ui;
3583 	mm_unit_t	*un = MD_UNIT(ui->ui_link.ln_id);
3584 	set_t		setno;
3585 	int		restart_resync;
3586 
3587 	mutex_enter(&un->un_rrp_inflight_mx);
3588 	(void) md_unit_writerlock(ui);
3589 	ps->ps_un = un;
3590 	setno = MD_MIN2SET(getminor(pb->b_edev));
3591 	if (mddb_reread_rr(setno, un->un_rr_dirty_recid) == 0) {
3592 		/*
3593 		 * Synchronize our in-core view of what regions need to be
3594 		 * resync'd with the on-disk version.
3595 		 */
3596 		mirror_copy_rr(howmany(un->un_rrd_num, NBBY), un->un_resync_bm,
3597 		    un->un_dirty_bm);
3598 
3599 		/* Region dirty map is now up to date */
3600 	}
3601 	restart_resync = (un->un_rs_thread_flags & MD_RI_BLOCK_OWNER) ? 1 : 0;
3602 	md_unit_writerexit(ui);
3603 	mutex_exit(&un->un_rrp_inflight_mx);
3604 
3605 	/* Restart the resync thread if it was previously blocked */
3606 	if (restart_resync) {
3607 		mutex_enter(&un->un_rs_thread_mx);
3608 		un->un_rs_thread_flags &= ~MD_RI_BLOCK_OWNER;
3609 		cv_signal(&un->un_rs_thread_cv);
3610 		mutex_exit(&un->un_rs_thread_mx);
3611 	}
3612 	/* Continue with original deferred i/o */
3613 	daemon_io(dq);
3614 }
3615 
3616 /*
3617  * owner_timeout:
3618  * -------------
3619  * Called if the original mdmn_ksend_message() failed and the request is to be
3620  * retried. Reattempt the original ownership change.
3621  *
3622  * NOTE: called at interrupt context (see timeout(9f)).
3623  */
3624 static void
3625 owner_timeout(void *arg)
3626 {
3627 	daemon_queue_t	*dq = (daemon_queue_t *)arg;
3628 
3629 	daemon_request(&md_mirror_daemon, become_owner, dq, REQ_OLD);
3630 }
3631 
3632 /*
3633  * become_owner:
3634  * ------------
3635  * Called to issue RPC request to become the owner of the mirror
3636  * associated with this i/o request. We assume that the ownership request
3637  * is synchronous, so if it succeeds we will issue the request via
3638  * mirror_write_strategy().
3639  * If multiple i/o's are outstanding we will be called from the mirror_daemon
3640  * service thread.
3641  * NOTE: no mutex should be held on entry to this routine.
3642  */
3643 static void
3644 become_owner(daemon_queue_t *dq)
3645 {
3646 	md_mps_t	*ps = (md_mps_t *)dq;
3647 	mm_unit_t	*un = ps->ps_un;
3648 	buf_t		*pb = ps->ps_bp;
3649 	set_t		setno;
3650 	md_mn_kresult_t	*kres;
3651 	int		msg_flags = md_mirror_msg_flags;
3652 	md_mps_t	*ps1;
3653 
3654 	ASSERT(dq->dq_next == NULL && dq->dq_prev == NULL);
3655 
3656 	/*
3657 	 * If we're already the mirror owner we do not need to send a message
3658 	 * but can simply process the i/o request immediately.
3659 	 * If we've already sent the request to become owner we requeue the
3660 	 * request as we're waiting for the synchronous ownership message to
3661 	 * be processed.
3662 	 */
3663 	if (MD_MN_MIRROR_OWNER(un)) {
3664 		/*
3665 		 * As the strategy() call will potentially block we need to
3666 		 * punt this to a separate thread and complete this request
3667 		 * as quickly as possible. Note: if we're a read request
3668 		 * this must be a resync, we cannot afford to be queued
3669 		 * behind any intervening i/o requests. In this case we put the
3670 		 * request on the md_mirror_rs_daemon queue.
3671 		 */
3672 		if (pb->b_flags & B_READ) {
3673 			daemon_request(&md_mirror_rs_daemon, daemon_io, dq,
3674 			    REQ_OLD);
3675 		} else {
3676 			daemon_request(&md_mirror_io_daemon, daemon_io, dq,
3677 			    REQ_OLD);
3678 		}
3679 	} else {
3680 		mutex_enter(&un->un_owner_mx);
3681 		if ((un->un_owner_state & MM_MN_OWNER_SENT) == 0) {
3682 			md_mn_req_owner_t	*msg;
3683 			int			rval = 0;
3684 
3685 			/*
3686 			 * Check to see that we haven't exceeded the maximum
3687 			 * retry count. If we have we fail the i/o as the
3688 			 * comms mechanism has become wedged beyond recovery.
3689 			 */
3690 			if (dq->qlen++ >= MD_OWNER_RETRIES) {
3691 				mutex_exit(&un->un_owner_mx);
3692 				cmn_err(CE_WARN,
3693 				    "md_mirror: Request exhausted ownership "
3694 				    "retry limit of %d attempts", dq->qlen);
3695 				pb->b_error = EIO;
3696 				pb->b_flags |= B_ERROR;
3697 				pb->b_resid = pb->b_bcount;
3698 				kmem_cache_free(mirror_parent_cache, ps);
3699 				md_biodone(pb);
3700 				return;
3701 			}
3702 
3703 			/*
3704 			 * Issue request to change ownership. The call is
3705 			 * synchronous so when it returns we can complete the
3706 			 * i/o (if successful), or enqueue it again so that
3707 			 * the operation will be retried.
3708 			 */
3709 			un->un_owner_state |= MM_MN_OWNER_SENT;
3710 			mutex_exit(&un->un_owner_mx);
3711 
3712 			msg = kmem_zalloc(sizeof (md_mn_req_owner_t), KM_SLEEP);
3713 			setno = MD_MIN2SET(getminor(pb->b_edev));
3714 			msg->mnum = MD_SID(un);
3715 			msg->owner = md_mn_mynode_id;
3716 			msg_flags |= MD_MSGF_NO_LOG;
3717 			/*
3718 			 * If this IO is triggered by updating a watermark,
3719 			 * it might be issued by the creation of a softpartition
3720 			 * while the commd subsystem is suspended.
3721 			 * We don't want this message to block.
3722 			 */
3723 			if (ps->ps_flags & MD_MPS_WMUPDATE) {
3724 				msg_flags |= MD_MSGF_OVERRIDE_SUSPEND;
3725 			}
3726 
3727 			kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
3728 			rval = mdmn_ksend_message(setno,
3729 			    MD_MN_MSG_REQUIRE_OWNER, msg_flags, 0,
3730 			    (char *)msg, sizeof (md_mn_req_owner_t), kres);
3731 
3732 			kmem_free(msg, sizeof (md_mn_req_owner_t));
3733 
3734 			if (MDMN_KSEND_MSG_OK(rval, kres)) {
3735 				dq->qlen = 0;
3736 				/*
3737 				 * Successfully changed owner, reread the
3738 				 * resync record so that we have a valid idea of
3739 				 * any previously committed incomplete write()s.
3740 				 * NOTE: As we need to acquire the resync mutex
3741 				 * this may block, so we defer it to a separate
3742 				 * thread handler. This makes us (effectively)
3743 				 * non-blocking once the ownership message
3744 				 * handling has completed.
3745 				 */
3746 				mutex_enter(&un->un_owner_mx);
3747 				if (un->un_owner_state & MM_MN_BECOME_OWNER) {
3748 					un->un_mirror_owner = md_mn_mynode_id;
3749 					/* Sets owner of un_rr_dirty record */
3750 					if (un->un_rr_dirty_recid)
3751 						(void) mddb_setowner(
3752 						    un->un_rr_dirty_recid,
3753 						    md_mn_mynode_id);
3754 					un->un_owner_state &=
3755 					    ~MM_MN_BECOME_OWNER;
3756 					/*
3757 					 * Release the block on the current
3758 					 * resync region if it is blocked
3759 					 */
3760 					ps1 = un->un_rs_prev_overlap;
3761 					if ((ps1 != NULL) &&
3762 					    (ps1->ps_flags & MD_MPS_ON_OVERLAP))
3763 						mirror_overlap_tree_remove(ps1);
3764 					mutex_exit(&un->un_owner_mx);
3765 
3766 					/*
3767 					 * If we're a read, this must be a
3768 					 * resync request, issue
3769 					 * the i/o request on the
3770 					 * md_mirror_rs_daemon queue. This is
3771 					 * to avoid a deadlock between the
3772 					 * resync_unit thread and
3773 					 * subsequent i/o requests that may
3774 					 * block on the resync region.
3775 					 */
3776 					if (pb->b_flags & B_READ) {
3777 						daemon_request(
3778 						    &md_mirror_rs_daemon,
3779 						    update_resync, dq, REQ_OLD);
3780 					} else {
3781 						daemon_request(
3782 						    &md_mirror_io_daemon,
3783 						    update_resync, dq, REQ_OLD);
3784 					}
3785 					kmem_free(kres,
3786 					    sizeof (md_mn_kresult_t));
3787 					return;
3788 				} else {
3789 					/*
3790 					 * Some other node has beaten us to
3791 					 * obtain ownership. We need to
3792 					 * reschedule our ownership request
3793 					 */
3794 					mutex_exit(&un->un_owner_mx);
3795 				}
3796 			} else {
3797 				mdmn_ksend_show_error(rval, kres,
3798 				    "MD_MN_MSG_REQUIRE_OWNER");
3799 				/*
3800 				 * Message transport failure is handled by the
3801 				 * comms layer. If the ownership change request
3802 				 * does not succeed we need to flag the error to
3803 				 * the initiator of the i/o. This is handled by
3804 				 * the retry logic above. As the request failed
3805 				 * we do not know _who_ the owner of the mirror
3806 				 * currently is. We reset our idea of the owner
3807 				 * to None so that any further write()s will
3808 				 * attempt to become the owner again. This stops
3809 				 * multiple nodes writing to the same mirror
3810 				 * simultaneously.
3811 				 */
3812 				mutex_enter(&un->un_owner_mx);
3813 				un->un_owner_state &=
3814 				    ~(MM_MN_OWNER_SENT|MM_MN_BECOME_OWNER);
3815 				un->un_mirror_owner = MD_MN_MIRROR_UNOWNED;
3816 				mutex_exit(&un->un_owner_mx);
3817 			}
3818 			kmem_free(kres, sizeof (md_mn_kresult_t));
3819 		} else
3820 			mutex_exit(&un->un_owner_mx);
3821 
3822 		/*
3823 		 * Re-enqueue this request on the deferred i/o list. Delay the
3824 		 * request for md_mirror_owner_to usecs to stop thrashing.
3825 		 */
3826 		(void) timeout(owner_timeout, dq,
3827 		    drv_usectohz(md_mirror_owner_to));
3828 	}
3829 }
3830 
3831 static void
3832 mirror_write_strategy(buf_t *pb, int flag, void *private)
3833 {
3834 	md_mps_t	*ps;
3835 	md_mcs_t	*cs;
3836 	int		more;
3837 	mm_unit_t	*un;
3838 	mdi_unit_t	*ui;
3839 	buf_t		*cb;		/* child buf pointer */
3840 	set_t		setno;
3841 	int		rs_on_overlap = 0;
3842 
3843 	ui = MDI_UNIT(getminor(pb->b_edev));
3844 	un = (mm_unit_t *)MD_UNIT(getminor(pb->b_edev));
3845 
3846 
3847 	md_kstat_waitq_enter(ui);
3848 
3849 	/*
3850 	 * If a state change is in progress for this mirror in a MN set,
3851 	 * suspend all non-resync writes until the state change is complete.
3852 	 * The objective of this suspend is to ensure that it is not
3853 	 * possible for one node to read data from a submirror that another node
3854 	 * has not written to because of the state change. Therefore we
3855 	 * suspend all writes until the state change has been made. As it is
3856 	 * not possible to read from the target of a resync, there is no need
3857 	 * to suspend resync writes.
3858 	 * Note that we only block here if the caller can handle a busy-wait.
3859 	 * The MD_STR_BLOCK_OK flag is set for daemon_io originated i/o only.
3860 	 */
3861 
3862 	if (!(flag & MD_STR_WAR)) {
3863 		if (flag & MD_STR_BLOCK_OK) {
3864 			mutex_enter(&un->un_suspend_wr_mx);
3865 			while (un->un_suspend_wr_flag) {
3866 				cv_wait(&un->un_suspend_wr_cv,
3867 				    &un->un_suspend_wr_mx);
3868 			}
3869 			mutex_exit(&un->un_suspend_wr_mx);
3870 		}
3871 		(void) md_unit_readerlock(ui);
3872 	}
3873 
3874 	if (!(flag & MD_STR_NOTTOP)) {
3875 		if (md_checkbuf(ui, (md_unit_t *)un, pb)) {
3876 			md_kstat_waitq_exit(ui);
3877 			return;
3878 		}
3879 	}
3880 
3881 	setno = MD_MIN2SET(getminor(pb->b_edev));
3882 
3883 	/* If an ABR write has been requested, set MD_STR_ABR flag */
3884 	if (MD_MNSET_SETNO(setno) && (pb->b_flags & B_ABRWRITE))
3885 		flag |= MD_STR_ABR;
3886 
3887 	if (private == NULL) {
3888 		ps = kmem_cache_alloc(mirror_parent_cache, MD_ALLOCFLAGS);
3889 		mirror_parent_init(ps);
3890 	} else {
3891 		ps = private;
3892 		private = NULL;
3893 	}
3894 	if (flag & MD_STR_MAPPED)
3895 		ps->ps_flags |= MD_MPS_MAPPED;
3896 
3897 	if (flag & MD_STR_WOW)
3898 		ps->ps_flags |= MD_MPS_WOW;
3899 
3900 	if (flag & MD_STR_ABR)
3901 		ps->ps_flags |= MD_MPS_ABR;
3902 
3903 	if (flag & MD_STR_WMUPDATE)
3904 		ps->ps_flags |= MD_MPS_WMUPDATE;
3905 
3906 	/*
3907 	 * Save essential information from the original buffhdr
3908 	 * in the md_save structure.
3909 	 */
3910 	ps->ps_un = un;
3911 	ps->ps_ui = ui;
3912 	ps->ps_bp = pb;
3913 	ps->ps_addr = pb->b_un.b_addr;
3914 	ps->ps_firstblk = pb->b_lblkno;
3915 	ps->ps_lastblk = pb->b_lblkno + lbtodb(pb->b_bcount) - 1;
3916 	ps->ps_changecnt = un->un_changecnt;
3917 
3918 	/*
3919 	 * Check for suspended writes here. This is where we can defer the
3920 	 * write request to the daemon_io queue which will then call us with
3921 	 * the MD_STR_BLOCK_OK flag set and we'll busy-wait (if necessary) at
3922 	 * the top of this routine.
3923 	 */
3924 	if (!(flag & MD_STR_WAR) && !(flag & MD_STR_BLOCK_OK)) {
3925 		mutex_enter(&un->un_suspend_wr_mx);
3926 		if (un->un_suspend_wr_flag) {
3927 			ps->ps_flags |= MD_MPS_BLOCKABLE_IO;
3928 			mutex_exit(&un->un_suspend_wr_mx);
3929 			md_unit_readerexit(ui);
3930 			daemon_request(&md_mirror_daemon, daemon_io,
3931 			    (daemon_queue_t *)ps, REQ_OLD);
3932 			return;
3933 		}
3934 		mutex_exit(&un->un_suspend_wr_mx);
3935 	}
3936 
3937 	/*
3938 	 * If not MN owner and this is an ABR write, make sure the current
3939 	 * resync region is in the overlaps tree
3940 	 */
3941 	mutex_enter(&un->un_owner_mx);
3942 	if (MD_MNSET_SETNO(setno) && (!(MD_MN_MIRROR_OWNER(un))) &&
3943 	    ((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR))) {
3944 		md_mps_t	*ps1;
3945 		/* Block the current resync region, if not already blocked */
3946 		ps1 = un->un_rs_prev_overlap;
3947 
3948 		if ((ps1 != NULL) && ((ps1->ps_firstblk != 0) ||
3949 		    (ps1->ps_lastblk != 0))) {
3950 			/* Drop locks to avoid deadlock */
3951 			mutex_exit(&un->un_owner_mx);
3952 			md_unit_readerexit(ui);
3953 			wait_for_overlaps(ps1, MD_OVERLAP_ALLOW_REPEAT);
3954 			rs_on_overlap = 1;
3955 			(void) md_unit_readerlock(ui);
3956 			mutex_enter(&un->un_owner_mx);
3957 			/*
3958 			 * Check to see if we have obtained ownership
3959 			 * while waiting for overlaps. If we have, remove
3960 			 * the resync_region entry from the overlap tree
3961 			 */
3962 			if (MD_MN_MIRROR_OWNER(un) &&
3963 			    (ps1->ps_flags & MD_MPS_ON_OVERLAP)) {
3964 				mirror_overlap_tree_remove(ps1);
3965 				rs_on_overlap = 0;
3966 			}
3967 		}
3968 	}
3969 	mutex_exit(&un->un_owner_mx);
3970 
3971 
3972 	/*
3973 	 * following keep write after read from writing to the
3974 	 * source in the case where it all came from one place
3975 	 */
3976 	if (flag & MD_STR_WAR) {
3977 		int	abort_write = 0;
3978 		/*
3979 		 * We are perfoming a write-after-read. This is either as a
3980 		 * result of a resync read or as a result of a read in a
3981 		 * dirty resync region when the optimized resync is not
3982 		 * complete. If in a MN set and a resync generated i/o,
3983 		 * if the current block is not in the current
3984 		 * resync region terminate the write as another node must have
3985 		 * completed this resync region
3986 		 */
3987 		if ((MD_MNSET_SETNO(MD_UN2SET(un))) &&
3988 		    (!flag & MD_STR_DIRTY_RD)) {
3989 			if (!IN_RESYNC_REGION(un, ps))
3990 				abort_write = 1;
3991 		}
3992 		if ((select_write_after_read_units(un, ps) == 0) ||
3993 		    (abort_write)) {
3994 #ifdef DEBUG
3995 			if (mirror_debug_flag)
3996 				printf("Abort resync write on %x, block %lld\n",
3997 				    MD_SID(un), ps->ps_firstblk);
3998 #endif
3999 			if (ps->ps_flags & MD_MPS_ON_OVERLAP)
4000 				mirror_overlap_tree_remove(ps);
4001 			kmem_cache_free(mirror_parent_cache, ps);
4002 			md_kstat_waitq_exit(ui);
4003 			md_unit_readerexit(ui);
4004 			md_biodone(pb);
4005 			return;
4006 		}
4007 	} else {
4008 		select_write_units(un, ps);
4009 
4010 		/* Drop readerlock to avoid deadlock */
4011 		md_unit_readerexit(ui);
4012 		wait_for_overlaps(ps, MD_OVERLAP_NO_REPEAT);
4013 		un = md_unit_readerlock(ui);
4014 		/*
4015 		 * For a MN set with an ABR write, if we are now the
4016 		 * owner and we have a resync region in the overlap
4017 		 * tree, remove the entry from overlaps and retry the write.
4018 		 */
4019 
4020 		if (MD_MNSET_SETNO(setno) &&
4021 		    ((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR))) {
4022 			mutex_enter(&un->un_owner_mx);
4023 			if (((MD_MN_MIRROR_OWNER(un))) && rs_on_overlap) {
4024 				mirror_overlap_tree_remove(ps);
4025 				md_kstat_waitq_exit(ui);
4026 				mutex_exit(&un->un_owner_mx);
4027 				md_unit_readerexit(ui);
4028 				daemon_request(&md_mirror_daemon, daemon_io,
4029 				    (daemon_queue_t *)ps, REQ_OLD);
4030 				return;
4031 			}
4032 			mutex_exit(&un->un_owner_mx);
4033 		}
4034 	}
4035 
4036 	/*
4037 	 * For Multinode mirrors with no owner and a Resync Region (not ABR)
4038 	 * we need to become the mirror owner before continuing with the
4039 	 * write(). For ABR mirrors we check that we 'own' the resync if
4040 	 * we're in write-after-read mode. We do this _after_ ensuring that
4041 	 * there are no overlaps to ensure that once we know that we are
4042 	 * the owner, the readerlock will not be released until the write is
4043 	 * complete. As a change of ownership in a MN set requires the
4044 	 * writerlock, this ensures that ownership cannot be changed until
4045 	 * the write is complete.
4046 	 */
4047 	if (MD_MNSET_SETNO(setno) && (!((ui->ui_tstate & MD_ABR_CAP) ||
4048 	    (flag & MD_STR_ABR)) || (flag & MD_STR_WAR))) {
4049 		if (MD_MN_NO_MIRROR_OWNER(un))  {
4050 			if (ps->ps_flags & MD_MPS_ON_OVERLAP)
4051 				mirror_overlap_tree_remove(ps);
4052 			md_kstat_waitq_exit(ui);
4053 			ASSERT(!(flag & MD_STR_WAR));
4054 			md_unit_readerexit(ui);
4055 			daemon_request(&md_mirror_daemon, become_owner,
4056 			    (daemon_queue_t *)ps, REQ_OLD);
4057 			return;
4058 		}
4059 	}
4060 
4061 	/*
4062 	 * Mark resync region if mirror has a Resync Region _and_ we are not
4063 	 * a resync initiated write(). Don't mark region if we're flagged as
4064 	 * an ABR write.
4065 	 */
4066 	if (!((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR)) &&
4067 	    !(flag & MD_STR_WAR)) {
4068 		if (mirror_mark_resync_region(un, ps->ps_firstblk,
4069 		    ps->ps_lastblk, md_mn_mynode_id)) {
4070 			pb->b_flags |= B_ERROR;
4071 			pb->b_resid = pb->b_bcount;
4072 			if (ps->ps_flags & MD_MPS_ON_OVERLAP)
4073 				mirror_overlap_tree_remove(ps);
4074 			kmem_cache_free(mirror_parent_cache, ps);
4075 			md_kstat_waitq_exit(ui);
4076 			md_unit_readerexit(ui);
4077 			md_biodone(pb);
4078 			return;
4079 		}
4080 	}
4081 
4082 	ps->ps_childbflags = pb->b_flags | B_WRITE;
4083 	ps->ps_childbflags &= ~B_READ;
4084 	if (flag & MD_STR_MAPPED)
4085 		ps->ps_childbflags &= ~B_PAGEIO;
4086 
4087 	if (!(flag & MD_STR_NOTTOP) && panicstr)
4088 		/* Disable WOW and don't free ps */
4089 		ps->ps_flags |= (MD_MPS_WOW|MD_MPS_DONTFREE);
4090 
4091 	md_kstat_waitq_to_runq(ui);
4092 
4093 	/*
4094 	 * Treat Raw and Direct I/O as Write-on-Write always
4095 	 */
4096 
4097 	if (!(md_mirror_wow_flg & WOW_DISABLE) &&
4098 	    (md_mirror_wow_flg & WOW_PHYS_ENABLE) &&
4099 	    (pb->b_flags & B_PHYS) &&
4100 	    !(ps->ps_flags & MD_MPS_WOW)) {
4101 		if (ps->ps_flags & MD_MPS_ON_OVERLAP)
4102 			mirror_overlap_tree_remove(ps);
4103 		md_unit_readerexit(ui);
4104 		daemon_request(&md_mstr_daemon, handle_wow,
4105 		    (daemon_queue_t *)ps, REQ_OLD);
4106 		return;
4107 	}
4108 
4109 	ps->ps_frags = 1;
4110 	do {
4111 		cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS);
4112 		mirror_child_init(cs);
4113 		cb = &cs->cs_buf;
4114 		more = mirror_map_write(un, cs, ps, (flag & MD_STR_WAR));
4115 
4116 		/*
4117 		 * This handles the case where we're requesting
4118 		 * a write to block 0 on a label partition.  (more < 0)
4119 		 * means that the request size was smaller than the
4120 		 * size of the label.  If so this request is done.
4121 		 */
4122 		if (more < 0) {
4123 			if (ps->ps_flags & MD_MPS_ON_OVERLAP)
4124 				mirror_overlap_tree_remove(ps);
4125 			md_kstat_runq_exit(ui);
4126 			kmem_cache_free(mirror_child_cache, cs);
4127 			kmem_cache_free(mirror_parent_cache, ps);
4128 			md_unit_readerexit(ui);
4129 			md_biodone(pb);
4130 			return;
4131 		}
4132 		if (more) {
4133 			mutex_enter(&ps->ps_mx);
4134 			ps->ps_frags++;
4135 			mutex_exit(&ps->ps_mx);
4136 		}
4137 		md_call_strategy(cb, flag, private);
4138 	} while (more);
4139 
4140 	if (!(flag & MD_STR_NOTTOP) && panicstr) {
4141 		while (!(ps->ps_flags & MD_MPS_DONE)) {
4142 			md_daemon(1, &md_done_daemon);
4143 			drv_usecwait(10);
4144 		}
4145 		kmem_cache_free(mirror_parent_cache, ps);
4146 	}
4147 }
4148 
4149 static void
4150 mirror_read_strategy(buf_t *pb, int flag, void *private)
4151 {
4152 	md_mps_t	*ps;
4153 	md_mcs_t	*cs;
4154 	size_t		more;
4155 	mm_unit_t	*un;
4156 	mdi_unit_t	*ui;
4157 	size_t		current_count;
4158 	diskaddr_t	current_blkno;
4159 	off_t		current_offset;
4160 	buf_t		*cb;		/* child buf pointer */
4161 	set_t		setno;
4162 
4163 	ui = MDI_UNIT(getminor(pb->b_edev));
4164 
4165 	md_kstat_waitq_enter(ui);
4166 
4167 	un = (mm_unit_t *)md_unit_readerlock(ui);
4168 
4169 	if (!(flag & MD_STR_NOTTOP)) {
4170 		if (md_checkbuf(ui, (md_unit_t *)un, pb)) {
4171 			md_kstat_waitq_exit(ui);
4172 			return;
4173 		}
4174 	}
4175 
4176 	if (private == NULL) {
4177 		ps = kmem_cache_alloc(mirror_parent_cache, MD_ALLOCFLAGS);
4178 		mirror_parent_init(ps);
4179 	} else {
4180 		ps = private;
4181 		private = NULL;
4182 	}
4183 
4184 	if (flag & MD_STR_MAPPED)
4185 		ps->ps_flags |= MD_MPS_MAPPED;
4186 	if (flag & MD_NOBLOCK)
4187 		ps->ps_flags |= MD_MPS_NOBLOCK;
4188 	if (flag & MD_STR_WMUPDATE)
4189 		ps->ps_flags |= MD_MPS_WMUPDATE;
4190 
4191 	/*
4192 	 * Check to see if this is a DMR driven read. If so we need to use the
4193 	 * specified side (in un->un_dmr_last_read) for the source of the data.
4194 	 */
4195 	if (flag & MD_STR_DMR)
4196 		ps->ps_flags |= MD_MPS_DMR;
4197 
4198 	/*
4199 	 * Save essential information from the original buffhdr
4200 	 * in the md_save structure.
4201 	 */
4202 	ps->ps_un = un;
4203 	ps->ps_ui = ui;
4204 	ps->ps_bp = pb;
4205 	ps->ps_addr = pb->b_un.b_addr;
4206 	ps->ps_firstblk = pb->b_lblkno;
4207 	ps->ps_lastblk = pb->b_lblkno + lbtodb(pb->b_bcount) - 1;
4208 	ps->ps_changecnt = un->un_changecnt;
4209 
4210 	current_count = btodb(pb->b_bcount);
4211 	current_blkno = pb->b_lblkno;
4212 	current_offset = 0;
4213 
4214 	/*
4215 	 * If flag has MD_STR_WAR set this means that the read is issued by a
4216 	 * resync thread which may or may not be an optimised resync.
4217 	 *
4218 	 * If MD_UN_OPT_NOT_DONE is set this means that the optimized resync
4219 	 * code has not completed; either a resync has not started since snarf,
4220 	 * or there is an optimized resync in progress.
4221 	 *
4222 	 * We need to generate a write after this read in the following two
4223 	 * cases,
4224 	 *
4225 	 * 1. Any Resync-Generated read
4226 	 *
4227 	 * 2. Any read to a DIRTY REGION if there is an optimized resync
4228 	 *    pending or in progress.
4229 	 *
4230 	 * The write after read is done in these cases to ensure that all sides
4231 	 * of the mirror are in sync with the read data and that it is not
4232 	 * possible for an application to read the same block multiple times
4233 	 * and get different data.
4234 	 *
4235 	 * This would be possible if the block was in a dirty region.
4236 	 *
4237 	 * If we're performing a directed read we don't write the data out as
4238 	 * the application is responsible for restoring the mirror to a known
4239 	 * state.
4240 	 */
4241 	if (((MD_STATUS(un) & MD_UN_OPT_NOT_DONE) || (flag & MD_STR_WAR)) &&
4242 	    !(flag & MD_STR_DMR)) {
4243 		size_t	start_rr, i, end_rr;
4244 		int	region_dirty = 1;
4245 
4246 		/*
4247 		 * We enter here under three circumstances,
4248 		 *
4249 		 * MD_UN_OPT_NOT_DONE	MD_STR_WAR
4250 		 * 0			1
4251 		 * 1			0
4252 		 * 1			1
4253 		 *
4254 		 * To be optimal we only care to explicitly check for dirty
4255 		 * regions in the second case since if MD_STR_WAR is set we
4256 		 * always do the write after read.
4257 		 */
4258 		if (!(flag & MD_STR_WAR)) {
4259 			BLK_TO_RR(end_rr, ps->ps_lastblk, un);
4260 			BLK_TO_RR(start_rr, ps->ps_firstblk, un);
4261 
4262 			for (i = start_rr; i <= end_rr; i++)
4263 				if ((region_dirty = IS_KEEPDIRTY(i, un)) != 0)
4264 					break;
4265 		}
4266 
4267 		if ((region_dirty) &&
4268 		    !(md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)) {
4269 			ps->ps_call = write_after_read;
4270 			/*
4271 			 * Mark this as a RESYNC_READ in ps_flags.
4272 			 * This is used if the read fails during a
4273 			 * resync of a 3-way mirror to ensure that
4274 			 * the retried read to the remaining
4275 			 * good submirror has MD_STR_WAR set. This
4276 			 * is needed to ensure that the resync write
4277 			 * (write-after-read) takes place.
4278 			 */
4279 			ps->ps_flags |= MD_MPS_RESYNC_READ;
4280 
4281 			/*
4282 			 * If MD_STR_FLAG_ERR is set in the flags we
4283 			 * set MD_MPS_FLAG_ERROR so that an error on the resync
4284 			 * write (issued by write_after_read) will be flagged
4285 			 * to the biowait'ing resync thread. This allows us to
4286 			 * avoid issuing further resync requests to a device
4287 			 * that has had a write failure.
4288 			 */
4289 			if (flag & MD_STR_FLAG_ERR)
4290 				ps->ps_flags |= MD_MPS_FLAG_ERROR;
4291 
4292 			setno = MD_UN2SET(un);
4293 			/*
4294 			 * Drop the readerlock to avoid
4295 			 * deadlock
4296 			 */
4297 			md_unit_readerexit(ui);
4298 			wait_for_overlaps(ps, MD_OVERLAP_NO_REPEAT);
4299 			un = md_unit_readerlock(ui);
4300 			/*
4301 			 * Ensure that we are owner
4302 			 */
4303 			if (MD_MNSET_SETNO(setno)) {
4304 				/*
4305 				 * For a non-resync read that requires a
4306 				 * write-after-read to be done, set a flag
4307 				 * in the parent structure, so that the
4308 				 * write_strategy routine can omit the
4309 				 * test that the write is still within the
4310 				 * resync region
4311 				 */
4312 				if (!(flag & MD_STR_WAR))
4313 					ps->ps_flags |= MD_MPS_DIRTY_RD;
4314 
4315 				/*
4316 				 * Before reading the buffer, see if
4317 				 * there is an owner.
4318 				 */
4319 				if (MD_MN_NO_MIRROR_OWNER(un))  {
4320 					ps->ps_call = NULL;
4321 					mirror_overlap_tree_remove(ps);
4322 					md_kstat_waitq_exit(ui);
4323 					md_unit_readerexit(ui);
4324 					daemon_request(
4325 					    &md_mirror_daemon,
4326 					    become_owner,
4327 					    (daemon_queue_t *)ps,
4328 					    REQ_OLD);
4329 					return;
4330 				}
4331 				/*
4332 				 * For a resync read, check to see if I/O is
4333 				 * outside of the current resync region, or
4334 				 * the resync has finished. If so
4335 				 * just terminate the I/O
4336 				 */
4337 				if ((flag & MD_STR_WAR) &&
4338 				    (!(un->c.un_status & MD_UN_WAR) ||
4339 				    (!IN_RESYNC_REGION(un, ps)))) {
4340 #ifdef DEBUG
4341 					if (mirror_debug_flag)
4342 						printf("Abort resync read "
4343 						    "%x: %lld\n",
4344 						    MD_SID(un),
4345 						    ps->ps_firstblk);
4346 #endif
4347 					mirror_overlap_tree_remove(ps);
4348 					kmem_cache_free(mirror_parent_cache,
4349 					    ps);
4350 					md_kstat_waitq_exit(ui);
4351 					md_unit_readerexit(ui);
4352 					md_biodone(pb);
4353 					return;
4354 				}
4355 			}
4356 		}
4357 	}
4358 
4359 	if (flag & MD_STR_DMR) {
4360 		ps->ps_call = directed_read_done;
4361 	}
4362 
4363 	if (!(flag & MD_STR_NOTTOP) && panicstr)
4364 		ps->ps_flags |= MD_MPS_DONTFREE;
4365 
4366 	md_kstat_waitq_to_runq(ui);
4367 
4368 	ps->ps_frags++;
4369 	do {
4370 		cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS);
4371 		mirror_child_init(cs);
4372 		cb = &cs->cs_buf;
4373 		cs->cs_ps = ps;
4374 
4375 		cb = md_bioclone(pb, current_offset, current_count, NODEV,
4376 		    current_blkno, mirror_done, cb, KM_NOSLEEP);
4377 
4378 		more = mirror_map_read(ps, cs, current_blkno,
4379 		    (u_longlong_t)current_count);
4380 		if (more) {
4381 			mutex_enter(&ps->ps_mx);
4382 			ps->ps_frags++;
4383 			mutex_exit(&ps->ps_mx);
4384 		}
4385 
4386 		/*
4387 		 * Do these calculations now,
4388 		 *  so that we pickup a valid b_bcount from the chld_bp.
4389 		 */
4390 		current_count -= more;
4391 		current_offset += cb->b_bcount;
4392 		current_blkno +=  more;
4393 		md_call_strategy(cb, flag, private);
4394 	} while (more);
4395 
4396 	if (!(flag & MD_STR_NOTTOP) && panicstr) {
4397 		while (!(ps->ps_flags & MD_MPS_DONE)) {
4398 			md_daemon(1, &md_done_daemon);
4399 			drv_usecwait(10);
4400 		}
4401 		kmem_cache_free(mirror_parent_cache, ps);
4402 	}
4403 }
4404 
4405 void
4406 md_mirror_strategy(buf_t *bp, int flag, void *private)
4407 {
4408 	set_t	setno = MD_MIN2SET(getminor(bp->b_edev));
4409 
4410 	/*
4411 	 * When doing IO to a multi owner meta device, check if set is halted.
4412 	 * We do this check without the needed lock held, for performance
4413 	 * reasons.
4414 	 * If an IO just slips through while the set is locked via an
4415 	 * MD_MN_SUSPEND_SET, we don't care about it.
4416 	 * Only check for suspension if we are a top-level i/o request
4417 	 * (MD_STR_NOTTOP is cleared in 'flag').
4418 	 */
4419 	if ((md_set[setno].s_status & (MD_SET_HALTED | MD_SET_MNSET)) ==
4420 	    (MD_SET_HALTED | MD_SET_MNSET)) {
4421 		if ((flag & MD_STR_NOTTOP) == 0) {
4422 			mutex_enter(&md_mx);
4423 			/* Here we loop until the set is no longer halted */
4424 			while (md_set[setno].s_status & MD_SET_HALTED) {
4425 				cv_wait(&md_cv, &md_mx);
4426 			}
4427 			mutex_exit(&md_mx);
4428 		}
4429 	}
4430 
4431 	if ((flag & MD_IO_COUNTED) == 0) {
4432 		if ((flag & MD_NOBLOCK) == 0) {
4433 			if (md_inc_iocount(setno) != 0) {
4434 				bp->b_flags |= B_ERROR;
4435 				bp->b_error = ENXIO;
4436 				bp->b_resid = bp->b_bcount;
4437 				biodone(bp);
4438 				return;
4439 			}
4440 		} else {
4441 			md_inc_iocount_noblock(setno);
4442 		}
4443 	}
4444 
4445 	if (bp->b_flags & B_READ)
4446 		mirror_read_strategy(bp, flag, private);
4447 	else
4448 		mirror_write_strategy(bp, flag, private);
4449 }
4450 
4451 /*
4452  * mirror_directed_read:
4453  * --------------------
4454  * Entry-point for the DKIOCDMR ioctl. We issue a read to a specified sub-mirror
4455  * so that the application can determine what (if any) resync needs to be
4456  * performed. The data is copied out to the user-supplied buffer.
4457  *
4458  * Parameters:
4459  *	mdev	- dev_t for the mirror device
4460  *	vdr	- directed read parameters specifying location and submirror
4461  *		  to perform the read from
4462  *	mode	- used to ddi_copyout() any resulting data from the read
4463  *
4464  * Returns:
4465  *	0	success
4466  *	!0	error code
4467  *		EINVAL - invalid request format
4468  */
4469 int
4470 mirror_directed_read(dev_t mdev, vol_directed_rd_t *vdr, int mode)
4471 {
4472 	buf_t		*bp;
4473 	minor_t		mnum = getminor(mdev);
4474 	mdi_unit_t	*ui = MDI_UNIT(mnum);
4475 	mm_unit_t	*un;
4476 	mm_submirror_t	*sm;
4477 	char		*sm_nm;
4478 	uint_t		next_side;
4479 	void		*kbuffer;
4480 
4481 	if (ui == NULL)
4482 		return (ENXIO);
4483 
4484 	if (!(vdr->vdr_flags & DKV_DMR_NEXT_SIDE)) {
4485 		return (EINVAL);
4486 	}
4487 
4488 	/* Check for aligned block access. We disallow non-aligned requests. */
4489 	if (vdr->vdr_offset % DEV_BSIZE) {
4490 		return (EINVAL);
4491 	}
4492 
4493 	/*
4494 	 * Allocate kernel buffer for target of read(). If we had a reliable
4495 	 * (sorry functional) DDI this wouldn't be needed.
4496 	 */
4497 	kbuffer = kmem_alloc(vdr->vdr_nbytes, KM_NOSLEEP);
4498 	if (kbuffer == NULL) {
4499 		cmn_err(CE_WARN, "mirror_directed_read: couldn't allocate %lx"
4500 		    " bytes\n", vdr->vdr_nbytes);
4501 		return (ENOMEM);
4502 	}
4503 
4504 	bp = getrbuf(KM_SLEEP);
4505 
4506 	bp->b_un.b_addr = kbuffer;
4507 	bp->b_flags = B_READ;
4508 	bp->b_bcount = vdr->vdr_nbytes;
4509 	bp->b_lblkno = lbtodb(vdr->vdr_offset);
4510 	bp->b_edev = mdev;
4511 
4512 	un = md_unit_readerlock(ui);
4513 
4514 	/*
4515 	 * If DKV_SIDE_INIT is set we need to determine the first available
4516 	 * side to start reading from. If it isn't set we increment to the
4517 	 * next readable submirror.
4518 	 * If there are no readable submirrors we error out with DKV_DMR_ERROR.
4519 	 * Note: we check for a readable submirror on completion of the i/o so
4520 	 * we should _always_ have one available. If this becomes unavailable
4521 	 * we have missed the 'DKV_DMR_DONE' opportunity. This could happen if
4522 	 * a metadetach is made between the completion of one DKIOCDMR ioctl
4523 	 * and the start of the next (i.e. a sys-admin 'accident' occurred).
4524 	 * The chance of this is small, but not non-existent.
4525 	 */
4526 	if (vdr->vdr_side == DKV_SIDE_INIT) {
4527 		next_side = 0;
4528 	} else {
4529 		next_side = vdr->vdr_side + 1;
4530 	}
4531 	while ((next_side < NMIRROR) &&
4532 	    !SUBMIRROR_IS_READABLE(un, next_side))
4533 		next_side++;
4534 	if (next_side >= NMIRROR) {
4535 		vdr->vdr_flags |= DKV_DMR_ERROR;
4536 		freerbuf(bp);
4537 		vdr->vdr_bytesread = 0;
4538 		md_unit_readerexit(ui);
4539 		return (0);
4540 	}
4541 
4542 	/* Set the side to read from */
4543 	un->un_dmr_last_read = next_side;
4544 
4545 	md_unit_readerexit(ui);
4546 
4547 	/*
4548 	 * Save timestamp for verification purposes. Can be read by debugger
4549 	 * to verify that this ioctl has been executed and to find the number
4550 	 * of DMR reads and the time of the last DMR read.
4551 	 */
4552 	uniqtime(&mirror_dmr_stats.dmr_timestamp);
4553 	mirror_dmr_stats.dmr_count++;
4554 
4555 	/* Issue READ request and wait for completion */
4556 	mirror_read_strategy(bp, MD_STR_DMR|MD_NOBLOCK|MD_STR_NOTTOP, NULL);
4557 
4558 	mutex_enter(&un->un_dmr_mx);
4559 	cv_wait(&un->un_dmr_cv, &un->un_dmr_mx);
4560 	mutex_exit(&un->un_dmr_mx);
4561 
4562 	/*
4563 	 * Check to see if we encountered an error during the read. If so we
4564 	 * can make no guarantee about any possibly returned data.
4565 	 */
4566 	if ((bp->b_flags & B_ERROR) == 0) {
4567 		vdr->vdr_flags &= ~DKV_DMR_ERROR;
4568 		if (bp->b_resid) {
4569 			vdr->vdr_flags |= DKV_DMR_SHORT;
4570 			vdr->vdr_bytesread = vdr->vdr_nbytes - bp->b_resid;
4571 		} else {
4572 			vdr->vdr_flags |= DKV_DMR_SUCCESS;
4573 			vdr->vdr_bytesread = vdr->vdr_nbytes;
4574 		}
4575 		/* Copy the data read back out to the user supplied buffer */
4576 		if (ddi_copyout(kbuffer, vdr->vdr_data, vdr->vdr_bytesread,
4577 		    mode)) {
4578 			kmem_free(kbuffer, vdr->vdr_nbytes);
4579 			return (EFAULT);
4580 		}
4581 
4582 	} else {
4583 		/* Error out with DKV_DMR_ERROR */
4584 		vdr->vdr_flags |= DKV_DMR_ERROR;
4585 		vdr->vdr_flags &= ~(DKV_DMR_SUCCESS|DKV_DMR_SHORT|DKV_DMR_DONE);
4586 	}
4587 	/*
4588 	 * Update the DMR parameters with the side and name of submirror that
4589 	 * we have just read from (un->un_dmr_last_read)
4590 	 */
4591 	un = md_unit_readerlock(ui);
4592 
4593 	vdr->vdr_side = un->un_dmr_last_read;
4594 	sm = &un->un_sm[un->un_dmr_last_read];
4595 	sm_nm = md_shortname(md_getminor(sm->sm_dev));
4596 
4597 	(void) strncpy(vdr->vdr_side_name, sm_nm, sizeof (vdr->vdr_side_name));
4598 
4599 	/*
4600 	 * Determine if we've completed the read cycle. This is true iff the
4601 	 * next computed submirror (side) equals or exceeds NMIRROR. We cannot
4602 	 * use un_nsm as we need to handle a sparse array of submirrors (which
4603 	 * can occur if a submirror is metadetached).
4604 	 */
4605 	next_side = un->un_dmr_last_read + 1;
4606 	while ((next_side < NMIRROR) &&
4607 	    !SUBMIRROR_IS_READABLE(un, next_side))
4608 		next_side++;
4609 	if (next_side >= NMIRROR) {
4610 		/* We've finished */
4611 		vdr->vdr_flags |= DKV_DMR_DONE;
4612 	}
4613 
4614 	md_unit_readerexit(ui);
4615 	freerbuf(bp);
4616 	kmem_free(kbuffer, vdr->vdr_nbytes);
4617 
4618 	return (0);
4619 }
4620 
4621 /*
4622  * mirror_resync_message:
4623  * ---------------------
4624  * Handle the multi-node resync messages that keep all nodes within a given
4625  * disk-set in sync with their view of a mirror's resync status.
4626  *
4627  * The message types dealt with are:
4628  * MD_MN_MSG_RESYNC_STARTING	- start a resync thread for a unit
4629  * MD_MN_MSG_RESYNC_NEXT	- specified next region to be resynced
4630  * MD_MN_MSG_RESYNC_FINISH	- stop the resync thread for a unit
4631  * MD_MN_MSG_RESYNC_PHASE_DONE	- end of a resync phase, opt, submirror or comp
4632  *
4633  * Returns:
4634  *	0	Success
4635  *	>0	Failure error number
4636  */
4637 int
4638 mirror_resync_message(md_mn_rs_params_t *p, IOLOCK *lockp)
4639 {
4640 	mdi_unit_t		*ui;
4641 	mm_unit_t		*un;
4642 	set_t			setno;
4643 	int			is_ABR;
4644 	int			smi;
4645 	int			ci;
4646 	sm_state_t		state;
4647 	int			broke_out;
4648 	mm_submirror_t		*sm;
4649 	mm_submirror_ic_t	*smic;
4650 	md_m_shared_t		*shared;
4651 	md_error_t		mde = mdnullerror;
4652 	md_mps_t		*ps;
4653 	int			rs_active;
4654 	int			rr, rr_start, rr_end;
4655 
4656 	/* Check that the given device is part of a multi-node set */
4657 	setno = MD_MIN2SET(p->mnum);
4658 	if (setno >= md_nsets) {
4659 		return (ENXIO);
4660 	}
4661 	if (!MD_MNSET_SETNO(setno)) {
4662 		return (EINVAL);
4663 	}
4664 
4665 	if ((un = mirror_getun(p->mnum, &p->mde, NO_LOCK, NULL)) == NULL)
4666 		return (EINVAL);
4667 	if ((ui = MDI_UNIT(p->mnum)) == NULL)
4668 		return (EINVAL);
4669 	is_ABR = (ui->ui_tstate & MD_ABR_CAP);
4670 
4671 	/* Obtain the current resync status */
4672 	(void) md_ioctl_readerlock(lockp, ui);
4673 	rs_active = (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) ? 1 : 0;
4674 	md_ioctl_readerexit(lockp);
4675 
4676 	switch ((md_mn_msgtype_t)p->msg_type) {
4677 	case MD_MN_MSG_RESYNC_STARTING:
4678 		/* Start the resync thread for the mirror */
4679 		(void) mirror_resync_unit(p->mnum, NULL, &p->mde, lockp);
4680 		break;
4681 
4682 	case MD_MN_MSG_RESYNC_NEXT:
4683 		/*
4684 		 * We have to release any previously marked overlap regions
4685 		 * so that i/o can resume. Then we need to block the region
4686 		 * from [rs_start..rs_start+rs_size) * so that no i/o is issued.
4687 		 * Update un_rs_resync_done and un_rs_resync_2_do.
4688 		 */
4689 		(void) md_ioctl_readerlock(lockp, ui);
4690 		/*
4691 		 * Ignore the message if there is no active resync thread or
4692 		 * if it is for a resync type that we have already completed.
4693 		 * un_resync_completed is set to the last resync completed
4694 		 * when processing a PHASE_DONE message.
4695 		 */
4696 		if (!rs_active || (p->rs_type == un->un_resync_completed))
4697 			break;
4698 		/*
4699 		 * If this message is for the same resync and is for an earlier
4700 		 * resync region, just ignore it. This can only occur if this
4701 		 * node has progressed on to the next resync region before
4702 		 * we receive this message. This can occur if the class for
4703 		 * this message is busy and the originator has to retry thus
4704 		 * allowing this node to move onto the next resync_region.
4705 		 */
4706 		if ((p->rs_type == un->un_rs_type) &&
4707 		    (p->rs_start < un->un_resync_startbl))
4708 			break;
4709 		ps = un->un_rs_prev_overlap;
4710 
4711 		/* Allocate previous overlap reference if needed */
4712 		if (ps == NULL) {
4713 			ps = kmem_cache_alloc(mirror_parent_cache,
4714 			    MD_ALLOCFLAGS);
4715 			ps->ps_un = un;
4716 			ps->ps_ui = ui;
4717 			ps->ps_firstblk = 0;
4718 			ps->ps_lastblk = 0;
4719 			ps->ps_flags = 0;
4720 			md_ioctl_readerexit(lockp);
4721 			(void) md_ioctl_writerlock(lockp, ui);
4722 			un->un_rs_prev_overlap = ps;
4723 			md_ioctl_writerexit(lockp);
4724 		} else
4725 			md_ioctl_readerexit(lockp);
4726 
4727 		if (p->rs_originator != md_mn_mynode_id) {
4728 			/*
4729 			 * Clear our un_resync_bm for the regions completed.
4730 			 * The owner (originator) will take care of itself.
4731 			 */
4732 			BLK_TO_RR(rr_end, ps->ps_lastblk, un);
4733 			BLK_TO_RR(rr_start, p->rs_start, un);
4734 			if (ps->ps_lastblk && rr_end < rr_start) {
4735 				BLK_TO_RR(rr_start, ps->ps_firstblk, un);
4736 				mutex_enter(&un->un_resync_mx);
4737 				/*
4738 				 * Update our resync bitmap to reflect that
4739 				 * another node has synchronized this range.
4740 				 */
4741 				for (rr = rr_start; rr <= rr_end; rr++) {
4742 					CLR_KEEPDIRTY(rr, un);
4743 				}
4744 				mutex_exit(&un->un_resync_mx);
4745 			}
4746 
4747 			/*
4748 			 * On all but the originating node, first update
4749 			 * the resync state, then unblock the previous
4750 			 * region and block the next one. No need
4751 			 * to do this if the region is already blocked.
4752 			 * Update the submirror state and flags from the
4753 			 * originator. This keeps the cluster in sync with
4754 			 * regards to the resync status.
4755 			 */
4756 
4757 			(void) md_ioctl_writerlock(lockp, ui);
4758 			un->un_rs_resync_done = p->rs_done;
4759 			un->un_rs_resync_2_do = p->rs_2_do;
4760 			un->un_rs_type = p->rs_type;
4761 			un->un_resync_startbl = p->rs_start;
4762 			md_ioctl_writerexit(lockp);
4763 			/*
4764 			 * Use un_owner_mx to ensure that an ownership change
4765 			 * cannot happen at the same time as this message
4766 			 */
4767 			mutex_enter(&un->un_owner_mx);
4768 			if (MD_MN_MIRROR_OWNER(un)) {
4769 				ps->ps_firstblk = p->rs_start;
4770 				ps->ps_lastblk = ps->ps_firstblk +
4771 				    p->rs_size - 1;
4772 			} else {
4773 				if ((ps->ps_firstblk != p->rs_start) ||
4774 				    (ps->ps_lastblk != p->rs_start +
4775 				    p->rs_size - 1)) {
4776 					/* Remove previous overlap range */
4777 					if (ps->ps_flags & MD_MPS_ON_OVERLAP)
4778 						mirror_overlap_tree_remove(ps);
4779 
4780 					ps->ps_firstblk = p->rs_start;
4781 					ps->ps_lastblk = ps->ps_firstblk +
4782 					    p->rs_size - 1;
4783 
4784 					mutex_exit(&un->un_owner_mx);
4785 					/* Block this range from all i/o. */
4786 					if (ps->ps_firstblk != 0 ||
4787 					    ps->ps_lastblk != 0)
4788 						wait_for_overlaps(ps,
4789 						    MD_OVERLAP_ALLOW_REPEAT);
4790 					mutex_enter(&un->un_owner_mx);
4791 					/*
4792 					 * Check to see if we have obtained
4793 					 * ownership while waiting for
4794 					 * overlaps. If we have, remove
4795 					 * the resync_region entry from the
4796 					 * overlap tree
4797 					 */
4798 					if (MD_MN_MIRROR_OWNER(un) &&
4799 					    (ps->ps_flags & MD_MPS_ON_OVERLAP))
4800 						mirror_overlap_tree_remove(ps);
4801 				}
4802 			}
4803 			mutex_exit(&un->un_owner_mx);
4804 
4805 			/*
4806 			 * If this is the first RESYNC_NEXT message (i.e.
4807 			 * MD_MN_RS_FIRST_RESYNC_NEXT set in p->rs_flags),
4808 			 * issue RESYNC_START NOTIFY event
4809 			 */
4810 			if (p->rs_flags & MD_MN_RS_FIRST_RESYNC_NEXT) {
4811 				SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_START,
4812 				    SVM_TAG_METADEVICE, MD_UN2SET(un),
4813 				    MD_SID(un));
4814 			}
4815 
4816 			/* Ensure that our local resync thread is running */
4817 			if (un->un_rs_thread == NULL) {
4818 				(void) mirror_resync_unit(p->mnum, NULL,
4819 				    &p->mde, lockp);
4820 			}
4821 		}
4822 
4823 		break;
4824 	case MD_MN_MSG_RESYNC_FINISH:
4825 		/*
4826 		 * Complete the resync by stopping the resync thread.
4827 		 * Also release the previous overlap region field.
4828 		 * Update the resync_progress_thread by cv_signal'ing it so
4829 		 * that we mark the end of the resync as soon as possible. This
4830 		 * stops an unnecessary delay should be panic after resync
4831 		 * completion.
4832 		 */
4833 #ifdef DEBUG
4834 		if (!rs_active) {
4835 			if (mirror_debug_flag)
4836 				printf("RESYNC_FINISH (mnum = %x), "
4837 				    "Resync *NOT* active",
4838 				    p->mnum);
4839 		}
4840 #endif
4841 
4842 		if ((un->c.un_status & MD_UN_RESYNC_ACTIVE) &&
4843 		    (p->rs_originator != md_mn_mynode_id)) {
4844 			mutex_enter(&un->un_rs_thread_mx);
4845 			un->c.un_status &= ~MD_UN_RESYNC_CANCEL;
4846 			un->un_rs_thread_flags |= MD_RI_SHUTDOWN;
4847 			un->un_rs_thread_flags &=
4848 			    ~(MD_RI_BLOCK|MD_RI_BLOCK_OWNER);
4849 			cv_signal(&un->un_rs_thread_cv);
4850 			mutex_exit(&un->un_rs_thread_mx);
4851 		}
4852 		if (is_ABR) {
4853 			/* Resync finished, if ABR set owner to NULL */
4854 			mutex_enter(&un->un_owner_mx);
4855 			un->un_mirror_owner = 0;
4856 			mutex_exit(&un->un_owner_mx);
4857 		}
4858 		(void) md_ioctl_writerlock(lockp, ui);
4859 		ps = un->un_rs_prev_overlap;
4860 		if (ps != NULL) {
4861 			/* Remove previous overlap range */
4862 			if (ps->ps_flags & MD_MPS_ON_OVERLAP)
4863 				mirror_overlap_tree_remove(ps);
4864 			/*
4865 			 * Release the overlap range reference
4866 			 */
4867 			un->un_rs_prev_overlap = NULL;
4868 			kmem_cache_free(mirror_parent_cache,
4869 			    ps);
4870 		}
4871 		md_ioctl_writerexit(lockp);
4872 
4873 		/* Mark the resync as complete in the metadb */
4874 		un->un_rs_resync_done = p->rs_done;
4875 		un->un_rs_resync_2_do = p->rs_2_do;
4876 		un->un_rs_type = p->rs_type;
4877 		mutex_enter(&un->un_rs_progress_mx);
4878 		cv_signal(&un->un_rs_progress_cv);
4879 		mutex_exit(&un->un_rs_progress_mx);
4880 
4881 		un = md_ioctl_writerlock(lockp, ui);
4882 		un->c.un_status &= ~MD_UN_RESYNC_ACTIVE;
4883 		/* Deal with any pending grow_unit */
4884 		if (un->c.un_status & MD_UN_GROW_PENDING) {
4885 			if ((mirror_grow_unit(un, &mde) != 0) ||
4886 			    (! mdismderror(&mde, MDE_GROW_DELAYED))) {
4887 				un->c.un_status &= ~MD_UN_GROW_PENDING;
4888 			}
4889 		}
4890 		md_ioctl_writerexit(lockp);
4891 		break;
4892 
4893 	case MD_MN_MSG_RESYNC_PHASE_DONE:
4894 		/*
4895 		 * A phase of the resync, optimized. component or
4896 		 * submirror is complete. Update mirror status.
4897 		 * If the flag CLEAR_OPT_NOT_DONE is set, it means that the
4898 		 * mirror owner is peforming a resync. If we have just snarfed
4899 		 * this set, then we must clear any of the flags set at snarf
4900 		 * time by unit_setup_resync().
4901 		 * Note that unit_setup_resync() sets up these flags to
4902 		 * indicate that an optimized resync is required. These flags
4903 		 * need to be reset because if we get here,  the mirror owner
4904 		 * will have handled the optimized resync.
4905 		 * The flags that must be cleared are MD_UN_OPT_NOT_DONE and
4906 		 * MD_UN_WAR. In addition, for each submirror,
4907 		 * MD_SM_RESYNC_TARGET must be cleared and SMS_OFFLINE_RESYNC
4908 		 * set to SMS_OFFLINE.
4909 		 */
4910 #ifdef DEBUG
4911 		if (mirror_debug_flag)
4912 			printf("phase done mess received from %d, mnum=%x,"
4913 			    "type=%x, flags=%x\n", p->rs_originator, p->mnum,
4914 			    p->rs_type, p->rs_flags);
4915 #endif
4916 		/*
4917 		 * Ignore the message if there is no active resync thread.
4918 		 */
4919 		if (!rs_active)
4920 			break;
4921 
4922 		broke_out = p->rs_flags & MD_MN_RS_ERR;
4923 		switch (RS_TYPE(p->rs_type)) {
4924 		case MD_RS_OPTIMIZED:
4925 			un = md_ioctl_writerlock(lockp, ui);
4926 			if (p->rs_flags & MD_MN_RS_CLEAR_OPT_NOT_DONE) {
4927 				/* If we are originator, just clear rs_type */
4928 				if (p->rs_originator == md_mn_mynode_id) {
4929 					SET_RS_TYPE_NONE(un->un_rs_type);
4930 					md_ioctl_writerexit(lockp);
4931 					break;
4932 				}
4933 				/*
4934 				 * If CLEAR_OPT_NOT_DONE is set, only clear the
4935 				 * flags if OPT_NOT_DONE is set *and* rs_type
4936 				 * is MD_RS_NONE.
4937 				 */
4938 				if ((un->c.un_status & MD_UN_OPT_NOT_DONE) &&
4939 				    (RS_TYPE(un->un_rs_type) == MD_RS_NONE)) {
4940 					/* No resync in progress */
4941 					un->c.un_status &= ~MD_UN_OPT_NOT_DONE;
4942 					un->c.un_status &= ~MD_UN_WAR;
4943 				} else {
4944 					/*
4945 					 * We are in the middle of an
4946 					 * optimized resync and this message
4947 					 * should be ignored.
4948 					 */
4949 					md_ioctl_writerexit(lockp);
4950 					break;
4951 				}
4952 			} else {
4953 				/*
4954 				 * This is the end of an optimized resync,
4955 				 * clear the OPT_NOT_DONE and OFFLINE_SM flags
4956 				 */
4957 
4958 				un->c.un_status &= ~MD_UN_KEEP_DIRTY;
4959 				if (!broke_out)
4960 					un->c.un_status &= ~MD_UN_WAR;
4961 
4962 				/*
4963 				 * Clear our un_resync_bm for the regions
4964 				 * completed.  The owner (originator) will
4965 				 * take care of itself.
4966 				 */
4967 				if (p->rs_originator != md_mn_mynode_id &&
4968 				    (ps = un->un_rs_prev_overlap) != NULL) {
4969 					BLK_TO_RR(rr_start, ps->ps_firstblk,
4970 					    un);
4971 					BLK_TO_RR(rr_end, ps->ps_lastblk, un);
4972 					mutex_enter(&un->un_resync_mx);
4973 					for (rr = rr_start; rr <= rr_end;
4974 					    rr++) {
4975 						CLR_KEEPDIRTY(rr, un);
4976 					}
4977 					mutex_exit(&un->un_resync_mx);
4978 				}
4979 			}
4980 
4981 			/*
4982 			 * Set resync_completed to last resync type and then
4983 			 * clear resync_type to indicate no resync in progress
4984 			 */
4985 			un->un_resync_completed = un->un_rs_type;
4986 			SET_RS_TYPE_NONE(un->un_rs_type);
4987 
4988 			/*
4989 			 * If resync is as a result of a submirror ONLINE,
4990 			 * reset the submirror state to SMS_RUNNING if the
4991 			 * resync was ok else set back to SMS_OFFLINE.
4992 			 */
4993 			for (smi = 0; smi < NMIRROR; smi++) {
4994 				un->un_sm[smi].sm_flags &=
4995 				    ~MD_SM_RESYNC_TARGET;
4996 				if (SMS_BY_INDEX_IS(un, smi,
4997 				    SMS_OFFLINE_RESYNC)) {
4998 					if (p->rs_flags &
4999 					    MD_MN_RS_CLEAR_OPT_NOT_DONE) {
5000 						state = SMS_OFFLINE;
5001 					} else {
5002 						state = (broke_out ?
5003 						    SMS_OFFLINE : SMS_RUNNING);
5004 					}
5005 					mirror_set_sm_state(
5006 					    &un->un_sm[smi],
5007 					    &un->un_smic[smi], state,
5008 					    broke_out);
5009 					mirror_commit(un, NO_SUBMIRRORS,
5010 					    0);
5011 				}
5012 				/*
5013 				 * If we still have an offline submirror, reset
5014 				 * the OFFLINE_SM flag in the mirror status
5015 				 */
5016 				if (SMS_BY_INDEX_IS(un, smi,
5017 				    SMS_OFFLINE))
5018 					un->c.un_status |=
5019 					    MD_UN_OFFLINE_SM;
5020 			}
5021 			md_ioctl_writerexit(lockp);
5022 			break;
5023 		case MD_RS_SUBMIRROR:
5024 			un = md_ioctl_writerlock(lockp, ui);
5025 			smi = RS_SMI(p->rs_type);
5026 			sm = &un->un_sm[smi];
5027 			smic = &un->un_smic[smi];
5028 			/* Clear RESYNC target */
5029 			un->un_sm[smi].sm_flags &= ~MD_SM_RESYNC_TARGET;
5030 			/*
5031 			 * Set resync_completed to last resync type and then
5032 			 * clear resync_type to indicate no resync in progress
5033 			 */
5034 			un->un_resync_completed = un->un_rs_type;
5035 			SET_RS_TYPE_NONE(un->un_rs_type);
5036 			/*
5037 			 * If the resync completed ok reset the submirror
5038 			 * state to SMS_RUNNING else reset it to SMS_ATTACHED
5039 			 */
5040 			state = (broke_out ?
5041 			    SMS_ATTACHED : SMS_RUNNING);
5042 			mirror_set_sm_state(sm, smic, state, broke_out);
5043 			un->c.un_status &= ~MD_UN_WAR;
5044 			mirror_commit(un, SMI2BIT(smi), 0);
5045 			md_ioctl_writerexit(lockp);
5046 			break;
5047 		case MD_RS_COMPONENT:
5048 			un = md_ioctl_writerlock(lockp, ui);
5049 			smi = RS_SMI(p->rs_type);
5050 			ci = RS_CI(p->rs_type);
5051 			sm = &un->un_sm[smi];
5052 			smic = &un->un_smic[smi];
5053 			shared = (md_m_shared_t *)
5054 			    (*(smic->sm_shared_by_indx))
5055 			    (sm->sm_dev, sm, ci);
5056 			un->c.un_status &= ~MD_UN_WAR;
5057 			/* Clear RESYNC target */
5058 			un->un_sm[smi].sm_flags &= ~MD_SM_RESYNC_TARGET;
5059 			/*
5060 			 * Set resync_completed to last resync type and then
5061 			 * clear resync_type to indicate no resync in progress
5062 			 */
5063 			un->un_resync_completed = un->un_rs_type;
5064 			SET_RS_TYPE_NONE(un->un_rs_type);
5065 
5066 			/*
5067 			 * If the resync completed ok, set the component state
5068 			 * to CS_OKAY.
5069 			 */
5070 			if (broke_out)
5071 				shared->ms_flags |= MDM_S_RS_TRIED;
5072 			else {
5073 				/*
5074 				 * As we don't transmit the changes,
5075 				 * no need to drop the lock.
5076 				 */
5077 				set_sm_comp_state(un, smi, ci, CS_OKAY, 0,
5078 				    MD_STATE_NO_XMIT, (IOLOCK *)NULL);
5079 			}
5080 			md_ioctl_writerexit(lockp);
5081 		default:
5082 			break;
5083 		}
5084 		/*
5085 		 * If the purpose of this PHASE_DONE message is just to
5086 		 * indicate to all other nodes that the optimized resync
5087 		 * required (OPT_NOT_DONE) flag is to be cleared, there is
5088 		 * no need to generate a notify event as there has not
5089 		 * actually been a resync.
5090 		 */
5091 		if (!(p->rs_flags & MD_MN_RS_CLEAR_OPT_NOT_DONE)) {
5092 			if (broke_out) {
5093 				SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_FAILED,
5094 				    SVM_TAG_METADEVICE, MD_UN2SET(un),
5095 				    MD_SID(un));
5096 			} else {
5097 				SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_DONE,
5098 				    SVM_TAG_METADEVICE, MD_UN2SET(un),
5099 				    MD_SID(un));
5100 			}
5101 		}
5102 		break;
5103 
5104 	default:
5105 #ifdef DEBUG
5106 		cmn_err(CE_PANIC, "mirror_resync_message: Unknown message type"
5107 		    " %x\n", p->msg_type);
5108 #endif
5109 		return (EINVAL);
5110 	}
5111 	return (0);
5112 }
5113 
5114 /* Return a -1 if snarf of optimized record failed and set should be released */
5115 static int
5116 mirror_snarf(md_snarfcmd_t cmd, set_t setno)
5117 {
5118 	mddb_recid_t	recid;
5119 	int		gotsomething;
5120 	int		all_mirrors_gotten;
5121 	mm_unit_t	*un;
5122 	mddb_type_t	typ1;
5123 	mddb_de_ic_t    *dep;
5124 	mddb_rb32_t	*rbp;
5125 	size_t		newreqsize;
5126 	mm_unit_t	*big_un;
5127 	mm_unit32_od_t	*small_un;
5128 	int		retval;
5129 	mdi_unit_t	*ui;
5130 
5131 	if (cmd == MD_SNARF_CLEANUP) {
5132 		if (md_get_setstatus(setno) & MD_SET_STALE)
5133 			return (0);
5134 
5135 		recid = mddb_makerecid(setno, 0);
5136 		typ1 = (mddb_type_t)md_getshared_key(setno,
5137 		    mirror_md_ops.md_driver.md_drivername);
5138 		while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) {
5139 			if (mddb_getrecprivate(recid) & MD_PRV_CLEANUP) {
5140 				un = (mm_unit_t *)mddb_getrecaddr(recid);
5141 				mirror_cleanup(un);
5142 				recid = mddb_makerecid(setno, 0);
5143 			}
5144 		}
5145 		return (0);
5146 	}
5147 
5148 	all_mirrors_gotten = 1;
5149 	gotsomething = 0;
5150 
5151 	recid = mddb_makerecid(setno, 0);
5152 	typ1 = (mddb_type_t)md_getshared_key(setno,
5153 	    mirror_md_ops.md_driver.md_drivername);
5154 
5155 	while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) {
5156 		if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
5157 			continue;
5158 
5159 		dep = mddb_getrecdep(recid);
5160 		dep->de_flags = MDDB_F_MIRROR;
5161 		rbp = dep->de_rb;
5162 
5163 		switch (rbp->rb_revision) {
5164 		case MDDB_REV_RB:
5165 		case MDDB_REV_RBFN:
5166 			if ((rbp->rb_private & MD_PRV_CONVD) == 0) {
5167 				/*
5168 				 * This means, we have an old and small
5169 				 * record and this record hasn't already
5170 				 * been converted.  Before we create an
5171 				 * incore metadevice from this we have to
5172 				 * convert it to a big record.
5173 				 */
5174 				small_un =
5175 				    (mm_unit32_od_t *)mddb_getrecaddr(recid);
5176 				newreqsize = sizeof (mm_unit_t);
5177 				big_un = (mm_unit_t *)kmem_zalloc(newreqsize,
5178 				    KM_SLEEP);
5179 				mirror_convert((caddr_t)small_un,
5180 				    (caddr_t)big_un, SMALL_2_BIG);
5181 				kmem_free(small_un, dep->de_reqsize);
5182 
5183 				/*
5184 				 * Update userdata and incore userdata
5185 				 * incores are at the end of un
5186 				 */
5187 				dep->de_rb_userdata_ic = big_un;
5188 				dep->de_rb_userdata = big_un;
5189 				dep->de_icreqsize = newreqsize;
5190 				un = big_un;
5191 				rbp->rb_private |= MD_PRV_CONVD;
5192 			} else {
5193 				/*
5194 				 * Unit already converted, just get the
5195 				 * record address.
5196 				 */
5197 				un = (mm_unit_t *)mddb_getrecaddr_resize(recid,
5198 				    sizeof (*un), 0);
5199 			}
5200 			un->c.un_revision &= ~MD_64BIT_META_DEV;
5201 			break;
5202 		case MDDB_REV_RB64:
5203 		case MDDB_REV_RB64FN:
5204 			/* Big device */
5205 			un = (mm_unit_t *)mddb_getrecaddr_resize(recid,
5206 			    sizeof (*un), 0);
5207 			un->c.un_revision |= MD_64BIT_META_DEV;
5208 			un->c.un_flag |= MD_EFILABEL;
5209 			break;
5210 		}
5211 		MDDB_NOTE_FN(rbp->rb_revision, un->c.un_revision);
5212 
5213 		/*
5214 		 * Create minor device node for snarfed entry.
5215 		 */
5216 		(void) md_create_minor_node(setno, MD_SID(un));
5217 
5218 		if (MD_UNIT(MD_SID(un)) != NULL) {
5219 			mddb_setrecprivate(recid, MD_PRV_PENDDEL);
5220 			continue;
5221 		}
5222 		all_mirrors_gotten = 0;
5223 		retval = mirror_build_incore(un, 1);
5224 		if (retval == 0) {
5225 			mddb_setrecprivate(recid, MD_PRV_GOTIT);
5226 			md_create_unit_incore(MD_SID(un), &mirror_md_ops, 0);
5227 			resync_start_timeout(setno);
5228 			gotsomething = 1;
5229 		} else {
5230 			return (retval);
5231 		}
5232 		/*
5233 		 * Set flag to indicate that the mirror has not yet
5234 		 * been through a reconfig. This flag is used for MN sets
5235 		 * when determining whether to update the mirror state from
5236 		 * the Master node.
5237 		 */
5238 		if (MD_MNSET_SETNO(setno)) {
5239 			ui = MDI_UNIT(MD_SID(un));
5240 			ui->ui_tstate |= MD_RESYNC_NOT_DONE;
5241 		}
5242 	}
5243 
5244 	if (!all_mirrors_gotten)
5245 		return (gotsomething);
5246 
5247 	recid = mddb_makerecid(setno, 0);
5248 	while ((recid = mddb_getnextrec(recid, typ1, RESYNC_REC)) > 0)
5249 		if (!(mddb_getrecprivate(recid) & MD_PRV_GOTIT))
5250 			mddb_setrecprivate(recid, MD_PRV_PENDDEL);
5251 
5252 	return (0);
5253 }
5254 
5255 static int
5256 mirror_halt(md_haltcmd_t cmd, set_t setno)
5257 {
5258 	unit_t		i;
5259 	mdi_unit_t	*ui;
5260 	minor_t		mnum;
5261 	int		reset_mirror_flag = 0;
5262 
5263 	if (cmd == MD_HALT_CLOSE)
5264 		return (0);
5265 
5266 	if (cmd == MD_HALT_OPEN)
5267 		return (0);
5268 
5269 	if (cmd == MD_HALT_UNLOAD)
5270 		return (0);
5271 
5272 	if (cmd == MD_HALT_CHECK) {
5273 		for (i = 0; i < md_nunits; i++) {
5274 			mnum = MD_MKMIN(setno, i);
5275 			if ((ui = MDI_UNIT(mnum)) == NULL)
5276 				continue;
5277 			if (ui->ui_opsindex != mirror_md_ops.md_selfindex)
5278 				continue;
5279 			if (md_unit_isopen(ui))
5280 				return (1);
5281 		}
5282 		return (0);
5283 	}
5284 
5285 	if (cmd != MD_HALT_DOIT)
5286 		return (1);
5287 
5288 	for (i = 0; i < md_nunits; i++) {
5289 		mnum = MD_MKMIN(setno, i);
5290 		if ((ui = MDI_UNIT(mnum)) == NULL)
5291 			continue;
5292 		if (ui->ui_opsindex != mirror_md_ops.md_selfindex)
5293 			continue;
5294 		reset_mirror((mm_unit_t *)MD_UNIT(mnum), mnum, 0);
5295 
5296 		/* Set a flag if there is at least one mirror metadevice. */
5297 		reset_mirror_flag = 1;
5298 	}
5299 
5300 	/*
5301 	 * Only wait for the global dr_timeout to finish
5302 	 *  - if there are mirror metadevices in this diskset or
5303 	 *  - if this is the local set since an unload of the md_mirror
5304 	 *    driver could follow a successful mirror halt in the local set.
5305 	 */
5306 	if ((reset_mirror_flag != 0) || (setno == MD_LOCAL_SET)) {
5307 		while ((mirror_md_ops.md_head == NULL) &&
5308 		    (mirror_timeout.dr_timeout_id != 0))
5309 			delay(md_hz);
5310 	}
5311 
5312 	return (0);
5313 }
5314 
5315 /*ARGSUSED3*/
5316 static int
5317 mirror_open(dev_t *dev, int flag, int otyp, cred_t *cred_p, int md_oflags)
5318 {
5319 	IOLOCK	lock;
5320 	minor_t		mnum = getminor(*dev);
5321 	set_t		setno;
5322 
5323 	/*
5324 	 * When doing an open of a multi owner metadevice, check to see if this
5325 	 * node is a starting node and if a reconfig cycle is underway.
5326 	 * If so, the system isn't sufficiently set up enough to handle the
5327 	 * open (which involves I/O during sp_validate), so fail with ENXIO.
5328 	 */
5329 	setno = MD_MIN2SET(mnum);
5330 	if ((md_set[setno].s_status & (MD_SET_MNSET | MD_SET_MN_START_RC)) ==
5331 	    (MD_SET_MNSET | MD_SET_MN_START_RC)) {
5332 			return (ENXIO);
5333 	}
5334 
5335 	if (md_oflags & MD_OFLG_FROMIOCTL) {
5336 		/*
5337 		 * This indicates that the caller is an ioctl service routine.
5338 		 * In this case we initialise our stack-based IOLOCK and pass
5339 		 * this into the internal open routine. This allows multi-owner
5340 		 * metadevices to avoid deadlocking if an error is encountered
5341 		 * during the open() attempt. The failure case is:
5342 		 * s-p -> mirror -> s-p (with error). Attempting to metaclear
5343 		 * this configuration would deadlock as the mirror code has to
5344 		 * send a state-update to the other nodes when it detects the
5345 		 * failure of the underlying submirror with an errored soft-part
5346 		 * on it. As there is a class1 message in progress (metaclear)
5347 		 * set_sm_comp_state() cannot send another class1 message;
5348 		 * instead we do not send a state_update message as the
5349 		 * metaclear is distributed and the failed submirror will be
5350 		 * cleared from the configuration by the metaclear.
5351 		 */
5352 		IOLOCK_INIT(&lock);
5353 		return (mirror_internal_open(getminor(*dev), flag, otyp,
5354 		    md_oflags, &lock));
5355 	} else {
5356 		return (mirror_internal_open(getminor(*dev), flag, otyp,
5357 		    md_oflags, (IOLOCK *)NULL));
5358 	}
5359 }
5360 
5361 
5362 /*ARGSUSED1*/
5363 static int
5364 mirror_close(dev_t dev, int flag, int otyp, cred_t *cred_p, int md_cflags)
5365 {
5366 	return (mirror_internal_close(getminor(dev), otyp, md_cflags,
5367 	    (IOLOCK *)NULL));
5368 }
5369 
5370 
5371 /*
5372  * This routine dumps memory to the disk.  It assumes that the memory has
5373  * already been mapped into mainbus space.  It is called at disk interrupt
5374  * priority when the system is in trouble.
5375  *
5376  */
5377 static int
5378 mirror_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
5379 {
5380 	mm_unit_t	*un;
5381 	dev_t		mapdev;
5382 	int		result;
5383 	int		smi;
5384 	int		any_succeed = 0;
5385 	int		save_result = 0;
5386 
5387 	/*
5388 	 * Don't need to grab the unit lock.
5389 	 * Cause nothing else is suppose to be happenning.
5390 	 * Also dump is not suppose to sleep.
5391 	 */
5392 	un = (mm_unit_t *)MD_UNIT(getminor(dev));
5393 
5394 	if ((diskaddr_t)blkno >= un->c.un_total_blocks)
5395 		return (EINVAL);
5396 
5397 	if ((diskaddr_t)blkno + nblk > un->c.un_total_blocks)
5398 		return (EINVAL);
5399 
5400 	for (smi = 0; smi < NMIRROR; smi++) {
5401 		if (!SUBMIRROR_IS_WRITEABLE(un, smi))
5402 			continue;
5403 		mapdev = md_dev64_to_dev(un->un_sm[smi].sm_dev);
5404 		result = bdev_dump(mapdev, addr, blkno, nblk);
5405 		if (result)
5406 			save_result = result;
5407 
5408 		if (result == 0)
5409 			any_succeed++;
5410 	}
5411 
5412 	if (any_succeed)
5413 		return (0);
5414 
5415 	return (save_result);
5416 }
5417 
5418 /*
5419  * NAME: mirror_probe_dev
5420  *
5421  * DESCRITPION: force opens every component of a mirror.
5422  *
5423  * On entry the unit writerlock is held
5424  */
5425 static int
5426 mirror_probe_dev(mdi_unit_t *ui, minor_t mnum)
5427 {
5428 	int		i;
5429 	int		smi;
5430 	int		ci;
5431 	mm_unit_t	*un;
5432 	int		md_devopen = 0;
5433 	set_t		setno;
5434 	int		sm_cnt;
5435 	int		sm_unavail_cnt;
5436 
5437 	if (md_unit_isopen(ui))
5438 		md_devopen++;
5439 
5440 	un = MD_UNIT(mnum);
5441 	setno = MD_UN2SET(un);
5442 
5443 	sm_cnt = 0;
5444 	sm_unavail_cnt = 0;
5445 	for (i = 0; i < NMIRROR; i++) {
5446 		md_dev64_t tmpdev;
5447 		mdi_unit_t	*sm_ui;
5448 
5449 		if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) {
5450 			continue;
5451 		}
5452 
5453 		sm_cnt++;
5454 		tmpdev = un->un_sm[i].sm_dev;
5455 		(void) md_layered_open(mnum, &tmpdev,
5456 		    MD_OFLG_CONT_ERRS | MD_OFLG_PROBEDEV);
5457 		un->un_sm[i].sm_dev = tmpdev;
5458 
5459 		sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev)));
5460 
5461 		/*
5462 		 * Logic similar to that in mirror_open_all_devs.  We set or
5463 		 * clear the submirror Unavailable bit.
5464 		 */
5465 		(void) md_unit_writerlock(sm_ui);
5466 		if (submirror_unavailable(un, i, 1)) {
5467 			sm_ui->ui_tstate |= MD_INACCESSIBLE;
5468 			sm_unavail_cnt++;
5469 		} else {
5470 			sm_ui->ui_tstate &= ~MD_INACCESSIBLE;
5471 		}
5472 		md_unit_writerexit(sm_ui);
5473 	}
5474 
5475 	/*
5476 	 * If all of the submirrors are unavailable, the mirror is also
5477 	 * unavailable.
5478 	 */
5479 	if (sm_cnt == sm_unavail_cnt) {
5480 		ui->ui_tstate |= MD_INACCESSIBLE;
5481 	} else {
5482 		ui->ui_tstate &= ~MD_INACCESSIBLE;
5483 	}
5484 
5485 	/*
5486 	 * Start checking from probe failures. If failures occur we
5487 	 * set the appropriate erred state only if the metadevice is in
5488 	 * use. This is specifically to prevent unnecessary resyncs.
5489 	 * For instance if the disks were accidentally disconnected when
5490 	 * the system booted up then until the metadevice is accessed
5491 	 * (like file system mount) the user can shutdown, recable and
5492 	 * reboot w/o incurring a potentially huge resync.
5493 	 */
5494 
5495 	smi = 0;
5496 	ci = 0;
5497 	while (mirror_geterror(un, &smi, &ci, 1, 1) != 0) {
5498 
5499 		if (mirror_other_sources(un, smi, ci, 0) == 1) {
5500 			/*
5501 			 * Note that for a MN set, there is no need to call
5502 			 * SE_NOTIFY as that is done when processing the
5503 			 * state change
5504 			 */
5505 			if (md_devopen) {
5506 				/*
5507 				 * Never called from ioctl context,
5508 				 * so (IOLOCK *)NULL
5509 				 */
5510 				set_sm_comp_state(un, smi, ci, CS_LAST_ERRED,
5511 				    0, MD_STATE_XMIT, (IOLOCK *)NULL);
5512 				if (!MD_MNSET_SETNO(setno)) {
5513 					SE_NOTIFY(EC_SVM_STATE,
5514 					    ESC_SVM_LASTERRED,
5515 					    SVM_TAG_METADEVICE, setno,
5516 					    MD_SID(un));
5517 				}
5518 				continue;
5519 			} else {
5520 				(void) mirror_close_all_devs(un,
5521 				    MD_OFLG_PROBEDEV);
5522 				if (!MD_MNSET_SETNO(setno)) {
5523 					SE_NOTIFY(EC_SVM_STATE,
5524 					    ESC_SVM_OPEN_FAIL,
5525 					    SVM_TAG_METADEVICE, setno,
5526 					    MD_SID(un));
5527 				}
5528 				mirror_openfail_console_info(un, smi, ci);
5529 				return (ENXIO);
5530 			}
5531 		}
5532 
5533 		/*
5534 		 * Note that for a MN set, there is no need to call
5535 		 * SE_NOTIFY as that is done when processing the
5536 		 * state change
5537 		 */
5538 		if (md_devopen) {
5539 			/* Never called from ioctl context, so (IOLOCK *)NULL */
5540 			set_sm_comp_state(un, smi, ci, CS_ERRED, 0,
5541 			    MD_STATE_XMIT, (IOLOCK *)NULL);
5542 			if (!MD_MNSET_SETNO(setno)) {
5543 				SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED,
5544 				    SVM_TAG_METADEVICE, setno,
5545 				    MD_SID(un));
5546 			}
5547 		}
5548 		mirror_openfail_console_info(un, smi, ci);
5549 		ci++;
5550 	}
5551 
5552 	if (MD_MNSET_SETNO(setno)) {
5553 		send_poke_hotspares(setno);
5554 	} else {
5555 		(void) poke_hotspares();
5556 	}
5557 	(void) mirror_close_all_devs(un, MD_OFLG_PROBEDEV);
5558 
5559 	return (0);
5560 }
5561 
5562 
5563 static int
5564 mirror_imp_set(
5565 	set_t	setno
5566 )
5567 {
5568 
5569 	mddb_recid_t	recid;
5570 	int		gotsomething, i;
5571 	mddb_type_t	typ1;
5572 	mddb_de_ic_t	*dep;
5573 	mddb_rb32_t	*rbp;
5574 	mm_unit32_od_t	*un32;
5575 	mm_unit_t	*un64;
5576 	md_dev64_t	self_devt;
5577 	minor_t		*self_id;	/* minor needs to be updated */
5578 	md_parent_t	*parent_id;	/* parent needs to be updated */
5579 	mddb_recid_t	*record_id;	/* record id needs to be updated */
5580 	mddb_recid_t	*optrec_id;
5581 	md_dev64_t	tmpdev;
5582 
5583 
5584 	gotsomething = 0;
5585 
5586 	typ1 = (mddb_type_t)md_getshared_key(setno,
5587 	    mirror_md_ops.md_driver.md_drivername);
5588 	recid = mddb_makerecid(setno, 0);
5589 
5590 	while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) {
5591 		if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
5592 			continue;
5593 
5594 		dep = mddb_getrecdep(recid);
5595 		rbp = dep->de_rb;
5596 
5597 		switch (rbp->rb_revision) {
5598 		case MDDB_REV_RB:
5599 		case MDDB_REV_RBFN:
5600 			/*
5601 			 * Small device
5602 			 */
5603 			un32 = (mm_unit32_od_t *)mddb_getrecaddr(recid);
5604 			self_id = &(un32->c.un_self_id);
5605 			parent_id = &(un32->c.un_parent);
5606 			record_id = &(un32->c.un_record_id);
5607 			optrec_id = &(un32->un_rr_dirty_recid);
5608 
5609 			for (i = 0; i < un32->un_nsm; i++) {
5610 				tmpdev = md_expldev(un32->un_sm[i].sm_dev);
5611 				un32->un_sm[i].sm_dev = md_cmpldev
5612 				    (md_makedevice(md_major, MD_MKMIN(setno,
5613 				    MD_MIN2UNIT(md_getminor(tmpdev)))));
5614 
5615 				if (!md_update_minor(setno, mddb_getsidenum
5616 				    (setno), un32->un_sm[i].sm_key))
5617 				goto out;
5618 			}
5619 			break;
5620 		case MDDB_REV_RB64:
5621 		case MDDB_REV_RB64FN:
5622 			un64 = (mm_unit_t *)mddb_getrecaddr(recid);
5623 			self_id = &(un64->c.un_self_id);
5624 			parent_id = &(un64->c.un_parent);
5625 			record_id = &(un64->c.un_record_id);
5626 			optrec_id = &(un64->un_rr_dirty_recid);
5627 
5628 			for (i = 0; i < un64->un_nsm; i++) {
5629 				tmpdev = un64->un_sm[i].sm_dev;
5630 				un64->un_sm[i].sm_dev = md_makedevice
5631 				    (md_major, MD_MKMIN(setno, MD_MIN2UNIT
5632 				    (md_getminor(tmpdev))));
5633 
5634 				if (!md_update_minor(setno, mddb_getsidenum
5635 				    (setno), un64->un_sm[i].sm_key))
5636 				goto out;
5637 			}
5638 			break;
5639 		}
5640 
5641 		/*
5642 		 * If this is a top level and a friendly name metadevice,
5643 		 * update its minor in the namespace.
5644 		 */
5645 		if ((*parent_id == MD_NO_PARENT) &&
5646 		    ((rbp->rb_revision == MDDB_REV_RBFN) ||
5647 		    (rbp->rb_revision == MDDB_REV_RB64FN))) {
5648 
5649 			self_devt = md_makedevice(md_major, *self_id);
5650 			if (!md_update_top_device_minor(setno,
5651 			    mddb_getsidenum(setno), self_devt))
5652 				goto out;
5653 		}
5654 
5655 		/*
5656 		 * Update unit with the imported setno
5657 		 *
5658 		 */
5659 		mddb_setrecprivate(recid, MD_PRV_GOTIT);
5660 
5661 		*self_id = MD_MKMIN(setno, MD_MIN2UNIT(*self_id));
5662 		if (*parent_id != MD_NO_PARENT)
5663 			*parent_id = MD_MKMIN(setno, MD_MIN2UNIT(*parent_id));
5664 		*record_id = MAKERECID(setno, DBID(*record_id));
5665 		*optrec_id = MAKERECID(setno, DBID(*optrec_id));
5666 
5667 		gotsomething = 1;
5668 	}
5669 
5670 out:
5671 	return (gotsomething);
5672 }
5673 
5674 /*
5675  * NAME: mirror_check_offline
5676  *
5677  * DESCRIPTION: return offline_status = 1 if any submirrors are offline
5678  *
5679  * Called from ioctl, so access to MD_UN_OFFLINE_SM in un_status is
5680  * protected by the global ioctl lock as it is only set by the MD_IOCOFFLINE
5681  * ioctl.
5682  */
5683 int
5684 mirror_check_offline(md_dev64_t dev, int *offline_status)
5685 {
5686 	mm_unit_t		*un;
5687 	md_error_t		mde = mdnullerror;
5688 
5689 	if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL)
5690 		return (EINVAL);
5691 	*offline_status = 0;
5692 	if (un->c.un_status & MD_UN_OFFLINE_SM)
5693 		*offline_status = 1;
5694 	return (0);
5695 }
5696 
5697 /*
5698  * NAME: mirror_inc_abr_count
5699  *
5700  * DESCRIPTION: increment the count of layered soft parts with ABR set
5701  *
5702  * Called from ioctl, so access to un_abr_count is protected by the global
5703  * ioctl lock. It is only referenced in the MD_IOCOFFLINE ioctl.
5704  */
5705 int
5706 mirror_inc_abr_count(md_dev64_t dev)
5707 {
5708 	mm_unit_t		*un;
5709 	md_error_t		mde = mdnullerror;
5710 
5711 	if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL)
5712 		return (EINVAL);
5713 	un->un_abr_count++;
5714 	return (0);
5715 }
5716 
5717 /*
5718  * NAME: mirror_dec_abr_count
5719  *
5720  * DESCRIPTION: decrement the count of layered soft parts with ABR set
5721  *
5722  * Called from ioctl, so access to un_abr_count is protected by the global
5723  * ioctl lock. It is only referenced in the MD_IOCOFFLINE ioctl.
5724  */
5725 int
5726 mirror_dec_abr_count(md_dev64_t dev)
5727 {
5728 	mm_unit_t		*un;
5729 	md_error_t		mde = mdnullerror;
5730 
5731 	if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL)
5732 		return (EINVAL);
5733 	un->un_abr_count--;
5734 	return (0);
5735 }
5736 
5737 static md_named_services_t mirror_named_services[] = {
5738 	{(intptr_t (*)()) poke_hotspares,		"poke hotspares"    },
5739 	{(intptr_t (*)()) mirror_rename_listkids,	MDRNM_LIST_URKIDS   },
5740 	{mirror_rename_check,				MDRNM_CHECK	    },
5741 	{(intptr_t (*)()) mirror_renexch_update_kids,	MDRNM_UPDATE_KIDS   },
5742 	{(intptr_t (*)()) mirror_exchange_parent_update_to,
5743 			MDRNM_PARENT_UPDATE_TO},
5744 	{(intptr_t (*)()) mirror_exchange_self_update_from_down,
5745 			MDRNM_SELF_UPDATE_FROM_DOWN },
5746 	{(intptr_t (*)())mirror_probe_dev,		"probe open test" },
5747 	{(intptr_t (*)())mirror_check_offline,		MD_CHECK_OFFLINE },
5748 	{(intptr_t (*)())mirror_inc_abr_count,		MD_INC_ABR_COUNT },
5749 	{(intptr_t (*)())mirror_dec_abr_count,		MD_DEC_ABR_COUNT },
5750 	{ NULL,						0		    }
5751 };
5752 
5753 md_ops_t mirror_md_ops = {
5754 	mirror_open,		/* open */
5755 	mirror_close,		/* close */
5756 	md_mirror_strategy,	/* strategy */
5757 	NULL,			/* print */
5758 	mirror_dump,		/* dump */
5759 	NULL,			/* read */
5760 	NULL,			/* write */
5761 	md_mirror_ioctl,	/* mirror_ioctl, */
5762 	mirror_snarf,		/* mirror_snarf */
5763 	mirror_halt,		/* mirror_halt */
5764 	NULL,			/* aread */
5765 	NULL,			/* awrite */
5766 	mirror_imp_set,		/* import set */
5767 	mirror_named_services
5768 };
5769 
5770 /* module specific initilization */
5771 static void
5772 init_init()
5773 {
5774 	md_mirror_mcs_buf_off = sizeof (md_mcs_t) - sizeof (buf_t);
5775 
5776 	/* Initialize the parent and child save memory pools */
5777 	mirror_parent_cache = kmem_cache_create("md_mirror_parent",
5778 	    sizeof (md_mps_t), 0, mirror_parent_constructor,
5779 	    mirror_parent_destructor, mirror_run_queue, NULL, NULL,
5780 	    0);
5781 
5782 	mirror_child_cache = kmem_cache_create("md_mirror_child",
5783 	    sizeof (md_mcs_t) - sizeof (buf_t) + biosize(), 0,
5784 	    mirror_child_constructor, mirror_child_destructor,
5785 	    mirror_run_queue, NULL, NULL, 0);
5786 
5787 	/*
5788 	 * Insure wowbuf_size is a multiple of DEV_BSIZE,
5789 	 * then initialize wowbuf memory pool.
5790 	 */
5791 	md_wowbuf_size = roundup(md_wowbuf_size, DEV_BSIZE);
5792 	if (md_wowbuf_size <= 0)
5793 		md_wowbuf_size = 2 * DEV_BSIZE;
5794 	if (md_wowbuf_size > (32 * DEV_BSIZE))
5795 		md_wowbuf_size = (32 * DEV_BSIZE);
5796 
5797 	md_wowblk_size = md_wowbuf_size + sizeof (wowhdr_t);
5798 	mirror_wowblk_cache = kmem_cache_create("md_mirror_wow",
5799 	    md_wowblk_size, 0, NULL, NULL, NULL, NULL, NULL, 0);
5800 
5801 	mutex_init(&mirror_timeout.dr_mx, NULL, MUTEX_DEFAULT, NULL);
5802 	mutex_init(&hotspare_request.dr_mx, NULL, MUTEX_DEFAULT, NULL);
5803 
5804 	mutex_init(&non_ff_drv_mutex, NULL, MUTEX_DEFAULT, NULL);
5805 }
5806 
5807 /* module specific uninitilization (undo init_init()) */
5808 static void
5809 fini_uninit()
5810 {
5811 	kmem_cache_destroy(mirror_parent_cache);
5812 	kmem_cache_destroy(mirror_child_cache);
5813 	kmem_cache_destroy(mirror_wowblk_cache);
5814 	mirror_parent_cache = mirror_child_cache =
5815 	    mirror_wowblk_cache = NULL;
5816 
5817 	mutex_destroy(&mirror_timeout.dr_mx);
5818 	mutex_destroy(&hotspare_request.dr_mx);
5819 	mutex_destroy(&non_ff_drv_mutex);
5820 }
5821 
5822 /* define the module linkage */
5823 MD_PLUGIN_MISC_MODULE("mirrors module", init_init(), fini_uninit())
5824