xref: /onnv-gate/usr/src/uts/common/io/lvm/mirror/mirror.c (revision 11130:ce5c27fd996f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/param.h>
28 #include <sys/systm.h>
29 #include <sys/conf.h>
30 #include <sys/file.h>
31 #include <sys/user.h>
32 #include <sys/uio.h>
33 #include <sys/t_lock.h>
34 #include <sys/buf.h>
35 #include <sys/dkio.h>
36 #include <sys/vtoc.h>
37 #include <sys/kmem.h>
38 #include <vm/page.h>
39 #include <sys/cmn_err.h>
40 #include <sys/sysmacros.h>
41 #include <sys/types.h>
42 #include <sys/mkdev.h>
43 #include <sys/stat.h>
44 #include <sys/open.h>
45 #include <sys/modctl.h>
46 #include <sys/ddi.h>
47 #include <sys/sunddi.h>
48 #include <sys/debug.h>
49 #include <sys/dklabel.h>
50 #include <vm/hat.h>
51 #include <sys/lvm/mdvar.h>
52 #include <sys/lvm/md_mirror.h>
53 #include <sys/lvm/md_convert.h>
54 #include <sys/lvm/md_mddb.h>
55 #include <sys/esunddi.h>
56 
57 #include <sys/sysevent/eventdefs.h>
58 #include <sys/sysevent/svm.h>
59 #include <sys/lvm/mdmn_commd.h>
60 #include <sys/avl.h>
61 
62 md_ops_t		mirror_md_ops;
63 #ifndef	lint
64 char			_depends_on[] = "drv/md";
65 md_ops_t		*md_interface_ops = &mirror_md_ops;
66 #endif
67 
68 extern mdq_anchor_t	md_done_daemon;
69 extern mdq_anchor_t	md_mstr_daemon;
70 extern mdq_anchor_t	md_mirror_daemon;
71 extern mdq_anchor_t	md_mirror_io_daemon;
72 extern mdq_anchor_t	md_mirror_rs_daemon;
73 extern mdq_anchor_t	md_mhs_daemon;
74 
75 extern unit_t		md_nunits;
76 extern set_t		md_nsets;
77 extern md_set_t		md_set[];
78 
79 extern int		md_status;
80 extern clock_t		md_hz;
81 
82 extern md_krwlock_t	md_unit_array_rw;
83 extern kmutex_t		md_mx;
84 extern kcondvar_t	md_cv;
85 extern int		md_mtioctl_cnt;
86 
87 daemon_request_t	mirror_timeout;
88 static daemon_request_t	hotspare_request;
89 static daemon_request_t	mn_hs_request[MD_MAXSETS];	/* Multinode hs req */
90 
91 int	md_mirror_mcs_buf_off;
92 
93 /* Flags for mdmn_ksend_message to allow debugging */
94 int	md_mirror_msg_flags;
95 
96 #ifdef DEBUG
97 /* Flag to switch on debug messages */
98 int	mirror_debug_flag = 0;
99 #endif
100 
101 /*
102  * Struct used to hold count of DMR reads and the timestamp of last DMR read
103  * It is used to verify, using a debugger, that the DMR read ioctl has been
104  * executed.
105  */
106 dmr_stats_t	mirror_dmr_stats = {0, 0};
107 
108 /*
109  * Mutex protecting list of non-failfast drivers.
110  */
111 static kmutex_t	non_ff_drv_mutex;
112 extern char	**non_ff_drivers;
113 
114 extern major_t	md_major;
115 
116 /*
117  * Write-On-Write memory pool.
118  */
119 static void		copy_write_cont(wowhdr_t *wowhdr);
120 static kmem_cache_t	*mirror_wowblk_cache = NULL;
121 static int		md_wowbuf_size = 16384;
122 static size_t		md_wowblk_size;
123 
124 /*
125  * This is a flag that allows:
126  *	- disabling the write-on-write mechanism.
127  *	- logging occurrences of write-on-write
128  *	- switching wow handling procedure processing
129  * Counter for occurences of WOW.
130  */
131 static uint_t	md_mirror_wow_flg = 0;
132 static int	md_mirror_wow_cnt = 0;
133 
134 /*
135  * Tunable to enable/disable dirty region
136  * processing when closing down a mirror.
137  */
138 static int	new_resync = 1;
139 kmem_cache_t	*mirror_parent_cache = NULL;
140 kmem_cache_t	*mirror_child_cache = NULL;
141 
142 extern int	md_ff_disable;		/* disable failfast */
143 
144 static int	mirror_map_write(mm_unit_t *, md_mcs_t *, md_mps_t *, int);
145 static void	mirror_read_strategy(buf_t *, int, void *);
146 static void	mirror_write_strategy(buf_t *, int, void *);
147 static void	become_owner(daemon_queue_t *);
148 static int	mirror_done(struct buf *cb);
149 static int	mirror_done_common(struct buf *cb);
150 static void	clear_retry_error(struct buf *cb);
151 
152 /*
153  * patchables
154  */
155 int	md_min_rr_size	= 200;	/* 2000 blocks, or 100k */
156 int	md_def_num_rr	= 1000;	/* Default number of dirty regions */
157 
158 /*
159  * patchable to change delay before rescheduling mirror ownership request.
160  * Value is clock ticks, default 0.5 seconds
161  */
162 clock_t	md_mirror_owner_to = 500000;
163 
164 /*ARGSUSED1*/
165 static int
166 mirror_parent_constructor(void *p, void *d1, int d2)
167 {
168 	mutex_init(&((md_mps_t *)p)->ps_mx, NULL, MUTEX_DEFAULT, NULL);
169 	return (0);
170 }
171 
172 static void
173 mirror_parent_init(md_mps_t *ps)
174 {
175 	bzero(ps, offsetof(md_mps_t, ps_mx));
176 	bzero(&ps->ps_overlap_node, sizeof (avl_node_t));
177 }
178 
179 /*ARGSUSED1*/
180 static void
181 mirror_parent_destructor(void *p, void *d)
182 {
183 	mutex_destroy(&((md_mps_t *)p)->ps_mx);
184 }
185 
186 /*ARGSUSED1*/
187 static int
188 mirror_child_constructor(void *p, void *d1, int d2)
189 {
190 	bioinit(&((md_mcs_t *)p)->cs_buf);
191 	return (0);
192 }
193 
194 void
195 mirror_child_init(md_mcs_t *cs)
196 {
197 	cs->cs_ps = NULL;
198 	cs->cs_mdunit = 0;
199 	md_bioreset(&cs->cs_buf);
200 }
201 
202 /*ARGSUSED1*/
203 static void
204 mirror_child_destructor(void *p, void *d)
205 {
206 	biofini(&((md_mcs_t *)p)->cs_buf);
207 }
208 
209 static void
210 mirror_wowblk_init(wowhdr_t *p)
211 {
212 	bzero(p, md_wowblk_size);
213 }
214 
215 static void
216 send_poke_hotspares_msg(daemon_request_t *drq)
217 {
218 	int			rval;
219 	int			nretries = 0;
220 	md_mn_msg_pokehsp_t	pokehsp;
221 	md_mn_kresult_t		*kresult;
222 	set_t			setno = (set_t)drq->dq.qlen;
223 
224 	pokehsp.pokehsp_setno = setno;
225 
226 	kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
227 
228 retry_sphmsg:
229 	rval = mdmn_ksend_message(setno, MD_MN_MSG_POKE_HOTSPARES,
230 	    MD_MSGF_NO_LOG | MD_MSGF_NO_BCAST, 0, (char *)&pokehsp,
231 	    sizeof (pokehsp), kresult);
232 
233 	if (!MDMN_KSEND_MSG_OK(rval, kresult)) {
234 		mdmn_ksend_show_error(rval, kresult, "POKE_HOTSPARES");
235 		/* If we're shutting down already, pause things here. */
236 		if (kresult->kmmr_comm_state == MDMNE_RPC_FAIL) {
237 			while (!md_mn_is_commd_present()) {
238 				delay(md_hz);
239 			}
240 			/*
241 			 * commd has become reachable again, so retry once.
242 			 * If this fails we'll panic as the system is in an
243 			 * unexpected state.
244 			 */
245 			if (nretries++ == 0)
246 				goto retry_sphmsg;
247 		}
248 		cmn_err(CE_PANIC,
249 		    "ksend_message failure: POKE_HOTSPARES");
250 	}
251 	kmem_free(kresult, sizeof (md_mn_kresult_t));
252 
253 	/* Allow further requests to use this set's queue structure */
254 	mutex_enter(&drq->dr_mx);
255 	drq->dr_pending = 0;
256 	mutex_exit(&drq->dr_mx);
257 }
258 
259 /*
260  * Send a poke_hotspares message to the master node. To avoid swamping the
261  * commd handler with requests we only send a message if there is not one
262  * already outstanding. We punt the request to a separate thread context as
263  * cannot afford to block waiting on the request to be serviced. This is
264  * essential when a reconfig cycle is in progress as any open() of a multinode
265  * metadevice may result in a livelock.
266  */
267 static void
268 send_poke_hotspares(set_t setno)
269 {
270 	daemon_request_t	*drq = &mn_hs_request[setno];
271 
272 	mutex_enter(&drq->dr_mx);
273 	if (drq->dr_pending == 0) {
274 		drq->dr_pending = 1;
275 		drq->dq.qlen = (int)setno;
276 		daemon_request(&md_mhs_daemon,
277 		    send_poke_hotspares_msg, (daemon_queue_t *)drq, REQ_OLD);
278 	}
279 	mutex_exit(&drq->dr_mx);
280 }
281 
282 void
283 mirror_set_sm_state(
284 	mm_submirror_t		*sm,
285 	mm_submirror_ic_t	*smic,
286 	sm_state_t		newstate,
287 	int			force)
288 {
289 	int			compcnt;
290 	int			i;
291 	int			errcnt;
292 	sm_state_t		origstate;
293 	md_m_shared_t		*shared;
294 
295 	if (force) {
296 		sm->sm_state = newstate;
297 		uniqtime32(&sm->sm_timestamp);
298 		return;
299 	}
300 
301 	origstate = newstate;
302 
303 	compcnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm);
304 	for (i = 0, errcnt = 0; i < compcnt; i++) {
305 		shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
306 		    (sm->sm_dev, sm, i);
307 		if (shared->ms_state & (CS_ERRED | CS_LAST_ERRED))
308 			newstate |= SMS_COMP_ERRED;
309 		if (shared->ms_state & (CS_RESYNC))
310 			newstate |= SMS_COMP_RESYNC;
311 		if (shared->ms_state & CS_ERRED)
312 			errcnt++;
313 	}
314 
315 	if ((newstate & (SMS_COMP_ERRED | SMS_COMP_RESYNC)) != 0)
316 		newstate &= ~origstate;
317 
318 	if (errcnt == compcnt)
319 		newstate |= SMS_ALL_ERRED;
320 	else
321 		newstate &= ~SMS_ALL_ERRED;
322 
323 	sm->sm_state = newstate;
324 	uniqtime32(&sm->sm_timestamp);
325 }
326 
327 static int
328 mirror_geterror(mm_unit_t *un, int *smi, int *cip, int clr_error,
329 							int frm_probe)
330 {
331 	mm_submirror_t		*sm;
332 	mm_submirror_ic_t	*smic;
333 	md_m_shared_t		*shared;
334 	int			ci;
335 	int			i;
336 	int			compcnt;
337 	int			open_comp; /* flag for open component */
338 
339 	for (i = *smi; i < NMIRROR; i++) {
340 		sm = &un->un_sm[i];
341 		smic = &un->un_smic[i];
342 
343 		if (!SMS_IS(sm, SMS_INUSE))
344 			continue;
345 
346 		compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un);
347 		for (ci = *cip; ci < compcnt; ci++) {
348 			shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
349 			    (sm->sm_dev, sm, ci);
350 			/*
351 			 * if called from any routine but probe, we check for
352 			 * MDM_S_ISOPEN flag. Since probe does a pseduo open,
353 			 * it sets MDM_S_PROBEOPEN flag and we test for this
354 			 * flag. They are both exclusive tests.
355 			 */
356 			open_comp = (frm_probe) ?
357 			    (shared->ms_flags & MDM_S_PROBEOPEN):
358 			    (shared->ms_flags & MDM_S_ISOPEN);
359 			if ((shared->ms_flags & MDM_S_IOERR || !open_comp) &&
360 			    ((shared->ms_state == CS_OKAY) ||
361 			    (shared->ms_state == CS_RESYNC))) {
362 				if (clr_error) {
363 					shared->ms_flags &= ~MDM_S_IOERR;
364 				}
365 				*cip = ci;
366 				*smi = i;
367 				return (1);
368 			}
369 
370 			if (clr_error && (shared->ms_flags & MDM_S_IOERR)) {
371 				shared->ms_flags &= ~MDM_S_IOERR;
372 			}
373 		}
374 
375 		*cip = 0;
376 	}
377 	return (0);
378 }
379 
380 /*ARGSUSED*/
381 static void
382 mirror_run_queue(void *d)
383 {
384 	if (!(md_status & MD_GBL_DAEMONS_LIVE))
385 		md_daemon(1, &md_done_daemon);
386 }
387 /*
388  * check_comp_4_hotspares
389  *
390  * This function attempts to allocate a hotspare for this component if the
391  * component is in error. In a MN set, the function can be called in 2 modes.
392  * It can be called either when a component error has been detected or when a
393  * new hotspare has been allocated. In this case, MD_HOTSPARE_XMIT is set
394  * in flags and the request is sent to all nodes.
395  * The handler on each of the nodes then calls this function with
396  * MD_HOTSPARE_XMIT unset and the hotspare allocation is then performed.
397  *
398  * For non-MN sets the function simply attempts to allocate a hotspare.
399  *
400  * On entry, the following locks are held
401  *	mirror_md_ops.md_link_rw (if flags has MD_HOTSPARE_LINKHELD set)
402  *	md_unit_writerlock
403  *
404  * Returns	0 if ok
405  *		1 if the unit containing the component has been cleared while
406  *		  the mdmn_ksend_message() was being executed
407  */
408 extern int
409 check_comp_4_hotspares(
410 	mm_unit_t	*un,
411 	int		smi,
412 	int		ci,
413 	uint_t		flags,
414 	mddb_recid_t	hs_id,	/* Only used by MN disksets */
415 	IOLOCK		*lockp	/* can be NULL */
416 )
417 {
418 	mm_submirror_t		*sm;
419 	mm_submirror_ic_t	*smic;
420 	md_m_shared_t		*shared;
421 	mddb_recid_t		recids[6];
422 	minor_t			mnum;
423 	intptr_t		(*hs_dev)();
424 	void			(*hs_done)();
425 	void			*hs_data;
426 	md_error_t		mde = mdnullerror;
427 	set_t			setno;
428 	md_mn_msg_allochsp_t	allochspmsg;
429 	md_mn_kresult_t		*kresult;
430 	mm_unit_t		*new_un;
431 	int			rval;
432 	int			nretries = 0;
433 
434 	mnum = MD_SID(un);
435 	setno = MD_UN2SET(un);
436 	sm = &un->un_sm[smi];
437 	smic = &un->un_smic[smi];
438 	shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
439 	    (sm->sm_dev, sm, ci);
440 
441 	if (shared->ms_state != CS_ERRED)
442 		return (0);
443 
444 	/* Don't start a new component resync if a resync is already running. */
445 	if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE)
446 		return (0);
447 
448 	if (MD_MNSET_SETNO(setno) && (flags & MD_HOTSPARE_XMIT)) {
449 		uint_t		msgflags;
450 		md_mn_msgtype_t	msgtype;
451 
452 		/* Send allocate hotspare message to all nodes */
453 
454 		allochspmsg.msg_allochsp_mnum = un->c.un_self_id;
455 		allochspmsg.msg_allochsp_sm = smi;
456 		allochspmsg.msg_allochsp_comp = ci;
457 		allochspmsg.msg_allochsp_hs_id = shared->ms_hs_id;
458 
459 		/*
460 		 * Before calling mdmn_ksend_message(), release locks
461 		 * Can never be in the context of an ioctl.
462 		 */
463 		md_unit_writerexit(MDI_UNIT(mnum));
464 		if (flags & MD_HOTSPARE_LINKHELD)
465 			rw_exit(&mirror_md_ops.md_link_rw.lock);
466 #ifdef DEBUG
467 		if (mirror_debug_flag)
468 			printf("send alloc hotspare, flags="
469 			    "0x%x %x, %x, %x, %x\n", flags,
470 			    allochspmsg.msg_allochsp_mnum,
471 			    allochspmsg.msg_allochsp_sm,
472 			    allochspmsg.msg_allochsp_comp,
473 			    allochspmsg.msg_allochsp_hs_id);
474 #endif
475 		if (flags & MD_HOTSPARE_WMUPDATE) {
476 			msgtype  = MD_MN_MSG_ALLOCATE_HOTSPARE2;
477 			/*
478 			 * When coming from an update of watermarks, there
479 			 * must already be a message logged that triggered
480 			 * this action. So, no need to log this message, too.
481 			 */
482 			msgflags = MD_MSGF_NO_LOG;
483 		} else {
484 			msgtype  = MD_MN_MSG_ALLOCATE_HOTSPARE;
485 			msgflags = MD_MSGF_DEFAULT_FLAGS;
486 		}
487 
488 		kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
489 
490 cc4hs_msg:
491 		rval = mdmn_ksend_message(setno, msgtype, msgflags, 0,
492 		    (char *)&allochspmsg, sizeof (allochspmsg),
493 		    kresult);
494 
495 		if (!MDMN_KSEND_MSG_OK(rval, kresult)) {
496 #ifdef DEBUG
497 			if (mirror_debug_flag)
498 				mdmn_ksend_show_error(rval, kresult,
499 				    "ALLOCATE HOTSPARE");
500 #endif
501 			/*
502 			 * If message is sent ok but exitval indicates an error
503 			 * it must be because the mirror has been cleared. In
504 			 * this case re-obtain lock and return an error
505 			 */
506 			if ((rval == 0) && (kresult->kmmr_exitval != 0)) {
507 				if (flags & MD_HOTSPARE_LINKHELD) {
508 					rw_enter(&mirror_md_ops.md_link_rw.lock,
509 					    RW_READER);
510 				}
511 				kmem_free(kresult, sizeof (md_mn_kresult_t));
512 				return (1);
513 			}
514 			/* If we're shutting down already, pause things here. */
515 			if (kresult->kmmr_comm_state == MDMNE_RPC_FAIL) {
516 				while (!md_mn_is_commd_present()) {
517 					delay(md_hz);
518 				}
519 				/*
520 				 * commd has become reachable again, so retry
521 				 * once. If this fails we'll panic as the
522 				 * system is in an unexpected state.
523 				 */
524 				if (nretries++ == 0)
525 					goto cc4hs_msg;
526 			}
527 			cmn_err(CE_PANIC,
528 			    "ksend_message failure: ALLOCATE_HOTSPARE");
529 		}
530 		kmem_free(kresult, sizeof (md_mn_kresult_t));
531 
532 		/*
533 		 * re-obtain the locks
534 		 */
535 		if (flags & MD_HOTSPARE_LINKHELD)
536 			rw_enter(&mirror_md_ops.md_link_rw.lock, RW_READER);
537 		new_un = md_unit_writerlock(MDI_UNIT(mnum));
538 
539 		/*
540 		 * As we had to release the locks in order to send the
541 		 * message to all nodes, we need to check to see if the
542 		 * unit has changed. If it has we release the writerlock
543 		 * and return fail.
544 		 */
545 		if ((new_un != un) || (un->c.un_type != MD_METAMIRROR)) {
546 			md_unit_writerexit(MDI_UNIT(mnum));
547 			return (1);
548 		}
549 	} else {
550 		if (MD_MNSET_SETNO(setno)) {
551 			/*
552 			 * If 2 or more nodes simultaneously see a
553 			 * component failure, these nodes will each
554 			 * send an ALLOCATE_HOTSPARE[2] message.
555 			 * The first message will allocate the hotspare
556 			 * and the subsequent messages should do nothing.
557 			 *
558 			 * If a slave node doesn't have a hotspare allocated
559 			 * at the time the message is initiated, then the
560 			 * passed in hs_id will be 0.  If the node
561 			 * executing this routine has a component shared
562 			 * ms_hs_id of non-zero, but the message shows a
563 			 * hs_id of 0, then just return since a hotspare
564 			 * has already been allocated for this failing
565 			 * component.  When the slave node returns from
566 			 * the ksend_message the hotspare will have
567 			 * already been allocated.
568 			 *
569 			 * If the slave node does send an hs_id of non-zero,
570 			 * and the slave node's hs_id matches this node's
571 			 * ms_hs_id, then the hotspare has error'd and
572 			 * should be replaced.
573 			 *
574 			 * If the slave node sends an hs_id of non-zero and
575 			 * this node has a different shared ms_hs_id, then
576 			 * just return since this hotspare has already
577 			 * been hotspared.
578 			 */
579 			if (shared->ms_hs_id != 0) {
580 				if (hs_id == 0) {
581 #ifdef DEBUG
582 					if (mirror_debug_flag) {
583 						printf("check_comp_4_hotspares"
584 						    "(NOXMIT), short circuit "
585 						    "hs_id=0x%x, "
586 						    "ms_hs_id=0x%x\n",
587 						    hs_id, shared->ms_hs_id);
588 					}
589 #endif
590 					return (0);
591 				}
592 				if (hs_id != shared->ms_hs_id) {
593 #ifdef DEBUG
594 					if (mirror_debug_flag) {
595 						printf("check_comp_4_hotspares"
596 						    "(NOXMIT), short circuit2 "
597 						    "hs_id=0x%x, "
598 						    "ms_hs_id=0x%x\n",
599 						    hs_id, shared->ms_hs_id);
600 					}
601 #endif
602 					return (0);
603 				}
604 			}
605 		}
606 
607 		sm = &un->un_sm[smi];
608 		hs_dev = md_get_named_service(sm->sm_dev, 0,
609 		    "hotspare device", 0);
610 		if ((*hs_dev)(sm->sm_dev, 0, ci, recids, 6, &hs_done,
611 		    &hs_data) != 0)
612 			return (0);
613 
614 		/*
615 		 * set_sm_comp_state() commits the modified records.
616 		 * As we don't transmit the changes, no need to drop the lock.
617 		 */
618 		set_sm_comp_state(un, smi, ci, CS_RESYNC, recids,
619 		    MD_STATE_NO_XMIT, (IOLOCK *)NULL);
620 
621 		(*hs_done)(sm->sm_dev, hs_data);
622 
623 		mirror_check_failfast(mnum);
624 
625 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_HOTSPARED, SVM_TAG_METADEVICE,
626 		    setno, MD_SID(un));
627 
628 		/*
629 		 * For a multi-node set we need to reset the un_rs_type,
630 		 * un_rs_resync_done and un_rs_resync_2_do fields as the
631 		 * hot-spare resync must copy all applicable data.
632 		 */
633 		if (MD_MNSET_SETNO(setno)) {
634 			un->un_rs_type = MD_RS_NONE;
635 			un->un_rs_resync_done = 0;
636 			un->un_rs_resync_2_do = 0;
637 		}
638 
639 		/*
640 		 * Must drop writer lock since mirror_resync_unit will
641 		 * open devices and must be able to grab readerlock.
642 		 * Don't need to drop IOLOCK since any descendent routines
643 		 * calling ksend_messages will drop the IOLOCK as needed.
644 		 *
645 		 */
646 		if (lockp) {
647 			md_ioctl_writerexit(lockp);
648 		} else {
649 			md_unit_writerexit(MDI_UNIT(mnum));
650 		}
651 
652 		/* start resync */
653 		(void) mirror_resync_unit(mnum, NULL, &mde, lockp);
654 
655 		if (lockp) {
656 			new_un = md_ioctl_writerlock(lockp, MDI_UNIT(mnum));
657 		} else {
658 			new_un = md_unit_writerlock(MDI_UNIT(mnum));
659 		}
660 	}
661 	return (0);
662 }
663 
664 /*
665  * check_unit_4_hotspares
666  *
667  * For a given mirror, allocate hotspares, if available for any components
668  * that are in error
669  *
670  * Returns	0 if ok
671  *		1 if check_comp_4_hotspares returns non-zero. This will only
672  *		  happen for a MN unit where the unit has been cleared while
673  *		  the allocate hotspare message is sent to all nodes.
674  */
675 static int
676 check_unit_4_hotspares(mm_unit_t *un, int flags)
677 {
678 	mm_submirror_t		*sm;
679 	mm_submirror_ic_t	*smic;
680 	int			ci;
681 	int			i;
682 	int			compcnt;
683 
684 	if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE)
685 		return (0);
686 
687 	for (i = 0; i < NMIRROR; i++) {
688 		sm = &un->un_sm[i];
689 		smic = &un->un_smic[i];
690 		if (!SMS_IS(sm, SMS_INUSE))
691 			continue;
692 		compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, sm);
693 		for (ci = 0; ci < compcnt; ci++) {
694 			md_m_shared_t		*shared;
695 
696 			shared = (md_m_shared_t *)
697 			    (*(smic->sm_shared_by_indx))(sm->sm_dev, sm, ci);
698 			/*
699 			 * Never called from ioctl context, so pass in
700 			 * (IOLOCK *)NULL.  Pass through flags from calling
701 			 * routine, also setting XMIT flag.
702 			 */
703 			if (check_comp_4_hotspares(un, i, ci,
704 			    (MD_HOTSPARE_XMIT | flags),
705 			    shared->ms_hs_id, (IOLOCK *)NULL) != 0)
706 				return (1);
707 		}
708 	}
709 	return (0);
710 }
711 
712 static void
713 check_4_hotspares(daemon_request_t *drq)
714 {
715 	mdi_unit_t	*ui;
716 	mm_unit_t	*un;
717 	md_link_t	*next;
718 	int		x;
719 
720 	mutex_enter(&drq->dr_mx);	/* clear up front so can poke */
721 	drq->dr_pending = 0;		/* again in low level routine if */
722 	mutex_exit(&drq->dr_mx);	/* something found to do	*/
723 
724 	/*
725 	 * Used to have a problem here. The disksets weren't marked as being
726 	 * MNHOLD. This opened a window where we could be searching for
727 	 * hotspares and have the disk set unloaded (released) from under
728 	 * us causing a panic in stripe_component_count().
729 	 * The way to prevent that is to mark the set MNHOLD which prevents
730 	 * any diskset from being released while we are scanning the mirrors,
731 	 * submirrors and components.
732 	 */
733 
734 	for (x = 0; x < md_nsets; x++)
735 		md_holdset_enter(x);
736 
737 	rw_enter(&mirror_md_ops.md_link_rw.lock, RW_READER);
738 	for (next = mirror_md_ops.md_head; next != NULL; next = next->ln_next) {
739 		ui = MDI_UNIT(next->ln_id);
740 
741 		un = (mm_unit_t *)md_unit_readerlock(ui);
742 
743 		/*
744 		 * Only check the unit if we are the master for this set
745 		 * For an MN set, poke_hotspares() is only effective on the
746 		 * master
747 		 */
748 		if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
749 		    md_set[MD_UN2SET(un)].s_am_i_master == 0) {
750 			md_unit_readerexit(ui);
751 			continue;
752 		}
753 		if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) {
754 			md_unit_readerexit(ui);
755 			continue;
756 		}
757 		md_unit_readerexit(ui);
758 
759 		un = (mm_unit_t *)md_unit_writerlock(ui);
760 		/*
761 		 * check_unit_4_hotspares will exit 1 if the unit has been
762 		 * removed during the process of allocating the hotspare.
763 		 * This can only happen for a MN metadevice. If unit no longer
764 		 * exists, no need to release writerlock
765 		 */
766 		if (check_unit_4_hotspares(un, MD_HOTSPARE_LINKHELD) == 0)
767 			md_unit_writerexit(ui);
768 		else {
769 			/*
770 			 * If check_unit_4_hotspares failed, queue another
771 			 * request and break out of this one
772 			 */
773 			(void) poke_hotspares();
774 			break;
775 		}
776 	}
777 	rw_exit(&mirror_md_ops.md_link_rw.lock);
778 
779 	for (x = 0; x < md_nsets; x++)
780 		md_holdset_exit(x);
781 }
782 
783 /*
784  * poke_hotspares
785  *
786  * If there is not a pending poke_hotspares request pending, queue a requent
787  * to call check_4_hotspares(). This will scan all mirrors and attempt to
788  * allocate hotspares for all components in error.
789  */
790 int
791 poke_hotspares()
792 {
793 	mutex_enter(&hotspare_request.dr_mx);
794 	if (hotspare_request.dr_pending == 0) {
795 		hotspare_request.dr_pending = 1;
796 		daemon_request(&md_mhs_daemon,
797 		    check_4_hotspares, (daemon_queue_t *)&hotspare_request,
798 		    REQ_OLD);
799 	}
800 	mutex_exit(&hotspare_request.dr_mx);
801 	return (0);
802 }
803 
804 static void
805 free_all_ecomps(err_comp_t *ecomp)
806 {
807 	err_comp_t	*d;
808 
809 	while (ecomp != NULL) {
810 		d = ecomp;
811 		ecomp = ecomp->ec_next;
812 		kmem_free(d, sizeof (err_comp_t));
813 	}
814 }
815 
816 /*
817  * NAME: mirror_openfail_console_info
818  *
819  * DESCRIPTION: Prints a informative message to the console when mirror
820  *		cannot be opened.
821  *
822  * PARAMETERS: mm_unit_t	un - pointer to mirror unit structure
823  *	       int		smi - submirror index
824  *	       int		ci - component index
825  */
826 
827 void
828 mirror_openfail_console_info(mm_unit_t *un, int smi, int ci)
829 {
830 	void (*get_dev)();
831 	ms_cd_info_t cd;
832 	md_dev64_t tmpdev;
833 
834 	tmpdev = un->un_sm[smi].sm_dev;
835 	get_dev = (void (*)())md_get_named_service(tmpdev, 0, "get device", 0);
836 	if (get_dev != NULL) {
837 		(void) (*get_dev)(tmpdev, smi, ci, &cd);
838 		cmn_err(CE_WARN, "md %s: open error on %s",
839 		    md_shortname(MD_SID(un)), md_devname(MD_UN2SET(un),
840 		    cd.cd_dev, NULL, 0));
841 	} else {
842 		cmn_err(CE_WARN, "md %s: open error",
843 		    md_shortname(MD_SID(un)));
844 	}
845 }
846 
847 static int
848 mirror_close_all_devs(mm_unit_t *un, int md_cflags)
849 {
850 	int i;
851 	md_dev64_t dev;
852 
853 	for (i = 0; i < NMIRROR; i++) {
854 		if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
855 			continue;
856 		dev = un->un_sm[i].sm_dev;
857 		md_layered_close(dev, md_cflags);
858 	}
859 	return (0);
860 }
861 
862 /*
863  * Keep track of drivers that don't support failfast.  We use this so that
864  * we only log one diagnostic message for each of these drivers, no matter
865  * how many times we run the mirror_check_failfast function.
866  * Return 1 if this is a new driver that does not support failfast,
867  * return 0 if we have already seen this non-failfast driver.
868  */
869 static int
870 new_non_ff_driver(const char *s)
871 {
872 	mutex_enter(&non_ff_drv_mutex);
873 	if (non_ff_drivers == NULL) {
874 		non_ff_drivers = (char **)kmem_alloc(2 * sizeof (char *),
875 		    KM_NOSLEEP);
876 		if (non_ff_drivers == NULL) {
877 			mutex_exit(&non_ff_drv_mutex);
878 			return (1);
879 		}
880 
881 		non_ff_drivers[0] = (char *)kmem_alloc(strlen(s) + 1,
882 		    KM_NOSLEEP);
883 		if (non_ff_drivers[0] == NULL) {
884 			kmem_free(non_ff_drivers, 2 * sizeof (char *));
885 			non_ff_drivers = NULL;
886 			mutex_exit(&non_ff_drv_mutex);
887 			return (1);
888 		}
889 
890 		(void) strcpy(non_ff_drivers[0], s);
891 		non_ff_drivers[1] = NULL;
892 
893 	} else {
894 		int i;
895 		char **tnames;
896 		char **tmp;
897 
898 		for (i = 0; non_ff_drivers[i] != NULL; i++) {
899 			if (strcmp(s, non_ff_drivers[i]) == 0) {
900 				mutex_exit(&non_ff_drv_mutex);
901 				return (0);
902 			}
903 		}
904 
905 		/* allow for new element and null */
906 		i += 2;
907 		tnames = (char **)kmem_alloc(i * sizeof (char *), KM_NOSLEEP);
908 		if (tnames == NULL) {
909 			mutex_exit(&non_ff_drv_mutex);
910 			return (1);
911 		}
912 
913 		for (i = 0; non_ff_drivers[i] != NULL; i++)
914 			tnames[i] = non_ff_drivers[i];
915 
916 		tnames[i] = (char *)kmem_alloc(strlen(s) + 1, KM_NOSLEEP);
917 		if (tnames[i] == NULL) {
918 			/* adjust i so that it is the right count to free */
919 			kmem_free(tnames, (i + 2) * sizeof (char *));
920 			mutex_exit(&non_ff_drv_mutex);
921 			return (1);
922 		}
923 
924 		(void) strcpy(tnames[i++], s);
925 		tnames[i] = NULL;
926 
927 		tmp = non_ff_drivers;
928 		non_ff_drivers = tnames;
929 		/* i now represents the count we previously alloced */
930 		kmem_free(tmp, i * sizeof (char *));
931 	}
932 	mutex_exit(&non_ff_drv_mutex);
933 
934 	return (1);
935 }
936 
937 /*
938  * Check for the "ddi-failfast-supported" devtree property on each submirror
939  * component to indicate if we should do I/O to that submirror with the
940  * B_FAILFAST flag set or not.  This check is made at various state transitions
941  * in the mirror code (e.g. open, enable, hotspare, etc.).  Sometimes we
942  * only need to check one drive (e.g. hotspare) but since the check is
943  * fast and infrequent and sometimes needs to be done on all components we
944  * just check all components on each call.
945  */
946 void
947 mirror_check_failfast(minor_t mnum)
948 {
949 	int		i;
950 	mm_unit_t	*un;
951 
952 	if (md_ff_disable)
953 		return;
954 
955 	un = MD_UNIT(mnum);
956 
957 	for (i = 0; i < NMIRROR; i++) {
958 		int			ci;
959 		int			cnt;
960 		int			ff = 1;
961 		mm_submirror_t		*sm;
962 		mm_submirror_ic_t	*smic;
963 		void			(*get_dev)();
964 
965 		if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
966 			continue;
967 
968 		sm = &un->un_sm[i];
969 		smic = &un->un_smic[i];
970 
971 		get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0,
972 		    "get device", 0);
973 
974 		cnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm);
975 		for (ci = 0; ci < cnt; ci++) {
976 			int		found = 0;
977 			dev_t		ci_dev;
978 			major_t		major;
979 			dev_info_t	*devi;
980 			ms_cd_info_t	cd;
981 
982 			/*
983 			 * this already returns the hs
984 			 * dev if the device is spared
985 			 */
986 			(void) (*get_dev)(sm->sm_dev, sm, ci, &cd);
987 
988 			ci_dev = md_dev64_to_dev(cd.cd_dev);
989 			major = getmajor(ci_dev);
990 
991 			if (major == md_major) {
992 				/*
993 				 * this component must be a soft
994 				 * partition; get the real dev
995 				 */
996 				minor_t	dev_mnum;
997 				mdi_unit_t	*ui;
998 				mp_unit_t	*un;
999 				set_t	setno;
1000 				side_t	side;
1001 				md_dev64_t	tmpdev;
1002 
1003 				ui = MDI_UNIT(getminor(ci_dev));
1004 
1005 				/* grab necessary lock */
1006 				un = (mp_unit_t *)md_unit_readerlock(ui);
1007 
1008 				dev_mnum = MD_SID(un);
1009 				setno = MD_MIN2SET(dev_mnum);
1010 				side = mddb_getsidenum(setno);
1011 
1012 				tmpdev = un->un_dev;
1013 
1014 				/* Get dev by device id */
1015 				if (md_devid_found(setno, side,
1016 				    un->un_key) == 1) {
1017 					tmpdev = md_resolve_bydevid(dev_mnum,
1018 					    tmpdev, un->un_key);
1019 				}
1020 
1021 				md_unit_readerexit(ui);
1022 
1023 				ci_dev = md_dev64_to_dev(tmpdev);
1024 				major = getmajor(ci_dev);
1025 			}
1026 
1027 			if (ci_dev != NODEV32 &&
1028 			    (devi = e_ddi_hold_devi_by_dev(ci_dev, 0))
1029 			    != NULL) {
1030 				ddi_prop_op_t	prop_op = PROP_LEN_AND_VAL_BUF;
1031 				int		propvalue = 0;
1032 				int		proplength = sizeof (int);
1033 				int		error;
1034 				struct cb_ops	*cb;
1035 
1036 				if ((cb = devopsp[major]->devo_cb_ops) !=
1037 				    NULL) {
1038 					error = (*cb->cb_prop_op)
1039 					    (DDI_DEV_T_ANY, devi, prop_op,
1040 					    DDI_PROP_NOTPROM|DDI_PROP_DONTPASS,
1041 					    "ddi-failfast-supported",
1042 					    (caddr_t)&propvalue, &proplength);
1043 
1044 					if (error == DDI_PROP_SUCCESS)
1045 						found = 1;
1046 				}
1047 
1048 				if (!found && new_non_ff_driver(
1049 				    ddi_driver_name(devi))) {
1050 					cmn_err(CE_NOTE, "!md: B_FAILFAST I/O"
1051 					    "disabled on %s",
1052 					    ddi_driver_name(devi));
1053 				}
1054 
1055 				ddi_release_devi(devi);
1056 			}
1057 
1058 			/*
1059 			 * All components must support
1060 			 * failfast in the submirror.
1061 			 */
1062 			if (!found) {
1063 				ff = 0;
1064 				break;
1065 			}
1066 		}
1067 
1068 		if (ff) {
1069 			sm->sm_flags |= MD_SM_FAILFAST;
1070 		} else {
1071 			sm->sm_flags &= ~MD_SM_FAILFAST;
1072 		}
1073 	}
1074 }
1075 
1076 /*
1077  * Return true if the submirror is unavailable.
1078  * If any of the submirror components are opened then the submirror cannot
1079  * be unavailable (MD_INACCESSIBLE).
1080  * If any of the components are already in the errored state, then the submirror
1081  * cannot be unavailable (MD_INACCESSIBLE).
1082  */
1083 static bool_t
1084 submirror_unavailable(mm_unit_t *un, int smi, int from_probe)
1085 {
1086 	mm_submirror_t		*sm;
1087 	mm_submirror_ic_t	*smic;
1088 	md_m_shared_t		*shared;
1089 	int			ci;
1090 	int			compcnt;
1091 
1092 	sm = &un->un_sm[smi];
1093 	smic = &un->un_smic[smi];
1094 
1095 	compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un);
1096 	for (ci = 0; ci < compcnt; ci++) {
1097 		shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
1098 		    (sm->sm_dev, sm, ci);
1099 		if (from_probe) {
1100 			if (shared->ms_flags & MDM_S_PROBEOPEN)
1101 				return (B_FALSE);
1102 		} else {
1103 			if (shared->ms_flags & MDM_S_ISOPEN)
1104 				return (B_FALSE);
1105 		}
1106 		if (shared->ms_state == CS_ERRED ||
1107 		    shared->ms_state == CS_LAST_ERRED)
1108 			return (B_FALSE);
1109 	}
1110 
1111 	return (B_TRUE);
1112 }
1113 
1114 static int
1115 mirror_open_all_devs(minor_t mnum, int md_oflags, IOLOCK *lockp)
1116 {
1117 	int		i;
1118 	mm_unit_t	*un;
1119 	mdi_unit_t	*ui;
1120 	int		err;
1121 	int		smi;
1122 	int		ci;
1123 	err_comp_t	*c;
1124 	err_comp_t	*ecomps = NULL;
1125 	int		smmask = 0;
1126 	set_t		setno;
1127 	int		sm_cnt;
1128 	int		sm_unavail_cnt;
1129 
1130 	mirror_check_failfast(mnum);
1131 
1132 	un = MD_UNIT(mnum);
1133 	ui = MDI_UNIT(mnum);
1134 	setno = MD_UN2SET(un);
1135 
1136 	for (i = 0; i < NMIRROR; i++) {
1137 		md_dev64_t tmpdev = un->un_sm[i].sm_dev;
1138 
1139 		if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
1140 			continue;
1141 		if (md_layered_open(mnum, &tmpdev, md_oflags))
1142 			smmask |= SMI2BIT(i);
1143 		un->un_sm[i].sm_dev = tmpdev;
1144 	}
1145 
1146 	/*
1147 	 * If smmask is clear, all submirrors are accessible. Clear the
1148 	 * MD_INACCESSIBLE bit in this case.  This bit is also cleared for the
1149 	 * mirror device.   If smmask is set, we have to determine which of the
1150 	 * submirrors are in error. If no submirror is accessible we mark the
1151 	 * whole mirror as MD_INACCESSIBLE.
1152 	 */
1153 	if (smmask == 0) {
1154 		if (lockp) {
1155 			md_ioctl_readerexit(lockp);
1156 			(void) md_ioctl_writerlock(lockp, ui);
1157 		} else {
1158 			md_unit_readerexit(ui);
1159 			(void) md_unit_writerlock(ui);
1160 		}
1161 		ui->ui_tstate &= ~MD_INACCESSIBLE;
1162 		if (lockp) {
1163 			md_ioctl_writerexit(lockp);
1164 			(void) md_ioctl_readerlock(lockp, ui);
1165 		} else {
1166 			md_unit_writerexit(ui);
1167 			(void) md_unit_readerlock(ui);
1168 		}
1169 
1170 		for (i = 0; i < NMIRROR; i++) {
1171 			md_dev64_t	tmpdev;
1172 			mdi_unit_t	*sm_ui;
1173 
1174 			if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
1175 				continue;
1176 
1177 			tmpdev = un->un_sm[i].sm_dev;
1178 			sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev)));
1179 			(void) md_unit_writerlock(sm_ui);
1180 			sm_ui->ui_tstate &= ~MD_INACCESSIBLE;
1181 			md_unit_writerexit(sm_ui);
1182 		}
1183 
1184 		return (0);
1185 	}
1186 
1187 	for (i = 0; i < NMIRROR; i++) {
1188 		md_dev64_t tmpdev;
1189 
1190 		if (!(smmask & SMI2BIT(i)))
1191 			continue;
1192 
1193 		tmpdev = un->un_sm[i].sm_dev;
1194 		err = md_layered_open(mnum, &tmpdev, MD_OFLG_CONT_ERRS);
1195 		un->un_sm[i].sm_dev = tmpdev;
1196 		ASSERT(err == 0);
1197 	}
1198 
1199 	if (lockp) {
1200 		md_ioctl_readerexit(lockp);
1201 		un = (mm_unit_t *)md_ioctl_writerlock(lockp, ui);
1202 	} else {
1203 		md_unit_readerexit(ui);
1204 		un = (mm_unit_t *)md_unit_writerlock(ui);
1205 	}
1206 
1207 	/*
1208 	 * We want to make sure the unavailable flag is not masking a real
1209 	 * error on the submirror.
1210 	 * For each submirror,
1211 	 *    if all of the submirror components couldn't be opened and there
1212 	 *    are no errors on the submirror, then set the unavailable flag
1213 	 *    otherwise, clear unavailable.
1214 	 */
1215 	sm_cnt = 0;
1216 	sm_unavail_cnt = 0;
1217 	for (i = 0; i < NMIRROR; i++) {
1218 		md_dev64_t	tmpdev;
1219 		mdi_unit_t	*sm_ui;
1220 
1221 		if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
1222 			continue;
1223 
1224 		sm_cnt++;
1225 		tmpdev = un->un_sm[i].sm_dev;
1226 		sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev)));
1227 
1228 		(void) md_unit_writerlock(sm_ui);
1229 		if (submirror_unavailable(un, i, 0)) {
1230 			sm_ui->ui_tstate |= MD_INACCESSIBLE;
1231 			sm_unavail_cnt++;
1232 		} else {
1233 			sm_ui->ui_tstate &= ~MD_INACCESSIBLE;
1234 		}
1235 		md_unit_writerexit(sm_ui);
1236 	}
1237 
1238 	/*
1239 	 * If all of the submirrors are unavailable, the mirror is also
1240 	 * unavailable.
1241 	 */
1242 	if (sm_cnt == sm_unavail_cnt) {
1243 		ui->ui_tstate |= MD_INACCESSIBLE;
1244 	} else {
1245 		ui->ui_tstate &= ~MD_INACCESSIBLE;
1246 	}
1247 
1248 	smi = 0;
1249 	ci = 0;
1250 	while (mirror_geterror(un, &smi, &ci, 1, 0) != 0) {
1251 		if (mirror_other_sources(un, smi, ci, 1) == 1) {
1252 
1253 			free_all_ecomps(ecomps);
1254 			(void) mirror_close_all_devs(un, md_oflags);
1255 			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL,
1256 			    SVM_TAG_METADEVICE, setno, MD_SID(un));
1257 			mirror_openfail_console_info(un, smi, ci);
1258 			if (lockp) {
1259 				md_ioctl_writerexit(lockp);
1260 				(void) md_ioctl_readerlock(lockp, ui);
1261 			} else {
1262 				md_unit_writerexit(ui);
1263 				(void) md_unit_readerlock(ui);
1264 			}
1265 			return (ENXIO);
1266 		}
1267 
1268 		/* track all component states that need changing */
1269 		c = (err_comp_t *)kmem_alloc(sizeof (err_comp_t), KM_SLEEP);
1270 		c->ec_next = ecomps;
1271 		c->ec_smi = smi;
1272 		c->ec_ci = ci;
1273 		ecomps = c;
1274 		ci++;
1275 	}
1276 
1277 	/* Make all state changes and commit them */
1278 	for (c = ecomps; c != NULL; c = c->ec_next) {
1279 		/*
1280 		 * If lockp is set, then entering kernel through ioctl.
1281 		 * For a MN set, the only ioctl path is via a commd message
1282 		 * (ALLOCATE_HOTSPARE or *RESYNC* messages) that is already
1283 		 * being sent to each node.
1284 		 * In this case, set NO_XMIT so that set_sm_comp_state
1285 		 * won't attempt to send a message on a message.
1286 		 *
1287 		 * In !MN sets, the xmit flag is ignored, so it doesn't matter
1288 		 * which flag is passed.
1289 		 */
1290 		if (lockp) {
1291 			set_sm_comp_state(un, c->ec_smi, c->ec_ci, CS_ERRED, 0,
1292 			    MD_STATE_NO_XMIT, lockp);
1293 		} else {
1294 			set_sm_comp_state(un, c->ec_smi, c->ec_ci, CS_ERRED, 0,
1295 			    (MD_STATE_XMIT | MD_STATE_OCHELD), lockp);
1296 		}
1297 		/*
1298 		 * For a MN set, the NOTIFY is done when the state change is
1299 		 * processed on each node
1300 		 */
1301 		if (!MD_MNSET_SETNO(setno)) {
1302 			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED,
1303 			    SVM_TAG_METADEVICE, setno, MD_SID(un));
1304 		}
1305 	}
1306 
1307 	if (lockp) {
1308 		md_ioctl_writerexit(lockp);
1309 		(void) md_ioctl_readerlock(lockp, ui);
1310 	} else {
1311 		md_unit_writerexit(ui);
1312 		(void) md_unit_readerlock(ui);
1313 	}
1314 
1315 	free_all_ecomps(ecomps);
1316 
1317 	/* allocate hotspares for all errored components */
1318 	if (MD_MNSET_SETNO(setno)) {
1319 		/*
1320 		 * If we're called from an ioctl (lockp set) then we cannot
1321 		 * directly call send_poke_hotspares as this will block until
1322 		 * the message gets despatched to all nodes. If the cluster is
1323 		 * going through a reconfig cycle then the message will block
1324 		 * until the cycle is complete, and as we originate from a
1325 		 * service call from commd we will livelock.
1326 		 */
1327 		if (lockp == NULL) {
1328 			md_unit_readerexit(ui);
1329 			send_poke_hotspares(setno);
1330 			(void) md_unit_readerlock(ui);
1331 		}
1332 	} else {
1333 		(void) poke_hotspares();
1334 	}
1335 	return (0);
1336 }
1337 
1338 void
1339 mirror_overlap_tree_remove(md_mps_t *ps)
1340 {
1341 	mm_unit_t	*un;
1342 
1343 	if (panicstr)
1344 		return;
1345 
1346 	VERIFY(ps->ps_flags & MD_MPS_ON_OVERLAP);
1347 	un = ps->ps_un;
1348 
1349 	mutex_enter(&un->un_overlap_tree_mx);
1350 	avl_remove(&un->un_overlap_root, ps);
1351 	ps->ps_flags &= ~MD_MPS_ON_OVERLAP;
1352 	if (un->un_overlap_tree_flag != 0) {
1353 		un->un_overlap_tree_flag = 0;
1354 		cv_broadcast(&un->un_overlap_tree_cv);
1355 	}
1356 	mutex_exit(&un->un_overlap_tree_mx);
1357 }
1358 
1359 
1360 /*
1361  * wait_for_overlaps:
1362  * -----------------
1363  * Check that given i/o request does not cause an overlap with already pending
1364  * i/o. If it does, block until the overlapped i/o completes.
1365  *
1366  * The flag argument has MD_OVERLAP_ALLOW_REPEAT set if it is ok for the parent
1367  * structure to be already in the overlap tree and MD_OVERLAP_NO_REPEAT if
1368  * it must not already be in the tree.
1369  */
1370 static void
1371 wait_for_overlaps(md_mps_t *ps, int flags)
1372 {
1373 	mm_unit_t	*un;
1374 	avl_index_t	where;
1375 	md_mps_t	*ps1;
1376 
1377 	if (panicstr)
1378 		return;
1379 
1380 	un = ps->ps_un;
1381 	mutex_enter(&un->un_overlap_tree_mx);
1382 	if ((flags & MD_OVERLAP_ALLOW_REPEAT) &&
1383 	    (ps->ps_flags & MD_MPS_ON_OVERLAP)) {
1384 		mutex_exit(&un->un_overlap_tree_mx);
1385 		return;
1386 	}
1387 
1388 	VERIFY(!(ps->ps_flags & MD_MPS_ON_OVERLAP));
1389 
1390 	do {
1391 		ps1 = avl_find(&un->un_overlap_root, ps, &where);
1392 		if (ps1 == NULL) {
1393 			/*
1394 			 * The candidate range does not overlap with any
1395 			 * range in the tree.  Insert it and be done.
1396 			 */
1397 			avl_insert(&un->un_overlap_root, ps, where);
1398 			ps->ps_flags |= MD_MPS_ON_OVERLAP;
1399 		} else {
1400 			/*
1401 			 * The candidate range would overlap.  Set the flag
1402 			 * indicating we need to be woken up, and sleep
1403 			 * until another thread removes a range.  If upon
1404 			 * waking up we find this mps was put on the tree
1405 			 * by another thread, the loop terminates.
1406 			 */
1407 			un->un_overlap_tree_flag = 1;
1408 			cv_wait(&un->un_overlap_tree_cv,
1409 			    &un->un_overlap_tree_mx);
1410 		}
1411 	} while (!(ps->ps_flags & MD_MPS_ON_OVERLAP));
1412 	mutex_exit(&un->un_overlap_tree_mx);
1413 }
1414 
1415 /*
1416  * This function is called from mirror_done to check whether any pages have
1417  * been modified while a mirrored write was in progress.  Returns 0 if
1418  * all pages associated with bp are clean, 1 otherwise.
1419  */
1420 static int
1421 any_pages_dirty(struct buf *bp)
1422 {
1423 	int	rval;
1424 
1425 	rval = biomodified(bp);
1426 	if (rval == -1)
1427 		rval = 0;
1428 
1429 	return (rval);
1430 }
1431 
1432 #define	MAX_EXTRAS 10
1433 
1434 void
1435 mirror_commit(
1436 	mm_unit_t	*un,
1437 	int		smmask,
1438 	mddb_recid_t	*extras
1439 )
1440 {
1441 	mm_submirror_t		*sm;
1442 	md_unit_t		*su;
1443 	int			i;
1444 
1445 	/* 2=mirror,null id */
1446 	mddb_recid_t		recids[NMIRROR+2+MAX_EXTRAS];
1447 
1448 	int			ri = 0;
1449 
1450 	if (md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)
1451 		return;
1452 
1453 	/* Add two, this includes the mirror unit and the null recid */
1454 	if (extras != NULL) {
1455 		int	nrecids = 0;
1456 		while (extras[nrecids] != 0) {
1457 			nrecids++;
1458 		}
1459 		ASSERT(nrecids <= MAX_EXTRAS);
1460 	}
1461 
1462 	if (un != NULL)
1463 		recids[ri++] = un->c.un_record_id;
1464 	for (i = 0;  i < NMIRROR; i++) {
1465 		if (!(smmask & SMI2BIT(i)))
1466 			continue;
1467 		sm = &un->un_sm[i];
1468 		if (!SMS_IS(sm, SMS_INUSE))
1469 			continue;
1470 		if (md_getmajor(sm->sm_dev) != md_major)
1471 			continue;
1472 		su =  MD_UNIT(md_getminor(sm->sm_dev));
1473 		recids[ri++] = su->c.un_record_id;
1474 	}
1475 
1476 	if (extras != NULL)
1477 		while (*extras != 0) {
1478 			recids[ri++] = *extras;
1479 			extras++;
1480 		}
1481 
1482 	if (ri == 0)
1483 		return;
1484 	recids[ri] = 0;
1485 
1486 	/*
1487 	 * Ok to hold ioctl lock across record commit to mddb as
1488 	 * long as the record(s) being committed aren't resync records.
1489 	 */
1490 	mddb_commitrecs_wrapper(recids);
1491 }
1492 
1493 
1494 /*
1495  * This routine is used to set a bit in the writable_bm bitmap
1496  * which represents each submirror in a metamirror which
1497  * is writable. The first writable submirror index is assigned
1498  * to the sm_index.  The number of writable submirrors are returned in nunits.
1499  *
1500  * This routine returns the submirror's unit number.
1501  */
1502 
1503 static void
1504 select_write_units(struct mm_unit *un, md_mps_t *ps)
1505 {
1506 
1507 	int		i;
1508 	unsigned	writable_bm = 0;
1509 	unsigned	nunits = 0;
1510 
1511 	for (i = 0; i < NMIRROR; i++) {
1512 		if (SUBMIRROR_IS_WRITEABLE(un, i)) {
1513 			/* set bit of all writable units */
1514 			writable_bm |= SMI2BIT(i);
1515 			nunits++;
1516 		}
1517 	}
1518 	ps->ps_writable_sm = writable_bm;
1519 	ps->ps_active_cnt = nunits;
1520 	ps->ps_current_sm = 0;
1521 }
1522 
1523 static
1524 unsigned
1525 select_write_after_read_units(struct mm_unit *un, md_mps_t *ps)
1526 {
1527 
1528 	int		i;
1529 	unsigned	writable_bm = 0;
1530 	unsigned	nunits = 0;
1531 
1532 	for (i = 0; i < NMIRROR; i++) {
1533 		if (SUBMIRROR_IS_WRITEABLE(un, i) &&
1534 		    un->un_sm[i].sm_flags & MD_SM_RESYNC_TARGET) {
1535 			writable_bm |= SMI2BIT(i);
1536 			nunits++;
1537 		}
1538 	}
1539 	if ((writable_bm & ps->ps_allfrom_sm) != 0) {
1540 		writable_bm &= ~ps->ps_allfrom_sm;
1541 		nunits--;
1542 	}
1543 	ps->ps_writable_sm = writable_bm;
1544 	ps->ps_active_cnt = nunits;
1545 	ps->ps_current_sm = 0;
1546 	return (nunits);
1547 }
1548 
1549 static md_dev64_t
1550 select_read_unit(
1551 	mm_unit_t	*un,
1552 	diskaddr_t	blkno,
1553 	u_longlong_t	reqcount,
1554 	u_longlong_t	*cando,
1555 	int		must_be_opened,
1556 	md_m_shared_t	**shared,
1557 	md_mcs_t	*cs)
1558 {
1559 	int			i;
1560 	md_m_shared_t		*s;
1561 	uint_t			lasterrcnt = 0;
1562 	md_dev64_t		dev = 0;
1563 	u_longlong_t		cnt;
1564 	u_longlong_t		mincnt;
1565 	mm_submirror_t		*sm;
1566 	mm_submirror_ic_t	*smic;
1567 	mdi_unit_t		*ui;
1568 
1569 	mincnt = reqcount;
1570 	for (i = 0; i < NMIRROR; i++) {
1571 		if (!SUBMIRROR_IS_READABLE(un, i))
1572 			continue;
1573 		sm = &un->un_sm[i];
1574 		smic = &un->un_smic[i];
1575 		cnt = reqcount;
1576 
1577 		/*
1578 		 * If the current submirror is marked as inaccessible, do not
1579 		 * try to access it.
1580 		 */
1581 		ui = MDI_UNIT(getminor(expldev(sm->sm_dev)));
1582 		(void) md_unit_readerlock(ui);
1583 		if (ui->ui_tstate & MD_INACCESSIBLE) {
1584 			md_unit_readerexit(ui);
1585 			continue;
1586 		}
1587 		md_unit_readerexit(ui);
1588 
1589 		s = (md_m_shared_t *)(*(smic->sm_shared_by_blk))
1590 		    (sm->sm_dev, sm, blkno, &cnt);
1591 
1592 		if (must_be_opened && !(s->ms_flags & MDM_S_ISOPEN))
1593 			continue;
1594 		if (s->ms_state == CS_OKAY) {
1595 			*cando = cnt;
1596 			if (shared != NULL)
1597 				*shared = s;
1598 
1599 			if (un->un_sm[i].sm_flags & MD_SM_FAILFAST &&
1600 			    cs != NULL) {
1601 				cs->cs_buf.b_flags |= B_FAILFAST;
1602 			}
1603 
1604 			return (un->un_sm[i].sm_dev);
1605 		}
1606 		if (s->ms_state != CS_LAST_ERRED)
1607 			continue;
1608 
1609 		/* don't use B_FAILFAST since we're Last Erred */
1610 
1611 		if (mincnt > cnt)
1612 			mincnt = cnt;
1613 		if (s->ms_lasterrcnt > lasterrcnt) {
1614 			lasterrcnt = s->ms_lasterrcnt;
1615 			if (shared != NULL)
1616 				*shared = s;
1617 			dev = un->un_sm[i].sm_dev;
1618 		}
1619 	}
1620 	*cando = mincnt;
1621 	return (dev);
1622 }
1623 
1624 /*
1625  * Given a 32-bit bitmap, this routine will return the bit number
1626  * of the nth bit set.	The nth bit set is passed via the index integer.
1627  *
1628  * This routine is used to run through the writable submirror bitmap
1629  * and starting all of the writes.  See the value returned is the
1630  * index to appropriate submirror structure, in the md_sm
1631  * array for metamirrors.
1632  */
1633 static int
1634 md_find_nth_unit(uint_t mask, int index)
1635 {
1636 	int	bit, nfound;
1637 
1638 	for (bit = -1, nfound = -1; nfound != index; bit++) {
1639 		ASSERT(mask != 0);
1640 		nfound += (mask & 1);
1641 		mask >>= 1;
1642 	}
1643 	return (bit);
1644 }
1645 
1646 static int
1647 fast_select_read_unit(md_mps_t *ps, md_mcs_t *cs)
1648 {
1649 	mm_unit_t	*un;
1650 	buf_t		*bp;
1651 	int		i;
1652 	unsigned	nunits = 0;
1653 	int		iunit;
1654 	uint_t		running_bm = 0;
1655 	uint_t		sm_index;
1656 
1657 	bp = &cs->cs_buf;
1658 	un = ps->ps_un;
1659 
1660 	for (i = 0; i < NMIRROR; i++) {
1661 		if (!SMS_BY_INDEX_IS(un, i, SMS_RUNNING))
1662 			continue;
1663 		running_bm |= SMI2BIT(i);
1664 		nunits++;
1665 	}
1666 	if (nunits == 0)
1667 		return (1);
1668 
1669 	/*
1670 	 * For directed mirror read (DMR) we only use the specified side and
1671 	 * do not compute the source of the read.
1672 	 * If we're running with MD_MPS_DIRTY_RD set we always return the
1673 	 * first mirror side (this prevents unnecessary ownership switching).
1674 	 * Otherwise we return the submirror according to the mirror read option
1675 	 */
1676 	if (ps->ps_flags & MD_MPS_DMR) {
1677 		sm_index = un->un_dmr_last_read;
1678 	} else if (ps->ps_flags & MD_MPS_DIRTY_RD) {
1679 		sm_index = md_find_nth_unit(running_bm, 0);
1680 	} else {
1681 		/* Normal (non-DMR) operation */
1682 		switch (un->un_read_option) {
1683 		case RD_GEOMETRY:
1684 			iunit = (int)(bp->b_lblkno /
1685 			    howmany(un->c.un_total_blocks, nunits));
1686 			sm_index = md_find_nth_unit(running_bm, iunit);
1687 			break;
1688 		case RD_FIRST:
1689 			sm_index = md_find_nth_unit(running_bm, 0);
1690 			break;
1691 		case RD_LOAD_BAL:
1692 			/* this is intentional to fall into the default */
1693 		default:
1694 			un->un_last_read = (un->un_last_read + 1) % nunits;
1695 			sm_index = md_find_nth_unit(running_bm,
1696 			    un->un_last_read);
1697 			break;
1698 		}
1699 	}
1700 	bp->b_edev = md_dev64_to_dev(un->un_sm[sm_index].sm_dev);
1701 	ps->ps_allfrom_sm = SMI2BIT(sm_index);
1702 
1703 	if (un->un_sm[sm_index].sm_flags & MD_SM_FAILFAST) {
1704 		bp->b_flags |= B_FAILFAST;
1705 	}
1706 
1707 	return (0);
1708 }
1709 
1710 static
1711 int
1712 mirror_are_submirrors_available(mm_unit_t *un)
1713 {
1714 	int i;
1715 	for (i = 0; i < NMIRROR; i++) {
1716 		md_dev64_t tmpdev = un->un_sm[i].sm_dev;
1717 
1718 		if ((!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) ||
1719 		    md_getmajor(tmpdev) != md_major)
1720 			continue;
1721 
1722 		if ((MD_MIN2SET(md_getminor(tmpdev)) >= md_nsets) ||
1723 		    (MD_MIN2UNIT(md_getminor(tmpdev)) >= md_nunits))
1724 			return (0);
1725 
1726 		if (MDI_UNIT(md_getminor(tmpdev)) == NULL)
1727 			return (0);
1728 	}
1729 	return (1);
1730 }
1731 
1732 void
1733 build_submirror(mm_unit_t *un, int i, int snarfing)
1734 {
1735 	struct mm_submirror	*sm;
1736 	struct mm_submirror_ic	*smic;
1737 	md_unit_t		*su;
1738 	set_t			setno;
1739 
1740 	sm = &un->un_sm[i];
1741 	smic = &un->un_smic[i];
1742 
1743 	sm->sm_flags = 0; /* sometime we may need to do more here */
1744 
1745 	setno = MD_UN2SET(un);
1746 
1747 	if (!SMS_IS(sm, SMS_INUSE))
1748 		return;
1749 	if (snarfing) {
1750 		sm->sm_dev = md_getdevnum(setno, mddb_getsidenum(setno),
1751 		    sm->sm_key, MD_NOTRUST_DEVT);
1752 	} else {
1753 		if (md_getmajor(sm->sm_dev) == md_major) {
1754 			su = MD_UNIT(md_getminor(sm->sm_dev));
1755 			un->c.un_flag |= (su->c.un_flag & MD_LABELED);
1756 			/* submirror can no longer be soft partitioned */
1757 			MD_CAPAB(su) &= (~MD_CAN_SP);
1758 		}
1759 	}
1760 	smic->sm_shared_by_blk = md_get_named_service(sm->sm_dev,
1761 	    0, "shared by blk", 0);
1762 	smic->sm_shared_by_indx = md_get_named_service(sm->sm_dev,
1763 	    0, "shared by indx", 0);
1764 	smic->sm_get_component_count = (int (*)())md_get_named_service(
1765 	    sm->sm_dev, 0, "get component count", 0);
1766 	smic->sm_get_bcss = (int (*)())md_get_named_service(sm->sm_dev, 0,
1767 	    "get block count skip size", 0);
1768 	sm->sm_state &= ~SMS_IGNORE;
1769 	if (SMS_IS(sm, SMS_OFFLINE))
1770 		MD_STATUS(un) |= MD_UN_OFFLINE_SM;
1771 	md_set_parent(sm->sm_dev, MD_SID(un));
1772 }
1773 
1774 static void
1775 mirror_cleanup(mm_unit_t *un)
1776 {
1777 	mddb_recid_t	recid;
1778 	int		smi;
1779 	sv_dev_t	sv[NMIRROR];
1780 	int		nsv = 0;
1781 
1782 	/*
1783 	 * If a MN diskset and this node is not the master, do
1784 	 * not delete any records on snarf of the mirror records.
1785 	 */
1786 	if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
1787 	    md_set[MD_UN2SET(un)].s_am_i_master == 0) {
1788 		return;
1789 	}
1790 
1791 	for (smi = 0; smi < NMIRROR; smi++) {
1792 		if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE))
1793 			continue;
1794 		sv[nsv].setno = MD_UN2SET(un);
1795 		sv[nsv++].key = un->un_sm[smi].sm_key;
1796 	}
1797 
1798 	recid = un->un_rr_dirty_recid;
1799 	mddb_deleterec_wrapper(un->c.un_record_id);
1800 	if (recid > 0)
1801 		mddb_deleterec_wrapper(recid);
1802 
1803 	md_rem_names(sv, nsv);
1804 }
1805 
1806 /*
1807  * Comparison function for the avl tree which tracks
1808  * outstanding writes on submirrors.
1809  *
1810  * Returns:
1811  *	-1: ps1 < ps2
1812  *	 0: ps1 and ps2 overlap
1813  *	 1: ps1 > ps2
1814  */
1815 static int
1816 mirror_overlap_compare(const void *p1, const void *p2)
1817 {
1818 	const md_mps_t *ps1 = (md_mps_t *)p1;
1819 	const md_mps_t *ps2 = (md_mps_t *)p2;
1820 
1821 	if (ps1->ps_firstblk < ps2->ps_firstblk) {
1822 		if (ps1->ps_lastblk >= ps2->ps_firstblk)
1823 			return (0);
1824 		return (-1);
1825 	}
1826 
1827 	if (ps1->ps_firstblk > ps2->ps_firstblk) {
1828 		if (ps1->ps_firstblk <= ps2->ps_lastblk)
1829 			return (0);
1830 		return (1);
1831 	}
1832 
1833 	return (0);
1834 }
1835 
1836 /*
1837  * Collapse any sparse submirror entries snarfed from the on-disk replica.
1838  * Only the in-core entries are updated. The replica will be updated on-disk
1839  * when the in-core replica is committed on shutdown of the SVM subsystem.
1840  */
1841 static void
1842 collapse_submirrors(mm_unit_t *un)
1843 {
1844 	int			smi, nremovals, smiremove;
1845 	mm_submirror_t		*sm, *new_sm, *old_sm;
1846 	mm_submirror_ic_t	*smic;
1847 	int			nsmidx = un->un_nsm - 1;
1848 
1849 rescan:
1850 	nremovals = 0;
1851 	smiremove = -1;
1852 
1853 	for (smi = 0; smi <= nsmidx; smi++) {
1854 		sm = &un->un_sm[smi];
1855 
1856 		/*
1857 		 * Check to see if this submirror is marked as in-use.
1858 		 * If it isn't then it is a potential sparse entry and
1859 		 * may need to be cleared from the configuration.
1860 		 * The records should _already_ have been cleared by the
1861 		 * original mirror_detach() code, but we need to shuffle
1862 		 * any NULL entries in un_sm[] to the end of the array.
1863 		 * Any NULL un_smic[] entries need to be reset to the underlying
1864 		 * submirror/slice accessor functions.
1865 		 */
1866 		if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE)) {
1867 			nremovals++;
1868 			smiremove = smi;
1869 			break;
1870 		}
1871 	}
1872 
1873 	if (nremovals == 0) {
1874 		/*
1875 		 * Ensure that we have a matching contiguous set of un_smic[]
1876 		 * entries for the corresponding un_sm[] entries
1877 		 */
1878 		for (smi = 0; smi <= nsmidx; smi++) {
1879 			smic = &un->un_smic[smi];
1880 			sm = &un->un_sm[smi];
1881 
1882 			smic->sm_shared_by_blk =
1883 			    md_get_named_service(sm->sm_dev, 0,
1884 			    "shared by_blk", 0);
1885 			smic->sm_shared_by_indx =
1886 			    md_get_named_service(sm->sm_dev, 0,
1887 			    "shared by indx", 0);
1888 			smic->sm_get_component_count =
1889 			    (int (*)())md_get_named_service(sm->sm_dev, 0,
1890 			    "get component count", 0);
1891 			smic->sm_get_bcss =
1892 			    (int (*)())md_get_named_service(sm->sm_dev, 0,
1893 			    "get block count skip size", 0);
1894 		}
1895 		return;
1896 	}
1897 
1898 	/*
1899 	 * Reshuffle the submirror devices so that we do not have a dead record
1900 	 * in the middle of the array. Once we've done this we need to rescan
1901 	 * the mirror to check for any other holes.
1902 	 */
1903 	for (smi = 0; smi < NMIRROR; smi++) {
1904 		if (smi < smiremove)
1905 			continue;
1906 		if (smi > smiremove) {
1907 			old_sm = &un->un_sm[smi];
1908 			new_sm = &un->un_sm[smi - 1];
1909 			bcopy(old_sm, new_sm, sizeof (mm_submirror_t));
1910 			bzero(old_sm, sizeof (mm_submirror_t));
1911 		}
1912 	}
1913 
1914 	/*
1915 	 * Now we need to rescan the array to find the next potential dead
1916 	 * entry.
1917 	 */
1918 	goto rescan;
1919 }
1920 
1921 /* Return a -1 if optimized record unavailable and set should be released */
1922 int
1923 mirror_build_incore(mm_unit_t *un, int snarfing)
1924 {
1925 	int		i;
1926 
1927 	if (MD_STATUS(un) & MD_UN_BEING_RESET) {
1928 		mddb_setrecprivate(un->c.un_record_id, MD_PRV_PENDCLEAN);
1929 		return (1);
1930 	}
1931 
1932 	if (mirror_are_submirrors_available(un) == 0)
1933 		return (1);
1934 
1935 	if (MD_UNIT(MD_SID(un)) != NULL)
1936 		return (0);
1937 
1938 	MD_STATUS(un) = 0;
1939 
1940 	/* pre-4.1 didn't define CAN_META_CHILD capability */
1941 	MD_CAPAB(un) = MD_CAN_META_CHILD | MD_CAN_PARENT | MD_CAN_SP;
1942 
1943 	un->un_overlap_tree_flag = 0;
1944 	avl_create(&un->un_overlap_root, mirror_overlap_compare,
1945 	    sizeof (md_mps_t), offsetof(md_mps_t, ps_overlap_node));
1946 
1947 	/*
1948 	 * We need to collapse any sparse submirror entries into a non-sparse
1949 	 * array. This is to cover the case where we have an old replica image
1950 	 * which has not been updated (i.e. snarfed) since being modified.
1951 	 * The new code expects all submirror access to be sequential (i.e.
1952 	 * both the un_sm[] and un_smic[] entries correspond to non-empty
1953 	 * submirrors.
1954 	 */
1955 
1956 	collapse_submirrors(un);
1957 
1958 	for (i = 0; i < NMIRROR; i++)
1959 		build_submirror(un, i, snarfing);
1960 
1961 	if (unit_setup_resync(un, snarfing) != 0) {
1962 		if (snarfing) {
1963 			mddb_setrecprivate(un->c.un_record_id, MD_PRV_GOTIT);
1964 			/*
1965 			 * If a MN set and set is not stale, then return -1
1966 			 * which will force the caller to unload the set.
1967 			 * The MN diskset nodes will return failure if
1968 			 * unit_setup_resync fails so that nodes won't
1969 			 * get out of sync.
1970 			 *
1971 			 * If set is STALE, the master node can't allocate
1972 			 * a resync record (if needed), but node needs to
1973 			 * join the set so that user can delete broken mddbs.
1974 			 * So, if set is STALE, just continue on.
1975 			 */
1976 			if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
1977 			    !(md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)) {
1978 				return (-1);
1979 			}
1980 		} else
1981 			return (1);
1982 	}
1983 
1984 	mutex_init(&un->un_overlap_tree_mx, NULL, MUTEX_DEFAULT, NULL);
1985 	cv_init(&un->un_overlap_tree_cv, NULL, CV_DEFAULT, NULL);
1986 
1987 	un->un_suspend_wr_flag = 0;
1988 	mutex_init(&un->un_suspend_wr_mx, NULL, MUTEX_DEFAULT, NULL);
1989 	cv_init(&un->un_suspend_wr_cv, NULL, CV_DEFAULT, NULL);
1990 
1991 	/*
1992 	 * Allocate mutexes for mirror-owner and resync-owner changes.
1993 	 * All references to the owner message state field must be guarded
1994 	 * by this mutex.
1995 	 */
1996 	mutex_init(&un->un_owner_mx, NULL, MUTEX_DEFAULT, NULL);
1997 
1998 	/*
1999 	 * Allocate mutex and condvar for resync thread manipulation. These
2000 	 * will be used by mirror_resync_unit/mirror_ioctl_resync
2001 	 */
2002 	mutex_init(&un->un_rs_thread_mx, NULL, MUTEX_DEFAULT, NULL);
2003 	cv_init(&un->un_rs_thread_cv, NULL, CV_DEFAULT, NULL);
2004 
2005 	/*
2006 	 * Allocate mutex and condvar for resync progress thread manipulation.
2007 	 * This allows resyncs to be continued across an intervening reboot.
2008 	 */
2009 	mutex_init(&un->un_rs_progress_mx, NULL, MUTEX_DEFAULT, NULL);
2010 	cv_init(&un->un_rs_progress_cv, NULL, CV_DEFAULT, NULL);
2011 
2012 	/*
2013 	 * Allocate mutex and condvar for Directed Mirror Reads (DMR). This
2014 	 * provides synchronization between a user-ioctl and the resulting
2015 	 * strategy() call that performs the read().
2016 	 */
2017 	mutex_init(&un->un_dmr_mx, NULL, MUTEX_DEFAULT, NULL);
2018 	cv_init(&un->un_dmr_cv, NULL, CV_DEFAULT, NULL);
2019 
2020 	/*
2021 	 * Allocate rwlocks for un_pernode_dirty_bm accessing.
2022 	 */
2023 	for (i = 0; i < MD_MNMAXSIDES; i++) {
2024 		rw_init(&un->un_pernode_dirty_mx[i], NULL, RW_DEFAULT, NULL);
2025 	}
2026 
2027 	/* place various information in the in-core data structures */
2028 	md_nblocks_set(MD_SID(un), un->c.un_total_blocks);
2029 	MD_UNIT(MD_SID(un)) = un;
2030 
2031 	return (0);
2032 }
2033 
2034 
2035 void
2036 reset_mirror(struct mm_unit *un, minor_t mnum, int removing)
2037 {
2038 	mddb_recid_t	recid, vtoc_id;
2039 	size_t		bitcnt;
2040 	size_t		shortcnt;
2041 	int		smi;
2042 	sv_dev_t	sv[NMIRROR];
2043 	int		nsv = 0;
2044 	uint_t		bits = 0;
2045 	minor_t		selfid;
2046 	md_unit_t	*su;
2047 	int		i;
2048 
2049 	md_destroy_unit_incore(mnum, &mirror_md_ops);
2050 
2051 	shortcnt = un->un_rrd_num * sizeof (short);
2052 	bitcnt = howmany(un->un_rrd_num, NBBY);
2053 
2054 	if (un->un_outstanding_writes)
2055 		kmem_free((caddr_t)un->un_outstanding_writes, shortcnt);
2056 	if (un->un_goingclean_bm)
2057 		kmem_free((caddr_t)un->un_goingclean_bm, bitcnt);
2058 	if (un->un_goingdirty_bm)
2059 		kmem_free((caddr_t)un->un_goingdirty_bm, bitcnt);
2060 	if (un->un_resync_bm)
2061 		kmem_free((caddr_t)un->un_resync_bm, bitcnt);
2062 	if (un->un_pernode_dirty_sum)
2063 		kmem_free((caddr_t)un->un_pernode_dirty_sum, un->un_rrd_num);
2064 
2065 	/*
2066 	 * Destroy the taskq for deferred processing of DRL clean requests.
2067 	 * This taskq will only be present for Multi Owner mirrors.
2068 	 */
2069 	if (un->un_drl_task != NULL)
2070 		ddi_taskq_destroy(un->un_drl_task);
2071 
2072 	md_nblocks_set(mnum, -1ULL);
2073 	MD_UNIT(mnum) = NULL;
2074 
2075 	/*
2076 	 * Attempt release of its minor node
2077 	 */
2078 	md_remove_minor_node(mnum);
2079 
2080 	if (!removing)
2081 		return;
2082 
2083 	for (smi = 0; smi < NMIRROR; smi++) {
2084 		if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE))
2085 			continue;
2086 		/* reallow soft partitioning of submirror and reset parent */
2087 		su = MD_UNIT(md_getminor(un->un_sm[smi].sm_dev));
2088 		MD_CAPAB(su) |= MD_CAN_SP;
2089 		md_reset_parent(un->un_sm[smi].sm_dev);
2090 		reset_comp_states(&un->un_sm[smi], &un->un_smic[smi]);
2091 
2092 		sv[nsv].setno = MD_MIN2SET(mnum);
2093 		sv[nsv++].key = un->un_sm[smi].sm_key;
2094 		bits |= SMI2BIT(smi);
2095 	}
2096 
2097 	MD_STATUS(un) |= MD_UN_BEING_RESET;
2098 	recid = un->un_rr_dirty_recid;
2099 	vtoc_id = un->c.un_vtoc_id;
2100 	selfid = MD_SID(un);
2101 
2102 	mirror_commit(un, bits, 0);
2103 
2104 	avl_destroy(&un->un_overlap_root);
2105 
2106 	/* Destroy all mutexes and condvars before returning. */
2107 	mutex_destroy(&un->un_suspend_wr_mx);
2108 	cv_destroy(&un->un_suspend_wr_cv);
2109 	mutex_destroy(&un->un_overlap_tree_mx);
2110 	cv_destroy(&un->un_overlap_tree_cv);
2111 	mutex_destroy(&un->un_owner_mx);
2112 	mutex_destroy(&un->un_rs_thread_mx);
2113 	cv_destroy(&un->un_rs_thread_cv);
2114 	mutex_destroy(&un->un_rs_progress_mx);
2115 	cv_destroy(&un->un_rs_progress_cv);
2116 	mutex_destroy(&un->un_dmr_mx);
2117 	cv_destroy(&un->un_dmr_cv);
2118 
2119 	for (i = 0; i < MD_MNMAXSIDES; i++) {
2120 		rw_destroy(&un->un_pernode_dirty_mx[i]);
2121 		if (un->un_pernode_dirty_bm[i])
2122 			kmem_free((caddr_t)un->un_pernode_dirty_bm[i], bitcnt);
2123 	}
2124 
2125 	/*
2126 	 * Remove self from the namespace
2127 	 */
2128 	if (un->c.un_revision & MD_FN_META_DEV) {
2129 		(void) md_rem_selfname(un->c.un_self_id);
2130 	}
2131 
2132 	/* This frees the unit structure. */
2133 	mddb_deleterec_wrapper(un->c.un_record_id);
2134 
2135 	if (recid != 0)
2136 		mddb_deleterec_wrapper(recid);
2137 
2138 	/* Remove the vtoc, if present */
2139 	if (vtoc_id)
2140 		mddb_deleterec_wrapper(vtoc_id);
2141 
2142 	md_rem_names(sv, nsv);
2143 
2144 	SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, SVM_TAG_METADEVICE,
2145 	    MD_MIN2SET(selfid), selfid);
2146 }
2147 
2148 int
2149 mirror_internal_open(
2150 	minor_t		mnum,
2151 	int		flag,
2152 	int		otyp,
2153 	int		md_oflags,
2154 	IOLOCK		*lockp		/* can be NULL */
2155 )
2156 {
2157 	mdi_unit_t	*ui = MDI_UNIT(mnum);
2158 	int		err = 0;
2159 
2160 tryagain:
2161 	/* single thread */
2162 	if (lockp) {
2163 		/*
2164 		 * If ioctl lock is held, use openclose_enter
2165 		 * routine that will set the ioctl flag when
2166 		 * grabbing the readerlock.
2167 		 */
2168 		(void) md_ioctl_openclose_enter(lockp, ui);
2169 	} else {
2170 		(void) md_unit_openclose_enter(ui);
2171 	}
2172 
2173 	/*
2174 	 * The mirror_open_all_devs routine may end up sending a STATE_UPDATE
2175 	 * message in a MN diskset and this requires that the openclose
2176 	 * lock is dropped in order to send this message.  So, another
2177 	 * flag (MD_UL_OPENINPROGRESS) is used to keep another thread from
2178 	 * attempting an open while this thread has an open in progress.
2179 	 * Call the *_lh version of the lock exit routines since the ui_mx
2180 	 * mutex must be held from checking for OPENINPROGRESS until
2181 	 * after the cv_wait call.
2182 	 */
2183 	mutex_enter(&ui->ui_mx);
2184 	if (ui->ui_lock & MD_UL_OPENINPROGRESS) {
2185 		if (lockp) {
2186 			(void) md_ioctl_openclose_exit_lh(lockp);
2187 		} else {
2188 			md_unit_openclose_exit_lh(ui);
2189 		}
2190 		cv_wait(&ui->ui_cv, &ui->ui_mx);
2191 		mutex_exit(&ui->ui_mx);
2192 		goto tryagain;
2193 	}
2194 
2195 	ui->ui_lock |= MD_UL_OPENINPROGRESS;
2196 	mutex_exit(&ui->ui_mx);
2197 
2198 	/* open devices, if necessary */
2199 	if (! md_unit_isopen(ui) || (ui->ui_tstate & MD_INACCESSIBLE)) {
2200 		if ((err = mirror_open_all_devs(mnum, md_oflags, lockp)) != 0)
2201 			goto out;
2202 	}
2203 
2204 	/* count open */
2205 	if ((err = md_unit_incopen(mnum, flag, otyp)) != 0)
2206 		goto out;
2207 
2208 	/* unlock, return success */
2209 out:
2210 	mutex_enter(&ui->ui_mx);
2211 	ui->ui_lock &= ~MD_UL_OPENINPROGRESS;
2212 	mutex_exit(&ui->ui_mx);
2213 
2214 	if (lockp) {
2215 		/*
2216 		 * If ioctl lock is held, use openclose_exit
2217 		 * routine that will clear the lockp reader flag.
2218 		 */
2219 		(void) md_ioctl_openclose_exit(lockp);
2220 	} else {
2221 		md_unit_openclose_exit(ui);
2222 	}
2223 	return (err);
2224 }
2225 
2226 int
2227 mirror_internal_close(
2228 	minor_t		mnum,
2229 	int		otyp,
2230 	int		md_cflags,
2231 	IOLOCK		*lockp		/* can be NULL */
2232 )
2233 {
2234 	mdi_unit_t	*ui = MDI_UNIT(mnum);
2235 	mm_unit_t	*un;
2236 	int		err = 0;
2237 
2238 	/* single thread */
2239 	if (lockp) {
2240 		/*
2241 		 * If ioctl lock is held, use openclose_enter
2242 		 * routine that will set the ioctl flag when
2243 		 * grabbing the readerlock.
2244 		 */
2245 		un = (mm_unit_t *)md_ioctl_openclose_enter(lockp, ui);
2246 	} else {
2247 		un = (mm_unit_t *)md_unit_openclose_enter(ui);
2248 	}
2249 
2250 	/* count closed */
2251 	if ((err = md_unit_decopen(mnum, otyp)) != 0)
2252 		goto out;
2253 
2254 	/* close devices, if necessary */
2255 	if (! md_unit_isopen(ui) || (md_cflags & MD_OFLG_PROBEDEV)) {
2256 		/*
2257 		 * Clean up dirty bitmap for this unit. Do this
2258 		 * before closing the underlying devices to avoid
2259 		 * race conditions with reset_mirror() as a
2260 		 * result of a 'metaset -r' command running in
2261 		 * parallel. This might cause deallocation of
2262 		 * dirty region bitmaps; with underlying metadevices
2263 		 * in place this can't happen.
2264 		 * Don't do this if a MN set and ABR not set
2265 		 */
2266 		if (new_resync && !(MD_STATUS(un) & MD_UN_KEEP_DIRTY)) {
2267 			if (!MD_MNSET_SETNO(MD_UN2SET(un)) ||
2268 			    !(ui->ui_tstate & MD_ABR_CAP))
2269 				mirror_process_unit_resync(un);
2270 		}
2271 		(void) mirror_close_all_devs(un, md_cflags);
2272 
2273 		/*
2274 		 * For a MN set with transient capabilities (eg ABR/DMR) set,
2275 		 * clear these capabilities on the last open in the cluster.
2276 		 * To do this we send a message to all nodes to see of the
2277 		 * device is open.
2278 		 */
2279 		if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
2280 		    (ui->ui_tstate & (MD_ABR_CAP|MD_DMR_CAP))) {
2281 			if (lockp) {
2282 				(void) md_ioctl_openclose_exit(lockp);
2283 			} else {
2284 				md_unit_openclose_exit(ui);
2285 			}
2286 
2287 			/*
2288 			 * if we are in the context of an ioctl, drop the
2289 			 * ioctl lock.
2290 			 * Otherwise, no other locks should be held.
2291 			 */
2292 			if (lockp) {
2293 				IOLOCK_RETURN_RELEASE(0, lockp);
2294 			}
2295 
2296 			mdmn_clear_all_capabilities(mnum);
2297 
2298 			/* if dropped the lock previously, regain it */
2299 			if (lockp) {
2300 				IOLOCK_RETURN_REACQUIRE(lockp);
2301 			}
2302 			return (0);
2303 		}
2304 		/* unlock and return success */
2305 	}
2306 out:
2307 	/* Call whether lockp is NULL or not. */
2308 	if (lockp) {
2309 		md_ioctl_openclose_exit(lockp);
2310 	} else {
2311 		md_unit_openclose_exit(ui);
2312 	}
2313 	return (err);
2314 }
2315 
2316 /*
2317  * When a component has completed resyncing and is now ok, check if the
2318  * corresponding component in the other submirrors is in the Last Erred
2319  * state.  If it is, we want to change that to the Erred state so we stop
2320  * using that component and start using this good component instead.
2321  *
2322  * This is called from set_sm_comp_state and recursively calls
2323  * set_sm_comp_state if it needs to change the Last Erred state.
2324  */
2325 static void
2326 reset_lasterred(mm_unit_t *un, int smi, mddb_recid_t *extras, uint_t flags,
2327 	IOLOCK *lockp)
2328 {
2329 	mm_submirror_t		*sm;
2330 	mm_submirror_ic_t	*smic;
2331 	int			ci;
2332 	int			i;
2333 	int			compcnt;
2334 	int			changed = 0;
2335 
2336 	for (i = 0; i < NMIRROR; i++) {
2337 		sm = &un->un_sm[i];
2338 		smic = &un->un_smic[i];
2339 
2340 		if (!SMS_IS(sm, SMS_INUSE))
2341 			continue;
2342 
2343 		/* ignore the submirror that we just made ok */
2344 		if (i == smi)
2345 			continue;
2346 
2347 		compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un);
2348 		for (ci = 0; ci < compcnt; ci++) {
2349 			md_m_shared_t	*shared;
2350 
2351 			shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
2352 			    (sm->sm_dev, sm, ci);
2353 
2354 			if ((shared->ms_state & CS_LAST_ERRED) &&
2355 			    !mirror_other_sources(un, i, ci, 1)) {
2356 
2357 				set_sm_comp_state(un, i, ci, CS_ERRED, extras,
2358 				    flags, lockp);
2359 				changed = 1;
2360 			}
2361 		}
2362 	}
2363 
2364 	/* maybe there is a hotspare for this newly erred component */
2365 	if (changed) {
2366 		set_t	setno;
2367 
2368 		setno = MD_UN2SET(un);
2369 		if (MD_MNSET_SETNO(setno)) {
2370 			send_poke_hotspares(setno);
2371 		} else {
2372 			(void) poke_hotspares();
2373 		}
2374 	}
2375 }
2376 
2377 /*
2378  * set_sm_comp_state
2379  *
2380  * Set the state of a submirror component to the specified new state.
2381  * If the mirror is in a multi-node set, send messages to all nodes to
2382  * block all writes to the mirror and then update the state and release the
2383  * writes. These messages are only sent if MD_STATE_XMIT is set in flags.
2384  * MD_STATE_XMIT will be unset in 2 cases:
2385  * 1. When the state is changed to CS_RESYNC as this state change
2386  * will already have been updated on each node by the processing of the
2387  * distributed metasync command, hence no need to xmit.
2388  * 2. When the state is change to CS_OKAY after a resync has completed. Again
2389  * the resync completion will already have been processed on each node by
2390  * the processing of the MD_MN_MSG_RESYNC_PHASE_DONE message for a component
2391  * resync, hence no need to xmit.
2392  *
2393  * In case we are called from the updates of a watermark,
2394  * (then MD_STATE_WMUPDATE will be set in the ps->flags) this is due to
2395  * a metainit or similar. In this case the message that we sent to propagate
2396  * the state change must not be a class1 message as that would deadlock with
2397  * the metainit command that is still being processed.
2398  * This we achieve by creating a class2 message MD_MN_MSG_STATE_UPDATE2
2399  * instead. This also makes the submessage generator to create a class2
2400  * submessage rather than a class1 (which would also block)
2401  *
2402  * On entry, unit_writerlock is held
2403  * If MD_STATE_OCHELD is set in flags, then unit_openclose lock is
2404  * also held.
2405  */
2406 void
2407 set_sm_comp_state(
2408 	mm_unit_t	*un,
2409 	int		smi,
2410 	int		ci,
2411 	int		newstate,
2412 	mddb_recid_t	*extras,
2413 	uint_t		flags,
2414 	IOLOCK		*lockp
2415 )
2416 {
2417 	mm_submirror_t		*sm;
2418 	mm_submirror_ic_t	*smic;
2419 	md_m_shared_t		*shared;
2420 	int			origstate;
2421 	void			(*get_dev)();
2422 	ms_cd_info_t		cd;
2423 	char			devname[MD_MAX_CTDLEN];
2424 	int			err;
2425 	set_t			setno = MD_UN2SET(un);
2426 	md_mn_msg_stch_t	stchmsg;
2427 	mdi_unit_t		*ui = MDI_UNIT(MD_SID(un));
2428 	md_mn_kresult_t		*kresult;
2429 	int			rval;
2430 	uint_t			msgflags;
2431 	md_mn_msgtype_t		msgtype;
2432 	int			save_lock = 0;
2433 	mdi_unit_t		*ui_sm;
2434 	int			nretries = 0;
2435 
2436 	sm = &un->un_sm[smi];
2437 	smic = &un->un_smic[smi];
2438 
2439 	/* If we have a real error status then turn off MD_INACCESSIBLE. */
2440 	ui_sm = MDI_UNIT(getminor(md_dev64_to_dev(sm->sm_dev)));
2441 	if (newstate & (CS_ERRED | CS_RESYNC | CS_LAST_ERRED) &&
2442 	    ui_sm->ui_tstate & MD_INACCESSIBLE) {
2443 		ui_sm->ui_tstate &= ~MD_INACCESSIBLE;
2444 	}
2445 
2446 	shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
2447 	    (sm->sm_dev, sm, ci);
2448 	origstate = shared->ms_state;
2449 
2450 	/*
2451 	 * If the new state is an error and the old one wasn't, generate
2452 	 * a console message. We do this before we send the state to other
2453 	 * nodes in a MN set because the state change may change the component
2454 	 * name  if a hotspare is allocated.
2455 	 */
2456 	if ((! (origstate & (CS_ERRED|CS_LAST_ERRED))) &&
2457 	    (newstate & (CS_ERRED|CS_LAST_ERRED))) {
2458 
2459 		get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0,
2460 		    "get device", 0);
2461 		(void) (*get_dev)(sm->sm_dev, sm, ci, &cd);
2462 
2463 		err = md_getdevname(setno, mddb_getsidenum(setno), 0,
2464 		    cd.cd_dev, devname, sizeof (devname));
2465 
2466 		if (err == ENOENT) {
2467 			(void) md_devname(setno, cd.cd_dev, devname,
2468 			    sizeof (devname));
2469 		}
2470 
2471 		cmn_err(CE_WARN, "md: %s: %s needs maintenance",
2472 		    md_shortname(md_getminor(sm->sm_dev)), devname);
2473 
2474 		if (newstate & CS_LAST_ERRED) {
2475 			cmn_err(CE_WARN, "md: %s: %s last erred",
2476 			    md_shortname(md_getminor(sm->sm_dev)),
2477 			    devname);
2478 
2479 		} else if (shared->ms_flags & MDM_S_ISOPEN) {
2480 			/*
2481 			 * Close the broken device and clear the open flag on
2482 			 * it.  Closing the device means the RCM framework will
2483 			 * be able to unconfigure the device if required.
2484 			 *
2485 			 * We have to check that the device is open, otherwise
2486 			 * the first open on it has resulted in the error that
2487 			 * is being processed and the actual cd.cd_dev will be
2488 			 * NODEV64.
2489 			 *
2490 			 * If this is a multi-node mirror, then the multinode
2491 			 * state checks following this code will cause the
2492 			 * slave nodes to close the mirror in the function
2493 			 * mirror_set_state().
2494 			 */
2495 			md_layered_close(cd.cd_dev, MD_OFLG_NULL);
2496 			shared->ms_flags &= ~MDM_S_ISOPEN;
2497 		}
2498 
2499 	} else if ((origstate & CS_LAST_ERRED) && (newstate & CS_ERRED) &&
2500 	    (shared->ms_flags & MDM_S_ISOPEN)) {
2501 		/*
2502 		 * Similar to logic above except no log messages since we
2503 		 * are just transitioning from Last Erred to Erred.
2504 		 */
2505 		get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0,
2506 		    "get device", 0);
2507 		(void) (*get_dev)(sm->sm_dev, sm, ci, &cd);
2508 
2509 		md_layered_close(cd.cd_dev, MD_OFLG_NULL);
2510 		shared->ms_flags &= ~MDM_S_ISOPEN;
2511 	}
2512 
2513 	if ((MD_MNSET_SETNO(setno)) && (origstate != newstate) &&
2514 	    (flags & MD_STATE_XMIT) && !(ui->ui_tstate & MD_ERR_PENDING)) {
2515 		/*
2516 		 * For a multi-node mirror, send the state change to the
2517 		 * master, which broadcasts to all nodes, including this
2518 		 * one. Once the message is received, the state is set
2519 		 * in-core and the master commits the change to disk.
2520 		 * There is a case, comp_replace,  where this function
2521 		 * can be called from within an ioctl and therefore in this
2522 		 * case, as the ioctl will already be called on each node,
2523 		 * there is no need to xmit the state change to the master for
2524 		 * distribution to the other nodes. MD_STATE_XMIT flag is used
2525 		 * to indicate whether a xmit is required. The mirror's
2526 		 * transient state is set to MD_ERR_PENDING to avoid sending
2527 		 * multiple messages.
2528 		 */
2529 		if (newstate & (CS_ERRED|CS_LAST_ERRED))
2530 			ui->ui_tstate |= MD_ERR_PENDING;
2531 
2532 		/*
2533 		 * Send a state update message to all nodes. This message
2534 		 * will generate 2 submessages, the first one to suspend
2535 		 * all writes to the mirror and the second to update the
2536 		 * state and resume writes.
2537 		 */
2538 		stchmsg.msg_stch_mnum = un->c.un_self_id;
2539 		stchmsg.msg_stch_sm = smi;
2540 		stchmsg.msg_stch_comp = ci;
2541 		stchmsg.msg_stch_new_state = newstate;
2542 		stchmsg.msg_stch_hs_id = shared->ms_hs_id;
2543 #ifdef DEBUG
2544 		if (mirror_debug_flag)
2545 			printf("send set state, %x, %x, %x, %x, %x\n",
2546 			    stchmsg.msg_stch_mnum, stchmsg.msg_stch_sm,
2547 			    stchmsg.msg_stch_comp, stchmsg.msg_stch_new_state,
2548 			    stchmsg.msg_stch_hs_id);
2549 #endif
2550 		if (flags & MD_STATE_WMUPDATE) {
2551 			msgtype  = MD_MN_MSG_STATE_UPDATE2;
2552 			/*
2553 			 * When coming from an update of watermarks, there
2554 			 * must already be a message logged that triggered
2555 			 * this action. So, no need to log this message, too.
2556 			 */
2557 			msgflags = MD_MSGF_NO_LOG;
2558 		} else {
2559 			msgtype  = MD_MN_MSG_STATE_UPDATE;
2560 			msgflags = MD_MSGF_DEFAULT_FLAGS;
2561 		}
2562 
2563 		/*
2564 		 * If we are in the context of an ioctl, drop the ioctl lock.
2565 		 * lockp holds the list of locks held.
2566 		 *
2567 		 * Otherwise, increment the appropriate reacquire counters.
2568 		 * If openclose lock is *held, then must reacquire reader
2569 		 * lock before releasing the openclose lock.
2570 		 * Do not drop the ARRAY_WRITER lock as we may not be able
2571 		 * to reacquire it.
2572 		 */
2573 		if (lockp) {
2574 			if (lockp->l_flags & MD_ARRAY_WRITER) {
2575 				save_lock = MD_ARRAY_WRITER;
2576 				lockp->l_flags &= ~MD_ARRAY_WRITER;
2577 			} else if (lockp->l_flags & MD_ARRAY_READER) {
2578 				save_lock = MD_ARRAY_READER;
2579 				lockp->l_flags &= ~MD_ARRAY_READER;
2580 			}
2581 			IOLOCK_RETURN_RELEASE(0, lockp);
2582 		} else {
2583 			if (flags & MD_STATE_OCHELD) {
2584 				md_unit_writerexit(ui);
2585 				(void) md_unit_readerlock(ui);
2586 				md_unit_openclose_exit(ui);
2587 			} else {
2588 				md_unit_writerexit(ui);
2589 			}
2590 		}
2591 
2592 		kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
2593 sscs_msg:
2594 		rval = mdmn_ksend_message(setno, msgtype, msgflags, 0,
2595 		    (char *)&stchmsg, sizeof (stchmsg), kresult);
2596 
2597 		if (!MDMN_KSEND_MSG_OK(rval, kresult)) {
2598 			mdmn_ksend_show_error(rval, kresult, "STATE UPDATE");
2599 			/* If we're shutting down already, pause things here. */
2600 			if (kresult->kmmr_comm_state == MDMNE_RPC_FAIL) {
2601 				while (!md_mn_is_commd_present()) {
2602 					delay(md_hz);
2603 				}
2604 				/*
2605 				 * commd is now available; retry the message
2606 				 * one time. If that fails we fall through and
2607 				 * panic as the system is in an unexpected state
2608 				 */
2609 				if (nretries++ == 0)
2610 					goto sscs_msg;
2611 			}
2612 			cmn_err(CE_PANIC,
2613 			    "ksend_message failure: STATE_UPDATE");
2614 		}
2615 		kmem_free(kresult, sizeof (md_mn_kresult_t));
2616 
2617 		/* if dropped the lock previously, regain it */
2618 		if (lockp) {
2619 			IOLOCK_RETURN_REACQUIRE(lockp);
2620 			lockp->l_flags |= save_lock;
2621 		} else {
2622 			/*
2623 			 * Reacquire dropped locks and update acquirecnts
2624 			 * appropriately.
2625 			 */
2626 			if (flags & MD_STATE_OCHELD) {
2627 				/*
2628 				 * openclose also grabs readerlock.
2629 				 */
2630 				(void) md_unit_openclose_enter(ui);
2631 				md_unit_readerexit(ui);
2632 				(void) md_unit_writerlock(ui);
2633 			} else {
2634 				(void) md_unit_writerlock(ui);
2635 			}
2636 		}
2637 
2638 		ui->ui_tstate &= ~MD_ERR_PENDING;
2639 	} else {
2640 		shared->ms_state = newstate;
2641 		uniqtime32(&shared->ms_timestamp);
2642 
2643 		if (newstate == CS_ERRED)
2644 			shared->ms_flags |= MDM_S_NOWRITE;
2645 		else
2646 			shared->ms_flags &= ~MDM_S_NOWRITE;
2647 
2648 		shared->ms_flags &= ~MDM_S_IOERR;
2649 		un->un_changecnt++;
2650 		shared->ms_lasterrcnt = un->un_changecnt;
2651 
2652 		mirror_set_sm_state(sm, smic, SMS_RUNNING, 0);
2653 		mirror_commit(un, SMI2BIT(smi), extras);
2654 	}
2655 
2656 	if ((origstate & CS_RESYNC) && (newstate & CS_OKAY)) {
2657 		/*
2658 		 * Resetting the Last Erred state will recursively call back
2659 		 * into this function (set_sm_comp_state) to update the state.
2660 		 */
2661 		reset_lasterred(un, smi, extras, flags, lockp);
2662 	}
2663 }
2664 
2665 static int
2666 find_another_logical(
2667 	mm_unit_t		*un,
2668 	mm_submirror_t		*esm,
2669 	diskaddr_t		blk,
2670 	u_longlong_t		cnt,
2671 	int			must_be_open,
2672 	int			state,
2673 	int			err_cnt)
2674 {
2675 	u_longlong_t	cando;
2676 	md_dev64_t	dev;
2677 	md_m_shared_t	*s;
2678 
2679 	esm->sm_state |= SMS_IGNORE;
2680 	while (cnt != 0) {
2681 		u_longlong_t	 mcnt;
2682 
2683 		mcnt = MIN(cnt, lbtodb(1024 * 1024 * 1024));	/* 1 Gig Blks */
2684 
2685 		dev = select_read_unit(un, blk, mcnt, &cando,
2686 		    must_be_open, &s, NULL);
2687 		if (dev == (md_dev64_t)0)
2688 			break;
2689 
2690 		if ((state == CS_LAST_ERRED) &&
2691 		    (s->ms_state == CS_LAST_ERRED) &&
2692 		    (err_cnt > s->ms_lasterrcnt))
2693 			break;
2694 
2695 		cnt -= cando;
2696 		blk += cando;
2697 	}
2698 	esm->sm_state &= ~SMS_IGNORE;
2699 	return (cnt != 0);
2700 }
2701 
2702 int
2703 mirror_other_sources(mm_unit_t *un, int smi, int ci, int must_be_open)
2704 {
2705 	mm_submirror_t		*sm;
2706 	mm_submirror_ic_t	*smic;
2707 	size_t			count;
2708 	diskaddr_t		block;
2709 	u_longlong_t		skip;
2710 	u_longlong_t		size;
2711 	md_dev64_t		dev;
2712 	int			cnt;
2713 	md_m_shared_t		*s;
2714 	int			not_found;
2715 
2716 	sm = &un->un_sm[smi];
2717 	smic = &un->un_smic[smi];
2718 	dev = sm->sm_dev;
2719 
2720 	/*
2721 	 * Make sure every component of the submirror
2722 	 * has other sources.
2723 	 */
2724 	if (ci < 0) {
2725 		/* Find the highest lasterrcnt */
2726 		cnt = (*(smic->sm_get_component_count))(dev, sm);
2727 		for (ci = 0; ci < cnt; ci++) {
2728 			not_found = mirror_other_sources(un, smi, ci,
2729 			    must_be_open);
2730 			if (not_found)
2731 				return (1);
2732 		}
2733 		return (0);
2734 	}
2735 
2736 	/*
2737 	 * Make sure this component has other sources
2738 	 */
2739 	(void) (*(smic->sm_get_bcss))
2740 	    (dev, sm, ci, &block, &count, &skip, &size);
2741 
2742 	if (count == 0)
2743 		return (1);
2744 
2745 	s = (md_m_shared_t *)(*(smic->sm_shared_by_indx))(dev, sm, ci);
2746 
2747 	while (count--) {
2748 		if (block >= un->c.un_total_blocks)
2749 			return (0);
2750 
2751 		if ((block + size) > un->c.un_total_blocks)
2752 			size = un->c.un_total_blocks - block;
2753 
2754 		not_found = find_another_logical(un, sm, block, size,
2755 		    must_be_open, s->ms_state, s->ms_lasterrcnt);
2756 		if (not_found)
2757 			return (1);
2758 
2759 		block += size + skip;
2760 	}
2761 	return (0);
2762 }
2763 
2764 static void
2765 finish_error(md_mps_t *ps)
2766 {
2767 	struct buf	*pb;
2768 	mm_unit_t	*un;
2769 	mdi_unit_t	*ui;
2770 	uint_t		new_str_flags;
2771 
2772 	pb = ps->ps_bp;
2773 	un = ps->ps_un;
2774 	ui = ps->ps_ui;
2775 
2776 	/*
2777 	 * Must flag any error to the resync originator if we're performing
2778 	 * a Write-after-Read. This corresponds to an i/o error on a resync
2779 	 * target device and in this case we ought to abort the resync as there
2780 	 * is nothing that can be done to recover from this without operator
2781 	 * intervention. If we don't set the B_ERROR flag we will continue
2782 	 * reading from the mirror but won't write to the target (as it will
2783 	 * have been placed into an errored state).
2784 	 * To handle the case of multiple components within a submirror we only
2785 	 * set the B_ERROR bit if explicitly requested to via MD_MPS_FLAG_ERROR.
2786 	 * The originator of the resync read will cause this bit to be set if
2787 	 * the underlying component count is one for a submirror resync. All
2788 	 * other resync types will have the flag set as there is no underlying
2789 	 * resync which can be performed on a contained metadevice for these
2790 	 * resync types (optimized or component).
2791 	 */
2792 
2793 	if (ps->ps_flags & MD_MPS_WRITE_AFTER_READ) {
2794 		if (ps->ps_flags & MD_MPS_FLAG_ERROR)
2795 			pb->b_flags |= B_ERROR;
2796 		md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
2797 		MPS_FREE(mirror_parent_cache, ps);
2798 		md_unit_readerexit(ui);
2799 		md_biodone(pb);
2800 		return;
2801 	}
2802 	/*
2803 	 * Set the MD_IO_COUNTED flag as we are retrying the same I/O
2804 	 * operation therefore this I/O request has already been counted,
2805 	 * the I/O count variable will be decremented by mirror_done()'s
2806 	 * call to md_biodone().
2807 	 */
2808 	if (ps->ps_changecnt != un->un_changecnt) {
2809 		new_str_flags = MD_STR_NOTTOP | MD_IO_COUNTED;
2810 		if (ps->ps_flags & MD_MPS_WOW)
2811 			new_str_flags |= MD_STR_WOW;
2812 		if (ps->ps_flags & MD_MPS_MAPPED)
2813 			new_str_flags |= MD_STR_MAPPED;
2814 		/*
2815 		 * If this I/O request was a read that was part of a resync,
2816 		 * set MD_STR_WAR for the retried read to ensure that the
2817 		 * resync write (i.e. write-after-read) will be performed
2818 		 */
2819 		if (ps->ps_flags & MD_MPS_RESYNC_READ)
2820 			new_str_flags |= MD_STR_WAR;
2821 		md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
2822 		MPS_FREE(mirror_parent_cache, ps);
2823 		md_unit_readerexit(ui);
2824 		(void) md_mirror_strategy(pb, new_str_flags, NULL);
2825 		return;
2826 	}
2827 
2828 	pb->b_flags |= B_ERROR;
2829 	md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
2830 	MPS_FREE(mirror_parent_cache, ps);
2831 	md_unit_readerexit(ui);
2832 	md_biodone(pb);
2833 }
2834 
2835 static void
2836 error_update_unit(md_mps_t *ps)
2837 {
2838 	mm_unit_t		*un;
2839 	mdi_unit_t		*ui;
2840 	int			smi;	/* sub mirror index */
2841 	int			ci;	/* errored component */
2842 	set_t			setno;
2843 	uint_t			flags;	/* for set_sm_comp_state() */
2844 	uint_t			hspflags; /* for check_comp_4_hotspares() */
2845 
2846 	ui = ps->ps_ui;
2847 	un = (mm_unit_t *)md_unit_writerlock(ui);
2848 	setno = MD_UN2SET(un);
2849 
2850 	/* All of these updates have to propagated in case of MN set */
2851 	flags = MD_STATE_XMIT;
2852 	hspflags = MD_HOTSPARE_XMIT;
2853 
2854 	/* special treatment if we are called during updating watermarks */
2855 	if (ps->ps_flags & MD_MPS_WMUPDATE) {
2856 		flags |= MD_STATE_WMUPDATE;
2857 		hspflags |= MD_HOTSPARE_WMUPDATE;
2858 	}
2859 	smi = 0;
2860 	ci = 0;
2861 	while (mirror_geterror(un, &smi, &ci, 1, 0) != 0) {
2862 		if (mirror_other_sources(un, smi, ci, 0) == 1) {
2863 
2864 			/* Never called from ioctl context, so (IOLOCK *)NULL */
2865 			set_sm_comp_state(un, smi, ci, CS_LAST_ERRED, 0, flags,
2866 			    (IOLOCK *)NULL);
2867 			/*
2868 			 * For a MN set, the NOTIFY is done when the state
2869 			 * change is processed on each node
2870 			 */
2871 			if (!MD_MNSET_SETNO(MD_UN2SET(un))) {
2872 				SE_NOTIFY(EC_SVM_STATE, ESC_SVM_LASTERRED,
2873 				    SVM_TAG_METADEVICE, setno, MD_SID(un));
2874 			}
2875 			continue;
2876 		}
2877 		/* Never called from ioctl context, so (IOLOCK *)NULL */
2878 		set_sm_comp_state(un, smi, ci, CS_ERRED, 0, flags,
2879 		    (IOLOCK *)NULL);
2880 		/*
2881 		 * For a MN set, the NOTIFY is done when the state
2882 		 * change is processed on each node
2883 		 */
2884 		if (!MD_MNSET_SETNO(MD_UN2SET(un))) {
2885 			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED,
2886 			    SVM_TAG_METADEVICE, setno, MD_SID(un));
2887 		}
2888 		smi = 0;
2889 		ci = 0;
2890 	}
2891 
2892 	md_unit_writerexit(ui);
2893 	if (MD_MNSET_SETNO(setno)) {
2894 		send_poke_hotspares(setno);
2895 	} else {
2896 		(void) poke_hotspares();
2897 	}
2898 	(void) md_unit_readerlock(ui);
2899 
2900 	finish_error(ps);
2901 }
2902 
2903 /*
2904  * When we have a B_FAILFAST IO error on a Last Erred component we need to
2905  * retry the IO without B_FAILFAST set so that we try to ensure that the
2906  * component "sees" each IO.
2907  */
2908 static void
2909 last_err_retry(md_mcs_t *cs)
2910 {
2911 	struct buf	*cb;
2912 	md_mps_t	*ps;
2913 	uint_t		flags;
2914 
2915 	cb = &cs->cs_buf;
2916 	cb->b_flags &= ~B_FAILFAST;
2917 
2918 	/* if we're panicing just let this I/O error out */
2919 	if (panicstr) {
2920 		(void) mirror_done(cb);
2921 		return;
2922 	}
2923 
2924 	/* reissue the I/O */
2925 
2926 	ps = cs->cs_ps;
2927 
2928 	bioerror(cb, 0);
2929 
2930 	mutex_enter(&ps->ps_mx);
2931 
2932 	flags = MD_STR_NOTTOP;
2933 	if (ps->ps_flags & MD_MPS_MAPPED)
2934 		flags |= MD_STR_MAPPED;
2935 	if (ps->ps_flags & MD_MPS_NOBLOCK)
2936 		flags |= MD_NOBLOCK;
2937 
2938 	mutex_exit(&ps->ps_mx);
2939 
2940 	clear_retry_error(cb);
2941 
2942 	cmn_err(CE_NOTE, "!md: %s: Last Erred, retry I/O without B_FAILFAST",
2943 	    md_shortname(getminor(cb->b_edev)));
2944 
2945 	md_call_strategy(cb, flags, NULL);
2946 }
2947 
2948 static void
2949 mirror_error(md_mps_t *ps)
2950 {
2951 	int		smi;	/* sub mirror index */
2952 	int		ci;	/* errored component */
2953 
2954 	if (panicstr) {
2955 		finish_error(ps);
2956 		return;
2957 	}
2958 
2959 	if (ps->ps_flags & MD_MPS_ON_OVERLAP)
2960 		mirror_overlap_tree_remove(ps);
2961 
2962 	smi = 0;
2963 	ci = 0;
2964 	if (mirror_geterror(ps->ps_un, &smi, &ci, 0, 0) != 0) {
2965 		md_unit_readerexit(ps->ps_ui);
2966 		daemon_request(&md_mstr_daemon, error_update_unit,
2967 		    (daemon_queue_t *)ps, REQ_OLD);
2968 		return;
2969 	}
2970 
2971 	finish_error(ps);
2972 }
2973 
2974 static int
2975 copy_write_done(struct buf *cb)
2976 {
2977 	md_mps_t	*ps;
2978 	buf_t		*pb;
2979 	char		*wowbuf;
2980 	wowhdr_t	*wowhdr;
2981 	ssize_t		wow_resid;
2982 
2983 	/* get wowbuf ans save structure */
2984 	wowbuf = cb->b_un.b_addr;
2985 	wowhdr = WOWBUF_HDR(wowbuf);
2986 	ps = wowhdr->wow_ps;
2987 	pb = ps->ps_bp;
2988 
2989 	/* Save error information, then free cb */
2990 	if (cb->b_flags & B_ERROR)
2991 		pb->b_flags |= B_ERROR;
2992 
2993 	if (cb->b_flags & B_REMAPPED)
2994 		bp_mapout(cb);
2995 
2996 	freerbuf(cb);
2997 
2998 	/* update residual and continue if needed */
2999 	if ((pb->b_flags & B_ERROR) == 0) {
3000 		wow_resid = pb->b_bcount - wowhdr->wow_offset;
3001 		pb->b_resid = wow_resid;
3002 		if (wow_resid > 0)  {
3003 			daemon_request(&md_mstr_daemon, copy_write_cont,
3004 			    (daemon_queue_t *)wowhdr, REQ_OLD);
3005 			return (1);
3006 		}
3007 	}
3008 
3009 	/* Write is complete, release resources. */
3010 	kmem_cache_free(mirror_wowblk_cache, wowhdr);
3011 	ASSERT(!(ps->ps_flags & MD_MPS_ON_OVERLAP));
3012 	md_kstat_done(ps->ps_ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
3013 	MPS_FREE(mirror_parent_cache, ps);
3014 	md_biodone(pb);
3015 	return (0);
3016 }
3017 
3018 static void
3019 copy_write_cont(wowhdr_t *wowhdr)
3020 {
3021 	buf_t		*pb;
3022 	buf_t		*cb;
3023 	char		*wowbuf;
3024 	int		wow_offset;
3025 	size_t		wow_resid;
3026 	diskaddr_t	wow_blkno;
3027 
3028 	wowbuf = WOWHDR_BUF(wowhdr);
3029 	pb = wowhdr->wow_ps->ps_bp;
3030 
3031 	/* get data on current location */
3032 	wow_offset = wowhdr->wow_offset;
3033 	wow_resid = pb->b_bcount - wow_offset;
3034 	wow_blkno = pb->b_lblkno + lbtodb(wow_offset);
3035 
3036 	/* setup child buffer */
3037 	cb = getrbuf(KM_SLEEP);
3038 	cb->b_flags = B_WRITE;
3039 	cb->b_edev = pb->b_edev;
3040 	cb->b_un.b_addr = wowbuf;	/* change to point at WOWBUF */
3041 	cb->b_bufsize = md_wowbuf_size; /* change to wowbuf_size */
3042 	cb->b_iodone = copy_write_done;
3043 	cb->b_bcount = MIN(md_wowbuf_size, wow_resid);
3044 	cb->b_lblkno = wow_blkno;
3045 
3046 	/* move offset to next section */
3047 	wowhdr->wow_offset += cb->b_bcount;
3048 
3049 	/* copy and setup write for current section */
3050 	bcopy(&pb->b_un.b_addr[wow_offset], wowbuf, cb->b_bcount);
3051 
3052 	/* do it */
3053 	/*
3054 	 * Do not set the MD_IO_COUNTED flag as this is a new I/O request
3055 	 * that handles the WOW condition. The resultant increment on the
3056 	 * I/O count variable is cleared by copy_write_done()'s call to
3057 	 * md_biodone().
3058 	 */
3059 	(void) md_mirror_strategy(cb, MD_STR_NOTTOP | MD_STR_WOW
3060 	    | MD_STR_MAPPED, NULL);
3061 }
3062 
3063 static void
3064 md_mirror_copy_write(md_mps_t *ps)
3065 {
3066 	wowhdr_t	*wowhdr;
3067 
3068 	wowhdr = kmem_cache_alloc(mirror_wowblk_cache, MD_ALLOCFLAGS);
3069 	mirror_wowblk_init(wowhdr);
3070 	wowhdr->wow_ps = ps;
3071 	wowhdr->wow_offset = 0;
3072 	copy_write_cont(wowhdr);
3073 }
3074 
3075 static void
3076 handle_wow(md_mps_t *ps)
3077 {
3078 	buf_t		*pb;
3079 
3080 	pb = ps->ps_bp;
3081 
3082 	bp_mapin(pb);
3083 
3084 	md_mirror_wow_cnt++;
3085 	if (!(pb->b_flags & B_PHYS) && (md_mirror_wow_flg & WOW_LOGIT)) {
3086 		cmn_err(CE_NOTE,
3087 		    "md: %s, blk %lld, cnt %ld: Write on write %d occurred",
3088 		    md_shortname(getminor(pb->b_edev)),
3089 		    (longlong_t)pb->b_lblkno, pb->b_bcount, md_mirror_wow_cnt);
3090 	}
3091 
3092 	/*
3093 	 * Set the MD_IO_COUNTED flag as we are retrying the same I/O
3094 	 * operation therefore this I/O request has already been counted,
3095 	 * the I/O count variable will be decremented by mirror_done()'s
3096 	 * call to md_biodone().
3097 	 */
3098 	if (md_mirror_wow_flg & WOW_NOCOPY)
3099 		(void) md_mirror_strategy(pb, MD_STR_NOTTOP | MD_STR_WOW |
3100 		    MD_STR_MAPPED | MD_IO_COUNTED, ps);
3101 	else
3102 		md_mirror_copy_write(ps);
3103 }
3104 
3105 /*
3106  * Return true if the specified submirror is either in the Last Erred
3107  * state or is transitioning into the Last Erred state.
3108  */
3109 static bool_t
3110 submirror_is_lasterred(mm_unit_t *un, int smi)
3111 {
3112 	mm_submirror_t		*sm;
3113 	mm_submirror_ic_t	*smic;
3114 	md_m_shared_t		*shared;
3115 	int			ci;
3116 	int			compcnt;
3117 
3118 	sm = &un->un_sm[smi];
3119 	smic = &un->un_smic[smi];
3120 
3121 	compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un);
3122 	for (ci = 0; ci < compcnt; ci++) {
3123 		shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
3124 		    (sm->sm_dev, sm, ci);
3125 
3126 		if (shared->ms_state == CS_LAST_ERRED)
3127 			return (B_TRUE);
3128 
3129 		/*
3130 		 * It is not currently Last Erred, check if entering Last Erred.
3131 		 */
3132 		if ((shared->ms_flags & MDM_S_IOERR) &&
3133 		    ((shared->ms_state == CS_OKAY) ||
3134 		    (shared->ms_state == CS_RESYNC))) {
3135 			if (mirror_other_sources(un, smi, ci, 0) == 1)
3136 				return (B_TRUE);
3137 		}
3138 	}
3139 
3140 	return (B_FALSE);
3141 }
3142 
3143 
3144 static int
3145 mirror_done(struct buf *cb)
3146 {
3147 	md_mps_t	*ps;
3148 	md_mcs_t	*cs;
3149 
3150 	/*LINTED*/
3151 	cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off);
3152 	ps = cs->cs_ps;
3153 
3154 	mutex_enter(&ps->ps_mx);
3155 
3156 	/* check if we need to retry an errored failfast I/O */
3157 	if (cb->b_flags & B_ERROR) {
3158 		struct buf *pb = ps->ps_bp;
3159 
3160 		if (cb->b_flags & B_FAILFAST) {
3161 			int		i;
3162 			mm_unit_t	*un = ps->ps_un;
3163 
3164 			for (i = 0; i < NMIRROR; i++) {
3165 				if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
3166 					continue;
3167 
3168 				if (cb->b_edev ==
3169 				    md_dev64_to_dev(un->un_sm[i].sm_dev)) {
3170 
3171 					/*
3172 					 * This is the submirror that had the
3173 					 * error.  Check if it is Last Erred.
3174 					 */
3175 					if (submirror_is_lasterred(un, i)) {
3176 						daemon_queue_t *dqp;
3177 
3178 						mutex_exit(&ps->ps_mx);
3179 						dqp = (daemon_queue_t *)cs;
3180 						dqp->dq_prev = NULL;
3181 						dqp->dq_next = NULL;
3182 						daemon_request(&md_done_daemon,
3183 						    last_err_retry, dqp,
3184 						    REQ_OLD);
3185 						return (1);
3186 					}
3187 					break;
3188 				}
3189 			}
3190 		}
3191 
3192 		/* continue to process the buf without doing a retry */
3193 		ps->ps_flags |= MD_MPS_ERROR;
3194 		pb->b_error = cb->b_error;
3195 	}
3196 
3197 	return (mirror_done_common(cb));
3198 }
3199 
3200 /*
3201  * Split from the original mirror_done function so we can handle bufs after a
3202  * retry.
3203  * ps->ps_mx is already held in the caller of this function and the cb error
3204  * has already been checked and handled in the caller.
3205  */
3206 static int
3207 mirror_done_common(struct buf *cb)
3208 {
3209 	struct buf	*pb;
3210 	mm_unit_t	*un;
3211 	mdi_unit_t	*ui;
3212 	md_mps_t	*ps;
3213 	md_mcs_t	*cs;
3214 	size_t		end_rr, start_rr, current_rr;
3215 
3216 	/*LINTED*/
3217 	cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off);
3218 	ps = cs->cs_ps;
3219 	pb = ps->ps_bp;
3220 
3221 	if (cb->b_flags & B_REMAPPED)
3222 		bp_mapout(cb);
3223 
3224 	ps->ps_frags--;
3225 	if (ps->ps_frags != 0) {
3226 		mutex_exit(&ps->ps_mx);
3227 		kmem_cache_free(mirror_child_cache, cs);
3228 		return (1);
3229 	}
3230 	un = ps->ps_un;
3231 	ui = ps->ps_ui;
3232 
3233 	/*
3234 	 * Do not update outstanding_writes if we're running with ABR
3235 	 * set for this mirror or the write() was issued with MD_STR_ABR set.
3236 	 * Also a resync initiated write() has no outstanding_writes update
3237 	 * either.
3238 	 */
3239 	if (((cb->b_flags & B_READ) == 0) &&
3240 	    (un->un_nsm >= 2) &&
3241 	    (ps->ps_call == NULL) &&
3242 	    !((ui->ui_tstate & MD_ABR_CAP) || (ps->ps_flags & MD_MPS_ABR)) &&
3243 	    !(ps->ps_flags & MD_MPS_WRITE_AFTER_READ)) {
3244 		BLK_TO_RR(end_rr, ps->ps_lastblk, un);
3245 		BLK_TO_RR(start_rr, ps->ps_firstblk, un);
3246 		mutex_enter(&un->un_resync_mx);
3247 		for (current_rr = start_rr; current_rr <= end_rr; current_rr++)
3248 			un->un_outstanding_writes[current_rr]--;
3249 		mutex_exit(&un->un_resync_mx);
3250 	}
3251 	kmem_cache_free(mirror_child_cache, cs);
3252 	mutex_exit(&ps->ps_mx);
3253 
3254 	if (ps->ps_call != NULL) {
3255 		daemon_request(&md_done_daemon, ps->ps_call,
3256 		    (daemon_queue_t *)ps, REQ_OLD);
3257 		return (1);
3258 	}
3259 
3260 	if ((ps->ps_flags & MD_MPS_ERROR)) {
3261 		daemon_request(&md_done_daemon, mirror_error,
3262 		    (daemon_queue_t *)ps, REQ_OLD);
3263 		return (1);
3264 	}
3265 
3266 	if (ps->ps_flags & MD_MPS_ON_OVERLAP)
3267 		mirror_overlap_tree_remove(ps);
3268 
3269 	/*
3270 	 * Handle Write-on-Write problem.
3271 	 * Skip In case of Raw and Direct I/O as they are
3272 	 * handled earlier.
3273 	 *
3274 	 */
3275 	if (!(md_mirror_wow_flg & WOW_DISABLE) &&
3276 	    !(pb->b_flags & B_READ) &&
3277 	    !(ps->ps_flags & MD_MPS_WOW) &&
3278 	    !(pb->b_flags & B_PHYS) &&
3279 	    any_pages_dirty(pb)) {
3280 		md_unit_readerexit(ps->ps_ui);
3281 		daemon_request(&md_mstr_daemon, handle_wow,
3282 		    (daemon_queue_t *)ps, REQ_OLD);
3283 		return (1);
3284 	}
3285 
3286 	md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
3287 	MPS_FREE(mirror_parent_cache, ps);
3288 	md_unit_readerexit(ui);
3289 	md_biodone(pb);
3290 	return (0);
3291 }
3292 
3293 /*
3294  * Clear error state in submirror component if the retry worked after
3295  * a failfast error.
3296  */
3297 static void
3298 clear_retry_error(struct buf *cb)
3299 {
3300 	int			smi;
3301 	md_mcs_t		*cs;
3302 	mm_unit_t		*un;
3303 	mdi_unit_t		*ui_sm;
3304 	mm_submirror_t		*sm;
3305 	mm_submirror_ic_t	*smic;
3306 	u_longlong_t		cnt;
3307 	md_m_shared_t		*shared;
3308 
3309 	/*LINTED*/
3310 	cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off);
3311 	un = cs->cs_ps->ps_un;
3312 
3313 	for (smi = 0; smi < NMIRROR; smi++) {
3314 		if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE))
3315 			continue;
3316 
3317 		if (cb->b_edev == md_dev64_to_dev(un->un_sm[smi].sm_dev))
3318 			break;
3319 	}
3320 
3321 	if (smi >= NMIRROR)
3322 		return;
3323 
3324 	sm = &un->un_sm[smi];
3325 	smic = &un->un_smic[smi];
3326 	cnt = cb->b_bcount;
3327 
3328 	ui_sm = MDI_UNIT(getminor(cb->b_edev));
3329 	(void) md_unit_writerlock(ui_sm);
3330 
3331 	shared = (md_m_shared_t *)(*(smic->sm_shared_by_blk))(sm->sm_dev, sm,
3332 	    cb->b_blkno, &cnt);
3333 
3334 	if (shared->ms_flags & MDM_S_IOERR) {
3335 		shared->ms_flags &= ~MDM_S_IOERR;
3336 
3337 	} else {
3338 		/* the buf spans components and the first one is not erred */
3339 		int	cnt;
3340 		int	i;
3341 
3342 		cnt = (*(smic->sm_get_component_count))(sm->sm_dev, un);
3343 		for (i = 0; i < cnt; i++) {
3344 			shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
3345 			    (sm->sm_dev, sm, i);
3346 
3347 			if (shared->ms_flags & MDM_S_IOERR &&
3348 			    shared->ms_state == CS_OKAY) {
3349 
3350 				shared->ms_flags &= ~MDM_S_IOERR;
3351 				break;
3352 			}
3353 		}
3354 	}
3355 
3356 	md_unit_writerexit(ui_sm);
3357 }
3358 
3359 static size_t
3360 mirror_map_read(
3361 	md_mps_t *ps,
3362 	md_mcs_t *cs,
3363 	diskaddr_t blkno,
3364 	u_longlong_t	count
3365 )
3366 {
3367 	mm_unit_t	*un;
3368 	buf_t		*bp;
3369 	u_longlong_t	cando;
3370 
3371 	bp = &cs->cs_buf;
3372 	un = ps->ps_un;
3373 
3374 	bp->b_lblkno = blkno;
3375 	if (fast_select_read_unit(ps, cs) == 0) {
3376 		bp->b_bcount = ldbtob(count);
3377 		return (0);
3378 	}
3379 	bp->b_edev = md_dev64_to_dev(select_read_unit(un, blkno,
3380 	    count, &cando, 0, NULL, cs));
3381 	bp->b_bcount = ldbtob(cando);
3382 	if (count != cando)
3383 		return (cando);
3384 	return (0);
3385 }
3386 
3387 static void
3388 write_after_read(md_mps_t *ps)
3389 {
3390 	struct buf	*pb;
3391 	int		flags;
3392 
3393 	if (ps->ps_flags & MD_MPS_ERROR) {
3394 		mirror_error(ps);
3395 		return;
3396 	}
3397 
3398 	pb = ps->ps_bp;
3399 	md_kstat_done(ps->ps_ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
3400 	ps->ps_call = NULL;
3401 	ps->ps_flags |= MD_MPS_WRITE_AFTER_READ;
3402 	flags = MD_STR_NOTTOP | MD_STR_WAR;
3403 	if (ps->ps_flags & MD_MPS_MAPPED)
3404 		flags |= MD_STR_MAPPED;
3405 	if (ps->ps_flags & MD_MPS_NOBLOCK)
3406 		flags |= MD_NOBLOCK;
3407 	if (ps->ps_flags & MD_MPS_DIRTY_RD)
3408 		flags |= MD_STR_DIRTY_RD;
3409 	(void) mirror_write_strategy(pb, flags, ps);
3410 }
3411 
3412 static void
3413 continue_serial(md_mps_t *ps)
3414 {
3415 	md_mcs_t	*cs;
3416 	buf_t		*cb;
3417 	mm_unit_t	*un;
3418 	int		flags;
3419 
3420 	un = ps->ps_un;
3421 	cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS);
3422 	mirror_child_init(cs);
3423 	cb = &cs->cs_buf;
3424 	ps->ps_call = NULL;
3425 	ps->ps_frags = 1;
3426 	(void) mirror_map_write(un, cs, ps, 0);
3427 	flags = MD_STR_NOTTOP;
3428 	if (ps->ps_flags & MD_MPS_MAPPED)
3429 		flags |= MD_STR_MAPPED;
3430 	md_call_strategy(cb, flags, NULL);
3431 }
3432 
3433 static int
3434 mirror_map_write(mm_unit_t *un, md_mcs_t *cs, md_mps_t *ps, int war)
3435 {
3436 	int i;
3437 	dev_t		dev;	/* needed for bioclone, so not md_dev64_t */
3438 	buf_t		*cb;
3439 	buf_t		*pb;
3440 	diskaddr_t	blkno;
3441 	size_t		bcount;
3442 	off_t		offset;
3443 
3444 	pb = ps->ps_bp;
3445 	cb = &cs->cs_buf;
3446 	cs->cs_ps = ps;
3447 
3448 	i = md_find_nth_unit(ps->ps_writable_sm, ps->ps_current_sm);
3449 
3450 	dev = md_dev64_to_dev(un->un_sm[i].sm_dev);
3451 
3452 	blkno = pb->b_lblkno;
3453 	bcount = pb->b_bcount;
3454 	offset = 0;
3455 	if (war && (blkno == 0) && (un->c.un_flag & MD_LABELED)) {
3456 		blkno = DK_LABEL_LOC + 1;
3457 		/*
3458 		 * This handles the case where we're requesting
3459 		 * a write to block 0 on a label partition
3460 		 * and the request size was smaller than the
3461 		 * size of the label.  If this is the case
3462 		 * then we'll return -1.  Failure to do so will
3463 		 * either cause the calling thread to hang due to
3464 		 * an ssd bug, or worse if the bcount were allowed
3465 		 * to go negative (ie large).
3466 		 */
3467 		if (bcount <= DEV_BSIZE*(DK_LABEL_LOC + 1))
3468 			return (-1);
3469 		bcount -= (DEV_BSIZE*(DK_LABEL_LOC + 1));
3470 		offset = (DEV_BSIZE*(DK_LABEL_LOC + 1));
3471 	}
3472 
3473 	cb = md_bioclone(pb, offset, bcount, dev, blkno, mirror_done,
3474 	    cb, KM_NOSLEEP);
3475 	if (war)
3476 		cb->b_flags = (cb->b_flags & ~B_READ) | B_WRITE;
3477 
3478 	/*
3479 	 * If the submirror is in the erred stated, check if any component is
3480 	 * in the Last Erred state.  If so, we don't want to use the B_FAILFAST
3481 	 * flag on the IO.
3482 	 *
3483 	 * Provide a fast path for the non-erred case (which should be the
3484 	 * normal case).
3485 	 */
3486 	if (un->un_sm[i].sm_flags & MD_SM_FAILFAST) {
3487 		if (un->un_sm[i].sm_state & SMS_COMP_ERRED) {
3488 			mm_submirror_t		*sm;
3489 			mm_submirror_ic_t	*smic;
3490 			int			ci;
3491 			int			compcnt;
3492 
3493 			sm = &un->un_sm[i];
3494 			smic = &un->un_smic[i];
3495 
3496 			compcnt = (*(smic->sm_get_component_count))
3497 			    (sm->sm_dev, un);
3498 			for (ci = 0; ci < compcnt; ci++) {
3499 				md_m_shared_t	*shared;
3500 
3501 				shared = (md_m_shared_t *)
3502 				    (*(smic->sm_shared_by_indx))(sm->sm_dev,
3503 				    sm, ci);
3504 
3505 				if (shared->ms_state == CS_LAST_ERRED)
3506 					break;
3507 			}
3508 			if (ci >= compcnt)
3509 				cb->b_flags |= B_FAILFAST;
3510 
3511 		} else {
3512 			cb->b_flags |= B_FAILFAST;
3513 		}
3514 	}
3515 
3516 	ps->ps_current_sm++;
3517 	if (ps->ps_current_sm != ps->ps_active_cnt) {
3518 		if (un->un_write_option == WR_SERIAL) {
3519 			ps->ps_call = continue_serial;
3520 			return (0);
3521 		}
3522 		return (1);
3523 	}
3524 	return (0);
3525 }
3526 
3527 /*
3528  * directed_read_done:
3529  * ------------------
3530  * Completion routine called when a DMR request has been returned from the
3531  * underlying driver. Wake-up the original ioctl() and return the data to
3532  * the user.
3533  */
3534 static void
3535 directed_read_done(md_mps_t *ps)
3536 {
3537 	mm_unit_t	*un;
3538 	mdi_unit_t	*ui;
3539 
3540 	un = ps->ps_un;
3541 	ui = ps->ps_ui;
3542 
3543 	md_unit_readerexit(ui);
3544 	md_kstat_done(ui, ps->ps_bp, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
3545 	ps->ps_call = NULL;
3546 
3547 	mutex_enter(&un->un_dmr_mx);
3548 	cv_signal(&un->un_dmr_cv);
3549 	mutex_exit(&un->un_dmr_mx);
3550 
3551 	/* release the parent structure */
3552 	kmem_cache_free(mirror_parent_cache, ps);
3553 }
3554 
3555 /*
3556  * daemon_io:
3557  * ------------
3558  * Called to issue a mirror_write_strategy() or mirror_read_strategy
3559  * call from a blockable context. NOTE: no mutex can be held on entry to this
3560  * routine
3561  */
3562 static void
3563 daemon_io(daemon_queue_t *dq)
3564 {
3565 	md_mps_t	*ps = (md_mps_t *)dq;
3566 	int		flag = MD_STR_NOTTOP;
3567 	buf_t		*pb = ps->ps_bp;
3568 
3569 	if (ps->ps_flags & MD_MPS_MAPPED)
3570 		flag |= MD_STR_MAPPED;
3571 	if (ps->ps_flags & MD_MPS_WOW)
3572 		flag |= MD_STR_WOW;
3573 	if (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)
3574 		flag |= MD_STR_WAR;
3575 	if (ps->ps_flags & MD_MPS_ABR)
3576 		flag |= MD_STR_ABR;
3577 	if (ps->ps_flags & MD_MPS_BLOCKABLE_IO)
3578 		flag |= MD_STR_BLOCK_OK;
3579 
3580 	/*
3581 	 * If this is a resync read, ie MD_STR_DIRTY_RD not set, set
3582 	 * MD_STR_WAR before calling mirror_read_strategy
3583 	 */
3584 	if (pb->b_flags & B_READ) {
3585 		if (!(ps->ps_flags & MD_MPS_DIRTY_RD))
3586 			flag |= MD_STR_WAR;
3587 		mirror_read_strategy(pb, flag, ps);
3588 	} else
3589 		mirror_write_strategy(pb, flag, ps);
3590 }
3591 
3592 /*
3593  * update_resync:
3594  * -------------
3595  * Called to update the in-core version of the resync record with the latest
3596  * version that was committed to disk when the previous mirror owner
3597  * relinquished ownership. This call is likely to block as we must hold-off
3598  * any current resync processing that may be occurring.
3599  * On completion of the resync record update we issue the mirror_write_strategy
3600  * call to complete the i/o that first started this sequence. To remove a race
3601  * condition between a new write() request which is submitted and the resync
3602  * record update we acquire the writerlock. This will hold off all i/o to the
3603  * mirror until the resync update has completed.
3604  * NOTE: no mutex can be held on entry to this routine
3605  */
3606 static void
3607 update_resync(daemon_queue_t *dq)
3608 {
3609 	md_mps_t	*ps = (md_mps_t *)dq;
3610 	buf_t		*pb = ps->ps_bp;
3611 	mdi_unit_t	*ui = ps->ps_ui;
3612 	mm_unit_t	*un = MD_UNIT(ui->ui_link.ln_id);
3613 	set_t		setno;
3614 	int		restart_resync;
3615 
3616 	mutex_enter(&un->un_rrp_inflight_mx);
3617 	(void) md_unit_writerlock(ui);
3618 	ps->ps_un = un;
3619 	setno = MD_MIN2SET(getminor(pb->b_edev));
3620 	if (mddb_reread_rr(setno, un->un_rr_dirty_recid) == 0) {
3621 		/*
3622 		 * Synchronize our in-core view of what regions need to be
3623 		 * resync'd with the on-disk version.
3624 		 */
3625 		mirror_copy_rr(howmany(un->un_rrd_num, NBBY), un->un_resync_bm,
3626 		    un->un_dirty_bm);
3627 
3628 		/* Region dirty map is now up to date */
3629 	}
3630 	restart_resync = (un->un_rs_thread_flags & MD_RI_BLOCK_OWNER) ? 1 : 0;
3631 	md_unit_writerexit(ui);
3632 	mutex_exit(&un->un_rrp_inflight_mx);
3633 
3634 	/* Restart the resync thread if it was previously blocked */
3635 	if (restart_resync) {
3636 		mutex_enter(&un->un_rs_thread_mx);
3637 		un->un_rs_thread_flags &= ~MD_RI_BLOCK_OWNER;
3638 		cv_signal(&un->un_rs_thread_cv);
3639 		mutex_exit(&un->un_rs_thread_mx);
3640 	}
3641 	/* Continue with original deferred i/o */
3642 	daemon_io(dq);
3643 }
3644 
3645 /*
3646  * owner_timeout:
3647  * -------------
3648  * Called if the original mdmn_ksend_message() failed and the request is to be
3649  * retried. Reattempt the original ownership change.
3650  *
3651  * NOTE: called at interrupt context (see timeout(9f)).
3652  */
3653 static void
3654 owner_timeout(void *arg)
3655 {
3656 	daemon_queue_t	*dq = (daemon_queue_t *)arg;
3657 
3658 	daemon_request(&md_mirror_daemon, become_owner, dq, REQ_OLD);
3659 }
3660 
3661 /*
3662  * become_owner:
3663  * ------------
3664  * Called to issue RPC request to become the owner of the mirror
3665  * associated with this i/o request. We assume that the ownership request
3666  * is synchronous, so if it succeeds we will issue the request via
3667  * mirror_write_strategy().
3668  * If multiple i/o's are outstanding we will be called from the mirror_daemon
3669  * service thread.
3670  * NOTE: no mutex should be held on entry to this routine.
3671  */
3672 static void
3673 become_owner(daemon_queue_t *dq)
3674 {
3675 	md_mps_t	*ps = (md_mps_t *)dq;
3676 	mm_unit_t	*un = ps->ps_un;
3677 	buf_t		*pb = ps->ps_bp;
3678 	set_t		setno;
3679 	md_mn_kresult_t	*kres;
3680 	int		msg_flags = md_mirror_msg_flags;
3681 	md_mps_t	*ps1;
3682 
3683 	ASSERT(dq->dq_next == NULL && dq->dq_prev == NULL);
3684 
3685 	/*
3686 	 * If we're already the mirror owner we do not need to send a message
3687 	 * but can simply process the i/o request immediately.
3688 	 * If we've already sent the request to become owner we requeue the
3689 	 * request as we're waiting for the synchronous ownership message to
3690 	 * be processed.
3691 	 */
3692 	if (MD_MN_MIRROR_OWNER(un)) {
3693 		/*
3694 		 * As the strategy() call will potentially block we need to
3695 		 * punt this to a separate thread and complete this request
3696 		 * as quickly as possible. Note: if we're a read request
3697 		 * this must be a resync, we cannot afford to be queued
3698 		 * behind any intervening i/o requests. In this case we put the
3699 		 * request on the md_mirror_rs_daemon queue.
3700 		 */
3701 		if (pb->b_flags & B_READ) {
3702 			daemon_request(&md_mirror_rs_daemon, daemon_io, dq,
3703 			    REQ_OLD);
3704 		} else {
3705 			daemon_request(&md_mirror_io_daemon, daemon_io, dq,
3706 			    REQ_OLD);
3707 		}
3708 	} else {
3709 		mutex_enter(&un->un_owner_mx);
3710 		if ((un->un_owner_state & MM_MN_OWNER_SENT) == 0) {
3711 			md_mn_req_owner_t	*msg;
3712 			int			rval = 0;
3713 
3714 			/*
3715 			 * Check to see that we haven't exceeded the maximum
3716 			 * retry count. If we have we fail the i/o as the
3717 			 * comms mechanism has become wedged beyond recovery.
3718 			 */
3719 			if (dq->qlen++ >= MD_OWNER_RETRIES) {
3720 				mutex_exit(&un->un_owner_mx);
3721 				cmn_err(CE_WARN,
3722 				    "md_mirror: Request exhausted ownership "
3723 				    "retry limit of %d attempts", dq->qlen);
3724 				pb->b_error = EIO;
3725 				pb->b_flags |= B_ERROR;
3726 				pb->b_resid = pb->b_bcount;
3727 				kmem_cache_free(mirror_parent_cache, ps);
3728 				md_biodone(pb);
3729 				return;
3730 			}
3731 
3732 			/*
3733 			 * Issue request to change ownership. The call is
3734 			 * synchronous so when it returns we can complete the
3735 			 * i/o (if successful), or enqueue it again so that
3736 			 * the operation will be retried.
3737 			 */
3738 			un->un_owner_state |= MM_MN_OWNER_SENT;
3739 			mutex_exit(&un->un_owner_mx);
3740 
3741 			msg = kmem_zalloc(sizeof (md_mn_req_owner_t), KM_SLEEP);
3742 			setno = MD_MIN2SET(getminor(pb->b_edev));
3743 			msg->mnum = MD_SID(un);
3744 			msg->owner = md_mn_mynode_id;
3745 			msg_flags |= MD_MSGF_NO_LOG;
3746 			/*
3747 			 * If this IO is triggered by updating a watermark,
3748 			 * it might be issued by the creation of a softpartition
3749 			 * while the commd subsystem is suspended.
3750 			 * We don't want this message to block.
3751 			 */
3752 			if (ps->ps_flags & MD_MPS_WMUPDATE) {
3753 				msg_flags |= MD_MSGF_OVERRIDE_SUSPEND;
3754 			}
3755 
3756 			kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
3757 			rval = mdmn_ksend_message(setno,
3758 			    MD_MN_MSG_REQUIRE_OWNER, msg_flags, 0,
3759 			    (char *)msg, sizeof (md_mn_req_owner_t), kres);
3760 
3761 			kmem_free(msg, sizeof (md_mn_req_owner_t));
3762 
3763 			if (MDMN_KSEND_MSG_OK(rval, kres)) {
3764 				dq->qlen = 0;
3765 				/*
3766 				 * Successfully changed owner, reread the
3767 				 * resync record so that we have a valid idea of
3768 				 * any previously committed incomplete write()s.
3769 				 * NOTE: As we need to acquire the resync mutex
3770 				 * this may block, so we defer it to a separate
3771 				 * thread handler. This makes us (effectively)
3772 				 * non-blocking once the ownership message
3773 				 * handling has completed.
3774 				 */
3775 				mutex_enter(&un->un_owner_mx);
3776 				if (un->un_owner_state & MM_MN_BECOME_OWNER) {
3777 					un->un_mirror_owner = md_mn_mynode_id;
3778 					/* Sets owner of un_rr_dirty record */
3779 					if (un->un_rr_dirty_recid)
3780 						(void) mddb_setowner(
3781 						    un->un_rr_dirty_recid,
3782 						    md_mn_mynode_id);
3783 					un->un_owner_state &=
3784 					    ~MM_MN_BECOME_OWNER;
3785 					/*
3786 					 * Release the block on the current
3787 					 * resync region if it is blocked
3788 					 */
3789 					ps1 = un->un_rs_prev_overlap;
3790 					if ((ps1 != NULL) &&
3791 					    (ps1->ps_flags & MD_MPS_ON_OVERLAP))
3792 						mirror_overlap_tree_remove(ps1);
3793 					mutex_exit(&un->un_owner_mx);
3794 
3795 					/*
3796 					 * If we're a read, this must be a
3797 					 * resync request, issue
3798 					 * the i/o request on the
3799 					 * md_mirror_rs_daemon queue. This is
3800 					 * to avoid a deadlock between the
3801 					 * resync_unit thread and
3802 					 * subsequent i/o requests that may
3803 					 * block on the resync region.
3804 					 */
3805 					if (pb->b_flags & B_READ) {
3806 						daemon_request(
3807 						    &md_mirror_rs_daemon,
3808 						    update_resync, dq, REQ_OLD);
3809 					} else {
3810 						daemon_request(
3811 						    &md_mirror_io_daemon,
3812 						    update_resync, dq, REQ_OLD);
3813 					}
3814 					kmem_free(kres,
3815 					    sizeof (md_mn_kresult_t));
3816 					return;
3817 				} else {
3818 					/*
3819 					 * Some other node has beaten us to
3820 					 * obtain ownership. We need to
3821 					 * reschedule our ownership request
3822 					 */
3823 					mutex_exit(&un->un_owner_mx);
3824 				}
3825 			} else {
3826 				mdmn_ksend_show_error(rval, kres,
3827 				    "MD_MN_MSG_REQUIRE_OWNER");
3828 				/*
3829 				 * Message transport failure is handled by the
3830 				 * comms layer. If the ownership change request
3831 				 * does not succeed we need to flag the error to
3832 				 * the initiator of the i/o. This is handled by
3833 				 * the retry logic above. As the request failed
3834 				 * we do not know _who_ the owner of the mirror
3835 				 * currently is. We reset our idea of the owner
3836 				 * to None so that any further write()s will
3837 				 * attempt to become the owner again. This stops
3838 				 * multiple nodes writing to the same mirror
3839 				 * simultaneously.
3840 				 */
3841 				mutex_enter(&un->un_owner_mx);
3842 				un->un_owner_state &=
3843 				    ~(MM_MN_OWNER_SENT|MM_MN_BECOME_OWNER);
3844 				un->un_mirror_owner = MD_MN_MIRROR_UNOWNED;
3845 				mutex_exit(&un->un_owner_mx);
3846 			}
3847 			kmem_free(kres, sizeof (md_mn_kresult_t));
3848 		} else
3849 			mutex_exit(&un->un_owner_mx);
3850 
3851 		/*
3852 		 * Re-enqueue this request on the deferred i/o list. Delay the
3853 		 * request for md_mirror_owner_to usecs to stop thrashing.
3854 		 */
3855 		(void) timeout(owner_timeout, dq,
3856 		    drv_usectohz(md_mirror_owner_to));
3857 	}
3858 }
3859 
3860 static void
3861 mirror_write_strategy(buf_t *pb, int flag, void *private)
3862 {
3863 	md_mps_t	*ps;
3864 	md_mcs_t	*cs;
3865 	int		more;
3866 	mm_unit_t	*un;
3867 	mdi_unit_t	*ui;
3868 	buf_t		*cb;		/* child buf pointer */
3869 	set_t		setno;
3870 	int		rs_on_overlap = 0;
3871 
3872 	ui = MDI_UNIT(getminor(pb->b_edev));
3873 	un = (mm_unit_t *)MD_UNIT(getminor(pb->b_edev));
3874 
3875 
3876 	md_kstat_waitq_enter(ui);
3877 
3878 	/*
3879 	 * If a state change is in progress for this mirror in a MN set,
3880 	 * suspend all non-resync writes until the state change is complete.
3881 	 * The objective of this suspend is to ensure that it is not
3882 	 * possible for one node to read data from a submirror that another node
3883 	 * has not written to because of the state change. Therefore we
3884 	 * suspend all writes until the state change has been made. As it is
3885 	 * not possible to read from the target of a resync, there is no need
3886 	 * to suspend resync writes.
3887 	 * Note that we only block here if the caller can handle a busy-wait.
3888 	 * The MD_STR_BLOCK_OK flag is set for daemon_io originated i/o only.
3889 	 */
3890 
3891 	if (!(flag & MD_STR_WAR)) {
3892 		if (flag & MD_STR_BLOCK_OK) {
3893 			mutex_enter(&un->un_suspend_wr_mx);
3894 			while (un->un_suspend_wr_flag) {
3895 				cv_wait(&un->un_suspend_wr_cv,
3896 				    &un->un_suspend_wr_mx);
3897 			}
3898 			mutex_exit(&un->un_suspend_wr_mx);
3899 		}
3900 		(void) md_unit_readerlock(ui);
3901 	}
3902 
3903 	if (!(flag & MD_STR_NOTTOP)) {
3904 		if (md_checkbuf(ui, (md_unit_t *)un, pb)) {
3905 			md_kstat_waitq_exit(ui);
3906 			return;
3907 		}
3908 	}
3909 
3910 	setno = MD_MIN2SET(getminor(pb->b_edev));
3911 
3912 	/* If an ABR write has been requested, set MD_STR_ABR flag */
3913 	if (MD_MNSET_SETNO(setno) && (pb->b_flags & B_ABRWRITE))
3914 		flag |= MD_STR_ABR;
3915 
3916 	if (private == NULL) {
3917 		ps = kmem_cache_alloc(mirror_parent_cache, MD_ALLOCFLAGS);
3918 		mirror_parent_init(ps);
3919 	} else {
3920 		ps = private;
3921 		private = NULL;
3922 	}
3923 	if (flag & MD_STR_MAPPED)
3924 		ps->ps_flags |= MD_MPS_MAPPED;
3925 
3926 	if (flag & MD_STR_WOW)
3927 		ps->ps_flags |= MD_MPS_WOW;
3928 
3929 	if (flag & MD_STR_ABR)
3930 		ps->ps_flags |= MD_MPS_ABR;
3931 
3932 	if (flag & MD_STR_WMUPDATE)
3933 		ps->ps_flags |= MD_MPS_WMUPDATE;
3934 
3935 	/*
3936 	 * Save essential information from the original buffhdr
3937 	 * in the md_save structure.
3938 	 */
3939 	ps->ps_un = un;
3940 	ps->ps_ui = ui;
3941 	ps->ps_bp = pb;
3942 	ps->ps_addr = pb->b_un.b_addr;
3943 	ps->ps_firstblk = pb->b_lblkno;
3944 	ps->ps_lastblk = pb->b_lblkno + lbtodb(pb->b_bcount) - 1;
3945 	ps->ps_changecnt = un->un_changecnt;
3946 
3947 	/*
3948 	 * Check for suspended writes here. This is where we can defer the
3949 	 * write request to the daemon_io queue which will then call us with
3950 	 * the MD_STR_BLOCK_OK flag set and we'll busy-wait (if necessary) at
3951 	 * the top of this routine.
3952 	 */
3953 	if (!(flag & MD_STR_WAR) && !(flag & MD_STR_BLOCK_OK)) {
3954 		mutex_enter(&un->un_suspend_wr_mx);
3955 		if (un->un_suspend_wr_flag) {
3956 			ps->ps_flags |= MD_MPS_BLOCKABLE_IO;
3957 			mutex_exit(&un->un_suspend_wr_mx);
3958 			md_unit_readerexit(ui);
3959 			daemon_request(&md_mirror_daemon, daemon_io,
3960 			    (daemon_queue_t *)ps, REQ_OLD);
3961 			return;
3962 		}
3963 		mutex_exit(&un->un_suspend_wr_mx);
3964 	}
3965 
3966 	/*
3967 	 * If not MN owner and this is an ABR write, make sure the current
3968 	 * resync region is in the overlaps tree
3969 	 */
3970 	mutex_enter(&un->un_owner_mx);
3971 	if (MD_MNSET_SETNO(setno) && (!(MD_MN_MIRROR_OWNER(un))) &&
3972 	    ((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR))) {
3973 		md_mps_t	*ps1;
3974 		/* Block the current resync region, if not already blocked */
3975 		ps1 = un->un_rs_prev_overlap;
3976 
3977 		if ((ps1 != NULL) && ((ps1->ps_firstblk != 0) ||
3978 		    (ps1->ps_lastblk != 0))) {
3979 			/* Drop locks to avoid deadlock */
3980 			mutex_exit(&un->un_owner_mx);
3981 			md_unit_readerexit(ui);
3982 			wait_for_overlaps(ps1, MD_OVERLAP_ALLOW_REPEAT);
3983 			rs_on_overlap = 1;
3984 			(void) md_unit_readerlock(ui);
3985 			mutex_enter(&un->un_owner_mx);
3986 			/*
3987 			 * Check to see if we have obtained ownership
3988 			 * while waiting for overlaps. If we have, remove
3989 			 * the resync_region entry from the overlap tree
3990 			 */
3991 			if (MD_MN_MIRROR_OWNER(un) &&
3992 			    (ps1->ps_flags & MD_MPS_ON_OVERLAP)) {
3993 				mirror_overlap_tree_remove(ps1);
3994 				rs_on_overlap = 0;
3995 			}
3996 		}
3997 	}
3998 	mutex_exit(&un->un_owner_mx);
3999 
4000 
4001 	/*
4002 	 * following keep write after read from writing to the
4003 	 * source in the case where it all came from one place
4004 	 */
4005 	if (flag & MD_STR_WAR) {
4006 		int	abort_write = 0;
4007 		/*
4008 		 * We are perfoming a write-after-read. This is either as a
4009 		 * result of a resync read or as a result of a read in a
4010 		 * dirty resync region when the optimized resync is not
4011 		 * complete. If in a MN set and a resync generated i/o,
4012 		 * if the current block is not in the current
4013 		 * resync region terminate the write as another node must have
4014 		 * completed this resync region
4015 		 */
4016 		if ((MD_MNSET_SETNO(MD_UN2SET(un))) &&
4017 		    (!flag & MD_STR_DIRTY_RD)) {
4018 			if (!IN_RESYNC_REGION(un, ps))
4019 				abort_write = 1;
4020 		}
4021 		if ((select_write_after_read_units(un, ps) == 0) ||
4022 		    (abort_write)) {
4023 #ifdef DEBUG
4024 			if (mirror_debug_flag)
4025 				printf("Abort resync write on %x, block %lld\n",
4026 				    MD_SID(un), ps->ps_firstblk);
4027 #endif
4028 			if (ps->ps_flags & MD_MPS_ON_OVERLAP)
4029 				mirror_overlap_tree_remove(ps);
4030 			kmem_cache_free(mirror_parent_cache, ps);
4031 			md_kstat_waitq_exit(ui);
4032 			md_unit_readerexit(ui);
4033 			md_biodone(pb);
4034 			return;
4035 		}
4036 	} else {
4037 		select_write_units(un, ps);
4038 
4039 		/* Drop readerlock to avoid deadlock */
4040 		md_unit_readerexit(ui);
4041 		wait_for_overlaps(ps, MD_OVERLAP_NO_REPEAT);
4042 		un = md_unit_readerlock(ui);
4043 		/*
4044 		 * For a MN set with an ABR write, if we are now the
4045 		 * owner and we have a resync region in the overlap
4046 		 * tree, remove the entry from overlaps and retry the write.
4047 		 */
4048 
4049 		if (MD_MNSET_SETNO(setno) &&
4050 		    ((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR))) {
4051 			mutex_enter(&un->un_owner_mx);
4052 			if (((MD_MN_MIRROR_OWNER(un))) && rs_on_overlap) {
4053 				mirror_overlap_tree_remove(ps);
4054 				md_kstat_waitq_exit(ui);
4055 				mutex_exit(&un->un_owner_mx);
4056 				md_unit_readerexit(ui);
4057 				daemon_request(&md_mirror_daemon, daemon_io,
4058 				    (daemon_queue_t *)ps, REQ_OLD);
4059 				return;
4060 			}
4061 			mutex_exit(&un->un_owner_mx);
4062 		}
4063 	}
4064 
4065 	/*
4066 	 * For Multinode mirrors with no owner and a Resync Region (not ABR)
4067 	 * we need to become the mirror owner before continuing with the
4068 	 * write(). For ABR mirrors we check that we 'own' the resync if
4069 	 * we're in write-after-read mode. We do this _after_ ensuring that
4070 	 * there are no overlaps to ensure that once we know that we are
4071 	 * the owner, the readerlock will not be released until the write is
4072 	 * complete. As a change of ownership in a MN set requires the
4073 	 * writerlock, this ensures that ownership cannot be changed until
4074 	 * the write is complete.
4075 	 */
4076 	if (MD_MNSET_SETNO(setno) && (!((ui->ui_tstate & MD_ABR_CAP) ||
4077 	    (flag & MD_STR_ABR)) || (flag & MD_STR_WAR))) {
4078 		if (MD_MN_NO_MIRROR_OWNER(un))  {
4079 			if (ps->ps_flags & MD_MPS_ON_OVERLAP)
4080 				mirror_overlap_tree_remove(ps);
4081 			md_kstat_waitq_exit(ui);
4082 			ASSERT(!(flag & MD_STR_WAR));
4083 			md_unit_readerexit(ui);
4084 			daemon_request(&md_mirror_daemon, become_owner,
4085 			    (daemon_queue_t *)ps, REQ_OLD);
4086 			return;
4087 		}
4088 	}
4089 
4090 	/*
4091 	 * Mark resync region if mirror has a Resync Region _and_ we are not
4092 	 * a resync initiated write(). Don't mark region if we're flagged as
4093 	 * an ABR write.
4094 	 */
4095 	if (!((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR)) &&
4096 	    !(flag & MD_STR_WAR)) {
4097 		if (mirror_mark_resync_region(un, ps->ps_firstblk,
4098 		    ps->ps_lastblk, md_mn_mynode_id)) {
4099 			pb->b_flags |= B_ERROR;
4100 			pb->b_resid = pb->b_bcount;
4101 			if (ps->ps_flags & MD_MPS_ON_OVERLAP)
4102 				mirror_overlap_tree_remove(ps);
4103 			kmem_cache_free(mirror_parent_cache, ps);
4104 			md_kstat_waitq_exit(ui);
4105 			md_unit_readerexit(ui);
4106 			md_biodone(pb);
4107 			return;
4108 		}
4109 	}
4110 
4111 	ps->ps_childbflags = pb->b_flags | B_WRITE;
4112 	ps->ps_childbflags &= ~B_READ;
4113 	if (flag & MD_STR_MAPPED)
4114 		ps->ps_childbflags &= ~B_PAGEIO;
4115 
4116 	if (!(flag & MD_STR_NOTTOP) && panicstr)
4117 		/* Disable WOW and don't free ps */
4118 		ps->ps_flags |= (MD_MPS_WOW|MD_MPS_DONTFREE);
4119 
4120 	md_kstat_waitq_to_runq(ui);
4121 
4122 	/*
4123 	 * Treat Raw and Direct I/O as Write-on-Write always
4124 	 */
4125 
4126 	if (!(md_mirror_wow_flg & WOW_DISABLE) &&
4127 	    (md_mirror_wow_flg & WOW_PHYS_ENABLE) &&
4128 	    (pb->b_flags & B_PHYS) &&
4129 	    !(ps->ps_flags & MD_MPS_WOW)) {
4130 		if (ps->ps_flags & MD_MPS_ON_OVERLAP)
4131 			mirror_overlap_tree_remove(ps);
4132 		md_unit_readerexit(ui);
4133 		daemon_request(&md_mstr_daemon, handle_wow,
4134 		    (daemon_queue_t *)ps, REQ_OLD);
4135 		return;
4136 	}
4137 
4138 	ps->ps_frags = 1;
4139 	do {
4140 		cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS);
4141 		mirror_child_init(cs);
4142 		cb = &cs->cs_buf;
4143 		more = mirror_map_write(un, cs, ps, (flag & MD_STR_WAR));
4144 
4145 		/*
4146 		 * This handles the case where we're requesting
4147 		 * a write to block 0 on a label partition.  (more < 0)
4148 		 * means that the request size was smaller than the
4149 		 * size of the label.  If so this request is done.
4150 		 */
4151 		if (more < 0) {
4152 			if (ps->ps_flags & MD_MPS_ON_OVERLAP)
4153 				mirror_overlap_tree_remove(ps);
4154 			md_kstat_runq_exit(ui);
4155 			kmem_cache_free(mirror_child_cache, cs);
4156 			kmem_cache_free(mirror_parent_cache, ps);
4157 			md_unit_readerexit(ui);
4158 			md_biodone(pb);
4159 			return;
4160 		}
4161 		if (more) {
4162 			mutex_enter(&ps->ps_mx);
4163 			ps->ps_frags++;
4164 			mutex_exit(&ps->ps_mx);
4165 		}
4166 		md_call_strategy(cb, flag, private);
4167 	} while (more);
4168 
4169 	if (!(flag & MD_STR_NOTTOP) && panicstr) {
4170 		while (!(ps->ps_flags & MD_MPS_DONE)) {
4171 			md_daemon(1, &md_done_daemon);
4172 			drv_usecwait(10);
4173 		}
4174 		kmem_cache_free(mirror_parent_cache, ps);
4175 	}
4176 }
4177 
4178 static void
4179 mirror_read_strategy(buf_t *pb, int flag, void *private)
4180 {
4181 	md_mps_t	*ps;
4182 	md_mcs_t	*cs;
4183 	size_t		more;
4184 	mm_unit_t	*un;
4185 	mdi_unit_t	*ui;
4186 	size_t		current_count;
4187 	diskaddr_t	current_blkno;
4188 	off_t		current_offset;
4189 	buf_t		*cb;		/* child buf pointer */
4190 	set_t		setno;
4191 
4192 	ui = MDI_UNIT(getminor(pb->b_edev));
4193 
4194 	md_kstat_waitq_enter(ui);
4195 
4196 	un = (mm_unit_t *)md_unit_readerlock(ui);
4197 
4198 	if (!(flag & MD_STR_NOTTOP)) {
4199 		if (md_checkbuf(ui, (md_unit_t *)un, pb)) {
4200 			md_kstat_waitq_exit(ui);
4201 			return;
4202 		}
4203 	}
4204 
4205 	if (private == NULL) {
4206 		ps = kmem_cache_alloc(mirror_parent_cache, MD_ALLOCFLAGS);
4207 		mirror_parent_init(ps);
4208 	} else {
4209 		ps = private;
4210 		private = NULL;
4211 	}
4212 
4213 	if (flag & MD_STR_MAPPED)
4214 		ps->ps_flags |= MD_MPS_MAPPED;
4215 	if (flag & MD_NOBLOCK)
4216 		ps->ps_flags |= MD_MPS_NOBLOCK;
4217 	if (flag & MD_STR_WMUPDATE)
4218 		ps->ps_flags |= MD_MPS_WMUPDATE;
4219 
4220 	/*
4221 	 * Check to see if this is a DMR driven read. If so we need to use the
4222 	 * specified side (in un->un_dmr_last_read) for the source of the data.
4223 	 */
4224 	if (flag & MD_STR_DMR)
4225 		ps->ps_flags |= MD_MPS_DMR;
4226 
4227 	/*
4228 	 * Save essential information from the original buffhdr
4229 	 * in the md_save structure.
4230 	 */
4231 	ps->ps_un = un;
4232 	ps->ps_ui = ui;
4233 	ps->ps_bp = pb;
4234 	ps->ps_addr = pb->b_un.b_addr;
4235 	ps->ps_firstblk = pb->b_lblkno;
4236 	ps->ps_lastblk = pb->b_lblkno + lbtodb(pb->b_bcount) - 1;
4237 	ps->ps_changecnt = un->un_changecnt;
4238 
4239 	current_count = btodb(pb->b_bcount);
4240 	current_blkno = pb->b_lblkno;
4241 	current_offset = 0;
4242 
4243 	/*
4244 	 * If flag has MD_STR_WAR set this means that the read is issued by a
4245 	 * resync thread which may or may not be an optimised resync.
4246 	 *
4247 	 * If MD_UN_OPT_NOT_DONE is set this means that the optimized resync
4248 	 * code has not completed; either a resync has not started since snarf,
4249 	 * or there is an optimized resync in progress.
4250 	 *
4251 	 * We need to generate a write after this read in the following two
4252 	 * cases,
4253 	 *
4254 	 * 1. Any Resync-Generated read
4255 	 *
4256 	 * 2. Any read to a DIRTY REGION if there is an optimized resync
4257 	 *    pending or in progress.
4258 	 *
4259 	 * The write after read is done in these cases to ensure that all sides
4260 	 * of the mirror are in sync with the read data and that it is not
4261 	 * possible for an application to read the same block multiple times
4262 	 * and get different data.
4263 	 *
4264 	 * This would be possible if the block was in a dirty region.
4265 	 *
4266 	 * If we're performing a directed read we don't write the data out as
4267 	 * the application is responsible for restoring the mirror to a known
4268 	 * state.
4269 	 */
4270 	if (((MD_STATUS(un) & MD_UN_OPT_NOT_DONE) || (flag & MD_STR_WAR)) &&
4271 	    !(flag & MD_STR_DMR)) {
4272 		size_t	start_rr, i, end_rr;
4273 		int	region_dirty = 1;
4274 
4275 		/*
4276 		 * We enter here under three circumstances,
4277 		 *
4278 		 * MD_UN_OPT_NOT_DONE	MD_STR_WAR
4279 		 * 0			1
4280 		 * 1			0
4281 		 * 1			1
4282 		 *
4283 		 * To be optimal we only care to explicitly check for dirty
4284 		 * regions in the second case since if MD_STR_WAR is set we
4285 		 * always do the write after read.
4286 		 */
4287 		if (!(flag & MD_STR_WAR)) {
4288 			BLK_TO_RR(end_rr, ps->ps_lastblk, un);
4289 			BLK_TO_RR(start_rr, ps->ps_firstblk, un);
4290 
4291 			for (i = start_rr; i <= end_rr; i++)
4292 				if ((region_dirty = IS_KEEPDIRTY(i, un)) != 0)
4293 					break;
4294 		}
4295 
4296 		if ((region_dirty) &&
4297 		    !(md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)) {
4298 			ps->ps_call = write_after_read;
4299 			/*
4300 			 * Mark this as a RESYNC_READ in ps_flags.
4301 			 * This is used if the read fails during a
4302 			 * resync of a 3-way mirror to ensure that
4303 			 * the retried read to the remaining
4304 			 * good submirror has MD_STR_WAR set. This
4305 			 * is needed to ensure that the resync write
4306 			 * (write-after-read) takes place.
4307 			 */
4308 			ps->ps_flags |= MD_MPS_RESYNC_READ;
4309 
4310 			/*
4311 			 * If MD_STR_FLAG_ERR is set in the flags we
4312 			 * set MD_MPS_FLAG_ERROR so that an error on the resync
4313 			 * write (issued by write_after_read) will be flagged
4314 			 * to the biowait'ing resync thread. This allows us to
4315 			 * avoid issuing further resync requests to a device
4316 			 * that has had a write failure.
4317 			 */
4318 			if (flag & MD_STR_FLAG_ERR)
4319 				ps->ps_flags |= MD_MPS_FLAG_ERROR;
4320 
4321 			setno = MD_UN2SET(un);
4322 			/*
4323 			 * Drop the readerlock to avoid
4324 			 * deadlock
4325 			 */
4326 			md_unit_readerexit(ui);
4327 			wait_for_overlaps(ps, MD_OVERLAP_NO_REPEAT);
4328 			un = md_unit_readerlock(ui);
4329 			/*
4330 			 * Ensure that we are owner
4331 			 */
4332 			if (MD_MNSET_SETNO(setno)) {
4333 				/*
4334 				 * For a non-resync read that requires a
4335 				 * write-after-read to be done, set a flag
4336 				 * in the parent structure, so that the
4337 				 * write_strategy routine can omit the
4338 				 * test that the write is still within the
4339 				 * resync region
4340 				 */
4341 				if (!(flag & MD_STR_WAR))
4342 					ps->ps_flags |= MD_MPS_DIRTY_RD;
4343 
4344 				/*
4345 				 * Before reading the buffer, see if
4346 				 * there is an owner.
4347 				 */
4348 				if (MD_MN_NO_MIRROR_OWNER(un))  {
4349 					ps->ps_call = NULL;
4350 					mirror_overlap_tree_remove(ps);
4351 					md_kstat_waitq_exit(ui);
4352 					md_unit_readerexit(ui);
4353 					daemon_request(
4354 					    &md_mirror_daemon,
4355 					    become_owner,
4356 					    (daemon_queue_t *)ps,
4357 					    REQ_OLD);
4358 					return;
4359 				}
4360 				/*
4361 				 * For a resync read, check to see if I/O is
4362 				 * outside of the current resync region, or
4363 				 * the resync has finished. If so
4364 				 * just terminate the I/O
4365 				 */
4366 				if ((flag & MD_STR_WAR) &&
4367 				    (!(un->c.un_status & MD_UN_WAR) ||
4368 				    (!IN_RESYNC_REGION(un, ps)))) {
4369 #ifdef DEBUG
4370 					if (mirror_debug_flag)
4371 						printf("Abort resync read "
4372 						    "%x: %lld\n",
4373 						    MD_SID(un),
4374 						    ps->ps_firstblk);
4375 #endif
4376 					mirror_overlap_tree_remove(ps);
4377 					kmem_cache_free(mirror_parent_cache,
4378 					    ps);
4379 					md_kstat_waitq_exit(ui);
4380 					md_unit_readerexit(ui);
4381 					md_biodone(pb);
4382 					return;
4383 				}
4384 			}
4385 		}
4386 	}
4387 
4388 	if (flag & MD_STR_DMR) {
4389 		ps->ps_call = directed_read_done;
4390 	}
4391 
4392 	if (!(flag & MD_STR_NOTTOP) && panicstr)
4393 		ps->ps_flags |= MD_MPS_DONTFREE;
4394 
4395 	md_kstat_waitq_to_runq(ui);
4396 
4397 	ps->ps_frags++;
4398 	do {
4399 		cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS);
4400 		mirror_child_init(cs);
4401 		cb = &cs->cs_buf;
4402 		cs->cs_ps = ps;
4403 
4404 		cb = md_bioclone(pb, current_offset, current_count, NODEV,
4405 		    current_blkno, mirror_done, cb, KM_NOSLEEP);
4406 
4407 		more = mirror_map_read(ps, cs, current_blkno,
4408 		    (u_longlong_t)current_count);
4409 		if (more) {
4410 			mutex_enter(&ps->ps_mx);
4411 			ps->ps_frags++;
4412 			mutex_exit(&ps->ps_mx);
4413 		}
4414 
4415 		/*
4416 		 * Do these calculations now,
4417 		 *  so that we pickup a valid b_bcount from the chld_bp.
4418 		 */
4419 		current_count -= more;
4420 		current_offset += cb->b_bcount;
4421 		current_blkno +=  more;
4422 		md_call_strategy(cb, flag, private);
4423 	} while (more);
4424 
4425 	if (!(flag & MD_STR_NOTTOP) && panicstr) {
4426 		while (!(ps->ps_flags & MD_MPS_DONE)) {
4427 			md_daemon(1, &md_done_daemon);
4428 			drv_usecwait(10);
4429 		}
4430 		kmem_cache_free(mirror_parent_cache, ps);
4431 	}
4432 }
4433 
4434 void
4435 md_mirror_strategy(buf_t *bp, int flag, void *private)
4436 {
4437 	set_t	setno = MD_MIN2SET(getminor(bp->b_edev));
4438 
4439 	/*
4440 	 * When doing IO to a multi owner meta device, check if set is halted.
4441 	 * We do this check without the needed lock held, for performance
4442 	 * reasons.
4443 	 * If an IO just slips through while the set is locked via an
4444 	 * MD_MN_SUSPEND_SET, we don't care about it.
4445 	 * Only check for suspension if we are a top-level i/o request
4446 	 * (MD_STR_NOTTOP is cleared in 'flag').
4447 	 */
4448 	if ((md_set[setno].s_status & (MD_SET_HALTED | MD_SET_MNSET)) ==
4449 	    (MD_SET_HALTED | MD_SET_MNSET)) {
4450 		if ((flag & MD_STR_NOTTOP) == 0) {
4451 			mutex_enter(&md_mx);
4452 			/* Here we loop until the set is no longer halted */
4453 			while (md_set[setno].s_status & MD_SET_HALTED) {
4454 				cv_wait(&md_cv, &md_mx);
4455 			}
4456 			mutex_exit(&md_mx);
4457 		}
4458 	}
4459 
4460 	if ((flag & MD_IO_COUNTED) == 0) {
4461 		if ((flag & MD_NOBLOCK) == 0) {
4462 			if (md_inc_iocount(setno) != 0) {
4463 				bp->b_flags |= B_ERROR;
4464 				bp->b_error = ENXIO;
4465 				bp->b_resid = bp->b_bcount;
4466 				biodone(bp);
4467 				return;
4468 			}
4469 		} else {
4470 			md_inc_iocount_noblock(setno);
4471 		}
4472 	}
4473 
4474 	if (bp->b_flags & B_READ)
4475 		mirror_read_strategy(bp, flag, private);
4476 	else
4477 		mirror_write_strategy(bp, flag, private);
4478 }
4479 
4480 /*
4481  * mirror_directed_read:
4482  * --------------------
4483  * Entry-point for the DKIOCDMR ioctl. We issue a read to a specified sub-mirror
4484  * so that the application can determine what (if any) resync needs to be
4485  * performed. The data is copied out to the user-supplied buffer.
4486  *
4487  * Parameters:
4488  *	mdev	- dev_t for the mirror device
4489  *	vdr	- directed read parameters specifying location and submirror
4490  *		  to perform the read from
4491  *	mode	- used to ddi_copyout() any resulting data from the read
4492  *
4493  * Returns:
4494  *	0	success
4495  *	!0	error code
4496  *		EINVAL - invalid request format
4497  */
4498 int
4499 mirror_directed_read(dev_t mdev, vol_directed_rd_t *vdr, int mode)
4500 {
4501 	buf_t		*bp;
4502 	minor_t		mnum = getminor(mdev);
4503 	mdi_unit_t	*ui = MDI_UNIT(mnum);
4504 	mm_unit_t	*un;
4505 	mm_submirror_t	*sm;
4506 	char		*sm_nm;
4507 	uint_t		next_side;
4508 	void		*kbuffer;
4509 
4510 	if (ui == NULL)
4511 		return (ENXIO);
4512 
4513 	if (!(vdr->vdr_flags & DKV_DMR_NEXT_SIDE)) {
4514 		return (EINVAL);
4515 	}
4516 
4517 	/* Check for aligned block access. We disallow non-aligned requests. */
4518 	if (vdr->vdr_offset % DEV_BSIZE) {
4519 		return (EINVAL);
4520 	}
4521 
4522 	/*
4523 	 * Allocate kernel buffer for target of read(). If we had a reliable
4524 	 * (sorry functional) DDI this wouldn't be needed.
4525 	 */
4526 	kbuffer = kmem_alloc(vdr->vdr_nbytes, KM_NOSLEEP);
4527 	if (kbuffer == NULL) {
4528 		cmn_err(CE_WARN, "mirror_directed_read: couldn't allocate %lx"
4529 		    " bytes\n", vdr->vdr_nbytes);
4530 		return (ENOMEM);
4531 	}
4532 
4533 	bp = getrbuf(KM_SLEEP);
4534 
4535 	bp->b_un.b_addr = kbuffer;
4536 	bp->b_flags = B_READ;
4537 	bp->b_bcount = vdr->vdr_nbytes;
4538 	bp->b_lblkno = lbtodb(vdr->vdr_offset);
4539 	bp->b_edev = mdev;
4540 
4541 	un = md_unit_readerlock(ui);
4542 
4543 	/*
4544 	 * If DKV_SIDE_INIT is set we need to determine the first available
4545 	 * side to start reading from. If it isn't set we increment to the
4546 	 * next readable submirror.
4547 	 * If there are no readable submirrors we error out with DKV_DMR_ERROR.
4548 	 * Note: we check for a readable submirror on completion of the i/o so
4549 	 * we should _always_ have one available. If this becomes unavailable
4550 	 * we have missed the 'DKV_DMR_DONE' opportunity. This could happen if
4551 	 * a metadetach is made between the completion of one DKIOCDMR ioctl
4552 	 * and the start of the next (i.e. a sys-admin 'accident' occurred).
4553 	 * The chance of this is small, but not non-existent.
4554 	 */
4555 	if (vdr->vdr_side == DKV_SIDE_INIT) {
4556 		next_side = 0;
4557 	} else {
4558 		next_side = vdr->vdr_side + 1;
4559 	}
4560 	while ((next_side < NMIRROR) &&
4561 	    !SUBMIRROR_IS_READABLE(un, next_side))
4562 		next_side++;
4563 	if (next_side >= NMIRROR) {
4564 		vdr->vdr_flags |= DKV_DMR_ERROR;
4565 		freerbuf(bp);
4566 		vdr->vdr_bytesread = 0;
4567 		md_unit_readerexit(ui);
4568 		return (0);
4569 	}
4570 
4571 	/* Set the side to read from */
4572 	un->un_dmr_last_read = next_side;
4573 
4574 	md_unit_readerexit(ui);
4575 
4576 	/*
4577 	 * Save timestamp for verification purposes. Can be read by debugger
4578 	 * to verify that this ioctl has been executed and to find the number
4579 	 * of DMR reads and the time of the last DMR read.
4580 	 */
4581 	uniqtime(&mirror_dmr_stats.dmr_timestamp);
4582 	mirror_dmr_stats.dmr_count++;
4583 
4584 	/* Issue READ request and wait for completion */
4585 	mirror_read_strategy(bp, MD_STR_DMR|MD_NOBLOCK|MD_STR_NOTTOP, NULL);
4586 
4587 	mutex_enter(&un->un_dmr_mx);
4588 	cv_wait(&un->un_dmr_cv, &un->un_dmr_mx);
4589 	mutex_exit(&un->un_dmr_mx);
4590 
4591 	/*
4592 	 * Check to see if we encountered an error during the read. If so we
4593 	 * can make no guarantee about any possibly returned data.
4594 	 */
4595 	if ((bp->b_flags & B_ERROR) == 0) {
4596 		vdr->vdr_flags &= ~DKV_DMR_ERROR;
4597 		if (bp->b_resid) {
4598 			vdr->vdr_flags |= DKV_DMR_SHORT;
4599 			vdr->vdr_bytesread = vdr->vdr_nbytes - bp->b_resid;
4600 		} else {
4601 			vdr->vdr_flags |= DKV_DMR_SUCCESS;
4602 			vdr->vdr_bytesread = vdr->vdr_nbytes;
4603 		}
4604 		/* Copy the data read back out to the user supplied buffer */
4605 		if (ddi_copyout(kbuffer, vdr->vdr_data, vdr->vdr_bytesread,
4606 		    mode)) {
4607 			kmem_free(kbuffer, vdr->vdr_nbytes);
4608 			return (EFAULT);
4609 		}
4610 
4611 	} else {
4612 		/* Error out with DKV_DMR_ERROR */
4613 		vdr->vdr_flags |= DKV_DMR_ERROR;
4614 		vdr->vdr_flags &= ~(DKV_DMR_SUCCESS|DKV_DMR_SHORT|DKV_DMR_DONE);
4615 	}
4616 	/*
4617 	 * Update the DMR parameters with the side and name of submirror that
4618 	 * we have just read from (un->un_dmr_last_read)
4619 	 */
4620 	un = md_unit_readerlock(ui);
4621 
4622 	vdr->vdr_side = un->un_dmr_last_read;
4623 	sm = &un->un_sm[un->un_dmr_last_read];
4624 	sm_nm = md_shortname(md_getminor(sm->sm_dev));
4625 
4626 	(void) strncpy(vdr->vdr_side_name, sm_nm, sizeof (vdr->vdr_side_name));
4627 
4628 	/*
4629 	 * Determine if we've completed the read cycle. This is true iff the
4630 	 * next computed submirror (side) equals or exceeds NMIRROR. We cannot
4631 	 * use un_nsm as we need to handle a sparse array of submirrors (which
4632 	 * can occur if a submirror is metadetached).
4633 	 */
4634 	next_side = un->un_dmr_last_read + 1;
4635 	while ((next_side < NMIRROR) &&
4636 	    !SUBMIRROR_IS_READABLE(un, next_side))
4637 		next_side++;
4638 	if (next_side >= NMIRROR) {
4639 		/* We've finished */
4640 		vdr->vdr_flags |= DKV_DMR_DONE;
4641 	}
4642 
4643 	md_unit_readerexit(ui);
4644 	freerbuf(bp);
4645 	kmem_free(kbuffer, vdr->vdr_nbytes);
4646 
4647 	return (0);
4648 }
4649 
4650 /*
4651  * mirror_resync_message:
4652  * ---------------------
4653  * Handle the multi-node resync messages that keep all nodes within a given
4654  * disk-set in sync with their view of a mirror's resync status.
4655  *
4656  * The message types dealt with are:
4657  * MD_MN_MSG_RESYNC_STARTING	- start a resync thread for a unit
4658  * MD_MN_MSG_RESYNC_NEXT	- specified next region to be resynced
4659  * MD_MN_MSG_RESYNC_FINISH	- stop the resync thread for a unit
4660  * MD_MN_MSG_RESYNC_PHASE_DONE	- end of a resync phase, opt, submirror or comp
4661  *
4662  * Returns:
4663  *	0	Success
4664  *	>0	Failure error number
4665  */
4666 int
4667 mirror_resync_message(md_mn_rs_params_t *p, IOLOCK *lockp)
4668 {
4669 	mdi_unit_t		*ui;
4670 	mm_unit_t		*un;
4671 	set_t			setno;
4672 	int			is_ABR;
4673 	int			smi;
4674 	int			ci;
4675 	sm_state_t		state;
4676 	int			broke_out;
4677 	mm_submirror_t		*sm;
4678 	mm_submirror_ic_t	*smic;
4679 	md_m_shared_t		*shared;
4680 	md_error_t		mde = mdnullerror;
4681 	md_mps_t		*ps;
4682 	int			rs_active;
4683 	int			rr, rr_start, rr_end;
4684 
4685 	/* Check that the given device is part of a multi-node set */
4686 	setno = MD_MIN2SET(p->mnum);
4687 	if (setno >= md_nsets) {
4688 		return (ENXIO);
4689 	}
4690 	if (!MD_MNSET_SETNO(setno)) {
4691 		return (EINVAL);
4692 	}
4693 
4694 	if ((un = mirror_getun(p->mnum, &p->mde, NO_LOCK, NULL)) == NULL)
4695 		return (EINVAL);
4696 	if ((ui = MDI_UNIT(p->mnum)) == NULL)
4697 		return (EINVAL);
4698 	is_ABR = (ui->ui_tstate & MD_ABR_CAP);
4699 
4700 	/* Obtain the current resync status */
4701 	(void) md_ioctl_readerlock(lockp, ui);
4702 	rs_active = (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) ? 1 : 0;
4703 	md_ioctl_readerexit(lockp);
4704 
4705 	switch ((md_mn_msgtype_t)p->msg_type) {
4706 	case MD_MN_MSG_RESYNC_STARTING:
4707 		/* Start the resync thread for the mirror */
4708 		(void) mirror_resync_unit(p->mnum, NULL, &p->mde, lockp);
4709 		break;
4710 
4711 	case MD_MN_MSG_RESYNC_NEXT:
4712 		/*
4713 		 * We have to release any previously marked overlap regions
4714 		 * so that i/o can resume. Then we need to block the region
4715 		 * from [rs_start..rs_start+rs_size) * so that no i/o is issued.
4716 		 * Update un_rs_resync_done and un_rs_resync_2_do.
4717 		 */
4718 		(void) md_ioctl_readerlock(lockp, ui);
4719 		/*
4720 		 * Ignore the message if there is no active resync thread or
4721 		 * if it is for a resync type that we have already completed.
4722 		 * un_resync_completed is set to the last resync completed
4723 		 * when processing a PHASE_DONE message.
4724 		 */
4725 		if (!rs_active || (p->rs_type == un->un_resync_completed))
4726 			break;
4727 		/*
4728 		 * If this message is for the same resync and is for an earlier
4729 		 * resync region, just ignore it. This can only occur if this
4730 		 * node has progressed on to the next resync region before
4731 		 * we receive this message. This can occur if the class for
4732 		 * this message is busy and the originator has to retry thus
4733 		 * allowing this node to move onto the next resync_region.
4734 		 */
4735 		if ((p->rs_type == un->un_rs_type) &&
4736 		    (p->rs_start < un->un_resync_startbl))
4737 			break;
4738 		ps = un->un_rs_prev_overlap;
4739 
4740 		/* Allocate previous overlap reference if needed */
4741 		if (ps == NULL) {
4742 			ps = kmem_cache_alloc(mirror_parent_cache,
4743 			    MD_ALLOCFLAGS);
4744 			ps->ps_un = un;
4745 			ps->ps_ui = ui;
4746 			ps->ps_firstblk = 0;
4747 			ps->ps_lastblk = 0;
4748 			ps->ps_flags = 0;
4749 			md_ioctl_readerexit(lockp);
4750 			(void) md_ioctl_writerlock(lockp, ui);
4751 			un->un_rs_prev_overlap = ps;
4752 			md_ioctl_writerexit(lockp);
4753 		} else
4754 			md_ioctl_readerexit(lockp);
4755 
4756 		if (p->rs_originator != md_mn_mynode_id) {
4757 			/*
4758 			 * Clear our un_resync_bm for the regions completed.
4759 			 * The owner (originator) will take care of itself.
4760 			 */
4761 			BLK_TO_RR(rr_end, ps->ps_lastblk, un);
4762 			BLK_TO_RR(rr_start, p->rs_start, un);
4763 			if (ps->ps_lastblk && rr_end < rr_start) {
4764 				BLK_TO_RR(rr_start, ps->ps_firstblk, un);
4765 				mutex_enter(&un->un_resync_mx);
4766 				/*
4767 				 * Update our resync bitmap to reflect that
4768 				 * another node has synchronized this range.
4769 				 */
4770 				for (rr = rr_start; rr <= rr_end; rr++) {
4771 					CLR_KEEPDIRTY(rr, un);
4772 				}
4773 				mutex_exit(&un->un_resync_mx);
4774 			}
4775 
4776 			/*
4777 			 * On all but the originating node, first update
4778 			 * the resync state, then unblock the previous
4779 			 * region and block the next one. No need
4780 			 * to do this if the region is already blocked.
4781 			 * Update the submirror state and flags from the
4782 			 * originator. This keeps the cluster in sync with
4783 			 * regards to the resync status.
4784 			 */
4785 
4786 			(void) md_ioctl_writerlock(lockp, ui);
4787 			un->un_rs_resync_done = p->rs_done;
4788 			un->un_rs_resync_2_do = p->rs_2_do;
4789 			un->un_rs_type = p->rs_type;
4790 			un->un_resync_startbl = p->rs_start;
4791 			md_ioctl_writerexit(lockp);
4792 			/*
4793 			 * Use un_owner_mx to ensure that an ownership change
4794 			 * cannot happen at the same time as this message
4795 			 */
4796 			mutex_enter(&un->un_owner_mx);
4797 			if (MD_MN_MIRROR_OWNER(un)) {
4798 				ps->ps_firstblk = p->rs_start;
4799 				ps->ps_lastblk = ps->ps_firstblk +
4800 				    p->rs_size - 1;
4801 			} else {
4802 				if ((ps->ps_firstblk != p->rs_start) ||
4803 				    (ps->ps_lastblk != p->rs_start +
4804 				    p->rs_size - 1)) {
4805 					/* Remove previous overlap range */
4806 					if (ps->ps_flags & MD_MPS_ON_OVERLAP)
4807 						mirror_overlap_tree_remove(ps);
4808 
4809 					ps->ps_firstblk = p->rs_start;
4810 					ps->ps_lastblk = ps->ps_firstblk +
4811 					    p->rs_size - 1;
4812 
4813 					mutex_exit(&un->un_owner_mx);
4814 					/* Block this range from all i/o. */
4815 					if (ps->ps_firstblk != 0 ||
4816 					    ps->ps_lastblk != 0)
4817 						wait_for_overlaps(ps,
4818 						    MD_OVERLAP_ALLOW_REPEAT);
4819 					mutex_enter(&un->un_owner_mx);
4820 					/*
4821 					 * Check to see if we have obtained
4822 					 * ownership while waiting for
4823 					 * overlaps. If we have, remove
4824 					 * the resync_region entry from the
4825 					 * overlap tree
4826 					 */
4827 					if (MD_MN_MIRROR_OWNER(un) &&
4828 					    (ps->ps_flags & MD_MPS_ON_OVERLAP))
4829 						mirror_overlap_tree_remove(ps);
4830 				}
4831 			}
4832 			mutex_exit(&un->un_owner_mx);
4833 
4834 			/*
4835 			 * If this is the first RESYNC_NEXT message (i.e.
4836 			 * MD_MN_RS_FIRST_RESYNC_NEXT set in p->rs_flags),
4837 			 * issue RESYNC_START NOTIFY event
4838 			 */
4839 			if (p->rs_flags & MD_MN_RS_FIRST_RESYNC_NEXT) {
4840 				SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_START,
4841 				    SVM_TAG_METADEVICE, MD_UN2SET(un),
4842 				    MD_SID(un));
4843 			}
4844 
4845 			/* Ensure that our local resync thread is running */
4846 			if (un->un_rs_thread == NULL) {
4847 				(void) mirror_resync_unit(p->mnum, NULL,
4848 				    &p->mde, lockp);
4849 			}
4850 		}
4851 
4852 		break;
4853 	case MD_MN_MSG_RESYNC_FINISH:
4854 		/*
4855 		 * Complete the resync by stopping the resync thread.
4856 		 * Also release the previous overlap region field.
4857 		 * Update the resync_progress_thread by cv_signal'ing it so
4858 		 * that we mark the end of the resync as soon as possible. This
4859 		 * stops an unnecessary delay should be panic after resync
4860 		 * completion.
4861 		 */
4862 #ifdef DEBUG
4863 		if (!rs_active) {
4864 			if (mirror_debug_flag)
4865 				printf("RESYNC_FINISH (mnum = %x), "
4866 				    "Resync *NOT* active",
4867 				    p->mnum);
4868 		}
4869 #endif
4870 
4871 		if ((un->c.un_status & MD_UN_RESYNC_ACTIVE) &&
4872 		    (p->rs_originator != md_mn_mynode_id)) {
4873 			mutex_enter(&un->un_rs_thread_mx);
4874 			un->c.un_status &= ~MD_UN_RESYNC_CANCEL;
4875 			un->un_rs_thread_flags |= MD_RI_SHUTDOWN;
4876 			un->un_rs_thread_flags &=
4877 			    ~(MD_RI_BLOCK|MD_RI_BLOCK_OWNER);
4878 			cv_signal(&un->un_rs_thread_cv);
4879 			mutex_exit(&un->un_rs_thread_mx);
4880 		}
4881 		if (is_ABR) {
4882 			/* Resync finished, if ABR set owner to NULL */
4883 			mutex_enter(&un->un_owner_mx);
4884 			un->un_mirror_owner = 0;
4885 			mutex_exit(&un->un_owner_mx);
4886 		}
4887 		(void) md_ioctl_writerlock(lockp, ui);
4888 		ps = un->un_rs_prev_overlap;
4889 		if (ps != NULL) {
4890 			/* Remove previous overlap range */
4891 			if (ps->ps_flags & MD_MPS_ON_OVERLAP)
4892 				mirror_overlap_tree_remove(ps);
4893 			/*
4894 			 * Release the overlap range reference
4895 			 */
4896 			un->un_rs_prev_overlap = NULL;
4897 			kmem_cache_free(mirror_parent_cache,
4898 			    ps);
4899 		}
4900 		md_ioctl_writerexit(lockp);
4901 
4902 		/* Mark the resync as complete in the metadb */
4903 		un->un_rs_resync_done = p->rs_done;
4904 		un->un_rs_resync_2_do = p->rs_2_do;
4905 		un->un_rs_type = p->rs_type;
4906 		mutex_enter(&un->un_rs_progress_mx);
4907 		cv_signal(&un->un_rs_progress_cv);
4908 		mutex_exit(&un->un_rs_progress_mx);
4909 
4910 		un = md_ioctl_writerlock(lockp, ui);
4911 		un->c.un_status &= ~MD_UN_RESYNC_ACTIVE;
4912 		/* Deal with any pending grow_unit */
4913 		if (un->c.un_status & MD_UN_GROW_PENDING) {
4914 			if ((mirror_grow_unit(un, &mde) != 0) ||
4915 			    (! mdismderror(&mde, MDE_GROW_DELAYED))) {
4916 				un->c.un_status &= ~MD_UN_GROW_PENDING;
4917 			}
4918 		}
4919 		md_ioctl_writerexit(lockp);
4920 		break;
4921 
4922 	case MD_MN_MSG_RESYNC_PHASE_DONE:
4923 		/*
4924 		 * A phase of the resync, optimized. component or
4925 		 * submirror is complete. Update mirror status.
4926 		 * If the flag CLEAR_OPT_NOT_DONE is set, it means that the
4927 		 * mirror owner is peforming a resync. If we have just snarfed
4928 		 * this set, then we must clear any of the flags set at snarf
4929 		 * time by unit_setup_resync().
4930 		 * Note that unit_setup_resync() sets up these flags to
4931 		 * indicate that an optimized resync is required. These flags
4932 		 * need to be reset because if we get here,  the mirror owner
4933 		 * will have handled the optimized resync.
4934 		 * The flags that must be cleared are MD_UN_OPT_NOT_DONE and
4935 		 * MD_UN_WAR. In addition, for each submirror,
4936 		 * MD_SM_RESYNC_TARGET must be cleared and SMS_OFFLINE_RESYNC
4937 		 * set to SMS_OFFLINE.
4938 		 */
4939 #ifdef DEBUG
4940 		if (mirror_debug_flag)
4941 			printf("phase done mess received from %d, mnum=%x,"
4942 			    "type=%x, flags=%x\n", p->rs_originator, p->mnum,
4943 			    p->rs_type, p->rs_flags);
4944 #endif
4945 		/*
4946 		 * Ignore the message if there is no active resync thread.
4947 		 */
4948 		if (!rs_active)
4949 			break;
4950 
4951 		broke_out = p->rs_flags & MD_MN_RS_ERR;
4952 		switch (RS_TYPE(p->rs_type)) {
4953 		case MD_RS_OPTIMIZED:
4954 			un = md_ioctl_writerlock(lockp, ui);
4955 			if (p->rs_flags & MD_MN_RS_CLEAR_OPT_NOT_DONE) {
4956 				/* If we are originator, just clear rs_type */
4957 				if (p->rs_originator == md_mn_mynode_id) {
4958 					SET_RS_TYPE_NONE(un->un_rs_type);
4959 					md_ioctl_writerexit(lockp);
4960 					break;
4961 				}
4962 				/*
4963 				 * If CLEAR_OPT_NOT_DONE is set, only clear the
4964 				 * flags if OPT_NOT_DONE is set *and* rs_type
4965 				 * is MD_RS_NONE.
4966 				 */
4967 				if ((un->c.un_status & MD_UN_OPT_NOT_DONE) &&
4968 				    (RS_TYPE(un->un_rs_type) == MD_RS_NONE)) {
4969 					/* No resync in progress */
4970 					un->c.un_status &= ~MD_UN_OPT_NOT_DONE;
4971 					un->c.un_status &= ~MD_UN_WAR;
4972 				} else {
4973 					/*
4974 					 * We are in the middle of an
4975 					 * optimized resync and this message
4976 					 * should be ignored.
4977 					 */
4978 					md_ioctl_writerexit(lockp);
4979 					break;
4980 				}
4981 			} else {
4982 				/*
4983 				 * This is the end of an optimized resync,
4984 				 * clear the OPT_NOT_DONE and OFFLINE_SM flags
4985 				 */
4986 
4987 				un->c.un_status &= ~MD_UN_KEEP_DIRTY;
4988 				if (!broke_out)
4989 					un->c.un_status &= ~MD_UN_WAR;
4990 
4991 				/*
4992 				 * Clear our un_resync_bm for the regions
4993 				 * completed.  The owner (originator) will
4994 				 * take care of itself.
4995 				 */
4996 				if (p->rs_originator != md_mn_mynode_id &&
4997 				    (ps = un->un_rs_prev_overlap) != NULL) {
4998 					BLK_TO_RR(rr_start, ps->ps_firstblk,
4999 					    un);
5000 					BLK_TO_RR(rr_end, ps->ps_lastblk, un);
5001 					mutex_enter(&un->un_resync_mx);
5002 					for (rr = rr_start; rr <= rr_end;
5003 					    rr++) {
5004 						CLR_KEEPDIRTY(rr, un);
5005 					}
5006 					mutex_exit(&un->un_resync_mx);
5007 				}
5008 			}
5009 
5010 			/*
5011 			 * Set resync_completed to last resync type and then
5012 			 * clear resync_type to indicate no resync in progress
5013 			 */
5014 			un->un_resync_completed = un->un_rs_type;
5015 			SET_RS_TYPE_NONE(un->un_rs_type);
5016 
5017 			/*
5018 			 * If resync is as a result of a submirror ONLINE,
5019 			 * reset the submirror state to SMS_RUNNING if the
5020 			 * resync was ok else set back to SMS_OFFLINE.
5021 			 */
5022 			for (smi = 0; smi < NMIRROR; smi++) {
5023 				un->un_sm[smi].sm_flags &=
5024 				    ~MD_SM_RESYNC_TARGET;
5025 				if (SMS_BY_INDEX_IS(un, smi,
5026 				    SMS_OFFLINE_RESYNC)) {
5027 					if (p->rs_flags &
5028 					    MD_MN_RS_CLEAR_OPT_NOT_DONE) {
5029 						state = SMS_OFFLINE;
5030 					} else {
5031 						state = (broke_out ?
5032 						    SMS_OFFLINE : SMS_RUNNING);
5033 					}
5034 					mirror_set_sm_state(
5035 					    &un->un_sm[smi],
5036 					    &un->un_smic[smi], state,
5037 					    broke_out);
5038 					mirror_commit(un, NO_SUBMIRRORS,
5039 					    0);
5040 				}
5041 				/*
5042 				 * If we still have an offline submirror, reset
5043 				 * the OFFLINE_SM flag in the mirror status
5044 				 */
5045 				if (SMS_BY_INDEX_IS(un, smi,
5046 				    SMS_OFFLINE))
5047 					un->c.un_status |=
5048 					    MD_UN_OFFLINE_SM;
5049 			}
5050 			md_ioctl_writerexit(lockp);
5051 			break;
5052 		case MD_RS_SUBMIRROR:
5053 			un = md_ioctl_writerlock(lockp, ui);
5054 			smi = RS_SMI(p->rs_type);
5055 			sm = &un->un_sm[smi];
5056 			smic = &un->un_smic[smi];
5057 			/* Clear RESYNC target */
5058 			un->un_sm[smi].sm_flags &= ~MD_SM_RESYNC_TARGET;
5059 			/*
5060 			 * Set resync_completed to last resync type and then
5061 			 * clear resync_type to indicate no resync in progress
5062 			 */
5063 			un->un_resync_completed = un->un_rs_type;
5064 			SET_RS_TYPE_NONE(un->un_rs_type);
5065 			/*
5066 			 * If the resync completed ok reset the submirror
5067 			 * state to SMS_RUNNING else reset it to SMS_ATTACHED
5068 			 */
5069 			state = (broke_out ?
5070 			    SMS_ATTACHED : SMS_RUNNING);
5071 			mirror_set_sm_state(sm, smic, state, broke_out);
5072 			un->c.un_status &= ~MD_UN_WAR;
5073 			mirror_commit(un, SMI2BIT(smi), 0);
5074 			md_ioctl_writerexit(lockp);
5075 			break;
5076 		case MD_RS_COMPONENT:
5077 			un = md_ioctl_writerlock(lockp, ui);
5078 			smi = RS_SMI(p->rs_type);
5079 			ci = RS_CI(p->rs_type);
5080 			sm = &un->un_sm[smi];
5081 			smic = &un->un_smic[smi];
5082 			shared = (md_m_shared_t *)
5083 			    (*(smic->sm_shared_by_indx))
5084 			    (sm->sm_dev, sm, ci);
5085 			un->c.un_status &= ~MD_UN_WAR;
5086 			/* Clear RESYNC target */
5087 			un->un_sm[smi].sm_flags &= ~MD_SM_RESYNC_TARGET;
5088 			/*
5089 			 * Set resync_completed to last resync type and then
5090 			 * clear resync_type to indicate no resync in progress
5091 			 */
5092 			un->un_resync_completed = un->un_rs_type;
5093 			SET_RS_TYPE_NONE(un->un_rs_type);
5094 
5095 			/*
5096 			 * If the resync completed ok, set the component state
5097 			 * to CS_OKAY.
5098 			 */
5099 			if (broke_out)
5100 				shared->ms_flags |= MDM_S_RS_TRIED;
5101 			else {
5102 				/*
5103 				 * As we don't transmit the changes,
5104 				 * no need to drop the lock.
5105 				 */
5106 				set_sm_comp_state(un, smi, ci, CS_OKAY, 0,
5107 				    MD_STATE_NO_XMIT, (IOLOCK *)NULL);
5108 			}
5109 			md_ioctl_writerexit(lockp);
5110 		default:
5111 			break;
5112 		}
5113 		/*
5114 		 * If the purpose of this PHASE_DONE message is just to
5115 		 * indicate to all other nodes that the optimized resync
5116 		 * required (OPT_NOT_DONE) flag is to be cleared, there is
5117 		 * no need to generate a notify event as there has not
5118 		 * actually been a resync.
5119 		 */
5120 		if (!(p->rs_flags & MD_MN_RS_CLEAR_OPT_NOT_DONE)) {
5121 			if (broke_out) {
5122 				SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_FAILED,
5123 				    SVM_TAG_METADEVICE, MD_UN2SET(un),
5124 				    MD_SID(un));
5125 			} else {
5126 				SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_DONE,
5127 				    SVM_TAG_METADEVICE, MD_UN2SET(un),
5128 				    MD_SID(un));
5129 			}
5130 		}
5131 		break;
5132 
5133 	default:
5134 #ifdef DEBUG
5135 		cmn_err(CE_PANIC, "mirror_resync_message: Unknown message type"
5136 		    " %x\n", p->msg_type);
5137 #endif
5138 		return (EINVAL);
5139 	}
5140 	return (0);
5141 }
5142 
5143 /* Return a -1 if snarf of optimized record failed and set should be released */
5144 static int
5145 mirror_snarf(md_snarfcmd_t cmd, set_t setno)
5146 {
5147 	mddb_recid_t	recid;
5148 	int		gotsomething;
5149 	int		all_mirrors_gotten;
5150 	mm_unit_t	*un;
5151 	mddb_type_t	typ1;
5152 	mddb_de_ic_t    *dep;
5153 	mddb_rb32_t	*rbp;
5154 	size_t		newreqsize;
5155 	mm_unit_t	*big_un;
5156 	mm_unit32_od_t	*small_un;
5157 	int		retval;
5158 	mdi_unit_t	*ui;
5159 
5160 	if (cmd == MD_SNARF_CLEANUP) {
5161 		if (md_get_setstatus(setno) & MD_SET_STALE)
5162 			return (0);
5163 
5164 		recid = mddb_makerecid(setno, 0);
5165 		typ1 = (mddb_type_t)md_getshared_key(setno,
5166 		    mirror_md_ops.md_driver.md_drivername);
5167 		while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) {
5168 			if (mddb_getrecprivate(recid) & MD_PRV_CLEANUP) {
5169 				un = (mm_unit_t *)mddb_getrecaddr(recid);
5170 				mirror_cleanup(un);
5171 				recid = mddb_makerecid(setno, 0);
5172 			}
5173 		}
5174 		return (0);
5175 	}
5176 
5177 	all_mirrors_gotten = 1;
5178 	gotsomething = 0;
5179 
5180 	recid = mddb_makerecid(setno, 0);
5181 	typ1 = (mddb_type_t)md_getshared_key(setno,
5182 	    mirror_md_ops.md_driver.md_drivername);
5183 
5184 	while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) {
5185 		if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
5186 			continue;
5187 
5188 		dep = mddb_getrecdep(recid);
5189 		dep->de_flags = MDDB_F_MIRROR;
5190 		rbp = dep->de_rb;
5191 
5192 		switch (rbp->rb_revision) {
5193 		case MDDB_REV_RB:
5194 		case MDDB_REV_RBFN:
5195 			if ((rbp->rb_private & MD_PRV_CONVD) == 0) {
5196 				/*
5197 				 * This means, we have an old and small
5198 				 * record and this record hasn't already
5199 				 * been converted.  Before we create an
5200 				 * incore metadevice from this we have to
5201 				 * convert it to a big record.
5202 				 */
5203 				small_un =
5204 				    (mm_unit32_od_t *)mddb_getrecaddr(recid);
5205 				newreqsize = sizeof (mm_unit_t);
5206 				big_un = (mm_unit_t *)kmem_zalloc(newreqsize,
5207 				    KM_SLEEP);
5208 				mirror_convert((caddr_t)small_un,
5209 				    (caddr_t)big_un, SMALL_2_BIG);
5210 				kmem_free(small_un, dep->de_reqsize);
5211 
5212 				/*
5213 				 * Update userdata and incore userdata
5214 				 * incores are at the end of un
5215 				 */
5216 				dep->de_rb_userdata_ic = big_un;
5217 				dep->de_rb_userdata = big_un;
5218 				dep->de_icreqsize = newreqsize;
5219 				un = big_un;
5220 				rbp->rb_private |= MD_PRV_CONVD;
5221 			} else {
5222 				/*
5223 				 * Unit already converted, just get the
5224 				 * record address.
5225 				 */
5226 				un = (mm_unit_t *)mddb_getrecaddr_resize(recid,
5227 				    sizeof (*un), 0);
5228 			}
5229 			un->c.un_revision &= ~MD_64BIT_META_DEV;
5230 			break;
5231 		case MDDB_REV_RB64:
5232 		case MDDB_REV_RB64FN:
5233 			/* Big device */
5234 			un = (mm_unit_t *)mddb_getrecaddr_resize(recid,
5235 			    sizeof (*un), 0);
5236 			un->c.un_revision |= MD_64BIT_META_DEV;
5237 			un->c.un_flag |= MD_EFILABEL;
5238 			break;
5239 		}
5240 		MDDB_NOTE_FN(rbp->rb_revision, un->c.un_revision);
5241 
5242 		/*
5243 		 * Create minor device node for snarfed entry.
5244 		 */
5245 		(void) md_create_minor_node(setno, MD_SID(un));
5246 
5247 		if (MD_UNIT(MD_SID(un)) != NULL) {
5248 			mddb_setrecprivate(recid, MD_PRV_PENDDEL);
5249 			continue;
5250 		}
5251 		all_mirrors_gotten = 0;
5252 		retval = mirror_build_incore(un, 1);
5253 		if (retval == 0) {
5254 			mddb_setrecprivate(recid, MD_PRV_GOTIT);
5255 			md_create_unit_incore(MD_SID(un), &mirror_md_ops, 0);
5256 			resync_start_timeout(setno);
5257 			gotsomething = 1;
5258 		} else {
5259 			return (retval);
5260 		}
5261 		/*
5262 		 * Set flag to indicate that the mirror has not yet
5263 		 * been through a reconfig. This flag is used for MN sets
5264 		 * when determining whether to update the mirror state from
5265 		 * the Master node.
5266 		 */
5267 		if (MD_MNSET_SETNO(setno)) {
5268 			ui = MDI_UNIT(MD_SID(un));
5269 			ui->ui_tstate |= MD_RESYNC_NOT_DONE;
5270 		}
5271 	}
5272 
5273 	if (!all_mirrors_gotten)
5274 		return (gotsomething);
5275 
5276 	recid = mddb_makerecid(setno, 0);
5277 	while ((recid = mddb_getnextrec(recid, typ1, RESYNC_REC)) > 0)
5278 		if (!(mddb_getrecprivate(recid) & MD_PRV_GOTIT))
5279 			mddb_setrecprivate(recid, MD_PRV_PENDDEL);
5280 
5281 	return (0);
5282 }
5283 
5284 static int
5285 mirror_halt(md_haltcmd_t cmd, set_t setno)
5286 {
5287 	unit_t		i;
5288 	mdi_unit_t	*ui;
5289 	minor_t		mnum;
5290 	int		reset_mirror_flag = 0;
5291 
5292 	if (cmd == MD_HALT_CLOSE)
5293 		return (0);
5294 
5295 	if (cmd == MD_HALT_OPEN)
5296 		return (0);
5297 
5298 	if (cmd == MD_HALT_UNLOAD)
5299 		return (0);
5300 
5301 	if (cmd == MD_HALT_CHECK) {
5302 		for (i = 0; i < md_nunits; i++) {
5303 			mnum = MD_MKMIN(setno, i);
5304 			if ((ui = MDI_UNIT(mnum)) == NULL)
5305 				continue;
5306 			if (ui->ui_opsindex != mirror_md_ops.md_selfindex)
5307 				continue;
5308 			if (md_unit_isopen(ui))
5309 				return (1);
5310 		}
5311 		return (0);
5312 	}
5313 
5314 	if (cmd != MD_HALT_DOIT)
5315 		return (1);
5316 
5317 	for (i = 0; i < md_nunits; i++) {
5318 		mnum = MD_MKMIN(setno, i);
5319 		if ((ui = MDI_UNIT(mnum)) == NULL)
5320 			continue;
5321 		if (ui->ui_opsindex != mirror_md_ops.md_selfindex)
5322 			continue;
5323 		reset_mirror((mm_unit_t *)MD_UNIT(mnum), mnum, 0);
5324 
5325 		/* Set a flag if there is at least one mirror metadevice. */
5326 		reset_mirror_flag = 1;
5327 	}
5328 
5329 	/*
5330 	 * Only wait for the global dr_timeout to finish
5331 	 *  - if there are mirror metadevices in this diskset or
5332 	 *  - if this is the local set since an unload of the md_mirror
5333 	 *    driver could follow a successful mirror halt in the local set.
5334 	 */
5335 	if ((reset_mirror_flag != 0) || (setno == MD_LOCAL_SET)) {
5336 		while ((mirror_md_ops.md_head == NULL) &&
5337 		    (mirror_timeout.dr_timeout_id != 0))
5338 			delay(md_hz);
5339 	}
5340 
5341 	return (0);
5342 }
5343 
5344 /*ARGSUSED3*/
5345 static int
5346 mirror_open(dev_t *dev, int flag, int otyp, cred_t *cred_p, int md_oflags)
5347 {
5348 	IOLOCK	lock;
5349 	minor_t		mnum = getminor(*dev);
5350 	set_t		setno;
5351 
5352 	/*
5353 	 * When doing an open of a multi owner metadevice, check to see if this
5354 	 * node is a starting node and if a reconfig cycle is underway.
5355 	 * If so, the system isn't sufficiently set up enough to handle the
5356 	 * open (which involves I/O during sp_validate), so fail with ENXIO.
5357 	 */
5358 	setno = MD_MIN2SET(mnum);
5359 	if ((md_set[setno].s_status & (MD_SET_MNSET | MD_SET_MN_START_RC)) ==
5360 	    (MD_SET_MNSET | MD_SET_MN_START_RC)) {
5361 			return (ENXIO);
5362 	}
5363 
5364 	if (md_oflags & MD_OFLG_FROMIOCTL) {
5365 		/*
5366 		 * This indicates that the caller is an ioctl service routine.
5367 		 * In this case we initialise our stack-based IOLOCK and pass
5368 		 * this into the internal open routine. This allows multi-owner
5369 		 * metadevices to avoid deadlocking if an error is encountered
5370 		 * during the open() attempt. The failure case is:
5371 		 * s-p -> mirror -> s-p (with error). Attempting to metaclear
5372 		 * this configuration would deadlock as the mirror code has to
5373 		 * send a state-update to the other nodes when it detects the
5374 		 * failure of the underlying submirror with an errored soft-part
5375 		 * on it. As there is a class1 message in progress (metaclear)
5376 		 * set_sm_comp_state() cannot send another class1 message;
5377 		 * instead we do not send a state_update message as the
5378 		 * metaclear is distributed and the failed submirror will be
5379 		 * cleared from the configuration by the metaclear.
5380 		 */
5381 		IOLOCK_INIT(&lock);
5382 		return (mirror_internal_open(getminor(*dev), flag, otyp,
5383 		    md_oflags, &lock));
5384 	} else {
5385 		return (mirror_internal_open(getminor(*dev), flag, otyp,
5386 		    md_oflags, (IOLOCK *)NULL));
5387 	}
5388 }
5389 
5390 
5391 /*ARGSUSED1*/
5392 static int
5393 mirror_close(dev_t dev, int flag, int otyp, cred_t *cred_p, int md_cflags)
5394 {
5395 	return (mirror_internal_close(getminor(dev), otyp, md_cflags,
5396 	    (IOLOCK *)NULL));
5397 }
5398 
5399 
5400 /*
5401  * This routine dumps memory to the disk.  It assumes that the memory has
5402  * already been mapped into mainbus space.  It is called at disk interrupt
5403  * priority when the system is in trouble.
5404  *
5405  */
5406 static int
5407 mirror_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
5408 {
5409 	mm_unit_t	*un;
5410 	dev_t		mapdev;
5411 	int		result;
5412 	int		smi;
5413 	int		any_succeed = 0;
5414 	int		save_result = 0;
5415 
5416 	/*
5417 	 * Don't need to grab the unit lock.
5418 	 * Cause nothing else is suppose to be happenning.
5419 	 * Also dump is not suppose to sleep.
5420 	 */
5421 	un = (mm_unit_t *)MD_UNIT(getminor(dev));
5422 
5423 	if ((diskaddr_t)blkno >= un->c.un_total_blocks)
5424 		return (EINVAL);
5425 
5426 	if ((diskaddr_t)blkno + nblk > un->c.un_total_blocks)
5427 		return (EINVAL);
5428 
5429 	for (smi = 0; smi < NMIRROR; smi++) {
5430 		if (!SUBMIRROR_IS_WRITEABLE(un, smi))
5431 			continue;
5432 		mapdev = md_dev64_to_dev(un->un_sm[smi].sm_dev);
5433 		result = bdev_dump(mapdev, addr, blkno, nblk);
5434 		if (result)
5435 			save_result = result;
5436 
5437 		if (result == 0)
5438 			any_succeed++;
5439 	}
5440 
5441 	if (any_succeed)
5442 		return (0);
5443 
5444 	return (save_result);
5445 }
5446 
5447 /*
5448  * NAME: mirror_probe_dev
5449  *
5450  * DESCRITPION: force opens every component of a mirror.
5451  *
5452  * On entry the unit writerlock is held
5453  */
5454 static int
5455 mirror_probe_dev(mdi_unit_t *ui, minor_t mnum)
5456 {
5457 	int		i;
5458 	int		smi;
5459 	int		ci;
5460 	mm_unit_t	*un;
5461 	int		md_devopen = 0;
5462 	set_t		setno;
5463 	int		sm_cnt;
5464 	int		sm_unavail_cnt;
5465 
5466 	if (md_unit_isopen(ui))
5467 		md_devopen++;
5468 
5469 	un = MD_UNIT(mnum);
5470 	setno = MD_UN2SET(un);
5471 
5472 	sm_cnt = 0;
5473 	sm_unavail_cnt = 0;
5474 	for (i = 0; i < NMIRROR; i++) {
5475 		md_dev64_t tmpdev;
5476 		mdi_unit_t	*sm_ui;
5477 
5478 		if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) {
5479 			continue;
5480 		}
5481 
5482 		sm_cnt++;
5483 		tmpdev = un->un_sm[i].sm_dev;
5484 		(void) md_layered_open(mnum, &tmpdev,
5485 		    MD_OFLG_CONT_ERRS | MD_OFLG_PROBEDEV);
5486 		un->un_sm[i].sm_dev = tmpdev;
5487 
5488 		sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev)));
5489 
5490 		/*
5491 		 * Logic similar to that in mirror_open_all_devs.  We set or
5492 		 * clear the submirror Unavailable bit.
5493 		 */
5494 		(void) md_unit_writerlock(sm_ui);
5495 		if (submirror_unavailable(un, i, 1)) {
5496 			sm_ui->ui_tstate |= MD_INACCESSIBLE;
5497 			sm_unavail_cnt++;
5498 		} else {
5499 			sm_ui->ui_tstate &= ~MD_INACCESSIBLE;
5500 		}
5501 		md_unit_writerexit(sm_ui);
5502 	}
5503 
5504 	/*
5505 	 * If all of the submirrors are unavailable, the mirror is also
5506 	 * unavailable.
5507 	 */
5508 	if (sm_cnt == sm_unavail_cnt) {
5509 		ui->ui_tstate |= MD_INACCESSIBLE;
5510 	} else {
5511 		ui->ui_tstate &= ~MD_INACCESSIBLE;
5512 	}
5513 
5514 	/*
5515 	 * Start checking from probe failures. If failures occur we
5516 	 * set the appropriate erred state only if the metadevice is in
5517 	 * use. This is specifically to prevent unnecessary resyncs.
5518 	 * For instance if the disks were accidentally disconnected when
5519 	 * the system booted up then until the metadevice is accessed
5520 	 * (like file system mount) the user can shutdown, recable and
5521 	 * reboot w/o incurring a potentially huge resync.
5522 	 */
5523 
5524 	smi = 0;
5525 	ci = 0;
5526 	while (mirror_geterror(un, &smi, &ci, 1, 1) != 0) {
5527 
5528 		if (mirror_other_sources(un, smi, ci, 0) == 1) {
5529 			/*
5530 			 * Note that for a MN set, there is no need to call
5531 			 * SE_NOTIFY as that is done when processing the
5532 			 * state change
5533 			 */
5534 			if (md_devopen) {
5535 				/*
5536 				 * Never called from ioctl context,
5537 				 * so (IOLOCK *)NULL
5538 				 */
5539 				set_sm_comp_state(un, smi, ci, CS_LAST_ERRED,
5540 				    0, MD_STATE_XMIT, (IOLOCK *)NULL);
5541 				if (!MD_MNSET_SETNO(setno)) {
5542 					SE_NOTIFY(EC_SVM_STATE,
5543 					    ESC_SVM_LASTERRED,
5544 					    SVM_TAG_METADEVICE, setno,
5545 					    MD_SID(un));
5546 				}
5547 				continue;
5548 			} else {
5549 				(void) mirror_close_all_devs(un,
5550 				    MD_OFLG_PROBEDEV);
5551 				if (!MD_MNSET_SETNO(setno)) {
5552 					SE_NOTIFY(EC_SVM_STATE,
5553 					    ESC_SVM_OPEN_FAIL,
5554 					    SVM_TAG_METADEVICE, setno,
5555 					    MD_SID(un));
5556 				}
5557 				mirror_openfail_console_info(un, smi, ci);
5558 				return (ENXIO);
5559 			}
5560 		}
5561 
5562 		/*
5563 		 * Note that for a MN set, there is no need to call
5564 		 * SE_NOTIFY as that is done when processing the
5565 		 * state change
5566 		 */
5567 		if (md_devopen) {
5568 			/* Never called from ioctl context, so (IOLOCK *)NULL */
5569 			set_sm_comp_state(un, smi, ci, CS_ERRED, 0,
5570 			    MD_STATE_XMIT, (IOLOCK *)NULL);
5571 			if (!MD_MNSET_SETNO(setno)) {
5572 				SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED,
5573 				    SVM_TAG_METADEVICE, setno,
5574 				    MD_SID(un));
5575 			}
5576 		}
5577 		mirror_openfail_console_info(un, smi, ci);
5578 		ci++;
5579 	}
5580 
5581 	if (MD_MNSET_SETNO(setno)) {
5582 		send_poke_hotspares(setno);
5583 	} else {
5584 		(void) poke_hotspares();
5585 	}
5586 	(void) mirror_close_all_devs(un, MD_OFLG_PROBEDEV);
5587 
5588 	return (0);
5589 }
5590 
5591 
5592 static int
5593 mirror_imp_set(
5594 	set_t	setno
5595 )
5596 {
5597 
5598 	mddb_recid_t	recid;
5599 	int		gotsomething, i;
5600 	mddb_type_t	typ1;
5601 	mddb_de_ic_t	*dep;
5602 	mddb_rb32_t	*rbp;
5603 	mm_unit32_od_t	*un32;
5604 	mm_unit_t	*un64;
5605 	md_dev64_t	self_devt;
5606 	minor_t		*self_id;	/* minor needs to be updated */
5607 	md_parent_t	*parent_id;	/* parent needs to be updated */
5608 	mddb_recid_t	*record_id;	/* record id needs to be updated */
5609 	mddb_recid_t	*optrec_id;
5610 	md_dev64_t	tmpdev;
5611 
5612 
5613 	gotsomething = 0;
5614 
5615 	typ1 = (mddb_type_t)md_getshared_key(setno,
5616 	    mirror_md_ops.md_driver.md_drivername);
5617 	recid = mddb_makerecid(setno, 0);
5618 
5619 	while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) {
5620 		if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
5621 			continue;
5622 
5623 		dep = mddb_getrecdep(recid);
5624 		rbp = dep->de_rb;
5625 
5626 		switch (rbp->rb_revision) {
5627 		case MDDB_REV_RB:
5628 		case MDDB_REV_RBFN:
5629 			/*
5630 			 * Small device
5631 			 */
5632 			un32 = (mm_unit32_od_t *)mddb_getrecaddr(recid);
5633 			self_id = &(un32->c.un_self_id);
5634 			parent_id = &(un32->c.un_parent);
5635 			record_id = &(un32->c.un_record_id);
5636 			optrec_id = &(un32->un_rr_dirty_recid);
5637 
5638 			for (i = 0; i < un32->un_nsm; i++) {
5639 				tmpdev = md_expldev(un32->un_sm[i].sm_dev);
5640 				un32->un_sm[i].sm_dev = md_cmpldev
5641 				    (md_makedevice(md_major, MD_MKMIN(setno,
5642 				    MD_MIN2UNIT(md_getminor(tmpdev)))));
5643 
5644 				if (!md_update_minor(setno, mddb_getsidenum
5645 				    (setno), un32->un_sm[i].sm_key))
5646 				goto out;
5647 			}
5648 			break;
5649 		case MDDB_REV_RB64:
5650 		case MDDB_REV_RB64FN:
5651 			un64 = (mm_unit_t *)mddb_getrecaddr(recid);
5652 			self_id = &(un64->c.un_self_id);
5653 			parent_id = &(un64->c.un_parent);
5654 			record_id = &(un64->c.un_record_id);
5655 			optrec_id = &(un64->un_rr_dirty_recid);
5656 
5657 			for (i = 0; i < un64->un_nsm; i++) {
5658 				tmpdev = un64->un_sm[i].sm_dev;
5659 				un64->un_sm[i].sm_dev = md_makedevice
5660 				    (md_major, MD_MKMIN(setno, MD_MIN2UNIT
5661 				    (md_getminor(tmpdev))));
5662 
5663 				if (!md_update_minor(setno, mddb_getsidenum
5664 				    (setno), un64->un_sm[i].sm_key))
5665 				goto out;
5666 			}
5667 			break;
5668 		}
5669 
5670 		/*
5671 		 * If this is a top level and a friendly name metadevice,
5672 		 * update its minor in the namespace.
5673 		 */
5674 		if ((*parent_id == MD_NO_PARENT) &&
5675 		    ((rbp->rb_revision == MDDB_REV_RBFN) ||
5676 		    (rbp->rb_revision == MDDB_REV_RB64FN))) {
5677 
5678 			self_devt = md_makedevice(md_major, *self_id);
5679 			if (!md_update_top_device_minor(setno,
5680 			    mddb_getsidenum(setno), self_devt))
5681 				goto out;
5682 		}
5683 
5684 		/*
5685 		 * Update unit with the imported setno
5686 		 *
5687 		 */
5688 		mddb_setrecprivate(recid, MD_PRV_GOTIT);
5689 
5690 		*self_id = MD_MKMIN(setno, MD_MIN2UNIT(*self_id));
5691 		if (*parent_id != MD_NO_PARENT)
5692 			*parent_id = MD_MKMIN(setno, MD_MIN2UNIT(*parent_id));
5693 		*record_id = MAKERECID(setno, DBID(*record_id));
5694 		*optrec_id = MAKERECID(setno, DBID(*optrec_id));
5695 
5696 		gotsomething = 1;
5697 	}
5698 
5699 out:
5700 	return (gotsomething);
5701 }
5702 
5703 /*
5704  * NAME: mirror_check_offline
5705  *
5706  * DESCRIPTION: return offline_status = 1 if any submirrors are offline
5707  *
5708  * Called from ioctl, so access to MD_UN_OFFLINE_SM in un_status is
5709  * protected by the global ioctl lock as it is only set by the MD_IOCOFFLINE
5710  * ioctl.
5711  */
5712 int
5713 mirror_check_offline(md_dev64_t dev, int *offline_status)
5714 {
5715 	mm_unit_t		*un;
5716 	md_error_t		mde = mdnullerror;
5717 
5718 	if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL)
5719 		return (EINVAL);
5720 	*offline_status = 0;
5721 	if (un->c.un_status & MD_UN_OFFLINE_SM)
5722 		*offline_status = 1;
5723 	return (0);
5724 }
5725 
5726 /*
5727  * NAME: mirror_inc_abr_count
5728  *
5729  * DESCRIPTION: increment the count of layered soft parts with ABR set
5730  *
5731  * Called from ioctl, so access to un_abr_count is protected by the global
5732  * ioctl lock. It is only referenced in the MD_IOCOFFLINE ioctl.
5733  */
5734 int
5735 mirror_inc_abr_count(md_dev64_t dev)
5736 {
5737 	mm_unit_t		*un;
5738 	md_error_t		mde = mdnullerror;
5739 
5740 	if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL)
5741 		return (EINVAL);
5742 	un->un_abr_count++;
5743 	return (0);
5744 }
5745 
5746 /*
5747  * NAME: mirror_dec_abr_count
5748  *
5749  * DESCRIPTION: decrement the count of layered soft parts with ABR set
5750  *
5751  * Called from ioctl, so access to un_abr_count is protected by the global
5752  * ioctl lock. It is only referenced in the MD_IOCOFFLINE ioctl.
5753  */
5754 int
5755 mirror_dec_abr_count(md_dev64_t dev)
5756 {
5757 	mm_unit_t		*un;
5758 	md_error_t		mde = mdnullerror;
5759 
5760 	if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL)
5761 		return (EINVAL);
5762 	un->un_abr_count--;
5763 	return (0);
5764 }
5765 
5766 static md_named_services_t mirror_named_services[] = {
5767 	{(intptr_t (*)()) poke_hotspares,		"poke hotspares"    },
5768 	{(intptr_t (*)()) mirror_rename_listkids,	MDRNM_LIST_URKIDS   },
5769 	{mirror_rename_check,				MDRNM_CHECK	    },
5770 	{(intptr_t (*)()) mirror_renexch_update_kids,	MDRNM_UPDATE_KIDS   },
5771 	{(intptr_t (*)()) mirror_exchange_parent_update_to,
5772 			MDRNM_PARENT_UPDATE_TO},
5773 	{(intptr_t (*)()) mirror_exchange_self_update_from_down,
5774 			MDRNM_SELF_UPDATE_FROM_DOWN },
5775 	{(intptr_t (*)())mirror_probe_dev,		"probe open test" },
5776 	{(intptr_t (*)())mirror_check_offline,		MD_CHECK_OFFLINE },
5777 	{(intptr_t (*)())mirror_inc_abr_count,		MD_INC_ABR_COUNT },
5778 	{(intptr_t (*)())mirror_dec_abr_count,		MD_DEC_ABR_COUNT },
5779 	{ NULL,						0		    }
5780 };
5781 
5782 md_ops_t mirror_md_ops = {
5783 	mirror_open,		/* open */
5784 	mirror_close,		/* close */
5785 	md_mirror_strategy,	/* strategy */
5786 	NULL,			/* print */
5787 	mirror_dump,		/* dump */
5788 	NULL,			/* read */
5789 	NULL,			/* write */
5790 	md_mirror_ioctl,	/* mirror_ioctl, */
5791 	mirror_snarf,		/* mirror_snarf */
5792 	mirror_halt,		/* mirror_halt */
5793 	NULL,			/* aread */
5794 	NULL,			/* awrite */
5795 	mirror_imp_set,		/* import set */
5796 	mirror_named_services
5797 };
5798 
5799 /* module specific initilization */
5800 static void
5801 init_init()
5802 {
5803 	md_mirror_mcs_buf_off = sizeof (md_mcs_t) - sizeof (buf_t);
5804 
5805 	/* Initialize the parent and child save memory pools */
5806 	mirror_parent_cache = kmem_cache_create("md_mirror_parent",
5807 	    sizeof (md_mps_t), 0, mirror_parent_constructor,
5808 	    mirror_parent_destructor, mirror_run_queue, NULL, NULL,
5809 	    0);
5810 
5811 	mirror_child_cache = kmem_cache_create("md_mirror_child",
5812 	    sizeof (md_mcs_t) - sizeof (buf_t) + biosize(), 0,
5813 	    mirror_child_constructor, mirror_child_destructor,
5814 	    mirror_run_queue, NULL, NULL, 0);
5815 
5816 	/*
5817 	 * Insure wowbuf_size is a multiple of DEV_BSIZE,
5818 	 * then initialize wowbuf memory pool.
5819 	 */
5820 	md_wowbuf_size = roundup(md_wowbuf_size, DEV_BSIZE);
5821 	if (md_wowbuf_size <= 0)
5822 		md_wowbuf_size = 2 * DEV_BSIZE;
5823 	if (md_wowbuf_size > (32 * DEV_BSIZE))
5824 		md_wowbuf_size = (32 * DEV_BSIZE);
5825 
5826 	md_wowblk_size = md_wowbuf_size + sizeof (wowhdr_t);
5827 	mirror_wowblk_cache = kmem_cache_create("md_mirror_wow",
5828 	    md_wowblk_size, 0, NULL, NULL, NULL, NULL, NULL, 0);
5829 
5830 	mutex_init(&mirror_timeout.dr_mx, NULL, MUTEX_DEFAULT, NULL);
5831 	mutex_init(&hotspare_request.dr_mx, NULL, MUTEX_DEFAULT, NULL);
5832 
5833 	mutex_init(&non_ff_drv_mutex, NULL, MUTEX_DEFAULT, NULL);
5834 }
5835 
5836 /* module specific uninitilization (undo init_init()) */
5837 static void
5838 fini_uninit()
5839 {
5840 	kmem_cache_destroy(mirror_parent_cache);
5841 	kmem_cache_destroy(mirror_child_cache);
5842 	kmem_cache_destroy(mirror_wowblk_cache);
5843 	mirror_parent_cache = mirror_child_cache =
5844 	    mirror_wowblk_cache = NULL;
5845 
5846 	mutex_destroy(&mirror_timeout.dr_mx);
5847 	mutex_destroy(&hotspare_request.dr_mx);
5848 	mutex_destroy(&non_ff_drv_mutex);
5849 }
5850 
5851 /* define the module linkage */
5852 MD_PLUGIN_MISC_MODULE("mirrors module", init_init(), fini_uninit())
5853