xref: /onnv-gate/usr/src/uts/common/io/lvm/mirror/mirror.c (revision 2063:a6ebd483c3cf)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/param.h>
29 #include <sys/systm.h>
30 #include <sys/conf.h>
31 #include <sys/file.h>
32 #include <sys/user.h>
33 #include <sys/uio.h>
34 #include <sys/t_lock.h>
35 #include <sys/buf.h>
36 #include <sys/dkio.h>
37 #include <sys/vtoc.h>
38 #include <sys/kmem.h>
39 #include <vm/page.h>
40 #include <sys/cmn_err.h>
41 #include <sys/sysmacros.h>
42 #include <sys/types.h>
43 #include <sys/mkdev.h>
44 #include <sys/stat.h>
45 #include <sys/open.h>
46 #include <sys/modctl.h>
47 #include <sys/ddi.h>
48 #include <sys/sunddi.h>
49 #include <sys/debug.h>
50 #include <sys/dklabel.h>
51 #include <vm/hat.h>
52 #include <sys/lvm/mdvar.h>
53 #include <sys/lvm/md_mirror.h>
54 #include <sys/lvm/md_convert.h>
55 #include <sys/lvm/md_mddb.h>
56 #include <sys/esunddi.h>
57 
58 #include <sys/sysevent/eventdefs.h>
59 #include <sys/sysevent/svm.h>
60 #include <sys/lvm/mdmn_commd.h>
61 
62 md_ops_t		mirror_md_ops;
63 #ifndef	lint
64 char			_depends_on[] = "drv/md";
65 md_ops_t		*md_interface_ops = &mirror_md_ops;
66 #endif
67 
68 extern mdq_anchor_t	md_done_daemon;
69 extern mdq_anchor_t	md_mstr_daemon;
70 extern mdq_anchor_t	md_mirror_daemon;
71 extern mdq_anchor_t	md_mirror_io_daemon;
72 extern mdq_anchor_t	md_mirror_rs_daemon;
73 extern mdq_anchor_t	md_mhs_daemon;
74 
75 extern unit_t		md_nunits;
76 extern set_t		md_nsets;
77 extern md_set_t		md_set[];
78 
79 extern int		md_status;
80 extern clock_t		md_hz;
81 
82 extern md_krwlock_t	md_unit_array_rw;
83 extern kmutex_t		md_mx;
84 extern kcondvar_t	md_cv;
85 extern int		md_mtioctl_cnt;
86 
87 daemon_request_t	mirror_timeout;
88 static daemon_request_t	hotspare_request;
89 static daemon_request_t	mn_hs_request[MD_MAXSETS];	/* Multinode hs req */
90 
91 int	md_mirror_mcs_buf_off;
92 
93 /* Flags for mdmn_ksend_message to allow debugging */
94 int	md_mirror_msg_flags;
95 
96 #ifdef DEBUG
97 /* Flag to switch on debug messages */
98 int	mirror_debug_flag = 0;
99 #endif
100 
101 /*
102  * Struct used to hold count of DMR reads and the timestamp of last DMR read
103  * It is used to verify, using a debugger, that the DMR read ioctl has been
104  * executed.
105  */
106 dmr_stats_t	mirror_dmr_stats = {0, 0};
107 
108 /*
109  * Mutex protecting list of non-failfast drivers.
110  */
111 static kmutex_t	non_ff_drv_mutex;
112 extern char	**non_ff_drivers;
113 
114 extern major_t	md_major;
115 
116 /*
117  * Write-On-Write memory pool.
118  */
119 static void		copy_write_cont(wowhdr_t *wowhdr);
120 static kmem_cache_t	*mirror_wowblk_cache = NULL;
121 static int		md_wowbuf_size = 16384;
122 static size_t		md_wowblk_size;
123 
124 /*
125  * This is a flag that allows:
126  *	- disabling the write-on-write mechanism.
127  *	- logging occurrences of write-on-write
128  *	- switching wow handling procedure processing
129  * Counter for occurences of WOW.
130  */
131 static uint_t	md_mirror_wow_flg = 0;
132 static int	md_mirror_wow_cnt = 0;
133 
134 /*
135  * Tunable to enable/disable dirty region
136  * processing when closing down a mirror.
137  */
138 static int	new_resync = 1;
139 kmem_cache_t	*mirror_parent_cache = NULL;
140 kmem_cache_t	*mirror_child_cache = NULL;
141 
142 extern int	md_ff_disable;		/* disable failfast */
143 
144 static int	mirror_map_write(mm_unit_t *, md_mcs_t *, md_mps_t *, int);
145 static void	mirror_read_strategy(buf_t *, int, void *);
146 static void	mirror_write_strategy(buf_t *, int, void *);
147 static void	become_owner(daemon_queue_t *);
148 static int	mirror_done(struct buf *cb);
149 static int	mirror_done_common(struct buf *cb);
150 static void	clear_retry_error(struct buf *cb);
151 
152 /*
153  * patchables
154  */
155 int	md_min_rr_size	= 200;	/* 2000 blocks, or 100k */
156 int	md_def_num_rr	= 1000;	/* Default number of dirty regions */
157 
158 /*
159  * patchable to change delay before rescheduling mirror ownership request.
160  * Value is clock ticks, default 0.5 seconds
161  */
162 clock_t	md_mirror_owner_to = 500000;
163 
164 /*ARGSUSED1*/
165 static int
166 mirror_parent_constructor(void *p, void *d1, int d2)
167 {
168 	mutex_init(&((md_mps_t *)p)->ps_mx, NULL, MUTEX_DEFAULT, NULL);
169 	return (0);
170 }
171 
172 static void
173 mirror_parent_init(md_mps_t *ps)
174 {
175 	bzero(ps, offsetof(md_mps_t, ps_mx));
176 }
177 
178 /*ARGSUSED1*/
179 static void
180 mirror_parent_destructor(void *p, void *d)
181 {
182 	mutex_destroy(&((md_mps_t *)p)->ps_mx);
183 }
184 
185 /*ARGSUSED1*/
186 static int
187 mirror_child_constructor(void *p, void *d1, int d2)
188 {
189 	bioinit(&((md_mcs_t *)p)->cs_buf);
190 	return (0);
191 }
192 
193 void
194 mirror_child_init(md_mcs_t *cs)
195 {
196 	cs->cs_ps = NULL;
197 	cs->cs_mdunit = 0;
198 	md_bioreset(&cs->cs_buf);
199 }
200 
201 /*ARGSUSED1*/
202 static void
203 mirror_child_destructor(void *p, void *d)
204 {
205 	biofini(&((md_mcs_t *)p)->cs_buf);
206 }
207 
208 static void
209 mirror_wowblk_init(wowhdr_t *p)
210 {
211 	bzero(p, md_wowblk_size);
212 }
213 
214 static void
215 send_poke_hotspares_msg(daemon_request_t *drq)
216 {
217 	int			rval;
218 	md_mn_msg_pokehsp_t	pokehsp;
219 	md_mn_kresult_t		*kresult;
220 	set_t			setno = (set_t)drq->dq.qlen;
221 
222 	pokehsp.pokehsp_setno = setno;
223 
224 	kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
225 	rval = mdmn_ksend_message(setno, MD_MN_MSG_POKE_HOTSPARES,
226 	    MD_MSGF_NO_LOG | MD_MSGF_NO_BCAST, (char *)&pokehsp,
227 	    sizeof (pokehsp), kresult);
228 
229 	if (!MDMN_KSEND_MSG_OK(rval, kresult)) {
230 		mdmn_ksend_show_error(rval, kresult, "POKE_HOTSPARES");
231 		cmn_err(CE_PANIC,
232 		    "ksend_message failure: POKE_HOTSPARES");
233 	}
234 	kmem_free(kresult, sizeof (md_mn_kresult_t));
235 
236 	/* Allow further requests to use this set's queue structure */
237 	mutex_enter(&drq->dr_mx);
238 	drq->dr_pending = 0;
239 	mutex_exit(&drq->dr_mx);
240 }
241 
242 /*
243  * Send a poke_hotspares message to the master node. To avoid swamping the
244  * commd handler with requests we only send a message if there is not one
245  * already outstanding. We punt the request to a separate thread context as
246  * cannot afford to block waiting on the request to be serviced. This is
247  * essential when a reconfig cycle is in progress as any open() of a multinode
248  * metadevice may result in a livelock.
249  */
250 static void
251 send_poke_hotspares(set_t setno)
252 {
253 	daemon_request_t	*drq = &mn_hs_request[setno];
254 
255 	mutex_enter(&drq->dr_mx);
256 	if (drq->dr_pending == 0) {
257 		drq->dr_pending = 1;
258 		drq->dq.qlen = (int)setno;
259 		daemon_request(&md_mhs_daemon,
260 		    send_poke_hotspares_msg, (daemon_queue_t *)drq, REQ_OLD);
261 	}
262 	mutex_exit(&drq->dr_mx);
263 }
264 
265 void
266 mirror_set_sm_state(
267 	mm_submirror_t		*sm,
268 	mm_submirror_ic_t	*smic,
269 	sm_state_t		newstate,
270 	int			force)
271 {
272 	int			compcnt;
273 	int			i;
274 	int			errcnt;
275 	sm_state_t		origstate;
276 	md_m_shared_t		*shared;
277 
278 	if (force) {
279 		sm->sm_state = newstate;
280 		uniqtime32(&sm->sm_timestamp);
281 		return;
282 	}
283 
284 	origstate = newstate;
285 
286 	compcnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm);
287 	for (i = 0, errcnt = 0; i < compcnt; i++) {
288 		shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
289 		    (sm->sm_dev, sm, i);
290 		if (shared->ms_state & (CS_ERRED | CS_LAST_ERRED))
291 			newstate |= SMS_COMP_ERRED;
292 		if (shared->ms_state & (CS_RESYNC))
293 			newstate |= SMS_COMP_RESYNC;
294 		if (shared->ms_state & CS_ERRED)
295 			errcnt++;
296 	}
297 
298 	if ((newstate & (SMS_COMP_ERRED | SMS_COMP_RESYNC)) != 0)
299 		newstate &= ~origstate;
300 
301 	if (errcnt == compcnt)
302 		newstate |= SMS_ALL_ERRED;
303 	else
304 		newstate &= ~SMS_ALL_ERRED;
305 
306 	sm->sm_state = newstate;
307 	uniqtime32(&sm->sm_timestamp);
308 }
309 
310 static int
311 mirror_geterror(mm_unit_t *un, int *smi, int *cip, int clr_error,
312 							int frm_probe)
313 {
314 	mm_submirror_t		*sm;
315 	mm_submirror_ic_t	*smic;
316 	md_m_shared_t		*shared;
317 	int			ci;
318 	int			i;
319 	int			compcnt;
320 	int			open_comp; /* flag for open component */
321 
322 	for (i = *smi; i < NMIRROR; i++) {
323 		sm = &un->un_sm[i];
324 		smic = &un->un_smic[i];
325 
326 		if (!SMS_IS(sm, SMS_INUSE))
327 			continue;
328 
329 		compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un);
330 		for (ci = *cip; ci < compcnt; ci++) {
331 			shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
332 			    (sm->sm_dev, sm, ci);
333 			/*
334 			 * if called from any routine but probe, we check for
335 			 * MDM_S_ISOPEN flag. Since probe does a pseduo open,
336 			 * it sets MDM_S_PROBEOPEN flag and we test for this
337 			 * flag. They are both exclusive tests.
338 			 */
339 			open_comp = (frm_probe) ?
340 					(shared->ms_flags & MDM_S_PROBEOPEN):
341 					(shared->ms_flags & MDM_S_ISOPEN);
342 			if ((shared->ms_flags & MDM_S_IOERR || !open_comp) &&
343 				((shared->ms_state == CS_OKAY) ||
344 				(shared->ms_state == CS_RESYNC))) {
345 				if (clr_error) {
346 					shared->ms_flags &= ~MDM_S_IOERR;
347 				}
348 				*cip = ci;
349 				*smi = i;
350 				return (1);
351 			}
352 
353 			if (clr_error && (shared->ms_flags & MDM_S_IOERR)) {
354 				shared->ms_flags &= ~MDM_S_IOERR;
355 			}
356 		}
357 
358 		*cip = 0;
359 	}
360 	return (0);
361 }
362 
363 /*ARGSUSED*/
364 static void
365 mirror_run_queue(void *d)
366 {
367 	if (!(md_status & MD_GBL_DAEMONS_LIVE))
368 		md_daemon(1, &md_done_daemon);
369 }
370 /*
371  * check_comp_4_hotspares
372  *
373  * This function attempts to allocate a hotspare for this component if the
374  * component is in error. In a MN set, the function can be called in 2 modes.
375  * It can be called either when a component error has been detected or when a
376  * new hotspare has been allocated. In this case, MD_HOTSPARE_XMIT is set
377  * in flags and the request is sent to all nodes.
378  * The handler on each of the nodes then calls this function with
379  * MD_HOTSPARE_XMIT unset and the hotspare allocation is then performed.
380  *
381  * For non-MN sets the function simply attempts to allocate a hotspare.
382  *
383  * On entry, the following locks are held
384  *	mirror_md_ops.md_link_rw (if flags has MD_HOTSPARE_LINKHELD set)
385  *	md_unit_writerlock
386  *
387  * Returns	0 if ok
388  *		1 if the unit containing the component has been cleared while
389  *		  the mdmn_ksend_message() was being executed
390  */
391 extern int
392 check_comp_4_hotspares(
393 	mm_unit_t	*un,
394 	int		smi,
395 	int		ci,
396 	uint_t		flags,
397 	mddb_recid_t	hs_id,	/* Only used by MN disksets */
398 	IOLOCK		*lockp	/* can be NULL */
399 )
400 {
401 	mm_submirror_t		*sm;
402 	mm_submirror_ic_t	*smic;
403 	md_m_shared_t		*shared;
404 	mddb_recid_t		recids[6];
405 	minor_t			mnum;
406 	intptr_t		(*hs_dev)();
407 	void			(*hs_done)();
408 	void			*hs_data;
409 	md_error_t		mde = mdnullerror;
410 	set_t			setno;
411 	md_mn_msg_allochsp_t	allochspmsg;
412 	md_mn_kresult_t		*kresult;
413 	mm_unit_t		*new_un;
414 	int			rval;
415 
416 	mnum = MD_SID(un);
417 	setno = MD_UN2SET(un);
418 	sm = &un->un_sm[smi];
419 	smic = &un->un_smic[smi];
420 	shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
421 		(sm->sm_dev, sm, ci);
422 
423 	if (shared->ms_state != CS_ERRED)
424 		return (0);
425 
426 	/* Don't start a new component resync if a resync is already running. */
427 	if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE)
428 		return (0);
429 
430 	if (MD_MNSET_SETNO(setno) && (flags & MD_HOTSPARE_XMIT)) {
431 		uint_t		msgflags;
432 		md_mn_msgtype_t	msgtype;
433 
434 		/* Send allocate hotspare message to all nodes */
435 
436 		allochspmsg.msg_allochsp_mnum = un->c.un_self_id;
437 		allochspmsg.msg_allochsp_sm = smi;
438 		allochspmsg.msg_allochsp_comp = ci;
439 		allochspmsg.msg_allochsp_hs_id = shared->ms_hs_id;
440 
441 		/*
442 		 * Before calling mdmn_ksend_message(), release locks
443 		 * Can never be in the context of an ioctl.
444 		 */
445 		md_unit_writerexit(MDI_UNIT(mnum));
446 		if (flags & MD_HOTSPARE_LINKHELD)
447 			rw_exit(&mirror_md_ops.md_link_rw.lock);
448 #ifdef DEBUG
449 		if (mirror_debug_flag)
450 		    printf("send alloc hotspare, flags=0x%x %x, %x, %x, %x\n",
451 			flags,
452 			allochspmsg.msg_allochsp_mnum,
453 			allochspmsg.msg_allochsp_sm,
454 			allochspmsg.msg_allochsp_comp,
455 			allochspmsg.msg_allochsp_hs_id);
456 #endif
457 		if (flags & MD_HOTSPARE_WMUPDATE) {
458 			msgtype  = MD_MN_MSG_ALLOCATE_HOTSPARE2;
459 			/*
460 			 * When coming from an update of watermarks, there
461 			 * must already be a message logged that triggered
462 			 * this action. So, no need to log this message, too.
463 			 */
464 			msgflags = MD_MSGF_NO_LOG;
465 		} else {
466 			msgtype  = MD_MN_MSG_ALLOCATE_HOTSPARE;
467 			msgflags = MD_MSGF_DEFAULT_FLAGS;
468 		}
469 
470 		kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
471 		rval = mdmn_ksend_message(setno, msgtype, msgflags,
472 		    (char *)&allochspmsg, sizeof (allochspmsg),
473 		    kresult);
474 
475 		if (!MDMN_KSEND_MSG_OK(rval, kresult)) {
476 #ifdef DEBUG
477 			if (mirror_debug_flag)
478 				mdmn_ksend_show_error(rval, kresult,
479 				    "ALLOCATE HOTSPARE");
480 #endif
481 			/*
482 			 * If message is sent ok but exitval indicates an error
483 			 * it must be because the mirror has been cleared. In
484 			 * this case re-obtain lock and return an error
485 			 */
486 			if ((rval == 0) && (kresult->kmmr_exitval != 0)) {
487 				if (flags & MD_HOTSPARE_LINKHELD) {
488 					rw_enter(&mirror_md_ops.md_link_rw.lock,
489 					    RW_READER);
490 				}
491 				kmem_free(kresult, sizeof (md_mn_kresult_t));
492 				return (1);
493 			}
494 			cmn_err(CE_PANIC,
495 			    "ksend_message failure: ALLOCATE_HOTSPARE");
496 		}
497 		kmem_free(kresult, sizeof (md_mn_kresult_t));
498 
499 		/*
500 		 * re-obtain the locks
501 		 */
502 		if (flags & MD_HOTSPARE_LINKHELD)
503 			rw_enter(&mirror_md_ops.md_link_rw.lock, RW_READER);
504 		new_un = md_unit_writerlock(MDI_UNIT(mnum));
505 
506 		/*
507 		 * As we had to release the locks in order to send the
508 		 * message to all nodes, we need to check to see if the
509 		 * unit has changed. If it has we release the writerlock
510 		 * and return fail.
511 		 */
512 		if ((new_un != un) || (un->c.un_type != MD_METAMIRROR)) {
513 			md_unit_writerexit(MDI_UNIT(mnum));
514 			return (1);
515 		}
516 	} else {
517 		if (MD_MNSET_SETNO(setno)) {
518 			/*
519 			 * If 2 or more nodes simultaneously see a
520 			 * component failure, these nodes will each
521 			 * send an ALLOCATE_HOTSPARE[2] message.
522 			 * The first message will allocate the hotspare
523 			 * and the subsequent messages should do nothing.
524 			 *
525 			 * If a slave node doesn't have a hotspare allocated
526 			 * at the time the message is initiated, then the
527 			 * passed in hs_id will be 0.  If the node
528 			 * executing this routine has a component shared
529 			 * ms_hs_id of non-zero, but the message shows a
530 			 * hs_id of 0, then just return since a hotspare
531 			 * has already been allocated for this failing
532 			 * component.  When the slave node returns from
533 			 * the ksend_message the hotspare will have
534 			 * already been allocated.
535 			 *
536 			 * If the slave node does send an hs_id of non-zero,
537 			 * and the slave node's hs_id matches this node's
538 			 * ms_hs_id, then the hotspare has error'd and
539 			 * should be replaced.
540 			 *
541 			 * If the slave node sends an hs_id of non-zero and
542 			 * this node has a different shared ms_hs_id, then
543 			 * just return since this hotspare has already
544 			 * been hotspared.
545 			 */
546 			if (shared->ms_hs_id != 0) {
547 				if (hs_id == 0) {
548 #ifdef DEBUG
549 					if (mirror_debug_flag) {
550 						printf("check_comp_4_hotspares"
551 						    "(NOXMIT), short circuit "
552 						    "hs_id=0x%x, "
553 						    "ms_hs_id=0x%x\n",
554 						    hs_id, shared->ms_hs_id);
555 					}
556 #endif
557 					return (0);
558 				}
559 				if (hs_id != shared->ms_hs_id) {
560 #ifdef DEBUG
561 					if (mirror_debug_flag) {
562 						printf("check_comp_4_hotspares"
563 						    "(NOXMIT), short circuit2 "
564 						    "hs_id=0x%x, "
565 						    "ms_hs_id=0x%x\n",
566 						    hs_id, shared->ms_hs_id);
567 					}
568 #endif
569 					return (0);
570 				}
571 			}
572 		}
573 
574 		sm = &un->un_sm[smi];
575 		hs_dev = md_get_named_service(sm->sm_dev, 0,
576 		    "hotspare device", 0);
577 		if ((*hs_dev)(sm->sm_dev, 0, ci, recids, 6, &hs_done,
578 		    &hs_data) != 0)
579 			return (0);
580 
581 		/*
582 		 * set_sm_comp_state() commits the modified records.
583 		 * As we don't transmit the changes, no need to drop the lock.
584 		 */
585 		set_sm_comp_state(un, smi, ci, CS_RESYNC, recids,
586 		    MD_STATE_NO_XMIT, (IOLOCK *)NULL);
587 
588 		(*hs_done)(sm->sm_dev, hs_data);
589 
590 		mirror_check_failfast(mnum);
591 
592 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_HOTSPARED, SVM_TAG_METADEVICE,
593 		    setno, MD_SID(un));
594 
595 		/*
596 		 * For a multi-node set we need to reset the un_rs_type,
597 		 * un_rs_resync_done and un_rs_resync_2_do fields as the
598 		 * hot-spare resync must copy all applicable data.
599 		 */
600 		if (MD_MNSET_SETNO(setno)) {
601 			un->un_rs_type = MD_RS_NONE;
602 			un->un_rs_resync_done = 0;
603 			un->un_rs_resync_2_do = 0;
604 		}
605 
606 		/*
607 		 * Must drop writer lock since mirror_resync_unit will
608 		 * open devices and must be able to grab readerlock.
609 		 * Don't need to drop IOLOCK since any descendent routines
610 		 * calling ksend_messages will drop the IOLOCK as needed.
611 		 *
612 		 */
613 		if (lockp) {
614 			md_ioctl_writerexit(lockp);
615 		} else {
616 			md_unit_writerexit(MDI_UNIT(mnum));
617 		}
618 
619 		/* start resync */
620 		(void) mirror_resync_unit(mnum, NULL, &mde, lockp);
621 
622 		if (lockp) {
623 			new_un = md_ioctl_writerlock(lockp, MDI_UNIT(mnum));
624 		} else {
625 			new_un = md_unit_writerlock(MDI_UNIT(mnum));
626 		}
627 	}
628 	return (0);
629 }
630 
631 /*
632  * check_unit_4_hotspares
633  *
634  * For a given mirror, allocate hotspares, if available for any components
635  * that are in error
636  *
637  * Returns	0 if ok
638  *		1 if check_comp_4_hotspares returns non-zero. This will only
639  *		  happen for a MN unit where the unit has been cleared while
640  *		  the allocate hotspare message is sent to all nodes.
641  */
642 static int
643 check_unit_4_hotspares(mm_unit_t *un, int flags)
644 {
645 	mm_submirror_t		*sm;
646 	mm_submirror_ic_t	*smic;
647 	int			ci;
648 	int			i;
649 	int			compcnt;
650 
651 	if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE)
652 		return (0);
653 
654 	for (i = 0; i < NMIRROR; i++) {
655 		sm = &un->un_sm[i];
656 		smic = &un->un_smic[i];
657 		if (!SMS_IS(sm, SMS_INUSE))
658 			continue;
659 		compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, sm);
660 		for (ci = 0; ci < compcnt; ci++) {
661 			md_m_shared_t		*shared;
662 
663 			shared = (md_m_shared_t *)
664 				(*(smic->sm_shared_by_indx))(sm->sm_dev,
665 				sm, ci);
666 			/*
667 			 * Never called from ioctl context, so pass in
668 			 * (IOLOCK *)NULL.  Pass through flags from calling
669 			 * routine, also setting XMIT flag.
670 			 */
671 			if (check_comp_4_hotspares(un, i, ci,
672 				(MD_HOTSPARE_XMIT | flags),
673 				shared->ms_hs_id, (IOLOCK *)NULL) != 0)
674 				return (1);
675 		}
676 	}
677 	return (0);
678 }
679 
680 static void
681 check_4_hotspares(daemon_request_t *drq)
682 {
683 	mdi_unit_t	*ui;
684 	mm_unit_t	*un;
685 	md_link_t	*next;
686 	int		x;
687 
688 	mutex_enter(&drq->dr_mx);	/* clear up front so can poke */
689 	drq->dr_pending = 0;		/* again in low level routine if */
690 	mutex_exit(&drq->dr_mx);	/* something found to do	*/
691 
692 	/*
693 	 * Used to have a problem here. The disksets weren't marked as being
694 	 * MNHOLD. This opened a window where we could be searching for
695 	 * hotspares and have the disk set unloaded (released) from under
696 	 * us causing a panic in stripe_component_count().
697 	 * The way to prevent that is to mark the set MNHOLD which prevents
698 	 * any diskset from being released while we are scanning the mirrors,
699 	 * submirrors and components.
700 	 */
701 
702 	for (x = 0; x < md_nsets; x++)
703 		md_holdset_enter(x);
704 
705 	rw_enter(&mirror_md_ops.md_link_rw.lock, RW_READER);
706 	for (next = mirror_md_ops.md_head; next != NULL; next = next->ln_next) {
707 		ui = MDI_UNIT(next->ln_id);
708 
709 		un = (mm_unit_t *)md_unit_readerlock(ui);
710 
711 		/*
712 		 * Only check the unit if we are the master for this set
713 		 * For an MN set, poke_hotspares() is only effective on the
714 		 * master
715 		 */
716 		if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
717 		    md_set[MD_UN2SET(un)].s_am_i_master == 0) {
718 			md_unit_readerexit(ui);
719 			continue;
720 		}
721 		if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) {
722 			md_unit_readerexit(ui);
723 			continue;
724 		}
725 		md_unit_readerexit(ui);
726 
727 		un = (mm_unit_t *)md_unit_writerlock(ui);
728 		/*
729 		 * check_unit_4_hotspares will exit 1 if the unit has been
730 		 * removed during the process of allocating the hotspare.
731 		 * This can only happen for a MN metadevice. If unit no longer
732 		 * exists, no need to release writerlock
733 		 */
734 		if (check_unit_4_hotspares(un, MD_HOTSPARE_LINKHELD) == 0)
735 			md_unit_writerexit(ui);
736 		else {
737 			/*
738 			 * If check_unit_4_hotspares failed, queue another
739 			 * request and break out of this one
740 			 */
741 			(void) poke_hotspares();
742 			break;
743 		}
744 	}
745 	rw_exit(&mirror_md_ops.md_link_rw.lock);
746 
747 	for (x = 0; x < md_nsets; x++)
748 		md_holdset_exit(x);
749 }
750 
751 /*
752  * poke_hotspares
753  *
754  * If there is not a pending poke_hotspares request pending, queue a requent
755  * to call check_4_hotspares(). This will scan all mirrors and attempt to
756  * allocate hotspares for all components in error.
757  */
758 int
759 poke_hotspares()
760 {
761 	mutex_enter(&hotspare_request.dr_mx);
762 	if (hotspare_request.dr_pending == 0) {
763 		hotspare_request.dr_pending = 1;
764 		daemon_request(&md_mhs_daemon,
765 		    check_4_hotspares,
766 				(daemon_queue_t *)&hotspare_request, REQ_OLD);
767 	}
768 	mutex_exit(&hotspare_request.dr_mx);
769 	return (0);
770 }
771 
772 static void
773 free_all_ecomps(err_comp_t *ecomp)
774 {
775 	err_comp_t	*d;
776 
777 	while (ecomp != NULL) {
778 		d = ecomp;
779 		ecomp = ecomp->ec_next;
780 		kmem_free(d, sizeof (err_comp_t));
781 	}
782 }
783 
784 /*
785  * NAME: mirror_openfail_console_info
786  *
787  * DESCRIPTION: Prints a informative message to the console when mirror
788  *		cannot be opened.
789  *
790  * PARAMETERS: mm_unit_t	un - pointer to mirror unit structure
791  *	       int		smi - submirror index
792  *	       int		ci - component index
793  */
794 
795 void
796 mirror_openfail_console_info(mm_unit_t *un, int smi, int ci)
797 {
798 	void (*get_dev)();
799 	ms_cd_info_t cd;
800 	md_dev64_t tmpdev;
801 
802 	tmpdev = un->un_sm[smi].sm_dev;
803 	get_dev = (void (*)())md_get_named_service(tmpdev, 0, "get device", 0);
804 	if (get_dev != NULL) {
805 		(void) (*get_dev)(tmpdev, smi, ci, &cd);
806 		cmn_err(CE_WARN, "md %s: open error on %s",
807 			md_shortname(MD_SID(un)),
808 			md_devname(MD_UN2SET(un), cd.cd_dev,
809 			NULL, 0));
810 	} else {
811 		cmn_err(CE_WARN, "md %s: open error",
812 			md_shortname(MD_SID(un)));
813 	}
814 }
815 
816 static int
817 mirror_close_all_devs(mm_unit_t *un, int md_cflags)
818 {
819 	int i;
820 	md_dev64_t dev;
821 
822 	for (i = 0; i < NMIRROR; i++) {
823 		if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
824 			continue;
825 		dev = un->un_sm[i].sm_dev;
826 		md_layered_close(dev, md_cflags);
827 	}
828 	return (0);
829 }
830 
831 /*
832  * Keep track of drivers that don't support failfast.  We use this so that
833  * we only log one diagnostic message for each of these drivers, no matter
834  * how many times we run the mirror_check_failfast function.
835  * Return 1 if this is a new driver that does not support failfast,
836  * return 0 if we have already seen this non-failfast driver.
837  */
838 static int
839 new_non_ff_driver(const char *s)
840 {
841 	mutex_enter(&non_ff_drv_mutex);
842 	if (non_ff_drivers == NULL) {
843 	    non_ff_drivers = (char **)kmem_alloc(2 * sizeof (char *),
844 		KM_NOSLEEP);
845 	    if (non_ff_drivers == NULL) {
846 		mutex_exit(&non_ff_drv_mutex);
847 		return (1);
848 	    }
849 
850 	    non_ff_drivers[0] = (char *)kmem_alloc(strlen(s) + 1, KM_NOSLEEP);
851 	    if (non_ff_drivers[0] == NULL) {
852 		kmem_free(non_ff_drivers, 2 * sizeof (char *));
853 		non_ff_drivers = NULL;
854 		mutex_exit(&non_ff_drv_mutex);
855 		return (1);
856 	    }
857 
858 	    (void) strcpy(non_ff_drivers[0], s);
859 	    non_ff_drivers[1] = NULL;
860 
861 	} else {
862 	    int i;
863 	    char **tnames;
864 	    char **tmp;
865 
866 	    for (i = 0; non_ff_drivers[i] != NULL; i++) {
867 		if (strcmp(s, non_ff_drivers[i]) == 0) {
868 		    mutex_exit(&non_ff_drv_mutex);
869 		    return (0);
870 		}
871 	    }
872 
873 	    /* allow for new element and null */
874 	    i += 2;
875 	    tnames = (char **)kmem_alloc(i * sizeof (char *), KM_NOSLEEP);
876 	    if (tnames == NULL) {
877 		mutex_exit(&non_ff_drv_mutex);
878 		return (1);
879 	    }
880 
881 	    for (i = 0; non_ff_drivers[i] != NULL; i++)
882 		tnames[i] = non_ff_drivers[i];
883 
884 	    tnames[i] = (char *)kmem_alloc(strlen(s) + 1, KM_NOSLEEP);
885 	    if (tnames[i] == NULL) {
886 		/* adjust i so that it is the right count to free */
887 		kmem_free(tnames, (i + 2) * sizeof (char *));
888 		mutex_exit(&non_ff_drv_mutex);
889 		return (1);
890 	    }
891 
892 	    (void) strcpy(tnames[i++], s);
893 	    tnames[i] = NULL;
894 
895 	    tmp = non_ff_drivers;
896 	    non_ff_drivers = tnames;
897 	    /* i now represents the count we previously alloced */
898 	    kmem_free(tmp, i * sizeof (char *));
899 	}
900 	mutex_exit(&non_ff_drv_mutex);
901 
902 	return (1);
903 }
904 
905 /*
906  * Check for the "ddi-failfast-supported" devtree property on each submirror
907  * component to indicate if we should do I/O to that submirror with the
908  * B_FAILFAST flag set or not.  This check is made at various state transitions
909  * in the mirror code (e.g. open, enable, hotspare, etc.).  Sometimes we
910  * only need to check one drive (e.g. hotspare) but since the check is
911  * fast and infrequent and sometimes needs to be done on all components we
912  * just check all components on each call.
913  */
914 void
915 mirror_check_failfast(minor_t mnum)
916 {
917 	int		i;
918 	mm_unit_t	*un;
919 
920 	if (md_ff_disable)
921 	    return;
922 
923 	un = MD_UNIT(mnum);
924 
925 	for (i = 0; i < NMIRROR; i++) {
926 	    int			ci;
927 	    int			cnt;
928 	    int			ff = 1;
929 	    mm_submirror_t	*sm;
930 	    mm_submirror_ic_t	*smic;
931 	    void		(*get_dev)();
932 
933 	    if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
934 		continue;
935 
936 	    sm = &un->un_sm[i];
937 	    smic = &un->un_smic[i];
938 
939 	    get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0,
940 		"get device", 0);
941 
942 	    cnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm);
943 	    for (ci = 0; ci < cnt; ci++) {
944 		int		found = 0;
945 		dev_t		ci_dev;
946 		major_t		major;
947 		dev_info_t	*devi;
948 		ms_cd_info_t	cd;
949 
950 		/* this already returns the hs dev if the device is spared */
951 		(void) (*get_dev)(sm->sm_dev, sm, ci, &cd);
952 
953 		ci_dev = md_dev64_to_dev(cd.cd_dev);
954 		major = getmajor(ci_dev);
955 
956 		if (major == md_major) {
957 		    /* this component must be a soft partition; get real dev */
958 		    minor_t	dev_mnum;
959 		    mdi_unit_t	*ui;
960 		    mp_unit_t	*un;
961 		    set_t	setno;
962 		    side_t	side;
963 		    md_dev64_t	tmpdev;
964 
965 		    ui = MDI_UNIT(getminor(ci_dev));
966 
967 		    /* grab necessary lock */
968 		    un = (mp_unit_t *)md_unit_readerlock(ui);
969 
970 		    dev_mnum = MD_SID(un);
971 		    setno = MD_MIN2SET(dev_mnum);
972 		    side = mddb_getsidenum(setno);
973 
974 		    tmpdev = un->un_dev;
975 
976 		    /* Get dev by device id */
977 		    if (md_devid_found(setno, side, un->un_key) == 1) {
978 			tmpdev = md_resolve_bydevid(dev_mnum, tmpdev,
979 				un->un_key);
980 		    }
981 
982 		    md_unit_readerexit(ui);
983 
984 		    ci_dev = md_dev64_to_dev(tmpdev);
985 		    major = getmajor(ci_dev);
986 		}
987 
988 		if (ci_dev != NODEV32 &&
989 		    (devi = e_ddi_hold_devi_by_dev(ci_dev, 0)) != NULL) {
990 		    ddi_prop_op_t	prop_op = PROP_LEN_AND_VAL_BUF;
991 		    int			propvalue = 0;
992 		    int			proplength = sizeof (int);
993 		    int			error;
994 		    struct cb_ops	*cb;
995 
996 		    if ((cb = devopsp[major]->devo_cb_ops) != NULL) {
997 			error = (*cb->cb_prop_op)(DDI_DEV_T_ANY, devi, prop_op,
998 			    DDI_PROP_NOTPROM|DDI_PROP_DONTPASS,
999 			    "ddi-failfast-supported",
1000 			    (caddr_t)&propvalue, &proplength);
1001 
1002 			if (error == DDI_PROP_SUCCESS)
1003 			    found = 1;
1004 		    }
1005 
1006 		    if (!found && new_non_ff_driver(ddi_driver_name(devi)))
1007 			cmn_err(CE_NOTE, "!md: B_FAILFAST I/O disabled on %s",
1008 			    ddi_driver_name(devi));
1009 
1010 		    ddi_release_devi(devi);
1011 		}
1012 
1013 		/* All components must support failfast in the submirror. */
1014 		if (!found) {
1015 		    ff = 0;
1016 		    break;
1017 		}
1018 	    }
1019 
1020 	    if (ff) {
1021 		sm->sm_flags |= MD_SM_FAILFAST;
1022 	    } else {
1023 		sm->sm_flags &= ~MD_SM_FAILFAST;
1024 	    }
1025 	}
1026 }
1027 
1028 /*
1029  * Return true if the submirror is unavailable.
1030  * If any of the submirror components are opened then the submirror cannot
1031  * be unavailable (MD_INACCESSIBLE).
1032  * If any of the components are already in the errored state, then the submirror
1033  * cannot be unavailable (MD_INACCESSIBLE).
1034  */
1035 static bool_t
1036 submirror_unavailable(mm_unit_t *un, int smi, int from_probe)
1037 {
1038 	mm_submirror_t		*sm;
1039 	mm_submirror_ic_t	*smic;
1040 	md_m_shared_t		*shared;
1041 	int			ci;
1042 	int			compcnt;
1043 
1044 	sm = &un->un_sm[smi];
1045 	smic = &un->un_smic[smi];
1046 
1047 	compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un);
1048 	for (ci = 0; ci < compcnt; ci++) {
1049 		shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
1050 		    (sm->sm_dev, sm, ci);
1051 		if (from_probe) {
1052 			if (shared->ms_flags & MDM_S_PROBEOPEN)
1053 				return (B_FALSE);
1054 		} else {
1055 			if (shared->ms_flags & MDM_S_ISOPEN)
1056 				return (B_FALSE);
1057 		}
1058 		if (shared->ms_state == CS_ERRED ||
1059 		    shared->ms_state == CS_LAST_ERRED)
1060 			return (B_FALSE);
1061 	}
1062 
1063 	return (B_TRUE);
1064 }
1065 
1066 static int
1067 mirror_open_all_devs(minor_t mnum, int md_oflags, IOLOCK *lockp)
1068 {
1069 	int		i;
1070 	mm_unit_t	*un;
1071 	mdi_unit_t	*ui;
1072 	int		err;
1073 	int		smi;
1074 	int		ci;
1075 	err_comp_t	*c;
1076 	err_comp_t	*ecomps = NULL;
1077 	int		smmask = 0;
1078 	set_t		setno;
1079 	int		sm_cnt;
1080 	int		sm_unavail_cnt;
1081 
1082 	mirror_check_failfast(mnum);
1083 
1084 	un = MD_UNIT(mnum);
1085 	ui = MDI_UNIT(mnum);
1086 	setno = MD_UN2SET(un);
1087 
1088 	for (i = 0; i < NMIRROR; i++) {
1089 		md_dev64_t tmpdev = un->un_sm[i].sm_dev;
1090 
1091 		if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
1092 			continue;
1093 		if (md_layered_open(mnum, &tmpdev, md_oflags))
1094 			smmask |= SMI2BIT(i);
1095 		un->un_sm[i].sm_dev = tmpdev;
1096 	}
1097 
1098 	/*
1099 	 * If smmask is clear, all submirrors are accessible. Clear the
1100 	 * MD_INACCESSIBLE bit in this case.  This bit is also cleared for the
1101 	 * mirror device.   If smmask is set, we have to determine which of the
1102 	 * submirrors are in error. If no submirror is accessible we mark the
1103 	 * whole mirror as MD_INACCESSIBLE.
1104 	 */
1105 	if (smmask == 0) {
1106 		if (lockp) {
1107 			md_ioctl_readerexit(lockp);
1108 			(void) md_ioctl_writerlock(lockp, ui);
1109 		} else {
1110 			md_unit_readerexit(ui);
1111 			(void) md_unit_writerlock(ui);
1112 		}
1113 		ui->ui_tstate &= ~MD_INACCESSIBLE;
1114 		if (lockp) {
1115 			md_ioctl_writerexit(lockp);
1116 			(void) md_ioctl_readerlock(lockp, ui);
1117 		} else {
1118 			md_unit_writerexit(ui);
1119 			(void) md_unit_readerlock(ui);
1120 		}
1121 
1122 		for (i = 0; i < NMIRROR; i++) {
1123 			md_dev64_t	tmpdev;
1124 			mdi_unit_t	*sm_ui;
1125 
1126 			if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
1127 				continue;
1128 
1129 			tmpdev = un->un_sm[i].sm_dev;
1130 			sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev)));
1131 			(void) md_unit_writerlock(sm_ui);
1132 			sm_ui->ui_tstate &= ~MD_INACCESSIBLE;
1133 			md_unit_writerexit(sm_ui);
1134 		}
1135 
1136 		return (0);
1137 	}
1138 
1139 	for (i = 0; i < NMIRROR; i++) {
1140 		md_dev64_t tmpdev;
1141 
1142 		if (!(smmask & SMI2BIT(i)))
1143 			continue;
1144 
1145 		tmpdev = un->un_sm[i].sm_dev;
1146 		err = md_layered_open(mnum, &tmpdev, MD_OFLG_CONT_ERRS);
1147 		un->un_sm[i].sm_dev = tmpdev;
1148 		ASSERT(err == 0);
1149 	}
1150 
1151 	if (lockp) {
1152 		md_ioctl_readerexit(lockp);
1153 		un = (mm_unit_t *)md_ioctl_writerlock(lockp, ui);
1154 	} else {
1155 		md_unit_readerexit(ui);
1156 		un = (mm_unit_t *)md_unit_writerlock(ui);
1157 	}
1158 
1159 	/*
1160 	 * We want to make sure the unavailable flag is not masking a real
1161 	 * error on the submirror.
1162 	 * For each submirror,
1163 	 *    if all of the submirror components couldn't be opened and there
1164 	 *    are no errors on the submirror, then set the unavailable flag
1165 	 *    otherwise, clear unavailable.
1166 	 */
1167 	sm_cnt = 0;
1168 	sm_unavail_cnt = 0;
1169 	for (i = 0; i < NMIRROR; i++) {
1170 		md_dev64_t	tmpdev;
1171 		mdi_unit_t	*sm_ui;
1172 
1173 		if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
1174 			continue;
1175 
1176 		sm_cnt++;
1177 		tmpdev = un->un_sm[i].sm_dev;
1178 		sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev)));
1179 
1180 		(void) md_unit_writerlock(sm_ui);
1181 		if (submirror_unavailable(un, i, 0)) {
1182 			sm_ui->ui_tstate |= MD_INACCESSIBLE;
1183 			sm_unavail_cnt++;
1184 		} else {
1185 			sm_ui->ui_tstate &= ~MD_INACCESSIBLE;
1186 		}
1187 		md_unit_writerexit(sm_ui);
1188 	}
1189 
1190 	/*
1191 	 * If all of the submirrors are unavailable, the mirror is also
1192 	 * unavailable.
1193 	 */
1194 	if (sm_cnt == sm_unavail_cnt) {
1195 		ui->ui_tstate |= MD_INACCESSIBLE;
1196 	} else {
1197 		ui->ui_tstate &= ~MD_INACCESSIBLE;
1198 	}
1199 
1200 	smi = 0;
1201 	ci = 0;
1202 	while (mirror_geterror(un, &smi, &ci, 1, 0) != 0) {
1203 		if (mirror_other_sources(un, smi, ci, 1) == 1) {
1204 
1205 			free_all_ecomps(ecomps);
1206 			(void) mirror_close_all_devs(un, md_oflags);
1207 			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL,
1208 			    SVM_TAG_METADEVICE, setno, MD_SID(un));
1209 			mirror_openfail_console_info(un, smi, ci);
1210 			if (lockp) {
1211 				md_ioctl_writerexit(lockp);
1212 				(void) md_ioctl_readerlock(lockp, ui);
1213 			} else {
1214 				md_unit_writerexit(ui);
1215 				(void) md_unit_readerlock(ui);
1216 			}
1217 			return (ENXIO);
1218 		}
1219 
1220 		/* track all component states that need changing */
1221 		c = (err_comp_t *)kmem_alloc(sizeof (err_comp_t), KM_SLEEP);
1222 		c->ec_next = ecomps;
1223 		c->ec_smi = smi;
1224 		c->ec_ci = ci;
1225 		ecomps = c;
1226 		ci++;
1227 	}
1228 
1229 	/* Make all state changes and commit them */
1230 	for (c = ecomps; c != NULL; c = c->ec_next) {
1231 		/*
1232 		 * If lockp is set, then entering kernel through ioctl.
1233 		 * For a MN set, the only ioctl path is via a commd message
1234 		 * (ALLOCATE_HOTSPARE or *RESYNC* messages) that is already
1235 		 * being sent to each node.
1236 		 * In this case, set NO_XMIT so that set_sm_comp_state
1237 		 * won't attempt to send a message on a message.
1238 		 *
1239 		 * In !MN sets, the xmit flag is ignored, so it doesn't matter
1240 		 * which flag is passed.
1241 		 */
1242 		if (lockp) {
1243 			set_sm_comp_state(un, c->ec_smi, c->ec_ci, CS_ERRED, 0,
1244 			    MD_STATE_NO_XMIT, lockp);
1245 		} else {
1246 			set_sm_comp_state(un, c->ec_smi, c->ec_ci, CS_ERRED, 0,
1247 			    (MD_STATE_XMIT | MD_STATE_OCHELD), lockp);
1248 		}
1249 		/*
1250 		 * For a MN set, the NOTIFY is done when the state change is
1251 		 * processed on each node
1252 		 */
1253 		if (!MD_MNSET_SETNO(setno)) {
1254 			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED,
1255 			    SVM_TAG_METADEVICE, setno, MD_SID(un));
1256 		}
1257 	}
1258 
1259 	if (lockp) {
1260 		md_ioctl_writerexit(lockp);
1261 		(void) md_ioctl_readerlock(lockp, ui);
1262 	} else {
1263 		md_unit_writerexit(ui);
1264 		(void) md_unit_readerlock(ui);
1265 	}
1266 
1267 	free_all_ecomps(ecomps);
1268 
1269 	/* allocate hotspares for all errored components */
1270 	if (MD_MNSET_SETNO(setno)) {
1271 		/*
1272 		 * If we're called from an ioctl (lockp set) then we cannot
1273 		 * directly call send_poke_hotspares as this will block until
1274 		 * the message gets despatched to all nodes. If the cluster is
1275 		 * going through a reconfig cycle then the message will block
1276 		 * until the cycle is complete, and as we originate from a
1277 		 * service call from commd we will livelock.
1278 		 */
1279 		if (lockp == NULL) {
1280 			md_unit_readerexit(ui);
1281 			send_poke_hotspares(setno);
1282 			(void) md_unit_readerlock(ui);
1283 		}
1284 	} else {
1285 		(void) poke_hotspares();
1286 	}
1287 	return (0);
1288 }
1289 
1290 void
1291 mirror_overlap_chain_remove(md_mps_t *ps)
1292 {
1293 	mm_unit_t	*un;
1294 
1295 	if (panicstr)
1296 		return;
1297 
1298 	ASSERT(ps->ps_flags & MD_MPS_ON_OVERLAP);
1299 
1300 	un = ps->ps_un;
1301 
1302 	mutex_enter(&un->un_ovrlap_chn_mx);
1303 	if (ps->ps_ovrlap_prev != &un->un_ovrlap_chn)
1304 		ps->ps_ovrlap_prev->ps_ovrlap_next = ps->ps_ovrlap_next;
1305 	else
1306 		un->un_ovrlap_chn.ps_ovrlap_next = ps->ps_ovrlap_next;
1307 	if (ps->ps_ovrlap_next != &un->un_ovrlap_chn)
1308 		ps->ps_ovrlap_next->ps_ovrlap_prev = ps->ps_ovrlap_prev;
1309 	else
1310 		un->un_ovrlap_chn.ps_ovrlap_prev = ps->ps_ovrlap_prev;
1311 	/* Handle empty overlap chain */
1312 	if (un->un_ovrlap_chn.ps_ovrlap_prev == &un->un_ovrlap_chn) {
1313 		un->un_ovrlap_chn.ps_ovrlap_prev =
1314 		    un->un_ovrlap_chn.ps_ovrlap_next = NULL;
1315 	}
1316 	if (un->un_ovrlap_chn_flg) {
1317 		un->un_ovrlap_chn_flg = 0;
1318 		cv_broadcast(&un->un_ovrlap_chn_cv);
1319 	}
1320 	ps->ps_flags &= ~MD_MPS_ON_OVERLAP;
1321 	mutex_exit(&un->un_ovrlap_chn_mx);
1322 }
1323 
1324 
1325 /*
1326  * wait_for_overlaps:
1327  * -----------------
1328  * Check that given i/o request does not cause an overlap with already pending
1329  * i/o. If it does, block until the overlapped i/o completes.
1330  *
1331  * Note: the overlap chain is held as a monotonically increasing doubly-linked
1332  * list with the sentinel contained in un->un_ovrlap_chn. We avoid a linear
1333  * search of the list by the following logic:
1334  *	ps->ps_lastblk < un_ovrlap_chn.ps_ovrlap_next->ps_firstblk => No overlap
1335  *	ps->ps_firstblk > un_ovrlap_chn.ps_ovrlap_prev->ps_lastblk => No overlap
1336  * otherwise
1337  *	scan un_ovrlap_chn.ps_ovrlap_next for location where ps->ps_firstblk
1338  *	> chain->ps_lastblk. This is the insertion point. As the list is
1339  *	guaranteed to be ordered there is no need to continue scanning.
1340  *
1341  * The flag argument has MD_OVERLAP_ALLOW_REPEAT set if it is ok for the parent
1342  *	structure to be already on the overlap chain and MD_OVERLAP_NO_REPEAT
1343  *	if it must not already be on the chain
1344  */
1345 static void
1346 wait_for_overlaps(md_mps_t *ps, int flags)
1347 {
1348 	mm_unit_t	*un;
1349 	md_mps_t	*ps1, **head, **tail;
1350 
1351 	if (panicstr)
1352 		return;
1353 
1354 
1355 	un = ps->ps_un;
1356 
1357 	mutex_enter(&un->un_ovrlap_chn_mx);
1358 	if ((flags & MD_OVERLAP_ALLOW_REPEAT) &&
1359 	    (ps->ps_flags & MD_MPS_ON_OVERLAP)) {
1360 		mutex_exit(&un->un_ovrlap_chn_mx);
1361 		return;
1362 	}
1363 
1364 	ASSERT(!(ps->ps_flags & MD_MPS_ON_OVERLAP));
1365 	head = &(un->un_ovrlap_chn.ps_ovrlap_next);
1366 	tail = &(un->un_ovrlap_chn.ps_ovrlap_prev);
1367 	ps1 = *head;
1368 	/*
1369 	 * Check for simple limit cases:
1370 	 *	*head == NULL
1371 	 *		insert ps at head of list
1372 	 *	lastblk < head->firstblk
1373 	 *		insert at head of list
1374 	 *	firstblk > tail->lastblk
1375 	 *		insert at tail of list
1376 	 */
1377 	if (ps1 == NULL) {
1378 		/* Insert at head */
1379 		ps->ps_ovrlap_next = &un->un_ovrlap_chn;
1380 		ps->ps_ovrlap_prev = &un->un_ovrlap_chn;
1381 		*head = ps;
1382 		*tail = ps;
1383 		ps->ps_flags |= MD_MPS_ON_OVERLAP;
1384 		mutex_exit(&un->un_ovrlap_chn_mx);
1385 		return;
1386 	} else if (ps->ps_lastblk < (*head)->ps_firstblk) {
1387 		/* Insert at head */
1388 		ps->ps_ovrlap_next = (*head);
1389 		ps->ps_ovrlap_prev = &un->un_ovrlap_chn;
1390 		(*head)->ps_ovrlap_prev = ps;
1391 		*head = ps;
1392 		ps->ps_flags |= MD_MPS_ON_OVERLAP;
1393 		mutex_exit(&un->un_ovrlap_chn_mx);
1394 		return;
1395 	} else if (ps->ps_firstblk > (*tail)->ps_lastblk) {
1396 		/* Insert at tail */
1397 		ps->ps_ovrlap_prev = (*tail);
1398 		ps->ps_ovrlap_next = &un->un_ovrlap_chn;
1399 		(*tail)->ps_ovrlap_next = ps;
1400 		*tail = ps;
1401 		ps->ps_flags |= MD_MPS_ON_OVERLAP;
1402 		mutex_exit(&un->un_ovrlap_chn_mx);
1403 		return;
1404 	}
1405 	/* Now we have to scan the list for possible overlaps */
1406 	while (ps1 != NULL) {
1407 		/*
1408 		 * If this region has been put on the chain by another thread
1409 		 * just exit
1410 		 */
1411 		if ((flags & MD_OVERLAP_ALLOW_REPEAT) &&
1412 		    (ps->ps_flags & MD_MPS_ON_OVERLAP)) {
1413 			mutex_exit(&un->un_ovrlap_chn_mx);
1414 			return;
1415 
1416 		}
1417 		for (ps1 = *head; ps1 && (ps1 != &un->un_ovrlap_chn);
1418 		    ps1 = ps1->ps_ovrlap_next) {
1419 			if (ps->ps_firstblk > (*tail)->ps_lastblk) {
1420 				/* Insert at tail */
1421 				ps->ps_ovrlap_prev = (*tail);
1422 				ps->ps_ovrlap_next = &un->un_ovrlap_chn;
1423 				(*tail)->ps_ovrlap_next = ps;
1424 				*tail = ps;
1425 				ps->ps_flags |= MD_MPS_ON_OVERLAP;
1426 				mutex_exit(&un->un_ovrlap_chn_mx);
1427 				return;
1428 			}
1429 			if (ps->ps_firstblk > ps1->ps_lastblk)
1430 				continue;
1431 			if (ps->ps_lastblk < ps1->ps_firstblk) {
1432 				/* Insert into list at current 'ps1' position */
1433 				ps->ps_ovrlap_next = ps1;
1434 				ps->ps_ovrlap_prev = ps1->ps_ovrlap_prev;
1435 				ps1->ps_ovrlap_prev->ps_ovrlap_next = ps;
1436 				ps1->ps_ovrlap_prev = ps;
1437 				ps->ps_flags |= MD_MPS_ON_OVERLAP;
1438 				mutex_exit(&un->un_ovrlap_chn_mx);
1439 				return;
1440 			}
1441 			break;
1442 		}
1443 		if (ps1 != NULL) {
1444 			un->un_ovrlap_chn_flg = 1;
1445 			cv_wait(&un->un_ovrlap_chn_cv, &un->un_ovrlap_chn_mx);
1446 			/*
1447 			 * Now ps1 refers to the old insertion point and we
1448 			 * have to check the whole chain to see if we're still
1449 			 * overlapping any other i/o.
1450 			 */
1451 		}
1452 	}
1453 
1454 	/*
1455 	 * Only get here if we had one overlapping i/o on the list and that
1456 	 * has now completed. In this case the list is empty so we insert <ps>
1457 	 * at the head of the chain.
1458 	 */
1459 	ASSERT(*head == NULL);
1460 	*tail = *head = ps;
1461 	ps->ps_ovrlap_next = ps->ps_ovrlap_prev = &un->un_ovrlap_chn;
1462 	ps->ps_flags |= MD_MPS_ON_OVERLAP;
1463 	mutex_exit(&un->un_ovrlap_chn_mx);
1464 }
1465 
1466 /*
1467  * This function is called from mirror_done to check whether any pages have
1468  * been modified while a mirrored write was in progress.  Returns 0 if
1469  * all pages associated with bp are clean, 1 otherwise.
1470  */
1471 static int
1472 any_pages_dirty(struct buf *bp)
1473 {
1474 	int	rval;
1475 
1476 	rval = biomodified(bp);
1477 	if (rval == -1)
1478 		rval = 0;
1479 
1480 	return (rval);
1481 }
1482 
1483 #define	MAX_EXTRAS 10
1484 
1485 void
1486 mirror_commit(
1487 	mm_unit_t	*un,
1488 	int		smmask,
1489 	mddb_recid_t	*extras
1490 )
1491 {
1492 	mm_submirror_t		*sm;
1493 	md_unit_t		*su;
1494 	int			i;
1495 
1496 	/* 2=mirror,null id */
1497 	mddb_recid_t		recids[NMIRROR+2+MAX_EXTRAS];
1498 
1499 	int			ri = 0;
1500 
1501 	if (md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)
1502 		return;
1503 
1504 	/* Add two, this includes the mirror unit and the null recid */
1505 	if (extras != NULL) {
1506 		int	nrecids = 0;
1507 		while (extras[nrecids] != 0) {
1508 			nrecids++;
1509 		}
1510 		ASSERT(nrecids <= MAX_EXTRAS);
1511 	}
1512 
1513 	if (un != NULL)
1514 		recids[ri++] = un->c.un_record_id;
1515 	for (i = 0;  i < NMIRROR; i++) {
1516 		if (!(smmask & SMI2BIT(i)))
1517 			continue;
1518 		sm = &un->un_sm[i];
1519 		if (!SMS_IS(sm, SMS_INUSE))
1520 			continue;
1521 		if (md_getmajor(sm->sm_dev) != md_major)
1522 			continue;
1523 		su =  MD_UNIT(md_getminor(sm->sm_dev));
1524 		recids[ri++] = su->c.un_record_id;
1525 	}
1526 
1527 	if (extras != NULL)
1528 		while (*extras != 0) {
1529 			recids[ri++] = *extras;
1530 			extras++;
1531 		}
1532 
1533 	if (ri == 0)
1534 		return;
1535 	recids[ri] = 0;
1536 
1537 	/*
1538 	 * Ok to hold ioctl lock across record commit to mddb as
1539 	 * long as the record(s) being committed aren't resync records.
1540 	 */
1541 	mddb_commitrecs_wrapper(recids);
1542 }
1543 
1544 
1545 /*
1546  * This routine is used to set a bit in the writable_bm bitmap
1547  * which represents each submirror in a metamirror which
1548  * is writable. The first writable submirror index is assigned
1549  * to the sm_index.  The number of writable submirrors are returned in nunits.
1550  *
1551  * This routine returns the submirror's unit number.
1552  */
1553 
1554 static void
1555 select_write_units(struct mm_unit *un, md_mps_t *ps)
1556 {
1557 
1558 	int		i;
1559 	unsigned	writable_bm = 0;
1560 	unsigned	nunits = 0;
1561 
1562 	for (i = 0; i < NMIRROR; i++) {
1563 		if (SUBMIRROR_IS_WRITEABLE(un, i)) {
1564 			/* set bit of all writable units */
1565 			writable_bm |= SMI2BIT(i);
1566 			nunits++;
1567 		}
1568 	}
1569 	ps->ps_writable_sm = writable_bm;
1570 	ps->ps_active_cnt = nunits;
1571 	ps->ps_current_sm = 0;
1572 }
1573 
1574 static
1575 unsigned
1576 select_write_after_read_units(struct mm_unit *un, md_mps_t *ps)
1577 {
1578 
1579 	int		i;
1580 	unsigned	writable_bm = 0;
1581 	unsigned	nunits = 0;
1582 
1583 	for (i = 0; i < NMIRROR; i++) {
1584 		if (SUBMIRROR_IS_WRITEABLE(un, i) &&
1585 		    un->un_sm[i].sm_flags & MD_SM_RESYNC_TARGET) {
1586 			writable_bm |= SMI2BIT(i);
1587 			nunits++;
1588 		}
1589 	}
1590 	if ((writable_bm & ps->ps_allfrom_sm) != 0) {
1591 		writable_bm &= ~ps->ps_allfrom_sm;
1592 		nunits--;
1593 	}
1594 	ps->ps_writable_sm = writable_bm;
1595 	ps->ps_active_cnt = nunits;
1596 	ps->ps_current_sm = 0;
1597 	return (nunits);
1598 }
1599 
1600 static md_dev64_t
1601 select_read_unit(
1602 	mm_unit_t	*un,
1603 	diskaddr_t	blkno,
1604 	u_longlong_t	reqcount,
1605 	u_longlong_t	*cando,
1606 	int		must_be_opened,
1607 	md_m_shared_t	**shared,
1608 	md_mcs_t	*cs)
1609 {
1610 	int			i;
1611 	md_m_shared_t		*s;
1612 	uint_t			lasterrcnt = 0;
1613 	md_dev64_t		dev = 0;
1614 	u_longlong_t		cnt;
1615 	u_longlong_t		mincnt;
1616 	mm_submirror_t		*sm;
1617 	mm_submirror_ic_t	*smic;
1618 	mdi_unit_t		*ui;
1619 
1620 	mincnt = reqcount;
1621 	for (i = 0; i < NMIRROR; i++) {
1622 		if (!SUBMIRROR_IS_READABLE(un, i))
1623 			continue;
1624 		sm = &un->un_sm[i];
1625 		smic = &un->un_smic[i];
1626 		cnt = reqcount;
1627 
1628 		/*
1629 		 * If the current submirror is marked as inaccessible, do not
1630 		 * try to access it.
1631 		 */
1632 		ui = MDI_UNIT(getminor(expldev(sm->sm_dev)));
1633 		(void) md_unit_readerlock(ui);
1634 		if (ui->ui_tstate & MD_INACCESSIBLE) {
1635 			md_unit_readerexit(ui);
1636 			continue;
1637 		}
1638 		md_unit_readerexit(ui);
1639 
1640 		s = (md_m_shared_t *)(*(smic->sm_shared_by_blk))
1641 		    (sm->sm_dev, sm, blkno, &cnt);
1642 
1643 		if (must_be_opened && !(s->ms_flags & MDM_S_ISOPEN))
1644 			continue;
1645 		if (s->ms_state == CS_OKAY) {
1646 			*cando = cnt;
1647 			if (shared != NULL)
1648 				*shared = s;
1649 
1650 			if (un->un_sm[i].sm_flags & MD_SM_FAILFAST &&
1651 			    cs != NULL) {
1652 				cs->cs_buf.b_flags |= B_FAILFAST;
1653 			}
1654 
1655 			return (un->un_sm[i].sm_dev);
1656 		}
1657 		if (s->ms_state != CS_LAST_ERRED)
1658 			continue;
1659 
1660 		/* don't use B_FAILFAST since we're Last Erred */
1661 
1662 		if (mincnt > cnt)
1663 			mincnt = cnt;
1664 		if (s->ms_lasterrcnt > lasterrcnt) {
1665 			lasterrcnt = s->ms_lasterrcnt;
1666 			if (shared != NULL)
1667 				*shared = s;
1668 			dev = un->un_sm[i].sm_dev;
1669 		}
1670 	}
1671 	*cando = mincnt;
1672 	return (dev);
1673 }
1674 
1675 /*
1676  * Given a 32-bit bitmap, this routine will return the bit number
1677  * of the nth bit set.	The nth bit set is passed via the index integer.
1678  *
1679  * This routine is used to run through the writable submirror bitmap
1680  * and starting all of the writes.  See the value returned is the
1681  * index to appropriate submirror structure, in the md_sm
1682  * array for metamirrors.
1683  */
1684 static int
1685 md_find_nth_unit(uint_t mask, int index)
1686 {
1687 	int	bit, nfound;
1688 
1689 	for (bit = -1, nfound = -1; nfound != index; bit++) {
1690 		ASSERT(mask != 0);
1691 		nfound += (mask & 1);
1692 		mask >>= 1;
1693 	}
1694 	return (bit);
1695 }
1696 
1697 static int
1698 fast_select_read_unit(md_mps_t *ps, md_mcs_t *cs)
1699 {
1700 	mm_unit_t	*un;
1701 	buf_t		*bp;
1702 	int		i;
1703 	unsigned	nunits = 0;
1704 	int		iunit;
1705 	uint_t		running_bm = 0;
1706 	uint_t		sm_index;
1707 
1708 	bp = &cs->cs_buf;
1709 	un = ps->ps_un;
1710 
1711 	for (i = 0; i < NMIRROR; i++) {
1712 		if (!SMS_BY_INDEX_IS(un, i, SMS_RUNNING))
1713 			continue;
1714 		running_bm |= SMI2BIT(i);
1715 		nunits++;
1716 	}
1717 	if (nunits == 0)
1718 		return (1);
1719 
1720 	/*
1721 	 * For directed mirror read (DMR) we only use the specified side and
1722 	 * do not compute the source of the read.
1723 	 */
1724 	if (ps->ps_flags & MD_MPS_DMR) {
1725 		sm_index = un->un_dmr_last_read;
1726 	} else {
1727 		/* Normal (non-DMR) operation */
1728 		switch (un->un_read_option) {
1729 		case RD_GEOMETRY:
1730 			iunit = (int)(bp->b_lblkno /
1731 			    howmany(un->c.un_total_blocks, nunits));
1732 			sm_index = md_find_nth_unit(running_bm, iunit);
1733 			break;
1734 		case RD_FIRST:
1735 			sm_index = md_find_nth_unit(running_bm, 0);
1736 			break;
1737 		case RD_LOAD_BAL:
1738 			/* this is intentional to fall into the default */
1739 		default:
1740 			un->un_last_read = (un->un_last_read + 1) % nunits;
1741 			sm_index = md_find_nth_unit(running_bm,
1742 			    un->un_last_read);
1743 			break;
1744 		}
1745 	}
1746 	bp->b_edev = md_dev64_to_dev(un->un_sm[sm_index].sm_dev);
1747 	ps->ps_allfrom_sm = SMI2BIT(sm_index);
1748 
1749 	if (un->un_sm[sm_index].sm_flags & MD_SM_FAILFAST) {
1750 	    bp->b_flags |= B_FAILFAST;
1751 	}
1752 
1753 	return (0);
1754 }
1755 
1756 static
1757 int
1758 mirror_are_submirrors_available(mm_unit_t *un)
1759 {
1760 	int i;
1761 	for (i = 0; i < NMIRROR; i++) {
1762 		md_dev64_t tmpdev = un->un_sm[i].sm_dev;
1763 
1764 		if ((!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) ||
1765 		    md_getmajor(tmpdev) != md_major)
1766 			continue;
1767 
1768 		if ((MD_MIN2SET(md_getminor(tmpdev)) >= md_nsets) ||
1769 		    (MD_MIN2UNIT(md_getminor(tmpdev)) >= md_nunits))
1770 			return (0);
1771 
1772 		if (MDI_UNIT(md_getminor(tmpdev)) == NULL)
1773 			return (0);
1774 	}
1775 	return (1);
1776 }
1777 
1778 void
1779 build_submirror(mm_unit_t *un, int i, int snarfing)
1780 {
1781 	struct mm_submirror	*sm;
1782 	struct mm_submirror_ic	*smic;
1783 	md_unit_t		*su;
1784 	set_t			setno;
1785 
1786 	sm = &un->un_sm[i];
1787 	smic = &un->un_smic[i];
1788 
1789 	sm->sm_flags = 0; /* sometime we may need to do more here */
1790 
1791 	setno = MD_UN2SET(un);
1792 
1793 	if (!SMS_IS(sm, SMS_INUSE))
1794 		return;
1795 	if (snarfing) {
1796 		sm->sm_dev = md_getdevnum(setno, mddb_getsidenum(setno),
1797 						sm->sm_key, MD_NOTRUST_DEVT);
1798 	} else {
1799 		if (md_getmajor(sm->sm_dev) == md_major) {
1800 			su = MD_UNIT(md_getminor(sm->sm_dev));
1801 			un->c.un_flag |= (su->c.un_flag & MD_LABELED);
1802 			/* submirror can no longer be soft partitioned */
1803 			MD_CAPAB(su) &= (~MD_CAN_SP);
1804 		}
1805 	}
1806 	smic->sm_shared_by_blk = md_get_named_service(sm->sm_dev,
1807 	    0, "shared by blk", 0);
1808 	smic->sm_shared_by_indx = md_get_named_service(sm->sm_dev,
1809 	    0, "shared by indx", 0);
1810 	smic->sm_get_component_count =
1811 	    (int (*)())md_get_named_service(sm->sm_dev, 0,
1812 		    "get component count", 0);
1813 	smic->sm_get_bcss =
1814 	    (int (*)())md_get_named_service(sm->sm_dev, 0,
1815 		    "get block count skip size", 0);
1816 	sm->sm_state &= ~SMS_IGNORE;
1817 	if (SMS_IS(sm, SMS_OFFLINE))
1818 		MD_STATUS(un) |= MD_UN_OFFLINE_SM;
1819 	md_set_parent(sm->sm_dev, MD_SID(un));
1820 }
1821 
1822 static void
1823 mirror_cleanup(mm_unit_t *un)
1824 {
1825 	mddb_recid_t	recid;
1826 	int		smi;
1827 	sv_dev_t	sv[NMIRROR];
1828 	int		nsv = 0;
1829 
1830 	/*
1831 	 * If a MN diskset and this node is not the master, do
1832 	 * not delete any records on snarf of the mirror records.
1833 	 */
1834 	if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
1835 	    md_set[MD_UN2SET(un)].s_am_i_master == 0) {
1836 		return;
1837 	}
1838 
1839 	for (smi = 0; smi < NMIRROR; smi++) {
1840 		if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE))
1841 			continue;
1842 		sv[nsv].setno = MD_UN2SET(un);
1843 		sv[nsv++].key = un->un_sm[smi].sm_key;
1844 	}
1845 
1846 	recid = un->un_rr_dirty_recid;
1847 	mddb_deleterec_wrapper(un->c.un_record_id);
1848 	if (recid > 0)
1849 		mddb_deleterec_wrapper(recid);
1850 
1851 	md_rem_names(sv, nsv);
1852 }
1853 
1854 /* Return a -1 if optimized record unavailable and set should be released */
1855 int
1856 mirror_build_incore(mm_unit_t *un, int snarfing)
1857 {
1858 	int		i;
1859 
1860 	if (MD_STATUS(un) & MD_UN_BEING_RESET) {
1861 		mddb_setrecprivate(un->c.un_record_id, MD_PRV_PENDCLEAN);
1862 		return (1);
1863 	}
1864 
1865 	if (mirror_are_submirrors_available(un) == 0)
1866 		return (1);
1867 
1868 	if (MD_UNIT(MD_SID(un)) != NULL)
1869 		return (0);
1870 
1871 	MD_STATUS(un) = 0;
1872 
1873 	/* pre-4.1 didn't define CAN_META_CHILD capability */
1874 	MD_CAPAB(un) = MD_CAN_META_CHILD | MD_CAN_PARENT | MD_CAN_SP;
1875 
1876 	un->un_ovrlap_chn_flg = 0;
1877 	bzero(&un->un_ovrlap_chn, sizeof (un->un_ovrlap_chn));
1878 
1879 	for (i = 0; i < NMIRROR; i++)
1880 		build_submirror(un, i, snarfing);
1881 
1882 	if (unit_setup_resync(un, snarfing) != 0) {
1883 		if (snarfing) {
1884 			mddb_setrecprivate(un->c.un_record_id, MD_PRV_GOTIT);
1885 			/*
1886 			 * If a MN set and set is not stale, then return -1
1887 			 * which will force the caller to unload the set.
1888 			 * The MN diskset nodes will return failure if
1889 			 * unit_setup_resync fails so that nodes won't
1890 			 * get out of sync.
1891 			 *
1892 			 * If set is STALE, the master node can't allocate
1893 			 * a resync record (if needed), but node needs to
1894 			 * join the set so that user can delete broken mddbs.
1895 			 * So, if set is STALE, just continue on.
1896 			 */
1897 			if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
1898 			    !(md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)) {
1899 				return (-1);
1900 			}
1901 		} else
1902 			return (1);
1903 	}
1904 
1905 	mutex_init(&un->un_ovrlap_chn_mx, NULL, MUTEX_DEFAULT, NULL);
1906 	cv_init(&un->un_ovrlap_chn_cv, NULL, CV_DEFAULT, NULL);
1907 
1908 	un->un_suspend_wr_flag = 0;
1909 	mutex_init(&un->un_suspend_wr_mx, NULL, MUTEX_DEFAULT, NULL);
1910 	cv_init(&un->un_suspend_wr_cv, NULL, CV_DEFAULT, NULL);
1911 
1912 	/*
1913 	 * Allocate mutexes for mirror-owner and resync-owner changes.
1914 	 * All references to the owner message state field must be guarded
1915 	 * by this mutex.
1916 	 */
1917 	mutex_init(&un->un_owner_mx, NULL, MUTEX_DEFAULT, NULL);
1918 
1919 	/*
1920 	 * Allocate mutex and condvar for resync thread manipulation. These
1921 	 * will be used by mirror_resync_unit/mirror_ioctl_resync
1922 	 */
1923 	mutex_init(&un->un_rs_thread_mx, NULL, MUTEX_DEFAULT, NULL);
1924 	cv_init(&un->un_rs_thread_cv, NULL, CV_DEFAULT, NULL);
1925 
1926 	/*
1927 	 * Allocate mutex and condvar for resync progress thread manipulation.
1928 	 * This allows resyncs to be continued across an intervening reboot.
1929 	 */
1930 	mutex_init(&un->un_rs_progress_mx, NULL, MUTEX_DEFAULT, NULL);
1931 	cv_init(&un->un_rs_progress_cv, NULL, CV_DEFAULT, NULL);
1932 
1933 	/*
1934 	 * Allocate mutex and condvar for Directed Mirror Reads (DMR). This
1935 	 * provides synchronization between a user-ioctl and the resulting
1936 	 * strategy() call that performs the read().
1937 	 */
1938 	mutex_init(&un->un_dmr_mx, NULL, MUTEX_DEFAULT, NULL);
1939 	cv_init(&un->un_dmr_cv, NULL, CV_DEFAULT, NULL);
1940 
1941 	MD_UNIT(MD_SID(un)) = un;
1942 	return (0);
1943 }
1944 
1945 
1946 void
1947 reset_mirror(struct mm_unit *un, minor_t mnum, int removing)
1948 {
1949 	mddb_recid_t	recid, vtoc_id;
1950 	size_t		bitcnt;
1951 	size_t		shortcnt;
1952 	int		smi;
1953 	sv_dev_t	sv[NMIRROR];
1954 	int		nsv = 0;
1955 	uint_t		bits = 0;
1956 	minor_t		selfid;
1957 	md_unit_t	*su;
1958 
1959 	md_destroy_unit_incore(mnum, &mirror_md_ops);
1960 
1961 	shortcnt = un->un_rrd_num * sizeof (short);
1962 	bitcnt = howmany(un->un_rrd_num, NBBY);
1963 
1964 	if (un->un_outstanding_writes)
1965 		kmem_free((caddr_t)un->un_outstanding_writes, shortcnt);
1966 	if (un->un_goingclean_bm)
1967 		kmem_free((caddr_t)un->un_goingclean_bm, bitcnt);
1968 	if (un->un_goingdirty_bm)
1969 		kmem_free((caddr_t)un->un_goingdirty_bm, bitcnt);
1970 	if (un->un_resync_bm)
1971 		kmem_free((caddr_t)un->un_resync_bm, bitcnt);
1972 
1973 	MD_UNIT(mnum) = NULL;
1974 
1975 	/*
1976 	 * Attempt release of its minor node
1977 	 */
1978 	(void) md_remove_minor_node(mnum);
1979 
1980 	if (!removing)
1981 		return;
1982 
1983 	for (smi = 0; smi < NMIRROR; smi++) {
1984 		if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE))
1985 			continue;
1986 		/* reallow soft partitioning of submirror and reset parent */
1987 		su = MD_UNIT(md_getminor(un->un_sm[smi].sm_dev));
1988 		MD_CAPAB(su) |= MD_CAN_SP;
1989 		md_reset_parent(un->un_sm[smi].sm_dev);
1990 		reset_comp_states(&un->un_sm[smi], &un->un_smic[smi]);
1991 
1992 		sv[nsv].setno = MD_MIN2SET(mnum);
1993 		sv[nsv++].key = un->un_sm[smi].sm_key;
1994 		bits |= SMI2BIT(smi);
1995 	}
1996 
1997 	MD_STATUS(un) |= MD_UN_BEING_RESET;
1998 	recid = un->un_rr_dirty_recid;
1999 	vtoc_id = un->c.un_vtoc_id;
2000 	selfid = MD_SID(un);
2001 
2002 	mirror_commit(un, bits, 0);
2003 
2004 	/* Destroy all mutexes and condvars before returning. */
2005 	mutex_destroy(&un->un_suspend_wr_mx);
2006 	cv_destroy(&un->un_suspend_wr_cv);
2007 	mutex_destroy(&un->un_ovrlap_chn_mx);
2008 	cv_destroy(&un->un_ovrlap_chn_cv);
2009 	mutex_destroy(&un->un_owner_mx);
2010 	mutex_destroy(&un->un_rs_thread_mx);
2011 	cv_destroy(&un->un_rs_thread_cv);
2012 	mutex_destroy(&un->un_rs_progress_mx);
2013 	cv_destroy(&un->un_rs_progress_cv);
2014 	mutex_destroy(&un->un_dmr_mx);
2015 	cv_destroy(&un->un_dmr_cv);
2016 
2017 	/*
2018 	 * Remove self from the namespace
2019 	 */
2020 	if (un->c.un_revision & MD_FN_META_DEV) {
2021 		(void) md_rem_selfname(un->c.un_self_id);
2022 	}
2023 
2024 	mddb_deleterec_wrapper(un->c.un_record_id);
2025 	if (recid != 0)
2026 		mddb_deleterec_wrapper(recid);
2027 
2028 	/* Remove the vtoc, if present */
2029 	if (vtoc_id)
2030 		mddb_deleterec_wrapper(vtoc_id);
2031 
2032 	md_rem_names(sv, nsv);
2033 
2034 	SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, SVM_TAG_METADEVICE,
2035 	    MD_MIN2SET(selfid), selfid);
2036 }
2037 
2038 int
2039 mirror_internal_open(
2040 	minor_t		mnum,
2041 	int		flag,
2042 	int		otyp,
2043 	int		md_oflags,
2044 	IOLOCK		*lockp		/* can be NULL */
2045 )
2046 {
2047 	mdi_unit_t	*ui = MDI_UNIT(mnum);
2048 	int		err = 0;
2049 
2050 tryagain:
2051 	/* single thread */
2052 	if (lockp) {
2053 		/*
2054 		 * If ioctl lock is held, use openclose_enter
2055 		 * routine that will set the ioctl flag when
2056 		 * grabbing the readerlock.
2057 		 */
2058 		(void) md_ioctl_openclose_enter(lockp, ui);
2059 	} else {
2060 		(void) md_unit_openclose_enter(ui);
2061 	}
2062 
2063 	/*
2064 	 * The mirror_open_all_devs routine may end up sending a STATE_UPDATE
2065 	 * message in a MN diskset and this requires that the openclose
2066 	 * lock is dropped in order to send this message.  So, another
2067 	 * flag (MD_UL_OPENINPROGRESS) is used to keep another thread from
2068 	 * attempting an open while this thread has an open in progress.
2069 	 * Call the *_lh version of the lock exit routines since the ui_mx
2070 	 * mutex must be held from checking for OPENINPROGRESS until
2071 	 * after the cv_wait call.
2072 	 */
2073 	mutex_enter(&ui->ui_mx);
2074 	if (ui->ui_lock & MD_UL_OPENINPROGRESS) {
2075 		if (lockp) {
2076 			(void) md_ioctl_openclose_exit_lh(lockp);
2077 		} else {
2078 			md_unit_openclose_exit_lh(ui);
2079 		}
2080 		cv_wait(&ui->ui_cv, &ui->ui_mx);
2081 		mutex_exit(&ui->ui_mx);
2082 		goto tryagain;
2083 	}
2084 
2085 	ui->ui_lock |= MD_UL_OPENINPROGRESS;
2086 	mutex_exit(&ui->ui_mx);
2087 
2088 	/* open devices, if necessary */
2089 	if (! md_unit_isopen(ui) || (ui->ui_tstate & MD_INACCESSIBLE)) {
2090 		if ((err = mirror_open_all_devs(mnum, md_oflags, lockp)) != 0)
2091 			goto out;
2092 	}
2093 
2094 	/* count open */
2095 	if ((err = md_unit_incopen(mnum, flag, otyp)) != 0)
2096 		goto out;
2097 
2098 	/* unlock, return success */
2099 out:
2100 	mutex_enter(&ui->ui_mx);
2101 	ui->ui_lock &= ~MD_UL_OPENINPROGRESS;
2102 	mutex_exit(&ui->ui_mx);
2103 
2104 	if (lockp) {
2105 		/*
2106 		 * If ioctl lock is held, use openclose_exit
2107 		 * routine that will clear the lockp reader flag.
2108 		 */
2109 		(void) md_ioctl_openclose_exit(lockp);
2110 	} else {
2111 		md_unit_openclose_exit(ui);
2112 	}
2113 	return (err);
2114 }
2115 
2116 int
2117 mirror_internal_close(
2118 	minor_t		mnum,
2119 	int		otyp,
2120 	int		md_cflags,
2121 	IOLOCK		*lockp		/* can be NULL */
2122 )
2123 {
2124 	mdi_unit_t	*ui = MDI_UNIT(mnum);
2125 	mm_unit_t	*un;
2126 	int		err = 0;
2127 
2128 	/* single thread */
2129 	if (lockp) {
2130 		/*
2131 		 * If ioctl lock is held, use openclose_enter
2132 		 * routine that will set the ioctl flag when
2133 		 * grabbing the readerlock.
2134 		 */
2135 		un = (mm_unit_t *)md_ioctl_openclose_enter(lockp, ui);
2136 	} else {
2137 		un = (mm_unit_t *)md_unit_openclose_enter(ui);
2138 	}
2139 
2140 	/* count closed */
2141 	if ((err = md_unit_decopen(mnum, otyp)) != 0)
2142 		goto out;
2143 
2144 	/* close devices, if necessary */
2145 	if (! md_unit_isopen(ui) || (md_cflags & MD_OFLG_PROBEDEV)) {
2146 		/*
2147 		 * Clean up dirty bitmap for this unit. Do this
2148 		 * before closing the underlying devices to avoid
2149 		 * race conditions with reset_mirror() as a
2150 		 * result of a 'metaset -r' command running in
2151 		 * parallel. This might cause deallocation of
2152 		 * dirty region bitmaps; with underlying metadevices
2153 		 * in place this can't happen.
2154 		 * Don't do this if a MN set and ABR not set
2155 		 */
2156 		if (new_resync && !(MD_STATUS(un) & MD_UN_KEEP_DIRTY)) {
2157 			if (!MD_MNSET_SETNO(MD_UN2SET(un)) ||
2158 			    !(ui->ui_tstate & MD_ABR_CAP))
2159 				mirror_process_unit_resync(un);
2160 		}
2161 		(void) mirror_close_all_devs(un, md_cflags);
2162 
2163 		/*
2164 		 * For a MN set with transient capabilities (eg ABR/DMR) set,
2165 		 * clear these capabilities on the last open in the cluster.
2166 		 * To do this we send a message to all nodes to see of the
2167 		 * device is open.
2168 		 */
2169 		if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
2170 		    (ui->ui_tstate & (MD_ABR_CAP|MD_DMR_CAP))) {
2171 			if (lockp) {
2172 				(void) md_ioctl_openclose_exit(lockp);
2173 			} else {
2174 				md_unit_openclose_exit(ui);
2175 			}
2176 
2177 			/*
2178 			 * if we are in the context of an ioctl, drop the
2179 			 * ioctl lock.
2180 			 * Otherwise, no other locks should be held.
2181 			 */
2182 			if (lockp) {
2183 				IOLOCK_RETURN_RELEASE(0, lockp);
2184 			}
2185 
2186 			mdmn_clear_all_capabilities(mnum);
2187 
2188 			/* if dropped the lock previously, regain it */
2189 			if (lockp) {
2190 				IOLOCK_RETURN_REACQUIRE(lockp);
2191 			}
2192 			return (0);
2193 		}
2194 		/* unlock and return success */
2195 	}
2196 out:
2197 	/* Call whether lockp is NULL or not. */
2198 	if (lockp) {
2199 		md_ioctl_openclose_exit(lockp);
2200 	} else {
2201 		md_unit_openclose_exit(ui);
2202 	}
2203 	return (err);
2204 }
2205 
2206 /*
2207  * When a component has completed resyncing and is now ok, check if the
2208  * corresponding component in the other submirrors is in the Last Erred
2209  * state.  If it is, we want to change that to the Erred state so we stop
2210  * using that component and start using this good component instead.
2211  *
2212  * This is called from set_sm_comp_state and recursively calls
2213  * set_sm_comp_state if it needs to change the Last Erred state.
2214  */
2215 static void
2216 reset_lasterred(mm_unit_t *un, int smi, mddb_recid_t *extras, uint_t flags,
2217 	IOLOCK *lockp)
2218 {
2219 	mm_submirror_t		*sm;
2220 	mm_submirror_ic_t	*smic;
2221 	int			ci;
2222 	int			i;
2223 	int			compcnt;
2224 	int			changed = 0;
2225 
2226 	for (i = 0; i < NMIRROR; i++) {
2227 		sm = &un->un_sm[i];
2228 		smic = &un->un_smic[i];
2229 
2230 		if (!SMS_IS(sm, SMS_INUSE))
2231 			continue;
2232 
2233 		/* ignore the submirror that we just made ok */
2234 		if (i == smi)
2235 			continue;
2236 
2237 		compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un);
2238 		for (ci = 0; ci < compcnt; ci++) {
2239 			md_m_shared_t	*shared;
2240 
2241 			shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
2242 			    (sm->sm_dev, sm, ci);
2243 
2244 			if ((shared->ms_state & CS_LAST_ERRED) &&
2245 			    !mirror_other_sources(un, i, ci, 1)) {
2246 
2247 				set_sm_comp_state(un, i, ci, CS_ERRED, extras,
2248 				    flags, lockp);
2249 				changed = 1;
2250 			}
2251 		}
2252 	}
2253 
2254 	/* maybe there is a hotspare for this newly erred component */
2255 	if (changed) {
2256 		set_t	setno;
2257 
2258 		setno = MD_UN2SET(un);
2259 		if (MD_MNSET_SETNO(setno)) {
2260 			send_poke_hotspares(setno);
2261 		} else {
2262 			(void) poke_hotspares();
2263 		}
2264 	}
2265 }
2266 
2267 /*
2268  * set_sm_comp_state
2269  *
2270  * Set the state of a submirror component to the specified new state.
2271  * If the mirror is in a multi-node set, send messages to all nodes to
2272  * block all writes to the mirror and then update the state and release the
2273  * writes. These messages are only sent if MD_STATE_XMIT is set in flags.
2274  * MD_STATE_XMIT will be unset in 2 cases:
2275  * 1. When the state is changed to CS_RESYNC as this state change
2276  * will already have been updated on each node by the processing of the
2277  * distributed metasync command, hence no need to xmit.
2278  * 2. When the state is change to CS_OKAY after a resync has completed. Again
2279  * the resync completion will already have been processed on each node by
2280  * the processing of the MD_MN_MSG_RESYNC_PHASE_DONE message for a component
2281  * resync, hence no need to xmit.
2282  *
2283  * In case we are called from the updates of a watermark,
2284  * (then MD_STATE_WMUPDATE will be set in the ps->flags) this is due to
2285  * a metainit or similar. In this case the message that we sent to propagate
2286  * the state change must not be a class1 message as that would deadlock with
2287  * the metainit command that is still being processed.
2288  * This we achieve by creating a class2 message MD_MN_MSG_STATE_UPDATE2
2289  * instead. This also makes the submessage generator to create a class2
2290  * submessage rather than a class1 (which would also block)
2291  *
2292  * On entry, unit_writerlock is held
2293  * If MD_STATE_OCHELD is set in flags, then unit_openclose lock is
2294  * also held.
2295  */
2296 void
2297 set_sm_comp_state(
2298 	mm_unit_t	*un,
2299 	int		smi,
2300 	int		ci,
2301 	int		newstate,
2302 	mddb_recid_t	*extras,
2303 	uint_t		flags,
2304 	IOLOCK		*lockp
2305 )
2306 {
2307 	mm_submirror_t		*sm;
2308 	mm_submirror_ic_t	*smic;
2309 	md_m_shared_t		*shared;
2310 	int			origstate;
2311 	void			(*get_dev)();
2312 	ms_cd_info_t		cd;
2313 	char			devname[MD_MAX_CTDLEN];
2314 	int			err;
2315 	set_t			setno = MD_UN2SET(un);
2316 	md_mn_msg_stch_t	stchmsg;
2317 	mdi_unit_t		*ui = MDI_UNIT(MD_SID(un));
2318 	md_mn_kresult_t		*kresult;
2319 	int			rval;
2320 	uint_t			msgflags;
2321 	md_mn_msgtype_t		msgtype;
2322 	int			save_lock = 0;
2323 	mdi_unit_t		*ui_sm;
2324 
2325 	sm = &un->un_sm[smi];
2326 	smic = &un->un_smic[smi];
2327 
2328 	/* If we have a real error status then turn off MD_INACCESSIBLE. */
2329 	ui_sm = MDI_UNIT(getminor(md_dev64_to_dev(sm->sm_dev)));
2330 	if (newstate & (CS_ERRED | CS_RESYNC | CS_LAST_ERRED) &&
2331 	    ui_sm->ui_tstate & MD_INACCESSIBLE) {
2332 	    ui_sm->ui_tstate &= ~MD_INACCESSIBLE;
2333 	}
2334 
2335 	shared = (md_m_shared_t *)
2336 		(*(smic->sm_shared_by_indx))(sm->sm_dev, sm, ci);
2337 	origstate = shared->ms_state;
2338 
2339 	/*
2340 	 * If the new state is an error and the old one wasn't, generate
2341 	 * a console message. We do this before we send the state to other
2342 	 * nodes in a MN set because the state change may change the component
2343 	 * name  if a hotspare is allocated.
2344 	 */
2345 	if ((! (origstate & (CS_ERRED|CS_LAST_ERRED))) &&
2346 	    (newstate & (CS_ERRED|CS_LAST_ERRED))) {
2347 
2348 		get_dev =
2349 		    (void (*)())md_get_named_service(sm->sm_dev, 0,
2350 				"get device", 0);
2351 		(void) (*get_dev)(sm->sm_dev, sm, ci, &cd);
2352 
2353 		err = md_getdevname(setno, mddb_getsidenum(setno), 0,
2354 		    cd.cd_dev, devname, sizeof (devname));
2355 
2356 		if (err == ENOENT) {
2357 			(void) md_devname(setno, cd.cd_dev, devname,
2358 				sizeof (devname));
2359 		}
2360 
2361 		cmn_err(CE_WARN, "md: %s: %s needs maintenance",
2362 		    md_shortname(md_getminor(sm->sm_dev)), devname);
2363 
2364 		if (newstate & CS_LAST_ERRED) {
2365 			cmn_err(CE_WARN, "md: %s: %s last erred",
2366 			    md_shortname(md_getminor(sm->sm_dev)),
2367 			    devname);
2368 
2369 		} else if (shared->ms_flags & MDM_S_ISOPEN) {
2370 			/*
2371 			 * Close the broken device and clear the open flag on
2372 			 * it.  Closing the device means the RCM framework will
2373 			 * be able to unconfigure the device if required.
2374 			 *
2375 			 * We have to check that the device is open, otherwise
2376 			 * the first open on it has resulted in the error that
2377 			 * is being processed and the actual cd.cd_dev will be
2378 			 * NODEV64.
2379 			 *
2380 			 * If this is a multi-node mirror, then the multinode
2381 			 * state checks following this code will cause the
2382 			 * slave nodes to close the mirror in the function
2383 			 * mirror_set_state().
2384 			 */
2385 			md_layered_close(cd.cd_dev, MD_OFLG_NULL);
2386 			shared->ms_flags &= ~MDM_S_ISOPEN;
2387 		}
2388 
2389 	} else if ((origstate & CS_LAST_ERRED) && (newstate & CS_ERRED) &&
2390 	    (shared->ms_flags & MDM_S_ISOPEN)) {
2391 		/*
2392 		 * Similar to logic above except no log messages since we
2393 		 * are just transitioning from Last Erred to Erred.
2394 		 */
2395 		get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0,
2396 		    "get device", 0);
2397 		(void) (*get_dev)(sm->sm_dev, sm, ci, &cd);
2398 
2399 		md_layered_close(cd.cd_dev, MD_OFLG_NULL);
2400 		shared->ms_flags &= ~MDM_S_ISOPEN;
2401 	}
2402 
2403 	if ((MD_MNSET_SETNO(setno)) && (origstate != newstate) &&
2404 	    (flags & MD_STATE_XMIT) && !(ui->ui_tstate & MD_ERR_PENDING)) {
2405 		/*
2406 		 * For a multi-node mirror, send the state change to the
2407 		 * master, which broadcasts to all nodes, including this
2408 		 * one. Once the message is received, the state is set
2409 		 * in-core and the master commits the change to disk.
2410 		 * There is a case, comp_replace,  where this function
2411 		 * can be called from within an ioctl and therefore in this
2412 		 * case, as the ioctl will already be called on each node,
2413 		 * there is no need to xmit the state change to the master for
2414 		 * distribution to the other nodes. MD_STATE_XMIT flag is used
2415 		 * to indicate whether a xmit is required. The mirror's
2416 		 * transient state is set to MD_ERR_PENDING to avoid sending
2417 		 * multiple messages.
2418 		 */
2419 		if (newstate & (CS_ERRED|CS_LAST_ERRED))
2420 			ui->ui_tstate |= MD_ERR_PENDING;
2421 
2422 		/*
2423 		 * Send a state update message to all nodes. This message
2424 		 * will generate 2 submessages, the first one to suspend
2425 		 * all writes to the mirror and the second to update the
2426 		 * state and resume writes.
2427 		 */
2428 		stchmsg.msg_stch_mnum = un->c.un_self_id;
2429 		stchmsg.msg_stch_sm = smi;
2430 		stchmsg.msg_stch_comp = ci;
2431 		stchmsg.msg_stch_new_state = newstate;
2432 		stchmsg.msg_stch_hs_id = shared->ms_hs_id;
2433 #ifdef DEBUG
2434 		if (mirror_debug_flag)
2435 			printf("send set state, %x, %x, %x, %x, %x\n",
2436 			    stchmsg.msg_stch_mnum, stchmsg.msg_stch_sm,
2437 			    stchmsg.msg_stch_comp, stchmsg.msg_stch_new_state,
2438 			    stchmsg.msg_stch_hs_id);
2439 #endif
2440 		if (flags & MD_STATE_WMUPDATE) {
2441 			msgtype  = MD_MN_MSG_STATE_UPDATE2;
2442 			/*
2443 			 * When coming from an update of watermarks, there
2444 			 * must already be a message logged that triggered
2445 			 * this action. So, no need to log this message, too.
2446 			 */
2447 			msgflags = MD_MSGF_NO_LOG;
2448 		} else {
2449 			msgtype  = MD_MN_MSG_STATE_UPDATE;
2450 			msgflags = MD_MSGF_DEFAULT_FLAGS;
2451 		}
2452 
2453 		/*
2454 		 * If we are in the context of an ioctl, drop the ioctl lock.
2455 		 * lockp holds the list of locks held.
2456 		 *
2457 		 * Otherwise, increment the appropriate reacquire counters.
2458 		 * If openclose lock is *held, then must reacquire reader
2459 		 * lock before releasing the openclose lock.
2460 		 * Do not drop the ARRAY_WRITER lock as we may not be able
2461 		 * to reacquire it.
2462 		 */
2463 		if (lockp) {
2464 			if (lockp->l_flags & MD_ARRAY_WRITER) {
2465 				save_lock = MD_ARRAY_WRITER;
2466 				lockp->l_flags &= ~MD_ARRAY_WRITER;
2467 			} else if (lockp->l_flags & MD_ARRAY_READER) {
2468 				save_lock = MD_ARRAY_READER;
2469 				lockp->l_flags &= ~MD_ARRAY_READER;
2470 			}
2471 			IOLOCK_RETURN_RELEASE(0, lockp);
2472 		} else {
2473 			if (flags & MD_STATE_OCHELD) {
2474 				md_unit_writerexit(ui);
2475 				(void) md_unit_readerlock(ui);
2476 				md_unit_openclose_exit(ui);
2477 			} else {
2478 				md_unit_writerexit(ui);
2479 			}
2480 		}
2481 
2482 		kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
2483 		rval = mdmn_ksend_message(setno,
2484 					msgtype,
2485 					msgflags,
2486 					(char *)&stchmsg,
2487 					sizeof (stchmsg),
2488 					kresult);
2489 
2490 		if (!MDMN_KSEND_MSG_OK(rval, kresult)) {
2491 			mdmn_ksend_show_error(rval, kresult, "STATE UPDATE");
2492 			cmn_err(CE_PANIC,
2493 			    "ksend_message failure: STATE_UPDATE");
2494 		}
2495 		kmem_free(kresult, sizeof (md_mn_kresult_t));
2496 
2497 		/* if dropped the lock previously, regain it */
2498 		if (lockp) {
2499 			IOLOCK_RETURN_REACQUIRE(lockp);
2500 			lockp->l_flags |= save_lock;
2501 		} else {
2502 			/*
2503 			 * Reacquire dropped locks and update acquirecnts
2504 			 * appropriately.
2505 			 */
2506 			if (flags & MD_STATE_OCHELD) {
2507 				/*
2508 				 * openclose also grabs readerlock.
2509 				 */
2510 				(void) md_unit_openclose_enter(ui);
2511 				md_unit_readerexit(ui);
2512 				(void) md_unit_writerlock(ui);
2513 			} else {
2514 				(void) md_unit_writerlock(ui);
2515 			}
2516 		}
2517 
2518 		ui->ui_tstate &= ~MD_ERR_PENDING;
2519 	} else {
2520 		shared->ms_state = newstate;
2521 		uniqtime32(&shared->ms_timestamp);
2522 
2523 		if (newstate == CS_ERRED)
2524 			shared->ms_flags |= MDM_S_NOWRITE;
2525 		else
2526 			shared->ms_flags &= ~MDM_S_NOWRITE;
2527 
2528 		shared->ms_flags &= ~MDM_S_IOERR;
2529 		un->un_changecnt++;
2530 		shared->ms_lasterrcnt = un->un_changecnt;
2531 
2532 		mirror_set_sm_state(sm, smic, SMS_RUNNING, 0);
2533 		mirror_commit(un, SMI2BIT(smi), extras);
2534 	}
2535 
2536 	if ((origstate & CS_RESYNC) && (newstate & CS_OKAY)) {
2537 		/*
2538 		 * Resetting the Last Erred state will recursively call back
2539 		 * into this function (set_sm_comp_state) to update the state.
2540 		 */
2541 		reset_lasterred(un, smi, extras, flags, lockp);
2542 	}
2543 }
2544 
2545 static int
2546 find_another_logical(
2547 	mm_unit_t		*un,
2548 	mm_submirror_t		*esm,
2549 	diskaddr_t		blk,
2550 	u_longlong_t		cnt,
2551 	int			must_be_open,
2552 	int			state,
2553 	int			err_cnt)
2554 {
2555 	u_longlong_t	cando;
2556 	md_dev64_t	dev;
2557 	md_m_shared_t	*s;
2558 
2559 	esm->sm_state |= SMS_IGNORE;
2560 	while (cnt != 0) {
2561 		u_longlong_t	 mcnt;
2562 
2563 		mcnt = MIN(cnt, lbtodb(1024 * 1024 * 1024));	/* 1 Gig Blks */
2564 
2565 		dev = select_read_unit(un, blk, mcnt, &cando, must_be_open, &s,
2566 			NULL);
2567 		if (dev == (md_dev64_t)0)
2568 			break;
2569 
2570 		if ((state == CS_LAST_ERRED) &&
2571 		    (s->ms_state == CS_LAST_ERRED) &&
2572 		    (err_cnt > s->ms_lasterrcnt))
2573 			break;
2574 
2575 		cnt -= cando;
2576 		blk += cando;
2577 	}
2578 	esm->sm_state &= ~SMS_IGNORE;
2579 	return (cnt != 0);
2580 }
2581 
2582 int
2583 mirror_other_sources(mm_unit_t *un, int smi, int ci, int must_be_open)
2584 {
2585 	mm_submirror_t		*sm;
2586 	mm_submirror_ic_t	*smic;
2587 	size_t			count;
2588 	diskaddr_t		block;
2589 	u_longlong_t		skip;
2590 	u_longlong_t		size;
2591 	md_dev64_t		dev;
2592 	int			cnt;
2593 	md_m_shared_t		*s;
2594 	int			not_found;
2595 
2596 	sm = &un->un_sm[smi];
2597 	smic = &un->un_smic[smi];
2598 	dev = sm->sm_dev;
2599 
2600 	/*
2601 	 * Make sure every component of the submirror
2602 	 * has other sources.
2603 	 */
2604 	if (ci < 0) {
2605 		/* Find the highest lasterrcnt */
2606 		cnt = (*(smic->sm_get_component_count))(dev, sm);
2607 		for (ci = 0; ci < cnt; ci++) {
2608 			not_found = mirror_other_sources(un, smi, ci,
2609 			    must_be_open);
2610 			if (not_found)
2611 				return (1);
2612 		}
2613 		return (0);
2614 	}
2615 
2616 	/*
2617 	 * Make sure this component has other sources
2618 	 */
2619 	(void) (*(smic->sm_get_bcss))
2620 		(dev, sm, ci, &block, &count, &skip, &size);
2621 
2622 	if (count == 0)
2623 		return (1);
2624 
2625 	s = (md_m_shared_t *)(*(smic->sm_shared_by_indx))(dev, sm, ci);
2626 
2627 	while (count--) {
2628 		if (block >= un->c.un_total_blocks)
2629 			return (0);
2630 
2631 		if ((block + size) > un->c.un_total_blocks)
2632 			size = un->c.un_total_blocks - block;
2633 
2634 		not_found = find_another_logical(un, sm, block, size,
2635 		    must_be_open, s->ms_state, s->ms_lasterrcnt);
2636 		if (not_found)
2637 			return (1);
2638 
2639 		block += size + skip;
2640 	}
2641 	return (0);
2642 }
2643 
2644 static void
2645 finish_error(md_mps_t *ps)
2646 {
2647 	struct buf	*pb;
2648 	mm_unit_t	*un;
2649 	mdi_unit_t	*ui;
2650 	uint_t		new_str_flags;
2651 
2652 	pb = ps->ps_bp;
2653 	un = ps->ps_un;
2654 	ui = ps->ps_ui;
2655 
2656 	/*
2657 	 * Must flag any error to the resync originator if we're performing
2658 	 * a Write-after-Read. This corresponds to an i/o error on a resync
2659 	 * target device and in this case we ought to abort the resync as there
2660 	 * is nothing that can be done to recover from this without operator
2661 	 * intervention. If we don't set the B_ERROR flag we will continue
2662 	 * reading from the mirror but won't write to the target (as it will
2663 	 * have been placed into an errored state).
2664 	 * To handle the case of multiple components within a submirror we only
2665 	 * set the B_ERROR bit if explicitly requested to via MD_MPS_FLAG_ERROR.
2666 	 * The originator of the resync read will cause this bit to be set if
2667 	 * the underlying component count is one for a submirror resync. All
2668 	 * other resync types will have the flag set as there is no underlying
2669 	 * resync which can be performed on a contained metadevice for these
2670 	 * resync types (optimized or component).
2671 	 */
2672 
2673 	if (ps->ps_flags & MD_MPS_WRITE_AFTER_READ) {
2674 		if (ps->ps_flags & MD_MPS_FLAG_ERROR)
2675 			pb->b_flags |= B_ERROR;
2676 		md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
2677 		MPS_FREE(mirror_parent_cache, ps);
2678 		md_unit_readerexit(ui);
2679 		md_biodone(pb);
2680 		return;
2681 	}
2682 	/*
2683 	 * Set the MD_IO_COUNTED flag as we are retrying the same I/O
2684 	 * operation therefore this I/O request has already been counted,
2685 	 * the I/O count variable will be decremented by mirror_done()'s
2686 	 * call to md_biodone().
2687 	 */
2688 	if (ps->ps_changecnt != un->un_changecnt) {
2689 		new_str_flags = MD_STR_NOTTOP | MD_IO_COUNTED;
2690 		if (ps->ps_flags & MD_MPS_WOW)
2691 			new_str_flags |= MD_STR_WOW;
2692 		if (ps->ps_flags & MD_MPS_MAPPED)
2693 			new_str_flags |= MD_STR_MAPPED;
2694 		/*
2695 		 * If this I/O request was a read that was part of a resync,
2696 		 * set MD_STR_WAR for the retried read to ensure that the
2697 		 * resync write (i.e. write-after-read) will be performed
2698 		 */
2699 		if (ps->ps_flags & MD_MPS_RESYNC_READ)
2700 			new_str_flags |= MD_STR_WAR;
2701 		md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
2702 		MPS_FREE(mirror_parent_cache, ps);
2703 		md_unit_readerexit(ui);
2704 		(void) md_mirror_strategy(pb, new_str_flags, NULL);
2705 		return;
2706 	}
2707 
2708 	pb->b_flags |= B_ERROR;
2709 	md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
2710 	MPS_FREE(mirror_parent_cache, ps);
2711 	md_unit_readerexit(ui);
2712 	md_biodone(pb);
2713 }
2714 
2715 static void
2716 error_update_unit(md_mps_t *ps)
2717 {
2718 	mm_unit_t		*un;
2719 	mdi_unit_t		*ui;
2720 	int			smi;	/* sub mirror index */
2721 	int			ci;	/* errored component */
2722 	set_t			setno;
2723 	uint_t			flags;	/* for set_sm_comp_state() */
2724 	uint_t			hspflags; /* for check_comp_4_hotspares() */
2725 
2726 	ui = ps->ps_ui;
2727 	un = (mm_unit_t *)md_unit_writerlock(ui);
2728 	setno = MD_UN2SET(un);
2729 
2730 	/* All of these updates have to propagated in case of MN set */
2731 	flags = MD_STATE_XMIT;
2732 	hspflags = MD_HOTSPARE_XMIT;
2733 
2734 	/* special treatment if we are called during updating watermarks */
2735 	if (ps->ps_flags & MD_MPS_WMUPDATE) {
2736 		flags |= MD_STATE_WMUPDATE;
2737 		hspflags |= MD_HOTSPARE_WMUPDATE;
2738 	}
2739 	smi = 0;
2740 	ci = 0;
2741 	while (mirror_geterror(un, &smi, &ci, 1, 0) != 0) {
2742 		if (mirror_other_sources(un, smi, ci, 0) == 1) {
2743 
2744 			/* Never called from ioctl context, so (IOLOCK *)NULL */
2745 			set_sm_comp_state(un, smi, ci, CS_LAST_ERRED, 0, flags,
2746 				(IOLOCK *)NULL);
2747 			/*
2748 			 * For a MN set, the NOTIFY is done when the state
2749 			 * change is processed on each node
2750 			 */
2751 			if (!MD_MNSET_SETNO(MD_UN2SET(un))) {
2752 				SE_NOTIFY(EC_SVM_STATE, ESC_SVM_LASTERRED,
2753 				    SVM_TAG_METADEVICE, setno, MD_SID(un));
2754 			}
2755 			continue;
2756 		}
2757 		/* Never called from ioctl context, so (IOLOCK *)NULL */
2758 		set_sm_comp_state(un, smi, ci, CS_ERRED, 0, flags,
2759 			(IOLOCK *)NULL);
2760 		/*
2761 		 * For a MN set, the NOTIFY is done when the state
2762 		 * change is processed on each node
2763 		 */
2764 		if (!MD_MNSET_SETNO(MD_UN2SET(un))) {
2765 			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED,
2766 			    SVM_TAG_METADEVICE, setno, MD_SID(un));
2767 		}
2768 		smi = 0;
2769 		ci = 0;
2770 	}
2771 
2772 	md_unit_writerexit(ui);
2773 	if (MD_MNSET_SETNO(setno)) {
2774 		send_poke_hotspares(setno);
2775 	} else {
2776 		(void) poke_hotspares();
2777 	}
2778 	(void) md_unit_readerlock(ui);
2779 
2780 	finish_error(ps);
2781 }
2782 
2783 /*
2784  * When we have a B_FAILFAST IO error on a Last Erred component we need to
2785  * retry the IO without B_FAILFAST set so that we try to ensure that the
2786  * component "sees" each IO.
2787  */
2788 static void
2789 last_err_retry(md_mcs_t *cs)
2790 {
2791 	struct buf	*cb;
2792 	md_mps_t	*ps;
2793 	uint_t		flags;
2794 
2795 	cb = &cs->cs_buf;
2796 	cb->b_flags &= ~B_FAILFAST;
2797 
2798 	/* if we're panicing just let this I/O error out */
2799 	if (panicstr) {
2800 	    (void) mirror_done(cb);
2801 	    return;
2802 	}
2803 
2804 	/* reissue the I/O */
2805 
2806 	ps = cs->cs_ps;
2807 
2808 	bioerror(cb, 0);
2809 
2810 	mutex_enter(&ps->ps_mx);
2811 
2812 	flags = MD_STR_NOTTOP;
2813 	if (ps->ps_flags & MD_MPS_MAPPED)
2814 		flags |= MD_STR_MAPPED;
2815 	if (ps->ps_flags & MD_MPS_NOBLOCK)
2816 		flags |= MD_NOBLOCK;
2817 
2818 	mutex_exit(&ps->ps_mx);
2819 
2820 	clear_retry_error(cb);
2821 
2822 	cmn_err(CE_NOTE, "!md: %s: Last Erred, retry I/O without B_FAILFAST",
2823 		md_shortname(getminor(cb->b_edev)));
2824 
2825 	md_call_strategy(cb, flags, NULL);
2826 }
2827 
2828 static void
2829 mirror_error(md_mps_t *ps)
2830 {
2831 	int		smi;	/* sub mirror index */
2832 	int		ci;	/* errored component */
2833 
2834 	if (panicstr) {
2835 		finish_error(ps);
2836 		return;
2837 	}
2838 
2839 	if (ps->ps_flags & MD_MPS_ON_OVERLAP)
2840 		mirror_overlap_chain_remove(ps);
2841 
2842 	smi = 0;
2843 	ci = 0;
2844 	if (mirror_geterror(ps->ps_un, &smi, &ci, 0, 0) != 0) {
2845 		md_unit_readerexit(ps->ps_ui);
2846 		daemon_request(&md_mstr_daemon, error_update_unit,
2847 		    (daemon_queue_t *)ps, REQ_OLD);
2848 		return;
2849 	}
2850 
2851 	finish_error(ps);
2852 }
2853 
2854 static int
2855 copy_write_done(struct buf *cb)
2856 {
2857 	md_mps_t	*ps;
2858 	buf_t		*pb;
2859 	char		*wowbuf;
2860 	wowhdr_t	*wowhdr;
2861 	ssize_t		wow_resid;
2862 
2863 	/* get wowbuf ans save structure */
2864 	wowbuf = cb->b_un.b_addr;
2865 	wowhdr = WOWBUF_HDR(wowbuf);
2866 	ps = wowhdr->wow_ps;
2867 	pb = ps->ps_bp;
2868 
2869 	/* Save error information, then free cb */
2870 	if (cb->b_flags & B_ERROR)
2871 		pb->b_flags |= B_ERROR;
2872 
2873 	if (cb->b_flags & B_REMAPPED)
2874 		bp_mapout(cb);
2875 
2876 	freerbuf(cb);
2877 
2878 	/* update residual and continue if needed */
2879 	if ((pb->b_flags & B_ERROR) == 0) {
2880 		wow_resid = pb->b_bcount - wowhdr->wow_offset;
2881 		pb->b_resid = wow_resid;
2882 		if (wow_resid > 0)  {
2883 			daemon_request(&md_mstr_daemon, copy_write_cont,
2884 			    (daemon_queue_t *)wowhdr, REQ_OLD);
2885 			return (1);
2886 		}
2887 	}
2888 
2889 	/* Write is complete, release resources. */
2890 	kmem_cache_free(mirror_wowblk_cache, wowhdr);
2891 	ASSERT(!(ps->ps_flags & MD_MPS_ON_OVERLAP));
2892 	md_kstat_done(ps->ps_ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
2893 	MPS_FREE(mirror_parent_cache, ps);
2894 	md_biodone(pb);
2895 	return (0);
2896 }
2897 
2898 static void
2899 copy_write_cont(wowhdr_t *wowhdr)
2900 {
2901 	buf_t		*pb;
2902 	buf_t		*cb;
2903 	char		*wowbuf;
2904 	int		wow_offset;
2905 	size_t		wow_resid;
2906 	diskaddr_t	wow_blkno;
2907 
2908 	wowbuf = WOWHDR_BUF(wowhdr);
2909 	pb = wowhdr->wow_ps->ps_bp;
2910 
2911 	/* get data on current location */
2912 	wow_offset = wowhdr->wow_offset;
2913 	wow_resid = pb->b_bcount - wow_offset;
2914 	wow_blkno = pb->b_lblkno + lbtodb(wow_offset);
2915 
2916 	/* setup child buffer */
2917 	cb = getrbuf(KM_SLEEP);
2918 	cb->b_flags = B_WRITE;
2919 	cb->b_edev = pb->b_edev;
2920 	cb->b_un.b_addr = wowbuf;	/* change to point at WOWBUF */
2921 	cb->b_bufsize = md_wowbuf_size; /* change to wowbuf_size */
2922 	cb->b_iodone = copy_write_done;
2923 	cb->b_bcount = MIN(md_wowbuf_size, wow_resid);
2924 	cb->b_lblkno = wow_blkno;
2925 
2926 	/* move offset to next section */
2927 	wowhdr->wow_offset += cb->b_bcount;
2928 
2929 	/* copy and setup write for current section */
2930 	bcopy(&pb->b_un.b_addr[wow_offset], wowbuf, cb->b_bcount);
2931 
2932 	/* do it */
2933 	/*
2934 	 * Do not set the MD_IO_COUNTED flag as this is a new I/O request
2935 	 * that handles the WOW condition. The resultant increment on the
2936 	 * I/O count variable is cleared by copy_write_done()'s call to
2937 	 * md_biodone().
2938 	 */
2939 	(void) md_mirror_strategy(cb, MD_STR_NOTTOP | MD_STR_WOW
2940 				    | MD_STR_MAPPED, NULL);
2941 }
2942 
2943 static void
2944 md_mirror_copy_write(md_mps_t *ps)
2945 {
2946 	wowhdr_t	*wowhdr;
2947 
2948 	wowhdr = kmem_cache_alloc(mirror_wowblk_cache, MD_ALLOCFLAGS);
2949 	mirror_wowblk_init(wowhdr);
2950 	wowhdr->wow_ps = ps;
2951 	wowhdr->wow_offset = 0;
2952 	copy_write_cont(wowhdr);
2953 }
2954 
2955 static void
2956 handle_wow(md_mps_t *ps)
2957 {
2958 	buf_t		*pb;
2959 
2960 	pb = ps->ps_bp;
2961 
2962 	bp_mapin(pb);
2963 
2964 	md_mirror_wow_cnt++;
2965 	if (!(pb->b_flags & B_PHYS) && (md_mirror_wow_flg & WOW_LOGIT)) {
2966 		cmn_err(CE_NOTE,
2967 		    "md: %s, blk %lld, cnt %ld: Write on write %d occurred",
2968 		    md_shortname(getminor(pb->b_edev)),
2969 		    (longlong_t)pb->b_lblkno, pb->b_bcount, md_mirror_wow_cnt);
2970 	}
2971 
2972 	/*
2973 	 * Set the MD_IO_COUNTED flag as we are retrying the same I/O
2974 	 * operation therefore this I/O request has already been counted,
2975 	 * the I/O count variable will be decremented by mirror_done()'s
2976 	 * call to md_biodone().
2977 	 */
2978 	if (md_mirror_wow_flg & WOW_NOCOPY)
2979 		(void) md_mirror_strategy(pb, MD_STR_NOTTOP | MD_STR_WOW |
2980 					    MD_STR_MAPPED | MD_IO_COUNTED, ps);
2981 	else
2982 		md_mirror_copy_write(ps);
2983 }
2984 
2985 /*
2986  * Return true if the specified submirror is either in the Last Erred
2987  * state or is transitioning into the Last Erred state.
2988  */
2989 static bool_t
2990 submirror_is_lasterred(mm_unit_t *un, int smi)
2991 {
2992 	mm_submirror_t		*sm;
2993 	mm_submirror_ic_t	*smic;
2994 	md_m_shared_t		*shared;
2995 	int			ci;
2996 	int			compcnt;
2997 
2998 	sm = &un->un_sm[smi];
2999 	smic = &un->un_smic[smi];
3000 
3001 	compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un);
3002 	for (ci = 0; ci < compcnt; ci++) {
3003 		shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
3004 		    (sm->sm_dev, sm, ci);
3005 
3006 		if (shared->ms_state == CS_LAST_ERRED)
3007 			return (B_TRUE);
3008 
3009 		/*
3010 		 * It is not currently Last Erred, check if entering Last Erred.
3011 		 */
3012 		if ((shared->ms_flags & MDM_S_IOERR) &&
3013 		    ((shared->ms_state == CS_OKAY) ||
3014 		    (shared->ms_state == CS_RESYNC))) {
3015 			if (mirror_other_sources(un, smi, ci, 0) == 1)
3016 				return (B_TRUE);
3017 		}
3018 	}
3019 
3020 	return (B_FALSE);
3021 }
3022 
3023 
3024 static int
3025 mirror_done(struct buf *cb)
3026 {
3027 	md_mps_t	*ps;
3028 	md_mcs_t	*cs;
3029 
3030 	/*LINTED*/
3031 	cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off);
3032 	ps = cs->cs_ps;
3033 
3034 	mutex_enter(&ps->ps_mx);
3035 
3036 	/* check if we need to retry an errored failfast I/O */
3037 	if (cb->b_flags & B_ERROR) {
3038 		struct buf *pb = ps->ps_bp;
3039 
3040 		if (cb->b_flags & B_FAILFAST) {
3041 			int		i;
3042 			mm_unit_t	*un = ps->ps_un;
3043 
3044 			for (i = 0; i < NMIRROR; i++) {
3045 				if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
3046 					continue;
3047 
3048 				if (cb->b_edev ==
3049 				    md_dev64_to_dev(un->un_sm[i].sm_dev)) {
3050 
3051 					/*
3052 					 * This is the submirror that had the
3053 					 * error.  Check if it is Last Erred.
3054 					 */
3055 					if (submirror_is_lasterred(un, i)) {
3056 						daemon_queue_t *dqp;
3057 
3058 						mutex_exit(&ps->ps_mx);
3059 						dqp = (daemon_queue_t *)cs;
3060 						dqp->dq_prev = NULL;
3061 						dqp->dq_next = NULL;
3062 						daemon_request(&md_done_daemon,
3063 						    last_err_retry, dqp,
3064 						    REQ_OLD);
3065 						return (1);
3066 					}
3067 					break;
3068 				}
3069 			}
3070 		}
3071 
3072 		/* continue to process the buf without doing a retry */
3073 		ps->ps_flags |= MD_MPS_ERROR;
3074 		pb->b_error = cb->b_error;
3075 	}
3076 
3077 	return (mirror_done_common(cb));
3078 }
3079 
3080 /*
3081  * Split from the original mirror_done function so we can handle bufs after a
3082  * retry.
3083  * ps->ps_mx is already held in the caller of this function and the cb error
3084  * has already been checked and handled in the caller.
3085  */
3086 static int
3087 mirror_done_common(struct buf *cb)
3088 {
3089 	struct buf	*pb;
3090 	mm_unit_t	*un;
3091 	mdi_unit_t	*ui;
3092 	md_mps_t	*ps;
3093 	md_mcs_t	*cs;
3094 	size_t		end_rr, start_rr, current_rr;
3095 
3096 	/*LINTED*/
3097 	cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off);
3098 	ps = cs->cs_ps;
3099 	pb = ps->ps_bp;
3100 
3101 	if (cb->b_flags & B_REMAPPED)
3102 		bp_mapout(cb);
3103 
3104 	ps->ps_frags--;
3105 	if (ps->ps_frags != 0) {
3106 		mutex_exit(&ps->ps_mx);
3107 		kmem_cache_free(mirror_child_cache, cs);
3108 		return (1);
3109 	}
3110 	un = ps->ps_un;
3111 	ui = ps->ps_ui;
3112 
3113 	/*
3114 	 * Do not update outstanding_writes if we're running with ABR
3115 	 * set for this mirror or the write() was issued with MD_STR_ABR set.
3116 	 * Also a resync initiated write() has no outstanding_writes update
3117 	 * either.
3118 	 */
3119 	if (((cb->b_flags & B_READ) == 0) &&
3120 	    (un->un_nsm >= 2) &&
3121 	    (ps->ps_call == NULL) &&
3122 	    !((ui->ui_tstate & MD_ABR_CAP) || (ps->ps_flags & MD_MPS_ABR)) &&
3123 	    !(ps->ps_flags & MD_MPS_WRITE_AFTER_READ)) {
3124 		BLK_TO_RR(end_rr, ps->ps_lastblk, un);
3125 		BLK_TO_RR(start_rr, ps->ps_firstblk, un);
3126 		mutex_enter(&un->un_resync_mx);
3127 		for (current_rr = start_rr; current_rr <= end_rr; current_rr++)
3128 			un->un_outstanding_writes[current_rr]--;
3129 		mutex_exit(&un->un_resync_mx);
3130 	}
3131 	kmem_cache_free(mirror_child_cache, cs);
3132 	mutex_exit(&ps->ps_mx);
3133 
3134 	if (ps->ps_call != NULL) {
3135 		daemon_request(&md_done_daemon, ps->ps_call,
3136 		    (daemon_queue_t *)ps, REQ_OLD);
3137 		return (1);
3138 	}
3139 
3140 	if ((ps->ps_flags & MD_MPS_ERROR)) {
3141 		daemon_request(&md_done_daemon, mirror_error,
3142 		    (daemon_queue_t *)ps, REQ_OLD);
3143 		return (1);
3144 	}
3145 
3146 	if (ps->ps_flags & MD_MPS_ON_OVERLAP)
3147 		mirror_overlap_chain_remove(ps);
3148 
3149 	/*
3150 	 * Handle Write-on-Write problem.
3151 	 * Skip In case of Raw and Direct I/O as they are
3152 	 * handled earlier.
3153 	 *
3154 	 */
3155 	if (!(md_mirror_wow_flg & WOW_DISABLE) &&
3156 	    !(pb->b_flags & B_READ) &&
3157 	    !(ps->ps_flags & MD_MPS_WOW) &&
3158 	    !(pb->b_flags & B_PHYS) &&
3159 	    any_pages_dirty(pb)) {
3160 		md_unit_readerexit(ps->ps_ui);
3161 		daemon_request(&md_mstr_daemon, handle_wow,
3162 		    (daemon_queue_t *)ps, REQ_OLD);
3163 		return (1);
3164 	}
3165 
3166 	md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
3167 	MPS_FREE(mirror_parent_cache, ps);
3168 	md_unit_readerexit(ui);
3169 	md_biodone(pb);
3170 	return (0);
3171 }
3172 
3173 /*
3174  * Clear error state in submirror component if the retry worked after
3175  * a failfast error.
3176  */
3177 static void
3178 clear_retry_error(struct buf *cb)
3179 {
3180 	int			smi;
3181 	md_mcs_t		*cs;
3182 	mm_unit_t		*un;
3183 	mdi_unit_t		*ui_sm;
3184 	mm_submirror_t		*sm;
3185 	mm_submirror_ic_t	*smic;
3186 	u_longlong_t		cnt;
3187 	md_m_shared_t		*shared;
3188 
3189 	/*LINTED*/
3190 	cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off);
3191 	un = cs->cs_ps->ps_un;
3192 
3193 	for (smi = 0; smi < NMIRROR; smi++) {
3194 	    if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE))
3195 		continue;
3196 
3197 	    if (cb->b_edev == md_dev64_to_dev(un->un_sm[smi].sm_dev)) {
3198 		break;
3199 	    }
3200 	}
3201 
3202 	if (smi >= NMIRROR)
3203 	    return;
3204 
3205 	sm = &un->un_sm[smi];
3206 	smic = &un->un_smic[smi];
3207 	cnt = cb->b_bcount;
3208 
3209 	ui_sm = MDI_UNIT(getminor(cb->b_edev));
3210 	(void) md_unit_writerlock(ui_sm);
3211 
3212 	shared = (md_m_shared_t *)(*(smic->sm_shared_by_blk))(sm->sm_dev, sm,
3213 	    cb->b_blkno, &cnt);
3214 
3215 	if (shared->ms_flags & MDM_S_IOERR) {
3216 	    shared->ms_flags &= ~MDM_S_IOERR;
3217 
3218 	} else {
3219 	    /* the I/O buf spans components and the first one is not erred */
3220 	    int	cnt;
3221 	    int	i;
3222 
3223 	    cnt = (*(smic->sm_get_component_count))(sm->sm_dev, un);
3224 	    for (i = 0; i < cnt; i++) {
3225 		shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
3226 		    (sm->sm_dev, sm, i);
3227 
3228 		if (shared->ms_flags & MDM_S_IOERR &&
3229 		    shared->ms_state == CS_OKAY) {
3230 
3231 		    shared->ms_flags &= ~MDM_S_IOERR;
3232 		    break;
3233 		}
3234 	    }
3235 	}
3236 
3237 	md_unit_writerexit(ui_sm);
3238 }
3239 
3240 static size_t
3241 mirror_map_read(
3242 	md_mps_t *ps,
3243 	md_mcs_t *cs,
3244 	diskaddr_t blkno,
3245 	u_longlong_t	count
3246 )
3247 {
3248 	mm_unit_t	*un;
3249 	buf_t		*bp;
3250 	u_longlong_t	cando;
3251 
3252 	bp = &cs->cs_buf;
3253 	un = ps->ps_un;
3254 
3255 	bp->b_lblkno = blkno;
3256 	if (fast_select_read_unit(ps, cs) == 0) {
3257 		bp->b_bcount = ldbtob(count);
3258 		return (0);
3259 	}
3260 	bp->b_edev = md_dev64_to_dev(select_read_unit(un, blkno, count, &cando,
3261 							0, NULL, cs));
3262 	bp->b_bcount = ldbtob(cando);
3263 	if (count != cando)
3264 		return (cando);
3265 	return (0);
3266 }
3267 
3268 static void
3269 write_after_read(md_mps_t *ps)
3270 {
3271 	struct buf	*pb;
3272 	int		flags;
3273 
3274 	if (ps->ps_flags & MD_MPS_ERROR) {
3275 		mirror_error(ps);
3276 		return;
3277 	}
3278 
3279 	pb = ps->ps_bp;
3280 	md_kstat_done(ps->ps_ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
3281 	ps->ps_call = NULL;
3282 	ps->ps_flags |= MD_MPS_WRITE_AFTER_READ;
3283 	flags = MD_STR_NOTTOP | MD_STR_WAR;
3284 	if (ps->ps_flags & MD_MPS_MAPPED)
3285 		flags |= MD_STR_MAPPED;
3286 	if (ps->ps_flags & MD_MPS_NOBLOCK)
3287 		flags |= MD_NOBLOCK;
3288 	if (ps->ps_flags & MD_MPS_DIRTY_RD)
3289 		flags |= MD_STR_DIRTY_RD;
3290 	(void) mirror_write_strategy(pb, flags, ps);
3291 }
3292 
3293 static void
3294 continue_serial(md_mps_t *ps)
3295 {
3296 	md_mcs_t	*cs;
3297 	buf_t		*cb;
3298 	mm_unit_t	*un;
3299 	int		flags;
3300 
3301 	un = ps->ps_un;
3302 	cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS);
3303 	mirror_child_init(cs);
3304 	cb = &cs->cs_buf;
3305 	ps->ps_call = NULL;
3306 	ps->ps_frags = 1;
3307 	(void) mirror_map_write(un, cs, ps, 0);
3308 	flags = MD_STR_NOTTOP;
3309 	if (ps->ps_flags & MD_MPS_MAPPED)
3310 		flags |= MD_STR_MAPPED;
3311 	md_call_strategy(cb, flags, NULL);
3312 }
3313 
3314 static int
3315 mirror_map_write(mm_unit_t *un, md_mcs_t *cs, md_mps_t *ps, int war)
3316 {
3317 	int i;
3318 	dev_t		dev;	/* needed for bioclone, so not md_dev64_t */
3319 	buf_t		*cb;
3320 	buf_t		*pb;
3321 	diskaddr_t	blkno;
3322 	size_t		bcount;
3323 	off_t		offset;
3324 
3325 	pb = ps->ps_bp;
3326 	cb = &cs->cs_buf;
3327 	cs->cs_ps = ps;
3328 
3329 	i = md_find_nth_unit(ps->ps_writable_sm, ps->ps_current_sm);
3330 
3331 	dev = md_dev64_to_dev(un->un_sm[i].sm_dev);
3332 
3333 	blkno = pb->b_lblkno;
3334 	bcount = pb->b_bcount;
3335 	offset = 0;
3336 	if (war && (blkno == 0) && (un->c.un_flag & MD_LABELED)) {
3337 		blkno = DK_LABEL_LOC + 1;
3338 		/*
3339 		 * This handles the case where we're requesting
3340 		 * a write to block 0 on a label partition
3341 		 * and the request size was smaller than the
3342 		 * size of the label.  If this is the case
3343 		 * then we'll return -1.  Failure to do so will
3344 		 * either cause the calling thread to hang due to
3345 		 * an ssd bug, or worse if the bcount were allowed
3346 		 * to go negative (ie large).
3347 		 */
3348 		if (bcount <= DEV_BSIZE*(DK_LABEL_LOC + 1))
3349 			return (-1);
3350 		bcount -= (DEV_BSIZE*(DK_LABEL_LOC + 1));
3351 		offset = (DEV_BSIZE*(DK_LABEL_LOC + 1));
3352 	}
3353 
3354 	cb = md_bioclone(pb, offset, bcount, dev, blkno, mirror_done,
3355 	    cb, KM_NOSLEEP);
3356 	if (war)
3357 		cb->b_flags = (cb->b_flags & ~B_READ) | B_WRITE;
3358 
3359 	/*
3360 	 * If the submirror is in the erred stated, check if any component is
3361 	 * in the Last Erred state.  If so, we don't want to use the B_FAILFAST
3362 	 * flag on the IO.
3363 	 *
3364 	 * Provide a fast path for the non-erred case (which should be the
3365 	 * normal case).
3366 	 */
3367 	if (un->un_sm[i].sm_flags & MD_SM_FAILFAST) {
3368 		if (un->un_sm[i].sm_state & SMS_COMP_ERRED) {
3369 			mm_submirror_t		*sm;
3370 			mm_submirror_ic_t	*smic;
3371 			int			ci;
3372 			int			compcnt;
3373 
3374 			sm = &un->un_sm[i];
3375 			smic = &un->un_smic[i];
3376 
3377 			compcnt = (*(smic->sm_get_component_count))
3378 			    (sm->sm_dev, un);
3379 			for (ci = 0; ci < compcnt; ci++) {
3380 				md_m_shared_t	*shared;
3381 
3382 				shared = (md_m_shared_t *)
3383 				    (*(smic->sm_shared_by_indx))(sm->sm_dev,
3384 				    sm, ci);
3385 
3386 				if (shared->ms_state == CS_LAST_ERRED)
3387 					break;
3388 			}
3389 			if (ci >= compcnt)
3390 				cb->b_flags |= B_FAILFAST;
3391 
3392 		} else {
3393 			cb->b_flags |= B_FAILFAST;
3394 		}
3395 	}
3396 
3397 	ps->ps_current_sm++;
3398 	if (ps->ps_current_sm != ps->ps_active_cnt) {
3399 		if (un->un_write_option == WR_SERIAL) {
3400 			ps->ps_call = continue_serial;
3401 			return (0);
3402 		}
3403 		return (1);
3404 	}
3405 	return (0);
3406 }
3407 
3408 /*
3409  * directed_read_done:
3410  * ------------------
3411  * Completion routine called when a DMR request has been returned from the
3412  * underlying driver. Wake-up the original ioctl() and return the data to
3413  * the user.
3414  */
3415 static void
3416 directed_read_done(md_mps_t *ps)
3417 {
3418 	mm_unit_t	*un;
3419 	mdi_unit_t	*ui;
3420 
3421 	un = ps->ps_un;
3422 	ui = ps->ps_ui;
3423 
3424 	md_unit_readerexit(ui);
3425 	md_kstat_done(ui, ps->ps_bp, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
3426 	ps->ps_call = NULL;
3427 
3428 	mutex_enter(&un->un_dmr_mx);
3429 	cv_signal(&un->un_dmr_cv);
3430 	mutex_exit(&un->un_dmr_mx);
3431 
3432 	/* release the parent structure */
3433 	kmem_cache_free(mirror_parent_cache, ps);
3434 }
3435 
3436 /*
3437  * daemon_io:
3438  * ------------
3439  * Called to issue a mirror_write_strategy() or mirror_read_strategy
3440  * call from a blockable context. NOTE: no mutex can be held on entry to this
3441  * routine
3442  */
3443 static void
3444 daemon_io(daemon_queue_t *dq)
3445 {
3446 	md_mps_t	*ps = (md_mps_t *)dq;
3447 	int		flag = MD_STR_NOTTOP;
3448 	buf_t		*pb = ps->ps_bp;
3449 
3450 	if (ps->ps_flags & MD_MPS_MAPPED)
3451 		flag |= MD_STR_MAPPED;
3452 	if (ps->ps_flags & MD_MPS_WOW)
3453 		flag |= MD_STR_WOW;
3454 	if (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)
3455 		flag |= MD_STR_WAR;
3456 	if (ps->ps_flags & MD_MPS_ABR)
3457 		flag |= MD_STR_ABR;
3458 
3459 	/*
3460 	 * If this is a resync read, ie MD_STR_DIRTY_RD not set, set
3461 	 * MD_STR_WAR before calling mirror_read_strategy
3462 	 */
3463 	if (pb->b_flags & B_READ) {
3464 		if (!(ps->ps_flags & MD_MPS_DIRTY_RD))
3465 			flag |= MD_STR_WAR;
3466 		mirror_read_strategy(pb, flag, ps);
3467 	} else
3468 		mirror_write_strategy(pb, flag, ps);
3469 }
3470 
3471 /*
3472  * update_resync:
3473  * -------------
3474  * Called to update the in-core version of the resync record with the latest
3475  * version that was committed to disk when the previous mirror owner
3476  * relinquished ownership. This call is likely to block as we must hold-off
3477  * any current resync processing that may be occurring.
3478  * On completion of the resync record update we issue the mirror_write_strategy
3479  * call to complete the i/o that first started this sequence. To remove a race
3480  * condition between a new write() request which is submitted and the resync
3481  * record update we acquire the writerlock. This will hold off all i/o to the
3482  * mirror until the resync update has completed.
3483  * NOTE: no mutex can be held on entry to this routine
3484  */
3485 static void
3486 update_resync(daemon_queue_t *dq)
3487 {
3488 	md_mps_t	*ps = (md_mps_t *)dq;
3489 	buf_t		*pb = ps->ps_bp;
3490 	mdi_unit_t	*ui = ps->ps_ui;
3491 	mm_unit_t	*un;
3492 	set_t		setno;
3493 	int		restart_resync;
3494 
3495 	un = md_unit_writerlock(ui);
3496 	ps->ps_un = un;
3497 	setno = MD_MIN2SET(getminor(pb->b_edev));
3498 	if (mddb_reread_rr(setno, un->un_rr_dirty_recid) == 0) {
3499 		/*
3500 		 * Synchronize our in-core view of what regions need to be
3501 		 * resync'd with the on-disk version.
3502 		 */
3503 		mutex_enter(&un->un_rrp_inflight_mx);
3504 		mirror_copy_rr(howmany(un->un_rrd_num, NBBY), un->un_resync_bm,
3505 		    un->un_dirty_bm);
3506 		mutex_exit(&un->un_rrp_inflight_mx);
3507 
3508 		/* Region dirty map is now up to date */
3509 	}
3510 	restart_resync = (un->un_rs_thread_flags & MD_RI_BLOCK_OWNER) ? 1 : 0;
3511 	md_unit_writerexit(ui);
3512 
3513 	/* Restart the resync thread if it was previously blocked */
3514 	if (restart_resync) {
3515 		mutex_enter(&un->un_rs_thread_mx);
3516 		un->un_rs_thread_flags &= ~MD_RI_BLOCK_OWNER;
3517 		cv_signal(&un->un_rs_thread_cv);
3518 		mutex_exit(&un->un_rs_thread_mx);
3519 	}
3520 	/* Continue with original deferred i/o */
3521 	daemon_io(dq);
3522 }
3523 
3524 /*
3525  * owner_timeout:
3526  * -------------
3527  * Called if the original mdmn_ksend_message() failed and the request is to be
3528  * retried. Reattempt the original ownership change.
3529  *
3530  * NOTE: called at interrupt context (see timeout(9f)).
3531  */
3532 static void
3533 owner_timeout(void *arg)
3534 {
3535 	daemon_queue_t	*dq = (daemon_queue_t *)arg;
3536 
3537 	daemon_request(&md_mirror_daemon, become_owner, dq, REQ_OLD);
3538 }
3539 
3540 /*
3541  * become_owner:
3542  * ------------
3543  * Called to issue RPC request to become the owner of the mirror
3544  * associated with this i/o request. We assume that the ownership request
3545  * is synchronous, so if it succeeds we will issue the request via
3546  * mirror_write_strategy().
3547  * If multiple i/o's are outstanding we will be called from the mirror_daemon
3548  * service thread.
3549  * NOTE: no mutex should be held on entry to this routine.
3550  */
3551 static void
3552 become_owner(daemon_queue_t *dq)
3553 {
3554 	md_mps_t	*ps = (md_mps_t *)dq;
3555 	mm_unit_t	*un = ps->ps_un;
3556 	buf_t		*pb = ps->ps_bp;
3557 	set_t		setno;
3558 	md_mn_kresult_t	*kres;
3559 	int		msg_flags = md_mirror_msg_flags;
3560 	md_mps_t	*ps1;
3561 
3562 	ASSERT(dq->dq_next == NULL && dq->dq_prev == NULL);
3563 
3564 	/*
3565 	 * If we're already the mirror owner we do not need to send a message
3566 	 * but can simply process the i/o request immediately.
3567 	 * If we've already sent the request to become owner we requeue the
3568 	 * request as we're waiting for the synchronous ownership message to
3569 	 * be processed.
3570 	 */
3571 	if (MD_MN_MIRROR_OWNER(un)) {
3572 		/*
3573 		 * As the strategy() call will potentially block we need to
3574 		 * punt this to a separate thread and complete this request
3575 		 * as quickly as possible. Note: if we're a read request
3576 		 * this must be a resync, we cannot afford to be queued
3577 		 * behind any intervening i/o requests. In this case we put the
3578 		 * request on the md_mirror_rs_daemon queue.
3579 		 */
3580 		if (pb->b_flags & B_READ) {
3581 			daemon_request(&md_mirror_rs_daemon, daemon_io, dq,
3582 			    REQ_OLD);
3583 		} else {
3584 			daemon_request(&md_mirror_io_daemon, daemon_io, dq,
3585 			    REQ_OLD);
3586 		}
3587 	} else {
3588 		mutex_enter(&un->un_owner_mx);
3589 		if ((un->un_owner_state & MM_MN_OWNER_SENT) == 0) {
3590 			md_mn_req_owner_t	*msg;
3591 			int			rval = 0;
3592 
3593 			/*
3594 			 * Check to see that we haven't exceeded the maximum
3595 			 * retry count. If we have we fail the i/o as the
3596 			 * comms mechanism has become wedged beyond recovery.
3597 			 */
3598 			if (dq->qlen++ >= MD_OWNER_RETRIES) {
3599 				mutex_exit(&un->un_owner_mx);
3600 				cmn_err(CE_WARN,
3601 				    "md_mirror: Request exhausted ownership "
3602 				    "retry limit of %d attempts", dq->qlen);
3603 				pb->b_error = EIO;
3604 				pb->b_flags |= B_ERROR;
3605 				pb->b_resid = pb->b_bcount;
3606 				kmem_cache_free(mirror_parent_cache, ps);
3607 				md_biodone(pb);
3608 				return;
3609 			}
3610 
3611 			/*
3612 			 * Issue request to change ownership. The call is
3613 			 * synchronous so when it returns we can complete the
3614 			 * i/o (if successful), or enqueue it again so that
3615 			 * the operation will be retried.
3616 			 */
3617 			un->un_owner_state |= MM_MN_OWNER_SENT;
3618 			mutex_exit(&un->un_owner_mx);
3619 
3620 			msg = kmem_zalloc(sizeof (md_mn_req_owner_t), KM_SLEEP);
3621 			setno = MD_MIN2SET(getminor(pb->b_edev));
3622 			msg->mnum = MD_SID(un);
3623 			msg->owner = md_mn_mynode_id;
3624 			msg_flags |= MD_MSGF_NO_LOG;
3625 			/*
3626 			 * If this IO is triggered by updating a watermark,
3627 			 * it might be issued by the creation of a softpartition
3628 			 * while the commd subsystem is suspended.
3629 			 * We don't want this message to block.
3630 			 */
3631 			if (ps->ps_flags & MD_MPS_WMUPDATE) {
3632 				msg_flags |= MD_MSGF_OVERRIDE_SUSPEND;
3633 			}
3634 
3635 			kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
3636 			rval = mdmn_ksend_message(setno,
3637 						MD_MN_MSG_REQUIRE_OWNER,
3638 						msg_flags, /* flags */
3639 						(char *)msg,
3640 						sizeof (md_mn_req_owner_t),
3641 						kres);
3642 
3643 			kmem_free(msg, sizeof (md_mn_req_owner_t));
3644 
3645 			if (MDMN_KSEND_MSG_OK(rval, kres)) {
3646 				dq->qlen = 0;
3647 				/*
3648 				 * Successfully changed owner, reread the
3649 				 * resync record so that we have a valid idea of
3650 				 * any previously committed incomplete write()s.
3651 				 * NOTE: As we need to acquire the resync mutex
3652 				 * this may block, so we defer it to a separate
3653 				 * thread handler. This makes us (effectively)
3654 				 * non-blocking once the ownership message
3655 				 * handling has completed.
3656 				 */
3657 				mutex_enter(&un->un_owner_mx);
3658 				if (un->un_owner_state & MM_MN_BECOME_OWNER) {
3659 					un->un_mirror_owner = md_mn_mynode_id;
3660 					/* Sets owner of un_rr_dirty record */
3661 					if (un->un_rr_dirty_recid)
3662 						(void) mddb_setowner(
3663 						    un->un_rr_dirty_recid,
3664 						    md_mn_mynode_id);
3665 					un->un_owner_state &=
3666 					    ~MM_MN_BECOME_OWNER;
3667 					/*
3668 					 * Release the block on the current
3669 					 * resync region if it is blocked
3670 					 */
3671 					ps1 = un->un_rs_prev_ovrlap;
3672 					if ((ps1 != NULL) &&
3673 					    (ps1->ps_flags & MD_MPS_ON_OVERLAP))
3674 						mirror_overlap_chain_remove(
3675 						    ps1);
3676 					mutex_exit(&un->un_owner_mx);
3677 
3678 					/*
3679 					 * If we're a read, this must be a
3680 					 * resync request, issue
3681 					 * the i/o request on the
3682 					 * md_mirror_rs_daemon queue. This is
3683 					 * to avoid a deadlock between the
3684 					 * resync_unit thread and
3685 					 * subsequent i/o requests that may
3686 					 * block on the resync region.
3687 					 */
3688 					if (pb->b_flags & B_READ) {
3689 						daemon_request(
3690 						    &md_mirror_rs_daemon,
3691 						    update_resync, dq, REQ_OLD);
3692 					} else {
3693 						daemon_request(
3694 						    &md_mirror_io_daemon,
3695 						    update_resync, dq, REQ_OLD);
3696 					}
3697 					kmem_free(kres,
3698 					    sizeof (md_mn_kresult_t));
3699 					return;
3700 				} else {
3701 					/*
3702 					 * Some other node has beaten us to
3703 					 * obtain ownership. We need to
3704 					 * reschedule our ownership request
3705 					 */
3706 					mutex_exit(&un->un_owner_mx);
3707 				}
3708 			} else {
3709 				mdmn_ksend_show_error(rval, kres,
3710 				    "MD_MN_MSG_REQUIRE_OWNER");
3711 				/*
3712 				 * Message transport failure is handled by the
3713 				 * comms layer. If the ownership change request
3714 				 * does not succeed we need to flag the error to
3715 				 * the initiator of the i/o. This is handled by
3716 				 * the retry logic above. As the request failed
3717 				 * we do not know _who_ the owner of the mirror
3718 				 * currently is. We reset our idea of the owner
3719 				 * to None so that any further write()s will
3720 				 * attempt to become the owner again. This stops
3721 				 * multiple nodes writing to the same mirror
3722 				 * simultaneously.
3723 				 */
3724 				mutex_enter(&un->un_owner_mx);
3725 				un->un_owner_state &=
3726 				    ~(MM_MN_OWNER_SENT|MM_MN_BECOME_OWNER);
3727 				un->un_mirror_owner = MD_MN_MIRROR_UNOWNED;
3728 				mutex_exit(&un->un_owner_mx);
3729 			}
3730 			kmem_free(kres, sizeof (md_mn_kresult_t));
3731 		} else
3732 			mutex_exit(&un->un_owner_mx);
3733 
3734 		/*
3735 		 * Re-enqueue this request on the deferred i/o list. Delay the
3736 		 * request for md_mirror_owner_to usecs to stop thrashing.
3737 		 */
3738 		(void) timeout(owner_timeout, dq,
3739 		    drv_usectohz(md_mirror_owner_to));
3740 	}
3741 }
3742 
3743 static void
3744 mirror_write_strategy(buf_t *pb, int flag, void *private)
3745 {
3746 	md_mps_t	*ps;
3747 	md_mcs_t	*cs;
3748 	int		more;
3749 	mm_unit_t	*un;
3750 	mdi_unit_t	*ui;
3751 	buf_t		*cb;		/* child buf pointer */
3752 	set_t		setno;
3753 	int		rs_on_overlap = 0;
3754 
3755 	ui = MDI_UNIT(getminor(pb->b_edev));
3756 	un = (mm_unit_t *)MD_UNIT(getminor(pb->b_edev));
3757 
3758 
3759 	md_kstat_waitq_enter(ui);
3760 
3761 	/*
3762 	 * If a state change is in progress for this mirror in a MN set,
3763 	 * suspend all non-resync writes until the state change is complete.
3764 	 * The objective of this suspend is to ensure that it is not
3765 	 * possible for one node to read data from a submirror that another node
3766 	 * has not written to because of the state change. Therefore we
3767 	 * suspend all writes until the state change has been made. As it is
3768 	 * not possible to read from the target of a resync, there is no need
3769 	 * to suspend resync writes.
3770 	 */
3771 
3772 	if (!(flag & MD_STR_WAR)) {
3773 		mutex_enter(&un->un_suspend_wr_mx);
3774 		while (un->un_suspend_wr_flag) {
3775 			cv_wait(&un->un_suspend_wr_cv, &un->un_suspend_wr_mx);
3776 		}
3777 		mutex_exit(&un->un_suspend_wr_mx);
3778 		(void) md_unit_readerlock(ui);
3779 	}
3780 
3781 	if (!(flag & MD_STR_NOTTOP)) {
3782 		if (md_checkbuf(ui, (md_unit_t *)un, pb)) {
3783 			md_kstat_waitq_exit(ui);
3784 			return;
3785 		}
3786 	}
3787 
3788 	setno = MD_MIN2SET(getminor(pb->b_edev));
3789 
3790 	/* If an ABR write has been requested, set MD_STR_ABR flag */
3791 	if (MD_MNSET_SETNO(setno) && (pb->b_flags & B_ABRWRITE))
3792 		flag |= MD_STR_ABR;
3793 
3794 	if (private == NULL) {
3795 		ps = kmem_cache_alloc(mirror_parent_cache, MD_ALLOCFLAGS);
3796 		mirror_parent_init(ps);
3797 	} else {
3798 		ps = private;
3799 		private = NULL;
3800 	}
3801 	if (flag & MD_STR_MAPPED)
3802 		ps->ps_flags |= MD_MPS_MAPPED;
3803 
3804 	if (flag & MD_STR_WOW)
3805 		ps->ps_flags |= MD_MPS_WOW;
3806 
3807 	if (flag & MD_STR_ABR)
3808 		ps->ps_flags |= MD_MPS_ABR;
3809 
3810 	if (flag & MD_STR_WMUPDATE)
3811 		ps->ps_flags |= MD_MPS_WMUPDATE;
3812 
3813 	/*
3814 	 * Save essential information from the original buffhdr
3815 	 * in the md_save structure.
3816 	 */
3817 	ps->ps_un = un;
3818 	ps->ps_ui = ui;
3819 	ps->ps_bp = pb;
3820 	ps->ps_addr = pb->b_un.b_addr;
3821 	ps->ps_firstblk = pb->b_lblkno;
3822 	ps->ps_lastblk = pb->b_lblkno + lbtodb(pb->b_bcount) - 1;
3823 	ps->ps_changecnt = un->un_changecnt;
3824 
3825 	/*
3826 	 * If not MN owner and this is an ABR write, make sure the current
3827 	 * resync region is on the overlaps chain
3828 	 */
3829 	mutex_enter(&un->un_owner_mx);
3830 	if (MD_MNSET_SETNO(setno) && (!(MD_MN_MIRROR_OWNER(un))) &&
3831 	    ((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR))) {
3832 		md_mps_t	*ps1;
3833 		/* Block the current resync region, if not already blocked */
3834 		ps1 = un->un_rs_prev_ovrlap;
3835 
3836 		if ((ps1 != NULL) && ((ps1->ps_firstblk != 0) ||
3837 		    (ps1->ps_lastblk != 0))) {
3838 			/* Drop locks to avoid deadlock */
3839 			mutex_exit(&un->un_owner_mx);
3840 			md_unit_readerexit(ui);
3841 			wait_for_overlaps(ps1, MD_OVERLAP_ALLOW_REPEAT);
3842 			rs_on_overlap = 1;
3843 			(void) md_unit_readerlock(ui);
3844 			mutex_enter(&un->un_owner_mx);
3845 			/*
3846 			 * Check to see if we have obtained ownership
3847 			 * while waiting for overlaps. If we have, remove
3848 			 * the resync_region entry from the overlap chain
3849 			 */
3850 			if (MD_MN_MIRROR_OWNER(un) &&
3851 			    (ps1->ps_flags & MD_MPS_ON_OVERLAP)) {
3852 				mirror_overlap_chain_remove(ps1);
3853 				rs_on_overlap = 0;
3854 			}
3855 		}
3856 	}
3857 	mutex_exit(&un->un_owner_mx);
3858 
3859 
3860 	/*
3861 	 * following keep write after read from writing to the
3862 	 * source in the case where it all came from one place
3863 	 */
3864 	if (flag & MD_STR_WAR) {
3865 		int	abort_write = 0;
3866 		/*
3867 		 * We are perfoming a write-after-read. This is either as a
3868 		 * result of a resync read or as a result of a read in a
3869 		 * dirty resync region when the optimized resync is not
3870 		 * complete. If in a MN set and a resync generated i/o,
3871 		 * if the current block is not in the current
3872 		 * resync region terminate the write as another node must have
3873 		 * completed this resync region
3874 		 */
3875 		if ((MD_MNSET_SETNO(MD_UN2SET(un))) &&
3876 		    (!flag & MD_STR_DIRTY_RD)) {
3877 			if (!IN_RESYNC_REGION(un, ps))
3878 				abort_write = 1;
3879 		}
3880 		if ((select_write_after_read_units(un, ps) == 0) ||
3881 		    (abort_write)) {
3882 #ifdef DEBUG
3883 			if (mirror_debug_flag)
3884 				printf("Abort resync write on %x, block %lld\n",
3885 				    MD_SID(un), ps->ps_firstblk);
3886 #endif
3887 			if (ps->ps_flags & MD_MPS_ON_OVERLAP)
3888 				mirror_overlap_chain_remove(ps);
3889 			kmem_cache_free(mirror_parent_cache, ps);
3890 			md_kstat_waitq_exit(ui);
3891 			md_unit_readerexit(ui);
3892 			md_biodone(pb);
3893 			return;
3894 		}
3895 	} else {
3896 		select_write_units(un, ps);
3897 
3898 		/* Drop readerlock to avoid deadlock */
3899 		md_unit_readerexit(ui);
3900 		wait_for_overlaps(ps, MD_OVERLAP_NO_REPEAT);
3901 		un = md_unit_readerlock(ui);
3902 		/*
3903 		 * For a MN set with an ABR write, if we are now the
3904 		 * owner and we have a resync region on the overlap
3905 		 * chain, remove the entry from overlaps and retry the write.
3906 		 */
3907 
3908 		if (MD_MNSET_SETNO(setno) &&
3909 		    ((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR))) {
3910 			mutex_enter(&un->un_owner_mx);
3911 			if (((MD_MN_MIRROR_OWNER(un))) && rs_on_overlap) {
3912 				mirror_overlap_chain_remove(ps);
3913 				md_kstat_waitq_exit(ui);
3914 				mutex_exit(&un->un_owner_mx);
3915 				md_unit_readerexit(ui);
3916 				daemon_request(&md_mirror_daemon, daemon_io,
3917 				    (daemon_queue_t *)ps, REQ_OLD);
3918 				return;
3919 			}
3920 			mutex_exit(&un->un_owner_mx);
3921 		}
3922 	}
3923 
3924 	/*
3925 	 * For Multinode mirrors with a Resync Region (not ABR) we need to
3926 	 * become the mirror owner before continuing with the write(). For ABR
3927 	 * mirrors we check that we 'own' the resync if we're in
3928 	 * write-after-read mode. We do this _after_ ensuring that there are no
3929 	 * overlaps to ensure that the once we know that we are the owner, the
3930 	 * readerlock will not released until the write is complete. As a
3931 	 * change of ownership in a MN set requires the writerlock, this
3932 	 * ensures that ownership cannot be changed until the write is
3933 	 * complete
3934 	 */
3935 	if (MD_MNSET_SETNO(setno) && (!((ui->ui_tstate & MD_ABR_CAP) ||
3936 	    (flag & MD_STR_ABR)) || (flag & MD_STR_WAR))) {
3937 		if (!MD_MN_MIRROR_OWNER(un))  {
3938 			if (ps->ps_flags & MD_MPS_ON_OVERLAP)
3939 				mirror_overlap_chain_remove(ps);
3940 			md_kstat_waitq_exit(ui);
3941 			ASSERT(!(flag & MD_STR_WAR));
3942 			md_unit_readerexit(ui);
3943 			daemon_request(&md_mirror_daemon, become_owner,
3944 			    (daemon_queue_t *)ps, REQ_OLD);
3945 			return;
3946 		}
3947 	}
3948 
3949 	/*
3950 	 * Mark resync region if mirror has a Resync Region _and_ we are not
3951 	 * a resync initiated write(). Don't mark region if we're flagged as
3952 	 * an ABR write.
3953 	 */
3954 	if (!((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR)) &&
3955 	    !(flag & MD_STR_WAR)) {
3956 		if (mirror_mark_resync_region(un, ps->ps_firstblk,
3957 		    ps->ps_lastblk)) {
3958 			pb->b_flags |= B_ERROR;
3959 			pb->b_resid = pb->b_bcount;
3960 			ASSERT(!(ps->ps_flags & MD_MPS_ON_OVERLAP));
3961 			kmem_cache_free(mirror_parent_cache, ps);
3962 			md_kstat_waitq_exit(ui);
3963 			md_unit_readerexit(ui);
3964 			md_biodone(pb);
3965 			return;
3966 		}
3967 	}
3968 
3969 	ps->ps_childbflags = pb->b_flags | B_WRITE;
3970 	ps->ps_childbflags &= ~B_READ;
3971 	if (flag & MD_STR_MAPPED)
3972 		ps->ps_childbflags &= ~B_PAGEIO;
3973 
3974 	if (!(flag & MD_STR_NOTTOP) && panicstr)
3975 		/* Disable WOW and don't free ps */
3976 		ps->ps_flags |= (MD_MPS_WOW|MD_MPS_DONTFREE);
3977 
3978 	md_kstat_waitq_to_runq(ui);
3979 
3980 	/*
3981 	 * Treat Raw and Direct I/O as Write-on-Write always
3982 	 */
3983 
3984 	if (!(md_mirror_wow_flg & WOW_DISABLE) &&
3985 	    (md_mirror_wow_flg & WOW_PHYS_ENABLE) &&
3986 	    (pb->b_flags & B_PHYS) &&
3987 	    !(ps->ps_flags & MD_MPS_WOW)) {
3988 		if (ps->ps_flags & MD_MPS_ON_OVERLAP)
3989 			mirror_overlap_chain_remove(ps);
3990 		md_unit_readerexit(ui);
3991 		daemon_request(&md_mstr_daemon, handle_wow,
3992 			(daemon_queue_t *)ps, REQ_OLD);
3993 		return;
3994 	}
3995 
3996 	ps->ps_frags = 1;
3997 	do {
3998 		cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS);
3999 		mirror_child_init(cs);
4000 		cb = &cs->cs_buf;
4001 		more = mirror_map_write(un, cs, ps, (flag & MD_STR_WAR));
4002 
4003 		/*
4004 		 * This handles the case where we're requesting
4005 		 * a write to block 0 on a label partition.  (more < 0)
4006 		 * means that the request size was smaller than the
4007 		 * size of the label.  If so this request is done.
4008 		 */
4009 		if (more < 0) {
4010 			if (ps->ps_flags & MD_MPS_ON_OVERLAP)
4011 				mirror_overlap_chain_remove(ps);
4012 			md_kstat_runq_exit(ui);
4013 			kmem_cache_free(mirror_child_cache, cs);
4014 			kmem_cache_free(mirror_parent_cache, ps);
4015 			md_unit_readerexit(ui);
4016 			md_biodone(pb);
4017 			return;
4018 		}
4019 		if (more) {
4020 			mutex_enter(&ps->ps_mx);
4021 			ps->ps_frags++;
4022 			mutex_exit(&ps->ps_mx);
4023 		}
4024 		md_call_strategy(cb, flag, private);
4025 	} while (more);
4026 
4027 	if (!(flag & MD_STR_NOTTOP) && panicstr) {
4028 		while (!(ps->ps_flags & MD_MPS_DONE)) {
4029 			md_daemon(1, &md_done_daemon);
4030 			drv_usecwait(10);
4031 		}
4032 		kmem_cache_free(mirror_parent_cache, ps);
4033 	}
4034 }
4035 
4036 static void
4037 mirror_read_strategy(buf_t *pb, int flag, void *private)
4038 {
4039 	md_mps_t	*ps;
4040 	md_mcs_t	*cs;
4041 	size_t		more;
4042 	mm_unit_t	*un;
4043 	mdi_unit_t	*ui;
4044 	size_t		current_count;
4045 	diskaddr_t	current_blkno;
4046 	off_t		current_offset;
4047 	buf_t		*cb;		/* child buf pointer */
4048 	set_t		setno;
4049 
4050 	ui = MDI_UNIT(getminor(pb->b_edev));
4051 
4052 	md_kstat_waitq_enter(ui);
4053 
4054 	un = (mm_unit_t *)md_unit_readerlock(ui);
4055 
4056 	if (!(flag & MD_STR_NOTTOP)) {
4057 		if (md_checkbuf(ui, (md_unit_t *)un, pb)) {
4058 			md_kstat_waitq_exit(ui);
4059 			return;
4060 		}
4061 	}
4062 
4063 	if (private == NULL) {
4064 		ps = kmem_cache_alloc(mirror_parent_cache, MD_ALLOCFLAGS);
4065 		mirror_parent_init(ps);
4066 	} else {
4067 		ps = private;
4068 		private = NULL;
4069 	}
4070 
4071 	if (flag & MD_STR_MAPPED)
4072 		ps->ps_flags |= MD_MPS_MAPPED;
4073 	if (flag & MD_NOBLOCK)
4074 		ps->ps_flags |= MD_MPS_NOBLOCK;
4075 	if (flag & MD_STR_WMUPDATE)
4076 		ps->ps_flags |= MD_MPS_WMUPDATE;
4077 
4078 	/*
4079 	 * Check to see if this is a DMR driven read. If so we need to use the
4080 	 * specified side (in un->un_dmr_last_read) for the source of the data.
4081 	 */
4082 	if (flag & MD_STR_DMR)
4083 		ps->ps_flags |= MD_MPS_DMR;
4084 
4085 	/*
4086 	 * Save essential information from the original buffhdr
4087 	 * in the md_save structure.
4088 	 */
4089 	ps->ps_un = un;
4090 	ps->ps_ui = ui;
4091 	ps->ps_bp = pb;
4092 	ps->ps_addr = pb->b_un.b_addr;
4093 	ps->ps_firstblk = pb->b_lblkno;
4094 	ps->ps_lastblk = pb->b_lblkno + lbtodb(pb->b_bcount) - 1;
4095 	ps->ps_changecnt = un->un_changecnt;
4096 
4097 	current_count = btodb(pb->b_bcount);
4098 	current_blkno = pb->b_lblkno;
4099 	current_offset = 0;
4100 
4101 	/*
4102 	 * If flag has MD_STR_WAR set this means that the read is issued by a
4103 	 * resync thread which may or may not be an optimised resync.
4104 	 *
4105 	 * If MD_UN_OPT_NOT_DONE is set this means that the optimized resync
4106 	 * code has not completed; either a resync has not started since snarf,
4107 	 * or there is an optimized resync in progress.
4108 	 *
4109 	 * We need to generate a write after this read in the following two
4110 	 * cases,
4111 	 *
4112 	 * 1. Any Resync-Generated read
4113 	 *
4114 	 * 2. Any read to a DIRTY REGION if there is an optimized resync
4115 	 *    pending or in progress.
4116 	 *
4117 	 * The write after read is done in these cases to ensure that all sides
4118 	 * of the mirror are in sync with the read data and that it is not
4119 	 * possible for an application to read the same block multiple times
4120 	 * and get different data.
4121 	 *
4122 	 * This would be possible if the block was in a dirty region.
4123 	 *
4124 	 * If we're performing a directed read we don't write the data out as
4125 	 * the application is responsible for restoring the mirror to a known
4126 	 * state.
4127 	 */
4128 	if (((MD_STATUS(un) & MD_UN_OPT_NOT_DONE) || (flag & MD_STR_WAR)) &&
4129 	    !(flag & MD_STR_DMR)) {
4130 		size_t	start_rr, i, end_rr;
4131 		int	region_dirty = 1;
4132 
4133 		/*
4134 		 * We enter here under three circumstances,
4135 		 *
4136 		 * MD_UN_OPT_NOT_DONE	MD_STR_WAR
4137 		 * 0			1
4138 		 * 1			0
4139 		 * 1			1
4140 		 *
4141 		 * To be optimal we only care to explicitly check for dirty
4142 		 * regions in the second case since if MD_STR_WAR is set we
4143 		 * always do the write after read.
4144 		 */
4145 		if (!(flag & MD_STR_WAR)) {
4146 			BLK_TO_RR(end_rr, ps->ps_lastblk, un);
4147 			BLK_TO_RR(start_rr, ps->ps_firstblk, un);
4148 
4149 			for (i = start_rr; i <= end_rr; i++)
4150 				if ((region_dirty = IS_KEEPDIRTY(i, un)) != 0)
4151 					break;
4152 		}
4153 
4154 		if ((region_dirty) &&
4155 		    !(md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)) {
4156 			ps->ps_call = write_after_read;
4157 			/*
4158 			 * Mark this as a RESYNC_READ in ps_flags.
4159 			 * This is used if the read fails during a
4160 			 * resync of a 3-way mirror to ensure that
4161 			 * the retried read to the remaining
4162 			 * good submirror has MD_STR_WAR set. This
4163 			 * is needed to ensure that the resync write
4164 			 * (write-after-read) takes place.
4165 			 */
4166 			ps->ps_flags |= MD_MPS_RESYNC_READ;
4167 
4168 			/*
4169 			 * If MD_STR_FLAG_ERR is set in the flags we
4170 			 * set MD_MPS_FLAG_ERROR so that an error on the resync
4171 			 * write (issued by write_after_read) will be flagged
4172 			 * to the biowait'ing resync thread. This allows us to
4173 			 * avoid issuing further resync requests to a device
4174 			 * that has had a write failure.
4175 			 */
4176 			if (flag & MD_STR_FLAG_ERR)
4177 				ps->ps_flags |= MD_MPS_FLAG_ERROR;
4178 
4179 			setno = MD_UN2SET(un);
4180 			/*
4181 			 * Drop the readerlock to avoid
4182 			 * deadlock
4183 			 */
4184 			md_unit_readerexit(ui);
4185 			wait_for_overlaps(ps, MD_OVERLAP_NO_REPEAT);
4186 			un = md_unit_readerlock(ui);
4187 			/*
4188 			 * Ensure that we are owner
4189 			 */
4190 			if (MD_MNSET_SETNO(setno)) {
4191 				/*
4192 				 * For a non-resync read that requires a
4193 				 * write-after-read to be done, set a flag
4194 				 * in the parent structure, so that the
4195 				 * write_strategy routine can omit the
4196 				 * test that the write is still within the
4197 				 * resync region
4198 				 */
4199 				if (!(flag & MD_STR_WAR))
4200 					ps->ps_flags |= MD_MPS_DIRTY_RD;
4201 
4202 				/*
4203 				 * Before reading the buffer, see if
4204 				 * we are the owner
4205 				 */
4206 				if (!MD_MN_MIRROR_OWNER(un))  {
4207 					ps->ps_call = NULL;
4208 					mirror_overlap_chain_remove(ps);
4209 					md_kstat_waitq_exit(ui);
4210 					md_unit_readerexit(ui);
4211 					daemon_request(
4212 					    &md_mirror_daemon,
4213 					    become_owner,
4214 					    (daemon_queue_t *)ps,
4215 					    REQ_OLD);
4216 					return;
4217 				}
4218 				/*
4219 				 * For a resync read, check to see if I/O is
4220 				 * outside of the current resync region, or
4221 				 * the resync has finished. If so
4222 				 * just terminate the I/O
4223 				 */
4224 				if ((flag & MD_STR_WAR) &&
4225 				    (!(un->c.un_status & MD_UN_WAR) ||
4226 				    (!IN_RESYNC_REGION(un, ps)))) {
4227 #ifdef DEBUG
4228 					if (mirror_debug_flag)
4229 						printf("Abort resync read "
4230 						    "%x: %lld\n",
4231 						    MD_SID(un),
4232 						    ps->ps_firstblk);
4233 #endif
4234 					mirror_overlap_chain_remove(ps);
4235 					kmem_cache_free(mirror_parent_cache,
4236 					    ps);
4237 					md_kstat_waitq_exit(ui);
4238 					md_unit_readerexit(ui);
4239 					md_biodone(pb);
4240 					return;
4241 				}
4242 			}
4243 		}
4244 	}
4245 
4246 	if (flag & MD_STR_DMR) {
4247 		ps->ps_call = directed_read_done;
4248 	}
4249 
4250 	if (!(flag & MD_STR_NOTTOP) && panicstr)
4251 		ps->ps_flags |= MD_MPS_DONTFREE;
4252 
4253 	md_kstat_waitq_to_runq(ui);
4254 
4255 	ps->ps_frags++;
4256 	do {
4257 		cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS);
4258 		mirror_child_init(cs);
4259 		cb = &cs->cs_buf;
4260 		cs->cs_ps = ps;
4261 
4262 		cb = md_bioclone(pb, current_offset, current_count, NODEV,
4263 		    current_blkno, mirror_done, cb, KM_NOSLEEP);
4264 
4265 		more = mirror_map_read(ps, cs, current_blkno,
4266 				(u_longlong_t)current_count);
4267 		if (more) {
4268 			mutex_enter(&ps->ps_mx);
4269 			ps->ps_frags++;
4270 			mutex_exit(&ps->ps_mx);
4271 		}
4272 
4273 		/*
4274 		 * Do these calculations now,
4275 		 *  so that we pickup a valid b_bcount from the chld_bp.
4276 		 */
4277 		current_count -= more;
4278 		current_offset += cb->b_bcount;
4279 		current_blkno +=  more;
4280 		md_call_strategy(cb, flag, private);
4281 	} while (more);
4282 
4283 	if (!(flag & MD_STR_NOTTOP) && panicstr) {
4284 		while (!(ps->ps_flags & MD_MPS_DONE)) {
4285 			md_daemon(1, &md_done_daemon);
4286 			drv_usecwait(10);
4287 		}
4288 		kmem_cache_free(mirror_parent_cache, ps);
4289 	}
4290 }
4291 
4292 void
4293 md_mirror_strategy(buf_t *bp, int flag, void *private)
4294 {
4295 	set_t	setno = MD_MIN2SET(getminor(bp->b_edev));
4296 
4297 	/*
4298 	 * When doing IO to a multi owner meta device, check if set is halted.
4299 	 * We do this check without the needed lock held, for performance
4300 	 * reasons.
4301 	 * If an IO just slips through while the set is locked via an
4302 	 * MD_MN_SUSPEND_SET, we don't care about it.
4303 	 * Only check for suspension if we are a top-level i/o request
4304 	 * (MD_STR_NOTTOP is cleared in 'flag').
4305 	 */
4306 	if ((md_set[setno].s_status & (MD_SET_HALTED | MD_SET_MNSET)) ==
4307 	    (MD_SET_HALTED | MD_SET_MNSET)) {
4308 		if ((flag & MD_STR_NOTTOP) == 0) {
4309 			mutex_enter(&md_mx);
4310 			/* Here we loop until the set is no longer halted */
4311 			while (md_set[setno].s_status & MD_SET_HALTED) {
4312 				cv_wait(&md_cv, &md_mx);
4313 			}
4314 			mutex_exit(&md_mx);
4315 		}
4316 	}
4317 
4318 	if ((flag & MD_IO_COUNTED) == 0) {
4319 		if ((flag & MD_NOBLOCK) == 0) {
4320 			if (md_inc_iocount(setno) != 0) {
4321 				bp->b_flags |= B_ERROR;
4322 				bp->b_error = ENXIO;
4323 				bp->b_resid = bp->b_bcount;
4324 				biodone(bp);
4325 				return;
4326 			}
4327 		} else {
4328 			md_inc_iocount_noblock(setno);
4329 		}
4330 	}
4331 
4332 	if (bp->b_flags & B_READ)
4333 		mirror_read_strategy(bp, flag, private);
4334 	else
4335 		mirror_write_strategy(bp, flag, private);
4336 }
4337 
4338 /*
4339  * mirror_directed_read:
4340  * --------------------
4341  * Entry-point for the DKIOCDMR ioctl. We issue a read to a specified sub-mirror
4342  * so that the application can determine what (if any) resync needs to be
4343  * performed. The data is copied out to the user-supplied buffer.
4344  *
4345  * Parameters:
4346  *	mdev	- dev_t for the mirror device
4347  *	vdr	- directed read parameters specifying location and submirror
4348  *		  to perform the read from
4349  *	mode	- used to ddi_copyout() any resulting data from the read
4350  *
4351  * Returns:
4352  *	0	success
4353  *	!0	error code
4354  *		EINVAL - invalid request format
4355  */
4356 int
4357 mirror_directed_read(dev_t mdev, vol_directed_rd_t *vdr, int mode)
4358 {
4359 	buf_t		*bp;
4360 	minor_t		mnum = getminor(mdev);
4361 	mdi_unit_t	*ui = MDI_UNIT(mnum);
4362 	mm_unit_t	*un;
4363 	mm_submirror_t	*sm;
4364 	char		*sm_nm;
4365 	uint_t		next_side;
4366 	void		*kbuffer;
4367 
4368 	if (ui == NULL)
4369 		return (ENXIO);
4370 
4371 	if (!(vdr->vdr_flags & DKV_DMR_NEXT_SIDE)) {
4372 		return (EINVAL);
4373 	}
4374 
4375 	/* Check for aligned block access. We disallow non-aligned requests. */
4376 	if (vdr->vdr_offset % DEV_BSIZE) {
4377 		return (EINVAL);
4378 	}
4379 
4380 	/*
4381 	 * Allocate kernel buffer for target of read(). If we had a reliable
4382 	 * (sorry functional) DDI this wouldn't be needed.
4383 	 */
4384 	kbuffer = kmem_alloc(vdr->vdr_nbytes, KM_NOSLEEP);
4385 	if (kbuffer == NULL) {
4386 		cmn_err(CE_WARN, "mirror_directed_read: couldn't allocate %lx"
4387 		    " bytes\n", vdr->vdr_nbytes);
4388 		return (ENOMEM);
4389 	}
4390 
4391 	bp = getrbuf(KM_SLEEP);
4392 
4393 	bp->b_un.b_addr = kbuffer;
4394 	bp->b_flags = B_READ;
4395 	bp->b_bcount = vdr->vdr_nbytes;
4396 	bp->b_lblkno = lbtodb(vdr->vdr_offset);
4397 	bp->b_edev = mdev;
4398 
4399 	un = md_unit_readerlock(ui);
4400 
4401 	/*
4402 	 * If DKV_SIDE_INIT is set we need to determine the first available
4403 	 * side to start reading from. If it isn't set we increment to the
4404 	 * next readable submirror.
4405 	 * If there are no readable submirrors we error out with DKV_DMR_ERROR.
4406 	 * Note: we check for a readable submirror on completion of the i/o so
4407 	 * we should _always_ have one available. If this becomes unavailable
4408 	 * we have missed the 'DKV_DMR_DONE' opportunity. This could happen if
4409 	 * a metadetach is made between the completion of one DKIOCDMR ioctl
4410 	 * and the start of the next (i.e. a sys-admin 'accident' occurred).
4411 	 * The chance of this is small, but not non-existent.
4412 	 */
4413 	if (vdr->vdr_side == DKV_SIDE_INIT) {
4414 		next_side = 0;
4415 	} else {
4416 		next_side = vdr->vdr_side + 1;
4417 	}
4418 	while ((next_side < NMIRROR) &&
4419 	    !SUBMIRROR_IS_READABLE(un, next_side))
4420 		next_side++;
4421 	if (next_side >= NMIRROR) {
4422 		vdr->vdr_flags |= DKV_DMR_ERROR;
4423 		freerbuf(bp);
4424 		vdr->vdr_bytesread = 0;
4425 		md_unit_readerexit(ui);
4426 		return (0);
4427 	}
4428 
4429 	/* Set the side to read from */
4430 	un->un_dmr_last_read = next_side;
4431 
4432 	md_unit_readerexit(ui);
4433 
4434 	/*
4435 	 * Save timestamp for verification purposes. Can be read by debugger
4436 	 * to verify that this ioctl has been executed and to find the number
4437 	 * of DMR reads and the time of the last DMR read.
4438 	 */
4439 	uniqtime(&mirror_dmr_stats.dmr_timestamp);
4440 	mirror_dmr_stats.dmr_count++;
4441 
4442 	/* Issue READ request and wait for completion */
4443 	mirror_read_strategy(bp, MD_STR_DMR|MD_NOBLOCK|MD_STR_NOTTOP, NULL);
4444 
4445 	mutex_enter(&un->un_dmr_mx);
4446 	cv_wait(&un->un_dmr_cv, &un->un_dmr_mx);
4447 	mutex_exit(&un->un_dmr_mx);
4448 
4449 	/*
4450 	 * Check to see if we encountered an error during the read. If so we
4451 	 * can make no guarantee about any possibly returned data.
4452 	 */
4453 	if ((bp->b_flags & B_ERROR) == 0) {
4454 		vdr->vdr_flags &= ~DKV_DMR_ERROR;
4455 		if (bp->b_resid) {
4456 			vdr->vdr_flags |= DKV_DMR_SHORT;
4457 			vdr->vdr_bytesread = vdr->vdr_nbytes - bp->b_resid;
4458 		} else {
4459 			vdr->vdr_flags |= DKV_DMR_SUCCESS;
4460 			vdr->vdr_bytesread = vdr->vdr_nbytes;
4461 		}
4462 		/* Copy the data read back out to the user supplied buffer */
4463 		if (ddi_copyout(kbuffer, vdr->vdr_data, vdr->vdr_bytesread,
4464 		    mode)) {
4465 			kmem_free(kbuffer, vdr->vdr_nbytes);
4466 			return (EFAULT);
4467 		}
4468 
4469 	} else {
4470 		/* Error out with DKV_DMR_ERROR */
4471 		vdr->vdr_flags |= DKV_DMR_ERROR;
4472 		vdr->vdr_flags &= ~(DKV_DMR_SUCCESS|DKV_DMR_SHORT|DKV_DMR_DONE);
4473 	}
4474 	/*
4475 	 * Update the DMR parameters with the side and name of submirror that
4476 	 * we have just read from (un->un_dmr_last_read)
4477 	 */
4478 	un = md_unit_readerlock(ui);
4479 
4480 	vdr->vdr_side = un->un_dmr_last_read;
4481 	sm = &un->un_sm[un->un_dmr_last_read];
4482 	sm_nm = md_shortname(md_getminor(sm->sm_dev));
4483 
4484 	(void) strncpy(vdr->vdr_side_name, sm_nm, sizeof (vdr->vdr_side_name));
4485 
4486 	/*
4487 	 * Determine if we've completed the read cycle. This is true iff the
4488 	 * next computed submirror (side) equals or exceeds NMIRROR. We cannot
4489 	 * use un_nsm as we need to handle a sparse array of submirrors (which
4490 	 * can occur if a submirror is metadetached).
4491 	 */
4492 	next_side = un->un_dmr_last_read + 1;
4493 	while ((next_side < NMIRROR) &&
4494 	    !SUBMIRROR_IS_READABLE(un, next_side))
4495 		next_side++;
4496 	if (next_side >= NMIRROR) {
4497 		/* We've finished */
4498 		vdr->vdr_flags |= DKV_DMR_DONE;
4499 	}
4500 
4501 	md_unit_readerexit(ui);
4502 	freerbuf(bp);
4503 	kmem_free(kbuffer, vdr->vdr_nbytes);
4504 
4505 	return (0);
4506 }
4507 
4508 /*
4509  * mirror_resync_message:
4510  * ---------------------
4511  * Handle the multi-node resync messages that keep all nodes within a given
4512  * disk-set in sync with their view of a mirror's resync status.
4513  *
4514  * The message types dealt with are:
4515  * MD_MN_MSG_RESYNC_STARTING	- start a resync thread for a unit
4516  * MD_MN_MSG_RESYNC_NEXT	- specified next region to be resynced
4517  * MD_MN_MSG_RESYNC_FINISH	- stop the resync thread for a unit
4518  * MD_MN_MSG_RESYNC_PHASE_DONE	- end of a resync phase, opt, submirror or comp
4519  *
4520  * Returns:
4521  *	0	Success
4522  *	>0	Failure error number
4523  */
4524 int
4525 mirror_resync_message(md_mn_rs_params_t *p, IOLOCK *lockp)
4526 {
4527 	mdi_unit_t		*ui;
4528 	mm_unit_t		*un;
4529 	set_t			setno;
4530 	int			is_ABR;
4531 	int			smi;
4532 	int			ci;
4533 	sm_state_t		state;
4534 	int			broke_out;
4535 	mm_submirror_t		*sm;
4536 	mm_submirror_ic_t	*smic;
4537 	md_m_shared_t		*shared;
4538 	md_error_t		mde = mdnullerror;
4539 	md_mps_t		*ps;
4540 	int			rs_active;
4541 
4542 	/* Check that the given device is part of a multi-node set */
4543 	setno = MD_MIN2SET(p->mnum);
4544 	if (setno >= md_nsets) {
4545 		return (ENXIO);
4546 	}
4547 	if (!MD_MNSET_SETNO(setno)) {
4548 		return (EINVAL);
4549 	}
4550 
4551 	if ((un = mirror_getun(p->mnum, &p->mde, NO_LOCK, NULL)) == NULL)
4552 		return (EINVAL);
4553 	if ((ui = MDI_UNIT(p->mnum)) == NULL)
4554 		return (EINVAL);
4555 	is_ABR = (ui->ui_tstate & MD_ABR_CAP);
4556 
4557 	/* Obtain the current resync status */
4558 	(void) md_ioctl_readerlock(lockp, ui);
4559 	rs_active = (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) ? 1 : 0;
4560 	md_ioctl_readerexit(lockp);
4561 
4562 	switch ((md_mn_msgtype_t)p->msg_type) {
4563 	case MD_MN_MSG_RESYNC_STARTING:
4564 		/* Start the resync thread for the mirror */
4565 		(void) mirror_resync_unit(p->mnum, NULL, &p->mde, lockp);
4566 		break;
4567 
4568 	case MD_MN_MSG_RESYNC_NEXT:
4569 		/*
4570 		 * We have to release any previously marked overlap regions
4571 		 * so that i/o can resume. Then we need to block the region
4572 		 * from [rs_start..rs_start+rs_size) * so that no i/o is issued.
4573 		 * Update un_rs_resync_done and un_rs_resync_2_do.
4574 		 */
4575 		(void) md_ioctl_readerlock(lockp, ui);
4576 		/*
4577 		 * Ignore the message if there is no active resync thread or
4578 		 * if it is for a resync type that we have already completed.
4579 		 * un_resync_completed is set to the last resync completed
4580 		 * when processing a PHASE_DONE message.
4581 		 */
4582 		if (!rs_active || (p->rs_type == un->un_resync_completed))
4583 			break;
4584 		/*
4585 		 * If this message is for the same resync and is for an earlier
4586 		 * resync region, just ignore it. This can only occur if this
4587 		 * node has progressed on to the next resync region before
4588 		 * we receive this message. This can occur if the class for
4589 		 * this message is busy and the originator has to retry thus
4590 		 * allowing this node to move onto the next resync_region.
4591 		 */
4592 		if ((p->rs_type == un->un_rs_type) &&
4593 		    (p->rs_start < un->un_resync_startbl))
4594 			break;
4595 		ps = un->un_rs_prev_ovrlap;
4596 
4597 		/* Allocate previous overlap reference if needed */
4598 		if (ps == NULL) {
4599 			ps = kmem_cache_alloc(mirror_parent_cache,
4600 				MD_ALLOCFLAGS);
4601 			ps->ps_un = un;
4602 			ps->ps_ui = ui;
4603 			ps->ps_firstblk = 0;
4604 			ps->ps_lastblk = 0;
4605 			ps->ps_flags = 0;
4606 			md_ioctl_readerexit(lockp);
4607 			(void) md_ioctl_writerlock(lockp, ui);
4608 			un->un_rs_prev_ovrlap = ps;
4609 			md_ioctl_writerexit(lockp);
4610 		} else
4611 			md_ioctl_readerexit(lockp);
4612 
4613 		if (p->rs_originator != md_mn_mynode_id) {
4614 			/*
4615 			 * On all but the originating node, first update
4616 			 * the resync state, then unblock the previous
4617 			 * region and block the next one. No need
4618 			 * to do this if the region is already blocked.
4619 			 * Update the submirror state and flags from the
4620 			 * originator. This keeps the cluster in sync with
4621 			 * regards to the resync status.
4622 			 */
4623 
4624 			(void) md_ioctl_writerlock(lockp, ui);
4625 			un->un_rs_resync_done = p->rs_done;
4626 			un->un_rs_resync_2_do = p->rs_2_do;
4627 			un->un_rs_type = p->rs_type;
4628 			un->un_resync_startbl = p->rs_start;
4629 			md_ioctl_writerexit(lockp);
4630 			/*
4631 			 * Use un_owner_mx to ensure that an ownership change
4632 			 * cannot happen at the same time as this message
4633 			 */
4634 			mutex_enter(&un->un_owner_mx);
4635 			if (MD_MN_MIRROR_OWNER(un)) {
4636 				ps->ps_firstblk = p->rs_start;
4637 				ps->ps_lastblk = ps->ps_firstblk +
4638 				    p->rs_size - 1;
4639 			} else {
4640 				if ((ps->ps_firstblk != p->rs_start) ||
4641 				    (ps->ps_lastblk != p->rs_start +
4642 				    p->rs_size - 1)) {
4643 					/* Remove previous overlap range */
4644 					if (ps->ps_flags & MD_MPS_ON_OVERLAP)
4645 						mirror_overlap_chain_remove(ps);
4646 
4647 					ps->ps_firstblk = p->rs_start;
4648 					ps->ps_lastblk = ps->ps_firstblk +
4649 					    p->rs_size - 1;
4650 
4651 					mutex_exit(&un->un_owner_mx);
4652 					/* Block this range from all i/o. */
4653 					if (ps->ps_firstblk != 0 ||
4654 					    ps->ps_lastblk != 0)
4655 						wait_for_overlaps(ps,
4656 						    MD_OVERLAP_ALLOW_REPEAT);
4657 					mutex_enter(&un->un_owner_mx);
4658 					/*
4659 					 * Check to see if we have obtained
4660 					 * ownership while waiting for
4661 					 * overlaps. If we have, remove
4662 					 * the resync_region entry from the
4663 					 * overlap chain
4664 					 */
4665 					if (MD_MN_MIRROR_OWNER(un) &&
4666 					    (ps->ps_flags & MD_MPS_ON_OVERLAP))
4667 						mirror_overlap_chain_remove(ps);
4668 				}
4669 			}
4670 			mutex_exit(&un->un_owner_mx);
4671 
4672 			/*
4673 			 * If this is the first RESYNC_NEXT message (i.e.
4674 			 * MD_MN_RS_FIRST_RESYNC_NEXT set in p->rs_flags),
4675 			 * issue RESYNC_START NOTIFY event
4676 			 */
4677 			if (p->rs_flags & MD_MN_RS_FIRST_RESYNC_NEXT) {
4678 				SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_START,
4679 				    SVM_TAG_METADEVICE, MD_UN2SET(un),
4680 				    MD_SID(un));
4681 			}
4682 
4683 			/* Ensure that our local resync thread is running */
4684 			if (un->un_rs_thread == NULL) {
4685 				(void) mirror_resync_unit(p->mnum, NULL,
4686 				    &p->mde, lockp);
4687 			}
4688 		}
4689 		break;
4690 	case MD_MN_MSG_RESYNC_FINISH:
4691 		/*
4692 		 * Complete the resync by stopping the resync thread.
4693 		 * Also release the previous overlap region field.
4694 		 * Update the resync_progress_thread by cv_signal'ing it so
4695 		 * that we mark the end of the resync as soon as possible. This
4696 		 * stops an unnecessary delay should be panic after resync
4697 		 * completion.
4698 		 */
4699 #ifdef DEBUG
4700 		if (!rs_active) {
4701 			if (mirror_debug_flag)
4702 				printf("RESYNC_FINISH (mnum = %x), "
4703 				    "Resync *NOT* active",
4704 				    p->mnum);
4705 		}
4706 #endif
4707 
4708 		if ((un->c.un_status & MD_UN_RESYNC_ACTIVE) &&
4709 		    (p->rs_originator != md_mn_mynode_id)) {
4710 			mutex_enter(&un->un_rs_thread_mx);
4711 			un->c.un_status &= ~MD_UN_RESYNC_CANCEL;
4712 			un->un_rs_thread_flags |= MD_RI_SHUTDOWN;
4713 			un->un_rs_thread_flags &=
4714 			    ~(MD_RI_BLOCK|MD_RI_BLOCK_OWNER);
4715 			cv_signal(&un->un_rs_thread_cv);
4716 			mutex_exit(&un->un_rs_thread_mx);
4717 		}
4718 		if (is_ABR) {
4719 			/* Resync finished, if ABR set owner to NULL */
4720 			mutex_enter(&un->un_owner_mx);
4721 			un->un_mirror_owner = 0;
4722 			mutex_exit(&un->un_owner_mx);
4723 		}
4724 		(void) md_ioctl_writerlock(lockp, ui);
4725 		ps = un->un_rs_prev_ovrlap;
4726 		if (ps != NULL) {
4727 			/* Remove previous overlap range */
4728 			if (ps->ps_flags & MD_MPS_ON_OVERLAP)
4729 				mirror_overlap_chain_remove(ps);
4730 			/*
4731 			 * Release the overlap range reference
4732 			 */
4733 			un->un_rs_prev_ovrlap = NULL;
4734 			kmem_cache_free(mirror_parent_cache,
4735 			    ps);
4736 		}
4737 		md_ioctl_writerexit(lockp);
4738 
4739 		/* Mark the resync as complete in the metadb */
4740 		un->un_rs_resync_done = p->rs_done;
4741 		un->un_rs_resync_2_do = p->rs_2_do;
4742 		un->un_rs_type = p->rs_type;
4743 		mutex_enter(&un->un_rs_progress_mx);
4744 		cv_signal(&un->un_rs_progress_cv);
4745 		mutex_exit(&un->un_rs_progress_mx);
4746 
4747 		un = md_ioctl_writerlock(lockp, ui);
4748 		un->c.un_status &= ~MD_UN_RESYNC_ACTIVE;
4749 		/* Deal with any pending grow_unit */
4750 		if (un->c.un_status & MD_UN_GROW_PENDING) {
4751 			if ((mirror_grow_unit(un, &mde) != 0) ||
4752 			    (! mdismderror(&mde, MDE_GROW_DELAYED))) {
4753 				un->c.un_status &= ~MD_UN_GROW_PENDING;
4754 			}
4755 		}
4756 		md_ioctl_writerexit(lockp);
4757 		break;
4758 
4759 	case MD_MN_MSG_RESYNC_PHASE_DONE:
4760 		/*
4761 		 * A phase of the resync, optimized. component or
4762 		 * submirror is complete. Update mirror status.
4763 		 * If the flag CLEAR_OPT_NOT_DONE is set, it means that the
4764 		 * mirror owner is peforming a resync. If we have just snarfed
4765 		 * this set, then we must clear any of the flags set at snarf
4766 		 * time by unit_setup_resync().
4767 		 * Note that unit_setup_resync() sets up these flags to
4768 		 * indicate that an optimized resync is required. These flags
4769 		 * need to be reset because if we get here,  the mirror owner
4770 		 * will have handled the optimized resync.
4771 		 * The flags that must be cleared are MD_UN_OPT_NOT_DONE and
4772 		 * MD_UN_WAR. In addition, for each submirror,
4773 		 * MD_SM_RESYNC_TARGET must be cleared and SMS_OFFLINE_RESYNC
4774 		 * set to SMS_OFFLINE.
4775 		 */
4776 #ifdef DEBUG
4777 		if (mirror_debug_flag)
4778 			printf("phase done mess received from %d, mnum=%x,"
4779 			    "type=%x, flags=%x\n", p->rs_originator, p->mnum,
4780 			    p->rs_type, p->rs_flags);
4781 #endif
4782 		/*
4783 		 * Ignore the message if there is no active resync thread.
4784 		 */
4785 		if (!rs_active)
4786 			break;
4787 
4788 		broke_out = p->rs_flags & MD_MN_RS_ERR;
4789 		switch (RS_TYPE(p->rs_type)) {
4790 		case MD_RS_OPTIMIZED:
4791 			un = md_ioctl_writerlock(lockp, ui);
4792 			if (p->rs_flags & MD_MN_RS_CLEAR_OPT_NOT_DONE) {
4793 				/* If we are originator, just clear rs_type */
4794 				if (p->rs_originator == md_mn_mynode_id) {
4795 					SET_RS_TYPE_NONE(un->un_rs_type);
4796 					md_ioctl_writerexit(lockp);
4797 					break;
4798 				}
4799 				/*
4800 				 * If CLEAR_OPT_NOT_DONE is set, only clear the
4801 				 * flags if OPT_NOT_DONE is set *and* rs_type
4802 				 * is MD_RS_NONE.
4803 				 */
4804 				if ((un->c.un_status & MD_UN_OPT_NOT_DONE) &&
4805 				    (RS_TYPE(un->un_rs_type) == MD_RS_NONE)) {
4806 					/* No resync in progress */
4807 					un->c.un_status &= ~MD_UN_OPT_NOT_DONE;
4808 					un->c.un_status &= ~MD_UN_WAR;
4809 				} else {
4810 					/*
4811 					 * We are in the middle of an
4812 					 * optimized resync and this message
4813 					 * should be ignored.
4814 					 */
4815 					md_ioctl_writerexit(lockp);
4816 					break;
4817 				}
4818 			} else {
4819 				/*
4820 				 * This is the end of an optimized resync,
4821 				 * clear the OPT_NOT_DONE and OFFLINE_SM flags
4822 				 */
4823 
4824 				un->c.un_status &= ~MD_UN_KEEP_DIRTY;
4825 				if (!broke_out)
4826 					un->c.un_status &= ~MD_UN_WAR;
4827 			}
4828 
4829 			/*
4830 			 * Set resync_completed to last resync type and then
4831 			 * clear resync_type to indicate no resync in progress
4832 			 */
4833 			un->un_resync_completed = un->un_rs_type;
4834 			SET_RS_TYPE_NONE(un->un_rs_type);
4835 
4836 			/*
4837 			 * If resync is as a result of a submirror ONLINE,
4838 			 * reset the submirror state to SMS_RUNNING if the
4839 			 * resync was ok else set back to SMS_OFFLINE.
4840 			 */
4841 			for (smi = 0; smi < NMIRROR; smi++) {
4842 				un->un_sm[smi].sm_flags &=
4843 				    ~MD_SM_RESYNC_TARGET;
4844 				if (SMS_BY_INDEX_IS(un, smi,
4845 				    SMS_OFFLINE_RESYNC)) {
4846 					if (p->rs_flags &
4847 					    MD_MN_RS_CLEAR_OPT_NOT_DONE) {
4848 						state = SMS_OFFLINE;
4849 					} else {
4850 						state = (broke_out ?
4851 						    SMS_OFFLINE : SMS_RUNNING);
4852 					}
4853 					mirror_set_sm_state(
4854 					    &un->un_sm[smi],
4855 					    &un->un_smic[smi], state,
4856 					    broke_out);
4857 					mirror_commit(un, NO_SUBMIRRORS,
4858 					    0);
4859 				}
4860 				/*
4861 				 * If we still have an offline submirror, reset
4862 				 * the OFFLINE_SM flag in the mirror status
4863 				 */
4864 				if (SMS_BY_INDEX_IS(un, smi,
4865 				    SMS_OFFLINE))
4866 					un->c.un_status |=
4867 					    MD_UN_OFFLINE_SM;
4868 			}
4869 			md_ioctl_writerexit(lockp);
4870 			break;
4871 		case MD_RS_SUBMIRROR:
4872 			un = md_ioctl_writerlock(lockp, ui);
4873 			smi = RS_SMI(p->rs_type);
4874 			sm = &un->un_sm[smi];
4875 			smic = &un->un_smic[smi];
4876 			/* Clear RESYNC target */
4877 			un->un_sm[smi].sm_flags &= ~MD_SM_RESYNC_TARGET;
4878 			/*
4879 			 * Set resync_completed to last resync type and then
4880 			 * clear resync_type to indicate no resync in progress
4881 			 */
4882 			un->un_resync_completed = un->un_rs_type;
4883 			SET_RS_TYPE_NONE(un->un_rs_type);
4884 			/*
4885 			 * If the resync completed ok reset the submirror
4886 			 * state to SMS_RUNNING else reset it to SMS_ATTACHED
4887 			 */
4888 			state = (broke_out ?
4889 			    SMS_ATTACHED : SMS_RUNNING);
4890 			mirror_set_sm_state(sm, smic, state, broke_out);
4891 			un->c.un_status &= ~MD_UN_WAR;
4892 			mirror_commit(un, SMI2BIT(smi), 0);
4893 			md_ioctl_writerexit(lockp);
4894 			break;
4895 		case MD_RS_COMPONENT:
4896 			un = md_ioctl_writerlock(lockp, ui);
4897 			smi = RS_SMI(p->rs_type);
4898 			ci = RS_CI(p->rs_type);
4899 			sm = &un->un_sm[smi];
4900 			smic = &un->un_smic[smi];
4901 			shared = (md_m_shared_t *)
4902 			    (*(smic->sm_shared_by_indx))
4903 			    (sm->sm_dev, sm, ci);
4904 			un->c.un_status &= ~MD_UN_WAR;
4905 			/* Clear RESYNC target */
4906 			un->un_sm[smi].sm_flags &= ~MD_SM_RESYNC_TARGET;
4907 			/*
4908 			 * Set resync_completed to last resync type and then
4909 			 * clear resync_type to indicate no resync in progress
4910 			 */
4911 			un->un_resync_completed = un->un_rs_type;
4912 			SET_RS_TYPE_NONE(un->un_rs_type);
4913 
4914 			/*
4915 			 * If the resync completed ok, set the component state
4916 			 * to CS_OKAY.
4917 			 */
4918 			if (broke_out)
4919 				shared->ms_flags |= MDM_S_RS_TRIED;
4920 			else {
4921 				/*
4922 				 * As we don't transmit the changes,
4923 				 * no need to drop the lock.
4924 				 */
4925 				set_sm_comp_state(un, smi, ci, CS_OKAY, 0,
4926 				    MD_STATE_NO_XMIT, (IOLOCK *)NULL);
4927 			}
4928 			md_ioctl_writerexit(lockp);
4929 		default:
4930 			break;
4931 		}
4932 		/*
4933 		 * If the purpose of this PHASE_DONE message is just to
4934 		 * indicate to all other nodes that the optimized resync
4935 		 * required (OPT_NOT_DONE) flag is to be cleared, there is
4936 		 * no need to generate a notify event as there has not
4937 		 * actually been a resync.
4938 		 */
4939 		if (!(p->rs_flags & MD_MN_RS_CLEAR_OPT_NOT_DONE)) {
4940 			if (broke_out) {
4941 				SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_FAILED,
4942 				    SVM_TAG_METADEVICE, MD_UN2SET(un),
4943 				    MD_SID(un));
4944 			} else {
4945 				SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_DONE,
4946 				    SVM_TAG_METADEVICE, MD_UN2SET(un),
4947 				    MD_SID(un));
4948 			}
4949 		}
4950 		break;
4951 
4952 	default:
4953 #ifdef DEBUG
4954 		cmn_err(CE_PANIC, "mirror_resync_message: Unknown message type"
4955 		    " %x\n", p->msg_type);
4956 #endif
4957 		return (EINVAL);
4958 	}
4959 	return (0);
4960 }
4961 
4962 /* Return a -1 if snarf of optimized record failed and set should be released */
4963 static int
4964 mirror_snarf(md_snarfcmd_t cmd, set_t setno)
4965 {
4966 	mddb_recid_t	recid;
4967 	int		gotsomething;
4968 	int		all_mirrors_gotten;
4969 	mm_unit_t	*un;
4970 	mddb_type_t	typ1;
4971 	mddb_de_ic_t    *dep;
4972 	mddb_rb32_t	*rbp;
4973 	size_t		newreqsize;
4974 	mm_unit_t	*big_un;
4975 	mm_unit32_od_t	*small_un;
4976 	int		retval;
4977 	mdi_unit_t	*ui;
4978 
4979 	if (cmd == MD_SNARF_CLEANUP) {
4980 		if (md_get_setstatus(setno) & MD_SET_STALE)
4981 			return (0);
4982 
4983 		recid = mddb_makerecid(setno, 0);
4984 		typ1 = (mddb_type_t)md_getshared_key(setno,
4985 		    mirror_md_ops.md_driver.md_drivername);
4986 		while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) {
4987 			if (mddb_getrecprivate(recid) & MD_PRV_CLEANUP) {
4988 				un = (mm_unit_t *)mddb_getrecaddr(recid);
4989 				mirror_cleanup(un);
4990 				recid = mddb_makerecid(setno, 0);
4991 			}
4992 		}
4993 		return (0);
4994 	}
4995 
4996 	all_mirrors_gotten = 1;
4997 	gotsomething = 0;
4998 
4999 	recid = mddb_makerecid(setno, 0);
5000 	typ1 = (mddb_type_t)md_getshared_key(setno,
5001 	    mirror_md_ops.md_driver.md_drivername);
5002 
5003 	while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) {
5004 		if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
5005 			continue;
5006 
5007 		dep = mddb_getrecdep(recid);
5008 		dep->de_flags = MDDB_F_MIRROR;
5009 		rbp = dep->de_rb;
5010 
5011 		switch (rbp->rb_revision) {
5012 		case MDDB_REV_RB:
5013 		case MDDB_REV_RBFN:
5014 			if ((rbp->rb_private & MD_PRV_CONVD) == 0) {
5015 				/*
5016 				 * This means, we have an old and small
5017 				 * record and this record hasn't already
5018 				 * been converted.  Before we create an
5019 				 * incore metadevice from this we have to
5020 				 * convert it to a big record.
5021 				 */
5022 				small_un =
5023 				    (mm_unit32_od_t *)mddb_getrecaddr(recid);
5024 				newreqsize = sizeof (mm_unit_t);
5025 				big_un = (mm_unit_t *)kmem_zalloc(newreqsize,
5026 					KM_SLEEP);
5027 				mirror_convert((caddr_t)small_un,
5028 					(caddr_t)big_un, SMALL_2_BIG);
5029 				kmem_free(small_un, dep->de_reqsize);
5030 
5031 				/*
5032 				 * Update userdata and incore userdata
5033 				 * incores are at the end of un
5034 				 */
5035 				dep->de_rb_userdata_ic = big_un;
5036 				dep->de_rb_userdata = big_un;
5037 				dep->de_icreqsize = newreqsize;
5038 				un = big_un;
5039 				rbp->rb_private |= MD_PRV_CONVD;
5040 			} else {
5041 				/*
5042 				 * Unit already converted, just get the
5043 				 * record address.
5044 				 */
5045 				un = (mm_unit_t *)mddb_getrecaddr_resize(recid,
5046 					sizeof (*un), 0);
5047 			}
5048 			un->c.un_revision &= ~MD_64BIT_META_DEV;
5049 			break;
5050 		case MDDB_REV_RB64:
5051 		case MDDB_REV_RB64FN:
5052 			/* Big device */
5053 			un = (mm_unit_t *)mddb_getrecaddr_resize(recid,
5054 				sizeof (*un), 0);
5055 			un->c.un_revision |= MD_64BIT_META_DEV;
5056 			un->c.un_flag |= MD_EFILABEL;
5057 			break;
5058 		}
5059 		NOTE_FN(rbp->rb_revision, un->c.un_revision);
5060 
5061 		/*
5062 		 * Create minor device node for snarfed entry.
5063 		 */
5064 		(void) md_create_minor_node(setno, MD_SID(un));
5065 
5066 		if (MD_UNIT(MD_SID(un)) != NULL) {
5067 			mddb_setrecprivate(recid, MD_PRV_PENDDEL);
5068 			continue;
5069 		}
5070 		all_mirrors_gotten = 0;
5071 		retval = mirror_build_incore(un, 1);
5072 		if (retval == 0) {
5073 			mddb_setrecprivate(recid, MD_PRV_GOTIT);
5074 			md_create_unit_incore(MD_SID(un), &mirror_md_ops, 0);
5075 			resync_start_timeout(setno);
5076 			gotsomething = 1;
5077 		} else {
5078 			return (retval);
5079 		}
5080 		/*
5081 		 * Set flag to indicate that the mirror has not yet
5082 		 * been through a reconfig. This flag is used for MN sets
5083 		 * when determining whether to update the mirror state from
5084 		 * the Master node.
5085 		 */
5086 		if (MD_MNSET_SETNO(setno)) {
5087 			ui = MDI_UNIT(MD_SID(un));
5088 			ui->ui_tstate |= MD_RESYNC_NOT_DONE;
5089 		}
5090 	}
5091 
5092 	if (!all_mirrors_gotten)
5093 		return (gotsomething);
5094 
5095 	recid = mddb_makerecid(setno, 0);
5096 	while ((recid = mddb_getnextrec(recid, typ1, RESYNC_REC)) > 0)
5097 		if (!(mddb_getrecprivate(recid) & MD_PRV_GOTIT))
5098 			mddb_setrecprivate(recid, MD_PRV_PENDDEL);
5099 
5100 	return (0);
5101 }
5102 
5103 static int
5104 mirror_halt(md_haltcmd_t cmd, set_t setno)
5105 {
5106 	unit_t		i;
5107 	mdi_unit_t	*ui;
5108 	minor_t		mnum;
5109 	int		reset_mirror_flag = 0;
5110 
5111 	if (cmd == MD_HALT_CLOSE)
5112 		return (0);
5113 
5114 	if (cmd == MD_HALT_OPEN)
5115 		return (0);
5116 
5117 	if (cmd == MD_HALT_UNLOAD)
5118 		return (0);
5119 
5120 	if (cmd == MD_HALT_CHECK) {
5121 		for (i = 0; i < md_nunits; i++) {
5122 			mnum = MD_MKMIN(setno, i);
5123 			if ((ui = MDI_UNIT(mnum)) == NULL)
5124 				continue;
5125 			if (ui->ui_opsindex != mirror_md_ops.md_selfindex)
5126 				continue;
5127 			if (md_unit_isopen(ui))
5128 				return (1);
5129 		}
5130 		return (0);
5131 	}
5132 
5133 	if (cmd != MD_HALT_DOIT)
5134 		return (1);
5135 
5136 	for (i = 0; i < md_nunits; i++) {
5137 		mnum = MD_MKMIN(setno, i);
5138 		if ((ui = MDI_UNIT(mnum)) == NULL)
5139 			continue;
5140 		if (ui->ui_opsindex != mirror_md_ops.md_selfindex)
5141 			continue;
5142 		reset_mirror((mm_unit_t *)MD_UNIT(mnum), mnum, 0);
5143 
5144 		/* Set a flag if there is at least one mirror metadevice. */
5145 		reset_mirror_flag = 1;
5146 	}
5147 
5148 	/*
5149 	 * Only wait for the global dr_timeout to finish
5150 	 *  - if there are mirror metadevices in this diskset or
5151 	 *  - if this is the local set since an unload of the md_mirror
5152 	 *    driver could follow a successful mirror halt in the local set.
5153 	 */
5154 	if ((reset_mirror_flag != 0) || (setno == MD_LOCAL_SET)) {
5155 		while ((mirror_md_ops.md_head == NULL) &&
5156 		    (mirror_timeout.dr_timeout_id != 0))
5157 			delay(md_hz);
5158 	}
5159 
5160 	return (0);
5161 }
5162 
5163 /*ARGSUSED3*/
5164 static int
5165 mirror_open(dev_t *dev, int flag, int otyp, cred_t *cred_p, int md_oflags)
5166 {
5167 	IOLOCK	lock;
5168 	minor_t		mnum = getminor(*dev);
5169 	set_t		setno;
5170 
5171 	/*
5172 	 * When doing an open of a multi owner metadevice, check to see if this
5173 	 * node is a starting node and if a reconfig cycle is underway.
5174 	 * If so, the system isn't sufficiently set up enough to handle the
5175 	 * open (which involves I/O during sp_validate), so fail with ENXIO.
5176 	 */
5177 	setno = MD_MIN2SET(mnum);
5178 	if ((md_set[setno].s_status & (MD_SET_MNSET | MD_SET_MN_START_RC)) ==
5179 	    (MD_SET_MNSET | MD_SET_MN_START_RC)) {
5180 			return (ENXIO);
5181 	}
5182 
5183 	if (md_oflags & MD_OFLG_FROMIOCTL) {
5184 		/*
5185 		 * This indicates that the caller is an ioctl service routine.
5186 		 * In this case we initialise our stack-based IOLOCK and pass
5187 		 * this into the internal open routine. This allows multi-owner
5188 		 * metadevices to avoid deadlocking if an error is encountered
5189 		 * during the open() attempt. The failure case is:
5190 		 * s-p -> mirror -> s-p (with error). Attempting to metaclear
5191 		 * this configuration would deadlock as the mirror code has to
5192 		 * send a state-update to the other nodes when it detects the
5193 		 * failure of the underlying submirror with an errored soft-part
5194 		 * on it. As there is a class1 message in progress (metaclear)
5195 		 * set_sm_comp_state() cannot send another class1 message;
5196 		 * instead we do not send a state_update message as the
5197 		 * metaclear is distributed and the failed submirror will be
5198 		 * cleared from the configuration by the metaclear.
5199 		 */
5200 		IOLOCK_INIT(&lock);
5201 		return (mirror_internal_open(getminor(*dev), flag, otyp,
5202 		    md_oflags, &lock));
5203 	} else {
5204 		return (mirror_internal_open(getminor(*dev), flag, otyp,
5205 		    md_oflags, (IOLOCK *)NULL));
5206 	}
5207 }
5208 
5209 
5210 /*ARGSUSED1*/
5211 static int
5212 mirror_close(dev_t dev, int flag, int otyp, cred_t *cred_p, int md_cflags)
5213 {
5214 	return (mirror_internal_close(getminor(dev), otyp, md_cflags,
5215 		(IOLOCK *)NULL));
5216 }
5217 
5218 
5219 /*
5220  * This routine dumps memory to the disk.  It assumes that the memory has
5221  * already been mapped into mainbus space.  It is called at disk interrupt
5222  * priority when the system is in trouble.
5223  *
5224  */
5225 static int
5226 mirror_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
5227 {
5228 	mm_unit_t	*un;
5229 	dev_t		mapdev;
5230 	int		result;
5231 	int		smi;
5232 	int		any_succeed = 0;
5233 	int		save_result = 0;
5234 
5235 	/*
5236 	 * Don't need to grab the unit lock.
5237 	 * Cause nothing else is suppose to be happenning.
5238 	 * Also dump is not suppose to sleep.
5239 	 */
5240 	un = (mm_unit_t *)MD_UNIT(getminor(dev));
5241 
5242 	if ((diskaddr_t)blkno >= un->c.un_total_blocks)
5243 		return (EINVAL);
5244 
5245 	if ((diskaddr_t)blkno + nblk > un->c.un_total_blocks)
5246 		return (EINVAL);
5247 
5248 	for (smi = 0; smi < NMIRROR; smi++) {
5249 		if (!SUBMIRROR_IS_WRITEABLE(un, smi))
5250 			continue;
5251 		mapdev = md_dev64_to_dev(un->un_sm[smi].sm_dev);
5252 		result = bdev_dump(mapdev, addr, blkno, nblk);
5253 		if (result)
5254 			save_result = result;
5255 
5256 		if (result == 0)
5257 			any_succeed++;
5258 	}
5259 
5260 	if (any_succeed)
5261 		return (0);
5262 
5263 	return (save_result);
5264 }
5265 
5266 /*
5267  * NAME: mirror_probe_dev
5268  *
5269  * DESCRITPION: force opens every component of a mirror.
5270  *
5271  * On entry the unit writerlock is held
5272  */
5273 static int
5274 mirror_probe_dev(mdi_unit_t *ui, minor_t mnum)
5275 {
5276 	int		i;
5277 	int		smi;
5278 	int		ci;
5279 	mm_unit_t	*un;
5280 	int		md_devopen = 0;
5281 	set_t		setno;
5282 	int		sm_cnt;
5283 	int		sm_unavail_cnt;
5284 
5285 	if (md_unit_isopen(ui))
5286 		md_devopen++;
5287 
5288 	un = MD_UNIT(mnum);
5289 	setno = MD_UN2SET(un);
5290 
5291 	sm_cnt = 0;
5292 	sm_unavail_cnt = 0;
5293 	for (i = 0; i < NMIRROR; i++) {
5294 		md_dev64_t tmpdev;
5295 		mdi_unit_t	*sm_ui;
5296 
5297 		if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) {
5298 			continue;
5299 		}
5300 
5301 		sm_cnt++;
5302 		tmpdev = un->un_sm[i].sm_dev;
5303 		(void) md_layered_open(mnum, &tmpdev,
5304 				MD_OFLG_CONT_ERRS | MD_OFLG_PROBEDEV);
5305 		un->un_sm[i].sm_dev = tmpdev;
5306 
5307 		sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev)));
5308 
5309 		/*
5310 		 * Logic similar to that in mirror_open_all_devs.  We set or
5311 		 * clear the submirror Unavailable bit.
5312 		 */
5313 		(void) md_unit_writerlock(sm_ui);
5314 		if (submirror_unavailable(un, i, 1)) {
5315 			sm_ui->ui_tstate |= MD_INACCESSIBLE;
5316 			sm_unavail_cnt++;
5317 		} else {
5318 			sm_ui->ui_tstate &= ~MD_INACCESSIBLE;
5319 		}
5320 		md_unit_writerexit(sm_ui);
5321 	}
5322 
5323 	/*
5324 	 * If all of the submirrors are unavailable, the mirror is also
5325 	 * unavailable.
5326 	 */
5327 	if (sm_cnt == sm_unavail_cnt) {
5328 		ui->ui_tstate |= MD_INACCESSIBLE;
5329 	} else {
5330 		ui->ui_tstate &= ~MD_INACCESSIBLE;
5331 	}
5332 
5333 	/*
5334 	 * Start checking from probe failures. If failures occur we
5335 	 * set the appropriate erred state only if the metadevice is in
5336 	 * use. This is specifically to prevent unnecessary resyncs.
5337 	 * For instance if the disks were accidentally disconnected when
5338 	 * the system booted up then until the metadevice is accessed
5339 	 * (like file system mount) the user can shutdown, recable and
5340 	 * reboot w/o incurring a potentially huge resync.
5341 	 */
5342 
5343 	smi = 0;
5344 	ci = 0;
5345 	while (mirror_geterror(un, &smi, &ci, 1, 1) != 0) {
5346 
5347 		if (mirror_other_sources(un, smi, ci, 0) == 1) {
5348 			/*
5349 			 * Note that for a MN set, there is no need to call
5350 			 * SE_NOTIFY as that is done when processing the
5351 			 * state change
5352 			 */
5353 			if (md_devopen) {
5354 				/*
5355 				 * Never called from ioctl context,
5356 				 * so (IOLOCK *)NULL
5357 				 */
5358 				set_sm_comp_state(un, smi, ci, CS_LAST_ERRED,
5359 				    0, MD_STATE_XMIT, (IOLOCK *)NULL);
5360 				if (!MD_MNSET_SETNO(setno)) {
5361 					SE_NOTIFY(EC_SVM_STATE,
5362 					    ESC_SVM_LASTERRED,
5363 					    SVM_TAG_METADEVICE, setno,
5364 					    MD_SID(un));
5365 				}
5366 				continue;
5367 			} else {
5368 				(void) mirror_close_all_devs(un,
5369 				    MD_OFLG_PROBEDEV);
5370 				if (!MD_MNSET_SETNO(setno)) {
5371 					SE_NOTIFY(EC_SVM_STATE,
5372 					    ESC_SVM_OPEN_FAIL,
5373 					    SVM_TAG_METADEVICE, setno,
5374 					    MD_SID(un));
5375 				}
5376 				mirror_openfail_console_info(un, smi, ci);
5377 				return (ENXIO);
5378 			}
5379 		}
5380 
5381 		/*
5382 		 * Note that for a MN set, there is no need to call
5383 		 * SE_NOTIFY as that is done when processing the
5384 		 * state change
5385 		 */
5386 		if (md_devopen) {
5387 			/* Never called from ioctl context, so (IOLOCK *)NULL */
5388 			set_sm_comp_state(un, smi, ci, CS_ERRED, 0,
5389 			    MD_STATE_XMIT, (IOLOCK *)NULL);
5390 			if (!MD_MNSET_SETNO(setno)) {
5391 				SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED,
5392 				    SVM_TAG_METADEVICE, setno,
5393 				    MD_SID(un));
5394 			}
5395 		}
5396 		mirror_openfail_console_info(un, smi, ci);
5397 		ci++;
5398 	}
5399 
5400 	if (MD_MNSET_SETNO(setno)) {
5401 		send_poke_hotspares(setno);
5402 	} else {
5403 		(void) poke_hotspares();
5404 	}
5405 	(void) mirror_close_all_devs(un, MD_OFLG_PROBEDEV);
5406 
5407 	return (0);
5408 }
5409 
5410 
5411 static int
5412 mirror_imp_set(
5413 	set_t	setno
5414 )
5415 {
5416 
5417 	mddb_recid_t	recid;
5418 	int		gotsomething, i;
5419 	mddb_type_t	typ1;
5420 	mddb_de_ic_t	*dep;
5421 	mddb_rb32_t	*rbp;
5422 	mm_unit32_od_t	*un32;
5423 	mm_unit_t	*un64;
5424 	md_dev64_t	self_devt;
5425 	minor_t		*self_id;	/* minor needs to be updated */
5426 	md_parent_t	*parent_id;	/* parent needs to be updated */
5427 	mddb_recid_t	*record_id;	/* record id needs to be updated */
5428 	mddb_recid_t	*optrec_id;
5429 	md_dev64_t	tmpdev;
5430 
5431 
5432 	gotsomething = 0;
5433 
5434 	typ1 = (mddb_type_t)md_getshared_key(setno,
5435 	    mirror_md_ops.md_driver.md_drivername);
5436 	recid = mddb_makerecid(setno, 0);
5437 
5438 	while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) {
5439 		if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
5440 			continue;
5441 
5442 		dep = mddb_getrecdep(recid);
5443 		rbp = dep->de_rb;
5444 
5445 		switch (rbp->rb_revision) {
5446 		case MDDB_REV_RB:
5447 		case MDDB_REV_RBFN:
5448 			/*
5449 			 * Small device
5450 			 */
5451 			un32 = (mm_unit32_od_t *)mddb_getrecaddr(recid);
5452 			self_id = &(un32->c.un_self_id);
5453 			parent_id = &(un32->c.un_parent);
5454 			record_id = &(un32->c.un_record_id);
5455 			optrec_id = &(un32->un_rr_dirty_recid);
5456 
5457 			for (i = 0; i < un32->un_nsm; i++) {
5458 			    tmpdev = md_expldev(un32->un_sm[i].sm_dev);
5459 			    un32->un_sm[i].sm_dev = md_cmpldev
5460 				(md_makedevice(md_major, MD_MKMIN(setno,
5461 				MD_MIN2UNIT(md_getminor(tmpdev)))));
5462 
5463 			    if (!md_update_minor(setno, mddb_getsidenum
5464 				(setno), un32->un_sm[i].sm_key))
5465 				goto out;
5466 			}
5467 			break;
5468 		case MDDB_REV_RB64:
5469 		case MDDB_REV_RB64FN:
5470 			un64 = (mm_unit_t *)mddb_getrecaddr(recid);
5471 			self_id = &(un64->c.un_self_id);
5472 			parent_id = &(un64->c.un_parent);
5473 			record_id = &(un64->c.un_record_id);
5474 			optrec_id = &(un64->un_rr_dirty_recid);
5475 
5476 			for (i = 0; i < un64->un_nsm; i++) {
5477 			    tmpdev = un64->un_sm[i].sm_dev;
5478 			    un64->un_sm[i].sm_dev = md_makedevice
5479 				(md_major, MD_MKMIN(setno, MD_MIN2UNIT
5480 				(md_getminor(tmpdev))));
5481 
5482 			    if (!md_update_minor(setno, mddb_getsidenum
5483 				(setno), un64->un_sm[i].sm_key))
5484 				goto out;
5485 			}
5486 			break;
5487 		}
5488 
5489 		/*
5490 		 * If this is a top level and a friendly name metadevice,
5491 		 * update its minor in the namespace.
5492 		 */
5493 		if ((*parent_id == MD_NO_PARENT) &&
5494 		    ((rbp->rb_revision == MDDB_REV_RBFN) ||
5495 		    (rbp->rb_revision == MDDB_REV_RB64FN))) {
5496 
5497 			self_devt = md_makedevice(md_major, *self_id);
5498 			if (!md_update_top_device_minor(setno,
5499 			    mddb_getsidenum(setno), self_devt))
5500 				goto out;
5501 		}
5502 
5503 		/*
5504 		 * Update unit with the imported setno
5505 		 *
5506 		 */
5507 		mddb_setrecprivate(recid, MD_PRV_GOTIT);
5508 
5509 		*self_id = MD_MKMIN(setno, MD_MIN2UNIT(*self_id));
5510 		if (*parent_id != MD_NO_PARENT)
5511 			*parent_id = MD_MKMIN(setno, MD_MIN2UNIT(*parent_id));
5512 		*record_id = MAKERECID(setno, DBID(*record_id));
5513 		*optrec_id = MAKERECID(setno, DBID(*optrec_id));
5514 
5515 		gotsomething = 1;
5516 	}
5517 
5518 out:
5519 	return (gotsomething);
5520 }
5521 
5522 /*
5523  * NAME: mirror_check_offline
5524  *
5525  * DESCRIPTION: return offline_status = 1 if any submirrors are offline
5526  *
5527  * Called from ioctl, so access to MD_UN_OFFLINE_SM in un_status is
5528  * protected by the global ioctl lock as it is only set by the MD_IOCOFFLINE
5529  * ioctl.
5530  */
5531 int
5532 mirror_check_offline(md_dev64_t dev, int *offline_status)
5533 {
5534 	mm_unit_t		*un;
5535 	md_error_t		mde = mdnullerror;
5536 
5537 	if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL)
5538 		return (EINVAL);
5539 	*offline_status = 0;
5540 	if (un->c.un_status & MD_UN_OFFLINE_SM)
5541 		*offline_status = 1;
5542 	return (0);
5543 }
5544 
5545 /*
5546  * NAME: mirror_inc_abr_count
5547  *
5548  * DESCRIPTION: increment the count of layered soft parts with ABR set
5549  *
5550  * Called from ioctl, so access to un_abr_count is protected by the global
5551  * ioctl lock. It is only referenced in the MD_IOCOFFLINE ioctl.
5552  */
5553 int
5554 mirror_inc_abr_count(md_dev64_t dev)
5555 {
5556 	mm_unit_t		*un;
5557 	md_error_t		mde = mdnullerror;
5558 
5559 	if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL)
5560 		return (EINVAL);
5561 	un->un_abr_count++;
5562 	return (0);
5563 }
5564 
5565 /*
5566  * NAME: mirror_dec_abr_count
5567  *
5568  * DESCRIPTION: decrement the count of layered soft parts with ABR set
5569  *
5570  * Called from ioctl, so access to un_abr_count is protected by the global
5571  * ioctl lock. It is only referenced in the MD_IOCOFFLINE ioctl.
5572  */
5573 int
5574 mirror_dec_abr_count(md_dev64_t dev)
5575 {
5576 	mm_unit_t		*un;
5577 	md_error_t		mde = mdnullerror;
5578 
5579 	if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL)
5580 		return (EINVAL);
5581 	un->un_abr_count--;
5582 	return (0);
5583 }
5584 
5585 static md_named_services_t mirror_named_services[] = {
5586 	{(intptr_t (*)()) poke_hotspares,		"poke hotspares"    },
5587 	{(intptr_t (*)()) mirror_rename_listkids,	MDRNM_LIST_URKIDS   },
5588 	{mirror_rename_check,				MDRNM_CHECK	    },
5589 	{(intptr_t (*)()) mirror_renexch_update_kids,	MDRNM_UPDATE_KIDS   },
5590 	{(intptr_t (*)()) mirror_exchange_parent_update_to,
5591 			MDRNM_PARENT_UPDATE_TO},
5592 	{(intptr_t (*)()) mirror_exchange_self_update_from_down,
5593 			MDRNM_SELF_UPDATE_FROM_DOWN },
5594 	{(intptr_t (*)())mirror_probe_dev,		"probe open test" },
5595 	{(intptr_t (*)())mirror_check_offline,		MD_CHECK_OFFLINE },
5596 	{(intptr_t (*)())mirror_inc_abr_count,		MD_INC_ABR_COUNT },
5597 	{(intptr_t (*)())mirror_dec_abr_count,		MD_DEC_ABR_COUNT },
5598 	{ NULL,						0		    }
5599 };
5600 
5601 md_ops_t mirror_md_ops = {
5602 	mirror_open,		/* open */
5603 	mirror_close,		/* close */
5604 	md_mirror_strategy,	/* strategy */
5605 	NULL,			/* print */
5606 	mirror_dump,		/* dump */
5607 	NULL,			/* read */
5608 	NULL,			/* write */
5609 	md_mirror_ioctl,	/* mirror_ioctl, */
5610 	mirror_snarf,		/* mirror_snarf */
5611 	mirror_halt,		/* mirror_halt */
5612 	NULL,			/* aread */
5613 	NULL,			/* awrite */
5614 	mirror_imp_set,		/* import set */
5615 	mirror_named_services
5616 };
5617 
5618 /* module specific initilization */
5619 static void
5620 init_init()
5621 {
5622 	md_mirror_mcs_buf_off = sizeof (md_mcs_t) - sizeof (buf_t);
5623 
5624 	/* Initialize the parent and child save memory pools */
5625 	mirror_parent_cache = kmem_cache_create("md_mirror_parent",
5626 	    sizeof (md_mps_t), 0, mirror_parent_constructor,
5627 	    mirror_parent_destructor, mirror_run_queue, NULL, NULL,
5628 	    0);
5629 
5630 	mirror_child_cache = kmem_cache_create("md_mirror_child",
5631 	    sizeof (md_mcs_t) - sizeof (buf_t) + biosize(), 0,
5632 	    mirror_child_constructor, mirror_child_destructor,
5633 	    mirror_run_queue, NULL, NULL, 0);
5634 
5635 	/*
5636 	 * Insure wowbuf_size is a multiple of DEV_BSIZE,
5637 	 * then initialize wowbuf memory pool.
5638 	 */
5639 	md_wowbuf_size = roundup(md_wowbuf_size, DEV_BSIZE);
5640 	if (md_wowbuf_size <= 0)
5641 		md_wowbuf_size = 2 * DEV_BSIZE;
5642 	if (md_wowbuf_size > (32 * DEV_BSIZE))
5643 		md_wowbuf_size = (32 * DEV_BSIZE);
5644 
5645 	md_wowblk_size = md_wowbuf_size + sizeof (wowhdr_t);
5646 	mirror_wowblk_cache = kmem_cache_create("md_mirror_wow",
5647 	    md_wowblk_size, 0, NULL, NULL, NULL, NULL, NULL, 0);
5648 
5649 	mutex_init(&mirror_timeout.dr_mx, NULL, MUTEX_DEFAULT, NULL);
5650 	mutex_init(&hotspare_request.dr_mx, NULL, MUTEX_DEFAULT, NULL);
5651 
5652 	mutex_init(&non_ff_drv_mutex, NULL, MUTEX_DEFAULT, NULL);
5653 }
5654 
5655 /* module specific uninitilization (undo init_init()) */
5656 static void
5657 fini_uninit()
5658 {
5659 	kmem_cache_destroy(mirror_parent_cache);
5660 	kmem_cache_destroy(mirror_child_cache);
5661 	kmem_cache_destroy(mirror_wowblk_cache);
5662 	mirror_parent_cache = mirror_child_cache =
5663 	    mirror_wowblk_cache = NULL;
5664 
5665 	mutex_destroy(&mirror_timeout.dr_mx);
5666 	mutex_destroy(&hotspare_request.dr_mx);
5667 	mutex_destroy(&non_ff_drv_mutex);
5668 }
5669 
5670 /* define the module linkage */
5671 MD_PLUGIN_MISC_MODULE("mirrors module %I%", init_init(), fini_uninit())
5672