xref: /onnv-gate/usr/src/uts/common/io/lvm/mirror/mirror.c (revision 1366:18ae7db30fe7)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/param.h>
29 #include <sys/systm.h>
30 #include <sys/conf.h>
31 #include <sys/file.h>
32 #include <sys/user.h>
33 #include <sys/uio.h>
34 #include <sys/t_lock.h>
35 #include <sys/buf.h>
36 #include <sys/dkio.h>
37 #include <sys/vtoc.h>
38 #include <sys/kmem.h>
39 #include <vm/page.h>
40 #include <sys/cmn_err.h>
41 #include <sys/sysmacros.h>
42 #include <sys/types.h>
43 #include <sys/mkdev.h>
44 #include <sys/stat.h>
45 #include <sys/open.h>
46 #include <sys/modctl.h>
47 #include <sys/ddi.h>
48 #include <sys/sunddi.h>
49 #include <sys/debug.h>
50 #include <sys/dklabel.h>
51 #include <vm/hat.h>
52 #include <sys/lvm/md_mirror.h>
53 #include <sys/lvm/md_convert.h>
54 #include <sys/lvm/md_mddb.h>
55 #include <sys/esunddi.h>
56 
57 #include <sys/sysevent/eventdefs.h>
58 #include <sys/sysevent/svm.h>
59 #include <sys/lvm/mdmn_commd.h>
60 
61 md_ops_t		mirror_md_ops;
62 #ifndef	lint
63 char			_depends_on[] = "drv/md";
64 md_ops_t		*md_interface_ops = &mirror_md_ops;
65 #endif
66 
67 extern mdq_anchor_t	md_done_daemon;
68 extern mdq_anchor_t	md_mstr_daemon;
69 extern mdq_anchor_t	md_mirror_daemon;
70 extern mdq_anchor_t	md_mirror_io_daemon;
71 extern mdq_anchor_t	md_mirror_rs_daemon;
72 extern mdq_anchor_t	md_mhs_daemon;
73 
74 extern unit_t		md_nunits;
75 extern set_t		md_nsets;
76 extern md_set_t		md_set[];
77 
78 extern int		md_status;
79 extern clock_t		md_hz;
80 
81 extern md_krwlock_t	md_unit_array_rw;
82 extern kmutex_t		md_mx;
83 extern kcondvar_t	md_cv;
84 extern int		md_mtioctl_cnt;
85 
86 daemon_request_t	mirror_timeout;
87 static daemon_request_t	hotspare_request;
88 static daemon_request_t	mn_hs_request[MD_MAXSETS];	/* Multinode hs req */
89 
90 int	md_mirror_mcs_buf_off;
91 
92 /* Flags for mdmn_ksend_message to allow debugging */
93 int	md_mirror_msg_flags;
94 
95 #ifdef DEBUG
96 /* Flag to switch on debug messages */
97 int	mirror_debug_flag = 0;
98 #endif
99 
100 /*
101  * Struct used to hold count of DMR reads and the timestamp of last DMR read
102  * It is used to verify, using a debugger, that the DMR read ioctl has been
103  * executed.
104  */
105 dmr_stats_t	mirror_dmr_stats = {0, 0};
106 
107 /*
108  * Mutex protecting list of non-failfast drivers.
109  */
110 static kmutex_t	non_ff_drv_mutex;
111 static char	**non_ff_drivers = NULL;
112 
113 extern major_t	md_major;
114 
115 /*
116  * Write-On-Write memory pool.
117  */
118 static void		copy_write_cont(wowhdr_t *wowhdr);
119 static kmem_cache_t	*mirror_wowblk_cache = NULL;
120 static int		md_wowbuf_size = 16384;
121 static size_t		md_wowblk_size;
122 
123 /*
124  * This is a flag that allows:
125  *	- disabling the write-on-write mechanism.
126  *	- logging occurrences of write-on-write
127  *	- switching wow handling procedure processing
128  * Counter for occurences of WOW.
129  */
130 static uint_t	md_mirror_wow_flg = 0;
131 static int	md_mirror_wow_cnt = 0;
132 
133 /*
134  * Tunable to enable/disable dirty region
135  * processing when closing down a mirror.
136  */
137 static int	new_resync = 1;
138 kmem_cache_t	*mirror_parent_cache = NULL;
139 kmem_cache_t	*mirror_child_cache = NULL;
140 
141 extern int	md_ff_disable;		/* disable failfast */
142 
143 static int	mirror_map_write(mm_unit_t *, md_mcs_t *, md_mps_t *, int);
144 static void	mirror_read_strategy(buf_t *, int, void *);
145 static void	mirror_write_strategy(buf_t *, int, void *);
146 static void	become_owner(daemon_queue_t *);
147 static int	mirror_done(struct buf *cb);
148 static int	mirror_done_common(struct buf *cb);
149 static void	clear_retry_error(struct buf *cb);
150 
151 /*
152  * patchables
153  */
154 int	md_min_rr_size	= 200;	/* 2000 blocks, or 100k */
155 int	md_def_num_rr	= 1000;	/* Default number of dirty regions */
156 
157 /*
158  * patchable to change delay before rescheduling mirror ownership request.
159  * Value is clock ticks, default 0.5 seconds
160  */
161 clock_t	md_mirror_owner_to = 500000;
162 
163 /*ARGSUSED1*/
164 static int
165 mirror_parent_constructor(void *p, void *d1, int d2)
166 {
167 	mutex_init(&((md_mps_t *)p)->ps_mx, NULL, MUTEX_DEFAULT, NULL);
168 	return (0);
169 }
170 
171 static void
172 mirror_parent_init(md_mps_t *ps)
173 {
174 	bzero(ps, offsetof(md_mps_t, ps_mx));
175 }
176 
177 /*ARGSUSED1*/
178 static void
179 mirror_parent_destructor(void *p, void *d)
180 {
181 	mutex_destroy(&((md_mps_t *)p)->ps_mx);
182 }
183 
184 /*ARGSUSED1*/
185 static int
186 mirror_child_constructor(void *p, void *d1, int d2)
187 {
188 	bioinit(&((md_mcs_t *)p)->cs_buf);
189 	return (0);
190 }
191 
192 void
193 mirror_child_init(md_mcs_t *cs)
194 {
195 	cs->cs_ps = NULL;
196 	cs->cs_mdunit = 0;
197 	md_bioreset(&cs->cs_buf);
198 }
199 
200 /*ARGSUSED1*/
201 static void
202 mirror_child_destructor(void *p, void *d)
203 {
204 	biofini(&((md_mcs_t *)p)->cs_buf);
205 }
206 
207 static void
208 mirror_wowblk_init(wowhdr_t *p)
209 {
210 	bzero(p, md_wowblk_size);
211 }
212 
213 static void
214 send_poke_hotspares_msg(daemon_request_t *drq)
215 {
216 	int			rval;
217 	md_mn_msg_pokehsp_t	pokehsp;
218 	md_mn_kresult_t		*kresult;
219 	set_t			setno = (set_t)drq->dq.qlen;
220 
221 	pokehsp.pokehsp_setno = setno;
222 
223 	kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
224 	rval = mdmn_ksend_message(setno, MD_MN_MSG_POKE_HOTSPARES,
225 	    MD_MSGF_NO_LOG | MD_MSGF_NO_BCAST, (char *)&pokehsp,
226 	    sizeof (pokehsp), kresult);
227 
228 	if (!MDMN_KSEND_MSG_OK(rval, kresult)) {
229 		mdmn_ksend_show_error(rval, kresult, "POKE_HOTSPARES");
230 		cmn_err(CE_PANIC,
231 		    "ksend_message failure: POKE_HOTSPARES");
232 	}
233 	kmem_free(kresult, sizeof (md_mn_kresult_t));
234 
235 	/* Allow further requests to use this set's queue structure */
236 	mutex_enter(&drq->dr_mx);
237 	drq->dr_pending = 0;
238 	mutex_exit(&drq->dr_mx);
239 }
240 
241 /*
242  * Send a poke_hotspares message to the master node. To avoid swamping the
243  * commd handler with requests we only send a message if there is not one
244  * already outstanding. We punt the request to a separate thread context as
245  * cannot afford to block waiting on the request to be serviced. This is
246  * essential when a reconfig cycle is in progress as any open() of a multinode
247  * metadevice may result in a livelock.
248  */
249 static void
250 send_poke_hotspares(set_t setno)
251 {
252 	daemon_request_t	*drq = &mn_hs_request[setno];
253 
254 	mutex_enter(&drq->dr_mx);
255 	if (drq->dr_pending == 0) {
256 		drq->dr_pending = 1;
257 		drq->dq.qlen = (int)setno;
258 		daemon_request(&md_mhs_daemon,
259 		    send_poke_hotspares_msg, (daemon_queue_t *)drq, REQ_OLD);
260 	}
261 	mutex_exit(&drq->dr_mx);
262 }
263 
264 void
265 mirror_set_sm_state(
266 	mm_submirror_t		*sm,
267 	mm_submirror_ic_t	*smic,
268 	sm_state_t		newstate,
269 	int			force)
270 {
271 	int			compcnt;
272 	int			i;
273 	int			errcnt;
274 	sm_state_t		origstate;
275 	md_m_shared_t		*shared;
276 
277 	if (force) {
278 		sm->sm_state = newstate;
279 		uniqtime32(&sm->sm_timestamp);
280 		return;
281 	}
282 
283 	origstate = newstate;
284 
285 	compcnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm);
286 	for (i = 0, errcnt = 0; i < compcnt; i++) {
287 		shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
288 		    (sm->sm_dev, sm, i);
289 		if (shared->ms_state & (CS_ERRED | CS_LAST_ERRED))
290 			newstate |= SMS_COMP_ERRED;
291 		if (shared->ms_state & (CS_RESYNC))
292 			newstate |= SMS_COMP_RESYNC;
293 		if (shared->ms_state & CS_ERRED)
294 			errcnt++;
295 	}
296 
297 	if ((newstate & (SMS_COMP_ERRED | SMS_COMP_RESYNC)) != 0)
298 		newstate &= ~origstate;
299 
300 	if (errcnt == compcnt)
301 		newstate |= SMS_ALL_ERRED;
302 	else
303 		newstate &= ~SMS_ALL_ERRED;
304 
305 	sm->sm_state = newstate;
306 	uniqtime32(&sm->sm_timestamp);
307 }
308 
309 static int
310 mirror_geterror(mm_unit_t *un, int *smi, int *cip, int clr_error,
311 							int frm_probe)
312 {
313 	mm_submirror_t		*sm;
314 	mm_submirror_ic_t	*smic;
315 	md_m_shared_t		*shared;
316 	int			ci;
317 	int			i;
318 	int			compcnt;
319 	int			open_comp; /* flag for open component */
320 
321 	for (i = *smi; i < NMIRROR; i++) {
322 		sm = &un->un_sm[i];
323 		smic = &un->un_smic[i];
324 
325 		if (!SMS_IS(sm, SMS_INUSE))
326 			continue;
327 
328 		compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un);
329 		for (ci = *cip; ci < compcnt; ci++) {
330 			shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
331 			    (sm->sm_dev, sm, ci);
332 			/*
333 			 * if called from any routine but probe, we check for
334 			 * MDM_S_ISOPEN flag. Since probe does a pseduo open,
335 			 * it sets MDM_S_PROBEOPEN flag and we test for this
336 			 * flag. They are both exclusive tests.
337 			 */
338 			open_comp = (frm_probe) ?
339 					(shared->ms_flags & MDM_S_PROBEOPEN):
340 					(shared->ms_flags & MDM_S_ISOPEN);
341 			if ((shared->ms_flags & MDM_S_IOERR || !open_comp) &&
342 				((shared->ms_state == CS_OKAY) ||
343 				(shared->ms_state == CS_RESYNC))) {
344 				if (clr_error) {
345 					shared->ms_flags &= ~MDM_S_IOERR;
346 				}
347 				*cip = ci;
348 				*smi = i;
349 				return (1);
350 			}
351 
352 			if (clr_error && (shared->ms_flags & MDM_S_IOERR)) {
353 				shared->ms_flags &= ~MDM_S_IOERR;
354 			}
355 		}
356 
357 		*cip = 0;
358 	}
359 	return (0);
360 }
361 
362 /*ARGSUSED*/
363 static void
364 mirror_run_queue(void *d)
365 {
366 	if (!(md_status & MD_GBL_DAEMONS_LIVE))
367 		md_daemon(1, &md_done_daemon);
368 }
369 /*
370  * check_comp_4_hotspares
371  *
372  * This function attempts to allocate a hotspare for this component if the
373  * component is in error. In a MN set, the function can be called in 2 modes.
374  * It can be called either when a component error has been detected or when a
375  * new hotspare has been allocated. In this case, MD_HOTSPARE_XMIT is set
376  * in flags and the request is sent to all nodes.
377  * The handler on each of the nodes then calls this function with
378  * MD_HOTSPARE_XMIT unset and the hotspare allocation is then performed.
379  *
380  * For non-MN sets the function simply attempts to allocate a hotspare.
381  *
382  * On entry, the following locks are held
383  *	mirror_md_ops.md_link_rw (if flags has MD_HOTSPARE_LINKHELD set)
384  *	md_unit_writerlock
385  *
386  * Returns	0 if ok
387  *		1 if the unit containing the component has been cleared while
388  *		  the mdmn_ksend_message() was being executed
389  */
390 extern int
391 check_comp_4_hotspares(
392 	mm_unit_t	*un,
393 	int		smi,
394 	int		ci,
395 	uint_t		flags,
396 	mddb_recid_t	hs_id,	/* Only used by MN disksets */
397 	IOLOCK		*lockp	/* can be NULL */
398 )
399 {
400 	mm_submirror_t		*sm;
401 	mm_submirror_ic_t	*smic;
402 	md_m_shared_t		*shared;
403 	mddb_recid_t		recids[6];
404 	minor_t			mnum;
405 	intptr_t		(*hs_dev)();
406 	void			(*hs_done)();
407 	void			*hs_data;
408 	md_error_t		mde = mdnullerror;
409 	set_t			setno;
410 	md_mn_msg_allochsp_t	allochspmsg;
411 	md_mn_kresult_t		*kresult;
412 	mm_unit_t		*new_un;
413 	int			rval;
414 
415 	mnum = MD_SID(un);
416 	setno = MD_UN2SET(un);
417 	sm = &un->un_sm[smi];
418 	smic = &un->un_smic[smi];
419 	shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
420 		(sm->sm_dev, sm, ci);
421 
422 	if (shared->ms_state != CS_ERRED)
423 		return (0);
424 
425 	/* Don't start a new component resync if a resync is already running. */
426 	if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE)
427 		return (0);
428 
429 	if (MD_MNSET_SETNO(setno) && (flags & MD_HOTSPARE_XMIT)) {
430 		uint_t		msgflags;
431 		md_mn_msgtype_t	msgtype;
432 
433 		/* Send allocate hotspare message to all nodes */
434 
435 		allochspmsg.msg_allochsp_mnum = un->c.un_self_id;
436 		allochspmsg.msg_allochsp_sm = smi;
437 		allochspmsg.msg_allochsp_comp = ci;
438 		allochspmsg.msg_allochsp_hs_id = shared->ms_hs_id;
439 
440 		/*
441 		 * Before calling mdmn_ksend_message(), release locks
442 		 * Can never be in the context of an ioctl.
443 		 */
444 		md_unit_writerexit(MDI_UNIT(mnum));
445 		if (flags & MD_HOTSPARE_LINKHELD)
446 			rw_exit(&mirror_md_ops.md_link_rw.lock);
447 #ifdef DEBUG
448 		if (mirror_debug_flag)
449 		    printf("send alloc hotspare, flags=0x%x %x, %x, %x, %x\n",
450 			flags,
451 			allochspmsg.msg_allochsp_mnum,
452 			allochspmsg.msg_allochsp_sm,
453 			allochspmsg.msg_allochsp_comp,
454 			allochspmsg.msg_allochsp_hs_id);
455 #endif
456 		if (flags & MD_HOTSPARE_WMUPDATE) {
457 			msgtype  = MD_MN_MSG_ALLOCATE_HOTSPARE2;
458 			/*
459 			 * When coming from an update of watermarks, there
460 			 * must already be a message logged that triggered
461 			 * this action. So, no need to log this message, too.
462 			 */
463 			msgflags = MD_MSGF_NO_LOG;
464 		} else {
465 			msgtype  = MD_MN_MSG_ALLOCATE_HOTSPARE;
466 			msgflags = MD_MSGF_DEFAULT_FLAGS;
467 		}
468 
469 		kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
470 		rval = mdmn_ksend_message(setno, msgtype, msgflags,
471 		    (char *)&allochspmsg, sizeof (allochspmsg),
472 		    kresult);
473 
474 		if (!MDMN_KSEND_MSG_OK(rval, kresult)) {
475 #ifdef DEBUG
476 			if (mirror_debug_flag)
477 				mdmn_ksend_show_error(rval, kresult,
478 				    "ALLOCATE HOTSPARE");
479 #endif
480 			/*
481 			 * If message is sent ok but exitval indicates an error
482 			 * it must be because the mirror has been cleared. In
483 			 * this case re-obtain lock and return an error
484 			 */
485 			if ((rval == 0) && (kresult->kmmr_exitval != 0)) {
486 				if (flags & MD_HOTSPARE_LINKHELD) {
487 					rw_enter(&mirror_md_ops.md_link_rw.lock,
488 					    RW_READER);
489 				}
490 				kmem_free(kresult, sizeof (md_mn_kresult_t));
491 				return (1);
492 			}
493 			cmn_err(CE_PANIC,
494 			    "ksend_message failure: ALLOCATE_HOTSPARE");
495 		}
496 		kmem_free(kresult, sizeof (md_mn_kresult_t));
497 
498 		/*
499 		 * re-obtain the locks
500 		 */
501 		if (flags & MD_HOTSPARE_LINKHELD)
502 			rw_enter(&mirror_md_ops.md_link_rw.lock, RW_READER);
503 		new_un = md_unit_writerlock(MDI_UNIT(mnum));
504 
505 		/*
506 		 * As we had to release the locks in order to send the
507 		 * message to all nodes, we need to check to see if the
508 		 * unit has changed. If it has we release the writerlock
509 		 * and return fail.
510 		 */
511 		if ((new_un != un) || (un->c.un_type != MD_METAMIRROR)) {
512 			md_unit_writerexit(MDI_UNIT(mnum));
513 			return (1);
514 		}
515 	} else {
516 		if (MD_MNSET_SETNO(setno)) {
517 			/*
518 			 * If 2 or more nodes simultaneously see a
519 			 * component failure, these nodes will each
520 			 * send an ALLOCATE_HOTSPARE[2] message.
521 			 * The first message will allocate the hotspare
522 			 * and the subsequent messages should do nothing.
523 			 *
524 			 * If a slave node doesn't have a hotspare allocated
525 			 * at the time the message is initiated, then the
526 			 * passed in hs_id will be 0.  If the node
527 			 * executing this routine has a component shared
528 			 * ms_hs_id of non-zero, but the message shows a
529 			 * hs_id of 0, then just return since a hotspare
530 			 * has already been allocated for this failing
531 			 * component.  When the slave node returns from
532 			 * the ksend_message the hotspare will have
533 			 * already been allocated.
534 			 *
535 			 * If the slave node does send an hs_id of non-zero,
536 			 * and the slave node's hs_id matches this node's
537 			 * ms_hs_id, then the hotspare has error'd and
538 			 * should be replaced.
539 			 *
540 			 * If the slave node sends an hs_id of non-zero and
541 			 * this node has a different shared ms_hs_id, then
542 			 * just return since this hotspare has already
543 			 * been hotspared.
544 			 */
545 			if (shared->ms_hs_id != 0) {
546 				if (hs_id == 0) {
547 #ifdef DEBUG
548 					if (mirror_debug_flag) {
549 						printf("check_comp_4_hotspares"
550 						    "(NOXMIT), short circuit "
551 						    "hs_id=0x%x, "
552 						    "ms_hs_id=0x%x\n",
553 						    hs_id, shared->ms_hs_id);
554 					}
555 #endif
556 					return (0);
557 				}
558 				if (hs_id != shared->ms_hs_id) {
559 #ifdef DEBUG
560 					if (mirror_debug_flag) {
561 						printf("check_comp_4_hotspares"
562 						    "(NOXMIT), short circuit2 "
563 						    "hs_id=0x%x, "
564 						    "ms_hs_id=0x%x\n",
565 						    hs_id, shared->ms_hs_id);
566 					}
567 #endif
568 					return (0);
569 				}
570 			}
571 		}
572 
573 		sm = &un->un_sm[smi];
574 		hs_dev = md_get_named_service(sm->sm_dev, 0,
575 		    "hotspare device", 0);
576 		if ((*hs_dev)(sm->sm_dev, 0, ci, recids, 6, &hs_done,
577 		    &hs_data) != 0)
578 			return (0);
579 
580 		/*
581 		 * set_sm_comp_state() commits the modified records.
582 		 * As we don't transmit the changes, no need to drop the lock.
583 		 */
584 		set_sm_comp_state(un, smi, ci, CS_RESYNC, recids,
585 		    MD_STATE_NO_XMIT, (IOLOCK *)NULL);
586 
587 		(*hs_done)(sm->sm_dev, hs_data);
588 
589 		mirror_check_failfast(mnum);
590 
591 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_HOTSPARED, SVM_TAG_METADEVICE,
592 		    setno, MD_SID(un));
593 
594 		/*
595 		 * For a multi-node set we need to reset the un_rs_type,
596 		 * un_rs_resync_done and un_rs_resync_2_do fields as the
597 		 * hot-spare resync must copy all applicable data.
598 		 */
599 		if (MD_MNSET_SETNO(setno)) {
600 			un->un_rs_type = MD_RS_NONE;
601 			un->un_rs_resync_done = 0;
602 			un->un_rs_resync_2_do = 0;
603 		}
604 
605 		/*
606 		 * Must drop writer lock since mirror_resync_unit will
607 		 * open devices and must be able to grab readerlock.
608 		 * Don't need to drop IOLOCK since any descendent routines
609 		 * calling ksend_messages will drop the IOLOCK as needed.
610 		 *
611 		 */
612 		if (lockp) {
613 			md_ioctl_writerexit(lockp);
614 		} else {
615 			md_unit_writerexit(MDI_UNIT(mnum));
616 		}
617 
618 		/* start resync */
619 		(void) mirror_resync_unit(mnum, NULL, &mde, lockp);
620 
621 		if (lockp) {
622 			new_un = md_ioctl_writerlock(lockp, MDI_UNIT(mnum));
623 		} else {
624 			new_un = md_unit_writerlock(MDI_UNIT(mnum));
625 		}
626 	}
627 	return (0);
628 }
629 
630 /*
631  * check_unit_4_hotspares
632  *
633  * For a given mirror, allocate hotspares, if available for any components
634  * that are in error
635  *
636  * Returns	0 if ok
637  *		1 if check_comp_4_hotspares returns non-zero. This will only
638  *		  happen for a MN unit where the unit has been cleared while
639  *		  the allocate hotspare message is sent to all nodes.
640  */
641 static int
642 check_unit_4_hotspares(mm_unit_t *un, int flags)
643 {
644 	mm_submirror_t		*sm;
645 	mm_submirror_ic_t	*smic;
646 	int			ci;
647 	int			i;
648 	int			compcnt;
649 
650 	if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE)
651 		return (0);
652 
653 	for (i = 0; i < NMIRROR; i++) {
654 		sm = &un->un_sm[i];
655 		smic = &un->un_smic[i];
656 		if (!SMS_IS(sm, SMS_INUSE))
657 			continue;
658 		compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, sm);
659 		for (ci = 0; ci < compcnt; ci++) {
660 			md_m_shared_t		*shared;
661 
662 			shared = (md_m_shared_t *)
663 				(*(smic->sm_shared_by_indx))(sm->sm_dev,
664 				sm, ci);
665 			/*
666 			 * Never called from ioctl context, so pass in
667 			 * (IOLOCK *)NULL.  Pass through flags from calling
668 			 * routine, also setting XMIT flag.
669 			 */
670 			if (check_comp_4_hotspares(un, i, ci,
671 				(MD_HOTSPARE_XMIT | flags),
672 				shared->ms_hs_id, (IOLOCK *)NULL) != 0)
673 				return (1);
674 		}
675 	}
676 	return (0);
677 }
678 
679 static void
680 check_4_hotspares(daemon_request_t *drq)
681 {
682 	mdi_unit_t	*ui;
683 	mm_unit_t	*un;
684 	md_link_t	*next;
685 	int		x;
686 
687 	mutex_enter(&drq->dr_mx);	/* clear up front so can poke */
688 	drq->dr_pending = 0;		/* again in low level routine if */
689 	mutex_exit(&drq->dr_mx);	/* something found to do	*/
690 
691 	/*
692 	 * Used to have a problem here. The disksets weren't marked as being
693 	 * MNHOLD. This opened a window where we could be searching for
694 	 * hotspares and have the disk set unloaded (released) from under
695 	 * us causing a panic in stripe_component_count().
696 	 * The way to prevent that is to mark the set MNHOLD which prevents
697 	 * any diskset from being released while we are scanning the mirrors,
698 	 * submirrors and components.
699 	 */
700 
701 	for (x = 0; x < md_nsets; x++)
702 		md_holdset_enter(x);
703 
704 	rw_enter(&mirror_md_ops.md_link_rw.lock, RW_READER);
705 	for (next = mirror_md_ops.md_head; next != NULL; next = next->ln_next) {
706 		ui = MDI_UNIT(next->ln_id);
707 
708 		un = (mm_unit_t *)md_unit_readerlock(ui);
709 
710 		/*
711 		 * Only check the unit if we are the master for this set
712 		 * For an MN set, poke_hotspares() is only effective on the
713 		 * master
714 		 */
715 		if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
716 		    md_set[MD_UN2SET(un)].s_am_i_master == 0) {
717 			md_unit_readerexit(ui);
718 			continue;
719 		}
720 		if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) {
721 			md_unit_readerexit(ui);
722 			continue;
723 		}
724 		md_unit_readerexit(ui);
725 
726 		un = (mm_unit_t *)md_unit_writerlock(ui);
727 		/*
728 		 * check_unit_4_hotspares will exit 1 if the unit has been
729 		 * removed during the process of allocating the hotspare.
730 		 * This can only happen for a MN metadevice. If unit no longer
731 		 * exists, no need to release writerlock
732 		 */
733 		if (check_unit_4_hotspares(un, MD_HOTSPARE_LINKHELD) == 0)
734 			md_unit_writerexit(ui);
735 		else {
736 			/*
737 			 * If check_unit_4_hotspares failed, queue another
738 			 * request and break out of this one
739 			 */
740 			(void) poke_hotspares();
741 			break;
742 		}
743 	}
744 	rw_exit(&mirror_md_ops.md_link_rw.lock);
745 
746 	for (x = 0; x < md_nsets; x++)
747 		md_holdset_exit(x);
748 }
749 
750 /*
751  * poke_hotspares
752  *
753  * If there is not a pending poke_hotspares request pending, queue a requent
754  * to call check_4_hotspares(). This will scan all mirrors and attempt to
755  * allocate hotspares for all components in error.
756  */
757 int
758 poke_hotspares()
759 {
760 	mutex_enter(&hotspare_request.dr_mx);
761 	if (hotspare_request.dr_pending == 0) {
762 		hotspare_request.dr_pending = 1;
763 		daemon_request(&md_mhs_daemon,
764 		    check_4_hotspares,
765 				(daemon_queue_t *)&hotspare_request, REQ_OLD);
766 	}
767 	mutex_exit(&hotspare_request.dr_mx);
768 	return (0);
769 }
770 
771 static void
772 free_all_ecomps(err_comp_t *ecomp)
773 {
774 	err_comp_t	*d;
775 
776 	while (ecomp != NULL) {
777 		d = ecomp;
778 		ecomp = ecomp->ec_next;
779 		kmem_free(d, sizeof (err_comp_t));
780 	}
781 }
782 
783 /*
784  * NAME: mirror_openfail_console_info
785  *
786  * DESCRIPTION: Prints a informative message to the console when mirror
787  *		cannot be opened.
788  *
789  * PARAMETERS: mm_unit_t	un - pointer to mirror unit structure
790  *	       int		smi - submirror index
791  *	       int		ci - component index
792  */
793 
794 void
795 mirror_openfail_console_info(mm_unit_t *un, int smi, int ci)
796 {
797 	void (*get_dev)();
798 	ms_cd_info_t cd;
799 	md_dev64_t tmpdev;
800 
801 	tmpdev = un->un_sm[smi].sm_dev;
802 	get_dev = (void (*)())md_get_named_service(tmpdev, 0, "get device", 0);
803 	if (get_dev != NULL) {
804 		(void) (*get_dev)(tmpdev, smi, ci, &cd);
805 		cmn_err(CE_WARN, "md %s: open error on %s",
806 			md_shortname(MD_SID(un)),
807 			md_devname(MD_UN2SET(un), cd.cd_dev,
808 			NULL, 0));
809 	} else {
810 		cmn_err(CE_WARN, "md %s: open error",
811 			md_shortname(MD_SID(un)));
812 	}
813 }
814 
815 static int
816 mirror_close_all_devs(mm_unit_t *un, int md_cflags)
817 {
818 	int i;
819 	md_dev64_t dev;
820 
821 	for (i = 0; i < NMIRROR; i++) {
822 		if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
823 			continue;
824 		dev = un->un_sm[i].sm_dev;
825 		md_layered_close(dev, md_cflags);
826 	}
827 	return (0);
828 }
829 
830 /*
831  * Keep track of drivers that don't support failfast.  We use this so that
832  * we only log one diagnostic message for each of these drivers, no matter
833  * how many times we run the mirror_check_failfast function.
834  * Return 1 if this is a new driver that does not support failfast,
835  * return 0 if we have already seen this non-failfast driver.
836  */
837 static int
838 new_non_ff_driver(const char *s)
839 {
840 	mutex_enter(&non_ff_drv_mutex);
841 	if (non_ff_drivers == NULL) {
842 	    non_ff_drivers = (char **)kmem_alloc(2 * sizeof (char *),
843 		KM_NOSLEEP);
844 	    if (non_ff_drivers == NULL) {
845 		mutex_exit(&non_ff_drv_mutex);
846 		return (1);
847 	    }
848 
849 	    non_ff_drivers[0] = (char *)kmem_alloc(strlen(s) + 1, KM_NOSLEEP);
850 	    if (non_ff_drivers[0] == NULL) {
851 		kmem_free(non_ff_drivers, 2 * sizeof (char *));
852 		non_ff_drivers = NULL;
853 		mutex_exit(&non_ff_drv_mutex);
854 		return (1);
855 	    }
856 
857 	    (void) strcpy(non_ff_drivers[0], s);
858 	    non_ff_drivers[1] = NULL;
859 
860 	} else {
861 	    int i;
862 	    char **tnames;
863 	    char **tmp;
864 
865 	    for (i = 0; non_ff_drivers[i] != NULL; i++) {
866 		if (strcmp(s, non_ff_drivers[i]) == 0) {
867 		    mutex_exit(&non_ff_drv_mutex);
868 		    return (0);
869 		}
870 	    }
871 
872 	    /* allow for new element and null */
873 	    i += 2;
874 	    tnames = (char **)kmem_alloc(i * sizeof (char *), KM_NOSLEEP);
875 	    if (tnames == NULL) {
876 		mutex_exit(&non_ff_drv_mutex);
877 		return (1);
878 	    }
879 
880 	    for (i = 0; non_ff_drivers[i] != NULL; i++)
881 		tnames[i] = non_ff_drivers[i];
882 
883 	    tnames[i] = (char *)kmem_alloc(strlen(s) + 1, KM_NOSLEEP);
884 	    if (tnames[i] == NULL) {
885 		/* adjust i so that it is the right count to free */
886 		kmem_free(tnames, (i + 2) * sizeof (char *));
887 		mutex_exit(&non_ff_drv_mutex);
888 		return (1);
889 	    }
890 
891 	    (void) strcpy(tnames[i++], s);
892 	    tnames[i] = NULL;
893 
894 	    tmp = non_ff_drivers;
895 	    non_ff_drivers = tnames;
896 	    /* i now represents the count we previously alloced */
897 	    kmem_free(tmp, i * sizeof (char *));
898 	}
899 	mutex_exit(&non_ff_drv_mutex);
900 
901 	return (1);
902 }
903 
904 /*
905  * Check for the "ddi-failfast-supported" devtree property on each submirror
906  * component to indicate if we should do I/O to that submirror with the
907  * B_FAILFAST flag set or not.  This check is made at various state transitions
908  * in the mirror code (e.g. open, enable, hotspare, etc.).  Sometimes we
909  * only need to check one drive (e.g. hotspare) but since the check is
910  * fast and infrequent and sometimes needs to be done on all components we
911  * just check all components on each call.
912  */
913 void
914 mirror_check_failfast(minor_t mnum)
915 {
916 	int		i;
917 	mm_unit_t	*un;
918 
919 	if (md_ff_disable)
920 	    return;
921 
922 	un = MD_UNIT(mnum);
923 
924 	for (i = 0; i < NMIRROR; i++) {
925 	    int			ci;
926 	    int			cnt;
927 	    int			ff = 1;
928 	    mm_submirror_t	*sm;
929 	    mm_submirror_ic_t	*smic;
930 	    void		(*get_dev)();
931 
932 	    if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
933 		continue;
934 
935 	    sm = &un->un_sm[i];
936 	    smic = &un->un_smic[i];
937 
938 	    get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0,
939 		"get device", 0);
940 
941 	    cnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm);
942 	    for (ci = 0; ci < cnt; ci++) {
943 		int		found = 0;
944 		dev_t		ci_dev;
945 		major_t		major;
946 		dev_info_t	*devi;
947 		ms_cd_info_t	cd;
948 
949 		/* this already returns the hs dev if the device is spared */
950 		(void) (*get_dev)(sm->sm_dev, sm, ci, &cd);
951 
952 		ci_dev = md_dev64_to_dev(cd.cd_dev);
953 		major = getmajor(ci_dev);
954 
955 		if (major == md_major) {
956 		    /* this component must be a soft partition; get real dev */
957 		    minor_t	dev_mnum;
958 		    mdi_unit_t	*ui;
959 		    mp_unit_t	*un;
960 		    set_t	setno;
961 		    side_t	side;
962 		    md_dev64_t	tmpdev;
963 
964 		    ui = MDI_UNIT(getminor(ci_dev));
965 
966 		    /* grab necessary lock */
967 		    un = (mp_unit_t *)md_unit_readerlock(ui);
968 
969 		    dev_mnum = MD_SID(un);
970 		    setno = MD_MIN2SET(dev_mnum);
971 		    side = mddb_getsidenum(setno);
972 
973 		    tmpdev = un->un_dev;
974 
975 		    /* Get dev by device id */
976 		    if (md_devid_found(setno, side, un->un_key) == 1) {
977 			tmpdev = md_resolve_bydevid(dev_mnum, tmpdev,
978 				un->un_key);
979 		    }
980 
981 		    md_unit_readerexit(ui);
982 
983 		    ci_dev = md_dev64_to_dev(tmpdev);
984 		    major = getmajor(ci_dev);
985 		}
986 
987 		if (ci_dev != NODEV32 &&
988 		    (devi = e_ddi_hold_devi_by_dev(ci_dev, 0)) != NULL) {
989 		    ddi_prop_op_t	prop_op = PROP_LEN_AND_VAL_BUF;
990 		    int			propvalue = 0;
991 		    int			proplength = sizeof (int);
992 		    int			error;
993 		    struct cb_ops	*cb;
994 
995 		    if ((cb = devopsp[major]->devo_cb_ops) != NULL) {
996 			error = (*cb->cb_prop_op)(DDI_DEV_T_ANY, devi, prop_op,
997 			    DDI_PROP_NOTPROM|DDI_PROP_DONTPASS,
998 			    "ddi-failfast-supported",
999 			    (caddr_t)&propvalue, &proplength);
1000 
1001 			if (error == DDI_PROP_SUCCESS)
1002 			    found = 1;
1003 		    }
1004 
1005 		    if (!found && new_non_ff_driver(ddi_driver_name(devi)))
1006 			cmn_err(CE_NOTE, "!md: B_FAILFAST I/O disabled on %s",
1007 			    ddi_driver_name(devi));
1008 
1009 		    ddi_release_devi(devi);
1010 		}
1011 
1012 		/* All components must support failfast in the submirror. */
1013 		if (!found) {
1014 		    ff = 0;
1015 		    break;
1016 		}
1017 	    }
1018 
1019 	    if (ff) {
1020 		sm->sm_flags |= MD_SM_FAILFAST;
1021 	    } else {
1022 		sm->sm_flags &= ~MD_SM_FAILFAST;
1023 	    }
1024 	}
1025 }
1026 
1027 /*
1028  * Return true if the submirror is unavailable.
1029  * If any of the submirror components are opened then the submirror cannot
1030  * be unavailable (MD_INACCESSIBLE).
1031  * If any of the components are already in the errored state, then the submirror
1032  * cannot be unavailable (MD_INACCESSIBLE).
1033  */
1034 static bool_t
1035 submirror_unavailable(mm_unit_t *un, int smi, int from_probe)
1036 {
1037 	mm_submirror_t		*sm;
1038 	mm_submirror_ic_t	*smic;
1039 	md_m_shared_t		*shared;
1040 	int			ci;
1041 	int			compcnt;
1042 
1043 	sm = &un->un_sm[smi];
1044 	smic = &un->un_smic[smi];
1045 
1046 	compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un);
1047 	for (ci = 0; ci < compcnt; ci++) {
1048 		shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
1049 		    (sm->sm_dev, sm, ci);
1050 		if (from_probe) {
1051 			if (shared->ms_flags & MDM_S_PROBEOPEN)
1052 				return (B_FALSE);
1053 		} else {
1054 			if (shared->ms_flags & MDM_S_ISOPEN)
1055 				return (B_FALSE);
1056 		}
1057 		if (shared->ms_state == CS_ERRED ||
1058 		    shared->ms_state == CS_LAST_ERRED)
1059 			return (B_FALSE);
1060 	}
1061 
1062 	return (B_TRUE);
1063 }
1064 
1065 static int
1066 mirror_open_all_devs(minor_t mnum, int md_oflags, IOLOCK *lockp)
1067 {
1068 	int		i;
1069 	mm_unit_t	*un;
1070 	mdi_unit_t	*ui;
1071 	int		err;
1072 	int		smi;
1073 	int		ci;
1074 	err_comp_t	*c;
1075 	err_comp_t	*ecomps = NULL;
1076 	int		smmask = 0;
1077 	set_t		setno;
1078 	int		sm_cnt;
1079 	int		sm_unavail_cnt;
1080 
1081 	mirror_check_failfast(mnum);
1082 
1083 	un = MD_UNIT(mnum);
1084 	ui = MDI_UNIT(mnum);
1085 	setno = MD_UN2SET(un);
1086 
1087 	for (i = 0; i < NMIRROR; i++) {
1088 		md_dev64_t tmpdev = un->un_sm[i].sm_dev;
1089 
1090 		if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
1091 			continue;
1092 		if (md_layered_open(mnum, &tmpdev, md_oflags))
1093 			smmask |= SMI2BIT(i);
1094 		un->un_sm[i].sm_dev = tmpdev;
1095 	}
1096 
1097 	/*
1098 	 * If smmask is clear, all submirrors are accessible. Clear the
1099 	 * MD_INACCESSIBLE bit in this case.  This bit is also cleared for the
1100 	 * mirror device.   If smmask is set, we have to determine which of the
1101 	 * submirrors are in error. If no submirror is accessible we mark the
1102 	 * whole mirror as MD_INACCESSIBLE.
1103 	 */
1104 	if (smmask == 0) {
1105 		if (lockp) {
1106 			md_ioctl_readerexit(lockp);
1107 			(void) md_ioctl_writerlock(lockp, ui);
1108 		} else {
1109 			md_unit_readerexit(ui);
1110 			(void) md_unit_writerlock(ui);
1111 		}
1112 		ui->ui_tstate &= ~MD_INACCESSIBLE;
1113 		if (lockp) {
1114 			md_ioctl_writerexit(lockp);
1115 			(void) md_ioctl_readerlock(lockp, ui);
1116 		} else {
1117 			md_unit_writerexit(ui);
1118 			(void) md_unit_readerlock(ui);
1119 		}
1120 
1121 		for (i = 0; i < NMIRROR; i++) {
1122 			md_dev64_t	tmpdev;
1123 			mdi_unit_t	*sm_ui;
1124 
1125 			if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
1126 				continue;
1127 
1128 			tmpdev = un->un_sm[i].sm_dev;
1129 			sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev)));
1130 			(void) md_unit_writerlock(sm_ui);
1131 			sm_ui->ui_tstate &= ~MD_INACCESSIBLE;
1132 			md_unit_writerexit(sm_ui);
1133 		}
1134 
1135 		return (0);
1136 	}
1137 
1138 	for (i = 0; i < NMIRROR; i++) {
1139 		md_dev64_t tmpdev;
1140 
1141 		if (!(smmask & SMI2BIT(i)))
1142 			continue;
1143 
1144 		tmpdev = un->un_sm[i].sm_dev;
1145 		err = md_layered_open(mnum, &tmpdev, MD_OFLG_CONT_ERRS);
1146 		un->un_sm[i].sm_dev = tmpdev;
1147 		ASSERT(err == 0);
1148 	}
1149 
1150 	if (lockp) {
1151 		md_ioctl_readerexit(lockp);
1152 		un = (mm_unit_t *)md_ioctl_writerlock(lockp, ui);
1153 	} else {
1154 		md_unit_readerexit(ui);
1155 		un = (mm_unit_t *)md_unit_writerlock(ui);
1156 	}
1157 
1158 	/*
1159 	 * We want to make sure the unavailable flag is not masking a real
1160 	 * error on the submirror.
1161 	 * For each submirror,
1162 	 *    if all of the submirror components couldn't be opened and there
1163 	 *    are no errors on the submirror, then set the unavailable flag
1164 	 *    otherwise, clear unavailable.
1165 	 */
1166 	sm_cnt = 0;
1167 	sm_unavail_cnt = 0;
1168 	for (i = 0; i < NMIRROR; i++) {
1169 		md_dev64_t	tmpdev;
1170 		mdi_unit_t	*sm_ui;
1171 
1172 		if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
1173 			continue;
1174 
1175 		sm_cnt++;
1176 		tmpdev = un->un_sm[i].sm_dev;
1177 		sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev)));
1178 
1179 		(void) md_unit_writerlock(sm_ui);
1180 		if (submirror_unavailable(un, i, 0)) {
1181 			sm_ui->ui_tstate |= MD_INACCESSIBLE;
1182 			sm_unavail_cnt++;
1183 		} else {
1184 			sm_ui->ui_tstate &= ~MD_INACCESSIBLE;
1185 		}
1186 		md_unit_writerexit(sm_ui);
1187 	}
1188 
1189 	/*
1190 	 * If all of the submirrors are unavailable, the mirror is also
1191 	 * unavailable.
1192 	 */
1193 	if (sm_cnt == sm_unavail_cnt) {
1194 		ui->ui_tstate |= MD_INACCESSIBLE;
1195 	} else {
1196 		ui->ui_tstate &= ~MD_INACCESSIBLE;
1197 	}
1198 
1199 	smi = 0;
1200 	ci = 0;
1201 	while (mirror_geterror(un, &smi, &ci, 1, 0) != 0) {
1202 		if (mirror_other_sources(un, smi, ci, 1) == 1) {
1203 
1204 			free_all_ecomps(ecomps);
1205 			(void) mirror_close_all_devs(un, md_oflags);
1206 			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL,
1207 			    SVM_TAG_METADEVICE, setno, MD_SID(un));
1208 			mirror_openfail_console_info(un, smi, ci);
1209 			if (lockp) {
1210 				md_ioctl_writerexit(lockp);
1211 				(void) md_ioctl_readerlock(lockp, ui);
1212 			} else {
1213 				md_unit_writerexit(ui);
1214 				(void) md_unit_readerlock(ui);
1215 			}
1216 			return (ENXIO);
1217 		}
1218 
1219 		/* track all component states that need changing */
1220 		c = (err_comp_t *)kmem_alloc(sizeof (err_comp_t), KM_SLEEP);
1221 		c->ec_next = ecomps;
1222 		c->ec_smi = smi;
1223 		c->ec_ci = ci;
1224 		ecomps = c;
1225 		ci++;
1226 	}
1227 
1228 	/* Make all state changes and commit them */
1229 	for (c = ecomps; c != NULL; c = c->ec_next) {
1230 		/*
1231 		 * If lockp is set, then entering kernel through ioctl.
1232 		 * For a MN set, the only ioctl path is via a commd message
1233 		 * (ALLOCATE_HOTSPARE or *RESYNC* messages) that is already
1234 		 * being sent to each node.
1235 		 * In this case, set NO_XMIT so that set_sm_comp_state
1236 		 * won't attempt to send a message on a message.
1237 		 *
1238 		 * In !MN sets, the xmit flag is ignored, so it doesn't matter
1239 		 * which flag is passed.
1240 		 */
1241 		if (lockp) {
1242 			set_sm_comp_state(un, c->ec_smi, c->ec_ci, CS_ERRED, 0,
1243 			    MD_STATE_NO_XMIT, lockp);
1244 		} else {
1245 			set_sm_comp_state(un, c->ec_smi, c->ec_ci, CS_ERRED, 0,
1246 			    (MD_STATE_XMIT | MD_STATE_OCHELD), lockp);
1247 		}
1248 		/*
1249 		 * For a MN set, the NOTIFY is done when the state change is
1250 		 * processed on each node
1251 		 */
1252 		if (!MD_MNSET_SETNO(setno)) {
1253 			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED,
1254 			    SVM_TAG_METADEVICE, setno, MD_SID(un));
1255 		}
1256 	}
1257 
1258 	if (lockp) {
1259 		md_ioctl_writerexit(lockp);
1260 		(void) md_ioctl_readerlock(lockp, ui);
1261 	} else {
1262 		md_unit_writerexit(ui);
1263 		(void) md_unit_readerlock(ui);
1264 	}
1265 
1266 	free_all_ecomps(ecomps);
1267 
1268 	/* allocate hotspares for all errored components */
1269 	if (MD_MNSET_SETNO(setno)) {
1270 		/*
1271 		 * If we're called from an ioctl (lockp set) then we cannot
1272 		 * directly call send_poke_hotspares as this will block until
1273 		 * the message gets despatched to all nodes. If the cluster is
1274 		 * going through a reconfig cycle then the message will block
1275 		 * until the cycle is complete, and as we originate from a
1276 		 * service call from commd we will livelock.
1277 		 */
1278 		if (lockp == NULL) {
1279 			md_unit_readerexit(ui);
1280 			send_poke_hotspares(setno);
1281 			(void) md_unit_readerlock(ui);
1282 		}
1283 	} else {
1284 		(void) poke_hotspares();
1285 	}
1286 	return (0);
1287 }
1288 
1289 void
1290 mirror_overlap_chain_remove(md_mps_t *ps)
1291 {
1292 	mm_unit_t	*un;
1293 
1294 	if (panicstr)
1295 		return;
1296 
1297 	ASSERT(ps->ps_flags & MD_MPS_ON_OVERLAP);
1298 
1299 	un = ps->ps_un;
1300 
1301 	mutex_enter(&un->un_ovrlap_chn_mx);
1302 	if (ps->ps_ovrlap_prev != &un->un_ovrlap_chn)
1303 		ps->ps_ovrlap_prev->ps_ovrlap_next = ps->ps_ovrlap_next;
1304 	else
1305 		un->un_ovrlap_chn.ps_ovrlap_next = ps->ps_ovrlap_next;
1306 	if (ps->ps_ovrlap_next != &un->un_ovrlap_chn)
1307 		ps->ps_ovrlap_next->ps_ovrlap_prev = ps->ps_ovrlap_prev;
1308 	else
1309 		un->un_ovrlap_chn.ps_ovrlap_prev = ps->ps_ovrlap_prev;
1310 	/* Handle empty overlap chain */
1311 	if (un->un_ovrlap_chn.ps_ovrlap_prev == &un->un_ovrlap_chn) {
1312 		un->un_ovrlap_chn.ps_ovrlap_prev =
1313 		    un->un_ovrlap_chn.ps_ovrlap_next = NULL;
1314 	}
1315 	if (un->un_ovrlap_chn_flg) {
1316 		un->un_ovrlap_chn_flg = 0;
1317 		cv_broadcast(&un->un_ovrlap_chn_cv);
1318 	}
1319 	ps->ps_flags &= ~MD_MPS_ON_OVERLAP;
1320 	mutex_exit(&un->un_ovrlap_chn_mx);
1321 }
1322 
1323 
1324 /*
1325  * wait_for_overlaps:
1326  * -----------------
1327  * Check that given i/o request does not cause an overlap with already pending
1328  * i/o. If it does, block until the overlapped i/o completes.
1329  *
1330  * Note: the overlap chain is held as a monotonically increasing doubly-linked
1331  * list with the sentinel contained in un->un_ovrlap_chn. We avoid a linear
1332  * search of the list by the following logic:
1333  *	ps->ps_lastblk < un_ovrlap_chn.ps_ovrlap_next->ps_firstblk => No overlap
1334  *	ps->ps_firstblk > un_ovrlap_chn.ps_ovrlap_prev->ps_lastblk => No overlap
1335  * otherwise
1336  *	scan un_ovrlap_chn.ps_ovrlap_next for location where ps->ps_firstblk
1337  *	> chain->ps_lastblk. This is the insertion point. As the list is
1338  *	guaranteed to be ordered there is no need to continue scanning.
1339  *
1340  * The flag argument has MD_OVERLAP_ALLOW_REPEAT set if it is ok for the parent
1341  *	structure to be already on the overlap chain and MD_OVERLAP_NO_REPEAT
1342  *	if it must not already be on the chain
1343  */
1344 static void
1345 wait_for_overlaps(md_mps_t *ps, int flags)
1346 {
1347 	mm_unit_t	*un;
1348 	md_mps_t	*ps1, **head, **tail;
1349 
1350 	if (panicstr)
1351 		return;
1352 
1353 
1354 	un = ps->ps_un;
1355 
1356 	mutex_enter(&un->un_ovrlap_chn_mx);
1357 	if ((flags & MD_OVERLAP_ALLOW_REPEAT) &&
1358 	    (ps->ps_flags & MD_MPS_ON_OVERLAP)) {
1359 		mutex_exit(&un->un_ovrlap_chn_mx);
1360 		return;
1361 	}
1362 
1363 	ASSERT(!(ps->ps_flags & MD_MPS_ON_OVERLAP));
1364 	head = &(un->un_ovrlap_chn.ps_ovrlap_next);
1365 	tail = &(un->un_ovrlap_chn.ps_ovrlap_prev);
1366 	ps1 = *head;
1367 	/*
1368 	 * Check for simple limit cases:
1369 	 *	*head == NULL
1370 	 *		insert ps at head of list
1371 	 *	lastblk < head->firstblk
1372 	 *		insert at head of list
1373 	 *	firstblk > tail->lastblk
1374 	 *		insert at tail of list
1375 	 */
1376 	if (ps1 == NULL) {
1377 		/* Insert at head */
1378 		ps->ps_ovrlap_next = &un->un_ovrlap_chn;
1379 		ps->ps_ovrlap_prev = &un->un_ovrlap_chn;
1380 		*head = ps;
1381 		*tail = ps;
1382 		ps->ps_flags |= MD_MPS_ON_OVERLAP;
1383 		mutex_exit(&un->un_ovrlap_chn_mx);
1384 		return;
1385 	} else if (ps->ps_lastblk < (*head)->ps_firstblk) {
1386 		/* Insert at head */
1387 		ps->ps_ovrlap_next = (*head);
1388 		ps->ps_ovrlap_prev = &un->un_ovrlap_chn;
1389 		(*head)->ps_ovrlap_prev = ps;
1390 		*head = ps;
1391 		ps->ps_flags |= MD_MPS_ON_OVERLAP;
1392 		mutex_exit(&un->un_ovrlap_chn_mx);
1393 		return;
1394 	} else if (ps->ps_firstblk > (*tail)->ps_lastblk) {
1395 		/* Insert at tail */
1396 		ps->ps_ovrlap_prev = (*tail);
1397 		ps->ps_ovrlap_next = &un->un_ovrlap_chn;
1398 		(*tail)->ps_ovrlap_next = ps;
1399 		*tail = ps;
1400 		ps->ps_flags |= MD_MPS_ON_OVERLAP;
1401 		mutex_exit(&un->un_ovrlap_chn_mx);
1402 		return;
1403 	}
1404 	/* Now we have to scan the list for possible overlaps */
1405 	while (ps1 != NULL) {
1406 		/*
1407 		 * If this region has been put on the chain by another thread
1408 		 * just exit
1409 		 */
1410 		if ((flags & MD_OVERLAP_ALLOW_REPEAT) &&
1411 		    (ps->ps_flags & MD_MPS_ON_OVERLAP)) {
1412 			mutex_exit(&un->un_ovrlap_chn_mx);
1413 			return;
1414 
1415 		}
1416 		for (ps1 = *head; ps1 && (ps1 != &un->un_ovrlap_chn);
1417 		    ps1 = ps1->ps_ovrlap_next) {
1418 			if (ps->ps_firstblk > (*tail)->ps_lastblk) {
1419 				/* Insert at tail */
1420 				ps->ps_ovrlap_prev = (*tail);
1421 				ps->ps_ovrlap_next = &un->un_ovrlap_chn;
1422 				(*tail)->ps_ovrlap_next = ps;
1423 				*tail = ps;
1424 				ps->ps_flags |= MD_MPS_ON_OVERLAP;
1425 				mutex_exit(&un->un_ovrlap_chn_mx);
1426 				return;
1427 			}
1428 			if (ps->ps_firstblk > ps1->ps_lastblk)
1429 				continue;
1430 			if (ps->ps_lastblk < ps1->ps_firstblk) {
1431 				/* Insert into list at current 'ps1' position */
1432 				ps->ps_ovrlap_next = ps1;
1433 				ps->ps_ovrlap_prev = ps1->ps_ovrlap_prev;
1434 				ps1->ps_ovrlap_prev->ps_ovrlap_next = ps;
1435 				ps1->ps_ovrlap_prev = ps;
1436 				ps->ps_flags |= MD_MPS_ON_OVERLAP;
1437 				mutex_exit(&un->un_ovrlap_chn_mx);
1438 				return;
1439 			}
1440 			break;
1441 		}
1442 		if (ps1 != NULL) {
1443 			un->un_ovrlap_chn_flg = 1;
1444 			cv_wait(&un->un_ovrlap_chn_cv, &un->un_ovrlap_chn_mx);
1445 			/*
1446 			 * Now ps1 refers to the old insertion point and we
1447 			 * have to check the whole chain to see if we're still
1448 			 * overlapping any other i/o.
1449 			 */
1450 		}
1451 	}
1452 
1453 	/*
1454 	 * Only get here if we had one overlapping i/o on the list and that
1455 	 * has now completed. In this case the list is empty so we insert <ps>
1456 	 * at the head of the chain.
1457 	 */
1458 	ASSERT(*head == NULL);
1459 	*tail = *head = ps;
1460 	ps->ps_ovrlap_next = ps->ps_ovrlap_prev = &un->un_ovrlap_chn;
1461 	ps->ps_flags |= MD_MPS_ON_OVERLAP;
1462 	mutex_exit(&un->un_ovrlap_chn_mx);
1463 }
1464 
1465 /*
1466  * This function is called from mirror_done to check whether any pages have
1467  * been modified while a mirrored write was in progress.  Returns 0 if
1468  * all pages associated with bp are clean, 1 otherwise.
1469  */
1470 static int
1471 any_pages_dirty(struct buf *bp)
1472 {
1473 	int	rval;
1474 
1475 	rval = biomodified(bp);
1476 	if (rval == -1)
1477 		rval = 0;
1478 
1479 	return (rval);
1480 }
1481 
1482 #define	MAX_EXTRAS 10
1483 
1484 void
1485 mirror_commit(
1486 	mm_unit_t	*un,
1487 	int		smmask,
1488 	mddb_recid_t	*extras
1489 )
1490 {
1491 	mm_submirror_t		*sm;
1492 	md_unit_t		*su;
1493 	int			i;
1494 
1495 	/* 2=mirror,null id */
1496 	mddb_recid_t		recids[NMIRROR+2+MAX_EXTRAS];
1497 
1498 	int			ri = 0;
1499 
1500 	if (md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)
1501 		return;
1502 
1503 	/* Add two, this includes the mirror unit and the null recid */
1504 	if (extras != NULL) {
1505 		int	nrecids = 0;
1506 		while (extras[nrecids] != 0) {
1507 			nrecids++;
1508 		}
1509 		ASSERT(nrecids <= MAX_EXTRAS);
1510 	}
1511 
1512 	if (un != NULL)
1513 		recids[ri++] = un->c.un_record_id;
1514 	for (i = 0;  i < NMIRROR; i++) {
1515 		if (!(smmask & SMI2BIT(i)))
1516 			continue;
1517 		sm = &un->un_sm[i];
1518 		if (!SMS_IS(sm, SMS_INUSE))
1519 			continue;
1520 		if (md_getmajor(sm->sm_dev) != md_major)
1521 			continue;
1522 		su =  MD_UNIT(md_getminor(sm->sm_dev));
1523 		recids[ri++] = su->c.un_record_id;
1524 	}
1525 
1526 	if (extras != NULL)
1527 		while (*extras != 0) {
1528 			recids[ri++] = *extras;
1529 			extras++;
1530 		}
1531 
1532 	if (ri == 0)
1533 		return;
1534 	recids[ri] = 0;
1535 
1536 	/*
1537 	 * Ok to hold ioctl lock across record commit to mddb as
1538 	 * long as the record(s) being committed aren't resync records.
1539 	 */
1540 	mddb_commitrecs_wrapper(recids);
1541 }
1542 
1543 
1544 /*
1545  * This routine is used to set a bit in the writable_bm bitmap
1546  * which represents each submirror in a metamirror which
1547  * is writable. The first writable submirror index is assigned
1548  * to the sm_index.  The number of writable submirrors are returned in nunits.
1549  *
1550  * This routine returns the submirror's unit number.
1551  */
1552 
1553 static void
1554 select_write_units(struct mm_unit *un, md_mps_t *ps)
1555 {
1556 
1557 	int		i;
1558 	unsigned	writable_bm = 0;
1559 	unsigned	nunits = 0;
1560 
1561 	for (i = 0; i < NMIRROR; i++) {
1562 		if (SUBMIRROR_IS_WRITEABLE(un, i)) {
1563 			/* set bit of all writable units */
1564 			writable_bm |= SMI2BIT(i);
1565 			nunits++;
1566 		}
1567 	}
1568 	ps->ps_writable_sm = writable_bm;
1569 	ps->ps_active_cnt = nunits;
1570 	ps->ps_current_sm = 0;
1571 }
1572 
1573 static
1574 unsigned
1575 select_write_after_read_units(struct mm_unit *un, md_mps_t *ps)
1576 {
1577 
1578 	int		i;
1579 	unsigned	writable_bm = 0;
1580 	unsigned	nunits = 0;
1581 
1582 	for (i = 0; i < NMIRROR; i++) {
1583 		if (SUBMIRROR_IS_WRITEABLE(un, i) &&
1584 		    un->un_sm[i].sm_flags & MD_SM_RESYNC_TARGET) {
1585 			writable_bm |= SMI2BIT(i);
1586 			nunits++;
1587 		}
1588 	}
1589 	if ((writable_bm & ps->ps_allfrom_sm) != 0) {
1590 		writable_bm &= ~ps->ps_allfrom_sm;
1591 		nunits--;
1592 	}
1593 	ps->ps_writable_sm = writable_bm;
1594 	ps->ps_active_cnt = nunits;
1595 	ps->ps_current_sm = 0;
1596 	return (nunits);
1597 }
1598 
1599 static md_dev64_t
1600 select_read_unit(
1601 	mm_unit_t	*un,
1602 	diskaddr_t	blkno,
1603 	u_longlong_t	reqcount,
1604 	u_longlong_t	*cando,
1605 	int		must_be_opened,
1606 	md_m_shared_t	**shared,
1607 	md_mcs_t	*cs)
1608 {
1609 	int			i;
1610 	md_m_shared_t		*s;
1611 	uint_t			lasterrcnt = 0;
1612 	md_dev64_t		dev = 0;
1613 	u_longlong_t		cnt;
1614 	u_longlong_t		mincnt;
1615 	mm_submirror_t		*sm;
1616 	mm_submirror_ic_t	*smic;
1617 	mdi_unit_t		*ui;
1618 
1619 	mincnt = reqcount;
1620 	for (i = 0; i < NMIRROR; i++) {
1621 		if (!SUBMIRROR_IS_READABLE(un, i))
1622 			continue;
1623 		sm = &un->un_sm[i];
1624 		smic = &un->un_smic[i];
1625 		cnt = reqcount;
1626 
1627 		/*
1628 		 * If the current submirror is marked as inaccessible, do not
1629 		 * try to access it.
1630 		 */
1631 		ui = MDI_UNIT(getminor(expldev(sm->sm_dev)));
1632 		(void) md_unit_readerlock(ui);
1633 		if (ui->ui_tstate & MD_INACCESSIBLE) {
1634 			md_unit_readerexit(ui);
1635 			continue;
1636 		}
1637 		md_unit_readerexit(ui);
1638 
1639 		s = (md_m_shared_t *)(*(smic->sm_shared_by_blk))
1640 		    (sm->sm_dev, sm, blkno, &cnt);
1641 
1642 		if (must_be_opened && !(s->ms_flags & MDM_S_ISOPEN))
1643 			continue;
1644 		if (s->ms_state == CS_OKAY) {
1645 			*cando = cnt;
1646 			if (shared != NULL)
1647 				*shared = s;
1648 
1649 			if (un->un_sm[i].sm_flags & MD_SM_FAILFAST &&
1650 			    cs != NULL) {
1651 				cs->cs_buf.b_flags |= B_FAILFAST;
1652 			}
1653 
1654 			return (un->un_sm[i].sm_dev);
1655 		}
1656 		if (s->ms_state != CS_LAST_ERRED)
1657 			continue;
1658 
1659 		/* don't use B_FAILFAST since we're Last Erred */
1660 
1661 		if (mincnt > cnt)
1662 			mincnt = cnt;
1663 		if (s->ms_lasterrcnt > lasterrcnt) {
1664 			lasterrcnt = s->ms_lasterrcnt;
1665 			if (shared != NULL)
1666 				*shared = s;
1667 			dev = un->un_sm[i].sm_dev;
1668 		}
1669 	}
1670 	*cando = mincnt;
1671 	return (dev);
1672 }
1673 
1674 /*
1675  * Given a 32-bit bitmap, this routine will return the bit number
1676  * of the nth bit set.	The nth bit set is passed via the index integer.
1677  *
1678  * This routine is used to run through the writable submirror bitmap
1679  * and starting all of the writes.  See the value returned is the
1680  * index to appropriate submirror structure, in the md_sm
1681  * array for metamirrors.
1682  */
1683 static int
1684 md_find_nth_unit(uint_t mask, int index)
1685 {
1686 	int	bit, nfound;
1687 
1688 	for (bit = -1, nfound = -1; nfound != index; bit++) {
1689 		ASSERT(mask != 0);
1690 		nfound += (mask & 1);
1691 		mask >>= 1;
1692 	}
1693 	return (bit);
1694 }
1695 
1696 static int
1697 fast_select_read_unit(md_mps_t *ps, md_mcs_t *cs)
1698 {
1699 	mm_unit_t	*un;
1700 	buf_t		*bp;
1701 	int		i;
1702 	unsigned	nunits = 0;
1703 	int		iunit;
1704 	uint_t		running_bm = 0;
1705 	uint_t		sm_index;
1706 
1707 	bp = &cs->cs_buf;
1708 	un = ps->ps_un;
1709 
1710 	for (i = 0; i < NMIRROR; i++) {
1711 		if (!SMS_BY_INDEX_IS(un, i, SMS_RUNNING))
1712 			continue;
1713 		running_bm |= SMI2BIT(i);
1714 		nunits++;
1715 	}
1716 	if (nunits == 0)
1717 		return (1);
1718 
1719 	/*
1720 	 * For directed mirror read (DMR) we only use the specified side and
1721 	 * do not compute the source of the read.
1722 	 */
1723 	if (ps->ps_flags & MD_MPS_DMR) {
1724 		sm_index = un->un_dmr_last_read;
1725 	} else {
1726 		/* Normal (non-DMR) operation */
1727 		switch (un->un_read_option) {
1728 		case RD_GEOMETRY:
1729 			iunit = (int)(bp->b_lblkno /
1730 			    howmany(un->c.un_total_blocks, nunits));
1731 			sm_index = md_find_nth_unit(running_bm, iunit);
1732 			break;
1733 		case RD_FIRST:
1734 			sm_index = md_find_nth_unit(running_bm, 0);
1735 			break;
1736 		case RD_LOAD_BAL:
1737 			/* this is intentional to fall into the default */
1738 		default:
1739 			un->un_last_read = (un->un_last_read + 1) % nunits;
1740 			sm_index = md_find_nth_unit(running_bm,
1741 			    un->un_last_read);
1742 			break;
1743 		}
1744 	}
1745 	bp->b_edev = md_dev64_to_dev(un->un_sm[sm_index].sm_dev);
1746 	ps->ps_allfrom_sm = SMI2BIT(sm_index);
1747 
1748 	if (un->un_sm[sm_index].sm_flags & MD_SM_FAILFAST) {
1749 	    bp->b_flags |= B_FAILFAST;
1750 	}
1751 
1752 	return (0);
1753 }
1754 
1755 static
1756 int
1757 mirror_are_submirrors_available(mm_unit_t *un)
1758 {
1759 	int i;
1760 	for (i = 0; i < NMIRROR; i++) {
1761 		md_dev64_t tmpdev = un->un_sm[i].sm_dev;
1762 
1763 		if ((!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) ||
1764 		    md_getmajor(tmpdev) != md_major)
1765 			continue;
1766 
1767 		if ((MD_MIN2SET(md_getminor(tmpdev)) >= md_nsets) ||
1768 		    (MD_MIN2UNIT(md_getminor(tmpdev)) >= md_nunits))
1769 			return (0);
1770 
1771 		if (MDI_UNIT(md_getminor(tmpdev)) == NULL)
1772 			return (0);
1773 	}
1774 	return (1);
1775 }
1776 
1777 void
1778 build_submirror(mm_unit_t *un, int i, int snarfing)
1779 {
1780 	struct mm_submirror	*sm;
1781 	struct mm_submirror_ic	*smic;
1782 	md_unit_t		*su;
1783 	set_t			setno;
1784 
1785 	sm = &un->un_sm[i];
1786 	smic = &un->un_smic[i];
1787 
1788 	sm->sm_flags = 0; /* sometime we may need to do more here */
1789 
1790 	setno = MD_UN2SET(un);
1791 
1792 	if (!SMS_IS(sm, SMS_INUSE))
1793 		return;
1794 	if (snarfing) {
1795 		sm->sm_dev = md_getdevnum(setno, mddb_getsidenum(setno),
1796 						sm->sm_key, MD_NOTRUST_DEVT);
1797 	} else {
1798 		if (md_getmajor(sm->sm_dev) == md_major) {
1799 			su = MD_UNIT(md_getminor(sm->sm_dev));
1800 			un->c.un_flag |= (su->c.un_flag & MD_LABELED);
1801 			/* submirror can no longer be soft partitioned */
1802 			MD_CAPAB(su) &= (~MD_CAN_SP);
1803 		}
1804 	}
1805 	smic->sm_shared_by_blk = md_get_named_service(sm->sm_dev,
1806 	    0, "shared by blk", 0);
1807 	smic->sm_shared_by_indx = md_get_named_service(sm->sm_dev,
1808 	    0, "shared by indx", 0);
1809 	smic->sm_get_component_count =
1810 	    (int (*)())md_get_named_service(sm->sm_dev, 0,
1811 		    "get component count", 0);
1812 	smic->sm_get_bcss =
1813 	    (int (*)())md_get_named_service(sm->sm_dev, 0,
1814 		    "get block count skip size", 0);
1815 	sm->sm_state &= ~SMS_IGNORE;
1816 	if (SMS_IS(sm, SMS_OFFLINE))
1817 		MD_STATUS(un) |= MD_UN_OFFLINE_SM;
1818 	md_set_parent(sm->sm_dev, MD_SID(un));
1819 }
1820 
1821 static void
1822 mirror_cleanup(mm_unit_t *un)
1823 {
1824 	mddb_recid_t	recid;
1825 	int		smi;
1826 	sv_dev_t	sv[NMIRROR];
1827 	int		nsv = 0;
1828 
1829 	/*
1830 	 * If a MN diskset and this node is not the master, do
1831 	 * not delete any records on snarf of the mirror records.
1832 	 */
1833 	if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
1834 	    md_set[MD_UN2SET(un)].s_am_i_master == 0) {
1835 		return;
1836 	}
1837 
1838 	for (smi = 0; smi < NMIRROR; smi++) {
1839 		if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE))
1840 			continue;
1841 		sv[nsv].setno = MD_UN2SET(un);
1842 		sv[nsv++].key = un->un_sm[smi].sm_key;
1843 	}
1844 
1845 	recid = un->un_rr_dirty_recid;
1846 	mddb_deleterec_wrapper(un->c.un_record_id);
1847 	if (recid > 0)
1848 		mddb_deleterec_wrapper(recid);
1849 
1850 	md_rem_names(sv, nsv);
1851 }
1852 
1853 /* Return a -1 if optimized record unavailable and set should be released */
1854 int
1855 mirror_build_incore(mm_unit_t *un, int snarfing)
1856 {
1857 	int		i;
1858 
1859 	if (MD_STATUS(un) & MD_UN_BEING_RESET) {
1860 		mddb_setrecprivate(un->c.un_record_id, MD_PRV_PENDCLEAN);
1861 		return (1);
1862 	}
1863 
1864 	if (mirror_are_submirrors_available(un) == 0)
1865 		return (1);
1866 
1867 	if (MD_UNIT(MD_SID(un)) != NULL)
1868 		return (0);
1869 
1870 	MD_STATUS(un) = 0;
1871 
1872 	/* pre-4.1 didn't define CAN_META_CHILD capability */
1873 	MD_CAPAB(un) = MD_CAN_META_CHILD | MD_CAN_PARENT | MD_CAN_SP;
1874 
1875 	un->un_ovrlap_chn_flg = 0;
1876 	bzero(&un->un_ovrlap_chn, sizeof (un->un_ovrlap_chn));
1877 
1878 	for (i = 0; i < NMIRROR; i++)
1879 		build_submirror(un, i, snarfing);
1880 
1881 	if (unit_setup_resync(un, snarfing) != 0) {
1882 		if (snarfing) {
1883 			mddb_setrecprivate(un->c.un_record_id, MD_PRV_GOTIT);
1884 			/*
1885 			 * If a MN set and set is not stale, then return -1
1886 			 * which will force the caller to unload the set.
1887 			 * The MN diskset nodes will return failure if
1888 			 * unit_setup_resync fails so that nodes won't
1889 			 * get out of sync.
1890 			 *
1891 			 * If set is STALE, the master node can't allocate
1892 			 * a resync record (if needed), but node needs to
1893 			 * join the set so that user can delete broken mddbs.
1894 			 * So, if set is STALE, just continue on.
1895 			 */
1896 			if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
1897 			    !(md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)) {
1898 				return (-1);
1899 			}
1900 		} else
1901 			return (1);
1902 	}
1903 
1904 	mutex_init(&un->un_ovrlap_chn_mx, NULL, MUTEX_DEFAULT, NULL);
1905 	cv_init(&un->un_ovrlap_chn_cv, NULL, CV_DEFAULT, NULL);
1906 
1907 	un->un_suspend_wr_flag = 0;
1908 	mutex_init(&un->un_suspend_wr_mx, NULL, MUTEX_DEFAULT, NULL);
1909 	cv_init(&un->un_suspend_wr_cv, NULL, CV_DEFAULT, NULL);
1910 
1911 	/*
1912 	 * Allocate mutexes for mirror-owner and resync-owner changes.
1913 	 * All references to the owner message state field must be guarded
1914 	 * by this mutex.
1915 	 */
1916 	mutex_init(&un->un_owner_mx, NULL, MUTEX_DEFAULT, NULL);
1917 
1918 	/*
1919 	 * Allocate mutex and condvar for resync thread manipulation. These
1920 	 * will be used by mirror_resync_unit/mirror_ioctl_resync
1921 	 */
1922 	mutex_init(&un->un_rs_thread_mx, NULL, MUTEX_DEFAULT, NULL);
1923 	cv_init(&un->un_rs_thread_cv, NULL, CV_DEFAULT, NULL);
1924 
1925 	/*
1926 	 * Allocate mutex and condvar for resync progress thread manipulation.
1927 	 * This allows resyncs to be continued across an intervening reboot.
1928 	 */
1929 	mutex_init(&un->un_rs_progress_mx, NULL, MUTEX_DEFAULT, NULL);
1930 	cv_init(&un->un_rs_progress_cv, NULL, CV_DEFAULT, NULL);
1931 
1932 	/*
1933 	 * Allocate mutex and condvar for Directed Mirror Reads (DMR). This
1934 	 * provides synchronization between a user-ioctl and the resulting
1935 	 * strategy() call that performs the read().
1936 	 */
1937 	mutex_init(&un->un_dmr_mx, NULL, MUTEX_DEFAULT, NULL);
1938 	cv_init(&un->un_dmr_cv, NULL, CV_DEFAULT, NULL);
1939 
1940 	MD_UNIT(MD_SID(un)) = un;
1941 	return (0);
1942 }
1943 
1944 
1945 void
1946 reset_mirror(struct mm_unit *un, minor_t mnum, int removing)
1947 {
1948 	mddb_recid_t	recid, vtoc_id;
1949 	size_t		bitcnt;
1950 	size_t		shortcnt;
1951 	int		smi;
1952 	sv_dev_t	sv[NMIRROR];
1953 	int		nsv = 0;
1954 	uint_t		bits = 0;
1955 	minor_t		selfid;
1956 	md_unit_t	*su;
1957 
1958 	md_destroy_unit_incore(mnum, &mirror_md_ops);
1959 
1960 	shortcnt = un->un_rrd_num * sizeof (short);
1961 	bitcnt = howmany(un->un_rrd_num, NBBY);
1962 
1963 	if (un->un_outstanding_writes)
1964 		kmem_free((caddr_t)un->un_outstanding_writes, shortcnt);
1965 	if (un->un_goingclean_bm)
1966 		kmem_free((caddr_t)un->un_goingclean_bm, bitcnt);
1967 	if (un->un_goingdirty_bm)
1968 		kmem_free((caddr_t)un->un_goingdirty_bm, bitcnt);
1969 	if (un->un_resync_bm)
1970 		kmem_free((caddr_t)un->un_resync_bm, bitcnt);
1971 
1972 	MD_UNIT(mnum) = NULL;
1973 
1974 	if (!removing)
1975 		return;
1976 
1977 	for (smi = 0; smi < NMIRROR; smi++) {
1978 		if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE))
1979 			continue;
1980 		/* reallow soft partitioning of submirror and reset parent */
1981 		su = MD_UNIT(md_getminor(un->un_sm[smi].sm_dev));
1982 		MD_CAPAB(su) |= MD_CAN_SP;
1983 		md_reset_parent(un->un_sm[smi].sm_dev);
1984 		reset_comp_states(&un->un_sm[smi], &un->un_smic[smi]);
1985 
1986 		sv[nsv].setno = MD_MIN2SET(mnum);
1987 		sv[nsv++].key = un->un_sm[smi].sm_key;
1988 		bits |= SMI2BIT(smi);
1989 	}
1990 
1991 	MD_STATUS(un) |= MD_UN_BEING_RESET;
1992 	recid = un->un_rr_dirty_recid;
1993 	vtoc_id = un->c.un_vtoc_id;
1994 	selfid = MD_SID(un);
1995 
1996 	mirror_commit(un, bits, 0);
1997 
1998 	/* Destroy all mutexes and condvars before returning. */
1999 	mutex_destroy(&un->un_suspend_wr_mx);
2000 	cv_destroy(&un->un_suspend_wr_cv);
2001 	mutex_destroy(&un->un_ovrlap_chn_mx);
2002 	cv_destroy(&un->un_ovrlap_chn_cv);
2003 	mutex_destroy(&un->un_owner_mx);
2004 	mutex_destroy(&un->un_rs_thread_mx);
2005 	cv_destroy(&un->un_rs_thread_cv);
2006 	mutex_destroy(&un->un_rs_progress_mx);
2007 	cv_destroy(&un->un_rs_progress_cv);
2008 	mutex_destroy(&un->un_dmr_mx);
2009 	cv_destroy(&un->un_dmr_cv);
2010 	mddb_deleterec_wrapper(un->c.un_record_id);
2011 	if (recid != 0)
2012 		mddb_deleterec_wrapper(recid);
2013 
2014 	/* Remove the vtoc, if present */
2015 	if (vtoc_id)
2016 		mddb_deleterec_wrapper(vtoc_id);
2017 
2018 	md_rem_names(sv, nsv);
2019 
2020 	SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, SVM_TAG_METADEVICE,
2021 	    MD_MIN2SET(selfid), selfid);
2022 }
2023 
2024 int
2025 mirror_internal_open(
2026 	minor_t		mnum,
2027 	int		flag,
2028 	int		otyp,
2029 	int		md_oflags,
2030 	IOLOCK		*lockp		/* can be NULL */
2031 )
2032 {
2033 	mdi_unit_t	*ui = MDI_UNIT(mnum);
2034 	int		err = 0;
2035 
2036 tryagain:
2037 	/* single thread */
2038 	if (lockp) {
2039 		/*
2040 		 * If ioctl lock is held, use openclose_enter
2041 		 * routine that will set the ioctl flag when
2042 		 * grabbing the readerlock.
2043 		 */
2044 		(void) md_ioctl_openclose_enter(lockp, ui);
2045 	} else {
2046 		(void) md_unit_openclose_enter(ui);
2047 	}
2048 
2049 	/*
2050 	 * The mirror_open_all_devs routine may end up sending a STATE_UPDATE
2051 	 * message in a MN diskset and this requires that the openclose
2052 	 * lock is dropped in order to send this message.  So, another
2053 	 * flag (MD_UL_OPENINPROGRESS) is used to keep another thread from
2054 	 * attempting an open while this thread has an open in progress.
2055 	 * Call the *_lh version of the lock exit routines since the ui_mx
2056 	 * mutex must be held from checking for OPENINPROGRESS until
2057 	 * after the cv_wait call.
2058 	 */
2059 	mutex_enter(&ui->ui_mx);
2060 	if (ui->ui_lock & MD_UL_OPENINPROGRESS) {
2061 		if (lockp) {
2062 			(void) md_ioctl_openclose_exit_lh(lockp);
2063 		} else {
2064 			md_unit_openclose_exit_lh(ui);
2065 		}
2066 		cv_wait(&ui->ui_cv, &ui->ui_mx);
2067 		mutex_exit(&ui->ui_mx);
2068 		goto tryagain;
2069 	}
2070 
2071 	ui->ui_lock |= MD_UL_OPENINPROGRESS;
2072 	mutex_exit(&ui->ui_mx);
2073 
2074 	/* open devices, if necessary */
2075 	if (! md_unit_isopen(ui) || (ui->ui_tstate & MD_INACCESSIBLE)) {
2076 		if ((err = mirror_open_all_devs(mnum, md_oflags, lockp)) != 0)
2077 			goto out;
2078 	}
2079 
2080 	/* count open */
2081 	if ((err = md_unit_incopen(mnum, flag, otyp)) != 0)
2082 		goto out;
2083 
2084 	/* unlock, return success */
2085 out:
2086 	mutex_enter(&ui->ui_mx);
2087 	ui->ui_lock &= ~MD_UL_OPENINPROGRESS;
2088 	mutex_exit(&ui->ui_mx);
2089 
2090 	if (lockp) {
2091 		/*
2092 		 * If ioctl lock is held, use openclose_exit
2093 		 * routine that will clear the lockp reader flag.
2094 		 */
2095 		(void) md_ioctl_openclose_exit(lockp);
2096 	} else {
2097 		md_unit_openclose_exit(ui);
2098 	}
2099 	return (err);
2100 }
2101 
2102 int
2103 mirror_internal_close(
2104 	minor_t		mnum,
2105 	int		otyp,
2106 	int		md_cflags,
2107 	IOLOCK		*lockp		/* can be NULL */
2108 )
2109 {
2110 	mdi_unit_t	*ui = MDI_UNIT(mnum);
2111 	mm_unit_t	*un;
2112 	int		err = 0;
2113 
2114 	/* single thread */
2115 	if (lockp) {
2116 		/*
2117 		 * If ioctl lock is held, use openclose_enter
2118 		 * routine that will set the ioctl flag when
2119 		 * grabbing the readerlock.
2120 		 */
2121 		un = (mm_unit_t *)md_ioctl_openclose_enter(lockp, ui);
2122 	} else {
2123 		un = (mm_unit_t *)md_unit_openclose_enter(ui);
2124 	}
2125 
2126 	/* count closed */
2127 	if ((err = md_unit_decopen(mnum, otyp)) != 0)
2128 		goto out;
2129 
2130 	/* close devices, if necessary */
2131 	if (! md_unit_isopen(ui) || (md_cflags & MD_OFLG_PROBEDEV)) {
2132 		/*
2133 		 * Clean up dirty bitmap for this unit. Do this
2134 		 * before closing the underlying devices to avoid
2135 		 * race conditions with reset_mirror() as a
2136 		 * result of a 'metaset -r' command running in
2137 		 * parallel. This might cause deallocation of
2138 		 * dirty region bitmaps; with underlying metadevices
2139 		 * in place this can't happen.
2140 		 * Don't do this if a MN set and ABR not set
2141 		 */
2142 		if (new_resync && !(MD_STATUS(un) & MD_UN_KEEP_DIRTY)) {
2143 			if (!MD_MNSET_SETNO(MD_UN2SET(un)) ||
2144 			    !(ui->ui_tstate & MD_ABR_CAP))
2145 				mirror_process_unit_resync(un);
2146 		}
2147 		(void) mirror_close_all_devs(un, md_cflags);
2148 
2149 		/*
2150 		 * For a MN set with transient capabilities (eg ABR/DMR) set,
2151 		 * clear these capabilities on the last open in the cluster.
2152 		 * To do this we send a message to all nodes to see of the
2153 		 * device is open.
2154 		 */
2155 		if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
2156 		    (ui->ui_tstate & (MD_ABR_CAP|MD_DMR_CAP))) {
2157 			if (lockp) {
2158 				(void) md_ioctl_openclose_exit(lockp);
2159 			} else {
2160 				md_unit_openclose_exit(ui);
2161 			}
2162 
2163 			/*
2164 			 * if we are in the context of an ioctl, drop the
2165 			 * ioctl lock.
2166 			 * Otherwise, no other locks should be held.
2167 			 */
2168 			if (lockp) {
2169 				IOLOCK_RETURN_RELEASE(0, lockp);
2170 			}
2171 
2172 			mdmn_clear_all_capabilities(mnum);
2173 
2174 			/* if dropped the lock previously, regain it */
2175 			if (lockp) {
2176 				IOLOCK_RETURN_REACQUIRE(lockp);
2177 			}
2178 			return (0);
2179 		}
2180 		/* unlock and return success */
2181 	}
2182 out:
2183 	/* Call whether lockp is NULL or not. */
2184 	if (lockp) {
2185 		md_ioctl_openclose_exit(lockp);
2186 	} else {
2187 		md_unit_openclose_exit(ui);
2188 	}
2189 	return (err);
2190 }
2191 
2192 /*
2193  * When a component has completed resyncing and is now ok, check if the
2194  * corresponding component in the other submirrors is in the Last Erred
2195  * state.  If it is, we want to change that to the Erred state so we stop
2196  * using that component and start using this good component instead.
2197  *
2198  * This is called from set_sm_comp_state and recursively calls
2199  * set_sm_comp_state if it needs to change the Last Erred state.
2200  */
2201 static void
2202 reset_lasterred(mm_unit_t *un, int smi, mddb_recid_t *extras, uint_t flags,
2203 	IOLOCK *lockp)
2204 {
2205 	mm_submirror_t		*sm;
2206 	mm_submirror_ic_t	*smic;
2207 	int			ci;
2208 	int			i;
2209 	int			compcnt;
2210 	int			changed = 0;
2211 
2212 	for (i = 0; i < NMIRROR; i++) {
2213 		sm = &un->un_sm[i];
2214 		smic = &un->un_smic[i];
2215 
2216 		if (!SMS_IS(sm, SMS_INUSE))
2217 			continue;
2218 
2219 		/* ignore the submirror that we just made ok */
2220 		if (i == smi)
2221 			continue;
2222 
2223 		compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un);
2224 		for (ci = 0; ci < compcnt; ci++) {
2225 			md_m_shared_t	*shared;
2226 
2227 			shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
2228 			    (sm->sm_dev, sm, ci);
2229 
2230 			if ((shared->ms_state & CS_LAST_ERRED) &&
2231 			    !mirror_other_sources(un, i, ci, 1)) {
2232 
2233 				set_sm_comp_state(un, i, ci, CS_ERRED, extras,
2234 				    flags, lockp);
2235 				changed = 1;
2236 			}
2237 		}
2238 	}
2239 
2240 	/* maybe there is a hotspare for this newly erred component */
2241 	if (changed) {
2242 		set_t	setno;
2243 
2244 		setno = MD_UN2SET(un);
2245 		if (MD_MNSET_SETNO(setno)) {
2246 			send_poke_hotspares(setno);
2247 		} else {
2248 			(void) poke_hotspares();
2249 		}
2250 	}
2251 }
2252 
2253 /*
2254  * set_sm_comp_state
2255  *
2256  * Set the state of a submirror component to the specified new state.
2257  * If the mirror is in a multi-node set, send messages to all nodes to
2258  * block all writes to the mirror and then update the state and release the
2259  * writes. These messages are only sent if MD_STATE_XMIT is set in flags.
2260  * MD_STATE_XMIT will be unset in 2 cases:
2261  * 1. When the state is changed to CS_RESYNC as this state change
2262  * will already have been updated on each node by the processing of the
2263  * distributed metasync command, hence no need to xmit.
2264  * 2. When the state is change to CS_OKAY after a resync has completed. Again
2265  * the resync completion will already have been processed on each node by
2266  * the processing of the MD_MN_MSG_RESYNC_PHASE_DONE message for a component
2267  * resync, hence no need to xmit.
2268  *
2269  * In case we are called from the updates of a watermark,
2270  * (then MD_STATE_WMUPDATE will be set in the ps->flags) this is due to
2271  * a metainit or similar. In this case the message that we sent to propagate
2272  * the state change must not be a class1 message as that would deadlock with
2273  * the metainit command that is still being processed.
2274  * This we achieve by creating a class2 message MD_MN_MSG_STATE_UPDATE2
2275  * instead. This also makes the submessage generator to create a class2
2276  * submessage rather than a class1 (which would also block)
2277  *
2278  * On entry, unit_writerlock is held
2279  * If MD_STATE_OCHELD is set in flags, then unit_openclose lock is
2280  * also held.
2281  */
2282 void
2283 set_sm_comp_state(
2284 	mm_unit_t	*un,
2285 	int		smi,
2286 	int		ci,
2287 	int		newstate,
2288 	mddb_recid_t	*extras,
2289 	uint_t		flags,
2290 	IOLOCK		*lockp
2291 )
2292 {
2293 	mm_submirror_t		*sm;
2294 	mm_submirror_ic_t	*smic;
2295 	md_m_shared_t		*shared;
2296 	int			origstate;
2297 	void			(*get_dev)();
2298 	ms_cd_info_t		cd;
2299 	char			devname[MD_MAX_CTDLEN];
2300 	int			err;
2301 	set_t			setno = MD_UN2SET(un);
2302 	md_mn_msg_stch_t	stchmsg;
2303 	mdi_unit_t		*ui = MDI_UNIT(MD_SID(un));
2304 	md_mn_kresult_t		*kresult;
2305 	int			rval;
2306 	uint_t			msgflags;
2307 	md_mn_msgtype_t		msgtype;
2308 	int			save_lock = 0;
2309 	mdi_unit_t		*ui_sm;
2310 
2311 	sm = &un->un_sm[smi];
2312 	smic = &un->un_smic[smi];
2313 
2314 	/* If we have a real error status then turn off MD_INACCESSIBLE. */
2315 	ui_sm = MDI_UNIT(getminor(md_dev64_to_dev(sm->sm_dev)));
2316 	if (newstate & (CS_ERRED | CS_RESYNC | CS_LAST_ERRED) &&
2317 	    ui_sm->ui_tstate & MD_INACCESSIBLE) {
2318 	    ui_sm->ui_tstate &= ~MD_INACCESSIBLE;
2319 	}
2320 
2321 	shared = (md_m_shared_t *)
2322 		(*(smic->sm_shared_by_indx))(sm->sm_dev, sm, ci);
2323 	origstate = shared->ms_state;
2324 
2325 	/*
2326 	 * If the new state is an error and the old one wasn't, generate
2327 	 * a console message. We do this before we send the state to other
2328 	 * nodes in a MN set because the state change may change the component
2329 	 * name  if a hotspare is allocated.
2330 	 */
2331 	if ((! (origstate & (CS_ERRED|CS_LAST_ERRED))) &&
2332 	    (newstate & (CS_ERRED|CS_LAST_ERRED))) {
2333 
2334 		get_dev =
2335 		    (void (*)())md_get_named_service(sm->sm_dev, 0,
2336 				"get device", 0);
2337 		(void) (*get_dev)(sm->sm_dev, sm, ci, &cd);
2338 
2339 		err = md_getdevname(setno, mddb_getsidenum(setno), 0,
2340 		    cd.cd_dev, devname, sizeof (devname));
2341 
2342 		if (err == ENOENT) {
2343 			(void) md_devname(setno, cd.cd_dev, devname,
2344 				sizeof (devname));
2345 		}
2346 
2347 		cmn_err(CE_WARN, "md: %s: %s needs maintenance",
2348 		    md_shortname(md_getminor(sm->sm_dev)), devname);
2349 
2350 		if (newstate & CS_LAST_ERRED) {
2351 			cmn_err(CE_WARN, "md: %s: %s last erred",
2352 			    md_shortname(md_getminor(sm->sm_dev)),
2353 			    devname);
2354 
2355 		} else if (shared->ms_flags & MDM_S_ISOPEN) {
2356 			/*
2357 			 * Close the broken device and clear the open flag on
2358 			 * it.  Closing the device means the RCM framework will
2359 			 * be able to unconfigure the device if required.
2360 			 *
2361 			 * We have to check that the device is open, otherwise
2362 			 * the first open on it has resulted in the error that
2363 			 * is being processed and the actual cd.cd_dev will be
2364 			 * NODEV64.
2365 			 *
2366 			 * If this is a multi-node mirror, then the multinode
2367 			 * state checks following this code will cause the
2368 			 * slave nodes to close the mirror in the function
2369 			 * mirror_set_state().
2370 			 */
2371 			md_layered_close(cd.cd_dev, MD_OFLG_NULL);
2372 			shared->ms_flags &= ~MDM_S_ISOPEN;
2373 		}
2374 
2375 	} else if ((origstate & CS_LAST_ERRED) && (newstate & CS_ERRED) &&
2376 	    (shared->ms_flags & MDM_S_ISOPEN)) {
2377 		/*
2378 		 * Similar to logic above except no log messages since we
2379 		 * are just transitioning from Last Erred to Erred.
2380 		 */
2381 		get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0,
2382 		    "get device", 0);
2383 		(void) (*get_dev)(sm->sm_dev, sm, ci, &cd);
2384 
2385 		md_layered_close(cd.cd_dev, MD_OFLG_NULL);
2386 		shared->ms_flags &= ~MDM_S_ISOPEN;
2387 	}
2388 
2389 	if ((MD_MNSET_SETNO(setno)) && (origstate != newstate) &&
2390 	    (flags & MD_STATE_XMIT) && !(ui->ui_tstate & MD_ERR_PENDING)) {
2391 		/*
2392 		 * For a multi-node mirror, send the state change to the
2393 		 * master, which broadcasts to all nodes, including this
2394 		 * one. Once the message is received, the state is set
2395 		 * in-core and the master commits the change to disk.
2396 		 * There is a case, comp_replace,  where this function
2397 		 * can be called from within an ioctl and therefore in this
2398 		 * case, as the ioctl will already be called on each node,
2399 		 * there is no need to xmit the state change to the master for
2400 		 * distribution to the other nodes. MD_STATE_XMIT flag is used
2401 		 * to indicate whether a xmit is required. The mirror's
2402 		 * transient state is set to MD_ERR_PENDING to avoid sending
2403 		 * multiple messages.
2404 		 */
2405 		if (newstate & (CS_ERRED|CS_LAST_ERRED))
2406 			ui->ui_tstate |= MD_ERR_PENDING;
2407 
2408 		/*
2409 		 * Send a state update message to all nodes. This message
2410 		 * will generate 2 submessages, the first one to suspend
2411 		 * all writes to the mirror and the second to update the
2412 		 * state and resume writes.
2413 		 */
2414 		stchmsg.msg_stch_mnum = un->c.un_self_id;
2415 		stchmsg.msg_stch_sm = smi;
2416 		stchmsg.msg_stch_comp = ci;
2417 		stchmsg.msg_stch_new_state = newstate;
2418 		stchmsg.msg_stch_hs_id = shared->ms_hs_id;
2419 #ifdef DEBUG
2420 		if (mirror_debug_flag)
2421 			printf("send set state, %x, %x, %x, %x, %x\n",
2422 			    stchmsg.msg_stch_mnum, stchmsg.msg_stch_sm,
2423 			    stchmsg.msg_stch_comp, stchmsg.msg_stch_new_state,
2424 			    stchmsg.msg_stch_hs_id);
2425 #endif
2426 		if (flags & MD_STATE_WMUPDATE) {
2427 			msgtype  = MD_MN_MSG_STATE_UPDATE2;
2428 			/*
2429 			 * When coming from an update of watermarks, there
2430 			 * must already be a message logged that triggered
2431 			 * this action. So, no need to log this message, too.
2432 			 */
2433 			msgflags = MD_MSGF_NO_LOG;
2434 		} else {
2435 			msgtype  = MD_MN_MSG_STATE_UPDATE;
2436 			msgflags = MD_MSGF_DEFAULT_FLAGS;
2437 		}
2438 
2439 		/*
2440 		 * If we are in the context of an ioctl, drop the ioctl lock.
2441 		 * lockp holds the list of locks held.
2442 		 *
2443 		 * Otherwise, increment the appropriate reacquire counters.
2444 		 * If openclose lock is *held, then must reacquire reader
2445 		 * lock before releasing the openclose lock.
2446 		 * Do not drop the ARRAY_WRITER lock as we may not be able
2447 		 * to reacquire it.
2448 		 */
2449 		if (lockp) {
2450 			if (lockp->l_flags & MD_ARRAY_WRITER) {
2451 				save_lock = MD_ARRAY_WRITER;
2452 				lockp->l_flags &= ~MD_ARRAY_WRITER;
2453 			} else if (lockp->l_flags & MD_ARRAY_READER) {
2454 				save_lock = MD_ARRAY_READER;
2455 				lockp->l_flags &= ~MD_ARRAY_READER;
2456 			}
2457 			IOLOCK_RETURN_RELEASE(0, lockp);
2458 		} else {
2459 			if (flags & MD_STATE_OCHELD) {
2460 				md_unit_writerexit(ui);
2461 				(void) md_unit_readerlock(ui);
2462 				md_unit_openclose_exit(ui);
2463 			} else {
2464 				md_unit_writerexit(ui);
2465 			}
2466 		}
2467 
2468 		kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
2469 		rval = mdmn_ksend_message(setno,
2470 					msgtype,
2471 					msgflags,
2472 					(char *)&stchmsg,
2473 					sizeof (stchmsg),
2474 					kresult);
2475 
2476 		if (!MDMN_KSEND_MSG_OK(rval, kresult)) {
2477 			mdmn_ksend_show_error(rval, kresult, "STATE UPDATE");
2478 			cmn_err(CE_PANIC,
2479 			    "ksend_message failure: STATE_UPDATE");
2480 		}
2481 		kmem_free(kresult, sizeof (md_mn_kresult_t));
2482 
2483 		/* if dropped the lock previously, regain it */
2484 		if (lockp) {
2485 			IOLOCK_RETURN_REACQUIRE(lockp);
2486 			lockp->l_flags |= save_lock;
2487 		} else {
2488 			/*
2489 			 * Reacquire dropped locks and update acquirecnts
2490 			 * appropriately.
2491 			 */
2492 			if (flags & MD_STATE_OCHELD) {
2493 				/*
2494 				 * openclose also grabs readerlock.
2495 				 */
2496 				(void) md_unit_openclose_enter(ui);
2497 				md_unit_readerexit(ui);
2498 				(void) md_unit_writerlock(ui);
2499 			} else {
2500 				(void) md_unit_writerlock(ui);
2501 			}
2502 		}
2503 
2504 		ui->ui_tstate &= ~MD_ERR_PENDING;
2505 	} else {
2506 		shared->ms_state = newstate;
2507 		uniqtime32(&shared->ms_timestamp);
2508 
2509 		if (newstate == CS_ERRED)
2510 			shared->ms_flags |= MDM_S_NOWRITE;
2511 		else
2512 			shared->ms_flags &= ~MDM_S_NOWRITE;
2513 
2514 		shared->ms_flags &= ~MDM_S_IOERR;
2515 		un->un_changecnt++;
2516 		shared->ms_lasterrcnt = un->un_changecnt;
2517 
2518 		mirror_set_sm_state(sm, smic, SMS_RUNNING, 0);
2519 		mirror_commit(un, SMI2BIT(smi), extras);
2520 	}
2521 
2522 	if ((origstate & CS_RESYNC) && (newstate & CS_OKAY)) {
2523 		/*
2524 		 * Resetting the Last Erred state will recursively call back
2525 		 * into this function (set_sm_comp_state) to update the state.
2526 		 */
2527 		reset_lasterred(un, smi, extras, flags, lockp);
2528 	}
2529 }
2530 
2531 static int
2532 find_another_logical(
2533 	mm_unit_t		*un,
2534 	mm_submirror_t		*esm,
2535 	diskaddr_t		blk,
2536 	u_longlong_t		cnt,
2537 	int			must_be_open,
2538 	int			state,
2539 	int			err_cnt)
2540 {
2541 	u_longlong_t	cando;
2542 	md_dev64_t	dev;
2543 	md_m_shared_t	*s;
2544 
2545 	esm->sm_state |= SMS_IGNORE;
2546 	while (cnt != 0) {
2547 		u_longlong_t	 mcnt;
2548 
2549 		mcnt = MIN(cnt, lbtodb(1024 * 1024 * 1024));	/* 1 Gig Blks */
2550 
2551 		dev = select_read_unit(un, blk, mcnt, &cando, must_be_open, &s,
2552 			NULL);
2553 		if (dev == (md_dev64_t)0)
2554 			break;
2555 
2556 		if ((state == CS_LAST_ERRED) &&
2557 		    (s->ms_state == CS_LAST_ERRED) &&
2558 		    (err_cnt > s->ms_lasterrcnt))
2559 			break;
2560 
2561 		cnt -= cando;
2562 		blk += cando;
2563 	}
2564 	esm->sm_state &= ~SMS_IGNORE;
2565 	return (cnt != 0);
2566 }
2567 
2568 int
2569 mirror_other_sources(mm_unit_t *un, int smi, int ci, int must_be_open)
2570 {
2571 	mm_submirror_t		*sm;
2572 	mm_submirror_ic_t	*smic;
2573 	size_t			count;
2574 	diskaddr_t		block;
2575 	u_longlong_t		skip;
2576 	u_longlong_t		size;
2577 	md_dev64_t		dev;
2578 	int			cnt;
2579 	md_m_shared_t		*s;
2580 	int			not_found;
2581 
2582 	sm = &un->un_sm[smi];
2583 	smic = &un->un_smic[smi];
2584 	dev = sm->sm_dev;
2585 
2586 	/*
2587 	 * Make sure every component of the submirror
2588 	 * has other sources.
2589 	 */
2590 	if (ci < 0) {
2591 		/* Find the highest lasterrcnt */
2592 		cnt = (*(smic->sm_get_component_count))(dev, sm);
2593 		for (ci = 0; ci < cnt; ci++) {
2594 			not_found = mirror_other_sources(un, smi, ci,
2595 			    must_be_open);
2596 			if (not_found)
2597 				return (1);
2598 		}
2599 		return (0);
2600 	}
2601 
2602 	/*
2603 	 * Make sure this component has other sources
2604 	 */
2605 	(void) (*(smic->sm_get_bcss))
2606 		(dev, sm, ci, &block, &count, &skip, &size);
2607 
2608 	if (count == 0)
2609 		return (1);
2610 
2611 	s = (md_m_shared_t *)(*(smic->sm_shared_by_indx))(dev, sm, ci);
2612 
2613 	while (count--) {
2614 		if (block >= un->c.un_total_blocks)
2615 			return (0);
2616 
2617 		if ((block + size) > un->c.un_total_blocks)
2618 			size = un->c.un_total_blocks - block;
2619 
2620 		not_found = find_another_logical(un, sm, block, size,
2621 		    must_be_open, s->ms_state, s->ms_lasterrcnt);
2622 		if (not_found)
2623 			return (1);
2624 
2625 		block += size + skip;
2626 	}
2627 	return (0);
2628 }
2629 
2630 static void
2631 finish_error(md_mps_t *ps)
2632 {
2633 	struct buf	*pb;
2634 	mm_unit_t	*un;
2635 	mdi_unit_t	*ui;
2636 	uint_t		new_str_flags;
2637 
2638 	pb = ps->ps_bp;
2639 	un = ps->ps_un;
2640 	ui = ps->ps_ui;
2641 
2642 	/*
2643 	 * Must flag any error to the resync originator if we're performing
2644 	 * a Write-after-Read. This corresponds to an i/o error on a resync
2645 	 * target device and in this case we ought to abort the resync as there
2646 	 * is nothing that can be done to recover from this without operator
2647 	 * intervention. If we don't set the B_ERROR flag we will continue
2648 	 * reading from the mirror but won't write to the target (as it will
2649 	 * have been placed into an errored state).
2650 	 * To handle the case of multiple components within a submirror we only
2651 	 * set the B_ERROR bit if explicitly requested to via MD_MPS_FLAG_ERROR.
2652 	 * The originator of the resync read will cause this bit to be set if
2653 	 * the underlying component count is one for a submirror resync. All
2654 	 * other resync types will have the flag set as there is no underlying
2655 	 * resync which can be performed on a contained metadevice for these
2656 	 * resync types (optimized or component).
2657 	 */
2658 
2659 	if (ps->ps_flags & MD_MPS_WRITE_AFTER_READ) {
2660 		if (ps->ps_flags & MD_MPS_FLAG_ERROR)
2661 			pb->b_flags |= B_ERROR;
2662 		md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
2663 		MPS_FREE(mirror_parent_cache, ps);
2664 		md_unit_readerexit(ui);
2665 		md_biodone(pb);
2666 		return;
2667 	}
2668 	/*
2669 	 * Set the MD_IO_COUNTED flag as we are retrying the same I/O
2670 	 * operation therefore this I/O request has already been counted,
2671 	 * the I/O count variable will be decremented by mirror_done()'s
2672 	 * call to md_biodone().
2673 	 */
2674 	if (ps->ps_changecnt != un->un_changecnt) {
2675 		new_str_flags = MD_STR_NOTTOP | MD_IO_COUNTED;
2676 		if (ps->ps_flags & MD_MPS_WOW)
2677 			new_str_flags |= MD_STR_WOW;
2678 		if (ps->ps_flags & MD_MPS_MAPPED)
2679 			new_str_flags |= MD_STR_MAPPED;
2680 		/*
2681 		 * If this I/O request was a read that was part of a resync,
2682 		 * set MD_STR_WAR for the retried read to ensure that the
2683 		 * resync write (i.e. write-after-read) will be performed
2684 		 */
2685 		if (ps->ps_flags & MD_MPS_RESYNC_READ)
2686 			new_str_flags |= MD_STR_WAR;
2687 		md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
2688 		MPS_FREE(mirror_parent_cache, ps);
2689 		md_unit_readerexit(ui);
2690 		(void) md_mirror_strategy(pb, new_str_flags, NULL);
2691 		return;
2692 	}
2693 
2694 	pb->b_flags |= B_ERROR;
2695 	md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
2696 	MPS_FREE(mirror_parent_cache, ps);
2697 	md_unit_readerexit(ui);
2698 	md_biodone(pb);
2699 }
2700 
2701 static void
2702 error_update_unit(md_mps_t *ps)
2703 {
2704 	mm_unit_t		*un;
2705 	mdi_unit_t		*ui;
2706 	int			smi;	/* sub mirror index */
2707 	int			ci;	/* errored component */
2708 	set_t			setno;
2709 	uint_t			flags;	/* for set_sm_comp_state() */
2710 	uint_t			hspflags; /* for check_comp_4_hotspares() */
2711 
2712 	ui = ps->ps_ui;
2713 	un = (mm_unit_t *)md_unit_writerlock(ui);
2714 	setno = MD_UN2SET(un);
2715 
2716 	/* All of these updates have to propagated in case of MN set */
2717 	flags = MD_STATE_XMIT;
2718 	hspflags = MD_HOTSPARE_XMIT;
2719 
2720 	/* special treatment if we are called during updating watermarks */
2721 	if (ps->ps_flags & MD_MPS_WMUPDATE) {
2722 		flags |= MD_STATE_WMUPDATE;
2723 		hspflags |= MD_HOTSPARE_WMUPDATE;
2724 	}
2725 	smi = 0;
2726 	ci = 0;
2727 	while (mirror_geterror(un, &smi, &ci, 1, 0) != 0) {
2728 		if (mirror_other_sources(un, smi, ci, 0) == 1) {
2729 
2730 			/* Never called from ioctl context, so (IOLOCK *)NULL */
2731 			set_sm_comp_state(un, smi, ci, CS_LAST_ERRED, 0, flags,
2732 				(IOLOCK *)NULL);
2733 			/*
2734 			 * For a MN set, the NOTIFY is done when the state
2735 			 * change is processed on each node
2736 			 */
2737 			if (!MD_MNSET_SETNO(MD_UN2SET(un))) {
2738 				SE_NOTIFY(EC_SVM_STATE, ESC_SVM_LASTERRED,
2739 				    SVM_TAG_METADEVICE, setno, MD_SID(un));
2740 			}
2741 			continue;
2742 		}
2743 		/* Never called from ioctl context, so (IOLOCK *)NULL */
2744 		set_sm_comp_state(un, smi, ci, CS_ERRED, 0, flags,
2745 			(IOLOCK *)NULL);
2746 		/*
2747 		 * For a MN set, the NOTIFY is done when the state
2748 		 * change is processed on each node
2749 		 */
2750 		if (!MD_MNSET_SETNO(MD_UN2SET(un))) {
2751 			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED,
2752 			    SVM_TAG_METADEVICE, setno, MD_SID(un));
2753 		}
2754 		smi = 0;
2755 		ci = 0;
2756 	}
2757 
2758 	md_unit_writerexit(ui);
2759 	if (MD_MNSET_SETNO(setno)) {
2760 		send_poke_hotspares(setno);
2761 	} else {
2762 		(void) poke_hotspares();
2763 	}
2764 	(void) md_unit_readerlock(ui);
2765 
2766 	finish_error(ps);
2767 }
2768 
2769 /*
2770  * When we have a B_FAILFAST IO error on a Last Erred component we need to
2771  * retry the IO without B_FAILFAST set so that we try to ensure that the
2772  * component "sees" each IO.
2773  */
2774 static void
2775 last_err_retry(md_mcs_t *cs)
2776 {
2777 	struct buf	*cb;
2778 	md_mps_t	*ps;
2779 	uint_t		flags;
2780 
2781 	cb = &cs->cs_buf;
2782 	cb->b_flags &= ~B_FAILFAST;
2783 
2784 	/* if we're panicing just let this I/O error out */
2785 	if (panicstr) {
2786 	    (void) mirror_done(cb);
2787 	    return;
2788 	}
2789 
2790 	/* reissue the I/O */
2791 
2792 	ps = cs->cs_ps;
2793 
2794 	bioerror(cb, 0);
2795 
2796 	mutex_enter(&ps->ps_mx);
2797 
2798 	flags = MD_STR_NOTTOP;
2799 	if (ps->ps_flags & MD_MPS_MAPPED)
2800 		flags |= MD_STR_MAPPED;
2801 	if (ps->ps_flags & MD_MPS_NOBLOCK)
2802 		flags |= MD_NOBLOCK;
2803 
2804 	mutex_exit(&ps->ps_mx);
2805 
2806 	clear_retry_error(cb);
2807 
2808 	cmn_err(CE_NOTE, "!md: %s: Last Erred, retry I/O without B_FAILFAST",
2809 		md_shortname(getminor(cb->b_edev)));
2810 
2811 	md_call_strategy(cb, flags, NULL);
2812 }
2813 
2814 static void
2815 mirror_error(md_mps_t *ps)
2816 {
2817 	int		smi;	/* sub mirror index */
2818 	int		ci;	/* errored component */
2819 
2820 	if (panicstr) {
2821 		finish_error(ps);
2822 		return;
2823 	}
2824 
2825 	if (ps->ps_flags & MD_MPS_ON_OVERLAP)
2826 		mirror_overlap_chain_remove(ps);
2827 
2828 	smi = 0;
2829 	ci = 0;
2830 	if (mirror_geterror(ps->ps_un, &smi, &ci, 0, 0) != 0) {
2831 		md_unit_readerexit(ps->ps_ui);
2832 		daemon_request(&md_mstr_daemon, error_update_unit,
2833 		    (daemon_queue_t *)ps, REQ_OLD);
2834 		return;
2835 	}
2836 
2837 	finish_error(ps);
2838 }
2839 
2840 static int
2841 copy_write_done(struct buf *cb)
2842 {
2843 	md_mps_t	*ps;
2844 	buf_t		*pb;
2845 	char		*wowbuf;
2846 	wowhdr_t	*wowhdr;
2847 	ssize_t		wow_resid;
2848 
2849 	/* get wowbuf ans save structure */
2850 	wowbuf = cb->b_un.b_addr;
2851 	wowhdr = WOWBUF_HDR(wowbuf);
2852 	ps = wowhdr->wow_ps;
2853 	pb = ps->ps_bp;
2854 
2855 	/* Save error information, then free cb */
2856 	if (cb->b_flags & B_ERROR)
2857 		pb->b_flags |= B_ERROR;
2858 
2859 	if (cb->b_flags & B_REMAPPED)
2860 		bp_mapout(cb);
2861 
2862 	freerbuf(cb);
2863 
2864 	/* update residual and continue if needed */
2865 	if ((pb->b_flags & B_ERROR) == 0) {
2866 		wow_resid = pb->b_bcount - wowhdr->wow_offset;
2867 		pb->b_resid = wow_resid;
2868 		if (wow_resid > 0)  {
2869 			daemon_request(&md_mstr_daemon, copy_write_cont,
2870 			    (daemon_queue_t *)wowhdr, REQ_OLD);
2871 			return (1);
2872 		}
2873 	}
2874 
2875 	/* Write is complete, release resources. */
2876 	kmem_cache_free(mirror_wowblk_cache, wowhdr);
2877 	ASSERT(!(ps->ps_flags & MD_MPS_ON_OVERLAP));
2878 	md_kstat_done(ps->ps_ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
2879 	MPS_FREE(mirror_parent_cache, ps);
2880 	md_biodone(pb);
2881 	return (0);
2882 }
2883 
2884 static void
2885 copy_write_cont(wowhdr_t *wowhdr)
2886 {
2887 	buf_t		*pb;
2888 	buf_t		*cb;
2889 	char		*wowbuf;
2890 	int		wow_offset;
2891 	size_t		wow_resid;
2892 	diskaddr_t	wow_blkno;
2893 
2894 	wowbuf = WOWHDR_BUF(wowhdr);
2895 	pb = wowhdr->wow_ps->ps_bp;
2896 
2897 	/* get data on current location */
2898 	wow_offset = wowhdr->wow_offset;
2899 	wow_resid = pb->b_bcount - wow_offset;
2900 	wow_blkno = pb->b_lblkno + lbtodb(wow_offset);
2901 
2902 	/* setup child buffer */
2903 	cb = getrbuf(KM_SLEEP);
2904 	cb->b_flags = B_WRITE;
2905 	cb->b_edev = pb->b_edev;
2906 	cb->b_un.b_addr = wowbuf;	/* change to point at WOWBUF */
2907 	cb->b_bufsize = md_wowbuf_size; /* change to wowbuf_size */
2908 	cb->b_iodone = copy_write_done;
2909 	cb->b_bcount = MIN(md_wowbuf_size, wow_resid);
2910 	cb->b_lblkno = wow_blkno;
2911 
2912 	/* move offset to next section */
2913 	wowhdr->wow_offset += cb->b_bcount;
2914 
2915 	/* copy and setup write for current section */
2916 	bcopy(&pb->b_un.b_addr[wow_offset], wowbuf, cb->b_bcount);
2917 
2918 	/* do it */
2919 	/*
2920 	 * Do not set the MD_IO_COUNTED flag as this is a new I/O request
2921 	 * that handles the WOW condition. The resultant increment on the
2922 	 * I/O count variable is cleared by copy_write_done()'s call to
2923 	 * md_biodone().
2924 	 */
2925 	(void) md_mirror_strategy(cb, MD_STR_NOTTOP | MD_STR_WOW
2926 				    | MD_STR_MAPPED, NULL);
2927 }
2928 
2929 static void
2930 md_mirror_copy_write(md_mps_t *ps)
2931 {
2932 	wowhdr_t	*wowhdr;
2933 
2934 	wowhdr = kmem_cache_alloc(mirror_wowblk_cache, MD_ALLOCFLAGS);
2935 	mirror_wowblk_init(wowhdr);
2936 	wowhdr->wow_ps = ps;
2937 	wowhdr->wow_offset = 0;
2938 	copy_write_cont(wowhdr);
2939 }
2940 
2941 static void
2942 handle_wow(md_mps_t *ps)
2943 {
2944 	buf_t		*pb;
2945 
2946 	pb = ps->ps_bp;
2947 
2948 	bp_mapin(pb);
2949 
2950 	md_mirror_wow_cnt++;
2951 	if (!(pb->b_flags & B_PHYS) && (md_mirror_wow_flg & WOW_LOGIT)) {
2952 		cmn_err(CE_NOTE,
2953 		    "md: %s, blk %lld, cnt %ld: Write on write %d occurred",
2954 		    md_shortname(getminor(pb->b_edev)),
2955 		    (longlong_t)pb->b_lblkno, pb->b_bcount, md_mirror_wow_cnt);
2956 	}
2957 
2958 	/*
2959 	 * Set the MD_IO_COUNTED flag as we are retrying the same I/O
2960 	 * operation therefore this I/O request has already been counted,
2961 	 * the I/O count variable will be decremented by mirror_done()'s
2962 	 * call to md_biodone().
2963 	 */
2964 	if (md_mirror_wow_flg & WOW_NOCOPY)
2965 		(void) md_mirror_strategy(pb, MD_STR_NOTTOP | MD_STR_WOW |
2966 					    MD_STR_MAPPED | MD_IO_COUNTED, ps);
2967 	else
2968 		md_mirror_copy_write(ps);
2969 }
2970 
2971 /*
2972  * Return true if the specified submirror is either in the Last Erred
2973  * state or is transitioning into the Last Erred state.
2974  */
2975 static bool_t
2976 submirror_is_lasterred(mm_unit_t *un, int smi)
2977 {
2978 	mm_submirror_t		*sm;
2979 	mm_submirror_ic_t	*smic;
2980 	md_m_shared_t		*shared;
2981 	int			ci;
2982 	int			compcnt;
2983 
2984 	sm = &un->un_sm[smi];
2985 	smic = &un->un_smic[smi];
2986 
2987 	compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un);
2988 	for (ci = 0; ci < compcnt; ci++) {
2989 		shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
2990 		    (sm->sm_dev, sm, ci);
2991 
2992 		if (shared->ms_state == CS_LAST_ERRED)
2993 			return (B_TRUE);
2994 
2995 		/*
2996 		 * It is not currently Last Erred, check if entering Last Erred.
2997 		 */
2998 		if ((shared->ms_flags & MDM_S_IOERR) &&
2999 		    ((shared->ms_state == CS_OKAY) ||
3000 		    (shared->ms_state == CS_RESYNC))) {
3001 			if (mirror_other_sources(un, smi, ci, 0) == 1)
3002 				return (B_TRUE);
3003 		}
3004 	}
3005 
3006 	return (B_FALSE);
3007 }
3008 
3009 
3010 static int
3011 mirror_done(struct buf *cb)
3012 {
3013 	md_mps_t	*ps;
3014 	md_mcs_t	*cs;
3015 
3016 	/*LINTED*/
3017 	cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off);
3018 	ps = cs->cs_ps;
3019 
3020 	mutex_enter(&ps->ps_mx);
3021 
3022 	/* check if we need to retry an errored failfast I/O */
3023 	if (cb->b_flags & B_ERROR) {
3024 		struct buf *pb = ps->ps_bp;
3025 
3026 		if (cb->b_flags & B_FAILFAST) {
3027 			int		i;
3028 			mm_unit_t	*un = ps->ps_un;
3029 
3030 			for (i = 0; i < NMIRROR; i++) {
3031 				if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
3032 					continue;
3033 
3034 				if (cb->b_edev ==
3035 				    md_dev64_to_dev(un->un_sm[i].sm_dev)) {
3036 
3037 					/*
3038 					 * This is the submirror that had the
3039 					 * error.  Check if it is Last Erred.
3040 					 */
3041 					if (submirror_is_lasterred(un, i)) {
3042 						daemon_queue_t *dqp;
3043 
3044 						mutex_exit(&ps->ps_mx);
3045 						dqp = (daemon_queue_t *)cs;
3046 						dqp->dq_prev = NULL;
3047 						dqp->dq_next = NULL;
3048 						daemon_request(&md_done_daemon,
3049 						    last_err_retry, dqp,
3050 						    REQ_OLD);
3051 						return (1);
3052 					}
3053 					break;
3054 				}
3055 			}
3056 		}
3057 
3058 		/* continue to process the buf without doing a retry */
3059 		ps->ps_flags |= MD_MPS_ERROR;
3060 		pb->b_error = cb->b_error;
3061 	}
3062 
3063 	return (mirror_done_common(cb));
3064 }
3065 
3066 /*
3067  * Split from the original mirror_done function so we can handle bufs after a
3068  * retry.
3069  * ps->ps_mx is already held in the caller of this function and the cb error
3070  * has already been checked and handled in the caller.
3071  */
3072 static int
3073 mirror_done_common(struct buf *cb)
3074 {
3075 	struct buf	*pb;
3076 	mm_unit_t	*un;
3077 	mdi_unit_t	*ui;
3078 	md_mps_t	*ps;
3079 	md_mcs_t	*cs;
3080 	size_t		end_rr, start_rr, current_rr;
3081 
3082 	/*LINTED*/
3083 	cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off);
3084 	ps = cs->cs_ps;
3085 	pb = ps->ps_bp;
3086 
3087 	if (cb->b_flags & B_REMAPPED)
3088 		bp_mapout(cb);
3089 
3090 	ps->ps_frags--;
3091 	if (ps->ps_frags != 0) {
3092 		mutex_exit(&ps->ps_mx);
3093 		kmem_cache_free(mirror_child_cache, cs);
3094 		return (1);
3095 	}
3096 	un = ps->ps_un;
3097 	ui = ps->ps_ui;
3098 
3099 	/*
3100 	 * Do not update outstanding_writes if we're running with ABR
3101 	 * set for this mirror or the write() was issued with MD_STR_ABR set.
3102 	 * Also a resync initiated write() has no outstanding_writes update
3103 	 * either.
3104 	 */
3105 	if (((cb->b_flags & B_READ) == 0) &&
3106 	    (un->un_nsm >= 2) &&
3107 	    (ps->ps_call == NULL) &&
3108 	    !((ui->ui_tstate & MD_ABR_CAP) || (ps->ps_flags & MD_MPS_ABR)) &&
3109 	    !(ps->ps_flags & MD_MPS_WRITE_AFTER_READ)) {
3110 		BLK_TO_RR(end_rr, ps->ps_lastblk, un);
3111 		BLK_TO_RR(start_rr, ps->ps_firstblk, un);
3112 		mutex_enter(&un->un_resync_mx);
3113 		for (current_rr = start_rr; current_rr <= end_rr; current_rr++)
3114 			un->un_outstanding_writes[current_rr]--;
3115 		mutex_exit(&un->un_resync_mx);
3116 	}
3117 	kmem_cache_free(mirror_child_cache, cs);
3118 	mutex_exit(&ps->ps_mx);
3119 
3120 	if (ps->ps_call != NULL) {
3121 		daemon_request(&md_done_daemon, ps->ps_call,
3122 		    (daemon_queue_t *)ps, REQ_OLD);
3123 		return (1);
3124 	}
3125 
3126 	if ((ps->ps_flags & MD_MPS_ERROR)) {
3127 		daemon_request(&md_done_daemon, mirror_error,
3128 		    (daemon_queue_t *)ps, REQ_OLD);
3129 		return (1);
3130 	}
3131 
3132 	if (ps->ps_flags & MD_MPS_ON_OVERLAP)
3133 		mirror_overlap_chain_remove(ps);
3134 
3135 	/*
3136 	 * Handle Write-on-Write problem.
3137 	 * Skip In case of Raw and Direct I/O as they are
3138 	 * handled earlier.
3139 	 *
3140 	 */
3141 	if (!(md_mirror_wow_flg & WOW_DISABLE) &&
3142 	    !(pb->b_flags & B_READ) &&
3143 	    !(ps->ps_flags & MD_MPS_WOW) &&
3144 	    !(pb->b_flags & B_PHYS) &&
3145 	    any_pages_dirty(pb)) {
3146 		md_unit_readerexit(ps->ps_ui);
3147 		daemon_request(&md_mstr_daemon, handle_wow,
3148 		    (daemon_queue_t *)ps, REQ_OLD);
3149 		return (1);
3150 	}
3151 
3152 	md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
3153 	MPS_FREE(mirror_parent_cache, ps);
3154 	md_unit_readerexit(ui);
3155 	md_biodone(pb);
3156 	return (0);
3157 }
3158 
3159 /*
3160  * Clear error state in submirror component if the retry worked after
3161  * a failfast error.
3162  */
3163 static void
3164 clear_retry_error(struct buf *cb)
3165 {
3166 	int			smi;
3167 	md_mcs_t		*cs;
3168 	mm_unit_t		*un;
3169 	mdi_unit_t		*ui_sm;
3170 	mm_submirror_t		*sm;
3171 	mm_submirror_ic_t	*smic;
3172 	u_longlong_t		cnt;
3173 	md_m_shared_t		*shared;
3174 
3175 	/*LINTED*/
3176 	cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off);
3177 	un = cs->cs_ps->ps_un;
3178 
3179 	for (smi = 0; smi < NMIRROR; smi++) {
3180 	    if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE))
3181 		continue;
3182 
3183 	    if (cb->b_edev == md_dev64_to_dev(un->un_sm[smi].sm_dev)) {
3184 		break;
3185 	    }
3186 	}
3187 
3188 	if (smi >= NMIRROR)
3189 	    return;
3190 
3191 	sm = &un->un_sm[smi];
3192 	smic = &un->un_smic[smi];
3193 	cnt = cb->b_bcount;
3194 
3195 	ui_sm = MDI_UNIT(getminor(cb->b_edev));
3196 	(void) md_unit_writerlock(ui_sm);
3197 
3198 	shared = (md_m_shared_t *)(*(smic->sm_shared_by_blk))(sm->sm_dev, sm,
3199 	    cb->b_blkno, &cnt);
3200 
3201 	if (shared->ms_flags & MDM_S_IOERR) {
3202 	    shared->ms_flags &= ~MDM_S_IOERR;
3203 
3204 	} else {
3205 	    /* the I/O buf spans components and the first one is not erred */
3206 	    int	cnt;
3207 	    int	i;
3208 
3209 	    cnt = (*(smic->sm_get_component_count))(sm->sm_dev, un);
3210 	    for (i = 0; i < cnt; i++) {
3211 		shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
3212 		    (sm->sm_dev, sm, i);
3213 
3214 		if (shared->ms_flags & MDM_S_IOERR &&
3215 		    shared->ms_state == CS_OKAY) {
3216 
3217 		    shared->ms_flags &= ~MDM_S_IOERR;
3218 		    break;
3219 		}
3220 	    }
3221 	}
3222 
3223 	md_unit_writerexit(ui_sm);
3224 }
3225 
3226 static size_t
3227 mirror_map_read(
3228 	md_mps_t *ps,
3229 	md_mcs_t *cs,
3230 	diskaddr_t blkno,
3231 	u_longlong_t	count
3232 )
3233 {
3234 	mm_unit_t	*un;
3235 	buf_t		*bp;
3236 	u_longlong_t	cando;
3237 
3238 	bp = &cs->cs_buf;
3239 	un = ps->ps_un;
3240 
3241 	bp->b_lblkno = blkno;
3242 	if (fast_select_read_unit(ps, cs) == 0) {
3243 		bp->b_bcount = ldbtob(count);
3244 		return (0);
3245 	}
3246 	bp->b_edev = md_dev64_to_dev(select_read_unit(un, blkno, count, &cando,
3247 							0, NULL, cs));
3248 	bp->b_bcount = ldbtob(cando);
3249 	if (count != cando)
3250 		return (cando);
3251 	return (0);
3252 }
3253 
3254 static void
3255 write_after_read(md_mps_t *ps)
3256 {
3257 	struct buf	*pb;
3258 	int		flags;
3259 
3260 	if (ps->ps_flags & MD_MPS_ERROR) {
3261 		mirror_error(ps);
3262 		return;
3263 	}
3264 
3265 	pb = ps->ps_bp;
3266 	md_kstat_done(ps->ps_ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
3267 	ps->ps_call = NULL;
3268 	ps->ps_flags |= MD_MPS_WRITE_AFTER_READ;
3269 	flags = MD_STR_NOTTOP | MD_STR_WAR;
3270 	if (ps->ps_flags & MD_MPS_MAPPED)
3271 		flags |= MD_STR_MAPPED;
3272 	if (ps->ps_flags & MD_MPS_NOBLOCK)
3273 		flags |= MD_NOBLOCK;
3274 	if (ps->ps_flags & MD_MPS_DIRTY_RD)
3275 		flags |= MD_STR_DIRTY_RD;
3276 	(void) mirror_write_strategy(pb, flags, ps);
3277 }
3278 
3279 static void
3280 continue_serial(md_mps_t *ps)
3281 {
3282 	md_mcs_t	*cs;
3283 	buf_t		*cb;
3284 	mm_unit_t	*un;
3285 	int		flags;
3286 
3287 	un = ps->ps_un;
3288 	cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS);
3289 	mirror_child_init(cs);
3290 	cb = &cs->cs_buf;
3291 	ps->ps_call = NULL;
3292 	ps->ps_frags = 1;
3293 	(void) mirror_map_write(un, cs, ps, 0);
3294 	flags = MD_STR_NOTTOP;
3295 	if (ps->ps_flags & MD_MPS_MAPPED)
3296 		flags |= MD_STR_MAPPED;
3297 	md_call_strategy(cb, flags, NULL);
3298 }
3299 
3300 static int
3301 mirror_map_write(mm_unit_t *un, md_mcs_t *cs, md_mps_t *ps, int war)
3302 {
3303 	int i;
3304 	dev_t		dev;	/* needed for bioclone, so not md_dev64_t */
3305 	buf_t		*cb;
3306 	buf_t		*pb;
3307 	diskaddr_t	blkno;
3308 	size_t		bcount;
3309 	off_t		offset;
3310 
3311 	pb = ps->ps_bp;
3312 	cb = &cs->cs_buf;
3313 	cs->cs_ps = ps;
3314 
3315 	i = md_find_nth_unit(ps->ps_writable_sm, ps->ps_current_sm);
3316 
3317 	dev = md_dev64_to_dev(un->un_sm[i].sm_dev);
3318 
3319 	blkno = pb->b_lblkno;
3320 	bcount = pb->b_bcount;
3321 	offset = 0;
3322 	if (war && (blkno == 0) && (un->c.un_flag & MD_LABELED)) {
3323 		blkno = DK_LABEL_LOC + 1;
3324 		/*
3325 		 * This handles the case where we're requesting
3326 		 * a write to block 0 on a label partition
3327 		 * and the request size was smaller than the
3328 		 * size of the label.  If this is the case
3329 		 * then we'll return -1.  Failure to do so will
3330 		 * either cause the calling thread to hang due to
3331 		 * an ssd bug, or worse if the bcount were allowed
3332 		 * to go negative (ie large).
3333 		 */
3334 		if (bcount <= DEV_BSIZE*(DK_LABEL_LOC + 1))
3335 			return (-1);
3336 		bcount -= (DEV_BSIZE*(DK_LABEL_LOC + 1));
3337 		offset = (DEV_BSIZE*(DK_LABEL_LOC + 1));
3338 	}
3339 
3340 	cb = md_bioclone(pb, offset, bcount, dev, blkno, mirror_done,
3341 	    cb, KM_NOSLEEP);
3342 	if (war)
3343 		cb->b_flags = (cb->b_flags & ~B_READ) | B_WRITE;
3344 
3345 	/*
3346 	 * If the submirror is in the erred stated, check if any component is
3347 	 * in the Last Erred state.  If so, we don't want to use the B_FAILFAST
3348 	 * flag on the IO.
3349 	 *
3350 	 * Provide a fast path for the non-erred case (which should be the
3351 	 * normal case).
3352 	 */
3353 	if (un->un_sm[i].sm_flags & MD_SM_FAILFAST) {
3354 		if (un->un_sm[i].sm_state & SMS_COMP_ERRED) {
3355 			mm_submirror_t		*sm;
3356 			mm_submirror_ic_t	*smic;
3357 			int			ci;
3358 			int			compcnt;
3359 
3360 			sm = &un->un_sm[i];
3361 			smic = &un->un_smic[i];
3362 
3363 			compcnt = (*(smic->sm_get_component_count))
3364 			    (sm->sm_dev, un);
3365 			for (ci = 0; ci < compcnt; ci++) {
3366 				md_m_shared_t	*shared;
3367 
3368 				shared = (md_m_shared_t *)
3369 				    (*(smic->sm_shared_by_indx))(sm->sm_dev,
3370 				    sm, ci);
3371 
3372 				if (shared->ms_state == CS_LAST_ERRED)
3373 					break;
3374 			}
3375 			if (ci >= compcnt)
3376 				cb->b_flags |= B_FAILFAST;
3377 
3378 		} else {
3379 			cb->b_flags |= B_FAILFAST;
3380 		}
3381 	}
3382 
3383 	ps->ps_current_sm++;
3384 	if (ps->ps_current_sm != ps->ps_active_cnt) {
3385 		if (un->un_write_option == WR_SERIAL) {
3386 			ps->ps_call = continue_serial;
3387 			return (0);
3388 		}
3389 		return (1);
3390 	}
3391 	return (0);
3392 }
3393 
3394 /*
3395  * directed_read_done:
3396  * ------------------
3397  * Completion routine called when a DMR request has been returned from the
3398  * underlying driver. Wake-up the original ioctl() and return the data to
3399  * the user.
3400  */
3401 static void
3402 directed_read_done(md_mps_t *ps)
3403 {
3404 	mm_unit_t	*un;
3405 	mdi_unit_t	*ui;
3406 
3407 	un = ps->ps_un;
3408 	ui = ps->ps_ui;
3409 
3410 	md_unit_readerexit(ui);
3411 	md_kstat_done(ui, ps->ps_bp, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
3412 	ps->ps_call = NULL;
3413 
3414 	mutex_enter(&un->un_dmr_mx);
3415 	cv_signal(&un->un_dmr_cv);
3416 	mutex_exit(&un->un_dmr_mx);
3417 
3418 	/* release the parent structure */
3419 	kmem_cache_free(mirror_parent_cache, ps);
3420 }
3421 
3422 /*
3423  * daemon_io:
3424  * ------------
3425  * Called to issue a mirror_write_strategy() or mirror_read_strategy
3426  * call from a blockable context. NOTE: no mutex can be held on entry to this
3427  * routine
3428  */
3429 static void
3430 daemon_io(daemon_queue_t *dq)
3431 {
3432 	md_mps_t	*ps = (md_mps_t *)dq;
3433 	int		flag = MD_STR_NOTTOP;
3434 	buf_t		*pb = ps->ps_bp;
3435 
3436 	if (ps->ps_flags & MD_MPS_MAPPED)
3437 		flag |= MD_STR_MAPPED;
3438 	if (ps->ps_flags & MD_MPS_WOW)
3439 		flag |= MD_STR_WOW;
3440 	if (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)
3441 		flag |= MD_STR_WAR;
3442 	if (ps->ps_flags & MD_MPS_ABR)
3443 		flag |= MD_STR_ABR;
3444 
3445 	/*
3446 	 * If this is a resync read, ie MD_STR_DIRTY_RD not set, set
3447 	 * MD_STR_WAR before calling mirror_read_strategy
3448 	 */
3449 	if (pb->b_flags & B_READ) {
3450 		if (!(ps->ps_flags & MD_MPS_DIRTY_RD))
3451 			flag |= MD_STR_WAR;
3452 		mirror_read_strategy(pb, flag, ps);
3453 	} else
3454 		mirror_write_strategy(pb, flag, ps);
3455 }
3456 
3457 /*
3458  * update_resync:
3459  * -------------
3460  * Called to update the in-core version of the resync record with the latest
3461  * version that was committed to disk when the previous mirror owner
3462  * relinquished ownership. This call is likely to block as we must hold-off
3463  * any current resync processing that may be occurring.
3464  * On completion of the resync record update we issue the mirror_write_strategy
3465  * call to complete the i/o that first started this sequence. To remove a race
3466  * condition between a new write() request which is submitted and the resync
3467  * record update we acquire the writerlock. This will hold off all i/o to the
3468  * mirror until the resync update has completed.
3469  * NOTE: no mutex can be held on entry to this routine
3470  */
3471 static void
3472 update_resync(daemon_queue_t *dq)
3473 {
3474 	md_mps_t	*ps = (md_mps_t *)dq;
3475 	buf_t		*pb = ps->ps_bp;
3476 	mdi_unit_t	*ui = ps->ps_ui;
3477 	mm_unit_t	*un;
3478 	set_t		setno;
3479 	int		restart_resync;
3480 
3481 	un = md_unit_writerlock(ui);
3482 	ps->ps_un = un;
3483 	setno = MD_MIN2SET(getminor(pb->b_edev));
3484 	if (mddb_reread_rr(setno, un->un_rr_dirty_recid) == 0) {
3485 		/*
3486 		 * Synchronize our in-core view of what regions need to be
3487 		 * resync'd with the on-disk version.
3488 		 */
3489 		mutex_enter(&un->un_rrp_inflight_mx);
3490 		mirror_copy_rr(howmany(un->un_rrd_num, NBBY), un->un_resync_bm,
3491 		    un->un_dirty_bm);
3492 		mutex_exit(&un->un_rrp_inflight_mx);
3493 
3494 		/* Region dirty map is now up to date */
3495 	}
3496 	restart_resync = (un->un_rs_thread_flags & MD_RI_BLOCK_OWNER) ? 1 : 0;
3497 	md_unit_writerexit(ui);
3498 
3499 	/* Restart the resync thread if it was previously blocked */
3500 	if (restart_resync) {
3501 		mutex_enter(&un->un_rs_thread_mx);
3502 		un->un_rs_thread_flags &= ~MD_RI_BLOCK_OWNER;
3503 		cv_signal(&un->un_rs_thread_cv);
3504 		mutex_exit(&un->un_rs_thread_mx);
3505 	}
3506 	/* Continue with original deferred i/o */
3507 	daemon_io(dq);
3508 }
3509 
3510 /*
3511  * owner_timeout:
3512  * -------------
3513  * Called if the original mdmn_ksend_message() failed and the request is to be
3514  * retried. Reattempt the original ownership change.
3515  *
3516  * NOTE: called at interrupt context (see timeout(9f)).
3517  */
3518 static void
3519 owner_timeout(void *arg)
3520 {
3521 	daemon_queue_t	*dq = (daemon_queue_t *)arg;
3522 
3523 	daemon_request(&md_mirror_daemon, become_owner, dq, REQ_OLD);
3524 }
3525 
3526 /*
3527  * become_owner:
3528  * ------------
3529  * Called to issue RPC request to become the owner of the mirror
3530  * associated with this i/o request. We assume that the ownership request
3531  * is synchronous, so if it succeeds we will issue the request via
3532  * mirror_write_strategy().
3533  * If multiple i/o's are outstanding we will be called from the mirror_daemon
3534  * service thread.
3535  * NOTE: no mutex should be held on entry to this routine.
3536  */
3537 static void
3538 become_owner(daemon_queue_t *dq)
3539 {
3540 	md_mps_t	*ps = (md_mps_t *)dq;
3541 	mm_unit_t	*un = ps->ps_un;
3542 	buf_t		*pb = ps->ps_bp;
3543 	set_t		setno;
3544 	md_mn_kresult_t	*kres;
3545 	int		msg_flags = md_mirror_msg_flags;
3546 	md_mps_t	*ps1;
3547 
3548 	ASSERT(dq->dq_next == NULL && dq->dq_prev == NULL);
3549 
3550 	/*
3551 	 * If we're already the mirror owner we do not need to send a message
3552 	 * but can simply process the i/o request immediately.
3553 	 * If we've already sent the request to become owner we requeue the
3554 	 * request as we're waiting for the synchronous ownership message to
3555 	 * be processed.
3556 	 */
3557 	if (MD_MN_MIRROR_OWNER(un)) {
3558 		/*
3559 		 * As the strategy() call will potentially block we need to
3560 		 * punt this to a separate thread and complete this request
3561 		 * as quickly as possible. Note: if we're a read request
3562 		 * this must be a resync, we cannot afford to be queued
3563 		 * behind any intervening i/o requests. In this case we put the
3564 		 * request on the md_mirror_rs_daemon queue.
3565 		 */
3566 		if (pb->b_flags & B_READ) {
3567 			daemon_request(&md_mirror_rs_daemon, daemon_io, dq,
3568 			    REQ_OLD);
3569 		} else {
3570 			daemon_request(&md_mirror_io_daemon, daemon_io, dq,
3571 			    REQ_OLD);
3572 		}
3573 	} else {
3574 		mutex_enter(&un->un_owner_mx);
3575 		if ((un->un_owner_state & MM_MN_OWNER_SENT) == 0) {
3576 			md_mn_req_owner_t	*msg;
3577 			int			rval = 0;
3578 
3579 			/*
3580 			 * Check to see that we haven't exceeded the maximum
3581 			 * retry count. If we have we fail the i/o as the
3582 			 * comms mechanism has become wedged beyond recovery.
3583 			 */
3584 			if (dq->qlen++ >= MD_OWNER_RETRIES) {
3585 				mutex_exit(&un->un_owner_mx);
3586 				cmn_err(CE_WARN,
3587 				    "md_mirror: Request exhausted ownership "
3588 				    "retry limit of %d attempts", dq->qlen);
3589 				pb->b_error = EIO;
3590 				pb->b_flags |= B_ERROR;
3591 				pb->b_resid = pb->b_bcount;
3592 				kmem_cache_free(mirror_parent_cache, ps);
3593 				md_biodone(pb);
3594 				return;
3595 			}
3596 
3597 			/*
3598 			 * Issue request to change ownership. The call is
3599 			 * synchronous so when it returns we can complete the
3600 			 * i/o (if successful), or enqueue it again so that
3601 			 * the operation will be retried.
3602 			 */
3603 			un->un_owner_state |= MM_MN_OWNER_SENT;
3604 			mutex_exit(&un->un_owner_mx);
3605 
3606 			msg = kmem_zalloc(sizeof (md_mn_req_owner_t), KM_SLEEP);
3607 			setno = MD_MIN2SET(getminor(pb->b_edev));
3608 			msg->mnum = MD_SID(un);
3609 			msg->owner = md_mn_mynode_id;
3610 			msg_flags |= MD_MSGF_NO_LOG;
3611 			/*
3612 			 * If this IO is triggered by updating a watermark,
3613 			 * it might be issued by the creation of a softpartition
3614 			 * while the commd subsystem is suspended.
3615 			 * We don't want this message to block.
3616 			 */
3617 			if (ps->ps_flags & MD_MPS_WMUPDATE) {
3618 				msg_flags |= MD_MSGF_OVERRIDE_SUSPEND;
3619 			}
3620 
3621 			kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
3622 			rval = mdmn_ksend_message(setno,
3623 						MD_MN_MSG_REQUIRE_OWNER,
3624 						msg_flags, /* flags */
3625 						(char *)msg,
3626 						sizeof (md_mn_req_owner_t),
3627 						kres);
3628 
3629 			kmem_free(msg, sizeof (md_mn_req_owner_t));
3630 
3631 			if (MDMN_KSEND_MSG_OK(rval, kres)) {
3632 				dq->qlen = 0;
3633 				/*
3634 				 * Successfully changed owner, reread the
3635 				 * resync record so that we have a valid idea of
3636 				 * any previously committed incomplete write()s.
3637 				 * NOTE: As we need to acquire the resync mutex
3638 				 * this may block, so we defer it to a separate
3639 				 * thread handler. This makes us (effectively)
3640 				 * non-blocking once the ownership message
3641 				 * handling has completed.
3642 				 */
3643 				mutex_enter(&un->un_owner_mx);
3644 				if (un->un_owner_state & MM_MN_BECOME_OWNER) {
3645 					un->un_mirror_owner = md_mn_mynode_id;
3646 					/* Sets owner of un_rr_dirty record */
3647 					if (un->un_rr_dirty_recid)
3648 						(void) mddb_setowner(
3649 						    un->un_rr_dirty_recid,
3650 						    md_mn_mynode_id);
3651 					un->un_owner_state &=
3652 					    ~MM_MN_BECOME_OWNER;
3653 					/*
3654 					 * Release the block on the current
3655 					 * resync region if it is blocked
3656 					 */
3657 					ps1 = un->un_rs_prev_ovrlap;
3658 					if ((ps1 != NULL) &&
3659 					    (ps1->ps_flags & MD_MPS_ON_OVERLAP))
3660 						mirror_overlap_chain_remove(
3661 						    ps1);
3662 					mutex_exit(&un->un_owner_mx);
3663 
3664 					/*
3665 					 * If we're a read, this must be a
3666 					 * resync request, issue
3667 					 * the i/o request on the
3668 					 * md_mirror_rs_daemon queue. This is
3669 					 * to avoid a deadlock between the
3670 					 * resync_unit thread and
3671 					 * subsequent i/o requests that may
3672 					 * block on the resync region.
3673 					 */
3674 					if (pb->b_flags & B_READ) {
3675 						daemon_request(
3676 						    &md_mirror_rs_daemon,
3677 						    update_resync, dq, REQ_OLD);
3678 					} else {
3679 						daemon_request(
3680 						    &md_mirror_io_daemon,
3681 						    update_resync, dq, REQ_OLD);
3682 					}
3683 					kmem_free(kres,
3684 					    sizeof (md_mn_kresult_t));
3685 					return;
3686 				} else {
3687 					/*
3688 					 * Some other node has beaten us to
3689 					 * obtain ownership. We need to
3690 					 * reschedule our ownership request
3691 					 */
3692 					mutex_exit(&un->un_owner_mx);
3693 				}
3694 			} else {
3695 				mdmn_ksend_show_error(rval, kres,
3696 				    "MD_MN_MSG_REQUIRE_OWNER");
3697 				/*
3698 				 * Message transport failure is handled by the
3699 				 * comms layer. If the ownership change request
3700 				 * does not succeed we need to flag the error to
3701 				 * the initiator of the i/o. This is handled by
3702 				 * the retry logic above. As the request failed
3703 				 * we do not know _who_ the owner of the mirror
3704 				 * currently is. We reset our idea of the owner
3705 				 * to None so that any further write()s will
3706 				 * attempt to become the owner again. This stops
3707 				 * multiple nodes writing to the same mirror
3708 				 * simultaneously.
3709 				 */
3710 				mutex_enter(&un->un_owner_mx);
3711 				un->un_owner_state &=
3712 				    ~(MM_MN_OWNER_SENT|MM_MN_BECOME_OWNER);
3713 				un->un_mirror_owner = MD_MN_MIRROR_UNOWNED;
3714 				mutex_exit(&un->un_owner_mx);
3715 			}
3716 			kmem_free(kres, sizeof (md_mn_kresult_t));
3717 		} else
3718 			mutex_exit(&un->un_owner_mx);
3719 
3720 		/*
3721 		 * Re-enqueue this request on the deferred i/o list. Delay the
3722 		 * request for md_mirror_owner_to usecs to stop thrashing.
3723 		 */
3724 		(void) timeout(owner_timeout, dq,
3725 		    drv_usectohz(md_mirror_owner_to));
3726 	}
3727 }
3728 
3729 static void
3730 mirror_write_strategy(buf_t *pb, int flag, void *private)
3731 {
3732 	md_mps_t	*ps;
3733 	md_mcs_t	*cs;
3734 	int		more;
3735 	mm_unit_t	*un;
3736 	mdi_unit_t	*ui;
3737 	buf_t		*cb;		/* child buf pointer */
3738 	set_t		setno;
3739 	int		rs_on_overlap = 0;
3740 
3741 	ui = MDI_UNIT(getminor(pb->b_edev));
3742 	un = (mm_unit_t *)MD_UNIT(getminor(pb->b_edev));
3743 
3744 
3745 	md_kstat_waitq_enter(ui);
3746 
3747 	/*
3748 	 * If a state change is in progress for this mirror in a MN set,
3749 	 * suspend all non-resync writes until the state change is complete.
3750 	 * The objective of this suspend is to ensure that it is not
3751 	 * possible for one node to read data from a submirror that another node
3752 	 * has not written to because of the state change. Therefore we
3753 	 * suspend all writes until the state change has been made. As it is
3754 	 * not possible to read from the target of a resync, there is no need
3755 	 * to suspend resync writes.
3756 	 */
3757 
3758 	if (!(flag & MD_STR_WAR)) {
3759 		mutex_enter(&un->un_suspend_wr_mx);
3760 		while (un->un_suspend_wr_flag) {
3761 			cv_wait(&un->un_suspend_wr_cv, &un->un_suspend_wr_mx);
3762 		}
3763 		mutex_exit(&un->un_suspend_wr_mx);
3764 		(void) md_unit_readerlock(ui);
3765 	}
3766 
3767 	if (!(flag & MD_STR_NOTTOP)) {
3768 		if (md_checkbuf(ui, (md_unit_t *)un, pb)) {
3769 			md_kstat_waitq_exit(ui);
3770 			return;
3771 		}
3772 	}
3773 
3774 	setno = MD_MIN2SET(getminor(pb->b_edev));
3775 
3776 	/* If an ABR write has been requested, set MD_STR_ABR flag */
3777 	if (MD_MNSET_SETNO(setno) && (pb->b_flags & B_ABRWRITE))
3778 		flag |= MD_STR_ABR;
3779 
3780 	if (private == NULL) {
3781 		ps = kmem_cache_alloc(mirror_parent_cache, MD_ALLOCFLAGS);
3782 		mirror_parent_init(ps);
3783 	} else {
3784 		ps = private;
3785 		private = NULL;
3786 	}
3787 	if (flag & MD_STR_MAPPED)
3788 		ps->ps_flags |= MD_MPS_MAPPED;
3789 
3790 	if (flag & MD_STR_WOW)
3791 		ps->ps_flags |= MD_MPS_WOW;
3792 
3793 	if (flag & MD_STR_ABR)
3794 		ps->ps_flags |= MD_MPS_ABR;
3795 
3796 	if (flag & MD_STR_WMUPDATE)
3797 		ps->ps_flags |= MD_MPS_WMUPDATE;
3798 
3799 	/*
3800 	 * Save essential information from the original buffhdr
3801 	 * in the md_save structure.
3802 	 */
3803 	ps->ps_un = un;
3804 	ps->ps_ui = ui;
3805 	ps->ps_bp = pb;
3806 	ps->ps_addr = pb->b_un.b_addr;
3807 	ps->ps_firstblk = pb->b_lblkno;
3808 	ps->ps_lastblk = pb->b_lblkno + lbtodb(pb->b_bcount) - 1;
3809 	ps->ps_changecnt = un->un_changecnt;
3810 
3811 	/*
3812 	 * If not MN owner and this is an ABR write, make sure the current
3813 	 * resync region is on the overlaps chain
3814 	 */
3815 	mutex_enter(&un->un_owner_mx);
3816 	if (MD_MNSET_SETNO(setno) && (!(MD_MN_MIRROR_OWNER(un))) &&
3817 	    ((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR))) {
3818 		md_mps_t	*ps1;
3819 		/* Block the current resync region, if not already blocked */
3820 		ps1 = un->un_rs_prev_ovrlap;
3821 
3822 		if ((ps1 != NULL) && ((ps1->ps_firstblk != 0) ||
3823 		    (ps1->ps_lastblk != 0))) {
3824 			/* Drop locks to avoid deadlock */
3825 			mutex_exit(&un->un_owner_mx);
3826 			md_unit_readerexit(ui);
3827 			wait_for_overlaps(ps1, MD_OVERLAP_ALLOW_REPEAT);
3828 			rs_on_overlap = 1;
3829 			(void) md_unit_readerlock(ui);
3830 			mutex_enter(&un->un_owner_mx);
3831 			/*
3832 			 * Check to see if we have obtained ownership
3833 			 * while waiting for overlaps. If we have, remove
3834 			 * the resync_region entry from the overlap chain
3835 			 */
3836 			if (MD_MN_MIRROR_OWNER(un) &&
3837 			    (ps1->ps_flags & MD_MPS_ON_OVERLAP)) {
3838 				mirror_overlap_chain_remove(ps1);
3839 				rs_on_overlap = 0;
3840 			}
3841 		}
3842 	}
3843 	mutex_exit(&un->un_owner_mx);
3844 
3845 
3846 	/*
3847 	 * following keep write after read from writing to the
3848 	 * source in the case where it all came from one place
3849 	 */
3850 	if (flag & MD_STR_WAR) {
3851 		int	abort_write = 0;
3852 		/*
3853 		 * We are perfoming a write-after-read. This is either as a
3854 		 * result of a resync read or as a result of a read in a
3855 		 * dirty resync region when the optimized resync is not
3856 		 * complete. If in a MN set and a resync generated i/o,
3857 		 * if the current block is not in the current
3858 		 * resync region terminate the write as another node must have
3859 		 * completed this resync region
3860 		 */
3861 		if ((MD_MNSET_SETNO(MD_UN2SET(un))) &&
3862 		    (!flag & MD_STR_DIRTY_RD)) {
3863 			if (!IN_RESYNC_REGION(un, ps))
3864 				abort_write = 1;
3865 		}
3866 		if ((select_write_after_read_units(un, ps) == 0) ||
3867 		    (abort_write)) {
3868 #ifdef DEBUG
3869 			if (mirror_debug_flag)
3870 				printf("Abort resync write on %x, block %lld\n",
3871 				    MD_SID(un), ps->ps_firstblk);
3872 #endif
3873 			if (ps->ps_flags & MD_MPS_ON_OVERLAP)
3874 				mirror_overlap_chain_remove(ps);
3875 			kmem_cache_free(mirror_parent_cache, ps);
3876 			md_kstat_waitq_exit(ui);
3877 			md_unit_readerexit(ui);
3878 			md_biodone(pb);
3879 			return;
3880 		}
3881 	} else {
3882 		select_write_units(un, ps);
3883 
3884 		/* Drop readerlock to avoid deadlock */
3885 		md_unit_readerexit(ui);
3886 		wait_for_overlaps(ps, MD_OVERLAP_NO_REPEAT);
3887 		un = md_unit_readerlock(ui);
3888 		/*
3889 		 * For a MN set with an ABR write, if we are now the
3890 		 * owner and we have a resync region on the overlap
3891 		 * chain, remove the entry from overlaps and retry the write.
3892 		 */
3893 
3894 		if (MD_MNSET_SETNO(setno) &&
3895 		    ((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR))) {
3896 			mutex_enter(&un->un_owner_mx);
3897 			if (((MD_MN_MIRROR_OWNER(un))) && rs_on_overlap) {
3898 				mirror_overlap_chain_remove(ps);
3899 				md_kstat_waitq_exit(ui);
3900 				mutex_exit(&un->un_owner_mx);
3901 				md_unit_readerexit(ui);
3902 				daemon_request(&md_mirror_daemon, daemon_io,
3903 				    (daemon_queue_t *)ps, REQ_OLD);
3904 				return;
3905 			}
3906 			mutex_exit(&un->un_owner_mx);
3907 		}
3908 	}
3909 
3910 	/*
3911 	 * For Multinode mirrors with a Resync Region (not ABR) we need to
3912 	 * become the mirror owner before continuing with the write(). For ABR
3913 	 * mirrors we check that we 'own' the resync if we're in
3914 	 * write-after-read mode. We do this _after_ ensuring that there are no
3915 	 * overlaps to ensure that the once we know that we are the owner, the
3916 	 * readerlock will not released until the write is complete. As a
3917 	 * change of ownership in a MN set requires the writerlock, this
3918 	 * ensures that ownership cannot be changed until the write is
3919 	 * complete
3920 	 */
3921 	if (MD_MNSET_SETNO(setno) && (!((ui->ui_tstate & MD_ABR_CAP) ||
3922 	    (flag & MD_STR_ABR)) || (flag & MD_STR_WAR))) {
3923 		if (!MD_MN_MIRROR_OWNER(un))  {
3924 			if (ps->ps_flags & MD_MPS_ON_OVERLAP)
3925 				mirror_overlap_chain_remove(ps);
3926 			md_kstat_waitq_exit(ui);
3927 			ASSERT(!(flag & MD_STR_WAR));
3928 			md_unit_readerexit(ui);
3929 			daemon_request(&md_mirror_daemon, become_owner,
3930 			    (daemon_queue_t *)ps, REQ_OLD);
3931 			return;
3932 		}
3933 	}
3934 
3935 	/*
3936 	 * Mark resync region if mirror has a Resync Region _and_ we are not
3937 	 * a resync initiated write(). Don't mark region if we're flagged as
3938 	 * an ABR write.
3939 	 */
3940 	if (!((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR)) &&
3941 	    !(flag & MD_STR_WAR)) {
3942 		if (mirror_mark_resync_region(un, ps->ps_firstblk,
3943 		    ps->ps_lastblk)) {
3944 			pb->b_flags |= B_ERROR;
3945 			pb->b_resid = pb->b_bcount;
3946 			ASSERT(!(ps->ps_flags & MD_MPS_ON_OVERLAP));
3947 			kmem_cache_free(mirror_parent_cache, ps);
3948 			md_kstat_waitq_exit(ui);
3949 			md_unit_readerexit(ui);
3950 			md_biodone(pb);
3951 			return;
3952 		}
3953 	}
3954 
3955 	ps->ps_childbflags = pb->b_flags | B_WRITE;
3956 	ps->ps_childbflags &= ~B_READ;
3957 	if (flag & MD_STR_MAPPED)
3958 		ps->ps_childbflags &= ~B_PAGEIO;
3959 
3960 	if (!(flag & MD_STR_NOTTOP) && panicstr)
3961 		/* Disable WOW and don't free ps */
3962 		ps->ps_flags |= (MD_MPS_WOW|MD_MPS_DONTFREE);
3963 
3964 	md_kstat_waitq_to_runq(ui);
3965 
3966 	/*
3967 	 * Treat Raw and Direct I/O as Write-on-Write always
3968 	 */
3969 
3970 	if (!(md_mirror_wow_flg & WOW_DISABLE) &&
3971 	    (md_mirror_wow_flg & WOW_PHYS_ENABLE) &&
3972 	    (pb->b_flags & B_PHYS) &&
3973 	    !(ps->ps_flags & MD_MPS_WOW)) {
3974 		if (ps->ps_flags & MD_MPS_ON_OVERLAP)
3975 			mirror_overlap_chain_remove(ps);
3976 		md_unit_readerexit(ui);
3977 		daemon_request(&md_mstr_daemon, handle_wow,
3978 			(daemon_queue_t *)ps, REQ_OLD);
3979 		return;
3980 	}
3981 
3982 	ps->ps_frags = 1;
3983 	do {
3984 		cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS);
3985 		mirror_child_init(cs);
3986 		cb = &cs->cs_buf;
3987 		more = mirror_map_write(un, cs, ps, (flag & MD_STR_WAR));
3988 
3989 		/*
3990 		 * This handles the case where we're requesting
3991 		 * a write to block 0 on a label partition.  (more < 0)
3992 		 * means that the request size was smaller than the
3993 		 * size of the label.  If so this request is done.
3994 		 */
3995 		if (more < 0) {
3996 			if (ps->ps_flags & MD_MPS_ON_OVERLAP)
3997 				mirror_overlap_chain_remove(ps);
3998 			md_kstat_runq_exit(ui);
3999 			kmem_cache_free(mirror_child_cache, cs);
4000 			kmem_cache_free(mirror_parent_cache, ps);
4001 			md_unit_readerexit(ui);
4002 			md_biodone(pb);
4003 			return;
4004 		}
4005 		if (more) {
4006 			mutex_enter(&ps->ps_mx);
4007 			ps->ps_frags++;
4008 			mutex_exit(&ps->ps_mx);
4009 		}
4010 		md_call_strategy(cb, flag, private);
4011 	} while (more);
4012 
4013 	if (!(flag & MD_STR_NOTTOP) && panicstr) {
4014 		while (!(ps->ps_flags & MD_MPS_DONE)) {
4015 			md_daemon(1, &md_done_daemon);
4016 			drv_usecwait(10);
4017 		}
4018 		kmem_cache_free(mirror_parent_cache, ps);
4019 	}
4020 }
4021 
4022 static void
4023 mirror_read_strategy(buf_t *pb, int flag, void *private)
4024 {
4025 	md_mps_t	*ps;
4026 	md_mcs_t	*cs;
4027 	size_t		more;
4028 	mm_unit_t	*un;
4029 	mdi_unit_t	*ui;
4030 	size_t		current_count;
4031 	diskaddr_t	current_blkno;
4032 	off_t		current_offset;
4033 	buf_t		*cb;		/* child buf pointer */
4034 	set_t		setno;
4035 
4036 	ui = MDI_UNIT(getminor(pb->b_edev));
4037 
4038 	md_kstat_waitq_enter(ui);
4039 
4040 	un = (mm_unit_t *)md_unit_readerlock(ui);
4041 
4042 	if (!(flag & MD_STR_NOTTOP)) {
4043 		if (md_checkbuf(ui, (md_unit_t *)un, pb)) {
4044 			md_kstat_waitq_exit(ui);
4045 			return;
4046 		}
4047 	}
4048 
4049 	if (private == NULL) {
4050 		ps = kmem_cache_alloc(mirror_parent_cache, MD_ALLOCFLAGS);
4051 		mirror_parent_init(ps);
4052 	} else {
4053 		ps = private;
4054 		private = NULL;
4055 	}
4056 
4057 	if (flag & MD_STR_MAPPED)
4058 		ps->ps_flags |= MD_MPS_MAPPED;
4059 	if (flag & MD_NOBLOCK)
4060 		ps->ps_flags |= MD_MPS_NOBLOCK;
4061 	if (flag & MD_STR_WMUPDATE)
4062 		ps->ps_flags |= MD_MPS_WMUPDATE;
4063 
4064 	/*
4065 	 * Check to see if this is a DMR driven read. If so we need to use the
4066 	 * specified side (in un->un_dmr_last_read) for the source of the data.
4067 	 */
4068 	if (flag & MD_STR_DMR)
4069 		ps->ps_flags |= MD_MPS_DMR;
4070 
4071 	/*
4072 	 * Save essential information from the original buffhdr
4073 	 * in the md_save structure.
4074 	 */
4075 	ps->ps_un = un;
4076 	ps->ps_ui = ui;
4077 	ps->ps_bp = pb;
4078 	ps->ps_addr = pb->b_un.b_addr;
4079 	ps->ps_firstblk = pb->b_lblkno;
4080 	ps->ps_lastblk = pb->b_lblkno + lbtodb(pb->b_bcount) - 1;
4081 	ps->ps_changecnt = un->un_changecnt;
4082 
4083 	current_count = btodb(pb->b_bcount);
4084 	current_blkno = pb->b_lblkno;
4085 	current_offset = 0;
4086 
4087 	/*
4088 	 * If flag has MD_STR_WAR set this means that the read is issued by a
4089 	 * resync thread which may or may not be an optimised resync.
4090 	 *
4091 	 * If MD_UN_OPT_NOT_DONE is set this means that the optimized resync
4092 	 * code has not completed; either a resync has not started since snarf,
4093 	 * or there is an optimized resync in progress.
4094 	 *
4095 	 * We need to generate a write after this read in the following two
4096 	 * cases,
4097 	 *
4098 	 * 1. Any Resync-Generated read
4099 	 *
4100 	 * 2. Any read to a DIRTY REGION if there is an optimized resync
4101 	 *    pending or in progress.
4102 	 *
4103 	 * The write after read is done in these cases to ensure that all sides
4104 	 * of the mirror are in sync with the read data and that it is not
4105 	 * possible for an application to read the same block multiple times
4106 	 * and get different data.
4107 	 *
4108 	 * This would be possible if the block was in a dirty region.
4109 	 *
4110 	 * If we're performing a directed read we don't write the data out as
4111 	 * the application is responsible for restoring the mirror to a known
4112 	 * state.
4113 	 */
4114 	if (((MD_STATUS(un) & MD_UN_OPT_NOT_DONE) || (flag & MD_STR_WAR)) &&
4115 	    !(flag & MD_STR_DMR)) {
4116 		size_t	start_rr, i, end_rr;
4117 		int	region_dirty = 1;
4118 
4119 		/*
4120 		 * We enter here under three circumstances,
4121 		 *
4122 		 * MD_UN_OPT_NOT_DONE	MD_STR_WAR
4123 		 * 0			1
4124 		 * 1			0
4125 		 * 1			1
4126 		 *
4127 		 * To be optimal we only care to explicitly check for dirty
4128 		 * regions in the second case since if MD_STR_WAR is set we
4129 		 * always do the write after read.
4130 		 */
4131 		if (!(flag & MD_STR_WAR)) {
4132 			BLK_TO_RR(end_rr, ps->ps_lastblk, un);
4133 			BLK_TO_RR(start_rr, ps->ps_firstblk, un);
4134 
4135 			for (i = start_rr; i <= end_rr; i++)
4136 				if ((region_dirty = IS_KEEPDIRTY(i, un)) != 0)
4137 					break;
4138 		}
4139 
4140 		if ((region_dirty) &&
4141 		    !(md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)) {
4142 			ps->ps_call = write_after_read;
4143 			/*
4144 			 * Mark this as a RESYNC_READ in ps_flags.
4145 			 * This is used if the read fails during a
4146 			 * resync of a 3-way mirror to ensure that
4147 			 * the retried read to the remaining
4148 			 * good submirror has MD_STR_WAR set. This
4149 			 * is needed to ensure that the resync write
4150 			 * (write-after-read) takes place.
4151 			 */
4152 			ps->ps_flags |= MD_MPS_RESYNC_READ;
4153 
4154 			/*
4155 			 * If MD_STR_FLAG_ERR is set in the flags we
4156 			 * set MD_MPS_FLAG_ERROR so that an error on the resync
4157 			 * write (issued by write_after_read) will be flagged
4158 			 * to the biowait'ing resync thread. This allows us to
4159 			 * avoid issuing further resync requests to a device
4160 			 * that has had a write failure.
4161 			 */
4162 			if (flag & MD_STR_FLAG_ERR)
4163 				ps->ps_flags |= MD_MPS_FLAG_ERROR;
4164 
4165 			setno = MD_UN2SET(un);
4166 			/*
4167 			 * Drop the readerlock to avoid
4168 			 * deadlock
4169 			 */
4170 			md_unit_readerexit(ui);
4171 			wait_for_overlaps(ps, MD_OVERLAP_NO_REPEAT);
4172 			un = md_unit_readerlock(ui);
4173 			/*
4174 			 * Ensure that we are owner
4175 			 */
4176 			if (MD_MNSET_SETNO(setno)) {
4177 				/*
4178 				 * For a non-resync read that requires a
4179 				 * write-after-read to be done, set a flag
4180 				 * in the parent structure, so that the
4181 				 * write_strategy routine can omit the
4182 				 * test that the write is still within the
4183 				 * resync region
4184 				 */
4185 				if (!(flag & MD_STR_WAR))
4186 					ps->ps_flags |= MD_MPS_DIRTY_RD;
4187 
4188 				/*
4189 				 * Before reading the buffer, see if
4190 				 * we are the owner
4191 				 */
4192 				if (!MD_MN_MIRROR_OWNER(un))  {
4193 					ps->ps_call = NULL;
4194 					mirror_overlap_chain_remove(ps);
4195 					md_kstat_waitq_exit(ui);
4196 					md_unit_readerexit(ui);
4197 					daemon_request(
4198 					    &md_mirror_daemon,
4199 					    become_owner,
4200 					    (daemon_queue_t *)ps,
4201 					    REQ_OLD);
4202 					return;
4203 				}
4204 				/*
4205 				 * For a resync read, check to see if I/O is
4206 				 * outside of the current resync region, or
4207 				 * the resync has finished. If so
4208 				 * just terminate the I/O
4209 				 */
4210 				if ((flag & MD_STR_WAR) &&
4211 				    (!(un->c.un_status & MD_UN_WAR) ||
4212 				    (!IN_RESYNC_REGION(un, ps)))) {
4213 #ifdef DEBUG
4214 					if (mirror_debug_flag)
4215 						printf("Abort resync read "
4216 						    "%x: %lld\n",
4217 						    MD_SID(un),
4218 						    ps->ps_firstblk);
4219 #endif
4220 					mirror_overlap_chain_remove(ps);
4221 					kmem_cache_free(mirror_parent_cache,
4222 					    ps);
4223 					md_kstat_waitq_exit(ui);
4224 					md_unit_readerexit(ui);
4225 					md_biodone(pb);
4226 					return;
4227 				}
4228 			}
4229 		}
4230 	}
4231 
4232 	if (flag & MD_STR_DMR) {
4233 		ps->ps_call = directed_read_done;
4234 	}
4235 
4236 	if (!(flag & MD_STR_NOTTOP) && panicstr)
4237 		ps->ps_flags |= MD_MPS_DONTFREE;
4238 
4239 	md_kstat_waitq_to_runq(ui);
4240 
4241 	ps->ps_frags++;
4242 	do {
4243 		cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS);
4244 		mirror_child_init(cs);
4245 		cb = &cs->cs_buf;
4246 		cs->cs_ps = ps;
4247 
4248 		cb = md_bioclone(pb, current_offset, current_count, NODEV,
4249 		    current_blkno, mirror_done, cb, KM_NOSLEEP);
4250 
4251 		more = mirror_map_read(ps, cs, current_blkno,
4252 				(u_longlong_t)current_count);
4253 		if (more) {
4254 			mutex_enter(&ps->ps_mx);
4255 			ps->ps_frags++;
4256 			mutex_exit(&ps->ps_mx);
4257 		}
4258 
4259 		/*
4260 		 * Do these calculations now,
4261 		 *  so that we pickup a valid b_bcount from the chld_bp.
4262 		 */
4263 		current_count -= more;
4264 		current_offset += cb->b_bcount;
4265 		current_blkno +=  more;
4266 		md_call_strategy(cb, flag, private);
4267 	} while (more);
4268 
4269 	if (!(flag & MD_STR_NOTTOP) && panicstr) {
4270 		while (!(ps->ps_flags & MD_MPS_DONE)) {
4271 			md_daemon(1, &md_done_daemon);
4272 			drv_usecwait(10);
4273 		}
4274 		kmem_cache_free(mirror_parent_cache, ps);
4275 	}
4276 }
4277 
4278 void
4279 md_mirror_strategy(buf_t *bp, int flag, void *private)
4280 {
4281 	set_t	setno = MD_MIN2SET(getminor(bp->b_edev));
4282 
4283 	/*
4284 	 * When doing IO to a multi owner meta device, check if set is halted.
4285 	 * We do this check without the needed lock held, for performance
4286 	 * reasons.
4287 	 * If an IO just slips through while the set is locked via an
4288 	 * MD_MN_SUSPEND_SET, we don't care about it.
4289 	 * Only check for suspension if we are a top-level i/o request
4290 	 * (MD_STR_NOTTOP is cleared in 'flag').
4291 	 */
4292 	if ((md_set[setno].s_status & (MD_SET_HALTED | MD_SET_MNSET)) ==
4293 	    (MD_SET_HALTED | MD_SET_MNSET)) {
4294 		if ((flag & MD_STR_NOTTOP) == 0) {
4295 			mutex_enter(&md_mx);
4296 			/* Here we loop until the set is no longer halted */
4297 			while (md_set[setno].s_status & MD_SET_HALTED) {
4298 				cv_wait(&md_cv, &md_mx);
4299 			}
4300 			mutex_exit(&md_mx);
4301 		}
4302 	}
4303 
4304 	if ((flag & MD_IO_COUNTED) == 0) {
4305 		if ((flag & MD_NOBLOCK) == 0) {
4306 			if (md_inc_iocount(setno) != 0) {
4307 				bp->b_flags |= B_ERROR;
4308 				bp->b_error = ENXIO;
4309 				bp->b_resid = bp->b_bcount;
4310 				biodone(bp);
4311 				return;
4312 			}
4313 		} else {
4314 			md_inc_iocount_noblock(setno);
4315 		}
4316 	}
4317 
4318 	if (bp->b_flags & B_READ)
4319 		mirror_read_strategy(bp, flag, private);
4320 	else
4321 		mirror_write_strategy(bp, flag, private);
4322 }
4323 
4324 /*
4325  * mirror_directed_read:
4326  * --------------------
4327  * Entry-point for the DKIOCDMR ioctl. We issue a read to a specified sub-mirror
4328  * so that the application can determine what (if any) resync needs to be
4329  * performed. The data is copied out to the user-supplied buffer.
4330  *
4331  * Parameters:
4332  *	mdev	- dev_t for the mirror device
4333  *	vdr	- directed read parameters specifying location and submirror
4334  *		  to perform the read from
4335  *	mode	- used to ddi_copyout() any resulting data from the read
4336  *
4337  * Returns:
4338  *	0	success
4339  *	!0	error code
4340  *		EINVAL - invalid request format
4341  */
4342 int
4343 mirror_directed_read(dev_t mdev, vol_directed_rd_t *vdr, int mode)
4344 {
4345 	buf_t		*bp;
4346 	minor_t		mnum = getminor(mdev);
4347 	mdi_unit_t	*ui = MDI_UNIT(mnum);
4348 	mm_unit_t	*un;
4349 	mm_submirror_t	*sm;
4350 	char		*sm_nm;
4351 	size_t		namelen;
4352 	uint_t		next_side;
4353 	void		*kbuffer;
4354 
4355 	if (ui == NULL)
4356 		return (ENXIO);
4357 
4358 	if (!(vdr->vdr_flags & DKV_DMR_NEXT_SIDE)) {
4359 		return (EINVAL);
4360 	}
4361 
4362 	/* Check for aligned block access. We disallow non-aligned requests. */
4363 	if (vdr->vdr_offset % DEV_BSIZE) {
4364 		return (EINVAL);
4365 	}
4366 
4367 	/*
4368 	 * Allocate kernel buffer for target of read(). If we had a reliable
4369 	 * (sorry functional) DDI this wouldn't be needed.
4370 	 */
4371 	kbuffer = kmem_alloc(vdr->vdr_nbytes, KM_NOSLEEP);
4372 	if (kbuffer == NULL) {
4373 		cmn_err(CE_WARN, "mirror_directed_read: couldn't allocate %lx"
4374 		    " bytes\n", vdr->vdr_nbytes);
4375 		return (ENOMEM);
4376 	}
4377 
4378 	bp = getrbuf(KM_SLEEP);
4379 
4380 	bp->b_un.b_addr = kbuffer;
4381 	bp->b_flags = B_READ;
4382 	bp->b_bcount = vdr->vdr_nbytes;
4383 	bp->b_lblkno = lbtodb(vdr->vdr_offset);
4384 	bp->b_edev = mdev;
4385 
4386 	un = md_unit_readerlock(ui);
4387 
4388 	/*
4389 	 * If DKV_SIDE_INIT is set we need to determine the first available
4390 	 * side to start reading from. If it isn't set we increment to the
4391 	 * next readable submirror.
4392 	 * If there are no readable submirrors we error out with DKV_DMR_ERROR.
4393 	 * Note: we check for a readable submirror on completion of the i/o so
4394 	 * we should _always_ have one available. If this becomes unavailable
4395 	 * we have missed the 'DKV_DMR_DONE' opportunity. This could happen if
4396 	 * a metadetach is made between the completion of one DKIOCDMR ioctl
4397 	 * and the start of the next (i.e. a sys-admin 'accident' occurred).
4398 	 * The chance of this is small, but not non-existent.
4399 	 */
4400 	if (vdr->vdr_side == DKV_SIDE_INIT) {
4401 		next_side = 0;
4402 	} else {
4403 		next_side = vdr->vdr_side + 1;
4404 	}
4405 	while ((next_side < NMIRROR) &&
4406 	    !SUBMIRROR_IS_READABLE(un, next_side))
4407 		next_side++;
4408 	if (next_side >= NMIRROR) {
4409 		vdr->vdr_flags |= DKV_DMR_ERROR;
4410 		freerbuf(bp);
4411 		vdr->vdr_bytesread = 0;
4412 		md_unit_readerexit(ui);
4413 		return (0);
4414 	}
4415 
4416 	/* Set the side to read from */
4417 	un->un_dmr_last_read = next_side;
4418 
4419 	md_unit_readerexit(ui);
4420 
4421 	/*
4422 	 * Save timestamp for verification purposes. Can be read by debugger
4423 	 * to verify that this ioctl has been executed and to find the number
4424 	 * of DMR reads and the time of the last DMR read.
4425 	 */
4426 	uniqtime(&mirror_dmr_stats.dmr_timestamp);
4427 	mirror_dmr_stats.dmr_count++;
4428 
4429 	/* Issue READ request and wait for completion */
4430 	mirror_read_strategy(bp, MD_STR_DMR|MD_NOBLOCK|MD_STR_NOTTOP, NULL);
4431 
4432 	mutex_enter(&un->un_dmr_mx);
4433 	cv_wait(&un->un_dmr_cv, &un->un_dmr_mx);
4434 	mutex_exit(&un->un_dmr_mx);
4435 
4436 	/*
4437 	 * Check to see if we encountered an error during the read. If so we
4438 	 * can make no guarantee about any possibly returned data.
4439 	 */
4440 	if ((bp->b_flags & B_ERROR) == 0) {
4441 		vdr->vdr_flags &= ~DKV_DMR_ERROR;
4442 		if (bp->b_resid) {
4443 			vdr->vdr_flags |= DKV_DMR_SHORT;
4444 			vdr->vdr_bytesread = vdr->vdr_nbytes - bp->b_resid;
4445 		} else {
4446 			vdr->vdr_flags |= DKV_DMR_SUCCESS;
4447 			vdr->vdr_bytesread = vdr->vdr_nbytes;
4448 		}
4449 		/* Copy the data read back out to the user supplied buffer */
4450 		if (ddi_copyout(kbuffer, vdr->vdr_data, vdr->vdr_bytesread,
4451 		    mode)) {
4452 			kmem_free(kbuffer, vdr->vdr_nbytes);
4453 			return (EFAULT);
4454 		}
4455 
4456 	} else {
4457 		/* Error out with DKV_DMR_ERROR */
4458 		vdr->vdr_flags |= DKV_DMR_ERROR;
4459 		vdr->vdr_flags &= ~(DKV_DMR_SUCCESS|DKV_DMR_SHORT|DKV_DMR_DONE);
4460 	}
4461 	/*
4462 	 * Update the DMR parameters with the side and name of submirror that
4463 	 * we have just read from (un->un_dmr_last_read)
4464 	 */
4465 	un = md_unit_readerlock(ui);
4466 
4467 	vdr->vdr_side = un->un_dmr_last_read;
4468 	sm = &un->un_sm[un->un_dmr_last_read];
4469 	sm_nm = md_shortname(md_getminor(sm->sm_dev));
4470 
4471 	namelen = MIN(MD_MAX_SIDENAME_LEN, VOL_SIDENAME);
4472 	(void) strncpy(vdr->vdr_side_name, sm_nm, namelen);
4473 
4474 	/*
4475 	 * Determine if we've completed the read cycle. This is true iff the
4476 	 * next computed submirror (side) equals or exceeds NMIRROR. We cannot
4477 	 * use un_nsm as we need to handle a sparse array of submirrors (which
4478 	 * can occur if a submirror is metadetached).
4479 	 */
4480 	next_side = un->un_dmr_last_read + 1;
4481 	while ((next_side < NMIRROR) &&
4482 	    !SUBMIRROR_IS_READABLE(un, next_side))
4483 		next_side++;
4484 	if (next_side >= NMIRROR) {
4485 		/* We've finished */
4486 		vdr->vdr_flags |= DKV_DMR_DONE;
4487 	}
4488 
4489 	md_unit_readerexit(ui);
4490 	freerbuf(bp);
4491 	kmem_free(kbuffer, vdr->vdr_nbytes);
4492 
4493 	return (0);
4494 }
4495 
4496 /*
4497  * mirror_resync_message:
4498  * ---------------------
4499  * Handle the multi-node resync messages that keep all nodes within a given
4500  * disk-set in sync with their view of a mirror's resync status.
4501  *
4502  * The message types dealt with are:
4503  * MD_MN_MSG_RESYNC_STARTING	- start a resync thread for a unit
4504  * MD_MN_MSG_RESYNC_NEXT	- specified next region to be resynced
4505  * MD_MN_MSG_RESYNC_FINISH	- stop the resync thread for a unit
4506  * MD_MN_MSG_RESYNC_PHASE_DONE	- end of a resync phase, opt, submirror or comp
4507  *
4508  * Returns:
4509  *	0	Success
4510  *	>0	Failure error number
4511  */
4512 int
4513 mirror_resync_message(md_mn_rs_params_t *p, IOLOCK *lockp)
4514 {
4515 	mdi_unit_t		*ui;
4516 	mm_unit_t		*un;
4517 	set_t			setno;
4518 	int			is_ABR;
4519 	int			smi;
4520 	int			ci;
4521 	sm_state_t		state;
4522 	int			broke_out;
4523 	mm_submirror_t		*sm;
4524 	mm_submirror_ic_t	*smic;
4525 	md_m_shared_t		*shared;
4526 	md_error_t		mde = mdnullerror;
4527 	md_mps_t		*ps;
4528 	int			rs_active;
4529 
4530 	/* Check that the given device is part of a multi-node set */
4531 	setno = MD_MIN2SET(p->mnum);
4532 	if (setno >= md_nsets) {
4533 		return (ENXIO);
4534 	}
4535 	if (!MD_MNSET_SETNO(setno)) {
4536 		return (EINVAL);
4537 	}
4538 
4539 	if ((un = mirror_getun(p->mnum, &p->mde, NO_LOCK, NULL)) == NULL)
4540 		return (EINVAL);
4541 	if ((ui = MDI_UNIT(p->mnum)) == NULL)
4542 		return (EINVAL);
4543 	is_ABR = (ui->ui_tstate & MD_ABR_CAP);
4544 
4545 	/* Obtain the current resync status */
4546 	(void) md_ioctl_readerlock(lockp, ui);
4547 	rs_active = (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) ? 1 : 0;
4548 	md_ioctl_readerexit(lockp);
4549 
4550 	switch ((md_mn_msgtype_t)p->msg_type) {
4551 	case MD_MN_MSG_RESYNC_STARTING:
4552 		/* Start the resync thread for the mirror */
4553 		(void) mirror_resync_unit(p->mnum, NULL, &p->mde, lockp);
4554 		break;
4555 
4556 	case MD_MN_MSG_RESYNC_NEXT:
4557 		/*
4558 		 * We have to release any previously marked overlap regions
4559 		 * so that i/o can resume. Then we need to block the region
4560 		 * from [rs_start..rs_start+rs_size) * so that no i/o is issued.
4561 		 * Update un_rs_resync_done and un_rs_resync_2_do.
4562 		 */
4563 		(void) md_ioctl_readerlock(lockp, ui);
4564 		/*
4565 		 * Ignore the message if there is no active resync thread or
4566 		 * if it is for a resync type that we have already completed.
4567 		 * un_resync_completed is set to the last resync completed
4568 		 * when processing a PHASE_DONE message.
4569 		 */
4570 		if (!rs_active || (p->rs_type == un->un_resync_completed))
4571 			break;
4572 		/*
4573 		 * If this message is for the same resync and is for an earlier
4574 		 * resync region, just ignore it. This can only occur if this
4575 		 * node has progressed on to the next resync region before
4576 		 * we receive this message. This can occur if the class for
4577 		 * this message is busy and the originator has to retry thus
4578 		 * allowing this node to move onto the next resync_region.
4579 		 */
4580 		if ((p->rs_type == un->un_rs_type) &&
4581 		    (p->rs_start < un->un_resync_startbl))
4582 			break;
4583 		ps = un->un_rs_prev_ovrlap;
4584 
4585 		/* Allocate previous overlap reference if needed */
4586 		if (ps == NULL) {
4587 			ps = kmem_cache_alloc(mirror_parent_cache,
4588 				MD_ALLOCFLAGS);
4589 			ps->ps_un = un;
4590 			ps->ps_ui = ui;
4591 			ps->ps_firstblk = 0;
4592 			ps->ps_lastblk = 0;
4593 			ps->ps_flags = 0;
4594 			md_ioctl_readerexit(lockp);
4595 			(void) md_ioctl_writerlock(lockp, ui);
4596 			un->un_rs_prev_ovrlap = ps;
4597 			md_ioctl_writerexit(lockp);
4598 		} else
4599 			md_ioctl_readerexit(lockp);
4600 
4601 		if (p->rs_originator != md_mn_mynode_id) {
4602 			/*
4603 			 * On all but the originating node, first update
4604 			 * the resync state, then unblock the previous
4605 			 * region and block the next one. No need
4606 			 * to do this if the region is already blocked.
4607 			 * Update the submirror state and flags from the
4608 			 * originator. This keeps the cluster in sync with
4609 			 * regards to the resync status.
4610 			 */
4611 
4612 			(void) md_ioctl_writerlock(lockp, ui);
4613 			un->un_rs_resync_done = p->rs_done;
4614 			un->un_rs_resync_2_do = p->rs_2_do;
4615 			un->un_rs_type = p->rs_type;
4616 			un->un_resync_startbl = p->rs_start;
4617 			md_ioctl_writerexit(lockp);
4618 			/*
4619 			 * Use un_owner_mx to ensure that an ownership change
4620 			 * cannot happen at the same time as this message
4621 			 */
4622 			mutex_enter(&un->un_owner_mx);
4623 			if (MD_MN_MIRROR_OWNER(un)) {
4624 				ps->ps_firstblk = p->rs_start;
4625 				ps->ps_lastblk = ps->ps_firstblk +
4626 				    p->rs_size - 1;
4627 			} else {
4628 				if ((ps->ps_firstblk != p->rs_start) ||
4629 				    (ps->ps_lastblk != p->rs_start +
4630 				    p->rs_size - 1)) {
4631 					/* Remove previous overlap range */
4632 					if (ps->ps_flags & MD_MPS_ON_OVERLAP)
4633 						mirror_overlap_chain_remove(ps);
4634 
4635 					ps->ps_firstblk = p->rs_start;
4636 					ps->ps_lastblk = ps->ps_firstblk +
4637 					    p->rs_size - 1;
4638 
4639 					mutex_exit(&un->un_owner_mx);
4640 					/* Block this range from all i/o. */
4641 					if (ps->ps_firstblk != 0 ||
4642 					    ps->ps_lastblk != 0)
4643 						wait_for_overlaps(ps,
4644 						    MD_OVERLAP_ALLOW_REPEAT);
4645 					mutex_enter(&un->un_owner_mx);
4646 					/*
4647 					 * Check to see if we have obtained
4648 					 * ownership while waiting for
4649 					 * overlaps. If we have, remove
4650 					 * the resync_region entry from the
4651 					 * overlap chain
4652 					 */
4653 					if (MD_MN_MIRROR_OWNER(un) &&
4654 					    (ps->ps_flags & MD_MPS_ON_OVERLAP))
4655 						mirror_overlap_chain_remove(ps);
4656 				}
4657 			}
4658 			mutex_exit(&un->un_owner_mx);
4659 
4660 			/*
4661 			 * If this is the first RESYNC_NEXT message (i.e.
4662 			 * MD_MN_RS_FIRST_RESYNC_NEXT set in p->rs_flags),
4663 			 * issue RESYNC_START NOTIFY event
4664 			 */
4665 			if (p->rs_flags & MD_MN_RS_FIRST_RESYNC_NEXT) {
4666 				SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_START,
4667 				    SVM_TAG_METADEVICE, MD_UN2SET(un),
4668 				    MD_SID(un));
4669 			}
4670 
4671 			/* Ensure that our local resync thread is running */
4672 			if (un->un_rs_thread == NULL) {
4673 				(void) mirror_resync_unit(p->mnum, NULL,
4674 				    &p->mde, lockp);
4675 			}
4676 		}
4677 		break;
4678 	case MD_MN_MSG_RESYNC_FINISH:
4679 		/*
4680 		 * Complete the resync by stopping the resync thread.
4681 		 * Also release the previous overlap region field.
4682 		 * Update the resync_progress_thread by cv_signal'ing it so
4683 		 * that we mark the end of the resync as soon as possible. This
4684 		 * stops an unnecessary delay should be panic after resync
4685 		 * completion.
4686 		 */
4687 #ifdef DEBUG
4688 		if (!rs_active) {
4689 			if (mirror_debug_flag)
4690 				printf("RESYNC_FINISH (mnum = %x), "
4691 				    "Resync *NOT* active",
4692 				    p->mnum);
4693 		}
4694 #endif
4695 
4696 		if ((un->c.un_status & MD_UN_RESYNC_ACTIVE) &&
4697 		    (p->rs_originator != md_mn_mynode_id)) {
4698 			mutex_enter(&un->un_rs_thread_mx);
4699 			un->c.un_status &= ~MD_UN_RESYNC_CANCEL;
4700 			un->un_rs_thread_flags |= MD_RI_SHUTDOWN;
4701 			un->un_rs_thread_flags &=
4702 			    ~(MD_RI_BLOCK|MD_RI_BLOCK_OWNER);
4703 			cv_signal(&un->un_rs_thread_cv);
4704 			mutex_exit(&un->un_rs_thread_mx);
4705 		}
4706 		if (is_ABR) {
4707 			/* Resync finished, if ABR set owner to NULL */
4708 			mutex_enter(&un->un_owner_mx);
4709 			un->un_mirror_owner = 0;
4710 			mutex_exit(&un->un_owner_mx);
4711 		}
4712 		(void) md_ioctl_writerlock(lockp, ui);
4713 		ps = un->un_rs_prev_ovrlap;
4714 		if (ps != NULL) {
4715 			/* Remove previous overlap range */
4716 			if (ps->ps_flags & MD_MPS_ON_OVERLAP)
4717 				mirror_overlap_chain_remove(ps);
4718 			/*
4719 			 * Release the overlap range reference
4720 			 */
4721 			un->un_rs_prev_ovrlap = NULL;
4722 			kmem_cache_free(mirror_parent_cache,
4723 			    ps);
4724 		}
4725 		md_ioctl_writerexit(lockp);
4726 
4727 		/* Mark the resync as complete in the metadb */
4728 		un->un_rs_resync_done = p->rs_done;
4729 		un->un_rs_resync_2_do = p->rs_2_do;
4730 		un->un_rs_type = p->rs_type;
4731 		mutex_enter(&un->un_rs_progress_mx);
4732 		cv_signal(&un->un_rs_progress_cv);
4733 		mutex_exit(&un->un_rs_progress_mx);
4734 
4735 		un = md_ioctl_writerlock(lockp, ui);
4736 		un->c.un_status &= ~MD_UN_RESYNC_ACTIVE;
4737 		/* Deal with any pending grow_unit */
4738 		if (un->c.un_status & MD_UN_GROW_PENDING) {
4739 			if ((mirror_grow_unit(un, &mde) != 0) ||
4740 			    (! mdismderror(&mde, MDE_GROW_DELAYED))) {
4741 				un->c.un_status &= ~MD_UN_GROW_PENDING;
4742 			}
4743 		}
4744 		md_ioctl_writerexit(lockp);
4745 		break;
4746 
4747 	case MD_MN_MSG_RESYNC_PHASE_DONE:
4748 		/*
4749 		 * A phase of the resync, optimized. component or
4750 		 * submirror is complete. Update mirror status.
4751 		 * If the flag CLEAR_OPT_NOT_DONE is set, it means that the
4752 		 * mirror owner is peforming a resync. If we have just snarfed
4753 		 * this set, then we must clear any of the flags set at snarf
4754 		 * time by unit_setup_resync().
4755 		 * Note that unit_setup_resync() sets up these flags to
4756 		 * indicate that an optimized resync is required. These flags
4757 		 * need to be reset because if we get here,  the mirror owner
4758 		 * will have handled the optimized resync.
4759 		 * The flags that must be cleared are MD_UN_OPT_NOT_DONE and
4760 		 * MD_UN_WAR. In addition, for each submirror,
4761 		 * MD_SM_RESYNC_TARGET must be cleared and SMS_OFFLINE_RESYNC
4762 		 * set to SMS_OFFLINE.
4763 		 */
4764 #ifdef DEBUG
4765 		if (mirror_debug_flag)
4766 			printf("phase done mess received from %d, mnum=%x,"
4767 			    "type=%x, flags=%x\n", p->rs_originator, p->mnum,
4768 			    p->rs_type, p->rs_flags);
4769 #endif
4770 		/*
4771 		 * Ignore the message if there is no active resync thread.
4772 		 */
4773 		if (!rs_active)
4774 			break;
4775 
4776 		broke_out = p->rs_flags & MD_MN_RS_ERR;
4777 		switch (RS_TYPE(p->rs_type)) {
4778 		case MD_RS_OPTIMIZED:
4779 			un = md_ioctl_writerlock(lockp, ui);
4780 			if (p->rs_flags & MD_MN_RS_CLEAR_OPT_NOT_DONE) {
4781 				/* If we are originator, just clear rs_type */
4782 				if (p->rs_originator == md_mn_mynode_id) {
4783 					SET_RS_TYPE_NONE(un->un_rs_type);
4784 					md_ioctl_writerexit(lockp);
4785 					break;
4786 				}
4787 				/*
4788 				 * If CLEAR_OPT_NOT_DONE is set, only clear the
4789 				 * flags if OPT_NOT_DONE is set *and* rs_type
4790 				 * is MD_RS_NONE.
4791 				 */
4792 				if ((un->c.un_status & MD_UN_OPT_NOT_DONE) &&
4793 				    (RS_TYPE(un->un_rs_type) == MD_RS_NONE)) {
4794 					/* No resync in progress */
4795 					un->c.un_status &= ~MD_UN_OPT_NOT_DONE;
4796 					un->c.un_status &= ~MD_UN_WAR;
4797 				} else {
4798 					/*
4799 					 * We are in the middle of an
4800 					 * optimized resync and this message
4801 					 * should be ignored.
4802 					 */
4803 					md_ioctl_writerexit(lockp);
4804 					break;
4805 				}
4806 			} else {
4807 				/*
4808 				 * This is the end of an optimized resync,
4809 				 * clear the OPT_NOT_DONE and OFFLINE_SM flags
4810 				 */
4811 
4812 				un->c.un_status &= ~MD_UN_KEEP_DIRTY;
4813 				if (!broke_out)
4814 					un->c.un_status &= ~MD_UN_WAR;
4815 			}
4816 
4817 			/*
4818 			 * Set resync_completed to last resync type and then
4819 			 * clear resync_type to indicate no resync in progress
4820 			 */
4821 			un->un_resync_completed = un->un_rs_type;
4822 			SET_RS_TYPE_NONE(un->un_rs_type);
4823 
4824 			/*
4825 			 * If resync is as a result of a submirror ONLINE,
4826 			 * reset the submirror state to SMS_RUNNING if the
4827 			 * resync was ok else set back to SMS_OFFLINE.
4828 			 */
4829 			for (smi = 0; smi < NMIRROR; smi++) {
4830 				un->un_sm[smi].sm_flags &=
4831 				    ~MD_SM_RESYNC_TARGET;
4832 				if (SMS_BY_INDEX_IS(un, smi,
4833 				    SMS_OFFLINE_RESYNC)) {
4834 					if (p->rs_flags &
4835 					    MD_MN_RS_CLEAR_OPT_NOT_DONE) {
4836 						state = SMS_OFFLINE;
4837 					} else {
4838 						state = (broke_out ?
4839 						    SMS_OFFLINE : SMS_RUNNING);
4840 					}
4841 					mirror_set_sm_state(
4842 					    &un->un_sm[smi],
4843 					    &un->un_smic[smi], state,
4844 					    broke_out);
4845 					mirror_commit(un, NO_SUBMIRRORS,
4846 					    0);
4847 				}
4848 				/*
4849 				 * If we still have an offline submirror, reset
4850 				 * the OFFLINE_SM flag in the mirror status
4851 				 */
4852 				if (SMS_BY_INDEX_IS(un, smi,
4853 				    SMS_OFFLINE))
4854 					un->c.un_status |=
4855 					    MD_UN_OFFLINE_SM;
4856 			}
4857 			md_ioctl_writerexit(lockp);
4858 			break;
4859 		case MD_RS_SUBMIRROR:
4860 			un = md_ioctl_writerlock(lockp, ui);
4861 			smi = RS_SMI(p->rs_type);
4862 			sm = &un->un_sm[smi];
4863 			smic = &un->un_smic[smi];
4864 			/* Clear RESYNC target */
4865 			un->un_sm[smi].sm_flags &= ~MD_SM_RESYNC_TARGET;
4866 			/*
4867 			 * Set resync_completed to last resync type and then
4868 			 * clear resync_type to indicate no resync in progress
4869 			 */
4870 			un->un_resync_completed = un->un_rs_type;
4871 			SET_RS_TYPE_NONE(un->un_rs_type);
4872 			/*
4873 			 * If the resync completed ok reset the submirror
4874 			 * state to SMS_RUNNING else reset it to SMS_ATTACHED
4875 			 */
4876 			state = (broke_out ?
4877 			    SMS_ATTACHED : SMS_RUNNING);
4878 			mirror_set_sm_state(sm, smic, state, broke_out);
4879 			un->c.un_status &= ~MD_UN_WAR;
4880 			mirror_commit(un, SMI2BIT(smi), 0);
4881 			md_ioctl_writerexit(lockp);
4882 			break;
4883 		case MD_RS_COMPONENT:
4884 			un = md_ioctl_writerlock(lockp, ui);
4885 			smi = RS_SMI(p->rs_type);
4886 			ci = RS_CI(p->rs_type);
4887 			sm = &un->un_sm[smi];
4888 			smic = &un->un_smic[smi];
4889 			shared = (md_m_shared_t *)
4890 			    (*(smic->sm_shared_by_indx))
4891 			    (sm->sm_dev, sm, ci);
4892 			un->c.un_status &= ~MD_UN_WAR;
4893 			/* Clear RESYNC target */
4894 			un->un_sm[smi].sm_flags &= ~MD_SM_RESYNC_TARGET;
4895 			/*
4896 			 * Set resync_completed to last resync type and then
4897 			 * clear resync_type to indicate no resync in progress
4898 			 */
4899 			un->un_resync_completed = un->un_rs_type;
4900 			SET_RS_TYPE_NONE(un->un_rs_type);
4901 
4902 			/*
4903 			 * If the resync completed ok, set the component state
4904 			 * to CS_OKAY.
4905 			 */
4906 			if (broke_out)
4907 				shared->ms_flags |= MDM_S_RS_TRIED;
4908 			else {
4909 				/*
4910 				 * As we don't transmit the changes,
4911 				 * no need to drop the lock.
4912 				 */
4913 				set_sm_comp_state(un, smi, ci, CS_OKAY, 0,
4914 				    MD_STATE_NO_XMIT, (IOLOCK *)NULL);
4915 			}
4916 			md_ioctl_writerexit(lockp);
4917 		default:
4918 			break;
4919 		}
4920 		/*
4921 		 * If the purpose of this PHASE_DONE message is just to
4922 		 * indicate to all other nodes that the optimized resync
4923 		 * required (OPT_NOT_DONE) flag is to be cleared, there is
4924 		 * no need to generate a notify event as there has not
4925 		 * actually been a resync.
4926 		 */
4927 		if (!(p->rs_flags & MD_MN_RS_CLEAR_OPT_NOT_DONE)) {
4928 			if (broke_out) {
4929 				SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_FAILED,
4930 				    SVM_TAG_METADEVICE, MD_UN2SET(un),
4931 				    MD_SID(un));
4932 			} else {
4933 				SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_DONE,
4934 				    SVM_TAG_METADEVICE, MD_UN2SET(un),
4935 				    MD_SID(un));
4936 			}
4937 		}
4938 		break;
4939 
4940 	default:
4941 #ifdef DEBUG
4942 		cmn_err(CE_PANIC, "mirror_resync_message: Unknown message type"
4943 		    " %x\n", p->msg_type);
4944 #endif
4945 		return (EINVAL);
4946 	}
4947 	return (0);
4948 }
4949 
4950 /* Return a -1 if snarf of optimized record failed and set should be released */
4951 static int
4952 mirror_snarf(md_snarfcmd_t cmd, set_t setno)
4953 {
4954 	mddb_recid_t	recid;
4955 	int		gotsomething;
4956 	int		all_mirrors_gotten;
4957 	mm_unit_t	*un;
4958 	mddb_type_t	typ1;
4959 	mddb_de_ic_t    *dep;
4960 	mddb_rb32_t	*rbp;
4961 	size_t		newreqsize;
4962 	mm_unit_t	*big_un;
4963 	mm_unit32_od_t	*small_un;
4964 	int		retval;
4965 	mdi_unit_t	*ui;
4966 
4967 	if (cmd == MD_SNARF_CLEANUP) {
4968 		if (md_get_setstatus(setno) & MD_SET_STALE)
4969 			return (0);
4970 
4971 		recid = mddb_makerecid(setno, 0);
4972 		typ1 = (mddb_type_t)md_getshared_key(setno,
4973 		    mirror_md_ops.md_driver.md_drivername);
4974 		while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) {
4975 			if (mddb_getrecprivate(recid) & MD_PRV_CLEANUP) {
4976 				un = (mm_unit_t *)mddb_getrecaddr(recid);
4977 				mirror_cleanup(un);
4978 				recid = mddb_makerecid(setno, 0);
4979 			}
4980 		}
4981 		return (0);
4982 	}
4983 
4984 	all_mirrors_gotten = 1;
4985 	gotsomething = 0;
4986 
4987 	recid = mddb_makerecid(setno, 0);
4988 	typ1 = (mddb_type_t)md_getshared_key(setno,
4989 	    mirror_md_ops.md_driver.md_drivername);
4990 
4991 	while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) {
4992 		if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
4993 			continue;
4994 
4995 		dep = mddb_getrecdep(recid);
4996 		dep->de_flags = MDDB_F_MIRROR;
4997 		rbp = dep->de_rb;
4998 
4999 		if ((rbp->rb_revision == MDDB_REV_RB) &&
5000 		    ((rbp->rb_private & MD_PRV_CONVD) == 0)) {
5001 			/*
5002 			 * This means, we have an old and small record
5003 			 * and this record hasn't already been converted.
5004 			 * Before we create an incore metadevice from this
5005 			 * we have to convert it to a big record.
5006 			 */
5007 			small_un = (mm_unit32_od_t *)mddb_getrecaddr(recid);
5008 			newreqsize = sizeof (mm_unit_t);
5009 			big_un = (mm_unit_t *)kmem_zalloc(newreqsize, KM_SLEEP);
5010 			mirror_convert((caddr_t)small_un, (caddr_t)big_un,
5011 			    SMALL_2_BIG);
5012 			kmem_free(small_un, dep->de_reqsize);
5013 
5014 			/*
5015 			 * Update userdata and incore userdata
5016 			 * incores are at the end of un
5017 			 */
5018 			dep->de_rb_userdata_ic = big_un;
5019 			dep->de_rb_userdata = big_un;
5020 			dep->de_icreqsize = newreqsize;
5021 			un = big_un;
5022 			rbp->rb_private |= MD_PRV_CONVD;
5023 		} else {
5024 			/* Big device */
5025 			un = (mm_unit_t *)mddb_getrecaddr_resize(recid,
5026 				sizeof (*un), 0);
5027 		}
5028 
5029 		/* Set revision and flag accordingly */
5030 		if (rbp->rb_revision == MDDB_REV_RB) {
5031 			un->c.un_revision = MD_32BIT_META_DEV;
5032 		} else {
5033 			un->c.un_revision = MD_64BIT_META_DEV;
5034 			un->c.un_flag |= MD_EFILABEL;
5035 		}
5036 
5037 		/*
5038 		 * Create minor device node for snarfed entry.
5039 		 */
5040 		(void) md_create_minor_node(setno, MD_SID(un));
5041 
5042 		if (MD_UNIT(MD_SID(un)) != NULL) {
5043 			mddb_setrecprivate(recid, MD_PRV_PENDDEL);
5044 			continue;
5045 		}
5046 		all_mirrors_gotten = 0;
5047 		retval = mirror_build_incore(un, 1);
5048 		if (retval == 0) {
5049 			mddb_setrecprivate(recid, MD_PRV_GOTIT);
5050 			md_create_unit_incore(MD_SID(un), &mirror_md_ops, 0);
5051 			resync_start_timeout(setno);
5052 			gotsomething = 1;
5053 		} else if (retval == -1) {
5054 			return (-1);
5055 		}
5056 		/*
5057 		 * Set flag to indicate that the mirror has not yet
5058 		 * been through a reconfig. This flag is used for MN sets
5059 		 * when determining whether to update the mirror state from
5060 		 * the Master node.
5061 		 */
5062 		if (MD_MNSET_SETNO(setno)) {
5063 			ui = MDI_UNIT(MD_SID(un));
5064 			ui->ui_tstate |= MD_RESYNC_NOT_DONE;
5065 		}
5066 	}
5067 
5068 	if (!all_mirrors_gotten)
5069 		return (gotsomething);
5070 
5071 	recid = mddb_makerecid(setno, 0);
5072 	while ((recid = mddb_getnextrec(recid, typ1, RESYNC_REC)) > 0)
5073 		if (!(mddb_getrecprivate(recid) & MD_PRV_GOTIT))
5074 			mddb_setrecprivate(recid, MD_PRV_PENDDEL);
5075 
5076 	return (0);
5077 }
5078 
5079 static int
5080 mirror_halt(md_haltcmd_t cmd, set_t setno)
5081 {
5082 	unit_t		i;
5083 	mdi_unit_t	*ui;
5084 	minor_t		mnum;
5085 	int		reset_mirror_flag = 0;
5086 
5087 	if (cmd == MD_HALT_CLOSE)
5088 		return (0);
5089 
5090 	if (cmd == MD_HALT_OPEN)
5091 		return (0);
5092 
5093 	if (cmd == MD_HALT_UNLOAD)
5094 		return (0);
5095 
5096 	if (cmd == MD_HALT_CHECK) {
5097 		for (i = 0; i < md_nunits; i++) {
5098 			mnum = MD_MKMIN(setno, i);
5099 			if ((ui = MDI_UNIT(mnum)) == NULL)
5100 				continue;
5101 			if (ui->ui_opsindex != mirror_md_ops.md_selfindex)
5102 				continue;
5103 			if (md_unit_isopen(ui))
5104 				return (1);
5105 		}
5106 		return (0);
5107 	}
5108 
5109 	if (cmd != MD_HALT_DOIT)
5110 		return (1);
5111 
5112 	for (i = 0; i < md_nunits; i++) {
5113 		mnum = MD_MKMIN(setno, i);
5114 		if ((ui = MDI_UNIT(mnum)) == NULL)
5115 			continue;
5116 		if (ui->ui_opsindex != mirror_md_ops.md_selfindex)
5117 			continue;
5118 		reset_mirror((mm_unit_t *)MD_UNIT(mnum), mnum, 0);
5119 
5120 		/* Set a flag if there is at least one mirror metadevice. */
5121 		reset_mirror_flag = 1;
5122 	}
5123 
5124 	/*
5125 	 * Only wait for the global dr_timeout to finish
5126 	 *  - if there are mirror metadevices in this diskset or
5127 	 *  - if this is the local set since an unload of the md_mirror
5128 	 *    driver could follow a successful mirror halt in the local set.
5129 	 */
5130 	if ((reset_mirror_flag != 0) || (setno == MD_LOCAL_SET)) {
5131 		while ((mirror_md_ops.md_head == NULL) &&
5132 		    (mirror_timeout.dr_timeout_id != 0))
5133 			delay(md_hz);
5134 	}
5135 
5136 	return (0);
5137 }
5138 
5139 /*ARGSUSED3*/
5140 static int
5141 mirror_open(dev_t *dev, int flag, int otyp, cred_t *cred_p, int md_oflags)
5142 {
5143 	IOLOCK	lock;
5144 	minor_t		mnum = getminor(*dev);
5145 	set_t		setno;
5146 
5147 	/*
5148 	 * When doing an open of a multi owner metadevice, check to see if this
5149 	 * node is a starting node and if a reconfig cycle is underway.
5150 	 * If so, the system isn't sufficiently set up enough to handle the
5151 	 * open (which involves I/O during sp_validate), so fail with ENXIO.
5152 	 */
5153 	setno = MD_MIN2SET(mnum);
5154 	if ((md_set[setno].s_status & (MD_SET_MNSET | MD_SET_MN_START_RC)) ==
5155 	    (MD_SET_MNSET | MD_SET_MN_START_RC)) {
5156 			return (ENXIO);
5157 	}
5158 
5159 	if (md_oflags & MD_OFLG_FROMIOCTL) {
5160 		/*
5161 		 * This indicates that the caller is an ioctl service routine.
5162 		 * In this case we initialise our stack-based IOLOCK and pass
5163 		 * this into the internal open routine. This allows multi-owner
5164 		 * metadevices to avoid deadlocking if an error is encountered
5165 		 * during the open() attempt. The failure case is:
5166 		 * s-p -> mirror -> s-p (with error). Attempting to metaclear
5167 		 * this configuration would deadlock as the mirror code has to
5168 		 * send a state-update to the other nodes when it detects the
5169 		 * failure of the underlying submirror with an errored soft-part
5170 		 * on it. As there is a class1 message in progress (metaclear)
5171 		 * set_sm_comp_state() cannot send another class1 message;
5172 		 * instead we do not send a state_update message as the
5173 		 * metaclear is distributed and the failed submirror will be
5174 		 * cleared from the configuration by the metaclear.
5175 		 */
5176 		IOLOCK_INIT(&lock);
5177 		return (mirror_internal_open(getminor(*dev), flag, otyp,
5178 		    md_oflags, &lock));
5179 	} else {
5180 		return (mirror_internal_open(getminor(*dev), flag, otyp,
5181 		    md_oflags, (IOLOCK *)NULL));
5182 	}
5183 }
5184 
5185 
5186 /*ARGSUSED1*/
5187 static int
5188 mirror_close(dev_t dev, int flag, int otyp, cred_t *cred_p, int md_cflags)
5189 {
5190 	return (mirror_internal_close(getminor(dev), otyp, md_cflags,
5191 		(IOLOCK *)NULL));
5192 }
5193 
5194 
5195 /*
5196  * This routine dumps memory to the disk.  It assumes that the memory has
5197  * already been mapped into mainbus space.  It is called at disk interrupt
5198  * priority when the system is in trouble.
5199  *
5200  */
5201 static int
5202 mirror_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
5203 {
5204 	mm_unit_t	*un;
5205 	dev_t		mapdev;
5206 	int		result;
5207 	int		smi;
5208 	int		any_succeed = 0;
5209 	int		save_result = 0;
5210 
5211 	/*
5212 	 * Don't need to grab the unit lock.
5213 	 * Cause nothing else is suppose to be happenning.
5214 	 * Also dump is not suppose to sleep.
5215 	 */
5216 	un = (mm_unit_t *)MD_UNIT(getminor(dev));
5217 
5218 	if ((diskaddr_t)blkno >= un->c.un_total_blocks)
5219 		return (EINVAL);
5220 
5221 	if ((diskaddr_t)blkno + nblk > un->c.un_total_blocks)
5222 		return (EINVAL);
5223 
5224 	for (smi = 0; smi < NMIRROR; smi++) {
5225 		if (!SUBMIRROR_IS_WRITEABLE(un, smi))
5226 			continue;
5227 		mapdev = md_dev64_to_dev(un->un_sm[smi].sm_dev);
5228 		result = bdev_dump(mapdev, addr, blkno, nblk);
5229 		if (result)
5230 			save_result = result;
5231 
5232 		if (result == 0)
5233 			any_succeed++;
5234 	}
5235 
5236 	if (any_succeed)
5237 		return (0);
5238 
5239 	return (save_result);
5240 }
5241 
5242 /*
5243  * NAME: mirror_probe_dev
5244  *
5245  * DESCRITPION: force opens every component of a mirror.
5246  *
5247  * On entry the unit writerlock is held
5248  */
5249 static int
5250 mirror_probe_dev(mdi_unit_t *ui, minor_t mnum)
5251 {
5252 	int		i;
5253 	int		smi;
5254 	int		ci;
5255 	mm_unit_t	*un;
5256 	int		md_devopen = 0;
5257 	set_t		setno;
5258 	int		sm_cnt;
5259 	int		sm_unavail_cnt;
5260 
5261 	if (md_unit_isopen(ui))
5262 		md_devopen++;
5263 
5264 	un = MD_UNIT(mnum);
5265 	setno = MD_UN2SET(un);
5266 
5267 	sm_cnt = 0;
5268 	sm_unavail_cnt = 0;
5269 	for (i = 0; i < NMIRROR; i++) {
5270 		md_dev64_t tmpdev;
5271 		mdi_unit_t	*sm_ui;
5272 
5273 		if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) {
5274 			continue;
5275 		}
5276 
5277 		sm_cnt++;
5278 		tmpdev = un->un_sm[i].sm_dev;
5279 		(void) md_layered_open(mnum, &tmpdev,
5280 				MD_OFLG_CONT_ERRS | MD_OFLG_PROBEDEV);
5281 		un->un_sm[i].sm_dev = tmpdev;
5282 
5283 		sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev)));
5284 
5285 		/*
5286 		 * Logic similar to that in mirror_open_all_devs.  We set or
5287 		 * clear the submirror Unavailable bit.
5288 		 */
5289 		(void) md_unit_writerlock(sm_ui);
5290 		if (submirror_unavailable(un, i, 1)) {
5291 			sm_ui->ui_tstate |= MD_INACCESSIBLE;
5292 			sm_unavail_cnt++;
5293 		} else {
5294 			sm_ui->ui_tstate &= ~MD_INACCESSIBLE;
5295 		}
5296 		md_unit_writerexit(sm_ui);
5297 	}
5298 
5299 	/*
5300 	 * If all of the submirrors are unavailable, the mirror is also
5301 	 * unavailable.
5302 	 */
5303 	if (sm_cnt == sm_unavail_cnt) {
5304 		ui->ui_tstate |= MD_INACCESSIBLE;
5305 	} else {
5306 		ui->ui_tstate &= ~MD_INACCESSIBLE;
5307 	}
5308 
5309 	/*
5310 	 * Start checking from probe failures. If failures occur we
5311 	 * set the appropriate erred state only if the metadevice is in
5312 	 * use. This is specifically to prevent unnecessary resyncs.
5313 	 * For instance if the disks were accidentally disconnected when
5314 	 * the system booted up then until the metadevice is accessed
5315 	 * (like file system mount) the user can shutdown, recable and
5316 	 * reboot w/o incurring a potentially huge resync.
5317 	 */
5318 
5319 	smi = 0;
5320 	ci = 0;
5321 	while (mirror_geterror(un, &smi, &ci, 1, 1) != 0) {
5322 
5323 		if (mirror_other_sources(un, smi, ci, 0) == 1) {
5324 			/*
5325 			 * Note that for a MN set, there is no need to call
5326 			 * SE_NOTIFY as that is done when processing the
5327 			 * state change
5328 			 */
5329 			if (md_devopen) {
5330 				/*
5331 				 * Never called from ioctl context,
5332 				 * so (IOLOCK *)NULL
5333 				 */
5334 				set_sm_comp_state(un, smi, ci, CS_LAST_ERRED,
5335 				    0, MD_STATE_XMIT, (IOLOCK *)NULL);
5336 				if (!MD_MNSET_SETNO(setno)) {
5337 					SE_NOTIFY(EC_SVM_STATE,
5338 					    ESC_SVM_LASTERRED,
5339 					    SVM_TAG_METADEVICE, setno,
5340 					    MD_SID(un));
5341 				}
5342 				continue;
5343 			} else {
5344 				(void) mirror_close_all_devs(un,
5345 				    MD_OFLG_PROBEDEV);
5346 				if (!MD_MNSET_SETNO(setno)) {
5347 					SE_NOTIFY(EC_SVM_STATE,
5348 					    ESC_SVM_OPEN_FAIL,
5349 					    SVM_TAG_METADEVICE, setno,
5350 					    MD_SID(un));
5351 				}
5352 				mirror_openfail_console_info(un, smi, ci);
5353 				return (ENXIO);
5354 			}
5355 		}
5356 
5357 		/*
5358 		 * Note that for a MN set, there is no need to call
5359 		 * SE_NOTIFY as that is done when processing the
5360 		 * state change
5361 		 */
5362 		if (md_devopen) {
5363 			/* Never called from ioctl context, so (IOLOCK *)NULL */
5364 			set_sm_comp_state(un, smi, ci, CS_ERRED, 0,
5365 			    MD_STATE_XMIT, (IOLOCK *)NULL);
5366 			if (!MD_MNSET_SETNO(setno)) {
5367 				SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED,
5368 				    SVM_TAG_METADEVICE, setno,
5369 				    MD_SID(un));
5370 			}
5371 		}
5372 		mirror_openfail_console_info(un, smi, ci);
5373 		ci++;
5374 	}
5375 
5376 	if (MD_MNSET_SETNO(setno)) {
5377 		send_poke_hotspares(setno);
5378 	} else {
5379 		(void) poke_hotspares();
5380 	}
5381 	(void) mirror_close_all_devs(un, MD_OFLG_PROBEDEV);
5382 
5383 	return (0);
5384 }
5385 
5386 
5387 static int
5388 mirror_imp_set(
5389 	set_t	setno
5390 )
5391 {
5392 
5393 	mddb_recid_t	recid;
5394 	int		gotsomething, i;
5395 	mddb_type_t	typ1;
5396 	mddb_de_ic_t	*dep;
5397 	mddb_rb32_t	*rbp;
5398 	mm_unit32_od_t	*un32;
5399 	mm_unit_t	*un64;
5400 	minor_t		*self_id;	/* minor needs to be updated */
5401 	md_parent_t	*parent_id;	/* parent needs to be updated */
5402 	mddb_recid_t	*record_id;	/* record id needs to be updated */
5403 	mddb_recid_t	*optrec_id;
5404 	md_dev64_t	tmpdev;
5405 
5406 
5407 	gotsomething = 0;
5408 
5409 	typ1 = (mddb_type_t)md_getshared_key(setno,
5410 	    mirror_md_ops.md_driver.md_drivername);
5411 	recid = mddb_makerecid(setno, 0);
5412 
5413 	while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) {
5414 		if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
5415 			continue;
5416 
5417 		dep = mddb_getrecdep(recid);
5418 		rbp = dep->de_rb;
5419 
5420 		if (rbp->rb_revision == MDDB_REV_RB) {
5421 			/*
5422 			 * Small device
5423 			 */
5424 			un32 = (mm_unit32_od_t *)mddb_getrecaddr(recid);
5425 			self_id = &(un32->c.un_self_id);
5426 			parent_id = &(un32->c.un_parent);
5427 			record_id = &(un32->c.un_record_id);
5428 			optrec_id = &(un32->un_rr_dirty_recid);
5429 
5430 			for (i = 0; i < un32->un_nsm; i++) {
5431 			    tmpdev = md_expldev(un32->un_sm[i].sm_dev);
5432 			    un32->un_sm[i].sm_dev = md_cmpldev
5433 				(md_makedevice(md_major, MD_MKMIN(setno,
5434 				MD_MIN2UNIT(md_getminor(tmpdev)))));
5435 
5436 			    if (!md_update_minor(setno, mddb_getsidenum
5437 				(setno), un32->un_sm[i].sm_key))
5438 				goto out;
5439 			}
5440 		} else {
5441 			un64 = (mm_unit_t *)mddb_getrecaddr(recid);
5442 			self_id = &(un64->c.un_self_id);
5443 			parent_id = &(un64->c.un_parent);
5444 			record_id = &(un64->c.un_record_id);
5445 			optrec_id = &(un64->un_rr_dirty_recid);
5446 
5447 			for (i = 0; i < un64->un_nsm; i++) {
5448 			    tmpdev = un64->un_sm[i].sm_dev;
5449 			    un64->un_sm[i].sm_dev = md_makedevice
5450 				(md_major, MD_MKMIN(setno, MD_MIN2UNIT
5451 				(md_getminor(tmpdev))));
5452 
5453 			    if (!md_update_minor(setno, mddb_getsidenum
5454 				(setno), un64->un_sm[i].sm_key))
5455 				goto out;
5456 			}
5457 		}
5458 
5459 		/*
5460 		 * Update unit with the imported setno
5461 		 *
5462 		 */
5463 		mddb_setrecprivate(recid, MD_PRV_GOTIT);
5464 
5465 		*self_id = MD_MKMIN(setno, MD_MIN2UNIT(*self_id));
5466 		if (*parent_id != MD_NO_PARENT)
5467 			*parent_id = MD_MKMIN(setno, MD_MIN2UNIT(*parent_id));
5468 		*record_id = MAKERECID(setno, DBID(*record_id));
5469 		*optrec_id = MAKERECID(setno, DBID(*optrec_id));
5470 
5471 		gotsomething = 1;
5472 	}
5473 
5474 out:
5475 	return (gotsomething);
5476 }
5477 
5478 /*
5479  * NAME: mirror_check_offline
5480  *
5481  * DESCRIPTION: return offline_status = 1 if any submirrors are offline
5482  *
5483  * Called from ioctl, so access to MD_UN_OFFLINE_SM in un_status is
5484  * protected by the global ioctl lock as it is only set by the MD_IOCOFFLINE
5485  * ioctl.
5486  */
5487 int
5488 mirror_check_offline(md_dev64_t dev, int *offline_status)
5489 {
5490 	mm_unit_t		*un;
5491 	md_error_t		mde = mdnullerror;
5492 
5493 	if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL)
5494 		return (EINVAL);
5495 	*offline_status = 0;
5496 	if (un->c.un_status & MD_UN_OFFLINE_SM)
5497 		*offline_status = 1;
5498 	return (0);
5499 }
5500 
5501 /*
5502  * NAME: mirror_inc_abr_count
5503  *
5504  * DESCRIPTION: increment the count of layered soft parts with ABR set
5505  *
5506  * Called from ioctl, so access to un_abr_count is protected by the global
5507  * ioctl lock. It is only referenced in the MD_IOCOFFLINE ioctl.
5508  */
5509 int
5510 mirror_inc_abr_count(md_dev64_t dev)
5511 {
5512 	mm_unit_t		*un;
5513 	md_error_t		mde = mdnullerror;
5514 
5515 	if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL)
5516 		return (EINVAL);
5517 	un->un_abr_count++;
5518 	return (0);
5519 }
5520 
5521 /*
5522  * NAME: mirror_dec_abr_count
5523  *
5524  * DESCRIPTION: decrement the count of layered soft parts with ABR set
5525  *
5526  * Called from ioctl, so access to un_abr_count is protected by the global
5527  * ioctl lock. It is only referenced in the MD_IOCOFFLINE ioctl.
5528  */
5529 int
5530 mirror_dec_abr_count(md_dev64_t dev)
5531 {
5532 	mm_unit_t		*un;
5533 	md_error_t		mde = mdnullerror;
5534 
5535 	if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL)
5536 		return (EINVAL);
5537 	un->un_abr_count--;
5538 	return (0);
5539 }
5540 
5541 static md_named_services_t mirror_named_services[] = {
5542 	{(intptr_t (*)()) poke_hotspares,		"poke hotspares"    },
5543 	{(intptr_t (*)()) mirror_rename_listkids,	MDRNM_LIST_URKIDS   },
5544 	{mirror_rename_check,				MDRNM_CHECK	    },
5545 	{(intptr_t (*)()) mirror_renexch_update_kids,	MDRNM_UPDATE_KIDS   },
5546 	{(intptr_t (*)()) mirror_exchange_parent_update_to,
5547 			MDRNM_PARENT_UPDATE_TO},
5548 	{(intptr_t (*)()) mirror_exchange_self_update_from_down,
5549 			MDRNM_SELF_UPDATE_FROM_DOWN },
5550 	{(intptr_t (*)())mirror_probe_dev,		"probe open test" },
5551 	{(intptr_t (*)())mirror_check_offline,		MD_CHECK_OFFLINE },
5552 	{(intptr_t (*)())mirror_inc_abr_count,		MD_INC_ABR_COUNT },
5553 	{(intptr_t (*)())mirror_dec_abr_count,		MD_DEC_ABR_COUNT },
5554 	{ NULL,						0		    }
5555 };
5556 
5557 md_ops_t mirror_md_ops = {
5558 	mirror_open,		/* open */
5559 	mirror_close,		/* close */
5560 	md_mirror_strategy,	/* strategy */
5561 	NULL,			/* print */
5562 	mirror_dump,		/* dump */
5563 	NULL,			/* read */
5564 	NULL,			/* write */
5565 	md_mirror_ioctl,	/* mirror_ioctl, */
5566 	mirror_snarf,		/* mirror_snarf */
5567 	mirror_halt,		/* mirror_halt */
5568 	NULL,			/* aread */
5569 	NULL,			/* awrite */
5570 	mirror_imp_set,		/* import set */
5571 	mirror_named_services
5572 };
5573 
5574 /* module specific initilization */
5575 static void
5576 init_init()
5577 {
5578 	md_mirror_mcs_buf_off = sizeof (md_mcs_t) - sizeof (buf_t);
5579 
5580 	/* Initialize the parent and child save memory pools */
5581 	mirror_parent_cache = kmem_cache_create("md_mirror_parent",
5582 	    sizeof (md_mps_t), 0, mirror_parent_constructor,
5583 	    mirror_parent_destructor, mirror_run_queue, NULL, NULL,
5584 	    0);
5585 
5586 	mirror_child_cache = kmem_cache_create("md_mirror_child",
5587 	    sizeof (md_mcs_t) - sizeof (buf_t) + biosize(), 0,
5588 	    mirror_child_constructor, mirror_child_destructor,
5589 	    mirror_run_queue, NULL, NULL, 0);
5590 
5591 	/*
5592 	 * Insure wowbuf_size is a multiple of DEV_BSIZE,
5593 	 * then initialize wowbuf memory pool.
5594 	 */
5595 	md_wowbuf_size = roundup(md_wowbuf_size, DEV_BSIZE);
5596 	if (md_wowbuf_size <= 0)
5597 		md_wowbuf_size = 2 * DEV_BSIZE;
5598 	if (md_wowbuf_size > (32 * DEV_BSIZE))
5599 		md_wowbuf_size = (32 * DEV_BSIZE);
5600 
5601 	md_wowblk_size = md_wowbuf_size + sizeof (wowhdr_t);
5602 	mirror_wowblk_cache = kmem_cache_create("md_mirror_wow",
5603 	    md_wowblk_size, 0, NULL, NULL, NULL, NULL, NULL, 0);
5604 
5605 	mutex_init(&mirror_timeout.dr_mx, NULL, MUTEX_DEFAULT, NULL);
5606 	mutex_init(&hotspare_request.dr_mx, NULL, MUTEX_DEFAULT, NULL);
5607 
5608 	mutex_init(&non_ff_drv_mutex, NULL, MUTEX_DEFAULT, NULL);
5609 }
5610 
5611 /* module specific uninitilization (undo init_init()) */
5612 static void
5613 fini_uninit()
5614 {
5615 	kmem_cache_destroy(mirror_parent_cache);
5616 	kmem_cache_destroy(mirror_child_cache);
5617 	kmem_cache_destroy(mirror_wowblk_cache);
5618 	mirror_parent_cache = mirror_child_cache =
5619 	    mirror_wowblk_cache = NULL;
5620 
5621 	mutex_destroy(&mirror_timeout.dr_mx);
5622 	mutex_destroy(&hotspare_request.dr_mx);
5623 	mutex_destroy(&non_ff_drv_mutex);
5624 }
5625 
5626 /* define the module linkage */
5627 MD_PLUGIN_MISC_MODULE("mirrors module %I%", init_init(), fini_uninit())
5628