xref: /onnv-gate/usr/src/uts/common/io/lvm/mirror/mirror.c (revision 7627:8599a7568728)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/param.h>
28 #include <sys/systm.h>
29 #include <sys/conf.h>
30 #include <sys/file.h>
31 #include <sys/user.h>
32 #include <sys/uio.h>
33 #include <sys/t_lock.h>
34 #include <sys/buf.h>
35 #include <sys/dkio.h>
36 #include <sys/vtoc.h>
37 #include <sys/kmem.h>
38 #include <vm/page.h>
39 #include <sys/cmn_err.h>
40 #include <sys/sysmacros.h>
41 #include <sys/types.h>
42 #include <sys/mkdev.h>
43 #include <sys/stat.h>
44 #include <sys/open.h>
45 #include <sys/modctl.h>
46 #include <sys/ddi.h>
47 #include <sys/sunddi.h>
48 #include <sys/debug.h>
49 #include <sys/dklabel.h>
50 #include <vm/hat.h>
51 #include <sys/lvm/mdvar.h>
52 #include <sys/lvm/md_mirror.h>
53 #include <sys/lvm/md_convert.h>
54 #include <sys/lvm/md_mddb.h>
55 #include <sys/esunddi.h>
56 
57 #include <sys/sysevent/eventdefs.h>
58 #include <sys/sysevent/svm.h>
59 #include <sys/lvm/mdmn_commd.h>
60 #include <sys/avl.h>
61 
62 md_ops_t		mirror_md_ops;
63 #ifndef	lint
64 char			_depends_on[] = "drv/md";
65 md_ops_t		*md_interface_ops = &mirror_md_ops;
66 #endif
67 
68 extern mdq_anchor_t	md_done_daemon;
69 extern mdq_anchor_t	md_mstr_daemon;
70 extern mdq_anchor_t	md_mirror_daemon;
71 extern mdq_anchor_t	md_mirror_io_daemon;
72 extern mdq_anchor_t	md_mirror_rs_daemon;
73 extern mdq_anchor_t	md_mhs_daemon;
74 
75 extern unit_t		md_nunits;
76 extern set_t		md_nsets;
77 extern md_set_t		md_set[];
78 
79 extern int		md_status;
80 extern clock_t		md_hz;
81 
82 extern md_krwlock_t	md_unit_array_rw;
83 extern kmutex_t		md_mx;
84 extern kcondvar_t	md_cv;
85 extern int		md_mtioctl_cnt;
86 
87 daemon_request_t	mirror_timeout;
88 static daemon_request_t	hotspare_request;
89 static daemon_request_t	mn_hs_request[MD_MAXSETS];	/* Multinode hs req */
90 
91 int	md_mirror_mcs_buf_off;
92 
93 /* Flags for mdmn_ksend_message to allow debugging */
94 int	md_mirror_msg_flags;
95 
96 #ifdef DEBUG
97 /* Flag to switch on debug messages */
98 int	mirror_debug_flag = 0;
99 #endif
100 
101 /*
102  * Struct used to hold count of DMR reads and the timestamp of last DMR read
103  * It is used to verify, using a debugger, that the DMR read ioctl has been
104  * executed.
105  */
106 dmr_stats_t	mirror_dmr_stats = {0, 0};
107 
108 /*
109  * Mutex protecting list of non-failfast drivers.
110  */
111 static kmutex_t	non_ff_drv_mutex;
112 extern char	**non_ff_drivers;
113 
114 extern major_t	md_major;
115 
116 /*
117  * Write-On-Write memory pool.
118  */
119 static void		copy_write_cont(wowhdr_t *wowhdr);
120 static kmem_cache_t	*mirror_wowblk_cache = NULL;
121 static int		md_wowbuf_size = 16384;
122 static size_t		md_wowblk_size;
123 
124 /*
125  * This is a flag that allows:
126  *	- disabling the write-on-write mechanism.
127  *	- logging occurrences of write-on-write
128  *	- switching wow handling procedure processing
129  * Counter for occurences of WOW.
130  */
131 static uint_t	md_mirror_wow_flg = 0;
132 static int	md_mirror_wow_cnt = 0;
133 
134 /*
135  * Tunable to enable/disable dirty region
136  * processing when closing down a mirror.
137  */
138 static int	new_resync = 1;
139 kmem_cache_t	*mirror_parent_cache = NULL;
140 kmem_cache_t	*mirror_child_cache = NULL;
141 
142 extern int	md_ff_disable;		/* disable failfast */
143 
144 static int	mirror_map_write(mm_unit_t *, md_mcs_t *, md_mps_t *, int);
145 static void	mirror_read_strategy(buf_t *, int, void *);
146 static void	mirror_write_strategy(buf_t *, int, void *);
147 static void	become_owner(daemon_queue_t *);
148 static int	mirror_done(struct buf *cb);
149 static int	mirror_done_common(struct buf *cb);
150 static void	clear_retry_error(struct buf *cb);
151 
152 /*
153  * patchables
154  */
155 int	md_min_rr_size	= 200;	/* 2000 blocks, or 100k */
156 int	md_def_num_rr	= 1000;	/* Default number of dirty regions */
157 
158 /*
159  * patchable to change delay before rescheduling mirror ownership request.
160  * Value is clock ticks, default 0.5 seconds
161  */
162 clock_t	md_mirror_owner_to = 500000;
163 
164 /*ARGSUSED1*/
165 static int
166 mirror_parent_constructor(void *p, void *d1, int d2)
167 {
168 	mutex_init(&((md_mps_t *)p)->ps_mx, NULL, MUTEX_DEFAULT, NULL);
169 	return (0);
170 }
171 
172 static void
173 mirror_parent_init(md_mps_t *ps)
174 {
175 	bzero(ps, offsetof(md_mps_t, ps_mx));
176 }
177 
178 /*ARGSUSED1*/
179 static void
180 mirror_parent_destructor(void *p, void *d)
181 {
182 	mutex_destroy(&((md_mps_t *)p)->ps_mx);
183 }
184 
185 /*ARGSUSED1*/
186 static int
187 mirror_child_constructor(void *p, void *d1, int d2)
188 {
189 	bioinit(&((md_mcs_t *)p)->cs_buf);
190 	return (0);
191 }
192 
193 void
194 mirror_child_init(md_mcs_t *cs)
195 {
196 	cs->cs_ps = NULL;
197 	cs->cs_mdunit = 0;
198 	md_bioreset(&cs->cs_buf);
199 }
200 
201 /*ARGSUSED1*/
202 static void
203 mirror_child_destructor(void *p, void *d)
204 {
205 	biofini(&((md_mcs_t *)p)->cs_buf);
206 }
207 
208 static void
209 mirror_wowblk_init(wowhdr_t *p)
210 {
211 	bzero(p, md_wowblk_size);
212 }
213 
214 static void
215 send_poke_hotspares_msg(daemon_request_t *drq)
216 {
217 	int			rval;
218 	md_mn_msg_pokehsp_t	pokehsp;
219 	md_mn_kresult_t		*kresult;
220 	set_t			setno = (set_t)drq->dq.qlen;
221 
222 	pokehsp.pokehsp_setno = setno;
223 
224 	kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
225 	rval = mdmn_ksend_message(setno, MD_MN_MSG_POKE_HOTSPARES,
226 	    MD_MSGF_NO_LOG | MD_MSGF_NO_BCAST, (char *)&pokehsp,
227 	    sizeof (pokehsp), kresult);
228 
229 	if (!MDMN_KSEND_MSG_OK(rval, kresult)) {
230 		mdmn_ksend_show_error(rval, kresult, "POKE_HOTSPARES");
231 		cmn_err(CE_PANIC,
232 		    "ksend_message failure: POKE_HOTSPARES");
233 	}
234 	kmem_free(kresult, sizeof (md_mn_kresult_t));
235 
236 	/* Allow further requests to use this set's queue structure */
237 	mutex_enter(&drq->dr_mx);
238 	drq->dr_pending = 0;
239 	mutex_exit(&drq->dr_mx);
240 }
241 
242 /*
243  * Send a poke_hotspares message to the master node. To avoid swamping the
244  * commd handler with requests we only send a message if there is not one
245  * already outstanding. We punt the request to a separate thread context as
246  * cannot afford to block waiting on the request to be serviced. This is
247  * essential when a reconfig cycle is in progress as any open() of a multinode
248  * metadevice may result in a livelock.
249  */
250 static void
251 send_poke_hotspares(set_t setno)
252 {
253 	daemon_request_t	*drq = &mn_hs_request[setno];
254 
255 	mutex_enter(&drq->dr_mx);
256 	if (drq->dr_pending == 0) {
257 		drq->dr_pending = 1;
258 		drq->dq.qlen = (int)setno;
259 		daemon_request(&md_mhs_daemon,
260 		    send_poke_hotspares_msg, (daemon_queue_t *)drq, REQ_OLD);
261 	}
262 	mutex_exit(&drq->dr_mx);
263 }
264 
265 void
266 mirror_set_sm_state(
267 	mm_submirror_t		*sm,
268 	mm_submirror_ic_t	*smic,
269 	sm_state_t		newstate,
270 	int			force)
271 {
272 	int			compcnt;
273 	int			i;
274 	int			errcnt;
275 	sm_state_t		origstate;
276 	md_m_shared_t		*shared;
277 
278 	if (force) {
279 		sm->sm_state = newstate;
280 		uniqtime32(&sm->sm_timestamp);
281 		return;
282 	}
283 
284 	origstate = newstate;
285 
286 	compcnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm);
287 	for (i = 0, errcnt = 0; i < compcnt; i++) {
288 		shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
289 		    (sm->sm_dev, sm, i);
290 		if (shared->ms_state & (CS_ERRED | CS_LAST_ERRED))
291 			newstate |= SMS_COMP_ERRED;
292 		if (shared->ms_state & (CS_RESYNC))
293 			newstate |= SMS_COMP_RESYNC;
294 		if (shared->ms_state & CS_ERRED)
295 			errcnt++;
296 	}
297 
298 	if ((newstate & (SMS_COMP_ERRED | SMS_COMP_RESYNC)) != 0)
299 		newstate &= ~origstate;
300 
301 	if (errcnt == compcnt)
302 		newstate |= SMS_ALL_ERRED;
303 	else
304 		newstate &= ~SMS_ALL_ERRED;
305 
306 	sm->sm_state = newstate;
307 	uniqtime32(&sm->sm_timestamp);
308 }
309 
310 static int
311 mirror_geterror(mm_unit_t *un, int *smi, int *cip, int clr_error,
312 							int frm_probe)
313 {
314 	mm_submirror_t		*sm;
315 	mm_submirror_ic_t	*smic;
316 	md_m_shared_t		*shared;
317 	int			ci;
318 	int			i;
319 	int			compcnt;
320 	int			open_comp; /* flag for open component */
321 
322 	for (i = *smi; i < NMIRROR; i++) {
323 		sm = &un->un_sm[i];
324 		smic = &un->un_smic[i];
325 
326 		if (!SMS_IS(sm, SMS_INUSE))
327 			continue;
328 
329 		compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un);
330 		for (ci = *cip; ci < compcnt; ci++) {
331 			shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
332 			    (sm->sm_dev, sm, ci);
333 			/*
334 			 * if called from any routine but probe, we check for
335 			 * MDM_S_ISOPEN flag. Since probe does a pseduo open,
336 			 * it sets MDM_S_PROBEOPEN flag and we test for this
337 			 * flag. They are both exclusive tests.
338 			 */
339 			open_comp = (frm_probe) ?
340 			    (shared->ms_flags & MDM_S_PROBEOPEN):
341 			    (shared->ms_flags & MDM_S_ISOPEN);
342 			if ((shared->ms_flags & MDM_S_IOERR || !open_comp) &&
343 			    ((shared->ms_state == CS_OKAY) ||
344 			    (shared->ms_state == CS_RESYNC))) {
345 				if (clr_error) {
346 					shared->ms_flags &= ~MDM_S_IOERR;
347 				}
348 				*cip = ci;
349 				*smi = i;
350 				return (1);
351 			}
352 
353 			if (clr_error && (shared->ms_flags & MDM_S_IOERR)) {
354 				shared->ms_flags &= ~MDM_S_IOERR;
355 			}
356 		}
357 
358 		*cip = 0;
359 	}
360 	return (0);
361 }
362 
363 /*ARGSUSED*/
364 static void
365 mirror_run_queue(void *d)
366 {
367 	if (!(md_status & MD_GBL_DAEMONS_LIVE))
368 		md_daemon(1, &md_done_daemon);
369 }
370 /*
371  * check_comp_4_hotspares
372  *
373  * This function attempts to allocate a hotspare for this component if the
374  * component is in error. In a MN set, the function can be called in 2 modes.
375  * It can be called either when a component error has been detected or when a
376  * new hotspare has been allocated. In this case, MD_HOTSPARE_XMIT is set
377  * in flags and the request is sent to all nodes.
378  * The handler on each of the nodes then calls this function with
379  * MD_HOTSPARE_XMIT unset and the hotspare allocation is then performed.
380  *
381  * For non-MN sets the function simply attempts to allocate a hotspare.
382  *
383  * On entry, the following locks are held
384  *	mirror_md_ops.md_link_rw (if flags has MD_HOTSPARE_LINKHELD set)
385  *	md_unit_writerlock
386  *
387  * Returns	0 if ok
388  *		1 if the unit containing the component has been cleared while
389  *		  the mdmn_ksend_message() was being executed
390  */
391 extern int
392 check_comp_4_hotspares(
393 	mm_unit_t	*un,
394 	int		smi,
395 	int		ci,
396 	uint_t		flags,
397 	mddb_recid_t	hs_id,	/* Only used by MN disksets */
398 	IOLOCK		*lockp	/* can be NULL */
399 )
400 {
401 	mm_submirror_t		*sm;
402 	mm_submirror_ic_t	*smic;
403 	md_m_shared_t		*shared;
404 	mddb_recid_t		recids[6];
405 	minor_t			mnum;
406 	intptr_t		(*hs_dev)();
407 	void			(*hs_done)();
408 	void			*hs_data;
409 	md_error_t		mde = mdnullerror;
410 	set_t			setno;
411 	md_mn_msg_allochsp_t	allochspmsg;
412 	md_mn_kresult_t		*kresult;
413 	mm_unit_t		*new_un;
414 	int			rval;
415 
416 	mnum = MD_SID(un);
417 	setno = MD_UN2SET(un);
418 	sm = &un->un_sm[smi];
419 	smic = &un->un_smic[smi];
420 	shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
421 	    (sm->sm_dev, sm, ci);
422 
423 	if (shared->ms_state != CS_ERRED)
424 		return (0);
425 
426 	/* Don't start a new component resync if a resync is already running. */
427 	if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE)
428 		return (0);
429 
430 	if (MD_MNSET_SETNO(setno) && (flags & MD_HOTSPARE_XMIT)) {
431 		uint_t		msgflags;
432 		md_mn_msgtype_t	msgtype;
433 
434 		/* Send allocate hotspare message to all nodes */
435 
436 		allochspmsg.msg_allochsp_mnum = un->c.un_self_id;
437 		allochspmsg.msg_allochsp_sm = smi;
438 		allochspmsg.msg_allochsp_comp = ci;
439 		allochspmsg.msg_allochsp_hs_id = shared->ms_hs_id;
440 
441 		/*
442 		 * Before calling mdmn_ksend_message(), release locks
443 		 * Can never be in the context of an ioctl.
444 		 */
445 		md_unit_writerexit(MDI_UNIT(mnum));
446 		if (flags & MD_HOTSPARE_LINKHELD)
447 			rw_exit(&mirror_md_ops.md_link_rw.lock);
448 #ifdef DEBUG
449 		if (mirror_debug_flag)
450 			printf("send alloc hotspare, flags="
451 			    "0x%x %x, %x, %x, %x\n", flags,
452 			    allochspmsg.msg_allochsp_mnum,
453 			    allochspmsg.msg_allochsp_sm,
454 			    allochspmsg.msg_allochsp_comp,
455 			    allochspmsg.msg_allochsp_hs_id);
456 #endif
457 		if (flags & MD_HOTSPARE_WMUPDATE) {
458 			msgtype  = MD_MN_MSG_ALLOCATE_HOTSPARE2;
459 			/*
460 			 * When coming from an update of watermarks, there
461 			 * must already be a message logged that triggered
462 			 * this action. So, no need to log this message, too.
463 			 */
464 			msgflags = MD_MSGF_NO_LOG;
465 		} else {
466 			msgtype  = MD_MN_MSG_ALLOCATE_HOTSPARE;
467 			msgflags = MD_MSGF_DEFAULT_FLAGS;
468 		}
469 
470 		kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
471 		rval = mdmn_ksend_message(setno, msgtype, msgflags,
472 		    (char *)&allochspmsg, sizeof (allochspmsg),
473 		    kresult);
474 
475 		if (!MDMN_KSEND_MSG_OK(rval, kresult)) {
476 #ifdef DEBUG
477 			if (mirror_debug_flag)
478 				mdmn_ksend_show_error(rval, kresult,
479 				    "ALLOCATE HOTSPARE");
480 #endif
481 			/*
482 			 * If message is sent ok but exitval indicates an error
483 			 * it must be because the mirror has been cleared. In
484 			 * this case re-obtain lock and return an error
485 			 */
486 			if ((rval == 0) && (kresult->kmmr_exitval != 0)) {
487 				if (flags & MD_HOTSPARE_LINKHELD) {
488 					rw_enter(&mirror_md_ops.md_link_rw.lock,
489 					    RW_READER);
490 				}
491 				kmem_free(kresult, sizeof (md_mn_kresult_t));
492 				return (1);
493 			}
494 			cmn_err(CE_PANIC,
495 			    "ksend_message failure: ALLOCATE_HOTSPARE");
496 		}
497 		kmem_free(kresult, sizeof (md_mn_kresult_t));
498 
499 		/*
500 		 * re-obtain the locks
501 		 */
502 		if (flags & MD_HOTSPARE_LINKHELD)
503 			rw_enter(&mirror_md_ops.md_link_rw.lock, RW_READER);
504 		new_un = md_unit_writerlock(MDI_UNIT(mnum));
505 
506 		/*
507 		 * As we had to release the locks in order to send the
508 		 * message to all nodes, we need to check to see if the
509 		 * unit has changed. If it has we release the writerlock
510 		 * and return fail.
511 		 */
512 		if ((new_un != un) || (un->c.un_type != MD_METAMIRROR)) {
513 			md_unit_writerexit(MDI_UNIT(mnum));
514 			return (1);
515 		}
516 	} else {
517 		if (MD_MNSET_SETNO(setno)) {
518 			/*
519 			 * If 2 or more nodes simultaneously see a
520 			 * component failure, these nodes will each
521 			 * send an ALLOCATE_HOTSPARE[2] message.
522 			 * The first message will allocate the hotspare
523 			 * and the subsequent messages should do nothing.
524 			 *
525 			 * If a slave node doesn't have a hotspare allocated
526 			 * at the time the message is initiated, then the
527 			 * passed in hs_id will be 0.  If the node
528 			 * executing this routine has a component shared
529 			 * ms_hs_id of non-zero, but the message shows a
530 			 * hs_id of 0, then just return since a hotspare
531 			 * has already been allocated for this failing
532 			 * component.  When the slave node returns from
533 			 * the ksend_message the hotspare will have
534 			 * already been allocated.
535 			 *
536 			 * If the slave node does send an hs_id of non-zero,
537 			 * and the slave node's hs_id matches this node's
538 			 * ms_hs_id, then the hotspare has error'd and
539 			 * should be replaced.
540 			 *
541 			 * If the slave node sends an hs_id of non-zero and
542 			 * this node has a different shared ms_hs_id, then
543 			 * just return since this hotspare has already
544 			 * been hotspared.
545 			 */
546 			if (shared->ms_hs_id != 0) {
547 				if (hs_id == 0) {
548 #ifdef DEBUG
549 					if (mirror_debug_flag) {
550 						printf("check_comp_4_hotspares"
551 						    "(NOXMIT), short circuit "
552 						    "hs_id=0x%x, "
553 						    "ms_hs_id=0x%x\n",
554 						    hs_id, shared->ms_hs_id);
555 					}
556 #endif
557 					return (0);
558 				}
559 				if (hs_id != shared->ms_hs_id) {
560 #ifdef DEBUG
561 					if (mirror_debug_flag) {
562 						printf("check_comp_4_hotspares"
563 						    "(NOXMIT), short circuit2 "
564 						    "hs_id=0x%x, "
565 						    "ms_hs_id=0x%x\n",
566 						    hs_id, shared->ms_hs_id);
567 					}
568 #endif
569 					return (0);
570 				}
571 			}
572 		}
573 
574 		sm = &un->un_sm[smi];
575 		hs_dev = md_get_named_service(sm->sm_dev, 0,
576 		    "hotspare device", 0);
577 		if ((*hs_dev)(sm->sm_dev, 0, ci, recids, 6, &hs_done,
578 		    &hs_data) != 0)
579 			return (0);
580 
581 		/*
582 		 * set_sm_comp_state() commits the modified records.
583 		 * As we don't transmit the changes, no need to drop the lock.
584 		 */
585 		set_sm_comp_state(un, smi, ci, CS_RESYNC, recids,
586 		    MD_STATE_NO_XMIT, (IOLOCK *)NULL);
587 
588 		(*hs_done)(sm->sm_dev, hs_data);
589 
590 		mirror_check_failfast(mnum);
591 
592 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_HOTSPARED, SVM_TAG_METADEVICE,
593 		    setno, MD_SID(un));
594 
595 		/*
596 		 * For a multi-node set we need to reset the un_rs_type,
597 		 * un_rs_resync_done and un_rs_resync_2_do fields as the
598 		 * hot-spare resync must copy all applicable data.
599 		 */
600 		if (MD_MNSET_SETNO(setno)) {
601 			un->un_rs_type = MD_RS_NONE;
602 			un->un_rs_resync_done = 0;
603 			un->un_rs_resync_2_do = 0;
604 		}
605 
606 		/*
607 		 * Must drop writer lock since mirror_resync_unit will
608 		 * open devices and must be able to grab readerlock.
609 		 * Don't need to drop IOLOCK since any descendent routines
610 		 * calling ksend_messages will drop the IOLOCK as needed.
611 		 *
612 		 */
613 		if (lockp) {
614 			md_ioctl_writerexit(lockp);
615 		} else {
616 			md_unit_writerexit(MDI_UNIT(mnum));
617 		}
618 
619 		/* start resync */
620 		(void) mirror_resync_unit(mnum, NULL, &mde, lockp);
621 
622 		if (lockp) {
623 			new_un = md_ioctl_writerlock(lockp, MDI_UNIT(mnum));
624 		} else {
625 			new_un = md_unit_writerlock(MDI_UNIT(mnum));
626 		}
627 	}
628 	return (0);
629 }
630 
631 /*
632  * check_unit_4_hotspares
633  *
634  * For a given mirror, allocate hotspares, if available for any components
635  * that are in error
636  *
637  * Returns	0 if ok
638  *		1 if check_comp_4_hotspares returns non-zero. This will only
639  *		  happen for a MN unit where the unit has been cleared while
640  *		  the allocate hotspare message is sent to all nodes.
641  */
642 static int
643 check_unit_4_hotspares(mm_unit_t *un, int flags)
644 {
645 	mm_submirror_t		*sm;
646 	mm_submirror_ic_t	*smic;
647 	int			ci;
648 	int			i;
649 	int			compcnt;
650 
651 	if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE)
652 		return (0);
653 
654 	for (i = 0; i < NMIRROR; i++) {
655 		sm = &un->un_sm[i];
656 		smic = &un->un_smic[i];
657 		if (!SMS_IS(sm, SMS_INUSE))
658 			continue;
659 		compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, sm);
660 		for (ci = 0; ci < compcnt; ci++) {
661 			md_m_shared_t		*shared;
662 
663 			shared = (md_m_shared_t *)
664 			    (*(smic->sm_shared_by_indx))(sm->sm_dev, sm, ci);
665 			/*
666 			 * Never called from ioctl context, so pass in
667 			 * (IOLOCK *)NULL.  Pass through flags from calling
668 			 * routine, also setting XMIT flag.
669 			 */
670 			if (check_comp_4_hotspares(un, i, ci,
671 			    (MD_HOTSPARE_XMIT | flags),
672 			    shared->ms_hs_id, (IOLOCK *)NULL) != 0)
673 				return (1);
674 		}
675 	}
676 	return (0);
677 }
678 
679 static void
680 check_4_hotspares(daemon_request_t *drq)
681 {
682 	mdi_unit_t	*ui;
683 	mm_unit_t	*un;
684 	md_link_t	*next;
685 	int		x;
686 
687 	mutex_enter(&drq->dr_mx);	/* clear up front so can poke */
688 	drq->dr_pending = 0;		/* again in low level routine if */
689 	mutex_exit(&drq->dr_mx);	/* something found to do	*/
690 
691 	/*
692 	 * Used to have a problem here. The disksets weren't marked as being
693 	 * MNHOLD. This opened a window where we could be searching for
694 	 * hotspares and have the disk set unloaded (released) from under
695 	 * us causing a panic in stripe_component_count().
696 	 * The way to prevent that is to mark the set MNHOLD which prevents
697 	 * any diskset from being released while we are scanning the mirrors,
698 	 * submirrors and components.
699 	 */
700 
701 	for (x = 0; x < md_nsets; x++)
702 		md_holdset_enter(x);
703 
704 	rw_enter(&mirror_md_ops.md_link_rw.lock, RW_READER);
705 	for (next = mirror_md_ops.md_head; next != NULL; next = next->ln_next) {
706 		ui = MDI_UNIT(next->ln_id);
707 
708 		un = (mm_unit_t *)md_unit_readerlock(ui);
709 
710 		/*
711 		 * Only check the unit if we are the master for this set
712 		 * For an MN set, poke_hotspares() is only effective on the
713 		 * master
714 		 */
715 		if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
716 		    md_set[MD_UN2SET(un)].s_am_i_master == 0) {
717 			md_unit_readerexit(ui);
718 			continue;
719 		}
720 		if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) {
721 			md_unit_readerexit(ui);
722 			continue;
723 		}
724 		md_unit_readerexit(ui);
725 
726 		un = (mm_unit_t *)md_unit_writerlock(ui);
727 		/*
728 		 * check_unit_4_hotspares will exit 1 if the unit has been
729 		 * removed during the process of allocating the hotspare.
730 		 * This can only happen for a MN metadevice. If unit no longer
731 		 * exists, no need to release writerlock
732 		 */
733 		if (check_unit_4_hotspares(un, MD_HOTSPARE_LINKHELD) == 0)
734 			md_unit_writerexit(ui);
735 		else {
736 			/*
737 			 * If check_unit_4_hotspares failed, queue another
738 			 * request and break out of this one
739 			 */
740 			(void) poke_hotspares();
741 			break;
742 		}
743 	}
744 	rw_exit(&mirror_md_ops.md_link_rw.lock);
745 
746 	for (x = 0; x < md_nsets; x++)
747 		md_holdset_exit(x);
748 }
749 
750 /*
751  * poke_hotspares
752  *
753  * If there is not a pending poke_hotspares request pending, queue a requent
754  * to call check_4_hotspares(). This will scan all mirrors and attempt to
755  * allocate hotspares for all components in error.
756  */
757 int
758 poke_hotspares()
759 {
760 	mutex_enter(&hotspare_request.dr_mx);
761 	if (hotspare_request.dr_pending == 0) {
762 		hotspare_request.dr_pending = 1;
763 		daemon_request(&md_mhs_daemon,
764 		    check_4_hotspares, (daemon_queue_t *)&hotspare_request,
765 		    REQ_OLD);
766 	}
767 	mutex_exit(&hotspare_request.dr_mx);
768 	return (0);
769 }
770 
771 static void
772 free_all_ecomps(err_comp_t *ecomp)
773 {
774 	err_comp_t	*d;
775 
776 	while (ecomp != NULL) {
777 		d = ecomp;
778 		ecomp = ecomp->ec_next;
779 		kmem_free(d, sizeof (err_comp_t));
780 	}
781 }
782 
783 /*
784  * NAME: mirror_openfail_console_info
785  *
786  * DESCRIPTION: Prints a informative message to the console when mirror
787  *		cannot be opened.
788  *
789  * PARAMETERS: mm_unit_t	un - pointer to mirror unit structure
790  *	       int		smi - submirror index
791  *	       int		ci - component index
792  */
793 
794 void
795 mirror_openfail_console_info(mm_unit_t *un, int smi, int ci)
796 {
797 	void (*get_dev)();
798 	ms_cd_info_t cd;
799 	md_dev64_t tmpdev;
800 
801 	tmpdev = un->un_sm[smi].sm_dev;
802 	get_dev = (void (*)())md_get_named_service(tmpdev, 0, "get device", 0);
803 	if (get_dev != NULL) {
804 		(void) (*get_dev)(tmpdev, smi, ci, &cd);
805 		cmn_err(CE_WARN, "md %s: open error on %s",
806 		    md_shortname(MD_SID(un)), md_devname(MD_UN2SET(un),
807 		    cd.cd_dev, NULL, 0));
808 	} else {
809 		cmn_err(CE_WARN, "md %s: open error",
810 		    md_shortname(MD_SID(un)));
811 	}
812 }
813 
814 static int
815 mirror_close_all_devs(mm_unit_t *un, int md_cflags)
816 {
817 	int i;
818 	md_dev64_t dev;
819 
820 	for (i = 0; i < NMIRROR; i++) {
821 		if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
822 			continue;
823 		dev = un->un_sm[i].sm_dev;
824 		md_layered_close(dev, md_cflags);
825 	}
826 	return (0);
827 }
828 
829 /*
830  * Keep track of drivers that don't support failfast.  We use this so that
831  * we only log one diagnostic message for each of these drivers, no matter
832  * how many times we run the mirror_check_failfast function.
833  * Return 1 if this is a new driver that does not support failfast,
834  * return 0 if we have already seen this non-failfast driver.
835  */
836 static int
837 new_non_ff_driver(const char *s)
838 {
839 	mutex_enter(&non_ff_drv_mutex);
840 	if (non_ff_drivers == NULL) {
841 		non_ff_drivers = (char **)kmem_alloc(2 * sizeof (char *),
842 		    KM_NOSLEEP);
843 		if (non_ff_drivers == NULL) {
844 			mutex_exit(&non_ff_drv_mutex);
845 			return (1);
846 		}
847 
848 		non_ff_drivers[0] = (char *)kmem_alloc(strlen(s) + 1,
849 		    KM_NOSLEEP);
850 		if (non_ff_drivers[0] == NULL) {
851 			kmem_free(non_ff_drivers, 2 * sizeof (char *));
852 			non_ff_drivers = NULL;
853 			mutex_exit(&non_ff_drv_mutex);
854 			return (1);
855 		}
856 
857 		(void) strcpy(non_ff_drivers[0], s);
858 		non_ff_drivers[1] = NULL;
859 
860 	} else {
861 		int i;
862 		char **tnames;
863 		char **tmp;
864 
865 		for (i = 0; non_ff_drivers[i] != NULL; i++) {
866 			if (strcmp(s, non_ff_drivers[i]) == 0) {
867 				mutex_exit(&non_ff_drv_mutex);
868 				return (0);
869 			}
870 		}
871 
872 		/* allow for new element and null */
873 		i += 2;
874 		tnames = (char **)kmem_alloc(i * sizeof (char *), KM_NOSLEEP);
875 		if (tnames == NULL) {
876 			mutex_exit(&non_ff_drv_mutex);
877 			return (1);
878 		}
879 
880 		for (i = 0; non_ff_drivers[i] != NULL; i++)
881 			tnames[i] = non_ff_drivers[i];
882 
883 		tnames[i] = (char *)kmem_alloc(strlen(s) + 1, KM_NOSLEEP);
884 		if (tnames[i] == NULL) {
885 			/* adjust i so that it is the right count to free */
886 			kmem_free(tnames, (i + 2) * sizeof (char *));
887 			mutex_exit(&non_ff_drv_mutex);
888 			return (1);
889 		}
890 
891 		(void) strcpy(tnames[i++], s);
892 		tnames[i] = NULL;
893 
894 		tmp = non_ff_drivers;
895 		non_ff_drivers = tnames;
896 		/* i now represents the count we previously alloced */
897 		kmem_free(tmp, i * sizeof (char *));
898 	}
899 	mutex_exit(&non_ff_drv_mutex);
900 
901 	return (1);
902 }
903 
904 /*
905  * Check for the "ddi-failfast-supported" devtree property on each submirror
906  * component to indicate if we should do I/O to that submirror with the
907  * B_FAILFAST flag set or not.  This check is made at various state transitions
908  * in the mirror code (e.g. open, enable, hotspare, etc.).  Sometimes we
909  * only need to check one drive (e.g. hotspare) but since the check is
910  * fast and infrequent and sometimes needs to be done on all components we
911  * just check all components on each call.
912  */
913 void
914 mirror_check_failfast(minor_t mnum)
915 {
916 	int		i;
917 	mm_unit_t	*un;
918 
919 	if (md_ff_disable)
920 		return;
921 
922 	un = MD_UNIT(mnum);
923 
924 	for (i = 0; i < NMIRROR; i++) {
925 		int			ci;
926 		int			cnt;
927 		int			ff = 1;
928 		mm_submirror_t		*sm;
929 		mm_submirror_ic_t	*smic;
930 		void			(*get_dev)();
931 
932 		if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
933 			continue;
934 
935 		sm = &un->un_sm[i];
936 		smic = &un->un_smic[i];
937 
938 		get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0,
939 		    "get device", 0);
940 
941 		cnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm);
942 		for (ci = 0; ci < cnt; ci++) {
943 			int		found = 0;
944 			dev_t		ci_dev;
945 			major_t		major;
946 			dev_info_t	*devi;
947 			ms_cd_info_t	cd;
948 
949 			/*
950 			 * this already returns the hs
951 			 * dev if the device is spared
952 			 */
953 			(void) (*get_dev)(sm->sm_dev, sm, ci, &cd);
954 
955 			ci_dev = md_dev64_to_dev(cd.cd_dev);
956 			major = getmajor(ci_dev);
957 
958 			if (major == md_major) {
959 				/*
960 				 * this component must be a soft
961 				 * partition; get the real dev
962 				 */
963 				minor_t	dev_mnum;
964 				mdi_unit_t	*ui;
965 				mp_unit_t	*un;
966 				set_t	setno;
967 				side_t	side;
968 				md_dev64_t	tmpdev;
969 
970 				ui = MDI_UNIT(getminor(ci_dev));
971 
972 				/* grab necessary lock */
973 				un = (mp_unit_t *)md_unit_readerlock(ui);
974 
975 				dev_mnum = MD_SID(un);
976 				setno = MD_MIN2SET(dev_mnum);
977 				side = mddb_getsidenum(setno);
978 
979 				tmpdev = un->un_dev;
980 
981 				/* Get dev by device id */
982 				if (md_devid_found(setno, side,
983 				    un->un_key) == 1) {
984 					tmpdev = md_resolve_bydevid(dev_mnum,
985 					    tmpdev, un->un_key);
986 				}
987 
988 				md_unit_readerexit(ui);
989 
990 				ci_dev = md_dev64_to_dev(tmpdev);
991 				major = getmajor(ci_dev);
992 			}
993 
994 			if (ci_dev != NODEV32 &&
995 			    (devi = e_ddi_hold_devi_by_dev(ci_dev, 0))
996 			    != NULL) {
997 				ddi_prop_op_t	prop_op = PROP_LEN_AND_VAL_BUF;
998 				int		propvalue = 0;
999 				int		proplength = sizeof (int);
1000 				int		error;
1001 				struct cb_ops	*cb;
1002 
1003 				if ((cb = devopsp[major]->devo_cb_ops) !=
1004 				    NULL) {
1005 					error = (*cb->cb_prop_op)
1006 					    (DDI_DEV_T_ANY, devi, prop_op,
1007 					    DDI_PROP_NOTPROM|DDI_PROP_DONTPASS,
1008 					    "ddi-failfast-supported",
1009 					    (caddr_t)&propvalue, &proplength);
1010 
1011 					if (error == DDI_PROP_SUCCESS)
1012 						found = 1;
1013 				}
1014 
1015 				if (!found && new_non_ff_driver(
1016 				    ddi_driver_name(devi))) {
1017 					cmn_err(CE_NOTE, "!md: B_FAILFAST I/O"
1018 					    "disabled on %s",
1019 					    ddi_driver_name(devi));
1020 				}
1021 
1022 				ddi_release_devi(devi);
1023 			}
1024 
1025 			/*
1026 			 * All components must support
1027 			 * failfast in the submirror.
1028 			 */
1029 			if (!found) {
1030 				ff = 0;
1031 				break;
1032 			}
1033 		}
1034 
1035 		if (ff) {
1036 			sm->sm_flags |= MD_SM_FAILFAST;
1037 		} else {
1038 			sm->sm_flags &= ~MD_SM_FAILFAST;
1039 		}
1040 	}
1041 }
1042 
1043 /*
1044  * Return true if the submirror is unavailable.
1045  * If any of the submirror components are opened then the submirror cannot
1046  * be unavailable (MD_INACCESSIBLE).
1047  * If any of the components are already in the errored state, then the submirror
1048  * cannot be unavailable (MD_INACCESSIBLE).
1049  */
1050 static bool_t
1051 submirror_unavailable(mm_unit_t *un, int smi, int from_probe)
1052 {
1053 	mm_submirror_t		*sm;
1054 	mm_submirror_ic_t	*smic;
1055 	md_m_shared_t		*shared;
1056 	int			ci;
1057 	int			compcnt;
1058 
1059 	sm = &un->un_sm[smi];
1060 	smic = &un->un_smic[smi];
1061 
1062 	compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un);
1063 	for (ci = 0; ci < compcnt; ci++) {
1064 		shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
1065 		    (sm->sm_dev, sm, ci);
1066 		if (from_probe) {
1067 			if (shared->ms_flags & MDM_S_PROBEOPEN)
1068 				return (B_FALSE);
1069 		} else {
1070 			if (shared->ms_flags & MDM_S_ISOPEN)
1071 				return (B_FALSE);
1072 		}
1073 		if (shared->ms_state == CS_ERRED ||
1074 		    shared->ms_state == CS_LAST_ERRED)
1075 			return (B_FALSE);
1076 	}
1077 
1078 	return (B_TRUE);
1079 }
1080 
1081 static int
1082 mirror_open_all_devs(minor_t mnum, int md_oflags, IOLOCK *lockp)
1083 {
1084 	int		i;
1085 	mm_unit_t	*un;
1086 	mdi_unit_t	*ui;
1087 	int		err;
1088 	int		smi;
1089 	int		ci;
1090 	err_comp_t	*c;
1091 	err_comp_t	*ecomps = NULL;
1092 	int		smmask = 0;
1093 	set_t		setno;
1094 	int		sm_cnt;
1095 	int		sm_unavail_cnt;
1096 
1097 	mirror_check_failfast(mnum);
1098 
1099 	un = MD_UNIT(mnum);
1100 	ui = MDI_UNIT(mnum);
1101 	setno = MD_UN2SET(un);
1102 
1103 	for (i = 0; i < NMIRROR; i++) {
1104 		md_dev64_t tmpdev = un->un_sm[i].sm_dev;
1105 
1106 		if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
1107 			continue;
1108 		if (md_layered_open(mnum, &tmpdev, md_oflags))
1109 			smmask |= SMI2BIT(i);
1110 		un->un_sm[i].sm_dev = tmpdev;
1111 	}
1112 
1113 	/*
1114 	 * If smmask is clear, all submirrors are accessible. Clear the
1115 	 * MD_INACCESSIBLE bit in this case.  This bit is also cleared for the
1116 	 * mirror device.   If smmask is set, we have to determine which of the
1117 	 * submirrors are in error. If no submirror is accessible we mark the
1118 	 * whole mirror as MD_INACCESSIBLE.
1119 	 */
1120 	if (smmask == 0) {
1121 		if (lockp) {
1122 			md_ioctl_readerexit(lockp);
1123 			(void) md_ioctl_writerlock(lockp, ui);
1124 		} else {
1125 			md_unit_readerexit(ui);
1126 			(void) md_unit_writerlock(ui);
1127 		}
1128 		ui->ui_tstate &= ~MD_INACCESSIBLE;
1129 		if (lockp) {
1130 			md_ioctl_writerexit(lockp);
1131 			(void) md_ioctl_readerlock(lockp, ui);
1132 		} else {
1133 			md_unit_writerexit(ui);
1134 			(void) md_unit_readerlock(ui);
1135 		}
1136 
1137 		for (i = 0; i < NMIRROR; i++) {
1138 			md_dev64_t	tmpdev;
1139 			mdi_unit_t	*sm_ui;
1140 
1141 			if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
1142 				continue;
1143 
1144 			tmpdev = un->un_sm[i].sm_dev;
1145 			sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev)));
1146 			(void) md_unit_writerlock(sm_ui);
1147 			sm_ui->ui_tstate &= ~MD_INACCESSIBLE;
1148 			md_unit_writerexit(sm_ui);
1149 		}
1150 
1151 		return (0);
1152 	}
1153 
1154 	for (i = 0; i < NMIRROR; i++) {
1155 		md_dev64_t tmpdev;
1156 
1157 		if (!(smmask & SMI2BIT(i)))
1158 			continue;
1159 
1160 		tmpdev = un->un_sm[i].sm_dev;
1161 		err = md_layered_open(mnum, &tmpdev, MD_OFLG_CONT_ERRS);
1162 		un->un_sm[i].sm_dev = tmpdev;
1163 		ASSERT(err == 0);
1164 	}
1165 
1166 	if (lockp) {
1167 		md_ioctl_readerexit(lockp);
1168 		un = (mm_unit_t *)md_ioctl_writerlock(lockp, ui);
1169 	} else {
1170 		md_unit_readerexit(ui);
1171 		un = (mm_unit_t *)md_unit_writerlock(ui);
1172 	}
1173 
1174 	/*
1175 	 * We want to make sure the unavailable flag is not masking a real
1176 	 * error on the submirror.
1177 	 * For each submirror,
1178 	 *    if all of the submirror components couldn't be opened and there
1179 	 *    are no errors on the submirror, then set the unavailable flag
1180 	 *    otherwise, clear unavailable.
1181 	 */
1182 	sm_cnt = 0;
1183 	sm_unavail_cnt = 0;
1184 	for (i = 0; i < NMIRROR; i++) {
1185 		md_dev64_t	tmpdev;
1186 		mdi_unit_t	*sm_ui;
1187 
1188 		if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
1189 			continue;
1190 
1191 		sm_cnt++;
1192 		tmpdev = un->un_sm[i].sm_dev;
1193 		sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev)));
1194 
1195 		(void) md_unit_writerlock(sm_ui);
1196 		if (submirror_unavailable(un, i, 0)) {
1197 			sm_ui->ui_tstate |= MD_INACCESSIBLE;
1198 			sm_unavail_cnt++;
1199 		} else {
1200 			sm_ui->ui_tstate &= ~MD_INACCESSIBLE;
1201 		}
1202 		md_unit_writerexit(sm_ui);
1203 	}
1204 
1205 	/*
1206 	 * If all of the submirrors are unavailable, the mirror is also
1207 	 * unavailable.
1208 	 */
1209 	if (sm_cnt == sm_unavail_cnt) {
1210 		ui->ui_tstate |= MD_INACCESSIBLE;
1211 	} else {
1212 		ui->ui_tstate &= ~MD_INACCESSIBLE;
1213 	}
1214 
1215 	smi = 0;
1216 	ci = 0;
1217 	while (mirror_geterror(un, &smi, &ci, 1, 0) != 0) {
1218 		if (mirror_other_sources(un, smi, ci, 1) == 1) {
1219 
1220 			free_all_ecomps(ecomps);
1221 			(void) mirror_close_all_devs(un, md_oflags);
1222 			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL,
1223 			    SVM_TAG_METADEVICE, setno, MD_SID(un));
1224 			mirror_openfail_console_info(un, smi, ci);
1225 			if (lockp) {
1226 				md_ioctl_writerexit(lockp);
1227 				(void) md_ioctl_readerlock(lockp, ui);
1228 			} else {
1229 				md_unit_writerexit(ui);
1230 				(void) md_unit_readerlock(ui);
1231 			}
1232 			return (ENXIO);
1233 		}
1234 
1235 		/* track all component states that need changing */
1236 		c = (err_comp_t *)kmem_alloc(sizeof (err_comp_t), KM_SLEEP);
1237 		c->ec_next = ecomps;
1238 		c->ec_smi = smi;
1239 		c->ec_ci = ci;
1240 		ecomps = c;
1241 		ci++;
1242 	}
1243 
1244 	/* Make all state changes and commit them */
1245 	for (c = ecomps; c != NULL; c = c->ec_next) {
1246 		/*
1247 		 * If lockp is set, then entering kernel through ioctl.
1248 		 * For a MN set, the only ioctl path is via a commd message
1249 		 * (ALLOCATE_HOTSPARE or *RESYNC* messages) that is already
1250 		 * being sent to each node.
1251 		 * In this case, set NO_XMIT so that set_sm_comp_state
1252 		 * won't attempt to send a message on a message.
1253 		 *
1254 		 * In !MN sets, the xmit flag is ignored, so it doesn't matter
1255 		 * which flag is passed.
1256 		 */
1257 		if (lockp) {
1258 			set_sm_comp_state(un, c->ec_smi, c->ec_ci, CS_ERRED, 0,
1259 			    MD_STATE_NO_XMIT, lockp);
1260 		} else {
1261 			set_sm_comp_state(un, c->ec_smi, c->ec_ci, CS_ERRED, 0,
1262 			    (MD_STATE_XMIT | MD_STATE_OCHELD), lockp);
1263 		}
1264 		/*
1265 		 * For a MN set, the NOTIFY is done when the state change is
1266 		 * processed on each node
1267 		 */
1268 		if (!MD_MNSET_SETNO(setno)) {
1269 			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED,
1270 			    SVM_TAG_METADEVICE, setno, MD_SID(un));
1271 		}
1272 	}
1273 
1274 	if (lockp) {
1275 		md_ioctl_writerexit(lockp);
1276 		(void) md_ioctl_readerlock(lockp, ui);
1277 	} else {
1278 		md_unit_writerexit(ui);
1279 		(void) md_unit_readerlock(ui);
1280 	}
1281 
1282 	free_all_ecomps(ecomps);
1283 
1284 	/* allocate hotspares for all errored components */
1285 	if (MD_MNSET_SETNO(setno)) {
1286 		/*
1287 		 * If we're called from an ioctl (lockp set) then we cannot
1288 		 * directly call send_poke_hotspares as this will block until
1289 		 * the message gets despatched to all nodes. If the cluster is
1290 		 * going through a reconfig cycle then the message will block
1291 		 * until the cycle is complete, and as we originate from a
1292 		 * service call from commd we will livelock.
1293 		 */
1294 		if (lockp == NULL) {
1295 			md_unit_readerexit(ui);
1296 			send_poke_hotspares(setno);
1297 			(void) md_unit_readerlock(ui);
1298 		}
1299 	} else {
1300 		(void) poke_hotspares();
1301 	}
1302 	return (0);
1303 }
1304 
1305 void
1306 mirror_overlap_tree_remove(md_mps_t *ps)
1307 {
1308 	mm_unit_t	*un;
1309 
1310 	if (panicstr)
1311 		return;
1312 
1313 	VERIFY(ps->ps_flags & MD_MPS_ON_OVERLAP);
1314 	un = ps->ps_un;
1315 
1316 	mutex_enter(&un->un_overlap_tree_mx);
1317 	avl_remove(&un->un_overlap_root, ps);
1318 	ps->ps_flags &= ~MD_MPS_ON_OVERLAP;
1319 	if (un->un_overlap_tree_flag != 0) {
1320 		un->un_overlap_tree_flag = 0;
1321 		cv_broadcast(&un->un_overlap_tree_cv);
1322 	}
1323 	mutex_exit(&un->un_overlap_tree_mx);
1324 }
1325 
1326 
1327 /*
1328  * wait_for_overlaps:
1329  * -----------------
1330  * Check that given i/o request does not cause an overlap with already pending
1331  * i/o. If it does, block until the overlapped i/o completes.
1332  *
1333  * The flag argument has MD_OVERLAP_ALLOW_REPEAT set if it is ok for the parent
1334  * structure to be already in the overlap tree and MD_OVERLAP_NO_REPEAT if
1335  * it must not already be in the tree.
1336  */
1337 static void
1338 wait_for_overlaps(md_mps_t *ps, int flags)
1339 {
1340 	mm_unit_t	*un;
1341 	avl_index_t	where;
1342 	md_mps_t	*ps1;
1343 
1344 	if (panicstr)
1345 		return;
1346 
1347 	un = ps->ps_un;
1348 	mutex_enter(&un->un_overlap_tree_mx);
1349 	if ((flags & MD_OVERLAP_ALLOW_REPEAT) &&
1350 	    (ps->ps_flags & MD_MPS_ON_OVERLAP)) {
1351 		mutex_exit(&un->un_overlap_tree_mx);
1352 		return;
1353 	}
1354 
1355 	VERIFY(!(ps->ps_flags & MD_MPS_ON_OVERLAP));
1356 
1357 	do {
1358 		ps1 = avl_find(&un->un_overlap_root, ps, &where);
1359 		if (ps1 == NULL) {
1360 			/*
1361 			 * The candidate range does not overlap with any
1362 			 * range in the tree.  Insert it and be done.
1363 			 */
1364 			avl_insert(&un->un_overlap_root, ps, where);
1365 			ps->ps_flags |= MD_MPS_ON_OVERLAP;
1366 		} else {
1367 			/*
1368 			 * The candidate range would overlap.  Set the flag
1369 			 * indicating we need to be woken up, and sleep
1370 			 * until another thread removes a range.  If upon
1371 			 * waking up we find this mps was put on the tree
1372 			 * by another thread, the loop terminates.
1373 			 */
1374 			un->un_overlap_tree_flag = 1;
1375 			cv_wait(&un->un_overlap_tree_cv,
1376 			    &un->un_overlap_tree_mx);
1377 		}
1378 	} while (!(ps->ps_flags & MD_MPS_ON_OVERLAP));
1379 	mutex_exit(&un->un_overlap_tree_mx);
1380 }
1381 
1382 /*
1383  * This function is called from mirror_done to check whether any pages have
1384  * been modified while a mirrored write was in progress.  Returns 0 if
1385  * all pages associated with bp are clean, 1 otherwise.
1386  */
1387 static int
1388 any_pages_dirty(struct buf *bp)
1389 {
1390 	int	rval;
1391 
1392 	rval = biomodified(bp);
1393 	if (rval == -1)
1394 		rval = 0;
1395 
1396 	return (rval);
1397 }
1398 
1399 #define	MAX_EXTRAS 10
1400 
1401 void
1402 mirror_commit(
1403 	mm_unit_t	*un,
1404 	int		smmask,
1405 	mddb_recid_t	*extras
1406 )
1407 {
1408 	mm_submirror_t		*sm;
1409 	md_unit_t		*su;
1410 	int			i;
1411 
1412 	/* 2=mirror,null id */
1413 	mddb_recid_t		recids[NMIRROR+2+MAX_EXTRAS];
1414 
1415 	int			ri = 0;
1416 
1417 	if (md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)
1418 		return;
1419 
1420 	/* Add two, this includes the mirror unit and the null recid */
1421 	if (extras != NULL) {
1422 		int	nrecids = 0;
1423 		while (extras[nrecids] != 0) {
1424 			nrecids++;
1425 		}
1426 		ASSERT(nrecids <= MAX_EXTRAS);
1427 	}
1428 
1429 	if (un != NULL)
1430 		recids[ri++] = un->c.un_record_id;
1431 	for (i = 0;  i < NMIRROR; i++) {
1432 		if (!(smmask & SMI2BIT(i)))
1433 			continue;
1434 		sm = &un->un_sm[i];
1435 		if (!SMS_IS(sm, SMS_INUSE))
1436 			continue;
1437 		if (md_getmajor(sm->sm_dev) != md_major)
1438 			continue;
1439 		su =  MD_UNIT(md_getminor(sm->sm_dev));
1440 		recids[ri++] = su->c.un_record_id;
1441 	}
1442 
1443 	if (extras != NULL)
1444 		while (*extras != 0) {
1445 			recids[ri++] = *extras;
1446 			extras++;
1447 		}
1448 
1449 	if (ri == 0)
1450 		return;
1451 	recids[ri] = 0;
1452 
1453 	/*
1454 	 * Ok to hold ioctl lock across record commit to mddb as
1455 	 * long as the record(s) being committed aren't resync records.
1456 	 */
1457 	mddb_commitrecs_wrapper(recids);
1458 }
1459 
1460 
1461 /*
1462  * This routine is used to set a bit in the writable_bm bitmap
1463  * which represents each submirror in a metamirror which
1464  * is writable. The first writable submirror index is assigned
1465  * to the sm_index.  The number of writable submirrors are returned in nunits.
1466  *
1467  * This routine returns the submirror's unit number.
1468  */
1469 
1470 static void
1471 select_write_units(struct mm_unit *un, md_mps_t *ps)
1472 {
1473 
1474 	int		i;
1475 	unsigned	writable_bm = 0;
1476 	unsigned	nunits = 0;
1477 
1478 	for (i = 0; i < NMIRROR; i++) {
1479 		if (SUBMIRROR_IS_WRITEABLE(un, i)) {
1480 			/* set bit of all writable units */
1481 			writable_bm |= SMI2BIT(i);
1482 			nunits++;
1483 		}
1484 	}
1485 	ps->ps_writable_sm = writable_bm;
1486 	ps->ps_active_cnt = nunits;
1487 	ps->ps_current_sm = 0;
1488 }
1489 
1490 static
1491 unsigned
1492 select_write_after_read_units(struct mm_unit *un, md_mps_t *ps)
1493 {
1494 
1495 	int		i;
1496 	unsigned	writable_bm = 0;
1497 	unsigned	nunits = 0;
1498 
1499 	for (i = 0; i < NMIRROR; i++) {
1500 		if (SUBMIRROR_IS_WRITEABLE(un, i) &&
1501 		    un->un_sm[i].sm_flags & MD_SM_RESYNC_TARGET) {
1502 			writable_bm |= SMI2BIT(i);
1503 			nunits++;
1504 		}
1505 	}
1506 	if ((writable_bm & ps->ps_allfrom_sm) != 0) {
1507 		writable_bm &= ~ps->ps_allfrom_sm;
1508 		nunits--;
1509 	}
1510 	ps->ps_writable_sm = writable_bm;
1511 	ps->ps_active_cnt = nunits;
1512 	ps->ps_current_sm = 0;
1513 	return (nunits);
1514 }
1515 
1516 static md_dev64_t
1517 select_read_unit(
1518 	mm_unit_t	*un,
1519 	diskaddr_t	blkno,
1520 	u_longlong_t	reqcount,
1521 	u_longlong_t	*cando,
1522 	int		must_be_opened,
1523 	md_m_shared_t	**shared,
1524 	md_mcs_t	*cs)
1525 {
1526 	int			i;
1527 	md_m_shared_t		*s;
1528 	uint_t			lasterrcnt = 0;
1529 	md_dev64_t		dev = 0;
1530 	u_longlong_t		cnt;
1531 	u_longlong_t		mincnt;
1532 	mm_submirror_t		*sm;
1533 	mm_submirror_ic_t	*smic;
1534 	mdi_unit_t		*ui;
1535 
1536 	mincnt = reqcount;
1537 	for (i = 0; i < NMIRROR; i++) {
1538 		if (!SUBMIRROR_IS_READABLE(un, i))
1539 			continue;
1540 		sm = &un->un_sm[i];
1541 		smic = &un->un_smic[i];
1542 		cnt = reqcount;
1543 
1544 		/*
1545 		 * If the current submirror is marked as inaccessible, do not
1546 		 * try to access it.
1547 		 */
1548 		ui = MDI_UNIT(getminor(expldev(sm->sm_dev)));
1549 		(void) md_unit_readerlock(ui);
1550 		if (ui->ui_tstate & MD_INACCESSIBLE) {
1551 			md_unit_readerexit(ui);
1552 			continue;
1553 		}
1554 		md_unit_readerexit(ui);
1555 
1556 		s = (md_m_shared_t *)(*(smic->sm_shared_by_blk))
1557 		    (sm->sm_dev, sm, blkno, &cnt);
1558 
1559 		if (must_be_opened && !(s->ms_flags & MDM_S_ISOPEN))
1560 			continue;
1561 		if (s->ms_state == CS_OKAY) {
1562 			*cando = cnt;
1563 			if (shared != NULL)
1564 				*shared = s;
1565 
1566 			if (un->un_sm[i].sm_flags & MD_SM_FAILFAST &&
1567 			    cs != NULL) {
1568 				cs->cs_buf.b_flags |= B_FAILFAST;
1569 			}
1570 
1571 			return (un->un_sm[i].sm_dev);
1572 		}
1573 		if (s->ms_state != CS_LAST_ERRED)
1574 			continue;
1575 
1576 		/* don't use B_FAILFAST since we're Last Erred */
1577 
1578 		if (mincnt > cnt)
1579 			mincnt = cnt;
1580 		if (s->ms_lasterrcnt > lasterrcnt) {
1581 			lasterrcnt = s->ms_lasterrcnt;
1582 			if (shared != NULL)
1583 				*shared = s;
1584 			dev = un->un_sm[i].sm_dev;
1585 		}
1586 	}
1587 	*cando = mincnt;
1588 	return (dev);
1589 }
1590 
1591 /*
1592  * Given a 32-bit bitmap, this routine will return the bit number
1593  * of the nth bit set.	The nth bit set is passed via the index integer.
1594  *
1595  * This routine is used to run through the writable submirror bitmap
1596  * and starting all of the writes.  See the value returned is the
1597  * index to appropriate submirror structure, in the md_sm
1598  * array for metamirrors.
1599  */
1600 static int
1601 md_find_nth_unit(uint_t mask, int index)
1602 {
1603 	int	bit, nfound;
1604 
1605 	for (bit = -1, nfound = -1; nfound != index; bit++) {
1606 		ASSERT(mask != 0);
1607 		nfound += (mask & 1);
1608 		mask >>= 1;
1609 	}
1610 	return (bit);
1611 }
1612 
1613 static int
1614 fast_select_read_unit(md_mps_t *ps, md_mcs_t *cs)
1615 {
1616 	mm_unit_t	*un;
1617 	buf_t		*bp;
1618 	int		i;
1619 	unsigned	nunits = 0;
1620 	int		iunit;
1621 	uint_t		running_bm = 0;
1622 	uint_t		sm_index;
1623 
1624 	bp = &cs->cs_buf;
1625 	un = ps->ps_un;
1626 
1627 	for (i = 0; i < NMIRROR; i++) {
1628 		if (!SMS_BY_INDEX_IS(un, i, SMS_RUNNING))
1629 			continue;
1630 		running_bm |= SMI2BIT(i);
1631 		nunits++;
1632 	}
1633 	if (nunits == 0)
1634 		return (1);
1635 
1636 	/*
1637 	 * For directed mirror read (DMR) we only use the specified side and
1638 	 * do not compute the source of the read.
1639 	 */
1640 	if (ps->ps_flags & MD_MPS_DMR) {
1641 		sm_index = un->un_dmr_last_read;
1642 	} else {
1643 		/* Normal (non-DMR) operation */
1644 		switch (un->un_read_option) {
1645 		case RD_GEOMETRY:
1646 			iunit = (int)(bp->b_lblkno /
1647 			    howmany(un->c.un_total_blocks, nunits));
1648 			sm_index = md_find_nth_unit(running_bm, iunit);
1649 			break;
1650 		case RD_FIRST:
1651 			sm_index = md_find_nth_unit(running_bm, 0);
1652 			break;
1653 		case RD_LOAD_BAL:
1654 			/* this is intentional to fall into the default */
1655 		default:
1656 			un->un_last_read = (un->un_last_read + 1) % nunits;
1657 			sm_index = md_find_nth_unit(running_bm,
1658 			    un->un_last_read);
1659 			break;
1660 		}
1661 	}
1662 	bp->b_edev = md_dev64_to_dev(un->un_sm[sm_index].sm_dev);
1663 	ps->ps_allfrom_sm = SMI2BIT(sm_index);
1664 
1665 	if (un->un_sm[sm_index].sm_flags & MD_SM_FAILFAST) {
1666 		bp->b_flags |= B_FAILFAST;
1667 	}
1668 
1669 	return (0);
1670 }
1671 
1672 static
1673 int
1674 mirror_are_submirrors_available(mm_unit_t *un)
1675 {
1676 	int i;
1677 	for (i = 0; i < NMIRROR; i++) {
1678 		md_dev64_t tmpdev = un->un_sm[i].sm_dev;
1679 
1680 		if ((!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) ||
1681 		    md_getmajor(tmpdev) != md_major)
1682 			continue;
1683 
1684 		if ((MD_MIN2SET(md_getminor(tmpdev)) >= md_nsets) ||
1685 		    (MD_MIN2UNIT(md_getminor(tmpdev)) >= md_nunits))
1686 			return (0);
1687 
1688 		if (MDI_UNIT(md_getminor(tmpdev)) == NULL)
1689 			return (0);
1690 	}
1691 	return (1);
1692 }
1693 
1694 void
1695 build_submirror(mm_unit_t *un, int i, int snarfing)
1696 {
1697 	struct mm_submirror	*sm;
1698 	struct mm_submirror_ic	*smic;
1699 	md_unit_t		*su;
1700 	set_t			setno;
1701 
1702 	sm = &un->un_sm[i];
1703 	smic = &un->un_smic[i];
1704 
1705 	sm->sm_flags = 0; /* sometime we may need to do more here */
1706 
1707 	setno = MD_UN2SET(un);
1708 
1709 	if (!SMS_IS(sm, SMS_INUSE))
1710 		return;
1711 	if (snarfing) {
1712 		sm->sm_dev = md_getdevnum(setno, mddb_getsidenum(setno),
1713 		    sm->sm_key, MD_NOTRUST_DEVT);
1714 	} else {
1715 		if (md_getmajor(sm->sm_dev) == md_major) {
1716 			su = MD_UNIT(md_getminor(sm->sm_dev));
1717 			un->c.un_flag |= (su->c.un_flag & MD_LABELED);
1718 			/* submirror can no longer be soft partitioned */
1719 			MD_CAPAB(su) &= (~MD_CAN_SP);
1720 		}
1721 	}
1722 	smic->sm_shared_by_blk = md_get_named_service(sm->sm_dev,
1723 	    0, "shared by blk", 0);
1724 	smic->sm_shared_by_indx = md_get_named_service(sm->sm_dev,
1725 	    0, "shared by indx", 0);
1726 	smic->sm_get_component_count = (int (*)())md_get_named_service(
1727 	    sm->sm_dev, 0, "get component count", 0);
1728 	smic->sm_get_bcss = (int (*)())md_get_named_service(sm->sm_dev, 0,
1729 	    "get block count skip size", 0);
1730 	sm->sm_state &= ~SMS_IGNORE;
1731 	if (SMS_IS(sm, SMS_OFFLINE))
1732 		MD_STATUS(un) |= MD_UN_OFFLINE_SM;
1733 	md_set_parent(sm->sm_dev, MD_SID(un));
1734 }
1735 
1736 static void
1737 mirror_cleanup(mm_unit_t *un)
1738 {
1739 	mddb_recid_t	recid;
1740 	int		smi;
1741 	sv_dev_t	sv[NMIRROR];
1742 	int		nsv = 0;
1743 
1744 	/*
1745 	 * If a MN diskset and this node is not the master, do
1746 	 * not delete any records on snarf of the mirror records.
1747 	 */
1748 	if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
1749 	    md_set[MD_UN2SET(un)].s_am_i_master == 0) {
1750 		return;
1751 	}
1752 
1753 	for (smi = 0; smi < NMIRROR; smi++) {
1754 		if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE))
1755 			continue;
1756 		sv[nsv].setno = MD_UN2SET(un);
1757 		sv[nsv++].key = un->un_sm[smi].sm_key;
1758 	}
1759 
1760 	recid = un->un_rr_dirty_recid;
1761 	mddb_deleterec_wrapper(un->c.un_record_id);
1762 	if (recid > 0)
1763 		mddb_deleterec_wrapper(recid);
1764 
1765 	md_rem_names(sv, nsv);
1766 }
1767 
1768 /*
1769  * Comparison function for the avl tree which tracks
1770  * outstanding writes on submirrors.
1771  *
1772  * Returns:
1773  *	-1: ps1 < ps2
1774  *	 0: ps1 and ps2 overlap
1775  *	 1: ps1 > ps2
1776  */
1777 static int
1778 mirror_overlap_compare(const void *p1, const void *p2)
1779 {
1780 	const md_mps_t *ps1 = (md_mps_t *)p1;
1781 	const md_mps_t *ps2 = (md_mps_t *)p2;
1782 
1783 	if (ps1->ps_firstblk < ps2->ps_firstblk) {
1784 		if (ps1->ps_lastblk >= ps2->ps_firstblk)
1785 			return (0);
1786 		return (-1);
1787 	}
1788 
1789 	if (ps1->ps_firstblk > ps2->ps_firstblk) {
1790 		if (ps1->ps_firstblk <= ps2->ps_lastblk)
1791 			return (0);
1792 		return (1);
1793 	}
1794 
1795 	return (0);
1796 }
1797 
1798 /* Return a -1 if optimized record unavailable and set should be released */
1799 int
1800 mirror_build_incore(mm_unit_t *un, int snarfing)
1801 {
1802 	int		i;
1803 
1804 	if (MD_STATUS(un) & MD_UN_BEING_RESET) {
1805 		mddb_setrecprivate(un->c.un_record_id, MD_PRV_PENDCLEAN);
1806 		return (1);
1807 	}
1808 
1809 	if (mirror_are_submirrors_available(un) == 0)
1810 		return (1);
1811 
1812 	if (MD_UNIT(MD_SID(un)) != NULL)
1813 		return (0);
1814 
1815 	MD_STATUS(un) = 0;
1816 
1817 	/* pre-4.1 didn't define CAN_META_CHILD capability */
1818 	MD_CAPAB(un) = MD_CAN_META_CHILD | MD_CAN_PARENT | MD_CAN_SP;
1819 
1820 	un->un_overlap_tree_flag = 0;
1821 	avl_create(&un->un_overlap_root, mirror_overlap_compare,
1822 	    sizeof (md_mps_t), offsetof(md_mps_t, ps_overlap_node));
1823 
1824 	for (i = 0; i < NMIRROR; i++)
1825 		build_submirror(un, i, snarfing);
1826 
1827 	if (unit_setup_resync(un, snarfing) != 0) {
1828 		if (snarfing) {
1829 			mddb_setrecprivate(un->c.un_record_id, MD_PRV_GOTIT);
1830 			/*
1831 			 * If a MN set and set is not stale, then return -1
1832 			 * which will force the caller to unload the set.
1833 			 * The MN diskset nodes will return failure if
1834 			 * unit_setup_resync fails so that nodes won't
1835 			 * get out of sync.
1836 			 *
1837 			 * If set is STALE, the master node can't allocate
1838 			 * a resync record (if needed), but node needs to
1839 			 * join the set so that user can delete broken mddbs.
1840 			 * So, if set is STALE, just continue on.
1841 			 */
1842 			if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
1843 			    !(md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)) {
1844 				return (-1);
1845 			}
1846 		} else
1847 			return (1);
1848 	}
1849 
1850 	mutex_init(&un->un_overlap_tree_mx, NULL, MUTEX_DEFAULT, NULL);
1851 	cv_init(&un->un_overlap_tree_cv, NULL, CV_DEFAULT, NULL);
1852 
1853 	un->un_suspend_wr_flag = 0;
1854 	mutex_init(&un->un_suspend_wr_mx, NULL, MUTEX_DEFAULT, NULL);
1855 	cv_init(&un->un_suspend_wr_cv, NULL, CV_DEFAULT, NULL);
1856 
1857 	/*
1858 	 * Allocate mutexes for mirror-owner and resync-owner changes.
1859 	 * All references to the owner message state field must be guarded
1860 	 * by this mutex.
1861 	 */
1862 	mutex_init(&un->un_owner_mx, NULL, MUTEX_DEFAULT, NULL);
1863 
1864 	/*
1865 	 * Allocate mutex and condvar for resync thread manipulation. These
1866 	 * will be used by mirror_resync_unit/mirror_ioctl_resync
1867 	 */
1868 	mutex_init(&un->un_rs_thread_mx, NULL, MUTEX_DEFAULT, NULL);
1869 	cv_init(&un->un_rs_thread_cv, NULL, CV_DEFAULT, NULL);
1870 
1871 	/*
1872 	 * Allocate mutex and condvar for resync progress thread manipulation.
1873 	 * This allows resyncs to be continued across an intervening reboot.
1874 	 */
1875 	mutex_init(&un->un_rs_progress_mx, NULL, MUTEX_DEFAULT, NULL);
1876 	cv_init(&un->un_rs_progress_cv, NULL, CV_DEFAULT, NULL);
1877 
1878 	/*
1879 	 * Allocate mutex and condvar for Directed Mirror Reads (DMR). This
1880 	 * provides synchronization between a user-ioctl and the resulting
1881 	 * strategy() call that performs the read().
1882 	 */
1883 	mutex_init(&un->un_dmr_mx, NULL, MUTEX_DEFAULT, NULL);
1884 	cv_init(&un->un_dmr_cv, NULL, CV_DEFAULT, NULL);
1885 
1886 	/* place various information in the in-core data structures */
1887 	md_nblocks_set(MD_SID(un), un->c.un_total_blocks);
1888 	MD_UNIT(MD_SID(un)) = un;
1889 
1890 	return (0);
1891 }
1892 
1893 
1894 void
1895 reset_mirror(struct mm_unit *un, minor_t mnum, int removing)
1896 {
1897 	mddb_recid_t	recid, vtoc_id;
1898 	size_t		bitcnt;
1899 	size_t		shortcnt;
1900 	int		smi;
1901 	sv_dev_t	sv[NMIRROR];
1902 	int		nsv = 0;
1903 	uint_t		bits = 0;
1904 	minor_t		selfid;
1905 	md_unit_t	*su;
1906 
1907 	md_destroy_unit_incore(mnum, &mirror_md_ops);
1908 
1909 	shortcnt = un->un_rrd_num * sizeof (short);
1910 	bitcnt = howmany(un->un_rrd_num, NBBY);
1911 
1912 	if (un->un_outstanding_writes)
1913 		kmem_free((caddr_t)un->un_outstanding_writes, shortcnt);
1914 	if (un->un_goingclean_bm)
1915 		kmem_free((caddr_t)un->un_goingclean_bm, bitcnt);
1916 	if (un->un_goingdirty_bm)
1917 		kmem_free((caddr_t)un->un_goingdirty_bm, bitcnt);
1918 	if (un->un_resync_bm)
1919 		kmem_free((caddr_t)un->un_resync_bm, bitcnt);
1920 
1921 	md_nblocks_set(mnum, -1ULL);
1922 	MD_UNIT(mnum) = NULL;
1923 
1924 	/*
1925 	 * Attempt release of its minor node
1926 	 */
1927 	md_remove_minor_node(mnum);
1928 
1929 	if (!removing)
1930 		return;
1931 
1932 	for (smi = 0; smi < NMIRROR; smi++) {
1933 		if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE))
1934 			continue;
1935 		/* reallow soft partitioning of submirror and reset parent */
1936 		su = MD_UNIT(md_getminor(un->un_sm[smi].sm_dev));
1937 		MD_CAPAB(su) |= MD_CAN_SP;
1938 		md_reset_parent(un->un_sm[smi].sm_dev);
1939 		reset_comp_states(&un->un_sm[smi], &un->un_smic[smi]);
1940 
1941 		sv[nsv].setno = MD_MIN2SET(mnum);
1942 		sv[nsv++].key = un->un_sm[smi].sm_key;
1943 		bits |= SMI2BIT(smi);
1944 	}
1945 
1946 	MD_STATUS(un) |= MD_UN_BEING_RESET;
1947 	recid = un->un_rr_dirty_recid;
1948 	vtoc_id = un->c.un_vtoc_id;
1949 	selfid = MD_SID(un);
1950 
1951 	mirror_commit(un, bits, 0);
1952 
1953 	avl_destroy(&un->un_overlap_root);
1954 
1955 	/* Destroy all mutexes and condvars before returning. */
1956 	mutex_destroy(&un->un_suspend_wr_mx);
1957 	cv_destroy(&un->un_suspend_wr_cv);
1958 	mutex_destroy(&un->un_overlap_tree_mx);
1959 	cv_destroy(&un->un_overlap_tree_cv);
1960 	mutex_destroy(&un->un_owner_mx);
1961 	mutex_destroy(&un->un_rs_thread_mx);
1962 	cv_destroy(&un->un_rs_thread_cv);
1963 	mutex_destroy(&un->un_rs_progress_mx);
1964 	cv_destroy(&un->un_rs_progress_cv);
1965 	mutex_destroy(&un->un_dmr_mx);
1966 	cv_destroy(&un->un_dmr_cv);
1967 
1968 	/*
1969 	 * Remove self from the namespace
1970 	 */
1971 	if (un->c.un_revision & MD_FN_META_DEV) {
1972 		(void) md_rem_selfname(un->c.un_self_id);
1973 	}
1974 
1975 	mddb_deleterec_wrapper(un->c.un_record_id);
1976 	if (recid != 0)
1977 		mddb_deleterec_wrapper(recid);
1978 
1979 	/* Remove the vtoc, if present */
1980 	if (vtoc_id)
1981 		mddb_deleterec_wrapper(vtoc_id);
1982 
1983 	md_rem_names(sv, nsv);
1984 
1985 	SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, SVM_TAG_METADEVICE,
1986 	    MD_MIN2SET(selfid), selfid);
1987 }
1988 
1989 int
1990 mirror_internal_open(
1991 	minor_t		mnum,
1992 	int		flag,
1993 	int		otyp,
1994 	int		md_oflags,
1995 	IOLOCK		*lockp		/* can be NULL */
1996 )
1997 {
1998 	mdi_unit_t	*ui = MDI_UNIT(mnum);
1999 	int		err = 0;
2000 
2001 tryagain:
2002 	/* single thread */
2003 	if (lockp) {
2004 		/*
2005 		 * If ioctl lock is held, use openclose_enter
2006 		 * routine that will set the ioctl flag when
2007 		 * grabbing the readerlock.
2008 		 */
2009 		(void) md_ioctl_openclose_enter(lockp, ui);
2010 	} else {
2011 		(void) md_unit_openclose_enter(ui);
2012 	}
2013 
2014 	/*
2015 	 * The mirror_open_all_devs routine may end up sending a STATE_UPDATE
2016 	 * message in a MN diskset and this requires that the openclose
2017 	 * lock is dropped in order to send this message.  So, another
2018 	 * flag (MD_UL_OPENINPROGRESS) is used to keep another thread from
2019 	 * attempting an open while this thread has an open in progress.
2020 	 * Call the *_lh version of the lock exit routines since the ui_mx
2021 	 * mutex must be held from checking for OPENINPROGRESS until
2022 	 * after the cv_wait call.
2023 	 */
2024 	mutex_enter(&ui->ui_mx);
2025 	if (ui->ui_lock & MD_UL_OPENINPROGRESS) {
2026 		if (lockp) {
2027 			(void) md_ioctl_openclose_exit_lh(lockp);
2028 		} else {
2029 			md_unit_openclose_exit_lh(ui);
2030 		}
2031 		cv_wait(&ui->ui_cv, &ui->ui_mx);
2032 		mutex_exit(&ui->ui_mx);
2033 		goto tryagain;
2034 	}
2035 
2036 	ui->ui_lock |= MD_UL_OPENINPROGRESS;
2037 	mutex_exit(&ui->ui_mx);
2038 
2039 	/* open devices, if necessary */
2040 	if (! md_unit_isopen(ui) || (ui->ui_tstate & MD_INACCESSIBLE)) {
2041 		if ((err = mirror_open_all_devs(mnum, md_oflags, lockp)) != 0)
2042 			goto out;
2043 	}
2044 
2045 	/* count open */
2046 	if ((err = md_unit_incopen(mnum, flag, otyp)) != 0)
2047 		goto out;
2048 
2049 	/* unlock, return success */
2050 out:
2051 	mutex_enter(&ui->ui_mx);
2052 	ui->ui_lock &= ~MD_UL_OPENINPROGRESS;
2053 	mutex_exit(&ui->ui_mx);
2054 
2055 	if (lockp) {
2056 		/*
2057 		 * If ioctl lock is held, use openclose_exit
2058 		 * routine that will clear the lockp reader flag.
2059 		 */
2060 		(void) md_ioctl_openclose_exit(lockp);
2061 	} else {
2062 		md_unit_openclose_exit(ui);
2063 	}
2064 	return (err);
2065 }
2066 
2067 int
2068 mirror_internal_close(
2069 	minor_t		mnum,
2070 	int		otyp,
2071 	int		md_cflags,
2072 	IOLOCK		*lockp		/* can be NULL */
2073 )
2074 {
2075 	mdi_unit_t	*ui = MDI_UNIT(mnum);
2076 	mm_unit_t	*un;
2077 	int		err = 0;
2078 
2079 	/* single thread */
2080 	if (lockp) {
2081 		/*
2082 		 * If ioctl lock is held, use openclose_enter
2083 		 * routine that will set the ioctl flag when
2084 		 * grabbing the readerlock.
2085 		 */
2086 		un = (mm_unit_t *)md_ioctl_openclose_enter(lockp, ui);
2087 	} else {
2088 		un = (mm_unit_t *)md_unit_openclose_enter(ui);
2089 	}
2090 
2091 	/* count closed */
2092 	if ((err = md_unit_decopen(mnum, otyp)) != 0)
2093 		goto out;
2094 
2095 	/* close devices, if necessary */
2096 	if (! md_unit_isopen(ui) || (md_cflags & MD_OFLG_PROBEDEV)) {
2097 		/*
2098 		 * Clean up dirty bitmap for this unit. Do this
2099 		 * before closing the underlying devices to avoid
2100 		 * race conditions with reset_mirror() as a
2101 		 * result of a 'metaset -r' command running in
2102 		 * parallel. This might cause deallocation of
2103 		 * dirty region bitmaps; with underlying metadevices
2104 		 * in place this can't happen.
2105 		 * Don't do this if a MN set and ABR not set
2106 		 */
2107 		if (new_resync && !(MD_STATUS(un) & MD_UN_KEEP_DIRTY)) {
2108 			if (!MD_MNSET_SETNO(MD_UN2SET(un)) ||
2109 			    !(ui->ui_tstate & MD_ABR_CAP))
2110 				mirror_process_unit_resync(un);
2111 		}
2112 		(void) mirror_close_all_devs(un, md_cflags);
2113 
2114 		/*
2115 		 * For a MN set with transient capabilities (eg ABR/DMR) set,
2116 		 * clear these capabilities on the last open in the cluster.
2117 		 * To do this we send a message to all nodes to see of the
2118 		 * device is open.
2119 		 */
2120 		if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
2121 		    (ui->ui_tstate & (MD_ABR_CAP|MD_DMR_CAP))) {
2122 			if (lockp) {
2123 				(void) md_ioctl_openclose_exit(lockp);
2124 			} else {
2125 				md_unit_openclose_exit(ui);
2126 			}
2127 
2128 			/*
2129 			 * if we are in the context of an ioctl, drop the
2130 			 * ioctl lock.
2131 			 * Otherwise, no other locks should be held.
2132 			 */
2133 			if (lockp) {
2134 				IOLOCK_RETURN_RELEASE(0, lockp);
2135 			}
2136 
2137 			mdmn_clear_all_capabilities(mnum);
2138 
2139 			/* if dropped the lock previously, regain it */
2140 			if (lockp) {
2141 				IOLOCK_RETURN_REACQUIRE(lockp);
2142 			}
2143 			return (0);
2144 		}
2145 		/* unlock and return success */
2146 	}
2147 out:
2148 	/* Call whether lockp is NULL or not. */
2149 	if (lockp) {
2150 		md_ioctl_openclose_exit(lockp);
2151 	} else {
2152 		md_unit_openclose_exit(ui);
2153 	}
2154 	return (err);
2155 }
2156 
2157 /*
2158  * When a component has completed resyncing and is now ok, check if the
2159  * corresponding component in the other submirrors is in the Last Erred
2160  * state.  If it is, we want to change that to the Erred state so we stop
2161  * using that component and start using this good component instead.
2162  *
2163  * This is called from set_sm_comp_state and recursively calls
2164  * set_sm_comp_state if it needs to change the Last Erred state.
2165  */
2166 static void
2167 reset_lasterred(mm_unit_t *un, int smi, mddb_recid_t *extras, uint_t flags,
2168 	IOLOCK *lockp)
2169 {
2170 	mm_submirror_t		*sm;
2171 	mm_submirror_ic_t	*smic;
2172 	int			ci;
2173 	int			i;
2174 	int			compcnt;
2175 	int			changed = 0;
2176 
2177 	for (i = 0; i < NMIRROR; i++) {
2178 		sm = &un->un_sm[i];
2179 		smic = &un->un_smic[i];
2180 
2181 		if (!SMS_IS(sm, SMS_INUSE))
2182 			continue;
2183 
2184 		/* ignore the submirror that we just made ok */
2185 		if (i == smi)
2186 			continue;
2187 
2188 		compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un);
2189 		for (ci = 0; ci < compcnt; ci++) {
2190 			md_m_shared_t	*shared;
2191 
2192 			shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
2193 			    (sm->sm_dev, sm, ci);
2194 
2195 			if ((shared->ms_state & CS_LAST_ERRED) &&
2196 			    !mirror_other_sources(un, i, ci, 1)) {
2197 
2198 				set_sm_comp_state(un, i, ci, CS_ERRED, extras,
2199 				    flags, lockp);
2200 				changed = 1;
2201 			}
2202 		}
2203 	}
2204 
2205 	/* maybe there is a hotspare for this newly erred component */
2206 	if (changed) {
2207 		set_t	setno;
2208 
2209 		setno = MD_UN2SET(un);
2210 		if (MD_MNSET_SETNO(setno)) {
2211 			send_poke_hotspares(setno);
2212 		} else {
2213 			(void) poke_hotspares();
2214 		}
2215 	}
2216 }
2217 
2218 /*
2219  * set_sm_comp_state
2220  *
2221  * Set the state of a submirror component to the specified new state.
2222  * If the mirror is in a multi-node set, send messages to all nodes to
2223  * block all writes to the mirror and then update the state and release the
2224  * writes. These messages are only sent if MD_STATE_XMIT is set in flags.
2225  * MD_STATE_XMIT will be unset in 2 cases:
2226  * 1. When the state is changed to CS_RESYNC as this state change
2227  * will already have been updated on each node by the processing of the
2228  * distributed metasync command, hence no need to xmit.
2229  * 2. When the state is change to CS_OKAY after a resync has completed. Again
2230  * the resync completion will already have been processed on each node by
2231  * the processing of the MD_MN_MSG_RESYNC_PHASE_DONE message for a component
2232  * resync, hence no need to xmit.
2233  *
2234  * In case we are called from the updates of a watermark,
2235  * (then MD_STATE_WMUPDATE will be set in the ps->flags) this is due to
2236  * a metainit or similar. In this case the message that we sent to propagate
2237  * the state change must not be a class1 message as that would deadlock with
2238  * the metainit command that is still being processed.
2239  * This we achieve by creating a class2 message MD_MN_MSG_STATE_UPDATE2
2240  * instead. This also makes the submessage generator to create a class2
2241  * submessage rather than a class1 (which would also block)
2242  *
2243  * On entry, unit_writerlock is held
2244  * If MD_STATE_OCHELD is set in flags, then unit_openclose lock is
2245  * also held.
2246  */
2247 void
2248 set_sm_comp_state(
2249 	mm_unit_t	*un,
2250 	int		smi,
2251 	int		ci,
2252 	int		newstate,
2253 	mddb_recid_t	*extras,
2254 	uint_t		flags,
2255 	IOLOCK		*lockp
2256 )
2257 {
2258 	mm_submirror_t		*sm;
2259 	mm_submirror_ic_t	*smic;
2260 	md_m_shared_t		*shared;
2261 	int			origstate;
2262 	void			(*get_dev)();
2263 	ms_cd_info_t		cd;
2264 	char			devname[MD_MAX_CTDLEN];
2265 	int			err;
2266 	set_t			setno = MD_UN2SET(un);
2267 	md_mn_msg_stch_t	stchmsg;
2268 	mdi_unit_t		*ui = MDI_UNIT(MD_SID(un));
2269 	md_mn_kresult_t		*kresult;
2270 	int			rval;
2271 	uint_t			msgflags;
2272 	md_mn_msgtype_t		msgtype;
2273 	int			save_lock = 0;
2274 	mdi_unit_t		*ui_sm;
2275 
2276 	sm = &un->un_sm[smi];
2277 	smic = &un->un_smic[smi];
2278 
2279 	/* If we have a real error status then turn off MD_INACCESSIBLE. */
2280 	ui_sm = MDI_UNIT(getminor(md_dev64_to_dev(sm->sm_dev)));
2281 	if (newstate & (CS_ERRED | CS_RESYNC | CS_LAST_ERRED) &&
2282 	    ui_sm->ui_tstate & MD_INACCESSIBLE) {
2283 		ui_sm->ui_tstate &= ~MD_INACCESSIBLE;
2284 	}
2285 
2286 	shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
2287 	    (sm->sm_dev, sm, ci);
2288 	origstate = shared->ms_state;
2289 
2290 	/*
2291 	 * If the new state is an error and the old one wasn't, generate
2292 	 * a console message. We do this before we send the state to other
2293 	 * nodes in a MN set because the state change may change the component
2294 	 * name  if a hotspare is allocated.
2295 	 */
2296 	if ((! (origstate & (CS_ERRED|CS_LAST_ERRED))) &&
2297 	    (newstate & (CS_ERRED|CS_LAST_ERRED))) {
2298 
2299 		get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0,
2300 		    "get device", 0);
2301 		(void) (*get_dev)(sm->sm_dev, sm, ci, &cd);
2302 
2303 		err = md_getdevname(setno, mddb_getsidenum(setno), 0,
2304 		    cd.cd_dev, devname, sizeof (devname));
2305 
2306 		if (err == ENOENT) {
2307 			(void) md_devname(setno, cd.cd_dev, devname,
2308 			    sizeof (devname));
2309 		}
2310 
2311 		cmn_err(CE_WARN, "md: %s: %s needs maintenance",
2312 		    md_shortname(md_getminor(sm->sm_dev)), devname);
2313 
2314 		if (newstate & CS_LAST_ERRED) {
2315 			cmn_err(CE_WARN, "md: %s: %s last erred",
2316 			    md_shortname(md_getminor(sm->sm_dev)),
2317 			    devname);
2318 
2319 		} else if (shared->ms_flags & MDM_S_ISOPEN) {
2320 			/*
2321 			 * Close the broken device and clear the open flag on
2322 			 * it.  Closing the device means the RCM framework will
2323 			 * be able to unconfigure the device if required.
2324 			 *
2325 			 * We have to check that the device is open, otherwise
2326 			 * the first open on it has resulted in the error that
2327 			 * is being processed and the actual cd.cd_dev will be
2328 			 * NODEV64.
2329 			 *
2330 			 * If this is a multi-node mirror, then the multinode
2331 			 * state checks following this code will cause the
2332 			 * slave nodes to close the mirror in the function
2333 			 * mirror_set_state().
2334 			 */
2335 			md_layered_close(cd.cd_dev, MD_OFLG_NULL);
2336 			shared->ms_flags &= ~MDM_S_ISOPEN;
2337 		}
2338 
2339 	} else if ((origstate & CS_LAST_ERRED) && (newstate & CS_ERRED) &&
2340 	    (shared->ms_flags & MDM_S_ISOPEN)) {
2341 		/*
2342 		 * Similar to logic above except no log messages since we
2343 		 * are just transitioning from Last Erred to Erred.
2344 		 */
2345 		get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0,
2346 		    "get device", 0);
2347 		(void) (*get_dev)(sm->sm_dev, sm, ci, &cd);
2348 
2349 		md_layered_close(cd.cd_dev, MD_OFLG_NULL);
2350 		shared->ms_flags &= ~MDM_S_ISOPEN;
2351 	}
2352 
2353 	if ((MD_MNSET_SETNO(setno)) && (origstate != newstate) &&
2354 	    (flags & MD_STATE_XMIT) && !(ui->ui_tstate & MD_ERR_PENDING)) {
2355 		/*
2356 		 * For a multi-node mirror, send the state change to the
2357 		 * master, which broadcasts to all nodes, including this
2358 		 * one. Once the message is received, the state is set
2359 		 * in-core and the master commits the change to disk.
2360 		 * There is a case, comp_replace,  where this function
2361 		 * can be called from within an ioctl and therefore in this
2362 		 * case, as the ioctl will already be called on each node,
2363 		 * there is no need to xmit the state change to the master for
2364 		 * distribution to the other nodes. MD_STATE_XMIT flag is used
2365 		 * to indicate whether a xmit is required. The mirror's
2366 		 * transient state is set to MD_ERR_PENDING to avoid sending
2367 		 * multiple messages.
2368 		 */
2369 		if (newstate & (CS_ERRED|CS_LAST_ERRED))
2370 			ui->ui_tstate |= MD_ERR_PENDING;
2371 
2372 		/*
2373 		 * Send a state update message to all nodes. This message
2374 		 * will generate 2 submessages, the first one to suspend
2375 		 * all writes to the mirror and the second to update the
2376 		 * state and resume writes.
2377 		 */
2378 		stchmsg.msg_stch_mnum = un->c.un_self_id;
2379 		stchmsg.msg_stch_sm = smi;
2380 		stchmsg.msg_stch_comp = ci;
2381 		stchmsg.msg_stch_new_state = newstate;
2382 		stchmsg.msg_stch_hs_id = shared->ms_hs_id;
2383 #ifdef DEBUG
2384 		if (mirror_debug_flag)
2385 			printf("send set state, %x, %x, %x, %x, %x\n",
2386 			    stchmsg.msg_stch_mnum, stchmsg.msg_stch_sm,
2387 			    stchmsg.msg_stch_comp, stchmsg.msg_stch_new_state,
2388 			    stchmsg.msg_stch_hs_id);
2389 #endif
2390 		if (flags & MD_STATE_WMUPDATE) {
2391 			msgtype  = MD_MN_MSG_STATE_UPDATE2;
2392 			/*
2393 			 * When coming from an update of watermarks, there
2394 			 * must already be a message logged that triggered
2395 			 * this action. So, no need to log this message, too.
2396 			 */
2397 			msgflags = MD_MSGF_NO_LOG;
2398 		} else {
2399 			msgtype  = MD_MN_MSG_STATE_UPDATE;
2400 			msgflags = MD_MSGF_DEFAULT_FLAGS;
2401 		}
2402 
2403 		/*
2404 		 * If we are in the context of an ioctl, drop the ioctl lock.
2405 		 * lockp holds the list of locks held.
2406 		 *
2407 		 * Otherwise, increment the appropriate reacquire counters.
2408 		 * If openclose lock is *held, then must reacquire reader
2409 		 * lock before releasing the openclose lock.
2410 		 * Do not drop the ARRAY_WRITER lock as we may not be able
2411 		 * to reacquire it.
2412 		 */
2413 		if (lockp) {
2414 			if (lockp->l_flags & MD_ARRAY_WRITER) {
2415 				save_lock = MD_ARRAY_WRITER;
2416 				lockp->l_flags &= ~MD_ARRAY_WRITER;
2417 			} else if (lockp->l_flags & MD_ARRAY_READER) {
2418 				save_lock = MD_ARRAY_READER;
2419 				lockp->l_flags &= ~MD_ARRAY_READER;
2420 			}
2421 			IOLOCK_RETURN_RELEASE(0, lockp);
2422 		} else {
2423 			if (flags & MD_STATE_OCHELD) {
2424 				md_unit_writerexit(ui);
2425 				(void) md_unit_readerlock(ui);
2426 				md_unit_openclose_exit(ui);
2427 			} else {
2428 				md_unit_writerexit(ui);
2429 			}
2430 		}
2431 
2432 		kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
2433 		rval = mdmn_ksend_message(setno, msgtype, msgflags,
2434 		    (char *)&stchmsg, sizeof (stchmsg), kresult);
2435 
2436 		if (!MDMN_KSEND_MSG_OK(rval, kresult)) {
2437 			mdmn_ksend_show_error(rval, kresult, "STATE UPDATE");
2438 			cmn_err(CE_PANIC,
2439 			    "ksend_message failure: STATE_UPDATE");
2440 		}
2441 		kmem_free(kresult, sizeof (md_mn_kresult_t));
2442 
2443 		/* if dropped the lock previously, regain it */
2444 		if (lockp) {
2445 			IOLOCK_RETURN_REACQUIRE(lockp);
2446 			lockp->l_flags |= save_lock;
2447 		} else {
2448 			/*
2449 			 * Reacquire dropped locks and update acquirecnts
2450 			 * appropriately.
2451 			 */
2452 			if (flags & MD_STATE_OCHELD) {
2453 				/*
2454 				 * openclose also grabs readerlock.
2455 				 */
2456 				(void) md_unit_openclose_enter(ui);
2457 				md_unit_readerexit(ui);
2458 				(void) md_unit_writerlock(ui);
2459 			} else {
2460 				(void) md_unit_writerlock(ui);
2461 			}
2462 		}
2463 
2464 		ui->ui_tstate &= ~MD_ERR_PENDING;
2465 	} else {
2466 		shared->ms_state = newstate;
2467 		uniqtime32(&shared->ms_timestamp);
2468 
2469 		if (newstate == CS_ERRED)
2470 			shared->ms_flags |= MDM_S_NOWRITE;
2471 		else
2472 			shared->ms_flags &= ~MDM_S_NOWRITE;
2473 
2474 		shared->ms_flags &= ~MDM_S_IOERR;
2475 		un->un_changecnt++;
2476 		shared->ms_lasterrcnt = un->un_changecnt;
2477 
2478 		mirror_set_sm_state(sm, smic, SMS_RUNNING, 0);
2479 		mirror_commit(un, SMI2BIT(smi), extras);
2480 	}
2481 
2482 	if ((origstate & CS_RESYNC) && (newstate & CS_OKAY)) {
2483 		/*
2484 		 * Resetting the Last Erred state will recursively call back
2485 		 * into this function (set_sm_comp_state) to update the state.
2486 		 */
2487 		reset_lasterred(un, smi, extras, flags, lockp);
2488 	}
2489 }
2490 
2491 static int
2492 find_another_logical(
2493 	mm_unit_t		*un,
2494 	mm_submirror_t		*esm,
2495 	diskaddr_t		blk,
2496 	u_longlong_t		cnt,
2497 	int			must_be_open,
2498 	int			state,
2499 	int			err_cnt)
2500 {
2501 	u_longlong_t	cando;
2502 	md_dev64_t	dev;
2503 	md_m_shared_t	*s;
2504 
2505 	esm->sm_state |= SMS_IGNORE;
2506 	while (cnt != 0) {
2507 		u_longlong_t	 mcnt;
2508 
2509 		mcnt = MIN(cnt, lbtodb(1024 * 1024 * 1024));	/* 1 Gig Blks */
2510 
2511 		dev = select_read_unit(un, blk, mcnt, &cando,
2512 		    must_be_open, &s, NULL);
2513 		if (dev == (md_dev64_t)0)
2514 			break;
2515 
2516 		if ((state == CS_LAST_ERRED) &&
2517 		    (s->ms_state == CS_LAST_ERRED) &&
2518 		    (err_cnt > s->ms_lasterrcnt))
2519 			break;
2520 
2521 		cnt -= cando;
2522 		blk += cando;
2523 	}
2524 	esm->sm_state &= ~SMS_IGNORE;
2525 	return (cnt != 0);
2526 }
2527 
2528 int
2529 mirror_other_sources(mm_unit_t *un, int smi, int ci, int must_be_open)
2530 {
2531 	mm_submirror_t		*sm;
2532 	mm_submirror_ic_t	*smic;
2533 	size_t			count;
2534 	diskaddr_t		block;
2535 	u_longlong_t		skip;
2536 	u_longlong_t		size;
2537 	md_dev64_t		dev;
2538 	int			cnt;
2539 	md_m_shared_t		*s;
2540 	int			not_found;
2541 
2542 	sm = &un->un_sm[smi];
2543 	smic = &un->un_smic[smi];
2544 	dev = sm->sm_dev;
2545 
2546 	/*
2547 	 * Make sure every component of the submirror
2548 	 * has other sources.
2549 	 */
2550 	if (ci < 0) {
2551 		/* Find the highest lasterrcnt */
2552 		cnt = (*(smic->sm_get_component_count))(dev, sm);
2553 		for (ci = 0; ci < cnt; ci++) {
2554 			not_found = mirror_other_sources(un, smi, ci,
2555 			    must_be_open);
2556 			if (not_found)
2557 				return (1);
2558 		}
2559 		return (0);
2560 	}
2561 
2562 	/*
2563 	 * Make sure this component has other sources
2564 	 */
2565 	(void) (*(smic->sm_get_bcss))
2566 	    (dev, sm, ci, &block, &count, &skip, &size);
2567 
2568 	if (count == 0)
2569 		return (1);
2570 
2571 	s = (md_m_shared_t *)(*(smic->sm_shared_by_indx))(dev, sm, ci);
2572 
2573 	while (count--) {
2574 		if (block >= un->c.un_total_blocks)
2575 			return (0);
2576 
2577 		if ((block + size) > un->c.un_total_blocks)
2578 			size = un->c.un_total_blocks - block;
2579 
2580 		not_found = find_another_logical(un, sm, block, size,
2581 		    must_be_open, s->ms_state, s->ms_lasterrcnt);
2582 		if (not_found)
2583 			return (1);
2584 
2585 		block += size + skip;
2586 	}
2587 	return (0);
2588 }
2589 
2590 static void
2591 finish_error(md_mps_t *ps)
2592 {
2593 	struct buf	*pb;
2594 	mm_unit_t	*un;
2595 	mdi_unit_t	*ui;
2596 	uint_t		new_str_flags;
2597 
2598 	pb = ps->ps_bp;
2599 	un = ps->ps_un;
2600 	ui = ps->ps_ui;
2601 
2602 	/*
2603 	 * Must flag any error to the resync originator if we're performing
2604 	 * a Write-after-Read. This corresponds to an i/o error on a resync
2605 	 * target device and in this case we ought to abort the resync as there
2606 	 * is nothing that can be done to recover from this without operator
2607 	 * intervention. If we don't set the B_ERROR flag we will continue
2608 	 * reading from the mirror but won't write to the target (as it will
2609 	 * have been placed into an errored state).
2610 	 * To handle the case of multiple components within a submirror we only
2611 	 * set the B_ERROR bit if explicitly requested to via MD_MPS_FLAG_ERROR.
2612 	 * The originator of the resync read will cause this bit to be set if
2613 	 * the underlying component count is one for a submirror resync. All
2614 	 * other resync types will have the flag set as there is no underlying
2615 	 * resync which can be performed on a contained metadevice for these
2616 	 * resync types (optimized or component).
2617 	 */
2618 
2619 	if (ps->ps_flags & MD_MPS_WRITE_AFTER_READ) {
2620 		if (ps->ps_flags & MD_MPS_FLAG_ERROR)
2621 			pb->b_flags |= B_ERROR;
2622 		md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
2623 		MPS_FREE(mirror_parent_cache, ps);
2624 		md_unit_readerexit(ui);
2625 		md_biodone(pb);
2626 		return;
2627 	}
2628 	/*
2629 	 * Set the MD_IO_COUNTED flag as we are retrying the same I/O
2630 	 * operation therefore this I/O request has already been counted,
2631 	 * the I/O count variable will be decremented by mirror_done()'s
2632 	 * call to md_biodone().
2633 	 */
2634 	if (ps->ps_changecnt != un->un_changecnt) {
2635 		new_str_flags = MD_STR_NOTTOP | MD_IO_COUNTED;
2636 		if (ps->ps_flags & MD_MPS_WOW)
2637 			new_str_flags |= MD_STR_WOW;
2638 		if (ps->ps_flags & MD_MPS_MAPPED)
2639 			new_str_flags |= MD_STR_MAPPED;
2640 		/*
2641 		 * If this I/O request was a read that was part of a resync,
2642 		 * set MD_STR_WAR for the retried read to ensure that the
2643 		 * resync write (i.e. write-after-read) will be performed
2644 		 */
2645 		if (ps->ps_flags & MD_MPS_RESYNC_READ)
2646 			new_str_flags |= MD_STR_WAR;
2647 		md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
2648 		MPS_FREE(mirror_parent_cache, ps);
2649 		md_unit_readerexit(ui);
2650 		(void) md_mirror_strategy(pb, new_str_flags, NULL);
2651 		return;
2652 	}
2653 
2654 	pb->b_flags |= B_ERROR;
2655 	md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
2656 	MPS_FREE(mirror_parent_cache, ps);
2657 	md_unit_readerexit(ui);
2658 	md_biodone(pb);
2659 }
2660 
2661 static void
2662 error_update_unit(md_mps_t *ps)
2663 {
2664 	mm_unit_t		*un;
2665 	mdi_unit_t		*ui;
2666 	int			smi;	/* sub mirror index */
2667 	int			ci;	/* errored component */
2668 	set_t			setno;
2669 	uint_t			flags;	/* for set_sm_comp_state() */
2670 	uint_t			hspflags; /* for check_comp_4_hotspares() */
2671 
2672 	ui = ps->ps_ui;
2673 	un = (mm_unit_t *)md_unit_writerlock(ui);
2674 	setno = MD_UN2SET(un);
2675 
2676 	/* All of these updates have to propagated in case of MN set */
2677 	flags = MD_STATE_XMIT;
2678 	hspflags = MD_HOTSPARE_XMIT;
2679 
2680 	/* special treatment if we are called during updating watermarks */
2681 	if (ps->ps_flags & MD_MPS_WMUPDATE) {
2682 		flags |= MD_STATE_WMUPDATE;
2683 		hspflags |= MD_HOTSPARE_WMUPDATE;
2684 	}
2685 	smi = 0;
2686 	ci = 0;
2687 	while (mirror_geterror(un, &smi, &ci, 1, 0) != 0) {
2688 		if (mirror_other_sources(un, smi, ci, 0) == 1) {
2689 
2690 			/* Never called from ioctl context, so (IOLOCK *)NULL */
2691 			set_sm_comp_state(un, smi, ci, CS_LAST_ERRED, 0, flags,
2692 			    (IOLOCK *)NULL);
2693 			/*
2694 			 * For a MN set, the NOTIFY is done when the state
2695 			 * change is processed on each node
2696 			 */
2697 			if (!MD_MNSET_SETNO(MD_UN2SET(un))) {
2698 				SE_NOTIFY(EC_SVM_STATE, ESC_SVM_LASTERRED,
2699 				    SVM_TAG_METADEVICE, setno, MD_SID(un));
2700 			}
2701 			continue;
2702 		}
2703 		/* Never called from ioctl context, so (IOLOCK *)NULL */
2704 		set_sm_comp_state(un, smi, ci, CS_ERRED, 0, flags,
2705 		    (IOLOCK *)NULL);
2706 		/*
2707 		 * For a MN set, the NOTIFY is done when the state
2708 		 * change is processed on each node
2709 		 */
2710 		if (!MD_MNSET_SETNO(MD_UN2SET(un))) {
2711 			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED,
2712 			    SVM_TAG_METADEVICE, setno, MD_SID(un));
2713 		}
2714 		smi = 0;
2715 		ci = 0;
2716 	}
2717 
2718 	md_unit_writerexit(ui);
2719 	if (MD_MNSET_SETNO(setno)) {
2720 		send_poke_hotspares(setno);
2721 	} else {
2722 		(void) poke_hotspares();
2723 	}
2724 	(void) md_unit_readerlock(ui);
2725 
2726 	finish_error(ps);
2727 }
2728 
2729 /*
2730  * When we have a B_FAILFAST IO error on a Last Erred component we need to
2731  * retry the IO without B_FAILFAST set so that we try to ensure that the
2732  * component "sees" each IO.
2733  */
2734 static void
2735 last_err_retry(md_mcs_t *cs)
2736 {
2737 	struct buf	*cb;
2738 	md_mps_t	*ps;
2739 	uint_t		flags;
2740 
2741 	cb = &cs->cs_buf;
2742 	cb->b_flags &= ~B_FAILFAST;
2743 
2744 	/* if we're panicing just let this I/O error out */
2745 	if (panicstr) {
2746 		(void) mirror_done(cb);
2747 		return;
2748 	}
2749 
2750 	/* reissue the I/O */
2751 
2752 	ps = cs->cs_ps;
2753 
2754 	bioerror(cb, 0);
2755 
2756 	mutex_enter(&ps->ps_mx);
2757 
2758 	flags = MD_STR_NOTTOP;
2759 	if (ps->ps_flags & MD_MPS_MAPPED)
2760 		flags |= MD_STR_MAPPED;
2761 	if (ps->ps_flags & MD_MPS_NOBLOCK)
2762 		flags |= MD_NOBLOCK;
2763 
2764 	mutex_exit(&ps->ps_mx);
2765 
2766 	clear_retry_error(cb);
2767 
2768 	cmn_err(CE_NOTE, "!md: %s: Last Erred, retry I/O without B_FAILFAST",
2769 	    md_shortname(getminor(cb->b_edev)));
2770 
2771 	md_call_strategy(cb, flags, NULL);
2772 }
2773 
2774 static void
2775 mirror_error(md_mps_t *ps)
2776 {
2777 	int		smi;	/* sub mirror index */
2778 	int		ci;	/* errored component */
2779 
2780 	if (panicstr) {
2781 		finish_error(ps);
2782 		return;
2783 	}
2784 
2785 	if (ps->ps_flags & MD_MPS_ON_OVERLAP)
2786 		mirror_overlap_tree_remove(ps);
2787 
2788 	smi = 0;
2789 	ci = 0;
2790 	if (mirror_geterror(ps->ps_un, &smi, &ci, 0, 0) != 0) {
2791 		md_unit_readerexit(ps->ps_ui);
2792 		daemon_request(&md_mstr_daemon, error_update_unit,
2793 		    (daemon_queue_t *)ps, REQ_OLD);
2794 		return;
2795 	}
2796 
2797 	finish_error(ps);
2798 }
2799 
2800 static int
2801 copy_write_done(struct buf *cb)
2802 {
2803 	md_mps_t	*ps;
2804 	buf_t		*pb;
2805 	char		*wowbuf;
2806 	wowhdr_t	*wowhdr;
2807 	ssize_t		wow_resid;
2808 
2809 	/* get wowbuf ans save structure */
2810 	wowbuf = cb->b_un.b_addr;
2811 	wowhdr = WOWBUF_HDR(wowbuf);
2812 	ps = wowhdr->wow_ps;
2813 	pb = ps->ps_bp;
2814 
2815 	/* Save error information, then free cb */
2816 	if (cb->b_flags & B_ERROR)
2817 		pb->b_flags |= B_ERROR;
2818 
2819 	if (cb->b_flags & B_REMAPPED)
2820 		bp_mapout(cb);
2821 
2822 	freerbuf(cb);
2823 
2824 	/* update residual and continue if needed */
2825 	if ((pb->b_flags & B_ERROR) == 0) {
2826 		wow_resid = pb->b_bcount - wowhdr->wow_offset;
2827 		pb->b_resid = wow_resid;
2828 		if (wow_resid > 0)  {
2829 			daemon_request(&md_mstr_daemon, copy_write_cont,
2830 			    (daemon_queue_t *)wowhdr, REQ_OLD);
2831 			return (1);
2832 		}
2833 	}
2834 
2835 	/* Write is complete, release resources. */
2836 	kmem_cache_free(mirror_wowblk_cache, wowhdr);
2837 	ASSERT(!(ps->ps_flags & MD_MPS_ON_OVERLAP));
2838 	md_kstat_done(ps->ps_ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
2839 	MPS_FREE(mirror_parent_cache, ps);
2840 	md_biodone(pb);
2841 	return (0);
2842 }
2843 
2844 static void
2845 copy_write_cont(wowhdr_t *wowhdr)
2846 {
2847 	buf_t		*pb;
2848 	buf_t		*cb;
2849 	char		*wowbuf;
2850 	int		wow_offset;
2851 	size_t		wow_resid;
2852 	diskaddr_t	wow_blkno;
2853 
2854 	wowbuf = WOWHDR_BUF(wowhdr);
2855 	pb = wowhdr->wow_ps->ps_bp;
2856 
2857 	/* get data on current location */
2858 	wow_offset = wowhdr->wow_offset;
2859 	wow_resid = pb->b_bcount - wow_offset;
2860 	wow_blkno = pb->b_lblkno + lbtodb(wow_offset);
2861 
2862 	/* setup child buffer */
2863 	cb = getrbuf(KM_SLEEP);
2864 	cb->b_flags = B_WRITE;
2865 	cb->b_edev = pb->b_edev;
2866 	cb->b_un.b_addr = wowbuf;	/* change to point at WOWBUF */
2867 	cb->b_bufsize = md_wowbuf_size; /* change to wowbuf_size */
2868 	cb->b_iodone = copy_write_done;
2869 	cb->b_bcount = MIN(md_wowbuf_size, wow_resid);
2870 	cb->b_lblkno = wow_blkno;
2871 
2872 	/* move offset to next section */
2873 	wowhdr->wow_offset += cb->b_bcount;
2874 
2875 	/* copy and setup write for current section */
2876 	bcopy(&pb->b_un.b_addr[wow_offset], wowbuf, cb->b_bcount);
2877 
2878 	/* do it */
2879 	/*
2880 	 * Do not set the MD_IO_COUNTED flag as this is a new I/O request
2881 	 * that handles the WOW condition. The resultant increment on the
2882 	 * I/O count variable is cleared by copy_write_done()'s call to
2883 	 * md_biodone().
2884 	 */
2885 	(void) md_mirror_strategy(cb, MD_STR_NOTTOP | MD_STR_WOW
2886 	    | MD_STR_MAPPED, NULL);
2887 }
2888 
2889 static void
2890 md_mirror_copy_write(md_mps_t *ps)
2891 {
2892 	wowhdr_t	*wowhdr;
2893 
2894 	wowhdr = kmem_cache_alloc(mirror_wowblk_cache, MD_ALLOCFLAGS);
2895 	mirror_wowblk_init(wowhdr);
2896 	wowhdr->wow_ps = ps;
2897 	wowhdr->wow_offset = 0;
2898 	copy_write_cont(wowhdr);
2899 }
2900 
2901 static void
2902 handle_wow(md_mps_t *ps)
2903 {
2904 	buf_t		*pb;
2905 
2906 	pb = ps->ps_bp;
2907 
2908 	bp_mapin(pb);
2909 
2910 	md_mirror_wow_cnt++;
2911 	if (!(pb->b_flags & B_PHYS) && (md_mirror_wow_flg & WOW_LOGIT)) {
2912 		cmn_err(CE_NOTE,
2913 		    "md: %s, blk %lld, cnt %ld: Write on write %d occurred",
2914 		    md_shortname(getminor(pb->b_edev)),
2915 		    (longlong_t)pb->b_lblkno, pb->b_bcount, md_mirror_wow_cnt);
2916 	}
2917 
2918 	/*
2919 	 * Set the MD_IO_COUNTED flag as we are retrying the same I/O
2920 	 * operation therefore this I/O request has already been counted,
2921 	 * the I/O count variable will be decremented by mirror_done()'s
2922 	 * call to md_biodone().
2923 	 */
2924 	if (md_mirror_wow_flg & WOW_NOCOPY)
2925 		(void) md_mirror_strategy(pb, MD_STR_NOTTOP | MD_STR_WOW |
2926 		    MD_STR_MAPPED | MD_IO_COUNTED, ps);
2927 	else
2928 		md_mirror_copy_write(ps);
2929 }
2930 
2931 /*
2932  * Return true if the specified submirror is either in the Last Erred
2933  * state or is transitioning into the Last Erred state.
2934  */
2935 static bool_t
2936 submirror_is_lasterred(mm_unit_t *un, int smi)
2937 {
2938 	mm_submirror_t		*sm;
2939 	mm_submirror_ic_t	*smic;
2940 	md_m_shared_t		*shared;
2941 	int			ci;
2942 	int			compcnt;
2943 
2944 	sm = &un->un_sm[smi];
2945 	smic = &un->un_smic[smi];
2946 
2947 	compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un);
2948 	for (ci = 0; ci < compcnt; ci++) {
2949 		shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
2950 		    (sm->sm_dev, sm, ci);
2951 
2952 		if (shared->ms_state == CS_LAST_ERRED)
2953 			return (B_TRUE);
2954 
2955 		/*
2956 		 * It is not currently Last Erred, check if entering Last Erred.
2957 		 */
2958 		if ((shared->ms_flags & MDM_S_IOERR) &&
2959 		    ((shared->ms_state == CS_OKAY) ||
2960 		    (shared->ms_state == CS_RESYNC))) {
2961 			if (mirror_other_sources(un, smi, ci, 0) == 1)
2962 				return (B_TRUE);
2963 		}
2964 	}
2965 
2966 	return (B_FALSE);
2967 }
2968 
2969 
2970 static int
2971 mirror_done(struct buf *cb)
2972 {
2973 	md_mps_t	*ps;
2974 	md_mcs_t	*cs;
2975 
2976 	/*LINTED*/
2977 	cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off);
2978 	ps = cs->cs_ps;
2979 
2980 	mutex_enter(&ps->ps_mx);
2981 
2982 	/* check if we need to retry an errored failfast I/O */
2983 	if (cb->b_flags & B_ERROR) {
2984 		struct buf *pb = ps->ps_bp;
2985 
2986 		if (cb->b_flags & B_FAILFAST) {
2987 			int		i;
2988 			mm_unit_t	*un = ps->ps_un;
2989 
2990 			for (i = 0; i < NMIRROR; i++) {
2991 				if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
2992 					continue;
2993 
2994 				if (cb->b_edev ==
2995 				    md_dev64_to_dev(un->un_sm[i].sm_dev)) {
2996 
2997 					/*
2998 					 * This is the submirror that had the
2999 					 * error.  Check if it is Last Erred.
3000 					 */
3001 					if (submirror_is_lasterred(un, i)) {
3002 						daemon_queue_t *dqp;
3003 
3004 						mutex_exit(&ps->ps_mx);
3005 						dqp = (daemon_queue_t *)cs;
3006 						dqp->dq_prev = NULL;
3007 						dqp->dq_next = NULL;
3008 						daemon_request(&md_done_daemon,
3009 						    last_err_retry, dqp,
3010 						    REQ_OLD);
3011 						return (1);
3012 					}
3013 					break;
3014 				}
3015 			}
3016 		}
3017 
3018 		/* continue to process the buf without doing a retry */
3019 		ps->ps_flags |= MD_MPS_ERROR;
3020 		pb->b_error = cb->b_error;
3021 	}
3022 
3023 	return (mirror_done_common(cb));
3024 }
3025 
3026 /*
3027  * Split from the original mirror_done function so we can handle bufs after a
3028  * retry.
3029  * ps->ps_mx is already held in the caller of this function and the cb error
3030  * has already been checked and handled in the caller.
3031  */
3032 static int
3033 mirror_done_common(struct buf *cb)
3034 {
3035 	struct buf	*pb;
3036 	mm_unit_t	*un;
3037 	mdi_unit_t	*ui;
3038 	md_mps_t	*ps;
3039 	md_mcs_t	*cs;
3040 	size_t		end_rr, start_rr, current_rr;
3041 
3042 	/*LINTED*/
3043 	cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off);
3044 	ps = cs->cs_ps;
3045 	pb = ps->ps_bp;
3046 
3047 	if (cb->b_flags & B_REMAPPED)
3048 		bp_mapout(cb);
3049 
3050 	ps->ps_frags--;
3051 	if (ps->ps_frags != 0) {
3052 		mutex_exit(&ps->ps_mx);
3053 		kmem_cache_free(mirror_child_cache, cs);
3054 		return (1);
3055 	}
3056 	un = ps->ps_un;
3057 	ui = ps->ps_ui;
3058 
3059 	/*
3060 	 * Do not update outstanding_writes if we're running with ABR
3061 	 * set for this mirror or the write() was issued with MD_STR_ABR set.
3062 	 * Also a resync initiated write() has no outstanding_writes update
3063 	 * either.
3064 	 */
3065 	if (((cb->b_flags & B_READ) == 0) &&
3066 	    (un->un_nsm >= 2) &&
3067 	    (ps->ps_call == NULL) &&
3068 	    !((ui->ui_tstate & MD_ABR_CAP) || (ps->ps_flags & MD_MPS_ABR)) &&
3069 	    !(ps->ps_flags & MD_MPS_WRITE_AFTER_READ)) {
3070 		BLK_TO_RR(end_rr, ps->ps_lastblk, un);
3071 		BLK_TO_RR(start_rr, ps->ps_firstblk, un);
3072 		mutex_enter(&un->un_resync_mx);
3073 		for (current_rr = start_rr; current_rr <= end_rr; current_rr++)
3074 			un->un_outstanding_writes[current_rr]--;
3075 		mutex_exit(&un->un_resync_mx);
3076 	}
3077 	kmem_cache_free(mirror_child_cache, cs);
3078 	mutex_exit(&ps->ps_mx);
3079 
3080 	if (ps->ps_call != NULL) {
3081 		daemon_request(&md_done_daemon, ps->ps_call,
3082 		    (daemon_queue_t *)ps, REQ_OLD);
3083 		return (1);
3084 	}
3085 
3086 	if ((ps->ps_flags & MD_MPS_ERROR)) {
3087 		daemon_request(&md_done_daemon, mirror_error,
3088 		    (daemon_queue_t *)ps, REQ_OLD);
3089 		return (1);
3090 	}
3091 
3092 	if (ps->ps_flags & MD_MPS_ON_OVERLAP)
3093 		mirror_overlap_tree_remove(ps);
3094 
3095 	/*
3096 	 * Handle Write-on-Write problem.
3097 	 * Skip In case of Raw and Direct I/O as they are
3098 	 * handled earlier.
3099 	 *
3100 	 */
3101 	if (!(md_mirror_wow_flg & WOW_DISABLE) &&
3102 	    !(pb->b_flags & B_READ) &&
3103 	    !(ps->ps_flags & MD_MPS_WOW) &&
3104 	    !(pb->b_flags & B_PHYS) &&
3105 	    any_pages_dirty(pb)) {
3106 		md_unit_readerexit(ps->ps_ui);
3107 		daemon_request(&md_mstr_daemon, handle_wow,
3108 		    (daemon_queue_t *)ps, REQ_OLD);
3109 		return (1);
3110 	}
3111 
3112 	md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
3113 	MPS_FREE(mirror_parent_cache, ps);
3114 	md_unit_readerexit(ui);
3115 	md_biodone(pb);
3116 	return (0);
3117 }
3118 
3119 /*
3120  * Clear error state in submirror component if the retry worked after
3121  * a failfast error.
3122  */
3123 static void
3124 clear_retry_error(struct buf *cb)
3125 {
3126 	int			smi;
3127 	md_mcs_t		*cs;
3128 	mm_unit_t		*un;
3129 	mdi_unit_t		*ui_sm;
3130 	mm_submirror_t		*sm;
3131 	mm_submirror_ic_t	*smic;
3132 	u_longlong_t		cnt;
3133 	md_m_shared_t		*shared;
3134 
3135 	/*LINTED*/
3136 	cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off);
3137 	un = cs->cs_ps->ps_un;
3138 
3139 	for (smi = 0; smi < NMIRROR; smi++) {
3140 		if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE))
3141 			continue;
3142 
3143 		if (cb->b_edev == md_dev64_to_dev(un->un_sm[smi].sm_dev))
3144 			break;
3145 	}
3146 
3147 	if (smi >= NMIRROR)
3148 		return;
3149 
3150 	sm = &un->un_sm[smi];
3151 	smic = &un->un_smic[smi];
3152 	cnt = cb->b_bcount;
3153 
3154 	ui_sm = MDI_UNIT(getminor(cb->b_edev));
3155 	(void) md_unit_writerlock(ui_sm);
3156 
3157 	shared = (md_m_shared_t *)(*(smic->sm_shared_by_blk))(sm->sm_dev, sm,
3158 	    cb->b_blkno, &cnt);
3159 
3160 	if (shared->ms_flags & MDM_S_IOERR) {
3161 		shared->ms_flags &= ~MDM_S_IOERR;
3162 
3163 	} else {
3164 		/* the buf spans components and the first one is not erred */
3165 		int	cnt;
3166 		int	i;
3167 
3168 		cnt = (*(smic->sm_get_component_count))(sm->sm_dev, un);
3169 		for (i = 0; i < cnt; i++) {
3170 			shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
3171 			    (sm->sm_dev, sm, i);
3172 
3173 			if (shared->ms_flags & MDM_S_IOERR &&
3174 			    shared->ms_state == CS_OKAY) {
3175 
3176 				shared->ms_flags &= ~MDM_S_IOERR;
3177 				break;
3178 			}
3179 		}
3180 	}
3181 
3182 	md_unit_writerexit(ui_sm);
3183 }
3184 
3185 static size_t
3186 mirror_map_read(
3187 	md_mps_t *ps,
3188 	md_mcs_t *cs,
3189 	diskaddr_t blkno,
3190 	u_longlong_t	count
3191 )
3192 {
3193 	mm_unit_t	*un;
3194 	buf_t		*bp;
3195 	u_longlong_t	cando;
3196 
3197 	bp = &cs->cs_buf;
3198 	un = ps->ps_un;
3199 
3200 	bp->b_lblkno = blkno;
3201 	if (fast_select_read_unit(ps, cs) == 0) {
3202 		bp->b_bcount = ldbtob(count);
3203 		return (0);
3204 	}
3205 	bp->b_edev = md_dev64_to_dev(select_read_unit(un, blkno,
3206 	    count, &cando, 0, NULL, cs));
3207 	bp->b_bcount = ldbtob(cando);
3208 	if (count != cando)
3209 		return (cando);
3210 	return (0);
3211 }
3212 
3213 static void
3214 write_after_read(md_mps_t *ps)
3215 {
3216 	struct buf	*pb;
3217 	int		flags;
3218 
3219 	if (ps->ps_flags & MD_MPS_ERROR) {
3220 		mirror_error(ps);
3221 		return;
3222 	}
3223 
3224 	pb = ps->ps_bp;
3225 	md_kstat_done(ps->ps_ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
3226 	ps->ps_call = NULL;
3227 	ps->ps_flags |= MD_MPS_WRITE_AFTER_READ;
3228 	flags = MD_STR_NOTTOP | MD_STR_WAR;
3229 	if (ps->ps_flags & MD_MPS_MAPPED)
3230 		flags |= MD_STR_MAPPED;
3231 	if (ps->ps_flags & MD_MPS_NOBLOCK)
3232 		flags |= MD_NOBLOCK;
3233 	if (ps->ps_flags & MD_MPS_DIRTY_RD)
3234 		flags |= MD_STR_DIRTY_RD;
3235 	(void) mirror_write_strategy(pb, flags, ps);
3236 }
3237 
3238 static void
3239 continue_serial(md_mps_t *ps)
3240 {
3241 	md_mcs_t	*cs;
3242 	buf_t		*cb;
3243 	mm_unit_t	*un;
3244 	int		flags;
3245 
3246 	un = ps->ps_un;
3247 	cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS);
3248 	mirror_child_init(cs);
3249 	cb = &cs->cs_buf;
3250 	ps->ps_call = NULL;
3251 	ps->ps_frags = 1;
3252 	(void) mirror_map_write(un, cs, ps, 0);
3253 	flags = MD_STR_NOTTOP;
3254 	if (ps->ps_flags & MD_MPS_MAPPED)
3255 		flags |= MD_STR_MAPPED;
3256 	md_call_strategy(cb, flags, NULL);
3257 }
3258 
3259 static int
3260 mirror_map_write(mm_unit_t *un, md_mcs_t *cs, md_mps_t *ps, int war)
3261 {
3262 	int i;
3263 	dev_t		dev;	/* needed for bioclone, so not md_dev64_t */
3264 	buf_t		*cb;
3265 	buf_t		*pb;
3266 	diskaddr_t	blkno;
3267 	size_t		bcount;
3268 	off_t		offset;
3269 
3270 	pb = ps->ps_bp;
3271 	cb = &cs->cs_buf;
3272 	cs->cs_ps = ps;
3273 
3274 	i = md_find_nth_unit(ps->ps_writable_sm, ps->ps_current_sm);
3275 
3276 	dev = md_dev64_to_dev(un->un_sm[i].sm_dev);
3277 
3278 	blkno = pb->b_lblkno;
3279 	bcount = pb->b_bcount;
3280 	offset = 0;
3281 	if (war && (blkno == 0) && (un->c.un_flag & MD_LABELED)) {
3282 		blkno = DK_LABEL_LOC + 1;
3283 		/*
3284 		 * This handles the case where we're requesting
3285 		 * a write to block 0 on a label partition
3286 		 * and the request size was smaller than the
3287 		 * size of the label.  If this is the case
3288 		 * then we'll return -1.  Failure to do so will
3289 		 * either cause the calling thread to hang due to
3290 		 * an ssd bug, or worse if the bcount were allowed
3291 		 * to go negative (ie large).
3292 		 */
3293 		if (bcount <= DEV_BSIZE*(DK_LABEL_LOC + 1))
3294 			return (-1);
3295 		bcount -= (DEV_BSIZE*(DK_LABEL_LOC + 1));
3296 		offset = (DEV_BSIZE*(DK_LABEL_LOC + 1));
3297 	}
3298 
3299 	cb = md_bioclone(pb, offset, bcount, dev, blkno, mirror_done,
3300 	    cb, KM_NOSLEEP);
3301 	if (war)
3302 		cb->b_flags = (cb->b_flags & ~B_READ) | B_WRITE;
3303 
3304 	/*
3305 	 * If the submirror is in the erred stated, check if any component is
3306 	 * in the Last Erred state.  If so, we don't want to use the B_FAILFAST
3307 	 * flag on the IO.
3308 	 *
3309 	 * Provide a fast path for the non-erred case (which should be the
3310 	 * normal case).
3311 	 */
3312 	if (un->un_sm[i].sm_flags & MD_SM_FAILFAST) {
3313 		if (un->un_sm[i].sm_state & SMS_COMP_ERRED) {
3314 			mm_submirror_t		*sm;
3315 			mm_submirror_ic_t	*smic;
3316 			int			ci;
3317 			int			compcnt;
3318 
3319 			sm = &un->un_sm[i];
3320 			smic = &un->un_smic[i];
3321 
3322 			compcnt = (*(smic->sm_get_component_count))
3323 			    (sm->sm_dev, un);
3324 			for (ci = 0; ci < compcnt; ci++) {
3325 				md_m_shared_t	*shared;
3326 
3327 				shared = (md_m_shared_t *)
3328 				    (*(smic->sm_shared_by_indx))(sm->sm_dev,
3329 				    sm, ci);
3330 
3331 				if (shared->ms_state == CS_LAST_ERRED)
3332 					break;
3333 			}
3334 			if (ci >= compcnt)
3335 				cb->b_flags |= B_FAILFAST;
3336 
3337 		} else {
3338 			cb->b_flags |= B_FAILFAST;
3339 		}
3340 	}
3341 
3342 	ps->ps_current_sm++;
3343 	if (ps->ps_current_sm != ps->ps_active_cnt) {
3344 		if (un->un_write_option == WR_SERIAL) {
3345 			ps->ps_call = continue_serial;
3346 			return (0);
3347 		}
3348 		return (1);
3349 	}
3350 	return (0);
3351 }
3352 
3353 /*
3354  * directed_read_done:
3355  * ------------------
3356  * Completion routine called when a DMR request has been returned from the
3357  * underlying driver. Wake-up the original ioctl() and return the data to
3358  * the user.
3359  */
3360 static void
3361 directed_read_done(md_mps_t *ps)
3362 {
3363 	mm_unit_t	*un;
3364 	mdi_unit_t	*ui;
3365 
3366 	un = ps->ps_un;
3367 	ui = ps->ps_ui;
3368 
3369 	md_unit_readerexit(ui);
3370 	md_kstat_done(ui, ps->ps_bp, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
3371 	ps->ps_call = NULL;
3372 
3373 	mutex_enter(&un->un_dmr_mx);
3374 	cv_signal(&un->un_dmr_cv);
3375 	mutex_exit(&un->un_dmr_mx);
3376 
3377 	/* release the parent structure */
3378 	kmem_cache_free(mirror_parent_cache, ps);
3379 }
3380 
3381 /*
3382  * daemon_io:
3383  * ------------
3384  * Called to issue a mirror_write_strategy() or mirror_read_strategy
3385  * call from a blockable context. NOTE: no mutex can be held on entry to this
3386  * routine
3387  */
3388 static void
3389 daemon_io(daemon_queue_t *dq)
3390 {
3391 	md_mps_t	*ps = (md_mps_t *)dq;
3392 	int		flag = MD_STR_NOTTOP;
3393 	buf_t		*pb = ps->ps_bp;
3394 
3395 	if (ps->ps_flags & MD_MPS_MAPPED)
3396 		flag |= MD_STR_MAPPED;
3397 	if (ps->ps_flags & MD_MPS_WOW)
3398 		flag |= MD_STR_WOW;
3399 	if (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)
3400 		flag |= MD_STR_WAR;
3401 	if (ps->ps_flags & MD_MPS_ABR)
3402 		flag |= MD_STR_ABR;
3403 
3404 	/*
3405 	 * If this is a resync read, ie MD_STR_DIRTY_RD not set, set
3406 	 * MD_STR_WAR before calling mirror_read_strategy
3407 	 */
3408 	if (pb->b_flags & B_READ) {
3409 		if (!(ps->ps_flags & MD_MPS_DIRTY_RD))
3410 			flag |= MD_STR_WAR;
3411 		mirror_read_strategy(pb, flag, ps);
3412 	} else
3413 		mirror_write_strategy(pb, flag, ps);
3414 }
3415 
3416 /*
3417  * update_resync:
3418  * -------------
3419  * Called to update the in-core version of the resync record with the latest
3420  * version that was committed to disk when the previous mirror owner
3421  * relinquished ownership. This call is likely to block as we must hold-off
3422  * any current resync processing that may be occurring.
3423  * On completion of the resync record update we issue the mirror_write_strategy
3424  * call to complete the i/o that first started this sequence. To remove a race
3425  * condition between a new write() request which is submitted and the resync
3426  * record update we acquire the writerlock. This will hold off all i/o to the
3427  * mirror until the resync update has completed.
3428  * NOTE: no mutex can be held on entry to this routine
3429  */
3430 static void
3431 update_resync(daemon_queue_t *dq)
3432 {
3433 	md_mps_t	*ps = (md_mps_t *)dq;
3434 	buf_t		*pb = ps->ps_bp;
3435 	mdi_unit_t	*ui = ps->ps_ui;
3436 	mm_unit_t	*un;
3437 	set_t		setno;
3438 	int		restart_resync;
3439 
3440 	un = md_unit_writerlock(ui);
3441 	ps->ps_un = un;
3442 	setno = MD_MIN2SET(getminor(pb->b_edev));
3443 	if (mddb_reread_rr(setno, un->un_rr_dirty_recid) == 0) {
3444 		/*
3445 		 * Synchronize our in-core view of what regions need to be
3446 		 * resync'd with the on-disk version.
3447 		 */
3448 		mutex_enter(&un->un_rrp_inflight_mx);
3449 		mirror_copy_rr(howmany(un->un_rrd_num, NBBY), un->un_resync_bm,
3450 		    un->un_dirty_bm);
3451 		mutex_exit(&un->un_rrp_inflight_mx);
3452 
3453 		/* Region dirty map is now up to date */
3454 	}
3455 	restart_resync = (un->un_rs_thread_flags & MD_RI_BLOCK_OWNER) ? 1 : 0;
3456 	md_unit_writerexit(ui);
3457 
3458 	/* Restart the resync thread if it was previously blocked */
3459 	if (restart_resync) {
3460 		mutex_enter(&un->un_rs_thread_mx);
3461 		un->un_rs_thread_flags &= ~MD_RI_BLOCK_OWNER;
3462 		cv_signal(&un->un_rs_thread_cv);
3463 		mutex_exit(&un->un_rs_thread_mx);
3464 	}
3465 	/* Continue with original deferred i/o */
3466 	daemon_io(dq);
3467 }
3468 
3469 /*
3470  * owner_timeout:
3471  * -------------
3472  * Called if the original mdmn_ksend_message() failed and the request is to be
3473  * retried. Reattempt the original ownership change.
3474  *
3475  * NOTE: called at interrupt context (see timeout(9f)).
3476  */
3477 static void
3478 owner_timeout(void *arg)
3479 {
3480 	daemon_queue_t	*dq = (daemon_queue_t *)arg;
3481 
3482 	daemon_request(&md_mirror_daemon, become_owner, dq, REQ_OLD);
3483 }
3484 
3485 /*
3486  * become_owner:
3487  * ------------
3488  * Called to issue RPC request to become the owner of the mirror
3489  * associated with this i/o request. We assume that the ownership request
3490  * is synchronous, so if it succeeds we will issue the request via
3491  * mirror_write_strategy().
3492  * If multiple i/o's are outstanding we will be called from the mirror_daemon
3493  * service thread.
3494  * NOTE: no mutex should be held on entry to this routine.
3495  */
3496 static void
3497 become_owner(daemon_queue_t *dq)
3498 {
3499 	md_mps_t	*ps = (md_mps_t *)dq;
3500 	mm_unit_t	*un = ps->ps_un;
3501 	buf_t		*pb = ps->ps_bp;
3502 	set_t		setno;
3503 	md_mn_kresult_t	*kres;
3504 	int		msg_flags = md_mirror_msg_flags;
3505 	md_mps_t	*ps1;
3506 
3507 	ASSERT(dq->dq_next == NULL && dq->dq_prev == NULL);
3508 
3509 	/*
3510 	 * If we're already the mirror owner we do not need to send a message
3511 	 * but can simply process the i/o request immediately.
3512 	 * If we've already sent the request to become owner we requeue the
3513 	 * request as we're waiting for the synchronous ownership message to
3514 	 * be processed.
3515 	 */
3516 	if (MD_MN_MIRROR_OWNER(un)) {
3517 		/*
3518 		 * As the strategy() call will potentially block we need to
3519 		 * punt this to a separate thread and complete this request
3520 		 * as quickly as possible. Note: if we're a read request
3521 		 * this must be a resync, we cannot afford to be queued
3522 		 * behind any intervening i/o requests. In this case we put the
3523 		 * request on the md_mirror_rs_daemon queue.
3524 		 */
3525 		if (pb->b_flags & B_READ) {
3526 			daemon_request(&md_mirror_rs_daemon, daemon_io, dq,
3527 			    REQ_OLD);
3528 		} else {
3529 			daemon_request(&md_mirror_io_daemon, daemon_io, dq,
3530 			    REQ_OLD);
3531 		}
3532 	} else {
3533 		mutex_enter(&un->un_owner_mx);
3534 		if ((un->un_owner_state & MM_MN_OWNER_SENT) == 0) {
3535 			md_mn_req_owner_t	*msg;
3536 			int			rval = 0;
3537 
3538 			/*
3539 			 * Check to see that we haven't exceeded the maximum
3540 			 * retry count. If we have we fail the i/o as the
3541 			 * comms mechanism has become wedged beyond recovery.
3542 			 */
3543 			if (dq->qlen++ >= MD_OWNER_RETRIES) {
3544 				mutex_exit(&un->un_owner_mx);
3545 				cmn_err(CE_WARN,
3546 				    "md_mirror: Request exhausted ownership "
3547 				    "retry limit of %d attempts", dq->qlen);
3548 				pb->b_error = EIO;
3549 				pb->b_flags |= B_ERROR;
3550 				pb->b_resid = pb->b_bcount;
3551 				kmem_cache_free(mirror_parent_cache, ps);
3552 				md_biodone(pb);
3553 				return;
3554 			}
3555 
3556 			/*
3557 			 * Issue request to change ownership. The call is
3558 			 * synchronous so when it returns we can complete the
3559 			 * i/o (if successful), or enqueue it again so that
3560 			 * the operation will be retried.
3561 			 */
3562 			un->un_owner_state |= MM_MN_OWNER_SENT;
3563 			mutex_exit(&un->un_owner_mx);
3564 
3565 			msg = kmem_zalloc(sizeof (md_mn_req_owner_t), KM_SLEEP);
3566 			setno = MD_MIN2SET(getminor(pb->b_edev));
3567 			msg->mnum = MD_SID(un);
3568 			msg->owner = md_mn_mynode_id;
3569 			msg_flags |= MD_MSGF_NO_LOG;
3570 			/*
3571 			 * If this IO is triggered by updating a watermark,
3572 			 * it might be issued by the creation of a softpartition
3573 			 * while the commd subsystem is suspended.
3574 			 * We don't want this message to block.
3575 			 */
3576 			if (ps->ps_flags & MD_MPS_WMUPDATE) {
3577 				msg_flags |= MD_MSGF_OVERRIDE_SUSPEND;
3578 			}
3579 
3580 			kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
3581 			rval = mdmn_ksend_message(setno,
3582 			    MD_MN_MSG_REQUIRE_OWNER, msg_flags,
3583 			    /* flags */ (char *)msg,
3584 			    sizeof (md_mn_req_owner_t), kres);
3585 
3586 			kmem_free(msg, sizeof (md_mn_req_owner_t));
3587 
3588 			if (MDMN_KSEND_MSG_OK(rval, kres)) {
3589 				dq->qlen = 0;
3590 				/*
3591 				 * Successfully changed owner, reread the
3592 				 * resync record so that we have a valid idea of
3593 				 * any previously committed incomplete write()s.
3594 				 * NOTE: As we need to acquire the resync mutex
3595 				 * this may block, so we defer it to a separate
3596 				 * thread handler. This makes us (effectively)
3597 				 * non-blocking once the ownership message
3598 				 * handling has completed.
3599 				 */
3600 				mutex_enter(&un->un_owner_mx);
3601 				if (un->un_owner_state & MM_MN_BECOME_OWNER) {
3602 					un->un_mirror_owner = md_mn_mynode_id;
3603 					/* Sets owner of un_rr_dirty record */
3604 					if (un->un_rr_dirty_recid)
3605 						(void) mddb_setowner(
3606 						    un->un_rr_dirty_recid,
3607 						    md_mn_mynode_id);
3608 					un->un_owner_state &=
3609 					    ~MM_MN_BECOME_OWNER;
3610 					/*
3611 					 * Release the block on the current
3612 					 * resync region if it is blocked
3613 					 */
3614 					ps1 = un->un_rs_prev_overlap;
3615 					if ((ps1 != NULL) &&
3616 					    (ps1->ps_flags & MD_MPS_ON_OVERLAP))
3617 						mirror_overlap_tree_remove(ps1);
3618 					mutex_exit(&un->un_owner_mx);
3619 
3620 					/*
3621 					 * If we're a read, this must be a
3622 					 * resync request, issue
3623 					 * the i/o request on the
3624 					 * md_mirror_rs_daemon queue. This is
3625 					 * to avoid a deadlock between the
3626 					 * resync_unit thread and
3627 					 * subsequent i/o requests that may
3628 					 * block on the resync region.
3629 					 */
3630 					if (pb->b_flags & B_READ) {
3631 						daemon_request(
3632 						    &md_mirror_rs_daemon,
3633 						    update_resync, dq, REQ_OLD);
3634 					} else {
3635 						daemon_request(
3636 						    &md_mirror_io_daemon,
3637 						    update_resync, dq, REQ_OLD);
3638 					}
3639 					kmem_free(kres,
3640 					    sizeof (md_mn_kresult_t));
3641 					return;
3642 				} else {
3643 					/*
3644 					 * Some other node has beaten us to
3645 					 * obtain ownership. We need to
3646 					 * reschedule our ownership request
3647 					 */
3648 					mutex_exit(&un->un_owner_mx);
3649 				}
3650 			} else {
3651 				mdmn_ksend_show_error(rval, kres,
3652 				    "MD_MN_MSG_REQUIRE_OWNER");
3653 				/*
3654 				 * Message transport failure is handled by the
3655 				 * comms layer. If the ownership change request
3656 				 * does not succeed we need to flag the error to
3657 				 * the initiator of the i/o. This is handled by
3658 				 * the retry logic above. As the request failed
3659 				 * we do not know _who_ the owner of the mirror
3660 				 * currently is. We reset our idea of the owner
3661 				 * to None so that any further write()s will
3662 				 * attempt to become the owner again. This stops
3663 				 * multiple nodes writing to the same mirror
3664 				 * simultaneously.
3665 				 */
3666 				mutex_enter(&un->un_owner_mx);
3667 				un->un_owner_state &=
3668 				    ~(MM_MN_OWNER_SENT|MM_MN_BECOME_OWNER);
3669 				un->un_mirror_owner = MD_MN_MIRROR_UNOWNED;
3670 				mutex_exit(&un->un_owner_mx);
3671 			}
3672 			kmem_free(kres, sizeof (md_mn_kresult_t));
3673 		} else
3674 			mutex_exit(&un->un_owner_mx);
3675 
3676 		/*
3677 		 * Re-enqueue this request on the deferred i/o list. Delay the
3678 		 * request for md_mirror_owner_to usecs to stop thrashing.
3679 		 */
3680 		(void) timeout(owner_timeout, dq,
3681 		    drv_usectohz(md_mirror_owner_to));
3682 	}
3683 }
3684 
3685 static void
3686 mirror_write_strategy(buf_t *pb, int flag, void *private)
3687 {
3688 	md_mps_t	*ps;
3689 	md_mcs_t	*cs;
3690 	int		more;
3691 	mm_unit_t	*un;
3692 	mdi_unit_t	*ui;
3693 	buf_t		*cb;		/* child buf pointer */
3694 	set_t		setno;
3695 	int		rs_on_overlap = 0;
3696 
3697 	ui = MDI_UNIT(getminor(pb->b_edev));
3698 	un = (mm_unit_t *)MD_UNIT(getminor(pb->b_edev));
3699 
3700 
3701 	md_kstat_waitq_enter(ui);
3702 
3703 	/*
3704 	 * If a state change is in progress for this mirror in a MN set,
3705 	 * suspend all non-resync writes until the state change is complete.
3706 	 * The objective of this suspend is to ensure that it is not
3707 	 * possible for one node to read data from a submirror that another node
3708 	 * has not written to because of the state change. Therefore we
3709 	 * suspend all writes until the state change has been made. As it is
3710 	 * not possible to read from the target of a resync, there is no need
3711 	 * to suspend resync writes.
3712 	 */
3713 
3714 	if (!(flag & MD_STR_WAR)) {
3715 		mutex_enter(&un->un_suspend_wr_mx);
3716 		while (un->un_suspend_wr_flag) {
3717 			cv_wait(&un->un_suspend_wr_cv, &un->un_suspend_wr_mx);
3718 		}
3719 		mutex_exit(&un->un_suspend_wr_mx);
3720 		(void) md_unit_readerlock(ui);
3721 	}
3722 
3723 	if (!(flag & MD_STR_NOTTOP)) {
3724 		if (md_checkbuf(ui, (md_unit_t *)un, pb)) {
3725 			md_kstat_waitq_exit(ui);
3726 			return;
3727 		}
3728 	}
3729 
3730 	setno = MD_MIN2SET(getminor(pb->b_edev));
3731 
3732 	/* If an ABR write has been requested, set MD_STR_ABR flag */
3733 	if (MD_MNSET_SETNO(setno) && (pb->b_flags & B_ABRWRITE))
3734 		flag |= MD_STR_ABR;
3735 
3736 	if (private == NULL) {
3737 		ps = kmem_cache_alloc(mirror_parent_cache, MD_ALLOCFLAGS);
3738 		mirror_parent_init(ps);
3739 	} else {
3740 		ps = private;
3741 		private = NULL;
3742 	}
3743 	if (flag & MD_STR_MAPPED)
3744 		ps->ps_flags |= MD_MPS_MAPPED;
3745 
3746 	if (flag & MD_STR_WOW)
3747 		ps->ps_flags |= MD_MPS_WOW;
3748 
3749 	if (flag & MD_STR_ABR)
3750 		ps->ps_flags |= MD_MPS_ABR;
3751 
3752 	if (flag & MD_STR_WMUPDATE)
3753 		ps->ps_flags |= MD_MPS_WMUPDATE;
3754 
3755 	/*
3756 	 * Save essential information from the original buffhdr
3757 	 * in the md_save structure.
3758 	 */
3759 	ps->ps_un = un;
3760 	ps->ps_ui = ui;
3761 	ps->ps_bp = pb;
3762 	ps->ps_addr = pb->b_un.b_addr;
3763 	ps->ps_firstblk = pb->b_lblkno;
3764 	ps->ps_lastblk = pb->b_lblkno + lbtodb(pb->b_bcount) - 1;
3765 	ps->ps_changecnt = un->un_changecnt;
3766 
3767 	/*
3768 	 * If not MN owner and this is an ABR write, make sure the current
3769 	 * resync region is in the overlaps tree
3770 	 */
3771 	mutex_enter(&un->un_owner_mx);
3772 	if (MD_MNSET_SETNO(setno) && (!(MD_MN_MIRROR_OWNER(un))) &&
3773 	    ((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR))) {
3774 		md_mps_t	*ps1;
3775 		/* Block the current resync region, if not already blocked */
3776 		ps1 = un->un_rs_prev_overlap;
3777 
3778 		if ((ps1 != NULL) && ((ps1->ps_firstblk != 0) ||
3779 		    (ps1->ps_lastblk != 0))) {
3780 			/* Drop locks to avoid deadlock */
3781 			mutex_exit(&un->un_owner_mx);
3782 			md_unit_readerexit(ui);
3783 			wait_for_overlaps(ps1, MD_OVERLAP_ALLOW_REPEAT);
3784 			rs_on_overlap = 1;
3785 			(void) md_unit_readerlock(ui);
3786 			mutex_enter(&un->un_owner_mx);
3787 			/*
3788 			 * Check to see if we have obtained ownership
3789 			 * while waiting for overlaps. If we have, remove
3790 			 * the resync_region entry from the overlap tree
3791 			 */
3792 			if (MD_MN_MIRROR_OWNER(un) &&
3793 			    (ps1->ps_flags & MD_MPS_ON_OVERLAP)) {
3794 				mirror_overlap_tree_remove(ps1);
3795 				rs_on_overlap = 0;
3796 			}
3797 		}
3798 	}
3799 	mutex_exit(&un->un_owner_mx);
3800 
3801 
3802 	/*
3803 	 * following keep write after read from writing to the
3804 	 * source in the case where it all came from one place
3805 	 */
3806 	if (flag & MD_STR_WAR) {
3807 		int	abort_write = 0;
3808 		/*
3809 		 * We are perfoming a write-after-read. This is either as a
3810 		 * result of a resync read or as a result of a read in a
3811 		 * dirty resync region when the optimized resync is not
3812 		 * complete. If in a MN set and a resync generated i/o,
3813 		 * if the current block is not in the current
3814 		 * resync region terminate the write as another node must have
3815 		 * completed this resync region
3816 		 */
3817 		if ((MD_MNSET_SETNO(MD_UN2SET(un))) &&
3818 		    (!flag & MD_STR_DIRTY_RD)) {
3819 			if (!IN_RESYNC_REGION(un, ps))
3820 				abort_write = 1;
3821 		}
3822 		if ((select_write_after_read_units(un, ps) == 0) ||
3823 		    (abort_write)) {
3824 #ifdef DEBUG
3825 			if (mirror_debug_flag)
3826 				printf("Abort resync write on %x, block %lld\n",
3827 				    MD_SID(un), ps->ps_firstblk);
3828 #endif
3829 			if (ps->ps_flags & MD_MPS_ON_OVERLAP)
3830 				mirror_overlap_tree_remove(ps);
3831 			kmem_cache_free(mirror_parent_cache, ps);
3832 			md_kstat_waitq_exit(ui);
3833 			md_unit_readerexit(ui);
3834 			md_biodone(pb);
3835 			return;
3836 		}
3837 	} else {
3838 		select_write_units(un, ps);
3839 
3840 		/* Drop readerlock to avoid deadlock */
3841 		md_unit_readerexit(ui);
3842 		wait_for_overlaps(ps, MD_OVERLAP_NO_REPEAT);
3843 		un = md_unit_readerlock(ui);
3844 		/*
3845 		 * For a MN set with an ABR write, if we are now the
3846 		 * owner and we have a resync region in the overlap
3847 		 * tree, remove the entry from overlaps and retry the write.
3848 		 */
3849 
3850 		if (MD_MNSET_SETNO(setno) &&
3851 		    ((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR))) {
3852 			mutex_enter(&un->un_owner_mx);
3853 			if (((MD_MN_MIRROR_OWNER(un))) && rs_on_overlap) {
3854 				mirror_overlap_tree_remove(ps);
3855 				md_kstat_waitq_exit(ui);
3856 				mutex_exit(&un->un_owner_mx);
3857 				md_unit_readerexit(ui);
3858 				daemon_request(&md_mirror_daemon, daemon_io,
3859 				    (daemon_queue_t *)ps, REQ_OLD);
3860 				return;
3861 			}
3862 			mutex_exit(&un->un_owner_mx);
3863 		}
3864 	}
3865 
3866 	/*
3867 	 * For Multinode mirrors with a Resync Region (not ABR) we need to
3868 	 * become the mirror owner before continuing with the write(). For ABR
3869 	 * mirrors we check that we 'own' the resync if we're in
3870 	 * write-after-read mode. We do this _after_ ensuring that there are no
3871 	 * overlaps to ensure that the once we know that we are the owner, the
3872 	 * readerlock will not released until the write is complete. As a
3873 	 * change of ownership in a MN set requires the writerlock, this
3874 	 * ensures that ownership cannot be changed until the write is
3875 	 * complete
3876 	 */
3877 	if (MD_MNSET_SETNO(setno) && (!((ui->ui_tstate & MD_ABR_CAP) ||
3878 	    (flag & MD_STR_ABR)) || (flag & MD_STR_WAR))) {
3879 		if (!MD_MN_MIRROR_OWNER(un))  {
3880 			if (ps->ps_flags & MD_MPS_ON_OVERLAP)
3881 				mirror_overlap_tree_remove(ps);
3882 			md_kstat_waitq_exit(ui);
3883 			ASSERT(!(flag & MD_STR_WAR));
3884 			md_unit_readerexit(ui);
3885 			daemon_request(&md_mirror_daemon, become_owner,
3886 			    (daemon_queue_t *)ps, REQ_OLD);
3887 			return;
3888 		}
3889 	}
3890 
3891 	/*
3892 	 * Mark resync region if mirror has a Resync Region _and_ we are not
3893 	 * a resync initiated write(). Don't mark region if we're flagged as
3894 	 * an ABR write.
3895 	 */
3896 	if (!((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR)) &&
3897 	    !(flag & MD_STR_WAR)) {
3898 		if (mirror_mark_resync_region(un, ps->ps_firstblk,
3899 		    ps->ps_lastblk)) {
3900 			pb->b_flags |= B_ERROR;
3901 			pb->b_resid = pb->b_bcount;
3902 			ASSERT(!(ps->ps_flags & MD_MPS_ON_OVERLAP));
3903 			kmem_cache_free(mirror_parent_cache, ps);
3904 			md_kstat_waitq_exit(ui);
3905 			md_unit_readerexit(ui);
3906 			md_biodone(pb);
3907 			return;
3908 		}
3909 	}
3910 
3911 	ps->ps_childbflags = pb->b_flags | B_WRITE;
3912 	ps->ps_childbflags &= ~B_READ;
3913 	if (flag & MD_STR_MAPPED)
3914 		ps->ps_childbflags &= ~B_PAGEIO;
3915 
3916 	if (!(flag & MD_STR_NOTTOP) && panicstr)
3917 		/* Disable WOW and don't free ps */
3918 		ps->ps_flags |= (MD_MPS_WOW|MD_MPS_DONTFREE);
3919 
3920 	md_kstat_waitq_to_runq(ui);
3921 
3922 	/*
3923 	 * Treat Raw and Direct I/O as Write-on-Write always
3924 	 */
3925 
3926 	if (!(md_mirror_wow_flg & WOW_DISABLE) &&
3927 	    (md_mirror_wow_flg & WOW_PHYS_ENABLE) &&
3928 	    (pb->b_flags & B_PHYS) &&
3929 	    !(ps->ps_flags & MD_MPS_WOW)) {
3930 		if (ps->ps_flags & MD_MPS_ON_OVERLAP)
3931 			mirror_overlap_tree_remove(ps);
3932 		md_unit_readerexit(ui);
3933 		daemon_request(&md_mstr_daemon, handle_wow,
3934 		    (daemon_queue_t *)ps, REQ_OLD);
3935 		return;
3936 	}
3937 
3938 	ps->ps_frags = 1;
3939 	do {
3940 		cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS);
3941 		mirror_child_init(cs);
3942 		cb = &cs->cs_buf;
3943 		more = mirror_map_write(un, cs, ps, (flag & MD_STR_WAR));
3944 
3945 		/*
3946 		 * This handles the case where we're requesting
3947 		 * a write to block 0 on a label partition.  (more < 0)
3948 		 * means that the request size was smaller than the
3949 		 * size of the label.  If so this request is done.
3950 		 */
3951 		if (more < 0) {
3952 			if (ps->ps_flags & MD_MPS_ON_OVERLAP)
3953 				mirror_overlap_tree_remove(ps);
3954 			md_kstat_runq_exit(ui);
3955 			kmem_cache_free(mirror_child_cache, cs);
3956 			kmem_cache_free(mirror_parent_cache, ps);
3957 			md_unit_readerexit(ui);
3958 			md_biodone(pb);
3959 			return;
3960 		}
3961 		if (more) {
3962 			mutex_enter(&ps->ps_mx);
3963 			ps->ps_frags++;
3964 			mutex_exit(&ps->ps_mx);
3965 		}
3966 		md_call_strategy(cb, flag, private);
3967 	} while (more);
3968 
3969 	if (!(flag & MD_STR_NOTTOP) && panicstr) {
3970 		while (!(ps->ps_flags & MD_MPS_DONE)) {
3971 			md_daemon(1, &md_done_daemon);
3972 			drv_usecwait(10);
3973 		}
3974 		kmem_cache_free(mirror_parent_cache, ps);
3975 	}
3976 }
3977 
3978 static void
3979 mirror_read_strategy(buf_t *pb, int flag, void *private)
3980 {
3981 	md_mps_t	*ps;
3982 	md_mcs_t	*cs;
3983 	size_t		more;
3984 	mm_unit_t	*un;
3985 	mdi_unit_t	*ui;
3986 	size_t		current_count;
3987 	diskaddr_t	current_blkno;
3988 	off_t		current_offset;
3989 	buf_t		*cb;		/* child buf pointer */
3990 	set_t		setno;
3991 
3992 	ui = MDI_UNIT(getminor(pb->b_edev));
3993 
3994 	md_kstat_waitq_enter(ui);
3995 
3996 	un = (mm_unit_t *)md_unit_readerlock(ui);
3997 
3998 	if (!(flag & MD_STR_NOTTOP)) {
3999 		if (md_checkbuf(ui, (md_unit_t *)un, pb)) {
4000 			md_kstat_waitq_exit(ui);
4001 			return;
4002 		}
4003 	}
4004 
4005 	if (private == NULL) {
4006 		ps = kmem_cache_alloc(mirror_parent_cache, MD_ALLOCFLAGS);
4007 		mirror_parent_init(ps);
4008 	} else {
4009 		ps = private;
4010 		private = NULL;
4011 	}
4012 
4013 	if (flag & MD_STR_MAPPED)
4014 		ps->ps_flags |= MD_MPS_MAPPED;
4015 	if (flag & MD_NOBLOCK)
4016 		ps->ps_flags |= MD_MPS_NOBLOCK;
4017 	if (flag & MD_STR_WMUPDATE)
4018 		ps->ps_flags |= MD_MPS_WMUPDATE;
4019 
4020 	/*
4021 	 * Check to see if this is a DMR driven read. If so we need to use the
4022 	 * specified side (in un->un_dmr_last_read) for the source of the data.
4023 	 */
4024 	if (flag & MD_STR_DMR)
4025 		ps->ps_flags |= MD_MPS_DMR;
4026 
4027 	/*
4028 	 * Save essential information from the original buffhdr
4029 	 * in the md_save structure.
4030 	 */
4031 	ps->ps_un = un;
4032 	ps->ps_ui = ui;
4033 	ps->ps_bp = pb;
4034 	ps->ps_addr = pb->b_un.b_addr;
4035 	ps->ps_firstblk = pb->b_lblkno;
4036 	ps->ps_lastblk = pb->b_lblkno + lbtodb(pb->b_bcount) - 1;
4037 	ps->ps_changecnt = un->un_changecnt;
4038 
4039 	current_count = btodb(pb->b_bcount);
4040 	current_blkno = pb->b_lblkno;
4041 	current_offset = 0;
4042 
4043 	/*
4044 	 * If flag has MD_STR_WAR set this means that the read is issued by a
4045 	 * resync thread which may or may not be an optimised resync.
4046 	 *
4047 	 * If MD_UN_OPT_NOT_DONE is set this means that the optimized resync
4048 	 * code has not completed; either a resync has not started since snarf,
4049 	 * or there is an optimized resync in progress.
4050 	 *
4051 	 * We need to generate a write after this read in the following two
4052 	 * cases,
4053 	 *
4054 	 * 1. Any Resync-Generated read
4055 	 *
4056 	 * 2. Any read to a DIRTY REGION if there is an optimized resync
4057 	 *    pending or in progress.
4058 	 *
4059 	 * The write after read is done in these cases to ensure that all sides
4060 	 * of the mirror are in sync with the read data and that it is not
4061 	 * possible for an application to read the same block multiple times
4062 	 * and get different data.
4063 	 *
4064 	 * This would be possible if the block was in a dirty region.
4065 	 *
4066 	 * If we're performing a directed read we don't write the data out as
4067 	 * the application is responsible for restoring the mirror to a known
4068 	 * state.
4069 	 */
4070 	if (((MD_STATUS(un) & MD_UN_OPT_NOT_DONE) || (flag & MD_STR_WAR)) &&
4071 	    !(flag & MD_STR_DMR)) {
4072 		size_t	start_rr, i, end_rr;
4073 		int	region_dirty = 1;
4074 
4075 		/*
4076 		 * We enter here under three circumstances,
4077 		 *
4078 		 * MD_UN_OPT_NOT_DONE	MD_STR_WAR
4079 		 * 0			1
4080 		 * 1			0
4081 		 * 1			1
4082 		 *
4083 		 * To be optimal we only care to explicitly check for dirty
4084 		 * regions in the second case since if MD_STR_WAR is set we
4085 		 * always do the write after read.
4086 		 */
4087 		if (!(flag & MD_STR_WAR)) {
4088 			BLK_TO_RR(end_rr, ps->ps_lastblk, un);
4089 			BLK_TO_RR(start_rr, ps->ps_firstblk, un);
4090 
4091 			for (i = start_rr; i <= end_rr; i++)
4092 				if ((region_dirty = IS_KEEPDIRTY(i, un)) != 0)
4093 					break;
4094 		}
4095 
4096 		if ((region_dirty) &&
4097 		    !(md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)) {
4098 			ps->ps_call = write_after_read;
4099 			/*
4100 			 * Mark this as a RESYNC_READ in ps_flags.
4101 			 * This is used if the read fails during a
4102 			 * resync of a 3-way mirror to ensure that
4103 			 * the retried read to the remaining
4104 			 * good submirror has MD_STR_WAR set. This
4105 			 * is needed to ensure that the resync write
4106 			 * (write-after-read) takes place.
4107 			 */
4108 			ps->ps_flags |= MD_MPS_RESYNC_READ;
4109 
4110 			/*
4111 			 * If MD_STR_FLAG_ERR is set in the flags we
4112 			 * set MD_MPS_FLAG_ERROR so that an error on the resync
4113 			 * write (issued by write_after_read) will be flagged
4114 			 * to the biowait'ing resync thread. This allows us to
4115 			 * avoid issuing further resync requests to a device
4116 			 * that has had a write failure.
4117 			 */
4118 			if (flag & MD_STR_FLAG_ERR)
4119 				ps->ps_flags |= MD_MPS_FLAG_ERROR;
4120 
4121 			setno = MD_UN2SET(un);
4122 			/*
4123 			 * Drop the readerlock to avoid
4124 			 * deadlock
4125 			 */
4126 			md_unit_readerexit(ui);
4127 			wait_for_overlaps(ps, MD_OVERLAP_NO_REPEAT);
4128 			un = md_unit_readerlock(ui);
4129 			/*
4130 			 * Ensure that we are owner
4131 			 */
4132 			if (MD_MNSET_SETNO(setno)) {
4133 				/*
4134 				 * For a non-resync read that requires a
4135 				 * write-after-read to be done, set a flag
4136 				 * in the parent structure, so that the
4137 				 * write_strategy routine can omit the
4138 				 * test that the write is still within the
4139 				 * resync region
4140 				 */
4141 				if (!(flag & MD_STR_WAR))
4142 					ps->ps_flags |= MD_MPS_DIRTY_RD;
4143 
4144 				/*
4145 				 * Before reading the buffer, see if
4146 				 * we are the owner
4147 				 */
4148 				if (!MD_MN_MIRROR_OWNER(un))  {
4149 					ps->ps_call = NULL;
4150 					mirror_overlap_tree_remove(ps);
4151 					md_kstat_waitq_exit(ui);
4152 					md_unit_readerexit(ui);
4153 					daemon_request(
4154 					    &md_mirror_daemon,
4155 					    become_owner,
4156 					    (daemon_queue_t *)ps,
4157 					    REQ_OLD);
4158 					return;
4159 				}
4160 				/*
4161 				 * For a resync read, check to see if I/O is
4162 				 * outside of the current resync region, or
4163 				 * the resync has finished. If so
4164 				 * just terminate the I/O
4165 				 */
4166 				if ((flag & MD_STR_WAR) &&
4167 				    (!(un->c.un_status & MD_UN_WAR) ||
4168 				    (!IN_RESYNC_REGION(un, ps)))) {
4169 #ifdef DEBUG
4170 					if (mirror_debug_flag)
4171 						printf("Abort resync read "
4172 						    "%x: %lld\n",
4173 						    MD_SID(un),
4174 						    ps->ps_firstblk);
4175 #endif
4176 					mirror_overlap_tree_remove(ps);
4177 					kmem_cache_free(mirror_parent_cache,
4178 					    ps);
4179 					md_kstat_waitq_exit(ui);
4180 					md_unit_readerexit(ui);
4181 					md_biodone(pb);
4182 					return;
4183 				}
4184 			}
4185 		}
4186 	}
4187 
4188 	if (flag & MD_STR_DMR) {
4189 		ps->ps_call = directed_read_done;
4190 	}
4191 
4192 	if (!(flag & MD_STR_NOTTOP) && panicstr)
4193 		ps->ps_flags |= MD_MPS_DONTFREE;
4194 
4195 	md_kstat_waitq_to_runq(ui);
4196 
4197 	ps->ps_frags++;
4198 	do {
4199 		cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS);
4200 		mirror_child_init(cs);
4201 		cb = &cs->cs_buf;
4202 		cs->cs_ps = ps;
4203 
4204 		cb = md_bioclone(pb, current_offset, current_count, NODEV,
4205 		    current_blkno, mirror_done, cb, KM_NOSLEEP);
4206 
4207 		more = mirror_map_read(ps, cs, current_blkno,
4208 		    (u_longlong_t)current_count);
4209 		if (more) {
4210 			mutex_enter(&ps->ps_mx);
4211 			ps->ps_frags++;
4212 			mutex_exit(&ps->ps_mx);
4213 		}
4214 
4215 		/*
4216 		 * Do these calculations now,
4217 		 *  so that we pickup a valid b_bcount from the chld_bp.
4218 		 */
4219 		current_count -= more;
4220 		current_offset += cb->b_bcount;
4221 		current_blkno +=  more;
4222 		md_call_strategy(cb, flag, private);
4223 	} while (more);
4224 
4225 	if (!(flag & MD_STR_NOTTOP) && panicstr) {
4226 		while (!(ps->ps_flags & MD_MPS_DONE)) {
4227 			md_daemon(1, &md_done_daemon);
4228 			drv_usecwait(10);
4229 		}
4230 		kmem_cache_free(mirror_parent_cache, ps);
4231 	}
4232 }
4233 
4234 void
4235 md_mirror_strategy(buf_t *bp, int flag, void *private)
4236 {
4237 	set_t	setno = MD_MIN2SET(getminor(bp->b_edev));
4238 
4239 	/*
4240 	 * When doing IO to a multi owner meta device, check if set is halted.
4241 	 * We do this check without the needed lock held, for performance
4242 	 * reasons.
4243 	 * If an IO just slips through while the set is locked via an
4244 	 * MD_MN_SUSPEND_SET, we don't care about it.
4245 	 * Only check for suspension if we are a top-level i/o request
4246 	 * (MD_STR_NOTTOP is cleared in 'flag').
4247 	 */
4248 	if ((md_set[setno].s_status & (MD_SET_HALTED | MD_SET_MNSET)) ==
4249 	    (MD_SET_HALTED | MD_SET_MNSET)) {
4250 		if ((flag & MD_STR_NOTTOP) == 0) {
4251 			mutex_enter(&md_mx);
4252 			/* Here we loop until the set is no longer halted */
4253 			while (md_set[setno].s_status & MD_SET_HALTED) {
4254 				cv_wait(&md_cv, &md_mx);
4255 			}
4256 			mutex_exit(&md_mx);
4257 		}
4258 	}
4259 
4260 	if ((flag & MD_IO_COUNTED) == 0) {
4261 		if ((flag & MD_NOBLOCK) == 0) {
4262 			if (md_inc_iocount(setno) != 0) {
4263 				bp->b_flags |= B_ERROR;
4264 				bp->b_error = ENXIO;
4265 				bp->b_resid = bp->b_bcount;
4266 				biodone(bp);
4267 				return;
4268 			}
4269 		} else {
4270 			md_inc_iocount_noblock(setno);
4271 		}
4272 	}
4273 
4274 	if (bp->b_flags & B_READ)
4275 		mirror_read_strategy(bp, flag, private);
4276 	else
4277 		mirror_write_strategy(bp, flag, private);
4278 }
4279 
4280 /*
4281  * mirror_directed_read:
4282  * --------------------
4283  * Entry-point for the DKIOCDMR ioctl. We issue a read to a specified sub-mirror
4284  * so that the application can determine what (if any) resync needs to be
4285  * performed. The data is copied out to the user-supplied buffer.
4286  *
4287  * Parameters:
4288  *	mdev	- dev_t for the mirror device
4289  *	vdr	- directed read parameters specifying location and submirror
4290  *		  to perform the read from
4291  *	mode	- used to ddi_copyout() any resulting data from the read
4292  *
4293  * Returns:
4294  *	0	success
4295  *	!0	error code
4296  *		EINVAL - invalid request format
4297  */
4298 int
4299 mirror_directed_read(dev_t mdev, vol_directed_rd_t *vdr, int mode)
4300 {
4301 	buf_t		*bp;
4302 	minor_t		mnum = getminor(mdev);
4303 	mdi_unit_t	*ui = MDI_UNIT(mnum);
4304 	mm_unit_t	*un;
4305 	mm_submirror_t	*sm;
4306 	char		*sm_nm;
4307 	uint_t		next_side;
4308 	void		*kbuffer;
4309 
4310 	if (ui == NULL)
4311 		return (ENXIO);
4312 
4313 	if (!(vdr->vdr_flags & DKV_DMR_NEXT_SIDE)) {
4314 		return (EINVAL);
4315 	}
4316 
4317 	/* Check for aligned block access. We disallow non-aligned requests. */
4318 	if (vdr->vdr_offset % DEV_BSIZE) {
4319 		return (EINVAL);
4320 	}
4321 
4322 	/*
4323 	 * Allocate kernel buffer for target of read(). If we had a reliable
4324 	 * (sorry functional) DDI this wouldn't be needed.
4325 	 */
4326 	kbuffer = kmem_alloc(vdr->vdr_nbytes, KM_NOSLEEP);
4327 	if (kbuffer == NULL) {
4328 		cmn_err(CE_WARN, "mirror_directed_read: couldn't allocate %lx"
4329 		    " bytes\n", vdr->vdr_nbytes);
4330 		return (ENOMEM);
4331 	}
4332 
4333 	bp = getrbuf(KM_SLEEP);
4334 
4335 	bp->b_un.b_addr = kbuffer;
4336 	bp->b_flags = B_READ;
4337 	bp->b_bcount = vdr->vdr_nbytes;
4338 	bp->b_lblkno = lbtodb(vdr->vdr_offset);
4339 	bp->b_edev = mdev;
4340 
4341 	un = md_unit_readerlock(ui);
4342 
4343 	/*
4344 	 * If DKV_SIDE_INIT is set we need to determine the first available
4345 	 * side to start reading from. If it isn't set we increment to the
4346 	 * next readable submirror.
4347 	 * If there are no readable submirrors we error out with DKV_DMR_ERROR.
4348 	 * Note: we check for a readable submirror on completion of the i/o so
4349 	 * we should _always_ have one available. If this becomes unavailable
4350 	 * we have missed the 'DKV_DMR_DONE' opportunity. This could happen if
4351 	 * a metadetach is made between the completion of one DKIOCDMR ioctl
4352 	 * and the start of the next (i.e. a sys-admin 'accident' occurred).
4353 	 * The chance of this is small, but not non-existent.
4354 	 */
4355 	if (vdr->vdr_side == DKV_SIDE_INIT) {
4356 		next_side = 0;
4357 	} else {
4358 		next_side = vdr->vdr_side + 1;
4359 	}
4360 	while ((next_side < NMIRROR) &&
4361 	    !SUBMIRROR_IS_READABLE(un, next_side))
4362 		next_side++;
4363 	if (next_side >= NMIRROR) {
4364 		vdr->vdr_flags |= DKV_DMR_ERROR;
4365 		freerbuf(bp);
4366 		vdr->vdr_bytesread = 0;
4367 		md_unit_readerexit(ui);
4368 		return (0);
4369 	}
4370 
4371 	/* Set the side to read from */
4372 	un->un_dmr_last_read = next_side;
4373 
4374 	md_unit_readerexit(ui);
4375 
4376 	/*
4377 	 * Save timestamp for verification purposes. Can be read by debugger
4378 	 * to verify that this ioctl has been executed and to find the number
4379 	 * of DMR reads and the time of the last DMR read.
4380 	 */
4381 	uniqtime(&mirror_dmr_stats.dmr_timestamp);
4382 	mirror_dmr_stats.dmr_count++;
4383 
4384 	/* Issue READ request and wait for completion */
4385 	mirror_read_strategy(bp, MD_STR_DMR|MD_NOBLOCK|MD_STR_NOTTOP, NULL);
4386 
4387 	mutex_enter(&un->un_dmr_mx);
4388 	cv_wait(&un->un_dmr_cv, &un->un_dmr_mx);
4389 	mutex_exit(&un->un_dmr_mx);
4390 
4391 	/*
4392 	 * Check to see if we encountered an error during the read. If so we
4393 	 * can make no guarantee about any possibly returned data.
4394 	 */
4395 	if ((bp->b_flags & B_ERROR) == 0) {
4396 		vdr->vdr_flags &= ~DKV_DMR_ERROR;
4397 		if (bp->b_resid) {
4398 			vdr->vdr_flags |= DKV_DMR_SHORT;
4399 			vdr->vdr_bytesread = vdr->vdr_nbytes - bp->b_resid;
4400 		} else {
4401 			vdr->vdr_flags |= DKV_DMR_SUCCESS;
4402 			vdr->vdr_bytesread = vdr->vdr_nbytes;
4403 		}
4404 		/* Copy the data read back out to the user supplied buffer */
4405 		if (ddi_copyout(kbuffer, vdr->vdr_data, vdr->vdr_bytesread,
4406 		    mode)) {
4407 			kmem_free(kbuffer, vdr->vdr_nbytes);
4408 			return (EFAULT);
4409 		}
4410 
4411 	} else {
4412 		/* Error out with DKV_DMR_ERROR */
4413 		vdr->vdr_flags |= DKV_DMR_ERROR;
4414 		vdr->vdr_flags &= ~(DKV_DMR_SUCCESS|DKV_DMR_SHORT|DKV_DMR_DONE);
4415 	}
4416 	/*
4417 	 * Update the DMR parameters with the side and name of submirror that
4418 	 * we have just read from (un->un_dmr_last_read)
4419 	 */
4420 	un = md_unit_readerlock(ui);
4421 
4422 	vdr->vdr_side = un->un_dmr_last_read;
4423 	sm = &un->un_sm[un->un_dmr_last_read];
4424 	sm_nm = md_shortname(md_getminor(sm->sm_dev));
4425 
4426 	(void) strncpy(vdr->vdr_side_name, sm_nm, sizeof (vdr->vdr_side_name));
4427 
4428 	/*
4429 	 * Determine if we've completed the read cycle. This is true iff the
4430 	 * next computed submirror (side) equals or exceeds NMIRROR. We cannot
4431 	 * use un_nsm as we need to handle a sparse array of submirrors (which
4432 	 * can occur if a submirror is metadetached).
4433 	 */
4434 	next_side = un->un_dmr_last_read + 1;
4435 	while ((next_side < NMIRROR) &&
4436 	    !SUBMIRROR_IS_READABLE(un, next_side))
4437 		next_side++;
4438 	if (next_side >= NMIRROR) {
4439 		/* We've finished */
4440 		vdr->vdr_flags |= DKV_DMR_DONE;
4441 	}
4442 
4443 	md_unit_readerexit(ui);
4444 	freerbuf(bp);
4445 	kmem_free(kbuffer, vdr->vdr_nbytes);
4446 
4447 	return (0);
4448 }
4449 
4450 /*
4451  * mirror_resync_message:
4452  * ---------------------
4453  * Handle the multi-node resync messages that keep all nodes within a given
4454  * disk-set in sync with their view of a mirror's resync status.
4455  *
4456  * The message types dealt with are:
4457  * MD_MN_MSG_RESYNC_STARTING	- start a resync thread for a unit
4458  * MD_MN_MSG_RESYNC_NEXT	- specified next region to be resynced
4459  * MD_MN_MSG_RESYNC_FINISH	- stop the resync thread for a unit
4460  * MD_MN_MSG_RESYNC_PHASE_DONE	- end of a resync phase, opt, submirror or comp
4461  *
4462  * Returns:
4463  *	0	Success
4464  *	>0	Failure error number
4465  */
4466 int
4467 mirror_resync_message(md_mn_rs_params_t *p, IOLOCK *lockp)
4468 {
4469 	mdi_unit_t		*ui;
4470 	mm_unit_t		*un;
4471 	set_t			setno;
4472 	int			is_ABR;
4473 	int			smi;
4474 	int			ci;
4475 	sm_state_t		state;
4476 	int			broke_out;
4477 	mm_submirror_t		*sm;
4478 	mm_submirror_ic_t	*smic;
4479 	md_m_shared_t		*shared;
4480 	md_error_t		mde = mdnullerror;
4481 	md_mps_t		*ps;
4482 	int			rs_active;
4483 
4484 	/* Check that the given device is part of a multi-node set */
4485 	setno = MD_MIN2SET(p->mnum);
4486 	if (setno >= md_nsets) {
4487 		return (ENXIO);
4488 	}
4489 	if (!MD_MNSET_SETNO(setno)) {
4490 		return (EINVAL);
4491 	}
4492 
4493 	if ((un = mirror_getun(p->mnum, &p->mde, NO_LOCK, NULL)) == NULL)
4494 		return (EINVAL);
4495 	if ((ui = MDI_UNIT(p->mnum)) == NULL)
4496 		return (EINVAL);
4497 	is_ABR = (ui->ui_tstate & MD_ABR_CAP);
4498 
4499 	/* Obtain the current resync status */
4500 	(void) md_ioctl_readerlock(lockp, ui);
4501 	rs_active = (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) ? 1 : 0;
4502 	md_ioctl_readerexit(lockp);
4503 
4504 	switch ((md_mn_msgtype_t)p->msg_type) {
4505 	case MD_MN_MSG_RESYNC_STARTING:
4506 		/* Start the resync thread for the mirror */
4507 		(void) mirror_resync_unit(p->mnum, NULL, &p->mde, lockp);
4508 		break;
4509 
4510 	case MD_MN_MSG_RESYNC_NEXT:
4511 		/*
4512 		 * We have to release any previously marked overlap regions
4513 		 * so that i/o can resume. Then we need to block the region
4514 		 * from [rs_start..rs_start+rs_size) * so that no i/o is issued.
4515 		 * Update un_rs_resync_done and un_rs_resync_2_do.
4516 		 */
4517 		(void) md_ioctl_readerlock(lockp, ui);
4518 		/*
4519 		 * Ignore the message if there is no active resync thread or
4520 		 * if it is for a resync type that we have already completed.
4521 		 * un_resync_completed is set to the last resync completed
4522 		 * when processing a PHASE_DONE message.
4523 		 */
4524 		if (!rs_active || (p->rs_type == un->un_resync_completed))
4525 			break;
4526 		/*
4527 		 * If this message is for the same resync and is for an earlier
4528 		 * resync region, just ignore it. This can only occur if this
4529 		 * node has progressed on to the next resync region before
4530 		 * we receive this message. This can occur if the class for
4531 		 * this message is busy and the originator has to retry thus
4532 		 * allowing this node to move onto the next resync_region.
4533 		 */
4534 		if ((p->rs_type == un->un_rs_type) &&
4535 		    (p->rs_start < un->un_resync_startbl))
4536 			break;
4537 		ps = un->un_rs_prev_overlap;
4538 
4539 		/* Allocate previous overlap reference if needed */
4540 		if (ps == NULL) {
4541 			ps = kmem_cache_alloc(mirror_parent_cache,
4542 			    MD_ALLOCFLAGS);
4543 			ps->ps_un = un;
4544 			ps->ps_ui = ui;
4545 			ps->ps_firstblk = 0;
4546 			ps->ps_lastblk = 0;
4547 			ps->ps_flags = 0;
4548 			md_ioctl_readerexit(lockp);
4549 			(void) md_ioctl_writerlock(lockp, ui);
4550 			un->un_rs_prev_overlap = ps;
4551 			md_ioctl_writerexit(lockp);
4552 		} else
4553 			md_ioctl_readerexit(lockp);
4554 
4555 		if (p->rs_originator != md_mn_mynode_id) {
4556 			/*
4557 			 * On all but the originating node, first update
4558 			 * the resync state, then unblock the previous
4559 			 * region and block the next one. No need
4560 			 * to do this if the region is already blocked.
4561 			 * Update the submirror state and flags from the
4562 			 * originator. This keeps the cluster in sync with
4563 			 * regards to the resync status.
4564 			 */
4565 
4566 			(void) md_ioctl_writerlock(lockp, ui);
4567 			un->un_rs_resync_done = p->rs_done;
4568 			un->un_rs_resync_2_do = p->rs_2_do;
4569 			un->un_rs_type = p->rs_type;
4570 			un->un_resync_startbl = p->rs_start;
4571 			md_ioctl_writerexit(lockp);
4572 			/*
4573 			 * Use un_owner_mx to ensure that an ownership change
4574 			 * cannot happen at the same time as this message
4575 			 */
4576 			mutex_enter(&un->un_owner_mx);
4577 			if (MD_MN_MIRROR_OWNER(un)) {
4578 				ps->ps_firstblk = p->rs_start;
4579 				ps->ps_lastblk = ps->ps_firstblk +
4580 				    p->rs_size - 1;
4581 			} else {
4582 				if ((ps->ps_firstblk != p->rs_start) ||
4583 				    (ps->ps_lastblk != p->rs_start +
4584 				    p->rs_size - 1)) {
4585 					/* Remove previous overlap range */
4586 					if (ps->ps_flags & MD_MPS_ON_OVERLAP)
4587 						mirror_overlap_tree_remove(ps);
4588 
4589 					ps->ps_firstblk = p->rs_start;
4590 					ps->ps_lastblk = ps->ps_firstblk +
4591 					    p->rs_size - 1;
4592 
4593 					mutex_exit(&un->un_owner_mx);
4594 					/* Block this range from all i/o. */
4595 					if (ps->ps_firstblk != 0 ||
4596 					    ps->ps_lastblk != 0)
4597 						wait_for_overlaps(ps,
4598 						    MD_OVERLAP_ALLOW_REPEAT);
4599 					mutex_enter(&un->un_owner_mx);
4600 					/*
4601 					 * Check to see if we have obtained
4602 					 * ownership while waiting for
4603 					 * overlaps. If we have, remove
4604 					 * the resync_region entry from the
4605 					 * overlap tree
4606 					 */
4607 					if (MD_MN_MIRROR_OWNER(un) &&
4608 					    (ps->ps_flags & MD_MPS_ON_OVERLAP))
4609 						mirror_overlap_tree_remove(ps);
4610 				}
4611 			}
4612 			mutex_exit(&un->un_owner_mx);
4613 
4614 			/*
4615 			 * If this is the first RESYNC_NEXT message (i.e.
4616 			 * MD_MN_RS_FIRST_RESYNC_NEXT set in p->rs_flags),
4617 			 * issue RESYNC_START NOTIFY event
4618 			 */
4619 			if (p->rs_flags & MD_MN_RS_FIRST_RESYNC_NEXT) {
4620 				SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_START,
4621 				    SVM_TAG_METADEVICE, MD_UN2SET(un),
4622 				    MD_SID(un));
4623 			}
4624 
4625 			/* Ensure that our local resync thread is running */
4626 			if (un->un_rs_thread == NULL) {
4627 				(void) mirror_resync_unit(p->mnum, NULL,
4628 				    &p->mde, lockp);
4629 			}
4630 		}
4631 		break;
4632 	case MD_MN_MSG_RESYNC_FINISH:
4633 		/*
4634 		 * Complete the resync by stopping the resync thread.
4635 		 * Also release the previous overlap region field.
4636 		 * Update the resync_progress_thread by cv_signal'ing it so
4637 		 * that we mark the end of the resync as soon as possible. This
4638 		 * stops an unnecessary delay should be panic after resync
4639 		 * completion.
4640 		 */
4641 #ifdef DEBUG
4642 		if (!rs_active) {
4643 			if (mirror_debug_flag)
4644 				printf("RESYNC_FINISH (mnum = %x), "
4645 				    "Resync *NOT* active",
4646 				    p->mnum);
4647 		}
4648 #endif
4649 
4650 		if ((un->c.un_status & MD_UN_RESYNC_ACTIVE) &&
4651 		    (p->rs_originator != md_mn_mynode_id)) {
4652 			mutex_enter(&un->un_rs_thread_mx);
4653 			un->c.un_status &= ~MD_UN_RESYNC_CANCEL;
4654 			un->un_rs_thread_flags |= MD_RI_SHUTDOWN;
4655 			un->un_rs_thread_flags &=
4656 			    ~(MD_RI_BLOCK|MD_RI_BLOCK_OWNER);
4657 			cv_signal(&un->un_rs_thread_cv);
4658 			mutex_exit(&un->un_rs_thread_mx);
4659 		}
4660 		if (is_ABR) {
4661 			/* Resync finished, if ABR set owner to NULL */
4662 			mutex_enter(&un->un_owner_mx);
4663 			un->un_mirror_owner = 0;
4664 			mutex_exit(&un->un_owner_mx);
4665 		}
4666 		(void) md_ioctl_writerlock(lockp, ui);
4667 		ps = un->un_rs_prev_overlap;
4668 		if (ps != NULL) {
4669 			/* Remove previous overlap range */
4670 			if (ps->ps_flags & MD_MPS_ON_OVERLAP)
4671 				mirror_overlap_tree_remove(ps);
4672 			/*
4673 			 * Release the overlap range reference
4674 			 */
4675 			un->un_rs_prev_overlap = NULL;
4676 			kmem_cache_free(mirror_parent_cache,
4677 			    ps);
4678 		}
4679 		md_ioctl_writerexit(lockp);
4680 
4681 		/* Mark the resync as complete in the metadb */
4682 		un->un_rs_resync_done = p->rs_done;
4683 		un->un_rs_resync_2_do = p->rs_2_do;
4684 		un->un_rs_type = p->rs_type;
4685 		mutex_enter(&un->un_rs_progress_mx);
4686 		cv_signal(&un->un_rs_progress_cv);
4687 		mutex_exit(&un->un_rs_progress_mx);
4688 
4689 		un = md_ioctl_writerlock(lockp, ui);
4690 		un->c.un_status &= ~MD_UN_RESYNC_ACTIVE;
4691 		/* Deal with any pending grow_unit */
4692 		if (un->c.un_status & MD_UN_GROW_PENDING) {
4693 			if ((mirror_grow_unit(un, &mde) != 0) ||
4694 			    (! mdismderror(&mde, MDE_GROW_DELAYED))) {
4695 				un->c.un_status &= ~MD_UN_GROW_PENDING;
4696 			}
4697 		}
4698 		md_ioctl_writerexit(lockp);
4699 		break;
4700 
4701 	case MD_MN_MSG_RESYNC_PHASE_DONE:
4702 		/*
4703 		 * A phase of the resync, optimized. component or
4704 		 * submirror is complete. Update mirror status.
4705 		 * If the flag CLEAR_OPT_NOT_DONE is set, it means that the
4706 		 * mirror owner is peforming a resync. If we have just snarfed
4707 		 * this set, then we must clear any of the flags set at snarf
4708 		 * time by unit_setup_resync().
4709 		 * Note that unit_setup_resync() sets up these flags to
4710 		 * indicate that an optimized resync is required. These flags
4711 		 * need to be reset because if we get here,  the mirror owner
4712 		 * will have handled the optimized resync.
4713 		 * The flags that must be cleared are MD_UN_OPT_NOT_DONE and
4714 		 * MD_UN_WAR. In addition, for each submirror,
4715 		 * MD_SM_RESYNC_TARGET must be cleared and SMS_OFFLINE_RESYNC
4716 		 * set to SMS_OFFLINE.
4717 		 */
4718 #ifdef DEBUG
4719 		if (mirror_debug_flag)
4720 			printf("phase done mess received from %d, mnum=%x,"
4721 			    "type=%x, flags=%x\n", p->rs_originator, p->mnum,
4722 			    p->rs_type, p->rs_flags);
4723 #endif
4724 		/*
4725 		 * Ignore the message if there is no active resync thread.
4726 		 */
4727 		if (!rs_active)
4728 			break;
4729 
4730 		broke_out = p->rs_flags & MD_MN_RS_ERR;
4731 		switch (RS_TYPE(p->rs_type)) {
4732 		case MD_RS_OPTIMIZED:
4733 			un = md_ioctl_writerlock(lockp, ui);
4734 			if (p->rs_flags & MD_MN_RS_CLEAR_OPT_NOT_DONE) {
4735 				/* If we are originator, just clear rs_type */
4736 				if (p->rs_originator == md_mn_mynode_id) {
4737 					SET_RS_TYPE_NONE(un->un_rs_type);
4738 					md_ioctl_writerexit(lockp);
4739 					break;
4740 				}
4741 				/*
4742 				 * If CLEAR_OPT_NOT_DONE is set, only clear the
4743 				 * flags if OPT_NOT_DONE is set *and* rs_type
4744 				 * is MD_RS_NONE.
4745 				 */
4746 				if ((un->c.un_status & MD_UN_OPT_NOT_DONE) &&
4747 				    (RS_TYPE(un->un_rs_type) == MD_RS_NONE)) {
4748 					/* No resync in progress */
4749 					un->c.un_status &= ~MD_UN_OPT_NOT_DONE;
4750 					un->c.un_status &= ~MD_UN_WAR;
4751 				} else {
4752 					/*
4753 					 * We are in the middle of an
4754 					 * optimized resync and this message
4755 					 * should be ignored.
4756 					 */
4757 					md_ioctl_writerexit(lockp);
4758 					break;
4759 				}
4760 			} else {
4761 				/*
4762 				 * This is the end of an optimized resync,
4763 				 * clear the OPT_NOT_DONE and OFFLINE_SM flags
4764 				 */
4765 
4766 				un->c.un_status &= ~MD_UN_KEEP_DIRTY;
4767 				if (!broke_out)
4768 					un->c.un_status &= ~MD_UN_WAR;
4769 			}
4770 
4771 			/*
4772 			 * Set resync_completed to last resync type and then
4773 			 * clear resync_type to indicate no resync in progress
4774 			 */
4775 			un->un_resync_completed = un->un_rs_type;
4776 			SET_RS_TYPE_NONE(un->un_rs_type);
4777 
4778 			/*
4779 			 * If resync is as a result of a submirror ONLINE,
4780 			 * reset the submirror state to SMS_RUNNING if the
4781 			 * resync was ok else set back to SMS_OFFLINE.
4782 			 */
4783 			for (smi = 0; smi < NMIRROR; smi++) {
4784 				un->un_sm[smi].sm_flags &=
4785 				    ~MD_SM_RESYNC_TARGET;
4786 				if (SMS_BY_INDEX_IS(un, smi,
4787 				    SMS_OFFLINE_RESYNC)) {
4788 					if (p->rs_flags &
4789 					    MD_MN_RS_CLEAR_OPT_NOT_DONE) {
4790 						state = SMS_OFFLINE;
4791 					} else {
4792 						state = (broke_out ?
4793 						    SMS_OFFLINE : SMS_RUNNING);
4794 					}
4795 					mirror_set_sm_state(
4796 					    &un->un_sm[smi],
4797 					    &un->un_smic[smi], state,
4798 					    broke_out);
4799 					mirror_commit(un, NO_SUBMIRRORS,
4800 					    0);
4801 				}
4802 				/*
4803 				 * If we still have an offline submirror, reset
4804 				 * the OFFLINE_SM flag in the mirror status
4805 				 */
4806 				if (SMS_BY_INDEX_IS(un, smi,
4807 				    SMS_OFFLINE))
4808 					un->c.un_status |=
4809 					    MD_UN_OFFLINE_SM;
4810 			}
4811 			md_ioctl_writerexit(lockp);
4812 			break;
4813 		case MD_RS_SUBMIRROR:
4814 			un = md_ioctl_writerlock(lockp, ui);
4815 			smi = RS_SMI(p->rs_type);
4816 			sm = &un->un_sm[smi];
4817 			smic = &un->un_smic[smi];
4818 			/* Clear RESYNC target */
4819 			un->un_sm[smi].sm_flags &= ~MD_SM_RESYNC_TARGET;
4820 			/*
4821 			 * Set resync_completed to last resync type and then
4822 			 * clear resync_type to indicate no resync in progress
4823 			 */
4824 			un->un_resync_completed = un->un_rs_type;
4825 			SET_RS_TYPE_NONE(un->un_rs_type);
4826 			/*
4827 			 * If the resync completed ok reset the submirror
4828 			 * state to SMS_RUNNING else reset it to SMS_ATTACHED
4829 			 */
4830 			state = (broke_out ?
4831 			    SMS_ATTACHED : SMS_RUNNING);
4832 			mirror_set_sm_state(sm, smic, state, broke_out);
4833 			un->c.un_status &= ~MD_UN_WAR;
4834 			mirror_commit(un, SMI2BIT(smi), 0);
4835 			md_ioctl_writerexit(lockp);
4836 			break;
4837 		case MD_RS_COMPONENT:
4838 			un = md_ioctl_writerlock(lockp, ui);
4839 			smi = RS_SMI(p->rs_type);
4840 			ci = RS_CI(p->rs_type);
4841 			sm = &un->un_sm[smi];
4842 			smic = &un->un_smic[smi];
4843 			shared = (md_m_shared_t *)
4844 			    (*(smic->sm_shared_by_indx))
4845 			    (sm->sm_dev, sm, ci);
4846 			un->c.un_status &= ~MD_UN_WAR;
4847 			/* Clear RESYNC target */
4848 			un->un_sm[smi].sm_flags &= ~MD_SM_RESYNC_TARGET;
4849 			/*
4850 			 * Set resync_completed to last resync type and then
4851 			 * clear resync_type to indicate no resync in progress
4852 			 */
4853 			un->un_resync_completed = un->un_rs_type;
4854 			SET_RS_TYPE_NONE(un->un_rs_type);
4855 
4856 			/*
4857 			 * If the resync completed ok, set the component state
4858 			 * to CS_OKAY.
4859 			 */
4860 			if (broke_out)
4861 				shared->ms_flags |= MDM_S_RS_TRIED;
4862 			else {
4863 				/*
4864 				 * As we don't transmit the changes,
4865 				 * no need to drop the lock.
4866 				 */
4867 				set_sm_comp_state(un, smi, ci, CS_OKAY, 0,
4868 				    MD_STATE_NO_XMIT, (IOLOCK *)NULL);
4869 			}
4870 			md_ioctl_writerexit(lockp);
4871 		default:
4872 			break;
4873 		}
4874 		/*
4875 		 * If the purpose of this PHASE_DONE message is just to
4876 		 * indicate to all other nodes that the optimized resync
4877 		 * required (OPT_NOT_DONE) flag is to be cleared, there is
4878 		 * no need to generate a notify event as there has not
4879 		 * actually been a resync.
4880 		 */
4881 		if (!(p->rs_flags & MD_MN_RS_CLEAR_OPT_NOT_DONE)) {
4882 			if (broke_out) {
4883 				SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_FAILED,
4884 				    SVM_TAG_METADEVICE, MD_UN2SET(un),
4885 				    MD_SID(un));
4886 			} else {
4887 				SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_DONE,
4888 				    SVM_TAG_METADEVICE, MD_UN2SET(un),
4889 				    MD_SID(un));
4890 			}
4891 		}
4892 		break;
4893 
4894 	default:
4895 #ifdef DEBUG
4896 		cmn_err(CE_PANIC, "mirror_resync_message: Unknown message type"
4897 		    " %x\n", p->msg_type);
4898 #endif
4899 		return (EINVAL);
4900 	}
4901 	return (0);
4902 }
4903 
4904 /* Return a -1 if snarf of optimized record failed and set should be released */
4905 static int
4906 mirror_snarf(md_snarfcmd_t cmd, set_t setno)
4907 {
4908 	mddb_recid_t	recid;
4909 	int		gotsomething;
4910 	int		all_mirrors_gotten;
4911 	mm_unit_t	*un;
4912 	mddb_type_t	typ1;
4913 	mddb_de_ic_t    *dep;
4914 	mddb_rb32_t	*rbp;
4915 	size_t		newreqsize;
4916 	mm_unit_t	*big_un;
4917 	mm_unit32_od_t	*small_un;
4918 	int		retval;
4919 	mdi_unit_t	*ui;
4920 
4921 	if (cmd == MD_SNARF_CLEANUP) {
4922 		if (md_get_setstatus(setno) & MD_SET_STALE)
4923 			return (0);
4924 
4925 		recid = mddb_makerecid(setno, 0);
4926 		typ1 = (mddb_type_t)md_getshared_key(setno,
4927 		    mirror_md_ops.md_driver.md_drivername);
4928 		while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) {
4929 			if (mddb_getrecprivate(recid) & MD_PRV_CLEANUP) {
4930 				un = (mm_unit_t *)mddb_getrecaddr(recid);
4931 				mirror_cleanup(un);
4932 				recid = mddb_makerecid(setno, 0);
4933 			}
4934 		}
4935 		return (0);
4936 	}
4937 
4938 	all_mirrors_gotten = 1;
4939 	gotsomething = 0;
4940 
4941 	recid = mddb_makerecid(setno, 0);
4942 	typ1 = (mddb_type_t)md_getshared_key(setno,
4943 	    mirror_md_ops.md_driver.md_drivername);
4944 
4945 	while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) {
4946 		if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
4947 			continue;
4948 
4949 		dep = mddb_getrecdep(recid);
4950 		dep->de_flags = MDDB_F_MIRROR;
4951 		rbp = dep->de_rb;
4952 
4953 		switch (rbp->rb_revision) {
4954 		case MDDB_REV_RB:
4955 		case MDDB_REV_RBFN:
4956 			if ((rbp->rb_private & MD_PRV_CONVD) == 0) {
4957 				/*
4958 				 * This means, we have an old and small
4959 				 * record and this record hasn't already
4960 				 * been converted.  Before we create an
4961 				 * incore metadevice from this we have to
4962 				 * convert it to a big record.
4963 				 */
4964 				small_un =
4965 				    (mm_unit32_od_t *)mddb_getrecaddr(recid);
4966 				newreqsize = sizeof (mm_unit_t);
4967 				big_un = (mm_unit_t *)kmem_zalloc(newreqsize,
4968 				    KM_SLEEP);
4969 				mirror_convert((caddr_t)small_un,
4970 				    (caddr_t)big_un, SMALL_2_BIG);
4971 				kmem_free(small_un, dep->de_reqsize);
4972 
4973 				/*
4974 				 * Update userdata and incore userdata
4975 				 * incores are at the end of un
4976 				 */
4977 				dep->de_rb_userdata_ic = big_un;
4978 				dep->de_rb_userdata = big_un;
4979 				dep->de_icreqsize = newreqsize;
4980 				un = big_un;
4981 				rbp->rb_private |= MD_PRV_CONVD;
4982 			} else {
4983 				/*
4984 				 * Unit already converted, just get the
4985 				 * record address.
4986 				 */
4987 				un = (mm_unit_t *)mddb_getrecaddr_resize(recid,
4988 				    sizeof (*un), 0);
4989 			}
4990 			un->c.un_revision &= ~MD_64BIT_META_DEV;
4991 			break;
4992 		case MDDB_REV_RB64:
4993 		case MDDB_REV_RB64FN:
4994 			/* Big device */
4995 			un = (mm_unit_t *)mddb_getrecaddr_resize(recid,
4996 			    sizeof (*un), 0);
4997 			un->c.un_revision |= MD_64BIT_META_DEV;
4998 			un->c.un_flag |= MD_EFILABEL;
4999 			break;
5000 		}
5001 		MDDB_NOTE_FN(rbp->rb_revision, un->c.un_revision);
5002 
5003 		/*
5004 		 * Create minor device node for snarfed entry.
5005 		 */
5006 		(void) md_create_minor_node(setno, MD_SID(un));
5007 
5008 		if (MD_UNIT(MD_SID(un)) != NULL) {
5009 			mddb_setrecprivate(recid, MD_PRV_PENDDEL);
5010 			continue;
5011 		}
5012 		all_mirrors_gotten = 0;
5013 		retval = mirror_build_incore(un, 1);
5014 		if (retval == 0) {
5015 			mddb_setrecprivate(recid, MD_PRV_GOTIT);
5016 			md_create_unit_incore(MD_SID(un), &mirror_md_ops, 0);
5017 			resync_start_timeout(setno);
5018 			gotsomething = 1;
5019 		} else {
5020 			return (retval);
5021 		}
5022 		/*
5023 		 * Set flag to indicate that the mirror has not yet
5024 		 * been through a reconfig. This flag is used for MN sets
5025 		 * when determining whether to update the mirror state from
5026 		 * the Master node.
5027 		 */
5028 		if (MD_MNSET_SETNO(setno)) {
5029 			ui = MDI_UNIT(MD_SID(un));
5030 			ui->ui_tstate |= MD_RESYNC_NOT_DONE;
5031 		}
5032 	}
5033 
5034 	if (!all_mirrors_gotten)
5035 		return (gotsomething);
5036 
5037 	recid = mddb_makerecid(setno, 0);
5038 	while ((recid = mddb_getnextrec(recid, typ1, RESYNC_REC)) > 0)
5039 		if (!(mddb_getrecprivate(recid) & MD_PRV_GOTIT))
5040 			mddb_setrecprivate(recid, MD_PRV_PENDDEL);
5041 
5042 	return (0);
5043 }
5044 
5045 static int
5046 mirror_halt(md_haltcmd_t cmd, set_t setno)
5047 {
5048 	unit_t		i;
5049 	mdi_unit_t	*ui;
5050 	minor_t		mnum;
5051 	int		reset_mirror_flag = 0;
5052 
5053 	if (cmd == MD_HALT_CLOSE)
5054 		return (0);
5055 
5056 	if (cmd == MD_HALT_OPEN)
5057 		return (0);
5058 
5059 	if (cmd == MD_HALT_UNLOAD)
5060 		return (0);
5061 
5062 	if (cmd == MD_HALT_CHECK) {
5063 		for (i = 0; i < md_nunits; i++) {
5064 			mnum = MD_MKMIN(setno, i);
5065 			if ((ui = MDI_UNIT(mnum)) == NULL)
5066 				continue;
5067 			if (ui->ui_opsindex != mirror_md_ops.md_selfindex)
5068 				continue;
5069 			if (md_unit_isopen(ui))
5070 				return (1);
5071 		}
5072 		return (0);
5073 	}
5074 
5075 	if (cmd != MD_HALT_DOIT)
5076 		return (1);
5077 
5078 	for (i = 0; i < md_nunits; i++) {
5079 		mnum = MD_MKMIN(setno, i);
5080 		if ((ui = MDI_UNIT(mnum)) == NULL)
5081 			continue;
5082 		if (ui->ui_opsindex != mirror_md_ops.md_selfindex)
5083 			continue;
5084 		reset_mirror((mm_unit_t *)MD_UNIT(mnum), mnum, 0);
5085 
5086 		/* Set a flag if there is at least one mirror metadevice. */
5087 		reset_mirror_flag = 1;
5088 	}
5089 
5090 	/*
5091 	 * Only wait for the global dr_timeout to finish
5092 	 *  - if there are mirror metadevices in this diskset or
5093 	 *  - if this is the local set since an unload of the md_mirror
5094 	 *    driver could follow a successful mirror halt in the local set.
5095 	 */
5096 	if ((reset_mirror_flag != 0) || (setno == MD_LOCAL_SET)) {
5097 		while ((mirror_md_ops.md_head == NULL) &&
5098 		    (mirror_timeout.dr_timeout_id != 0))
5099 			delay(md_hz);
5100 	}
5101 
5102 	return (0);
5103 }
5104 
5105 /*ARGSUSED3*/
5106 static int
5107 mirror_open(dev_t *dev, int flag, int otyp, cred_t *cred_p, int md_oflags)
5108 {
5109 	IOLOCK	lock;
5110 	minor_t		mnum = getminor(*dev);
5111 	set_t		setno;
5112 
5113 	/*
5114 	 * When doing an open of a multi owner metadevice, check to see if this
5115 	 * node is a starting node and if a reconfig cycle is underway.
5116 	 * If so, the system isn't sufficiently set up enough to handle the
5117 	 * open (which involves I/O during sp_validate), so fail with ENXIO.
5118 	 */
5119 	setno = MD_MIN2SET(mnum);
5120 	if ((md_set[setno].s_status & (MD_SET_MNSET | MD_SET_MN_START_RC)) ==
5121 	    (MD_SET_MNSET | MD_SET_MN_START_RC)) {
5122 			return (ENXIO);
5123 	}
5124 
5125 	if (md_oflags & MD_OFLG_FROMIOCTL) {
5126 		/*
5127 		 * This indicates that the caller is an ioctl service routine.
5128 		 * In this case we initialise our stack-based IOLOCK and pass
5129 		 * this into the internal open routine. This allows multi-owner
5130 		 * metadevices to avoid deadlocking if an error is encountered
5131 		 * during the open() attempt. The failure case is:
5132 		 * s-p -> mirror -> s-p (with error). Attempting to metaclear
5133 		 * this configuration would deadlock as the mirror code has to
5134 		 * send a state-update to the other nodes when it detects the
5135 		 * failure of the underlying submirror with an errored soft-part
5136 		 * on it. As there is a class1 message in progress (metaclear)
5137 		 * set_sm_comp_state() cannot send another class1 message;
5138 		 * instead we do not send a state_update message as the
5139 		 * metaclear is distributed and the failed submirror will be
5140 		 * cleared from the configuration by the metaclear.
5141 		 */
5142 		IOLOCK_INIT(&lock);
5143 		return (mirror_internal_open(getminor(*dev), flag, otyp,
5144 		    md_oflags, &lock));
5145 	} else {
5146 		return (mirror_internal_open(getminor(*dev), flag, otyp,
5147 		    md_oflags, (IOLOCK *)NULL));
5148 	}
5149 }
5150 
5151 
5152 /*ARGSUSED1*/
5153 static int
5154 mirror_close(dev_t dev, int flag, int otyp, cred_t *cred_p, int md_cflags)
5155 {
5156 	return (mirror_internal_close(getminor(dev), otyp, md_cflags,
5157 	    (IOLOCK *)NULL));
5158 }
5159 
5160 
5161 /*
5162  * This routine dumps memory to the disk.  It assumes that the memory has
5163  * already been mapped into mainbus space.  It is called at disk interrupt
5164  * priority when the system is in trouble.
5165  *
5166  */
5167 static int
5168 mirror_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
5169 {
5170 	mm_unit_t	*un;
5171 	dev_t		mapdev;
5172 	int		result;
5173 	int		smi;
5174 	int		any_succeed = 0;
5175 	int		save_result = 0;
5176 
5177 	/*
5178 	 * Don't need to grab the unit lock.
5179 	 * Cause nothing else is suppose to be happenning.
5180 	 * Also dump is not suppose to sleep.
5181 	 */
5182 	un = (mm_unit_t *)MD_UNIT(getminor(dev));
5183 
5184 	if ((diskaddr_t)blkno >= un->c.un_total_blocks)
5185 		return (EINVAL);
5186 
5187 	if ((diskaddr_t)blkno + nblk > un->c.un_total_blocks)
5188 		return (EINVAL);
5189 
5190 	for (smi = 0; smi < NMIRROR; smi++) {
5191 		if (!SUBMIRROR_IS_WRITEABLE(un, smi))
5192 			continue;
5193 		mapdev = md_dev64_to_dev(un->un_sm[smi].sm_dev);
5194 		result = bdev_dump(mapdev, addr, blkno, nblk);
5195 		if (result)
5196 			save_result = result;
5197 
5198 		if (result == 0)
5199 			any_succeed++;
5200 	}
5201 
5202 	if (any_succeed)
5203 		return (0);
5204 
5205 	return (save_result);
5206 }
5207 
5208 /*
5209  * NAME: mirror_probe_dev
5210  *
5211  * DESCRITPION: force opens every component of a mirror.
5212  *
5213  * On entry the unit writerlock is held
5214  */
5215 static int
5216 mirror_probe_dev(mdi_unit_t *ui, minor_t mnum)
5217 {
5218 	int		i;
5219 	int		smi;
5220 	int		ci;
5221 	mm_unit_t	*un;
5222 	int		md_devopen = 0;
5223 	set_t		setno;
5224 	int		sm_cnt;
5225 	int		sm_unavail_cnt;
5226 
5227 	if (md_unit_isopen(ui))
5228 		md_devopen++;
5229 
5230 	un = MD_UNIT(mnum);
5231 	setno = MD_UN2SET(un);
5232 
5233 	sm_cnt = 0;
5234 	sm_unavail_cnt = 0;
5235 	for (i = 0; i < NMIRROR; i++) {
5236 		md_dev64_t tmpdev;
5237 		mdi_unit_t	*sm_ui;
5238 
5239 		if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) {
5240 			continue;
5241 		}
5242 
5243 		sm_cnt++;
5244 		tmpdev = un->un_sm[i].sm_dev;
5245 		(void) md_layered_open(mnum, &tmpdev,
5246 		    MD_OFLG_CONT_ERRS | MD_OFLG_PROBEDEV);
5247 		un->un_sm[i].sm_dev = tmpdev;
5248 
5249 		sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev)));
5250 
5251 		/*
5252 		 * Logic similar to that in mirror_open_all_devs.  We set or
5253 		 * clear the submirror Unavailable bit.
5254 		 */
5255 		(void) md_unit_writerlock(sm_ui);
5256 		if (submirror_unavailable(un, i, 1)) {
5257 			sm_ui->ui_tstate |= MD_INACCESSIBLE;
5258 			sm_unavail_cnt++;
5259 		} else {
5260 			sm_ui->ui_tstate &= ~MD_INACCESSIBLE;
5261 		}
5262 		md_unit_writerexit(sm_ui);
5263 	}
5264 
5265 	/*
5266 	 * If all of the submirrors are unavailable, the mirror is also
5267 	 * unavailable.
5268 	 */
5269 	if (sm_cnt == sm_unavail_cnt) {
5270 		ui->ui_tstate |= MD_INACCESSIBLE;
5271 	} else {
5272 		ui->ui_tstate &= ~MD_INACCESSIBLE;
5273 	}
5274 
5275 	/*
5276 	 * Start checking from probe failures. If failures occur we
5277 	 * set the appropriate erred state only if the metadevice is in
5278 	 * use. This is specifically to prevent unnecessary resyncs.
5279 	 * For instance if the disks were accidentally disconnected when
5280 	 * the system booted up then until the metadevice is accessed
5281 	 * (like file system mount) the user can shutdown, recable and
5282 	 * reboot w/o incurring a potentially huge resync.
5283 	 */
5284 
5285 	smi = 0;
5286 	ci = 0;
5287 	while (mirror_geterror(un, &smi, &ci, 1, 1) != 0) {
5288 
5289 		if (mirror_other_sources(un, smi, ci, 0) == 1) {
5290 			/*
5291 			 * Note that for a MN set, there is no need to call
5292 			 * SE_NOTIFY as that is done when processing the
5293 			 * state change
5294 			 */
5295 			if (md_devopen) {
5296 				/*
5297 				 * Never called from ioctl context,
5298 				 * so (IOLOCK *)NULL
5299 				 */
5300 				set_sm_comp_state(un, smi, ci, CS_LAST_ERRED,
5301 				    0, MD_STATE_XMIT, (IOLOCK *)NULL);
5302 				if (!MD_MNSET_SETNO(setno)) {
5303 					SE_NOTIFY(EC_SVM_STATE,
5304 					    ESC_SVM_LASTERRED,
5305 					    SVM_TAG_METADEVICE, setno,
5306 					    MD_SID(un));
5307 				}
5308 				continue;
5309 			} else {
5310 				(void) mirror_close_all_devs(un,
5311 				    MD_OFLG_PROBEDEV);
5312 				if (!MD_MNSET_SETNO(setno)) {
5313 					SE_NOTIFY(EC_SVM_STATE,
5314 					    ESC_SVM_OPEN_FAIL,
5315 					    SVM_TAG_METADEVICE, setno,
5316 					    MD_SID(un));
5317 				}
5318 				mirror_openfail_console_info(un, smi, ci);
5319 				return (ENXIO);
5320 			}
5321 		}
5322 
5323 		/*
5324 		 * Note that for a MN set, there is no need to call
5325 		 * SE_NOTIFY as that is done when processing the
5326 		 * state change
5327 		 */
5328 		if (md_devopen) {
5329 			/* Never called from ioctl context, so (IOLOCK *)NULL */
5330 			set_sm_comp_state(un, smi, ci, CS_ERRED, 0,
5331 			    MD_STATE_XMIT, (IOLOCK *)NULL);
5332 			if (!MD_MNSET_SETNO(setno)) {
5333 				SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED,
5334 				    SVM_TAG_METADEVICE, setno,
5335 				    MD_SID(un));
5336 			}
5337 		}
5338 		mirror_openfail_console_info(un, smi, ci);
5339 		ci++;
5340 	}
5341 
5342 	if (MD_MNSET_SETNO(setno)) {
5343 		send_poke_hotspares(setno);
5344 	} else {
5345 		(void) poke_hotspares();
5346 	}
5347 	(void) mirror_close_all_devs(un, MD_OFLG_PROBEDEV);
5348 
5349 	return (0);
5350 }
5351 
5352 
5353 static int
5354 mirror_imp_set(
5355 	set_t	setno
5356 )
5357 {
5358 
5359 	mddb_recid_t	recid;
5360 	int		gotsomething, i;
5361 	mddb_type_t	typ1;
5362 	mddb_de_ic_t	*dep;
5363 	mddb_rb32_t	*rbp;
5364 	mm_unit32_od_t	*un32;
5365 	mm_unit_t	*un64;
5366 	md_dev64_t	self_devt;
5367 	minor_t		*self_id;	/* minor needs to be updated */
5368 	md_parent_t	*parent_id;	/* parent needs to be updated */
5369 	mddb_recid_t	*record_id;	/* record id needs to be updated */
5370 	mddb_recid_t	*optrec_id;
5371 	md_dev64_t	tmpdev;
5372 
5373 
5374 	gotsomething = 0;
5375 
5376 	typ1 = (mddb_type_t)md_getshared_key(setno,
5377 	    mirror_md_ops.md_driver.md_drivername);
5378 	recid = mddb_makerecid(setno, 0);
5379 
5380 	while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) {
5381 		if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
5382 			continue;
5383 
5384 		dep = mddb_getrecdep(recid);
5385 		rbp = dep->de_rb;
5386 
5387 		switch (rbp->rb_revision) {
5388 		case MDDB_REV_RB:
5389 		case MDDB_REV_RBFN:
5390 			/*
5391 			 * Small device
5392 			 */
5393 			un32 = (mm_unit32_od_t *)mddb_getrecaddr(recid);
5394 			self_id = &(un32->c.un_self_id);
5395 			parent_id = &(un32->c.un_parent);
5396 			record_id = &(un32->c.un_record_id);
5397 			optrec_id = &(un32->un_rr_dirty_recid);
5398 
5399 			for (i = 0; i < un32->un_nsm; i++) {
5400 				tmpdev = md_expldev(un32->un_sm[i].sm_dev);
5401 				un32->un_sm[i].sm_dev = md_cmpldev
5402 				    (md_makedevice(md_major, MD_MKMIN(setno,
5403 				    MD_MIN2UNIT(md_getminor(tmpdev)))));
5404 
5405 				if (!md_update_minor(setno, mddb_getsidenum
5406 				    (setno), un32->un_sm[i].sm_key))
5407 				goto out;
5408 			}
5409 			break;
5410 		case MDDB_REV_RB64:
5411 		case MDDB_REV_RB64FN:
5412 			un64 = (mm_unit_t *)mddb_getrecaddr(recid);
5413 			self_id = &(un64->c.un_self_id);
5414 			parent_id = &(un64->c.un_parent);
5415 			record_id = &(un64->c.un_record_id);
5416 			optrec_id = &(un64->un_rr_dirty_recid);
5417 
5418 			for (i = 0; i < un64->un_nsm; i++) {
5419 				tmpdev = un64->un_sm[i].sm_dev;
5420 				un64->un_sm[i].sm_dev = md_makedevice
5421 				    (md_major, MD_MKMIN(setno, MD_MIN2UNIT
5422 				    (md_getminor(tmpdev))));
5423 
5424 				if (!md_update_minor(setno, mddb_getsidenum
5425 				    (setno), un64->un_sm[i].sm_key))
5426 				goto out;
5427 			}
5428 			break;
5429 		}
5430 
5431 		/*
5432 		 * If this is a top level and a friendly name metadevice,
5433 		 * update its minor in the namespace.
5434 		 */
5435 		if ((*parent_id == MD_NO_PARENT) &&
5436 		    ((rbp->rb_revision == MDDB_REV_RBFN) ||
5437 		    (rbp->rb_revision == MDDB_REV_RB64FN))) {
5438 
5439 			self_devt = md_makedevice(md_major, *self_id);
5440 			if (!md_update_top_device_minor(setno,
5441 			    mddb_getsidenum(setno), self_devt))
5442 				goto out;
5443 		}
5444 
5445 		/*
5446 		 * Update unit with the imported setno
5447 		 *
5448 		 */
5449 		mddb_setrecprivate(recid, MD_PRV_GOTIT);
5450 
5451 		*self_id = MD_MKMIN(setno, MD_MIN2UNIT(*self_id));
5452 		if (*parent_id != MD_NO_PARENT)
5453 			*parent_id = MD_MKMIN(setno, MD_MIN2UNIT(*parent_id));
5454 		*record_id = MAKERECID(setno, DBID(*record_id));
5455 		*optrec_id = MAKERECID(setno, DBID(*optrec_id));
5456 
5457 		gotsomething = 1;
5458 	}
5459 
5460 out:
5461 	return (gotsomething);
5462 }
5463 
5464 /*
5465  * NAME: mirror_check_offline
5466  *
5467  * DESCRIPTION: return offline_status = 1 if any submirrors are offline
5468  *
5469  * Called from ioctl, so access to MD_UN_OFFLINE_SM in un_status is
5470  * protected by the global ioctl lock as it is only set by the MD_IOCOFFLINE
5471  * ioctl.
5472  */
5473 int
5474 mirror_check_offline(md_dev64_t dev, int *offline_status)
5475 {
5476 	mm_unit_t		*un;
5477 	md_error_t		mde = mdnullerror;
5478 
5479 	if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL)
5480 		return (EINVAL);
5481 	*offline_status = 0;
5482 	if (un->c.un_status & MD_UN_OFFLINE_SM)
5483 		*offline_status = 1;
5484 	return (0);
5485 }
5486 
5487 /*
5488  * NAME: mirror_inc_abr_count
5489  *
5490  * DESCRIPTION: increment the count of layered soft parts with ABR set
5491  *
5492  * Called from ioctl, so access to un_abr_count is protected by the global
5493  * ioctl lock. It is only referenced in the MD_IOCOFFLINE ioctl.
5494  */
5495 int
5496 mirror_inc_abr_count(md_dev64_t dev)
5497 {
5498 	mm_unit_t		*un;
5499 	md_error_t		mde = mdnullerror;
5500 
5501 	if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL)
5502 		return (EINVAL);
5503 	un->un_abr_count++;
5504 	return (0);
5505 }
5506 
5507 /*
5508  * NAME: mirror_dec_abr_count
5509  *
5510  * DESCRIPTION: decrement the count of layered soft parts with ABR set
5511  *
5512  * Called from ioctl, so access to un_abr_count is protected by the global
5513  * ioctl lock. It is only referenced in the MD_IOCOFFLINE ioctl.
5514  */
5515 int
5516 mirror_dec_abr_count(md_dev64_t dev)
5517 {
5518 	mm_unit_t		*un;
5519 	md_error_t		mde = mdnullerror;
5520 
5521 	if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL)
5522 		return (EINVAL);
5523 	un->un_abr_count--;
5524 	return (0);
5525 }
5526 
5527 static md_named_services_t mirror_named_services[] = {
5528 	{(intptr_t (*)()) poke_hotspares,		"poke hotspares"    },
5529 	{(intptr_t (*)()) mirror_rename_listkids,	MDRNM_LIST_URKIDS   },
5530 	{mirror_rename_check,				MDRNM_CHECK	    },
5531 	{(intptr_t (*)()) mirror_renexch_update_kids,	MDRNM_UPDATE_KIDS   },
5532 	{(intptr_t (*)()) mirror_exchange_parent_update_to,
5533 			MDRNM_PARENT_UPDATE_TO},
5534 	{(intptr_t (*)()) mirror_exchange_self_update_from_down,
5535 			MDRNM_SELF_UPDATE_FROM_DOWN },
5536 	{(intptr_t (*)())mirror_probe_dev,		"probe open test" },
5537 	{(intptr_t (*)())mirror_check_offline,		MD_CHECK_OFFLINE },
5538 	{(intptr_t (*)())mirror_inc_abr_count,		MD_INC_ABR_COUNT },
5539 	{(intptr_t (*)())mirror_dec_abr_count,		MD_DEC_ABR_COUNT },
5540 	{ NULL,						0		    }
5541 };
5542 
5543 md_ops_t mirror_md_ops = {
5544 	mirror_open,		/* open */
5545 	mirror_close,		/* close */
5546 	md_mirror_strategy,	/* strategy */
5547 	NULL,			/* print */
5548 	mirror_dump,		/* dump */
5549 	NULL,			/* read */
5550 	NULL,			/* write */
5551 	md_mirror_ioctl,	/* mirror_ioctl, */
5552 	mirror_snarf,		/* mirror_snarf */
5553 	mirror_halt,		/* mirror_halt */
5554 	NULL,			/* aread */
5555 	NULL,			/* awrite */
5556 	mirror_imp_set,		/* import set */
5557 	mirror_named_services
5558 };
5559 
5560 /* module specific initilization */
5561 static void
5562 init_init()
5563 {
5564 	md_mirror_mcs_buf_off = sizeof (md_mcs_t) - sizeof (buf_t);
5565 
5566 	/* Initialize the parent and child save memory pools */
5567 	mirror_parent_cache = kmem_cache_create("md_mirror_parent",
5568 	    sizeof (md_mps_t), 0, mirror_parent_constructor,
5569 	    mirror_parent_destructor, mirror_run_queue, NULL, NULL,
5570 	    0);
5571 
5572 	mirror_child_cache = kmem_cache_create("md_mirror_child",
5573 	    sizeof (md_mcs_t) - sizeof (buf_t) + biosize(), 0,
5574 	    mirror_child_constructor, mirror_child_destructor,
5575 	    mirror_run_queue, NULL, NULL, 0);
5576 
5577 	/*
5578 	 * Insure wowbuf_size is a multiple of DEV_BSIZE,
5579 	 * then initialize wowbuf memory pool.
5580 	 */
5581 	md_wowbuf_size = roundup(md_wowbuf_size, DEV_BSIZE);
5582 	if (md_wowbuf_size <= 0)
5583 		md_wowbuf_size = 2 * DEV_BSIZE;
5584 	if (md_wowbuf_size > (32 * DEV_BSIZE))
5585 		md_wowbuf_size = (32 * DEV_BSIZE);
5586 
5587 	md_wowblk_size = md_wowbuf_size + sizeof (wowhdr_t);
5588 	mirror_wowblk_cache = kmem_cache_create("md_mirror_wow",
5589 	    md_wowblk_size, 0, NULL, NULL, NULL, NULL, NULL, 0);
5590 
5591 	mutex_init(&mirror_timeout.dr_mx, NULL, MUTEX_DEFAULT, NULL);
5592 	mutex_init(&hotspare_request.dr_mx, NULL, MUTEX_DEFAULT, NULL);
5593 
5594 	mutex_init(&non_ff_drv_mutex, NULL, MUTEX_DEFAULT, NULL);
5595 }
5596 
5597 /* module specific uninitilization (undo init_init()) */
5598 static void
5599 fini_uninit()
5600 {
5601 	kmem_cache_destroy(mirror_parent_cache);
5602 	kmem_cache_destroy(mirror_child_cache);
5603 	kmem_cache_destroy(mirror_wowblk_cache);
5604 	mirror_parent_cache = mirror_child_cache =
5605 	    mirror_wowblk_cache = NULL;
5606 
5607 	mutex_destroy(&mirror_timeout.dr_mx);
5608 	mutex_destroy(&hotspare_request.dr_mx);
5609 	mutex_destroy(&non_ff_drv_mutex);
5610 }
5611 
5612 /* define the module linkage */
5613 MD_PLUGIN_MISC_MODULE("mirrors module", init_init(), fini_uninit())
5614