xref: /onnv-gate/usr/src/uts/common/io/lvm/mirror/mirror.c (revision 6901:307e592cef33)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/param.h>
29 #include <sys/systm.h>
30 #include <sys/conf.h>
31 #include <sys/file.h>
32 #include <sys/user.h>
33 #include <sys/uio.h>
34 #include <sys/t_lock.h>
35 #include <sys/buf.h>
36 #include <sys/dkio.h>
37 #include <sys/vtoc.h>
38 #include <sys/kmem.h>
39 #include <vm/page.h>
40 #include <sys/cmn_err.h>
41 #include <sys/sysmacros.h>
42 #include <sys/types.h>
43 #include <sys/mkdev.h>
44 #include <sys/stat.h>
45 #include <sys/open.h>
46 #include <sys/modctl.h>
47 #include <sys/ddi.h>
48 #include <sys/sunddi.h>
49 #include <sys/debug.h>
50 #include <sys/dklabel.h>
51 #include <vm/hat.h>
52 #include <sys/lvm/mdvar.h>
53 #include <sys/lvm/md_mirror.h>
54 #include <sys/lvm/md_convert.h>
55 #include <sys/lvm/md_mddb.h>
56 #include <sys/esunddi.h>
57 
58 #include <sys/sysevent/eventdefs.h>
59 #include <sys/sysevent/svm.h>
60 #include <sys/lvm/mdmn_commd.h>
61 #include <sys/avl.h>
62 
63 md_ops_t		mirror_md_ops;
64 #ifndef	lint
65 char			_depends_on[] = "drv/md";
66 md_ops_t		*md_interface_ops = &mirror_md_ops;
67 #endif
68 
69 extern mdq_anchor_t	md_done_daemon;
70 extern mdq_anchor_t	md_mstr_daemon;
71 extern mdq_anchor_t	md_mirror_daemon;
72 extern mdq_anchor_t	md_mirror_io_daemon;
73 extern mdq_anchor_t	md_mirror_rs_daemon;
74 extern mdq_anchor_t	md_mhs_daemon;
75 
76 extern unit_t		md_nunits;
77 extern set_t		md_nsets;
78 extern md_set_t		md_set[];
79 
80 extern int		md_status;
81 extern clock_t		md_hz;
82 
83 extern md_krwlock_t	md_unit_array_rw;
84 extern kmutex_t		md_mx;
85 extern kcondvar_t	md_cv;
86 extern int		md_mtioctl_cnt;
87 
88 daemon_request_t	mirror_timeout;
89 static daemon_request_t	hotspare_request;
90 static daemon_request_t	mn_hs_request[MD_MAXSETS];	/* Multinode hs req */
91 
92 int	md_mirror_mcs_buf_off;
93 
94 /* Flags for mdmn_ksend_message to allow debugging */
95 int	md_mirror_msg_flags;
96 
97 #ifdef DEBUG
98 /* Flag to switch on debug messages */
99 int	mirror_debug_flag = 0;
100 #endif
101 
102 /*
103  * Struct used to hold count of DMR reads and the timestamp of last DMR read
104  * It is used to verify, using a debugger, that the DMR read ioctl has been
105  * executed.
106  */
107 dmr_stats_t	mirror_dmr_stats = {0, 0};
108 
109 /*
110  * Mutex protecting list of non-failfast drivers.
111  */
112 static kmutex_t	non_ff_drv_mutex;
113 extern char	**non_ff_drivers;
114 
115 extern major_t	md_major;
116 
117 /*
118  * Write-On-Write memory pool.
119  */
120 static void		copy_write_cont(wowhdr_t *wowhdr);
121 static kmem_cache_t	*mirror_wowblk_cache = NULL;
122 static int		md_wowbuf_size = 16384;
123 static size_t		md_wowblk_size;
124 
125 /*
126  * This is a flag that allows:
127  *	- disabling the write-on-write mechanism.
128  *	- logging occurrences of write-on-write
129  *	- switching wow handling procedure processing
130  * Counter for occurences of WOW.
131  */
132 static uint_t	md_mirror_wow_flg = 0;
133 static int	md_mirror_wow_cnt = 0;
134 
135 /*
136  * Tunable to enable/disable dirty region
137  * processing when closing down a mirror.
138  */
139 static int	new_resync = 1;
140 kmem_cache_t	*mirror_parent_cache = NULL;
141 kmem_cache_t	*mirror_child_cache = NULL;
142 
143 extern int	md_ff_disable;		/* disable failfast */
144 
145 static int	mirror_map_write(mm_unit_t *, md_mcs_t *, md_mps_t *, int);
146 static void	mirror_read_strategy(buf_t *, int, void *);
147 static void	mirror_write_strategy(buf_t *, int, void *);
148 static void	become_owner(daemon_queue_t *);
149 static int	mirror_done(struct buf *cb);
150 static int	mirror_done_common(struct buf *cb);
151 static void	clear_retry_error(struct buf *cb);
152 
153 /*
154  * patchables
155  */
156 int	md_min_rr_size	= 200;	/* 2000 blocks, or 100k */
157 int	md_def_num_rr	= 1000;	/* Default number of dirty regions */
158 
159 /*
160  * patchable to change delay before rescheduling mirror ownership request.
161  * Value is clock ticks, default 0.5 seconds
162  */
163 clock_t	md_mirror_owner_to = 500000;
164 
165 /*ARGSUSED1*/
166 static int
167 mirror_parent_constructor(void *p, void *d1, int d2)
168 {
169 	mutex_init(&((md_mps_t *)p)->ps_mx, NULL, MUTEX_DEFAULT, NULL);
170 	return (0);
171 }
172 
173 static void
174 mirror_parent_init(md_mps_t *ps)
175 {
176 	bzero(ps, offsetof(md_mps_t, ps_mx));
177 }
178 
179 /*ARGSUSED1*/
180 static void
181 mirror_parent_destructor(void *p, void *d)
182 {
183 	mutex_destroy(&((md_mps_t *)p)->ps_mx);
184 }
185 
186 /*ARGSUSED1*/
187 static int
188 mirror_child_constructor(void *p, void *d1, int d2)
189 {
190 	bioinit(&((md_mcs_t *)p)->cs_buf);
191 	return (0);
192 }
193 
194 void
195 mirror_child_init(md_mcs_t *cs)
196 {
197 	cs->cs_ps = NULL;
198 	cs->cs_mdunit = 0;
199 	md_bioreset(&cs->cs_buf);
200 }
201 
202 /*ARGSUSED1*/
203 static void
204 mirror_child_destructor(void *p, void *d)
205 {
206 	biofini(&((md_mcs_t *)p)->cs_buf);
207 }
208 
209 static void
210 mirror_wowblk_init(wowhdr_t *p)
211 {
212 	bzero(p, md_wowblk_size);
213 }
214 
215 static void
216 send_poke_hotspares_msg(daemon_request_t *drq)
217 {
218 	int			rval;
219 	md_mn_msg_pokehsp_t	pokehsp;
220 	md_mn_kresult_t		*kresult;
221 	set_t			setno = (set_t)drq->dq.qlen;
222 
223 	pokehsp.pokehsp_setno = setno;
224 
225 	kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
226 	rval = mdmn_ksend_message(setno, MD_MN_MSG_POKE_HOTSPARES,
227 	    MD_MSGF_NO_LOG | MD_MSGF_NO_BCAST, (char *)&pokehsp,
228 	    sizeof (pokehsp), kresult);
229 
230 	if (!MDMN_KSEND_MSG_OK(rval, kresult)) {
231 		mdmn_ksend_show_error(rval, kresult, "POKE_HOTSPARES");
232 		cmn_err(CE_PANIC,
233 		    "ksend_message failure: POKE_HOTSPARES");
234 	}
235 	kmem_free(kresult, sizeof (md_mn_kresult_t));
236 
237 	/* Allow further requests to use this set's queue structure */
238 	mutex_enter(&drq->dr_mx);
239 	drq->dr_pending = 0;
240 	mutex_exit(&drq->dr_mx);
241 }
242 
243 /*
244  * Send a poke_hotspares message to the master node. To avoid swamping the
245  * commd handler with requests we only send a message if there is not one
246  * already outstanding. We punt the request to a separate thread context as
247  * cannot afford to block waiting on the request to be serviced. This is
248  * essential when a reconfig cycle is in progress as any open() of a multinode
249  * metadevice may result in a livelock.
250  */
251 static void
252 send_poke_hotspares(set_t setno)
253 {
254 	daemon_request_t	*drq = &mn_hs_request[setno];
255 
256 	mutex_enter(&drq->dr_mx);
257 	if (drq->dr_pending == 0) {
258 		drq->dr_pending = 1;
259 		drq->dq.qlen = (int)setno;
260 		daemon_request(&md_mhs_daemon,
261 		    send_poke_hotspares_msg, (daemon_queue_t *)drq, REQ_OLD);
262 	}
263 	mutex_exit(&drq->dr_mx);
264 }
265 
266 void
267 mirror_set_sm_state(
268 	mm_submirror_t		*sm,
269 	mm_submirror_ic_t	*smic,
270 	sm_state_t		newstate,
271 	int			force)
272 {
273 	int			compcnt;
274 	int			i;
275 	int			errcnt;
276 	sm_state_t		origstate;
277 	md_m_shared_t		*shared;
278 
279 	if (force) {
280 		sm->sm_state = newstate;
281 		uniqtime32(&sm->sm_timestamp);
282 		return;
283 	}
284 
285 	origstate = newstate;
286 
287 	compcnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm);
288 	for (i = 0, errcnt = 0; i < compcnt; i++) {
289 		shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
290 		    (sm->sm_dev, sm, i);
291 		if (shared->ms_state & (CS_ERRED | CS_LAST_ERRED))
292 			newstate |= SMS_COMP_ERRED;
293 		if (shared->ms_state & (CS_RESYNC))
294 			newstate |= SMS_COMP_RESYNC;
295 		if (shared->ms_state & CS_ERRED)
296 			errcnt++;
297 	}
298 
299 	if ((newstate & (SMS_COMP_ERRED | SMS_COMP_RESYNC)) != 0)
300 		newstate &= ~origstate;
301 
302 	if (errcnt == compcnt)
303 		newstate |= SMS_ALL_ERRED;
304 	else
305 		newstate &= ~SMS_ALL_ERRED;
306 
307 	sm->sm_state = newstate;
308 	uniqtime32(&sm->sm_timestamp);
309 }
310 
311 static int
312 mirror_geterror(mm_unit_t *un, int *smi, int *cip, int clr_error,
313 							int frm_probe)
314 {
315 	mm_submirror_t		*sm;
316 	mm_submirror_ic_t	*smic;
317 	md_m_shared_t		*shared;
318 	int			ci;
319 	int			i;
320 	int			compcnt;
321 	int			open_comp; /* flag for open component */
322 
323 	for (i = *smi; i < NMIRROR; i++) {
324 		sm = &un->un_sm[i];
325 		smic = &un->un_smic[i];
326 
327 		if (!SMS_IS(sm, SMS_INUSE))
328 			continue;
329 
330 		compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un);
331 		for (ci = *cip; ci < compcnt; ci++) {
332 			shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
333 			    (sm->sm_dev, sm, ci);
334 			/*
335 			 * if called from any routine but probe, we check for
336 			 * MDM_S_ISOPEN flag. Since probe does a pseduo open,
337 			 * it sets MDM_S_PROBEOPEN flag and we test for this
338 			 * flag. They are both exclusive tests.
339 			 */
340 			open_comp = (frm_probe) ?
341 			    (shared->ms_flags & MDM_S_PROBEOPEN):
342 			    (shared->ms_flags & MDM_S_ISOPEN);
343 			if ((shared->ms_flags & MDM_S_IOERR || !open_comp) &&
344 			    ((shared->ms_state == CS_OKAY) ||
345 			    (shared->ms_state == CS_RESYNC))) {
346 				if (clr_error) {
347 					shared->ms_flags &= ~MDM_S_IOERR;
348 				}
349 				*cip = ci;
350 				*smi = i;
351 				return (1);
352 			}
353 
354 			if (clr_error && (shared->ms_flags & MDM_S_IOERR)) {
355 				shared->ms_flags &= ~MDM_S_IOERR;
356 			}
357 		}
358 
359 		*cip = 0;
360 	}
361 	return (0);
362 }
363 
364 /*ARGSUSED*/
365 static void
366 mirror_run_queue(void *d)
367 {
368 	if (!(md_status & MD_GBL_DAEMONS_LIVE))
369 		md_daemon(1, &md_done_daemon);
370 }
371 /*
372  * check_comp_4_hotspares
373  *
374  * This function attempts to allocate a hotspare for this component if the
375  * component is in error. In a MN set, the function can be called in 2 modes.
376  * It can be called either when a component error has been detected or when a
377  * new hotspare has been allocated. In this case, MD_HOTSPARE_XMIT is set
378  * in flags and the request is sent to all nodes.
379  * The handler on each of the nodes then calls this function with
380  * MD_HOTSPARE_XMIT unset and the hotspare allocation is then performed.
381  *
382  * For non-MN sets the function simply attempts to allocate a hotspare.
383  *
384  * On entry, the following locks are held
385  *	mirror_md_ops.md_link_rw (if flags has MD_HOTSPARE_LINKHELD set)
386  *	md_unit_writerlock
387  *
388  * Returns	0 if ok
389  *		1 if the unit containing the component has been cleared while
390  *		  the mdmn_ksend_message() was being executed
391  */
392 extern int
393 check_comp_4_hotspares(
394 	mm_unit_t	*un,
395 	int		smi,
396 	int		ci,
397 	uint_t		flags,
398 	mddb_recid_t	hs_id,	/* Only used by MN disksets */
399 	IOLOCK		*lockp	/* can be NULL */
400 )
401 {
402 	mm_submirror_t		*sm;
403 	mm_submirror_ic_t	*smic;
404 	md_m_shared_t		*shared;
405 	mddb_recid_t		recids[6];
406 	minor_t			mnum;
407 	intptr_t		(*hs_dev)();
408 	void			(*hs_done)();
409 	void			*hs_data;
410 	md_error_t		mde = mdnullerror;
411 	set_t			setno;
412 	md_mn_msg_allochsp_t	allochspmsg;
413 	md_mn_kresult_t		*kresult;
414 	mm_unit_t		*new_un;
415 	int			rval;
416 
417 	mnum = MD_SID(un);
418 	setno = MD_UN2SET(un);
419 	sm = &un->un_sm[smi];
420 	smic = &un->un_smic[smi];
421 	shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
422 	    (sm->sm_dev, sm, ci);
423 
424 	if (shared->ms_state != CS_ERRED)
425 		return (0);
426 
427 	/* Don't start a new component resync if a resync is already running. */
428 	if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE)
429 		return (0);
430 
431 	if (MD_MNSET_SETNO(setno) && (flags & MD_HOTSPARE_XMIT)) {
432 		uint_t		msgflags;
433 		md_mn_msgtype_t	msgtype;
434 
435 		/* Send allocate hotspare message to all nodes */
436 
437 		allochspmsg.msg_allochsp_mnum = un->c.un_self_id;
438 		allochspmsg.msg_allochsp_sm = smi;
439 		allochspmsg.msg_allochsp_comp = ci;
440 		allochspmsg.msg_allochsp_hs_id = shared->ms_hs_id;
441 
442 		/*
443 		 * Before calling mdmn_ksend_message(), release locks
444 		 * Can never be in the context of an ioctl.
445 		 */
446 		md_unit_writerexit(MDI_UNIT(mnum));
447 		if (flags & MD_HOTSPARE_LINKHELD)
448 			rw_exit(&mirror_md_ops.md_link_rw.lock);
449 #ifdef DEBUG
450 		if (mirror_debug_flag)
451 			printf("send alloc hotspare, flags="
452 			    "0x%x %x, %x, %x, %x\n", flags,
453 			    allochspmsg.msg_allochsp_mnum,
454 			    allochspmsg.msg_allochsp_sm,
455 			    allochspmsg.msg_allochsp_comp,
456 			    allochspmsg.msg_allochsp_hs_id);
457 #endif
458 		if (flags & MD_HOTSPARE_WMUPDATE) {
459 			msgtype  = MD_MN_MSG_ALLOCATE_HOTSPARE2;
460 			/*
461 			 * When coming from an update of watermarks, there
462 			 * must already be a message logged that triggered
463 			 * this action. So, no need to log this message, too.
464 			 */
465 			msgflags = MD_MSGF_NO_LOG;
466 		} else {
467 			msgtype  = MD_MN_MSG_ALLOCATE_HOTSPARE;
468 			msgflags = MD_MSGF_DEFAULT_FLAGS;
469 		}
470 
471 		kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
472 		rval = mdmn_ksend_message(setno, msgtype, msgflags,
473 		    (char *)&allochspmsg, sizeof (allochspmsg),
474 		    kresult);
475 
476 		if (!MDMN_KSEND_MSG_OK(rval, kresult)) {
477 #ifdef DEBUG
478 			if (mirror_debug_flag)
479 				mdmn_ksend_show_error(rval, kresult,
480 				    "ALLOCATE HOTSPARE");
481 #endif
482 			/*
483 			 * If message is sent ok but exitval indicates an error
484 			 * it must be because the mirror has been cleared. In
485 			 * this case re-obtain lock and return an error
486 			 */
487 			if ((rval == 0) && (kresult->kmmr_exitval != 0)) {
488 				if (flags & MD_HOTSPARE_LINKHELD) {
489 					rw_enter(&mirror_md_ops.md_link_rw.lock,
490 					    RW_READER);
491 				}
492 				kmem_free(kresult, sizeof (md_mn_kresult_t));
493 				return (1);
494 			}
495 			cmn_err(CE_PANIC,
496 			    "ksend_message failure: ALLOCATE_HOTSPARE");
497 		}
498 		kmem_free(kresult, sizeof (md_mn_kresult_t));
499 
500 		/*
501 		 * re-obtain the locks
502 		 */
503 		if (flags & MD_HOTSPARE_LINKHELD)
504 			rw_enter(&mirror_md_ops.md_link_rw.lock, RW_READER);
505 		new_un = md_unit_writerlock(MDI_UNIT(mnum));
506 
507 		/*
508 		 * As we had to release the locks in order to send the
509 		 * message to all nodes, we need to check to see if the
510 		 * unit has changed. If it has we release the writerlock
511 		 * and return fail.
512 		 */
513 		if ((new_un != un) || (un->c.un_type != MD_METAMIRROR)) {
514 			md_unit_writerexit(MDI_UNIT(mnum));
515 			return (1);
516 		}
517 	} else {
518 		if (MD_MNSET_SETNO(setno)) {
519 			/*
520 			 * If 2 or more nodes simultaneously see a
521 			 * component failure, these nodes will each
522 			 * send an ALLOCATE_HOTSPARE[2] message.
523 			 * The first message will allocate the hotspare
524 			 * and the subsequent messages should do nothing.
525 			 *
526 			 * If a slave node doesn't have a hotspare allocated
527 			 * at the time the message is initiated, then the
528 			 * passed in hs_id will be 0.  If the node
529 			 * executing this routine has a component shared
530 			 * ms_hs_id of non-zero, but the message shows a
531 			 * hs_id of 0, then just return since a hotspare
532 			 * has already been allocated for this failing
533 			 * component.  When the slave node returns from
534 			 * the ksend_message the hotspare will have
535 			 * already been allocated.
536 			 *
537 			 * If the slave node does send an hs_id of non-zero,
538 			 * and the slave node's hs_id matches this node's
539 			 * ms_hs_id, then the hotspare has error'd and
540 			 * should be replaced.
541 			 *
542 			 * If the slave node sends an hs_id of non-zero and
543 			 * this node has a different shared ms_hs_id, then
544 			 * just return since this hotspare has already
545 			 * been hotspared.
546 			 */
547 			if (shared->ms_hs_id != 0) {
548 				if (hs_id == 0) {
549 #ifdef DEBUG
550 					if (mirror_debug_flag) {
551 						printf("check_comp_4_hotspares"
552 						    "(NOXMIT), short circuit "
553 						    "hs_id=0x%x, "
554 						    "ms_hs_id=0x%x\n",
555 						    hs_id, shared->ms_hs_id);
556 					}
557 #endif
558 					return (0);
559 				}
560 				if (hs_id != shared->ms_hs_id) {
561 #ifdef DEBUG
562 					if (mirror_debug_flag) {
563 						printf("check_comp_4_hotspares"
564 						    "(NOXMIT), short circuit2 "
565 						    "hs_id=0x%x, "
566 						    "ms_hs_id=0x%x\n",
567 						    hs_id, shared->ms_hs_id);
568 					}
569 #endif
570 					return (0);
571 				}
572 			}
573 		}
574 
575 		sm = &un->un_sm[smi];
576 		hs_dev = md_get_named_service(sm->sm_dev, 0,
577 		    "hotspare device", 0);
578 		if ((*hs_dev)(sm->sm_dev, 0, ci, recids, 6, &hs_done,
579 		    &hs_data) != 0)
580 			return (0);
581 
582 		/*
583 		 * set_sm_comp_state() commits the modified records.
584 		 * As we don't transmit the changes, no need to drop the lock.
585 		 */
586 		set_sm_comp_state(un, smi, ci, CS_RESYNC, recids,
587 		    MD_STATE_NO_XMIT, (IOLOCK *)NULL);
588 
589 		(*hs_done)(sm->sm_dev, hs_data);
590 
591 		mirror_check_failfast(mnum);
592 
593 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_HOTSPARED, SVM_TAG_METADEVICE,
594 		    setno, MD_SID(un));
595 
596 		/*
597 		 * For a multi-node set we need to reset the un_rs_type,
598 		 * un_rs_resync_done and un_rs_resync_2_do fields as the
599 		 * hot-spare resync must copy all applicable data.
600 		 */
601 		if (MD_MNSET_SETNO(setno)) {
602 			un->un_rs_type = MD_RS_NONE;
603 			un->un_rs_resync_done = 0;
604 			un->un_rs_resync_2_do = 0;
605 		}
606 
607 		/*
608 		 * Must drop writer lock since mirror_resync_unit will
609 		 * open devices and must be able to grab readerlock.
610 		 * Don't need to drop IOLOCK since any descendent routines
611 		 * calling ksend_messages will drop the IOLOCK as needed.
612 		 *
613 		 */
614 		if (lockp) {
615 			md_ioctl_writerexit(lockp);
616 		} else {
617 			md_unit_writerexit(MDI_UNIT(mnum));
618 		}
619 
620 		/* start resync */
621 		(void) mirror_resync_unit(mnum, NULL, &mde, lockp);
622 
623 		if (lockp) {
624 			new_un = md_ioctl_writerlock(lockp, MDI_UNIT(mnum));
625 		} else {
626 			new_un = md_unit_writerlock(MDI_UNIT(mnum));
627 		}
628 	}
629 	return (0);
630 }
631 
632 /*
633  * check_unit_4_hotspares
634  *
635  * For a given mirror, allocate hotspares, if available for any components
636  * that are in error
637  *
638  * Returns	0 if ok
639  *		1 if check_comp_4_hotspares returns non-zero. This will only
640  *		  happen for a MN unit where the unit has been cleared while
641  *		  the allocate hotspare message is sent to all nodes.
642  */
643 static int
644 check_unit_4_hotspares(mm_unit_t *un, int flags)
645 {
646 	mm_submirror_t		*sm;
647 	mm_submirror_ic_t	*smic;
648 	int			ci;
649 	int			i;
650 	int			compcnt;
651 
652 	if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE)
653 		return (0);
654 
655 	for (i = 0; i < NMIRROR; i++) {
656 		sm = &un->un_sm[i];
657 		smic = &un->un_smic[i];
658 		if (!SMS_IS(sm, SMS_INUSE))
659 			continue;
660 		compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, sm);
661 		for (ci = 0; ci < compcnt; ci++) {
662 			md_m_shared_t		*shared;
663 
664 			shared = (md_m_shared_t *)
665 			    (*(smic->sm_shared_by_indx))(sm->sm_dev, sm, ci);
666 			/*
667 			 * Never called from ioctl context, so pass in
668 			 * (IOLOCK *)NULL.  Pass through flags from calling
669 			 * routine, also setting XMIT flag.
670 			 */
671 			if (check_comp_4_hotspares(un, i, ci,
672 			    (MD_HOTSPARE_XMIT | flags),
673 			    shared->ms_hs_id, (IOLOCK *)NULL) != 0)
674 				return (1);
675 		}
676 	}
677 	return (0);
678 }
679 
680 static void
681 check_4_hotspares(daemon_request_t *drq)
682 {
683 	mdi_unit_t	*ui;
684 	mm_unit_t	*un;
685 	md_link_t	*next;
686 	int		x;
687 
688 	mutex_enter(&drq->dr_mx);	/* clear up front so can poke */
689 	drq->dr_pending = 0;		/* again in low level routine if */
690 	mutex_exit(&drq->dr_mx);	/* something found to do	*/
691 
692 	/*
693 	 * Used to have a problem here. The disksets weren't marked as being
694 	 * MNHOLD. This opened a window where we could be searching for
695 	 * hotspares and have the disk set unloaded (released) from under
696 	 * us causing a panic in stripe_component_count().
697 	 * The way to prevent that is to mark the set MNHOLD which prevents
698 	 * any diskset from being released while we are scanning the mirrors,
699 	 * submirrors and components.
700 	 */
701 
702 	for (x = 0; x < md_nsets; x++)
703 		md_holdset_enter(x);
704 
705 	rw_enter(&mirror_md_ops.md_link_rw.lock, RW_READER);
706 	for (next = mirror_md_ops.md_head; next != NULL; next = next->ln_next) {
707 		ui = MDI_UNIT(next->ln_id);
708 
709 		un = (mm_unit_t *)md_unit_readerlock(ui);
710 
711 		/*
712 		 * Only check the unit if we are the master for this set
713 		 * For an MN set, poke_hotspares() is only effective on the
714 		 * master
715 		 */
716 		if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
717 		    md_set[MD_UN2SET(un)].s_am_i_master == 0) {
718 			md_unit_readerexit(ui);
719 			continue;
720 		}
721 		if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) {
722 			md_unit_readerexit(ui);
723 			continue;
724 		}
725 		md_unit_readerexit(ui);
726 
727 		un = (mm_unit_t *)md_unit_writerlock(ui);
728 		/*
729 		 * check_unit_4_hotspares will exit 1 if the unit has been
730 		 * removed during the process of allocating the hotspare.
731 		 * This can only happen for a MN metadevice. If unit no longer
732 		 * exists, no need to release writerlock
733 		 */
734 		if (check_unit_4_hotspares(un, MD_HOTSPARE_LINKHELD) == 0)
735 			md_unit_writerexit(ui);
736 		else {
737 			/*
738 			 * If check_unit_4_hotspares failed, queue another
739 			 * request and break out of this one
740 			 */
741 			(void) poke_hotspares();
742 			break;
743 		}
744 	}
745 	rw_exit(&mirror_md_ops.md_link_rw.lock);
746 
747 	for (x = 0; x < md_nsets; x++)
748 		md_holdset_exit(x);
749 }
750 
751 /*
752  * poke_hotspares
753  *
754  * If there is not a pending poke_hotspares request pending, queue a requent
755  * to call check_4_hotspares(). This will scan all mirrors and attempt to
756  * allocate hotspares for all components in error.
757  */
758 int
759 poke_hotspares()
760 {
761 	mutex_enter(&hotspare_request.dr_mx);
762 	if (hotspare_request.dr_pending == 0) {
763 		hotspare_request.dr_pending = 1;
764 		daemon_request(&md_mhs_daemon,
765 		    check_4_hotspares, (daemon_queue_t *)&hotspare_request,
766 		    REQ_OLD);
767 	}
768 	mutex_exit(&hotspare_request.dr_mx);
769 	return (0);
770 }
771 
772 static void
773 free_all_ecomps(err_comp_t *ecomp)
774 {
775 	err_comp_t	*d;
776 
777 	while (ecomp != NULL) {
778 		d = ecomp;
779 		ecomp = ecomp->ec_next;
780 		kmem_free(d, sizeof (err_comp_t));
781 	}
782 }
783 
784 /*
785  * NAME: mirror_openfail_console_info
786  *
787  * DESCRIPTION: Prints a informative message to the console when mirror
788  *		cannot be opened.
789  *
790  * PARAMETERS: mm_unit_t	un - pointer to mirror unit structure
791  *	       int		smi - submirror index
792  *	       int		ci - component index
793  */
794 
795 void
796 mirror_openfail_console_info(mm_unit_t *un, int smi, int ci)
797 {
798 	void (*get_dev)();
799 	ms_cd_info_t cd;
800 	md_dev64_t tmpdev;
801 
802 	tmpdev = un->un_sm[smi].sm_dev;
803 	get_dev = (void (*)())md_get_named_service(tmpdev, 0, "get device", 0);
804 	if (get_dev != NULL) {
805 		(void) (*get_dev)(tmpdev, smi, ci, &cd);
806 		cmn_err(CE_WARN, "md %s: open error on %s",
807 		    md_shortname(MD_SID(un)), md_devname(MD_UN2SET(un),
808 		    cd.cd_dev, NULL, 0));
809 	} else {
810 		cmn_err(CE_WARN, "md %s: open error",
811 		    md_shortname(MD_SID(un)));
812 	}
813 }
814 
815 static int
816 mirror_close_all_devs(mm_unit_t *un, int md_cflags)
817 {
818 	int i;
819 	md_dev64_t dev;
820 
821 	for (i = 0; i < NMIRROR; i++) {
822 		if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
823 			continue;
824 		dev = un->un_sm[i].sm_dev;
825 		md_layered_close(dev, md_cflags);
826 	}
827 	return (0);
828 }
829 
830 /*
831  * Keep track of drivers that don't support failfast.  We use this so that
832  * we only log one diagnostic message for each of these drivers, no matter
833  * how many times we run the mirror_check_failfast function.
834  * Return 1 if this is a new driver that does not support failfast,
835  * return 0 if we have already seen this non-failfast driver.
836  */
837 static int
838 new_non_ff_driver(const char *s)
839 {
840 	mutex_enter(&non_ff_drv_mutex);
841 	if (non_ff_drivers == NULL) {
842 		non_ff_drivers = (char **)kmem_alloc(2 * sizeof (char *),
843 		    KM_NOSLEEP);
844 		if (non_ff_drivers == NULL) {
845 			mutex_exit(&non_ff_drv_mutex);
846 			return (1);
847 		}
848 
849 		non_ff_drivers[0] = (char *)kmem_alloc(strlen(s) + 1,
850 		    KM_NOSLEEP);
851 		if (non_ff_drivers[0] == NULL) {
852 			kmem_free(non_ff_drivers, 2 * sizeof (char *));
853 			non_ff_drivers = NULL;
854 			mutex_exit(&non_ff_drv_mutex);
855 			return (1);
856 		}
857 
858 		(void) strcpy(non_ff_drivers[0], s);
859 		non_ff_drivers[1] = NULL;
860 
861 	} else {
862 		int i;
863 		char **tnames;
864 		char **tmp;
865 
866 		for (i = 0; non_ff_drivers[i] != NULL; i++) {
867 			if (strcmp(s, non_ff_drivers[i]) == 0) {
868 				mutex_exit(&non_ff_drv_mutex);
869 				return (0);
870 			}
871 		}
872 
873 		/* allow for new element and null */
874 		i += 2;
875 		tnames = (char **)kmem_alloc(i * sizeof (char *), KM_NOSLEEP);
876 		if (tnames == NULL) {
877 			mutex_exit(&non_ff_drv_mutex);
878 			return (1);
879 		}
880 
881 		for (i = 0; non_ff_drivers[i] != NULL; i++)
882 			tnames[i] = non_ff_drivers[i];
883 
884 		tnames[i] = (char *)kmem_alloc(strlen(s) + 1, KM_NOSLEEP);
885 		if (tnames[i] == NULL) {
886 			/* adjust i so that it is the right count to free */
887 			kmem_free(tnames, (i + 2) * sizeof (char *));
888 			mutex_exit(&non_ff_drv_mutex);
889 			return (1);
890 		}
891 
892 		(void) strcpy(tnames[i++], s);
893 		tnames[i] = NULL;
894 
895 		tmp = non_ff_drivers;
896 		non_ff_drivers = tnames;
897 		/* i now represents the count we previously alloced */
898 		kmem_free(tmp, i * sizeof (char *));
899 	}
900 	mutex_exit(&non_ff_drv_mutex);
901 
902 	return (1);
903 }
904 
905 /*
906  * Check for the "ddi-failfast-supported" devtree property on each submirror
907  * component to indicate if we should do I/O to that submirror with the
908  * B_FAILFAST flag set or not.  This check is made at various state transitions
909  * in the mirror code (e.g. open, enable, hotspare, etc.).  Sometimes we
910  * only need to check one drive (e.g. hotspare) but since the check is
911  * fast and infrequent and sometimes needs to be done on all components we
912  * just check all components on each call.
913  */
914 void
915 mirror_check_failfast(minor_t mnum)
916 {
917 	int		i;
918 	mm_unit_t	*un;
919 
920 	if (md_ff_disable)
921 		return;
922 
923 	un = MD_UNIT(mnum);
924 
925 	for (i = 0; i < NMIRROR; i++) {
926 		int			ci;
927 		int			cnt;
928 		int			ff = 1;
929 		mm_submirror_t		*sm;
930 		mm_submirror_ic_t	*smic;
931 		void			(*get_dev)();
932 
933 		if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
934 			continue;
935 
936 		sm = &un->un_sm[i];
937 		smic = &un->un_smic[i];
938 
939 		get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0,
940 		    "get device", 0);
941 
942 		cnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm);
943 		for (ci = 0; ci < cnt; ci++) {
944 			int		found = 0;
945 			dev_t		ci_dev;
946 			major_t		major;
947 			dev_info_t	*devi;
948 			ms_cd_info_t	cd;
949 
950 			/*
951 			 * this already returns the hs
952 			 * dev if the device is spared
953 			 */
954 			(void) (*get_dev)(sm->sm_dev, sm, ci, &cd);
955 
956 			ci_dev = md_dev64_to_dev(cd.cd_dev);
957 			major = getmajor(ci_dev);
958 
959 			if (major == md_major) {
960 				/*
961 				 * this component must be a soft
962 				 * partition; get the real dev
963 				 */
964 				minor_t	dev_mnum;
965 				mdi_unit_t	*ui;
966 				mp_unit_t	*un;
967 				set_t	setno;
968 				side_t	side;
969 				md_dev64_t	tmpdev;
970 
971 				ui = MDI_UNIT(getminor(ci_dev));
972 
973 				/* grab necessary lock */
974 				un = (mp_unit_t *)md_unit_readerlock(ui);
975 
976 				dev_mnum = MD_SID(un);
977 				setno = MD_MIN2SET(dev_mnum);
978 				side = mddb_getsidenum(setno);
979 
980 				tmpdev = un->un_dev;
981 
982 				/* Get dev by device id */
983 				if (md_devid_found(setno, side,
984 				    un->un_key) == 1) {
985 					tmpdev = md_resolve_bydevid(dev_mnum,
986 					    tmpdev, un->un_key);
987 				}
988 
989 				md_unit_readerexit(ui);
990 
991 				ci_dev = md_dev64_to_dev(tmpdev);
992 				major = getmajor(ci_dev);
993 			}
994 
995 			if (ci_dev != NODEV32 &&
996 			    (devi = e_ddi_hold_devi_by_dev(ci_dev, 0))
997 			    != NULL) {
998 				ddi_prop_op_t	prop_op = PROP_LEN_AND_VAL_BUF;
999 				int		propvalue = 0;
1000 				int		proplength = sizeof (int);
1001 				int		error;
1002 				struct cb_ops	*cb;
1003 
1004 				if ((cb = devopsp[major]->devo_cb_ops) !=
1005 				    NULL) {
1006 					error = (*cb->cb_prop_op)
1007 					    (DDI_DEV_T_ANY, devi, prop_op,
1008 					    DDI_PROP_NOTPROM|DDI_PROP_DONTPASS,
1009 					    "ddi-failfast-supported",
1010 					    (caddr_t)&propvalue, &proplength);
1011 
1012 					if (error == DDI_PROP_SUCCESS)
1013 						found = 1;
1014 				}
1015 
1016 				if (!found && new_non_ff_driver(
1017 				    ddi_driver_name(devi))) {
1018 					cmn_err(CE_NOTE, "!md: B_FAILFAST I/O"
1019 					    "disabled on %s",
1020 					    ddi_driver_name(devi));
1021 				}
1022 
1023 				ddi_release_devi(devi);
1024 			}
1025 
1026 			/*
1027 			 * All components must support
1028 			 * failfast in the submirror.
1029 			 */
1030 			if (!found) {
1031 				ff = 0;
1032 				break;
1033 			}
1034 		}
1035 
1036 		if (ff) {
1037 			sm->sm_flags |= MD_SM_FAILFAST;
1038 		} else {
1039 			sm->sm_flags &= ~MD_SM_FAILFAST;
1040 		}
1041 	}
1042 }
1043 
1044 /*
1045  * Return true if the submirror is unavailable.
1046  * If any of the submirror components are opened then the submirror cannot
1047  * be unavailable (MD_INACCESSIBLE).
1048  * If any of the components are already in the errored state, then the submirror
1049  * cannot be unavailable (MD_INACCESSIBLE).
1050  */
1051 static bool_t
1052 submirror_unavailable(mm_unit_t *un, int smi, int from_probe)
1053 {
1054 	mm_submirror_t		*sm;
1055 	mm_submirror_ic_t	*smic;
1056 	md_m_shared_t		*shared;
1057 	int			ci;
1058 	int			compcnt;
1059 
1060 	sm = &un->un_sm[smi];
1061 	smic = &un->un_smic[smi];
1062 
1063 	compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un);
1064 	for (ci = 0; ci < compcnt; ci++) {
1065 		shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
1066 		    (sm->sm_dev, sm, ci);
1067 		if (from_probe) {
1068 			if (shared->ms_flags & MDM_S_PROBEOPEN)
1069 				return (B_FALSE);
1070 		} else {
1071 			if (shared->ms_flags & MDM_S_ISOPEN)
1072 				return (B_FALSE);
1073 		}
1074 		if (shared->ms_state == CS_ERRED ||
1075 		    shared->ms_state == CS_LAST_ERRED)
1076 			return (B_FALSE);
1077 	}
1078 
1079 	return (B_TRUE);
1080 }
1081 
1082 static int
1083 mirror_open_all_devs(minor_t mnum, int md_oflags, IOLOCK *lockp)
1084 {
1085 	int		i;
1086 	mm_unit_t	*un;
1087 	mdi_unit_t	*ui;
1088 	int		err;
1089 	int		smi;
1090 	int		ci;
1091 	err_comp_t	*c;
1092 	err_comp_t	*ecomps = NULL;
1093 	int		smmask = 0;
1094 	set_t		setno;
1095 	int		sm_cnt;
1096 	int		sm_unavail_cnt;
1097 
1098 	mirror_check_failfast(mnum);
1099 
1100 	un = MD_UNIT(mnum);
1101 	ui = MDI_UNIT(mnum);
1102 	setno = MD_UN2SET(un);
1103 
1104 	for (i = 0; i < NMIRROR; i++) {
1105 		md_dev64_t tmpdev = un->un_sm[i].sm_dev;
1106 
1107 		if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
1108 			continue;
1109 		if (md_layered_open(mnum, &tmpdev, md_oflags))
1110 			smmask |= SMI2BIT(i);
1111 		un->un_sm[i].sm_dev = tmpdev;
1112 	}
1113 
1114 	/*
1115 	 * If smmask is clear, all submirrors are accessible. Clear the
1116 	 * MD_INACCESSIBLE bit in this case.  This bit is also cleared for the
1117 	 * mirror device.   If smmask is set, we have to determine which of the
1118 	 * submirrors are in error. If no submirror is accessible we mark the
1119 	 * whole mirror as MD_INACCESSIBLE.
1120 	 */
1121 	if (smmask == 0) {
1122 		if (lockp) {
1123 			md_ioctl_readerexit(lockp);
1124 			(void) md_ioctl_writerlock(lockp, ui);
1125 		} else {
1126 			md_unit_readerexit(ui);
1127 			(void) md_unit_writerlock(ui);
1128 		}
1129 		ui->ui_tstate &= ~MD_INACCESSIBLE;
1130 		if (lockp) {
1131 			md_ioctl_writerexit(lockp);
1132 			(void) md_ioctl_readerlock(lockp, ui);
1133 		} else {
1134 			md_unit_writerexit(ui);
1135 			(void) md_unit_readerlock(ui);
1136 		}
1137 
1138 		for (i = 0; i < NMIRROR; i++) {
1139 			md_dev64_t	tmpdev;
1140 			mdi_unit_t	*sm_ui;
1141 
1142 			if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
1143 				continue;
1144 
1145 			tmpdev = un->un_sm[i].sm_dev;
1146 			sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev)));
1147 			(void) md_unit_writerlock(sm_ui);
1148 			sm_ui->ui_tstate &= ~MD_INACCESSIBLE;
1149 			md_unit_writerexit(sm_ui);
1150 		}
1151 
1152 		return (0);
1153 	}
1154 
1155 	for (i = 0; i < NMIRROR; i++) {
1156 		md_dev64_t tmpdev;
1157 
1158 		if (!(smmask & SMI2BIT(i)))
1159 			continue;
1160 
1161 		tmpdev = un->un_sm[i].sm_dev;
1162 		err = md_layered_open(mnum, &tmpdev, MD_OFLG_CONT_ERRS);
1163 		un->un_sm[i].sm_dev = tmpdev;
1164 		ASSERT(err == 0);
1165 	}
1166 
1167 	if (lockp) {
1168 		md_ioctl_readerexit(lockp);
1169 		un = (mm_unit_t *)md_ioctl_writerlock(lockp, ui);
1170 	} else {
1171 		md_unit_readerexit(ui);
1172 		un = (mm_unit_t *)md_unit_writerlock(ui);
1173 	}
1174 
1175 	/*
1176 	 * We want to make sure the unavailable flag is not masking a real
1177 	 * error on the submirror.
1178 	 * For each submirror,
1179 	 *    if all of the submirror components couldn't be opened and there
1180 	 *    are no errors on the submirror, then set the unavailable flag
1181 	 *    otherwise, clear unavailable.
1182 	 */
1183 	sm_cnt = 0;
1184 	sm_unavail_cnt = 0;
1185 	for (i = 0; i < NMIRROR; i++) {
1186 		md_dev64_t	tmpdev;
1187 		mdi_unit_t	*sm_ui;
1188 
1189 		if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
1190 			continue;
1191 
1192 		sm_cnt++;
1193 		tmpdev = un->un_sm[i].sm_dev;
1194 		sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev)));
1195 
1196 		(void) md_unit_writerlock(sm_ui);
1197 		if (submirror_unavailable(un, i, 0)) {
1198 			sm_ui->ui_tstate |= MD_INACCESSIBLE;
1199 			sm_unavail_cnt++;
1200 		} else {
1201 			sm_ui->ui_tstate &= ~MD_INACCESSIBLE;
1202 		}
1203 		md_unit_writerexit(sm_ui);
1204 	}
1205 
1206 	/*
1207 	 * If all of the submirrors are unavailable, the mirror is also
1208 	 * unavailable.
1209 	 */
1210 	if (sm_cnt == sm_unavail_cnt) {
1211 		ui->ui_tstate |= MD_INACCESSIBLE;
1212 	} else {
1213 		ui->ui_tstate &= ~MD_INACCESSIBLE;
1214 	}
1215 
1216 	smi = 0;
1217 	ci = 0;
1218 	while (mirror_geterror(un, &smi, &ci, 1, 0) != 0) {
1219 		if (mirror_other_sources(un, smi, ci, 1) == 1) {
1220 
1221 			free_all_ecomps(ecomps);
1222 			(void) mirror_close_all_devs(un, md_oflags);
1223 			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL,
1224 			    SVM_TAG_METADEVICE, setno, MD_SID(un));
1225 			mirror_openfail_console_info(un, smi, ci);
1226 			if (lockp) {
1227 				md_ioctl_writerexit(lockp);
1228 				(void) md_ioctl_readerlock(lockp, ui);
1229 			} else {
1230 				md_unit_writerexit(ui);
1231 				(void) md_unit_readerlock(ui);
1232 			}
1233 			return (ENXIO);
1234 		}
1235 
1236 		/* track all component states that need changing */
1237 		c = (err_comp_t *)kmem_alloc(sizeof (err_comp_t), KM_SLEEP);
1238 		c->ec_next = ecomps;
1239 		c->ec_smi = smi;
1240 		c->ec_ci = ci;
1241 		ecomps = c;
1242 		ci++;
1243 	}
1244 
1245 	/* Make all state changes and commit them */
1246 	for (c = ecomps; c != NULL; c = c->ec_next) {
1247 		/*
1248 		 * If lockp is set, then entering kernel through ioctl.
1249 		 * For a MN set, the only ioctl path is via a commd message
1250 		 * (ALLOCATE_HOTSPARE or *RESYNC* messages) that is already
1251 		 * being sent to each node.
1252 		 * In this case, set NO_XMIT so that set_sm_comp_state
1253 		 * won't attempt to send a message on a message.
1254 		 *
1255 		 * In !MN sets, the xmit flag is ignored, so it doesn't matter
1256 		 * which flag is passed.
1257 		 */
1258 		if (lockp) {
1259 			set_sm_comp_state(un, c->ec_smi, c->ec_ci, CS_ERRED, 0,
1260 			    MD_STATE_NO_XMIT, lockp);
1261 		} else {
1262 			set_sm_comp_state(un, c->ec_smi, c->ec_ci, CS_ERRED, 0,
1263 			    (MD_STATE_XMIT | MD_STATE_OCHELD), lockp);
1264 		}
1265 		/*
1266 		 * For a MN set, the NOTIFY is done when the state change is
1267 		 * processed on each node
1268 		 */
1269 		if (!MD_MNSET_SETNO(setno)) {
1270 			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED,
1271 			    SVM_TAG_METADEVICE, setno, MD_SID(un));
1272 		}
1273 	}
1274 
1275 	if (lockp) {
1276 		md_ioctl_writerexit(lockp);
1277 		(void) md_ioctl_readerlock(lockp, ui);
1278 	} else {
1279 		md_unit_writerexit(ui);
1280 		(void) md_unit_readerlock(ui);
1281 	}
1282 
1283 	free_all_ecomps(ecomps);
1284 
1285 	/* allocate hotspares for all errored components */
1286 	if (MD_MNSET_SETNO(setno)) {
1287 		/*
1288 		 * If we're called from an ioctl (lockp set) then we cannot
1289 		 * directly call send_poke_hotspares as this will block until
1290 		 * the message gets despatched to all nodes. If the cluster is
1291 		 * going through a reconfig cycle then the message will block
1292 		 * until the cycle is complete, and as we originate from a
1293 		 * service call from commd we will livelock.
1294 		 */
1295 		if (lockp == NULL) {
1296 			md_unit_readerexit(ui);
1297 			send_poke_hotspares(setno);
1298 			(void) md_unit_readerlock(ui);
1299 		}
1300 	} else {
1301 		(void) poke_hotspares();
1302 	}
1303 	return (0);
1304 }
1305 
1306 void
1307 mirror_overlap_tree_remove(md_mps_t *ps)
1308 {
1309 	mm_unit_t	*un;
1310 
1311 	if (panicstr)
1312 		return;
1313 
1314 	VERIFY(ps->ps_flags & MD_MPS_ON_OVERLAP);
1315 	un = ps->ps_un;
1316 
1317 	mutex_enter(&un->un_overlap_tree_mx);
1318 	avl_remove(&un->un_overlap_root, ps);
1319 	ps->ps_flags &= ~MD_MPS_ON_OVERLAP;
1320 	if (un->un_overlap_tree_flag != 0) {
1321 		un->un_overlap_tree_flag = 0;
1322 		cv_broadcast(&un->un_overlap_tree_cv);
1323 	}
1324 	mutex_exit(&un->un_overlap_tree_mx);
1325 }
1326 
1327 
1328 /*
1329  * wait_for_overlaps:
1330  * -----------------
1331  * Check that given i/o request does not cause an overlap with already pending
1332  * i/o. If it does, block until the overlapped i/o completes.
1333  *
1334  * The flag argument has MD_OVERLAP_ALLOW_REPEAT set if it is ok for the parent
1335  * structure to be already in the overlap tree and MD_OVERLAP_NO_REPEAT if
1336  * it must not already be in the tree.
1337  */
1338 static void
1339 wait_for_overlaps(md_mps_t *ps, int flags)
1340 {
1341 	mm_unit_t	*un;
1342 	avl_index_t	where;
1343 	md_mps_t	*ps1;
1344 
1345 	if (panicstr)
1346 		return;
1347 
1348 	un = ps->ps_un;
1349 	mutex_enter(&un->un_overlap_tree_mx);
1350 	if ((flags & MD_OVERLAP_ALLOW_REPEAT) &&
1351 	    (ps->ps_flags & MD_MPS_ON_OVERLAP)) {
1352 		mutex_exit(&un->un_overlap_tree_mx);
1353 		return;
1354 	}
1355 
1356 	VERIFY(!(ps->ps_flags & MD_MPS_ON_OVERLAP));
1357 
1358 	do {
1359 		ps1 = avl_find(&un->un_overlap_root, ps, &where);
1360 		if (ps1 == NULL) {
1361 			/*
1362 			 * The candidate range does not overlap with any
1363 			 * range in the tree.  Insert it and be done.
1364 			 */
1365 			avl_insert(&un->un_overlap_root, ps, where);
1366 			ps->ps_flags |= MD_MPS_ON_OVERLAP;
1367 		} else {
1368 			/*
1369 			 * The candidate range would overlap.  Set the flag
1370 			 * indicating we need to be woken up, and sleep
1371 			 * until another thread removes a range.  If upon
1372 			 * waking up we find this mps was put on the tree
1373 			 * by another thread, the loop terminates.
1374 			 */
1375 			un->un_overlap_tree_flag = 1;
1376 			cv_wait(&un->un_overlap_tree_cv,
1377 			    &un->un_overlap_tree_mx);
1378 		}
1379 	} while (!(ps->ps_flags & MD_MPS_ON_OVERLAP));
1380 	mutex_exit(&un->un_overlap_tree_mx);
1381 }
1382 
1383 /*
1384  * This function is called from mirror_done to check whether any pages have
1385  * been modified while a mirrored write was in progress.  Returns 0 if
1386  * all pages associated with bp are clean, 1 otherwise.
1387  */
1388 static int
1389 any_pages_dirty(struct buf *bp)
1390 {
1391 	int	rval;
1392 
1393 	rval = biomodified(bp);
1394 	if (rval == -1)
1395 		rval = 0;
1396 
1397 	return (rval);
1398 }
1399 
1400 #define	MAX_EXTRAS 10
1401 
1402 void
1403 mirror_commit(
1404 	mm_unit_t	*un,
1405 	int		smmask,
1406 	mddb_recid_t	*extras
1407 )
1408 {
1409 	mm_submirror_t		*sm;
1410 	md_unit_t		*su;
1411 	int			i;
1412 
1413 	/* 2=mirror,null id */
1414 	mddb_recid_t		recids[NMIRROR+2+MAX_EXTRAS];
1415 
1416 	int			ri = 0;
1417 
1418 	if (md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)
1419 		return;
1420 
1421 	/* Add two, this includes the mirror unit and the null recid */
1422 	if (extras != NULL) {
1423 		int	nrecids = 0;
1424 		while (extras[nrecids] != 0) {
1425 			nrecids++;
1426 		}
1427 		ASSERT(nrecids <= MAX_EXTRAS);
1428 	}
1429 
1430 	if (un != NULL)
1431 		recids[ri++] = un->c.un_record_id;
1432 	for (i = 0;  i < NMIRROR; i++) {
1433 		if (!(smmask & SMI2BIT(i)))
1434 			continue;
1435 		sm = &un->un_sm[i];
1436 		if (!SMS_IS(sm, SMS_INUSE))
1437 			continue;
1438 		if (md_getmajor(sm->sm_dev) != md_major)
1439 			continue;
1440 		su =  MD_UNIT(md_getminor(sm->sm_dev));
1441 		recids[ri++] = su->c.un_record_id;
1442 	}
1443 
1444 	if (extras != NULL)
1445 		while (*extras != 0) {
1446 			recids[ri++] = *extras;
1447 			extras++;
1448 		}
1449 
1450 	if (ri == 0)
1451 		return;
1452 	recids[ri] = 0;
1453 
1454 	/*
1455 	 * Ok to hold ioctl lock across record commit to mddb as
1456 	 * long as the record(s) being committed aren't resync records.
1457 	 */
1458 	mddb_commitrecs_wrapper(recids);
1459 }
1460 
1461 
1462 /*
1463  * This routine is used to set a bit in the writable_bm bitmap
1464  * which represents each submirror in a metamirror which
1465  * is writable. The first writable submirror index is assigned
1466  * to the sm_index.  The number of writable submirrors are returned in nunits.
1467  *
1468  * This routine returns the submirror's unit number.
1469  */
1470 
1471 static void
1472 select_write_units(struct mm_unit *un, md_mps_t *ps)
1473 {
1474 
1475 	int		i;
1476 	unsigned	writable_bm = 0;
1477 	unsigned	nunits = 0;
1478 
1479 	for (i = 0; i < NMIRROR; i++) {
1480 		if (SUBMIRROR_IS_WRITEABLE(un, i)) {
1481 			/* set bit of all writable units */
1482 			writable_bm |= SMI2BIT(i);
1483 			nunits++;
1484 		}
1485 	}
1486 	ps->ps_writable_sm = writable_bm;
1487 	ps->ps_active_cnt = nunits;
1488 	ps->ps_current_sm = 0;
1489 }
1490 
1491 static
1492 unsigned
1493 select_write_after_read_units(struct mm_unit *un, md_mps_t *ps)
1494 {
1495 
1496 	int		i;
1497 	unsigned	writable_bm = 0;
1498 	unsigned	nunits = 0;
1499 
1500 	for (i = 0; i < NMIRROR; i++) {
1501 		if (SUBMIRROR_IS_WRITEABLE(un, i) &&
1502 		    un->un_sm[i].sm_flags & MD_SM_RESYNC_TARGET) {
1503 			writable_bm |= SMI2BIT(i);
1504 			nunits++;
1505 		}
1506 	}
1507 	if ((writable_bm & ps->ps_allfrom_sm) != 0) {
1508 		writable_bm &= ~ps->ps_allfrom_sm;
1509 		nunits--;
1510 	}
1511 	ps->ps_writable_sm = writable_bm;
1512 	ps->ps_active_cnt = nunits;
1513 	ps->ps_current_sm = 0;
1514 	return (nunits);
1515 }
1516 
1517 static md_dev64_t
1518 select_read_unit(
1519 	mm_unit_t	*un,
1520 	diskaddr_t	blkno,
1521 	u_longlong_t	reqcount,
1522 	u_longlong_t	*cando,
1523 	int		must_be_opened,
1524 	md_m_shared_t	**shared,
1525 	md_mcs_t	*cs)
1526 {
1527 	int			i;
1528 	md_m_shared_t		*s;
1529 	uint_t			lasterrcnt = 0;
1530 	md_dev64_t		dev = 0;
1531 	u_longlong_t		cnt;
1532 	u_longlong_t		mincnt;
1533 	mm_submirror_t		*sm;
1534 	mm_submirror_ic_t	*smic;
1535 	mdi_unit_t		*ui;
1536 
1537 	mincnt = reqcount;
1538 	for (i = 0; i < NMIRROR; i++) {
1539 		if (!SUBMIRROR_IS_READABLE(un, i))
1540 			continue;
1541 		sm = &un->un_sm[i];
1542 		smic = &un->un_smic[i];
1543 		cnt = reqcount;
1544 
1545 		/*
1546 		 * If the current submirror is marked as inaccessible, do not
1547 		 * try to access it.
1548 		 */
1549 		ui = MDI_UNIT(getminor(expldev(sm->sm_dev)));
1550 		(void) md_unit_readerlock(ui);
1551 		if (ui->ui_tstate & MD_INACCESSIBLE) {
1552 			md_unit_readerexit(ui);
1553 			continue;
1554 		}
1555 		md_unit_readerexit(ui);
1556 
1557 		s = (md_m_shared_t *)(*(smic->sm_shared_by_blk))
1558 		    (sm->sm_dev, sm, blkno, &cnt);
1559 
1560 		if (must_be_opened && !(s->ms_flags & MDM_S_ISOPEN))
1561 			continue;
1562 		if (s->ms_state == CS_OKAY) {
1563 			*cando = cnt;
1564 			if (shared != NULL)
1565 				*shared = s;
1566 
1567 			if (un->un_sm[i].sm_flags & MD_SM_FAILFAST &&
1568 			    cs != NULL) {
1569 				cs->cs_buf.b_flags |= B_FAILFAST;
1570 			}
1571 
1572 			return (un->un_sm[i].sm_dev);
1573 		}
1574 		if (s->ms_state != CS_LAST_ERRED)
1575 			continue;
1576 
1577 		/* don't use B_FAILFAST since we're Last Erred */
1578 
1579 		if (mincnt > cnt)
1580 			mincnt = cnt;
1581 		if (s->ms_lasterrcnt > lasterrcnt) {
1582 			lasterrcnt = s->ms_lasterrcnt;
1583 			if (shared != NULL)
1584 				*shared = s;
1585 			dev = un->un_sm[i].sm_dev;
1586 		}
1587 	}
1588 	*cando = mincnt;
1589 	return (dev);
1590 }
1591 
1592 /*
1593  * Given a 32-bit bitmap, this routine will return the bit number
1594  * of the nth bit set.	The nth bit set is passed via the index integer.
1595  *
1596  * This routine is used to run through the writable submirror bitmap
1597  * and starting all of the writes.  See the value returned is the
1598  * index to appropriate submirror structure, in the md_sm
1599  * array for metamirrors.
1600  */
1601 static int
1602 md_find_nth_unit(uint_t mask, int index)
1603 {
1604 	int	bit, nfound;
1605 
1606 	for (bit = -1, nfound = -1; nfound != index; bit++) {
1607 		ASSERT(mask != 0);
1608 		nfound += (mask & 1);
1609 		mask >>= 1;
1610 	}
1611 	return (bit);
1612 }
1613 
1614 static int
1615 fast_select_read_unit(md_mps_t *ps, md_mcs_t *cs)
1616 {
1617 	mm_unit_t	*un;
1618 	buf_t		*bp;
1619 	int		i;
1620 	unsigned	nunits = 0;
1621 	int		iunit;
1622 	uint_t		running_bm = 0;
1623 	uint_t		sm_index;
1624 
1625 	bp = &cs->cs_buf;
1626 	un = ps->ps_un;
1627 
1628 	for (i = 0; i < NMIRROR; i++) {
1629 		if (!SMS_BY_INDEX_IS(un, i, SMS_RUNNING))
1630 			continue;
1631 		running_bm |= SMI2BIT(i);
1632 		nunits++;
1633 	}
1634 	if (nunits == 0)
1635 		return (1);
1636 
1637 	/*
1638 	 * For directed mirror read (DMR) we only use the specified side and
1639 	 * do not compute the source of the read.
1640 	 */
1641 	if (ps->ps_flags & MD_MPS_DMR) {
1642 		sm_index = un->un_dmr_last_read;
1643 	} else {
1644 		/* Normal (non-DMR) operation */
1645 		switch (un->un_read_option) {
1646 		case RD_GEOMETRY:
1647 			iunit = (int)(bp->b_lblkno /
1648 			    howmany(un->c.un_total_blocks, nunits));
1649 			sm_index = md_find_nth_unit(running_bm, iunit);
1650 			break;
1651 		case RD_FIRST:
1652 			sm_index = md_find_nth_unit(running_bm, 0);
1653 			break;
1654 		case RD_LOAD_BAL:
1655 			/* this is intentional to fall into the default */
1656 		default:
1657 			un->un_last_read = (un->un_last_read + 1) % nunits;
1658 			sm_index = md_find_nth_unit(running_bm,
1659 			    un->un_last_read);
1660 			break;
1661 		}
1662 	}
1663 	bp->b_edev = md_dev64_to_dev(un->un_sm[sm_index].sm_dev);
1664 	ps->ps_allfrom_sm = SMI2BIT(sm_index);
1665 
1666 	if (un->un_sm[sm_index].sm_flags & MD_SM_FAILFAST) {
1667 		bp->b_flags |= B_FAILFAST;
1668 	}
1669 
1670 	return (0);
1671 }
1672 
1673 static
1674 int
1675 mirror_are_submirrors_available(mm_unit_t *un)
1676 {
1677 	int i;
1678 	for (i = 0; i < NMIRROR; i++) {
1679 		md_dev64_t tmpdev = un->un_sm[i].sm_dev;
1680 
1681 		if ((!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) ||
1682 		    md_getmajor(tmpdev) != md_major)
1683 			continue;
1684 
1685 		if ((MD_MIN2SET(md_getminor(tmpdev)) >= md_nsets) ||
1686 		    (MD_MIN2UNIT(md_getminor(tmpdev)) >= md_nunits))
1687 			return (0);
1688 
1689 		if (MDI_UNIT(md_getminor(tmpdev)) == NULL)
1690 			return (0);
1691 	}
1692 	return (1);
1693 }
1694 
1695 void
1696 build_submirror(mm_unit_t *un, int i, int snarfing)
1697 {
1698 	struct mm_submirror	*sm;
1699 	struct mm_submirror_ic	*smic;
1700 	md_unit_t		*su;
1701 	set_t			setno;
1702 
1703 	sm = &un->un_sm[i];
1704 	smic = &un->un_smic[i];
1705 
1706 	sm->sm_flags = 0; /* sometime we may need to do more here */
1707 
1708 	setno = MD_UN2SET(un);
1709 
1710 	if (!SMS_IS(sm, SMS_INUSE))
1711 		return;
1712 	if (snarfing) {
1713 		sm->sm_dev = md_getdevnum(setno, mddb_getsidenum(setno),
1714 		    sm->sm_key, MD_NOTRUST_DEVT);
1715 	} else {
1716 		if (md_getmajor(sm->sm_dev) == md_major) {
1717 			su = MD_UNIT(md_getminor(sm->sm_dev));
1718 			un->c.un_flag |= (su->c.un_flag & MD_LABELED);
1719 			/* submirror can no longer be soft partitioned */
1720 			MD_CAPAB(su) &= (~MD_CAN_SP);
1721 		}
1722 	}
1723 	smic->sm_shared_by_blk = md_get_named_service(sm->sm_dev,
1724 	    0, "shared by blk", 0);
1725 	smic->sm_shared_by_indx = md_get_named_service(sm->sm_dev,
1726 	    0, "shared by indx", 0);
1727 	smic->sm_get_component_count = (int (*)())md_get_named_service(
1728 	    sm->sm_dev, 0, "get component count", 0);
1729 	smic->sm_get_bcss = (int (*)())md_get_named_service(sm->sm_dev, 0,
1730 	    "get block count skip size", 0);
1731 	sm->sm_state &= ~SMS_IGNORE;
1732 	if (SMS_IS(sm, SMS_OFFLINE))
1733 		MD_STATUS(un) |= MD_UN_OFFLINE_SM;
1734 	md_set_parent(sm->sm_dev, MD_SID(un));
1735 }
1736 
1737 static void
1738 mirror_cleanup(mm_unit_t *un)
1739 {
1740 	mddb_recid_t	recid;
1741 	int		smi;
1742 	sv_dev_t	sv[NMIRROR];
1743 	int		nsv = 0;
1744 
1745 	/*
1746 	 * If a MN diskset and this node is not the master, do
1747 	 * not delete any records on snarf of the mirror records.
1748 	 */
1749 	if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
1750 	    md_set[MD_UN2SET(un)].s_am_i_master == 0) {
1751 		return;
1752 	}
1753 
1754 	for (smi = 0; smi < NMIRROR; smi++) {
1755 		if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE))
1756 			continue;
1757 		sv[nsv].setno = MD_UN2SET(un);
1758 		sv[nsv++].key = un->un_sm[smi].sm_key;
1759 	}
1760 
1761 	recid = un->un_rr_dirty_recid;
1762 	mddb_deleterec_wrapper(un->c.un_record_id);
1763 	if (recid > 0)
1764 		mddb_deleterec_wrapper(recid);
1765 
1766 	md_rem_names(sv, nsv);
1767 }
1768 
1769 /*
1770  * Comparison function for the avl tree which tracks
1771  * outstanding writes on submirrors.
1772  *
1773  * Returns:
1774  *	-1: ps1 < ps2
1775  *	 0: ps1 and ps2 overlap
1776  *	 1: ps1 > ps2
1777  */
1778 static int
1779 mirror_overlap_compare(const void *p1, const void *p2)
1780 {
1781 	const md_mps_t *ps1 = (md_mps_t *)p1;
1782 	const md_mps_t *ps2 = (md_mps_t *)p2;
1783 
1784 	if (ps1->ps_firstblk < ps2->ps_firstblk) {
1785 		if (ps1->ps_lastblk >= ps2->ps_firstblk)
1786 			return (0);
1787 		return (-1);
1788 	}
1789 
1790 	if (ps1->ps_firstblk > ps2->ps_firstblk) {
1791 		if (ps1->ps_firstblk <= ps2->ps_lastblk)
1792 			return (0);
1793 		return (1);
1794 	}
1795 
1796 	return (0);
1797 }
1798 
1799 /* Return a -1 if optimized record unavailable and set should be released */
1800 int
1801 mirror_build_incore(mm_unit_t *un, int snarfing)
1802 {
1803 	int		i;
1804 
1805 	if (MD_STATUS(un) & MD_UN_BEING_RESET) {
1806 		mddb_setrecprivate(un->c.un_record_id, MD_PRV_PENDCLEAN);
1807 		return (1);
1808 	}
1809 
1810 	if (mirror_are_submirrors_available(un) == 0)
1811 		return (1);
1812 
1813 	if (MD_UNIT(MD_SID(un)) != NULL)
1814 		return (0);
1815 
1816 	MD_STATUS(un) = 0;
1817 
1818 	/* pre-4.1 didn't define CAN_META_CHILD capability */
1819 	MD_CAPAB(un) = MD_CAN_META_CHILD | MD_CAN_PARENT | MD_CAN_SP;
1820 
1821 	un->un_overlap_tree_flag = 0;
1822 	avl_create(&un->un_overlap_root, mirror_overlap_compare,
1823 	    sizeof (md_mps_t), offsetof(md_mps_t, ps_overlap_node));
1824 
1825 	for (i = 0; i < NMIRROR; i++)
1826 		build_submirror(un, i, snarfing);
1827 
1828 	if (unit_setup_resync(un, snarfing) != 0) {
1829 		if (snarfing) {
1830 			mddb_setrecprivate(un->c.un_record_id, MD_PRV_GOTIT);
1831 			/*
1832 			 * If a MN set and set is not stale, then return -1
1833 			 * which will force the caller to unload the set.
1834 			 * The MN diskset nodes will return failure if
1835 			 * unit_setup_resync fails so that nodes won't
1836 			 * get out of sync.
1837 			 *
1838 			 * If set is STALE, the master node can't allocate
1839 			 * a resync record (if needed), but node needs to
1840 			 * join the set so that user can delete broken mddbs.
1841 			 * So, if set is STALE, just continue on.
1842 			 */
1843 			if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
1844 			    !(md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)) {
1845 				return (-1);
1846 			}
1847 		} else
1848 			return (1);
1849 	}
1850 
1851 	mutex_init(&un->un_overlap_tree_mx, NULL, MUTEX_DEFAULT, NULL);
1852 	cv_init(&un->un_overlap_tree_cv, NULL, CV_DEFAULT, NULL);
1853 
1854 	un->un_suspend_wr_flag = 0;
1855 	mutex_init(&un->un_suspend_wr_mx, NULL, MUTEX_DEFAULT, NULL);
1856 	cv_init(&un->un_suspend_wr_cv, NULL, CV_DEFAULT, NULL);
1857 
1858 	/*
1859 	 * Allocate mutexes for mirror-owner and resync-owner changes.
1860 	 * All references to the owner message state field must be guarded
1861 	 * by this mutex.
1862 	 */
1863 	mutex_init(&un->un_owner_mx, NULL, MUTEX_DEFAULT, NULL);
1864 
1865 	/*
1866 	 * Allocate mutex and condvar for resync thread manipulation. These
1867 	 * will be used by mirror_resync_unit/mirror_ioctl_resync
1868 	 */
1869 	mutex_init(&un->un_rs_thread_mx, NULL, MUTEX_DEFAULT, NULL);
1870 	cv_init(&un->un_rs_thread_cv, NULL, CV_DEFAULT, NULL);
1871 
1872 	/*
1873 	 * Allocate mutex and condvar for resync progress thread manipulation.
1874 	 * This allows resyncs to be continued across an intervening reboot.
1875 	 */
1876 	mutex_init(&un->un_rs_progress_mx, NULL, MUTEX_DEFAULT, NULL);
1877 	cv_init(&un->un_rs_progress_cv, NULL, CV_DEFAULT, NULL);
1878 
1879 	/*
1880 	 * Allocate mutex and condvar for Directed Mirror Reads (DMR). This
1881 	 * provides synchronization between a user-ioctl and the resulting
1882 	 * strategy() call that performs the read().
1883 	 */
1884 	mutex_init(&un->un_dmr_mx, NULL, MUTEX_DEFAULT, NULL);
1885 	cv_init(&un->un_dmr_cv, NULL, CV_DEFAULT, NULL);
1886 
1887 	MD_UNIT(MD_SID(un)) = un;
1888 	return (0);
1889 }
1890 
1891 
1892 void
1893 reset_mirror(struct mm_unit *un, minor_t mnum, int removing)
1894 {
1895 	mddb_recid_t	recid, vtoc_id;
1896 	size_t		bitcnt;
1897 	size_t		shortcnt;
1898 	int		smi;
1899 	sv_dev_t	sv[NMIRROR];
1900 	int		nsv = 0;
1901 	uint_t		bits = 0;
1902 	minor_t		selfid;
1903 	md_unit_t	*su;
1904 
1905 	md_destroy_unit_incore(mnum, &mirror_md_ops);
1906 
1907 	shortcnt = un->un_rrd_num * sizeof (short);
1908 	bitcnt = howmany(un->un_rrd_num, NBBY);
1909 
1910 	if (un->un_outstanding_writes)
1911 		kmem_free((caddr_t)un->un_outstanding_writes, shortcnt);
1912 	if (un->un_goingclean_bm)
1913 		kmem_free((caddr_t)un->un_goingclean_bm, bitcnt);
1914 	if (un->un_goingdirty_bm)
1915 		kmem_free((caddr_t)un->un_goingdirty_bm, bitcnt);
1916 	if (un->un_resync_bm)
1917 		kmem_free((caddr_t)un->un_resync_bm, bitcnt);
1918 
1919 	MD_UNIT(mnum) = NULL;
1920 
1921 	/*
1922 	 * Attempt release of its minor node
1923 	 */
1924 	md_remove_minor_node(mnum);
1925 
1926 	if (!removing)
1927 		return;
1928 
1929 	for (smi = 0; smi < NMIRROR; smi++) {
1930 		if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE))
1931 			continue;
1932 		/* reallow soft partitioning of submirror and reset parent */
1933 		su = MD_UNIT(md_getminor(un->un_sm[smi].sm_dev));
1934 		MD_CAPAB(su) |= MD_CAN_SP;
1935 		md_reset_parent(un->un_sm[smi].sm_dev);
1936 		reset_comp_states(&un->un_sm[smi], &un->un_smic[smi]);
1937 
1938 		sv[nsv].setno = MD_MIN2SET(mnum);
1939 		sv[nsv++].key = un->un_sm[smi].sm_key;
1940 		bits |= SMI2BIT(smi);
1941 	}
1942 
1943 	MD_STATUS(un) |= MD_UN_BEING_RESET;
1944 	recid = un->un_rr_dirty_recid;
1945 	vtoc_id = un->c.un_vtoc_id;
1946 	selfid = MD_SID(un);
1947 
1948 	mirror_commit(un, bits, 0);
1949 
1950 	avl_destroy(&un->un_overlap_root);
1951 
1952 	/* Destroy all mutexes and condvars before returning. */
1953 	mutex_destroy(&un->un_suspend_wr_mx);
1954 	cv_destroy(&un->un_suspend_wr_cv);
1955 	mutex_destroy(&un->un_overlap_tree_mx);
1956 	cv_destroy(&un->un_overlap_tree_cv);
1957 	mutex_destroy(&un->un_owner_mx);
1958 	mutex_destroy(&un->un_rs_thread_mx);
1959 	cv_destroy(&un->un_rs_thread_cv);
1960 	mutex_destroy(&un->un_rs_progress_mx);
1961 	cv_destroy(&un->un_rs_progress_cv);
1962 	mutex_destroy(&un->un_dmr_mx);
1963 	cv_destroy(&un->un_dmr_cv);
1964 
1965 	/*
1966 	 * Remove self from the namespace
1967 	 */
1968 	if (un->c.un_revision & MD_FN_META_DEV) {
1969 		(void) md_rem_selfname(un->c.un_self_id);
1970 	}
1971 
1972 	mddb_deleterec_wrapper(un->c.un_record_id);
1973 	if (recid != 0)
1974 		mddb_deleterec_wrapper(recid);
1975 
1976 	/* Remove the vtoc, if present */
1977 	if (vtoc_id)
1978 		mddb_deleterec_wrapper(vtoc_id);
1979 
1980 	md_rem_names(sv, nsv);
1981 
1982 	SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, SVM_TAG_METADEVICE,
1983 	    MD_MIN2SET(selfid), selfid);
1984 }
1985 
1986 int
1987 mirror_internal_open(
1988 	minor_t		mnum,
1989 	int		flag,
1990 	int		otyp,
1991 	int		md_oflags,
1992 	IOLOCK		*lockp		/* can be NULL */
1993 )
1994 {
1995 	mdi_unit_t	*ui = MDI_UNIT(mnum);
1996 	int		err = 0;
1997 
1998 tryagain:
1999 	/* single thread */
2000 	if (lockp) {
2001 		/*
2002 		 * If ioctl lock is held, use openclose_enter
2003 		 * routine that will set the ioctl flag when
2004 		 * grabbing the readerlock.
2005 		 */
2006 		(void) md_ioctl_openclose_enter(lockp, ui);
2007 	} else {
2008 		(void) md_unit_openclose_enter(ui);
2009 	}
2010 
2011 	/*
2012 	 * The mirror_open_all_devs routine may end up sending a STATE_UPDATE
2013 	 * message in a MN diskset and this requires that the openclose
2014 	 * lock is dropped in order to send this message.  So, another
2015 	 * flag (MD_UL_OPENINPROGRESS) is used to keep another thread from
2016 	 * attempting an open while this thread has an open in progress.
2017 	 * Call the *_lh version of the lock exit routines since the ui_mx
2018 	 * mutex must be held from checking for OPENINPROGRESS until
2019 	 * after the cv_wait call.
2020 	 */
2021 	mutex_enter(&ui->ui_mx);
2022 	if (ui->ui_lock & MD_UL_OPENINPROGRESS) {
2023 		if (lockp) {
2024 			(void) md_ioctl_openclose_exit_lh(lockp);
2025 		} else {
2026 			md_unit_openclose_exit_lh(ui);
2027 		}
2028 		cv_wait(&ui->ui_cv, &ui->ui_mx);
2029 		mutex_exit(&ui->ui_mx);
2030 		goto tryagain;
2031 	}
2032 
2033 	ui->ui_lock |= MD_UL_OPENINPROGRESS;
2034 	mutex_exit(&ui->ui_mx);
2035 
2036 	/* open devices, if necessary */
2037 	if (! md_unit_isopen(ui) || (ui->ui_tstate & MD_INACCESSIBLE)) {
2038 		if ((err = mirror_open_all_devs(mnum, md_oflags, lockp)) != 0)
2039 			goto out;
2040 	}
2041 
2042 	/* count open */
2043 	if ((err = md_unit_incopen(mnum, flag, otyp)) != 0)
2044 		goto out;
2045 
2046 	/* unlock, return success */
2047 out:
2048 	mutex_enter(&ui->ui_mx);
2049 	ui->ui_lock &= ~MD_UL_OPENINPROGRESS;
2050 	mutex_exit(&ui->ui_mx);
2051 
2052 	if (lockp) {
2053 		/*
2054 		 * If ioctl lock is held, use openclose_exit
2055 		 * routine that will clear the lockp reader flag.
2056 		 */
2057 		(void) md_ioctl_openclose_exit(lockp);
2058 	} else {
2059 		md_unit_openclose_exit(ui);
2060 	}
2061 	return (err);
2062 }
2063 
2064 int
2065 mirror_internal_close(
2066 	minor_t		mnum,
2067 	int		otyp,
2068 	int		md_cflags,
2069 	IOLOCK		*lockp		/* can be NULL */
2070 )
2071 {
2072 	mdi_unit_t	*ui = MDI_UNIT(mnum);
2073 	mm_unit_t	*un;
2074 	int		err = 0;
2075 
2076 	/* single thread */
2077 	if (lockp) {
2078 		/*
2079 		 * If ioctl lock is held, use openclose_enter
2080 		 * routine that will set the ioctl flag when
2081 		 * grabbing the readerlock.
2082 		 */
2083 		un = (mm_unit_t *)md_ioctl_openclose_enter(lockp, ui);
2084 	} else {
2085 		un = (mm_unit_t *)md_unit_openclose_enter(ui);
2086 	}
2087 
2088 	/* count closed */
2089 	if ((err = md_unit_decopen(mnum, otyp)) != 0)
2090 		goto out;
2091 
2092 	/* close devices, if necessary */
2093 	if (! md_unit_isopen(ui) || (md_cflags & MD_OFLG_PROBEDEV)) {
2094 		/*
2095 		 * Clean up dirty bitmap for this unit. Do this
2096 		 * before closing the underlying devices to avoid
2097 		 * race conditions with reset_mirror() as a
2098 		 * result of a 'metaset -r' command running in
2099 		 * parallel. This might cause deallocation of
2100 		 * dirty region bitmaps; with underlying metadevices
2101 		 * in place this can't happen.
2102 		 * Don't do this if a MN set and ABR not set
2103 		 */
2104 		if (new_resync && !(MD_STATUS(un) & MD_UN_KEEP_DIRTY)) {
2105 			if (!MD_MNSET_SETNO(MD_UN2SET(un)) ||
2106 			    !(ui->ui_tstate & MD_ABR_CAP))
2107 				mirror_process_unit_resync(un);
2108 		}
2109 		(void) mirror_close_all_devs(un, md_cflags);
2110 
2111 		/*
2112 		 * For a MN set with transient capabilities (eg ABR/DMR) set,
2113 		 * clear these capabilities on the last open in the cluster.
2114 		 * To do this we send a message to all nodes to see of the
2115 		 * device is open.
2116 		 */
2117 		if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
2118 		    (ui->ui_tstate & (MD_ABR_CAP|MD_DMR_CAP))) {
2119 			if (lockp) {
2120 				(void) md_ioctl_openclose_exit(lockp);
2121 			} else {
2122 				md_unit_openclose_exit(ui);
2123 			}
2124 
2125 			/*
2126 			 * if we are in the context of an ioctl, drop the
2127 			 * ioctl lock.
2128 			 * Otherwise, no other locks should be held.
2129 			 */
2130 			if (lockp) {
2131 				IOLOCK_RETURN_RELEASE(0, lockp);
2132 			}
2133 
2134 			mdmn_clear_all_capabilities(mnum);
2135 
2136 			/* if dropped the lock previously, regain it */
2137 			if (lockp) {
2138 				IOLOCK_RETURN_REACQUIRE(lockp);
2139 			}
2140 			return (0);
2141 		}
2142 		/* unlock and return success */
2143 	}
2144 out:
2145 	/* Call whether lockp is NULL or not. */
2146 	if (lockp) {
2147 		md_ioctl_openclose_exit(lockp);
2148 	} else {
2149 		md_unit_openclose_exit(ui);
2150 	}
2151 	return (err);
2152 }
2153 
2154 /*
2155  * When a component has completed resyncing and is now ok, check if the
2156  * corresponding component in the other submirrors is in the Last Erred
2157  * state.  If it is, we want to change that to the Erred state so we stop
2158  * using that component and start using this good component instead.
2159  *
2160  * This is called from set_sm_comp_state and recursively calls
2161  * set_sm_comp_state if it needs to change the Last Erred state.
2162  */
2163 static void
2164 reset_lasterred(mm_unit_t *un, int smi, mddb_recid_t *extras, uint_t flags,
2165 	IOLOCK *lockp)
2166 {
2167 	mm_submirror_t		*sm;
2168 	mm_submirror_ic_t	*smic;
2169 	int			ci;
2170 	int			i;
2171 	int			compcnt;
2172 	int			changed = 0;
2173 
2174 	for (i = 0; i < NMIRROR; i++) {
2175 		sm = &un->un_sm[i];
2176 		smic = &un->un_smic[i];
2177 
2178 		if (!SMS_IS(sm, SMS_INUSE))
2179 			continue;
2180 
2181 		/* ignore the submirror that we just made ok */
2182 		if (i == smi)
2183 			continue;
2184 
2185 		compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un);
2186 		for (ci = 0; ci < compcnt; ci++) {
2187 			md_m_shared_t	*shared;
2188 
2189 			shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
2190 			    (sm->sm_dev, sm, ci);
2191 
2192 			if ((shared->ms_state & CS_LAST_ERRED) &&
2193 			    !mirror_other_sources(un, i, ci, 1)) {
2194 
2195 				set_sm_comp_state(un, i, ci, CS_ERRED, extras,
2196 				    flags, lockp);
2197 				changed = 1;
2198 			}
2199 		}
2200 	}
2201 
2202 	/* maybe there is a hotspare for this newly erred component */
2203 	if (changed) {
2204 		set_t	setno;
2205 
2206 		setno = MD_UN2SET(un);
2207 		if (MD_MNSET_SETNO(setno)) {
2208 			send_poke_hotspares(setno);
2209 		} else {
2210 			(void) poke_hotspares();
2211 		}
2212 	}
2213 }
2214 
2215 /*
2216  * set_sm_comp_state
2217  *
2218  * Set the state of a submirror component to the specified new state.
2219  * If the mirror is in a multi-node set, send messages to all nodes to
2220  * block all writes to the mirror and then update the state and release the
2221  * writes. These messages are only sent if MD_STATE_XMIT is set in flags.
2222  * MD_STATE_XMIT will be unset in 2 cases:
2223  * 1. When the state is changed to CS_RESYNC as this state change
2224  * will already have been updated on each node by the processing of the
2225  * distributed metasync command, hence no need to xmit.
2226  * 2. When the state is change to CS_OKAY after a resync has completed. Again
2227  * the resync completion will already have been processed on each node by
2228  * the processing of the MD_MN_MSG_RESYNC_PHASE_DONE message for a component
2229  * resync, hence no need to xmit.
2230  *
2231  * In case we are called from the updates of a watermark,
2232  * (then MD_STATE_WMUPDATE will be set in the ps->flags) this is due to
2233  * a metainit or similar. In this case the message that we sent to propagate
2234  * the state change must not be a class1 message as that would deadlock with
2235  * the metainit command that is still being processed.
2236  * This we achieve by creating a class2 message MD_MN_MSG_STATE_UPDATE2
2237  * instead. This also makes the submessage generator to create a class2
2238  * submessage rather than a class1 (which would also block)
2239  *
2240  * On entry, unit_writerlock is held
2241  * If MD_STATE_OCHELD is set in flags, then unit_openclose lock is
2242  * also held.
2243  */
2244 void
2245 set_sm_comp_state(
2246 	mm_unit_t	*un,
2247 	int		smi,
2248 	int		ci,
2249 	int		newstate,
2250 	mddb_recid_t	*extras,
2251 	uint_t		flags,
2252 	IOLOCK		*lockp
2253 )
2254 {
2255 	mm_submirror_t		*sm;
2256 	mm_submirror_ic_t	*smic;
2257 	md_m_shared_t		*shared;
2258 	int			origstate;
2259 	void			(*get_dev)();
2260 	ms_cd_info_t		cd;
2261 	char			devname[MD_MAX_CTDLEN];
2262 	int			err;
2263 	set_t			setno = MD_UN2SET(un);
2264 	md_mn_msg_stch_t	stchmsg;
2265 	mdi_unit_t		*ui = MDI_UNIT(MD_SID(un));
2266 	md_mn_kresult_t		*kresult;
2267 	int			rval;
2268 	uint_t			msgflags;
2269 	md_mn_msgtype_t		msgtype;
2270 	int			save_lock = 0;
2271 	mdi_unit_t		*ui_sm;
2272 
2273 	sm = &un->un_sm[smi];
2274 	smic = &un->un_smic[smi];
2275 
2276 	/* If we have a real error status then turn off MD_INACCESSIBLE. */
2277 	ui_sm = MDI_UNIT(getminor(md_dev64_to_dev(sm->sm_dev)));
2278 	if (newstate & (CS_ERRED | CS_RESYNC | CS_LAST_ERRED) &&
2279 	    ui_sm->ui_tstate & MD_INACCESSIBLE) {
2280 		ui_sm->ui_tstate &= ~MD_INACCESSIBLE;
2281 	}
2282 
2283 	shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
2284 	    (sm->sm_dev, sm, ci);
2285 	origstate = shared->ms_state;
2286 
2287 	/*
2288 	 * If the new state is an error and the old one wasn't, generate
2289 	 * a console message. We do this before we send the state to other
2290 	 * nodes in a MN set because the state change may change the component
2291 	 * name  if a hotspare is allocated.
2292 	 */
2293 	if ((! (origstate & (CS_ERRED|CS_LAST_ERRED))) &&
2294 	    (newstate & (CS_ERRED|CS_LAST_ERRED))) {
2295 
2296 		get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0,
2297 		    "get device", 0);
2298 		(void) (*get_dev)(sm->sm_dev, sm, ci, &cd);
2299 
2300 		err = md_getdevname(setno, mddb_getsidenum(setno), 0,
2301 		    cd.cd_dev, devname, sizeof (devname));
2302 
2303 		if (err == ENOENT) {
2304 			(void) md_devname(setno, cd.cd_dev, devname,
2305 			    sizeof (devname));
2306 		}
2307 
2308 		cmn_err(CE_WARN, "md: %s: %s needs maintenance",
2309 		    md_shortname(md_getminor(sm->sm_dev)), devname);
2310 
2311 		if (newstate & CS_LAST_ERRED) {
2312 			cmn_err(CE_WARN, "md: %s: %s last erred",
2313 			    md_shortname(md_getminor(sm->sm_dev)),
2314 			    devname);
2315 
2316 		} else if (shared->ms_flags & MDM_S_ISOPEN) {
2317 			/*
2318 			 * Close the broken device and clear the open flag on
2319 			 * it.  Closing the device means the RCM framework will
2320 			 * be able to unconfigure the device if required.
2321 			 *
2322 			 * We have to check that the device is open, otherwise
2323 			 * the first open on it has resulted in the error that
2324 			 * is being processed and the actual cd.cd_dev will be
2325 			 * NODEV64.
2326 			 *
2327 			 * If this is a multi-node mirror, then the multinode
2328 			 * state checks following this code will cause the
2329 			 * slave nodes to close the mirror in the function
2330 			 * mirror_set_state().
2331 			 */
2332 			md_layered_close(cd.cd_dev, MD_OFLG_NULL);
2333 			shared->ms_flags &= ~MDM_S_ISOPEN;
2334 		}
2335 
2336 	} else if ((origstate & CS_LAST_ERRED) && (newstate & CS_ERRED) &&
2337 	    (shared->ms_flags & MDM_S_ISOPEN)) {
2338 		/*
2339 		 * Similar to logic above except no log messages since we
2340 		 * are just transitioning from Last Erred to Erred.
2341 		 */
2342 		get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0,
2343 		    "get device", 0);
2344 		(void) (*get_dev)(sm->sm_dev, sm, ci, &cd);
2345 
2346 		md_layered_close(cd.cd_dev, MD_OFLG_NULL);
2347 		shared->ms_flags &= ~MDM_S_ISOPEN;
2348 	}
2349 
2350 	if ((MD_MNSET_SETNO(setno)) && (origstate != newstate) &&
2351 	    (flags & MD_STATE_XMIT) && !(ui->ui_tstate & MD_ERR_PENDING)) {
2352 		/*
2353 		 * For a multi-node mirror, send the state change to the
2354 		 * master, which broadcasts to all nodes, including this
2355 		 * one. Once the message is received, the state is set
2356 		 * in-core and the master commits the change to disk.
2357 		 * There is a case, comp_replace,  where this function
2358 		 * can be called from within an ioctl and therefore in this
2359 		 * case, as the ioctl will already be called on each node,
2360 		 * there is no need to xmit the state change to the master for
2361 		 * distribution to the other nodes. MD_STATE_XMIT flag is used
2362 		 * to indicate whether a xmit is required. The mirror's
2363 		 * transient state is set to MD_ERR_PENDING to avoid sending
2364 		 * multiple messages.
2365 		 */
2366 		if (newstate & (CS_ERRED|CS_LAST_ERRED))
2367 			ui->ui_tstate |= MD_ERR_PENDING;
2368 
2369 		/*
2370 		 * Send a state update message to all nodes. This message
2371 		 * will generate 2 submessages, the first one to suspend
2372 		 * all writes to the mirror and the second to update the
2373 		 * state and resume writes.
2374 		 */
2375 		stchmsg.msg_stch_mnum = un->c.un_self_id;
2376 		stchmsg.msg_stch_sm = smi;
2377 		stchmsg.msg_stch_comp = ci;
2378 		stchmsg.msg_stch_new_state = newstate;
2379 		stchmsg.msg_stch_hs_id = shared->ms_hs_id;
2380 #ifdef DEBUG
2381 		if (mirror_debug_flag)
2382 			printf("send set state, %x, %x, %x, %x, %x\n",
2383 			    stchmsg.msg_stch_mnum, stchmsg.msg_stch_sm,
2384 			    stchmsg.msg_stch_comp, stchmsg.msg_stch_new_state,
2385 			    stchmsg.msg_stch_hs_id);
2386 #endif
2387 		if (flags & MD_STATE_WMUPDATE) {
2388 			msgtype  = MD_MN_MSG_STATE_UPDATE2;
2389 			/*
2390 			 * When coming from an update of watermarks, there
2391 			 * must already be a message logged that triggered
2392 			 * this action. So, no need to log this message, too.
2393 			 */
2394 			msgflags = MD_MSGF_NO_LOG;
2395 		} else {
2396 			msgtype  = MD_MN_MSG_STATE_UPDATE;
2397 			msgflags = MD_MSGF_DEFAULT_FLAGS;
2398 		}
2399 
2400 		/*
2401 		 * If we are in the context of an ioctl, drop the ioctl lock.
2402 		 * lockp holds the list of locks held.
2403 		 *
2404 		 * Otherwise, increment the appropriate reacquire counters.
2405 		 * If openclose lock is *held, then must reacquire reader
2406 		 * lock before releasing the openclose lock.
2407 		 * Do not drop the ARRAY_WRITER lock as we may not be able
2408 		 * to reacquire it.
2409 		 */
2410 		if (lockp) {
2411 			if (lockp->l_flags & MD_ARRAY_WRITER) {
2412 				save_lock = MD_ARRAY_WRITER;
2413 				lockp->l_flags &= ~MD_ARRAY_WRITER;
2414 			} else if (lockp->l_flags & MD_ARRAY_READER) {
2415 				save_lock = MD_ARRAY_READER;
2416 				lockp->l_flags &= ~MD_ARRAY_READER;
2417 			}
2418 			IOLOCK_RETURN_RELEASE(0, lockp);
2419 		} else {
2420 			if (flags & MD_STATE_OCHELD) {
2421 				md_unit_writerexit(ui);
2422 				(void) md_unit_readerlock(ui);
2423 				md_unit_openclose_exit(ui);
2424 			} else {
2425 				md_unit_writerexit(ui);
2426 			}
2427 		}
2428 
2429 		kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
2430 		rval = mdmn_ksend_message(setno, msgtype, msgflags,
2431 		    (char *)&stchmsg, sizeof (stchmsg), kresult);
2432 
2433 		if (!MDMN_KSEND_MSG_OK(rval, kresult)) {
2434 			mdmn_ksend_show_error(rval, kresult, "STATE UPDATE");
2435 			cmn_err(CE_PANIC,
2436 			    "ksend_message failure: STATE_UPDATE");
2437 		}
2438 		kmem_free(kresult, sizeof (md_mn_kresult_t));
2439 
2440 		/* if dropped the lock previously, regain it */
2441 		if (lockp) {
2442 			IOLOCK_RETURN_REACQUIRE(lockp);
2443 			lockp->l_flags |= save_lock;
2444 		} else {
2445 			/*
2446 			 * Reacquire dropped locks and update acquirecnts
2447 			 * appropriately.
2448 			 */
2449 			if (flags & MD_STATE_OCHELD) {
2450 				/*
2451 				 * openclose also grabs readerlock.
2452 				 */
2453 				(void) md_unit_openclose_enter(ui);
2454 				md_unit_readerexit(ui);
2455 				(void) md_unit_writerlock(ui);
2456 			} else {
2457 				(void) md_unit_writerlock(ui);
2458 			}
2459 		}
2460 
2461 		ui->ui_tstate &= ~MD_ERR_PENDING;
2462 	} else {
2463 		shared->ms_state = newstate;
2464 		uniqtime32(&shared->ms_timestamp);
2465 
2466 		if (newstate == CS_ERRED)
2467 			shared->ms_flags |= MDM_S_NOWRITE;
2468 		else
2469 			shared->ms_flags &= ~MDM_S_NOWRITE;
2470 
2471 		shared->ms_flags &= ~MDM_S_IOERR;
2472 		un->un_changecnt++;
2473 		shared->ms_lasterrcnt = un->un_changecnt;
2474 
2475 		mirror_set_sm_state(sm, smic, SMS_RUNNING, 0);
2476 		mirror_commit(un, SMI2BIT(smi), extras);
2477 	}
2478 
2479 	if ((origstate & CS_RESYNC) && (newstate & CS_OKAY)) {
2480 		/*
2481 		 * Resetting the Last Erred state will recursively call back
2482 		 * into this function (set_sm_comp_state) to update the state.
2483 		 */
2484 		reset_lasterred(un, smi, extras, flags, lockp);
2485 	}
2486 }
2487 
2488 static int
2489 find_another_logical(
2490 	mm_unit_t		*un,
2491 	mm_submirror_t		*esm,
2492 	diskaddr_t		blk,
2493 	u_longlong_t		cnt,
2494 	int			must_be_open,
2495 	int			state,
2496 	int			err_cnt)
2497 {
2498 	u_longlong_t	cando;
2499 	md_dev64_t	dev;
2500 	md_m_shared_t	*s;
2501 
2502 	esm->sm_state |= SMS_IGNORE;
2503 	while (cnt != 0) {
2504 		u_longlong_t	 mcnt;
2505 
2506 		mcnt = MIN(cnt, lbtodb(1024 * 1024 * 1024));	/* 1 Gig Blks */
2507 
2508 		dev = select_read_unit(un, blk, mcnt, &cando,
2509 		    must_be_open, &s, NULL);
2510 		if (dev == (md_dev64_t)0)
2511 			break;
2512 
2513 		if ((state == CS_LAST_ERRED) &&
2514 		    (s->ms_state == CS_LAST_ERRED) &&
2515 		    (err_cnt > s->ms_lasterrcnt))
2516 			break;
2517 
2518 		cnt -= cando;
2519 		blk += cando;
2520 	}
2521 	esm->sm_state &= ~SMS_IGNORE;
2522 	return (cnt != 0);
2523 }
2524 
2525 int
2526 mirror_other_sources(mm_unit_t *un, int smi, int ci, int must_be_open)
2527 {
2528 	mm_submirror_t		*sm;
2529 	mm_submirror_ic_t	*smic;
2530 	size_t			count;
2531 	diskaddr_t		block;
2532 	u_longlong_t		skip;
2533 	u_longlong_t		size;
2534 	md_dev64_t		dev;
2535 	int			cnt;
2536 	md_m_shared_t		*s;
2537 	int			not_found;
2538 
2539 	sm = &un->un_sm[smi];
2540 	smic = &un->un_smic[smi];
2541 	dev = sm->sm_dev;
2542 
2543 	/*
2544 	 * Make sure every component of the submirror
2545 	 * has other sources.
2546 	 */
2547 	if (ci < 0) {
2548 		/* Find the highest lasterrcnt */
2549 		cnt = (*(smic->sm_get_component_count))(dev, sm);
2550 		for (ci = 0; ci < cnt; ci++) {
2551 			not_found = mirror_other_sources(un, smi, ci,
2552 			    must_be_open);
2553 			if (not_found)
2554 				return (1);
2555 		}
2556 		return (0);
2557 	}
2558 
2559 	/*
2560 	 * Make sure this component has other sources
2561 	 */
2562 	(void) (*(smic->sm_get_bcss))
2563 	    (dev, sm, ci, &block, &count, &skip, &size);
2564 
2565 	if (count == 0)
2566 		return (1);
2567 
2568 	s = (md_m_shared_t *)(*(smic->sm_shared_by_indx))(dev, sm, ci);
2569 
2570 	while (count--) {
2571 		if (block >= un->c.un_total_blocks)
2572 			return (0);
2573 
2574 		if ((block + size) > un->c.un_total_blocks)
2575 			size = un->c.un_total_blocks - block;
2576 
2577 		not_found = find_another_logical(un, sm, block, size,
2578 		    must_be_open, s->ms_state, s->ms_lasterrcnt);
2579 		if (not_found)
2580 			return (1);
2581 
2582 		block += size + skip;
2583 	}
2584 	return (0);
2585 }
2586 
2587 static void
2588 finish_error(md_mps_t *ps)
2589 {
2590 	struct buf	*pb;
2591 	mm_unit_t	*un;
2592 	mdi_unit_t	*ui;
2593 	uint_t		new_str_flags;
2594 
2595 	pb = ps->ps_bp;
2596 	un = ps->ps_un;
2597 	ui = ps->ps_ui;
2598 
2599 	/*
2600 	 * Must flag any error to the resync originator if we're performing
2601 	 * a Write-after-Read. This corresponds to an i/o error on a resync
2602 	 * target device and in this case we ought to abort the resync as there
2603 	 * is nothing that can be done to recover from this without operator
2604 	 * intervention. If we don't set the B_ERROR flag we will continue
2605 	 * reading from the mirror but won't write to the target (as it will
2606 	 * have been placed into an errored state).
2607 	 * To handle the case of multiple components within a submirror we only
2608 	 * set the B_ERROR bit if explicitly requested to via MD_MPS_FLAG_ERROR.
2609 	 * The originator of the resync read will cause this bit to be set if
2610 	 * the underlying component count is one for a submirror resync. All
2611 	 * other resync types will have the flag set as there is no underlying
2612 	 * resync which can be performed on a contained metadevice for these
2613 	 * resync types (optimized or component).
2614 	 */
2615 
2616 	if (ps->ps_flags & MD_MPS_WRITE_AFTER_READ) {
2617 		if (ps->ps_flags & MD_MPS_FLAG_ERROR)
2618 			pb->b_flags |= B_ERROR;
2619 		md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
2620 		MPS_FREE(mirror_parent_cache, ps);
2621 		md_unit_readerexit(ui);
2622 		md_biodone(pb);
2623 		return;
2624 	}
2625 	/*
2626 	 * Set the MD_IO_COUNTED flag as we are retrying the same I/O
2627 	 * operation therefore this I/O request has already been counted,
2628 	 * the I/O count variable will be decremented by mirror_done()'s
2629 	 * call to md_biodone().
2630 	 */
2631 	if (ps->ps_changecnt != un->un_changecnt) {
2632 		new_str_flags = MD_STR_NOTTOP | MD_IO_COUNTED;
2633 		if (ps->ps_flags & MD_MPS_WOW)
2634 			new_str_flags |= MD_STR_WOW;
2635 		if (ps->ps_flags & MD_MPS_MAPPED)
2636 			new_str_flags |= MD_STR_MAPPED;
2637 		/*
2638 		 * If this I/O request was a read that was part of a resync,
2639 		 * set MD_STR_WAR for the retried read to ensure that the
2640 		 * resync write (i.e. write-after-read) will be performed
2641 		 */
2642 		if (ps->ps_flags & MD_MPS_RESYNC_READ)
2643 			new_str_flags |= MD_STR_WAR;
2644 		md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
2645 		MPS_FREE(mirror_parent_cache, ps);
2646 		md_unit_readerexit(ui);
2647 		(void) md_mirror_strategy(pb, new_str_flags, NULL);
2648 		return;
2649 	}
2650 
2651 	pb->b_flags |= B_ERROR;
2652 	md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
2653 	MPS_FREE(mirror_parent_cache, ps);
2654 	md_unit_readerexit(ui);
2655 	md_biodone(pb);
2656 }
2657 
2658 static void
2659 error_update_unit(md_mps_t *ps)
2660 {
2661 	mm_unit_t		*un;
2662 	mdi_unit_t		*ui;
2663 	int			smi;	/* sub mirror index */
2664 	int			ci;	/* errored component */
2665 	set_t			setno;
2666 	uint_t			flags;	/* for set_sm_comp_state() */
2667 	uint_t			hspflags; /* for check_comp_4_hotspares() */
2668 
2669 	ui = ps->ps_ui;
2670 	un = (mm_unit_t *)md_unit_writerlock(ui);
2671 	setno = MD_UN2SET(un);
2672 
2673 	/* All of these updates have to propagated in case of MN set */
2674 	flags = MD_STATE_XMIT;
2675 	hspflags = MD_HOTSPARE_XMIT;
2676 
2677 	/* special treatment if we are called during updating watermarks */
2678 	if (ps->ps_flags & MD_MPS_WMUPDATE) {
2679 		flags |= MD_STATE_WMUPDATE;
2680 		hspflags |= MD_HOTSPARE_WMUPDATE;
2681 	}
2682 	smi = 0;
2683 	ci = 0;
2684 	while (mirror_geterror(un, &smi, &ci, 1, 0) != 0) {
2685 		if (mirror_other_sources(un, smi, ci, 0) == 1) {
2686 
2687 			/* Never called from ioctl context, so (IOLOCK *)NULL */
2688 			set_sm_comp_state(un, smi, ci, CS_LAST_ERRED, 0, flags,
2689 			    (IOLOCK *)NULL);
2690 			/*
2691 			 * For a MN set, the NOTIFY is done when the state
2692 			 * change is processed on each node
2693 			 */
2694 			if (!MD_MNSET_SETNO(MD_UN2SET(un))) {
2695 				SE_NOTIFY(EC_SVM_STATE, ESC_SVM_LASTERRED,
2696 				    SVM_TAG_METADEVICE, setno, MD_SID(un));
2697 			}
2698 			continue;
2699 		}
2700 		/* Never called from ioctl context, so (IOLOCK *)NULL */
2701 		set_sm_comp_state(un, smi, ci, CS_ERRED, 0, flags,
2702 		    (IOLOCK *)NULL);
2703 		/*
2704 		 * For a MN set, the NOTIFY is done when the state
2705 		 * change is processed on each node
2706 		 */
2707 		if (!MD_MNSET_SETNO(MD_UN2SET(un))) {
2708 			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED,
2709 			    SVM_TAG_METADEVICE, setno, MD_SID(un));
2710 		}
2711 		smi = 0;
2712 		ci = 0;
2713 	}
2714 
2715 	md_unit_writerexit(ui);
2716 	if (MD_MNSET_SETNO(setno)) {
2717 		send_poke_hotspares(setno);
2718 	} else {
2719 		(void) poke_hotspares();
2720 	}
2721 	(void) md_unit_readerlock(ui);
2722 
2723 	finish_error(ps);
2724 }
2725 
2726 /*
2727  * When we have a B_FAILFAST IO error on a Last Erred component we need to
2728  * retry the IO without B_FAILFAST set so that we try to ensure that the
2729  * component "sees" each IO.
2730  */
2731 static void
2732 last_err_retry(md_mcs_t *cs)
2733 {
2734 	struct buf	*cb;
2735 	md_mps_t	*ps;
2736 	uint_t		flags;
2737 
2738 	cb = &cs->cs_buf;
2739 	cb->b_flags &= ~B_FAILFAST;
2740 
2741 	/* if we're panicing just let this I/O error out */
2742 	if (panicstr) {
2743 		(void) mirror_done(cb);
2744 		return;
2745 	}
2746 
2747 	/* reissue the I/O */
2748 
2749 	ps = cs->cs_ps;
2750 
2751 	bioerror(cb, 0);
2752 
2753 	mutex_enter(&ps->ps_mx);
2754 
2755 	flags = MD_STR_NOTTOP;
2756 	if (ps->ps_flags & MD_MPS_MAPPED)
2757 		flags |= MD_STR_MAPPED;
2758 	if (ps->ps_flags & MD_MPS_NOBLOCK)
2759 		flags |= MD_NOBLOCK;
2760 
2761 	mutex_exit(&ps->ps_mx);
2762 
2763 	clear_retry_error(cb);
2764 
2765 	cmn_err(CE_NOTE, "!md: %s: Last Erred, retry I/O without B_FAILFAST",
2766 	    md_shortname(getminor(cb->b_edev)));
2767 
2768 	md_call_strategy(cb, flags, NULL);
2769 }
2770 
2771 static void
2772 mirror_error(md_mps_t *ps)
2773 {
2774 	int		smi;	/* sub mirror index */
2775 	int		ci;	/* errored component */
2776 
2777 	if (panicstr) {
2778 		finish_error(ps);
2779 		return;
2780 	}
2781 
2782 	if (ps->ps_flags & MD_MPS_ON_OVERLAP)
2783 		mirror_overlap_tree_remove(ps);
2784 
2785 	smi = 0;
2786 	ci = 0;
2787 	if (mirror_geterror(ps->ps_un, &smi, &ci, 0, 0) != 0) {
2788 		md_unit_readerexit(ps->ps_ui);
2789 		daemon_request(&md_mstr_daemon, error_update_unit,
2790 		    (daemon_queue_t *)ps, REQ_OLD);
2791 		return;
2792 	}
2793 
2794 	finish_error(ps);
2795 }
2796 
2797 static int
2798 copy_write_done(struct buf *cb)
2799 {
2800 	md_mps_t	*ps;
2801 	buf_t		*pb;
2802 	char		*wowbuf;
2803 	wowhdr_t	*wowhdr;
2804 	ssize_t		wow_resid;
2805 
2806 	/* get wowbuf ans save structure */
2807 	wowbuf = cb->b_un.b_addr;
2808 	wowhdr = WOWBUF_HDR(wowbuf);
2809 	ps = wowhdr->wow_ps;
2810 	pb = ps->ps_bp;
2811 
2812 	/* Save error information, then free cb */
2813 	if (cb->b_flags & B_ERROR)
2814 		pb->b_flags |= B_ERROR;
2815 
2816 	if (cb->b_flags & B_REMAPPED)
2817 		bp_mapout(cb);
2818 
2819 	freerbuf(cb);
2820 
2821 	/* update residual and continue if needed */
2822 	if ((pb->b_flags & B_ERROR) == 0) {
2823 		wow_resid = pb->b_bcount - wowhdr->wow_offset;
2824 		pb->b_resid = wow_resid;
2825 		if (wow_resid > 0)  {
2826 			daemon_request(&md_mstr_daemon, copy_write_cont,
2827 			    (daemon_queue_t *)wowhdr, REQ_OLD);
2828 			return (1);
2829 		}
2830 	}
2831 
2832 	/* Write is complete, release resources. */
2833 	kmem_cache_free(mirror_wowblk_cache, wowhdr);
2834 	ASSERT(!(ps->ps_flags & MD_MPS_ON_OVERLAP));
2835 	md_kstat_done(ps->ps_ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
2836 	MPS_FREE(mirror_parent_cache, ps);
2837 	md_biodone(pb);
2838 	return (0);
2839 }
2840 
2841 static void
2842 copy_write_cont(wowhdr_t *wowhdr)
2843 {
2844 	buf_t		*pb;
2845 	buf_t		*cb;
2846 	char		*wowbuf;
2847 	int		wow_offset;
2848 	size_t		wow_resid;
2849 	diskaddr_t	wow_blkno;
2850 
2851 	wowbuf = WOWHDR_BUF(wowhdr);
2852 	pb = wowhdr->wow_ps->ps_bp;
2853 
2854 	/* get data on current location */
2855 	wow_offset = wowhdr->wow_offset;
2856 	wow_resid = pb->b_bcount - wow_offset;
2857 	wow_blkno = pb->b_lblkno + lbtodb(wow_offset);
2858 
2859 	/* setup child buffer */
2860 	cb = getrbuf(KM_SLEEP);
2861 	cb->b_flags = B_WRITE;
2862 	cb->b_edev = pb->b_edev;
2863 	cb->b_un.b_addr = wowbuf;	/* change to point at WOWBUF */
2864 	cb->b_bufsize = md_wowbuf_size; /* change to wowbuf_size */
2865 	cb->b_iodone = copy_write_done;
2866 	cb->b_bcount = MIN(md_wowbuf_size, wow_resid);
2867 	cb->b_lblkno = wow_blkno;
2868 
2869 	/* move offset to next section */
2870 	wowhdr->wow_offset += cb->b_bcount;
2871 
2872 	/* copy and setup write for current section */
2873 	bcopy(&pb->b_un.b_addr[wow_offset], wowbuf, cb->b_bcount);
2874 
2875 	/* do it */
2876 	/*
2877 	 * Do not set the MD_IO_COUNTED flag as this is a new I/O request
2878 	 * that handles the WOW condition. The resultant increment on the
2879 	 * I/O count variable is cleared by copy_write_done()'s call to
2880 	 * md_biodone().
2881 	 */
2882 	(void) md_mirror_strategy(cb, MD_STR_NOTTOP | MD_STR_WOW
2883 	    | MD_STR_MAPPED, NULL);
2884 }
2885 
2886 static void
2887 md_mirror_copy_write(md_mps_t *ps)
2888 {
2889 	wowhdr_t	*wowhdr;
2890 
2891 	wowhdr = kmem_cache_alloc(mirror_wowblk_cache, MD_ALLOCFLAGS);
2892 	mirror_wowblk_init(wowhdr);
2893 	wowhdr->wow_ps = ps;
2894 	wowhdr->wow_offset = 0;
2895 	copy_write_cont(wowhdr);
2896 }
2897 
2898 static void
2899 handle_wow(md_mps_t *ps)
2900 {
2901 	buf_t		*pb;
2902 
2903 	pb = ps->ps_bp;
2904 
2905 	bp_mapin(pb);
2906 
2907 	md_mirror_wow_cnt++;
2908 	if (!(pb->b_flags & B_PHYS) && (md_mirror_wow_flg & WOW_LOGIT)) {
2909 		cmn_err(CE_NOTE,
2910 		    "md: %s, blk %lld, cnt %ld: Write on write %d occurred",
2911 		    md_shortname(getminor(pb->b_edev)),
2912 		    (longlong_t)pb->b_lblkno, pb->b_bcount, md_mirror_wow_cnt);
2913 	}
2914 
2915 	/*
2916 	 * Set the MD_IO_COUNTED flag as we are retrying the same I/O
2917 	 * operation therefore this I/O request has already been counted,
2918 	 * the I/O count variable will be decremented by mirror_done()'s
2919 	 * call to md_biodone().
2920 	 */
2921 	if (md_mirror_wow_flg & WOW_NOCOPY)
2922 		(void) md_mirror_strategy(pb, MD_STR_NOTTOP | MD_STR_WOW |
2923 		    MD_STR_MAPPED | MD_IO_COUNTED, ps);
2924 	else
2925 		md_mirror_copy_write(ps);
2926 }
2927 
2928 /*
2929  * Return true if the specified submirror is either in the Last Erred
2930  * state or is transitioning into the Last Erred state.
2931  */
2932 static bool_t
2933 submirror_is_lasterred(mm_unit_t *un, int smi)
2934 {
2935 	mm_submirror_t		*sm;
2936 	mm_submirror_ic_t	*smic;
2937 	md_m_shared_t		*shared;
2938 	int			ci;
2939 	int			compcnt;
2940 
2941 	sm = &un->un_sm[smi];
2942 	smic = &un->un_smic[smi];
2943 
2944 	compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un);
2945 	for (ci = 0; ci < compcnt; ci++) {
2946 		shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
2947 		    (sm->sm_dev, sm, ci);
2948 
2949 		if (shared->ms_state == CS_LAST_ERRED)
2950 			return (B_TRUE);
2951 
2952 		/*
2953 		 * It is not currently Last Erred, check if entering Last Erred.
2954 		 */
2955 		if ((shared->ms_flags & MDM_S_IOERR) &&
2956 		    ((shared->ms_state == CS_OKAY) ||
2957 		    (shared->ms_state == CS_RESYNC))) {
2958 			if (mirror_other_sources(un, smi, ci, 0) == 1)
2959 				return (B_TRUE);
2960 		}
2961 	}
2962 
2963 	return (B_FALSE);
2964 }
2965 
2966 
2967 static int
2968 mirror_done(struct buf *cb)
2969 {
2970 	md_mps_t	*ps;
2971 	md_mcs_t	*cs;
2972 
2973 	/*LINTED*/
2974 	cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off);
2975 	ps = cs->cs_ps;
2976 
2977 	mutex_enter(&ps->ps_mx);
2978 
2979 	/* check if we need to retry an errored failfast I/O */
2980 	if (cb->b_flags & B_ERROR) {
2981 		struct buf *pb = ps->ps_bp;
2982 
2983 		if (cb->b_flags & B_FAILFAST) {
2984 			int		i;
2985 			mm_unit_t	*un = ps->ps_un;
2986 
2987 			for (i = 0; i < NMIRROR; i++) {
2988 				if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
2989 					continue;
2990 
2991 				if (cb->b_edev ==
2992 				    md_dev64_to_dev(un->un_sm[i].sm_dev)) {
2993 
2994 					/*
2995 					 * This is the submirror that had the
2996 					 * error.  Check if it is Last Erred.
2997 					 */
2998 					if (submirror_is_lasterred(un, i)) {
2999 						daemon_queue_t *dqp;
3000 
3001 						mutex_exit(&ps->ps_mx);
3002 						dqp = (daemon_queue_t *)cs;
3003 						dqp->dq_prev = NULL;
3004 						dqp->dq_next = NULL;
3005 						daemon_request(&md_done_daemon,
3006 						    last_err_retry, dqp,
3007 						    REQ_OLD);
3008 						return (1);
3009 					}
3010 					break;
3011 				}
3012 			}
3013 		}
3014 
3015 		/* continue to process the buf without doing a retry */
3016 		ps->ps_flags |= MD_MPS_ERROR;
3017 		pb->b_error = cb->b_error;
3018 	}
3019 
3020 	return (mirror_done_common(cb));
3021 }
3022 
3023 /*
3024  * Split from the original mirror_done function so we can handle bufs after a
3025  * retry.
3026  * ps->ps_mx is already held in the caller of this function and the cb error
3027  * has already been checked and handled in the caller.
3028  */
3029 static int
3030 mirror_done_common(struct buf *cb)
3031 {
3032 	struct buf	*pb;
3033 	mm_unit_t	*un;
3034 	mdi_unit_t	*ui;
3035 	md_mps_t	*ps;
3036 	md_mcs_t	*cs;
3037 	size_t		end_rr, start_rr, current_rr;
3038 
3039 	/*LINTED*/
3040 	cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off);
3041 	ps = cs->cs_ps;
3042 	pb = ps->ps_bp;
3043 
3044 	if (cb->b_flags & B_REMAPPED)
3045 		bp_mapout(cb);
3046 
3047 	ps->ps_frags--;
3048 	if (ps->ps_frags != 0) {
3049 		mutex_exit(&ps->ps_mx);
3050 		kmem_cache_free(mirror_child_cache, cs);
3051 		return (1);
3052 	}
3053 	un = ps->ps_un;
3054 	ui = ps->ps_ui;
3055 
3056 	/*
3057 	 * Do not update outstanding_writes if we're running with ABR
3058 	 * set for this mirror or the write() was issued with MD_STR_ABR set.
3059 	 * Also a resync initiated write() has no outstanding_writes update
3060 	 * either.
3061 	 */
3062 	if (((cb->b_flags & B_READ) == 0) &&
3063 	    (un->un_nsm >= 2) &&
3064 	    (ps->ps_call == NULL) &&
3065 	    !((ui->ui_tstate & MD_ABR_CAP) || (ps->ps_flags & MD_MPS_ABR)) &&
3066 	    !(ps->ps_flags & MD_MPS_WRITE_AFTER_READ)) {
3067 		BLK_TO_RR(end_rr, ps->ps_lastblk, un);
3068 		BLK_TO_RR(start_rr, ps->ps_firstblk, un);
3069 		mutex_enter(&un->un_resync_mx);
3070 		for (current_rr = start_rr; current_rr <= end_rr; current_rr++)
3071 			un->un_outstanding_writes[current_rr]--;
3072 		mutex_exit(&un->un_resync_mx);
3073 	}
3074 	kmem_cache_free(mirror_child_cache, cs);
3075 	mutex_exit(&ps->ps_mx);
3076 
3077 	if (ps->ps_call != NULL) {
3078 		daemon_request(&md_done_daemon, ps->ps_call,
3079 		    (daemon_queue_t *)ps, REQ_OLD);
3080 		return (1);
3081 	}
3082 
3083 	if ((ps->ps_flags & MD_MPS_ERROR)) {
3084 		daemon_request(&md_done_daemon, mirror_error,
3085 		    (daemon_queue_t *)ps, REQ_OLD);
3086 		return (1);
3087 	}
3088 
3089 	if (ps->ps_flags & MD_MPS_ON_OVERLAP)
3090 		mirror_overlap_tree_remove(ps);
3091 
3092 	/*
3093 	 * Handle Write-on-Write problem.
3094 	 * Skip In case of Raw and Direct I/O as they are
3095 	 * handled earlier.
3096 	 *
3097 	 */
3098 	if (!(md_mirror_wow_flg & WOW_DISABLE) &&
3099 	    !(pb->b_flags & B_READ) &&
3100 	    !(ps->ps_flags & MD_MPS_WOW) &&
3101 	    !(pb->b_flags & B_PHYS) &&
3102 	    any_pages_dirty(pb)) {
3103 		md_unit_readerexit(ps->ps_ui);
3104 		daemon_request(&md_mstr_daemon, handle_wow,
3105 		    (daemon_queue_t *)ps, REQ_OLD);
3106 		return (1);
3107 	}
3108 
3109 	md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
3110 	MPS_FREE(mirror_parent_cache, ps);
3111 	md_unit_readerexit(ui);
3112 	md_biodone(pb);
3113 	return (0);
3114 }
3115 
3116 /*
3117  * Clear error state in submirror component if the retry worked after
3118  * a failfast error.
3119  */
3120 static void
3121 clear_retry_error(struct buf *cb)
3122 {
3123 	int			smi;
3124 	md_mcs_t		*cs;
3125 	mm_unit_t		*un;
3126 	mdi_unit_t		*ui_sm;
3127 	mm_submirror_t		*sm;
3128 	mm_submirror_ic_t	*smic;
3129 	u_longlong_t		cnt;
3130 	md_m_shared_t		*shared;
3131 
3132 	/*LINTED*/
3133 	cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off);
3134 	un = cs->cs_ps->ps_un;
3135 
3136 	for (smi = 0; smi < NMIRROR; smi++) {
3137 		if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE))
3138 			continue;
3139 
3140 		if (cb->b_edev == md_dev64_to_dev(un->un_sm[smi].sm_dev))
3141 			break;
3142 	}
3143 
3144 	if (smi >= NMIRROR)
3145 		return;
3146 
3147 	sm = &un->un_sm[smi];
3148 	smic = &un->un_smic[smi];
3149 	cnt = cb->b_bcount;
3150 
3151 	ui_sm = MDI_UNIT(getminor(cb->b_edev));
3152 	(void) md_unit_writerlock(ui_sm);
3153 
3154 	shared = (md_m_shared_t *)(*(smic->sm_shared_by_blk))(sm->sm_dev, sm,
3155 	    cb->b_blkno, &cnt);
3156 
3157 	if (shared->ms_flags & MDM_S_IOERR) {
3158 		shared->ms_flags &= ~MDM_S_IOERR;
3159 
3160 	} else {
3161 		/* the buf spans components and the first one is not erred */
3162 		int	cnt;
3163 		int	i;
3164 
3165 		cnt = (*(smic->sm_get_component_count))(sm->sm_dev, un);
3166 		for (i = 0; i < cnt; i++) {
3167 			shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
3168 			    (sm->sm_dev, sm, i);
3169 
3170 			if (shared->ms_flags & MDM_S_IOERR &&
3171 			    shared->ms_state == CS_OKAY) {
3172 
3173 				shared->ms_flags &= ~MDM_S_IOERR;
3174 				break;
3175 			}
3176 		}
3177 	}
3178 
3179 	md_unit_writerexit(ui_sm);
3180 }
3181 
3182 static size_t
3183 mirror_map_read(
3184 	md_mps_t *ps,
3185 	md_mcs_t *cs,
3186 	diskaddr_t blkno,
3187 	u_longlong_t	count
3188 )
3189 {
3190 	mm_unit_t	*un;
3191 	buf_t		*bp;
3192 	u_longlong_t	cando;
3193 
3194 	bp = &cs->cs_buf;
3195 	un = ps->ps_un;
3196 
3197 	bp->b_lblkno = blkno;
3198 	if (fast_select_read_unit(ps, cs) == 0) {
3199 		bp->b_bcount = ldbtob(count);
3200 		return (0);
3201 	}
3202 	bp->b_edev = md_dev64_to_dev(select_read_unit(un, blkno,
3203 	    count, &cando, 0, NULL, cs));
3204 	bp->b_bcount = ldbtob(cando);
3205 	if (count != cando)
3206 		return (cando);
3207 	return (0);
3208 }
3209 
3210 static void
3211 write_after_read(md_mps_t *ps)
3212 {
3213 	struct buf	*pb;
3214 	int		flags;
3215 
3216 	if (ps->ps_flags & MD_MPS_ERROR) {
3217 		mirror_error(ps);
3218 		return;
3219 	}
3220 
3221 	pb = ps->ps_bp;
3222 	md_kstat_done(ps->ps_ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
3223 	ps->ps_call = NULL;
3224 	ps->ps_flags |= MD_MPS_WRITE_AFTER_READ;
3225 	flags = MD_STR_NOTTOP | MD_STR_WAR;
3226 	if (ps->ps_flags & MD_MPS_MAPPED)
3227 		flags |= MD_STR_MAPPED;
3228 	if (ps->ps_flags & MD_MPS_NOBLOCK)
3229 		flags |= MD_NOBLOCK;
3230 	if (ps->ps_flags & MD_MPS_DIRTY_RD)
3231 		flags |= MD_STR_DIRTY_RD;
3232 	(void) mirror_write_strategy(pb, flags, ps);
3233 }
3234 
3235 static void
3236 continue_serial(md_mps_t *ps)
3237 {
3238 	md_mcs_t	*cs;
3239 	buf_t		*cb;
3240 	mm_unit_t	*un;
3241 	int		flags;
3242 
3243 	un = ps->ps_un;
3244 	cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS);
3245 	mirror_child_init(cs);
3246 	cb = &cs->cs_buf;
3247 	ps->ps_call = NULL;
3248 	ps->ps_frags = 1;
3249 	(void) mirror_map_write(un, cs, ps, 0);
3250 	flags = MD_STR_NOTTOP;
3251 	if (ps->ps_flags & MD_MPS_MAPPED)
3252 		flags |= MD_STR_MAPPED;
3253 	md_call_strategy(cb, flags, NULL);
3254 }
3255 
3256 static int
3257 mirror_map_write(mm_unit_t *un, md_mcs_t *cs, md_mps_t *ps, int war)
3258 {
3259 	int i;
3260 	dev_t		dev;	/* needed for bioclone, so not md_dev64_t */
3261 	buf_t		*cb;
3262 	buf_t		*pb;
3263 	diskaddr_t	blkno;
3264 	size_t		bcount;
3265 	off_t		offset;
3266 
3267 	pb = ps->ps_bp;
3268 	cb = &cs->cs_buf;
3269 	cs->cs_ps = ps;
3270 
3271 	i = md_find_nth_unit(ps->ps_writable_sm, ps->ps_current_sm);
3272 
3273 	dev = md_dev64_to_dev(un->un_sm[i].sm_dev);
3274 
3275 	blkno = pb->b_lblkno;
3276 	bcount = pb->b_bcount;
3277 	offset = 0;
3278 	if (war && (blkno == 0) && (un->c.un_flag & MD_LABELED)) {
3279 		blkno = DK_LABEL_LOC + 1;
3280 		/*
3281 		 * This handles the case where we're requesting
3282 		 * a write to block 0 on a label partition
3283 		 * and the request size was smaller than the
3284 		 * size of the label.  If this is the case
3285 		 * then we'll return -1.  Failure to do so will
3286 		 * either cause the calling thread to hang due to
3287 		 * an ssd bug, or worse if the bcount were allowed
3288 		 * to go negative (ie large).
3289 		 */
3290 		if (bcount <= DEV_BSIZE*(DK_LABEL_LOC + 1))
3291 			return (-1);
3292 		bcount -= (DEV_BSIZE*(DK_LABEL_LOC + 1));
3293 		offset = (DEV_BSIZE*(DK_LABEL_LOC + 1));
3294 	}
3295 
3296 	cb = md_bioclone(pb, offset, bcount, dev, blkno, mirror_done,
3297 	    cb, KM_NOSLEEP);
3298 	if (war)
3299 		cb->b_flags = (cb->b_flags & ~B_READ) | B_WRITE;
3300 
3301 	/*
3302 	 * If the submirror is in the erred stated, check if any component is
3303 	 * in the Last Erred state.  If so, we don't want to use the B_FAILFAST
3304 	 * flag on the IO.
3305 	 *
3306 	 * Provide a fast path for the non-erred case (which should be the
3307 	 * normal case).
3308 	 */
3309 	if (un->un_sm[i].sm_flags & MD_SM_FAILFAST) {
3310 		if (un->un_sm[i].sm_state & SMS_COMP_ERRED) {
3311 			mm_submirror_t		*sm;
3312 			mm_submirror_ic_t	*smic;
3313 			int			ci;
3314 			int			compcnt;
3315 
3316 			sm = &un->un_sm[i];
3317 			smic = &un->un_smic[i];
3318 
3319 			compcnt = (*(smic->sm_get_component_count))
3320 			    (sm->sm_dev, un);
3321 			for (ci = 0; ci < compcnt; ci++) {
3322 				md_m_shared_t	*shared;
3323 
3324 				shared = (md_m_shared_t *)
3325 				    (*(smic->sm_shared_by_indx))(sm->sm_dev,
3326 				    sm, ci);
3327 
3328 				if (shared->ms_state == CS_LAST_ERRED)
3329 					break;
3330 			}
3331 			if (ci >= compcnt)
3332 				cb->b_flags |= B_FAILFAST;
3333 
3334 		} else {
3335 			cb->b_flags |= B_FAILFAST;
3336 		}
3337 	}
3338 
3339 	ps->ps_current_sm++;
3340 	if (ps->ps_current_sm != ps->ps_active_cnt) {
3341 		if (un->un_write_option == WR_SERIAL) {
3342 			ps->ps_call = continue_serial;
3343 			return (0);
3344 		}
3345 		return (1);
3346 	}
3347 	return (0);
3348 }
3349 
3350 /*
3351  * directed_read_done:
3352  * ------------------
3353  * Completion routine called when a DMR request has been returned from the
3354  * underlying driver. Wake-up the original ioctl() and return the data to
3355  * the user.
3356  */
3357 static void
3358 directed_read_done(md_mps_t *ps)
3359 {
3360 	mm_unit_t	*un;
3361 	mdi_unit_t	*ui;
3362 
3363 	un = ps->ps_un;
3364 	ui = ps->ps_ui;
3365 
3366 	md_unit_readerexit(ui);
3367 	md_kstat_done(ui, ps->ps_bp, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
3368 	ps->ps_call = NULL;
3369 
3370 	mutex_enter(&un->un_dmr_mx);
3371 	cv_signal(&un->un_dmr_cv);
3372 	mutex_exit(&un->un_dmr_mx);
3373 
3374 	/* release the parent structure */
3375 	kmem_cache_free(mirror_parent_cache, ps);
3376 }
3377 
3378 /*
3379  * daemon_io:
3380  * ------------
3381  * Called to issue a mirror_write_strategy() or mirror_read_strategy
3382  * call from a blockable context. NOTE: no mutex can be held on entry to this
3383  * routine
3384  */
3385 static void
3386 daemon_io(daemon_queue_t *dq)
3387 {
3388 	md_mps_t	*ps = (md_mps_t *)dq;
3389 	int		flag = MD_STR_NOTTOP;
3390 	buf_t		*pb = ps->ps_bp;
3391 
3392 	if (ps->ps_flags & MD_MPS_MAPPED)
3393 		flag |= MD_STR_MAPPED;
3394 	if (ps->ps_flags & MD_MPS_WOW)
3395 		flag |= MD_STR_WOW;
3396 	if (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)
3397 		flag |= MD_STR_WAR;
3398 	if (ps->ps_flags & MD_MPS_ABR)
3399 		flag |= MD_STR_ABR;
3400 
3401 	/*
3402 	 * If this is a resync read, ie MD_STR_DIRTY_RD not set, set
3403 	 * MD_STR_WAR before calling mirror_read_strategy
3404 	 */
3405 	if (pb->b_flags & B_READ) {
3406 		if (!(ps->ps_flags & MD_MPS_DIRTY_RD))
3407 			flag |= MD_STR_WAR;
3408 		mirror_read_strategy(pb, flag, ps);
3409 	} else
3410 		mirror_write_strategy(pb, flag, ps);
3411 }
3412 
3413 /*
3414  * update_resync:
3415  * -------------
3416  * Called to update the in-core version of the resync record with the latest
3417  * version that was committed to disk when the previous mirror owner
3418  * relinquished ownership. This call is likely to block as we must hold-off
3419  * any current resync processing that may be occurring.
3420  * On completion of the resync record update we issue the mirror_write_strategy
3421  * call to complete the i/o that first started this sequence. To remove a race
3422  * condition between a new write() request which is submitted and the resync
3423  * record update we acquire the writerlock. This will hold off all i/o to the
3424  * mirror until the resync update has completed.
3425  * NOTE: no mutex can be held on entry to this routine
3426  */
3427 static void
3428 update_resync(daemon_queue_t *dq)
3429 {
3430 	md_mps_t	*ps = (md_mps_t *)dq;
3431 	buf_t		*pb = ps->ps_bp;
3432 	mdi_unit_t	*ui = ps->ps_ui;
3433 	mm_unit_t	*un;
3434 	set_t		setno;
3435 	int		restart_resync;
3436 
3437 	un = md_unit_writerlock(ui);
3438 	ps->ps_un = un;
3439 	setno = MD_MIN2SET(getminor(pb->b_edev));
3440 	if (mddb_reread_rr(setno, un->un_rr_dirty_recid) == 0) {
3441 		/*
3442 		 * Synchronize our in-core view of what regions need to be
3443 		 * resync'd with the on-disk version.
3444 		 */
3445 		mutex_enter(&un->un_rrp_inflight_mx);
3446 		mirror_copy_rr(howmany(un->un_rrd_num, NBBY), un->un_resync_bm,
3447 		    un->un_dirty_bm);
3448 		mutex_exit(&un->un_rrp_inflight_mx);
3449 
3450 		/* Region dirty map is now up to date */
3451 	}
3452 	restart_resync = (un->un_rs_thread_flags & MD_RI_BLOCK_OWNER) ? 1 : 0;
3453 	md_unit_writerexit(ui);
3454 
3455 	/* Restart the resync thread if it was previously blocked */
3456 	if (restart_resync) {
3457 		mutex_enter(&un->un_rs_thread_mx);
3458 		un->un_rs_thread_flags &= ~MD_RI_BLOCK_OWNER;
3459 		cv_signal(&un->un_rs_thread_cv);
3460 		mutex_exit(&un->un_rs_thread_mx);
3461 	}
3462 	/* Continue with original deferred i/o */
3463 	daemon_io(dq);
3464 }
3465 
3466 /*
3467  * owner_timeout:
3468  * -------------
3469  * Called if the original mdmn_ksend_message() failed and the request is to be
3470  * retried. Reattempt the original ownership change.
3471  *
3472  * NOTE: called at interrupt context (see timeout(9f)).
3473  */
3474 static void
3475 owner_timeout(void *arg)
3476 {
3477 	daemon_queue_t	*dq = (daemon_queue_t *)arg;
3478 
3479 	daemon_request(&md_mirror_daemon, become_owner, dq, REQ_OLD);
3480 }
3481 
3482 /*
3483  * become_owner:
3484  * ------------
3485  * Called to issue RPC request to become the owner of the mirror
3486  * associated with this i/o request. We assume that the ownership request
3487  * is synchronous, so if it succeeds we will issue the request via
3488  * mirror_write_strategy().
3489  * If multiple i/o's are outstanding we will be called from the mirror_daemon
3490  * service thread.
3491  * NOTE: no mutex should be held on entry to this routine.
3492  */
3493 static void
3494 become_owner(daemon_queue_t *dq)
3495 {
3496 	md_mps_t	*ps = (md_mps_t *)dq;
3497 	mm_unit_t	*un = ps->ps_un;
3498 	buf_t		*pb = ps->ps_bp;
3499 	set_t		setno;
3500 	md_mn_kresult_t	*kres;
3501 	int		msg_flags = md_mirror_msg_flags;
3502 	md_mps_t	*ps1;
3503 
3504 	ASSERT(dq->dq_next == NULL && dq->dq_prev == NULL);
3505 
3506 	/*
3507 	 * If we're already the mirror owner we do not need to send a message
3508 	 * but can simply process the i/o request immediately.
3509 	 * If we've already sent the request to become owner we requeue the
3510 	 * request as we're waiting for the synchronous ownership message to
3511 	 * be processed.
3512 	 */
3513 	if (MD_MN_MIRROR_OWNER(un)) {
3514 		/*
3515 		 * As the strategy() call will potentially block we need to
3516 		 * punt this to a separate thread and complete this request
3517 		 * as quickly as possible. Note: if we're a read request
3518 		 * this must be a resync, we cannot afford to be queued
3519 		 * behind any intervening i/o requests. In this case we put the
3520 		 * request on the md_mirror_rs_daemon queue.
3521 		 */
3522 		if (pb->b_flags & B_READ) {
3523 			daemon_request(&md_mirror_rs_daemon, daemon_io, dq,
3524 			    REQ_OLD);
3525 		} else {
3526 			daemon_request(&md_mirror_io_daemon, daemon_io, dq,
3527 			    REQ_OLD);
3528 		}
3529 	} else {
3530 		mutex_enter(&un->un_owner_mx);
3531 		if ((un->un_owner_state & MM_MN_OWNER_SENT) == 0) {
3532 			md_mn_req_owner_t	*msg;
3533 			int			rval = 0;
3534 
3535 			/*
3536 			 * Check to see that we haven't exceeded the maximum
3537 			 * retry count. If we have we fail the i/o as the
3538 			 * comms mechanism has become wedged beyond recovery.
3539 			 */
3540 			if (dq->qlen++ >= MD_OWNER_RETRIES) {
3541 				mutex_exit(&un->un_owner_mx);
3542 				cmn_err(CE_WARN,
3543 				    "md_mirror: Request exhausted ownership "
3544 				    "retry limit of %d attempts", dq->qlen);
3545 				pb->b_error = EIO;
3546 				pb->b_flags |= B_ERROR;
3547 				pb->b_resid = pb->b_bcount;
3548 				kmem_cache_free(mirror_parent_cache, ps);
3549 				md_biodone(pb);
3550 				return;
3551 			}
3552 
3553 			/*
3554 			 * Issue request to change ownership. The call is
3555 			 * synchronous so when it returns we can complete the
3556 			 * i/o (if successful), or enqueue it again so that
3557 			 * the operation will be retried.
3558 			 */
3559 			un->un_owner_state |= MM_MN_OWNER_SENT;
3560 			mutex_exit(&un->un_owner_mx);
3561 
3562 			msg = kmem_zalloc(sizeof (md_mn_req_owner_t), KM_SLEEP);
3563 			setno = MD_MIN2SET(getminor(pb->b_edev));
3564 			msg->mnum = MD_SID(un);
3565 			msg->owner = md_mn_mynode_id;
3566 			msg_flags |= MD_MSGF_NO_LOG;
3567 			/*
3568 			 * If this IO is triggered by updating a watermark,
3569 			 * it might be issued by the creation of a softpartition
3570 			 * while the commd subsystem is suspended.
3571 			 * We don't want this message to block.
3572 			 */
3573 			if (ps->ps_flags & MD_MPS_WMUPDATE) {
3574 				msg_flags |= MD_MSGF_OVERRIDE_SUSPEND;
3575 			}
3576 
3577 			kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
3578 			rval = mdmn_ksend_message(setno,
3579 			    MD_MN_MSG_REQUIRE_OWNER, msg_flags,
3580 			    /* flags */ (char *)msg,
3581 			    sizeof (md_mn_req_owner_t), kres);
3582 
3583 			kmem_free(msg, sizeof (md_mn_req_owner_t));
3584 
3585 			if (MDMN_KSEND_MSG_OK(rval, kres)) {
3586 				dq->qlen = 0;
3587 				/*
3588 				 * Successfully changed owner, reread the
3589 				 * resync record so that we have a valid idea of
3590 				 * any previously committed incomplete write()s.
3591 				 * NOTE: As we need to acquire the resync mutex
3592 				 * this may block, so we defer it to a separate
3593 				 * thread handler. This makes us (effectively)
3594 				 * non-blocking once the ownership message
3595 				 * handling has completed.
3596 				 */
3597 				mutex_enter(&un->un_owner_mx);
3598 				if (un->un_owner_state & MM_MN_BECOME_OWNER) {
3599 					un->un_mirror_owner = md_mn_mynode_id;
3600 					/* Sets owner of un_rr_dirty record */
3601 					if (un->un_rr_dirty_recid)
3602 						(void) mddb_setowner(
3603 						    un->un_rr_dirty_recid,
3604 						    md_mn_mynode_id);
3605 					un->un_owner_state &=
3606 					    ~MM_MN_BECOME_OWNER;
3607 					/*
3608 					 * Release the block on the current
3609 					 * resync region if it is blocked
3610 					 */
3611 					ps1 = un->un_rs_prev_overlap;
3612 					if ((ps1 != NULL) &&
3613 					    (ps1->ps_flags & MD_MPS_ON_OVERLAP))
3614 						mirror_overlap_tree_remove(ps1);
3615 					mutex_exit(&un->un_owner_mx);
3616 
3617 					/*
3618 					 * If we're a read, this must be a
3619 					 * resync request, issue
3620 					 * the i/o request on the
3621 					 * md_mirror_rs_daemon queue. This is
3622 					 * to avoid a deadlock between the
3623 					 * resync_unit thread and
3624 					 * subsequent i/o requests that may
3625 					 * block on the resync region.
3626 					 */
3627 					if (pb->b_flags & B_READ) {
3628 						daemon_request(
3629 						    &md_mirror_rs_daemon,
3630 						    update_resync, dq, REQ_OLD);
3631 					} else {
3632 						daemon_request(
3633 						    &md_mirror_io_daemon,
3634 						    update_resync, dq, REQ_OLD);
3635 					}
3636 					kmem_free(kres,
3637 					    sizeof (md_mn_kresult_t));
3638 					return;
3639 				} else {
3640 					/*
3641 					 * Some other node has beaten us to
3642 					 * obtain ownership. We need to
3643 					 * reschedule our ownership request
3644 					 */
3645 					mutex_exit(&un->un_owner_mx);
3646 				}
3647 			} else {
3648 				mdmn_ksend_show_error(rval, kres,
3649 				    "MD_MN_MSG_REQUIRE_OWNER");
3650 				/*
3651 				 * Message transport failure is handled by the
3652 				 * comms layer. If the ownership change request
3653 				 * does not succeed we need to flag the error to
3654 				 * the initiator of the i/o. This is handled by
3655 				 * the retry logic above. As the request failed
3656 				 * we do not know _who_ the owner of the mirror
3657 				 * currently is. We reset our idea of the owner
3658 				 * to None so that any further write()s will
3659 				 * attempt to become the owner again. This stops
3660 				 * multiple nodes writing to the same mirror
3661 				 * simultaneously.
3662 				 */
3663 				mutex_enter(&un->un_owner_mx);
3664 				un->un_owner_state &=
3665 				    ~(MM_MN_OWNER_SENT|MM_MN_BECOME_OWNER);
3666 				un->un_mirror_owner = MD_MN_MIRROR_UNOWNED;
3667 				mutex_exit(&un->un_owner_mx);
3668 			}
3669 			kmem_free(kres, sizeof (md_mn_kresult_t));
3670 		} else
3671 			mutex_exit(&un->un_owner_mx);
3672 
3673 		/*
3674 		 * Re-enqueue this request on the deferred i/o list. Delay the
3675 		 * request for md_mirror_owner_to usecs to stop thrashing.
3676 		 */
3677 		(void) timeout(owner_timeout, dq,
3678 		    drv_usectohz(md_mirror_owner_to));
3679 	}
3680 }
3681 
3682 static void
3683 mirror_write_strategy(buf_t *pb, int flag, void *private)
3684 {
3685 	md_mps_t	*ps;
3686 	md_mcs_t	*cs;
3687 	int		more;
3688 	mm_unit_t	*un;
3689 	mdi_unit_t	*ui;
3690 	buf_t		*cb;		/* child buf pointer */
3691 	set_t		setno;
3692 	int		rs_on_overlap = 0;
3693 
3694 	ui = MDI_UNIT(getminor(pb->b_edev));
3695 	un = (mm_unit_t *)MD_UNIT(getminor(pb->b_edev));
3696 
3697 
3698 	md_kstat_waitq_enter(ui);
3699 
3700 	/*
3701 	 * If a state change is in progress for this mirror in a MN set,
3702 	 * suspend all non-resync writes until the state change is complete.
3703 	 * The objective of this suspend is to ensure that it is not
3704 	 * possible for one node to read data from a submirror that another node
3705 	 * has not written to because of the state change. Therefore we
3706 	 * suspend all writes until the state change has been made. As it is
3707 	 * not possible to read from the target of a resync, there is no need
3708 	 * to suspend resync writes.
3709 	 */
3710 
3711 	if (!(flag & MD_STR_WAR)) {
3712 		mutex_enter(&un->un_suspend_wr_mx);
3713 		while (un->un_suspend_wr_flag) {
3714 			cv_wait(&un->un_suspend_wr_cv, &un->un_suspend_wr_mx);
3715 		}
3716 		mutex_exit(&un->un_suspend_wr_mx);
3717 		(void) md_unit_readerlock(ui);
3718 	}
3719 
3720 	if (!(flag & MD_STR_NOTTOP)) {
3721 		if (md_checkbuf(ui, (md_unit_t *)un, pb)) {
3722 			md_kstat_waitq_exit(ui);
3723 			return;
3724 		}
3725 	}
3726 
3727 	setno = MD_MIN2SET(getminor(pb->b_edev));
3728 
3729 	/* If an ABR write has been requested, set MD_STR_ABR flag */
3730 	if (MD_MNSET_SETNO(setno) && (pb->b_flags & B_ABRWRITE))
3731 		flag |= MD_STR_ABR;
3732 
3733 	if (private == NULL) {
3734 		ps = kmem_cache_alloc(mirror_parent_cache, MD_ALLOCFLAGS);
3735 		mirror_parent_init(ps);
3736 	} else {
3737 		ps = private;
3738 		private = NULL;
3739 	}
3740 	if (flag & MD_STR_MAPPED)
3741 		ps->ps_flags |= MD_MPS_MAPPED;
3742 
3743 	if (flag & MD_STR_WOW)
3744 		ps->ps_flags |= MD_MPS_WOW;
3745 
3746 	if (flag & MD_STR_ABR)
3747 		ps->ps_flags |= MD_MPS_ABR;
3748 
3749 	if (flag & MD_STR_WMUPDATE)
3750 		ps->ps_flags |= MD_MPS_WMUPDATE;
3751 
3752 	/*
3753 	 * Save essential information from the original buffhdr
3754 	 * in the md_save structure.
3755 	 */
3756 	ps->ps_un = un;
3757 	ps->ps_ui = ui;
3758 	ps->ps_bp = pb;
3759 	ps->ps_addr = pb->b_un.b_addr;
3760 	ps->ps_firstblk = pb->b_lblkno;
3761 	ps->ps_lastblk = pb->b_lblkno + lbtodb(pb->b_bcount) - 1;
3762 	ps->ps_changecnt = un->un_changecnt;
3763 
3764 	/*
3765 	 * If not MN owner and this is an ABR write, make sure the current
3766 	 * resync region is in the overlaps tree
3767 	 */
3768 	mutex_enter(&un->un_owner_mx);
3769 	if (MD_MNSET_SETNO(setno) && (!(MD_MN_MIRROR_OWNER(un))) &&
3770 	    ((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR))) {
3771 		md_mps_t	*ps1;
3772 		/* Block the current resync region, if not already blocked */
3773 		ps1 = un->un_rs_prev_overlap;
3774 
3775 		if ((ps1 != NULL) && ((ps1->ps_firstblk != 0) ||
3776 		    (ps1->ps_lastblk != 0))) {
3777 			/* Drop locks to avoid deadlock */
3778 			mutex_exit(&un->un_owner_mx);
3779 			md_unit_readerexit(ui);
3780 			wait_for_overlaps(ps1, MD_OVERLAP_ALLOW_REPEAT);
3781 			rs_on_overlap = 1;
3782 			(void) md_unit_readerlock(ui);
3783 			mutex_enter(&un->un_owner_mx);
3784 			/*
3785 			 * Check to see if we have obtained ownership
3786 			 * while waiting for overlaps. If we have, remove
3787 			 * the resync_region entry from the overlap tree
3788 			 */
3789 			if (MD_MN_MIRROR_OWNER(un) &&
3790 			    (ps1->ps_flags & MD_MPS_ON_OVERLAP)) {
3791 				mirror_overlap_tree_remove(ps1);
3792 				rs_on_overlap = 0;
3793 			}
3794 		}
3795 	}
3796 	mutex_exit(&un->un_owner_mx);
3797 
3798 
3799 	/*
3800 	 * following keep write after read from writing to the
3801 	 * source in the case where it all came from one place
3802 	 */
3803 	if (flag & MD_STR_WAR) {
3804 		int	abort_write = 0;
3805 		/*
3806 		 * We are perfoming a write-after-read. This is either as a
3807 		 * result of a resync read or as a result of a read in a
3808 		 * dirty resync region when the optimized resync is not
3809 		 * complete. If in a MN set and a resync generated i/o,
3810 		 * if the current block is not in the current
3811 		 * resync region terminate the write as another node must have
3812 		 * completed this resync region
3813 		 */
3814 		if ((MD_MNSET_SETNO(MD_UN2SET(un))) &&
3815 		    (!flag & MD_STR_DIRTY_RD)) {
3816 			if (!IN_RESYNC_REGION(un, ps))
3817 				abort_write = 1;
3818 		}
3819 		if ((select_write_after_read_units(un, ps) == 0) ||
3820 		    (abort_write)) {
3821 #ifdef DEBUG
3822 			if (mirror_debug_flag)
3823 				printf("Abort resync write on %x, block %lld\n",
3824 				    MD_SID(un), ps->ps_firstblk);
3825 #endif
3826 			if (ps->ps_flags & MD_MPS_ON_OVERLAP)
3827 				mirror_overlap_tree_remove(ps);
3828 			kmem_cache_free(mirror_parent_cache, ps);
3829 			md_kstat_waitq_exit(ui);
3830 			md_unit_readerexit(ui);
3831 			md_biodone(pb);
3832 			return;
3833 		}
3834 	} else {
3835 		select_write_units(un, ps);
3836 
3837 		/* Drop readerlock to avoid deadlock */
3838 		md_unit_readerexit(ui);
3839 		wait_for_overlaps(ps, MD_OVERLAP_NO_REPEAT);
3840 		un = md_unit_readerlock(ui);
3841 		/*
3842 		 * For a MN set with an ABR write, if we are now the
3843 		 * owner and we have a resync region in the overlap
3844 		 * tree, remove the entry from overlaps and retry the write.
3845 		 */
3846 
3847 		if (MD_MNSET_SETNO(setno) &&
3848 		    ((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR))) {
3849 			mutex_enter(&un->un_owner_mx);
3850 			if (((MD_MN_MIRROR_OWNER(un))) && rs_on_overlap) {
3851 				mirror_overlap_tree_remove(ps);
3852 				md_kstat_waitq_exit(ui);
3853 				mutex_exit(&un->un_owner_mx);
3854 				md_unit_readerexit(ui);
3855 				daemon_request(&md_mirror_daemon, daemon_io,
3856 				    (daemon_queue_t *)ps, REQ_OLD);
3857 				return;
3858 			}
3859 			mutex_exit(&un->un_owner_mx);
3860 		}
3861 	}
3862 
3863 	/*
3864 	 * For Multinode mirrors with a Resync Region (not ABR) we need to
3865 	 * become the mirror owner before continuing with the write(). For ABR
3866 	 * mirrors we check that we 'own' the resync if we're in
3867 	 * write-after-read mode. We do this _after_ ensuring that there are no
3868 	 * overlaps to ensure that the once we know that we are the owner, the
3869 	 * readerlock will not released until the write is complete. As a
3870 	 * change of ownership in a MN set requires the writerlock, this
3871 	 * ensures that ownership cannot be changed until the write is
3872 	 * complete
3873 	 */
3874 	if (MD_MNSET_SETNO(setno) && (!((ui->ui_tstate & MD_ABR_CAP) ||
3875 	    (flag & MD_STR_ABR)) || (flag & MD_STR_WAR))) {
3876 		if (!MD_MN_MIRROR_OWNER(un))  {
3877 			if (ps->ps_flags & MD_MPS_ON_OVERLAP)
3878 				mirror_overlap_tree_remove(ps);
3879 			md_kstat_waitq_exit(ui);
3880 			ASSERT(!(flag & MD_STR_WAR));
3881 			md_unit_readerexit(ui);
3882 			daemon_request(&md_mirror_daemon, become_owner,
3883 			    (daemon_queue_t *)ps, REQ_OLD);
3884 			return;
3885 		}
3886 	}
3887 
3888 	/*
3889 	 * Mark resync region if mirror has a Resync Region _and_ we are not
3890 	 * a resync initiated write(). Don't mark region if we're flagged as
3891 	 * an ABR write.
3892 	 */
3893 	if (!((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR)) &&
3894 	    !(flag & MD_STR_WAR)) {
3895 		if (mirror_mark_resync_region(un, ps->ps_firstblk,
3896 		    ps->ps_lastblk)) {
3897 			pb->b_flags |= B_ERROR;
3898 			pb->b_resid = pb->b_bcount;
3899 			ASSERT(!(ps->ps_flags & MD_MPS_ON_OVERLAP));
3900 			kmem_cache_free(mirror_parent_cache, ps);
3901 			md_kstat_waitq_exit(ui);
3902 			md_unit_readerexit(ui);
3903 			md_biodone(pb);
3904 			return;
3905 		}
3906 	}
3907 
3908 	ps->ps_childbflags = pb->b_flags | B_WRITE;
3909 	ps->ps_childbflags &= ~B_READ;
3910 	if (flag & MD_STR_MAPPED)
3911 		ps->ps_childbflags &= ~B_PAGEIO;
3912 
3913 	if (!(flag & MD_STR_NOTTOP) && panicstr)
3914 		/* Disable WOW and don't free ps */
3915 		ps->ps_flags |= (MD_MPS_WOW|MD_MPS_DONTFREE);
3916 
3917 	md_kstat_waitq_to_runq(ui);
3918 
3919 	/*
3920 	 * Treat Raw and Direct I/O as Write-on-Write always
3921 	 */
3922 
3923 	if (!(md_mirror_wow_flg & WOW_DISABLE) &&
3924 	    (md_mirror_wow_flg & WOW_PHYS_ENABLE) &&
3925 	    (pb->b_flags & B_PHYS) &&
3926 	    !(ps->ps_flags & MD_MPS_WOW)) {
3927 		if (ps->ps_flags & MD_MPS_ON_OVERLAP)
3928 			mirror_overlap_tree_remove(ps);
3929 		md_unit_readerexit(ui);
3930 		daemon_request(&md_mstr_daemon, handle_wow,
3931 		    (daemon_queue_t *)ps, REQ_OLD);
3932 		return;
3933 	}
3934 
3935 	ps->ps_frags = 1;
3936 	do {
3937 		cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS);
3938 		mirror_child_init(cs);
3939 		cb = &cs->cs_buf;
3940 		more = mirror_map_write(un, cs, ps, (flag & MD_STR_WAR));
3941 
3942 		/*
3943 		 * This handles the case where we're requesting
3944 		 * a write to block 0 on a label partition.  (more < 0)
3945 		 * means that the request size was smaller than the
3946 		 * size of the label.  If so this request is done.
3947 		 */
3948 		if (more < 0) {
3949 			if (ps->ps_flags & MD_MPS_ON_OVERLAP)
3950 				mirror_overlap_tree_remove(ps);
3951 			md_kstat_runq_exit(ui);
3952 			kmem_cache_free(mirror_child_cache, cs);
3953 			kmem_cache_free(mirror_parent_cache, ps);
3954 			md_unit_readerexit(ui);
3955 			md_biodone(pb);
3956 			return;
3957 		}
3958 		if (more) {
3959 			mutex_enter(&ps->ps_mx);
3960 			ps->ps_frags++;
3961 			mutex_exit(&ps->ps_mx);
3962 		}
3963 		md_call_strategy(cb, flag, private);
3964 	} while (more);
3965 
3966 	if (!(flag & MD_STR_NOTTOP) && panicstr) {
3967 		while (!(ps->ps_flags & MD_MPS_DONE)) {
3968 			md_daemon(1, &md_done_daemon);
3969 			drv_usecwait(10);
3970 		}
3971 		kmem_cache_free(mirror_parent_cache, ps);
3972 	}
3973 }
3974 
3975 static void
3976 mirror_read_strategy(buf_t *pb, int flag, void *private)
3977 {
3978 	md_mps_t	*ps;
3979 	md_mcs_t	*cs;
3980 	size_t		more;
3981 	mm_unit_t	*un;
3982 	mdi_unit_t	*ui;
3983 	size_t		current_count;
3984 	diskaddr_t	current_blkno;
3985 	off_t		current_offset;
3986 	buf_t		*cb;		/* child buf pointer */
3987 	set_t		setno;
3988 
3989 	ui = MDI_UNIT(getminor(pb->b_edev));
3990 
3991 	md_kstat_waitq_enter(ui);
3992 
3993 	un = (mm_unit_t *)md_unit_readerlock(ui);
3994 
3995 	if (!(flag & MD_STR_NOTTOP)) {
3996 		if (md_checkbuf(ui, (md_unit_t *)un, pb)) {
3997 			md_kstat_waitq_exit(ui);
3998 			return;
3999 		}
4000 	}
4001 
4002 	if (private == NULL) {
4003 		ps = kmem_cache_alloc(mirror_parent_cache, MD_ALLOCFLAGS);
4004 		mirror_parent_init(ps);
4005 	} else {
4006 		ps = private;
4007 		private = NULL;
4008 	}
4009 
4010 	if (flag & MD_STR_MAPPED)
4011 		ps->ps_flags |= MD_MPS_MAPPED;
4012 	if (flag & MD_NOBLOCK)
4013 		ps->ps_flags |= MD_MPS_NOBLOCK;
4014 	if (flag & MD_STR_WMUPDATE)
4015 		ps->ps_flags |= MD_MPS_WMUPDATE;
4016 
4017 	/*
4018 	 * Check to see if this is a DMR driven read. If so we need to use the
4019 	 * specified side (in un->un_dmr_last_read) for the source of the data.
4020 	 */
4021 	if (flag & MD_STR_DMR)
4022 		ps->ps_flags |= MD_MPS_DMR;
4023 
4024 	/*
4025 	 * Save essential information from the original buffhdr
4026 	 * in the md_save structure.
4027 	 */
4028 	ps->ps_un = un;
4029 	ps->ps_ui = ui;
4030 	ps->ps_bp = pb;
4031 	ps->ps_addr = pb->b_un.b_addr;
4032 	ps->ps_firstblk = pb->b_lblkno;
4033 	ps->ps_lastblk = pb->b_lblkno + lbtodb(pb->b_bcount) - 1;
4034 	ps->ps_changecnt = un->un_changecnt;
4035 
4036 	current_count = btodb(pb->b_bcount);
4037 	current_blkno = pb->b_lblkno;
4038 	current_offset = 0;
4039 
4040 	/*
4041 	 * If flag has MD_STR_WAR set this means that the read is issued by a
4042 	 * resync thread which may or may not be an optimised resync.
4043 	 *
4044 	 * If MD_UN_OPT_NOT_DONE is set this means that the optimized resync
4045 	 * code has not completed; either a resync has not started since snarf,
4046 	 * or there is an optimized resync in progress.
4047 	 *
4048 	 * We need to generate a write after this read in the following two
4049 	 * cases,
4050 	 *
4051 	 * 1. Any Resync-Generated read
4052 	 *
4053 	 * 2. Any read to a DIRTY REGION if there is an optimized resync
4054 	 *    pending or in progress.
4055 	 *
4056 	 * The write after read is done in these cases to ensure that all sides
4057 	 * of the mirror are in sync with the read data and that it is not
4058 	 * possible for an application to read the same block multiple times
4059 	 * and get different data.
4060 	 *
4061 	 * This would be possible if the block was in a dirty region.
4062 	 *
4063 	 * If we're performing a directed read we don't write the data out as
4064 	 * the application is responsible for restoring the mirror to a known
4065 	 * state.
4066 	 */
4067 	if (((MD_STATUS(un) & MD_UN_OPT_NOT_DONE) || (flag & MD_STR_WAR)) &&
4068 	    !(flag & MD_STR_DMR)) {
4069 		size_t	start_rr, i, end_rr;
4070 		int	region_dirty = 1;
4071 
4072 		/*
4073 		 * We enter here under three circumstances,
4074 		 *
4075 		 * MD_UN_OPT_NOT_DONE	MD_STR_WAR
4076 		 * 0			1
4077 		 * 1			0
4078 		 * 1			1
4079 		 *
4080 		 * To be optimal we only care to explicitly check for dirty
4081 		 * regions in the second case since if MD_STR_WAR is set we
4082 		 * always do the write after read.
4083 		 */
4084 		if (!(flag & MD_STR_WAR)) {
4085 			BLK_TO_RR(end_rr, ps->ps_lastblk, un);
4086 			BLK_TO_RR(start_rr, ps->ps_firstblk, un);
4087 
4088 			for (i = start_rr; i <= end_rr; i++)
4089 				if ((region_dirty = IS_KEEPDIRTY(i, un)) != 0)
4090 					break;
4091 		}
4092 
4093 		if ((region_dirty) &&
4094 		    !(md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)) {
4095 			ps->ps_call = write_after_read;
4096 			/*
4097 			 * Mark this as a RESYNC_READ in ps_flags.
4098 			 * This is used if the read fails during a
4099 			 * resync of a 3-way mirror to ensure that
4100 			 * the retried read to the remaining
4101 			 * good submirror has MD_STR_WAR set. This
4102 			 * is needed to ensure that the resync write
4103 			 * (write-after-read) takes place.
4104 			 */
4105 			ps->ps_flags |= MD_MPS_RESYNC_READ;
4106 
4107 			/*
4108 			 * If MD_STR_FLAG_ERR is set in the flags we
4109 			 * set MD_MPS_FLAG_ERROR so that an error on the resync
4110 			 * write (issued by write_after_read) will be flagged
4111 			 * to the biowait'ing resync thread. This allows us to
4112 			 * avoid issuing further resync requests to a device
4113 			 * that has had a write failure.
4114 			 */
4115 			if (flag & MD_STR_FLAG_ERR)
4116 				ps->ps_flags |= MD_MPS_FLAG_ERROR;
4117 
4118 			setno = MD_UN2SET(un);
4119 			/*
4120 			 * Drop the readerlock to avoid
4121 			 * deadlock
4122 			 */
4123 			md_unit_readerexit(ui);
4124 			wait_for_overlaps(ps, MD_OVERLAP_NO_REPEAT);
4125 			un = md_unit_readerlock(ui);
4126 			/*
4127 			 * Ensure that we are owner
4128 			 */
4129 			if (MD_MNSET_SETNO(setno)) {
4130 				/*
4131 				 * For a non-resync read that requires a
4132 				 * write-after-read to be done, set a flag
4133 				 * in the parent structure, so that the
4134 				 * write_strategy routine can omit the
4135 				 * test that the write is still within the
4136 				 * resync region
4137 				 */
4138 				if (!(flag & MD_STR_WAR))
4139 					ps->ps_flags |= MD_MPS_DIRTY_RD;
4140 
4141 				/*
4142 				 * Before reading the buffer, see if
4143 				 * we are the owner
4144 				 */
4145 				if (!MD_MN_MIRROR_OWNER(un))  {
4146 					ps->ps_call = NULL;
4147 					mirror_overlap_tree_remove(ps);
4148 					md_kstat_waitq_exit(ui);
4149 					md_unit_readerexit(ui);
4150 					daemon_request(
4151 					    &md_mirror_daemon,
4152 					    become_owner,
4153 					    (daemon_queue_t *)ps,
4154 					    REQ_OLD);
4155 					return;
4156 				}
4157 				/*
4158 				 * For a resync read, check to see if I/O is
4159 				 * outside of the current resync region, or
4160 				 * the resync has finished. If so
4161 				 * just terminate the I/O
4162 				 */
4163 				if ((flag & MD_STR_WAR) &&
4164 				    (!(un->c.un_status & MD_UN_WAR) ||
4165 				    (!IN_RESYNC_REGION(un, ps)))) {
4166 #ifdef DEBUG
4167 					if (mirror_debug_flag)
4168 						printf("Abort resync read "
4169 						    "%x: %lld\n",
4170 						    MD_SID(un),
4171 						    ps->ps_firstblk);
4172 #endif
4173 					mirror_overlap_tree_remove(ps);
4174 					kmem_cache_free(mirror_parent_cache,
4175 					    ps);
4176 					md_kstat_waitq_exit(ui);
4177 					md_unit_readerexit(ui);
4178 					md_biodone(pb);
4179 					return;
4180 				}
4181 			}
4182 		}
4183 	}
4184 
4185 	if (flag & MD_STR_DMR) {
4186 		ps->ps_call = directed_read_done;
4187 	}
4188 
4189 	if (!(flag & MD_STR_NOTTOP) && panicstr)
4190 		ps->ps_flags |= MD_MPS_DONTFREE;
4191 
4192 	md_kstat_waitq_to_runq(ui);
4193 
4194 	ps->ps_frags++;
4195 	do {
4196 		cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS);
4197 		mirror_child_init(cs);
4198 		cb = &cs->cs_buf;
4199 		cs->cs_ps = ps;
4200 
4201 		cb = md_bioclone(pb, current_offset, current_count, NODEV,
4202 		    current_blkno, mirror_done, cb, KM_NOSLEEP);
4203 
4204 		more = mirror_map_read(ps, cs, current_blkno,
4205 		    (u_longlong_t)current_count);
4206 		if (more) {
4207 			mutex_enter(&ps->ps_mx);
4208 			ps->ps_frags++;
4209 			mutex_exit(&ps->ps_mx);
4210 		}
4211 
4212 		/*
4213 		 * Do these calculations now,
4214 		 *  so that we pickup a valid b_bcount from the chld_bp.
4215 		 */
4216 		current_count -= more;
4217 		current_offset += cb->b_bcount;
4218 		current_blkno +=  more;
4219 		md_call_strategy(cb, flag, private);
4220 	} while (more);
4221 
4222 	if (!(flag & MD_STR_NOTTOP) && panicstr) {
4223 		while (!(ps->ps_flags & MD_MPS_DONE)) {
4224 			md_daemon(1, &md_done_daemon);
4225 			drv_usecwait(10);
4226 		}
4227 		kmem_cache_free(mirror_parent_cache, ps);
4228 	}
4229 }
4230 
4231 void
4232 md_mirror_strategy(buf_t *bp, int flag, void *private)
4233 {
4234 	set_t	setno = MD_MIN2SET(getminor(bp->b_edev));
4235 
4236 	/*
4237 	 * When doing IO to a multi owner meta device, check if set is halted.
4238 	 * We do this check without the needed lock held, for performance
4239 	 * reasons.
4240 	 * If an IO just slips through while the set is locked via an
4241 	 * MD_MN_SUSPEND_SET, we don't care about it.
4242 	 * Only check for suspension if we are a top-level i/o request
4243 	 * (MD_STR_NOTTOP is cleared in 'flag').
4244 	 */
4245 	if ((md_set[setno].s_status & (MD_SET_HALTED | MD_SET_MNSET)) ==
4246 	    (MD_SET_HALTED | MD_SET_MNSET)) {
4247 		if ((flag & MD_STR_NOTTOP) == 0) {
4248 			mutex_enter(&md_mx);
4249 			/* Here we loop until the set is no longer halted */
4250 			while (md_set[setno].s_status & MD_SET_HALTED) {
4251 				cv_wait(&md_cv, &md_mx);
4252 			}
4253 			mutex_exit(&md_mx);
4254 		}
4255 	}
4256 
4257 	if ((flag & MD_IO_COUNTED) == 0) {
4258 		if ((flag & MD_NOBLOCK) == 0) {
4259 			if (md_inc_iocount(setno) != 0) {
4260 				bp->b_flags |= B_ERROR;
4261 				bp->b_error = ENXIO;
4262 				bp->b_resid = bp->b_bcount;
4263 				biodone(bp);
4264 				return;
4265 			}
4266 		} else {
4267 			md_inc_iocount_noblock(setno);
4268 		}
4269 	}
4270 
4271 	if (bp->b_flags & B_READ)
4272 		mirror_read_strategy(bp, flag, private);
4273 	else
4274 		mirror_write_strategy(bp, flag, private);
4275 }
4276 
4277 /*
4278  * mirror_directed_read:
4279  * --------------------
4280  * Entry-point for the DKIOCDMR ioctl. We issue a read to a specified sub-mirror
4281  * so that the application can determine what (if any) resync needs to be
4282  * performed. The data is copied out to the user-supplied buffer.
4283  *
4284  * Parameters:
4285  *	mdev	- dev_t for the mirror device
4286  *	vdr	- directed read parameters specifying location and submirror
4287  *		  to perform the read from
4288  *	mode	- used to ddi_copyout() any resulting data from the read
4289  *
4290  * Returns:
4291  *	0	success
4292  *	!0	error code
4293  *		EINVAL - invalid request format
4294  */
4295 int
4296 mirror_directed_read(dev_t mdev, vol_directed_rd_t *vdr, int mode)
4297 {
4298 	buf_t		*bp;
4299 	minor_t		mnum = getminor(mdev);
4300 	mdi_unit_t	*ui = MDI_UNIT(mnum);
4301 	mm_unit_t	*un;
4302 	mm_submirror_t	*sm;
4303 	char		*sm_nm;
4304 	uint_t		next_side;
4305 	void		*kbuffer;
4306 
4307 	if (ui == NULL)
4308 		return (ENXIO);
4309 
4310 	if (!(vdr->vdr_flags & DKV_DMR_NEXT_SIDE)) {
4311 		return (EINVAL);
4312 	}
4313 
4314 	/* Check for aligned block access. We disallow non-aligned requests. */
4315 	if (vdr->vdr_offset % DEV_BSIZE) {
4316 		return (EINVAL);
4317 	}
4318 
4319 	/*
4320 	 * Allocate kernel buffer for target of read(). If we had a reliable
4321 	 * (sorry functional) DDI this wouldn't be needed.
4322 	 */
4323 	kbuffer = kmem_alloc(vdr->vdr_nbytes, KM_NOSLEEP);
4324 	if (kbuffer == NULL) {
4325 		cmn_err(CE_WARN, "mirror_directed_read: couldn't allocate %lx"
4326 		    " bytes\n", vdr->vdr_nbytes);
4327 		return (ENOMEM);
4328 	}
4329 
4330 	bp = getrbuf(KM_SLEEP);
4331 
4332 	bp->b_un.b_addr = kbuffer;
4333 	bp->b_flags = B_READ;
4334 	bp->b_bcount = vdr->vdr_nbytes;
4335 	bp->b_lblkno = lbtodb(vdr->vdr_offset);
4336 	bp->b_edev = mdev;
4337 
4338 	un = md_unit_readerlock(ui);
4339 
4340 	/*
4341 	 * If DKV_SIDE_INIT is set we need to determine the first available
4342 	 * side to start reading from. If it isn't set we increment to the
4343 	 * next readable submirror.
4344 	 * If there are no readable submirrors we error out with DKV_DMR_ERROR.
4345 	 * Note: we check for a readable submirror on completion of the i/o so
4346 	 * we should _always_ have one available. If this becomes unavailable
4347 	 * we have missed the 'DKV_DMR_DONE' opportunity. This could happen if
4348 	 * a metadetach is made between the completion of one DKIOCDMR ioctl
4349 	 * and the start of the next (i.e. a sys-admin 'accident' occurred).
4350 	 * The chance of this is small, but not non-existent.
4351 	 */
4352 	if (vdr->vdr_side == DKV_SIDE_INIT) {
4353 		next_side = 0;
4354 	} else {
4355 		next_side = vdr->vdr_side + 1;
4356 	}
4357 	while ((next_side < NMIRROR) &&
4358 	    !SUBMIRROR_IS_READABLE(un, next_side))
4359 		next_side++;
4360 	if (next_side >= NMIRROR) {
4361 		vdr->vdr_flags |= DKV_DMR_ERROR;
4362 		freerbuf(bp);
4363 		vdr->vdr_bytesread = 0;
4364 		md_unit_readerexit(ui);
4365 		return (0);
4366 	}
4367 
4368 	/* Set the side to read from */
4369 	un->un_dmr_last_read = next_side;
4370 
4371 	md_unit_readerexit(ui);
4372 
4373 	/*
4374 	 * Save timestamp for verification purposes. Can be read by debugger
4375 	 * to verify that this ioctl has been executed and to find the number
4376 	 * of DMR reads and the time of the last DMR read.
4377 	 */
4378 	uniqtime(&mirror_dmr_stats.dmr_timestamp);
4379 	mirror_dmr_stats.dmr_count++;
4380 
4381 	/* Issue READ request and wait for completion */
4382 	mirror_read_strategy(bp, MD_STR_DMR|MD_NOBLOCK|MD_STR_NOTTOP, NULL);
4383 
4384 	mutex_enter(&un->un_dmr_mx);
4385 	cv_wait(&un->un_dmr_cv, &un->un_dmr_mx);
4386 	mutex_exit(&un->un_dmr_mx);
4387 
4388 	/*
4389 	 * Check to see if we encountered an error during the read. If so we
4390 	 * can make no guarantee about any possibly returned data.
4391 	 */
4392 	if ((bp->b_flags & B_ERROR) == 0) {
4393 		vdr->vdr_flags &= ~DKV_DMR_ERROR;
4394 		if (bp->b_resid) {
4395 			vdr->vdr_flags |= DKV_DMR_SHORT;
4396 			vdr->vdr_bytesread = vdr->vdr_nbytes - bp->b_resid;
4397 		} else {
4398 			vdr->vdr_flags |= DKV_DMR_SUCCESS;
4399 			vdr->vdr_bytesread = vdr->vdr_nbytes;
4400 		}
4401 		/* Copy the data read back out to the user supplied buffer */
4402 		if (ddi_copyout(kbuffer, vdr->vdr_data, vdr->vdr_bytesread,
4403 		    mode)) {
4404 			kmem_free(kbuffer, vdr->vdr_nbytes);
4405 			return (EFAULT);
4406 		}
4407 
4408 	} else {
4409 		/* Error out with DKV_DMR_ERROR */
4410 		vdr->vdr_flags |= DKV_DMR_ERROR;
4411 		vdr->vdr_flags &= ~(DKV_DMR_SUCCESS|DKV_DMR_SHORT|DKV_DMR_DONE);
4412 	}
4413 	/*
4414 	 * Update the DMR parameters with the side and name of submirror that
4415 	 * we have just read from (un->un_dmr_last_read)
4416 	 */
4417 	un = md_unit_readerlock(ui);
4418 
4419 	vdr->vdr_side = un->un_dmr_last_read;
4420 	sm = &un->un_sm[un->un_dmr_last_read];
4421 	sm_nm = md_shortname(md_getminor(sm->sm_dev));
4422 
4423 	(void) strncpy(vdr->vdr_side_name, sm_nm, sizeof (vdr->vdr_side_name));
4424 
4425 	/*
4426 	 * Determine if we've completed the read cycle. This is true iff the
4427 	 * next computed submirror (side) equals or exceeds NMIRROR. We cannot
4428 	 * use un_nsm as we need to handle a sparse array of submirrors (which
4429 	 * can occur if a submirror is metadetached).
4430 	 */
4431 	next_side = un->un_dmr_last_read + 1;
4432 	while ((next_side < NMIRROR) &&
4433 	    !SUBMIRROR_IS_READABLE(un, next_side))
4434 		next_side++;
4435 	if (next_side >= NMIRROR) {
4436 		/* We've finished */
4437 		vdr->vdr_flags |= DKV_DMR_DONE;
4438 	}
4439 
4440 	md_unit_readerexit(ui);
4441 	freerbuf(bp);
4442 	kmem_free(kbuffer, vdr->vdr_nbytes);
4443 
4444 	return (0);
4445 }
4446 
4447 /*
4448  * mirror_resync_message:
4449  * ---------------------
4450  * Handle the multi-node resync messages that keep all nodes within a given
4451  * disk-set in sync with their view of a mirror's resync status.
4452  *
4453  * The message types dealt with are:
4454  * MD_MN_MSG_RESYNC_STARTING	- start a resync thread for a unit
4455  * MD_MN_MSG_RESYNC_NEXT	- specified next region to be resynced
4456  * MD_MN_MSG_RESYNC_FINISH	- stop the resync thread for a unit
4457  * MD_MN_MSG_RESYNC_PHASE_DONE	- end of a resync phase, opt, submirror or comp
4458  *
4459  * Returns:
4460  *	0	Success
4461  *	>0	Failure error number
4462  */
4463 int
4464 mirror_resync_message(md_mn_rs_params_t *p, IOLOCK *lockp)
4465 {
4466 	mdi_unit_t		*ui;
4467 	mm_unit_t		*un;
4468 	set_t			setno;
4469 	int			is_ABR;
4470 	int			smi;
4471 	int			ci;
4472 	sm_state_t		state;
4473 	int			broke_out;
4474 	mm_submirror_t		*sm;
4475 	mm_submirror_ic_t	*smic;
4476 	md_m_shared_t		*shared;
4477 	md_error_t		mde = mdnullerror;
4478 	md_mps_t		*ps;
4479 	int			rs_active;
4480 
4481 	/* Check that the given device is part of a multi-node set */
4482 	setno = MD_MIN2SET(p->mnum);
4483 	if (setno >= md_nsets) {
4484 		return (ENXIO);
4485 	}
4486 	if (!MD_MNSET_SETNO(setno)) {
4487 		return (EINVAL);
4488 	}
4489 
4490 	if ((un = mirror_getun(p->mnum, &p->mde, NO_LOCK, NULL)) == NULL)
4491 		return (EINVAL);
4492 	if ((ui = MDI_UNIT(p->mnum)) == NULL)
4493 		return (EINVAL);
4494 	is_ABR = (ui->ui_tstate & MD_ABR_CAP);
4495 
4496 	/* Obtain the current resync status */
4497 	(void) md_ioctl_readerlock(lockp, ui);
4498 	rs_active = (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) ? 1 : 0;
4499 	md_ioctl_readerexit(lockp);
4500 
4501 	switch ((md_mn_msgtype_t)p->msg_type) {
4502 	case MD_MN_MSG_RESYNC_STARTING:
4503 		/* Start the resync thread for the mirror */
4504 		(void) mirror_resync_unit(p->mnum, NULL, &p->mde, lockp);
4505 		break;
4506 
4507 	case MD_MN_MSG_RESYNC_NEXT:
4508 		/*
4509 		 * We have to release any previously marked overlap regions
4510 		 * so that i/o can resume. Then we need to block the region
4511 		 * from [rs_start..rs_start+rs_size) * so that no i/o is issued.
4512 		 * Update un_rs_resync_done and un_rs_resync_2_do.
4513 		 */
4514 		(void) md_ioctl_readerlock(lockp, ui);
4515 		/*
4516 		 * Ignore the message if there is no active resync thread or
4517 		 * if it is for a resync type that we have already completed.
4518 		 * un_resync_completed is set to the last resync completed
4519 		 * when processing a PHASE_DONE message.
4520 		 */
4521 		if (!rs_active || (p->rs_type == un->un_resync_completed))
4522 			break;
4523 		/*
4524 		 * If this message is for the same resync and is for an earlier
4525 		 * resync region, just ignore it. This can only occur if this
4526 		 * node has progressed on to the next resync region before
4527 		 * we receive this message. This can occur if the class for
4528 		 * this message is busy and the originator has to retry thus
4529 		 * allowing this node to move onto the next resync_region.
4530 		 */
4531 		if ((p->rs_type == un->un_rs_type) &&
4532 		    (p->rs_start < un->un_resync_startbl))
4533 			break;
4534 		ps = un->un_rs_prev_overlap;
4535 
4536 		/* Allocate previous overlap reference if needed */
4537 		if (ps == NULL) {
4538 			ps = kmem_cache_alloc(mirror_parent_cache,
4539 			    MD_ALLOCFLAGS);
4540 			ps->ps_un = un;
4541 			ps->ps_ui = ui;
4542 			ps->ps_firstblk = 0;
4543 			ps->ps_lastblk = 0;
4544 			ps->ps_flags = 0;
4545 			md_ioctl_readerexit(lockp);
4546 			(void) md_ioctl_writerlock(lockp, ui);
4547 			un->un_rs_prev_overlap = ps;
4548 			md_ioctl_writerexit(lockp);
4549 		} else
4550 			md_ioctl_readerexit(lockp);
4551 
4552 		if (p->rs_originator != md_mn_mynode_id) {
4553 			/*
4554 			 * On all but the originating node, first update
4555 			 * the resync state, then unblock the previous
4556 			 * region and block the next one. No need
4557 			 * to do this if the region is already blocked.
4558 			 * Update the submirror state and flags from the
4559 			 * originator. This keeps the cluster in sync with
4560 			 * regards to the resync status.
4561 			 */
4562 
4563 			(void) md_ioctl_writerlock(lockp, ui);
4564 			un->un_rs_resync_done = p->rs_done;
4565 			un->un_rs_resync_2_do = p->rs_2_do;
4566 			un->un_rs_type = p->rs_type;
4567 			un->un_resync_startbl = p->rs_start;
4568 			md_ioctl_writerexit(lockp);
4569 			/*
4570 			 * Use un_owner_mx to ensure that an ownership change
4571 			 * cannot happen at the same time as this message
4572 			 */
4573 			mutex_enter(&un->un_owner_mx);
4574 			if (MD_MN_MIRROR_OWNER(un)) {
4575 				ps->ps_firstblk = p->rs_start;
4576 				ps->ps_lastblk = ps->ps_firstblk +
4577 				    p->rs_size - 1;
4578 			} else {
4579 				if ((ps->ps_firstblk != p->rs_start) ||
4580 				    (ps->ps_lastblk != p->rs_start +
4581 				    p->rs_size - 1)) {
4582 					/* Remove previous overlap range */
4583 					if (ps->ps_flags & MD_MPS_ON_OVERLAP)
4584 						mirror_overlap_tree_remove(ps);
4585 
4586 					ps->ps_firstblk = p->rs_start;
4587 					ps->ps_lastblk = ps->ps_firstblk +
4588 					    p->rs_size - 1;
4589 
4590 					mutex_exit(&un->un_owner_mx);
4591 					/* Block this range from all i/o. */
4592 					if (ps->ps_firstblk != 0 ||
4593 					    ps->ps_lastblk != 0)
4594 						wait_for_overlaps(ps,
4595 						    MD_OVERLAP_ALLOW_REPEAT);
4596 					mutex_enter(&un->un_owner_mx);
4597 					/*
4598 					 * Check to see if we have obtained
4599 					 * ownership while waiting for
4600 					 * overlaps. If we have, remove
4601 					 * the resync_region entry from the
4602 					 * overlap tree
4603 					 */
4604 					if (MD_MN_MIRROR_OWNER(un) &&
4605 					    (ps->ps_flags & MD_MPS_ON_OVERLAP))
4606 						mirror_overlap_tree_remove(ps);
4607 				}
4608 			}
4609 			mutex_exit(&un->un_owner_mx);
4610 
4611 			/*
4612 			 * If this is the first RESYNC_NEXT message (i.e.
4613 			 * MD_MN_RS_FIRST_RESYNC_NEXT set in p->rs_flags),
4614 			 * issue RESYNC_START NOTIFY event
4615 			 */
4616 			if (p->rs_flags & MD_MN_RS_FIRST_RESYNC_NEXT) {
4617 				SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_START,
4618 				    SVM_TAG_METADEVICE, MD_UN2SET(un),
4619 				    MD_SID(un));
4620 			}
4621 
4622 			/* Ensure that our local resync thread is running */
4623 			if (un->un_rs_thread == NULL) {
4624 				(void) mirror_resync_unit(p->mnum, NULL,
4625 				    &p->mde, lockp);
4626 			}
4627 		}
4628 		break;
4629 	case MD_MN_MSG_RESYNC_FINISH:
4630 		/*
4631 		 * Complete the resync by stopping the resync thread.
4632 		 * Also release the previous overlap region field.
4633 		 * Update the resync_progress_thread by cv_signal'ing it so
4634 		 * that we mark the end of the resync as soon as possible. This
4635 		 * stops an unnecessary delay should be panic after resync
4636 		 * completion.
4637 		 */
4638 #ifdef DEBUG
4639 		if (!rs_active) {
4640 			if (mirror_debug_flag)
4641 				printf("RESYNC_FINISH (mnum = %x), "
4642 				    "Resync *NOT* active",
4643 				    p->mnum);
4644 		}
4645 #endif
4646 
4647 		if ((un->c.un_status & MD_UN_RESYNC_ACTIVE) &&
4648 		    (p->rs_originator != md_mn_mynode_id)) {
4649 			mutex_enter(&un->un_rs_thread_mx);
4650 			un->c.un_status &= ~MD_UN_RESYNC_CANCEL;
4651 			un->un_rs_thread_flags |= MD_RI_SHUTDOWN;
4652 			un->un_rs_thread_flags &=
4653 			    ~(MD_RI_BLOCK|MD_RI_BLOCK_OWNER);
4654 			cv_signal(&un->un_rs_thread_cv);
4655 			mutex_exit(&un->un_rs_thread_mx);
4656 		}
4657 		if (is_ABR) {
4658 			/* Resync finished, if ABR set owner to NULL */
4659 			mutex_enter(&un->un_owner_mx);
4660 			un->un_mirror_owner = 0;
4661 			mutex_exit(&un->un_owner_mx);
4662 		}
4663 		(void) md_ioctl_writerlock(lockp, ui);
4664 		ps = un->un_rs_prev_overlap;
4665 		if (ps != NULL) {
4666 			/* Remove previous overlap range */
4667 			if (ps->ps_flags & MD_MPS_ON_OVERLAP)
4668 				mirror_overlap_tree_remove(ps);
4669 			/*
4670 			 * Release the overlap range reference
4671 			 */
4672 			un->un_rs_prev_overlap = NULL;
4673 			kmem_cache_free(mirror_parent_cache,
4674 			    ps);
4675 		}
4676 		md_ioctl_writerexit(lockp);
4677 
4678 		/* Mark the resync as complete in the metadb */
4679 		un->un_rs_resync_done = p->rs_done;
4680 		un->un_rs_resync_2_do = p->rs_2_do;
4681 		un->un_rs_type = p->rs_type;
4682 		mutex_enter(&un->un_rs_progress_mx);
4683 		cv_signal(&un->un_rs_progress_cv);
4684 		mutex_exit(&un->un_rs_progress_mx);
4685 
4686 		un = md_ioctl_writerlock(lockp, ui);
4687 		un->c.un_status &= ~MD_UN_RESYNC_ACTIVE;
4688 		/* Deal with any pending grow_unit */
4689 		if (un->c.un_status & MD_UN_GROW_PENDING) {
4690 			if ((mirror_grow_unit(un, &mde) != 0) ||
4691 			    (! mdismderror(&mde, MDE_GROW_DELAYED))) {
4692 				un->c.un_status &= ~MD_UN_GROW_PENDING;
4693 			}
4694 		}
4695 		md_ioctl_writerexit(lockp);
4696 		break;
4697 
4698 	case MD_MN_MSG_RESYNC_PHASE_DONE:
4699 		/*
4700 		 * A phase of the resync, optimized. component or
4701 		 * submirror is complete. Update mirror status.
4702 		 * If the flag CLEAR_OPT_NOT_DONE is set, it means that the
4703 		 * mirror owner is peforming a resync. If we have just snarfed
4704 		 * this set, then we must clear any of the flags set at snarf
4705 		 * time by unit_setup_resync().
4706 		 * Note that unit_setup_resync() sets up these flags to
4707 		 * indicate that an optimized resync is required. These flags
4708 		 * need to be reset because if we get here,  the mirror owner
4709 		 * will have handled the optimized resync.
4710 		 * The flags that must be cleared are MD_UN_OPT_NOT_DONE and
4711 		 * MD_UN_WAR. In addition, for each submirror,
4712 		 * MD_SM_RESYNC_TARGET must be cleared and SMS_OFFLINE_RESYNC
4713 		 * set to SMS_OFFLINE.
4714 		 */
4715 #ifdef DEBUG
4716 		if (mirror_debug_flag)
4717 			printf("phase done mess received from %d, mnum=%x,"
4718 			    "type=%x, flags=%x\n", p->rs_originator, p->mnum,
4719 			    p->rs_type, p->rs_flags);
4720 #endif
4721 		/*
4722 		 * Ignore the message if there is no active resync thread.
4723 		 */
4724 		if (!rs_active)
4725 			break;
4726 
4727 		broke_out = p->rs_flags & MD_MN_RS_ERR;
4728 		switch (RS_TYPE(p->rs_type)) {
4729 		case MD_RS_OPTIMIZED:
4730 			un = md_ioctl_writerlock(lockp, ui);
4731 			if (p->rs_flags & MD_MN_RS_CLEAR_OPT_NOT_DONE) {
4732 				/* If we are originator, just clear rs_type */
4733 				if (p->rs_originator == md_mn_mynode_id) {
4734 					SET_RS_TYPE_NONE(un->un_rs_type);
4735 					md_ioctl_writerexit(lockp);
4736 					break;
4737 				}
4738 				/*
4739 				 * If CLEAR_OPT_NOT_DONE is set, only clear the
4740 				 * flags if OPT_NOT_DONE is set *and* rs_type
4741 				 * is MD_RS_NONE.
4742 				 */
4743 				if ((un->c.un_status & MD_UN_OPT_NOT_DONE) &&
4744 				    (RS_TYPE(un->un_rs_type) == MD_RS_NONE)) {
4745 					/* No resync in progress */
4746 					un->c.un_status &= ~MD_UN_OPT_NOT_DONE;
4747 					un->c.un_status &= ~MD_UN_WAR;
4748 				} else {
4749 					/*
4750 					 * We are in the middle of an
4751 					 * optimized resync and this message
4752 					 * should be ignored.
4753 					 */
4754 					md_ioctl_writerexit(lockp);
4755 					break;
4756 				}
4757 			} else {
4758 				/*
4759 				 * This is the end of an optimized resync,
4760 				 * clear the OPT_NOT_DONE and OFFLINE_SM flags
4761 				 */
4762 
4763 				un->c.un_status &= ~MD_UN_KEEP_DIRTY;
4764 				if (!broke_out)
4765 					un->c.un_status &= ~MD_UN_WAR;
4766 			}
4767 
4768 			/*
4769 			 * Set resync_completed to last resync type and then
4770 			 * clear resync_type to indicate no resync in progress
4771 			 */
4772 			un->un_resync_completed = un->un_rs_type;
4773 			SET_RS_TYPE_NONE(un->un_rs_type);
4774 
4775 			/*
4776 			 * If resync is as a result of a submirror ONLINE,
4777 			 * reset the submirror state to SMS_RUNNING if the
4778 			 * resync was ok else set back to SMS_OFFLINE.
4779 			 */
4780 			for (smi = 0; smi < NMIRROR; smi++) {
4781 				un->un_sm[smi].sm_flags &=
4782 				    ~MD_SM_RESYNC_TARGET;
4783 				if (SMS_BY_INDEX_IS(un, smi,
4784 				    SMS_OFFLINE_RESYNC)) {
4785 					if (p->rs_flags &
4786 					    MD_MN_RS_CLEAR_OPT_NOT_DONE) {
4787 						state = SMS_OFFLINE;
4788 					} else {
4789 						state = (broke_out ?
4790 						    SMS_OFFLINE : SMS_RUNNING);
4791 					}
4792 					mirror_set_sm_state(
4793 					    &un->un_sm[smi],
4794 					    &un->un_smic[smi], state,
4795 					    broke_out);
4796 					mirror_commit(un, NO_SUBMIRRORS,
4797 					    0);
4798 				}
4799 				/*
4800 				 * If we still have an offline submirror, reset
4801 				 * the OFFLINE_SM flag in the mirror status
4802 				 */
4803 				if (SMS_BY_INDEX_IS(un, smi,
4804 				    SMS_OFFLINE))
4805 					un->c.un_status |=
4806 					    MD_UN_OFFLINE_SM;
4807 			}
4808 			md_ioctl_writerexit(lockp);
4809 			break;
4810 		case MD_RS_SUBMIRROR:
4811 			un = md_ioctl_writerlock(lockp, ui);
4812 			smi = RS_SMI(p->rs_type);
4813 			sm = &un->un_sm[smi];
4814 			smic = &un->un_smic[smi];
4815 			/* Clear RESYNC target */
4816 			un->un_sm[smi].sm_flags &= ~MD_SM_RESYNC_TARGET;
4817 			/*
4818 			 * Set resync_completed to last resync type and then
4819 			 * clear resync_type to indicate no resync in progress
4820 			 */
4821 			un->un_resync_completed = un->un_rs_type;
4822 			SET_RS_TYPE_NONE(un->un_rs_type);
4823 			/*
4824 			 * If the resync completed ok reset the submirror
4825 			 * state to SMS_RUNNING else reset it to SMS_ATTACHED
4826 			 */
4827 			state = (broke_out ?
4828 			    SMS_ATTACHED : SMS_RUNNING);
4829 			mirror_set_sm_state(sm, smic, state, broke_out);
4830 			un->c.un_status &= ~MD_UN_WAR;
4831 			mirror_commit(un, SMI2BIT(smi), 0);
4832 			md_ioctl_writerexit(lockp);
4833 			break;
4834 		case MD_RS_COMPONENT:
4835 			un = md_ioctl_writerlock(lockp, ui);
4836 			smi = RS_SMI(p->rs_type);
4837 			ci = RS_CI(p->rs_type);
4838 			sm = &un->un_sm[smi];
4839 			smic = &un->un_smic[smi];
4840 			shared = (md_m_shared_t *)
4841 			    (*(smic->sm_shared_by_indx))
4842 			    (sm->sm_dev, sm, ci);
4843 			un->c.un_status &= ~MD_UN_WAR;
4844 			/* Clear RESYNC target */
4845 			un->un_sm[smi].sm_flags &= ~MD_SM_RESYNC_TARGET;
4846 			/*
4847 			 * Set resync_completed to last resync type and then
4848 			 * clear resync_type to indicate no resync in progress
4849 			 */
4850 			un->un_resync_completed = un->un_rs_type;
4851 			SET_RS_TYPE_NONE(un->un_rs_type);
4852 
4853 			/*
4854 			 * If the resync completed ok, set the component state
4855 			 * to CS_OKAY.
4856 			 */
4857 			if (broke_out)
4858 				shared->ms_flags |= MDM_S_RS_TRIED;
4859 			else {
4860 				/*
4861 				 * As we don't transmit the changes,
4862 				 * no need to drop the lock.
4863 				 */
4864 				set_sm_comp_state(un, smi, ci, CS_OKAY, 0,
4865 				    MD_STATE_NO_XMIT, (IOLOCK *)NULL);
4866 			}
4867 			md_ioctl_writerexit(lockp);
4868 		default:
4869 			break;
4870 		}
4871 		/*
4872 		 * If the purpose of this PHASE_DONE message is just to
4873 		 * indicate to all other nodes that the optimized resync
4874 		 * required (OPT_NOT_DONE) flag is to be cleared, there is
4875 		 * no need to generate a notify event as there has not
4876 		 * actually been a resync.
4877 		 */
4878 		if (!(p->rs_flags & MD_MN_RS_CLEAR_OPT_NOT_DONE)) {
4879 			if (broke_out) {
4880 				SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_FAILED,
4881 				    SVM_TAG_METADEVICE, MD_UN2SET(un),
4882 				    MD_SID(un));
4883 			} else {
4884 				SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_DONE,
4885 				    SVM_TAG_METADEVICE, MD_UN2SET(un),
4886 				    MD_SID(un));
4887 			}
4888 		}
4889 		break;
4890 
4891 	default:
4892 #ifdef DEBUG
4893 		cmn_err(CE_PANIC, "mirror_resync_message: Unknown message type"
4894 		    " %x\n", p->msg_type);
4895 #endif
4896 		return (EINVAL);
4897 	}
4898 	return (0);
4899 }
4900 
4901 /* Return a -1 if snarf of optimized record failed and set should be released */
4902 static int
4903 mirror_snarf(md_snarfcmd_t cmd, set_t setno)
4904 {
4905 	mddb_recid_t	recid;
4906 	int		gotsomething;
4907 	int		all_mirrors_gotten;
4908 	mm_unit_t	*un;
4909 	mddb_type_t	typ1;
4910 	mddb_de_ic_t    *dep;
4911 	mddb_rb32_t	*rbp;
4912 	size_t		newreqsize;
4913 	mm_unit_t	*big_un;
4914 	mm_unit32_od_t	*small_un;
4915 	int		retval;
4916 	mdi_unit_t	*ui;
4917 
4918 	if (cmd == MD_SNARF_CLEANUP) {
4919 		if (md_get_setstatus(setno) & MD_SET_STALE)
4920 			return (0);
4921 
4922 		recid = mddb_makerecid(setno, 0);
4923 		typ1 = (mddb_type_t)md_getshared_key(setno,
4924 		    mirror_md_ops.md_driver.md_drivername);
4925 		while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) {
4926 			if (mddb_getrecprivate(recid) & MD_PRV_CLEANUP) {
4927 				un = (mm_unit_t *)mddb_getrecaddr(recid);
4928 				mirror_cleanup(un);
4929 				recid = mddb_makerecid(setno, 0);
4930 			}
4931 		}
4932 		return (0);
4933 	}
4934 
4935 	all_mirrors_gotten = 1;
4936 	gotsomething = 0;
4937 
4938 	recid = mddb_makerecid(setno, 0);
4939 	typ1 = (mddb_type_t)md_getshared_key(setno,
4940 	    mirror_md_ops.md_driver.md_drivername);
4941 
4942 	while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) {
4943 		if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
4944 			continue;
4945 
4946 		dep = mddb_getrecdep(recid);
4947 		dep->de_flags = MDDB_F_MIRROR;
4948 		rbp = dep->de_rb;
4949 
4950 		switch (rbp->rb_revision) {
4951 		case MDDB_REV_RB:
4952 		case MDDB_REV_RBFN:
4953 			if ((rbp->rb_private & MD_PRV_CONVD) == 0) {
4954 				/*
4955 				 * This means, we have an old and small
4956 				 * record and this record hasn't already
4957 				 * been converted.  Before we create an
4958 				 * incore metadevice from this we have to
4959 				 * convert it to a big record.
4960 				 */
4961 				small_un =
4962 				    (mm_unit32_od_t *)mddb_getrecaddr(recid);
4963 				newreqsize = sizeof (mm_unit_t);
4964 				big_un = (mm_unit_t *)kmem_zalloc(newreqsize,
4965 				    KM_SLEEP);
4966 				mirror_convert((caddr_t)small_un,
4967 				    (caddr_t)big_un, SMALL_2_BIG);
4968 				kmem_free(small_un, dep->de_reqsize);
4969 
4970 				/*
4971 				 * Update userdata and incore userdata
4972 				 * incores are at the end of un
4973 				 */
4974 				dep->de_rb_userdata_ic = big_un;
4975 				dep->de_rb_userdata = big_un;
4976 				dep->de_icreqsize = newreqsize;
4977 				un = big_un;
4978 				rbp->rb_private |= MD_PRV_CONVD;
4979 			} else {
4980 				/*
4981 				 * Unit already converted, just get the
4982 				 * record address.
4983 				 */
4984 				un = (mm_unit_t *)mddb_getrecaddr_resize(recid,
4985 				    sizeof (*un), 0);
4986 			}
4987 			un->c.un_revision &= ~MD_64BIT_META_DEV;
4988 			break;
4989 		case MDDB_REV_RB64:
4990 		case MDDB_REV_RB64FN:
4991 			/* Big device */
4992 			un = (mm_unit_t *)mddb_getrecaddr_resize(recid,
4993 			    sizeof (*un), 0);
4994 			un->c.un_revision |= MD_64BIT_META_DEV;
4995 			un->c.un_flag |= MD_EFILABEL;
4996 			break;
4997 		}
4998 		MDDB_NOTE_FN(rbp->rb_revision, un->c.un_revision);
4999 
5000 		/*
5001 		 * Create minor device node for snarfed entry.
5002 		 */
5003 		(void) md_create_minor_node(setno, MD_SID(un));
5004 
5005 		if (MD_UNIT(MD_SID(un)) != NULL) {
5006 			mddb_setrecprivate(recid, MD_PRV_PENDDEL);
5007 			continue;
5008 		}
5009 		all_mirrors_gotten = 0;
5010 		retval = mirror_build_incore(un, 1);
5011 		if (retval == 0) {
5012 			mddb_setrecprivate(recid, MD_PRV_GOTIT);
5013 			md_create_unit_incore(MD_SID(un), &mirror_md_ops, 0);
5014 			resync_start_timeout(setno);
5015 			gotsomething = 1;
5016 		} else {
5017 			return (retval);
5018 		}
5019 		/*
5020 		 * Set flag to indicate that the mirror has not yet
5021 		 * been through a reconfig. This flag is used for MN sets
5022 		 * when determining whether to update the mirror state from
5023 		 * the Master node.
5024 		 */
5025 		if (MD_MNSET_SETNO(setno)) {
5026 			ui = MDI_UNIT(MD_SID(un));
5027 			ui->ui_tstate |= MD_RESYNC_NOT_DONE;
5028 		}
5029 	}
5030 
5031 	if (!all_mirrors_gotten)
5032 		return (gotsomething);
5033 
5034 	recid = mddb_makerecid(setno, 0);
5035 	while ((recid = mddb_getnextrec(recid, typ1, RESYNC_REC)) > 0)
5036 		if (!(mddb_getrecprivate(recid) & MD_PRV_GOTIT))
5037 			mddb_setrecprivate(recid, MD_PRV_PENDDEL);
5038 
5039 	return (0);
5040 }
5041 
5042 static int
5043 mirror_halt(md_haltcmd_t cmd, set_t setno)
5044 {
5045 	unit_t		i;
5046 	mdi_unit_t	*ui;
5047 	minor_t		mnum;
5048 	int		reset_mirror_flag = 0;
5049 
5050 	if (cmd == MD_HALT_CLOSE)
5051 		return (0);
5052 
5053 	if (cmd == MD_HALT_OPEN)
5054 		return (0);
5055 
5056 	if (cmd == MD_HALT_UNLOAD)
5057 		return (0);
5058 
5059 	if (cmd == MD_HALT_CHECK) {
5060 		for (i = 0; i < md_nunits; i++) {
5061 			mnum = MD_MKMIN(setno, i);
5062 			if ((ui = MDI_UNIT(mnum)) == NULL)
5063 				continue;
5064 			if (ui->ui_opsindex != mirror_md_ops.md_selfindex)
5065 				continue;
5066 			if (md_unit_isopen(ui))
5067 				return (1);
5068 		}
5069 		return (0);
5070 	}
5071 
5072 	if (cmd != MD_HALT_DOIT)
5073 		return (1);
5074 
5075 	for (i = 0; i < md_nunits; i++) {
5076 		mnum = MD_MKMIN(setno, i);
5077 		if ((ui = MDI_UNIT(mnum)) == NULL)
5078 			continue;
5079 		if (ui->ui_opsindex != mirror_md_ops.md_selfindex)
5080 			continue;
5081 		reset_mirror((mm_unit_t *)MD_UNIT(mnum), mnum, 0);
5082 
5083 		/* Set a flag if there is at least one mirror metadevice. */
5084 		reset_mirror_flag = 1;
5085 	}
5086 
5087 	/*
5088 	 * Only wait for the global dr_timeout to finish
5089 	 *  - if there are mirror metadevices in this diskset or
5090 	 *  - if this is the local set since an unload of the md_mirror
5091 	 *    driver could follow a successful mirror halt in the local set.
5092 	 */
5093 	if ((reset_mirror_flag != 0) || (setno == MD_LOCAL_SET)) {
5094 		while ((mirror_md_ops.md_head == NULL) &&
5095 		    (mirror_timeout.dr_timeout_id != 0))
5096 			delay(md_hz);
5097 	}
5098 
5099 	return (0);
5100 }
5101 
5102 /*ARGSUSED3*/
5103 static int
5104 mirror_open(dev_t *dev, int flag, int otyp, cred_t *cred_p, int md_oflags)
5105 {
5106 	IOLOCK	lock;
5107 	minor_t		mnum = getminor(*dev);
5108 	set_t		setno;
5109 
5110 	/*
5111 	 * When doing an open of a multi owner metadevice, check to see if this
5112 	 * node is a starting node and if a reconfig cycle is underway.
5113 	 * If so, the system isn't sufficiently set up enough to handle the
5114 	 * open (which involves I/O during sp_validate), so fail with ENXIO.
5115 	 */
5116 	setno = MD_MIN2SET(mnum);
5117 	if ((md_set[setno].s_status & (MD_SET_MNSET | MD_SET_MN_START_RC)) ==
5118 	    (MD_SET_MNSET | MD_SET_MN_START_RC)) {
5119 			return (ENXIO);
5120 	}
5121 
5122 	if (md_oflags & MD_OFLG_FROMIOCTL) {
5123 		/*
5124 		 * This indicates that the caller is an ioctl service routine.
5125 		 * In this case we initialise our stack-based IOLOCK and pass
5126 		 * this into the internal open routine. This allows multi-owner
5127 		 * metadevices to avoid deadlocking if an error is encountered
5128 		 * during the open() attempt. The failure case is:
5129 		 * s-p -> mirror -> s-p (with error). Attempting to metaclear
5130 		 * this configuration would deadlock as the mirror code has to
5131 		 * send a state-update to the other nodes when it detects the
5132 		 * failure of the underlying submirror with an errored soft-part
5133 		 * on it. As there is a class1 message in progress (metaclear)
5134 		 * set_sm_comp_state() cannot send another class1 message;
5135 		 * instead we do not send a state_update message as the
5136 		 * metaclear is distributed and the failed submirror will be
5137 		 * cleared from the configuration by the metaclear.
5138 		 */
5139 		IOLOCK_INIT(&lock);
5140 		return (mirror_internal_open(getminor(*dev), flag, otyp,
5141 		    md_oflags, &lock));
5142 	} else {
5143 		return (mirror_internal_open(getminor(*dev), flag, otyp,
5144 		    md_oflags, (IOLOCK *)NULL));
5145 	}
5146 }
5147 
5148 
5149 /*ARGSUSED1*/
5150 static int
5151 mirror_close(dev_t dev, int flag, int otyp, cred_t *cred_p, int md_cflags)
5152 {
5153 	return (mirror_internal_close(getminor(dev), otyp, md_cflags,
5154 	    (IOLOCK *)NULL));
5155 }
5156 
5157 
5158 /*
5159  * This routine dumps memory to the disk.  It assumes that the memory has
5160  * already been mapped into mainbus space.  It is called at disk interrupt
5161  * priority when the system is in trouble.
5162  *
5163  */
5164 static int
5165 mirror_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
5166 {
5167 	mm_unit_t	*un;
5168 	dev_t		mapdev;
5169 	int		result;
5170 	int		smi;
5171 	int		any_succeed = 0;
5172 	int		save_result = 0;
5173 
5174 	/*
5175 	 * Don't need to grab the unit lock.
5176 	 * Cause nothing else is suppose to be happenning.
5177 	 * Also dump is not suppose to sleep.
5178 	 */
5179 	un = (mm_unit_t *)MD_UNIT(getminor(dev));
5180 
5181 	if ((diskaddr_t)blkno >= un->c.un_total_blocks)
5182 		return (EINVAL);
5183 
5184 	if ((diskaddr_t)blkno + nblk > un->c.un_total_blocks)
5185 		return (EINVAL);
5186 
5187 	for (smi = 0; smi < NMIRROR; smi++) {
5188 		if (!SUBMIRROR_IS_WRITEABLE(un, smi))
5189 			continue;
5190 		mapdev = md_dev64_to_dev(un->un_sm[smi].sm_dev);
5191 		result = bdev_dump(mapdev, addr, blkno, nblk);
5192 		if (result)
5193 			save_result = result;
5194 
5195 		if (result == 0)
5196 			any_succeed++;
5197 	}
5198 
5199 	if (any_succeed)
5200 		return (0);
5201 
5202 	return (save_result);
5203 }
5204 
5205 /*
5206  * NAME: mirror_probe_dev
5207  *
5208  * DESCRITPION: force opens every component of a mirror.
5209  *
5210  * On entry the unit writerlock is held
5211  */
5212 static int
5213 mirror_probe_dev(mdi_unit_t *ui, minor_t mnum)
5214 {
5215 	int		i;
5216 	int		smi;
5217 	int		ci;
5218 	mm_unit_t	*un;
5219 	int		md_devopen = 0;
5220 	set_t		setno;
5221 	int		sm_cnt;
5222 	int		sm_unavail_cnt;
5223 
5224 	if (md_unit_isopen(ui))
5225 		md_devopen++;
5226 
5227 	un = MD_UNIT(mnum);
5228 	setno = MD_UN2SET(un);
5229 
5230 	sm_cnt = 0;
5231 	sm_unavail_cnt = 0;
5232 	for (i = 0; i < NMIRROR; i++) {
5233 		md_dev64_t tmpdev;
5234 		mdi_unit_t	*sm_ui;
5235 
5236 		if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) {
5237 			continue;
5238 		}
5239 
5240 		sm_cnt++;
5241 		tmpdev = un->un_sm[i].sm_dev;
5242 		(void) md_layered_open(mnum, &tmpdev,
5243 		    MD_OFLG_CONT_ERRS | MD_OFLG_PROBEDEV);
5244 		un->un_sm[i].sm_dev = tmpdev;
5245 
5246 		sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev)));
5247 
5248 		/*
5249 		 * Logic similar to that in mirror_open_all_devs.  We set or
5250 		 * clear the submirror Unavailable bit.
5251 		 */
5252 		(void) md_unit_writerlock(sm_ui);
5253 		if (submirror_unavailable(un, i, 1)) {
5254 			sm_ui->ui_tstate |= MD_INACCESSIBLE;
5255 			sm_unavail_cnt++;
5256 		} else {
5257 			sm_ui->ui_tstate &= ~MD_INACCESSIBLE;
5258 		}
5259 		md_unit_writerexit(sm_ui);
5260 	}
5261 
5262 	/*
5263 	 * If all of the submirrors are unavailable, the mirror is also
5264 	 * unavailable.
5265 	 */
5266 	if (sm_cnt == sm_unavail_cnt) {
5267 		ui->ui_tstate |= MD_INACCESSIBLE;
5268 	} else {
5269 		ui->ui_tstate &= ~MD_INACCESSIBLE;
5270 	}
5271 
5272 	/*
5273 	 * Start checking from probe failures. If failures occur we
5274 	 * set the appropriate erred state only if the metadevice is in
5275 	 * use. This is specifically to prevent unnecessary resyncs.
5276 	 * For instance if the disks were accidentally disconnected when
5277 	 * the system booted up then until the metadevice is accessed
5278 	 * (like file system mount) the user can shutdown, recable and
5279 	 * reboot w/o incurring a potentially huge resync.
5280 	 */
5281 
5282 	smi = 0;
5283 	ci = 0;
5284 	while (mirror_geterror(un, &smi, &ci, 1, 1) != 0) {
5285 
5286 		if (mirror_other_sources(un, smi, ci, 0) == 1) {
5287 			/*
5288 			 * Note that for a MN set, there is no need to call
5289 			 * SE_NOTIFY as that is done when processing the
5290 			 * state change
5291 			 */
5292 			if (md_devopen) {
5293 				/*
5294 				 * Never called from ioctl context,
5295 				 * so (IOLOCK *)NULL
5296 				 */
5297 				set_sm_comp_state(un, smi, ci, CS_LAST_ERRED,
5298 				    0, MD_STATE_XMIT, (IOLOCK *)NULL);
5299 				if (!MD_MNSET_SETNO(setno)) {
5300 					SE_NOTIFY(EC_SVM_STATE,
5301 					    ESC_SVM_LASTERRED,
5302 					    SVM_TAG_METADEVICE, setno,
5303 					    MD_SID(un));
5304 				}
5305 				continue;
5306 			} else {
5307 				(void) mirror_close_all_devs(un,
5308 				    MD_OFLG_PROBEDEV);
5309 				if (!MD_MNSET_SETNO(setno)) {
5310 					SE_NOTIFY(EC_SVM_STATE,
5311 					    ESC_SVM_OPEN_FAIL,
5312 					    SVM_TAG_METADEVICE, setno,
5313 					    MD_SID(un));
5314 				}
5315 				mirror_openfail_console_info(un, smi, ci);
5316 				return (ENXIO);
5317 			}
5318 		}
5319 
5320 		/*
5321 		 * Note that for a MN set, there is no need to call
5322 		 * SE_NOTIFY as that is done when processing the
5323 		 * state change
5324 		 */
5325 		if (md_devopen) {
5326 			/* Never called from ioctl context, so (IOLOCK *)NULL */
5327 			set_sm_comp_state(un, smi, ci, CS_ERRED, 0,
5328 			    MD_STATE_XMIT, (IOLOCK *)NULL);
5329 			if (!MD_MNSET_SETNO(setno)) {
5330 				SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED,
5331 				    SVM_TAG_METADEVICE, setno,
5332 				    MD_SID(un));
5333 			}
5334 		}
5335 		mirror_openfail_console_info(un, smi, ci);
5336 		ci++;
5337 	}
5338 
5339 	if (MD_MNSET_SETNO(setno)) {
5340 		send_poke_hotspares(setno);
5341 	} else {
5342 		(void) poke_hotspares();
5343 	}
5344 	(void) mirror_close_all_devs(un, MD_OFLG_PROBEDEV);
5345 
5346 	return (0);
5347 }
5348 
5349 
5350 static int
5351 mirror_imp_set(
5352 	set_t	setno
5353 )
5354 {
5355 
5356 	mddb_recid_t	recid;
5357 	int		gotsomething, i;
5358 	mddb_type_t	typ1;
5359 	mddb_de_ic_t	*dep;
5360 	mddb_rb32_t	*rbp;
5361 	mm_unit32_od_t	*un32;
5362 	mm_unit_t	*un64;
5363 	md_dev64_t	self_devt;
5364 	minor_t		*self_id;	/* minor needs to be updated */
5365 	md_parent_t	*parent_id;	/* parent needs to be updated */
5366 	mddb_recid_t	*record_id;	/* record id needs to be updated */
5367 	mddb_recid_t	*optrec_id;
5368 	md_dev64_t	tmpdev;
5369 
5370 
5371 	gotsomething = 0;
5372 
5373 	typ1 = (mddb_type_t)md_getshared_key(setno,
5374 	    mirror_md_ops.md_driver.md_drivername);
5375 	recid = mddb_makerecid(setno, 0);
5376 
5377 	while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) {
5378 		if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
5379 			continue;
5380 
5381 		dep = mddb_getrecdep(recid);
5382 		rbp = dep->de_rb;
5383 
5384 		switch (rbp->rb_revision) {
5385 		case MDDB_REV_RB:
5386 		case MDDB_REV_RBFN:
5387 			/*
5388 			 * Small device
5389 			 */
5390 			un32 = (mm_unit32_od_t *)mddb_getrecaddr(recid);
5391 			self_id = &(un32->c.un_self_id);
5392 			parent_id = &(un32->c.un_parent);
5393 			record_id = &(un32->c.un_record_id);
5394 			optrec_id = &(un32->un_rr_dirty_recid);
5395 
5396 			for (i = 0; i < un32->un_nsm; i++) {
5397 				tmpdev = md_expldev(un32->un_sm[i].sm_dev);
5398 				un32->un_sm[i].sm_dev = md_cmpldev
5399 				    (md_makedevice(md_major, MD_MKMIN(setno,
5400 				    MD_MIN2UNIT(md_getminor(tmpdev)))));
5401 
5402 				if (!md_update_minor(setno, mddb_getsidenum
5403 				    (setno), un32->un_sm[i].sm_key))
5404 				goto out;
5405 			}
5406 			break;
5407 		case MDDB_REV_RB64:
5408 		case MDDB_REV_RB64FN:
5409 			un64 = (mm_unit_t *)mddb_getrecaddr(recid);
5410 			self_id = &(un64->c.un_self_id);
5411 			parent_id = &(un64->c.un_parent);
5412 			record_id = &(un64->c.un_record_id);
5413 			optrec_id = &(un64->un_rr_dirty_recid);
5414 
5415 			for (i = 0; i < un64->un_nsm; i++) {
5416 				tmpdev = un64->un_sm[i].sm_dev;
5417 				un64->un_sm[i].sm_dev = md_makedevice
5418 				    (md_major, MD_MKMIN(setno, MD_MIN2UNIT
5419 				    (md_getminor(tmpdev))));
5420 
5421 				if (!md_update_minor(setno, mddb_getsidenum
5422 				    (setno), un64->un_sm[i].sm_key))
5423 				goto out;
5424 			}
5425 			break;
5426 		}
5427 
5428 		/*
5429 		 * If this is a top level and a friendly name metadevice,
5430 		 * update its minor in the namespace.
5431 		 */
5432 		if ((*parent_id == MD_NO_PARENT) &&
5433 		    ((rbp->rb_revision == MDDB_REV_RBFN) ||
5434 		    (rbp->rb_revision == MDDB_REV_RB64FN))) {
5435 
5436 			self_devt = md_makedevice(md_major, *self_id);
5437 			if (!md_update_top_device_minor(setno,
5438 			    mddb_getsidenum(setno), self_devt))
5439 				goto out;
5440 		}
5441 
5442 		/*
5443 		 * Update unit with the imported setno
5444 		 *
5445 		 */
5446 		mddb_setrecprivate(recid, MD_PRV_GOTIT);
5447 
5448 		*self_id = MD_MKMIN(setno, MD_MIN2UNIT(*self_id));
5449 		if (*parent_id != MD_NO_PARENT)
5450 			*parent_id = MD_MKMIN(setno, MD_MIN2UNIT(*parent_id));
5451 		*record_id = MAKERECID(setno, DBID(*record_id));
5452 		*optrec_id = MAKERECID(setno, DBID(*optrec_id));
5453 
5454 		gotsomething = 1;
5455 	}
5456 
5457 out:
5458 	return (gotsomething);
5459 }
5460 
5461 /*
5462  * NAME: mirror_check_offline
5463  *
5464  * DESCRIPTION: return offline_status = 1 if any submirrors are offline
5465  *
5466  * Called from ioctl, so access to MD_UN_OFFLINE_SM in un_status is
5467  * protected by the global ioctl lock as it is only set by the MD_IOCOFFLINE
5468  * ioctl.
5469  */
5470 int
5471 mirror_check_offline(md_dev64_t dev, int *offline_status)
5472 {
5473 	mm_unit_t		*un;
5474 	md_error_t		mde = mdnullerror;
5475 
5476 	if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL)
5477 		return (EINVAL);
5478 	*offline_status = 0;
5479 	if (un->c.un_status & MD_UN_OFFLINE_SM)
5480 		*offline_status = 1;
5481 	return (0);
5482 }
5483 
5484 /*
5485  * NAME: mirror_inc_abr_count
5486  *
5487  * DESCRIPTION: increment the count of layered soft parts with ABR set
5488  *
5489  * Called from ioctl, so access to un_abr_count is protected by the global
5490  * ioctl lock. It is only referenced in the MD_IOCOFFLINE ioctl.
5491  */
5492 int
5493 mirror_inc_abr_count(md_dev64_t dev)
5494 {
5495 	mm_unit_t		*un;
5496 	md_error_t		mde = mdnullerror;
5497 
5498 	if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL)
5499 		return (EINVAL);
5500 	un->un_abr_count++;
5501 	return (0);
5502 }
5503 
5504 /*
5505  * NAME: mirror_dec_abr_count
5506  *
5507  * DESCRIPTION: decrement the count of layered soft parts with ABR set
5508  *
5509  * Called from ioctl, so access to un_abr_count is protected by the global
5510  * ioctl lock. It is only referenced in the MD_IOCOFFLINE ioctl.
5511  */
5512 int
5513 mirror_dec_abr_count(md_dev64_t dev)
5514 {
5515 	mm_unit_t		*un;
5516 	md_error_t		mde = mdnullerror;
5517 
5518 	if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL)
5519 		return (EINVAL);
5520 	un->un_abr_count--;
5521 	return (0);
5522 }
5523 
5524 static md_named_services_t mirror_named_services[] = {
5525 	{(intptr_t (*)()) poke_hotspares,		"poke hotspares"    },
5526 	{(intptr_t (*)()) mirror_rename_listkids,	MDRNM_LIST_URKIDS   },
5527 	{mirror_rename_check,				MDRNM_CHECK	    },
5528 	{(intptr_t (*)()) mirror_renexch_update_kids,	MDRNM_UPDATE_KIDS   },
5529 	{(intptr_t (*)()) mirror_exchange_parent_update_to,
5530 			MDRNM_PARENT_UPDATE_TO},
5531 	{(intptr_t (*)()) mirror_exchange_self_update_from_down,
5532 			MDRNM_SELF_UPDATE_FROM_DOWN },
5533 	{(intptr_t (*)())mirror_probe_dev,		"probe open test" },
5534 	{(intptr_t (*)())mirror_check_offline,		MD_CHECK_OFFLINE },
5535 	{(intptr_t (*)())mirror_inc_abr_count,		MD_INC_ABR_COUNT },
5536 	{(intptr_t (*)())mirror_dec_abr_count,		MD_DEC_ABR_COUNT },
5537 	{ NULL,						0		    }
5538 };
5539 
5540 md_ops_t mirror_md_ops = {
5541 	mirror_open,		/* open */
5542 	mirror_close,		/* close */
5543 	md_mirror_strategy,	/* strategy */
5544 	NULL,			/* print */
5545 	mirror_dump,		/* dump */
5546 	NULL,			/* read */
5547 	NULL,			/* write */
5548 	md_mirror_ioctl,	/* mirror_ioctl, */
5549 	mirror_snarf,		/* mirror_snarf */
5550 	mirror_halt,		/* mirror_halt */
5551 	NULL,			/* aread */
5552 	NULL,			/* awrite */
5553 	mirror_imp_set,		/* import set */
5554 	mirror_named_services
5555 };
5556 
5557 /* module specific initilization */
5558 static void
5559 init_init()
5560 {
5561 	md_mirror_mcs_buf_off = sizeof (md_mcs_t) - sizeof (buf_t);
5562 
5563 	/* Initialize the parent and child save memory pools */
5564 	mirror_parent_cache = kmem_cache_create("md_mirror_parent",
5565 	    sizeof (md_mps_t), 0, mirror_parent_constructor,
5566 	    mirror_parent_destructor, mirror_run_queue, NULL, NULL,
5567 	    0);
5568 
5569 	mirror_child_cache = kmem_cache_create("md_mirror_child",
5570 	    sizeof (md_mcs_t) - sizeof (buf_t) + biosize(), 0,
5571 	    mirror_child_constructor, mirror_child_destructor,
5572 	    mirror_run_queue, NULL, NULL, 0);
5573 
5574 	/*
5575 	 * Insure wowbuf_size is a multiple of DEV_BSIZE,
5576 	 * then initialize wowbuf memory pool.
5577 	 */
5578 	md_wowbuf_size = roundup(md_wowbuf_size, DEV_BSIZE);
5579 	if (md_wowbuf_size <= 0)
5580 		md_wowbuf_size = 2 * DEV_BSIZE;
5581 	if (md_wowbuf_size > (32 * DEV_BSIZE))
5582 		md_wowbuf_size = (32 * DEV_BSIZE);
5583 
5584 	md_wowblk_size = md_wowbuf_size + sizeof (wowhdr_t);
5585 	mirror_wowblk_cache = kmem_cache_create("md_mirror_wow",
5586 	    md_wowblk_size, 0, NULL, NULL, NULL, NULL, NULL, 0);
5587 
5588 	mutex_init(&mirror_timeout.dr_mx, NULL, MUTEX_DEFAULT, NULL);
5589 	mutex_init(&hotspare_request.dr_mx, NULL, MUTEX_DEFAULT, NULL);
5590 
5591 	mutex_init(&non_ff_drv_mutex, NULL, MUTEX_DEFAULT, NULL);
5592 }
5593 
5594 /* module specific uninitilization (undo init_init()) */
5595 static void
5596 fini_uninit()
5597 {
5598 	kmem_cache_destroy(mirror_parent_cache);
5599 	kmem_cache_destroy(mirror_child_cache);
5600 	kmem_cache_destroy(mirror_wowblk_cache);
5601 	mirror_parent_cache = mirror_child_cache =
5602 	    mirror_wowblk_cache = NULL;
5603 
5604 	mutex_destroy(&mirror_timeout.dr_mx);
5605 	mutex_destroy(&hotspare_request.dr_mx);
5606 	mutex_destroy(&non_ff_drv_mutex);
5607 }
5608 
5609 /* define the module linkage */
5610 MD_PLUGIN_MISC_MODULE("mirrors module", init_init(), fini_uninit())
5611