xref: /onnv-gate/usr/src/uts/common/io/lvm/mirror/mirror.c (revision 12629:8a89ca2bbe3a)
10Sstevel@tonic-gate /*
20Sstevel@tonic-gate  * CDDL HEADER START
30Sstevel@tonic-gate  *
40Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
51366Spetede  * Common Development and Distribution License (the "License").
61366Spetede  * You may not use this file except in compliance with the License.
70Sstevel@tonic-gate  *
80Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
90Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
100Sstevel@tonic-gate  * See the License for the specific language governing permissions
110Sstevel@tonic-gate  * and limitations under the License.
120Sstevel@tonic-gate  *
130Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
140Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
150Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
160Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
170Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
180Sstevel@tonic-gate  *
190Sstevel@tonic-gate  * CDDL HEADER END
200Sstevel@tonic-gate  */
217627SChris.Horne@Sun.COM 
220Sstevel@tonic-gate /*
23*12629SRay.Hassan@oracle.COM  * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
240Sstevel@tonic-gate  */
250Sstevel@tonic-gate 
260Sstevel@tonic-gate #include <sys/param.h>
270Sstevel@tonic-gate #include <sys/systm.h>
280Sstevel@tonic-gate #include <sys/conf.h>
290Sstevel@tonic-gate #include <sys/file.h>
300Sstevel@tonic-gate #include <sys/user.h>
310Sstevel@tonic-gate #include <sys/uio.h>
320Sstevel@tonic-gate #include <sys/t_lock.h>
330Sstevel@tonic-gate #include <sys/buf.h>
340Sstevel@tonic-gate #include <sys/dkio.h>
350Sstevel@tonic-gate #include <sys/vtoc.h>
360Sstevel@tonic-gate #include <sys/kmem.h>
370Sstevel@tonic-gate #include <vm/page.h>
380Sstevel@tonic-gate #include <sys/cmn_err.h>
390Sstevel@tonic-gate #include <sys/sysmacros.h>
400Sstevel@tonic-gate #include <sys/types.h>
410Sstevel@tonic-gate #include <sys/mkdev.h>
420Sstevel@tonic-gate #include <sys/stat.h>
430Sstevel@tonic-gate #include <sys/open.h>
440Sstevel@tonic-gate #include <sys/modctl.h>
450Sstevel@tonic-gate #include <sys/ddi.h>
460Sstevel@tonic-gate #include <sys/sunddi.h>
470Sstevel@tonic-gate #include <sys/debug.h>
480Sstevel@tonic-gate #include <sys/dklabel.h>
490Sstevel@tonic-gate #include <vm/hat.h>
501623Stw21770 #include <sys/lvm/mdvar.h>
510Sstevel@tonic-gate #include <sys/lvm/md_mirror.h>
520Sstevel@tonic-gate #include <sys/lvm/md_convert.h>
530Sstevel@tonic-gate #include <sys/lvm/md_mddb.h>
540Sstevel@tonic-gate #include <sys/esunddi.h>
550Sstevel@tonic-gate 
560Sstevel@tonic-gate #include <sys/sysevent/eventdefs.h>
570Sstevel@tonic-gate #include <sys/sysevent/svm.h>
580Sstevel@tonic-gate #include <sys/lvm/mdmn_commd.h>
596901Sjkennedy #include <sys/avl.h>
600Sstevel@tonic-gate 
610Sstevel@tonic-gate md_ops_t		mirror_md_ops;
620Sstevel@tonic-gate #ifndef	lint
631366Spetede char			_depends_on[] = "drv/md";
640Sstevel@tonic-gate md_ops_t		*md_interface_ops = &mirror_md_ops;
650Sstevel@tonic-gate #endif
660Sstevel@tonic-gate 
670Sstevel@tonic-gate extern mdq_anchor_t	md_done_daemon;
680Sstevel@tonic-gate extern mdq_anchor_t	md_mstr_daemon;
690Sstevel@tonic-gate extern mdq_anchor_t	md_mirror_daemon;
700Sstevel@tonic-gate extern mdq_anchor_t	md_mirror_io_daemon;
710Sstevel@tonic-gate extern mdq_anchor_t	md_mirror_rs_daemon;
720Sstevel@tonic-gate extern mdq_anchor_t	md_mhs_daemon;
730Sstevel@tonic-gate 
740Sstevel@tonic-gate extern unit_t		md_nunits;
750Sstevel@tonic-gate extern set_t		md_nsets;
760Sstevel@tonic-gate extern md_set_t		md_set[];
770Sstevel@tonic-gate 
780Sstevel@tonic-gate extern int		md_status;
790Sstevel@tonic-gate extern clock_t		md_hz;
800Sstevel@tonic-gate 
810Sstevel@tonic-gate extern md_krwlock_t	md_unit_array_rw;
820Sstevel@tonic-gate extern kmutex_t		md_mx;
830Sstevel@tonic-gate extern kcondvar_t	md_cv;
840Sstevel@tonic-gate extern int		md_mtioctl_cnt;
850Sstevel@tonic-gate 
860Sstevel@tonic-gate daemon_request_t	mirror_timeout;
870Sstevel@tonic-gate static daemon_request_t	hotspare_request;
880Sstevel@tonic-gate static daemon_request_t	mn_hs_request[MD_MAXSETS];	/* Multinode hs req */
890Sstevel@tonic-gate 
900Sstevel@tonic-gate int	md_mirror_mcs_buf_off;
910Sstevel@tonic-gate 
920Sstevel@tonic-gate /* Flags for mdmn_ksend_message to allow debugging */
930Sstevel@tonic-gate int	md_mirror_msg_flags;
940Sstevel@tonic-gate 
950Sstevel@tonic-gate #ifdef DEBUG
960Sstevel@tonic-gate /* Flag to switch on debug messages */
970Sstevel@tonic-gate int	mirror_debug_flag = 0;
980Sstevel@tonic-gate #endif
990Sstevel@tonic-gate 
1000Sstevel@tonic-gate /*
1010Sstevel@tonic-gate  * Struct used to hold count of DMR reads and the timestamp of last DMR read
1020Sstevel@tonic-gate  * It is used to verify, using a debugger, that the DMR read ioctl has been
1030Sstevel@tonic-gate  * executed.
1040Sstevel@tonic-gate  */
1050Sstevel@tonic-gate dmr_stats_t	mirror_dmr_stats = {0, 0};
1060Sstevel@tonic-gate 
1070Sstevel@tonic-gate /*
1080Sstevel@tonic-gate  * Mutex protecting list of non-failfast drivers.
1090Sstevel@tonic-gate  */
1100Sstevel@tonic-gate static kmutex_t	non_ff_drv_mutex;
1112063Shshaw extern char	**non_ff_drivers;
1120Sstevel@tonic-gate 
1130Sstevel@tonic-gate extern major_t	md_major;
1140Sstevel@tonic-gate 
1150Sstevel@tonic-gate /*
1160Sstevel@tonic-gate  * Write-On-Write memory pool.
1170Sstevel@tonic-gate  */
1180Sstevel@tonic-gate static void		copy_write_cont(wowhdr_t *wowhdr);
1190Sstevel@tonic-gate static kmem_cache_t	*mirror_wowblk_cache = NULL;
1200Sstevel@tonic-gate static int		md_wowbuf_size = 16384;
1210Sstevel@tonic-gate static size_t		md_wowblk_size;
1220Sstevel@tonic-gate 
1230Sstevel@tonic-gate /*
1240Sstevel@tonic-gate  * This is a flag that allows:
1250Sstevel@tonic-gate  *	- disabling the write-on-write mechanism.
1260Sstevel@tonic-gate  *	- logging occurrences of write-on-write
1270Sstevel@tonic-gate  *	- switching wow handling procedure processing
1280Sstevel@tonic-gate  * Counter for occurences of WOW.
1290Sstevel@tonic-gate  */
1300Sstevel@tonic-gate static uint_t	md_mirror_wow_flg = 0;
1310Sstevel@tonic-gate static int	md_mirror_wow_cnt = 0;
1320Sstevel@tonic-gate 
1330Sstevel@tonic-gate /*
1340Sstevel@tonic-gate  * Tunable to enable/disable dirty region
1350Sstevel@tonic-gate  * processing when closing down a mirror.
1360Sstevel@tonic-gate  */
1370Sstevel@tonic-gate static int	new_resync = 1;
1380Sstevel@tonic-gate kmem_cache_t	*mirror_parent_cache = NULL;
1390Sstevel@tonic-gate kmem_cache_t	*mirror_child_cache = NULL;
1400Sstevel@tonic-gate 
1410Sstevel@tonic-gate extern int	md_ff_disable;		/* disable failfast */
1420Sstevel@tonic-gate 
1430Sstevel@tonic-gate static int	mirror_map_write(mm_unit_t *, md_mcs_t *, md_mps_t *, int);
1440Sstevel@tonic-gate static void	mirror_read_strategy(buf_t *, int, void *);
1450Sstevel@tonic-gate static void	mirror_write_strategy(buf_t *, int, void *);
1460Sstevel@tonic-gate static void	become_owner(daemon_queue_t *);
1470Sstevel@tonic-gate static int	mirror_done(struct buf *cb);
1480Sstevel@tonic-gate static int	mirror_done_common(struct buf *cb);
1490Sstevel@tonic-gate static void	clear_retry_error(struct buf *cb);
1500Sstevel@tonic-gate 
1510Sstevel@tonic-gate /*
1520Sstevel@tonic-gate  * patchables
1530Sstevel@tonic-gate  */
1540Sstevel@tonic-gate int	md_min_rr_size	= 200;	/* 2000 blocks, or 100k */
1550Sstevel@tonic-gate int	md_def_num_rr	= 1000;	/* Default number of dirty regions */
1560Sstevel@tonic-gate 
1570Sstevel@tonic-gate /*
1580Sstevel@tonic-gate  * patchable to change delay before rescheduling mirror ownership request.
1590Sstevel@tonic-gate  * Value is clock ticks, default 0.5 seconds
1600Sstevel@tonic-gate  */
1610Sstevel@tonic-gate clock_t	md_mirror_owner_to = 500000;
1620Sstevel@tonic-gate 
1630Sstevel@tonic-gate /*ARGSUSED1*/
1640Sstevel@tonic-gate static int
mirror_parent_constructor(void * p,void * d1,int d2)1650Sstevel@tonic-gate mirror_parent_constructor(void *p, void *d1, int d2)
1660Sstevel@tonic-gate {
1670Sstevel@tonic-gate 	mutex_init(&((md_mps_t *)p)->ps_mx, NULL, MUTEX_DEFAULT, NULL);
1680Sstevel@tonic-gate 	return (0);
1690Sstevel@tonic-gate }
1700Sstevel@tonic-gate 
1710Sstevel@tonic-gate static void
mirror_parent_init(md_mps_t * ps)1720Sstevel@tonic-gate mirror_parent_init(md_mps_t *ps)
1730Sstevel@tonic-gate {
1740Sstevel@tonic-gate 	bzero(ps, offsetof(md_mps_t, ps_mx));
1758452SJohn.Wren.Kennedy@Sun.COM 	bzero(&ps->ps_overlap_node, sizeof (avl_node_t));
1760Sstevel@tonic-gate }
1770Sstevel@tonic-gate 
1780Sstevel@tonic-gate /*ARGSUSED1*/
1790Sstevel@tonic-gate static void
mirror_parent_destructor(void * p,void * d)1800Sstevel@tonic-gate mirror_parent_destructor(void *p, void *d)
1810Sstevel@tonic-gate {
1820Sstevel@tonic-gate 	mutex_destroy(&((md_mps_t *)p)->ps_mx);
1830Sstevel@tonic-gate }
1840Sstevel@tonic-gate 
1850Sstevel@tonic-gate /*ARGSUSED1*/
1860Sstevel@tonic-gate static int
mirror_child_constructor(void * p,void * d1,int d2)1870Sstevel@tonic-gate mirror_child_constructor(void *p, void *d1, int d2)
1880Sstevel@tonic-gate {
1890Sstevel@tonic-gate 	bioinit(&((md_mcs_t *)p)->cs_buf);
1900Sstevel@tonic-gate 	return (0);
1910Sstevel@tonic-gate }
1920Sstevel@tonic-gate 
1930Sstevel@tonic-gate void
mirror_child_init(md_mcs_t * cs)1940Sstevel@tonic-gate mirror_child_init(md_mcs_t *cs)
1950Sstevel@tonic-gate {
1960Sstevel@tonic-gate 	cs->cs_ps = NULL;
1970Sstevel@tonic-gate 	cs->cs_mdunit = 0;
1980Sstevel@tonic-gate 	md_bioreset(&cs->cs_buf);
1990Sstevel@tonic-gate }
2000Sstevel@tonic-gate 
2010Sstevel@tonic-gate /*ARGSUSED1*/
2020Sstevel@tonic-gate static void
mirror_child_destructor(void * p,void * d)2030Sstevel@tonic-gate mirror_child_destructor(void *p, void *d)
2040Sstevel@tonic-gate {
2050Sstevel@tonic-gate 	biofini(&((md_mcs_t *)p)->cs_buf);
2060Sstevel@tonic-gate }
2070Sstevel@tonic-gate 
2080Sstevel@tonic-gate static void
mirror_wowblk_init(wowhdr_t * p)2090Sstevel@tonic-gate mirror_wowblk_init(wowhdr_t *p)
2100Sstevel@tonic-gate {
2110Sstevel@tonic-gate 	bzero(p, md_wowblk_size);
2120Sstevel@tonic-gate }
2130Sstevel@tonic-gate 
2140Sstevel@tonic-gate static void
send_poke_hotspares_msg(daemon_request_t * drq)2150Sstevel@tonic-gate send_poke_hotspares_msg(daemon_request_t *drq)
2160Sstevel@tonic-gate {
2170Sstevel@tonic-gate 	int			rval;
21811130SJames.Hall@Sun.COM 	int			nretries = 0;
2190Sstevel@tonic-gate 	md_mn_msg_pokehsp_t	pokehsp;
2200Sstevel@tonic-gate 	md_mn_kresult_t		*kresult;
2210Sstevel@tonic-gate 	set_t			setno = (set_t)drq->dq.qlen;
2220Sstevel@tonic-gate 
2230Sstevel@tonic-gate 	pokehsp.pokehsp_setno = setno;
2240Sstevel@tonic-gate 
2250Sstevel@tonic-gate 	kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
22611130SJames.Hall@Sun.COM 
22711130SJames.Hall@Sun.COM retry_sphmsg:
2280Sstevel@tonic-gate 	rval = mdmn_ksend_message(setno, MD_MN_MSG_POKE_HOTSPARES,
2298452SJohn.Wren.Kennedy@Sun.COM 	    MD_MSGF_NO_LOG | MD_MSGF_NO_BCAST, 0, (char *)&pokehsp,
2300Sstevel@tonic-gate 	    sizeof (pokehsp), kresult);
2310Sstevel@tonic-gate 
2320Sstevel@tonic-gate 	if (!MDMN_KSEND_MSG_OK(rval, kresult)) {
2330Sstevel@tonic-gate 		mdmn_ksend_show_error(rval, kresult, "POKE_HOTSPARES");
2348452SJohn.Wren.Kennedy@Sun.COM 		/* If we're shutting down already, pause things here. */
2358452SJohn.Wren.Kennedy@Sun.COM 		if (kresult->kmmr_comm_state == MDMNE_RPC_FAIL) {
2368452SJohn.Wren.Kennedy@Sun.COM 			while (!md_mn_is_commd_present()) {
2378452SJohn.Wren.Kennedy@Sun.COM 				delay(md_hz);
2388452SJohn.Wren.Kennedy@Sun.COM 			}
23911130SJames.Hall@Sun.COM 			/*
24011130SJames.Hall@Sun.COM 			 * commd has become reachable again, so retry once.
24111130SJames.Hall@Sun.COM 			 * If this fails we'll panic as the system is in an
24211130SJames.Hall@Sun.COM 			 * unexpected state.
24311130SJames.Hall@Sun.COM 			 */
24411130SJames.Hall@Sun.COM 			if (nretries++ == 0)
24511130SJames.Hall@Sun.COM 				goto retry_sphmsg;
2468452SJohn.Wren.Kennedy@Sun.COM 		}
2470Sstevel@tonic-gate 		cmn_err(CE_PANIC,
2480Sstevel@tonic-gate 		    "ksend_message failure: POKE_HOTSPARES");
2490Sstevel@tonic-gate 	}
2500Sstevel@tonic-gate 	kmem_free(kresult, sizeof (md_mn_kresult_t));
2510Sstevel@tonic-gate 
2520Sstevel@tonic-gate 	/* Allow further requests to use this set's queue structure */
2530Sstevel@tonic-gate 	mutex_enter(&drq->dr_mx);
2540Sstevel@tonic-gate 	drq->dr_pending = 0;
2550Sstevel@tonic-gate 	mutex_exit(&drq->dr_mx);
2560Sstevel@tonic-gate }
2570Sstevel@tonic-gate 
2580Sstevel@tonic-gate /*
2590Sstevel@tonic-gate  * Send a poke_hotspares message to the master node. To avoid swamping the
2600Sstevel@tonic-gate  * commd handler with requests we only send a message if there is not one
2610Sstevel@tonic-gate  * already outstanding. We punt the request to a separate thread context as
2620Sstevel@tonic-gate  * cannot afford to block waiting on the request to be serviced. This is
2630Sstevel@tonic-gate  * essential when a reconfig cycle is in progress as any open() of a multinode
2640Sstevel@tonic-gate  * metadevice may result in a livelock.
2650Sstevel@tonic-gate  */
2660Sstevel@tonic-gate static void
send_poke_hotspares(set_t setno)2670Sstevel@tonic-gate send_poke_hotspares(set_t setno)
2680Sstevel@tonic-gate {
2690Sstevel@tonic-gate 	daemon_request_t	*drq = &mn_hs_request[setno];
2700Sstevel@tonic-gate 
2710Sstevel@tonic-gate 	mutex_enter(&drq->dr_mx);
2720Sstevel@tonic-gate 	if (drq->dr_pending == 0) {
2730Sstevel@tonic-gate 		drq->dr_pending = 1;
2740Sstevel@tonic-gate 		drq->dq.qlen = (int)setno;
2750Sstevel@tonic-gate 		daemon_request(&md_mhs_daemon,
2760Sstevel@tonic-gate 		    send_poke_hotspares_msg, (daemon_queue_t *)drq, REQ_OLD);
2770Sstevel@tonic-gate 	}
2780Sstevel@tonic-gate 	mutex_exit(&drq->dr_mx);
2790Sstevel@tonic-gate }
2800Sstevel@tonic-gate 
2810Sstevel@tonic-gate void
mirror_set_sm_state(mm_submirror_t * sm,mm_submirror_ic_t * smic,sm_state_t newstate,int force)2820Sstevel@tonic-gate mirror_set_sm_state(
2830Sstevel@tonic-gate 	mm_submirror_t		*sm,
2840Sstevel@tonic-gate 	mm_submirror_ic_t	*smic,
2850Sstevel@tonic-gate 	sm_state_t		newstate,
2860Sstevel@tonic-gate 	int			force)
2870Sstevel@tonic-gate {
2880Sstevel@tonic-gate 	int			compcnt;
2890Sstevel@tonic-gate 	int			i;
2900Sstevel@tonic-gate 	int			errcnt;
2910Sstevel@tonic-gate 	sm_state_t		origstate;
2920Sstevel@tonic-gate 	md_m_shared_t		*shared;
2930Sstevel@tonic-gate 
2940Sstevel@tonic-gate 	if (force) {
2950Sstevel@tonic-gate 		sm->sm_state = newstate;
2960Sstevel@tonic-gate 		uniqtime32(&sm->sm_timestamp);
2970Sstevel@tonic-gate 		return;
2980Sstevel@tonic-gate 	}
2990Sstevel@tonic-gate 
3000Sstevel@tonic-gate 	origstate = newstate;
3010Sstevel@tonic-gate 
3020Sstevel@tonic-gate 	compcnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm);
3030Sstevel@tonic-gate 	for (i = 0, errcnt = 0; i < compcnt; i++) {
3040Sstevel@tonic-gate 		shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
3050Sstevel@tonic-gate 		    (sm->sm_dev, sm, i);
3060Sstevel@tonic-gate 		if (shared->ms_state & (CS_ERRED | CS_LAST_ERRED))
3070Sstevel@tonic-gate 			newstate |= SMS_COMP_ERRED;
3080Sstevel@tonic-gate 		if (shared->ms_state & (CS_RESYNC))
3090Sstevel@tonic-gate 			newstate |= SMS_COMP_RESYNC;
3100Sstevel@tonic-gate 		if (shared->ms_state & CS_ERRED)
3110Sstevel@tonic-gate 			errcnt++;
3120Sstevel@tonic-gate 	}
3130Sstevel@tonic-gate 
3140Sstevel@tonic-gate 	if ((newstate & (SMS_COMP_ERRED | SMS_COMP_RESYNC)) != 0)
3150Sstevel@tonic-gate 		newstate &= ~origstate;
3160Sstevel@tonic-gate 
3170Sstevel@tonic-gate 	if (errcnt == compcnt)
3180Sstevel@tonic-gate 		newstate |= SMS_ALL_ERRED;
3190Sstevel@tonic-gate 	else
3200Sstevel@tonic-gate 		newstate &= ~SMS_ALL_ERRED;
3210Sstevel@tonic-gate 
3220Sstevel@tonic-gate 	sm->sm_state = newstate;
3230Sstevel@tonic-gate 	uniqtime32(&sm->sm_timestamp);
3240Sstevel@tonic-gate }
3250Sstevel@tonic-gate 
3260Sstevel@tonic-gate static int
mirror_geterror(mm_unit_t * un,int * smi,int * cip,int clr_error,int frm_probe)3270Sstevel@tonic-gate mirror_geterror(mm_unit_t *un, int *smi, int *cip, int clr_error,
3280Sstevel@tonic-gate 							int frm_probe)
3290Sstevel@tonic-gate {
3300Sstevel@tonic-gate 	mm_submirror_t		*sm;
3310Sstevel@tonic-gate 	mm_submirror_ic_t	*smic;
3320Sstevel@tonic-gate 	md_m_shared_t		*shared;
3330Sstevel@tonic-gate 	int			ci;
3340Sstevel@tonic-gate 	int			i;
3350Sstevel@tonic-gate 	int			compcnt;
3360Sstevel@tonic-gate 	int			open_comp; /* flag for open component */
3370Sstevel@tonic-gate 
3380Sstevel@tonic-gate 	for (i = *smi; i < NMIRROR; i++) {
3390Sstevel@tonic-gate 		sm = &un->un_sm[i];
3400Sstevel@tonic-gate 		smic = &un->un_smic[i];
3410Sstevel@tonic-gate 
3420Sstevel@tonic-gate 		if (!SMS_IS(sm, SMS_INUSE))
3430Sstevel@tonic-gate 			continue;
3440Sstevel@tonic-gate 
3450Sstevel@tonic-gate 		compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un);
3460Sstevel@tonic-gate 		for (ci = *cip; ci < compcnt; ci++) {
3470Sstevel@tonic-gate 			shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
3480Sstevel@tonic-gate 			    (sm->sm_dev, sm, ci);
3490Sstevel@tonic-gate 			/*
3500Sstevel@tonic-gate 			 * if called from any routine but probe, we check for
3510Sstevel@tonic-gate 			 * MDM_S_ISOPEN flag. Since probe does a pseduo open,
3520Sstevel@tonic-gate 			 * it sets MDM_S_PROBEOPEN flag and we test for this
3530Sstevel@tonic-gate 			 * flag. They are both exclusive tests.
3540Sstevel@tonic-gate 			 */
3550Sstevel@tonic-gate 			open_comp = (frm_probe) ?
3566901Sjkennedy 			    (shared->ms_flags & MDM_S_PROBEOPEN):
3576901Sjkennedy 			    (shared->ms_flags & MDM_S_ISOPEN);
358*12629SRay.Hassan@oracle.COM 			if (((shared->ms_flags & MDM_S_IOERR || !open_comp) &&
3596901Sjkennedy 			    ((shared->ms_state == CS_OKAY) ||
360*12629SRay.Hassan@oracle.COM 			    (shared->ms_state == CS_RESYNC))) ||
361*12629SRay.Hassan@oracle.COM 			    (!open_comp &&
362*12629SRay.Hassan@oracle.COM 			    (shared->ms_state == CS_LAST_ERRED))) {
3630Sstevel@tonic-gate 				if (clr_error) {
3640Sstevel@tonic-gate 					shared->ms_flags &= ~MDM_S_IOERR;
3650Sstevel@tonic-gate 				}
3660Sstevel@tonic-gate 				*cip = ci;
3670Sstevel@tonic-gate 				*smi = i;
3680Sstevel@tonic-gate 				return (1);
3690Sstevel@tonic-gate 			}
3700Sstevel@tonic-gate 
3710Sstevel@tonic-gate 			if (clr_error && (shared->ms_flags & MDM_S_IOERR)) {
3720Sstevel@tonic-gate 				shared->ms_flags &= ~MDM_S_IOERR;
3730Sstevel@tonic-gate 			}
3740Sstevel@tonic-gate 		}
3750Sstevel@tonic-gate 
3760Sstevel@tonic-gate 		*cip = 0;
3770Sstevel@tonic-gate 	}
3780Sstevel@tonic-gate 	return (0);
3790Sstevel@tonic-gate }
3800Sstevel@tonic-gate 
3810Sstevel@tonic-gate /*ARGSUSED*/
3820Sstevel@tonic-gate static void
mirror_run_queue(void * d)3830Sstevel@tonic-gate mirror_run_queue(void *d)
3840Sstevel@tonic-gate {
3850Sstevel@tonic-gate 	if (!(md_status & MD_GBL_DAEMONS_LIVE))
3860Sstevel@tonic-gate 		md_daemon(1, &md_done_daemon);
3870Sstevel@tonic-gate }
3880Sstevel@tonic-gate /*
3890Sstevel@tonic-gate  * check_comp_4_hotspares
3900Sstevel@tonic-gate  *
3910Sstevel@tonic-gate  * This function attempts to allocate a hotspare for this component if the
3920Sstevel@tonic-gate  * component is in error. In a MN set, the function can be called in 2 modes.
3930Sstevel@tonic-gate  * It can be called either when a component error has been detected or when a
3940Sstevel@tonic-gate  * new hotspare has been allocated. In this case, MD_HOTSPARE_XMIT is set
3950Sstevel@tonic-gate  * in flags and the request is sent to all nodes.
3960Sstevel@tonic-gate  * The handler on each of the nodes then calls this function with
3970Sstevel@tonic-gate  * MD_HOTSPARE_XMIT unset and the hotspare allocation is then performed.
3980Sstevel@tonic-gate  *
3990Sstevel@tonic-gate  * For non-MN sets the function simply attempts to allocate a hotspare.
4000Sstevel@tonic-gate  *
4010Sstevel@tonic-gate  * On entry, the following locks are held
4020Sstevel@tonic-gate  *	mirror_md_ops.md_link_rw (if flags has MD_HOTSPARE_LINKHELD set)
4030Sstevel@tonic-gate  *	md_unit_writerlock
4040Sstevel@tonic-gate  *
4050Sstevel@tonic-gate  * Returns	0 if ok
4060Sstevel@tonic-gate  *		1 if the unit containing the component has been cleared while
4070Sstevel@tonic-gate  *		  the mdmn_ksend_message() was being executed
4080Sstevel@tonic-gate  */
4090Sstevel@tonic-gate extern int
check_comp_4_hotspares(mm_unit_t * un,int smi,int ci,uint_t flags,mddb_recid_t hs_id,IOLOCK * lockp)4100Sstevel@tonic-gate check_comp_4_hotspares(
4110Sstevel@tonic-gate 	mm_unit_t	*un,
4120Sstevel@tonic-gate 	int		smi,
4130Sstevel@tonic-gate 	int		ci,
4140Sstevel@tonic-gate 	uint_t		flags,
4150Sstevel@tonic-gate 	mddb_recid_t	hs_id,	/* Only used by MN disksets */
4160Sstevel@tonic-gate 	IOLOCK		*lockp	/* can be NULL */
4170Sstevel@tonic-gate )
4180Sstevel@tonic-gate {
4190Sstevel@tonic-gate 	mm_submirror_t		*sm;
4200Sstevel@tonic-gate 	mm_submirror_ic_t	*smic;
4210Sstevel@tonic-gate 	md_m_shared_t		*shared;
4220Sstevel@tonic-gate 	mddb_recid_t		recids[6];
4230Sstevel@tonic-gate 	minor_t			mnum;
4240Sstevel@tonic-gate 	intptr_t		(*hs_dev)();
4250Sstevel@tonic-gate 	void			(*hs_done)();
4260Sstevel@tonic-gate 	void			*hs_data;
4270Sstevel@tonic-gate 	md_error_t		mde = mdnullerror;
4280Sstevel@tonic-gate 	set_t			setno;
4290Sstevel@tonic-gate 	md_mn_msg_allochsp_t	allochspmsg;
4300Sstevel@tonic-gate 	md_mn_kresult_t		*kresult;
4310Sstevel@tonic-gate 	mm_unit_t		*new_un;
4320Sstevel@tonic-gate 	int			rval;
43311130SJames.Hall@Sun.COM 	int			nretries = 0;
4340Sstevel@tonic-gate 
4350Sstevel@tonic-gate 	mnum = MD_SID(un);
4360Sstevel@tonic-gate 	setno = MD_UN2SET(un);
4370Sstevel@tonic-gate 	sm = &un->un_sm[smi];
4380Sstevel@tonic-gate 	smic = &un->un_smic[smi];
4390Sstevel@tonic-gate 	shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
4406901Sjkennedy 	    (sm->sm_dev, sm, ci);
4410Sstevel@tonic-gate 
4420Sstevel@tonic-gate 	if (shared->ms_state != CS_ERRED)
4430Sstevel@tonic-gate 		return (0);
4440Sstevel@tonic-gate 
4450Sstevel@tonic-gate 	/* Don't start a new component resync if a resync is already running. */
4460Sstevel@tonic-gate 	if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE)
4470Sstevel@tonic-gate 		return (0);
4480Sstevel@tonic-gate 
4490Sstevel@tonic-gate 	if (MD_MNSET_SETNO(setno) && (flags & MD_HOTSPARE_XMIT)) {
4500Sstevel@tonic-gate 		uint_t		msgflags;
4510Sstevel@tonic-gate 		md_mn_msgtype_t	msgtype;
4520Sstevel@tonic-gate 
4530Sstevel@tonic-gate 		/* Send allocate hotspare message to all nodes */
4540Sstevel@tonic-gate 
4550Sstevel@tonic-gate 		allochspmsg.msg_allochsp_mnum = un->c.un_self_id;
4560Sstevel@tonic-gate 		allochspmsg.msg_allochsp_sm = smi;
4570Sstevel@tonic-gate 		allochspmsg.msg_allochsp_comp = ci;
4580Sstevel@tonic-gate 		allochspmsg.msg_allochsp_hs_id = shared->ms_hs_id;
4590Sstevel@tonic-gate 
4600Sstevel@tonic-gate 		/*
4610Sstevel@tonic-gate 		 * Before calling mdmn_ksend_message(), release locks
4620Sstevel@tonic-gate 		 * Can never be in the context of an ioctl.
4630Sstevel@tonic-gate 		 */
4640Sstevel@tonic-gate 		md_unit_writerexit(MDI_UNIT(mnum));
4650Sstevel@tonic-gate 		if (flags & MD_HOTSPARE_LINKHELD)
4660Sstevel@tonic-gate 			rw_exit(&mirror_md_ops.md_link_rw.lock);
4670Sstevel@tonic-gate #ifdef DEBUG
4680Sstevel@tonic-gate 		if (mirror_debug_flag)
4696901Sjkennedy 			printf("send alloc hotspare, flags="
4706901Sjkennedy 			    "0x%x %x, %x, %x, %x\n", flags,
4716901Sjkennedy 			    allochspmsg.msg_allochsp_mnum,
4726901Sjkennedy 			    allochspmsg.msg_allochsp_sm,
4736901Sjkennedy 			    allochspmsg.msg_allochsp_comp,
4746901Sjkennedy 			    allochspmsg.msg_allochsp_hs_id);
4750Sstevel@tonic-gate #endif
4760Sstevel@tonic-gate 		if (flags & MD_HOTSPARE_WMUPDATE) {
4770Sstevel@tonic-gate 			msgtype  = MD_MN_MSG_ALLOCATE_HOTSPARE2;
4780Sstevel@tonic-gate 			/*
4790Sstevel@tonic-gate 			 * When coming from an update of watermarks, there
4800Sstevel@tonic-gate 			 * must already be a message logged that triggered
4810Sstevel@tonic-gate 			 * this action. So, no need to log this message, too.
4820Sstevel@tonic-gate 			 */
4830Sstevel@tonic-gate 			msgflags = MD_MSGF_NO_LOG;
4840Sstevel@tonic-gate 		} else {
4850Sstevel@tonic-gate 			msgtype  = MD_MN_MSG_ALLOCATE_HOTSPARE;
4860Sstevel@tonic-gate 			msgflags = MD_MSGF_DEFAULT_FLAGS;
4870Sstevel@tonic-gate 		}
4880Sstevel@tonic-gate 
4890Sstevel@tonic-gate 		kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
49011130SJames.Hall@Sun.COM 
49111130SJames.Hall@Sun.COM cc4hs_msg:
4928452SJohn.Wren.Kennedy@Sun.COM 		rval = mdmn_ksend_message(setno, msgtype, msgflags, 0,
4930Sstevel@tonic-gate 		    (char *)&allochspmsg, sizeof (allochspmsg),
4940Sstevel@tonic-gate 		    kresult);
4950Sstevel@tonic-gate 
4960Sstevel@tonic-gate 		if (!MDMN_KSEND_MSG_OK(rval, kresult)) {
4970Sstevel@tonic-gate #ifdef DEBUG
4980Sstevel@tonic-gate 			if (mirror_debug_flag)
4990Sstevel@tonic-gate 				mdmn_ksend_show_error(rval, kresult,
5000Sstevel@tonic-gate 				    "ALLOCATE HOTSPARE");
5010Sstevel@tonic-gate #endif
5020Sstevel@tonic-gate 			/*
5030Sstevel@tonic-gate 			 * If message is sent ok but exitval indicates an error
5040Sstevel@tonic-gate 			 * it must be because the mirror has been cleared. In
5050Sstevel@tonic-gate 			 * this case re-obtain lock and return an error
5060Sstevel@tonic-gate 			 */
5070Sstevel@tonic-gate 			if ((rval == 0) && (kresult->kmmr_exitval != 0)) {
5080Sstevel@tonic-gate 				if (flags & MD_HOTSPARE_LINKHELD) {
5090Sstevel@tonic-gate 					rw_enter(&mirror_md_ops.md_link_rw.lock,
5100Sstevel@tonic-gate 					    RW_READER);
5110Sstevel@tonic-gate 				}
5120Sstevel@tonic-gate 				kmem_free(kresult, sizeof (md_mn_kresult_t));
5130Sstevel@tonic-gate 				return (1);
5140Sstevel@tonic-gate 			}
5158452SJohn.Wren.Kennedy@Sun.COM 			/* If we're shutting down already, pause things here. */
5168452SJohn.Wren.Kennedy@Sun.COM 			if (kresult->kmmr_comm_state == MDMNE_RPC_FAIL) {
5178452SJohn.Wren.Kennedy@Sun.COM 				while (!md_mn_is_commd_present()) {
5188452SJohn.Wren.Kennedy@Sun.COM 					delay(md_hz);
5198452SJohn.Wren.Kennedy@Sun.COM 				}
52011130SJames.Hall@Sun.COM 				/*
52111130SJames.Hall@Sun.COM 				 * commd has become reachable again, so retry
52211130SJames.Hall@Sun.COM 				 * once. If this fails we'll panic as the
52311130SJames.Hall@Sun.COM 				 * system is in an unexpected state.
52411130SJames.Hall@Sun.COM 				 */
52511130SJames.Hall@Sun.COM 				if (nretries++ == 0)
52611130SJames.Hall@Sun.COM 					goto cc4hs_msg;
5278452SJohn.Wren.Kennedy@Sun.COM 			}
5280Sstevel@tonic-gate 			cmn_err(CE_PANIC,
5290Sstevel@tonic-gate 			    "ksend_message failure: ALLOCATE_HOTSPARE");
5300Sstevel@tonic-gate 		}
5310Sstevel@tonic-gate 		kmem_free(kresult, sizeof (md_mn_kresult_t));
5320Sstevel@tonic-gate 
5330Sstevel@tonic-gate 		/*
5340Sstevel@tonic-gate 		 * re-obtain the locks
5350Sstevel@tonic-gate 		 */
5360Sstevel@tonic-gate 		if (flags & MD_HOTSPARE_LINKHELD)
5370Sstevel@tonic-gate 			rw_enter(&mirror_md_ops.md_link_rw.lock, RW_READER);
5380Sstevel@tonic-gate 		new_un = md_unit_writerlock(MDI_UNIT(mnum));
5390Sstevel@tonic-gate 
5400Sstevel@tonic-gate 		/*
5410Sstevel@tonic-gate 		 * As we had to release the locks in order to send the
5420Sstevel@tonic-gate 		 * message to all nodes, we need to check to see if the
5430Sstevel@tonic-gate 		 * unit has changed. If it has we release the writerlock
5440Sstevel@tonic-gate 		 * and return fail.
5450Sstevel@tonic-gate 		 */
5460Sstevel@tonic-gate 		if ((new_un != un) || (un->c.un_type != MD_METAMIRROR)) {
5470Sstevel@tonic-gate 			md_unit_writerexit(MDI_UNIT(mnum));
5480Sstevel@tonic-gate 			return (1);
5490Sstevel@tonic-gate 		}
5500Sstevel@tonic-gate 	} else {
5510Sstevel@tonic-gate 		if (MD_MNSET_SETNO(setno)) {
5520Sstevel@tonic-gate 			/*
5530Sstevel@tonic-gate 			 * If 2 or more nodes simultaneously see a
5540Sstevel@tonic-gate 			 * component failure, these nodes will each
5550Sstevel@tonic-gate 			 * send an ALLOCATE_HOTSPARE[2] message.
5560Sstevel@tonic-gate 			 * The first message will allocate the hotspare
5570Sstevel@tonic-gate 			 * and the subsequent messages should do nothing.
5580Sstevel@tonic-gate 			 *
5590Sstevel@tonic-gate 			 * If a slave node doesn't have a hotspare allocated
5600Sstevel@tonic-gate 			 * at the time the message is initiated, then the
5610Sstevel@tonic-gate 			 * passed in hs_id will be 0.  If the node
5620Sstevel@tonic-gate 			 * executing this routine has a component shared
5630Sstevel@tonic-gate 			 * ms_hs_id of non-zero, but the message shows a
5640Sstevel@tonic-gate 			 * hs_id of 0, then just return since a hotspare
5650Sstevel@tonic-gate 			 * has already been allocated for this failing
5660Sstevel@tonic-gate 			 * component.  When the slave node returns from
5670Sstevel@tonic-gate 			 * the ksend_message the hotspare will have
5680Sstevel@tonic-gate 			 * already been allocated.
5690Sstevel@tonic-gate 			 *
5700Sstevel@tonic-gate 			 * If the slave node does send an hs_id of non-zero,
5710Sstevel@tonic-gate 			 * and the slave node's hs_id matches this node's
5720Sstevel@tonic-gate 			 * ms_hs_id, then the hotspare has error'd and
5730Sstevel@tonic-gate 			 * should be replaced.
5740Sstevel@tonic-gate 			 *
5750Sstevel@tonic-gate 			 * If the slave node sends an hs_id of non-zero and
5760Sstevel@tonic-gate 			 * this node has a different shared ms_hs_id, then
5770Sstevel@tonic-gate 			 * just return since this hotspare has already
5780Sstevel@tonic-gate 			 * been hotspared.
5790Sstevel@tonic-gate 			 */
5800Sstevel@tonic-gate 			if (shared->ms_hs_id != 0) {
5810Sstevel@tonic-gate 				if (hs_id == 0) {
5820Sstevel@tonic-gate #ifdef DEBUG
5830Sstevel@tonic-gate 					if (mirror_debug_flag) {
5840Sstevel@tonic-gate 						printf("check_comp_4_hotspares"
5850Sstevel@tonic-gate 						    "(NOXMIT), short circuit "
5860Sstevel@tonic-gate 						    "hs_id=0x%x, "
5870Sstevel@tonic-gate 						    "ms_hs_id=0x%x\n",
5880Sstevel@tonic-gate 						    hs_id, shared->ms_hs_id);
5890Sstevel@tonic-gate 					}
5900Sstevel@tonic-gate #endif
5910Sstevel@tonic-gate 					return (0);
5920Sstevel@tonic-gate 				}
5930Sstevel@tonic-gate 				if (hs_id != shared->ms_hs_id) {
5940Sstevel@tonic-gate #ifdef DEBUG
5950Sstevel@tonic-gate 					if (mirror_debug_flag) {
5960Sstevel@tonic-gate 						printf("check_comp_4_hotspares"
5970Sstevel@tonic-gate 						    "(NOXMIT), short circuit2 "
5980Sstevel@tonic-gate 						    "hs_id=0x%x, "
5990Sstevel@tonic-gate 						    "ms_hs_id=0x%x\n",
6000Sstevel@tonic-gate 						    hs_id, shared->ms_hs_id);
6010Sstevel@tonic-gate 					}
6020Sstevel@tonic-gate #endif
6030Sstevel@tonic-gate 					return (0);
6040Sstevel@tonic-gate 				}
6050Sstevel@tonic-gate 			}
6060Sstevel@tonic-gate 		}
6070Sstevel@tonic-gate 
6080Sstevel@tonic-gate 		sm = &un->un_sm[smi];
6090Sstevel@tonic-gate 		hs_dev = md_get_named_service(sm->sm_dev, 0,
6100Sstevel@tonic-gate 		    "hotspare device", 0);
6110Sstevel@tonic-gate 		if ((*hs_dev)(sm->sm_dev, 0, ci, recids, 6, &hs_done,
6120Sstevel@tonic-gate 		    &hs_data) != 0)
6130Sstevel@tonic-gate 			return (0);
6140Sstevel@tonic-gate 
6150Sstevel@tonic-gate 		/*
6160Sstevel@tonic-gate 		 * set_sm_comp_state() commits the modified records.
6170Sstevel@tonic-gate 		 * As we don't transmit the changes, no need to drop the lock.
6180Sstevel@tonic-gate 		 */
6190Sstevel@tonic-gate 		set_sm_comp_state(un, smi, ci, CS_RESYNC, recids,
6200Sstevel@tonic-gate 		    MD_STATE_NO_XMIT, (IOLOCK *)NULL);
6210Sstevel@tonic-gate 
6220Sstevel@tonic-gate 		(*hs_done)(sm->sm_dev, hs_data);
6230Sstevel@tonic-gate 
6240Sstevel@tonic-gate 		mirror_check_failfast(mnum);
6250Sstevel@tonic-gate 
6260Sstevel@tonic-gate 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_HOTSPARED, SVM_TAG_METADEVICE,
6270Sstevel@tonic-gate 		    setno, MD_SID(un));
6280Sstevel@tonic-gate 
6290Sstevel@tonic-gate 		/*
6300Sstevel@tonic-gate 		 * For a multi-node set we need to reset the un_rs_type,
6310Sstevel@tonic-gate 		 * un_rs_resync_done and un_rs_resync_2_do fields as the
6320Sstevel@tonic-gate 		 * hot-spare resync must copy all applicable data.
6330Sstevel@tonic-gate 		 */
6340Sstevel@tonic-gate 		if (MD_MNSET_SETNO(setno)) {
6350Sstevel@tonic-gate 			un->un_rs_type = MD_RS_NONE;
6360Sstevel@tonic-gate 			un->un_rs_resync_done = 0;
6370Sstevel@tonic-gate 			un->un_rs_resync_2_do = 0;
6380Sstevel@tonic-gate 		}
6390Sstevel@tonic-gate 
6400Sstevel@tonic-gate 		/*
6410Sstevel@tonic-gate 		 * Must drop writer lock since mirror_resync_unit will
6420Sstevel@tonic-gate 		 * open devices and must be able to grab readerlock.
6430Sstevel@tonic-gate 		 * Don't need to drop IOLOCK since any descendent routines
6440Sstevel@tonic-gate 		 * calling ksend_messages will drop the IOLOCK as needed.
6450Sstevel@tonic-gate 		 *
6460Sstevel@tonic-gate 		 */
6470Sstevel@tonic-gate 		if (lockp) {
6480Sstevel@tonic-gate 			md_ioctl_writerexit(lockp);
6490Sstevel@tonic-gate 		} else {
6500Sstevel@tonic-gate 			md_unit_writerexit(MDI_UNIT(mnum));
6510Sstevel@tonic-gate 		}
6520Sstevel@tonic-gate 
6530Sstevel@tonic-gate 		/* start resync */
6540Sstevel@tonic-gate 		(void) mirror_resync_unit(mnum, NULL, &mde, lockp);
6550Sstevel@tonic-gate 
6560Sstevel@tonic-gate 		if (lockp) {
6570Sstevel@tonic-gate 			new_un = md_ioctl_writerlock(lockp, MDI_UNIT(mnum));
6580Sstevel@tonic-gate 		} else {
6590Sstevel@tonic-gate 			new_un = md_unit_writerlock(MDI_UNIT(mnum));
6600Sstevel@tonic-gate 		}
6610Sstevel@tonic-gate 	}
6620Sstevel@tonic-gate 	return (0);
6630Sstevel@tonic-gate }
6640Sstevel@tonic-gate 
6650Sstevel@tonic-gate /*
6660Sstevel@tonic-gate  * check_unit_4_hotspares
6670Sstevel@tonic-gate  *
6680Sstevel@tonic-gate  * For a given mirror, allocate hotspares, if available for any components
6690Sstevel@tonic-gate  * that are in error
6700Sstevel@tonic-gate  *
6710Sstevel@tonic-gate  * Returns	0 if ok
6720Sstevel@tonic-gate  *		1 if check_comp_4_hotspares returns non-zero. This will only
6730Sstevel@tonic-gate  *		  happen for a MN unit where the unit has been cleared while
6740Sstevel@tonic-gate  *		  the allocate hotspare message is sent to all nodes.
6750Sstevel@tonic-gate  */
6760Sstevel@tonic-gate static int
check_unit_4_hotspares(mm_unit_t * un,int flags)6770Sstevel@tonic-gate check_unit_4_hotspares(mm_unit_t *un, int flags)
6780Sstevel@tonic-gate {
6790Sstevel@tonic-gate 	mm_submirror_t		*sm;
6800Sstevel@tonic-gate 	mm_submirror_ic_t	*smic;
6810Sstevel@tonic-gate 	int			ci;
6820Sstevel@tonic-gate 	int			i;
6830Sstevel@tonic-gate 	int			compcnt;
6840Sstevel@tonic-gate 
6850Sstevel@tonic-gate 	if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE)
6860Sstevel@tonic-gate 		return (0);
6870Sstevel@tonic-gate 
6880Sstevel@tonic-gate 	for (i = 0; i < NMIRROR; i++) {
6890Sstevel@tonic-gate 		sm = &un->un_sm[i];
6900Sstevel@tonic-gate 		smic = &un->un_smic[i];
6910Sstevel@tonic-gate 		if (!SMS_IS(sm, SMS_INUSE))
6920Sstevel@tonic-gate 			continue;
6930Sstevel@tonic-gate 		compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, sm);
6940Sstevel@tonic-gate 		for (ci = 0; ci < compcnt; ci++) {
6950Sstevel@tonic-gate 			md_m_shared_t		*shared;
6960Sstevel@tonic-gate 
6970Sstevel@tonic-gate 			shared = (md_m_shared_t *)
6986901Sjkennedy 			    (*(smic->sm_shared_by_indx))(sm->sm_dev, sm, ci);
6990Sstevel@tonic-gate 			/*
7000Sstevel@tonic-gate 			 * Never called from ioctl context, so pass in
7010Sstevel@tonic-gate 			 * (IOLOCK *)NULL.  Pass through flags from calling
7020Sstevel@tonic-gate 			 * routine, also setting XMIT flag.
7030Sstevel@tonic-gate 			 */
7040Sstevel@tonic-gate 			if (check_comp_4_hotspares(un, i, ci,
7056901Sjkennedy 			    (MD_HOTSPARE_XMIT | flags),
7066901Sjkennedy 			    shared->ms_hs_id, (IOLOCK *)NULL) != 0)
7070Sstevel@tonic-gate 				return (1);
7080Sstevel@tonic-gate 		}
7090Sstevel@tonic-gate 	}
7100Sstevel@tonic-gate 	return (0);
7110Sstevel@tonic-gate }
7120Sstevel@tonic-gate 
7130Sstevel@tonic-gate static void
check_4_hotspares(daemon_request_t * drq)7140Sstevel@tonic-gate check_4_hotspares(daemon_request_t *drq)
7150Sstevel@tonic-gate {
7160Sstevel@tonic-gate 	mdi_unit_t	*ui;
7170Sstevel@tonic-gate 	mm_unit_t	*un;
7180Sstevel@tonic-gate 	md_link_t	*next;
7190Sstevel@tonic-gate 	int		x;
7200Sstevel@tonic-gate 
7210Sstevel@tonic-gate 	mutex_enter(&drq->dr_mx);	/* clear up front so can poke */
7220Sstevel@tonic-gate 	drq->dr_pending = 0;		/* again in low level routine if */
7230Sstevel@tonic-gate 	mutex_exit(&drq->dr_mx);	/* something found to do	*/
7240Sstevel@tonic-gate 
7250Sstevel@tonic-gate 	/*
7260Sstevel@tonic-gate 	 * Used to have a problem here. The disksets weren't marked as being
7270Sstevel@tonic-gate 	 * MNHOLD. This opened a window where we could be searching for
7280Sstevel@tonic-gate 	 * hotspares and have the disk set unloaded (released) from under
7290Sstevel@tonic-gate 	 * us causing a panic in stripe_component_count().
7300Sstevel@tonic-gate 	 * The way to prevent that is to mark the set MNHOLD which prevents
7310Sstevel@tonic-gate 	 * any diskset from being released while we are scanning the mirrors,
7320Sstevel@tonic-gate 	 * submirrors and components.
7330Sstevel@tonic-gate 	 */
7340Sstevel@tonic-gate 
7350Sstevel@tonic-gate 	for (x = 0; x < md_nsets; x++)
7360Sstevel@tonic-gate 		md_holdset_enter(x);
7370Sstevel@tonic-gate 
7380Sstevel@tonic-gate 	rw_enter(&mirror_md_ops.md_link_rw.lock, RW_READER);
7390Sstevel@tonic-gate 	for (next = mirror_md_ops.md_head; next != NULL; next = next->ln_next) {
7400Sstevel@tonic-gate 		ui = MDI_UNIT(next->ln_id);
7410Sstevel@tonic-gate 
7420Sstevel@tonic-gate 		un = (mm_unit_t *)md_unit_readerlock(ui);
7430Sstevel@tonic-gate 
7440Sstevel@tonic-gate 		/*
7450Sstevel@tonic-gate 		 * Only check the unit if we are the master for this set
7460Sstevel@tonic-gate 		 * For an MN set, poke_hotspares() is only effective on the
7470Sstevel@tonic-gate 		 * master
7480Sstevel@tonic-gate 		 */
7490Sstevel@tonic-gate 		if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
7500Sstevel@tonic-gate 		    md_set[MD_UN2SET(un)].s_am_i_master == 0) {
7510Sstevel@tonic-gate 			md_unit_readerexit(ui);
7520Sstevel@tonic-gate 			continue;
7530Sstevel@tonic-gate 		}
7540Sstevel@tonic-gate 		if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) {
7550Sstevel@tonic-gate 			md_unit_readerexit(ui);
7560Sstevel@tonic-gate 			continue;
7570Sstevel@tonic-gate 		}
7580Sstevel@tonic-gate 		md_unit_readerexit(ui);
7590Sstevel@tonic-gate 
7600Sstevel@tonic-gate 		un = (mm_unit_t *)md_unit_writerlock(ui);
7610Sstevel@tonic-gate 		/*
7620Sstevel@tonic-gate 		 * check_unit_4_hotspares will exit 1 if the unit has been
7630Sstevel@tonic-gate 		 * removed during the process of allocating the hotspare.
7640Sstevel@tonic-gate 		 * This can only happen for a MN metadevice. If unit no longer
7650Sstevel@tonic-gate 		 * exists, no need to release writerlock
7660Sstevel@tonic-gate 		 */
7670Sstevel@tonic-gate 		if (check_unit_4_hotspares(un, MD_HOTSPARE_LINKHELD) == 0)
7680Sstevel@tonic-gate 			md_unit_writerexit(ui);
7690Sstevel@tonic-gate 		else {
7700Sstevel@tonic-gate 			/*
7710Sstevel@tonic-gate 			 * If check_unit_4_hotspares failed, queue another
7720Sstevel@tonic-gate 			 * request and break out of this one
7730Sstevel@tonic-gate 			 */
7740Sstevel@tonic-gate 			(void) poke_hotspares();
7750Sstevel@tonic-gate 			break;
7760Sstevel@tonic-gate 		}
7770Sstevel@tonic-gate 	}
7780Sstevel@tonic-gate 	rw_exit(&mirror_md_ops.md_link_rw.lock);
7790Sstevel@tonic-gate 
7800Sstevel@tonic-gate 	for (x = 0; x < md_nsets; x++)
7810Sstevel@tonic-gate 		md_holdset_exit(x);
7820Sstevel@tonic-gate }
7830Sstevel@tonic-gate 
7840Sstevel@tonic-gate /*
7850Sstevel@tonic-gate  * poke_hotspares
7860Sstevel@tonic-gate  *
7870Sstevel@tonic-gate  * If there is not a pending poke_hotspares request pending, queue a requent
7880Sstevel@tonic-gate  * to call check_4_hotspares(). This will scan all mirrors and attempt to
7890Sstevel@tonic-gate  * allocate hotspares for all components in error.
7900Sstevel@tonic-gate  */
7910Sstevel@tonic-gate int
poke_hotspares()7920Sstevel@tonic-gate poke_hotspares()
7930Sstevel@tonic-gate {
7940Sstevel@tonic-gate 	mutex_enter(&hotspare_request.dr_mx);
7950Sstevel@tonic-gate 	if (hotspare_request.dr_pending == 0) {
7960Sstevel@tonic-gate 		hotspare_request.dr_pending = 1;
7970Sstevel@tonic-gate 		daemon_request(&md_mhs_daemon,
7986901Sjkennedy 		    check_4_hotspares, (daemon_queue_t *)&hotspare_request,
7996901Sjkennedy 		    REQ_OLD);
8000Sstevel@tonic-gate 	}
8010Sstevel@tonic-gate 	mutex_exit(&hotspare_request.dr_mx);
8020Sstevel@tonic-gate 	return (0);
8030Sstevel@tonic-gate }
8040Sstevel@tonic-gate 
8050Sstevel@tonic-gate static void
free_all_ecomps(err_comp_t * ecomp)8060Sstevel@tonic-gate free_all_ecomps(err_comp_t *ecomp)
8070Sstevel@tonic-gate {
8080Sstevel@tonic-gate 	err_comp_t	*d;
8090Sstevel@tonic-gate 
8100Sstevel@tonic-gate 	while (ecomp != NULL) {
8110Sstevel@tonic-gate 		d = ecomp;
8120Sstevel@tonic-gate 		ecomp = ecomp->ec_next;
8130Sstevel@tonic-gate 		kmem_free(d, sizeof (err_comp_t));
8140Sstevel@tonic-gate 	}
8150Sstevel@tonic-gate }
8160Sstevel@tonic-gate 
8170Sstevel@tonic-gate /*
8180Sstevel@tonic-gate  * NAME: mirror_openfail_console_info
8190Sstevel@tonic-gate  *
8200Sstevel@tonic-gate  * DESCRIPTION: Prints a informative message to the console when mirror
8210Sstevel@tonic-gate  *		cannot be opened.
8220Sstevel@tonic-gate  *
8230Sstevel@tonic-gate  * PARAMETERS: mm_unit_t	un - pointer to mirror unit structure
8240Sstevel@tonic-gate  *	       int		smi - submirror index
8250Sstevel@tonic-gate  *	       int		ci - component index
8260Sstevel@tonic-gate  */
8270Sstevel@tonic-gate 
8280Sstevel@tonic-gate void
mirror_openfail_console_info(mm_unit_t * un,int smi,int ci)8290Sstevel@tonic-gate mirror_openfail_console_info(mm_unit_t *un, int smi, int ci)
8300Sstevel@tonic-gate {
8310Sstevel@tonic-gate 	void (*get_dev)();
8320Sstevel@tonic-gate 	ms_cd_info_t cd;
8330Sstevel@tonic-gate 	md_dev64_t tmpdev;
8340Sstevel@tonic-gate 
8350Sstevel@tonic-gate 	tmpdev = un->un_sm[smi].sm_dev;
8360Sstevel@tonic-gate 	get_dev = (void (*)())md_get_named_service(tmpdev, 0, "get device", 0);
8370Sstevel@tonic-gate 	if (get_dev != NULL) {
8380Sstevel@tonic-gate 		(void) (*get_dev)(tmpdev, smi, ci, &cd);
8390Sstevel@tonic-gate 		cmn_err(CE_WARN, "md %s: open error on %s",
8406901Sjkennedy 		    md_shortname(MD_SID(un)), md_devname(MD_UN2SET(un),
8416901Sjkennedy 		    cd.cd_dev, NULL, 0));
8420Sstevel@tonic-gate 	} else {
8430Sstevel@tonic-gate 		cmn_err(CE_WARN, "md %s: open error",
8446901Sjkennedy 		    md_shortname(MD_SID(un)));
8450Sstevel@tonic-gate 	}
8460Sstevel@tonic-gate }
8470Sstevel@tonic-gate 
8480Sstevel@tonic-gate static int
mirror_close_all_devs(mm_unit_t * un,int md_cflags)8490Sstevel@tonic-gate mirror_close_all_devs(mm_unit_t *un, int md_cflags)
8500Sstevel@tonic-gate {
8510Sstevel@tonic-gate 	int i;
8520Sstevel@tonic-gate 	md_dev64_t dev;
8530Sstevel@tonic-gate 
8540Sstevel@tonic-gate 	for (i = 0; i < NMIRROR; i++) {
8550Sstevel@tonic-gate 		if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
8560Sstevel@tonic-gate 			continue;
8570Sstevel@tonic-gate 		dev = un->un_sm[i].sm_dev;
8580Sstevel@tonic-gate 		md_layered_close(dev, md_cflags);
8590Sstevel@tonic-gate 	}
8600Sstevel@tonic-gate 	return (0);
8610Sstevel@tonic-gate }
8620Sstevel@tonic-gate 
8630Sstevel@tonic-gate /*
8640Sstevel@tonic-gate  * Keep track of drivers that don't support failfast.  We use this so that
8650Sstevel@tonic-gate  * we only log one diagnostic message for each of these drivers, no matter
8660Sstevel@tonic-gate  * how many times we run the mirror_check_failfast function.
8670Sstevel@tonic-gate  * Return 1 if this is a new driver that does not support failfast,
8680Sstevel@tonic-gate  * return 0 if we have already seen this non-failfast driver.
8690Sstevel@tonic-gate  */
8700Sstevel@tonic-gate static int
new_non_ff_driver(const char * s)8710Sstevel@tonic-gate new_non_ff_driver(const char *s)
8720Sstevel@tonic-gate {
8730Sstevel@tonic-gate 	mutex_enter(&non_ff_drv_mutex);
8740Sstevel@tonic-gate 	if (non_ff_drivers == NULL) {
8756901Sjkennedy 		non_ff_drivers = (char **)kmem_alloc(2 * sizeof (char *),
8766901Sjkennedy 		    KM_NOSLEEP);
8776901Sjkennedy 		if (non_ff_drivers == NULL) {
8786901Sjkennedy 			mutex_exit(&non_ff_drv_mutex);
8796901Sjkennedy 			return (1);
8806901Sjkennedy 		}
8816901Sjkennedy 
8826901Sjkennedy 		non_ff_drivers[0] = (char *)kmem_alloc(strlen(s) + 1,
8836901Sjkennedy 		    KM_NOSLEEP);
8846901Sjkennedy 		if (non_ff_drivers[0] == NULL) {
8856901Sjkennedy 			kmem_free(non_ff_drivers, 2 * sizeof (char *));
8866901Sjkennedy 			non_ff_drivers = NULL;
8876901Sjkennedy 			mutex_exit(&non_ff_drv_mutex);
8886901Sjkennedy 			return (1);
8896901Sjkennedy 		}
8906901Sjkennedy 
8916901Sjkennedy 		(void) strcpy(non_ff_drivers[0], s);
8926901Sjkennedy 		non_ff_drivers[1] = NULL;
8930Sstevel@tonic-gate 
8940Sstevel@tonic-gate 	} else {
8956901Sjkennedy 		int i;
8966901Sjkennedy 		char **tnames;
8976901Sjkennedy 		char **tmp;
8986901Sjkennedy 
8996901Sjkennedy 		for (i = 0; non_ff_drivers[i] != NULL; i++) {
9006901Sjkennedy 			if (strcmp(s, non_ff_drivers[i]) == 0) {
9016901Sjkennedy 				mutex_exit(&non_ff_drv_mutex);
9026901Sjkennedy 				return (0);
9036901Sjkennedy 			}
9046901Sjkennedy 		}
9056901Sjkennedy 
9066901Sjkennedy 		/* allow for new element and null */
9076901Sjkennedy 		i += 2;
9086901Sjkennedy 		tnames = (char **)kmem_alloc(i * sizeof (char *), KM_NOSLEEP);
9096901Sjkennedy 		if (tnames == NULL) {
9106901Sjkennedy 			mutex_exit(&non_ff_drv_mutex);
9116901Sjkennedy 			return (1);
9120Sstevel@tonic-gate 		}
9136901Sjkennedy 
9146901Sjkennedy 		for (i = 0; non_ff_drivers[i] != NULL; i++)
9156901Sjkennedy 			tnames[i] = non_ff_drivers[i];
9166901Sjkennedy 
9176901Sjkennedy 		tnames[i] = (char *)kmem_alloc(strlen(s) + 1, KM_NOSLEEP);
9186901Sjkennedy 		if (tnames[i] == NULL) {
9196901Sjkennedy 			/* adjust i so that it is the right count to free */
9206901Sjkennedy 			kmem_free(tnames, (i + 2) * sizeof (char *));
9216901Sjkennedy 			mutex_exit(&non_ff_drv_mutex);
9226901Sjkennedy 			return (1);
9236901Sjkennedy 		}
9246901Sjkennedy 
9256901Sjkennedy 		(void) strcpy(tnames[i++], s);
9266901Sjkennedy 		tnames[i] = NULL;
9276901Sjkennedy 
9286901Sjkennedy 		tmp = non_ff_drivers;
9296901Sjkennedy 		non_ff_drivers = tnames;
9306901Sjkennedy 		/* i now represents the count we previously alloced */
9316901Sjkennedy 		kmem_free(tmp, i * sizeof (char *));
9320Sstevel@tonic-gate 	}
9330Sstevel@tonic-gate 	mutex_exit(&non_ff_drv_mutex);
9340Sstevel@tonic-gate 
9350Sstevel@tonic-gate 	return (1);
9360Sstevel@tonic-gate }
9370Sstevel@tonic-gate 
9380Sstevel@tonic-gate /*
9390Sstevel@tonic-gate  * Check for the "ddi-failfast-supported" devtree property on each submirror
9400Sstevel@tonic-gate  * component to indicate if we should do I/O to that submirror with the
9410Sstevel@tonic-gate  * B_FAILFAST flag set or not.  This check is made at various state transitions
9420Sstevel@tonic-gate  * in the mirror code (e.g. open, enable, hotspare, etc.).  Sometimes we
9430Sstevel@tonic-gate  * only need to check one drive (e.g. hotspare) but since the check is
9440Sstevel@tonic-gate  * fast and infrequent and sometimes needs to be done on all components we
9450Sstevel@tonic-gate  * just check all components on each call.
9460Sstevel@tonic-gate  */
9470Sstevel@tonic-gate void
mirror_check_failfast(minor_t mnum)9480Sstevel@tonic-gate mirror_check_failfast(minor_t mnum)
9490Sstevel@tonic-gate {
9500Sstevel@tonic-gate 	int		i;
9510Sstevel@tonic-gate 	mm_unit_t	*un;
9520Sstevel@tonic-gate 
9530Sstevel@tonic-gate 	if (md_ff_disable)
9546901Sjkennedy 		return;
9550Sstevel@tonic-gate 
9560Sstevel@tonic-gate 	un = MD_UNIT(mnum);
9570Sstevel@tonic-gate 
9580Sstevel@tonic-gate 	for (i = 0; i < NMIRROR; i++) {
9596901Sjkennedy 		int			ci;
9606901Sjkennedy 		int			cnt;
9616901Sjkennedy 		int			ff = 1;
9626901Sjkennedy 		mm_submirror_t		*sm;
9636901Sjkennedy 		mm_submirror_ic_t	*smic;
9646901Sjkennedy 		void			(*get_dev)();
9656901Sjkennedy 
9666901Sjkennedy 		if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
9676901Sjkennedy 			continue;
9686901Sjkennedy 
9696901Sjkennedy 		sm = &un->un_sm[i];
9706901Sjkennedy 		smic = &un->un_smic[i];
9716901Sjkennedy 
9726901Sjkennedy 		get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0,
9736901Sjkennedy 		    "get device", 0);
9746901Sjkennedy 
9756901Sjkennedy 		cnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm);
9766901Sjkennedy 		for (ci = 0; ci < cnt; ci++) {
9776901Sjkennedy 			int		found = 0;
9786901Sjkennedy 			dev_t		ci_dev;
9796901Sjkennedy 			major_t		major;
9806901Sjkennedy 			dev_info_t	*devi;
9816901Sjkennedy 			ms_cd_info_t	cd;
9826901Sjkennedy 
9836901Sjkennedy 			/*
9846901Sjkennedy 			 * this already returns the hs
9856901Sjkennedy 			 * dev if the device is spared
9866901Sjkennedy 			 */
9876901Sjkennedy 			(void) (*get_dev)(sm->sm_dev, sm, ci, &cd);
9886901Sjkennedy 
9896901Sjkennedy 			ci_dev = md_dev64_to_dev(cd.cd_dev);
9906901Sjkennedy 			major = getmajor(ci_dev);
9916901Sjkennedy 
9926901Sjkennedy 			if (major == md_major) {
9936901Sjkennedy 				/*
9946901Sjkennedy 				 * this component must be a soft
9956901Sjkennedy 				 * partition; get the real dev
9966901Sjkennedy 				 */
9976901Sjkennedy 				minor_t	dev_mnum;
9986901Sjkennedy 				mdi_unit_t	*ui;
9996901Sjkennedy 				mp_unit_t	*un;
10006901Sjkennedy 				set_t	setno;
10016901Sjkennedy 				side_t	side;
10026901Sjkennedy 				md_dev64_t	tmpdev;
10036901Sjkennedy 
10046901Sjkennedy 				ui = MDI_UNIT(getminor(ci_dev));
10056901Sjkennedy 
10066901Sjkennedy 				/* grab necessary lock */
10076901Sjkennedy 				un = (mp_unit_t *)md_unit_readerlock(ui);
10086901Sjkennedy 
10096901Sjkennedy 				dev_mnum = MD_SID(un);
10106901Sjkennedy 				setno = MD_MIN2SET(dev_mnum);
10116901Sjkennedy 				side = mddb_getsidenum(setno);
10126901Sjkennedy 
10136901Sjkennedy 				tmpdev = un->un_dev;
10146901Sjkennedy 
10156901Sjkennedy 				/* Get dev by device id */
10166901Sjkennedy 				if (md_devid_found(setno, side,
10176901Sjkennedy 				    un->un_key) == 1) {
10186901Sjkennedy 					tmpdev = md_resolve_bydevid(dev_mnum,
10196901Sjkennedy 					    tmpdev, un->un_key);
10206901Sjkennedy 				}
10216901Sjkennedy 
10226901Sjkennedy 				md_unit_readerexit(ui);
10236901Sjkennedy 
10246901Sjkennedy 				ci_dev = md_dev64_to_dev(tmpdev);
10256901Sjkennedy 				major = getmajor(ci_dev);
10266901Sjkennedy 			}
10276901Sjkennedy 
10286901Sjkennedy 			if (ci_dev != NODEV32 &&
10296901Sjkennedy 			    (devi = e_ddi_hold_devi_by_dev(ci_dev, 0))
10306901Sjkennedy 			    != NULL) {
10316901Sjkennedy 				ddi_prop_op_t	prop_op = PROP_LEN_AND_VAL_BUF;
10326901Sjkennedy 				int		propvalue = 0;
10336901Sjkennedy 				int		proplength = sizeof (int);
10346901Sjkennedy 				int		error;
10356901Sjkennedy 				struct cb_ops	*cb;
10366901Sjkennedy 
10376901Sjkennedy 				if ((cb = devopsp[major]->devo_cb_ops) !=
10386901Sjkennedy 				    NULL) {
10396901Sjkennedy 					error = (*cb->cb_prop_op)
10406901Sjkennedy 					    (DDI_DEV_T_ANY, devi, prop_op,
10416901Sjkennedy 					    DDI_PROP_NOTPROM|DDI_PROP_DONTPASS,
10426901Sjkennedy 					    "ddi-failfast-supported",
10436901Sjkennedy 					    (caddr_t)&propvalue, &proplength);
10446901Sjkennedy 
10456901Sjkennedy 					if (error == DDI_PROP_SUCCESS)
10466901Sjkennedy 						found = 1;
10476901Sjkennedy 				}
10486901Sjkennedy 
10496901Sjkennedy 				if (!found && new_non_ff_driver(
10506901Sjkennedy 				    ddi_driver_name(devi))) {
10516901Sjkennedy 					cmn_err(CE_NOTE, "!md: B_FAILFAST I/O"
10526901Sjkennedy 					    "disabled on %s",
10536901Sjkennedy 					    ddi_driver_name(devi));
10546901Sjkennedy 				}
10556901Sjkennedy 
10566901Sjkennedy 				ddi_release_devi(devi);
10576901Sjkennedy 			}
10586901Sjkennedy 
10596901Sjkennedy 			/*
10606901Sjkennedy 			 * All components must support
10616901Sjkennedy 			 * failfast in the submirror.
10626901Sjkennedy 			 */
10636901Sjkennedy 			if (!found) {
10646901Sjkennedy 				ff = 0;
10656901Sjkennedy 				break;
10666901Sjkennedy 			}
10670Sstevel@tonic-gate 		}
10680Sstevel@tonic-gate 
10696901Sjkennedy 		if (ff) {
10706901Sjkennedy 			sm->sm_flags |= MD_SM_FAILFAST;
10716901Sjkennedy 		} else {
10726901Sjkennedy 			sm->sm_flags &= ~MD_SM_FAILFAST;
10730Sstevel@tonic-gate 		}
10740Sstevel@tonic-gate 	}
10750Sstevel@tonic-gate }
10760Sstevel@tonic-gate 
10770Sstevel@tonic-gate /*
10780Sstevel@tonic-gate  * Return true if the submirror is unavailable.
10790Sstevel@tonic-gate  * If any of the submirror components are opened then the submirror cannot
10800Sstevel@tonic-gate  * be unavailable (MD_INACCESSIBLE).
10810Sstevel@tonic-gate  * If any of the components are already in the errored state, then the submirror
10820Sstevel@tonic-gate  * cannot be unavailable (MD_INACCESSIBLE).
10830Sstevel@tonic-gate  */
10840Sstevel@tonic-gate static bool_t
submirror_unavailable(mm_unit_t * un,int smi,int from_probe)10850Sstevel@tonic-gate submirror_unavailable(mm_unit_t *un, int smi, int from_probe)
10860Sstevel@tonic-gate {
10870Sstevel@tonic-gate 	mm_submirror_t		*sm;
10880Sstevel@tonic-gate 	mm_submirror_ic_t	*smic;
10890Sstevel@tonic-gate 	md_m_shared_t		*shared;
10900Sstevel@tonic-gate 	int			ci;
10910Sstevel@tonic-gate 	int			compcnt;
10920Sstevel@tonic-gate 
10930Sstevel@tonic-gate 	sm = &un->un_sm[smi];
10940Sstevel@tonic-gate 	smic = &un->un_smic[smi];
10950Sstevel@tonic-gate 
10960Sstevel@tonic-gate 	compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un);
10970Sstevel@tonic-gate 	for (ci = 0; ci < compcnt; ci++) {
10980Sstevel@tonic-gate 		shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
10990Sstevel@tonic-gate 		    (sm->sm_dev, sm, ci);
11000Sstevel@tonic-gate 		if (from_probe) {
11010Sstevel@tonic-gate 			if (shared->ms_flags & MDM_S_PROBEOPEN)
11020Sstevel@tonic-gate 				return (B_FALSE);
11030Sstevel@tonic-gate 		} else {
11040Sstevel@tonic-gate 			if (shared->ms_flags & MDM_S_ISOPEN)
11050Sstevel@tonic-gate 				return (B_FALSE);
11060Sstevel@tonic-gate 		}
11070Sstevel@tonic-gate 		if (shared->ms_state == CS_ERRED ||
11080Sstevel@tonic-gate 		    shared->ms_state == CS_LAST_ERRED)
11090Sstevel@tonic-gate 			return (B_FALSE);
11100Sstevel@tonic-gate 	}
11110Sstevel@tonic-gate 
11120Sstevel@tonic-gate 	return (B_TRUE);
11130Sstevel@tonic-gate }
11140Sstevel@tonic-gate 
11150Sstevel@tonic-gate static int
mirror_open_all_devs(minor_t mnum,int md_oflags,IOLOCK * lockp)11160Sstevel@tonic-gate mirror_open_all_devs(minor_t mnum, int md_oflags, IOLOCK *lockp)
11170Sstevel@tonic-gate {
11180Sstevel@tonic-gate 	int		i;
11190Sstevel@tonic-gate 	mm_unit_t	*un;
11200Sstevel@tonic-gate 	mdi_unit_t	*ui;
11210Sstevel@tonic-gate 	int		err;
11220Sstevel@tonic-gate 	int		smi;
11230Sstevel@tonic-gate 	int		ci;
11240Sstevel@tonic-gate 	err_comp_t	*c;
11250Sstevel@tonic-gate 	err_comp_t	*ecomps = NULL;
11260Sstevel@tonic-gate 	int		smmask = 0;
11270Sstevel@tonic-gate 	set_t		setno;
11280Sstevel@tonic-gate 	int		sm_cnt;
11290Sstevel@tonic-gate 	int		sm_unavail_cnt;
11300Sstevel@tonic-gate 
11310Sstevel@tonic-gate 	mirror_check_failfast(mnum);
11320Sstevel@tonic-gate 
11330Sstevel@tonic-gate 	un = MD_UNIT(mnum);
11340Sstevel@tonic-gate 	ui = MDI_UNIT(mnum);
11350Sstevel@tonic-gate 	setno = MD_UN2SET(un);
11360Sstevel@tonic-gate 
11370Sstevel@tonic-gate 	for (i = 0; i < NMIRROR; i++) {
11380Sstevel@tonic-gate 		md_dev64_t tmpdev = un->un_sm[i].sm_dev;
11390Sstevel@tonic-gate 
11400Sstevel@tonic-gate 		if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
11410Sstevel@tonic-gate 			continue;
11420Sstevel@tonic-gate 		if (md_layered_open(mnum, &tmpdev, md_oflags))
11430Sstevel@tonic-gate 			smmask |= SMI2BIT(i);
11440Sstevel@tonic-gate 		un->un_sm[i].sm_dev = tmpdev;
11450Sstevel@tonic-gate 	}
11460Sstevel@tonic-gate 
11470Sstevel@tonic-gate 	/*
11480Sstevel@tonic-gate 	 * If smmask is clear, all submirrors are accessible. Clear the
11490Sstevel@tonic-gate 	 * MD_INACCESSIBLE bit in this case.  This bit is also cleared for the
11500Sstevel@tonic-gate 	 * mirror device.   If smmask is set, we have to determine which of the
11510Sstevel@tonic-gate 	 * submirrors are in error. If no submirror is accessible we mark the
11520Sstevel@tonic-gate 	 * whole mirror as MD_INACCESSIBLE.
11530Sstevel@tonic-gate 	 */
11540Sstevel@tonic-gate 	if (smmask == 0) {
11550Sstevel@tonic-gate 		if (lockp) {
11560Sstevel@tonic-gate 			md_ioctl_readerexit(lockp);
11570Sstevel@tonic-gate 			(void) md_ioctl_writerlock(lockp, ui);
11580Sstevel@tonic-gate 		} else {
11590Sstevel@tonic-gate 			md_unit_readerexit(ui);
11600Sstevel@tonic-gate 			(void) md_unit_writerlock(ui);
11610Sstevel@tonic-gate 		}
11620Sstevel@tonic-gate 		ui->ui_tstate &= ~MD_INACCESSIBLE;
11630Sstevel@tonic-gate 		if (lockp) {
11640Sstevel@tonic-gate 			md_ioctl_writerexit(lockp);
11650Sstevel@tonic-gate 			(void) md_ioctl_readerlock(lockp, ui);
11660Sstevel@tonic-gate 		} else {
11670Sstevel@tonic-gate 			md_unit_writerexit(ui);
11680Sstevel@tonic-gate 			(void) md_unit_readerlock(ui);
11690Sstevel@tonic-gate 		}
11700Sstevel@tonic-gate 
11710Sstevel@tonic-gate 		for (i = 0; i < NMIRROR; i++) {
11720Sstevel@tonic-gate 			md_dev64_t	tmpdev;
11730Sstevel@tonic-gate 			mdi_unit_t	*sm_ui;
11740Sstevel@tonic-gate 
11750Sstevel@tonic-gate 			if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
11760Sstevel@tonic-gate 				continue;
11770Sstevel@tonic-gate 
11780Sstevel@tonic-gate 			tmpdev = un->un_sm[i].sm_dev;
11790Sstevel@tonic-gate 			sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev)));
11800Sstevel@tonic-gate 			(void) md_unit_writerlock(sm_ui);
11810Sstevel@tonic-gate 			sm_ui->ui_tstate &= ~MD_INACCESSIBLE;
11820Sstevel@tonic-gate 			md_unit_writerexit(sm_ui);
11830Sstevel@tonic-gate 		}
11840Sstevel@tonic-gate 
11850Sstevel@tonic-gate 		return (0);
11860Sstevel@tonic-gate 	}
11870Sstevel@tonic-gate 
11880Sstevel@tonic-gate 	for (i = 0; i < NMIRROR; i++) {
11890Sstevel@tonic-gate 		md_dev64_t tmpdev;
11900Sstevel@tonic-gate 
11910Sstevel@tonic-gate 		if (!(smmask & SMI2BIT(i)))
11920Sstevel@tonic-gate 			continue;
11930Sstevel@tonic-gate 
11940Sstevel@tonic-gate 		tmpdev = un->un_sm[i].sm_dev;
11950Sstevel@tonic-gate 		err = md_layered_open(mnum, &tmpdev, MD_OFLG_CONT_ERRS);
11960Sstevel@tonic-gate 		un->un_sm[i].sm_dev = tmpdev;
11970Sstevel@tonic-gate 		ASSERT(err == 0);
11980Sstevel@tonic-gate 	}
11990Sstevel@tonic-gate 
12000Sstevel@tonic-gate 	if (lockp) {
12010Sstevel@tonic-gate 		md_ioctl_readerexit(lockp);
12020Sstevel@tonic-gate 		un = (mm_unit_t *)md_ioctl_writerlock(lockp, ui);
12030Sstevel@tonic-gate 	} else {
12040Sstevel@tonic-gate 		md_unit_readerexit(ui);
12050Sstevel@tonic-gate 		un = (mm_unit_t *)md_unit_writerlock(ui);
12060Sstevel@tonic-gate 	}
12070Sstevel@tonic-gate 
12080Sstevel@tonic-gate 	/*
12090Sstevel@tonic-gate 	 * We want to make sure the unavailable flag is not masking a real
12100Sstevel@tonic-gate 	 * error on the submirror.
12110Sstevel@tonic-gate 	 * For each submirror,
12120Sstevel@tonic-gate 	 *    if all of the submirror components couldn't be opened and there
12130Sstevel@tonic-gate 	 *    are no errors on the submirror, then set the unavailable flag
12140Sstevel@tonic-gate 	 *    otherwise, clear unavailable.
12150Sstevel@tonic-gate 	 */
12160Sstevel@tonic-gate 	sm_cnt = 0;
12170Sstevel@tonic-gate 	sm_unavail_cnt = 0;
12180Sstevel@tonic-gate 	for (i = 0; i < NMIRROR; i++) {
12190Sstevel@tonic-gate 		md_dev64_t	tmpdev;
12200Sstevel@tonic-gate 		mdi_unit_t	*sm_ui;
12210Sstevel@tonic-gate 
12220Sstevel@tonic-gate 		if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
12230Sstevel@tonic-gate 			continue;
12240Sstevel@tonic-gate 
12250Sstevel@tonic-gate 		sm_cnt++;
12260Sstevel@tonic-gate 		tmpdev = un->un_sm[i].sm_dev;
12270Sstevel@tonic-gate 		sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev)));
12280Sstevel@tonic-gate 
12290Sstevel@tonic-gate 		(void) md_unit_writerlock(sm_ui);
12300Sstevel@tonic-gate 		if (submirror_unavailable(un, i, 0)) {
12310Sstevel@tonic-gate 			sm_ui->ui_tstate |= MD_INACCESSIBLE;
12320Sstevel@tonic-gate 			sm_unavail_cnt++;
12330Sstevel@tonic-gate 		} else {
12340Sstevel@tonic-gate 			sm_ui->ui_tstate &= ~MD_INACCESSIBLE;
12350Sstevel@tonic-gate 		}
12360Sstevel@tonic-gate 		md_unit_writerexit(sm_ui);
12370Sstevel@tonic-gate 	}
12380Sstevel@tonic-gate 
12390Sstevel@tonic-gate 	/*
12400Sstevel@tonic-gate 	 * If all of the submirrors are unavailable, the mirror is also
12410Sstevel@tonic-gate 	 * unavailable.
12420Sstevel@tonic-gate 	 */
12430Sstevel@tonic-gate 	if (sm_cnt == sm_unavail_cnt) {
12440Sstevel@tonic-gate 		ui->ui_tstate |= MD_INACCESSIBLE;
12450Sstevel@tonic-gate 	} else {
12460Sstevel@tonic-gate 		ui->ui_tstate &= ~MD_INACCESSIBLE;
12470Sstevel@tonic-gate 	}
12480Sstevel@tonic-gate 
12490Sstevel@tonic-gate 	smi = 0;
12500Sstevel@tonic-gate 	ci = 0;
12510Sstevel@tonic-gate 	while (mirror_geterror(un, &smi, &ci, 1, 0) != 0) {
12520Sstevel@tonic-gate 		if (mirror_other_sources(un, smi, ci, 1) == 1) {
12530Sstevel@tonic-gate 
12540Sstevel@tonic-gate 			free_all_ecomps(ecomps);
12550Sstevel@tonic-gate 			(void) mirror_close_all_devs(un, md_oflags);
12560Sstevel@tonic-gate 			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL,
12570Sstevel@tonic-gate 			    SVM_TAG_METADEVICE, setno, MD_SID(un));
12580Sstevel@tonic-gate 			mirror_openfail_console_info(un, smi, ci);
12590Sstevel@tonic-gate 			if (lockp) {
12600Sstevel@tonic-gate 				md_ioctl_writerexit(lockp);
12610Sstevel@tonic-gate 				(void) md_ioctl_readerlock(lockp, ui);
12620Sstevel@tonic-gate 			} else {
12630Sstevel@tonic-gate 				md_unit_writerexit(ui);
12640Sstevel@tonic-gate 				(void) md_unit_readerlock(ui);
12650Sstevel@tonic-gate 			}
12660Sstevel@tonic-gate 			return (ENXIO);
12670Sstevel@tonic-gate 		}
12680Sstevel@tonic-gate 
12690Sstevel@tonic-gate 		/* track all component states that need changing */
12700Sstevel@tonic-gate 		c = (err_comp_t *)kmem_alloc(sizeof (err_comp_t), KM_SLEEP);
12710Sstevel@tonic-gate 		c->ec_next = ecomps;
12720Sstevel@tonic-gate 		c->ec_smi = smi;
12730Sstevel@tonic-gate 		c->ec_ci = ci;
12740Sstevel@tonic-gate 		ecomps = c;
12750Sstevel@tonic-gate 		ci++;
12760Sstevel@tonic-gate 	}
12770Sstevel@tonic-gate 
12780Sstevel@tonic-gate 	/* Make all state changes and commit them */
12790Sstevel@tonic-gate 	for (c = ecomps; c != NULL; c = c->ec_next) {
12800Sstevel@tonic-gate 		/*
12810Sstevel@tonic-gate 		 * If lockp is set, then entering kernel through ioctl.
12820Sstevel@tonic-gate 		 * For a MN set, the only ioctl path is via a commd message
12830Sstevel@tonic-gate 		 * (ALLOCATE_HOTSPARE or *RESYNC* messages) that is already
12840Sstevel@tonic-gate 		 * being sent to each node.
12850Sstevel@tonic-gate 		 * In this case, set NO_XMIT so that set_sm_comp_state
12860Sstevel@tonic-gate 		 * won't attempt to send a message on a message.
12870Sstevel@tonic-gate 		 *
12880Sstevel@tonic-gate 		 * In !MN sets, the xmit flag is ignored, so it doesn't matter
12890Sstevel@tonic-gate 		 * which flag is passed.
12900Sstevel@tonic-gate 		 */
12910Sstevel@tonic-gate 		if (lockp) {
12920Sstevel@tonic-gate 			set_sm_comp_state(un, c->ec_smi, c->ec_ci, CS_ERRED, 0,
12930Sstevel@tonic-gate 			    MD_STATE_NO_XMIT, lockp);
12940Sstevel@tonic-gate 		} else {
12950Sstevel@tonic-gate 			set_sm_comp_state(un, c->ec_smi, c->ec_ci, CS_ERRED, 0,
12960Sstevel@tonic-gate 			    (MD_STATE_XMIT | MD_STATE_OCHELD), lockp);
12970Sstevel@tonic-gate 		}
12980Sstevel@tonic-gate 		/*
12990Sstevel@tonic-gate 		 * For a MN set, the NOTIFY is done when the state change is
13000Sstevel@tonic-gate 		 * processed on each node
13010Sstevel@tonic-gate 		 */
13020Sstevel@tonic-gate 		if (!MD_MNSET_SETNO(setno)) {
13030Sstevel@tonic-gate 			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED,
13040Sstevel@tonic-gate 			    SVM_TAG_METADEVICE, setno, MD_SID(un));
13050Sstevel@tonic-gate 		}
13060Sstevel@tonic-gate 	}
13070Sstevel@tonic-gate 
13080Sstevel@tonic-gate 	if (lockp) {
13090Sstevel@tonic-gate 		md_ioctl_writerexit(lockp);
13100Sstevel@tonic-gate 		(void) md_ioctl_readerlock(lockp, ui);
13110Sstevel@tonic-gate 	} else {
13120Sstevel@tonic-gate 		md_unit_writerexit(ui);
13130Sstevel@tonic-gate 		(void) md_unit_readerlock(ui);
13140Sstevel@tonic-gate 	}
13150Sstevel@tonic-gate 
13160Sstevel@tonic-gate 	free_all_ecomps(ecomps);
13170Sstevel@tonic-gate 
13180Sstevel@tonic-gate 	/* allocate hotspares for all errored components */
13190Sstevel@tonic-gate 	if (MD_MNSET_SETNO(setno)) {
13200Sstevel@tonic-gate 		/*
13210Sstevel@tonic-gate 		 * If we're called from an ioctl (lockp set) then we cannot
13220Sstevel@tonic-gate 		 * directly call send_poke_hotspares as this will block until
13230Sstevel@tonic-gate 		 * the message gets despatched to all nodes. If the cluster is
13240Sstevel@tonic-gate 		 * going through a reconfig cycle then the message will block
13250Sstevel@tonic-gate 		 * until the cycle is complete, and as we originate from a
13260Sstevel@tonic-gate 		 * service call from commd we will livelock.
13270Sstevel@tonic-gate 		 */
13280Sstevel@tonic-gate 		if (lockp == NULL) {
13290Sstevel@tonic-gate 			md_unit_readerexit(ui);
13300Sstevel@tonic-gate 			send_poke_hotspares(setno);
13310Sstevel@tonic-gate 			(void) md_unit_readerlock(ui);
13320Sstevel@tonic-gate 		}
13330Sstevel@tonic-gate 	} else {
13340Sstevel@tonic-gate 		(void) poke_hotspares();
13350Sstevel@tonic-gate 	}
13360Sstevel@tonic-gate 	return (0);
13370Sstevel@tonic-gate }
13380Sstevel@tonic-gate 
13390Sstevel@tonic-gate void
mirror_overlap_tree_remove(md_mps_t * ps)13406901Sjkennedy mirror_overlap_tree_remove(md_mps_t *ps)
13410Sstevel@tonic-gate {
13420Sstevel@tonic-gate 	mm_unit_t	*un;
13430Sstevel@tonic-gate 
13440Sstevel@tonic-gate 	if (panicstr)
13450Sstevel@tonic-gate 		return;
13460Sstevel@tonic-gate 
13476901Sjkennedy 	VERIFY(ps->ps_flags & MD_MPS_ON_OVERLAP);
13480Sstevel@tonic-gate 	un = ps->ps_un;
13490Sstevel@tonic-gate 
13506901Sjkennedy 	mutex_enter(&un->un_overlap_tree_mx);
13516901Sjkennedy 	avl_remove(&un->un_overlap_root, ps);
13526901Sjkennedy 	ps->ps_flags &= ~MD_MPS_ON_OVERLAP;
13536901Sjkennedy 	if (un->un_overlap_tree_flag != 0) {
13546901Sjkennedy 		un->un_overlap_tree_flag = 0;
13556901Sjkennedy 		cv_broadcast(&un->un_overlap_tree_cv);
13560Sstevel@tonic-gate 	}
13576901Sjkennedy 	mutex_exit(&un->un_overlap_tree_mx);
13580Sstevel@tonic-gate }
13590Sstevel@tonic-gate 
13600Sstevel@tonic-gate 
13610Sstevel@tonic-gate /*
13620Sstevel@tonic-gate  * wait_for_overlaps:
13630Sstevel@tonic-gate  * -----------------
13640Sstevel@tonic-gate  * Check that given i/o request does not cause an overlap with already pending
13650Sstevel@tonic-gate  * i/o. If it does, block until the overlapped i/o completes.
13660Sstevel@tonic-gate  *
13670Sstevel@tonic-gate  * The flag argument has MD_OVERLAP_ALLOW_REPEAT set if it is ok for the parent
13686901Sjkennedy  * structure to be already in the overlap tree and MD_OVERLAP_NO_REPEAT if
13696901Sjkennedy  * it must not already be in the tree.
13700Sstevel@tonic-gate  */
13710Sstevel@tonic-gate static void
wait_for_overlaps(md_mps_t * ps,int flags)13720Sstevel@tonic-gate wait_for_overlaps(md_mps_t *ps, int flags)
13730Sstevel@tonic-gate {
13740Sstevel@tonic-gate 	mm_unit_t	*un;
13756901Sjkennedy 	avl_index_t	where;
13766901Sjkennedy 	md_mps_t	*ps1;
13770Sstevel@tonic-gate 
13780Sstevel@tonic-gate 	if (panicstr)
13790Sstevel@tonic-gate 		return;
13800Sstevel@tonic-gate 
13810Sstevel@tonic-gate 	un = ps->ps_un;
13826901Sjkennedy 	mutex_enter(&un->un_overlap_tree_mx);
13830Sstevel@tonic-gate 	if ((flags & MD_OVERLAP_ALLOW_REPEAT) &&
13840Sstevel@tonic-gate 	    (ps->ps_flags & MD_MPS_ON_OVERLAP)) {
13856901Sjkennedy 		mutex_exit(&un->un_overlap_tree_mx);
13860Sstevel@tonic-gate 		return;
13870Sstevel@tonic-gate 	}
13886901Sjkennedy 
13896901Sjkennedy 	VERIFY(!(ps->ps_flags & MD_MPS_ON_OVERLAP));
13906901Sjkennedy 
13916901Sjkennedy 	do {
13926901Sjkennedy 		ps1 = avl_find(&un->un_overlap_root, ps, &where);
13936901Sjkennedy 		if (ps1 == NULL) {
13946901Sjkennedy 			/*
13956901Sjkennedy 			 * The candidate range does not overlap with any
13966901Sjkennedy 			 * range in the tree.  Insert it and be done.
13976901Sjkennedy 			 */
13986901Sjkennedy 			avl_insert(&un->un_overlap_root, ps, where);
13996901Sjkennedy 			ps->ps_flags |= MD_MPS_ON_OVERLAP;
14006901Sjkennedy 		} else {
14016901Sjkennedy 			/*
14026901Sjkennedy 			 * The candidate range would overlap.  Set the flag
14036901Sjkennedy 			 * indicating we need to be woken up, and sleep
14046901Sjkennedy 			 * until another thread removes a range.  If upon
14056901Sjkennedy 			 * waking up we find this mps was put on the tree
14066901Sjkennedy 			 * by another thread, the loop terminates.
14076901Sjkennedy 			 */
14086901Sjkennedy 			un->un_overlap_tree_flag = 1;
14096901Sjkennedy 			cv_wait(&un->un_overlap_tree_cv,
14106901Sjkennedy 			    &un->un_overlap_tree_mx);
14110Sstevel@tonic-gate 		}
14126901Sjkennedy 	} while (!(ps->ps_flags & MD_MPS_ON_OVERLAP));
14136901Sjkennedy 	mutex_exit(&un->un_overlap_tree_mx);
14140Sstevel@tonic-gate }
14150Sstevel@tonic-gate 
14160Sstevel@tonic-gate /*
14170Sstevel@tonic-gate  * This function is called from mirror_done to check whether any pages have
14180Sstevel@tonic-gate  * been modified while a mirrored write was in progress.  Returns 0 if
14190Sstevel@tonic-gate  * all pages associated with bp are clean, 1 otherwise.
14200Sstevel@tonic-gate  */
14210Sstevel@tonic-gate static int
any_pages_dirty(struct buf * bp)14220Sstevel@tonic-gate any_pages_dirty(struct buf *bp)
14230Sstevel@tonic-gate {
14240Sstevel@tonic-gate 	int	rval;
14250Sstevel@tonic-gate 
14260Sstevel@tonic-gate 	rval = biomodified(bp);
14270Sstevel@tonic-gate 	if (rval == -1)
14280Sstevel@tonic-gate 		rval = 0;
14290Sstevel@tonic-gate 
14300Sstevel@tonic-gate 	return (rval);
14310Sstevel@tonic-gate }
14320Sstevel@tonic-gate 
14330Sstevel@tonic-gate #define	MAX_EXTRAS 10
14340Sstevel@tonic-gate 
14350Sstevel@tonic-gate void
mirror_commit(mm_unit_t * un,int smmask,mddb_recid_t * extras)14360Sstevel@tonic-gate mirror_commit(
14370Sstevel@tonic-gate 	mm_unit_t	*un,
14380Sstevel@tonic-gate 	int		smmask,
14390Sstevel@tonic-gate 	mddb_recid_t	*extras
14400Sstevel@tonic-gate )
14410Sstevel@tonic-gate {
14420Sstevel@tonic-gate 	mm_submirror_t		*sm;
14430Sstevel@tonic-gate 	md_unit_t		*su;
14440Sstevel@tonic-gate 	int			i;
14450Sstevel@tonic-gate 
14460Sstevel@tonic-gate 	/* 2=mirror,null id */
14470Sstevel@tonic-gate 	mddb_recid_t		recids[NMIRROR+2+MAX_EXTRAS];
14480Sstevel@tonic-gate 
14490Sstevel@tonic-gate 	int			ri = 0;
14500Sstevel@tonic-gate 
14510Sstevel@tonic-gate 	if (md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)
14520Sstevel@tonic-gate 		return;
14530Sstevel@tonic-gate 
14540Sstevel@tonic-gate 	/* Add two, this includes the mirror unit and the null recid */
14550Sstevel@tonic-gate 	if (extras != NULL) {
14560Sstevel@tonic-gate 		int	nrecids = 0;
14570Sstevel@tonic-gate 		while (extras[nrecids] != 0) {
14580Sstevel@tonic-gate 			nrecids++;
14590Sstevel@tonic-gate 		}
14600Sstevel@tonic-gate 		ASSERT(nrecids <= MAX_EXTRAS);
14610Sstevel@tonic-gate 	}
14620Sstevel@tonic-gate 
14630Sstevel@tonic-gate 	if (un != NULL)
14640Sstevel@tonic-gate 		recids[ri++] = un->c.un_record_id;
14650Sstevel@tonic-gate 	for (i = 0;  i < NMIRROR; i++) {
14660Sstevel@tonic-gate 		if (!(smmask & SMI2BIT(i)))
14670Sstevel@tonic-gate 			continue;
14680Sstevel@tonic-gate 		sm = &un->un_sm[i];
14690Sstevel@tonic-gate 		if (!SMS_IS(sm, SMS_INUSE))
14700Sstevel@tonic-gate 			continue;
14710Sstevel@tonic-gate 		if (md_getmajor(sm->sm_dev) != md_major)
14720Sstevel@tonic-gate 			continue;
14730Sstevel@tonic-gate 		su =  MD_UNIT(md_getminor(sm->sm_dev));
14740Sstevel@tonic-gate 		recids[ri++] = su->c.un_record_id;
14750Sstevel@tonic-gate 	}
14760Sstevel@tonic-gate 
14770Sstevel@tonic-gate 	if (extras != NULL)
14780Sstevel@tonic-gate 		while (*extras != 0) {
14790Sstevel@tonic-gate 			recids[ri++] = *extras;
14800Sstevel@tonic-gate 			extras++;
14810Sstevel@tonic-gate 		}
14820Sstevel@tonic-gate 
14830Sstevel@tonic-gate 	if (ri == 0)
14840Sstevel@tonic-gate 		return;
14850Sstevel@tonic-gate 	recids[ri] = 0;
14860Sstevel@tonic-gate 
14870Sstevel@tonic-gate 	/*
14880Sstevel@tonic-gate 	 * Ok to hold ioctl lock across record commit to mddb as
14890Sstevel@tonic-gate 	 * long as the record(s) being committed aren't resync records.
14900Sstevel@tonic-gate 	 */
14910Sstevel@tonic-gate 	mddb_commitrecs_wrapper(recids);
14920Sstevel@tonic-gate }
14930Sstevel@tonic-gate 
14940Sstevel@tonic-gate 
14950Sstevel@tonic-gate /*
14960Sstevel@tonic-gate  * This routine is used to set a bit in the writable_bm bitmap
14970Sstevel@tonic-gate  * which represents each submirror in a metamirror which
14980Sstevel@tonic-gate  * is writable. The first writable submirror index is assigned
14990Sstevel@tonic-gate  * to the sm_index.  The number of writable submirrors are returned in nunits.
15000Sstevel@tonic-gate  *
15010Sstevel@tonic-gate  * This routine returns the submirror's unit number.
15020Sstevel@tonic-gate  */
15030Sstevel@tonic-gate 
15040Sstevel@tonic-gate static void
select_write_units(struct mm_unit * un,md_mps_t * ps)15050Sstevel@tonic-gate select_write_units(struct mm_unit *un, md_mps_t *ps)
15060Sstevel@tonic-gate {
15070Sstevel@tonic-gate 
15080Sstevel@tonic-gate 	int		i;
15090Sstevel@tonic-gate 	unsigned	writable_bm = 0;
15100Sstevel@tonic-gate 	unsigned	nunits = 0;
15110Sstevel@tonic-gate 
15120Sstevel@tonic-gate 	for (i = 0; i < NMIRROR; i++) {
15130Sstevel@tonic-gate 		if (SUBMIRROR_IS_WRITEABLE(un, i)) {
15140Sstevel@tonic-gate 			/* set bit of all writable units */
15150Sstevel@tonic-gate 			writable_bm |= SMI2BIT(i);
15160Sstevel@tonic-gate 			nunits++;
15170Sstevel@tonic-gate 		}
15180Sstevel@tonic-gate 	}
15190Sstevel@tonic-gate 	ps->ps_writable_sm = writable_bm;
15200Sstevel@tonic-gate 	ps->ps_active_cnt = nunits;
15210Sstevel@tonic-gate 	ps->ps_current_sm = 0;
15220Sstevel@tonic-gate }
15230Sstevel@tonic-gate 
15240Sstevel@tonic-gate static
15250Sstevel@tonic-gate unsigned
select_write_after_read_units(struct mm_unit * un,md_mps_t * ps)15260Sstevel@tonic-gate select_write_after_read_units(struct mm_unit *un, md_mps_t *ps)
15270Sstevel@tonic-gate {
15280Sstevel@tonic-gate 
15290Sstevel@tonic-gate 	int		i;
15300Sstevel@tonic-gate 	unsigned	writable_bm = 0;
15310Sstevel@tonic-gate 	unsigned	nunits = 0;
15320Sstevel@tonic-gate 
15330Sstevel@tonic-gate 	for (i = 0; i < NMIRROR; i++) {
15340Sstevel@tonic-gate 		if (SUBMIRROR_IS_WRITEABLE(un, i) &&
15350Sstevel@tonic-gate 		    un->un_sm[i].sm_flags & MD_SM_RESYNC_TARGET) {
15360Sstevel@tonic-gate 			writable_bm |= SMI2BIT(i);
15370Sstevel@tonic-gate 			nunits++;
15380Sstevel@tonic-gate 		}
15390Sstevel@tonic-gate 	}
15400Sstevel@tonic-gate 	if ((writable_bm & ps->ps_allfrom_sm) != 0) {
15410Sstevel@tonic-gate 		writable_bm &= ~ps->ps_allfrom_sm;
15420Sstevel@tonic-gate 		nunits--;
15430Sstevel@tonic-gate 	}
15440Sstevel@tonic-gate 	ps->ps_writable_sm = writable_bm;
15450Sstevel@tonic-gate 	ps->ps_active_cnt = nunits;
15460Sstevel@tonic-gate 	ps->ps_current_sm = 0;
15470Sstevel@tonic-gate 	return (nunits);
15480Sstevel@tonic-gate }
15490Sstevel@tonic-gate 
15500Sstevel@tonic-gate static md_dev64_t
select_read_unit(mm_unit_t * un,diskaddr_t blkno,u_longlong_t reqcount,u_longlong_t * cando,int must_be_opened,md_m_shared_t ** shared,md_mcs_t * cs)15510Sstevel@tonic-gate select_read_unit(
15520Sstevel@tonic-gate 	mm_unit_t	*un,
15530Sstevel@tonic-gate 	diskaddr_t	blkno,
15540Sstevel@tonic-gate 	u_longlong_t	reqcount,
15550Sstevel@tonic-gate 	u_longlong_t	*cando,
15560Sstevel@tonic-gate 	int		must_be_opened,
15570Sstevel@tonic-gate 	md_m_shared_t	**shared,
15580Sstevel@tonic-gate 	md_mcs_t	*cs)
15590Sstevel@tonic-gate {
15600Sstevel@tonic-gate 	int			i;
15610Sstevel@tonic-gate 	md_m_shared_t		*s;
15620Sstevel@tonic-gate 	uint_t			lasterrcnt = 0;
15630Sstevel@tonic-gate 	md_dev64_t		dev = 0;
15640Sstevel@tonic-gate 	u_longlong_t		cnt;
15650Sstevel@tonic-gate 	u_longlong_t		mincnt;
15660Sstevel@tonic-gate 	mm_submirror_t		*sm;
15670Sstevel@tonic-gate 	mm_submirror_ic_t	*smic;
15680Sstevel@tonic-gate 	mdi_unit_t		*ui;
15690Sstevel@tonic-gate 
15700Sstevel@tonic-gate 	mincnt = reqcount;
15710Sstevel@tonic-gate 	for (i = 0; i < NMIRROR; i++) {
15720Sstevel@tonic-gate 		if (!SUBMIRROR_IS_READABLE(un, i))
15730Sstevel@tonic-gate 			continue;
15740Sstevel@tonic-gate 		sm = &un->un_sm[i];
15750Sstevel@tonic-gate 		smic = &un->un_smic[i];
15760Sstevel@tonic-gate 		cnt = reqcount;
15770Sstevel@tonic-gate 
15780Sstevel@tonic-gate 		/*
15790Sstevel@tonic-gate 		 * If the current submirror is marked as inaccessible, do not
15800Sstevel@tonic-gate 		 * try to access it.
15810Sstevel@tonic-gate 		 */
15820Sstevel@tonic-gate 		ui = MDI_UNIT(getminor(expldev(sm->sm_dev)));
15830Sstevel@tonic-gate 		(void) md_unit_readerlock(ui);
15840Sstevel@tonic-gate 		if (ui->ui_tstate & MD_INACCESSIBLE) {
15850Sstevel@tonic-gate 			md_unit_readerexit(ui);
15860Sstevel@tonic-gate 			continue;
15870Sstevel@tonic-gate 		}
15880Sstevel@tonic-gate 		md_unit_readerexit(ui);
15890Sstevel@tonic-gate 
15900Sstevel@tonic-gate 		s = (md_m_shared_t *)(*(smic->sm_shared_by_blk))
15910Sstevel@tonic-gate 		    (sm->sm_dev, sm, blkno, &cnt);
15920Sstevel@tonic-gate 
15930Sstevel@tonic-gate 		if (must_be_opened && !(s->ms_flags & MDM_S_ISOPEN))
15940Sstevel@tonic-gate 			continue;
15950Sstevel@tonic-gate 		if (s->ms_state == CS_OKAY) {
15960Sstevel@tonic-gate 			*cando = cnt;
15970Sstevel@tonic-gate 			if (shared != NULL)
15980Sstevel@tonic-gate 				*shared = s;
15990Sstevel@tonic-gate 
16000Sstevel@tonic-gate 			if (un->un_sm[i].sm_flags & MD_SM_FAILFAST &&
16010Sstevel@tonic-gate 			    cs != NULL) {
16020Sstevel@tonic-gate 				cs->cs_buf.b_flags |= B_FAILFAST;
16030Sstevel@tonic-gate 			}
16040Sstevel@tonic-gate 
16050Sstevel@tonic-gate 			return (un->un_sm[i].sm_dev);
16060Sstevel@tonic-gate 		}
16070Sstevel@tonic-gate 		if (s->ms_state != CS_LAST_ERRED)
16080Sstevel@tonic-gate 			continue;
16090Sstevel@tonic-gate 
16100Sstevel@tonic-gate 		/* don't use B_FAILFAST since we're Last Erred */
16110Sstevel@tonic-gate 
16120Sstevel@tonic-gate 		if (mincnt > cnt)
16130Sstevel@tonic-gate 			mincnt = cnt;
16140Sstevel@tonic-gate 		if (s->ms_lasterrcnt > lasterrcnt) {
16150Sstevel@tonic-gate 			lasterrcnt = s->ms_lasterrcnt;
16160Sstevel@tonic-gate 			if (shared != NULL)
16170Sstevel@tonic-gate 				*shared = s;
16180Sstevel@tonic-gate 			dev = un->un_sm[i].sm_dev;
16190Sstevel@tonic-gate 		}
16200Sstevel@tonic-gate 	}
16210Sstevel@tonic-gate 	*cando = mincnt;
16220Sstevel@tonic-gate 	return (dev);
16230Sstevel@tonic-gate }
16240Sstevel@tonic-gate 
16250Sstevel@tonic-gate /*
16260Sstevel@tonic-gate  * Given a 32-bit bitmap, this routine will return the bit number
16270Sstevel@tonic-gate  * of the nth bit set.	The nth bit set is passed via the index integer.
16280Sstevel@tonic-gate  *
16290Sstevel@tonic-gate  * This routine is used to run through the writable submirror bitmap
16300Sstevel@tonic-gate  * and starting all of the writes.  See the value returned is the
16310Sstevel@tonic-gate  * index to appropriate submirror structure, in the md_sm
16320Sstevel@tonic-gate  * array for metamirrors.
16330Sstevel@tonic-gate  */
16340Sstevel@tonic-gate static int
md_find_nth_unit(uint_t mask,int index)16350Sstevel@tonic-gate md_find_nth_unit(uint_t mask, int index)
16360Sstevel@tonic-gate {
16370Sstevel@tonic-gate 	int	bit, nfound;
16380Sstevel@tonic-gate 
16390Sstevel@tonic-gate 	for (bit = -1, nfound = -1; nfound != index; bit++) {
16400Sstevel@tonic-gate 		ASSERT(mask != 0);
16410Sstevel@tonic-gate 		nfound += (mask & 1);
16420Sstevel@tonic-gate 		mask >>= 1;
16430Sstevel@tonic-gate 	}
16440Sstevel@tonic-gate 	return (bit);
16450Sstevel@tonic-gate }
16460Sstevel@tonic-gate 
16470Sstevel@tonic-gate static int
fast_select_read_unit(md_mps_t * ps,md_mcs_t * cs)16480Sstevel@tonic-gate fast_select_read_unit(md_mps_t *ps, md_mcs_t *cs)
16490Sstevel@tonic-gate {
16500Sstevel@tonic-gate 	mm_unit_t	*un;
16510Sstevel@tonic-gate 	buf_t		*bp;
16520Sstevel@tonic-gate 	int		i;
16530Sstevel@tonic-gate 	unsigned	nunits = 0;
16540Sstevel@tonic-gate 	int		iunit;
16550Sstevel@tonic-gate 	uint_t		running_bm = 0;
16560Sstevel@tonic-gate 	uint_t		sm_index;
16570Sstevel@tonic-gate 
16580Sstevel@tonic-gate 	bp = &cs->cs_buf;
16590Sstevel@tonic-gate 	un = ps->ps_un;
16600Sstevel@tonic-gate 
16610Sstevel@tonic-gate 	for (i = 0; i < NMIRROR; i++) {
16620Sstevel@tonic-gate 		if (!SMS_BY_INDEX_IS(un, i, SMS_RUNNING))
16630Sstevel@tonic-gate 			continue;
16640Sstevel@tonic-gate 		running_bm |= SMI2BIT(i);
16650Sstevel@tonic-gate 		nunits++;
16660Sstevel@tonic-gate 	}
16670Sstevel@tonic-gate 	if (nunits == 0)
16680Sstevel@tonic-gate 		return (1);
16690Sstevel@tonic-gate 
16700Sstevel@tonic-gate 	/*
16710Sstevel@tonic-gate 	 * For directed mirror read (DMR) we only use the specified side and
16720Sstevel@tonic-gate 	 * do not compute the source of the read.
16738452SJohn.Wren.Kennedy@Sun.COM 	 * If we're running with MD_MPS_DIRTY_RD set we always return the
16748452SJohn.Wren.Kennedy@Sun.COM 	 * first mirror side (this prevents unnecessary ownership switching).
16758452SJohn.Wren.Kennedy@Sun.COM 	 * Otherwise we return the submirror according to the mirror read option
16760Sstevel@tonic-gate 	 */
16770Sstevel@tonic-gate 	if (ps->ps_flags & MD_MPS_DMR) {
16780Sstevel@tonic-gate 		sm_index = un->un_dmr_last_read;
16798452SJohn.Wren.Kennedy@Sun.COM 	} else if (ps->ps_flags & MD_MPS_DIRTY_RD) {
16808452SJohn.Wren.Kennedy@Sun.COM 		sm_index = md_find_nth_unit(running_bm, 0);
16810Sstevel@tonic-gate 	} else {
16820Sstevel@tonic-gate 		/* Normal (non-DMR) operation */
16830Sstevel@tonic-gate 		switch (un->un_read_option) {
16840Sstevel@tonic-gate 		case RD_GEOMETRY:
16850Sstevel@tonic-gate 			iunit = (int)(bp->b_lblkno /
16860Sstevel@tonic-gate 			    howmany(un->c.un_total_blocks, nunits));
16870Sstevel@tonic-gate 			sm_index = md_find_nth_unit(running_bm, iunit);
16880Sstevel@tonic-gate 			break;
16890Sstevel@tonic-gate 		case RD_FIRST:
16900Sstevel@tonic-gate 			sm_index = md_find_nth_unit(running_bm, 0);
16910Sstevel@tonic-gate 			break;
16920Sstevel@tonic-gate 		case RD_LOAD_BAL:
16930Sstevel@tonic-gate 			/* this is intentional to fall into the default */
16940Sstevel@tonic-gate 		default:
16950Sstevel@tonic-gate 			un->un_last_read = (un->un_last_read + 1) % nunits;
16960Sstevel@tonic-gate 			sm_index = md_find_nth_unit(running_bm,
16970Sstevel@tonic-gate 			    un->un_last_read);
16980Sstevel@tonic-gate 			break;
16990Sstevel@tonic-gate 		}
17000Sstevel@tonic-gate 	}
17010Sstevel@tonic-gate 	bp->b_edev = md_dev64_to_dev(un->un_sm[sm_index].sm_dev);
17020Sstevel@tonic-gate 	ps->ps_allfrom_sm = SMI2BIT(sm_index);
17030Sstevel@tonic-gate 
17040Sstevel@tonic-gate 	if (un->un_sm[sm_index].sm_flags & MD_SM_FAILFAST) {
17056901Sjkennedy 		bp->b_flags |= B_FAILFAST;
17060Sstevel@tonic-gate 	}
17070Sstevel@tonic-gate 
17080Sstevel@tonic-gate 	return (0);
17090Sstevel@tonic-gate }
17100Sstevel@tonic-gate 
17110Sstevel@tonic-gate static
17120Sstevel@tonic-gate int
mirror_are_submirrors_available(mm_unit_t * un)17130Sstevel@tonic-gate mirror_are_submirrors_available(mm_unit_t *un)
17140Sstevel@tonic-gate {
17150Sstevel@tonic-gate 	int i;
17160Sstevel@tonic-gate 	for (i = 0; i < NMIRROR; i++) {
17170Sstevel@tonic-gate 		md_dev64_t tmpdev = un->un_sm[i].sm_dev;
17180Sstevel@tonic-gate 
17190Sstevel@tonic-gate 		if ((!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) ||
17200Sstevel@tonic-gate 		    md_getmajor(tmpdev) != md_major)
17210Sstevel@tonic-gate 			continue;
17220Sstevel@tonic-gate 
17230Sstevel@tonic-gate 		if ((MD_MIN2SET(md_getminor(tmpdev)) >= md_nsets) ||
17240Sstevel@tonic-gate 		    (MD_MIN2UNIT(md_getminor(tmpdev)) >= md_nunits))
17250Sstevel@tonic-gate 			return (0);
17260Sstevel@tonic-gate 
17270Sstevel@tonic-gate 		if (MDI_UNIT(md_getminor(tmpdev)) == NULL)
17280Sstevel@tonic-gate 			return (0);
17290Sstevel@tonic-gate 	}
17300Sstevel@tonic-gate 	return (1);
17310Sstevel@tonic-gate }
17320Sstevel@tonic-gate 
17330Sstevel@tonic-gate void
build_submirror(mm_unit_t * un,int i,int snarfing)17340Sstevel@tonic-gate build_submirror(mm_unit_t *un, int i, int snarfing)
17350Sstevel@tonic-gate {
17360Sstevel@tonic-gate 	struct mm_submirror	*sm;
17370Sstevel@tonic-gate 	struct mm_submirror_ic	*smic;
17380Sstevel@tonic-gate 	md_unit_t		*su;
17390Sstevel@tonic-gate 	set_t			setno;
17400Sstevel@tonic-gate 
17410Sstevel@tonic-gate 	sm = &un->un_sm[i];
17420Sstevel@tonic-gate 	smic = &un->un_smic[i];
17430Sstevel@tonic-gate 
17440Sstevel@tonic-gate 	sm->sm_flags = 0; /* sometime we may need to do more here */
17450Sstevel@tonic-gate 
17460Sstevel@tonic-gate 	setno = MD_UN2SET(un);
17470Sstevel@tonic-gate 
17480Sstevel@tonic-gate 	if (!SMS_IS(sm, SMS_INUSE))
17490Sstevel@tonic-gate 		return;
17500Sstevel@tonic-gate 	if (snarfing) {
17510Sstevel@tonic-gate 		sm->sm_dev = md_getdevnum(setno, mddb_getsidenum(setno),
17526901Sjkennedy 		    sm->sm_key, MD_NOTRUST_DEVT);
17530Sstevel@tonic-gate 	} else {
17540Sstevel@tonic-gate 		if (md_getmajor(sm->sm_dev) == md_major) {
17550Sstevel@tonic-gate 			su = MD_UNIT(md_getminor(sm->sm_dev));
17560Sstevel@tonic-gate 			un->c.un_flag |= (su->c.un_flag & MD_LABELED);
17570Sstevel@tonic-gate 			/* submirror can no longer be soft partitioned */
17580Sstevel@tonic-gate 			MD_CAPAB(su) &= (~MD_CAN_SP);
17590Sstevel@tonic-gate 		}
17600Sstevel@tonic-gate 	}
17610Sstevel@tonic-gate 	smic->sm_shared_by_blk = md_get_named_service(sm->sm_dev,
17620Sstevel@tonic-gate 	    0, "shared by blk", 0);
17630Sstevel@tonic-gate 	smic->sm_shared_by_indx = md_get_named_service(sm->sm_dev,
17640Sstevel@tonic-gate 	    0, "shared by indx", 0);
17656901Sjkennedy 	smic->sm_get_component_count = (int (*)())md_get_named_service(
17666901Sjkennedy 	    sm->sm_dev, 0, "get component count", 0);
17676901Sjkennedy 	smic->sm_get_bcss = (int (*)())md_get_named_service(sm->sm_dev, 0,
17686901Sjkennedy 	    "get block count skip size", 0);
17690Sstevel@tonic-gate 	sm->sm_state &= ~SMS_IGNORE;
17700Sstevel@tonic-gate 	if (SMS_IS(sm, SMS_OFFLINE))
17710Sstevel@tonic-gate 		MD_STATUS(un) |= MD_UN_OFFLINE_SM;
17720Sstevel@tonic-gate 	md_set_parent(sm->sm_dev, MD_SID(un));
17730Sstevel@tonic-gate }
17740Sstevel@tonic-gate 
17750Sstevel@tonic-gate static void
mirror_cleanup(mm_unit_t * un)17760Sstevel@tonic-gate mirror_cleanup(mm_unit_t *un)
17770Sstevel@tonic-gate {
17780Sstevel@tonic-gate 	mddb_recid_t	recid;
17790Sstevel@tonic-gate 	int		smi;
17800Sstevel@tonic-gate 	sv_dev_t	sv[NMIRROR];
17810Sstevel@tonic-gate 	int		nsv = 0;
17820Sstevel@tonic-gate 
17830Sstevel@tonic-gate 	/*
17840Sstevel@tonic-gate 	 * If a MN diskset and this node is not the master, do
17850Sstevel@tonic-gate 	 * not delete any records on snarf of the mirror records.
17860Sstevel@tonic-gate 	 */
17870Sstevel@tonic-gate 	if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
17880Sstevel@tonic-gate 	    md_set[MD_UN2SET(un)].s_am_i_master == 0) {
17890Sstevel@tonic-gate 		return;
17900Sstevel@tonic-gate 	}
17910Sstevel@tonic-gate 
17920Sstevel@tonic-gate 	for (smi = 0; smi < NMIRROR; smi++) {
17930Sstevel@tonic-gate 		if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE))
17940Sstevel@tonic-gate 			continue;
17950Sstevel@tonic-gate 		sv[nsv].setno = MD_UN2SET(un);
17960Sstevel@tonic-gate 		sv[nsv++].key = un->un_sm[smi].sm_key;
17970Sstevel@tonic-gate 	}
17980Sstevel@tonic-gate 
17990Sstevel@tonic-gate 	recid = un->un_rr_dirty_recid;
18000Sstevel@tonic-gate 	mddb_deleterec_wrapper(un->c.un_record_id);
18010Sstevel@tonic-gate 	if (recid > 0)
18020Sstevel@tonic-gate 		mddb_deleterec_wrapper(recid);
18030Sstevel@tonic-gate 
18040Sstevel@tonic-gate 	md_rem_names(sv, nsv);
18050Sstevel@tonic-gate }
18060Sstevel@tonic-gate 
18076901Sjkennedy /*
18086901Sjkennedy  * Comparison function for the avl tree which tracks
18096901Sjkennedy  * outstanding writes on submirrors.
18106901Sjkennedy  *
18116901Sjkennedy  * Returns:
18126901Sjkennedy  *	-1: ps1 < ps2
18136901Sjkennedy  *	 0: ps1 and ps2 overlap
18146901Sjkennedy  *	 1: ps1 > ps2
18156901Sjkennedy  */
18166901Sjkennedy static int
mirror_overlap_compare(const void * p1,const void * p2)18176901Sjkennedy mirror_overlap_compare(const void *p1, const void *p2)
18186901Sjkennedy {
18196901Sjkennedy 	const md_mps_t *ps1 = (md_mps_t *)p1;
18206901Sjkennedy 	const md_mps_t *ps2 = (md_mps_t *)p2;
18216901Sjkennedy 
18226901Sjkennedy 	if (ps1->ps_firstblk < ps2->ps_firstblk) {
18236901Sjkennedy 		if (ps1->ps_lastblk >= ps2->ps_firstblk)
18246901Sjkennedy 			return (0);
18256901Sjkennedy 		return (-1);
18266901Sjkennedy 	}
18276901Sjkennedy 
18286901Sjkennedy 	if (ps1->ps_firstblk > ps2->ps_firstblk) {
18296901Sjkennedy 		if (ps1->ps_firstblk <= ps2->ps_lastblk)
18306901Sjkennedy 			return (0);
18316901Sjkennedy 		return (1);
18326901Sjkennedy 	}
18336901Sjkennedy 
18346901Sjkennedy 	return (0);
18356901Sjkennedy }
18366901Sjkennedy 
183710948SJames.Hall@Sun.COM /*
183810948SJames.Hall@Sun.COM  * Collapse any sparse submirror entries snarfed from the on-disk replica.
183910948SJames.Hall@Sun.COM  * Only the in-core entries are updated. The replica will be updated on-disk
184010948SJames.Hall@Sun.COM  * when the in-core replica is committed on shutdown of the SVM subsystem.
184110948SJames.Hall@Sun.COM  */
184210948SJames.Hall@Sun.COM static void
collapse_submirrors(mm_unit_t * un)184310948SJames.Hall@Sun.COM collapse_submirrors(mm_unit_t *un)
184410948SJames.Hall@Sun.COM {
184510948SJames.Hall@Sun.COM 	int			smi, nremovals, smiremove;
184610948SJames.Hall@Sun.COM 	mm_submirror_t		*sm, *new_sm, *old_sm;
184710948SJames.Hall@Sun.COM 	mm_submirror_ic_t	*smic;
184810948SJames.Hall@Sun.COM 	int			nsmidx = un->un_nsm - 1;
184910948SJames.Hall@Sun.COM 
185010948SJames.Hall@Sun.COM rescan:
185110948SJames.Hall@Sun.COM 	nremovals = 0;
185210948SJames.Hall@Sun.COM 	smiremove = -1;
185310948SJames.Hall@Sun.COM 
185410948SJames.Hall@Sun.COM 	for (smi = 0; smi <= nsmidx; smi++) {
185510948SJames.Hall@Sun.COM 		sm = &un->un_sm[smi];
185610948SJames.Hall@Sun.COM 
185710948SJames.Hall@Sun.COM 		/*
185810948SJames.Hall@Sun.COM 		 * Check to see if this submirror is marked as in-use.
185910948SJames.Hall@Sun.COM 		 * If it isn't then it is a potential sparse entry and
186010948SJames.Hall@Sun.COM 		 * may need to be cleared from the configuration.
186110948SJames.Hall@Sun.COM 		 * The records should _already_ have been cleared by the
186210948SJames.Hall@Sun.COM 		 * original mirror_detach() code, but we need to shuffle
186310948SJames.Hall@Sun.COM 		 * any NULL entries in un_sm[] to the end of the array.
186410948SJames.Hall@Sun.COM 		 * Any NULL un_smic[] entries need to be reset to the underlying
186510948SJames.Hall@Sun.COM 		 * submirror/slice accessor functions.
186610948SJames.Hall@Sun.COM 		 */
186710948SJames.Hall@Sun.COM 		if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE)) {
186810948SJames.Hall@Sun.COM 			nremovals++;
186910948SJames.Hall@Sun.COM 			smiremove = smi;
187010948SJames.Hall@Sun.COM 			break;
187110948SJames.Hall@Sun.COM 		}
187210948SJames.Hall@Sun.COM 	}
187310948SJames.Hall@Sun.COM 
187410948SJames.Hall@Sun.COM 	if (nremovals == 0) {
187510948SJames.Hall@Sun.COM 		/*
187610948SJames.Hall@Sun.COM 		 * Ensure that we have a matching contiguous set of un_smic[]
187710948SJames.Hall@Sun.COM 		 * entries for the corresponding un_sm[] entries
187810948SJames.Hall@Sun.COM 		 */
187910948SJames.Hall@Sun.COM 		for (smi = 0; smi <= nsmidx; smi++) {
188010948SJames.Hall@Sun.COM 			smic = &un->un_smic[smi];
188110948SJames.Hall@Sun.COM 			sm = &un->un_sm[smi];
188210948SJames.Hall@Sun.COM 
188310948SJames.Hall@Sun.COM 			smic->sm_shared_by_blk =
188410948SJames.Hall@Sun.COM 			    md_get_named_service(sm->sm_dev, 0,
188510948SJames.Hall@Sun.COM 			    "shared by_blk", 0);
188610948SJames.Hall@Sun.COM 			smic->sm_shared_by_indx =
188710948SJames.Hall@Sun.COM 			    md_get_named_service(sm->sm_dev, 0,
188810948SJames.Hall@Sun.COM 			    "shared by indx", 0);
188910948SJames.Hall@Sun.COM 			smic->sm_get_component_count =
189010948SJames.Hall@Sun.COM 			    (int (*)())md_get_named_service(sm->sm_dev, 0,
189110948SJames.Hall@Sun.COM 			    "get component count", 0);
189210948SJames.Hall@Sun.COM 			smic->sm_get_bcss =
189310948SJames.Hall@Sun.COM 			    (int (*)())md_get_named_service(sm->sm_dev, 0,
189410948SJames.Hall@Sun.COM 			    "get block count skip size", 0);
189510948SJames.Hall@Sun.COM 		}
189610948SJames.Hall@Sun.COM 		return;
189710948SJames.Hall@Sun.COM 	}
189810948SJames.Hall@Sun.COM 
189910948SJames.Hall@Sun.COM 	/*
190010948SJames.Hall@Sun.COM 	 * Reshuffle the submirror devices so that we do not have a dead record
190110948SJames.Hall@Sun.COM 	 * in the middle of the array. Once we've done this we need to rescan
190210948SJames.Hall@Sun.COM 	 * the mirror to check for any other holes.
190310948SJames.Hall@Sun.COM 	 */
190410948SJames.Hall@Sun.COM 	for (smi = 0; smi < NMIRROR; smi++) {
190510948SJames.Hall@Sun.COM 		if (smi < smiremove)
190610948SJames.Hall@Sun.COM 			continue;
190710948SJames.Hall@Sun.COM 		if (smi > smiremove) {
190810948SJames.Hall@Sun.COM 			old_sm = &un->un_sm[smi];
190910948SJames.Hall@Sun.COM 			new_sm = &un->un_sm[smi - 1];
191010948SJames.Hall@Sun.COM 			bcopy(old_sm, new_sm, sizeof (mm_submirror_t));
191110948SJames.Hall@Sun.COM 			bzero(old_sm, sizeof (mm_submirror_t));
191210948SJames.Hall@Sun.COM 		}
191310948SJames.Hall@Sun.COM 	}
191410948SJames.Hall@Sun.COM 
191510948SJames.Hall@Sun.COM 	/*
191610948SJames.Hall@Sun.COM 	 * Now we need to rescan the array to find the next potential dead
191710948SJames.Hall@Sun.COM 	 * entry.
191810948SJames.Hall@Sun.COM 	 */
191910948SJames.Hall@Sun.COM 	goto rescan;
192010948SJames.Hall@Sun.COM }
192110948SJames.Hall@Sun.COM 
19220Sstevel@tonic-gate /* Return a -1 if optimized record unavailable and set should be released */
19230Sstevel@tonic-gate int
mirror_build_incore(mm_unit_t * un,int snarfing)19240Sstevel@tonic-gate mirror_build_incore(mm_unit_t *un, int snarfing)
19250Sstevel@tonic-gate {
19260Sstevel@tonic-gate 	int		i;
19270Sstevel@tonic-gate 
19280Sstevel@tonic-gate 	if (MD_STATUS(un) & MD_UN_BEING_RESET) {
19290Sstevel@tonic-gate 		mddb_setrecprivate(un->c.un_record_id, MD_PRV_PENDCLEAN);
19300Sstevel@tonic-gate 		return (1);
19310Sstevel@tonic-gate 	}
19320Sstevel@tonic-gate 
19330Sstevel@tonic-gate 	if (mirror_are_submirrors_available(un) == 0)
19340Sstevel@tonic-gate 		return (1);
19350Sstevel@tonic-gate 
19360Sstevel@tonic-gate 	if (MD_UNIT(MD_SID(un)) != NULL)
19370Sstevel@tonic-gate 		return (0);
19380Sstevel@tonic-gate 
19390Sstevel@tonic-gate 	MD_STATUS(un) = 0;
19400Sstevel@tonic-gate 
19410Sstevel@tonic-gate 	/* pre-4.1 didn't define CAN_META_CHILD capability */
19420Sstevel@tonic-gate 	MD_CAPAB(un) = MD_CAN_META_CHILD | MD_CAN_PARENT | MD_CAN_SP;
19430Sstevel@tonic-gate 
19446901Sjkennedy 	un->un_overlap_tree_flag = 0;
19456901Sjkennedy 	avl_create(&un->un_overlap_root, mirror_overlap_compare,
19466901Sjkennedy 	    sizeof (md_mps_t), offsetof(md_mps_t, ps_overlap_node));
19470Sstevel@tonic-gate 
194810948SJames.Hall@Sun.COM 	/*
194910948SJames.Hall@Sun.COM 	 * We need to collapse any sparse submirror entries into a non-sparse
195010948SJames.Hall@Sun.COM 	 * array. This is to cover the case where we have an old replica image
195110948SJames.Hall@Sun.COM 	 * which has not been updated (i.e. snarfed) since being modified.
195210948SJames.Hall@Sun.COM 	 * The new code expects all submirror access to be sequential (i.e.
195310948SJames.Hall@Sun.COM 	 * both the un_sm[] and un_smic[] entries correspond to non-empty
195410948SJames.Hall@Sun.COM 	 * submirrors.
195510948SJames.Hall@Sun.COM 	 */
195610948SJames.Hall@Sun.COM 
195710948SJames.Hall@Sun.COM 	collapse_submirrors(un);
195810948SJames.Hall@Sun.COM 
19590Sstevel@tonic-gate 	for (i = 0; i < NMIRROR; i++)
19600Sstevel@tonic-gate 		build_submirror(un, i, snarfing);
19610Sstevel@tonic-gate 
19620Sstevel@tonic-gate 	if (unit_setup_resync(un, snarfing) != 0) {
19630Sstevel@tonic-gate 		if (snarfing) {
19640Sstevel@tonic-gate 			mddb_setrecprivate(un->c.un_record_id, MD_PRV_GOTIT);
19650Sstevel@tonic-gate 			/*
19660Sstevel@tonic-gate 			 * If a MN set and set is not stale, then return -1
19670Sstevel@tonic-gate 			 * which will force the caller to unload the set.
19680Sstevel@tonic-gate 			 * The MN diskset nodes will return failure if
19690Sstevel@tonic-gate 			 * unit_setup_resync fails so that nodes won't
19700Sstevel@tonic-gate 			 * get out of sync.
19710Sstevel@tonic-gate 			 *
19720Sstevel@tonic-gate 			 * If set is STALE, the master node can't allocate
19730Sstevel@tonic-gate 			 * a resync record (if needed), but node needs to
19740Sstevel@tonic-gate 			 * join the set so that user can delete broken mddbs.
19750Sstevel@tonic-gate 			 * So, if set is STALE, just continue on.
19760Sstevel@tonic-gate 			 */
19770Sstevel@tonic-gate 			if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
19780Sstevel@tonic-gate 			    !(md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)) {
19790Sstevel@tonic-gate 				return (-1);
19800Sstevel@tonic-gate 			}
19810Sstevel@tonic-gate 		} else
19820Sstevel@tonic-gate 			return (1);
19830Sstevel@tonic-gate 	}
19840Sstevel@tonic-gate 
19856901Sjkennedy 	mutex_init(&un->un_overlap_tree_mx, NULL, MUTEX_DEFAULT, NULL);
19866901Sjkennedy 	cv_init(&un->un_overlap_tree_cv, NULL, CV_DEFAULT, NULL);
19870Sstevel@tonic-gate 
19880Sstevel@tonic-gate 	un->un_suspend_wr_flag = 0;
19890Sstevel@tonic-gate 	mutex_init(&un->un_suspend_wr_mx, NULL, MUTEX_DEFAULT, NULL);
19900Sstevel@tonic-gate 	cv_init(&un->un_suspend_wr_cv, NULL, CV_DEFAULT, NULL);
19910Sstevel@tonic-gate 
19920Sstevel@tonic-gate 	/*
19930Sstevel@tonic-gate 	 * Allocate mutexes for mirror-owner and resync-owner changes.
19940Sstevel@tonic-gate 	 * All references to the owner message state field must be guarded
19950Sstevel@tonic-gate 	 * by this mutex.
19960Sstevel@tonic-gate 	 */
19970Sstevel@tonic-gate 	mutex_init(&un->un_owner_mx, NULL, MUTEX_DEFAULT, NULL);
19980Sstevel@tonic-gate 
19990Sstevel@tonic-gate 	/*
20000Sstevel@tonic-gate 	 * Allocate mutex and condvar for resync thread manipulation. These
20010Sstevel@tonic-gate 	 * will be used by mirror_resync_unit/mirror_ioctl_resync
20020Sstevel@tonic-gate 	 */
20030Sstevel@tonic-gate 	mutex_init(&un->un_rs_thread_mx, NULL, MUTEX_DEFAULT, NULL);
20040Sstevel@tonic-gate 	cv_init(&un->un_rs_thread_cv, NULL, CV_DEFAULT, NULL);
20050Sstevel@tonic-gate 
20060Sstevel@tonic-gate 	/*
20070Sstevel@tonic-gate 	 * Allocate mutex and condvar for resync progress thread manipulation.
20080Sstevel@tonic-gate 	 * This allows resyncs to be continued across an intervening reboot.
20090Sstevel@tonic-gate 	 */
20100Sstevel@tonic-gate 	mutex_init(&un->un_rs_progress_mx, NULL, MUTEX_DEFAULT, NULL);
20110Sstevel@tonic-gate 	cv_init(&un->un_rs_progress_cv, NULL, CV_DEFAULT, NULL);
20120Sstevel@tonic-gate 
20130Sstevel@tonic-gate 	/*
20140Sstevel@tonic-gate 	 * Allocate mutex and condvar for Directed Mirror Reads (DMR). This
20150Sstevel@tonic-gate 	 * provides synchronization between a user-ioctl and the resulting
20160Sstevel@tonic-gate 	 * strategy() call that performs the read().
20170Sstevel@tonic-gate 	 */
20180Sstevel@tonic-gate 	mutex_init(&un->un_dmr_mx, NULL, MUTEX_DEFAULT, NULL);
20190Sstevel@tonic-gate 	cv_init(&un->un_dmr_cv, NULL, CV_DEFAULT, NULL);
20200Sstevel@tonic-gate 
20218452SJohn.Wren.Kennedy@Sun.COM 	/*
20228452SJohn.Wren.Kennedy@Sun.COM 	 * Allocate rwlocks for un_pernode_dirty_bm accessing.
20238452SJohn.Wren.Kennedy@Sun.COM 	 */
20248452SJohn.Wren.Kennedy@Sun.COM 	for (i = 0; i < MD_MNMAXSIDES; i++) {
20258452SJohn.Wren.Kennedy@Sun.COM 		rw_init(&un->un_pernode_dirty_mx[i], NULL, RW_DEFAULT, NULL);
20268452SJohn.Wren.Kennedy@Sun.COM 	}
20278452SJohn.Wren.Kennedy@Sun.COM 
20287627SChris.Horne@Sun.COM 	/* place various information in the in-core data structures */
20297627SChris.Horne@Sun.COM 	md_nblocks_set(MD_SID(un), un->c.un_total_blocks);
20300Sstevel@tonic-gate 	MD_UNIT(MD_SID(un)) = un;
20317627SChris.Horne@Sun.COM 
20320Sstevel@tonic-gate 	return (0);
20330Sstevel@tonic-gate }
20340Sstevel@tonic-gate 
20350Sstevel@tonic-gate 
20360Sstevel@tonic-gate void
reset_mirror(struct mm_unit * un,minor_t mnum,int removing)20370Sstevel@tonic-gate reset_mirror(struct mm_unit *un, minor_t mnum, int removing)
20380Sstevel@tonic-gate {
20390Sstevel@tonic-gate 	mddb_recid_t	recid, vtoc_id;
20400Sstevel@tonic-gate 	size_t		bitcnt;
20410Sstevel@tonic-gate 	size_t		shortcnt;
20420Sstevel@tonic-gate 	int		smi;
20430Sstevel@tonic-gate 	sv_dev_t	sv[NMIRROR];
20440Sstevel@tonic-gate 	int		nsv = 0;
20450Sstevel@tonic-gate 	uint_t		bits = 0;
20460Sstevel@tonic-gate 	minor_t		selfid;
20470Sstevel@tonic-gate 	md_unit_t	*su;
20488452SJohn.Wren.Kennedy@Sun.COM 	int		i;
20490Sstevel@tonic-gate 
20500Sstevel@tonic-gate 	md_destroy_unit_incore(mnum, &mirror_md_ops);
20510Sstevel@tonic-gate 
20520Sstevel@tonic-gate 	shortcnt = un->un_rrd_num * sizeof (short);
20530Sstevel@tonic-gate 	bitcnt = howmany(un->un_rrd_num, NBBY);
20540Sstevel@tonic-gate 
20550Sstevel@tonic-gate 	if (un->un_outstanding_writes)
20560Sstevel@tonic-gate 		kmem_free((caddr_t)un->un_outstanding_writes, shortcnt);
20570Sstevel@tonic-gate 	if (un->un_goingclean_bm)
20580Sstevel@tonic-gate 		kmem_free((caddr_t)un->un_goingclean_bm, bitcnt);
20590Sstevel@tonic-gate 	if (un->un_goingdirty_bm)
20600Sstevel@tonic-gate 		kmem_free((caddr_t)un->un_goingdirty_bm, bitcnt);
20610Sstevel@tonic-gate 	if (un->un_resync_bm)
20620Sstevel@tonic-gate 		kmem_free((caddr_t)un->un_resync_bm, bitcnt);
20638452SJohn.Wren.Kennedy@Sun.COM 	if (un->un_pernode_dirty_sum)
20648452SJohn.Wren.Kennedy@Sun.COM 		kmem_free((caddr_t)un->un_pernode_dirty_sum, un->un_rrd_num);
20658452SJohn.Wren.Kennedy@Sun.COM 
20668452SJohn.Wren.Kennedy@Sun.COM 	/*
20678452SJohn.Wren.Kennedy@Sun.COM 	 * Destroy the taskq for deferred processing of DRL clean requests.
20688452SJohn.Wren.Kennedy@Sun.COM 	 * This taskq will only be present for Multi Owner mirrors.
20698452SJohn.Wren.Kennedy@Sun.COM 	 */
20708452SJohn.Wren.Kennedy@Sun.COM 	if (un->un_drl_task != NULL)
20718452SJohn.Wren.Kennedy@Sun.COM 		ddi_taskq_destroy(un->un_drl_task);
20720Sstevel@tonic-gate 
20737627SChris.Horne@Sun.COM 	md_nblocks_set(mnum, -1ULL);
20740Sstevel@tonic-gate 	MD_UNIT(mnum) = NULL;
20750Sstevel@tonic-gate 
20761623Stw21770 	/*
20771623Stw21770 	 * Attempt release of its minor node
20781623Stw21770 	 */
20792077Stw21770 	md_remove_minor_node(mnum);
20801623Stw21770 
20810Sstevel@tonic-gate 	if (!removing)
20820Sstevel@tonic-gate 		return;
20830Sstevel@tonic-gate 
20840Sstevel@tonic-gate 	for (smi = 0; smi < NMIRROR; smi++) {
20850Sstevel@tonic-gate 		if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE))
20860Sstevel@tonic-gate 			continue;
20870Sstevel@tonic-gate 		/* reallow soft partitioning of submirror and reset parent */
20880Sstevel@tonic-gate 		su = MD_UNIT(md_getminor(un->un_sm[smi].sm_dev));
20890Sstevel@tonic-gate 		MD_CAPAB(su) |= MD_CAN_SP;
20900Sstevel@tonic-gate 		md_reset_parent(un->un_sm[smi].sm_dev);
20910Sstevel@tonic-gate 		reset_comp_states(&un->un_sm[smi], &un->un_smic[smi]);
20920Sstevel@tonic-gate 
20930Sstevel@tonic-gate 		sv[nsv].setno = MD_MIN2SET(mnum);
20940Sstevel@tonic-gate 		sv[nsv++].key = un->un_sm[smi].sm_key;
20950Sstevel@tonic-gate 		bits |= SMI2BIT(smi);
20960Sstevel@tonic-gate 	}
20970Sstevel@tonic-gate 
20980Sstevel@tonic-gate 	MD_STATUS(un) |= MD_UN_BEING_RESET;
20990Sstevel@tonic-gate 	recid = un->un_rr_dirty_recid;
21000Sstevel@tonic-gate 	vtoc_id = un->c.un_vtoc_id;
21010Sstevel@tonic-gate 	selfid = MD_SID(un);
21020Sstevel@tonic-gate 
21030Sstevel@tonic-gate 	mirror_commit(un, bits, 0);
21040Sstevel@tonic-gate 
21056901Sjkennedy 	avl_destroy(&un->un_overlap_root);
21066901Sjkennedy 
21070Sstevel@tonic-gate 	/* Destroy all mutexes and condvars before returning. */
21080Sstevel@tonic-gate 	mutex_destroy(&un->un_suspend_wr_mx);
21090Sstevel@tonic-gate 	cv_destroy(&un->un_suspend_wr_cv);
21106901Sjkennedy 	mutex_destroy(&un->un_overlap_tree_mx);
21116901Sjkennedy 	cv_destroy(&un->un_overlap_tree_cv);
21120Sstevel@tonic-gate 	mutex_destroy(&un->un_owner_mx);
21130Sstevel@tonic-gate 	mutex_destroy(&un->un_rs_thread_mx);
21140Sstevel@tonic-gate 	cv_destroy(&un->un_rs_thread_cv);
21150Sstevel@tonic-gate 	mutex_destroy(&un->un_rs_progress_mx);
21160Sstevel@tonic-gate 	cv_destroy(&un->un_rs_progress_cv);
21170Sstevel@tonic-gate 	mutex_destroy(&un->un_dmr_mx);
21180Sstevel@tonic-gate 	cv_destroy(&un->un_dmr_cv);
21191623Stw21770 
21208452SJohn.Wren.Kennedy@Sun.COM 	for (i = 0; i < MD_MNMAXSIDES; i++) {
21218452SJohn.Wren.Kennedy@Sun.COM 		rw_destroy(&un->un_pernode_dirty_mx[i]);
21228452SJohn.Wren.Kennedy@Sun.COM 		if (un->un_pernode_dirty_bm[i])
21238452SJohn.Wren.Kennedy@Sun.COM 			kmem_free((caddr_t)un->un_pernode_dirty_bm[i], bitcnt);
21248452SJohn.Wren.Kennedy@Sun.COM 	}
21258452SJohn.Wren.Kennedy@Sun.COM 
21261623Stw21770 	/*
21271623Stw21770 	 * Remove self from the namespace
21281623Stw21770 	 */
21291623Stw21770 	if (un->c.un_revision & MD_FN_META_DEV) {
21301623Stw21770 		(void) md_rem_selfname(un->c.un_self_id);
21311623Stw21770 	}
21321623Stw21770 
21338452SJohn.Wren.Kennedy@Sun.COM 	/* This frees the unit structure. */
21340Sstevel@tonic-gate 	mddb_deleterec_wrapper(un->c.un_record_id);
21358452SJohn.Wren.Kennedy@Sun.COM 
21360Sstevel@tonic-gate 	if (recid != 0)
21370Sstevel@tonic-gate 		mddb_deleterec_wrapper(recid);
21380Sstevel@tonic-gate 
21390Sstevel@tonic-gate 	/* Remove the vtoc, if present */
21400Sstevel@tonic-gate 	if (vtoc_id)
21410Sstevel@tonic-gate 		mddb_deleterec_wrapper(vtoc_id);
21420Sstevel@tonic-gate 
21430Sstevel@tonic-gate 	md_rem_names(sv, nsv);
21440Sstevel@tonic-gate 
21450Sstevel@tonic-gate 	SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, SVM_TAG_METADEVICE,
21460Sstevel@tonic-gate 	    MD_MIN2SET(selfid), selfid);
21470Sstevel@tonic-gate }
21480Sstevel@tonic-gate 
21490Sstevel@tonic-gate int
mirror_internal_open(minor_t mnum,int flag,int otyp,int md_oflags,IOLOCK * lockp)21500Sstevel@tonic-gate mirror_internal_open(
21510Sstevel@tonic-gate 	minor_t		mnum,
21520Sstevel@tonic-gate 	int		flag,
21530Sstevel@tonic-gate 	int		otyp,
21540Sstevel@tonic-gate 	int		md_oflags,
21550Sstevel@tonic-gate 	IOLOCK		*lockp		/* can be NULL */
21560Sstevel@tonic-gate )
21570Sstevel@tonic-gate {
21580Sstevel@tonic-gate 	mdi_unit_t	*ui = MDI_UNIT(mnum);
21590Sstevel@tonic-gate 	int		err = 0;
21600Sstevel@tonic-gate 
21610Sstevel@tonic-gate tryagain:
21620Sstevel@tonic-gate 	/* single thread */
21630Sstevel@tonic-gate 	if (lockp) {
21640Sstevel@tonic-gate 		/*
21650Sstevel@tonic-gate 		 * If ioctl lock is held, use openclose_enter
21660Sstevel@tonic-gate 		 * routine that will set the ioctl flag when
21670Sstevel@tonic-gate 		 * grabbing the readerlock.
21680Sstevel@tonic-gate 		 */
21690Sstevel@tonic-gate 		(void) md_ioctl_openclose_enter(lockp, ui);
21700Sstevel@tonic-gate 	} else {
21710Sstevel@tonic-gate 		(void) md_unit_openclose_enter(ui);
21720Sstevel@tonic-gate 	}
21730Sstevel@tonic-gate 
21740Sstevel@tonic-gate 	/*
21750Sstevel@tonic-gate 	 * The mirror_open_all_devs routine may end up sending a STATE_UPDATE
21760Sstevel@tonic-gate 	 * message in a MN diskset and this requires that the openclose
21770Sstevel@tonic-gate 	 * lock is dropped in order to send this message.  So, another
21780Sstevel@tonic-gate 	 * flag (MD_UL_OPENINPROGRESS) is used to keep another thread from
21790Sstevel@tonic-gate 	 * attempting an open while this thread has an open in progress.
21800Sstevel@tonic-gate 	 * Call the *_lh version of the lock exit routines since the ui_mx
21810Sstevel@tonic-gate 	 * mutex must be held from checking for OPENINPROGRESS until
21820Sstevel@tonic-gate 	 * after the cv_wait call.
21830Sstevel@tonic-gate 	 */
21840Sstevel@tonic-gate 	mutex_enter(&ui->ui_mx);
21850Sstevel@tonic-gate 	if (ui->ui_lock & MD_UL_OPENINPROGRESS) {
21860Sstevel@tonic-gate 		if (lockp) {
21870Sstevel@tonic-gate 			(void) md_ioctl_openclose_exit_lh(lockp);
21880Sstevel@tonic-gate 		} else {
21890Sstevel@tonic-gate 			md_unit_openclose_exit_lh(ui);
21900Sstevel@tonic-gate 		}
21910Sstevel@tonic-gate 		cv_wait(&ui->ui_cv, &ui->ui_mx);
21920Sstevel@tonic-gate 		mutex_exit(&ui->ui_mx);
21930Sstevel@tonic-gate 		goto tryagain;
21940Sstevel@tonic-gate 	}
21950Sstevel@tonic-gate 
21960Sstevel@tonic-gate 	ui->ui_lock |= MD_UL_OPENINPROGRESS;
21970Sstevel@tonic-gate 	mutex_exit(&ui->ui_mx);
21980Sstevel@tonic-gate 
21990Sstevel@tonic-gate 	/* open devices, if necessary */
22000Sstevel@tonic-gate 	if (! md_unit_isopen(ui) || (ui->ui_tstate & MD_INACCESSIBLE)) {
22010Sstevel@tonic-gate 		if ((err = mirror_open_all_devs(mnum, md_oflags, lockp)) != 0)
22020Sstevel@tonic-gate 			goto out;
22030Sstevel@tonic-gate 	}
22040Sstevel@tonic-gate 
22050Sstevel@tonic-gate 	/* count open */
22060Sstevel@tonic-gate 	if ((err = md_unit_incopen(mnum, flag, otyp)) != 0)
22070Sstevel@tonic-gate 		goto out;
22080Sstevel@tonic-gate 
22090Sstevel@tonic-gate 	/* unlock, return success */
22100Sstevel@tonic-gate out:
22110Sstevel@tonic-gate 	mutex_enter(&ui->ui_mx);
22120Sstevel@tonic-gate 	ui->ui_lock &= ~MD_UL_OPENINPROGRESS;
22130Sstevel@tonic-gate 	mutex_exit(&ui->ui_mx);
22140Sstevel@tonic-gate 
22150Sstevel@tonic-gate 	if (lockp) {
22160Sstevel@tonic-gate 		/*
22170Sstevel@tonic-gate 		 * If ioctl lock is held, use openclose_exit
22180Sstevel@tonic-gate 		 * routine that will clear the lockp reader flag.
22190Sstevel@tonic-gate 		 */
22200Sstevel@tonic-gate 		(void) md_ioctl_openclose_exit(lockp);
22210Sstevel@tonic-gate 	} else {
22220Sstevel@tonic-gate 		md_unit_openclose_exit(ui);
22230Sstevel@tonic-gate 	}
22240Sstevel@tonic-gate 	return (err);
22250Sstevel@tonic-gate }
22260Sstevel@tonic-gate 
22270Sstevel@tonic-gate int
mirror_internal_close(minor_t mnum,int otyp,int md_cflags,IOLOCK * lockp)22280Sstevel@tonic-gate mirror_internal_close(
22290Sstevel@tonic-gate 	minor_t		mnum,
22300Sstevel@tonic-gate 	int		otyp,
22310Sstevel@tonic-gate 	int		md_cflags,
22320Sstevel@tonic-gate 	IOLOCK		*lockp		/* can be NULL */
22330Sstevel@tonic-gate )
22340Sstevel@tonic-gate {
22350Sstevel@tonic-gate 	mdi_unit_t	*ui = MDI_UNIT(mnum);
22360Sstevel@tonic-gate 	mm_unit_t	*un;
22370Sstevel@tonic-gate 	int		err = 0;
22380Sstevel@tonic-gate 
22390Sstevel@tonic-gate 	/* single thread */
22400Sstevel@tonic-gate 	if (lockp) {
22410Sstevel@tonic-gate 		/*
22420Sstevel@tonic-gate 		 * If ioctl lock is held, use openclose_enter
22430Sstevel@tonic-gate 		 * routine that will set the ioctl flag when
22440Sstevel@tonic-gate 		 * grabbing the readerlock.
22450Sstevel@tonic-gate 		 */
22460Sstevel@tonic-gate 		un = (mm_unit_t *)md_ioctl_openclose_enter(lockp, ui);
22470Sstevel@tonic-gate 	} else {
22480Sstevel@tonic-gate 		un = (mm_unit_t *)md_unit_openclose_enter(ui);
22490Sstevel@tonic-gate 	}
22500Sstevel@tonic-gate 
22510Sstevel@tonic-gate 	/* count closed */
22520Sstevel@tonic-gate 	if ((err = md_unit_decopen(mnum, otyp)) != 0)
22530Sstevel@tonic-gate 		goto out;
22540Sstevel@tonic-gate 
22550Sstevel@tonic-gate 	/* close devices, if necessary */
22560Sstevel@tonic-gate 	if (! md_unit_isopen(ui) || (md_cflags & MD_OFLG_PROBEDEV)) {
22570Sstevel@tonic-gate 		/*
22580Sstevel@tonic-gate 		 * Clean up dirty bitmap for this unit. Do this
22590Sstevel@tonic-gate 		 * before closing the underlying devices to avoid
22600Sstevel@tonic-gate 		 * race conditions with reset_mirror() as a
22610Sstevel@tonic-gate 		 * result of a 'metaset -r' command running in
22620Sstevel@tonic-gate 		 * parallel. This might cause deallocation of
22630Sstevel@tonic-gate 		 * dirty region bitmaps; with underlying metadevices
22640Sstevel@tonic-gate 		 * in place this can't happen.
22650Sstevel@tonic-gate 		 * Don't do this if a MN set and ABR not set
22660Sstevel@tonic-gate 		 */
22670Sstevel@tonic-gate 		if (new_resync && !(MD_STATUS(un) & MD_UN_KEEP_DIRTY)) {
22680Sstevel@tonic-gate 			if (!MD_MNSET_SETNO(MD_UN2SET(un)) ||
22690Sstevel@tonic-gate 			    !(ui->ui_tstate & MD_ABR_CAP))
22700Sstevel@tonic-gate 				mirror_process_unit_resync(un);
22710Sstevel@tonic-gate 		}
22720Sstevel@tonic-gate 		(void) mirror_close_all_devs(un, md_cflags);
22730Sstevel@tonic-gate 
22740Sstevel@tonic-gate 		/*
22750Sstevel@tonic-gate 		 * For a MN set with transient capabilities (eg ABR/DMR) set,
22760Sstevel@tonic-gate 		 * clear these capabilities on the last open in the cluster.
22770Sstevel@tonic-gate 		 * To do this we send a message to all nodes to see of the
22780Sstevel@tonic-gate 		 * device is open.
22790Sstevel@tonic-gate 		 */
22800Sstevel@tonic-gate 		if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
22810Sstevel@tonic-gate 		    (ui->ui_tstate & (MD_ABR_CAP|MD_DMR_CAP))) {
22820Sstevel@tonic-gate 			if (lockp) {
22830Sstevel@tonic-gate 				(void) md_ioctl_openclose_exit(lockp);
22840Sstevel@tonic-gate 			} else {
22850Sstevel@tonic-gate 				md_unit_openclose_exit(ui);
22860Sstevel@tonic-gate 			}
22870Sstevel@tonic-gate 
22880Sstevel@tonic-gate 			/*
22890Sstevel@tonic-gate 			 * if we are in the context of an ioctl, drop the
22900Sstevel@tonic-gate 			 * ioctl lock.
22910Sstevel@tonic-gate 			 * Otherwise, no other locks should be held.
22920Sstevel@tonic-gate 			 */
22930Sstevel@tonic-gate 			if (lockp) {
22940Sstevel@tonic-gate 				IOLOCK_RETURN_RELEASE(0, lockp);
22950Sstevel@tonic-gate 			}
22960Sstevel@tonic-gate 
22970Sstevel@tonic-gate 			mdmn_clear_all_capabilities(mnum);
22980Sstevel@tonic-gate 
22990Sstevel@tonic-gate 			/* if dropped the lock previously, regain it */
23000Sstevel@tonic-gate 			if (lockp) {
23010Sstevel@tonic-gate 				IOLOCK_RETURN_REACQUIRE(lockp);
23020Sstevel@tonic-gate 			}
23030Sstevel@tonic-gate 			return (0);
23040Sstevel@tonic-gate 		}
23050Sstevel@tonic-gate 		/* unlock and return success */
23060Sstevel@tonic-gate 	}
23070Sstevel@tonic-gate out:
23080Sstevel@tonic-gate 	/* Call whether lockp is NULL or not. */
23090Sstevel@tonic-gate 	if (lockp) {
23100Sstevel@tonic-gate 		md_ioctl_openclose_exit(lockp);
23110Sstevel@tonic-gate 	} else {
23120Sstevel@tonic-gate 		md_unit_openclose_exit(ui);
23130Sstevel@tonic-gate 	}
23140Sstevel@tonic-gate 	return (err);
23150Sstevel@tonic-gate }
23160Sstevel@tonic-gate 
23170Sstevel@tonic-gate /*
23180Sstevel@tonic-gate  * When a component has completed resyncing and is now ok, check if the
23190Sstevel@tonic-gate  * corresponding component in the other submirrors is in the Last Erred
23200Sstevel@tonic-gate  * state.  If it is, we want to change that to the Erred state so we stop
23210Sstevel@tonic-gate  * using that component and start using this good component instead.
23220Sstevel@tonic-gate  *
23230Sstevel@tonic-gate  * This is called from set_sm_comp_state and recursively calls
23240Sstevel@tonic-gate  * set_sm_comp_state if it needs to change the Last Erred state.
23250Sstevel@tonic-gate  */
23260Sstevel@tonic-gate static void
reset_lasterred(mm_unit_t * un,int smi,mddb_recid_t * extras,uint_t flags,IOLOCK * lockp)23270Sstevel@tonic-gate reset_lasterred(mm_unit_t *un, int smi, mddb_recid_t *extras, uint_t flags,
23280Sstevel@tonic-gate 	IOLOCK *lockp)
23290Sstevel@tonic-gate {
23300Sstevel@tonic-gate 	mm_submirror_t		*sm;
23310Sstevel@tonic-gate 	mm_submirror_ic_t	*smic;
23320Sstevel@tonic-gate 	int			ci;
23330Sstevel@tonic-gate 	int			i;
23340Sstevel@tonic-gate 	int			compcnt;
23350Sstevel@tonic-gate 	int			changed = 0;
23360Sstevel@tonic-gate 
23370Sstevel@tonic-gate 	for (i = 0; i < NMIRROR; i++) {
23380Sstevel@tonic-gate 		sm = &un->un_sm[i];
23390Sstevel@tonic-gate 		smic = &un->un_smic[i];
23400Sstevel@tonic-gate 
23410Sstevel@tonic-gate 		if (!SMS_IS(sm, SMS_INUSE))
23420Sstevel@tonic-gate 			continue;
23430Sstevel@tonic-gate 
23440Sstevel@tonic-gate 		/* ignore the submirror that we just made ok */
23450Sstevel@tonic-gate 		if (i == smi)
23460Sstevel@tonic-gate 			continue;
23470Sstevel@tonic-gate 
23480Sstevel@tonic-gate 		compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un);
23490Sstevel@tonic-gate 		for (ci = 0; ci < compcnt; ci++) {
23500Sstevel@tonic-gate 			md_m_shared_t	*shared;
23510Sstevel@tonic-gate 
23520Sstevel@tonic-gate 			shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
23530Sstevel@tonic-gate 			    (sm->sm_dev, sm, ci);
23540Sstevel@tonic-gate 
23550Sstevel@tonic-gate 			if ((shared->ms_state & CS_LAST_ERRED) &&
23560Sstevel@tonic-gate 			    !mirror_other_sources(un, i, ci, 1)) {
23570Sstevel@tonic-gate 
23580Sstevel@tonic-gate 				set_sm_comp_state(un, i, ci, CS_ERRED, extras,
23590Sstevel@tonic-gate 				    flags, lockp);
23600Sstevel@tonic-gate 				changed = 1;
23610Sstevel@tonic-gate 			}
23620Sstevel@tonic-gate 		}
23630Sstevel@tonic-gate 	}
23640Sstevel@tonic-gate 
23650Sstevel@tonic-gate 	/* maybe there is a hotspare for this newly erred component */
23660Sstevel@tonic-gate 	if (changed) {
23670Sstevel@tonic-gate 		set_t	setno;
23680Sstevel@tonic-gate 
23690Sstevel@tonic-gate 		setno = MD_UN2SET(un);
23700Sstevel@tonic-gate 		if (MD_MNSET_SETNO(setno)) {
23710Sstevel@tonic-gate 			send_poke_hotspares(setno);
23720Sstevel@tonic-gate 		} else {
23730Sstevel@tonic-gate 			(void) poke_hotspares();
23740Sstevel@tonic-gate 		}
23750Sstevel@tonic-gate 	}
23760Sstevel@tonic-gate }
23770Sstevel@tonic-gate 
23780Sstevel@tonic-gate /*
23790Sstevel@tonic-gate  * set_sm_comp_state
23800Sstevel@tonic-gate  *
23810Sstevel@tonic-gate  * Set the state of a submirror component to the specified new state.
23820Sstevel@tonic-gate  * If the mirror is in a multi-node set, send messages to all nodes to
23830Sstevel@tonic-gate  * block all writes to the mirror and then update the state and release the
23840Sstevel@tonic-gate  * writes. These messages are only sent if MD_STATE_XMIT is set in flags.
23850Sstevel@tonic-gate  * MD_STATE_XMIT will be unset in 2 cases:
23860Sstevel@tonic-gate  * 1. When the state is changed to CS_RESYNC as this state change
23870Sstevel@tonic-gate  * will already have been updated on each node by the processing of the
23880Sstevel@tonic-gate  * distributed metasync command, hence no need to xmit.
23890Sstevel@tonic-gate  * 2. When the state is change to CS_OKAY after a resync has completed. Again
23900Sstevel@tonic-gate  * the resync completion will already have been processed on each node by
23910Sstevel@tonic-gate  * the processing of the MD_MN_MSG_RESYNC_PHASE_DONE message for a component
23920Sstevel@tonic-gate  * resync, hence no need to xmit.
23930Sstevel@tonic-gate  *
23940Sstevel@tonic-gate  * In case we are called from the updates of a watermark,
23950Sstevel@tonic-gate  * (then MD_STATE_WMUPDATE will be set in the ps->flags) this is due to
23960Sstevel@tonic-gate  * a metainit or similar. In this case the message that we sent to propagate
23970Sstevel@tonic-gate  * the state change must not be a class1 message as that would deadlock with
23980Sstevel@tonic-gate  * the metainit command that is still being processed.
23990Sstevel@tonic-gate  * This we achieve by creating a class2 message MD_MN_MSG_STATE_UPDATE2
24000Sstevel@tonic-gate  * instead. This also makes the submessage generator to create a class2
24010Sstevel@tonic-gate  * submessage rather than a class1 (which would also block)
24020Sstevel@tonic-gate  *
24030Sstevel@tonic-gate  * On entry, unit_writerlock is held
24040Sstevel@tonic-gate  * If MD_STATE_OCHELD is set in flags, then unit_openclose lock is
24050Sstevel@tonic-gate  * also held.
24060Sstevel@tonic-gate  */
24070Sstevel@tonic-gate void
set_sm_comp_state(mm_unit_t * un,int smi,int ci,int newstate,mddb_recid_t * extras,uint_t flags,IOLOCK * lockp)24080Sstevel@tonic-gate set_sm_comp_state(
24090Sstevel@tonic-gate 	mm_unit_t	*un,
24100Sstevel@tonic-gate 	int		smi,
24110Sstevel@tonic-gate 	int		ci,
24120Sstevel@tonic-gate 	int		newstate,
24130Sstevel@tonic-gate 	mddb_recid_t	*extras,
24140Sstevel@tonic-gate 	uint_t		flags,
24150Sstevel@tonic-gate 	IOLOCK		*lockp
24160Sstevel@tonic-gate )
24170Sstevel@tonic-gate {
24180Sstevel@tonic-gate 	mm_submirror_t		*sm;
24190Sstevel@tonic-gate 	mm_submirror_ic_t	*smic;
24200Sstevel@tonic-gate 	md_m_shared_t		*shared;
24210Sstevel@tonic-gate 	int			origstate;
24220Sstevel@tonic-gate 	void			(*get_dev)();
24230Sstevel@tonic-gate 	ms_cd_info_t		cd;
24240Sstevel@tonic-gate 	char			devname[MD_MAX_CTDLEN];
24250Sstevel@tonic-gate 	int			err;
24260Sstevel@tonic-gate 	set_t			setno = MD_UN2SET(un);
24270Sstevel@tonic-gate 	md_mn_msg_stch_t	stchmsg;
24280Sstevel@tonic-gate 	mdi_unit_t		*ui = MDI_UNIT(MD_SID(un));
24290Sstevel@tonic-gate 	md_mn_kresult_t		*kresult;
24300Sstevel@tonic-gate 	int			rval;
24310Sstevel@tonic-gate 	uint_t			msgflags;
24320Sstevel@tonic-gate 	md_mn_msgtype_t		msgtype;
24330Sstevel@tonic-gate 	int			save_lock = 0;
24340Sstevel@tonic-gate 	mdi_unit_t		*ui_sm;
243511130SJames.Hall@Sun.COM 	int			nretries = 0;
24360Sstevel@tonic-gate 
24370Sstevel@tonic-gate 	sm = &un->un_sm[smi];
24380Sstevel@tonic-gate 	smic = &un->un_smic[smi];
24390Sstevel@tonic-gate 
24400Sstevel@tonic-gate 	/* If we have a real error status then turn off MD_INACCESSIBLE. */
24410Sstevel@tonic-gate 	ui_sm = MDI_UNIT(getminor(md_dev64_to_dev(sm->sm_dev)));
24420Sstevel@tonic-gate 	if (newstate & (CS_ERRED | CS_RESYNC | CS_LAST_ERRED) &&
24430Sstevel@tonic-gate 	    ui_sm->ui_tstate & MD_INACCESSIBLE) {
24446901Sjkennedy 		ui_sm->ui_tstate &= ~MD_INACCESSIBLE;
24450Sstevel@tonic-gate 	}
24460Sstevel@tonic-gate 
24476901Sjkennedy 	shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
24486901Sjkennedy 	    (sm->sm_dev, sm, ci);
24490Sstevel@tonic-gate 	origstate = shared->ms_state;
24500Sstevel@tonic-gate 
24510Sstevel@tonic-gate 	/*
24520Sstevel@tonic-gate 	 * If the new state is an error and the old one wasn't, generate
24530Sstevel@tonic-gate 	 * a console message. We do this before we send the state to other
24540Sstevel@tonic-gate 	 * nodes in a MN set because the state change may change the component
24550Sstevel@tonic-gate 	 * name  if a hotspare is allocated.
24560Sstevel@tonic-gate 	 */
24570Sstevel@tonic-gate 	if ((! (origstate & (CS_ERRED|CS_LAST_ERRED))) &&
24580Sstevel@tonic-gate 	    (newstate & (CS_ERRED|CS_LAST_ERRED))) {
24590Sstevel@tonic-gate 
24606901Sjkennedy 		get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0,
24616901Sjkennedy 		    "get device", 0);
24620Sstevel@tonic-gate 		(void) (*get_dev)(sm->sm_dev, sm, ci, &cd);
24630Sstevel@tonic-gate 
24640Sstevel@tonic-gate 		err = md_getdevname(setno, mddb_getsidenum(setno), 0,
24650Sstevel@tonic-gate 		    cd.cd_dev, devname, sizeof (devname));
24660Sstevel@tonic-gate 
24670Sstevel@tonic-gate 		if (err == ENOENT) {
24680Sstevel@tonic-gate 			(void) md_devname(setno, cd.cd_dev, devname,
24696901Sjkennedy 			    sizeof (devname));
24700Sstevel@tonic-gate 		}
24710Sstevel@tonic-gate 
24720Sstevel@tonic-gate 		cmn_err(CE_WARN, "md: %s: %s needs maintenance",
24730Sstevel@tonic-gate 		    md_shortname(md_getminor(sm->sm_dev)), devname);
24740Sstevel@tonic-gate 
24750Sstevel@tonic-gate 		if (newstate & CS_LAST_ERRED) {
24760Sstevel@tonic-gate 			cmn_err(CE_WARN, "md: %s: %s last erred",
24770Sstevel@tonic-gate 			    md_shortname(md_getminor(sm->sm_dev)),
24780Sstevel@tonic-gate 			    devname);
24790Sstevel@tonic-gate 
24800Sstevel@tonic-gate 		} else if (shared->ms_flags & MDM_S_ISOPEN) {
24810Sstevel@tonic-gate 			/*
24820Sstevel@tonic-gate 			 * Close the broken device and clear the open flag on
24830Sstevel@tonic-gate 			 * it.  Closing the device means the RCM framework will
24840Sstevel@tonic-gate 			 * be able to unconfigure the device if required.
24850Sstevel@tonic-gate 			 *
24860Sstevel@tonic-gate 			 * We have to check that the device is open, otherwise
24870Sstevel@tonic-gate 			 * the first open on it has resulted in the error that
24880Sstevel@tonic-gate 			 * is being processed and the actual cd.cd_dev will be
24890Sstevel@tonic-gate 			 * NODEV64.
24900Sstevel@tonic-gate 			 *
24910Sstevel@tonic-gate 			 * If this is a multi-node mirror, then the multinode
24920Sstevel@tonic-gate 			 * state checks following this code will cause the
24930Sstevel@tonic-gate 			 * slave nodes to close the mirror in the function
24940Sstevel@tonic-gate 			 * mirror_set_state().
24950Sstevel@tonic-gate 			 */
24960Sstevel@tonic-gate 			md_layered_close(cd.cd_dev, MD_OFLG_NULL);
24970Sstevel@tonic-gate 			shared->ms_flags &= ~MDM_S_ISOPEN;
24980Sstevel@tonic-gate 		}
24990Sstevel@tonic-gate 
25000Sstevel@tonic-gate 	} else if ((origstate & CS_LAST_ERRED) && (newstate & CS_ERRED) &&
25010Sstevel@tonic-gate 	    (shared->ms_flags & MDM_S_ISOPEN)) {
25020Sstevel@tonic-gate 		/*
25030Sstevel@tonic-gate 		 * Similar to logic above except no log messages since we
25040Sstevel@tonic-gate 		 * are just transitioning from Last Erred to Erred.
25050Sstevel@tonic-gate 		 */
25060Sstevel@tonic-gate 		get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0,
25070Sstevel@tonic-gate 		    "get device", 0);
25080Sstevel@tonic-gate 		(void) (*get_dev)(sm->sm_dev, sm, ci, &cd);
25090Sstevel@tonic-gate 
25100Sstevel@tonic-gate 		md_layered_close(cd.cd_dev, MD_OFLG_NULL);
25110Sstevel@tonic-gate 		shared->ms_flags &= ~MDM_S_ISOPEN;
25120Sstevel@tonic-gate 	}
25130Sstevel@tonic-gate 
25140Sstevel@tonic-gate 	if ((MD_MNSET_SETNO(setno)) && (origstate != newstate) &&
25150Sstevel@tonic-gate 	    (flags & MD_STATE_XMIT) && !(ui->ui_tstate & MD_ERR_PENDING)) {
25160Sstevel@tonic-gate 		/*
25170Sstevel@tonic-gate 		 * For a multi-node mirror, send the state change to the
25180Sstevel@tonic-gate 		 * master, which broadcasts to all nodes, including this
25190Sstevel@tonic-gate 		 * one. Once the message is received, the state is set
25200Sstevel@tonic-gate 		 * in-core and the master commits the change to disk.
25210Sstevel@tonic-gate 		 * There is a case, comp_replace,  where this function
25220Sstevel@tonic-gate 		 * can be called from within an ioctl and therefore in this
25230Sstevel@tonic-gate 		 * case, as the ioctl will already be called on each node,
25240Sstevel@tonic-gate 		 * there is no need to xmit the state change to the master for
25250Sstevel@tonic-gate 		 * distribution to the other nodes. MD_STATE_XMIT flag is used
25260Sstevel@tonic-gate 		 * to indicate whether a xmit is required. The mirror's
25270Sstevel@tonic-gate 		 * transient state is set to MD_ERR_PENDING to avoid sending
25280Sstevel@tonic-gate 		 * multiple messages.
25290Sstevel@tonic-gate 		 */
25300Sstevel@tonic-gate 		if (newstate & (CS_ERRED|CS_LAST_ERRED))
25310Sstevel@tonic-gate 			ui->ui_tstate |= MD_ERR_PENDING;
25320Sstevel@tonic-gate 
25330Sstevel@tonic-gate 		/*
25340Sstevel@tonic-gate 		 * Send a state update message to all nodes. This message
25350Sstevel@tonic-gate 		 * will generate 2 submessages, the first one to suspend
25360Sstevel@tonic-gate 		 * all writes to the mirror and the second to update the
25370Sstevel@tonic-gate 		 * state and resume writes.
25380Sstevel@tonic-gate 		 */
25390Sstevel@tonic-gate 		stchmsg.msg_stch_mnum = un->c.un_self_id;
25400Sstevel@tonic-gate 		stchmsg.msg_stch_sm = smi;
25410Sstevel@tonic-gate 		stchmsg.msg_stch_comp = ci;
25420Sstevel@tonic-gate 		stchmsg.msg_stch_new_state = newstate;
25430Sstevel@tonic-gate 		stchmsg.msg_stch_hs_id = shared->ms_hs_id;
25440Sstevel@tonic-gate #ifdef DEBUG
25450Sstevel@tonic-gate 		if (mirror_debug_flag)
25460Sstevel@tonic-gate 			printf("send set state, %x, %x, %x, %x, %x\n",
25470Sstevel@tonic-gate 			    stchmsg.msg_stch_mnum, stchmsg.msg_stch_sm,
25480Sstevel@tonic-gate 			    stchmsg.msg_stch_comp, stchmsg.msg_stch_new_state,
25490Sstevel@tonic-gate 			    stchmsg.msg_stch_hs_id);
25500Sstevel@tonic-gate #endif
25510Sstevel@tonic-gate 		if (flags & MD_STATE_WMUPDATE) {
25520Sstevel@tonic-gate 			msgtype  = MD_MN_MSG_STATE_UPDATE2;
25530Sstevel@tonic-gate 			/*
25540Sstevel@tonic-gate 			 * When coming from an update of watermarks, there
25550Sstevel@tonic-gate 			 * must already be a message logged that triggered
25560Sstevel@tonic-gate 			 * this action. So, no need to log this message, too.
25570Sstevel@tonic-gate 			 */
25580Sstevel@tonic-gate 			msgflags = MD_MSGF_NO_LOG;
25590Sstevel@tonic-gate 		} else {
25600Sstevel@tonic-gate 			msgtype  = MD_MN_MSG_STATE_UPDATE;
25610Sstevel@tonic-gate 			msgflags = MD_MSGF_DEFAULT_FLAGS;
25620Sstevel@tonic-gate 		}
25630Sstevel@tonic-gate 
25640Sstevel@tonic-gate 		/*
25650Sstevel@tonic-gate 		 * If we are in the context of an ioctl, drop the ioctl lock.
25660Sstevel@tonic-gate 		 * lockp holds the list of locks held.
25670Sstevel@tonic-gate 		 *
25680Sstevel@tonic-gate 		 * Otherwise, increment the appropriate reacquire counters.
25690Sstevel@tonic-gate 		 * If openclose lock is *held, then must reacquire reader
25700Sstevel@tonic-gate 		 * lock before releasing the openclose lock.
25710Sstevel@tonic-gate 		 * Do not drop the ARRAY_WRITER lock as we may not be able
25720Sstevel@tonic-gate 		 * to reacquire it.
25730Sstevel@tonic-gate 		 */
25740Sstevel@tonic-gate 		if (lockp) {
25750Sstevel@tonic-gate 			if (lockp->l_flags & MD_ARRAY_WRITER) {
25760Sstevel@tonic-gate 				save_lock = MD_ARRAY_WRITER;
25770Sstevel@tonic-gate 				lockp->l_flags &= ~MD_ARRAY_WRITER;
25780Sstevel@tonic-gate 			} else if (lockp->l_flags & MD_ARRAY_READER) {
25790Sstevel@tonic-gate 				save_lock = MD_ARRAY_READER;
25800Sstevel@tonic-gate 				lockp->l_flags &= ~MD_ARRAY_READER;
25810Sstevel@tonic-gate 			}
25820Sstevel@tonic-gate 			IOLOCK_RETURN_RELEASE(0, lockp);
25830Sstevel@tonic-gate 		} else {
25840Sstevel@tonic-gate 			if (flags & MD_STATE_OCHELD) {
25850Sstevel@tonic-gate 				md_unit_writerexit(ui);
25860Sstevel@tonic-gate 				(void) md_unit_readerlock(ui);
25870Sstevel@tonic-gate 				md_unit_openclose_exit(ui);
25880Sstevel@tonic-gate 			} else {
25890Sstevel@tonic-gate 				md_unit_writerexit(ui);
25900Sstevel@tonic-gate 			}
25910Sstevel@tonic-gate 		}
25920Sstevel@tonic-gate 
25930Sstevel@tonic-gate 		kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
259411130SJames.Hall@Sun.COM sscs_msg:
25958452SJohn.Wren.Kennedy@Sun.COM 		rval = mdmn_ksend_message(setno, msgtype, msgflags, 0,
25966901Sjkennedy 		    (char *)&stchmsg, sizeof (stchmsg), kresult);
25970Sstevel@tonic-gate 
25980Sstevel@tonic-gate 		if (!MDMN_KSEND_MSG_OK(rval, kresult)) {
25990Sstevel@tonic-gate 			mdmn_ksend_show_error(rval, kresult, "STATE UPDATE");
26008452SJohn.Wren.Kennedy@Sun.COM 			/* If we're shutting down already, pause things here. */
26018452SJohn.Wren.Kennedy@Sun.COM 			if (kresult->kmmr_comm_state == MDMNE_RPC_FAIL) {
26028452SJohn.Wren.Kennedy@Sun.COM 				while (!md_mn_is_commd_present()) {
26038452SJohn.Wren.Kennedy@Sun.COM 					delay(md_hz);
26048452SJohn.Wren.Kennedy@Sun.COM 				}
260511130SJames.Hall@Sun.COM 				/*
260611130SJames.Hall@Sun.COM 				 * commd is now available; retry the message
260711130SJames.Hall@Sun.COM 				 * one time. If that fails we fall through and
260811130SJames.Hall@Sun.COM 				 * panic as the system is in an unexpected state
260911130SJames.Hall@Sun.COM 				 */
261011130SJames.Hall@Sun.COM 				if (nretries++ == 0)
261111130SJames.Hall@Sun.COM 					goto sscs_msg;
26128452SJohn.Wren.Kennedy@Sun.COM 			}
26130Sstevel@tonic-gate 			cmn_err(CE_PANIC,
26140Sstevel@tonic-gate 			    "ksend_message failure: STATE_UPDATE");
26150Sstevel@tonic-gate 		}
26160Sstevel@tonic-gate 		kmem_free(kresult, sizeof (md_mn_kresult_t));
26170Sstevel@tonic-gate 
26180Sstevel@tonic-gate 		/* if dropped the lock previously, regain it */
26190Sstevel@tonic-gate 		if (lockp) {
26200Sstevel@tonic-gate 			IOLOCK_RETURN_REACQUIRE(lockp);
26210Sstevel@tonic-gate 			lockp->l_flags |= save_lock;
26220Sstevel@tonic-gate 		} else {
26230Sstevel@tonic-gate 			/*
26240Sstevel@tonic-gate 			 * Reacquire dropped locks and update acquirecnts
26250Sstevel@tonic-gate 			 * appropriately.
26260Sstevel@tonic-gate 			 */
26270Sstevel@tonic-gate 			if (flags & MD_STATE_OCHELD) {
26280Sstevel@tonic-gate 				/*
26290Sstevel@tonic-gate 				 * openclose also grabs readerlock.
26300Sstevel@tonic-gate 				 */
26310Sstevel@tonic-gate 				(void) md_unit_openclose_enter(ui);
26320Sstevel@tonic-gate 				md_unit_readerexit(ui);
26330Sstevel@tonic-gate 				(void) md_unit_writerlock(ui);
26340Sstevel@tonic-gate 			} else {
26350Sstevel@tonic-gate 				(void) md_unit_writerlock(ui);
26360Sstevel@tonic-gate 			}
26370Sstevel@tonic-gate 		}
26380Sstevel@tonic-gate 
26390Sstevel@tonic-gate 		ui->ui_tstate &= ~MD_ERR_PENDING;
26400Sstevel@tonic-gate 	} else {
26410Sstevel@tonic-gate 		shared->ms_state = newstate;
26420Sstevel@tonic-gate 		uniqtime32(&shared->ms_timestamp);
26430Sstevel@tonic-gate 
26440Sstevel@tonic-gate 		if (newstate == CS_ERRED)
26450Sstevel@tonic-gate 			shared->ms_flags |= MDM_S_NOWRITE;
26460Sstevel@tonic-gate 		else
26470Sstevel@tonic-gate 			shared->ms_flags &= ~MDM_S_NOWRITE;
26480Sstevel@tonic-gate 
26490Sstevel@tonic-gate 		shared->ms_flags &= ~MDM_S_IOERR;
26500Sstevel@tonic-gate 		un->un_changecnt++;
26510Sstevel@tonic-gate 		shared->ms_lasterrcnt = un->un_changecnt;
26520Sstevel@tonic-gate 
26530Sstevel@tonic-gate 		mirror_set_sm_state(sm, smic, SMS_RUNNING, 0);
26540Sstevel@tonic-gate 		mirror_commit(un, SMI2BIT(smi), extras);
26550Sstevel@tonic-gate 	}
26560Sstevel@tonic-gate 
26570Sstevel@tonic-gate 	if ((origstate & CS_RESYNC) && (newstate & CS_OKAY)) {
26580Sstevel@tonic-gate 		/*
26590Sstevel@tonic-gate 		 * Resetting the Last Erred state will recursively call back
26600Sstevel@tonic-gate 		 * into this function (set_sm_comp_state) to update the state.
26610Sstevel@tonic-gate 		 */
26620Sstevel@tonic-gate 		reset_lasterred(un, smi, extras, flags, lockp);
26630Sstevel@tonic-gate 	}
26640Sstevel@tonic-gate }
26650Sstevel@tonic-gate 
26660Sstevel@tonic-gate static int
find_another_logical(mm_unit_t * un,mm_submirror_t * esm,diskaddr_t blk,u_longlong_t cnt,int must_be_open,int state,int err_cnt)26670Sstevel@tonic-gate find_another_logical(
26680Sstevel@tonic-gate 	mm_unit_t		*un,
26690Sstevel@tonic-gate 	mm_submirror_t		*esm,
26700Sstevel@tonic-gate 	diskaddr_t		blk,
26710Sstevel@tonic-gate 	u_longlong_t		cnt,
26720Sstevel@tonic-gate 	int			must_be_open,
26730Sstevel@tonic-gate 	int			state,
26740Sstevel@tonic-gate 	int			err_cnt)
26750Sstevel@tonic-gate {
26760Sstevel@tonic-gate 	u_longlong_t	cando;
26770Sstevel@tonic-gate 	md_dev64_t	dev;
26780Sstevel@tonic-gate 	md_m_shared_t	*s;
26790Sstevel@tonic-gate 
26800Sstevel@tonic-gate 	esm->sm_state |= SMS_IGNORE;
26810Sstevel@tonic-gate 	while (cnt != 0) {
26820Sstevel@tonic-gate 		u_longlong_t	 mcnt;
26830Sstevel@tonic-gate 
26840Sstevel@tonic-gate 		mcnt = MIN(cnt, lbtodb(1024 * 1024 * 1024));	/* 1 Gig Blks */
26850Sstevel@tonic-gate 
26866901Sjkennedy 		dev = select_read_unit(un, blk, mcnt, &cando,
26876901Sjkennedy 		    must_be_open, &s, NULL);
26880Sstevel@tonic-gate 		if (dev == (md_dev64_t)0)
26890Sstevel@tonic-gate 			break;
26900Sstevel@tonic-gate 
26910Sstevel@tonic-gate 		if ((state == CS_LAST_ERRED) &&
26920Sstevel@tonic-gate 		    (s->ms_state == CS_LAST_ERRED) &&
26930Sstevel@tonic-gate 		    (err_cnt > s->ms_lasterrcnt))
26940Sstevel@tonic-gate 			break;
26950Sstevel@tonic-gate 
26960Sstevel@tonic-gate 		cnt -= cando;
26970Sstevel@tonic-gate 		blk += cando;
26980Sstevel@tonic-gate 	}
26990Sstevel@tonic-gate 	esm->sm_state &= ~SMS_IGNORE;
27000Sstevel@tonic-gate 	return (cnt != 0);
27010Sstevel@tonic-gate }
27020Sstevel@tonic-gate 
27030Sstevel@tonic-gate int
mirror_other_sources(mm_unit_t * un,int smi,int ci,int must_be_open)27040Sstevel@tonic-gate mirror_other_sources(mm_unit_t *un, int smi, int ci, int must_be_open)
27050Sstevel@tonic-gate {
27060Sstevel@tonic-gate 	mm_submirror_t		*sm;
27070Sstevel@tonic-gate 	mm_submirror_ic_t	*smic;
27080Sstevel@tonic-gate 	size_t			count;
27090Sstevel@tonic-gate 	diskaddr_t		block;
27100Sstevel@tonic-gate 	u_longlong_t		skip;
27110Sstevel@tonic-gate 	u_longlong_t		size;
27120Sstevel@tonic-gate 	md_dev64_t		dev;
27130Sstevel@tonic-gate 	int			cnt;
27140Sstevel@tonic-gate 	md_m_shared_t		*s;
27150Sstevel@tonic-gate 	int			not_found;
27160Sstevel@tonic-gate 
27170Sstevel@tonic-gate 	sm = &un->un_sm[smi];
27180Sstevel@tonic-gate 	smic = &un->un_smic[smi];
27190Sstevel@tonic-gate 	dev = sm->sm_dev;
27200Sstevel@tonic-gate 
27210Sstevel@tonic-gate 	/*
27220Sstevel@tonic-gate 	 * Make sure every component of the submirror
27230Sstevel@tonic-gate 	 * has other sources.
27240Sstevel@tonic-gate 	 */
27250Sstevel@tonic-gate 	if (ci < 0) {
27260Sstevel@tonic-gate 		/* Find the highest lasterrcnt */
27270Sstevel@tonic-gate 		cnt = (*(smic->sm_get_component_count))(dev, sm);
27280Sstevel@tonic-gate 		for (ci = 0; ci < cnt; ci++) {
27290Sstevel@tonic-gate 			not_found = mirror_other_sources(un, smi, ci,
27300Sstevel@tonic-gate 			    must_be_open);
27310Sstevel@tonic-gate 			if (not_found)
27320Sstevel@tonic-gate 				return (1);
27330Sstevel@tonic-gate 		}
27340Sstevel@tonic-gate 		return (0);
27350Sstevel@tonic-gate 	}
27360Sstevel@tonic-gate 
27370Sstevel@tonic-gate 	/*
27380Sstevel@tonic-gate 	 * Make sure this component has other sources
27390Sstevel@tonic-gate 	 */
27400Sstevel@tonic-gate 	(void) (*(smic->sm_get_bcss))
27416901Sjkennedy 	    (dev, sm, ci, &block, &count, &skip, &size);
27420Sstevel@tonic-gate 
27430Sstevel@tonic-gate 	if (count == 0)
27440Sstevel@tonic-gate 		return (1);
27450Sstevel@tonic-gate 
27460Sstevel@tonic-gate 	s = (md_m_shared_t *)(*(smic->sm_shared_by_indx))(dev, sm, ci);
27470Sstevel@tonic-gate 
27480Sstevel@tonic-gate 	while (count--) {
27490Sstevel@tonic-gate 		if (block >= un->c.un_total_blocks)
27500Sstevel@tonic-gate 			return (0);
27510Sstevel@tonic-gate 
27520Sstevel@tonic-gate 		if ((block + size) > un->c.un_total_blocks)
27530Sstevel@tonic-gate 			size = un->c.un_total_blocks - block;
27540Sstevel@tonic-gate 
27550Sstevel@tonic-gate 		not_found = find_another_logical(un, sm, block, size,
27560Sstevel@tonic-gate 		    must_be_open, s->ms_state, s->ms_lasterrcnt);
27570Sstevel@tonic-gate 		if (not_found)
27580Sstevel@tonic-gate 			return (1);
27590Sstevel@tonic-gate 
27600Sstevel@tonic-gate 		block += size + skip;
27610Sstevel@tonic-gate 	}
27620Sstevel@tonic-gate 	return (0);
27630Sstevel@tonic-gate }
27640Sstevel@tonic-gate 
27650Sstevel@tonic-gate static void
finish_error(md_mps_t * ps)27660Sstevel@tonic-gate finish_error(md_mps_t *ps)
27670Sstevel@tonic-gate {
27680Sstevel@tonic-gate 	struct buf	*pb;
27690Sstevel@tonic-gate 	mm_unit_t	*un;
27700Sstevel@tonic-gate 	mdi_unit_t	*ui;
27710Sstevel@tonic-gate 	uint_t		new_str_flags;
27720Sstevel@tonic-gate 
27730Sstevel@tonic-gate 	pb = ps->ps_bp;
27740Sstevel@tonic-gate 	un = ps->ps_un;
27750Sstevel@tonic-gate 	ui = ps->ps_ui;
27760Sstevel@tonic-gate 
27770Sstevel@tonic-gate 	/*
27780Sstevel@tonic-gate 	 * Must flag any error to the resync originator if we're performing
27790Sstevel@tonic-gate 	 * a Write-after-Read. This corresponds to an i/o error on a resync
27800Sstevel@tonic-gate 	 * target device and in this case we ought to abort the resync as there
27810Sstevel@tonic-gate 	 * is nothing that can be done to recover from this without operator
27820Sstevel@tonic-gate 	 * intervention. If we don't set the B_ERROR flag we will continue
27830Sstevel@tonic-gate 	 * reading from the mirror but won't write to the target (as it will
27840Sstevel@tonic-gate 	 * have been placed into an errored state).
27850Sstevel@tonic-gate 	 * To handle the case of multiple components within a submirror we only
27860Sstevel@tonic-gate 	 * set the B_ERROR bit if explicitly requested to via MD_MPS_FLAG_ERROR.
27870Sstevel@tonic-gate 	 * The originator of the resync read will cause this bit to be set if
27880Sstevel@tonic-gate 	 * the underlying component count is one for a submirror resync. All
27890Sstevel@tonic-gate 	 * other resync types will have the flag set as there is no underlying
27900Sstevel@tonic-gate 	 * resync which can be performed on a contained metadevice for these
27910Sstevel@tonic-gate 	 * resync types (optimized or component).
27920Sstevel@tonic-gate 	 */
27930Sstevel@tonic-gate 
27940Sstevel@tonic-gate 	if (ps->ps_flags & MD_MPS_WRITE_AFTER_READ) {
27950Sstevel@tonic-gate 		if (ps->ps_flags & MD_MPS_FLAG_ERROR)
27960Sstevel@tonic-gate 			pb->b_flags |= B_ERROR;
27970Sstevel@tonic-gate 		md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
27980Sstevel@tonic-gate 		MPS_FREE(mirror_parent_cache, ps);
27990Sstevel@tonic-gate 		md_unit_readerexit(ui);
28000Sstevel@tonic-gate 		md_biodone(pb);
28010Sstevel@tonic-gate 		return;
28020Sstevel@tonic-gate 	}
28030Sstevel@tonic-gate 	/*
28040Sstevel@tonic-gate 	 * Set the MD_IO_COUNTED flag as we are retrying the same I/O
28050Sstevel@tonic-gate 	 * operation therefore this I/O request has already been counted,
28060Sstevel@tonic-gate 	 * the I/O count variable will be decremented by mirror_done()'s
28070Sstevel@tonic-gate 	 * call to md_biodone().
28080Sstevel@tonic-gate 	 */
28090Sstevel@tonic-gate 	if (ps->ps_changecnt != un->un_changecnt) {
28100Sstevel@tonic-gate 		new_str_flags = MD_STR_NOTTOP | MD_IO_COUNTED;
28110Sstevel@tonic-gate 		if (ps->ps_flags & MD_MPS_WOW)
28120Sstevel@tonic-gate 			new_str_flags |= MD_STR_WOW;
28130Sstevel@tonic-gate 		if (ps->ps_flags & MD_MPS_MAPPED)
28140Sstevel@tonic-gate 			new_str_flags |= MD_STR_MAPPED;
28150Sstevel@tonic-gate 		/*
28160Sstevel@tonic-gate 		 * If this I/O request was a read that was part of a resync,
28170Sstevel@tonic-gate 		 * set MD_STR_WAR for the retried read to ensure that the
28180Sstevel@tonic-gate 		 * resync write (i.e. write-after-read) will be performed
28190Sstevel@tonic-gate 		 */
28200Sstevel@tonic-gate 		if (ps->ps_flags & MD_MPS_RESYNC_READ)
28210Sstevel@tonic-gate 			new_str_flags |= MD_STR_WAR;
28220Sstevel@tonic-gate 		md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
28230Sstevel@tonic-gate 		MPS_FREE(mirror_parent_cache, ps);
28240Sstevel@tonic-gate 		md_unit_readerexit(ui);
28250Sstevel@tonic-gate 		(void) md_mirror_strategy(pb, new_str_flags, NULL);
28260Sstevel@tonic-gate 		return;
28270Sstevel@tonic-gate 	}
28280Sstevel@tonic-gate 
28290Sstevel@tonic-gate 	pb->b_flags |= B_ERROR;
28300Sstevel@tonic-gate 	md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
28310Sstevel@tonic-gate 	MPS_FREE(mirror_parent_cache, ps);
28320Sstevel@tonic-gate 	md_unit_readerexit(ui);
28330Sstevel@tonic-gate 	md_biodone(pb);
28340Sstevel@tonic-gate }
28350Sstevel@tonic-gate 
28360Sstevel@tonic-gate static void
error_update_unit(md_mps_t * ps)28370Sstevel@tonic-gate error_update_unit(md_mps_t *ps)
28380Sstevel@tonic-gate {
28390Sstevel@tonic-gate 	mm_unit_t		*un;
28400Sstevel@tonic-gate 	mdi_unit_t		*ui;
28410Sstevel@tonic-gate 	int			smi;	/* sub mirror index */
28420Sstevel@tonic-gate 	int			ci;	/* errored component */
28430Sstevel@tonic-gate 	set_t			setno;
28440Sstevel@tonic-gate 	uint_t			flags;	/* for set_sm_comp_state() */
28450Sstevel@tonic-gate 	uint_t			hspflags; /* for check_comp_4_hotspares() */
28460Sstevel@tonic-gate 
28470Sstevel@tonic-gate 	ui = ps->ps_ui;
28480Sstevel@tonic-gate 	un = (mm_unit_t *)md_unit_writerlock(ui);
28490Sstevel@tonic-gate 	setno = MD_UN2SET(un);
28500Sstevel@tonic-gate 
28510Sstevel@tonic-gate 	/* All of these updates have to propagated in case of MN set */
28520Sstevel@tonic-gate 	flags = MD_STATE_XMIT;
28530Sstevel@tonic-gate 	hspflags = MD_HOTSPARE_XMIT;
28540Sstevel@tonic-gate 
28550Sstevel@tonic-gate 	/* special treatment if we are called during updating watermarks */
28560Sstevel@tonic-gate 	if (ps->ps_flags & MD_MPS_WMUPDATE) {
28570Sstevel@tonic-gate 		flags |= MD_STATE_WMUPDATE;
28580Sstevel@tonic-gate 		hspflags |= MD_HOTSPARE_WMUPDATE;
28590Sstevel@tonic-gate 	}
28600Sstevel@tonic-gate 	smi = 0;
28610Sstevel@tonic-gate 	ci = 0;
28620Sstevel@tonic-gate 	while (mirror_geterror(un, &smi, &ci, 1, 0) != 0) {
28630Sstevel@tonic-gate 		if (mirror_other_sources(un, smi, ci, 0) == 1) {
28640Sstevel@tonic-gate 
28650Sstevel@tonic-gate 			/* Never called from ioctl context, so (IOLOCK *)NULL */
28660Sstevel@tonic-gate 			set_sm_comp_state(un, smi, ci, CS_LAST_ERRED, 0, flags,
28676901Sjkennedy 			    (IOLOCK *)NULL);
28680Sstevel@tonic-gate 			/*
28690Sstevel@tonic-gate 			 * For a MN set, the NOTIFY is done when the state
28700Sstevel@tonic-gate 			 * change is processed on each node
28710Sstevel@tonic-gate 			 */
28720Sstevel@tonic-gate 			if (!MD_MNSET_SETNO(MD_UN2SET(un))) {
28730Sstevel@tonic-gate 				SE_NOTIFY(EC_SVM_STATE, ESC_SVM_LASTERRED,
28740Sstevel@tonic-gate 				    SVM_TAG_METADEVICE, setno, MD_SID(un));
28750Sstevel@tonic-gate 			}
28760Sstevel@tonic-gate 			continue;
28770Sstevel@tonic-gate 		}
28780Sstevel@tonic-gate 		/* Never called from ioctl context, so (IOLOCK *)NULL */
28790Sstevel@tonic-gate 		set_sm_comp_state(un, smi, ci, CS_ERRED, 0, flags,
28806901Sjkennedy 		    (IOLOCK *)NULL);
28810Sstevel@tonic-gate 		/*
28820Sstevel@tonic-gate 		 * For a MN set, the NOTIFY is done when the state
28830Sstevel@tonic-gate 		 * change is processed on each node
28840Sstevel@tonic-gate 		 */
28850Sstevel@tonic-gate 		if (!MD_MNSET_SETNO(MD_UN2SET(un))) {
28860Sstevel@tonic-gate 			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED,
28870Sstevel@tonic-gate 			    SVM_TAG_METADEVICE, setno, MD_SID(un));
28880Sstevel@tonic-gate 		}
28890Sstevel@tonic-gate 		smi = 0;
28900Sstevel@tonic-gate 		ci = 0;
28910Sstevel@tonic-gate 	}
28920Sstevel@tonic-gate 
28930Sstevel@tonic-gate 	md_unit_writerexit(ui);
28940Sstevel@tonic-gate 	if (MD_MNSET_SETNO(setno)) {
28950Sstevel@tonic-gate 		send_poke_hotspares(setno);
28960Sstevel@tonic-gate 	} else {
28970Sstevel@tonic-gate 		(void) poke_hotspares();
28980Sstevel@tonic-gate 	}
28990Sstevel@tonic-gate 	(void) md_unit_readerlock(ui);
29000Sstevel@tonic-gate 
29010Sstevel@tonic-gate 	finish_error(ps);
29020Sstevel@tonic-gate }
29030Sstevel@tonic-gate 
29040Sstevel@tonic-gate /*
29050Sstevel@tonic-gate  * When we have a B_FAILFAST IO error on a Last Erred component we need to
29060Sstevel@tonic-gate  * retry the IO without B_FAILFAST set so that we try to ensure that the
29070Sstevel@tonic-gate  * component "sees" each IO.
29080Sstevel@tonic-gate  */
29090Sstevel@tonic-gate static void
last_err_retry(md_mcs_t * cs)29100Sstevel@tonic-gate last_err_retry(md_mcs_t *cs)
29110Sstevel@tonic-gate {
29120Sstevel@tonic-gate 	struct buf	*cb;
29130Sstevel@tonic-gate 	md_mps_t	*ps;
29140Sstevel@tonic-gate 	uint_t		flags;
29150Sstevel@tonic-gate 
29160Sstevel@tonic-gate 	cb = &cs->cs_buf;
29170Sstevel@tonic-gate 	cb->b_flags &= ~B_FAILFAST;
29180Sstevel@tonic-gate 
29190Sstevel@tonic-gate 	/* if we're panicing just let this I/O error out */
29200Sstevel@tonic-gate 	if (panicstr) {
29216901Sjkennedy 		(void) mirror_done(cb);
29226901Sjkennedy 		return;
29230Sstevel@tonic-gate 	}
29240Sstevel@tonic-gate 
29250Sstevel@tonic-gate 	/* reissue the I/O */
29260Sstevel@tonic-gate 
29270Sstevel@tonic-gate 	ps = cs->cs_ps;
29280Sstevel@tonic-gate 
29290Sstevel@tonic-gate 	bioerror(cb, 0);
29300Sstevel@tonic-gate 
29310Sstevel@tonic-gate 	mutex_enter(&ps->ps_mx);
29320Sstevel@tonic-gate 
29330Sstevel@tonic-gate 	flags = MD_STR_NOTTOP;
29340Sstevel@tonic-gate 	if (ps->ps_flags & MD_MPS_MAPPED)
29350Sstevel@tonic-gate 		flags |= MD_STR_MAPPED;
29360Sstevel@tonic-gate 	if (ps->ps_flags & MD_MPS_NOBLOCK)
29370Sstevel@tonic-gate 		flags |= MD_NOBLOCK;
29380Sstevel@tonic-gate 
29390Sstevel@tonic-gate 	mutex_exit(&ps->ps_mx);
29400Sstevel@tonic-gate 
29410Sstevel@tonic-gate 	clear_retry_error(cb);
29420Sstevel@tonic-gate 
29430Sstevel@tonic-gate 	cmn_err(CE_NOTE, "!md: %s: Last Erred, retry I/O without B_FAILFAST",
29446901Sjkennedy 	    md_shortname(getminor(cb->b_edev)));
29450Sstevel@tonic-gate 
29460Sstevel@tonic-gate 	md_call_strategy(cb, flags, NULL);
29470Sstevel@tonic-gate }
29480Sstevel@tonic-gate 
29490Sstevel@tonic-gate static void
mirror_error(md_mps_t * ps)29500Sstevel@tonic-gate mirror_error(md_mps_t *ps)
29510Sstevel@tonic-gate {
29520Sstevel@tonic-gate 	int		smi;	/* sub mirror index */
29530Sstevel@tonic-gate 	int		ci;	/* errored component */
29540Sstevel@tonic-gate 
29550Sstevel@tonic-gate 	if (panicstr) {
29560Sstevel@tonic-gate 		finish_error(ps);
29570Sstevel@tonic-gate 		return;
29580Sstevel@tonic-gate 	}
29590Sstevel@tonic-gate 
29600Sstevel@tonic-gate 	if (ps->ps_flags & MD_MPS_ON_OVERLAP)
29616901Sjkennedy 		mirror_overlap_tree_remove(ps);
29620Sstevel@tonic-gate 
29630Sstevel@tonic-gate 	smi = 0;
29640Sstevel@tonic-gate 	ci = 0;
29650Sstevel@tonic-gate 	if (mirror_geterror(ps->ps_un, &smi, &ci, 0, 0) != 0) {
29660Sstevel@tonic-gate 		md_unit_readerexit(ps->ps_ui);
29670Sstevel@tonic-gate 		daemon_request(&md_mstr_daemon, error_update_unit,
29680Sstevel@tonic-gate 		    (daemon_queue_t *)ps, REQ_OLD);
29690Sstevel@tonic-gate 		return;
29700Sstevel@tonic-gate 	}
29710Sstevel@tonic-gate 
29720Sstevel@tonic-gate 	finish_error(ps);
29730Sstevel@tonic-gate }
29740Sstevel@tonic-gate 
29750Sstevel@tonic-gate static int
copy_write_done(struct buf * cb)29760Sstevel@tonic-gate copy_write_done(struct buf *cb)
29770Sstevel@tonic-gate {
29780Sstevel@tonic-gate 	md_mps_t	*ps;
29790Sstevel@tonic-gate 	buf_t		*pb;
29800Sstevel@tonic-gate 	char		*wowbuf;
29810Sstevel@tonic-gate 	wowhdr_t	*wowhdr;
29820Sstevel@tonic-gate 	ssize_t		wow_resid;
29830Sstevel@tonic-gate 
29840Sstevel@tonic-gate 	/* get wowbuf ans save structure */
29850Sstevel@tonic-gate 	wowbuf = cb->b_un.b_addr;
29860Sstevel@tonic-gate 	wowhdr = WOWBUF_HDR(wowbuf);
29870Sstevel@tonic-gate 	ps = wowhdr->wow_ps;
29880Sstevel@tonic-gate 	pb = ps->ps_bp;
29890Sstevel@tonic-gate 
29900Sstevel@tonic-gate 	/* Save error information, then free cb */
29910Sstevel@tonic-gate 	if (cb->b_flags & B_ERROR)
29920Sstevel@tonic-gate 		pb->b_flags |= B_ERROR;
29930Sstevel@tonic-gate 
29940Sstevel@tonic-gate 	if (cb->b_flags & B_REMAPPED)
29950Sstevel@tonic-gate 		bp_mapout(cb);
29960Sstevel@tonic-gate 
29970Sstevel@tonic-gate 	freerbuf(cb);
29980Sstevel@tonic-gate 
29990Sstevel@tonic-gate 	/* update residual and continue if needed */
30000Sstevel@tonic-gate 	if ((pb->b_flags & B_ERROR) == 0) {
30010Sstevel@tonic-gate 		wow_resid = pb->b_bcount - wowhdr->wow_offset;
30020Sstevel@tonic-gate 		pb->b_resid = wow_resid;
30030Sstevel@tonic-gate 		if (wow_resid > 0)  {
30040Sstevel@tonic-gate 			daemon_request(&md_mstr_daemon, copy_write_cont,
30050Sstevel@tonic-gate 			    (daemon_queue_t *)wowhdr, REQ_OLD);
30060Sstevel@tonic-gate 			return (1);
30070Sstevel@tonic-gate 		}
30080Sstevel@tonic-gate 	}
30090Sstevel@tonic-gate 
30100Sstevel@tonic-gate 	/* Write is complete, release resources. */
30110Sstevel@tonic-gate 	kmem_cache_free(mirror_wowblk_cache, wowhdr);
30120Sstevel@tonic-gate 	ASSERT(!(ps->ps_flags & MD_MPS_ON_OVERLAP));
30130Sstevel@tonic-gate 	md_kstat_done(ps->ps_ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
30140Sstevel@tonic-gate 	MPS_FREE(mirror_parent_cache, ps);
30150Sstevel@tonic-gate 	md_biodone(pb);
30160Sstevel@tonic-gate 	return (0);
30170Sstevel@tonic-gate }
30180Sstevel@tonic-gate 
30190Sstevel@tonic-gate static void
copy_write_cont(wowhdr_t * wowhdr)30200Sstevel@tonic-gate copy_write_cont(wowhdr_t *wowhdr)
30210Sstevel@tonic-gate {
30220Sstevel@tonic-gate 	buf_t		*pb;
30230Sstevel@tonic-gate 	buf_t		*cb;
30240Sstevel@tonic-gate 	char		*wowbuf;
30250Sstevel@tonic-gate 	int		wow_offset;
30260Sstevel@tonic-gate 	size_t		wow_resid;
30270Sstevel@tonic-gate 	diskaddr_t	wow_blkno;
30280Sstevel@tonic-gate 
30290Sstevel@tonic-gate 	wowbuf = WOWHDR_BUF(wowhdr);
30300Sstevel@tonic-gate 	pb = wowhdr->wow_ps->ps_bp;
30310Sstevel@tonic-gate 
30320Sstevel@tonic-gate 	/* get data on current location */
30330Sstevel@tonic-gate 	wow_offset = wowhdr->wow_offset;
30340Sstevel@tonic-gate 	wow_resid = pb->b_bcount - wow_offset;
30350Sstevel@tonic-gate 	wow_blkno = pb->b_lblkno + lbtodb(wow_offset);
30360Sstevel@tonic-gate 
30370Sstevel@tonic-gate 	/* setup child buffer */
30380Sstevel@tonic-gate 	cb = getrbuf(KM_SLEEP);
30390Sstevel@tonic-gate 	cb->b_flags = B_WRITE;
30400Sstevel@tonic-gate 	cb->b_edev = pb->b_edev;
30410Sstevel@tonic-gate 	cb->b_un.b_addr = wowbuf;	/* change to point at WOWBUF */
30420Sstevel@tonic-gate 	cb->b_bufsize = md_wowbuf_size; /* change to wowbuf_size */
30430Sstevel@tonic-gate 	cb->b_iodone = copy_write_done;
30440Sstevel@tonic-gate 	cb->b_bcount = MIN(md_wowbuf_size, wow_resid);
30450Sstevel@tonic-gate 	cb->b_lblkno = wow_blkno;
30460Sstevel@tonic-gate 
30470Sstevel@tonic-gate 	/* move offset to next section */
30480Sstevel@tonic-gate 	wowhdr->wow_offset += cb->b_bcount;
30490Sstevel@tonic-gate 
30500Sstevel@tonic-gate 	/* copy and setup write for current section */
30510Sstevel@tonic-gate 	bcopy(&pb->b_un.b_addr[wow_offset], wowbuf, cb->b_bcount);
30520Sstevel@tonic-gate 
30530Sstevel@tonic-gate 	/* do it */
30540Sstevel@tonic-gate 	/*
30550Sstevel@tonic-gate 	 * Do not set the MD_IO_COUNTED flag as this is a new I/O request
30560Sstevel@tonic-gate 	 * that handles the WOW condition. The resultant increment on the
30570Sstevel@tonic-gate 	 * I/O count variable is cleared by copy_write_done()'s call to
30580Sstevel@tonic-gate 	 * md_biodone().
30590Sstevel@tonic-gate 	 */
30600Sstevel@tonic-gate 	(void) md_mirror_strategy(cb, MD_STR_NOTTOP | MD_STR_WOW
30616901Sjkennedy 	    | MD_STR_MAPPED, NULL);
30620Sstevel@tonic-gate }
30630Sstevel@tonic-gate 
30640Sstevel@tonic-gate static void
md_mirror_copy_write(md_mps_t * ps)30650Sstevel@tonic-gate md_mirror_copy_write(md_mps_t *ps)
30660Sstevel@tonic-gate {
30670Sstevel@tonic-gate 	wowhdr_t	*wowhdr;
30680Sstevel@tonic-gate 
30690Sstevel@tonic-gate 	wowhdr = kmem_cache_alloc(mirror_wowblk_cache, MD_ALLOCFLAGS);
30700Sstevel@tonic-gate 	mirror_wowblk_init(wowhdr);
30710Sstevel@tonic-gate 	wowhdr->wow_ps = ps;
30720Sstevel@tonic-gate 	wowhdr->wow_offset = 0;
30730Sstevel@tonic-gate 	copy_write_cont(wowhdr);
30740Sstevel@tonic-gate }
30750Sstevel@tonic-gate 
30760Sstevel@tonic-gate static void
handle_wow(md_mps_t * ps)30770Sstevel@tonic-gate handle_wow(md_mps_t *ps)
30780Sstevel@tonic-gate {
30790Sstevel@tonic-gate 	buf_t		*pb;
30800Sstevel@tonic-gate 
30810Sstevel@tonic-gate 	pb = ps->ps_bp;
30820Sstevel@tonic-gate 
30830Sstevel@tonic-gate 	bp_mapin(pb);
30840Sstevel@tonic-gate 
30850Sstevel@tonic-gate 	md_mirror_wow_cnt++;
30860Sstevel@tonic-gate 	if (!(pb->b_flags & B_PHYS) && (md_mirror_wow_flg & WOW_LOGIT)) {
30870Sstevel@tonic-gate 		cmn_err(CE_NOTE,
30880Sstevel@tonic-gate 		    "md: %s, blk %lld, cnt %ld: Write on write %d occurred",
30890Sstevel@tonic-gate 		    md_shortname(getminor(pb->b_edev)),
30900Sstevel@tonic-gate 		    (longlong_t)pb->b_lblkno, pb->b_bcount, md_mirror_wow_cnt);
30910Sstevel@tonic-gate 	}
30920Sstevel@tonic-gate 
30930Sstevel@tonic-gate 	/*
30940Sstevel@tonic-gate 	 * Set the MD_IO_COUNTED flag as we are retrying the same I/O
30950Sstevel@tonic-gate 	 * operation therefore this I/O request has already been counted,
30960Sstevel@tonic-gate 	 * the I/O count variable will be decremented by mirror_done()'s
30970Sstevel@tonic-gate 	 * call to md_biodone().
30980Sstevel@tonic-gate 	 */
30990Sstevel@tonic-gate 	if (md_mirror_wow_flg & WOW_NOCOPY)
31000Sstevel@tonic-gate 		(void) md_mirror_strategy(pb, MD_STR_NOTTOP | MD_STR_WOW |
31016901Sjkennedy 		    MD_STR_MAPPED | MD_IO_COUNTED, ps);
31020Sstevel@tonic-gate 	else
31030Sstevel@tonic-gate 		md_mirror_copy_write(ps);
31040Sstevel@tonic-gate }
31050Sstevel@tonic-gate 
31060Sstevel@tonic-gate /*
31070Sstevel@tonic-gate  * Return true if the specified submirror is either in the Last Erred
31080Sstevel@tonic-gate  * state or is transitioning into the Last Erred state.
31090Sstevel@tonic-gate  */
31100Sstevel@tonic-gate static bool_t
submirror_is_lasterred(mm_unit_t * un,int smi)31110Sstevel@tonic-gate submirror_is_lasterred(mm_unit_t *un, int smi)
31120Sstevel@tonic-gate {
31130Sstevel@tonic-gate 	mm_submirror_t		*sm;
31140Sstevel@tonic-gate 	mm_submirror_ic_t	*smic;
31150Sstevel@tonic-gate 	md_m_shared_t		*shared;
31160Sstevel@tonic-gate 	int			ci;
31170Sstevel@tonic-gate 	int			compcnt;
31180Sstevel@tonic-gate 
31190Sstevel@tonic-gate 	sm = &un->un_sm[smi];
31200Sstevel@tonic-gate 	smic = &un->un_smic[smi];
31210Sstevel@tonic-gate 
31220Sstevel@tonic-gate 	compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un);
31230Sstevel@tonic-gate 	for (ci = 0; ci < compcnt; ci++) {
31240Sstevel@tonic-gate 		shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
31250Sstevel@tonic-gate 		    (sm->sm_dev, sm, ci);
31260Sstevel@tonic-gate 
31270Sstevel@tonic-gate 		if (shared->ms_state == CS_LAST_ERRED)
31280Sstevel@tonic-gate 			return (B_TRUE);
31290Sstevel@tonic-gate 
31300Sstevel@tonic-gate 		/*
31310Sstevel@tonic-gate 		 * It is not currently Last Erred, check if entering Last Erred.
31320Sstevel@tonic-gate 		 */
31330Sstevel@tonic-gate 		if ((shared->ms_flags & MDM_S_IOERR) &&
31340Sstevel@tonic-gate 		    ((shared->ms_state == CS_OKAY) ||
31350Sstevel@tonic-gate 		    (shared->ms_state == CS_RESYNC))) {
31360Sstevel@tonic-gate 			if (mirror_other_sources(un, smi, ci, 0) == 1)
31370Sstevel@tonic-gate 				return (B_TRUE);
31380Sstevel@tonic-gate 		}
31390Sstevel@tonic-gate 	}
31400Sstevel@tonic-gate 
31410Sstevel@tonic-gate 	return (B_FALSE);
31420Sstevel@tonic-gate }
31430Sstevel@tonic-gate 
31440Sstevel@tonic-gate 
31450Sstevel@tonic-gate static int
mirror_done(struct buf * cb)31460Sstevel@tonic-gate mirror_done(struct buf *cb)
31470Sstevel@tonic-gate {
31480Sstevel@tonic-gate 	md_mps_t	*ps;
31490Sstevel@tonic-gate 	md_mcs_t	*cs;
31500Sstevel@tonic-gate 
31510Sstevel@tonic-gate 	/*LINTED*/
31520Sstevel@tonic-gate 	cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off);
31530Sstevel@tonic-gate 	ps = cs->cs_ps;
31540Sstevel@tonic-gate 
31550Sstevel@tonic-gate 	mutex_enter(&ps->ps_mx);
31560Sstevel@tonic-gate 
31570Sstevel@tonic-gate 	/* check if we need to retry an errored failfast I/O */
31580Sstevel@tonic-gate 	if (cb->b_flags & B_ERROR) {
31590Sstevel@tonic-gate 		struct buf *pb = ps->ps_bp;
31600Sstevel@tonic-gate 
31610Sstevel@tonic-gate 		if (cb->b_flags & B_FAILFAST) {
31620Sstevel@tonic-gate 			int		i;
31630Sstevel@tonic-gate 			mm_unit_t	*un = ps->ps_un;
31640Sstevel@tonic-gate 
31650Sstevel@tonic-gate 			for (i = 0; i < NMIRROR; i++) {
31660Sstevel@tonic-gate 				if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
31670Sstevel@tonic-gate 					continue;
31680Sstevel@tonic-gate 
31690Sstevel@tonic-gate 				if (cb->b_edev ==
31700Sstevel@tonic-gate 				    md_dev64_to_dev(un->un_sm[i].sm_dev)) {
31710Sstevel@tonic-gate 
31720Sstevel@tonic-gate 					/*
31730Sstevel@tonic-gate 					 * This is the submirror that had the
31740Sstevel@tonic-gate 					 * error.  Check if it is Last Erred.
31750Sstevel@tonic-gate 					 */
31760Sstevel@tonic-gate 					if (submirror_is_lasterred(un, i)) {
31770Sstevel@tonic-gate 						daemon_queue_t *dqp;
31780Sstevel@tonic-gate 
31790Sstevel@tonic-gate 						mutex_exit(&ps->ps_mx);
31800Sstevel@tonic-gate 						dqp = (daemon_queue_t *)cs;
31810Sstevel@tonic-gate 						dqp->dq_prev = NULL;
31820Sstevel@tonic-gate 						dqp->dq_next = NULL;
31830Sstevel@tonic-gate 						daemon_request(&md_done_daemon,
31840Sstevel@tonic-gate 						    last_err_retry, dqp,
31850Sstevel@tonic-gate 						    REQ_OLD);
31860Sstevel@tonic-gate 						return (1);
31870Sstevel@tonic-gate 					}
31880Sstevel@tonic-gate 					break;
31890Sstevel@tonic-gate 				}
31900Sstevel@tonic-gate 			}
31910Sstevel@tonic-gate 		}
31920Sstevel@tonic-gate 
31930Sstevel@tonic-gate 		/* continue to process the buf without doing a retry */
31940Sstevel@tonic-gate 		ps->ps_flags |= MD_MPS_ERROR;
31950Sstevel@tonic-gate 		pb->b_error = cb->b_error;
31960Sstevel@tonic-gate 	}
31970Sstevel@tonic-gate 
31980Sstevel@tonic-gate 	return (mirror_done_common(cb));
31990Sstevel@tonic-gate }
32000Sstevel@tonic-gate 
32010Sstevel@tonic-gate /*
32020Sstevel@tonic-gate  * Split from the original mirror_done function so we can handle bufs after a
32030Sstevel@tonic-gate  * retry.
32040Sstevel@tonic-gate  * ps->ps_mx is already held in the caller of this function and the cb error
32050Sstevel@tonic-gate  * has already been checked and handled in the caller.
32060Sstevel@tonic-gate  */
32070Sstevel@tonic-gate static int
mirror_done_common(struct buf * cb)32080Sstevel@tonic-gate mirror_done_common(struct buf *cb)
32090Sstevel@tonic-gate {
32100Sstevel@tonic-gate 	struct buf	*pb;
32110Sstevel@tonic-gate 	mm_unit_t	*un;
32120Sstevel@tonic-gate 	mdi_unit_t	*ui;
32130Sstevel@tonic-gate 	md_mps_t	*ps;
32140Sstevel@tonic-gate 	md_mcs_t	*cs;
32150Sstevel@tonic-gate 	size_t		end_rr, start_rr, current_rr;
32160Sstevel@tonic-gate 
32170Sstevel@tonic-gate 	/*LINTED*/
32180Sstevel@tonic-gate 	cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off);
32190Sstevel@tonic-gate 	ps = cs->cs_ps;
32200Sstevel@tonic-gate 	pb = ps->ps_bp;
32210Sstevel@tonic-gate 
32220Sstevel@tonic-gate 	if (cb->b_flags & B_REMAPPED)
32230Sstevel@tonic-gate 		bp_mapout(cb);
32240Sstevel@tonic-gate 
32250Sstevel@tonic-gate 	ps->ps_frags--;
32260Sstevel@tonic-gate 	if (ps->ps_frags != 0) {
32270Sstevel@tonic-gate 		mutex_exit(&ps->ps_mx);
32280Sstevel@tonic-gate 		kmem_cache_free(mirror_child_cache, cs);
32290Sstevel@tonic-gate 		return (1);
32300Sstevel@tonic-gate 	}
32310Sstevel@tonic-gate 	un = ps->ps_un;
32320Sstevel@tonic-gate 	ui = ps->ps_ui;
32330Sstevel@tonic-gate 
32340Sstevel@tonic-gate 	/*
32350Sstevel@tonic-gate 	 * Do not update outstanding_writes if we're running with ABR
32360Sstevel@tonic-gate 	 * set for this mirror or the write() was issued with MD_STR_ABR set.
32370Sstevel@tonic-gate 	 * Also a resync initiated write() has no outstanding_writes update
32380Sstevel@tonic-gate 	 * either.
32390Sstevel@tonic-gate 	 */
32400Sstevel@tonic-gate 	if (((cb->b_flags & B_READ) == 0) &&
32410Sstevel@tonic-gate 	    (un->un_nsm >= 2) &&
32420Sstevel@tonic-gate 	    (ps->ps_call == NULL) &&
32430Sstevel@tonic-gate 	    !((ui->ui_tstate & MD_ABR_CAP) || (ps->ps_flags & MD_MPS_ABR)) &&
32440Sstevel@tonic-gate 	    !(ps->ps_flags & MD_MPS_WRITE_AFTER_READ)) {
32450Sstevel@tonic-gate 		BLK_TO_RR(end_rr, ps->ps_lastblk, un);
32460Sstevel@tonic-gate 		BLK_TO_RR(start_rr, ps->ps_firstblk, un);
32470Sstevel@tonic-gate 		mutex_enter(&un->un_resync_mx);
32480Sstevel@tonic-gate 		for (current_rr = start_rr; current_rr <= end_rr; current_rr++)
32490Sstevel@tonic-gate 			un->un_outstanding_writes[current_rr]--;
32500Sstevel@tonic-gate 		mutex_exit(&un->un_resync_mx);
32510Sstevel@tonic-gate 	}
32520Sstevel@tonic-gate 	kmem_cache_free(mirror_child_cache, cs);
32530Sstevel@tonic-gate 	mutex_exit(&ps->ps_mx);
32540Sstevel@tonic-gate 
32550Sstevel@tonic-gate 	if (ps->ps_call != NULL) {
32560Sstevel@tonic-gate 		daemon_request(&md_done_daemon, ps->ps_call,
32570Sstevel@tonic-gate 		    (daemon_queue_t *)ps, REQ_OLD);
32580Sstevel@tonic-gate 		return (1);
32590Sstevel@tonic-gate 	}
32600Sstevel@tonic-gate 
32610Sstevel@tonic-gate 	if ((ps->ps_flags & MD_MPS_ERROR)) {
32620Sstevel@tonic-gate 		daemon_request(&md_done_daemon, mirror_error,
32630Sstevel@tonic-gate 		    (daemon_queue_t *)ps, REQ_OLD);
32640Sstevel@tonic-gate 		return (1);
32650Sstevel@tonic-gate 	}
32660Sstevel@tonic-gate 
32670Sstevel@tonic-gate 	if (ps->ps_flags & MD_MPS_ON_OVERLAP)
32686901Sjkennedy 		mirror_overlap_tree_remove(ps);
32690Sstevel@tonic-gate 
32700Sstevel@tonic-gate 	/*
32710Sstevel@tonic-gate 	 * Handle Write-on-Write problem.
32720Sstevel@tonic-gate 	 * Skip In case of Raw and Direct I/O as they are
32730Sstevel@tonic-gate 	 * handled earlier.
32740Sstevel@tonic-gate 	 *
32750Sstevel@tonic-gate 	 */
32760Sstevel@tonic-gate 	if (!(md_mirror_wow_flg & WOW_DISABLE) &&
32770Sstevel@tonic-gate 	    !(pb->b_flags & B_READ) &&
32780Sstevel@tonic-gate 	    !(ps->ps_flags & MD_MPS_WOW) &&
32790Sstevel@tonic-gate 	    !(pb->b_flags & B_PHYS) &&
32800Sstevel@tonic-gate 	    any_pages_dirty(pb)) {
32810Sstevel@tonic-gate 		md_unit_readerexit(ps->ps_ui);
32820Sstevel@tonic-gate 		daemon_request(&md_mstr_daemon, handle_wow,
32830Sstevel@tonic-gate 		    (daemon_queue_t *)ps, REQ_OLD);
32840Sstevel@tonic-gate 		return (1);
32850Sstevel@tonic-gate 	}
32860Sstevel@tonic-gate 
32870Sstevel@tonic-gate 	md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
32880Sstevel@tonic-gate 	MPS_FREE(mirror_parent_cache, ps);
32890Sstevel@tonic-gate 	md_unit_readerexit(ui);
32900Sstevel@tonic-gate 	md_biodone(pb);
32910Sstevel@tonic-gate 	return (0);
32920Sstevel@tonic-gate }
32930Sstevel@tonic-gate 
32940Sstevel@tonic-gate /*
32950Sstevel@tonic-gate  * Clear error state in submirror component if the retry worked after
32960Sstevel@tonic-gate  * a failfast error.
32970Sstevel@tonic-gate  */
32980Sstevel@tonic-gate static void
clear_retry_error(struct buf * cb)32990Sstevel@tonic-gate clear_retry_error(struct buf *cb)
33000Sstevel@tonic-gate {
33010Sstevel@tonic-gate 	int			smi;
33020Sstevel@tonic-gate 	md_mcs_t		*cs;
33030Sstevel@tonic-gate 	mm_unit_t		*un;
33040Sstevel@tonic-gate 	mdi_unit_t		*ui_sm;
33050Sstevel@tonic-gate 	mm_submirror_t		*sm;
33060Sstevel@tonic-gate 	mm_submirror_ic_t	*smic;
33070Sstevel@tonic-gate 	u_longlong_t		cnt;
33080Sstevel@tonic-gate 	md_m_shared_t		*shared;
33090Sstevel@tonic-gate 
33100Sstevel@tonic-gate 	/*LINTED*/
33110Sstevel@tonic-gate 	cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off);
33120Sstevel@tonic-gate 	un = cs->cs_ps->ps_un;
33130Sstevel@tonic-gate 
33140Sstevel@tonic-gate 	for (smi = 0; smi < NMIRROR; smi++) {
33156901Sjkennedy 		if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE))
33166901Sjkennedy 			continue;
33176901Sjkennedy 
33186901Sjkennedy 		if (cb->b_edev == md_dev64_to_dev(un->un_sm[smi].sm_dev))
33196901Sjkennedy 			break;
33200Sstevel@tonic-gate 	}
33210Sstevel@tonic-gate 
33220Sstevel@tonic-gate 	if (smi >= NMIRROR)
33236901Sjkennedy 		return;
33240Sstevel@tonic-gate 
33250Sstevel@tonic-gate 	sm = &un->un_sm[smi];
33260Sstevel@tonic-gate 	smic = &un->un_smic[smi];
33270Sstevel@tonic-gate 	cnt = cb->b_bcount;
33280Sstevel@tonic-gate 
33290Sstevel@tonic-gate 	ui_sm = MDI_UNIT(getminor(cb->b_edev));
33300Sstevel@tonic-gate 	(void) md_unit_writerlock(ui_sm);
33310Sstevel@tonic-gate 
33320Sstevel@tonic-gate 	shared = (md_m_shared_t *)(*(smic->sm_shared_by_blk))(sm->sm_dev, sm,
33330Sstevel@tonic-gate 	    cb->b_blkno, &cnt);
33340Sstevel@tonic-gate 
33350Sstevel@tonic-gate 	if (shared->ms_flags & MDM_S_IOERR) {
33366901Sjkennedy 		shared->ms_flags &= ~MDM_S_IOERR;
33370Sstevel@tonic-gate 
33380Sstevel@tonic-gate 	} else {
33396901Sjkennedy 		/* the buf spans components and the first one is not erred */
33406901Sjkennedy 		int	cnt;
33416901Sjkennedy 		int	i;
33426901Sjkennedy 
33436901Sjkennedy 		cnt = (*(smic->sm_get_component_count))(sm->sm_dev, un);
33446901Sjkennedy 		for (i = 0; i < cnt; i++) {
33456901Sjkennedy 			shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
33466901Sjkennedy 			    (sm->sm_dev, sm, i);
33476901Sjkennedy 
33486901Sjkennedy 			if (shared->ms_flags & MDM_S_IOERR &&
33496901Sjkennedy 			    shared->ms_state == CS_OKAY) {
33506901Sjkennedy 
33516901Sjkennedy 				shared->ms_flags &= ~MDM_S_IOERR;
33526901Sjkennedy 				break;
33536901Sjkennedy 			}
33540Sstevel@tonic-gate 		}
33550Sstevel@tonic-gate 	}
33560Sstevel@tonic-gate 
33570Sstevel@tonic-gate 	md_unit_writerexit(ui_sm);
33580Sstevel@tonic-gate }
33590Sstevel@tonic-gate 
33600Sstevel@tonic-gate static size_t
mirror_map_read(md_mps_t * ps,md_mcs_t * cs,diskaddr_t blkno,u_longlong_t count)33610Sstevel@tonic-gate mirror_map_read(
33620Sstevel@tonic-gate 	md_mps_t *ps,
33630Sstevel@tonic-gate 	md_mcs_t *cs,
33640Sstevel@tonic-gate 	diskaddr_t blkno,
33650Sstevel@tonic-gate 	u_longlong_t	count
33660Sstevel@tonic-gate )
33670Sstevel@tonic-gate {
33680Sstevel@tonic-gate 	mm_unit_t	*un;
33690Sstevel@tonic-gate 	buf_t		*bp;
33700Sstevel@tonic-gate 	u_longlong_t	cando;
33710Sstevel@tonic-gate 
33720Sstevel@tonic-gate 	bp = &cs->cs_buf;
33730Sstevel@tonic-gate 	un = ps->ps_un;
33740Sstevel@tonic-gate 
33750Sstevel@tonic-gate 	bp->b_lblkno = blkno;
33760Sstevel@tonic-gate 	if (fast_select_read_unit(ps, cs) == 0) {
33770Sstevel@tonic-gate 		bp->b_bcount = ldbtob(count);
33780Sstevel@tonic-gate 		return (0);
33790Sstevel@tonic-gate 	}
33806901Sjkennedy 	bp->b_edev = md_dev64_to_dev(select_read_unit(un, blkno,
33816901Sjkennedy 	    count, &cando, 0, NULL, cs));
33820Sstevel@tonic-gate 	bp->b_bcount = ldbtob(cando);
33830Sstevel@tonic-gate 	if (count != cando)
33840Sstevel@tonic-gate 		return (cando);
33850Sstevel@tonic-gate 	return (0);
33860Sstevel@tonic-gate }
33870Sstevel@tonic-gate 
33880Sstevel@tonic-gate static void
write_after_read(md_mps_t * ps)33890Sstevel@tonic-gate write_after_read(md_mps_t *ps)
33900Sstevel@tonic-gate {
33910Sstevel@tonic-gate 	struct buf	*pb;
33920Sstevel@tonic-gate 	int		flags;
33930Sstevel@tonic-gate 
33940Sstevel@tonic-gate 	if (ps->ps_flags & MD_MPS_ERROR) {
33950Sstevel@tonic-gate 		mirror_error(ps);
33960Sstevel@tonic-gate 		return;
33970Sstevel@tonic-gate 	}
33980Sstevel@tonic-gate 
33990Sstevel@tonic-gate 	pb = ps->ps_bp;
34000Sstevel@tonic-gate 	md_kstat_done(ps->ps_ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
34010Sstevel@tonic-gate 	ps->ps_call = NULL;
34020Sstevel@tonic-gate 	ps->ps_flags |= MD_MPS_WRITE_AFTER_READ;
34030Sstevel@tonic-gate 	flags = MD_STR_NOTTOP | MD_STR_WAR;
34040Sstevel@tonic-gate 	if (ps->ps_flags & MD_MPS_MAPPED)
34050Sstevel@tonic-gate 		flags |= MD_STR_MAPPED;
34060Sstevel@tonic-gate 	if (ps->ps_flags & MD_MPS_NOBLOCK)
34070Sstevel@tonic-gate 		flags |= MD_NOBLOCK;
34080Sstevel@tonic-gate 	if (ps->ps_flags & MD_MPS_DIRTY_RD)
34090Sstevel@tonic-gate 		flags |= MD_STR_DIRTY_RD;
34100Sstevel@tonic-gate 	(void) mirror_write_strategy(pb, flags, ps);
34110Sstevel@tonic-gate }
34120Sstevel@tonic-gate 
34130Sstevel@tonic-gate static void
continue_serial(md_mps_t * ps)34140Sstevel@tonic-gate continue_serial(md_mps_t *ps)
34150Sstevel@tonic-gate {
34160Sstevel@tonic-gate 	md_mcs_t	*cs;
34170Sstevel@tonic-gate 	buf_t		*cb;
34180Sstevel@tonic-gate 	mm_unit_t	*un;
34190Sstevel@tonic-gate 	int		flags;
34200Sstevel@tonic-gate 
34210Sstevel@tonic-gate 	un = ps->ps_un;
34220Sstevel@tonic-gate 	cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS);
34230Sstevel@tonic-gate 	mirror_child_init(cs);
34240Sstevel@tonic-gate 	cb = &cs->cs_buf;
34250Sstevel@tonic-gate 	ps->ps_call = NULL;
34260Sstevel@tonic-gate 	ps->ps_frags = 1;
34270Sstevel@tonic-gate 	(void) mirror_map_write(un, cs, ps, 0);
34280Sstevel@tonic-gate 	flags = MD_STR_NOTTOP;
34290Sstevel@tonic-gate 	if (ps->ps_flags & MD_MPS_MAPPED)
34300Sstevel@tonic-gate 		flags |= MD_STR_MAPPED;
34310Sstevel@tonic-gate 	md_call_strategy(cb, flags, NULL);
34320Sstevel@tonic-gate }
34330Sstevel@tonic-gate 
34340Sstevel@tonic-gate static int
mirror_map_write(mm_unit_t * un,md_mcs_t * cs,md_mps_t * ps,int war)34350Sstevel@tonic-gate mirror_map_write(mm_unit_t *un, md_mcs_t *cs, md_mps_t *ps, int war)
34360Sstevel@tonic-gate {
34370Sstevel@tonic-gate 	int i;
34380Sstevel@tonic-gate 	dev_t		dev;	/* needed for bioclone, so not md_dev64_t */
34390Sstevel@tonic-gate 	buf_t		*cb;
34400Sstevel@tonic-gate 	buf_t		*pb;
34410Sstevel@tonic-gate 	diskaddr_t	blkno;
34420Sstevel@tonic-gate 	size_t		bcount;
34430Sstevel@tonic-gate 	off_t		offset;
34440Sstevel@tonic-gate 
34450Sstevel@tonic-gate 	pb = ps->ps_bp;
34460Sstevel@tonic-gate 	cb = &cs->cs_buf;
34470Sstevel@tonic-gate 	cs->cs_ps = ps;
34480Sstevel@tonic-gate 
34490Sstevel@tonic-gate 	i = md_find_nth_unit(ps->ps_writable_sm, ps->ps_current_sm);
34500Sstevel@tonic-gate 
34510Sstevel@tonic-gate 	dev = md_dev64_to_dev(un->un_sm[i].sm_dev);
34520Sstevel@tonic-gate 
34530Sstevel@tonic-gate 	blkno = pb->b_lblkno;
34540Sstevel@tonic-gate 	bcount = pb->b_bcount;
34550Sstevel@tonic-gate 	offset = 0;
34560Sstevel@tonic-gate 	if (war && (blkno == 0) && (un->c.un_flag & MD_LABELED)) {
34570Sstevel@tonic-gate 		blkno = DK_LABEL_LOC + 1;
34580Sstevel@tonic-gate 		/*
34590Sstevel@tonic-gate 		 * This handles the case where we're requesting
34600Sstevel@tonic-gate 		 * a write to block 0 on a label partition
34610Sstevel@tonic-gate 		 * and the request size was smaller than the
34620Sstevel@tonic-gate 		 * size of the label.  If this is the case
34630Sstevel@tonic-gate 		 * then we'll return -1.  Failure to do so will
34640Sstevel@tonic-gate 		 * either cause the calling thread to hang due to
34650Sstevel@tonic-gate 		 * an ssd bug, or worse if the bcount were allowed
34660Sstevel@tonic-gate 		 * to go negative (ie large).
34670Sstevel@tonic-gate 		 */
34680Sstevel@tonic-gate 		if (bcount <= DEV_BSIZE*(DK_LABEL_LOC + 1))
34690Sstevel@tonic-gate 			return (-1);
34700Sstevel@tonic-gate 		bcount -= (DEV_BSIZE*(DK_LABEL_LOC + 1));
34710Sstevel@tonic-gate 		offset = (DEV_BSIZE*(DK_LABEL_LOC + 1));
34720Sstevel@tonic-gate 	}
34730Sstevel@tonic-gate 
34740Sstevel@tonic-gate 	cb = md_bioclone(pb, offset, bcount, dev, blkno, mirror_done,
34750Sstevel@tonic-gate 	    cb, KM_NOSLEEP);
34760Sstevel@tonic-gate 	if (war)
34770Sstevel@tonic-gate 		cb->b_flags = (cb->b_flags & ~B_READ) | B_WRITE;
34780Sstevel@tonic-gate 
34790Sstevel@tonic-gate 	/*
34800Sstevel@tonic-gate 	 * If the submirror is in the erred stated, check if any component is
34810Sstevel@tonic-gate 	 * in the Last Erred state.  If so, we don't want to use the B_FAILFAST
34820Sstevel@tonic-gate 	 * flag on the IO.
34830Sstevel@tonic-gate 	 *
34840Sstevel@tonic-gate 	 * Provide a fast path for the non-erred case (which should be the
34850Sstevel@tonic-gate 	 * normal case).
34860Sstevel@tonic-gate 	 */
34870Sstevel@tonic-gate 	if (un->un_sm[i].sm_flags & MD_SM_FAILFAST) {
34880Sstevel@tonic-gate 		if (un->un_sm[i].sm_state & SMS_COMP_ERRED) {
34890Sstevel@tonic-gate 			mm_submirror_t		*sm;
34900Sstevel@tonic-gate 			mm_submirror_ic_t	*smic;
34910Sstevel@tonic-gate 			int			ci;
34920Sstevel@tonic-gate 			int			compcnt;
34930Sstevel@tonic-gate 
34940Sstevel@tonic-gate 			sm = &un->un_sm[i];
34950Sstevel@tonic-gate 			smic = &un->un_smic[i];
34960Sstevel@tonic-gate 
34970Sstevel@tonic-gate 			compcnt = (*(smic->sm_get_component_count))
34980Sstevel@tonic-gate 			    (sm->sm_dev, un);
34990Sstevel@tonic-gate 			for (ci = 0; ci < compcnt; ci++) {
35000Sstevel@tonic-gate 				md_m_shared_t	*shared;
35010Sstevel@tonic-gate 
35020Sstevel@tonic-gate 				shared = (md_m_shared_t *)
35030Sstevel@tonic-gate 				    (*(smic->sm_shared_by_indx))(sm->sm_dev,
35040Sstevel@tonic-gate 				    sm, ci);
35050Sstevel@tonic-gate 
35060Sstevel@tonic-gate 				if (shared->ms_state == CS_LAST_ERRED)
35070Sstevel@tonic-gate 					break;
35080Sstevel@tonic-gate 			}
35090Sstevel@tonic-gate 			if (ci >= compcnt)
35100Sstevel@tonic-gate 				cb->b_flags |= B_FAILFAST;
35110Sstevel@tonic-gate 
35120Sstevel@tonic-gate 		} else {
35130Sstevel@tonic-gate 			cb->b_flags |= B_FAILFAST;
35140Sstevel@tonic-gate 		}
35150Sstevel@tonic-gate 	}
35160Sstevel@tonic-gate 
35170Sstevel@tonic-gate 	ps->ps_current_sm++;
35180Sstevel@tonic-gate 	if (ps->ps_current_sm != ps->ps_active_cnt) {
35190Sstevel@tonic-gate 		if (un->un_write_option == WR_SERIAL) {
35200Sstevel@tonic-gate 			ps->ps_call = continue_serial;
35210Sstevel@tonic-gate 			return (0);
35220Sstevel@tonic-gate 		}
35230Sstevel@tonic-gate 		return (1);
35240Sstevel@tonic-gate 	}
35250Sstevel@tonic-gate 	return (0);
35260Sstevel@tonic-gate }
35270Sstevel@tonic-gate 
35280Sstevel@tonic-gate /*
35290Sstevel@tonic-gate  * directed_read_done:
35300Sstevel@tonic-gate  * ------------------
35310Sstevel@tonic-gate  * Completion routine called when a DMR request has been returned from the
35320Sstevel@tonic-gate  * underlying driver. Wake-up the original ioctl() and return the data to
35330Sstevel@tonic-gate  * the user.
35340Sstevel@tonic-gate  */
35350Sstevel@tonic-gate static void
directed_read_done(md_mps_t * ps)35360Sstevel@tonic-gate directed_read_done(md_mps_t *ps)
35370Sstevel@tonic-gate {
35380Sstevel@tonic-gate 	mm_unit_t	*un;
35390Sstevel@tonic-gate 	mdi_unit_t	*ui;
35400Sstevel@tonic-gate 
35410Sstevel@tonic-gate 	un = ps->ps_un;
35420Sstevel@tonic-gate 	ui = ps->ps_ui;
35430Sstevel@tonic-gate 
35440Sstevel@tonic-gate 	md_unit_readerexit(ui);
35450Sstevel@tonic-gate 	md_kstat_done(ui, ps->ps_bp, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
35460Sstevel@tonic-gate 	ps->ps_call = NULL;
35470Sstevel@tonic-gate 
35480Sstevel@tonic-gate 	mutex_enter(&un->un_dmr_mx);
35490Sstevel@tonic-gate 	cv_signal(&un->un_dmr_cv);
35500Sstevel@tonic-gate 	mutex_exit(&un->un_dmr_mx);
35510Sstevel@tonic-gate 
35520Sstevel@tonic-gate 	/* release the parent structure */
35530Sstevel@tonic-gate 	kmem_cache_free(mirror_parent_cache, ps);
35540Sstevel@tonic-gate }
35550Sstevel@tonic-gate 
35560Sstevel@tonic-gate /*
35570Sstevel@tonic-gate  * daemon_io:
35580Sstevel@tonic-gate  * ------------
35590Sstevel@tonic-gate  * Called to issue a mirror_write_strategy() or mirror_read_strategy
35600Sstevel@tonic-gate  * call from a blockable context. NOTE: no mutex can be held on entry to this
35610Sstevel@tonic-gate  * routine
35620Sstevel@tonic-gate  */
35630Sstevel@tonic-gate static void
daemon_io(daemon_queue_t * dq)35640Sstevel@tonic-gate daemon_io(daemon_queue_t *dq)
35650Sstevel@tonic-gate {
35660Sstevel@tonic-gate 	md_mps_t	*ps = (md_mps_t *)dq;
35670Sstevel@tonic-gate 	int		flag = MD_STR_NOTTOP;
35680Sstevel@tonic-gate 	buf_t		*pb = ps->ps_bp;
35690Sstevel@tonic-gate 
35700Sstevel@tonic-gate 	if (ps->ps_flags & MD_MPS_MAPPED)
35710Sstevel@tonic-gate 		flag |= MD_STR_MAPPED;
35720Sstevel@tonic-gate 	if (ps->ps_flags & MD_MPS_WOW)
35730Sstevel@tonic-gate 		flag |= MD_STR_WOW;
35740Sstevel@tonic-gate 	if (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)
35750Sstevel@tonic-gate 		flag |= MD_STR_WAR;
35760Sstevel@tonic-gate 	if (ps->ps_flags & MD_MPS_ABR)
35770Sstevel@tonic-gate 		flag |= MD_STR_ABR;
35787975SAchim.Maurer@Sun.COM 	if (ps->ps_flags & MD_MPS_BLOCKABLE_IO)
35797975SAchim.Maurer@Sun.COM 		flag |= MD_STR_BLOCK_OK;
35800Sstevel@tonic-gate 
35810Sstevel@tonic-gate 	/*
35820Sstevel@tonic-gate 	 * If this is a resync read, ie MD_STR_DIRTY_RD not set, set
35830Sstevel@tonic-gate 	 * MD_STR_WAR before calling mirror_read_strategy
35840Sstevel@tonic-gate 	 */
35850Sstevel@tonic-gate 	if (pb->b_flags & B_READ) {
35860Sstevel@tonic-gate 		if (!(ps->ps_flags & MD_MPS_DIRTY_RD))
35870Sstevel@tonic-gate 			flag |= MD_STR_WAR;
35880Sstevel@tonic-gate 		mirror_read_strategy(pb, flag, ps);
35890Sstevel@tonic-gate 	} else
35900Sstevel@tonic-gate 		mirror_write_strategy(pb, flag, ps);
35910Sstevel@tonic-gate }
35920Sstevel@tonic-gate 
35930Sstevel@tonic-gate /*
35940Sstevel@tonic-gate  * update_resync:
35950Sstevel@tonic-gate  * -------------
35960Sstevel@tonic-gate  * Called to update the in-core version of the resync record with the latest
35970Sstevel@tonic-gate  * version that was committed to disk when the previous mirror owner
35980Sstevel@tonic-gate  * relinquished ownership. This call is likely to block as we must hold-off
35990Sstevel@tonic-gate  * any current resync processing that may be occurring.
36000Sstevel@tonic-gate  * On completion of the resync record update we issue the mirror_write_strategy
36010Sstevel@tonic-gate  * call to complete the i/o that first started this sequence. To remove a race
36020Sstevel@tonic-gate  * condition between a new write() request which is submitted and the resync
36030Sstevel@tonic-gate  * record update we acquire the writerlock. This will hold off all i/o to the
36040Sstevel@tonic-gate  * mirror until the resync update has completed.
36050Sstevel@tonic-gate  * NOTE: no mutex can be held on entry to this routine
36060Sstevel@tonic-gate  */
36070Sstevel@tonic-gate static void
update_resync(daemon_queue_t * dq)36080Sstevel@tonic-gate update_resync(daemon_queue_t *dq)
36090Sstevel@tonic-gate {
36100Sstevel@tonic-gate 	md_mps_t	*ps = (md_mps_t *)dq;
36110Sstevel@tonic-gate 	buf_t		*pb = ps->ps_bp;
36120Sstevel@tonic-gate 	mdi_unit_t	*ui = ps->ps_ui;
36138452SJohn.Wren.Kennedy@Sun.COM 	mm_unit_t	*un = MD_UNIT(ui->ui_link.ln_id);
36140Sstevel@tonic-gate 	set_t		setno;
36150Sstevel@tonic-gate 	int		restart_resync;
36160Sstevel@tonic-gate 
36178452SJohn.Wren.Kennedy@Sun.COM 	mutex_enter(&un->un_rrp_inflight_mx);
36188452SJohn.Wren.Kennedy@Sun.COM 	(void) md_unit_writerlock(ui);
36190Sstevel@tonic-gate 	ps->ps_un = un;
36200Sstevel@tonic-gate 	setno = MD_MIN2SET(getminor(pb->b_edev));
36210Sstevel@tonic-gate 	if (mddb_reread_rr(setno, un->un_rr_dirty_recid) == 0) {
36220Sstevel@tonic-gate 		/*
36230Sstevel@tonic-gate 		 * Synchronize our in-core view of what regions need to be
36240Sstevel@tonic-gate 		 * resync'd with the on-disk version.
36250Sstevel@tonic-gate 		 */
36260Sstevel@tonic-gate 		mirror_copy_rr(howmany(un->un_rrd_num, NBBY), un->un_resync_bm,
36270Sstevel@tonic-gate 		    un->un_dirty_bm);
36280Sstevel@tonic-gate 
36290Sstevel@tonic-gate 		/* Region dirty map is now up to date */
36300Sstevel@tonic-gate 	}
36310Sstevel@tonic-gate 	restart_resync = (un->un_rs_thread_flags & MD_RI_BLOCK_OWNER) ? 1 : 0;
36320Sstevel@tonic-gate 	md_unit_writerexit(ui);
36338452SJohn.Wren.Kennedy@Sun.COM 	mutex_exit(&un->un_rrp_inflight_mx);
36340Sstevel@tonic-gate 
36350Sstevel@tonic-gate 	/* Restart the resync thread if it was previously blocked */
36360Sstevel@tonic-gate 	if (restart_resync) {
36370Sstevel@tonic-gate 		mutex_enter(&un->un_rs_thread_mx);
36380Sstevel@tonic-gate 		un->un_rs_thread_flags &= ~MD_RI_BLOCK_OWNER;
36390Sstevel@tonic-gate 		cv_signal(&un->un_rs_thread_cv);
36400Sstevel@tonic-gate 		mutex_exit(&un->un_rs_thread_mx);
36410Sstevel@tonic-gate 	}
36420Sstevel@tonic-gate 	/* Continue with original deferred i/o */
36430Sstevel@tonic-gate 	daemon_io(dq);
36440Sstevel@tonic-gate }
36450Sstevel@tonic-gate 
36460Sstevel@tonic-gate /*
36470Sstevel@tonic-gate  * owner_timeout:
36480Sstevel@tonic-gate  * -------------
36490Sstevel@tonic-gate  * Called if the original mdmn_ksend_message() failed and the request is to be
36500Sstevel@tonic-gate  * retried. Reattempt the original ownership change.
36510Sstevel@tonic-gate  *
36520Sstevel@tonic-gate  * NOTE: called at interrupt context (see timeout(9f)).
36530Sstevel@tonic-gate  */
36540Sstevel@tonic-gate static void
owner_timeout(void * arg)36550Sstevel@tonic-gate owner_timeout(void *arg)
36560Sstevel@tonic-gate {
36570Sstevel@tonic-gate 	daemon_queue_t	*dq = (daemon_queue_t *)arg;
36580Sstevel@tonic-gate 
36590Sstevel@tonic-gate 	daemon_request(&md_mirror_daemon, become_owner, dq, REQ_OLD);
36600Sstevel@tonic-gate }
36610Sstevel@tonic-gate 
36620Sstevel@tonic-gate /*
36630Sstevel@tonic-gate  * become_owner:
36640Sstevel@tonic-gate  * ------------
36650Sstevel@tonic-gate  * Called to issue RPC request to become the owner of the mirror
36660Sstevel@tonic-gate  * associated with this i/o request. We assume that the ownership request
36670Sstevel@tonic-gate  * is synchronous, so if it succeeds we will issue the request via
36680Sstevel@tonic-gate  * mirror_write_strategy().
36690Sstevel@tonic-gate  * If multiple i/o's are outstanding we will be called from the mirror_daemon
36700Sstevel@tonic-gate  * service thread.
36710Sstevel@tonic-gate  * NOTE: no mutex should be held on entry to this routine.
36720Sstevel@tonic-gate  */
36730Sstevel@tonic-gate static void
become_owner(daemon_queue_t * dq)36740Sstevel@tonic-gate become_owner(daemon_queue_t *dq)
36750Sstevel@tonic-gate {
36760Sstevel@tonic-gate 	md_mps_t	*ps = (md_mps_t *)dq;
36770Sstevel@tonic-gate 	mm_unit_t	*un = ps->ps_un;
36780Sstevel@tonic-gate 	buf_t		*pb = ps->ps_bp;
36790Sstevel@tonic-gate 	set_t		setno;
36800Sstevel@tonic-gate 	md_mn_kresult_t	*kres;
36810Sstevel@tonic-gate 	int		msg_flags = md_mirror_msg_flags;
36820Sstevel@tonic-gate 	md_mps_t	*ps1;
36830Sstevel@tonic-gate 
36840Sstevel@tonic-gate 	ASSERT(dq->dq_next == NULL && dq->dq_prev == NULL);
36850Sstevel@tonic-gate 
36860Sstevel@tonic-gate 	/*
36870Sstevel@tonic-gate 	 * If we're already the mirror owner we do not need to send a message
36880Sstevel@tonic-gate 	 * but can simply process the i/o request immediately.
36890Sstevel@tonic-gate 	 * If we've already sent the request to become owner we requeue the
36900Sstevel@tonic-gate 	 * request as we're waiting for the synchronous ownership message to
36910Sstevel@tonic-gate 	 * be processed.
36920Sstevel@tonic-gate 	 */
36930Sstevel@tonic-gate 	if (MD_MN_MIRROR_OWNER(un)) {
36940Sstevel@tonic-gate 		/*
36950Sstevel@tonic-gate 		 * As the strategy() call will potentially block we need to
36960Sstevel@tonic-gate 		 * punt this to a separate thread and complete this request
36970Sstevel@tonic-gate 		 * as quickly as possible. Note: if we're a read request
36980Sstevel@tonic-gate 		 * this must be a resync, we cannot afford to be queued
36990Sstevel@tonic-gate 		 * behind any intervening i/o requests. In this case we put the
37000Sstevel@tonic-gate 		 * request on the md_mirror_rs_daemon queue.
37010Sstevel@tonic-gate 		 */
37020Sstevel@tonic-gate 		if (pb->b_flags & B_READ) {
37030Sstevel@tonic-gate 			daemon_request(&md_mirror_rs_daemon, daemon_io, dq,
37040Sstevel@tonic-gate 			    REQ_OLD);
37050Sstevel@tonic-gate 		} else {
37060Sstevel@tonic-gate 			daemon_request(&md_mirror_io_daemon, daemon_io, dq,
37070Sstevel@tonic-gate 			    REQ_OLD);
37080Sstevel@tonic-gate 		}
37090Sstevel@tonic-gate 	} else {
37100Sstevel@tonic-gate 		mutex_enter(&un->un_owner_mx);
37110Sstevel@tonic-gate 		if ((un->un_owner_state & MM_MN_OWNER_SENT) == 0) {
37120Sstevel@tonic-gate 			md_mn_req_owner_t	*msg;
37130Sstevel@tonic-gate 			int			rval = 0;
37140Sstevel@tonic-gate 
37150Sstevel@tonic-gate 			/*
37160Sstevel@tonic-gate 			 * Check to see that we haven't exceeded the maximum
37170Sstevel@tonic-gate 			 * retry count. If we have we fail the i/o as the
37180Sstevel@tonic-gate 			 * comms mechanism has become wedged beyond recovery.
37190Sstevel@tonic-gate 			 */
37200Sstevel@tonic-gate 			if (dq->qlen++ >= MD_OWNER_RETRIES) {
37210Sstevel@tonic-gate 				mutex_exit(&un->un_owner_mx);
37220Sstevel@tonic-gate 				cmn_err(CE_WARN,
37230Sstevel@tonic-gate 				    "md_mirror: Request exhausted ownership "
37240Sstevel@tonic-gate 				    "retry limit of %d attempts", dq->qlen);
37250Sstevel@tonic-gate 				pb->b_error = EIO;
37260Sstevel@tonic-gate 				pb->b_flags |= B_ERROR;
37270Sstevel@tonic-gate 				pb->b_resid = pb->b_bcount;
37280Sstevel@tonic-gate 				kmem_cache_free(mirror_parent_cache, ps);
37290Sstevel@tonic-gate 				md_biodone(pb);
37300Sstevel@tonic-gate 				return;
37310Sstevel@tonic-gate 			}
37320Sstevel@tonic-gate 
37330Sstevel@tonic-gate 			/*
37340Sstevel@tonic-gate 			 * Issue request to change ownership. The call is
37350Sstevel@tonic-gate 			 * synchronous so when it returns we can complete the
37360Sstevel@tonic-gate 			 * i/o (if successful), or enqueue it again so that
37370Sstevel@tonic-gate 			 * the operation will be retried.
37380Sstevel@tonic-gate 			 */
37390Sstevel@tonic-gate 			un->un_owner_state |= MM_MN_OWNER_SENT;
37400Sstevel@tonic-gate 			mutex_exit(&un->un_owner_mx);
37410Sstevel@tonic-gate 
37420Sstevel@tonic-gate 			msg = kmem_zalloc(sizeof (md_mn_req_owner_t), KM_SLEEP);
37430Sstevel@tonic-gate 			setno = MD_MIN2SET(getminor(pb->b_edev));
37440Sstevel@tonic-gate 			msg->mnum = MD_SID(un);
37450Sstevel@tonic-gate 			msg->owner = md_mn_mynode_id;
37460Sstevel@tonic-gate 			msg_flags |= MD_MSGF_NO_LOG;
37470Sstevel@tonic-gate 			/*
37480Sstevel@tonic-gate 			 * If this IO is triggered by updating a watermark,
37490Sstevel@tonic-gate 			 * it might be issued by the creation of a softpartition
37500Sstevel@tonic-gate 			 * while the commd subsystem is suspended.
37510Sstevel@tonic-gate 			 * We don't want this message to block.
37520Sstevel@tonic-gate 			 */
37530Sstevel@tonic-gate 			if (ps->ps_flags & MD_MPS_WMUPDATE) {
37540Sstevel@tonic-gate 				msg_flags |= MD_MSGF_OVERRIDE_SUSPEND;
37550Sstevel@tonic-gate 			}
37560Sstevel@tonic-gate 
37570Sstevel@tonic-gate 			kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
37580Sstevel@tonic-gate 			rval = mdmn_ksend_message(setno,
37598452SJohn.Wren.Kennedy@Sun.COM 			    MD_MN_MSG_REQUIRE_OWNER, msg_flags, 0,
37608452SJohn.Wren.Kennedy@Sun.COM 			    (char *)msg, sizeof (md_mn_req_owner_t), kres);
37610Sstevel@tonic-gate 
37620Sstevel@tonic-gate 			kmem_free(msg, sizeof (md_mn_req_owner_t));
37630Sstevel@tonic-gate 
37640Sstevel@tonic-gate 			if (MDMN_KSEND_MSG_OK(rval, kres)) {
37650Sstevel@tonic-gate 				dq->qlen = 0;
37660Sstevel@tonic-gate 				/*
37670Sstevel@tonic-gate 				 * Successfully changed owner, reread the
37680Sstevel@tonic-gate 				 * resync record so that we have a valid idea of
37690Sstevel@tonic-gate 				 * any previously committed incomplete write()s.
37700Sstevel@tonic-gate 				 * NOTE: As we need to acquire the resync mutex
37710Sstevel@tonic-gate 				 * this may block, so we defer it to a separate
37720Sstevel@tonic-gate 				 * thread handler. This makes us (effectively)
37730Sstevel@tonic-gate 				 * non-blocking once the ownership message
37740Sstevel@tonic-gate 				 * handling has completed.
37750Sstevel@tonic-gate 				 */
37760Sstevel@tonic-gate 				mutex_enter(&un->un_owner_mx);
37770Sstevel@tonic-gate 				if (un->un_owner_state & MM_MN_BECOME_OWNER) {
37780Sstevel@tonic-gate 					un->un_mirror_owner = md_mn_mynode_id;
37790Sstevel@tonic-gate 					/* Sets owner of un_rr_dirty record */
37800Sstevel@tonic-gate 					if (un->un_rr_dirty_recid)
37810Sstevel@tonic-gate 						(void) mddb_setowner(
37820Sstevel@tonic-gate 						    un->un_rr_dirty_recid,
37830Sstevel@tonic-gate 						    md_mn_mynode_id);
37840Sstevel@tonic-gate 					un->un_owner_state &=
37850Sstevel@tonic-gate 					    ~MM_MN_BECOME_OWNER;
37860Sstevel@tonic-gate 					/*
37870Sstevel@tonic-gate 					 * Release the block on the current
37880Sstevel@tonic-gate 					 * resync region if it is blocked
37890Sstevel@tonic-gate 					 */
37906901Sjkennedy 					ps1 = un->un_rs_prev_overlap;
37910Sstevel@tonic-gate 					if ((ps1 != NULL) &&
37920Sstevel@tonic-gate 					    (ps1->ps_flags & MD_MPS_ON_OVERLAP))
37936901Sjkennedy 						mirror_overlap_tree_remove(ps1);
37940Sstevel@tonic-gate 					mutex_exit(&un->un_owner_mx);
37950Sstevel@tonic-gate 
37960Sstevel@tonic-gate 					/*
37970Sstevel@tonic-gate 					 * If we're a read, this must be a
37980Sstevel@tonic-gate 					 * resync request, issue
37990Sstevel@tonic-gate 					 * the i/o request on the
38000Sstevel@tonic-gate 					 * md_mirror_rs_daemon queue. This is
38010Sstevel@tonic-gate 					 * to avoid a deadlock between the
38020Sstevel@tonic-gate 					 * resync_unit thread and
38030Sstevel@tonic-gate 					 * subsequent i/o requests that may
38040Sstevel@tonic-gate 					 * block on the resync region.
38050Sstevel@tonic-gate 					 */
38060Sstevel@tonic-gate 					if (pb->b_flags & B_READ) {
38070Sstevel@tonic-gate 						daemon_request(
38080Sstevel@tonic-gate 						    &md_mirror_rs_daemon,
38090Sstevel@tonic-gate 						    update_resync, dq, REQ_OLD);
38100Sstevel@tonic-gate 					} else {
38110Sstevel@tonic-gate 						daemon_request(
38120Sstevel@tonic-gate 						    &md_mirror_io_daemon,
38130Sstevel@tonic-gate 						    update_resync, dq, REQ_OLD);
38140Sstevel@tonic-gate 					}
38150Sstevel@tonic-gate 					kmem_free(kres,
38160Sstevel@tonic-gate 					    sizeof (md_mn_kresult_t));
38170Sstevel@tonic-gate 					return;
38180Sstevel@tonic-gate 				} else {
38190Sstevel@tonic-gate 					/*
38200Sstevel@tonic-gate 					 * Some other node has beaten us to
38210Sstevel@tonic-gate 					 * obtain ownership. We need to
38220Sstevel@tonic-gate 					 * reschedule our ownership request
38230Sstevel@tonic-gate 					 */
38240Sstevel@tonic-gate 					mutex_exit(&un->un_owner_mx);
38250Sstevel@tonic-gate 				}
38260Sstevel@tonic-gate 			} else {
38270Sstevel@tonic-gate 				mdmn_ksend_show_error(rval, kres,
38280Sstevel@tonic-gate 				    "MD_MN_MSG_REQUIRE_OWNER");
38290Sstevel@tonic-gate 				/*
38300Sstevel@tonic-gate 				 * Message transport failure is handled by the
38310Sstevel@tonic-gate 				 * comms layer. If the ownership change request
38320Sstevel@tonic-gate 				 * does not succeed we need to flag the error to
38330Sstevel@tonic-gate 				 * the initiator of the i/o. This is handled by
38340Sstevel@tonic-gate 				 * the retry logic above. As the request failed
38350Sstevel@tonic-gate 				 * we do not know _who_ the owner of the mirror
38360Sstevel@tonic-gate 				 * currently is. We reset our idea of the owner
38370Sstevel@tonic-gate 				 * to None so that any further write()s will
38380Sstevel@tonic-gate 				 * attempt to become the owner again. This stops
38390Sstevel@tonic-gate 				 * multiple nodes writing to the same mirror
38400Sstevel@tonic-gate 				 * simultaneously.
38410Sstevel@tonic-gate 				 */
38420Sstevel@tonic-gate 				mutex_enter(&un->un_owner_mx);
38430Sstevel@tonic-gate 				un->un_owner_state &=
38440Sstevel@tonic-gate 				    ~(MM_MN_OWNER_SENT|MM_MN_BECOME_OWNER);
38450Sstevel@tonic-gate 				un->un_mirror_owner = MD_MN_MIRROR_UNOWNED;
38460Sstevel@tonic-gate 				mutex_exit(&un->un_owner_mx);
38470Sstevel@tonic-gate 			}
38480Sstevel@tonic-gate 			kmem_free(kres, sizeof (md_mn_kresult_t));
38490Sstevel@tonic-gate 		} else
38500Sstevel@tonic-gate 			mutex_exit(&un->un_owner_mx);
38510Sstevel@tonic-gate 
38520Sstevel@tonic-gate 		/*
38530Sstevel@tonic-gate 		 * Re-enqueue this request on the deferred i/o list. Delay the
38540Sstevel@tonic-gate 		 * request for md_mirror_owner_to usecs to stop thrashing.
38550Sstevel@tonic-gate 		 */
38560Sstevel@tonic-gate 		(void) timeout(owner_timeout, dq,
38570Sstevel@tonic-gate 		    drv_usectohz(md_mirror_owner_to));
38580Sstevel@tonic-gate 	}
38590Sstevel@tonic-gate }
38600Sstevel@tonic-gate 
38610Sstevel@tonic-gate static void
mirror_write_strategy(buf_t * pb,int flag,void * private)38620Sstevel@tonic-gate mirror_write_strategy(buf_t *pb, int flag, void *private)
38630Sstevel@tonic-gate {
38640Sstevel@tonic-gate 	md_mps_t	*ps;
38650Sstevel@tonic-gate 	md_mcs_t	*cs;
38660Sstevel@tonic-gate 	int		more;
38670Sstevel@tonic-gate 	mm_unit_t	*un;
38680Sstevel@tonic-gate 	mdi_unit_t	*ui;
38690Sstevel@tonic-gate 	buf_t		*cb;		/* child buf pointer */
38700Sstevel@tonic-gate 	set_t		setno;
38710Sstevel@tonic-gate 	int		rs_on_overlap = 0;
38720Sstevel@tonic-gate 
38730Sstevel@tonic-gate 	ui = MDI_UNIT(getminor(pb->b_edev));
38740Sstevel@tonic-gate 	un = (mm_unit_t *)MD_UNIT(getminor(pb->b_edev));
38750Sstevel@tonic-gate 
38760Sstevel@tonic-gate 
38770Sstevel@tonic-gate 	md_kstat_waitq_enter(ui);
38780Sstevel@tonic-gate 
38790Sstevel@tonic-gate 	/*
38800Sstevel@tonic-gate 	 * If a state change is in progress for this mirror in a MN set,
38810Sstevel@tonic-gate 	 * suspend all non-resync writes until the state change is complete.
38820Sstevel@tonic-gate 	 * The objective of this suspend is to ensure that it is not
38830Sstevel@tonic-gate 	 * possible for one node to read data from a submirror that another node
38840Sstevel@tonic-gate 	 * has not written to because of the state change. Therefore we
38850Sstevel@tonic-gate 	 * suspend all writes until the state change has been made. As it is
38860Sstevel@tonic-gate 	 * not possible to read from the target of a resync, there is no need
38870Sstevel@tonic-gate 	 * to suspend resync writes.
38887975SAchim.Maurer@Sun.COM 	 * Note that we only block here if the caller can handle a busy-wait.
38897975SAchim.Maurer@Sun.COM 	 * The MD_STR_BLOCK_OK flag is set for daemon_io originated i/o only.
38900Sstevel@tonic-gate 	 */
38910Sstevel@tonic-gate 
38920Sstevel@tonic-gate 	if (!(flag & MD_STR_WAR)) {
38937975SAchim.Maurer@Sun.COM 		if (flag & MD_STR_BLOCK_OK) {
38947975SAchim.Maurer@Sun.COM 			mutex_enter(&un->un_suspend_wr_mx);
38957975SAchim.Maurer@Sun.COM 			while (un->un_suspend_wr_flag) {
38967975SAchim.Maurer@Sun.COM 				cv_wait(&un->un_suspend_wr_cv,
38977975SAchim.Maurer@Sun.COM 				    &un->un_suspend_wr_mx);
38987975SAchim.Maurer@Sun.COM 			}
38997975SAchim.Maurer@Sun.COM 			mutex_exit(&un->un_suspend_wr_mx);
39000Sstevel@tonic-gate 		}
39010Sstevel@tonic-gate 		(void) md_unit_readerlock(ui);
39020Sstevel@tonic-gate 	}
39030Sstevel@tonic-gate 
39040Sstevel@tonic-gate 	if (!(flag & MD_STR_NOTTOP)) {
39050Sstevel@tonic-gate 		if (md_checkbuf(ui, (md_unit_t *)un, pb)) {
39060Sstevel@tonic-gate 			md_kstat_waitq_exit(ui);
39070Sstevel@tonic-gate 			return;
39080Sstevel@tonic-gate 		}
39090Sstevel@tonic-gate 	}
39100Sstevel@tonic-gate 
39110Sstevel@tonic-gate 	setno = MD_MIN2SET(getminor(pb->b_edev));
39120Sstevel@tonic-gate 
39130Sstevel@tonic-gate 	/* If an ABR write has been requested, set MD_STR_ABR flag */
39140Sstevel@tonic-gate 	if (MD_MNSET_SETNO(setno) && (pb->b_flags & B_ABRWRITE))
39150Sstevel@tonic-gate 		flag |= MD_STR_ABR;
39160Sstevel@tonic-gate 
39170Sstevel@tonic-gate 	if (private == NULL) {
39180Sstevel@tonic-gate 		ps = kmem_cache_alloc(mirror_parent_cache, MD_ALLOCFLAGS);
39190Sstevel@tonic-gate 		mirror_parent_init(ps);
39200Sstevel@tonic-gate 	} else {
39210Sstevel@tonic-gate 		ps = private;
39220Sstevel@tonic-gate 		private = NULL;
39230Sstevel@tonic-gate 	}
39240Sstevel@tonic-gate 	if (flag & MD_STR_MAPPED)
39250Sstevel@tonic-gate 		ps->ps_flags |= MD_MPS_MAPPED;
39260Sstevel@tonic-gate 
39270Sstevel@tonic-gate 	if (flag & MD_STR_WOW)
39280Sstevel@tonic-gate 		ps->ps_flags |= MD_MPS_WOW;
39290Sstevel@tonic-gate 
39300Sstevel@tonic-gate 	if (flag & MD_STR_ABR)
39310Sstevel@tonic-gate 		ps->ps_flags |= MD_MPS_ABR;
39320Sstevel@tonic-gate 
39330Sstevel@tonic-gate 	if (flag & MD_STR_WMUPDATE)
39340Sstevel@tonic-gate 		ps->ps_flags |= MD_MPS_WMUPDATE;
39350Sstevel@tonic-gate 
39360Sstevel@tonic-gate 	/*
39370Sstevel@tonic-gate 	 * Save essential information from the original buffhdr
39380Sstevel@tonic-gate 	 * in the md_save structure.
39390Sstevel@tonic-gate 	 */
39400Sstevel@tonic-gate 	ps->ps_un = un;
39410Sstevel@tonic-gate 	ps->ps_ui = ui;
39420Sstevel@tonic-gate 	ps->ps_bp = pb;
39430Sstevel@tonic-gate 	ps->ps_addr = pb->b_un.b_addr;
39440Sstevel@tonic-gate 	ps->ps_firstblk = pb->b_lblkno;
39450Sstevel@tonic-gate 	ps->ps_lastblk = pb->b_lblkno + lbtodb(pb->b_bcount) - 1;
39460Sstevel@tonic-gate 	ps->ps_changecnt = un->un_changecnt;
39470Sstevel@tonic-gate 
39480Sstevel@tonic-gate 	/*
39497975SAchim.Maurer@Sun.COM 	 * Check for suspended writes here. This is where we can defer the
39507975SAchim.Maurer@Sun.COM 	 * write request to the daemon_io queue which will then call us with
39517975SAchim.Maurer@Sun.COM 	 * the MD_STR_BLOCK_OK flag set and we'll busy-wait (if necessary) at
39527975SAchim.Maurer@Sun.COM 	 * the top of this routine.
39537975SAchim.Maurer@Sun.COM 	 */
39547975SAchim.Maurer@Sun.COM 	if (!(flag & MD_STR_WAR) && !(flag & MD_STR_BLOCK_OK)) {
39557975SAchim.Maurer@Sun.COM 		mutex_enter(&un->un_suspend_wr_mx);
39567975SAchim.Maurer@Sun.COM 		if (un->un_suspend_wr_flag) {
39577975SAchim.Maurer@Sun.COM 			ps->ps_flags |= MD_MPS_BLOCKABLE_IO;
39587975SAchim.Maurer@Sun.COM 			mutex_exit(&un->un_suspend_wr_mx);
39597975SAchim.Maurer@Sun.COM 			md_unit_readerexit(ui);
39607975SAchim.Maurer@Sun.COM 			daemon_request(&md_mirror_daemon, daemon_io,
39617975SAchim.Maurer@Sun.COM 			    (daemon_queue_t *)ps, REQ_OLD);
39627975SAchim.Maurer@Sun.COM 			return;
39637975SAchim.Maurer@Sun.COM 		}
39647975SAchim.Maurer@Sun.COM 		mutex_exit(&un->un_suspend_wr_mx);
39657975SAchim.Maurer@Sun.COM 	}
39667975SAchim.Maurer@Sun.COM 
39677975SAchim.Maurer@Sun.COM 	/*
39680Sstevel@tonic-gate 	 * If not MN owner and this is an ABR write, make sure the current
39696901Sjkennedy 	 * resync region is in the overlaps tree
39700Sstevel@tonic-gate 	 */
39710Sstevel@tonic-gate 	mutex_enter(&un->un_owner_mx);
39720Sstevel@tonic-gate 	if (MD_MNSET_SETNO(setno) && (!(MD_MN_MIRROR_OWNER(un))) &&
39730Sstevel@tonic-gate 	    ((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR))) {
39740Sstevel@tonic-gate 		md_mps_t	*ps1;
39750Sstevel@tonic-gate 		/* Block the current resync region, if not already blocked */
39766901Sjkennedy 		ps1 = un->un_rs_prev_overlap;
39770Sstevel@tonic-gate 
39780Sstevel@tonic-gate 		if ((ps1 != NULL) && ((ps1->ps_firstblk != 0) ||
39790Sstevel@tonic-gate 		    (ps1->ps_lastblk != 0))) {
39800Sstevel@tonic-gate 			/* Drop locks to avoid deadlock */
39810Sstevel@tonic-gate 			mutex_exit(&un->un_owner_mx);
39820Sstevel@tonic-gate 			md_unit_readerexit(ui);
39830Sstevel@tonic-gate 			wait_for_overlaps(ps1, MD_OVERLAP_ALLOW_REPEAT);
39840Sstevel@tonic-gate 			rs_on_overlap = 1;
39850Sstevel@tonic-gate 			(void) md_unit_readerlock(ui);
39860Sstevel@tonic-gate 			mutex_enter(&un->un_owner_mx);
39870Sstevel@tonic-gate 			/*
39880Sstevel@tonic-gate 			 * Check to see if we have obtained ownership
39890Sstevel@tonic-gate 			 * while waiting for overlaps. If we have, remove
39906901Sjkennedy 			 * the resync_region entry from the overlap tree
39910Sstevel@tonic-gate 			 */
39920Sstevel@tonic-gate 			if (MD_MN_MIRROR_OWNER(un) &&
39930Sstevel@tonic-gate 			    (ps1->ps_flags & MD_MPS_ON_OVERLAP)) {
39946901Sjkennedy 				mirror_overlap_tree_remove(ps1);
39950Sstevel@tonic-gate 				rs_on_overlap = 0;
39960Sstevel@tonic-gate 			}
39970Sstevel@tonic-gate 		}
39980Sstevel@tonic-gate 	}
39990Sstevel@tonic-gate 	mutex_exit(&un->un_owner_mx);
40000Sstevel@tonic-gate 
40010Sstevel@tonic-gate 
40020Sstevel@tonic-gate 	/*
40030Sstevel@tonic-gate 	 * following keep write after read from writing to the
40040Sstevel@tonic-gate 	 * source in the case where it all came from one place
40050Sstevel@tonic-gate 	 */
40060Sstevel@tonic-gate 	if (flag & MD_STR_WAR) {
40070Sstevel@tonic-gate 		int	abort_write = 0;
40080Sstevel@tonic-gate 		/*
40090Sstevel@tonic-gate 		 * We are perfoming a write-after-read. This is either as a
40100Sstevel@tonic-gate 		 * result of a resync read or as a result of a read in a
40110Sstevel@tonic-gate 		 * dirty resync region when the optimized resync is not
40120Sstevel@tonic-gate 		 * complete. If in a MN set and a resync generated i/o,
40130Sstevel@tonic-gate 		 * if the current block is not in the current
40140Sstevel@tonic-gate 		 * resync region terminate the write as another node must have
40150Sstevel@tonic-gate 		 * completed this resync region
40160Sstevel@tonic-gate 		 */
40170Sstevel@tonic-gate 		if ((MD_MNSET_SETNO(MD_UN2SET(un))) &&
40180Sstevel@tonic-gate 		    (!flag & MD_STR_DIRTY_RD)) {
40190Sstevel@tonic-gate 			if (!IN_RESYNC_REGION(un, ps))
40200Sstevel@tonic-gate 				abort_write = 1;
40210Sstevel@tonic-gate 		}
40220Sstevel@tonic-gate 		if ((select_write_after_read_units(un, ps) == 0) ||
40230Sstevel@tonic-gate 		    (abort_write)) {
40240Sstevel@tonic-gate #ifdef DEBUG
40250Sstevel@tonic-gate 			if (mirror_debug_flag)
40260Sstevel@tonic-gate 				printf("Abort resync write on %x, block %lld\n",
40270Sstevel@tonic-gate 				    MD_SID(un), ps->ps_firstblk);
40280Sstevel@tonic-gate #endif
40290Sstevel@tonic-gate 			if (ps->ps_flags & MD_MPS_ON_OVERLAP)
40306901Sjkennedy 				mirror_overlap_tree_remove(ps);
40310Sstevel@tonic-gate 			kmem_cache_free(mirror_parent_cache, ps);
40320Sstevel@tonic-gate 			md_kstat_waitq_exit(ui);
40330Sstevel@tonic-gate 			md_unit_readerexit(ui);
40340Sstevel@tonic-gate 			md_biodone(pb);
40350Sstevel@tonic-gate 			return;
40360Sstevel@tonic-gate 		}
40370Sstevel@tonic-gate 	} else {
40380Sstevel@tonic-gate 		select_write_units(un, ps);
40390Sstevel@tonic-gate 
40400Sstevel@tonic-gate 		/* Drop readerlock to avoid deadlock */
40410Sstevel@tonic-gate 		md_unit_readerexit(ui);
40420Sstevel@tonic-gate 		wait_for_overlaps(ps, MD_OVERLAP_NO_REPEAT);
40430Sstevel@tonic-gate 		un = md_unit_readerlock(ui);
40440Sstevel@tonic-gate 		/*
40450Sstevel@tonic-gate 		 * For a MN set with an ABR write, if we are now the
40466901Sjkennedy 		 * owner and we have a resync region in the overlap
40476901Sjkennedy 		 * tree, remove the entry from overlaps and retry the write.
40480Sstevel@tonic-gate 		 */
40490Sstevel@tonic-gate 
40500Sstevel@tonic-gate 		if (MD_MNSET_SETNO(setno) &&
40510Sstevel@tonic-gate 		    ((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR))) {
40520Sstevel@tonic-gate 			mutex_enter(&un->un_owner_mx);
40530Sstevel@tonic-gate 			if (((MD_MN_MIRROR_OWNER(un))) && rs_on_overlap) {
40546901Sjkennedy 				mirror_overlap_tree_remove(ps);
40550Sstevel@tonic-gate 				md_kstat_waitq_exit(ui);
40560Sstevel@tonic-gate 				mutex_exit(&un->un_owner_mx);
40570Sstevel@tonic-gate 				md_unit_readerexit(ui);
40580Sstevel@tonic-gate 				daemon_request(&md_mirror_daemon, daemon_io,
40590Sstevel@tonic-gate 				    (daemon_queue_t *)ps, REQ_OLD);
40600Sstevel@tonic-gate 				return;
40610Sstevel@tonic-gate 			}
40620Sstevel@tonic-gate 			mutex_exit(&un->un_owner_mx);
40630Sstevel@tonic-gate 		}
40640Sstevel@tonic-gate 	}
40650Sstevel@tonic-gate 
40660Sstevel@tonic-gate 	/*
40678452SJohn.Wren.Kennedy@Sun.COM 	 * For Multinode mirrors with no owner and a Resync Region (not ABR)
40688452SJohn.Wren.Kennedy@Sun.COM 	 * we need to become the mirror owner before continuing with the
40698452SJohn.Wren.Kennedy@Sun.COM 	 * write(). For ABR mirrors we check that we 'own' the resync if
40708452SJohn.Wren.Kennedy@Sun.COM 	 * we're in write-after-read mode. We do this _after_ ensuring that
40718452SJohn.Wren.Kennedy@Sun.COM 	 * there are no overlaps to ensure that once we know that we are
40728452SJohn.Wren.Kennedy@Sun.COM 	 * the owner, the readerlock will not be released until the write is
40738452SJohn.Wren.Kennedy@Sun.COM 	 * complete. As a change of ownership in a MN set requires the
40748452SJohn.Wren.Kennedy@Sun.COM 	 * writerlock, this ensures that ownership cannot be changed until
40758452SJohn.Wren.Kennedy@Sun.COM 	 * the write is complete.
40760Sstevel@tonic-gate 	 */
40770Sstevel@tonic-gate 	if (MD_MNSET_SETNO(setno) && (!((ui->ui_tstate & MD_ABR_CAP) ||
40780Sstevel@tonic-gate 	    (flag & MD_STR_ABR)) || (flag & MD_STR_WAR))) {
40798452SJohn.Wren.Kennedy@Sun.COM 		if (MD_MN_NO_MIRROR_OWNER(un))  {
40800Sstevel@tonic-gate 			if (ps->ps_flags & MD_MPS_ON_OVERLAP)
40816901Sjkennedy 				mirror_overlap_tree_remove(ps);
40820Sstevel@tonic-gate 			md_kstat_waitq_exit(ui);
40830Sstevel@tonic-gate 			ASSERT(!(flag & MD_STR_WAR));
40840Sstevel@tonic-gate 			md_unit_readerexit(ui);
40850Sstevel@tonic-gate 			daemon_request(&md_mirror_daemon, become_owner,
40860Sstevel@tonic-gate 			    (daemon_queue_t *)ps, REQ_OLD);
40870Sstevel@tonic-gate 			return;
40880Sstevel@tonic-gate 		}
40890Sstevel@tonic-gate 	}
40900Sstevel@tonic-gate 
40910Sstevel@tonic-gate 	/*
40920Sstevel@tonic-gate 	 * Mark resync region if mirror has a Resync Region _and_ we are not
40930Sstevel@tonic-gate 	 * a resync initiated write(). Don't mark region if we're flagged as
40940Sstevel@tonic-gate 	 * an ABR write.
40950Sstevel@tonic-gate 	 */
40960Sstevel@tonic-gate 	if (!((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR)) &&
40970Sstevel@tonic-gate 	    !(flag & MD_STR_WAR)) {
40980Sstevel@tonic-gate 		if (mirror_mark_resync_region(un, ps->ps_firstblk,
40998452SJohn.Wren.Kennedy@Sun.COM 		    ps->ps_lastblk, md_mn_mynode_id)) {
41000Sstevel@tonic-gate 			pb->b_flags |= B_ERROR;
41010Sstevel@tonic-gate 			pb->b_resid = pb->b_bcount;
41028452SJohn.Wren.Kennedy@Sun.COM 			if (ps->ps_flags & MD_MPS_ON_OVERLAP)
41038452SJohn.Wren.Kennedy@Sun.COM 				mirror_overlap_tree_remove(ps);
41040Sstevel@tonic-gate 			kmem_cache_free(mirror_parent_cache, ps);
41050Sstevel@tonic-gate 			md_kstat_waitq_exit(ui);
41060Sstevel@tonic-gate 			md_unit_readerexit(ui);
41070Sstevel@tonic-gate 			md_biodone(pb);
41080Sstevel@tonic-gate 			return;
41090Sstevel@tonic-gate 		}
41100Sstevel@tonic-gate 	}
41110Sstevel@tonic-gate 
41120Sstevel@tonic-gate 	ps->ps_childbflags = pb->b_flags | B_WRITE;
41130Sstevel@tonic-gate 	ps->ps_childbflags &= ~B_READ;
41140Sstevel@tonic-gate 	if (flag & MD_STR_MAPPED)
41150Sstevel@tonic-gate 		ps->ps_childbflags &= ~B_PAGEIO;
41160Sstevel@tonic-gate 
41170Sstevel@tonic-gate 	if (!(flag & MD_STR_NOTTOP) && panicstr)
41180Sstevel@tonic-gate 		/* Disable WOW and don't free ps */
41190Sstevel@tonic-gate 		ps->ps_flags |= (MD_MPS_WOW|MD_MPS_DONTFREE);
41200Sstevel@tonic-gate 
41210Sstevel@tonic-gate 	md_kstat_waitq_to_runq(ui);
41220Sstevel@tonic-gate 
41230Sstevel@tonic-gate 	/*
41240Sstevel@tonic-gate 	 * Treat Raw and Direct I/O as Write-on-Write always
41250Sstevel@tonic-gate 	 */
41260Sstevel@tonic-gate 
41270Sstevel@tonic-gate 	if (!(md_mirror_wow_flg & WOW_DISABLE) &&
41280Sstevel@tonic-gate 	    (md_mirror_wow_flg & WOW_PHYS_ENABLE) &&
41290Sstevel@tonic-gate 	    (pb->b_flags & B_PHYS) &&
41300Sstevel@tonic-gate 	    !(ps->ps_flags & MD_MPS_WOW)) {
41310Sstevel@tonic-gate 		if (ps->ps_flags & MD_MPS_ON_OVERLAP)
41326901Sjkennedy 			mirror_overlap_tree_remove(ps);
41330Sstevel@tonic-gate 		md_unit_readerexit(ui);
41340Sstevel@tonic-gate 		daemon_request(&md_mstr_daemon, handle_wow,
41356901Sjkennedy 		    (daemon_queue_t *)ps, REQ_OLD);
41360Sstevel@tonic-gate 		return;
41370Sstevel@tonic-gate 	}
41380Sstevel@tonic-gate 
41390Sstevel@tonic-gate 	ps->ps_frags = 1;
41400Sstevel@tonic-gate 	do {
41410Sstevel@tonic-gate 		cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS);
41420Sstevel@tonic-gate 		mirror_child_init(cs);
41430Sstevel@tonic-gate 		cb = &cs->cs_buf;
41440Sstevel@tonic-gate 		more = mirror_map_write(un, cs, ps, (flag & MD_STR_WAR));
41450Sstevel@tonic-gate 
41460Sstevel@tonic-gate 		/*
41470Sstevel@tonic-gate 		 * This handles the case where we're requesting
41480Sstevel@tonic-gate 		 * a write to block 0 on a label partition.  (more < 0)
41490Sstevel@tonic-gate 		 * means that the request size was smaller than the
41500Sstevel@tonic-gate 		 * size of the label.  If so this request is done.
41510Sstevel@tonic-gate 		 */
41520Sstevel@tonic-gate 		if (more < 0) {
41530Sstevel@tonic-gate 			if (ps->ps_flags & MD_MPS_ON_OVERLAP)
41546901Sjkennedy 				mirror_overlap_tree_remove(ps);
41550Sstevel@tonic-gate 			md_kstat_runq_exit(ui);
41560Sstevel@tonic-gate 			kmem_cache_free(mirror_child_cache, cs);
41570Sstevel@tonic-gate 			kmem_cache_free(mirror_parent_cache, ps);
41580Sstevel@tonic-gate 			md_unit_readerexit(ui);
41590Sstevel@tonic-gate 			md_biodone(pb);
41600Sstevel@tonic-gate 			return;
41610Sstevel@tonic-gate 		}
41620Sstevel@tonic-gate 		if (more) {
41630Sstevel@tonic-gate 			mutex_enter(&ps->ps_mx);
41640Sstevel@tonic-gate 			ps->ps_frags++;
41650Sstevel@tonic-gate 			mutex_exit(&ps->ps_mx);
41660Sstevel@tonic-gate 		}
41670Sstevel@tonic-gate 		md_call_strategy(cb, flag, private);
41680Sstevel@tonic-gate 	} while (more);
41690Sstevel@tonic-gate 
41700Sstevel@tonic-gate 	if (!(flag & MD_STR_NOTTOP) && panicstr) {
41710Sstevel@tonic-gate 		while (!(ps->ps_flags & MD_MPS_DONE)) {
41720Sstevel@tonic-gate 			md_daemon(1, &md_done_daemon);
41730Sstevel@tonic-gate 			drv_usecwait(10);
41740Sstevel@tonic-gate 		}
41750Sstevel@tonic-gate 		kmem_cache_free(mirror_parent_cache, ps);
41760Sstevel@tonic-gate 	}
41770Sstevel@tonic-gate }
41780Sstevel@tonic-gate 
41790Sstevel@tonic-gate static void
mirror_read_strategy(buf_t * pb,int flag,void * private)41800Sstevel@tonic-gate mirror_read_strategy(buf_t *pb, int flag, void *private)
41810Sstevel@tonic-gate {
41820Sstevel@tonic-gate 	md_mps_t	*ps;
41830Sstevel@tonic-gate 	md_mcs_t	*cs;
41840Sstevel@tonic-gate 	size_t		more;
41850Sstevel@tonic-gate 	mm_unit_t	*un;
41860Sstevel@tonic-gate 	mdi_unit_t	*ui;
41870Sstevel@tonic-gate 	size_t		current_count;
41880Sstevel@tonic-gate 	diskaddr_t	current_blkno;
41890Sstevel@tonic-gate 	off_t		current_offset;
41900Sstevel@tonic-gate 	buf_t		*cb;		/* child buf pointer */
41910Sstevel@tonic-gate 	set_t		setno;
41920Sstevel@tonic-gate 
41930Sstevel@tonic-gate 	ui = MDI_UNIT(getminor(pb->b_edev));
41940Sstevel@tonic-gate 
41950Sstevel@tonic-gate 	md_kstat_waitq_enter(ui);
41960Sstevel@tonic-gate 
41970Sstevel@tonic-gate 	un = (mm_unit_t *)md_unit_readerlock(ui);
41980Sstevel@tonic-gate 
41990Sstevel@tonic-gate 	if (!(flag & MD_STR_NOTTOP)) {
42000Sstevel@tonic-gate 		if (md_checkbuf(ui, (md_unit_t *)un, pb)) {
42010Sstevel@tonic-gate 			md_kstat_waitq_exit(ui);
42020Sstevel@tonic-gate 			return;
42030Sstevel@tonic-gate 		}
42040Sstevel@tonic-gate 	}
42050Sstevel@tonic-gate 
42060Sstevel@tonic-gate 	if (private == NULL) {
42070Sstevel@tonic-gate 		ps = kmem_cache_alloc(mirror_parent_cache, MD_ALLOCFLAGS);
42080Sstevel@tonic-gate 		mirror_parent_init(ps);
42090Sstevel@tonic-gate 	} else {
42100Sstevel@tonic-gate 		ps = private;
42110Sstevel@tonic-gate 		private = NULL;
42120Sstevel@tonic-gate 	}
42130Sstevel@tonic-gate 
42140Sstevel@tonic-gate 	if (flag & MD_STR_MAPPED)
42150Sstevel@tonic-gate 		ps->ps_flags |= MD_MPS_MAPPED;
42160Sstevel@tonic-gate 	if (flag & MD_NOBLOCK)
42170Sstevel@tonic-gate 		ps->ps_flags |= MD_MPS_NOBLOCK;
42180Sstevel@tonic-gate 	if (flag & MD_STR_WMUPDATE)
42190Sstevel@tonic-gate 		ps->ps_flags |= MD_MPS_WMUPDATE;
42200Sstevel@tonic-gate 
42210Sstevel@tonic-gate 	/*
42220Sstevel@tonic-gate 	 * Check to see if this is a DMR driven read. If so we need to use the
42230Sstevel@tonic-gate 	 * specified side (in un->un_dmr_last_read) for the source of the data.
42240Sstevel@tonic-gate 	 */
42250Sstevel@tonic-gate 	if (flag & MD_STR_DMR)
42260Sstevel@tonic-gate 		ps->ps_flags |= MD_MPS_DMR;
42270Sstevel@tonic-gate 
42280Sstevel@tonic-gate 	/*
42290Sstevel@tonic-gate 	 * Save essential information from the original buffhdr
42300Sstevel@tonic-gate 	 * in the md_save structure.
42310Sstevel@tonic-gate 	 */
42320Sstevel@tonic-gate 	ps->ps_un = un;
42330Sstevel@tonic-gate 	ps->ps_ui = ui;
42340Sstevel@tonic-gate 	ps->ps_bp = pb;
42350Sstevel@tonic-gate 	ps->ps_addr = pb->b_un.b_addr;
42360Sstevel@tonic-gate 	ps->ps_firstblk = pb->b_lblkno;
42370Sstevel@tonic-gate 	ps->ps_lastblk = pb->b_lblkno + lbtodb(pb->b_bcount) - 1;
42380Sstevel@tonic-gate 	ps->ps_changecnt = un->un_changecnt;
42390Sstevel@tonic-gate 
42400Sstevel@tonic-gate 	current_count = btodb(pb->b_bcount);
42410Sstevel@tonic-gate 	current_blkno = pb->b_lblkno;
42420Sstevel@tonic-gate 	current_offset = 0;
42430Sstevel@tonic-gate 
42440Sstevel@tonic-gate 	/*
42450Sstevel@tonic-gate 	 * If flag has MD_STR_WAR set this means that the read is issued by a
42460Sstevel@tonic-gate 	 * resync thread which may or may not be an optimised resync.
42470Sstevel@tonic-gate 	 *
42480Sstevel@tonic-gate 	 * If MD_UN_OPT_NOT_DONE is set this means that the optimized resync
42490Sstevel@tonic-gate 	 * code has not completed; either a resync has not started since snarf,
42500Sstevel@tonic-gate 	 * or there is an optimized resync in progress.
42510Sstevel@tonic-gate 	 *
42520Sstevel@tonic-gate 	 * We need to generate a write after this read in the following two
42530Sstevel@tonic-gate 	 * cases,
42540Sstevel@tonic-gate 	 *
42550Sstevel@tonic-gate 	 * 1. Any Resync-Generated read
42560Sstevel@tonic-gate 	 *
42570Sstevel@tonic-gate 	 * 2. Any read to a DIRTY REGION if there is an optimized resync
42580Sstevel@tonic-gate 	 *    pending or in progress.
42590Sstevel@tonic-gate 	 *
42600Sstevel@tonic-gate 	 * The write after read is done in these cases to ensure that all sides
42610Sstevel@tonic-gate 	 * of the mirror are in sync with the read data and that it is not
42620Sstevel@tonic-gate 	 * possible for an application to read the same block multiple times
42630Sstevel@tonic-gate 	 * and get different data.
42640Sstevel@tonic-gate 	 *
42650Sstevel@tonic-gate 	 * This would be possible if the block was in a dirty region.
42660Sstevel@tonic-gate 	 *
42670Sstevel@tonic-gate 	 * If we're performing a directed read we don't write the data out as
42680Sstevel@tonic-gate 	 * the application is responsible for restoring the mirror to a known
42690Sstevel@tonic-gate 	 * state.
42700Sstevel@tonic-gate 	 */
42710Sstevel@tonic-gate 	if (((MD_STATUS(un) & MD_UN_OPT_NOT_DONE) || (flag & MD_STR_WAR)) &&
42720Sstevel@tonic-gate 	    !(flag & MD_STR_DMR)) {
42730Sstevel@tonic-gate 		size_t	start_rr, i, end_rr;
42740Sstevel@tonic-gate 		int	region_dirty = 1;
42750Sstevel@tonic-gate 
42760Sstevel@tonic-gate 		/*
42770Sstevel@tonic-gate 		 * We enter here under three circumstances,
42780Sstevel@tonic-gate 		 *
42790Sstevel@tonic-gate 		 * MD_UN_OPT_NOT_DONE	MD_STR_WAR
42800Sstevel@tonic-gate 		 * 0			1
42810Sstevel@tonic-gate 		 * 1			0
42820Sstevel@tonic-gate 		 * 1			1
42830Sstevel@tonic-gate 		 *
42840Sstevel@tonic-gate 		 * To be optimal we only care to explicitly check for dirty
42850Sstevel@tonic-gate 		 * regions in the second case since if MD_STR_WAR is set we
42860Sstevel@tonic-gate 		 * always do the write after read.
42870Sstevel@tonic-gate 		 */
42880Sstevel@tonic-gate 		if (!(flag & MD_STR_WAR)) {
42890Sstevel@tonic-gate 			BLK_TO_RR(end_rr, ps->ps_lastblk, un);
42900Sstevel@tonic-gate 			BLK_TO_RR(start_rr, ps->ps_firstblk, un);
42910Sstevel@tonic-gate 
42920Sstevel@tonic-gate 			for (i = start_rr; i <= end_rr; i++)
42930Sstevel@tonic-gate 				if ((region_dirty = IS_KEEPDIRTY(i, un)) != 0)
42940Sstevel@tonic-gate 					break;
42950Sstevel@tonic-gate 		}
42960Sstevel@tonic-gate 
42970Sstevel@tonic-gate 		if ((region_dirty) &&
42980Sstevel@tonic-gate 		    !(md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)) {
42990Sstevel@tonic-gate 			ps->ps_call = write_after_read;
43000Sstevel@tonic-gate 			/*
43010Sstevel@tonic-gate 			 * Mark this as a RESYNC_READ in ps_flags.
43020Sstevel@tonic-gate 			 * This is used if the read fails during a
43030Sstevel@tonic-gate 			 * resync of a 3-way mirror to ensure that
43040Sstevel@tonic-gate 			 * the retried read to the remaining
43050Sstevel@tonic-gate 			 * good submirror has MD_STR_WAR set. This
43060Sstevel@tonic-gate 			 * is needed to ensure that the resync write
43070Sstevel@tonic-gate 			 * (write-after-read) takes place.
43080Sstevel@tonic-gate 			 */
43090Sstevel@tonic-gate 			ps->ps_flags |= MD_MPS_RESYNC_READ;
43100Sstevel@tonic-gate 
43110Sstevel@tonic-gate 			/*
43120Sstevel@tonic-gate 			 * If MD_STR_FLAG_ERR is set in the flags we
43130Sstevel@tonic-gate 			 * set MD_MPS_FLAG_ERROR so that an error on the resync
43140Sstevel@tonic-gate 			 * write (issued by write_after_read) will be flagged
43150Sstevel@tonic-gate 			 * to the biowait'ing resync thread. This allows us to
43160Sstevel@tonic-gate 			 * avoid issuing further resync requests to a device
43170Sstevel@tonic-gate 			 * that has had a write failure.
43180Sstevel@tonic-gate 			 */
43190Sstevel@tonic-gate 			if (flag & MD_STR_FLAG_ERR)
43200Sstevel@tonic-gate 				ps->ps_flags |= MD_MPS_FLAG_ERROR;
43210Sstevel@tonic-gate 
43220Sstevel@tonic-gate 			setno = MD_UN2SET(un);
43230Sstevel@tonic-gate 			/*
43240Sstevel@tonic-gate 			 * Drop the readerlock to avoid
43250Sstevel@tonic-gate 			 * deadlock
43260Sstevel@tonic-gate 			 */
43270Sstevel@tonic-gate 			md_unit_readerexit(ui);
43280Sstevel@tonic-gate 			wait_for_overlaps(ps, MD_OVERLAP_NO_REPEAT);
43290Sstevel@tonic-gate 			un = md_unit_readerlock(ui);
43300Sstevel@tonic-gate 			/*
43310Sstevel@tonic-gate 			 * Ensure that we are owner
43320Sstevel@tonic-gate 			 */
43330Sstevel@tonic-gate 			if (MD_MNSET_SETNO(setno)) {
43340Sstevel@tonic-gate 				/*
43350Sstevel@tonic-gate 				 * For a non-resync read that requires a
43360Sstevel@tonic-gate 				 * write-after-read to be done, set a flag
43370Sstevel@tonic-gate 				 * in the parent structure, so that the
43380Sstevel@tonic-gate 				 * write_strategy routine can omit the
43390Sstevel@tonic-gate 				 * test that the write is still within the
43400Sstevel@tonic-gate 				 * resync region
43410Sstevel@tonic-gate 				 */
43420Sstevel@tonic-gate 				if (!(flag & MD_STR_WAR))
43430Sstevel@tonic-gate 					ps->ps_flags |= MD_MPS_DIRTY_RD;
43440Sstevel@tonic-gate 
43450Sstevel@tonic-gate 				/*
43460Sstevel@tonic-gate 				 * Before reading the buffer, see if
43478452SJohn.Wren.Kennedy@Sun.COM 				 * there is an owner.
43480Sstevel@tonic-gate 				 */
43498452SJohn.Wren.Kennedy@Sun.COM 				if (MD_MN_NO_MIRROR_OWNER(un))  {
43500Sstevel@tonic-gate 					ps->ps_call = NULL;
43516901Sjkennedy 					mirror_overlap_tree_remove(ps);
43520Sstevel@tonic-gate 					md_kstat_waitq_exit(ui);
43530Sstevel@tonic-gate 					md_unit_readerexit(ui);
43540Sstevel@tonic-gate 					daemon_request(
43550Sstevel@tonic-gate 					    &md_mirror_daemon,
43560Sstevel@tonic-gate 					    become_owner,
43570Sstevel@tonic-gate 					    (daemon_queue_t *)ps,
43580Sstevel@tonic-gate 					    REQ_OLD);
43590Sstevel@tonic-gate 					return;
43600Sstevel@tonic-gate 				}
43610Sstevel@tonic-gate 				/*
43620Sstevel@tonic-gate 				 * For a resync read, check to see if I/O is
43630Sstevel@tonic-gate 				 * outside of the current resync region, or
43640Sstevel@tonic-gate 				 * the resync has finished. If so
43650Sstevel@tonic-gate 				 * just terminate the I/O
43660Sstevel@tonic-gate 				 */
43670Sstevel@tonic-gate 				if ((flag & MD_STR_WAR) &&
43680Sstevel@tonic-gate 				    (!(un->c.un_status & MD_UN_WAR) ||
43690Sstevel@tonic-gate 				    (!IN_RESYNC_REGION(un, ps)))) {
43700Sstevel@tonic-gate #ifdef DEBUG
43710Sstevel@tonic-gate 					if (mirror_debug_flag)
43720Sstevel@tonic-gate 						printf("Abort resync read "
43730Sstevel@tonic-gate 						    "%x: %lld\n",
43740Sstevel@tonic-gate 						    MD_SID(un),
43750Sstevel@tonic-gate 						    ps->ps_firstblk);
43760Sstevel@tonic-gate #endif
43776901Sjkennedy 					mirror_overlap_tree_remove(ps);
43780Sstevel@tonic-gate 					kmem_cache_free(mirror_parent_cache,
43790Sstevel@tonic-gate 					    ps);
43800Sstevel@tonic-gate 					md_kstat_waitq_exit(ui);
43810Sstevel@tonic-gate 					md_unit_readerexit(ui);
43820Sstevel@tonic-gate 					md_biodone(pb);
43830Sstevel@tonic-gate 					return;
43840Sstevel@tonic-gate 				}
43850Sstevel@tonic-gate 			}
43860Sstevel@tonic-gate 		}
43870Sstevel@tonic-gate 	}
43880Sstevel@tonic-gate 
43890Sstevel@tonic-gate 	if (flag & MD_STR_DMR) {
43900Sstevel@tonic-gate 		ps->ps_call = directed_read_done;
43910Sstevel@tonic-gate 	}
43920Sstevel@tonic-gate 
43930Sstevel@tonic-gate 	if (!(flag & MD_STR_NOTTOP) && panicstr)
43940Sstevel@tonic-gate 		ps->ps_flags |= MD_MPS_DONTFREE;
43950Sstevel@tonic-gate 
43960Sstevel@tonic-gate 	md_kstat_waitq_to_runq(ui);
43970Sstevel@tonic-gate 
43980Sstevel@tonic-gate 	ps->ps_frags++;
43990Sstevel@tonic-gate 	do {
44000Sstevel@tonic-gate 		cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS);
44010Sstevel@tonic-gate 		mirror_child_init(cs);
44020Sstevel@tonic-gate 		cb = &cs->cs_buf;
44030Sstevel@tonic-gate 		cs->cs_ps = ps;
44040Sstevel@tonic-gate 
44050Sstevel@tonic-gate 		cb = md_bioclone(pb, current_offset, current_count, NODEV,
44060Sstevel@tonic-gate 		    current_blkno, mirror_done, cb, KM_NOSLEEP);
44070Sstevel@tonic-gate 
44080Sstevel@tonic-gate 		more = mirror_map_read(ps, cs, current_blkno,
44096901Sjkennedy 		    (u_longlong_t)current_count);
44100Sstevel@tonic-gate 		if (more) {
44110Sstevel@tonic-gate 			mutex_enter(&ps->ps_mx);
44120Sstevel@tonic-gate 			ps->ps_frags++;
44130Sstevel@tonic-gate 			mutex_exit(&ps->ps_mx);
44140Sstevel@tonic-gate 		}
44150Sstevel@tonic-gate 
44160Sstevel@tonic-gate 		/*
44170Sstevel@tonic-gate 		 * Do these calculations now,
44180Sstevel@tonic-gate 		 *  so that we pickup a valid b_bcount from the chld_bp.
44190Sstevel@tonic-gate 		 */
44200Sstevel@tonic-gate 		current_count -= more;
44210Sstevel@tonic-gate 		current_offset += cb->b_bcount;
44220Sstevel@tonic-gate 		current_blkno +=  more;
44230Sstevel@tonic-gate 		md_call_strategy(cb, flag, private);
44240Sstevel@tonic-gate 	} while (more);
44250Sstevel@tonic-gate 
44260Sstevel@tonic-gate 	if (!(flag & MD_STR_NOTTOP) && panicstr) {
44270Sstevel@tonic-gate 		while (!(ps->ps_flags & MD_MPS_DONE)) {
44280Sstevel@tonic-gate 			md_daemon(1, &md_done_daemon);
44290Sstevel@tonic-gate 			drv_usecwait(10);
44300Sstevel@tonic-gate 		}
44310Sstevel@tonic-gate 		kmem_cache_free(mirror_parent_cache, ps);
44320Sstevel@tonic-gate 	}
44330Sstevel@tonic-gate }
44340Sstevel@tonic-gate 
44350Sstevel@tonic-gate void
md_mirror_strategy(buf_t * bp,int flag,void * private)44360Sstevel@tonic-gate md_mirror_strategy(buf_t *bp, int flag, void *private)
44370Sstevel@tonic-gate {
44380Sstevel@tonic-gate 	set_t	setno = MD_MIN2SET(getminor(bp->b_edev));
44390Sstevel@tonic-gate 
44400Sstevel@tonic-gate 	/*
44410Sstevel@tonic-gate 	 * When doing IO to a multi owner meta device, check if set is halted.
44420Sstevel@tonic-gate 	 * We do this check without the needed lock held, for performance
44430Sstevel@tonic-gate 	 * reasons.
44440Sstevel@tonic-gate 	 * If an IO just slips through while the set is locked via an
44450Sstevel@tonic-gate 	 * MD_MN_SUSPEND_SET, we don't care about it.
44460Sstevel@tonic-gate 	 * Only check for suspension if we are a top-level i/o request
44470Sstevel@tonic-gate 	 * (MD_STR_NOTTOP is cleared in 'flag').
44480Sstevel@tonic-gate 	 */
44490Sstevel@tonic-gate 	if ((md_set[setno].s_status & (MD_SET_HALTED | MD_SET_MNSET)) ==
44500Sstevel@tonic-gate 	    (MD_SET_HALTED | MD_SET_MNSET)) {
44510Sstevel@tonic-gate 		if ((flag & MD_STR_NOTTOP) == 0) {
44520Sstevel@tonic-gate 			mutex_enter(&md_mx);
44530Sstevel@tonic-gate 			/* Here we loop until the set is no longer halted */
44540Sstevel@tonic-gate 			while (md_set[setno].s_status & MD_SET_HALTED) {
44550Sstevel@tonic-gate 				cv_wait(&md_cv, &md_mx);
44560Sstevel@tonic-gate 			}
44570Sstevel@tonic-gate 			mutex_exit(&md_mx);
44580Sstevel@tonic-gate 		}
44590Sstevel@tonic-gate 	}
44600Sstevel@tonic-gate 
44610Sstevel@tonic-gate 	if ((flag & MD_IO_COUNTED) == 0) {
44620Sstevel@tonic-gate 		if ((flag & MD_NOBLOCK) == 0) {
44630Sstevel@tonic-gate 			if (md_inc_iocount(setno) != 0) {
44640Sstevel@tonic-gate 				bp->b_flags |= B_ERROR;
44650Sstevel@tonic-gate 				bp->b_error = ENXIO;
44660Sstevel@tonic-gate 				bp->b_resid = bp->b_bcount;
44670Sstevel@tonic-gate 				biodone(bp);
44680Sstevel@tonic-gate 				return;
44690Sstevel@tonic-gate 			}
44700Sstevel@tonic-gate 		} else {
44710Sstevel@tonic-gate 			md_inc_iocount_noblock(setno);
44720Sstevel@tonic-gate 		}
44730Sstevel@tonic-gate 	}
44740Sstevel@tonic-gate 
44750Sstevel@tonic-gate 	if (bp->b_flags & B_READ)
44760Sstevel@tonic-gate 		mirror_read_strategy(bp, flag, private);
44770Sstevel@tonic-gate 	else
44780Sstevel@tonic-gate 		mirror_write_strategy(bp, flag, private);
44790Sstevel@tonic-gate }
44800Sstevel@tonic-gate 
44810Sstevel@tonic-gate /*
44820Sstevel@tonic-gate  * mirror_directed_read:
44830Sstevel@tonic-gate  * --------------------
44840Sstevel@tonic-gate  * Entry-point for the DKIOCDMR ioctl. We issue a read to a specified sub-mirror
44850Sstevel@tonic-gate  * so that the application can determine what (if any) resync needs to be
44860Sstevel@tonic-gate  * performed. The data is copied out to the user-supplied buffer.
44870Sstevel@tonic-gate  *
44880Sstevel@tonic-gate  * Parameters:
44890Sstevel@tonic-gate  *	mdev	- dev_t for the mirror device
44900Sstevel@tonic-gate  *	vdr	- directed read parameters specifying location and submirror
44910Sstevel@tonic-gate  *		  to perform the read from
44920Sstevel@tonic-gate  *	mode	- used to ddi_copyout() any resulting data from the read
44930Sstevel@tonic-gate  *
44940Sstevel@tonic-gate  * Returns:
44950Sstevel@tonic-gate  *	0	success
44960Sstevel@tonic-gate  *	!0	error code
44970Sstevel@tonic-gate  *		EINVAL - invalid request format
44980Sstevel@tonic-gate  */
44990Sstevel@tonic-gate int
mirror_directed_read(dev_t mdev,vol_directed_rd_t * vdr,int mode)45000Sstevel@tonic-gate mirror_directed_read(dev_t mdev, vol_directed_rd_t *vdr, int mode)
45010Sstevel@tonic-gate {
45020Sstevel@tonic-gate 	buf_t		*bp;
45030Sstevel@tonic-gate 	minor_t		mnum = getminor(mdev);
45040Sstevel@tonic-gate 	mdi_unit_t	*ui = MDI_UNIT(mnum);
45050Sstevel@tonic-gate 	mm_unit_t	*un;
45060Sstevel@tonic-gate 	mm_submirror_t	*sm;
45070Sstevel@tonic-gate 	char		*sm_nm;
45080Sstevel@tonic-gate 	uint_t		next_side;
45090Sstevel@tonic-gate 	void		*kbuffer;
45100Sstevel@tonic-gate 
45110Sstevel@tonic-gate 	if (ui == NULL)
45120Sstevel@tonic-gate 		return (ENXIO);
45130Sstevel@tonic-gate 
45140Sstevel@tonic-gate 	if (!(vdr->vdr_flags & DKV_DMR_NEXT_SIDE)) {
45150Sstevel@tonic-gate 		return (EINVAL);
45160Sstevel@tonic-gate 	}
45170Sstevel@tonic-gate 
45180Sstevel@tonic-gate 	/* Check for aligned block access. We disallow non-aligned requests. */
45190Sstevel@tonic-gate 	if (vdr->vdr_offset % DEV_BSIZE) {
45200Sstevel@tonic-gate 		return (EINVAL);
45210Sstevel@tonic-gate 	}
45220Sstevel@tonic-gate 
45230Sstevel@tonic-gate 	/*
45240Sstevel@tonic-gate 	 * Allocate kernel buffer for target of read(). If we had a reliable
45250Sstevel@tonic-gate 	 * (sorry functional) DDI this wouldn't be needed.
45260Sstevel@tonic-gate 	 */
45270Sstevel@tonic-gate 	kbuffer = kmem_alloc(vdr->vdr_nbytes, KM_NOSLEEP);
45280Sstevel@tonic-gate 	if (kbuffer == NULL) {
45290Sstevel@tonic-gate 		cmn_err(CE_WARN, "mirror_directed_read: couldn't allocate %lx"
45300Sstevel@tonic-gate 		    " bytes\n", vdr->vdr_nbytes);
45310Sstevel@tonic-gate 		return (ENOMEM);
45320Sstevel@tonic-gate 	}
45330Sstevel@tonic-gate 
45340Sstevel@tonic-gate 	bp = getrbuf(KM_SLEEP);
45350Sstevel@tonic-gate 
45360Sstevel@tonic-gate 	bp->b_un.b_addr = kbuffer;
45370Sstevel@tonic-gate 	bp->b_flags = B_READ;
45380Sstevel@tonic-gate 	bp->b_bcount = vdr->vdr_nbytes;
45390Sstevel@tonic-gate 	bp->b_lblkno = lbtodb(vdr->vdr_offset);
45400Sstevel@tonic-gate 	bp->b_edev = mdev;
45410Sstevel@tonic-gate 
45420Sstevel@tonic-gate 	un = md_unit_readerlock(ui);
45430Sstevel@tonic-gate 
45440Sstevel@tonic-gate 	/*
45450Sstevel@tonic-gate 	 * If DKV_SIDE_INIT is set we need to determine the first available
45460Sstevel@tonic-gate 	 * side to start reading from. If it isn't set we increment to the
45470Sstevel@tonic-gate 	 * next readable submirror.
45480Sstevel@tonic-gate 	 * If there are no readable submirrors we error out with DKV_DMR_ERROR.
45490Sstevel@tonic-gate 	 * Note: we check for a readable submirror on completion of the i/o so
45500Sstevel@tonic-gate 	 * we should _always_ have one available. If this becomes unavailable
45510Sstevel@tonic-gate 	 * we have missed the 'DKV_DMR_DONE' opportunity. This could happen if
45520Sstevel@tonic-gate 	 * a metadetach is made between the completion of one DKIOCDMR ioctl
45530Sstevel@tonic-gate 	 * and the start of the next (i.e. a sys-admin 'accident' occurred).
45540Sstevel@tonic-gate 	 * The chance of this is small, but not non-existent.
45550Sstevel@tonic-gate 	 */
45560Sstevel@tonic-gate 	if (vdr->vdr_side == DKV_SIDE_INIT) {
45570Sstevel@tonic-gate 		next_side = 0;
45580Sstevel@tonic-gate 	} else {
45590Sstevel@tonic-gate 		next_side = vdr->vdr_side + 1;
45600Sstevel@tonic-gate 	}
45610Sstevel@tonic-gate 	while ((next_side < NMIRROR) &&
45620Sstevel@tonic-gate 	    !SUBMIRROR_IS_READABLE(un, next_side))
45630Sstevel@tonic-gate 		next_side++;
45640Sstevel@tonic-gate 	if (next_side >= NMIRROR) {
45650Sstevel@tonic-gate 		vdr->vdr_flags |= DKV_DMR_ERROR;
45660Sstevel@tonic-gate 		freerbuf(bp);
45670Sstevel@tonic-gate 		vdr->vdr_bytesread = 0;
45680Sstevel@tonic-gate 		md_unit_readerexit(ui);
45690Sstevel@tonic-gate 		return (0);
45700Sstevel@tonic-gate 	}
45710Sstevel@tonic-gate 
45720Sstevel@tonic-gate 	/* Set the side to read from */
45730Sstevel@tonic-gate 	un->un_dmr_last_read = next_side;
45740Sstevel@tonic-gate 
45750Sstevel@tonic-gate 	md_unit_readerexit(ui);
45760Sstevel@tonic-gate 
45770Sstevel@tonic-gate 	/*
45780Sstevel@tonic-gate 	 * Save timestamp for verification purposes. Can be read by debugger
45790Sstevel@tonic-gate 	 * to verify that this ioctl has been executed and to find the number
45800Sstevel@tonic-gate 	 * of DMR reads and the time of the last DMR read.
45810Sstevel@tonic-gate 	 */
45820Sstevel@tonic-gate 	uniqtime(&mirror_dmr_stats.dmr_timestamp);
45830Sstevel@tonic-gate 	mirror_dmr_stats.dmr_count++;
45840Sstevel@tonic-gate 
45850Sstevel@tonic-gate 	/* Issue READ request and wait for completion */
45860Sstevel@tonic-gate 	mirror_read_strategy(bp, MD_STR_DMR|MD_NOBLOCK|MD_STR_NOTTOP, NULL);
45870Sstevel@tonic-gate 
45880Sstevel@tonic-gate 	mutex_enter(&un->un_dmr_mx);
45890Sstevel@tonic-gate 	cv_wait(&un->un_dmr_cv, &un->un_dmr_mx);
45900Sstevel@tonic-gate 	mutex_exit(&un->un_dmr_mx);
45910Sstevel@tonic-gate 
45920Sstevel@tonic-gate 	/*
45930Sstevel@tonic-gate 	 * Check to see if we encountered an error during the read. If so we
45940Sstevel@tonic-gate 	 * can make no guarantee about any possibly returned data.
45950Sstevel@tonic-gate 	 */
45960Sstevel@tonic-gate 	if ((bp->b_flags & B_ERROR) == 0) {
45970Sstevel@tonic-gate 		vdr->vdr_flags &= ~DKV_DMR_ERROR;
45980Sstevel@tonic-gate 		if (bp->b_resid) {
45990Sstevel@tonic-gate 			vdr->vdr_flags |= DKV_DMR_SHORT;
46000Sstevel@tonic-gate 			vdr->vdr_bytesread = vdr->vdr_nbytes - bp->b_resid;
46010Sstevel@tonic-gate 		} else {
46020Sstevel@tonic-gate 			vdr->vdr_flags |= DKV_DMR_SUCCESS;
46030Sstevel@tonic-gate 			vdr->vdr_bytesread = vdr->vdr_nbytes;
46040Sstevel@tonic-gate 		}
46050Sstevel@tonic-gate 		/* Copy the data read back out to the user supplied buffer */
46060Sstevel@tonic-gate 		if (ddi_copyout(kbuffer, vdr->vdr_data, vdr->vdr_bytesread,
46070Sstevel@tonic-gate 		    mode)) {
46080Sstevel@tonic-gate 			kmem_free(kbuffer, vdr->vdr_nbytes);
46090Sstevel@tonic-gate 			return (EFAULT);
46100Sstevel@tonic-gate 		}
46110Sstevel@tonic-gate 
46120Sstevel@tonic-gate 	} else {
46130Sstevel@tonic-gate 		/* Error out with DKV_DMR_ERROR */
46140Sstevel@tonic-gate 		vdr->vdr_flags |= DKV_DMR_ERROR;
46150Sstevel@tonic-gate 		vdr->vdr_flags &= ~(DKV_DMR_SUCCESS|DKV_DMR_SHORT|DKV_DMR_DONE);
46160Sstevel@tonic-gate 	}
46170Sstevel@tonic-gate 	/*
46180Sstevel@tonic-gate 	 * Update the DMR parameters with the side and name of submirror that
46190Sstevel@tonic-gate 	 * we have just read from (un->un_dmr_last_read)
46200Sstevel@tonic-gate 	 */
46210Sstevel@tonic-gate 	un = md_unit_readerlock(ui);
46220Sstevel@tonic-gate 
46230Sstevel@tonic-gate 	vdr->vdr_side = un->un_dmr_last_read;
46240Sstevel@tonic-gate 	sm = &un->un_sm[un->un_dmr_last_read];
46250Sstevel@tonic-gate 	sm_nm = md_shortname(md_getminor(sm->sm_dev));
46260Sstevel@tonic-gate 
46271623Stw21770 	(void) strncpy(vdr->vdr_side_name, sm_nm, sizeof (vdr->vdr_side_name));
46280Sstevel@tonic-gate 
46290Sstevel@tonic-gate 	/*
46300Sstevel@tonic-gate 	 * Determine if we've completed the read cycle. This is true iff the
46310Sstevel@tonic-gate 	 * next computed submirror (side) equals or exceeds NMIRROR. We cannot
46320Sstevel@tonic-gate 	 * use un_nsm as we need to handle a sparse array of submirrors (which
46330Sstevel@tonic-gate 	 * can occur if a submirror is metadetached).
46340Sstevel@tonic-gate 	 */
46350Sstevel@tonic-gate 	next_side = un->un_dmr_last_read + 1;
46360Sstevel@tonic-gate 	while ((next_side < NMIRROR) &&
46370Sstevel@tonic-gate 	    !SUBMIRROR_IS_READABLE(un, next_side))
46380Sstevel@tonic-gate 		next_side++;
46390Sstevel@tonic-gate 	if (next_side >= NMIRROR) {
46400Sstevel@tonic-gate 		/* We've finished */
46410Sstevel@tonic-gate 		vdr->vdr_flags |= DKV_DMR_DONE;
46420Sstevel@tonic-gate 	}
46430Sstevel@tonic-gate 
46440Sstevel@tonic-gate 	md_unit_readerexit(ui);
46450Sstevel@tonic-gate 	freerbuf(bp);
46460Sstevel@tonic-gate 	kmem_free(kbuffer, vdr->vdr_nbytes);
46470Sstevel@tonic-gate 
46480Sstevel@tonic-gate 	return (0);
46490Sstevel@tonic-gate }
46500Sstevel@tonic-gate 
46510Sstevel@tonic-gate /*
46520Sstevel@tonic-gate  * mirror_resync_message:
46530Sstevel@tonic-gate  * ---------------------
46540Sstevel@tonic-gate  * Handle the multi-node resync messages that keep all nodes within a given
46550Sstevel@tonic-gate  * disk-set in sync with their view of a mirror's resync status.
46560Sstevel@tonic-gate  *
46570Sstevel@tonic-gate  * The message types dealt with are:
46580Sstevel@tonic-gate  * MD_MN_MSG_RESYNC_STARTING	- start a resync thread for a unit
46590Sstevel@tonic-gate  * MD_MN_MSG_RESYNC_NEXT	- specified next region to be resynced
46600Sstevel@tonic-gate  * MD_MN_MSG_RESYNC_FINISH	- stop the resync thread for a unit
46610Sstevel@tonic-gate  * MD_MN_MSG_RESYNC_PHASE_DONE	- end of a resync phase, opt, submirror or comp
46620Sstevel@tonic-gate  *
46630Sstevel@tonic-gate  * Returns:
46640Sstevel@tonic-gate  *	0	Success
46650Sstevel@tonic-gate  *	>0	Failure error number
46660Sstevel@tonic-gate  */
46670Sstevel@tonic-gate int
mirror_resync_message(md_mn_rs_params_t * p,IOLOCK * lockp)46680Sstevel@tonic-gate mirror_resync_message(md_mn_rs_params_t *p, IOLOCK *lockp)
46690Sstevel@tonic-gate {
46700Sstevel@tonic-gate 	mdi_unit_t		*ui;
46710Sstevel@tonic-gate 	mm_unit_t		*un;
46720Sstevel@tonic-gate 	set_t			setno;
46730Sstevel@tonic-gate 	int			is_ABR;
46740Sstevel@tonic-gate 	int			smi;
46750Sstevel@tonic-gate 	int			ci;
46760Sstevel@tonic-gate 	sm_state_t		state;
46770Sstevel@tonic-gate 	int			broke_out;
46780Sstevel@tonic-gate 	mm_submirror_t		*sm;
46790Sstevel@tonic-gate 	mm_submirror_ic_t	*smic;
46800Sstevel@tonic-gate 	md_m_shared_t		*shared;
46810Sstevel@tonic-gate 	md_error_t		mde = mdnullerror;
46820Sstevel@tonic-gate 	md_mps_t		*ps;
46830Sstevel@tonic-gate 	int			rs_active;
46848452SJohn.Wren.Kennedy@Sun.COM 	int			rr, rr_start, rr_end;
46850Sstevel@tonic-gate 
46860Sstevel@tonic-gate 	/* Check that the given device is part of a multi-node set */
46870Sstevel@tonic-gate 	setno = MD_MIN2SET(p->mnum);
46880Sstevel@tonic-gate 	if (setno >= md_nsets) {
46890Sstevel@tonic-gate 		return (ENXIO);
46900Sstevel@tonic-gate 	}
46910Sstevel@tonic-gate 	if (!MD_MNSET_SETNO(setno)) {
46920Sstevel@tonic-gate 		return (EINVAL);
46930Sstevel@tonic-gate 	}
46940Sstevel@tonic-gate 
46950Sstevel@tonic-gate 	if ((un = mirror_getun(p->mnum, &p->mde, NO_LOCK, NULL)) == NULL)
46960Sstevel@tonic-gate 		return (EINVAL);
46970Sstevel@tonic-gate 	if ((ui = MDI_UNIT(p->mnum)) == NULL)
46980Sstevel@tonic-gate 		return (EINVAL);
46990Sstevel@tonic-gate 	is_ABR = (ui->ui_tstate & MD_ABR_CAP);
47000Sstevel@tonic-gate 
47010Sstevel@tonic-gate 	/* Obtain the current resync status */
47020Sstevel@tonic-gate 	(void) md_ioctl_readerlock(lockp, ui);
47030Sstevel@tonic-gate 	rs_active = (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) ? 1 : 0;
47040Sstevel@tonic-gate 	md_ioctl_readerexit(lockp);
47050Sstevel@tonic-gate 
47060Sstevel@tonic-gate 	switch ((md_mn_msgtype_t)p->msg_type) {
47070Sstevel@tonic-gate 	case MD_MN_MSG_RESYNC_STARTING:
47080Sstevel@tonic-gate 		/* Start the resync thread for the mirror */
47090Sstevel@tonic-gate 		(void) mirror_resync_unit(p->mnum, NULL, &p->mde, lockp);
47100Sstevel@tonic-gate 		break;
47110Sstevel@tonic-gate 
47120Sstevel@tonic-gate 	case MD_MN_MSG_RESYNC_NEXT:
47130Sstevel@tonic-gate 		/*
47140Sstevel@tonic-gate 		 * We have to release any previously marked overlap regions
47150Sstevel@tonic-gate 		 * so that i/o can resume. Then we need to block the region
47160Sstevel@tonic-gate 		 * from [rs_start..rs_start+rs_size) * so that no i/o is issued.
47170Sstevel@tonic-gate 		 * Update un_rs_resync_done and un_rs_resync_2_do.
47180Sstevel@tonic-gate 		 */
47190Sstevel@tonic-gate 		(void) md_ioctl_readerlock(lockp, ui);
47200Sstevel@tonic-gate 		/*
47210Sstevel@tonic-gate 		 * Ignore the message if there is no active resync thread or
47220Sstevel@tonic-gate 		 * if it is for a resync type that we have already completed.
47230Sstevel@tonic-gate 		 * un_resync_completed is set to the last resync completed
47240Sstevel@tonic-gate 		 * when processing a PHASE_DONE message.
47250Sstevel@tonic-gate 		 */
47260Sstevel@tonic-gate 		if (!rs_active || (p->rs_type == un->un_resync_completed))
47270Sstevel@tonic-gate 			break;
47280Sstevel@tonic-gate 		/*
47290Sstevel@tonic-gate 		 * If this message is for the same resync and is for an earlier
47300Sstevel@tonic-gate 		 * resync region, just ignore it. This can only occur if this
47310Sstevel@tonic-gate 		 * node has progressed on to the next resync region before
47320Sstevel@tonic-gate 		 * we receive this message. This can occur if the class for
47330Sstevel@tonic-gate 		 * this message is busy and the originator has to retry thus
47340Sstevel@tonic-gate 		 * allowing this node to move onto the next resync_region.
47350Sstevel@tonic-gate 		 */
47360Sstevel@tonic-gate 		if ((p->rs_type == un->un_rs_type) &&
47370Sstevel@tonic-gate 		    (p->rs_start < un->un_resync_startbl))
47380Sstevel@tonic-gate 			break;
47396901Sjkennedy 		ps = un->un_rs_prev_overlap;
47400Sstevel@tonic-gate 
47410Sstevel@tonic-gate 		/* Allocate previous overlap reference if needed */
47420Sstevel@tonic-gate 		if (ps == NULL) {
47430Sstevel@tonic-gate 			ps = kmem_cache_alloc(mirror_parent_cache,
47446901Sjkennedy 			    MD_ALLOCFLAGS);
47450Sstevel@tonic-gate 			ps->ps_un = un;
47460Sstevel@tonic-gate 			ps->ps_ui = ui;
47470Sstevel@tonic-gate 			ps->ps_firstblk = 0;
47480Sstevel@tonic-gate 			ps->ps_lastblk = 0;
47490Sstevel@tonic-gate 			ps->ps_flags = 0;
47500Sstevel@tonic-gate 			md_ioctl_readerexit(lockp);
47510Sstevel@tonic-gate 			(void) md_ioctl_writerlock(lockp, ui);
47526901Sjkennedy 			un->un_rs_prev_overlap = ps;
47530Sstevel@tonic-gate 			md_ioctl_writerexit(lockp);
47540Sstevel@tonic-gate 		} else
47550Sstevel@tonic-gate 			md_ioctl_readerexit(lockp);
47560Sstevel@tonic-gate 
47570Sstevel@tonic-gate 		if (p->rs_originator != md_mn_mynode_id) {
47580Sstevel@tonic-gate 			/*
47598452SJohn.Wren.Kennedy@Sun.COM 			 * Clear our un_resync_bm for the regions completed.
47608452SJohn.Wren.Kennedy@Sun.COM 			 * The owner (originator) will take care of itself.
47618452SJohn.Wren.Kennedy@Sun.COM 			 */
47628452SJohn.Wren.Kennedy@Sun.COM 			BLK_TO_RR(rr_end, ps->ps_lastblk, un);
47638452SJohn.Wren.Kennedy@Sun.COM 			BLK_TO_RR(rr_start, p->rs_start, un);
47648452SJohn.Wren.Kennedy@Sun.COM 			if (ps->ps_lastblk && rr_end < rr_start) {
47658452SJohn.Wren.Kennedy@Sun.COM 				BLK_TO_RR(rr_start, ps->ps_firstblk, un);
47668452SJohn.Wren.Kennedy@Sun.COM 				mutex_enter(&un->un_resync_mx);
47678452SJohn.Wren.Kennedy@Sun.COM 				/*
47688452SJohn.Wren.Kennedy@Sun.COM 				 * Update our resync bitmap to reflect that
47698452SJohn.Wren.Kennedy@Sun.COM 				 * another node has synchronized this range.
47708452SJohn.Wren.Kennedy@Sun.COM 				 */
47718452SJohn.Wren.Kennedy@Sun.COM 				for (rr = rr_start; rr <= rr_end; rr++) {
47728452SJohn.Wren.Kennedy@Sun.COM 					CLR_KEEPDIRTY(rr, un);
47738452SJohn.Wren.Kennedy@Sun.COM 				}
47748452SJohn.Wren.Kennedy@Sun.COM 				mutex_exit(&un->un_resync_mx);
47758452SJohn.Wren.Kennedy@Sun.COM 			}
47768452SJohn.Wren.Kennedy@Sun.COM 
47778452SJohn.Wren.Kennedy@Sun.COM 			/*
47780Sstevel@tonic-gate 			 * On all but the originating node, first update
47790Sstevel@tonic-gate 			 * the resync state, then unblock the previous
47800Sstevel@tonic-gate 			 * region and block the next one. No need
47810Sstevel@tonic-gate 			 * to do this if the region is already blocked.
47820Sstevel@tonic-gate 			 * Update the submirror state and flags from the
47830Sstevel@tonic-gate 			 * originator. This keeps the cluster in sync with
47840Sstevel@tonic-gate 			 * regards to the resync status.
47850Sstevel@tonic-gate 			 */
47860Sstevel@tonic-gate 
47870Sstevel@tonic-gate 			(void) md_ioctl_writerlock(lockp, ui);
47880Sstevel@tonic-gate 			un->un_rs_resync_done = p->rs_done;
47890Sstevel@tonic-gate 			un->un_rs_resync_2_do = p->rs_2_do;
47900Sstevel@tonic-gate 			un->un_rs_type = p->rs_type;
47910Sstevel@tonic-gate 			un->un_resync_startbl = p->rs_start;
47920Sstevel@tonic-gate 			md_ioctl_writerexit(lockp);
47930Sstevel@tonic-gate 			/*
47940Sstevel@tonic-gate 			 * Use un_owner_mx to ensure that an ownership change
47950Sstevel@tonic-gate 			 * cannot happen at the same time as this message
47960Sstevel@tonic-gate 			 */
47970Sstevel@tonic-gate 			mutex_enter(&un->un_owner_mx);
47980Sstevel@tonic-gate 			if (MD_MN_MIRROR_OWNER(un)) {
47990Sstevel@tonic-gate 				ps->ps_firstblk = p->rs_start;
48000Sstevel@tonic-gate 				ps->ps_lastblk = ps->ps_firstblk +
48010Sstevel@tonic-gate 				    p->rs_size - 1;
48020Sstevel@tonic-gate 			} else {
48030Sstevel@tonic-gate 				if ((ps->ps_firstblk != p->rs_start) ||
48040Sstevel@tonic-gate 				    (ps->ps_lastblk != p->rs_start +
48050Sstevel@tonic-gate 				    p->rs_size - 1)) {
48060Sstevel@tonic-gate 					/* Remove previous overlap range */
48070Sstevel@tonic-gate 					if (ps->ps_flags & MD_MPS_ON_OVERLAP)
48086901Sjkennedy 						mirror_overlap_tree_remove(ps);
48090Sstevel@tonic-gate 
48100Sstevel@tonic-gate 					ps->ps_firstblk = p->rs_start;
48110Sstevel@tonic-gate 					ps->ps_lastblk = ps->ps_firstblk +
48120Sstevel@tonic-gate 					    p->rs_size - 1;
48130Sstevel@tonic-gate 
48140Sstevel@tonic-gate 					mutex_exit(&un->un_owner_mx);
48150Sstevel@tonic-gate 					/* Block this range from all i/o. */
48160Sstevel@tonic-gate 					if (ps->ps_firstblk != 0 ||
48170Sstevel@tonic-gate 					    ps->ps_lastblk != 0)
48180Sstevel@tonic-gate 						wait_for_overlaps(ps,
48190Sstevel@tonic-gate 						    MD_OVERLAP_ALLOW_REPEAT);
48200Sstevel@tonic-gate 					mutex_enter(&un->un_owner_mx);
48210Sstevel@tonic-gate 					/*
48220Sstevel@tonic-gate 					 * Check to see if we have obtained
48230Sstevel@tonic-gate 					 * ownership while waiting for
48240Sstevel@tonic-gate 					 * overlaps. If we have, remove
48250Sstevel@tonic-gate 					 * the resync_region entry from the
48266901Sjkennedy 					 * overlap tree
48270Sstevel@tonic-gate 					 */
48280Sstevel@tonic-gate 					if (MD_MN_MIRROR_OWNER(un) &&
48290Sstevel@tonic-gate 					    (ps->ps_flags & MD_MPS_ON_OVERLAP))
48306901Sjkennedy 						mirror_overlap_tree_remove(ps);
48310Sstevel@tonic-gate 				}
48320Sstevel@tonic-gate 			}
48330Sstevel@tonic-gate 			mutex_exit(&un->un_owner_mx);
48340Sstevel@tonic-gate 
48350Sstevel@tonic-gate 			/*
48360Sstevel@tonic-gate 			 * If this is the first RESYNC_NEXT message (i.e.
48370Sstevel@tonic-gate 			 * MD_MN_RS_FIRST_RESYNC_NEXT set in p->rs_flags),
48380Sstevel@tonic-gate 			 * issue RESYNC_START NOTIFY event
48390Sstevel@tonic-gate 			 */
48400Sstevel@tonic-gate 			if (p->rs_flags & MD_MN_RS_FIRST_RESYNC_NEXT) {
48410Sstevel@tonic-gate 				SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_START,
48420Sstevel@tonic-gate 				    SVM_TAG_METADEVICE, MD_UN2SET(un),
48430Sstevel@tonic-gate 				    MD_SID(un));
48440Sstevel@tonic-gate 			}
48450Sstevel@tonic-gate 
48460Sstevel@tonic-gate 			/* Ensure that our local resync thread is running */
48470Sstevel@tonic-gate 			if (un->un_rs_thread == NULL) {
48480Sstevel@tonic-gate 				(void) mirror_resync_unit(p->mnum, NULL,
48490Sstevel@tonic-gate 				    &p->mde, lockp);
48500Sstevel@tonic-gate 			}
48510Sstevel@tonic-gate 		}
48528452SJohn.Wren.Kennedy@Sun.COM 
48530Sstevel@tonic-gate 		break;
48540Sstevel@tonic-gate 	case MD_MN_MSG_RESYNC_FINISH:
48550Sstevel@tonic-gate 		/*
48560Sstevel@tonic-gate 		 * Complete the resync by stopping the resync thread.
48570Sstevel@tonic-gate 		 * Also release the previous overlap region field.
48580Sstevel@tonic-gate 		 * Update the resync_progress_thread by cv_signal'ing it so
48590Sstevel@tonic-gate 		 * that we mark the end of the resync as soon as possible. This
48600Sstevel@tonic-gate 		 * stops an unnecessary delay should be panic after resync
48610Sstevel@tonic-gate 		 * completion.
48620Sstevel@tonic-gate 		 */
48630Sstevel@tonic-gate #ifdef DEBUG
48640Sstevel@tonic-gate 		if (!rs_active) {
48650Sstevel@tonic-gate 			if (mirror_debug_flag)
48660Sstevel@tonic-gate 				printf("RESYNC_FINISH (mnum = %x), "
48670Sstevel@tonic-gate 				    "Resync *NOT* active",
48680Sstevel@tonic-gate 				    p->mnum);
48690Sstevel@tonic-gate 		}
48700Sstevel@tonic-gate #endif
48710Sstevel@tonic-gate 
48720Sstevel@tonic-gate 		if ((un->c.un_status & MD_UN_RESYNC_ACTIVE) &&
48730Sstevel@tonic-gate 		    (p->rs_originator != md_mn_mynode_id)) {
48740Sstevel@tonic-gate 			mutex_enter(&un->un_rs_thread_mx);
48750Sstevel@tonic-gate 			un->c.un_status &= ~MD_UN_RESYNC_CANCEL;
48760Sstevel@tonic-gate 			un->un_rs_thread_flags |= MD_RI_SHUTDOWN;
48770Sstevel@tonic-gate 			un->un_rs_thread_flags &=
48780Sstevel@tonic-gate 			    ~(MD_RI_BLOCK|MD_RI_BLOCK_OWNER);
48790Sstevel@tonic-gate 			cv_signal(&un->un_rs_thread_cv);
48800Sstevel@tonic-gate 			mutex_exit(&un->un_rs_thread_mx);
48810Sstevel@tonic-gate 		}
48820Sstevel@tonic-gate 		if (is_ABR) {
48830Sstevel@tonic-gate 			/* Resync finished, if ABR set owner to NULL */
48840Sstevel@tonic-gate 			mutex_enter(&un->un_owner_mx);
48850Sstevel@tonic-gate 			un->un_mirror_owner = 0;
48860Sstevel@tonic-gate 			mutex_exit(&un->un_owner_mx);
48870Sstevel@tonic-gate 		}
48880Sstevel@tonic-gate 		(void) md_ioctl_writerlock(lockp, ui);
48896901Sjkennedy 		ps = un->un_rs_prev_overlap;
48900Sstevel@tonic-gate 		if (ps != NULL) {
48910Sstevel@tonic-gate 			/* Remove previous overlap range */
48920Sstevel@tonic-gate 			if (ps->ps_flags & MD_MPS_ON_OVERLAP)
48936901Sjkennedy 				mirror_overlap_tree_remove(ps);
48940Sstevel@tonic-gate 			/*
48950Sstevel@tonic-gate 			 * Release the overlap range reference
48960Sstevel@tonic-gate 			 */
48976901Sjkennedy 			un->un_rs_prev_overlap = NULL;
48980Sstevel@tonic-gate 			kmem_cache_free(mirror_parent_cache,
48990Sstevel@tonic-gate 			    ps);
49000Sstevel@tonic-gate 		}
49010Sstevel@tonic-gate 		md_ioctl_writerexit(lockp);
49020Sstevel@tonic-gate 
49030Sstevel@tonic-gate 		/* Mark the resync as complete in the metadb */
49040Sstevel@tonic-gate 		un->un_rs_resync_done = p->rs_done;
49050Sstevel@tonic-gate 		un->un_rs_resync_2_do = p->rs_2_do;
49060Sstevel@tonic-gate 		un->un_rs_type = p->rs_type;
49070Sstevel@tonic-gate 		mutex_enter(&un->un_rs_progress_mx);
49080Sstevel@tonic-gate 		cv_signal(&un->un_rs_progress_cv);
49090Sstevel@tonic-gate 		mutex_exit(&un->un_rs_progress_mx);
49100Sstevel@tonic-gate 
49110Sstevel@tonic-gate 		un = md_ioctl_writerlock(lockp, ui);
49120Sstevel@tonic-gate 		un->c.un_status &= ~MD_UN_RESYNC_ACTIVE;
49130Sstevel@tonic-gate 		/* Deal with any pending grow_unit */
49140Sstevel@tonic-gate 		if (un->c.un_status & MD_UN_GROW_PENDING) {
49150Sstevel@tonic-gate 			if ((mirror_grow_unit(un, &mde) != 0) ||
49160Sstevel@tonic-gate 			    (! mdismderror(&mde, MDE_GROW_DELAYED))) {
49170Sstevel@tonic-gate 				un->c.un_status &= ~MD_UN_GROW_PENDING;
49180Sstevel@tonic-gate 			}
49190Sstevel@tonic-gate 		}
49200Sstevel@tonic-gate 		md_ioctl_writerexit(lockp);
49210Sstevel@tonic-gate 		break;
49220Sstevel@tonic-gate 
49230Sstevel@tonic-gate 	case MD_MN_MSG_RESYNC_PHASE_DONE:
49240Sstevel@tonic-gate 		/*
49250Sstevel@tonic-gate 		 * A phase of the resync, optimized. component or
49260Sstevel@tonic-gate 		 * submirror is complete. Update mirror status.
49270Sstevel@tonic-gate 		 * If the flag CLEAR_OPT_NOT_DONE is set, it means that the
49280Sstevel@tonic-gate 		 * mirror owner is peforming a resync. If we have just snarfed
49290Sstevel@tonic-gate 		 * this set, then we must clear any of the flags set at snarf
49300Sstevel@tonic-gate 		 * time by unit_setup_resync().
49310Sstevel@tonic-gate 		 * Note that unit_setup_resync() sets up these flags to
49320Sstevel@tonic-gate 		 * indicate that an optimized resync is required. These flags
49330Sstevel@tonic-gate 		 * need to be reset because if we get here,  the mirror owner
49340Sstevel@tonic-gate 		 * will have handled the optimized resync.
49350Sstevel@tonic-gate 		 * The flags that must be cleared are MD_UN_OPT_NOT_DONE and
49360Sstevel@tonic-gate 		 * MD_UN_WAR. In addition, for each submirror,
49370Sstevel@tonic-gate 		 * MD_SM_RESYNC_TARGET must be cleared and SMS_OFFLINE_RESYNC
49380Sstevel@tonic-gate 		 * set to SMS_OFFLINE.
49390Sstevel@tonic-gate 		 */
49400Sstevel@tonic-gate #ifdef DEBUG
49410Sstevel@tonic-gate 		if (mirror_debug_flag)
49420Sstevel@tonic-gate 			printf("phase done mess received from %d, mnum=%x,"
49430Sstevel@tonic-gate 			    "type=%x, flags=%x\n", p->rs_originator, p->mnum,
49440Sstevel@tonic-gate 			    p->rs_type, p->rs_flags);
49450Sstevel@tonic-gate #endif
49460Sstevel@tonic-gate 		/*
49470Sstevel@tonic-gate 		 * Ignore the message if there is no active resync thread.
49480Sstevel@tonic-gate 		 */
49490Sstevel@tonic-gate 		if (!rs_active)
49500Sstevel@tonic-gate 			break;
49510Sstevel@tonic-gate 
49520Sstevel@tonic-gate 		broke_out = p->rs_flags & MD_MN_RS_ERR;
49530Sstevel@tonic-gate 		switch (RS_TYPE(p->rs_type)) {
49540Sstevel@tonic-gate 		case MD_RS_OPTIMIZED:
49550Sstevel@tonic-gate 			un = md_ioctl_writerlock(lockp, ui);
49560Sstevel@tonic-gate 			if (p->rs_flags & MD_MN_RS_CLEAR_OPT_NOT_DONE) {
49570Sstevel@tonic-gate 				/* If we are originator, just clear rs_type */
49580Sstevel@tonic-gate 				if (p->rs_originator == md_mn_mynode_id) {
49590Sstevel@tonic-gate 					SET_RS_TYPE_NONE(un->un_rs_type);
49600Sstevel@tonic-gate 					md_ioctl_writerexit(lockp);
49610Sstevel@tonic-gate 					break;
49620Sstevel@tonic-gate 				}
49630Sstevel@tonic-gate 				/*
49640Sstevel@tonic-gate 				 * If CLEAR_OPT_NOT_DONE is set, only clear the
49650Sstevel@tonic-gate 				 * flags if OPT_NOT_DONE is set *and* rs_type
49660Sstevel@tonic-gate 				 * is MD_RS_NONE.
49670Sstevel@tonic-gate 				 */
49680Sstevel@tonic-gate 				if ((un->c.un_status & MD_UN_OPT_NOT_DONE) &&
49690Sstevel@tonic-gate 				    (RS_TYPE(un->un_rs_type) == MD_RS_NONE)) {
49700Sstevel@tonic-gate 					/* No resync in progress */
49710Sstevel@tonic-gate 					un->c.un_status &= ~MD_UN_OPT_NOT_DONE;
49720Sstevel@tonic-gate 					un->c.un_status &= ~MD_UN_WAR;
49730Sstevel@tonic-gate 				} else {
49740Sstevel@tonic-gate 					/*
49750Sstevel@tonic-gate 					 * We are in the middle of an
49760Sstevel@tonic-gate 					 * optimized resync and this message
49770Sstevel@tonic-gate 					 * should be ignored.
49780Sstevel@tonic-gate 					 */
49790Sstevel@tonic-gate 					md_ioctl_writerexit(lockp);
49800Sstevel@tonic-gate 					break;
49810Sstevel@tonic-gate 				}
49820Sstevel@tonic-gate 			} else {
49830Sstevel@tonic-gate 				/*
49840Sstevel@tonic-gate 				 * This is the end of an optimized resync,
49850Sstevel@tonic-gate 				 * clear the OPT_NOT_DONE and OFFLINE_SM flags
49860Sstevel@tonic-gate 				 */
49870Sstevel@tonic-gate 
49880Sstevel@tonic-gate 				un->c.un_status &= ~MD_UN_KEEP_DIRTY;
49890Sstevel@tonic-gate 				if (!broke_out)
49900Sstevel@tonic-gate 					un->c.un_status &= ~MD_UN_WAR;
49918452SJohn.Wren.Kennedy@Sun.COM 
49928452SJohn.Wren.Kennedy@Sun.COM 				/*
49938452SJohn.Wren.Kennedy@Sun.COM 				 * Clear our un_resync_bm for the regions
49948452SJohn.Wren.Kennedy@Sun.COM 				 * completed.  The owner (originator) will
49958452SJohn.Wren.Kennedy@Sun.COM 				 * take care of itself.
49968452SJohn.Wren.Kennedy@Sun.COM 				 */
49978452SJohn.Wren.Kennedy@Sun.COM 				if (p->rs_originator != md_mn_mynode_id &&
49988452SJohn.Wren.Kennedy@Sun.COM 				    (ps = un->un_rs_prev_overlap) != NULL) {
49998452SJohn.Wren.Kennedy@Sun.COM 					BLK_TO_RR(rr_start, ps->ps_firstblk,
50008452SJohn.Wren.Kennedy@Sun.COM 					    un);
50018452SJohn.Wren.Kennedy@Sun.COM 					BLK_TO_RR(rr_end, ps->ps_lastblk, un);
50028452SJohn.Wren.Kennedy@Sun.COM 					mutex_enter(&un->un_resync_mx);
50038452SJohn.Wren.Kennedy@Sun.COM 					for (rr = rr_start; rr <= rr_end;
50048452SJohn.Wren.Kennedy@Sun.COM 					    rr++) {
50058452SJohn.Wren.Kennedy@Sun.COM 						CLR_KEEPDIRTY(rr, un);
50068452SJohn.Wren.Kennedy@Sun.COM 					}
50078452SJohn.Wren.Kennedy@Sun.COM 					mutex_exit(&un->un_resync_mx);
50088452SJohn.Wren.Kennedy@Sun.COM 				}
50090Sstevel@tonic-gate 			}
50100Sstevel@tonic-gate 
50110Sstevel@tonic-gate 			/*
50120Sstevel@tonic-gate 			 * Set resync_completed to last resync type and then
50130Sstevel@tonic-gate 			 * clear resync_type to indicate no resync in progress
50140Sstevel@tonic-gate 			 */
50150Sstevel@tonic-gate 			un->un_resync_completed = un->un_rs_type;
50160Sstevel@tonic-gate 			SET_RS_TYPE_NONE(un->un_rs_type);
50170Sstevel@tonic-gate 
50180Sstevel@tonic-gate 			/*
50190Sstevel@tonic-gate 			 * If resync is as a result of a submirror ONLINE,
50200Sstevel@tonic-gate 			 * reset the submirror state to SMS_RUNNING if the
50210Sstevel@tonic-gate 			 * resync was ok else set back to SMS_OFFLINE.
50220Sstevel@tonic-gate 			 */
50230Sstevel@tonic-gate 			for (smi = 0; smi < NMIRROR; smi++) {
50240Sstevel@tonic-gate 				un->un_sm[smi].sm_flags &=
50250Sstevel@tonic-gate 				    ~MD_SM_RESYNC_TARGET;
50260Sstevel@tonic-gate 				if (SMS_BY_INDEX_IS(un, smi,
50270Sstevel@tonic-gate 				    SMS_OFFLINE_RESYNC)) {
50280Sstevel@tonic-gate 					if (p->rs_flags &
50290Sstevel@tonic-gate 					    MD_MN_RS_CLEAR_OPT_NOT_DONE) {
50300Sstevel@tonic-gate 						state = SMS_OFFLINE;
50310Sstevel@tonic-gate 					} else {
50320Sstevel@tonic-gate 						state = (broke_out ?
50330Sstevel@tonic-gate 						    SMS_OFFLINE : SMS_RUNNING);
50340Sstevel@tonic-gate 					}
50350Sstevel@tonic-gate 					mirror_set_sm_state(
50360Sstevel@tonic-gate 					    &un->un_sm[smi],
50370Sstevel@tonic-gate 					    &un->un_smic[smi], state,
50380Sstevel@tonic-gate 					    broke_out);
50390Sstevel@tonic-gate 					mirror_commit(un, NO_SUBMIRRORS,
50400Sstevel@tonic-gate 					    0);
50410Sstevel@tonic-gate 				}
50420Sstevel@tonic-gate 				/*
50430Sstevel@tonic-gate 				 * If we still have an offline submirror, reset
50440Sstevel@tonic-gate 				 * the OFFLINE_SM flag in the mirror status
50450Sstevel@tonic-gate 				 */
50460Sstevel@tonic-gate 				if (SMS_BY_INDEX_IS(un, smi,
50470Sstevel@tonic-gate 				    SMS_OFFLINE))
50480Sstevel@tonic-gate 					un->c.un_status |=
50490Sstevel@tonic-gate 					    MD_UN_OFFLINE_SM;
50500Sstevel@tonic-gate 			}
50510Sstevel@tonic-gate 			md_ioctl_writerexit(lockp);
50520Sstevel@tonic-gate 			break;
50530Sstevel@tonic-gate 		case MD_RS_SUBMIRROR:
50540Sstevel@tonic-gate 			un = md_ioctl_writerlock(lockp, ui);
50550Sstevel@tonic-gate 			smi = RS_SMI(p->rs_type);
50560Sstevel@tonic-gate 			sm = &un->un_sm[smi];
50570Sstevel@tonic-gate 			smic = &un->un_smic[smi];
50580Sstevel@tonic-gate 			/* Clear RESYNC target */
50590Sstevel@tonic-gate 			un->un_sm[smi].sm_flags &= ~MD_SM_RESYNC_TARGET;
50600Sstevel@tonic-gate 			/*
50610Sstevel@tonic-gate 			 * Set resync_completed to last resync type and then
50620Sstevel@tonic-gate 			 * clear resync_type to indicate no resync in progress
50630Sstevel@tonic-gate 			 */
50640Sstevel@tonic-gate 			un->un_resync_completed = un->un_rs_type;
50650Sstevel@tonic-gate 			SET_RS_TYPE_NONE(un->un_rs_type);
50660Sstevel@tonic-gate 			/*
50670Sstevel@tonic-gate 			 * If the resync completed ok reset the submirror
50680Sstevel@tonic-gate 			 * state to SMS_RUNNING else reset it to SMS_ATTACHED
50690Sstevel@tonic-gate 			 */
50700Sstevel@tonic-gate 			state = (broke_out ?
50710Sstevel@tonic-gate 			    SMS_ATTACHED : SMS_RUNNING);
50720Sstevel@tonic-gate 			mirror_set_sm_state(sm, smic, state, broke_out);
50730Sstevel@tonic-gate 			un->c.un_status &= ~MD_UN_WAR;
50740Sstevel@tonic-gate 			mirror_commit(un, SMI2BIT(smi), 0);
50750Sstevel@tonic-gate 			md_ioctl_writerexit(lockp);
50760Sstevel@tonic-gate 			break;
50770Sstevel@tonic-gate 		case MD_RS_COMPONENT:
50780Sstevel@tonic-gate 			un = md_ioctl_writerlock(lockp, ui);
50790Sstevel@tonic-gate 			smi = RS_SMI(p->rs_type);
50800Sstevel@tonic-gate 			ci = RS_CI(p->rs_type);
50810Sstevel@tonic-gate 			sm = &un->un_sm[smi];
50820Sstevel@tonic-gate 			smic = &un->un_smic[smi];
50830Sstevel@tonic-gate 			shared = (md_m_shared_t *)
50840Sstevel@tonic-gate 			    (*(smic->sm_shared_by_indx))
50850Sstevel@tonic-gate 			    (sm->sm_dev, sm, ci);
50860Sstevel@tonic-gate 			un->c.un_status &= ~MD_UN_WAR;
50870Sstevel@tonic-gate 			/* Clear RESYNC target */
50880Sstevel@tonic-gate 			un->un_sm[smi].sm_flags &= ~MD_SM_RESYNC_TARGET;
50890Sstevel@tonic-gate 			/*
50900Sstevel@tonic-gate 			 * Set resync_completed to last resync type and then
50910Sstevel@tonic-gate 			 * clear resync_type to indicate no resync in progress
50920Sstevel@tonic-gate 			 */
50930Sstevel@tonic-gate 			un->un_resync_completed = un->un_rs_type;
50940Sstevel@tonic-gate 			SET_RS_TYPE_NONE(un->un_rs_type);
50950Sstevel@tonic-gate 
50960Sstevel@tonic-gate 			/*
50970Sstevel@tonic-gate 			 * If the resync completed ok, set the component state
50980Sstevel@tonic-gate 			 * to CS_OKAY.
50990Sstevel@tonic-gate 			 */
51000Sstevel@tonic-gate 			if (broke_out)
51010Sstevel@tonic-gate 				shared->ms_flags |= MDM_S_RS_TRIED;
51020Sstevel@tonic-gate 			else {
51030Sstevel@tonic-gate 				/*
51040Sstevel@tonic-gate 				 * As we don't transmit the changes,
51050Sstevel@tonic-gate 				 * no need to drop the lock.
51060Sstevel@tonic-gate 				 */
51070Sstevel@tonic-gate 				set_sm_comp_state(un, smi, ci, CS_OKAY, 0,
51080Sstevel@tonic-gate 				    MD_STATE_NO_XMIT, (IOLOCK *)NULL);
51090Sstevel@tonic-gate 			}
51100Sstevel@tonic-gate 			md_ioctl_writerexit(lockp);
51110Sstevel@tonic-gate 		default:
51120Sstevel@tonic-gate 			break;
51130Sstevel@tonic-gate 		}
51140Sstevel@tonic-gate 		/*
51150Sstevel@tonic-gate 		 * If the purpose of this PHASE_DONE message is just to
51160Sstevel@tonic-gate 		 * indicate to all other nodes that the optimized resync
51170Sstevel@tonic-gate 		 * required (OPT_NOT_DONE) flag is to be cleared, there is
51180Sstevel@tonic-gate 		 * no need to generate a notify event as there has not
51190Sstevel@tonic-gate 		 * actually been a resync.
51200Sstevel@tonic-gate 		 */
51210Sstevel@tonic-gate 		if (!(p->rs_flags & MD_MN_RS_CLEAR_OPT_NOT_DONE)) {
51220Sstevel@tonic-gate 			if (broke_out) {
51230Sstevel@tonic-gate 				SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_FAILED,
51240Sstevel@tonic-gate 				    SVM_TAG_METADEVICE, MD_UN2SET(un),
51250Sstevel@tonic-gate 				    MD_SID(un));
51260Sstevel@tonic-gate 			} else {
51270Sstevel@tonic-gate 				SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_DONE,
51280Sstevel@tonic-gate 				    SVM_TAG_METADEVICE, MD_UN2SET(un),
51290Sstevel@tonic-gate 				    MD_SID(un));
51300Sstevel@tonic-gate 			}
51310Sstevel@tonic-gate 		}
51320Sstevel@tonic-gate 		break;
51330Sstevel@tonic-gate 
51340Sstevel@tonic-gate 	default:
51350Sstevel@tonic-gate #ifdef DEBUG
51360Sstevel@tonic-gate 		cmn_err(CE_PANIC, "mirror_resync_message: Unknown message type"
51370Sstevel@tonic-gate 		    " %x\n", p->msg_type);
51380Sstevel@tonic-gate #endif
51390Sstevel@tonic-gate 		return (EINVAL);
51400Sstevel@tonic-gate 	}
51410Sstevel@tonic-gate 	return (0);
51420Sstevel@tonic-gate }
51430Sstevel@tonic-gate 
51440Sstevel@tonic-gate /* Return a -1 if snarf of optimized record failed and set should be released */
51450Sstevel@tonic-gate static int
mirror_snarf(md_snarfcmd_t cmd,set_t setno)51460Sstevel@tonic-gate mirror_snarf(md_snarfcmd_t cmd, set_t setno)
51470Sstevel@tonic-gate {
51480Sstevel@tonic-gate 	mddb_recid_t	recid;
51490Sstevel@tonic-gate 	int		gotsomething;
51500Sstevel@tonic-gate 	int		all_mirrors_gotten;
51510Sstevel@tonic-gate 	mm_unit_t	*un;
51520Sstevel@tonic-gate 	mddb_type_t	typ1;
51530Sstevel@tonic-gate 	mddb_de_ic_t    *dep;
51540Sstevel@tonic-gate 	mddb_rb32_t	*rbp;
51550Sstevel@tonic-gate 	size_t		newreqsize;
51560Sstevel@tonic-gate 	mm_unit_t	*big_un;
51570Sstevel@tonic-gate 	mm_unit32_od_t	*small_un;
51580Sstevel@tonic-gate 	int		retval;
51590Sstevel@tonic-gate 	mdi_unit_t	*ui;
51600Sstevel@tonic-gate 
51610Sstevel@tonic-gate 	if (cmd == MD_SNARF_CLEANUP) {
51620Sstevel@tonic-gate 		if (md_get_setstatus(setno) & MD_SET_STALE)
51630Sstevel@tonic-gate 			return (0);
51640Sstevel@tonic-gate 
51650Sstevel@tonic-gate 		recid = mddb_makerecid(setno, 0);
51660Sstevel@tonic-gate 		typ1 = (mddb_type_t)md_getshared_key(setno,
51670Sstevel@tonic-gate 		    mirror_md_ops.md_driver.md_drivername);
51680Sstevel@tonic-gate 		while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) {
51690Sstevel@tonic-gate 			if (mddb_getrecprivate(recid) & MD_PRV_CLEANUP) {
51700Sstevel@tonic-gate 				un = (mm_unit_t *)mddb_getrecaddr(recid);
51710Sstevel@tonic-gate 				mirror_cleanup(un);
51720Sstevel@tonic-gate 				recid = mddb_makerecid(setno, 0);
51730Sstevel@tonic-gate 			}
51740Sstevel@tonic-gate 		}
51750Sstevel@tonic-gate 		return (0);
51760Sstevel@tonic-gate 	}
51770Sstevel@tonic-gate 
51780Sstevel@tonic-gate 	all_mirrors_gotten = 1;
51790Sstevel@tonic-gate 	gotsomething = 0;
51800Sstevel@tonic-gate 
51810Sstevel@tonic-gate 	recid = mddb_makerecid(setno, 0);
51820Sstevel@tonic-gate 	typ1 = (mddb_type_t)md_getshared_key(setno,
51830Sstevel@tonic-gate 	    mirror_md_ops.md_driver.md_drivername);
51840Sstevel@tonic-gate 
51850Sstevel@tonic-gate 	while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) {
51860Sstevel@tonic-gate 		if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
51870Sstevel@tonic-gate 			continue;
51880Sstevel@tonic-gate 
51890Sstevel@tonic-gate 		dep = mddb_getrecdep(recid);
51900Sstevel@tonic-gate 		dep->de_flags = MDDB_F_MIRROR;
51910Sstevel@tonic-gate 		rbp = dep->de_rb;
51920Sstevel@tonic-gate 
51931623Stw21770 		switch (rbp->rb_revision) {
51941623Stw21770 		case MDDB_REV_RB:
51951623Stw21770 		case MDDB_REV_RBFN:
51961623Stw21770 			if ((rbp->rb_private & MD_PRV_CONVD) == 0) {
51971623Stw21770 				/*
51981623Stw21770 				 * This means, we have an old and small
51991623Stw21770 				 * record and this record hasn't already
52001623Stw21770 				 * been converted.  Before we create an
52011623Stw21770 				 * incore metadevice from this we have to
52021623Stw21770 				 * convert it to a big record.
52031623Stw21770 				 */
52041623Stw21770 				small_un =
52051623Stw21770 				    (mm_unit32_od_t *)mddb_getrecaddr(recid);
52061623Stw21770 				newreqsize = sizeof (mm_unit_t);
52071623Stw21770 				big_un = (mm_unit_t *)kmem_zalloc(newreqsize,
52086901Sjkennedy 				    KM_SLEEP);
52091623Stw21770 				mirror_convert((caddr_t)small_un,
52106901Sjkennedy 				    (caddr_t)big_un, SMALL_2_BIG);
52111623Stw21770 				kmem_free(small_un, dep->de_reqsize);
52121623Stw21770 
52131623Stw21770 				/*
52141623Stw21770 				 * Update userdata and incore userdata
52151623Stw21770 				 * incores are at the end of un
52161623Stw21770 				 */
52171623Stw21770 				dep->de_rb_userdata_ic = big_un;
52181623Stw21770 				dep->de_rb_userdata = big_un;
52191623Stw21770 				dep->de_icreqsize = newreqsize;
52201623Stw21770 				un = big_un;
52211623Stw21770 				rbp->rb_private |= MD_PRV_CONVD;
52221623Stw21770 			} else {
52231623Stw21770 				/*
52241623Stw21770 				 * Unit already converted, just get the
52251623Stw21770 				 * record address.
52261623Stw21770 				 */
52271623Stw21770 				un = (mm_unit_t *)mddb_getrecaddr_resize(recid,
52286901Sjkennedy 				    sizeof (*un), 0);
52291623Stw21770 			}
52301623Stw21770 			un->c.un_revision &= ~MD_64BIT_META_DEV;
52311623Stw21770 			break;
52321623Stw21770 		case MDDB_REV_RB64:
52331623Stw21770 		case MDDB_REV_RB64FN:
52340Sstevel@tonic-gate 			/* Big device */
52350Sstevel@tonic-gate 			un = (mm_unit_t *)mddb_getrecaddr_resize(recid,
52366901Sjkennedy 			    sizeof (*un), 0);
52371623Stw21770 			un->c.un_revision |= MD_64BIT_META_DEV;
52381623Stw21770 			un->c.un_flag |= MD_EFILABEL;
52391623Stw21770 			break;
52400Sstevel@tonic-gate 		}
52412077Stw21770 		MDDB_NOTE_FN(rbp->rb_revision, un->c.un_revision);
52420Sstevel@tonic-gate 
52430Sstevel@tonic-gate 		/*
52440Sstevel@tonic-gate 		 * Create minor device node for snarfed entry.
52450Sstevel@tonic-gate 		 */
52460Sstevel@tonic-gate 		(void) md_create_minor_node(setno, MD_SID(un));
52470Sstevel@tonic-gate 
52480Sstevel@tonic-gate 		if (MD_UNIT(MD_SID(un)) != NULL) {
52490Sstevel@tonic-gate 			mddb_setrecprivate(recid, MD_PRV_PENDDEL);
52500Sstevel@tonic-gate 			continue;
52510Sstevel@tonic-gate 		}
52520Sstevel@tonic-gate 		all_mirrors_gotten = 0;
52530Sstevel@tonic-gate 		retval = mirror_build_incore(un, 1);
52540Sstevel@tonic-gate 		if (retval == 0) {
52550Sstevel@tonic-gate 			mddb_setrecprivate(recid, MD_PRV_GOTIT);
52560Sstevel@tonic-gate 			md_create_unit_incore(MD_SID(un), &mirror_md_ops, 0);
52570Sstevel@tonic-gate 			resync_start_timeout(setno);
52580Sstevel@tonic-gate 			gotsomething = 1;
52592063Shshaw 		} else {
52602063Shshaw 			return (retval);
52610Sstevel@tonic-gate 		}
52620Sstevel@tonic-gate 		/*
52630Sstevel@tonic-gate 		 * Set flag to indicate that the mirror has not yet
52640Sstevel@tonic-gate 		 * been through a reconfig. This flag is used for MN sets
52650Sstevel@tonic-gate 		 * when determining whether to update the mirror state from
52660Sstevel@tonic-gate 		 * the Master node.
52670Sstevel@tonic-gate 		 */
52680Sstevel@tonic-gate 		if (MD_MNSET_SETNO(setno)) {
52690Sstevel@tonic-gate 			ui = MDI_UNIT(MD_SID(un));
52700Sstevel@tonic-gate 			ui->ui_tstate |= MD_RESYNC_NOT_DONE;
52710Sstevel@tonic-gate 		}
52720Sstevel@tonic-gate 	}
52730Sstevel@tonic-gate 
52740Sstevel@tonic-gate 	if (!all_mirrors_gotten)
52750Sstevel@tonic-gate 		return (gotsomething);
52760Sstevel@tonic-gate 
52770Sstevel@tonic-gate 	recid = mddb_makerecid(setno, 0);
52780Sstevel@tonic-gate 	while ((recid = mddb_getnextrec(recid, typ1, RESYNC_REC)) > 0)
52790Sstevel@tonic-gate 		if (!(mddb_getrecprivate(recid) & MD_PRV_GOTIT))
52800Sstevel@tonic-gate 			mddb_setrecprivate(recid, MD_PRV_PENDDEL);
52810Sstevel@tonic-gate 
52820Sstevel@tonic-gate 	return (0);
52830Sstevel@tonic-gate }
52840Sstevel@tonic-gate 
52850Sstevel@tonic-gate static int
mirror_halt(md_haltcmd_t cmd,set_t setno)52860Sstevel@tonic-gate mirror_halt(md_haltcmd_t cmd, set_t setno)
52870Sstevel@tonic-gate {
52880Sstevel@tonic-gate 	unit_t		i;
52890Sstevel@tonic-gate 	mdi_unit_t	*ui;
52900Sstevel@tonic-gate 	minor_t		mnum;
52910Sstevel@tonic-gate 	int		reset_mirror_flag = 0;
52920Sstevel@tonic-gate 
52930Sstevel@tonic-gate 	if (cmd == MD_HALT_CLOSE)
52940Sstevel@tonic-gate 		return (0);
52950Sstevel@tonic-gate 
52960Sstevel@tonic-gate 	if (cmd == MD_HALT_OPEN)
52970Sstevel@tonic-gate 		return (0);
52980Sstevel@tonic-gate 
52990Sstevel@tonic-gate 	if (cmd == MD_HALT_UNLOAD)
53000Sstevel@tonic-gate 		return (0);
53010Sstevel@tonic-gate 
53020Sstevel@tonic-gate 	if (cmd == MD_HALT_CHECK) {
53030Sstevel@tonic-gate 		for (i = 0; i < md_nunits; i++) {
53040Sstevel@tonic-gate 			mnum = MD_MKMIN(setno, i);
53050Sstevel@tonic-gate 			if ((ui = MDI_UNIT(mnum)) == NULL)
53060Sstevel@tonic-gate 				continue;
53070Sstevel@tonic-gate 			if (ui->ui_opsindex != mirror_md_ops.md_selfindex)
53080Sstevel@tonic-gate 				continue;
53090Sstevel@tonic-gate 			if (md_unit_isopen(ui))
53100Sstevel@tonic-gate 				return (1);
53110Sstevel@tonic-gate 		}
53120Sstevel@tonic-gate 		return (0);
53130Sstevel@tonic-gate 	}
53140Sstevel@tonic-gate 
53150Sstevel@tonic-gate 	if (cmd != MD_HALT_DOIT)
53160Sstevel@tonic-gate 		return (1);
53170Sstevel@tonic-gate 
53180Sstevel@tonic-gate 	for (i = 0; i < md_nunits; i++) {
53190Sstevel@tonic-gate 		mnum = MD_MKMIN(setno, i);
53200Sstevel@tonic-gate 		if ((ui = MDI_UNIT(mnum)) == NULL)
53210Sstevel@tonic-gate 			continue;
53220Sstevel@tonic-gate 		if (ui->ui_opsindex != mirror_md_ops.md_selfindex)
53230Sstevel@tonic-gate 			continue;
53240Sstevel@tonic-gate 		reset_mirror((mm_unit_t *)MD_UNIT(mnum), mnum, 0);
53250Sstevel@tonic-gate 
53260Sstevel@tonic-gate 		/* Set a flag if there is at least one mirror metadevice. */
53270Sstevel@tonic-gate 		reset_mirror_flag = 1;
53280Sstevel@tonic-gate 	}
53290Sstevel@tonic-gate 
53300Sstevel@tonic-gate 	/*
53310Sstevel@tonic-gate 	 * Only wait for the global dr_timeout to finish
53320Sstevel@tonic-gate 	 *  - if there are mirror metadevices in this diskset or
53330Sstevel@tonic-gate 	 *  - if this is the local set since an unload of the md_mirror
53340Sstevel@tonic-gate 	 *    driver could follow a successful mirror halt in the local set.
53350Sstevel@tonic-gate 	 */
53360Sstevel@tonic-gate 	if ((reset_mirror_flag != 0) || (setno == MD_LOCAL_SET)) {
53370Sstevel@tonic-gate 		while ((mirror_md_ops.md_head == NULL) &&
53380Sstevel@tonic-gate 		    (mirror_timeout.dr_timeout_id != 0))
53390Sstevel@tonic-gate 			delay(md_hz);
53400Sstevel@tonic-gate 	}
53410Sstevel@tonic-gate 
53420Sstevel@tonic-gate 	return (0);
53430Sstevel@tonic-gate }
53440Sstevel@tonic-gate 
53450Sstevel@tonic-gate /*ARGSUSED3*/
53460Sstevel@tonic-gate static int
mirror_open(dev_t * dev,int flag,int otyp,cred_t * cred_p,int md_oflags)53470Sstevel@tonic-gate mirror_open(dev_t *dev, int flag, int otyp, cred_t *cred_p, int md_oflags)
53480Sstevel@tonic-gate {
53490Sstevel@tonic-gate 	IOLOCK	lock;
535046Sskamm 	minor_t		mnum = getminor(*dev);
535146Sskamm 	set_t		setno;
535246Sskamm 
535346Sskamm 	/*
535446Sskamm 	 * When doing an open of a multi owner metadevice, check to see if this
535546Sskamm 	 * node is a starting node and if a reconfig cycle is underway.
535646Sskamm 	 * If so, the system isn't sufficiently set up enough to handle the
535746Sskamm 	 * open (which involves I/O during sp_validate), so fail with ENXIO.
535846Sskamm 	 */
535946Sskamm 	setno = MD_MIN2SET(mnum);
536046Sskamm 	if ((md_set[setno].s_status & (MD_SET_MNSET | MD_SET_MN_START_RC)) ==
536146Sskamm 	    (MD_SET_MNSET | MD_SET_MN_START_RC)) {
536246Sskamm 			return (ENXIO);
536346Sskamm 	}
53640Sstevel@tonic-gate 
53650Sstevel@tonic-gate 	if (md_oflags & MD_OFLG_FROMIOCTL) {
53660Sstevel@tonic-gate 		/*
53670Sstevel@tonic-gate 		 * This indicates that the caller is an ioctl service routine.
53680Sstevel@tonic-gate 		 * In this case we initialise our stack-based IOLOCK and pass
53690Sstevel@tonic-gate 		 * this into the internal open routine. This allows multi-owner
53700Sstevel@tonic-gate 		 * metadevices to avoid deadlocking if an error is encountered
53710Sstevel@tonic-gate 		 * during the open() attempt. The failure case is:
53720Sstevel@tonic-gate 		 * s-p -> mirror -> s-p (with error). Attempting to metaclear
53730Sstevel@tonic-gate 		 * this configuration would deadlock as the mirror code has to
53740Sstevel@tonic-gate 		 * send a state-update to the other nodes when it detects the
53750Sstevel@tonic-gate 		 * failure of the underlying submirror with an errored soft-part
53760Sstevel@tonic-gate 		 * on it. As there is a class1 message in progress (metaclear)
53770Sstevel@tonic-gate 		 * set_sm_comp_state() cannot send another class1 message;
53780Sstevel@tonic-gate 		 * instead we do not send a state_update message as the
53790Sstevel@tonic-gate 		 * metaclear is distributed and the failed submirror will be
53800Sstevel@tonic-gate 		 * cleared from the configuration by the metaclear.
53810Sstevel@tonic-gate 		 */
53820Sstevel@tonic-gate 		IOLOCK_INIT(&lock);
53830Sstevel@tonic-gate 		return (mirror_internal_open(getminor(*dev), flag, otyp,
53840Sstevel@tonic-gate 		    md_oflags, &lock));
53850Sstevel@tonic-gate 	} else {
53860Sstevel@tonic-gate 		return (mirror_internal_open(getminor(*dev), flag, otyp,
53870Sstevel@tonic-gate 		    md_oflags, (IOLOCK *)NULL));
53880Sstevel@tonic-gate 	}
53890Sstevel@tonic-gate }
53900Sstevel@tonic-gate 
53910Sstevel@tonic-gate 
53920Sstevel@tonic-gate /*ARGSUSED1*/
53930Sstevel@tonic-gate static int
mirror_close(dev_t dev,int flag,int otyp,cred_t * cred_p,int md_cflags)53940Sstevel@tonic-gate mirror_close(dev_t dev, int flag, int otyp, cred_t *cred_p, int md_cflags)
53950Sstevel@tonic-gate {
53960Sstevel@tonic-gate 	return (mirror_internal_close(getminor(dev), otyp, md_cflags,
53976901Sjkennedy 	    (IOLOCK *)NULL));
53980Sstevel@tonic-gate }
53990Sstevel@tonic-gate 
54000Sstevel@tonic-gate 
54010Sstevel@tonic-gate /*
54020Sstevel@tonic-gate  * This routine dumps memory to the disk.  It assumes that the memory has
54030Sstevel@tonic-gate  * already been mapped into mainbus space.  It is called at disk interrupt
54040Sstevel@tonic-gate  * priority when the system is in trouble.
54050Sstevel@tonic-gate  *
54060Sstevel@tonic-gate  */
54070Sstevel@tonic-gate static int
mirror_dump(dev_t dev,caddr_t addr,daddr_t blkno,int nblk)54080Sstevel@tonic-gate mirror_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
54090Sstevel@tonic-gate {
54100Sstevel@tonic-gate 	mm_unit_t	*un;
54110Sstevel@tonic-gate 	dev_t		mapdev;
54120Sstevel@tonic-gate 	int		result;
54130Sstevel@tonic-gate 	int		smi;
54140Sstevel@tonic-gate 	int		any_succeed = 0;
54150Sstevel@tonic-gate 	int		save_result = 0;
54160Sstevel@tonic-gate 
54170Sstevel@tonic-gate 	/*
54180Sstevel@tonic-gate 	 * Don't need to grab the unit lock.
54190Sstevel@tonic-gate 	 * Cause nothing else is suppose to be happenning.
54200Sstevel@tonic-gate 	 * Also dump is not suppose to sleep.
54210Sstevel@tonic-gate 	 */
54220Sstevel@tonic-gate 	un = (mm_unit_t *)MD_UNIT(getminor(dev));
54230Sstevel@tonic-gate 
54240Sstevel@tonic-gate 	if ((diskaddr_t)blkno >= un->c.un_total_blocks)
54250Sstevel@tonic-gate 		return (EINVAL);
54260Sstevel@tonic-gate 
54270Sstevel@tonic-gate 	if ((diskaddr_t)blkno + nblk > un->c.un_total_blocks)
54280Sstevel@tonic-gate 		return (EINVAL);
54290Sstevel@tonic-gate 
54300Sstevel@tonic-gate 	for (smi = 0; smi < NMIRROR; smi++) {
54310Sstevel@tonic-gate 		if (!SUBMIRROR_IS_WRITEABLE(un, smi))
54320Sstevel@tonic-gate 			continue;
54330Sstevel@tonic-gate 		mapdev = md_dev64_to_dev(un->un_sm[smi].sm_dev);
54340Sstevel@tonic-gate 		result = bdev_dump(mapdev, addr, blkno, nblk);
54350Sstevel@tonic-gate 		if (result)
54360Sstevel@tonic-gate 			save_result = result;
54370Sstevel@tonic-gate 
54380Sstevel@tonic-gate 		if (result == 0)
54390Sstevel@tonic-gate 			any_succeed++;
54400Sstevel@tonic-gate 	}
54410Sstevel@tonic-gate 
54420Sstevel@tonic-gate 	if (any_succeed)
54430Sstevel@tonic-gate 		return (0);
54440Sstevel@tonic-gate 
54450Sstevel@tonic-gate 	return (save_result);
54460Sstevel@tonic-gate }
54470Sstevel@tonic-gate 
54480Sstevel@tonic-gate /*
54490Sstevel@tonic-gate  * NAME: mirror_probe_dev
54500Sstevel@tonic-gate  *
54510Sstevel@tonic-gate  * DESCRITPION: force opens every component of a mirror.
54520Sstevel@tonic-gate  *
54530Sstevel@tonic-gate  * On entry the unit writerlock is held
54540Sstevel@tonic-gate  */
54550Sstevel@tonic-gate static int
mirror_probe_dev(mdi_unit_t * ui,minor_t mnum)54560Sstevel@tonic-gate mirror_probe_dev(mdi_unit_t *ui, minor_t mnum)
54570Sstevel@tonic-gate {
54580Sstevel@tonic-gate 	int		i;
54590Sstevel@tonic-gate 	int		smi;
54600Sstevel@tonic-gate 	int		ci;
54610Sstevel@tonic-gate 	mm_unit_t	*un;
54620Sstevel@tonic-gate 	int		md_devopen = 0;
54630Sstevel@tonic-gate 	set_t		setno;
54640Sstevel@tonic-gate 	int		sm_cnt;
54650Sstevel@tonic-gate 	int		sm_unavail_cnt;
54660Sstevel@tonic-gate 
54670Sstevel@tonic-gate 	if (md_unit_isopen(ui))
54680Sstevel@tonic-gate 		md_devopen++;
54690Sstevel@tonic-gate 
54700Sstevel@tonic-gate 	un = MD_UNIT(mnum);
54710Sstevel@tonic-gate 	setno = MD_UN2SET(un);
54720Sstevel@tonic-gate 
54730Sstevel@tonic-gate 	sm_cnt = 0;
54740Sstevel@tonic-gate 	sm_unavail_cnt = 0;
54750Sstevel@tonic-gate 	for (i = 0; i < NMIRROR; i++) {
54760Sstevel@tonic-gate 		md_dev64_t tmpdev;
54770Sstevel@tonic-gate 		mdi_unit_t	*sm_ui;
54780Sstevel@tonic-gate 
54790Sstevel@tonic-gate 		if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) {
54800Sstevel@tonic-gate 			continue;
54810Sstevel@tonic-gate 		}
54820Sstevel@tonic-gate 
54830Sstevel@tonic-gate 		sm_cnt++;
54840Sstevel@tonic-gate 		tmpdev = un->un_sm[i].sm_dev;
54850Sstevel@tonic-gate 		(void) md_layered_open(mnum, &tmpdev,
54866901Sjkennedy 		    MD_OFLG_CONT_ERRS | MD_OFLG_PROBEDEV);
54870Sstevel@tonic-gate 		un->un_sm[i].sm_dev = tmpdev;
54880Sstevel@tonic-gate 
54890Sstevel@tonic-gate 		sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev)));
54900Sstevel@tonic-gate 
54910Sstevel@tonic-gate 		/*
54920Sstevel@tonic-gate 		 * Logic similar to that in mirror_open_all_devs.  We set or
54930Sstevel@tonic-gate 		 * clear the submirror Unavailable bit.
54940Sstevel@tonic-gate 		 */
54950Sstevel@tonic-gate 		(void) md_unit_writerlock(sm_ui);
54960Sstevel@tonic-gate 		if (submirror_unavailable(un, i, 1)) {
54970Sstevel@tonic-gate 			sm_ui->ui_tstate |= MD_INACCESSIBLE;
54980Sstevel@tonic-gate 			sm_unavail_cnt++;
54990Sstevel@tonic-gate 		} else {
55000Sstevel@tonic-gate 			sm_ui->ui_tstate &= ~MD_INACCESSIBLE;
55010Sstevel@tonic-gate 		}
55020Sstevel@tonic-gate 		md_unit_writerexit(sm_ui);
55030Sstevel@tonic-gate 	}
55040Sstevel@tonic-gate 
55050Sstevel@tonic-gate 	/*
55060Sstevel@tonic-gate 	 * If all of the submirrors are unavailable, the mirror is also
55070Sstevel@tonic-gate 	 * unavailable.
55080Sstevel@tonic-gate 	 */
55090Sstevel@tonic-gate 	if (sm_cnt == sm_unavail_cnt) {
55100Sstevel@tonic-gate 		ui->ui_tstate |= MD_INACCESSIBLE;
55110Sstevel@tonic-gate 	} else {
55120Sstevel@tonic-gate 		ui->ui_tstate &= ~MD_INACCESSIBLE;
55130Sstevel@tonic-gate 	}
55140Sstevel@tonic-gate 
55150Sstevel@tonic-gate 	/*
55160Sstevel@tonic-gate 	 * Start checking from probe failures. If failures occur we
55170Sstevel@tonic-gate 	 * set the appropriate erred state only if the metadevice is in
55180Sstevel@tonic-gate 	 * use. This is specifically to prevent unnecessary resyncs.
55190Sstevel@tonic-gate 	 * For instance if the disks were accidentally disconnected when
55200Sstevel@tonic-gate 	 * the system booted up then until the metadevice is accessed
55210Sstevel@tonic-gate 	 * (like file system mount) the user can shutdown, recable and
55220Sstevel@tonic-gate 	 * reboot w/o incurring a potentially huge resync.
55230Sstevel@tonic-gate 	 */
55240Sstevel@tonic-gate 
55250Sstevel@tonic-gate 	smi = 0;
55260Sstevel@tonic-gate 	ci = 0;
55270Sstevel@tonic-gate 	while (mirror_geterror(un, &smi, &ci, 1, 1) != 0) {
55280Sstevel@tonic-gate 
55290Sstevel@tonic-gate 		if (mirror_other_sources(un, smi, ci, 0) == 1) {
55300Sstevel@tonic-gate 			/*
55310Sstevel@tonic-gate 			 * Note that for a MN set, there is no need to call
55320Sstevel@tonic-gate 			 * SE_NOTIFY as that is done when processing the
55330Sstevel@tonic-gate 			 * state change
55340Sstevel@tonic-gate 			 */
55350Sstevel@tonic-gate 			if (md_devopen) {
55360Sstevel@tonic-gate 				/*
55370Sstevel@tonic-gate 				 * Never called from ioctl context,
55380Sstevel@tonic-gate 				 * so (IOLOCK *)NULL
55390Sstevel@tonic-gate 				 */
55400Sstevel@tonic-gate 				set_sm_comp_state(un, smi, ci, CS_LAST_ERRED,
55410Sstevel@tonic-gate 				    0, MD_STATE_XMIT, (IOLOCK *)NULL);
55420Sstevel@tonic-gate 				if (!MD_MNSET_SETNO(setno)) {
55430Sstevel@tonic-gate 					SE_NOTIFY(EC_SVM_STATE,
55440Sstevel@tonic-gate 					    ESC_SVM_LASTERRED,
55450Sstevel@tonic-gate 					    SVM_TAG_METADEVICE, setno,
55460Sstevel@tonic-gate 					    MD_SID(un));
55470Sstevel@tonic-gate 				}
55480Sstevel@tonic-gate 				continue;
55490Sstevel@tonic-gate 			} else {
55500Sstevel@tonic-gate 				(void) mirror_close_all_devs(un,
55510Sstevel@tonic-gate 				    MD_OFLG_PROBEDEV);
55520Sstevel@tonic-gate 				if (!MD_MNSET_SETNO(setno)) {
55530Sstevel@tonic-gate 					SE_NOTIFY(EC_SVM_STATE,
55540Sstevel@tonic-gate 					    ESC_SVM_OPEN_FAIL,
55550Sstevel@tonic-gate 					    SVM_TAG_METADEVICE, setno,
55560Sstevel@tonic-gate 					    MD_SID(un));
55570Sstevel@tonic-gate 				}
55580Sstevel@tonic-gate 				mirror_openfail_console_info(un, smi, ci);
55590Sstevel@tonic-gate 				return (ENXIO);
55600Sstevel@tonic-gate 			}
55610Sstevel@tonic-gate 		}
55620Sstevel@tonic-gate 
55630Sstevel@tonic-gate 		/*
55640Sstevel@tonic-gate 		 * Note that for a MN set, there is no need to call
55650Sstevel@tonic-gate 		 * SE_NOTIFY as that is done when processing the
55660Sstevel@tonic-gate 		 * state change
55670Sstevel@tonic-gate 		 */
55680Sstevel@tonic-gate 		if (md_devopen) {
55690Sstevel@tonic-gate 			/* Never called from ioctl context, so (IOLOCK *)NULL */
55700Sstevel@tonic-gate 			set_sm_comp_state(un, smi, ci, CS_ERRED, 0,
55710Sstevel@tonic-gate 			    MD_STATE_XMIT, (IOLOCK *)NULL);
55720Sstevel@tonic-gate 			if (!MD_MNSET_SETNO(setno)) {
55730Sstevel@tonic-gate 				SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED,
55740Sstevel@tonic-gate 				    SVM_TAG_METADEVICE, setno,
55750Sstevel@tonic-gate 				    MD_SID(un));
55760Sstevel@tonic-gate 			}
55770Sstevel@tonic-gate 		}
55780Sstevel@tonic-gate 		mirror_openfail_console_info(un, smi, ci);
55790Sstevel@tonic-gate 		ci++;
55800Sstevel@tonic-gate 	}
55810Sstevel@tonic-gate 
55820Sstevel@tonic-gate 	if (MD_MNSET_SETNO(setno)) {
55830Sstevel@tonic-gate 		send_poke_hotspares(setno);
55840Sstevel@tonic-gate 	} else {
55850Sstevel@tonic-gate 		(void) poke_hotspares();
55860Sstevel@tonic-gate 	}
55870Sstevel@tonic-gate 	(void) mirror_close_all_devs(un, MD_OFLG_PROBEDEV);
55880Sstevel@tonic-gate 
55890Sstevel@tonic-gate 	return (0);
55900Sstevel@tonic-gate }
55910Sstevel@tonic-gate 
55920Sstevel@tonic-gate 
55930Sstevel@tonic-gate static int
mirror_imp_set(set_t setno)55940Sstevel@tonic-gate mirror_imp_set(
55950Sstevel@tonic-gate 	set_t	setno
55960Sstevel@tonic-gate )
55970Sstevel@tonic-gate {
55980Sstevel@tonic-gate 
55990Sstevel@tonic-gate 	mddb_recid_t	recid;
56000Sstevel@tonic-gate 	int		gotsomething, i;
56010Sstevel@tonic-gate 	mddb_type_t	typ1;
56020Sstevel@tonic-gate 	mddb_de_ic_t	*dep;
56030Sstevel@tonic-gate 	mddb_rb32_t	*rbp;
56040Sstevel@tonic-gate 	mm_unit32_od_t	*un32;
56050Sstevel@tonic-gate 	mm_unit_t	*un64;
56061623Stw21770 	md_dev64_t	self_devt;
56070Sstevel@tonic-gate 	minor_t		*self_id;	/* minor needs to be updated */
56080Sstevel@tonic-gate 	md_parent_t	*parent_id;	/* parent needs to be updated */
56090Sstevel@tonic-gate 	mddb_recid_t	*record_id;	/* record id needs to be updated */
56100Sstevel@tonic-gate 	mddb_recid_t	*optrec_id;
56110Sstevel@tonic-gate 	md_dev64_t	tmpdev;
56120Sstevel@tonic-gate 
56130Sstevel@tonic-gate 
56140Sstevel@tonic-gate 	gotsomething = 0;
56150Sstevel@tonic-gate 
56160Sstevel@tonic-gate 	typ1 = (mddb_type_t)md_getshared_key(setno,
56170Sstevel@tonic-gate 	    mirror_md_ops.md_driver.md_drivername);
56180Sstevel@tonic-gate 	recid = mddb_makerecid(setno, 0);
56190Sstevel@tonic-gate 
56200Sstevel@tonic-gate 	while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) {
56210Sstevel@tonic-gate 		if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
56220Sstevel@tonic-gate 			continue;
56230Sstevel@tonic-gate 
56240Sstevel@tonic-gate 		dep = mddb_getrecdep(recid);
56250Sstevel@tonic-gate 		rbp = dep->de_rb;
56260Sstevel@tonic-gate 
56271623Stw21770 		switch (rbp->rb_revision) {
56281623Stw21770 		case MDDB_REV_RB:
56291623Stw21770 		case MDDB_REV_RBFN:
56300Sstevel@tonic-gate 			/*
56310Sstevel@tonic-gate 			 * Small device
56320Sstevel@tonic-gate 			 */
56330Sstevel@tonic-gate 			un32 = (mm_unit32_od_t *)mddb_getrecaddr(recid);
56340Sstevel@tonic-gate 			self_id = &(un32->c.un_self_id);
56350Sstevel@tonic-gate 			parent_id = &(un32->c.un_parent);
56360Sstevel@tonic-gate 			record_id = &(un32->c.un_record_id);
56370Sstevel@tonic-gate 			optrec_id = &(un32->un_rr_dirty_recid);
56380Sstevel@tonic-gate 
56390Sstevel@tonic-gate 			for (i = 0; i < un32->un_nsm; i++) {
56406901Sjkennedy 				tmpdev = md_expldev(un32->un_sm[i].sm_dev);
56416901Sjkennedy 				un32->un_sm[i].sm_dev = md_cmpldev
56426901Sjkennedy 				    (md_makedevice(md_major, MD_MKMIN(setno,
56436901Sjkennedy 				    MD_MIN2UNIT(md_getminor(tmpdev)))));
56446901Sjkennedy 
56456901Sjkennedy 				if (!md_update_minor(setno, mddb_getsidenum
56466901Sjkennedy 				    (setno), un32->un_sm[i].sm_key))
56470Sstevel@tonic-gate 				goto out;
56480Sstevel@tonic-gate 			}
56491623Stw21770 			break;
56501623Stw21770 		case MDDB_REV_RB64:
56511623Stw21770 		case MDDB_REV_RB64FN:
56520Sstevel@tonic-gate 			un64 = (mm_unit_t *)mddb_getrecaddr(recid);
56530Sstevel@tonic-gate 			self_id = &(un64->c.un_self_id);
56540Sstevel@tonic-gate 			parent_id = &(un64->c.un_parent);
56550Sstevel@tonic-gate 			record_id = &(un64->c.un_record_id);
56560Sstevel@tonic-gate 			optrec_id = &(un64->un_rr_dirty_recid);
56570Sstevel@tonic-gate 
56580Sstevel@tonic-gate 			for (i = 0; i < un64->un_nsm; i++) {
56596901Sjkennedy 				tmpdev = un64->un_sm[i].sm_dev;
56606901Sjkennedy 				un64->un_sm[i].sm_dev = md_makedevice
56616901Sjkennedy 				    (md_major, MD_MKMIN(setno, MD_MIN2UNIT
56626901Sjkennedy 				    (md_getminor(tmpdev))));
56636901Sjkennedy 
56646901Sjkennedy 				if (!md_update_minor(setno, mddb_getsidenum
56656901Sjkennedy 				    (setno), un64->un_sm[i].sm_key))
56660Sstevel@tonic-gate 				goto out;
56670Sstevel@tonic-gate 			}
56681623Stw21770 			break;
56691623Stw21770 		}
56701623Stw21770 
56711623Stw21770 		/*
56721623Stw21770 		 * If this is a top level and a friendly name metadevice,
56731623Stw21770 		 * update its minor in the namespace.
56741623Stw21770 		 */
56751623Stw21770 		if ((*parent_id == MD_NO_PARENT) &&
56761623Stw21770 		    ((rbp->rb_revision == MDDB_REV_RBFN) ||
56771623Stw21770 		    (rbp->rb_revision == MDDB_REV_RB64FN))) {
56781623Stw21770 
56791623Stw21770 			self_devt = md_makedevice(md_major, *self_id);
56801623Stw21770 			if (!md_update_top_device_minor(setno,
56811623Stw21770 			    mddb_getsidenum(setno), self_devt))
56821623Stw21770 				goto out;
56830Sstevel@tonic-gate 		}
56840Sstevel@tonic-gate 
56850Sstevel@tonic-gate 		/*
56860Sstevel@tonic-gate 		 * Update unit with the imported setno
56870Sstevel@tonic-gate 		 *
56880Sstevel@tonic-gate 		 */
56890Sstevel@tonic-gate 		mddb_setrecprivate(recid, MD_PRV_GOTIT);
56900Sstevel@tonic-gate 
56910Sstevel@tonic-gate 		*self_id = MD_MKMIN(setno, MD_MIN2UNIT(*self_id));
56920Sstevel@tonic-gate 		if (*parent_id != MD_NO_PARENT)
56930Sstevel@tonic-gate 			*parent_id = MD_MKMIN(setno, MD_MIN2UNIT(*parent_id));
56940Sstevel@tonic-gate 		*record_id = MAKERECID(setno, DBID(*record_id));
56950Sstevel@tonic-gate 		*optrec_id = MAKERECID(setno, DBID(*optrec_id));
56960Sstevel@tonic-gate 
56970Sstevel@tonic-gate 		gotsomething = 1;
56980Sstevel@tonic-gate 	}
56990Sstevel@tonic-gate 
57000Sstevel@tonic-gate out:
57010Sstevel@tonic-gate 	return (gotsomething);
57020Sstevel@tonic-gate }
57030Sstevel@tonic-gate 
57040Sstevel@tonic-gate /*
57050Sstevel@tonic-gate  * NAME: mirror_check_offline
57060Sstevel@tonic-gate  *
57070Sstevel@tonic-gate  * DESCRIPTION: return offline_status = 1 if any submirrors are offline
57080Sstevel@tonic-gate  *
57090Sstevel@tonic-gate  * Called from ioctl, so access to MD_UN_OFFLINE_SM in un_status is
57100Sstevel@tonic-gate  * protected by the global ioctl lock as it is only set by the MD_IOCOFFLINE
57110Sstevel@tonic-gate  * ioctl.
57120Sstevel@tonic-gate  */
57130Sstevel@tonic-gate int
mirror_check_offline(md_dev64_t dev,int * offline_status)57140Sstevel@tonic-gate mirror_check_offline(md_dev64_t dev, int *offline_status)
57150Sstevel@tonic-gate {
57160Sstevel@tonic-gate 	mm_unit_t		*un;
57170Sstevel@tonic-gate 	md_error_t		mde = mdnullerror;
57180Sstevel@tonic-gate 
57190Sstevel@tonic-gate 	if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL)
57200Sstevel@tonic-gate 		return (EINVAL);
57210Sstevel@tonic-gate 	*offline_status = 0;
57220Sstevel@tonic-gate 	if (un->c.un_status & MD_UN_OFFLINE_SM)
57230Sstevel@tonic-gate 		*offline_status = 1;
57240Sstevel@tonic-gate 	return (0);
57250Sstevel@tonic-gate }
57260Sstevel@tonic-gate 
57270Sstevel@tonic-gate /*
57280Sstevel@tonic-gate  * NAME: mirror_inc_abr_count
57290Sstevel@tonic-gate  *
57300Sstevel@tonic-gate  * DESCRIPTION: increment the count of layered soft parts with ABR set
57310Sstevel@tonic-gate  *
57320Sstevel@tonic-gate  * Called from ioctl, so access to un_abr_count is protected by the global
57330Sstevel@tonic-gate  * ioctl lock. It is only referenced in the MD_IOCOFFLINE ioctl.
57340Sstevel@tonic-gate  */
57350Sstevel@tonic-gate int
mirror_inc_abr_count(md_dev64_t dev)57360Sstevel@tonic-gate mirror_inc_abr_count(md_dev64_t dev)
57370Sstevel@tonic-gate {
57380Sstevel@tonic-gate 	mm_unit_t		*un;
57390Sstevel@tonic-gate 	md_error_t		mde = mdnullerror;
57400Sstevel@tonic-gate 
57410Sstevel@tonic-gate 	if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL)
57420Sstevel@tonic-gate 		return (EINVAL);
57430Sstevel@tonic-gate 	un->un_abr_count++;
57440Sstevel@tonic-gate 	return (0);
57450Sstevel@tonic-gate }
57460Sstevel@tonic-gate 
57470Sstevel@tonic-gate /*
57480Sstevel@tonic-gate  * NAME: mirror_dec_abr_count
57490Sstevel@tonic-gate  *
57500Sstevel@tonic-gate  * DESCRIPTION: decrement the count of layered soft parts with ABR set
57510Sstevel@tonic-gate  *
57520Sstevel@tonic-gate  * Called from ioctl, so access to un_abr_count is protected by the global
57530Sstevel@tonic-gate  * ioctl lock. It is only referenced in the MD_IOCOFFLINE ioctl.
57540Sstevel@tonic-gate  */
57550Sstevel@tonic-gate int
mirror_dec_abr_count(md_dev64_t dev)57560Sstevel@tonic-gate mirror_dec_abr_count(md_dev64_t dev)
57570Sstevel@tonic-gate {
57580Sstevel@tonic-gate 	mm_unit_t		*un;
57590Sstevel@tonic-gate 	md_error_t		mde = mdnullerror;
57600Sstevel@tonic-gate 
57610Sstevel@tonic-gate 	if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL)
57620Sstevel@tonic-gate 		return (EINVAL);
57630Sstevel@tonic-gate 	un->un_abr_count--;
57640Sstevel@tonic-gate 	return (0);
57650Sstevel@tonic-gate }
57660Sstevel@tonic-gate 
57670Sstevel@tonic-gate static md_named_services_t mirror_named_services[] = {
57680Sstevel@tonic-gate 	{(intptr_t (*)()) poke_hotspares,		"poke hotspares"    },
57690Sstevel@tonic-gate 	{(intptr_t (*)()) mirror_rename_listkids,	MDRNM_LIST_URKIDS   },
57700Sstevel@tonic-gate 	{mirror_rename_check,				MDRNM_CHECK	    },
57710Sstevel@tonic-gate 	{(intptr_t (*)()) mirror_renexch_update_kids,	MDRNM_UPDATE_KIDS   },
57720Sstevel@tonic-gate 	{(intptr_t (*)()) mirror_exchange_parent_update_to,
57730Sstevel@tonic-gate 			MDRNM_PARENT_UPDATE_TO},
57740Sstevel@tonic-gate 	{(intptr_t (*)()) mirror_exchange_self_update_from_down,
57750Sstevel@tonic-gate 			MDRNM_SELF_UPDATE_FROM_DOWN },
57760Sstevel@tonic-gate 	{(intptr_t (*)())mirror_probe_dev,		"probe open test" },
57770Sstevel@tonic-gate 	{(intptr_t (*)())mirror_check_offline,		MD_CHECK_OFFLINE },
57780Sstevel@tonic-gate 	{(intptr_t (*)())mirror_inc_abr_count,		MD_INC_ABR_COUNT },
57790Sstevel@tonic-gate 	{(intptr_t (*)())mirror_dec_abr_count,		MD_DEC_ABR_COUNT },
57800Sstevel@tonic-gate 	{ NULL,						0		    }
57810Sstevel@tonic-gate };
57820Sstevel@tonic-gate 
57830Sstevel@tonic-gate md_ops_t mirror_md_ops = {
57840Sstevel@tonic-gate 	mirror_open,		/* open */
57850Sstevel@tonic-gate 	mirror_close,		/* close */
57860Sstevel@tonic-gate 	md_mirror_strategy,	/* strategy */
57870Sstevel@tonic-gate 	NULL,			/* print */
57880Sstevel@tonic-gate 	mirror_dump,		/* dump */
57890Sstevel@tonic-gate 	NULL,			/* read */
57900Sstevel@tonic-gate 	NULL,			/* write */
57910Sstevel@tonic-gate 	md_mirror_ioctl,	/* mirror_ioctl, */
57920Sstevel@tonic-gate 	mirror_snarf,		/* mirror_snarf */
57930Sstevel@tonic-gate 	mirror_halt,		/* mirror_halt */
57940Sstevel@tonic-gate 	NULL,			/* aread */
57950Sstevel@tonic-gate 	NULL,			/* awrite */
57960Sstevel@tonic-gate 	mirror_imp_set,		/* import set */
57970Sstevel@tonic-gate 	mirror_named_services
57980Sstevel@tonic-gate };
57990Sstevel@tonic-gate 
58000Sstevel@tonic-gate /* module specific initilization */
58010Sstevel@tonic-gate static void
init_init()58020Sstevel@tonic-gate init_init()
58030Sstevel@tonic-gate {
58040Sstevel@tonic-gate 	md_mirror_mcs_buf_off = sizeof (md_mcs_t) - sizeof (buf_t);
58050Sstevel@tonic-gate 
58060Sstevel@tonic-gate 	/* Initialize the parent and child save memory pools */
58070Sstevel@tonic-gate 	mirror_parent_cache = kmem_cache_create("md_mirror_parent",
58080Sstevel@tonic-gate 	    sizeof (md_mps_t), 0, mirror_parent_constructor,
58090Sstevel@tonic-gate 	    mirror_parent_destructor, mirror_run_queue, NULL, NULL,
58100Sstevel@tonic-gate 	    0);
58110Sstevel@tonic-gate 
58120Sstevel@tonic-gate 	mirror_child_cache = kmem_cache_create("md_mirror_child",
58130Sstevel@tonic-gate 	    sizeof (md_mcs_t) - sizeof (buf_t) + biosize(), 0,
58140Sstevel@tonic-gate 	    mirror_child_constructor, mirror_child_destructor,
58150Sstevel@tonic-gate 	    mirror_run_queue, NULL, NULL, 0);
58160Sstevel@tonic-gate 
58170Sstevel@tonic-gate 	/*
58180Sstevel@tonic-gate 	 * Insure wowbuf_size is a multiple of DEV_BSIZE,
58190Sstevel@tonic-gate 	 * then initialize wowbuf memory pool.
58200Sstevel@tonic-gate 	 */
58210Sstevel@tonic-gate 	md_wowbuf_size = roundup(md_wowbuf_size, DEV_BSIZE);
58220Sstevel@tonic-gate 	if (md_wowbuf_size <= 0)
58230Sstevel@tonic-gate 		md_wowbuf_size = 2 * DEV_BSIZE;
58240Sstevel@tonic-gate 	if (md_wowbuf_size > (32 * DEV_BSIZE))
58250Sstevel@tonic-gate 		md_wowbuf_size = (32 * DEV_BSIZE);
58260Sstevel@tonic-gate 
58270Sstevel@tonic-gate 	md_wowblk_size = md_wowbuf_size + sizeof (wowhdr_t);
58280Sstevel@tonic-gate 	mirror_wowblk_cache = kmem_cache_create("md_mirror_wow",
58290Sstevel@tonic-gate 	    md_wowblk_size, 0, NULL, NULL, NULL, NULL, NULL, 0);
58300Sstevel@tonic-gate 
58310Sstevel@tonic-gate 	mutex_init(&mirror_timeout.dr_mx, NULL, MUTEX_DEFAULT, NULL);
58320Sstevel@tonic-gate 	mutex_init(&hotspare_request.dr_mx, NULL, MUTEX_DEFAULT, NULL);
58330Sstevel@tonic-gate 
58340Sstevel@tonic-gate 	mutex_init(&non_ff_drv_mutex, NULL, MUTEX_DEFAULT, NULL);
58350Sstevel@tonic-gate }
58360Sstevel@tonic-gate 
58370Sstevel@tonic-gate /* module specific uninitilization (undo init_init()) */
58380Sstevel@tonic-gate static void
fini_uninit()58390Sstevel@tonic-gate fini_uninit()
58400Sstevel@tonic-gate {
58410Sstevel@tonic-gate 	kmem_cache_destroy(mirror_parent_cache);
58420Sstevel@tonic-gate 	kmem_cache_destroy(mirror_child_cache);
58430Sstevel@tonic-gate 	kmem_cache_destroy(mirror_wowblk_cache);
58440Sstevel@tonic-gate 	mirror_parent_cache = mirror_child_cache =
58450Sstevel@tonic-gate 	    mirror_wowblk_cache = NULL;
58460Sstevel@tonic-gate 
58470Sstevel@tonic-gate 	mutex_destroy(&mirror_timeout.dr_mx);
58480Sstevel@tonic-gate 	mutex_destroy(&hotspare_request.dr_mx);
58490Sstevel@tonic-gate 	mutex_destroy(&non_ff_drv_mutex);
58500Sstevel@tonic-gate }
58510Sstevel@tonic-gate 
58520Sstevel@tonic-gate /* define the module linkage */
58534932Spetede MD_PLUGIN_MISC_MODULE("mirrors module", init_init(), fini_uninit())
5854