10Sstevel@tonic-gate /*
20Sstevel@tonic-gate * CDDL HEADER START
30Sstevel@tonic-gate *
40Sstevel@tonic-gate * The contents of this file are subject to the terms of the
51366Spetede * Common Development and Distribution License (the "License").
61366Spetede * You may not use this file except in compliance with the License.
70Sstevel@tonic-gate *
80Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
90Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing.
100Sstevel@tonic-gate * See the License for the specific language governing permissions
110Sstevel@tonic-gate * and limitations under the License.
120Sstevel@tonic-gate *
130Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each
140Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
150Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the
160Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying
170Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner]
180Sstevel@tonic-gate *
190Sstevel@tonic-gate * CDDL HEADER END
200Sstevel@tonic-gate */
217627SChris.Horne@Sun.COM
220Sstevel@tonic-gate /*
23*12629SRay.Hassan@oracle.COM * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
240Sstevel@tonic-gate */
250Sstevel@tonic-gate
260Sstevel@tonic-gate #include <sys/param.h>
270Sstevel@tonic-gate #include <sys/systm.h>
280Sstevel@tonic-gate #include <sys/conf.h>
290Sstevel@tonic-gate #include <sys/file.h>
300Sstevel@tonic-gate #include <sys/user.h>
310Sstevel@tonic-gate #include <sys/uio.h>
320Sstevel@tonic-gate #include <sys/t_lock.h>
330Sstevel@tonic-gate #include <sys/buf.h>
340Sstevel@tonic-gate #include <sys/dkio.h>
350Sstevel@tonic-gate #include <sys/vtoc.h>
360Sstevel@tonic-gate #include <sys/kmem.h>
370Sstevel@tonic-gate #include <vm/page.h>
380Sstevel@tonic-gate #include <sys/cmn_err.h>
390Sstevel@tonic-gate #include <sys/sysmacros.h>
400Sstevel@tonic-gate #include <sys/types.h>
410Sstevel@tonic-gate #include <sys/mkdev.h>
420Sstevel@tonic-gate #include <sys/stat.h>
430Sstevel@tonic-gate #include <sys/open.h>
440Sstevel@tonic-gate #include <sys/modctl.h>
450Sstevel@tonic-gate #include <sys/ddi.h>
460Sstevel@tonic-gate #include <sys/sunddi.h>
470Sstevel@tonic-gate #include <sys/debug.h>
480Sstevel@tonic-gate #include <sys/dklabel.h>
490Sstevel@tonic-gate #include <vm/hat.h>
501623Stw21770 #include <sys/lvm/mdvar.h>
510Sstevel@tonic-gate #include <sys/lvm/md_mirror.h>
520Sstevel@tonic-gate #include <sys/lvm/md_convert.h>
530Sstevel@tonic-gate #include <sys/lvm/md_mddb.h>
540Sstevel@tonic-gate #include <sys/esunddi.h>
550Sstevel@tonic-gate
560Sstevel@tonic-gate #include <sys/sysevent/eventdefs.h>
570Sstevel@tonic-gate #include <sys/sysevent/svm.h>
580Sstevel@tonic-gate #include <sys/lvm/mdmn_commd.h>
596901Sjkennedy #include <sys/avl.h>
600Sstevel@tonic-gate
610Sstevel@tonic-gate md_ops_t mirror_md_ops;
620Sstevel@tonic-gate #ifndef lint
631366Spetede char _depends_on[] = "drv/md";
640Sstevel@tonic-gate md_ops_t *md_interface_ops = &mirror_md_ops;
650Sstevel@tonic-gate #endif
660Sstevel@tonic-gate
670Sstevel@tonic-gate extern mdq_anchor_t md_done_daemon;
680Sstevel@tonic-gate extern mdq_anchor_t md_mstr_daemon;
690Sstevel@tonic-gate extern mdq_anchor_t md_mirror_daemon;
700Sstevel@tonic-gate extern mdq_anchor_t md_mirror_io_daemon;
710Sstevel@tonic-gate extern mdq_anchor_t md_mirror_rs_daemon;
720Sstevel@tonic-gate extern mdq_anchor_t md_mhs_daemon;
730Sstevel@tonic-gate
740Sstevel@tonic-gate extern unit_t md_nunits;
750Sstevel@tonic-gate extern set_t md_nsets;
760Sstevel@tonic-gate extern md_set_t md_set[];
770Sstevel@tonic-gate
780Sstevel@tonic-gate extern int md_status;
790Sstevel@tonic-gate extern clock_t md_hz;
800Sstevel@tonic-gate
810Sstevel@tonic-gate extern md_krwlock_t md_unit_array_rw;
820Sstevel@tonic-gate extern kmutex_t md_mx;
830Sstevel@tonic-gate extern kcondvar_t md_cv;
840Sstevel@tonic-gate extern int md_mtioctl_cnt;
850Sstevel@tonic-gate
860Sstevel@tonic-gate daemon_request_t mirror_timeout;
870Sstevel@tonic-gate static daemon_request_t hotspare_request;
880Sstevel@tonic-gate static daemon_request_t mn_hs_request[MD_MAXSETS]; /* Multinode hs req */
890Sstevel@tonic-gate
900Sstevel@tonic-gate int md_mirror_mcs_buf_off;
910Sstevel@tonic-gate
920Sstevel@tonic-gate /* Flags for mdmn_ksend_message to allow debugging */
930Sstevel@tonic-gate int md_mirror_msg_flags;
940Sstevel@tonic-gate
950Sstevel@tonic-gate #ifdef DEBUG
960Sstevel@tonic-gate /* Flag to switch on debug messages */
970Sstevel@tonic-gate int mirror_debug_flag = 0;
980Sstevel@tonic-gate #endif
990Sstevel@tonic-gate
1000Sstevel@tonic-gate /*
1010Sstevel@tonic-gate * Struct used to hold count of DMR reads and the timestamp of last DMR read
1020Sstevel@tonic-gate * It is used to verify, using a debugger, that the DMR read ioctl has been
1030Sstevel@tonic-gate * executed.
1040Sstevel@tonic-gate */
1050Sstevel@tonic-gate dmr_stats_t mirror_dmr_stats = {0, 0};
1060Sstevel@tonic-gate
1070Sstevel@tonic-gate /*
1080Sstevel@tonic-gate * Mutex protecting list of non-failfast drivers.
1090Sstevel@tonic-gate */
1100Sstevel@tonic-gate static kmutex_t non_ff_drv_mutex;
1112063Shshaw extern char **non_ff_drivers;
1120Sstevel@tonic-gate
1130Sstevel@tonic-gate extern major_t md_major;
1140Sstevel@tonic-gate
1150Sstevel@tonic-gate /*
1160Sstevel@tonic-gate * Write-On-Write memory pool.
1170Sstevel@tonic-gate */
1180Sstevel@tonic-gate static void copy_write_cont(wowhdr_t *wowhdr);
1190Sstevel@tonic-gate static kmem_cache_t *mirror_wowblk_cache = NULL;
1200Sstevel@tonic-gate static int md_wowbuf_size = 16384;
1210Sstevel@tonic-gate static size_t md_wowblk_size;
1220Sstevel@tonic-gate
1230Sstevel@tonic-gate /*
1240Sstevel@tonic-gate * This is a flag that allows:
1250Sstevel@tonic-gate * - disabling the write-on-write mechanism.
1260Sstevel@tonic-gate * - logging occurrences of write-on-write
1270Sstevel@tonic-gate * - switching wow handling procedure processing
1280Sstevel@tonic-gate * Counter for occurences of WOW.
1290Sstevel@tonic-gate */
1300Sstevel@tonic-gate static uint_t md_mirror_wow_flg = 0;
1310Sstevel@tonic-gate static int md_mirror_wow_cnt = 0;
1320Sstevel@tonic-gate
1330Sstevel@tonic-gate /*
1340Sstevel@tonic-gate * Tunable to enable/disable dirty region
1350Sstevel@tonic-gate * processing when closing down a mirror.
1360Sstevel@tonic-gate */
1370Sstevel@tonic-gate static int new_resync = 1;
1380Sstevel@tonic-gate kmem_cache_t *mirror_parent_cache = NULL;
1390Sstevel@tonic-gate kmem_cache_t *mirror_child_cache = NULL;
1400Sstevel@tonic-gate
1410Sstevel@tonic-gate extern int md_ff_disable; /* disable failfast */
1420Sstevel@tonic-gate
1430Sstevel@tonic-gate static int mirror_map_write(mm_unit_t *, md_mcs_t *, md_mps_t *, int);
1440Sstevel@tonic-gate static void mirror_read_strategy(buf_t *, int, void *);
1450Sstevel@tonic-gate static void mirror_write_strategy(buf_t *, int, void *);
1460Sstevel@tonic-gate static void become_owner(daemon_queue_t *);
1470Sstevel@tonic-gate static int mirror_done(struct buf *cb);
1480Sstevel@tonic-gate static int mirror_done_common(struct buf *cb);
1490Sstevel@tonic-gate static void clear_retry_error(struct buf *cb);
1500Sstevel@tonic-gate
1510Sstevel@tonic-gate /*
1520Sstevel@tonic-gate * patchables
1530Sstevel@tonic-gate */
1540Sstevel@tonic-gate int md_min_rr_size = 200; /* 2000 blocks, or 100k */
1550Sstevel@tonic-gate int md_def_num_rr = 1000; /* Default number of dirty regions */
1560Sstevel@tonic-gate
1570Sstevel@tonic-gate /*
1580Sstevel@tonic-gate * patchable to change delay before rescheduling mirror ownership request.
1590Sstevel@tonic-gate * Value is clock ticks, default 0.5 seconds
1600Sstevel@tonic-gate */
1610Sstevel@tonic-gate clock_t md_mirror_owner_to = 500000;
1620Sstevel@tonic-gate
1630Sstevel@tonic-gate /*ARGSUSED1*/
1640Sstevel@tonic-gate static int
mirror_parent_constructor(void * p,void * d1,int d2)1650Sstevel@tonic-gate mirror_parent_constructor(void *p, void *d1, int d2)
1660Sstevel@tonic-gate {
1670Sstevel@tonic-gate mutex_init(&((md_mps_t *)p)->ps_mx, NULL, MUTEX_DEFAULT, NULL);
1680Sstevel@tonic-gate return (0);
1690Sstevel@tonic-gate }
1700Sstevel@tonic-gate
1710Sstevel@tonic-gate static void
mirror_parent_init(md_mps_t * ps)1720Sstevel@tonic-gate mirror_parent_init(md_mps_t *ps)
1730Sstevel@tonic-gate {
1740Sstevel@tonic-gate bzero(ps, offsetof(md_mps_t, ps_mx));
1758452SJohn.Wren.Kennedy@Sun.COM bzero(&ps->ps_overlap_node, sizeof (avl_node_t));
1760Sstevel@tonic-gate }
1770Sstevel@tonic-gate
1780Sstevel@tonic-gate /*ARGSUSED1*/
1790Sstevel@tonic-gate static void
mirror_parent_destructor(void * p,void * d)1800Sstevel@tonic-gate mirror_parent_destructor(void *p, void *d)
1810Sstevel@tonic-gate {
1820Sstevel@tonic-gate mutex_destroy(&((md_mps_t *)p)->ps_mx);
1830Sstevel@tonic-gate }
1840Sstevel@tonic-gate
1850Sstevel@tonic-gate /*ARGSUSED1*/
1860Sstevel@tonic-gate static int
mirror_child_constructor(void * p,void * d1,int d2)1870Sstevel@tonic-gate mirror_child_constructor(void *p, void *d1, int d2)
1880Sstevel@tonic-gate {
1890Sstevel@tonic-gate bioinit(&((md_mcs_t *)p)->cs_buf);
1900Sstevel@tonic-gate return (0);
1910Sstevel@tonic-gate }
1920Sstevel@tonic-gate
1930Sstevel@tonic-gate void
mirror_child_init(md_mcs_t * cs)1940Sstevel@tonic-gate mirror_child_init(md_mcs_t *cs)
1950Sstevel@tonic-gate {
1960Sstevel@tonic-gate cs->cs_ps = NULL;
1970Sstevel@tonic-gate cs->cs_mdunit = 0;
1980Sstevel@tonic-gate md_bioreset(&cs->cs_buf);
1990Sstevel@tonic-gate }
2000Sstevel@tonic-gate
2010Sstevel@tonic-gate /*ARGSUSED1*/
2020Sstevel@tonic-gate static void
mirror_child_destructor(void * p,void * d)2030Sstevel@tonic-gate mirror_child_destructor(void *p, void *d)
2040Sstevel@tonic-gate {
2050Sstevel@tonic-gate biofini(&((md_mcs_t *)p)->cs_buf);
2060Sstevel@tonic-gate }
2070Sstevel@tonic-gate
2080Sstevel@tonic-gate static void
mirror_wowblk_init(wowhdr_t * p)2090Sstevel@tonic-gate mirror_wowblk_init(wowhdr_t *p)
2100Sstevel@tonic-gate {
2110Sstevel@tonic-gate bzero(p, md_wowblk_size);
2120Sstevel@tonic-gate }
2130Sstevel@tonic-gate
2140Sstevel@tonic-gate static void
send_poke_hotspares_msg(daemon_request_t * drq)2150Sstevel@tonic-gate send_poke_hotspares_msg(daemon_request_t *drq)
2160Sstevel@tonic-gate {
2170Sstevel@tonic-gate int rval;
21811130SJames.Hall@Sun.COM int nretries = 0;
2190Sstevel@tonic-gate md_mn_msg_pokehsp_t pokehsp;
2200Sstevel@tonic-gate md_mn_kresult_t *kresult;
2210Sstevel@tonic-gate set_t setno = (set_t)drq->dq.qlen;
2220Sstevel@tonic-gate
2230Sstevel@tonic-gate pokehsp.pokehsp_setno = setno;
2240Sstevel@tonic-gate
2250Sstevel@tonic-gate kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
22611130SJames.Hall@Sun.COM
22711130SJames.Hall@Sun.COM retry_sphmsg:
2280Sstevel@tonic-gate rval = mdmn_ksend_message(setno, MD_MN_MSG_POKE_HOTSPARES,
2298452SJohn.Wren.Kennedy@Sun.COM MD_MSGF_NO_LOG | MD_MSGF_NO_BCAST, 0, (char *)&pokehsp,
2300Sstevel@tonic-gate sizeof (pokehsp), kresult);
2310Sstevel@tonic-gate
2320Sstevel@tonic-gate if (!MDMN_KSEND_MSG_OK(rval, kresult)) {
2330Sstevel@tonic-gate mdmn_ksend_show_error(rval, kresult, "POKE_HOTSPARES");
2348452SJohn.Wren.Kennedy@Sun.COM /* If we're shutting down already, pause things here. */
2358452SJohn.Wren.Kennedy@Sun.COM if (kresult->kmmr_comm_state == MDMNE_RPC_FAIL) {
2368452SJohn.Wren.Kennedy@Sun.COM while (!md_mn_is_commd_present()) {
2378452SJohn.Wren.Kennedy@Sun.COM delay(md_hz);
2388452SJohn.Wren.Kennedy@Sun.COM }
23911130SJames.Hall@Sun.COM /*
24011130SJames.Hall@Sun.COM * commd has become reachable again, so retry once.
24111130SJames.Hall@Sun.COM * If this fails we'll panic as the system is in an
24211130SJames.Hall@Sun.COM * unexpected state.
24311130SJames.Hall@Sun.COM */
24411130SJames.Hall@Sun.COM if (nretries++ == 0)
24511130SJames.Hall@Sun.COM goto retry_sphmsg;
2468452SJohn.Wren.Kennedy@Sun.COM }
2470Sstevel@tonic-gate cmn_err(CE_PANIC,
2480Sstevel@tonic-gate "ksend_message failure: POKE_HOTSPARES");
2490Sstevel@tonic-gate }
2500Sstevel@tonic-gate kmem_free(kresult, sizeof (md_mn_kresult_t));
2510Sstevel@tonic-gate
2520Sstevel@tonic-gate /* Allow further requests to use this set's queue structure */
2530Sstevel@tonic-gate mutex_enter(&drq->dr_mx);
2540Sstevel@tonic-gate drq->dr_pending = 0;
2550Sstevel@tonic-gate mutex_exit(&drq->dr_mx);
2560Sstevel@tonic-gate }
2570Sstevel@tonic-gate
2580Sstevel@tonic-gate /*
2590Sstevel@tonic-gate * Send a poke_hotspares message to the master node. To avoid swamping the
2600Sstevel@tonic-gate * commd handler with requests we only send a message if there is not one
2610Sstevel@tonic-gate * already outstanding. We punt the request to a separate thread context as
2620Sstevel@tonic-gate * cannot afford to block waiting on the request to be serviced. This is
2630Sstevel@tonic-gate * essential when a reconfig cycle is in progress as any open() of a multinode
2640Sstevel@tonic-gate * metadevice may result in a livelock.
2650Sstevel@tonic-gate */
2660Sstevel@tonic-gate static void
send_poke_hotspares(set_t setno)2670Sstevel@tonic-gate send_poke_hotspares(set_t setno)
2680Sstevel@tonic-gate {
2690Sstevel@tonic-gate daemon_request_t *drq = &mn_hs_request[setno];
2700Sstevel@tonic-gate
2710Sstevel@tonic-gate mutex_enter(&drq->dr_mx);
2720Sstevel@tonic-gate if (drq->dr_pending == 0) {
2730Sstevel@tonic-gate drq->dr_pending = 1;
2740Sstevel@tonic-gate drq->dq.qlen = (int)setno;
2750Sstevel@tonic-gate daemon_request(&md_mhs_daemon,
2760Sstevel@tonic-gate send_poke_hotspares_msg, (daemon_queue_t *)drq, REQ_OLD);
2770Sstevel@tonic-gate }
2780Sstevel@tonic-gate mutex_exit(&drq->dr_mx);
2790Sstevel@tonic-gate }
2800Sstevel@tonic-gate
2810Sstevel@tonic-gate void
mirror_set_sm_state(mm_submirror_t * sm,mm_submirror_ic_t * smic,sm_state_t newstate,int force)2820Sstevel@tonic-gate mirror_set_sm_state(
2830Sstevel@tonic-gate mm_submirror_t *sm,
2840Sstevel@tonic-gate mm_submirror_ic_t *smic,
2850Sstevel@tonic-gate sm_state_t newstate,
2860Sstevel@tonic-gate int force)
2870Sstevel@tonic-gate {
2880Sstevel@tonic-gate int compcnt;
2890Sstevel@tonic-gate int i;
2900Sstevel@tonic-gate int errcnt;
2910Sstevel@tonic-gate sm_state_t origstate;
2920Sstevel@tonic-gate md_m_shared_t *shared;
2930Sstevel@tonic-gate
2940Sstevel@tonic-gate if (force) {
2950Sstevel@tonic-gate sm->sm_state = newstate;
2960Sstevel@tonic-gate uniqtime32(&sm->sm_timestamp);
2970Sstevel@tonic-gate return;
2980Sstevel@tonic-gate }
2990Sstevel@tonic-gate
3000Sstevel@tonic-gate origstate = newstate;
3010Sstevel@tonic-gate
3020Sstevel@tonic-gate compcnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm);
3030Sstevel@tonic-gate for (i = 0, errcnt = 0; i < compcnt; i++) {
3040Sstevel@tonic-gate shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
3050Sstevel@tonic-gate (sm->sm_dev, sm, i);
3060Sstevel@tonic-gate if (shared->ms_state & (CS_ERRED | CS_LAST_ERRED))
3070Sstevel@tonic-gate newstate |= SMS_COMP_ERRED;
3080Sstevel@tonic-gate if (shared->ms_state & (CS_RESYNC))
3090Sstevel@tonic-gate newstate |= SMS_COMP_RESYNC;
3100Sstevel@tonic-gate if (shared->ms_state & CS_ERRED)
3110Sstevel@tonic-gate errcnt++;
3120Sstevel@tonic-gate }
3130Sstevel@tonic-gate
3140Sstevel@tonic-gate if ((newstate & (SMS_COMP_ERRED | SMS_COMP_RESYNC)) != 0)
3150Sstevel@tonic-gate newstate &= ~origstate;
3160Sstevel@tonic-gate
3170Sstevel@tonic-gate if (errcnt == compcnt)
3180Sstevel@tonic-gate newstate |= SMS_ALL_ERRED;
3190Sstevel@tonic-gate else
3200Sstevel@tonic-gate newstate &= ~SMS_ALL_ERRED;
3210Sstevel@tonic-gate
3220Sstevel@tonic-gate sm->sm_state = newstate;
3230Sstevel@tonic-gate uniqtime32(&sm->sm_timestamp);
3240Sstevel@tonic-gate }
3250Sstevel@tonic-gate
3260Sstevel@tonic-gate static int
mirror_geterror(mm_unit_t * un,int * smi,int * cip,int clr_error,int frm_probe)3270Sstevel@tonic-gate mirror_geterror(mm_unit_t *un, int *smi, int *cip, int clr_error,
3280Sstevel@tonic-gate int frm_probe)
3290Sstevel@tonic-gate {
3300Sstevel@tonic-gate mm_submirror_t *sm;
3310Sstevel@tonic-gate mm_submirror_ic_t *smic;
3320Sstevel@tonic-gate md_m_shared_t *shared;
3330Sstevel@tonic-gate int ci;
3340Sstevel@tonic-gate int i;
3350Sstevel@tonic-gate int compcnt;
3360Sstevel@tonic-gate int open_comp; /* flag for open component */
3370Sstevel@tonic-gate
3380Sstevel@tonic-gate for (i = *smi; i < NMIRROR; i++) {
3390Sstevel@tonic-gate sm = &un->un_sm[i];
3400Sstevel@tonic-gate smic = &un->un_smic[i];
3410Sstevel@tonic-gate
3420Sstevel@tonic-gate if (!SMS_IS(sm, SMS_INUSE))
3430Sstevel@tonic-gate continue;
3440Sstevel@tonic-gate
3450Sstevel@tonic-gate compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un);
3460Sstevel@tonic-gate for (ci = *cip; ci < compcnt; ci++) {
3470Sstevel@tonic-gate shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
3480Sstevel@tonic-gate (sm->sm_dev, sm, ci);
3490Sstevel@tonic-gate /*
3500Sstevel@tonic-gate * if called from any routine but probe, we check for
3510Sstevel@tonic-gate * MDM_S_ISOPEN flag. Since probe does a pseduo open,
3520Sstevel@tonic-gate * it sets MDM_S_PROBEOPEN flag and we test for this
3530Sstevel@tonic-gate * flag. They are both exclusive tests.
3540Sstevel@tonic-gate */
3550Sstevel@tonic-gate open_comp = (frm_probe) ?
3566901Sjkennedy (shared->ms_flags & MDM_S_PROBEOPEN):
3576901Sjkennedy (shared->ms_flags & MDM_S_ISOPEN);
358*12629SRay.Hassan@oracle.COM if (((shared->ms_flags & MDM_S_IOERR || !open_comp) &&
3596901Sjkennedy ((shared->ms_state == CS_OKAY) ||
360*12629SRay.Hassan@oracle.COM (shared->ms_state == CS_RESYNC))) ||
361*12629SRay.Hassan@oracle.COM (!open_comp &&
362*12629SRay.Hassan@oracle.COM (shared->ms_state == CS_LAST_ERRED))) {
3630Sstevel@tonic-gate if (clr_error) {
3640Sstevel@tonic-gate shared->ms_flags &= ~MDM_S_IOERR;
3650Sstevel@tonic-gate }
3660Sstevel@tonic-gate *cip = ci;
3670Sstevel@tonic-gate *smi = i;
3680Sstevel@tonic-gate return (1);
3690Sstevel@tonic-gate }
3700Sstevel@tonic-gate
3710Sstevel@tonic-gate if (clr_error && (shared->ms_flags & MDM_S_IOERR)) {
3720Sstevel@tonic-gate shared->ms_flags &= ~MDM_S_IOERR;
3730Sstevel@tonic-gate }
3740Sstevel@tonic-gate }
3750Sstevel@tonic-gate
3760Sstevel@tonic-gate *cip = 0;
3770Sstevel@tonic-gate }
3780Sstevel@tonic-gate return (0);
3790Sstevel@tonic-gate }
3800Sstevel@tonic-gate
3810Sstevel@tonic-gate /*ARGSUSED*/
3820Sstevel@tonic-gate static void
mirror_run_queue(void * d)3830Sstevel@tonic-gate mirror_run_queue(void *d)
3840Sstevel@tonic-gate {
3850Sstevel@tonic-gate if (!(md_status & MD_GBL_DAEMONS_LIVE))
3860Sstevel@tonic-gate md_daemon(1, &md_done_daemon);
3870Sstevel@tonic-gate }
3880Sstevel@tonic-gate /*
3890Sstevel@tonic-gate * check_comp_4_hotspares
3900Sstevel@tonic-gate *
3910Sstevel@tonic-gate * This function attempts to allocate a hotspare for this component if the
3920Sstevel@tonic-gate * component is in error. In a MN set, the function can be called in 2 modes.
3930Sstevel@tonic-gate * It can be called either when a component error has been detected or when a
3940Sstevel@tonic-gate * new hotspare has been allocated. In this case, MD_HOTSPARE_XMIT is set
3950Sstevel@tonic-gate * in flags and the request is sent to all nodes.
3960Sstevel@tonic-gate * The handler on each of the nodes then calls this function with
3970Sstevel@tonic-gate * MD_HOTSPARE_XMIT unset and the hotspare allocation is then performed.
3980Sstevel@tonic-gate *
3990Sstevel@tonic-gate * For non-MN sets the function simply attempts to allocate a hotspare.
4000Sstevel@tonic-gate *
4010Sstevel@tonic-gate * On entry, the following locks are held
4020Sstevel@tonic-gate * mirror_md_ops.md_link_rw (if flags has MD_HOTSPARE_LINKHELD set)
4030Sstevel@tonic-gate * md_unit_writerlock
4040Sstevel@tonic-gate *
4050Sstevel@tonic-gate * Returns 0 if ok
4060Sstevel@tonic-gate * 1 if the unit containing the component has been cleared while
4070Sstevel@tonic-gate * the mdmn_ksend_message() was being executed
4080Sstevel@tonic-gate */
4090Sstevel@tonic-gate extern int
check_comp_4_hotspares(mm_unit_t * un,int smi,int ci,uint_t flags,mddb_recid_t hs_id,IOLOCK * lockp)4100Sstevel@tonic-gate check_comp_4_hotspares(
4110Sstevel@tonic-gate mm_unit_t *un,
4120Sstevel@tonic-gate int smi,
4130Sstevel@tonic-gate int ci,
4140Sstevel@tonic-gate uint_t flags,
4150Sstevel@tonic-gate mddb_recid_t hs_id, /* Only used by MN disksets */
4160Sstevel@tonic-gate IOLOCK *lockp /* can be NULL */
4170Sstevel@tonic-gate )
4180Sstevel@tonic-gate {
4190Sstevel@tonic-gate mm_submirror_t *sm;
4200Sstevel@tonic-gate mm_submirror_ic_t *smic;
4210Sstevel@tonic-gate md_m_shared_t *shared;
4220Sstevel@tonic-gate mddb_recid_t recids[6];
4230Sstevel@tonic-gate minor_t mnum;
4240Sstevel@tonic-gate intptr_t (*hs_dev)();
4250Sstevel@tonic-gate void (*hs_done)();
4260Sstevel@tonic-gate void *hs_data;
4270Sstevel@tonic-gate md_error_t mde = mdnullerror;
4280Sstevel@tonic-gate set_t setno;
4290Sstevel@tonic-gate md_mn_msg_allochsp_t allochspmsg;
4300Sstevel@tonic-gate md_mn_kresult_t *kresult;
4310Sstevel@tonic-gate mm_unit_t *new_un;
4320Sstevel@tonic-gate int rval;
43311130SJames.Hall@Sun.COM int nretries = 0;
4340Sstevel@tonic-gate
4350Sstevel@tonic-gate mnum = MD_SID(un);
4360Sstevel@tonic-gate setno = MD_UN2SET(un);
4370Sstevel@tonic-gate sm = &un->un_sm[smi];
4380Sstevel@tonic-gate smic = &un->un_smic[smi];
4390Sstevel@tonic-gate shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
4406901Sjkennedy (sm->sm_dev, sm, ci);
4410Sstevel@tonic-gate
4420Sstevel@tonic-gate if (shared->ms_state != CS_ERRED)
4430Sstevel@tonic-gate return (0);
4440Sstevel@tonic-gate
4450Sstevel@tonic-gate /* Don't start a new component resync if a resync is already running. */
4460Sstevel@tonic-gate if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE)
4470Sstevel@tonic-gate return (0);
4480Sstevel@tonic-gate
4490Sstevel@tonic-gate if (MD_MNSET_SETNO(setno) && (flags & MD_HOTSPARE_XMIT)) {
4500Sstevel@tonic-gate uint_t msgflags;
4510Sstevel@tonic-gate md_mn_msgtype_t msgtype;
4520Sstevel@tonic-gate
4530Sstevel@tonic-gate /* Send allocate hotspare message to all nodes */
4540Sstevel@tonic-gate
4550Sstevel@tonic-gate allochspmsg.msg_allochsp_mnum = un->c.un_self_id;
4560Sstevel@tonic-gate allochspmsg.msg_allochsp_sm = smi;
4570Sstevel@tonic-gate allochspmsg.msg_allochsp_comp = ci;
4580Sstevel@tonic-gate allochspmsg.msg_allochsp_hs_id = shared->ms_hs_id;
4590Sstevel@tonic-gate
4600Sstevel@tonic-gate /*
4610Sstevel@tonic-gate * Before calling mdmn_ksend_message(), release locks
4620Sstevel@tonic-gate * Can never be in the context of an ioctl.
4630Sstevel@tonic-gate */
4640Sstevel@tonic-gate md_unit_writerexit(MDI_UNIT(mnum));
4650Sstevel@tonic-gate if (flags & MD_HOTSPARE_LINKHELD)
4660Sstevel@tonic-gate rw_exit(&mirror_md_ops.md_link_rw.lock);
4670Sstevel@tonic-gate #ifdef DEBUG
4680Sstevel@tonic-gate if (mirror_debug_flag)
4696901Sjkennedy printf("send alloc hotspare, flags="
4706901Sjkennedy "0x%x %x, %x, %x, %x\n", flags,
4716901Sjkennedy allochspmsg.msg_allochsp_mnum,
4726901Sjkennedy allochspmsg.msg_allochsp_sm,
4736901Sjkennedy allochspmsg.msg_allochsp_comp,
4746901Sjkennedy allochspmsg.msg_allochsp_hs_id);
4750Sstevel@tonic-gate #endif
4760Sstevel@tonic-gate if (flags & MD_HOTSPARE_WMUPDATE) {
4770Sstevel@tonic-gate msgtype = MD_MN_MSG_ALLOCATE_HOTSPARE2;
4780Sstevel@tonic-gate /*
4790Sstevel@tonic-gate * When coming from an update of watermarks, there
4800Sstevel@tonic-gate * must already be a message logged that triggered
4810Sstevel@tonic-gate * this action. So, no need to log this message, too.
4820Sstevel@tonic-gate */
4830Sstevel@tonic-gate msgflags = MD_MSGF_NO_LOG;
4840Sstevel@tonic-gate } else {
4850Sstevel@tonic-gate msgtype = MD_MN_MSG_ALLOCATE_HOTSPARE;
4860Sstevel@tonic-gate msgflags = MD_MSGF_DEFAULT_FLAGS;
4870Sstevel@tonic-gate }
4880Sstevel@tonic-gate
4890Sstevel@tonic-gate kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
49011130SJames.Hall@Sun.COM
49111130SJames.Hall@Sun.COM cc4hs_msg:
4928452SJohn.Wren.Kennedy@Sun.COM rval = mdmn_ksend_message(setno, msgtype, msgflags, 0,
4930Sstevel@tonic-gate (char *)&allochspmsg, sizeof (allochspmsg),
4940Sstevel@tonic-gate kresult);
4950Sstevel@tonic-gate
4960Sstevel@tonic-gate if (!MDMN_KSEND_MSG_OK(rval, kresult)) {
4970Sstevel@tonic-gate #ifdef DEBUG
4980Sstevel@tonic-gate if (mirror_debug_flag)
4990Sstevel@tonic-gate mdmn_ksend_show_error(rval, kresult,
5000Sstevel@tonic-gate "ALLOCATE HOTSPARE");
5010Sstevel@tonic-gate #endif
5020Sstevel@tonic-gate /*
5030Sstevel@tonic-gate * If message is sent ok but exitval indicates an error
5040Sstevel@tonic-gate * it must be because the mirror has been cleared. In
5050Sstevel@tonic-gate * this case re-obtain lock and return an error
5060Sstevel@tonic-gate */
5070Sstevel@tonic-gate if ((rval == 0) && (kresult->kmmr_exitval != 0)) {
5080Sstevel@tonic-gate if (flags & MD_HOTSPARE_LINKHELD) {
5090Sstevel@tonic-gate rw_enter(&mirror_md_ops.md_link_rw.lock,
5100Sstevel@tonic-gate RW_READER);
5110Sstevel@tonic-gate }
5120Sstevel@tonic-gate kmem_free(kresult, sizeof (md_mn_kresult_t));
5130Sstevel@tonic-gate return (1);
5140Sstevel@tonic-gate }
5158452SJohn.Wren.Kennedy@Sun.COM /* If we're shutting down already, pause things here. */
5168452SJohn.Wren.Kennedy@Sun.COM if (kresult->kmmr_comm_state == MDMNE_RPC_FAIL) {
5178452SJohn.Wren.Kennedy@Sun.COM while (!md_mn_is_commd_present()) {
5188452SJohn.Wren.Kennedy@Sun.COM delay(md_hz);
5198452SJohn.Wren.Kennedy@Sun.COM }
52011130SJames.Hall@Sun.COM /*
52111130SJames.Hall@Sun.COM * commd has become reachable again, so retry
52211130SJames.Hall@Sun.COM * once. If this fails we'll panic as the
52311130SJames.Hall@Sun.COM * system is in an unexpected state.
52411130SJames.Hall@Sun.COM */
52511130SJames.Hall@Sun.COM if (nretries++ == 0)
52611130SJames.Hall@Sun.COM goto cc4hs_msg;
5278452SJohn.Wren.Kennedy@Sun.COM }
5280Sstevel@tonic-gate cmn_err(CE_PANIC,
5290Sstevel@tonic-gate "ksend_message failure: ALLOCATE_HOTSPARE");
5300Sstevel@tonic-gate }
5310Sstevel@tonic-gate kmem_free(kresult, sizeof (md_mn_kresult_t));
5320Sstevel@tonic-gate
5330Sstevel@tonic-gate /*
5340Sstevel@tonic-gate * re-obtain the locks
5350Sstevel@tonic-gate */
5360Sstevel@tonic-gate if (flags & MD_HOTSPARE_LINKHELD)
5370Sstevel@tonic-gate rw_enter(&mirror_md_ops.md_link_rw.lock, RW_READER);
5380Sstevel@tonic-gate new_un = md_unit_writerlock(MDI_UNIT(mnum));
5390Sstevel@tonic-gate
5400Sstevel@tonic-gate /*
5410Sstevel@tonic-gate * As we had to release the locks in order to send the
5420Sstevel@tonic-gate * message to all nodes, we need to check to see if the
5430Sstevel@tonic-gate * unit has changed. If it has we release the writerlock
5440Sstevel@tonic-gate * and return fail.
5450Sstevel@tonic-gate */
5460Sstevel@tonic-gate if ((new_un != un) || (un->c.un_type != MD_METAMIRROR)) {
5470Sstevel@tonic-gate md_unit_writerexit(MDI_UNIT(mnum));
5480Sstevel@tonic-gate return (1);
5490Sstevel@tonic-gate }
5500Sstevel@tonic-gate } else {
5510Sstevel@tonic-gate if (MD_MNSET_SETNO(setno)) {
5520Sstevel@tonic-gate /*
5530Sstevel@tonic-gate * If 2 or more nodes simultaneously see a
5540Sstevel@tonic-gate * component failure, these nodes will each
5550Sstevel@tonic-gate * send an ALLOCATE_HOTSPARE[2] message.
5560Sstevel@tonic-gate * The first message will allocate the hotspare
5570Sstevel@tonic-gate * and the subsequent messages should do nothing.
5580Sstevel@tonic-gate *
5590Sstevel@tonic-gate * If a slave node doesn't have a hotspare allocated
5600Sstevel@tonic-gate * at the time the message is initiated, then the
5610Sstevel@tonic-gate * passed in hs_id will be 0. If the node
5620Sstevel@tonic-gate * executing this routine has a component shared
5630Sstevel@tonic-gate * ms_hs_id of non-zero, but the message shows a
5640Sstevel@tonic-gate * hs_id of 0, then just return since a hotspare
5650Sstevel@tonic-gate * has already been allocated for this failing
5660Sstevel@tonic-gate * component. When the slave node returns from
5670Sstevel@tonic-gate * the ksend_message the hotspare will have
5680Sstevel@tonic-gate * already been allocated.
5690Sstevel@tonic-gate *
5700Sstevel@tonic-gate * If the slave node does send an hs_id of non-zero,
5710Sstevel@tonic-gate * and the slave node's hs_id matches this node's
5720Sstevel@tonic-gate * ms_hs_id, then the hotspare has error'd and
5730Sstevel@tonic-gate * should be replaced.
5740Sstevel@tonic-gate *
5750Sstevel@tonic-gate * If the slave node sends an hs_id of non-zero and
5760Sstevel@tonic-gate * this node has a different shared ms_hs_id, then
5770Sstevel@tonic-gate * just return since this hotspare has already
5780Sstevel@tonic-gate * been hotspared.
5790Sstevel@tonic-gate */
5800Sstevel@tonic-gate if (shared->ms_hs_id != 0) {
5810Sstevel@tonic-gate if (hs_id == 0) {
5820Sstevel@tonic-gate #ifdef DEBUG
5830Sstevel@tonic-gate if (mirror_debug_flag) {
5840Sstevel@tonic-gate printf("check_comp_4_hotspares"
5850Sstevel@tonic-gate "(NOXMIT), short circuit "
5860Sstevel@tonic-gate "hs_id=0x%x, "
5870Sstevel@tonic-gate "ms_hs_id=0x%x\n",
5880Sstevel@tonic-gate hs_id, shared->ms_hs_id);
5890Sstevel@tonic-gate }
5900Sstevel@tonic-gate #endif
5910Sstevel@tonic-gate return (0);
5920Sstevel@tonic-gate }
5930Sstevel@tonic-gate if (hs_id != shared->ms_hs_id) {
5940Sstevel@tonic-gate #ifdef DEBUG
5950Sstevel@tonic-gate if (mirror_debug_flag) {
5960Sstevel@tonic-gate printf("check_comp_4_hotspares"
5970Sstevel@tonic-gate "(NOXMIT), short circuit2 "
5980Sstevel@tonic-gate "hs_id=0x%x, "
5990Sstevel@tonic-gate "ms_hs_id=0x%x\n",
6000Sstevel@tonic-gate hs_id, shared->ms_hs_id);
6010Sstevel@tonic-gate }
6020Sstevel@tonic-gate #endif
6030Sstevel@tonic-gate return (0);
6040Sstevel@tonic-gate }
6050Sstevel@tonic-gate }
6060Sstevel@tonic-gate }
6070Sstevel@tonic-gate
6080Sstevel@tonic-gate sm = &un->un_sm[smi];
6090Sstevel@tonic-gate hs_dev = md_get_named_service(sm->sm_dev, 0,
6100Sstevel@tonic-gate "hotspare device", 0);
6110Sstevel@tonic-gate if ((*hs_dev)(sm->sm_dev, 0, ci, recids, 6, &hs_done,
6120Sstevel@tonic-gate &hs_data) != 0)
6130Sstevel@tonic-gate return (0);
6140Sstevel@tonic-gate
6150Sstevel@tonic-gate /*
6160Sstevel@tonic-gate * set_sm_comp_state() commits the modified records.
6170Sstevel@tonic-gate * As we don't transmit the changes, no need to drop the lock.
6180Sstevel@tonic-gate */
6190Sstevel@tonic-gate set_sm_comp_state(un, smi, ci, CS_RESYNC, recids,
6200Sstevel@tonic-gate MD_STATE_NO_XMIT, (IOLOCK *)NULL);
6210Sstevel@tonic-gate
6220Sstevel@tonic-gate (*hs_done)(sm->sm_dev, hs_data);
6230Sstevel@tonic-gate
6240Sstevel@tonic-gate mirror_check_failfast(mnum);
6250Sstevel@tonic-gate
6260Sstevel@tonic-gate SE_NOTIFY(EC_SVM_STATE, ESC_SVM_HOTSPARED, SVM_TAG_METADEVICE,
6270Sstevel@tonic-gate setno, MD_SID(un));
6280Sstevel@tonic-gate
6290Sstevel@tonic-gate /*
6300Sstevel@tonic-gate * For a multi-node set we need to reset the un_rs_type,
6310Sstevel@tonic-gate * un_rs_resync_done and un_rs_resync_2_do fields as the
6320Sstevel@tonic-gate * hot-spare resync must copy all applicable data.
6330Sstevel@tonic-gate */
6340Sstevel@tonic-gate if (MD_MNSET_SETNO(setno)) {
6350Sstevel@tonic-gate un->un_rs_type = MD_RS_NONE;
6360Sstevel@tonic-gate un->un_rs_resync_done = 0;
6370Sstevel@tonic-gate un->un_rs_resync_2_do = 0;
6380Sstevel@tonic-gate }
6390Sstevel@tonic-gate
6400Sstevel@tonic-gate /*
6410Sstevel@tonic-gate * Must drop writer lock since mirror_resync_unit will
6420Sstevel@tonic-gate * open devices and must be able to grab readerlock.
6430Sstevel@tonic-gate * Don't need to drop IOLOCK since any descendent routines
6440Sstevel@tonic-gate * calling ksend_messages will drop the IOLOCK as needed.
6450Sstevel@tonic-gate *
6460Sstevel@tonic-gate */
6470Sstevel@tonic-gate if (lockp) {
6480Sstevel@tonic-gate md_ioctl_writerexit(lockp);
6490Sstevel@tonic-gate } else {
6500Sstevel@tonic-gate md_unit_writerexit(MDI_UNIT(mnum));
6510Sstevel@tonic-gate }
6520Sstevel@tonic-gate
6530Sstevel@tonic-gate /* start resync */
6540Sstevel@tonic-gate (void) mirror_resync_unit(mnum, NULL, &mde, lockp);
6550Sstevel@tonic-gate
6560Sstevel@tonic-gate if (lockp) {
6570Sstevel@tonic-gate new_un = md_ioctl_writerlock(lockp, MDI_UNIT(mnum));
6580Sstevel@tonic-gate } else {
6590Sstevel@tonic-gate new_un = md_unit_writerlock(MDI_UNIT(mnum));
6600Sstevel@tonic-gate }
6610Sstevel@tonic-gate }
6620Sstevel@tonic-gate return (0);
6630Sstevel@tonic-gate }
6640Sstevel@tonic-gate
6650Sstevel@tonic-gate /*
6660Sstevel@tonic-gate * check_unit_4_hotspares
6670Sstevel@tonic-gate *
6680Sstevel@tonic-gate * For a given mirror, allocate hotspares, if available for any components
6690Sstevel@tonic-gate * that are in error
6700Sstevel@tonic-gate *
6710Sstevel@tonic-gate * Returns 0 if ok
6720Sstevel@tonic-gate * 1 if check_comp_4_hotspares returns non-zero. This will only
6730Sstevel@tonic-gate * happen for a MN unit where the unit has been cleared while
6740Sstevel@tonic-gate * the allocate hotspare message is sent to all nodes.
6750Sstevel@tonic-gate */
6760Sstevel@tonic-gate static int
check_unit_4_hotspares(mm_unit_t * un,int flags)6770Sstevel@tonic-gate check_unit_4_hotspares(mm_unit_t *un, int flags)
6780Sstevel@tonic-gate {
6790Sstevel@tonic-gate mm_submirror_t *sm;
6800Sstevel@tonic-gate mm_submirror_ic_t *smic;
6810Sstevel@tonic-gate int ci;
6820Sstevel@tonic-gate int i;
6830Sstevel@tonic-gate int compcnt;
6840Sstevel@tonic-gate
6850Sstevel@tonic-gate if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE)
6860Sstevel@tonic-gate return (0);
6870Sstevel@tonic-gate
6880Sstevel@tonic-gate for (i = 0; i < NMIRROR; i++) {
6890Sstevel@tonic-gate sm = &un->un_sm[i];
6900Sstevel@tonic-gate smic = &un->un_smic[i];
6910Sstevel@tonic-gate if (!SMS_IS(sm, SMS_INUSE))
6920Sstevel@tonic-gate continue;
6930Sstevel@tonic-gate compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, sm);
6940Sstevel@tonic-gate for (ci = 0; ci < compcnt; ci++) {
6950Sstevel@tonic-gate md_m_shared_t *shared;
6960Sstevel@tonic-gate
6970Sstevel@tonic-gate shared = (md_m_shared_t *)
6986901Sjkennedy (*(smic->sm_shared_by_indx))(sm->sm_dev, sm, ci);
6990Sstevel@tonic-gate /*
7000Sstevel@tonic-gate * Never called from ioctl context, so pass in
7010Sstevel@tonic-gate * (IOLOCK *)NULL. Pass through flags from calling
7020Sstevel@tonic-gate * routine, also setting XMIT flag.
7030Sstevel@tonic-gate */
7040Sstevel@tonic-gate if (check_comp_4_hotspares(un, i, ci,
7056901Sjkennedy (MD_HOTSPARE_XMIT | flags),
7066901Sjkennedy shared->ms_hs_id, (IOLOCK *)NULL) != 0)
7070Sstevel@tonic-gate return (1);
7080Sstevel@tonic-gate }
7090Sstevel@tonic-gate }
7100Sstevel@tonic-gate return (0);
7110Sstevel@tonic-gate }
7120Sstevel@tonic-gate
7130Sstevel@tonic-gate static void
check_4_hotspares(daemon_request_t * drq)7140Sstevel@tonic-gate check_4_hotspares(daemon_request_t *drq)
7150Sstevel@tonic-gate {
7160Sstevel@tonic-gate mdi_unit_t *ui;
7170Sstevel@tonic-gate mm_unit_t *un;
7180Sstevel@tonic-gate md_link_t *next;
7190Sstevel@tonic-gate int x;
7200Sstevel@tonic-gate
7210Sstevel@tonic-gate mutex_enter(&drq->dr_mx); /* clear up front so can poke */
7220Sstevel@tonic-gate drq->dr_pending = 0; /* again in low level routine if */
7230Sstevel@tonic-gate mutex_exit(&drq->dr_mx); /* something found to do */
7240Sstevel@tonic-gate
7250Sstevel@tonic-gate /*
7260Sstevel@tonic-gate * Used to have a problem here. The disksets weren't marked as being
7270Sstevel@tonic-gate * MNHOLD. This opened a window where we could be searching for
7280Sstevel@tonic-gate * hotspares and have the disk set unloaded (released) from under
7290Sstevel@tonic-gate * us causing a panic in stripe_component_count().
7300Sstevel@tonic-gate * The way to prevent that is to mark the set MNHOLD which prevents
7310Sstevel@tonic-gate * any diskset from being released while we are scanning the mirrors,
7320Sstevel@tonic-gate * submirrors and components.
7330Sstevel@tonic-gate */
7340Sstevel@tonic-gate
7350Sstevel@tonic-gate for (x = 0; x < md_nsets; x++)
7360Sstevel@tonic-gate md_holdset_enter(x);
7370Sstevel@tonic-gate
7380Sstevel@tonic-gate rw_enter(&mirror_md_ops.md_link_rw.lock, RW_READER);
7390Sstevel@tonic-gate for (next = mirror_md_ops.md_head; next != NULL; next = next->ln_next) {
7400Sstevel@tonic-gate ui = MDI_UNIT(next->ln_id);
7410Sstevel@tonic-gate
7420Sstevel@tonic-gate un = (mm_unit_t *)md_unit_readerlock(ui);
7430Sstevel@tonic-gate
7440Sstevel@tonic-gate /*
7450Sstevel@tonic-gate * Only check the unit if we are the master for this set
7460Sstevel@tonic-gate * For an MN set, poke_hotspares() is only effective on the
7470Sstevel@tonic-gate * master
7480Sstevel@tonic-gate */
7490Sstevel@tonic-gate if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
7500Sstevel@tonic-gate md_set[MD_UN2SET(un)].s_am_i_master == 0) {
7510Sstevel@tonic-gate md_unit_readerexit(ui);
7520Sstevel@tonic-gate continue;
7530Sstevel@tonic-gate }
7540Sstevel@tonic-gate if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) {
7550Sstevel@tonic-gate md_unit_readerexit(ui);
7560Sstevel@tonic-gate continue;
7570Sstevel@tonic-gate }
7580Sstevel@tonic-gate md_unit_readerexit(ui);
7590Sstevel@tonic-gate
7600Sstevel@tonic-gate un = (mm_unit_t *)md_unit_writerlock(ui);
7610Sstevel@tonic-gate /*
7620Sstevel@tonic-gate * check_unit_4_hotspares will exit 1 if the unit has been
7630Sstevel@tonic-gate * removed during the process of allocating the hotspare.
7640Sstevel@tonic-gate * This can only happen for a MN metadevice. If unit no longer
7650Sstevel@tonic-gate * exists, no need to release writerlock
7660Sstevel@tonic-gate */
7670Sstevel@tonic-gate if (check_unit_4_hotspares(un, MD_HOTSPARE_LINKHELD) == 0)
7680Sstevel@tonic-gate md_unit_writerexit(ui);
7690Sstevel@tonic-gate else {
7700Sstevel@tonic-gate /*
7710Sstevel@tonic-gate * If check_unit_4_hotspares failed, queue another
7720Sstevel@tonic-gate * request and break out of this one
7730Sstevel@tonic-gate */
7740Sstevel@tonic-gate (void) poke_hotspares();
7750Sstevel@tonic-gate break;
7760Sstevel@tonic-gate }
7770Sstevel@tonic-gate }
7780Sstevel@tonic-gate rw_exit(&mirror_md_ops.md_link_rw.lock);
7790Sstevel@tonic-gate
7800Sstevel@tonic-gate for (x = 0; x < md_nsets; x++)
7810Sstevel@tonic-gate md_holdset_exit(x);
7820Sstevel@tonic-gate }
7830Sstevel@tonic-gate
7840Sstevel@tonic-gate /*
7850Sstevel@tonic-gate * poke_hotspares
7860Sstevel@tonic-gate *
7870Sstevel@tonic-gate * If there is not a pending poke_hotspares request pending, queue a requent
7880Sstevel@tonic-gate * to call check_4_hotspares(). This will scan all mirrors and attempt to
7890Sstevel@tonic-gate * allocate hotspares for all components in error.
7900Sstevel@tonic-gate */
7910Sstevel@tonic-gate int
poke_hotspares()7920Sstevel@tonic-gate poke_hotspares()
7930Sstevel@tonic-gate {
7940Sstevel@tonic-gate mutex_enter(&hotspare_request.dr_mx);
7950Sstevel@tonic-gate if (hotspare_request.dr_pending == 0) {
7960Sstevel@tonic-gate hotspare_request.dr_pending = 1;
7970Sstevel@tonic-gate daemon_request(&md_mhs_daemon,
7986901Sjkennedy check_4_hotspares, (daemon_queue_t *)&hotspare_request,
7996901Sjkennedy REQ_OLD);
8000Sstevel@tonic-gate }
8010Sstevel@tonic-gate mutex_exit(&hotspare_request.dr_mx);
8020Sstevel@tonic-gate return (0);
8030Sstevel@tonic-gate }
8040Sstevel@tonic-gate
8050Sstevel@tonic-gate static void
free_all_ecomps(err_comp_t * ecomp)8060Sstevel@tonic-gate free_all_ecomps(err_comp_t *ecomp)
8070Sstevel@tonic-gate {
8080Sstevel@tonic-gate err_comp_t *d;
8090Sstevel@tonic-gate
8100Sstevel@tonic-gate while (ecomp != NULL) {
8110Sstevel@tonic-gate d = ecomp;
8120Sstevel@tonic-gate ecomp = ecomp->ec_next;
8130Sstevel@tonic-gate kmem_free(d, sizeof (err_comp_t));
8140Sstevel@tonic-gate }
8150Sstevel@tonic-gate }
8160Sstevel@tonic-gate
8170Sstevel@tonic-gate /*
8180Sstevel@tonic-gate * NAME: mirror_openfail_console_info
8190Sstevel@tonic-gate *
8200Sstevel@tonic-gate * DESCRIPTION: Prints a informative message to the console when mirror
8210Sstevel@tonic-gate * cannot be opened.
8220Sstevel@tonic-gate *
8230Sstevel@tonic-gate * PARAMETERS: mm_unit_t un - pointer to mirror unit structure
8240Sstevel@tonic-gate * int smi - submirror index
8250Sstevel@tonic-gate * int ci - component index
8260Sstevel@tonic-gate */
8270Sstevel@tonic-gate
8280Sstevel@tonic-gate void
mirror_openfail_console_info(mm_unit_t * un,int smi,int ci)8290Sstevel@tonic-gate mirror_openfail_console_info(mm_unit_t *un, int smi, int ci)
8300Sstevel@tonic-gate {
8310Sstevel@tonic-gate void (*get_dev)();
8320Sstevel@tonic-gate ms_cd_info_t cd;
8330Sstevel@tonic-gate md_dev64_t tmpdev;
8340Sstevel@tonic-gate
8350Sstevel@tonic-gate tmpdev = un->un_sm[smi].sm_dev;
8360Sstevel@tonic-gate get_dev = (void (*)())md_get_named_service(tmpdev, 0, "get device", 0);
8370Sstevel@tonic-gate if (get_dev != NULL) {
8380Sstevel@tonic-gate (void) (*get_dev)(tmpdev, smi, ci, &cd);
8390Sstevel@tonic-gate cmn_err(CE_WARN, "md %s: open error on %s",
8406901Sjkennedy md_shortname(MD_SID(un)), md_devname(MD_UN2SET(un),
8416901Sjkennedy cd.cd_dev, NULL, 0));
8420Sstevel@tonic-gate } else {
8430Sstevel@tonic-gate cmn_err(CE_WARN, "md %s: open error",
8446901Sjkennedy md_shortname(MD_SID(un)));
8450Sstevel@tonic-gate }
8460Sstevel@tonic-gate }
8470Sstevel@tonic-gate
8480Sstevel@tonic-gate static int
mirror_close_all_devs(mm_unit_t * un,int md_cflags)8490Sstevel@tonic-gate mirror_close_all_devs(mm_unit_t *un, int md_cflags)
8500Sstevel@tonic-gate {
8510Sstevel@tonic-gate int i;
8520Sstevel@tonic-gate md_dev64_t dev;
8530Sstevel@tonic-gate
8540Sstevel@tonic-gate for (i = 0; i < NMIRROR; i++) {
8550Sstevel@tonic-gate if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
8560Sstevel@tonic-gate continue;
8570Sstevel@tonic-gate dev = un->un_sm[i].sm_dev;
8580Sstevel@tonic-gate md_layered_close(dev, md_cflags);
8590Sstevel@tonic-gate }
8600Sstevel@tonic-gate return (0);
8610Sstevel@tonic-gate }
8620Sstevel@tonic-gate
8630Sstevel@tonic-gate /*
8640Sstevel@tonic-gate * Keep track of drivers that don't support failfast. We use this so that
8650Sstevel@tonic-gate * we only log one diagnostic message for each of these drivers, no matter
8660Sstevel@tonic-gate * how many times we run the mirror_check_failfast function.
8670Sstevel@tonic-gate * Return 1 if this is a new driver that does not support failfast,
8680Sstevel@tonic-gate * return 0 if we have already seen this non-failfast driver.
8690Sstevel@tonic-gate */
8700Sstevel@tonic-gate static int
new_non_ff_driver(const char * s)8710Sstevel@tonic-gate new_non_ff_driver(const char *s)
8720Sstevel@tonic-gate {
8730Sstevel@tonic-gate mutex_enter(&non_ff_drv_mutex);
8740Sstevel@tonic-gate if (non_ff_drivers == NULL) {
8756901Sjkennedy non_ff_drivers = (char **)kmem_alloc(2 * sizeof (char *),
8766901Sjkennedy KM_NOSLEEP);
8776901Sjkennedy if (non_ff_drivers == NULL) {
8786901Sjkennedy mutex_exit(&non_ff_drv_mutex);
8796901Sjkennedy return (1);
8806901Sjkennedy }
8816901Sjkennedy
8826901Sjkennedy non_ff_drivers[0] = (char *)kmem_alloc(strlen(s) + 1,
8836901Sjkennedy KM_NOSLEEP);
8846901Sjkennedy if (non_ff_drivers[0] == NULL) {
8856901Sjkennedy kmem_free(non_ff_drivers, 2 * sizeof (char *));
8866901Sjkennedy non_ff_drivers = NULL;
8876901Sjkennedy mutex_exit(&non_ff_drv_mutex);
8886901Sjkennedy return (1);
8896901Sjkennedy }
8906901Sjkennedy
8916901Sjkennedy (void) strcpy(non_ff_drivers[0], s);
8926901Sjkennedy non_ff_drivers[1] = NULL;
8930Sstevel@tonic-gate
8940Sstevel@tonic-gate } else {
8956901Sjkennedy int i;
8966901Sjkennedy char **tnames;
8976901Sjkennedy char **tmp;
8986901Sjkennedy
8996901Sjkennedy for (i = 0; non_ff_drivers[i] != NULL; i++) {
9006901Sjkennedy if (strcmp(s, non_ff_drivers[i]) == 0) {
9016901Sjkennedy mutex_exit(&non_ff_drv_mutex);
9026901Sjkennedy return (0);
9036901Sjkennedy }
9046901Sjkennedy }
9056901Sjkennedy
9066901Sjkennedy /* allow for new element and null */
9076901Sjkennedy i += 2;
9086901Sjkennedy tnames = (char **)kmem_alloc(i * sizeof (char *), KM_NOSLEEP);
9096901Sjkennedy if (tnames == NULL) {
9106901Sjkennedy mutex_exit(&non_ff_drv_mutex);
9116901Sjkennedy return (1);
9120Sstevel@tonic-gate }
9136901Sjkennedy
9146901Sjkennedy for (i = 0; non_ff_drivers[i] != NULL; i++)
9156901Sjkennedy tnames[i] = non_ff_drivers[i];
9166901Sjkennedy
9176901Sjkennedy tnames[i] = (char *)kmem_alloc(strlen(s) + 1, KM_NOSLEEP);
9186901Sjkennedy if (tnames[i] == NULL) {
9196901Sjkennedy /* adjust i so that it is the right count to free */
9206901Sjkennedy kmem_free(tnames, (i + 2) * sizeof (char *));
9216901Sjkennedy mutex_exit(&non_ff_drv_mutex);
9226901Sjkennedy return (1);
9236901Sjkennedy }
9246901Sjkennedy
9256901Sjkennedy (void) strcpy(tnames[i++], s);
9266901Sjkennedy tnames[i] = NULL;
9276901Sjkennedy
9286901Sjkennedy tmp = non_ff_drivers;
9296901Sjkennedy non_ff_drivers = tnames;
9306901Sjkennedy /* i now represents the count we previously alloced */
9316901Sjkennedy kmem_free(tmp, i * sizeof (char *));
9320Sstevel@tonic-gate }
9330Sstevel@tonic-gate mutex_exit(&non_ff_drv_mutex);
9340Sstevel@tonic-gate
9350Sstevel@tonic-gate return (1);
9360Sstevel@tonic-gate }
9370Sstevel@tonic-gate
9380Sstevel@tonic-gate /*
9390Sstevel@tonic-gate * Check for the "ddi-failfast-supported" devtree property on each submirror
9400Sstevel@tonic-gate * component to indicate if we should do I/O to that submirror with the
9410Sstevel@tonic-gate * B_FAILFAST flag set or not. This check is made at various state transitions
9420Sstevel@tonic-gate * in the mirror code (e.g. open, enable, hotspare, etc.). Sometimes we
9430Sstevel@tonic-gate * only need to check one drive (e.g. hotspare) but since the check is
9440Sstevel@tonic-gate * fast and infrequent and sometimes needs to be done on all components we
9450Sstevel@tonic-gate * just check all components on each call.
9460Sstevel@tonic-gate */
9470Sstevel@tonic-gate void
mirror_check_failfast(minor_t mnum)9480Sstevel@tonic-gate mirror_check_failfast(minor_t mnum)
9490Sstevel@tonic-gate {
9500Sstevel@tonic-gate int i;
9510Sstevel@tonic-gate mm_unit_t *un;
9520Sstevel@tonic-gate
9530Sstevel@tonic-gate if (md_ff_disable)
9546901Sjkennedy return;
9550Sstevel@tonic-gate
9560Sstevel@tonic-gate un = MD_UNIT(mnum);
9570Sstevel@tonic-gate
9580Sstevel@tonic-gate for (i = 0; i < NMIRROR; i++) {
9596901Sjkennedy int ci;
9606901Sjkennedy int cnt;
9616901Sjkennedy int ff = 1;
9626901Sjkennedy mm_submirror_t *sm;
9636901Sjkennedy mm_submirror_ic_t *smic;
9646901Sjkennedy void (*get_dev)();
9656901Sjkennedy
9666901Sjkennedy if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
9676901Sjkennedy continue;
9686901Sjkennedy
9696901Sjkennedy sm = &un->un_sm[i];
9706901Sjkennedy smic = &un->un_smic[i];
9716901Sjkennedy
9726901Sjkennedy get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0,
9736901Sjkennedy "get device", 0);
9746901Sjkennedy
9756901Sjkennedy cnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm);
9766901Sjkennedy for (ci = 0; ci < cnt; ci++) {
9776901Sjkennedy int found = 0;
9786901Sjkennedy dev_t ci_dev;
9796901Sjkennedy major_t major;
9806901Sjkennedy dev_info_t *devi;
9816901Sjkennedy ms_cd_info_t cd;
9826901Sjkennedy
9836901Sjkennedy /*
9846901Sjkennedy * this already returns the hs
9856901Sjkennedy * dev if the device is spared
9866901Sjkennedy */
9876901Sjkennedy (void) (*get_dev)(sm->sm_dev, sm, ci, &cd);
9886901Sjkennedy
9896901Sjkennedy ci_dev = md_dev64_to_dev(cd.cd_dev);
9906901Sjkennedy major = getmajor(ci_dev);
9916901Sjkennedy
9926901Sjkennedy if (major == md_major) {
9936901Sjkennedy /*
9946901Sjkennedy * this component must be a soft
9956901Sjkennedy * partition; get the real dev
9966901Sjkennedy */
9976901Sjkennedy minor_t dev_mnum;
9986901Sjkennedy mdi_unit_t *ui;
9996901Sjkennedy mp_unit_t *un;
10006901Sjkennedy set_t setno;
10016901Sjkennedy side_t side;
10026901Sjkennedy md_dev64_t tmpdev;
10036901Sjkennedy
10046901Sjkennedy ui = MDI_UNIT(getminor(ci_dev));
10056901Sjkennedy
10066901Sjkennedy /* grab necessary lock */
10076901Sjkennedy un = (mp_unit_t *)md_unit_readerlock(ui);
10086901Sjkennedy
10096901Sjkennedy dev_mnum = MD_SID(un);
10106901Sjkennedy setno = MD_MIN2SET(dev_mnum);
10116901Sjkennedy side = mddb_getsidenum(setno);
10126901Sjkennedy
10136901Sjkennedy tmpdev = un->un_dev;
10146901Sjkennedy
10156901Sjkennedy /* Get dev by device id */
10166901Sjkennedy if (md_devid_found(setno, side,
10176901Sjkennedy un->un_key) == 1) {
10186901Sjkennedy tmpdev = md_resolve_bydevid(dev_mnum,
10196901Sjkennedy tmpdev, un->un_key);
10206901Sjkennedy }
10216901Sjkennedy
10226901Sjkennedy md_unit_readerexit(ui);
10236901Sjkennedy
10246901Sjkennedy ci_dev = md_dev64_to_dev(tmpdev);
10256901Sjkennedy major = getmajor(ci_dev);
10266901Sjkennedy }
10276901Sjkennedy
10286901Sjkennedy if (ci_dev != NODEV32 &&
10296901Sjkennedy (devi = e_ddi_hold_devi_by_dev(ci_dev, 0))
10306901Sjkennedy != NULL) {
10316901Sjkennedy ddi_prop_op_t prop_op = PROP_LEN_AND_VAL_BUF;
10326901Sjkennedy int propvalue = 0;
10336901Sjkennedy int proplength = sizeof (int);
10346901Sjkennedy int error;
10356901Sjkennedy struct cb_ops *cb;
10366901Sjkennedy
10376901Sjkennedy if ((cb = devopsp[major]->devo_cb_ops) !=
10386901Sjkennedy NULL) {
10396901Sjkennedy error = (*cb->cb_prop_op)
10406901Sjkennedy (DDI_DEV_T_ANY, devi, prop_op,
10416901Sjkennedy DDI_PROP_NOTPROM|DDI_PROP_DONTPASS,
10426901Sjkennedy "ddi-failfast-supported",
10436901Sjkennedy (caddr_t)&propvalue, &proplength);
10446901Sjkennedy
10456901Sjkennedy if (error == DDI_PROP_SUCCESS)
10466901Sjkennedy found = 1;
10476901Sjkennedy }
10486901Sjkennedy
10496901Sjkennedy if (!found && new_non_ff_driver(
10506901Sjkennedy ddi_driver_name(devi))) {
10516901Sjkennedy cmn_err(CE_NOTE, "!md: B_FAILFAST I/O"
10526901Sjkennedy "disabled on %s",
10536901Sjkennedy ddi_driver_name(devi));
10546901Sjkennedy }
10556901Sjkennedy
10566901Sjkennedy ddi_release_devi(devi);
10576901Sjkennedy }
10586901Sjkennedy
10596901Sjkennedy /*
10606901Sjkennedy * All components must support
10616901Sjkennedy * failfast in the submirror.
10626901Sjkennedy */
10636901Sjkennedy if (!found) {
10646901Sjkennedy ff = 0;
10656901Sjkennedy break;
10666901Sjkennedy }
10670Sstevel@tonic-gate }
10680Sstevel@tonic-gate
10696901Sjkennedy if (ff) {
10706901Sjkennedy sm->sm_flags |= MD_SM_FAILFAST;
10716901Sjkennedy } else {
10726901Sjkennedy sm->sm_flags &= ~MD_SM_FAILFAST;
10730Sstevel@tonic-gate }
10740Sstevel@tonic-gate }
10750Sstevel@tonic-gate }
10760Sstevel@tonic-gate
10770Sstevel@tonic-gate /*
10780Sstevel@tonic-gate * Return true if the submirror is unavailable.
10790Sstevel@tonic-gate * If any of the submirror components are opened then the submirror cannot
10800Sstevel@tonic-gate * be unavailable (MD_INACCESSIBLE).
10810Sstevel@tonic-gate * If any of the components are already in the errored state, then the submirror
10820Sstevel@tonic-gate * cannot be unavailable (MD_INACCESSIBLE).
10830Sstevel@tonic-gate */
10840Sstevel@tonic-gate static bool_t
submirror_unavailable(mm_unit_t * un,int smi,int from_probe)10850Sstevel@tonic-gate submirror_unavailable(mm_unit_t *un, int smi, int from_probe)
10860Sstevel@tonic-gate {
10870Sstevel@tonic-gate mm_submirror_t *sm;
10880Sstevel@tonic-gate mm_submirror_ic_t *smic;
10890Sstevel@tonic-gate md_m_shared_t *shared;
10900Sstevel@tonic-gate int ci;
10910Sstevel@tonic-gate int compcnt;
10920Sstevel@tonic-gate
10930Sstevel@tonic-gate sm = &un->un_sm[smi];
10940Sstevel@tonic-gate smic = &un->un_smic[smi];
10950Sstevel@tonic-gate
10960Sstevel@tonic-gate compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un);
10970Sstevel@tonic-gate for (ci = 0; ci < compcnt; ci++) {
10980Sstevel@tonic-gate shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
10990Sstevel@tonic-gate (sm->sm_dev, sm, ci);
11000Sstevel@tonic-gate if (from_probe) {
11010Sstevel@tonic-gate if (shared->ms_flags & MDM_S_PROBEOPEN)
11020Sstevel@tonic-gate return (B_FALSE);
11030Sstevel@tonic-gate } else {
11040Sstevel@tonic-gate if (shared->ms_flags & MDM_S_ISOPEN)
11050Sstevel@tonic-gate return (B_FALSE);
11060Sstevel@tonic-gate }
11070Sstevel@tonic-gate if (shared->ms_state == CS_ERRED ||
11080Sstevel@tonic-gate shared->ms_state == CS_LAST_ERRED)
11090Sstevel@tonic-gate return (B_FALSE);
11100Sstevel@tonic-gate }
11110Sstevel@tonic-gate
11120Sstevel@tonic-gate return (B_TRUE);
11130Sstevel@tonic-gate }
11140Sstevel@tonic-gate
11150Sstevel@tonic-gate static int
mirror_open_all_devs(minor_t mnum,int md_oflags,IOLOCK * lockp)11160Sstevel@tonic-gate mirror_open_all_devs(minor_t mnum, int md_oflags, IOLOCK *lockp)
11170Sstevel@tonic-gate {
11180Sstevel@tonic-gate int i;
11190Sstevel@tonic-gate mm_unit_t *un;
11200Sstevel@tonic-gate mdi_unit_t *ui;
11210Sstevel@tonic-gate int err;
11220Sstevel@tonic-gate int smi;
11230Sstevel@tonic-gate int ci;
11240Sstevel@tonic-gate err_comp_t *c;
11250Sstevel@tonic-gate err_comp_t *ecomps = NULL;
11260Sstevel@tonic-gate int smmask = 0;
11270Sstevel@tonic-gate set_t setno;
11280Sstevel@tonic-gate int sm_cnt;
11290Sstevel@tonic-gate int sm_unavail_cnt;
11300Sstevel@tonic-gate
11310Sstevel@tonic-gate mirror_check_failfast(mnum);
11320Sstevel@tonic-gate
11330Sstevel@tonic-gate un = MD_UNIT(mnum);
11340Sstevel@tonic-gate ui = MDI_UNIT(mnum);
11350Sstevel@tonic-gate setno = MD_UN2SET(un);
11360Sstevel@tonic-gate
11370Sstevel@tonic-gate for (i = 0; i < NMIRROR; i++) {
11380Sstevel@tonic-gate md_dev64_t tmpdev = un->un_sm[i].sm_dev;
11390Sstevel@tonic-gate
11400Sstevel@tonic-gate if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
11410Sstevel@tonic-gate continue;
11420Sstevel@tonic-gate if (md_layered_open(mnum, &tmpdev, md_oflags))
11430Sstevel@tonic-gate smmask |= SMI2BIT(i);
11440Sstevel@tonic-gate un->un_sm[i].sm_dev = tmpdev;
11450Sstevel@tonic-gate }
11460Sstevel@tonic-gate
11470Sstevel@tonic-gate /*
11480Sstevel@tonic-gate * If smmask is clear, all submirrors are accessible. Clear the
11490Sstevel@tonic-gate * MD_INACCESSIBLE bit in this case. This bit is also cleared for the
11500Sstevel@tonic-gate * mirror device. If smmask is set, we have to determine which of the
11510Sstevel@tonic-gate * submirrors are in error. If no submirror is accessible we mark the
11520Sstevel@tonic-gate * whole mirror as MD_INACCESSIBLE.
11530Sstevel@tonic-gate */
11540Sstevel@tonic-gate if (smmask == 0) {
11550Sstevel@tonic-gate if (lockp) {
11560Sstevel@tonic-gate md_ioctl_readerexit(lockp);
11570Sstevel@tonic-gate (void) md_ioctl_writerlock(lockp, ui);
11580Sstevel@tonic-gate } else {
11590Sstevel@tonic-gate md_unit_readerexit(ui);
11600Sstevel@tonic-gate (void) md_unit_writerlock(ui);
11610Sstevel@tonic-gate }
11620Sstevel@tonic-gate ui->ui_tstate &= ~MD_INACCESSIBLE;
11630Sstevel@tonic-gate if (lockp) {
11640Sstevel@tonic-gate md_ioctl_writerexit(lockp);
11650Sstevel@tonic-gate (void) md_ioctl_readerlock(lockp, ui);
11660Sstevel@tonic-gate } else {
11670Sstevel@tonic-gate md_unit_writerexit(ui);
11680Sstevel@tonic-gate (void) md_unit_readerlock(ui);
11690Sstevel@tonic-gate }
11700Sstevel@tonic-gate
11710Sstevel@tonic-gate for (i = 0; i < NMIRROR; i++) {
11720Sstevel@tonic-gate md_dev64_t tmpdev;
11730Sstevel@tonic-gate mdi_unit_t *sm_ui;
11740Sstevel@tonic-gate
11750Sstevel@tonic-gate if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
11760Sstevel@tonic-gate continue;
11770Sstevel@tonic-gate
11780Sstevel@tonic-gate tmpdev = un->un_sm[i].sm_dev;
11790Sstevel@tonic-gate sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev)));
11800Sstevel@tonic-gate (void) md_unit_writerlock(sm_ui);
11810Sstevel@tonic-gate sm_ui->ui_tstate &= ~MD_INACCESSIBLE;
11820Sstevel@tonic-gate md_unit_writerexit(sm_ui);
11830Sstevel@tonic-gate }
11840Sstevel@tonic-gate
11850Sstevel@tonic-gate return (0);
11860Sstevel@tonic-gate }
11870Sstevel@tonic-gate
11880Sstevel@tonic-gate for (i = 0; i < NMIRROR; i++) {
11890Sstevel@tonic-gate md_dev64_t tmpdev;
11900Sstevel@tonic-gate
11910Sstevel@tonic-gate if (!(smmask & SMI2BIT(i)))
11920Sstevel@tonic-gate continue;
11930Sstevel@tonic-gate
11940Sstevel@tonic-gate tmpdev = un->un_sm[i].sm_dev;
11950Sstevel@tonic-gate err = md_layered_open(mnum, &tmpdev, MD_OFLG_CONT_ERRS);
11960Sstevel@tonic-gate un->un_sm[i].sm_dev = tmpdev;
11970Sstevel@tonic-gate ASSERT(err == 0);
11980Sstevel@tonic-gate }
11990Sstevel@tonic-gate
12000Sstevel@tonic-gate if (lockp) {
12010Sstevel@tonic-gate md_ioctl_readerexit(lockp);
12020Sstevel@tonic-gate un = (mm_unit_t *)md_ioctl_writerlock(lockp, ui);
12030Sstevel@tonic-gate } else {
12040Sstevel@tonic-gate md_unit_readerexit(ui);
12050Sstevel@tonic-gate un = (mm_unit_t *)md_unit_writerlock(ui);
12060Sstevel@tonic-gate }
12070Sstevel@tonic-gate
12080Sstevel@tonic-gate /*
12090Sstevel@tonic-gate * We want to make sure the unavailable flag is not masking a real
12100Sstevel@tonic-gate * error on the submirror.
12110Sstevel@tonic-gate * For each submirror,
12120Sstevel@tonic-gate * if all of the submirror components couldn't be opened and there
12130Sstevel@tonic-gate * are no errors on the submirror, then set the unavailable flag
12140Sstevel@tonic-gate * otherwise, clear unavailable.
12150Sstevel@tonic-gate */
12160Sstevel@tonic-gate sm_cnt = 0;
12170Sstevel@tonic-gate sm_unavail_cnt = 0;
12180Sstevel@tonic-gate for (i = 0; i < NMIRROR; i++) {
12190Sstevel@tonic-gate md_dev64_t tmpdev;
12200Sstevel@tonic-gate mdi_unit_t *sm_ui;
12210Sstevel@tonic-gate
12220Sstevel@tonic-gate if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
12230Sstevel@tonic-gate continue;
12240Sstevel@tonic-gate
12250Sstevel@tonic-gate sm_cnt++;
12260Sstevel@tonic-gate tmpdev = un->un_sm[i].sm_dev;
12270Sstevel@tonic-gate sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev)));
12280Sstevel@tonic-gate
12290Sstevel@tonic-gate (void) md_unit_writerlock(sm_ui);
12300Sstevel@tonic-gate if (submirror_unavailable(un, i, 0)) {
12310Sstevel@tonic-gate sm_ui->ui_tstate |= MD_INACCESSIBLE;
12320Sstevel@tonic-gate sm_unavail_cnt++;
12330Sstevel@tonic-gate } else {
12340Sstevel@tonic-gate sm_ui->ui_tstate &= ~MD_INACCESSIBLE;
12350Sstevel@tonic-gate }
12360Sstevel@tonic-gate md_unit_writerexit(sm_ui);
12370Sstevel@tonic-gate }
12380Sstevel@tonic-gate
12390Sstevel@tonic-gate /*
12400Sstevel@tonic-gate * If all of the submirrors are unavailable, the mirror is also
12410Sstevel@tonic-gate * unavailable.
12420Sstevel@tonic-gate */
12430Sstevel@tonic-gate if (sm_cnt == sm_unavail_cnt) {
12440Sstevel@tonic-gate ui->ui_tstate |= MD_INACCESSIBLE;
12450Sstevel@tonic-gate } else {
12460Sstevel@tonic-gate ui->ui_tstate &= ~MD_INACCESSIBLE;
12470Sstevel@tonic-gate }
12480Sstevel@tonic-gate
12490Sstevel@tonic-gate smi = 0;
12500Sstevel@tonic-gate ci = 0;
12510Sstevel@tonic-gate while (mirror_geterror(un, &smi, &ci, 1, 0) != 0) {
12520Sstevel@tonic-gate if (mirror_other_sources(un, smi, ci, 1) == 1) {
12530Sstevel@tonic-gate
12540Sstevel@tonic-gate free_all_ecomps(ecomps);
12550Sstevel@tonic-gate (void) mirror_close_all_devs(un, md_oflags);
12560Sstevel@tonic-gate SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL,
12570Sstevel@tonic-gate SVM_TAG_METADEVICE, setno, MD_SID(un));
12580Sstevel@tonic-gate mirror_openfail_console_info(un, smi, ci);
12590Sstevel@tonic-gate if (lockp) {
12600Sstevel@tonic-gate md_ioctl_writerexit(lockp);
12610Sstevel@tonic-gate (void) md_ioctl_readerlock(lockp, ui);
12620Sstevel@tonic-gate } else {
12630Sstevel@tonic-gate md_unit_writerexit(ui);
12640Sstevel@tonic-gate (void) md_unit_readerlock(ui);
12650Sstevel@tonic-gate }
12660Sstevel@tonic-gate return (ENXIO);
12670Sstevel@tonic-gate }
12680Sstevel@tonic-gate
12690Sstevel@tonic-gate /* track all component states that need changing */
12700Sstevel@tonic-gate c = (err_comp_t *)kmem_alloc(sizeof (err_comp_t), KM_SLEEP);
12710Sstevel@tonic-gate c->ec_next = ecomps;
12720Sstevel@tonic-gate c->ec_smi = smi;
12730Sstevel@tonic-gate c->ec_ci = ci;
12740Sstevel@tonic-gate ecomps = c;
12750Sstevel@tonic-gate ci++;
12760Sstevel@tonic-gate }
12770Sstevel@tonic-gate
12780Sstevel@tonic-gate /* Make all state changes and commit them */
12790Sstevel@tonic-gate for (c = ecomps; c != NULL; c = c->ec_next) {
12800Sstevel@tonic-gate /*
12810Sstevel@tonic-gate * If lockp is set, then entering kernel through ioctl.
12820Sstevel@tonic-gate * For a MN set, the only ioctl path is via a commd message
12830Sstevel@tonic-gate * (ALLOCATE_HOTSPARE or *RESYNC* messages) that is already
12840Sstevel@tonic-gate * being sent to each node.
12850Sstevel@tonic-gate * In this case, set NO_XMIT so that set_sm_comp_state
12860Sstevel@tonic-gate * won't attempt to send a message on a message.
12870Sstevel@tonic-gate *
12880Sstevel@tonic-gate * In !MN sets, the xmit flag is ignored, so it doesn't matter
12890Sstevel@tonic-gate * which flag is passed.
12900Sstevel@tonic-gate */
12910Sstevel@tonic-gate if (lockp) {
12920Sstevel@tonic-gate set_sm_comp_state(un, c->ec_smi, c->ec_ci, CS_ERRED, 0,
12930Sstevel@tonic-gate MD_STATE_NO_XMIT, lockp);
12940Sstevel@tonic-gate } else {
12950Sstevel@tonic-gate set_sm_comp_state(un, c->ec_smi, c->ec_ci, CS_ERRED, 0,
12960Sstevel@tonic-gate (MD_STATE_XMIT | MD_STATE_OCHELD), lockp);
12970Sstevel@tonic-gate }
12980Sstevel@tonic-gate /*
12990Sstevel@tonic-gate * For a MN set, the NOTIFY is done when the state change is
13000Sstevel@tonic-gate * processed on each node
13010Sstevel@tonic-gate */
13020Sstevel@tonic-gate if (!MD_MNSET_SETNO(setno)) {
13030Sstevel@tonic-gate SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED,
13040Sstevel@tonic-gate SVM_TAG_METADEVICE, setno, MD_SID(un));
13050Sstevel@tonic-gate }
13060Sstevel@tonic-gate }
13070Sstevel@tonic-gate
13080Sstevel@tonic-gate if (lockp) {
13090Sstevel@tonic-gate md_ioctl_writerexit(lockp);
13100Sstevel@tonic-gate (void) md_ioctl_readerlock(lockp, ui);
13110Sstevel@tonic-gate } else {
13120Sstevel@tonic-gate md_unit_writerexit(ui);
13130Sstevel@tonic-gate (void) md_unit_readerlock(ui);
13140Sstevel@tonic-gate }
13150Sstevel@tonic-gate
13160Sstevel@tonic-gate free_all_ecomps(ecomps);
13170Sstevel@tonic-gate
13180Sstevel@tonic-gate /* allocate hotspares for all errored components */
13190Sstevel@tonic-gate if (MD_MNSET_SETNO(setno)) {
13200Sstevel@tonic-gate /*
13210Sstevel@tonic-gate * If we're called from an ioctl (lockp set) then we cannot
13220Sstevel@tonic-gate * directly call send_poke_hotspares as this will block until
13230Sstevel@tonic-gate * the message gets despatched to all nodes. If the cluster is
13240Sstevel@tonic-gate * going through a reconfig cycle then the message will block
13250Sstevel@tonic-gate * until the cycle is complete, and as we originate from a
13260Sstevel@tonic-gate * service call from commd we will livelock.
13270Sstevel@tonic-gate */
13280Sstevel@tonic-gate if (lockp == NULL) {
13290Sstevel@tonic-gate md_unit_readerexit(ui);
13300Sstevel@tonic-gate send_poke_hotspares(setno);
13310Sstevel@tonic-gate (void) md_unit_readerlock(ui);
13320Sstevel@tonic-gate }
13330Sstevel@tonic-gate } else {
13340Sstevel@tonic-gate (void) poke_hotspares();
13350Sstevel@tonic-gate }
13360Sstevel@tonic-gate return (0);
13370Sstevel@tonic-gate }
13380Sstevel@tonic-gate
13390Sstevel@tonic-gate void
mirror_overlap_tree_remove(md_mps_t * ps)13406901Sjkennedy mirror_overlap_tree_remove(md_mps_t *ps)
13410Sstevel@tonic-gate {
13420Sstevel@tonic-gate mm_unit_t *un;
13430Sstevel@tonic-gate
13440Sstevel@tonic-gate if (panicstr)
13450Sstevel@tonic-gate return;
13460Sstevel@tonic-gate
13476901Sjkennedy VERIFY(ps->ps_flags & MD_MPS_ON_OVERLAP);
13480Sstevel@tonic-gate un = ps->ps_un;
13490Sstevel@tonic-gate
13506901Sjkennedy mutex_enter(&un->un_overlap_tree_mx);
13516901Sjkennedy avl_remove(&un->un_overlap_root, ps);
13526901Sjkennedy ps->ps_flags &= ~MD_MPS_ON_OVERLAP;
13536901Sjkennedy if (un->un_overlap_tree_flag != 0) {
13546901Sjkennedy un->un_overlap_tree_flag = 0;
13556901Sjkennedy cv_broadcast(&un->un_overlap_tree_cv);
13560Sstevel@tonic-gate }
13576901Sjkennedy mutex_exit(&un->un_overlap_tree_mx);
13580Sstevel@tonic-gate }
13590Sstevel@tonic-gate
13600Sstevel@tonic-gate
13610Sstevel@tonic-gate /*
13620Sstevel@tonic-gate * wait_for_overlaps:
13630Sstevel@tonic-gate * -----------------
13640Sstevel@tonic-gate * Check that given i/o request does not cause an overlap with already pending
13650Sstevel@tonic-gate * i/o. If it does, block until the overlapped i/o completes.
13660Sstevel@tonic-gate *
13670Sstevel@tonic-gate * The flag argument has MD_OVERLAP_ALLOW_REPEAT set if it is ok for the parent
13686901Sjkennedy * structure to be already in the overlap tree and MD_OVERLAP_NO_REPEAT if
13696901Sjkennedy * it must not already be in the tree.
13700Sstevel@tonic-gate */
13710Sstevel@tonic-gate static void
wait_for_overlaps(md_mps_t * ps,int flags)13720Sstevel@tonic-gate wait_for_overlaps(md_mps_t *ps, int flags)
13730Sstevel@tonic-gate {
13740Sstevel@tonic-gate mm_unit_t *un;
13756901Sjkennedy avl_index_t where;
13766901Sjkennedy md_mps_t *ps1;
13770Sstevel@tonic-gate
13780Sstevel@tonic-gate if (panicstr)
13790Sstevel@tonic-gate return;
13800Sstevel@tonic-gate
13810Sstevel@tonic-gate un = ps->ps_un;
13826901Sjkennedy mutex_enter(&un->un_overlap_tree_mx);
13830Sstevel@tonic-gate if ((flags & MD_OVERLAP_ALLOW_REPEAT) &&
13840Sstevel@tonic-gate (ps->ps_flags & MD_MPS_ON_OVERLAP)) {
13856901Sjkennedy mutex_exit(&un->un_overlap_tree_mx);
13860Sstevel@tonic-gate return;
13870Sstevel@tonic-gate }
13886901Sjkennedy
13896901Sjkennedy VERIFY(!(ps->ps_flags & MD_MPS_ON_OVERLAP));
13906901Sjkennedy
13916901Sjkennedy do {
13926901Sjkennedy ps1 = avl_find(&un->un_overlap_root, ps, &where);
13936901Sjkennedy if (ps1 == NULL) {
13946901Sjkennedy /*
13956901Sjkennedy * The candidate range does not overlap with any
13966901Sjkennedy * range in the tree. Insert it and be done.
13976901Sjkennedy */
13986901Sjkennedy avl_insert(&un->un_overlap_root, ps, where);
13996901Sjkennedy ps->ps_flags |= MD_MPS_ON_OVERLAP;
14006901Sjkennedy } else {
14016901Sjkennedy /*
14026901Sjkennedy * The candidate range would overlap. Set the flag
14036901Sjkennedy * indicating we need to be woken up, and sleep
14046901Sjkennedy * until another thread removes a range. If upon
14056901Sjkennedy * waking up we find this mps was put on the tree
14066901Sjkennedy * by another thread, the loop terminates.
14076901Sjkennedy */
14086901Sjkennedy un->un_overlap_tree_flag = 1;
14096901Sjkennedy cv_wait(&un->un_overlap_tree_cv,
14106901Sjkennedy &un->un_overlap_tree_mx);
14110Sstevel@tonic-gate }
14126901Sjkennedy } while (!(ps->ps_flags & MD_MPS_ON_OVERLAP));
14136901Sjkennedy mutex_exit(&un->un_overlap_tree_mx);
14140Sstevel@tonic-gate }
14150Sstevel@tonic-gate
14160Sstevel@tonic-gate /*
14170Sstevel@tonic-gate * This function is called from mirror_done to check whether any pages have
14180Sstevel@tonic-gate * been modified while a mirrored write was in progress. Returns 0 if
14190Sstevel@tonic-gate * all pages associated with bp are clean, 1 otherwise.
14200Sstevel@tonic-gate */
14210Sstevel@tonic-gate static int
any_pages_dirty(struct buf * bp)14220Sstevel@tonic-gate any_pages_dirty(struct buf *bp)
14230Sstevel@tonic-gate {
14240Sstevel@tonic-gate int rval;
14250Sstevel@tonic-gate
14260Sstevel@tonic-gate rval = biomodified(bp);
14270Sstevel@tonic-gate if (rval == -1)
14280Sstevel@tonic-gate rval = 0;
14290Sstevel@tonic-gate
14300Sstevel@tonic-gate return (rval);
14310Sstevel@tonic-gate }
14320Sstevel@tonic-gate
14330Sstevel@tonic-gate #define MAX_EXTRAS 10
14340Sstevel@tonic-gate
14350Sstevel@tonic-gate void
mirror_commit(mm_unit_t * un,int smmask,mddb_recid_t * extras)14360Sstevel@tonic-gate mirror_commit(
14370Sstevel@tonic-gate mm_unit_t *un,
14380Sstevel@tonic-gate int smmask,
14390Sstevel@tonic-gate mddb_recid_t *extras
14400Sstevel@tonic-gate )
14410Sstevel@tonic-gate {
14420Sstevel@tonic-gate mm_submirror_t *sm;
14430Sstevel@tonic-gate md_unit_t *su;
14440Sstevel@tonic-gate int i;
14450Sstevel@tonic-gate
14460Sstevel@tonic-gate /* 2=mirror,null id */
14470Sstevel@tonic-gate mddb_recid_t recids[NMIRROR+2+MAX_EXTRAS];
14480Sstevel@tonic-gate
14490Sstevel@tonic-gate int ri = 0;
14500Sstevel@tonic-gate
14510Sstevel@tonic-gate if (md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)
14520Sstevel@tonic-gate return;
14530Sstevel@tonic-gate
14540Sstevel@tonic-gate /* Add two, this includes the mirror unit and the null recid */
14550Sstevel@tonic-gate if (extras != NULL) {
14560Sstevel@tonic-gate int nrecids = 0;
14570Sstevel@tonic-gate while (extras[nrecids] != 0) {
14580Sstevel@tonic-gate nrecids++;
14590Sstevel@tonic-gate }
14600Sstevel@tonic-gate ASSERT(nrecids <= MAX_EXTRAS);
14610Sstevel@tonic-gate }
14620Sstevel@tonic-gate
14630Sstevel@tonic-gate if (un != NULL)
14640Sstevel@tonic-gate recids[ri++] = un->c.un_record_id;
14650Sstevel@tonic-gate for (i = 0; i < NMIRROR; i++) {
14660Sstevel@tonic-gate if (!(smmask & SMI2BIT(i)))
14670Sstevel@tonic-gate continue;
14680Sstevel@tonic-gate sm = &un->un_sm[i];
14690Sstevel@tonic-gate if (!SMS_IS(sm, SMS_INUSE))
14700Sstevel@tonic-gate continue;
14710Sstevel@tonic-gate if (md_getmajor(sm->sm_dev) != md_major)
14720Sstevel@tonic-gate continue;
14730Sstevel@tonic-gate su = MD_UNIT(md_getminor(sm->sm_dev));
14740Sstevel@tonic-gate recids[ri++] = su->c.un_record_id;
14750Sstevel@tonic-gate }
14760Sstevel@tonic-gate
14770Sstevel@tonic-gate if (extras != NULL)
14780Sstevel@tonic-gate while (*extras != 0) {
14790Sstevel@tonic-gate recids[ri++] = *extras;
14800Sstevel@tonic-gate extras++;
14810Sstevel@tonic-gate }
14820Sstevel@tonic-gate
14830Sstevel@tonic-gate if (ri == 0)
14840Sstevel@tonic-gate return;
14850Sstevel@tonic-gate recids[ri] = 0;
14860Sstevel@tonic-gate
14870Sstevel@tonic-gate /*
14880Sstevel@tonic-gate * Ok to hold ioctl lock across record commit to mddb as
14890Sstevel@tonic-gate * long as the record(s) being committed aren't resync records.
14900Sstevel@tonic-gate */
14910Sstevel@tonic-gate mddb_commitrecs_wrapper(recids);
14920Sstevel@tonic-gate }
14930Sstevel@tonic-gate
14940Sstevel@tonic-gate
14950Sstevel@tonic-gate /*
14960Sstevel@tonic-gate * This routine is used to set a bit in the writable_bm bitmap
14970Sstevel@tonic-gate * which represents each submirror in a metamirror which
14980Sstevel@tonic-gate * is writable. The first writable submirror index is assigned
14990Sstevel@tonic-gate * to the sm_index. The number of writable submirrors are returned in nunits.
15000Sstevel@tonic-gate *
15010Sstevel@tonic-gate * This routine returns the submirror's unit number.
15020Sstevel@tonic-gate */
15030Sstevel@tonic-gate
15040Sstevel@tonic-gate static void
select_write_units(struct mm_unit * un,md_mps_t * ps)15050Sstevel@tonic-gate select_write_units(struct mm_unit *un, md_mps_t *ps)
15060Sstevel@tonic-gate {
15070Sstevel@tonic-gate
15080Sstevel@tonic-gate int i;
15090Sstevel@tonic-gate unsigned writable_bm = 0;
15100Sstevel@tonic-gate unsigned nunits = 0;
15110Sstevel@tonic-gate
15120Sstevel@tonic-gate for (i = 0; i < NMIRROR; i++) {
15130Sstevel@tonic-gate if (SUBMIRROR_IS_WRITEABLE(un, i)) {
15140Sstevel@tonic-gate /* set bit of all writable units */
15150Sstevel@tonic-gate writable_bm |= SMI2BIT(i);
15160Sstevel@tonic-gate nunits++;
15170Sstevel@tonic-gate }
15180Sstevel@tonic-gate }
15190Sstevel@tonic-gate ps->ps_writable_sm = writable_bm;
15200Sstevel@tonic-gate ps->ps_active_cnt = nunits;
15210Sstevel@tonic-gate ps->ps_current_sm = 0;
15220Sstevel@tonic-gate }
15230Sstevel@tonic-gate
15240Sstevel@tonic-gate static
15250Sstevel@tonic-gate unsigned
select_write_after_read_units(struct mm_unit * un,md_mps_t * ps)15260Sstevel@tonic-gate select_write_after_read_units(struct mm_unit *un, md_mps_t *ps)
15270Sstevel@tonic-gate {
15280Sstevel@tonic-gate
15290Sstevel@tonic-gate int i;
15300Sstevel@tonic-gate unsigned writable_bm = 0;
15310Sstevel@tonic-gate unsigned nunits = 0;
15320Sstevel@tonic-gate
15330Sstevel@tonic-gate for (i = 0; i < NMIRROR; i++) {
15340Sstevel@tonic-gate if (SUBMIRROR_IS_WRITEABLE(un, i) &&
15350Sstevel@tonic-gate un->un_sm[i].sm_flags & MD_SM_RESYNC_TARGET) {
15360Sstevel@tonic-gate writable_bm |= SMI2BIT(i);
15370Sstevel@tonic-gate nunits++;
15380Sstevel@tonic-gate }
15390Sstevel@tonic-gate }
15400Sstevel@tonic-gate if ((writable_bm & ps->ps_allfrom_sm) != 0) {
15410Sstevel@tonic-gate writable_bm &= ~ps->ps_allfrom_sm;
15420Sstevel@tonic-gate nunits--;
15430Sstevel@tonic-gate }
15440Sstevel@tonic-gate ps->ps_writable_sm = writable_bm;
15450Sstevel@tonic-gate ps->ps_active_cnt = nunits;
15460Sstevel@tonic-gate ps->ps_current_sm = 0;
15470Sstevel@tonic-gate return (nunits);
15480Sstevel@tonic-gate }
15490Sstevel@tonic-gate
15500Sstevel@tonic-gate static md_dev64_t
select_read_unit(mm_unit_t * un,diskaddr_t blkno,u_longlong_t reqcount,u_longlong_t * cando,int must_be_opened,md_m_shared_t ** shared,md_mcs_t * cs)15510Sstevel@tonic-gate select_read_unit(
15520Sstevel@tonic-gate mm_unit_t *un,
15530Sstevel@tonic-gate diskaddr_t blkno,
15540Sstevel@tonic-gate u_longlong_t reqcount,
15550Sstevel@tonic-gate u_longlong_t *cando,
15560Sstevel@tonic-gate int must_be_opened,
15570Sstevel@tonic-gate md_m_shared_t **shared,
15580Sstevel@tonic-gate md_mcs_t *cs)
15590Sstevel@tonic-gate {
15600Sstevel@tonic-gate int i;
15610Sstevel@tonic-gate md_m_shared_t *s;
15620Sstevel@tonic-gate uint_t lasterrcnt = 0;
15630Sstevel@tonic-gate md_dev64_t dev = 0;
15640Sstevel@tonic-gate u_longlong_t cnt;
15650Sstevel@tonic-gate u_longlong_t mincnt;
15660Sstevel@tonic-gate mm_submirror_t *sm;
15670Sstevel@tonic-gate mm_submirror_ic_t *smic;
15680Sstevel@tonic-gate mdi_unit_t *ui;
15690Sstevel@tonic-gate
15700Sstevel@tonic-gate mincnt = reqcount;
15710Sstevel@tonic-gate for (i = 0; i < NMIRROR; i++) {
15720Sstevel@tonic-gate if (!SUBMIRROR_IS_READABLE(un, i))
15730Sstevel@tonic-gate continue;
15740Sstevel@tonic-gate sm = &un->un_sm[i];
15750Sstevel@tonic-gate smic = &un->un_smic[i];
15760Sstevel@tonic-gate cnt = reqcount;
15770Sstevel@tonic-gate
15780Sstevel@tonic-gate /*
15790Sstevel@tonic-gate * If the current submirror is marked as inaccessible, do not
15800Sstevel@tonic-gate * try to access it.
15810Sstevel@tonic-gate */
15820Sstevel@tonic-gate ui = MDI_UNIT(getminor(expldev(sm->sm_dev)));
15830Sstevel@tonic-gate (void) md_unit_readerlock(ui);
15840Sstevel@tonic-gate if (ui->ui_tstate & MD_INACCESSIBLE) {
15850Sstevel@tonic-gate md_unit_readerexit(ui);
15860Sstevel@tonic-gate continue;
15870Sstevel@tonic-gate }
15880Sstevel@tonic-gate md_unit_readerexit(ui);
15890Sstevel@tonic-gate
15900Sstevel@tonic-gate s = (md_m_shared_t *)(*(smic->sm_shared_by_blk))
15910Sstevel@tonic-gate (sm->sm_dev, sm, blkno, &cnt);
15920Sstevel@tonic-gate
15930Sstevel@tonic-gate if (must_be_opened && !(s->ms_flags & MDM_S_ISOPEN))
15940Sstevel@tonic-gate continue;
15950Sstevel@tonic-gate if (s->ms_state == CS_OKAY) {
15960Sstevel@tonic-gate *cando = cnt;
15970Sstevel@tonic-gate if (shared != NULL)
15980Sstevel@tonic-gate *shared = s;
15990Sstevel@tonic-gate
16000Sstevel@tonic-gate if (un->un_sm[i].sm_flags & MD_SM_FAILFAST &&
16010Sstevel@tonic-gate cs != NULL) {
16020Sstevel@tonic-gate cs->cs_buf.b_flags |= B_FAILFAST;
16030Sstevel@tonic-gate }
16040Sstevel@tonic-gate
16050Sstevel@tonic-gate return (un->un_sm[i].sm_dev);
16060Sstevel@tonic-gate }
16070Sstevel@tonic-gate if (s->ms_state != CS_LAST_ERRED)
16080Sstevel@tonic-gate continue;
16090Sstevel@tonic-gate
16100Sstevel@tonic-gate /* don't use B_FAILFAST since we're Last Erred */
16110Sstevel@tonic-gate
16120Sstevel@tonic-gate if (mincnt > cnt)
16130Sstevel@tonic-gate mincnt = cnt;
16140Sstevel@tonic-gate if (s->ms_lasterrcnt > lasterrcnt) {
16150Sstevel@tonic-gate lasterrcnt = s->ms_lasterrcnt;
16160Sstevel@tonic-gate if (shared != NULL)
16170Sstevel@tonic-gate *shared = s;
16180Sstevel@tonic-gate dev = un->un_sm[i].sm_dev;
16190Sstevel@tonic-gate }
16200Sstevel@tonic-gate }
16210Sstevel@tonic-gate *cando = mincnt;
16220Sstevel@tonic-gate return (dev);
16230Sstevel@tonic-gate }
16240Sstevel@tonic-gate
16250Sstevel@tonic-gate /*
16260Sstevel@tonic-gate * Given a 32-bit bitmap, this routine will return the bit number
16270Sstevel@tonic-gate * of the nth bit set. The nth bit set is passed via the index integer.
16280Sstevel@tonic-gate *
16290Sstevel@tonic-gate * This routine is used to run through the writable submirror bitmap
16300Sstevel@tonic-gate * and starting all of the writes. See the value returned is the
16310Sstevel@tonic-gate * index to appropriate submirror structure, in the md_sm
16320Sstevel@tonic-gate * array for metamirrors.
16330Sstevel@tonic-gate */
16340Sstevel@tonic-gate static int
md_find_nth_unit(uint_t mask,int index)16350Sstevel@tonic-gate md_find_nth_unit(uint_t mask, int index)
16360Sstevel@tonic-gate {
16370Sstevel@tonic-gate int bit, nfound;
16380Sstevel@tonic-gate
16390Sstevel@tonic-gate for (bit = -1, nfound = -1; nfound != index; bit++) {
16400Sstevel@tonic-gate ASSERT(mask != 0);
16410Sstevel@tonic-gate nfound += (mask & 1);
16420Sstevel@tonic-gate mask >>= 1;
16430Sstevel@tonic-gate }
16440Sstevel@tonic-gate return (bit);
16450Sstevel@tonic-gate }
16460Sstevel@tonic-gate
16470Sstevel@tonic-gate static int
fast_select_read_unit(md_mps_t * ps,md_mcs_t * cs)16480Sstevel@tonic-gate fast_select_read_unit(md_mps_t *ps, md_mcs_t *cs)
16490Sstevel@tonic-gate {
16500Sstevel@tonic-gate mm_unit_t *un;
16510Sstevel@tonic-gate buf_t *bp;
16520Sstevel@tonic-gate int i;
16530Sstevel@tonic-gate unsigned nunits = 0;
16540Sstevel@tonic-gate int iunit;
16550Sstevel@tonic-gate uint_t running_bm = 0;
16560Sstevel@tonic-gate uint_t sm_index;
16570Sstevel@tonic-gate
16580Sstevel@tonic-gate bp = &cs->cs_buf;
16590Sstevel@tonic-gate un = ps->ps_un;
16600Sstevel@tonic-gate
16610Sstevel@tonic-gate for (i = 0; i < NMIRROR; i++) {
16620Sstevel@tonic-gate if (!SMS_BY_INDEX_IS(un, i, SMS_RUNNING))
16630Sstevel@tonic-gate continue;
16640Sstevel@tonic-gate running_bm |= SMI2BIT(i);
16650Sstevel@tonic-gate nunits++;
16660Sstevel@tonic-gate }
16670Sstevel@tonic-gate if (nunits == 0)
16680Sstevel@tonic-gate return (1);
16690Sstevel@tonic-gate
16700Sstevel@tonic-gate /*
16710Sstevel@tonic-gate * For directed mirror read (DMR) we only use the specified side and
16720Sstevel@tonic-gate * do not compute the source of the read.
16738452SJohn.Wren.Kennedy@Sun.COM * If we're running with MD_MPS_DIRTY_RD set we always return the
16748452SJohn.Wren.Kennedy@Sun.COM * first mirror side (this prevents unnecessary ownership switching).
16758452SJohn.Wren.Kennedy@Sun.COM * Otherwise we return the submirror according to the mirror read option
16760Sstevel@tonic-gate */
16770Sstevel@tonic-gate if (ps->ps_flags & MD_MPS_DMR) {
16780Sstevel@tonic-gate sm_index = un->un_dmr_last_read;
16798452SJohn.Wren.Kennedy@Sun.COM } else if (ps->ps_flags & MD_MPS_DIRTY_RD) {
16808452SJohn.Wren.Kennedy@Sun.COM sm_index = md_find_nth_unit(running_bm, 0);
16810Sstevel@tonic-gate } else {
16820Sstevel@tonic-gate /* Normal (non-DMR) operation */
16830Sstevel@tonic-gate switch (un->un_read_option) {
16840Sstevel@tonic-gate case RD_GEOMETRY:
16850Sstevel@tonic-gate iunit = (int)(bp->b_lblkno /
16860Sstevel@tonic-gate howmany(un->c.un_total_blocks, nunits));
16870Sstevel@tonic-gate sm_index = md_find_nth_unit(running_bm, iunit);
16880Sstevel@tonic-gate break;
16890Sstevel@tonic-gate case RD_FIRST:
16900Sstevel@tonic-gate sm_index = md_find_nth_unit(running_bm, 0);
16910Sstevel@tonic-gate break;
16920Sstevel@tonic-gate case RD_LOAD_BAL:
16930Sstevel@tonic-gate /* this is intentional to fall into the default */
16940Sstevel@tonic-gate default:
16950Sstevel@tonic-gate un->un_last_read = (un->un_last_read + 1) % nunits;
16960Sstevel@tonic-gate sm_index = md_find_nth_unit(running_bm,
16970Sstevel@tonic-gate un->un_last_read);
16980Sstevel@tonic-gate break;
16990Sstevel@tonic-gate }
17000Sstevel@tonic-gate }
17010Sstevel@tonic-gate bp->b_edev = md_dev64_to_dev(un->un_sm[sm_index].sm_dev);
17020Sstevel@tonic-gate ps->ps_allfrom_sm = SMI2BIT(sm_index);
17030Sstevel@tonic-gate
17040Sstevel@tonic-gate if (un->un_sm[sm_index].sm_flags & MD_SM_FAILFAST) {
17056901Sjkennedy bp->b_flags |= B_FAILFAST;
17060Sstevel@tonic-gate }
17070Sstevel@tonic-gate
17080Sstevel@tonic-gate return (0);
17090Sstevel@tonic-gate }
17100Sstevel@tonic-gate
17110Sstevel@tonic-gate static
17120Sstevel@tonic-gate int
mirror_are_submirrors_available(mm_unit_t * un)17130Sstevel@tonic-gate mirror_are_submirrors_available(mm_unit_t *un)
17140Sstevel@tonic-gate {
17150Sstevel@tonic-gate int i;
17160Sstevel@tonic-gate for (i = 0; i < NMIRROR; i++) {
17170Sstevel@tonic-gate md_dev64_t tmpdev = un->un_sm[i].sm_dev;
17180Sstevel@tonic-gate
17190Sstevel@tonic-gate if ((!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) ||
17200Sstevel@tonic-gate md_getmajor(tmpdev) != md_major)
17210Sstevel@tonic-gate continue;
17220Sstevel@tonic-gate
17230Sstevel@tonic-gate if ((MD_MIN2SET(md_getminor(tmpdev)) >= md_nsets) ||
17240Sstevel@tonic-gate (MD_MIN2UNIT(md_getminor(tmpdev)) >= md_nunits))
17250Sstevel@tonic-gate return (0);
17260Sstevel@tonic-gate
17270Sstevel@tonic-gate if (MDI_UNIT(md_getminor(tmpdev)) == NULL)
17280Sstevel@tonic-gate return (0);
17290Sstevel@tonic-gate }
17300Sstevel@tonic-gate return (1);
17310Sstevel@tonic-gate }
17320Sstevel@tonic-gate
17330Sstevel@tonic-gate void
build_submirror(mm_unit_t * un,int i,int snarfing)17340Sstevel@tonic-gate build_submirror(mm_unit_t *un, int i, int snarfing)
17350Sstevel@tonic-gate {
17360Sstevel@tonic-gate struct mm_submirror *sm;
17370Sstevel@tonic-gate struct mm_submirror_ic *smic;
17380Sstevel@tonic-gate md_unit_t *su;
17390Sstevel@tonic-gate set_t setno;
17400Sstevel@tonic-gate
17410Sstevel@tonic-gate sm = &un->un_sm[i];
17420Sstevel@tonic-gate smic = &un->un_smic[i];
17430Sstevel@tonic-gate
17440Sstevel@tonic-gate sm->sm_flags = 0; /* sometime we may need to do more here */
17450Sstevel@tonic-gate
17460Sstevel@tonic-gate setno = MD_UN2SET(un);
17470Sstevel@tonic-gate
17480Sstevel@tonic-gate if (!SMS_IS(sm, SMS_INUSE))
17490Sstevel@tonic-gate return;
17500Sstevel@tonic-gate if (snarfing) {
17510Sstevel@tonic-gate sm->sm_dev = md_getdevnum(setno, mddb_getsidenum(setno),
17526901Sjkennedy sm->sm_key, MD_NOTRUST_DEVT);
17530Sstevel@tonic-gate } else {
17540Sstevel@tonic-gate if (md_getmajor(sm->sm_dev) == md_major) {
17550Sstevel@tonic-gate su = MD_UNIT(md_getminor(sm->sm_dev));
17560Sstevel@tonic-gate un->c.un_flag |= (su->c.un_flag & MD_LABELED);
17570Sstevel@tonic-gate /* submirror can no longer be soft partitioned */
17580Sstevel@tonic-gate MD_CAPAB(su) &= (~MD_CAN_SP);
17590Sstevel@tonic-gate }
17600Sstevel@tonic-gate }
17610Sstevel@tonic-gate smic->sm_shared_by_blk = md_get_named_service(sm->sm_dev,
17620Sstevel@tonic-gate 0, "shared by blk", 0);
17630Sstevel@tonic-gate smic->sm_shared_by_indx = md_get_named_service(sm->sm_dev,
17640Sstevel@tonic-gate 0, "shared by indx", 0);
17656901Sjkennedy smic->sm_get_component_count = (int (*)())md_get_named_service(
17666901Sjkennedy sm->sm_dev, 0, "get component count", 0);
17676901Sjkennedy smic->sm_get_bcss = (int (*)())md_get_named_service(sm->sm_dev, 0,
17686901Sjkennedy "get block count skip size", 0);
17690Sstevel@tonic-gate sm->sm_state &= ~SMS_IGNORE;
17700Sstevel@tonic-gate if (SMS_IS(sm, SMS_OFFLINE))
17710Sstevel@tonic-gate MD_STATUS(un) |= MD_UN_OFFLINE_SM;
17720Sstevel@tonic-gate md_set_parent(sm->sm_dev, MD_SID(un));
17730Sstevel@tonic-gate }
17740Sstevel@tonic-gate
17750Sstevel@tonic-gate static void
mirror_cleanup(mm_unit_t * un)17760Sstevel@tonic-gate mirror_cleanup(mm_unit_t *un)
17770Sstevel@tonic-gate {
17780Sstevel@tonic-gate mddb_recid_t recid;
17790Sstevel@tonic-gate int smi;
17800Sstevel@tonic-gate sv_dev_t sv[NMIRROR];
17810Sstevel@tonic-gate int nsv = 0;
17820Sstevel@tonic-gate
17830Sstevel@tonic-gate /*
17840Sstevel@tonic-gate * If a MN diskset and this node is not the master, do
17850Sstevel@tonic-gate * not delete any records on snarf of the mirror records.
17860Sstevel@tonic-gate */
17870Sstevel@tonic-gate if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
17880Sstevel@tonic-gate md_set[MD_UN2SET(un)].s_am_i_master == 0) {
17890Sstevel@tonic-gate return;
17900Sstevel@tonic-gate }
17910Sstevel@tonic-gate
17920Sstevel@tonic-gate for (smi = 0; smi < NMIRROR; smi++) {
17930Sstevel@tonic-gate if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE))
17940Sstevel@tonic-gate continue;
17950Sstevel@tonic-gate sv[nsv].setno = MD_UN2SET(un);
17960Sstevel@tonic-gate sv[nsv++].key = un->un_sm[smi].sm_key;
17970Sstevel@tonic-gate }
17980Sstevel@tonic-gate
17990Sstevel@tonic-gate recid = un->un_rr_dirty_recid;
18000Sstevel@tonic-gate mddb_deleterec_wrapper(un->c.un_record_id);
18010Sstevel@tonic-gate if (recid > 0)
18020Sstevel@tonic-gate mddb_deleterec_wrapper(recid);
18030Sstevel@tonic-gate
18040Sstevel@tonic-gate md_rem_names(sv, nsv);
18050Sstevel@tonic-gate }
18060Sstevel@tonic-gate
18076901Sjkennedy /*
18086901Sjkennedy * Comparison function for the avl tree which tracks
18096901Sjkennedy * outstanding writes on submirrors.
18106901Sjkennedy *
18116901Sjkennedy * Returns:
18126901Sjkennedy * -1: ps1 < ps2
18136901Sjkennedy * 0: ps1 and ps2 overlap
18146901Sjkennedy * 1: ps1 > ps2
18156901Sjkennedy */
18166901Sjkennedy static int
mirror_overlap_compare(const void * p1,const void * p2)18176901Sjkennedy mirror_overlap_compare(const void *p1, const void *p2)
18186901Sjkennedy {
18196901Sjkennedy const md_mps_t *ps1 = (md_mps_t *)p1;
18206901Sjkennedy const md_mps_t *ps2 = (md_mps_t *)p2;
18216901Sjkennedy
18226901Sjkennedy if (ps1->ps_firstblk < ps2->ps_firstblk) {
18236901Sjkennedy if (ps1->ps_lastblk >= ps2->ps_firstblk)
18246901Sjkennedy return (0);
18256901Sjkennedy return (-1);
18266901Sjkennedy }
18276901Sjkennedy
18286901Sjkennedy if (ps1->ps_firstblk > ps2->ps_firstblk) {
18296901Sjkennedy if (ps1->ps_firstblk <= ps2->ps_lastblk)
18306901Sjkennedy return (0);
18316901Sjkennedy return (1);
18326901Sjkennedy }
18336901Sjkennedy
18346901Sjkennedy return (0);
18356901Sjkennedy }
18366901Sjkennedy
183710948SJames.Hall@Sun.COM /*
183810948SJames.Hall@Sun.COM * Collapse any sparse submirror entries snarfed from the on-disk replica.
183910948SJames.Hall@Sun.COM * Only the in-core entries are updated. The replica will be updated on-disk
184010948SJames.Hall@Sun.COM * when the in-core replica is committed on shutdown of the SVM subsystem.
184110948SJames.Hall@Sun.COM */
184210948SJames.Hall@Sun.COM static void
collapse_submirrors(mm_unit_t * un)184310948SJames.Hall@Sun.COM collapse_submirrors(mm_unit_t *un)
184410948SJames.Hall@Sun.COM {
184510948SJames.Hall@Sun.COM int smi, nremovals, smiremove;
184610948SJames.Hall@Sun.COM mm_submirror_t *sm, *new_sm, *old_sm;
184710948SJames.Hall@Sun.COM mm_submirror_ic_t *smic;
184810948SJames.Hall@Sun.COM int nsmidx = un->un_nsm - 1;
184910948SJames.Hall@Sun.COM
185010948SJames.Hall@Sun.COM rescan:
185110948SJames.Hall@Sun.COM nremovals = 0;
185210948SJames.Hall@Sun.COM smiremove = -1;
185310948SJames.Hall@Sun.COM
185410948SJames.Hall@Sun.COM for (smi = 0; smi <= nsmidx; smi++) {
185510948SJames.Hall@Sun.COM sm = &un->un_sm[smi];
185610948SJames.Hall@Sun.COM
185710948SJames.Hall@Sun.COM /*
185810948SJames.Hall@Sun.COM * Check to see if this submirror is marked as in-use.
185910948SJames.Hall@Sun.COM * If it isn't then it is a potential sparse entry and
186010948SJames.Hall@Sun.COM * may need to be cleared from the configuration.
186110948SJames.Hall@Sun.COM * The records should _already_ have been cleared by the
186210948SJames.Hall@Sun.COM * original mirror_detach() code, but we need to shuffle
186310948SJames.Hall@Sun.COM * any NULL entries in un_sm[] to the end of the array.
186410948SJames.Hall@Sun.COM * Any NULL un_smic[] entries need to be reset to the underlying
186510948SJames.Hall@Sun.COM * submirror/slice accessor functions.
186610948SJames.Hall@Sun.COM */
186710948SJames.Hall@Sun.COM if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE)) {
186810948SJames.Hall@Sun.COM nremovals++;
186910948SJames.Hall@Sun.COM smiremove = smi;
187010948SJames.Hall@Sun.COM break;
187110948SJames.Hall@Sun.COM }
187210948SJames.Hall@Sun.COM }
187310948SJames.Hall@Sun.COM
187410948SJames.Hall@Sun.COM if (nremovals == 0) {
187510948SJames.Hall@Sun.COM /*
187610948SJames.Hall@Sun.COM * Ensure that we have a matching contiguous set of un_smic[]
187710948SJames.Hall@Sun.COM * entries for the corresponding un_sm[] entries
187810948SJames.Hall@Sun.COM */
187910948SJames.Hall@Sun.COM for (smi = 0; smi <= nsmidx; smi++) {
188010948SJames.Hall@Sun.COM smic = &un->un_smic[smi];
188110948SJames.Hall@Sun.COM sm = &un->un_sm[smi];
188210948SJames.Hall@Sun.COM
188310948SJames.Hall@Sun.COM smic->sm_shared_by_blk =
188410948SJames.Hall@Sun.COM md_get_named_service(sm->sm_dev, 0,
188510948SJames.Hall@Sun.COM "shared by_blk", 0);
188610948SJames.Hall@Sun.COM smic->sm_shared_by_indx =
188710948SJames.Hall@Sun.COM md_get_named_service(sm->sm_dev, 0,
188810948SJames.Hall@Sun.COM "shared by indx", 0);
188910948SJames.Hall@Sun.COM smic->sm_get_component_count =
189010948SJames.Hall@Sun.COM (int (*)())md_get_named_service(sm->sm_dev, 0,
189110948SJames.Hall@Sun.COM "get component count", 0);
189210948SJames.Hall@Sun.COM smic->sm_get_bcss =
189310948SJames.Hall@Sun.COM (int (*)())md_get_named_service(sm->sm_dev, 0,
189410948SJames.Hall@Sun.COM "get block count skip size", 0);
189510948SJames.Hall@Sun.COM }
189610948SJames.Hall@Sun.COM return;
189710948SJames.Hall@Sun.COM }
189810948SJames.Hall@Sun.COM
189910948SJames.Hall@Sun.COM /*
190010948SJames.Hall@Sun.COM * Reshuffle the submirror devices so that we do not have a dead record
190110948SJames.Hall@Sun.COM * in the middle of the array. Once we've done this we need to rescan
190210948SJames.Hall@Sun.COM * the mirror to check for any other holes.
190310948SJames.Hall@Sun.COM */
190410948SJames.Hall@Sun.COM for (smi = 0; smi < NMIRROR; smi++) {
190510948SJames.Hall@Sun.COM if (smi < smiremove)
190610948SJames.Hall@Sun.COM continue;
190710948SJames.Hall@Sun.COM if (smi > smiremove) {
190810948SJames.Hall@Sun.COM old_sm = &un->un_sm[smi];
190910948SJames.Hall@Sun.COM new_sm = &un->un_sm[smi - 1];
191010948SJames.Hall@Sun.COM bcopy(old_sm, new_sm, sizeof (mm_submirror_t));
191110948SJames.Hall@Sun.COM bzero(old_sm, sizeof (mm_submirror_t));
191210948SJames.Hall@Sun.COM }
191310948SJames.Hall@Sun.COM }
191410948SJames.Hall@Sun.COM
191510948SJames.Hall@Sun.COM /*
191610948SJames.Hall@Sun.COM * Now we need to rescan the array to find the next potential dead
191710948SJames.Hall@Sun.COM * entry.
191810948SJames.Hall@Sun.COM */
191910948SJames.Hall@Sun.COM goto rescan;
192010948SJames.Hall@Sun.COM }
192110948SJames.Hall@Sun.COM
19220Sstevel@tonic-gate /* Return a -1 if optimized record unavailable and set should be released */
19230Sstevel@tonic-gate int
mirror_build_incore(mm_unit_t * un,int snarfing)19240Sstevel@tonic-gate mirror_build_incore(mm_unit_t *un, int snarfing)
19250Sstevel@tonic-gate {
19260Sstevel@tonic-gate int i;
19270Sstevel@tonic-gate
19280Sstevel@tonic-gate if (MD_STATUS(un) & MD_UN_BEING_RESET) {
19290Sstevel@tonic-gate mddb_setrecprivate(un->c.un_record_id, MD_PRV_PENDCLEAN);
19300Sstevel@tonic-gate return (1);
19310Sstevel@tonic-gate }
19320Sstevel@tonic-gate
19330Sstevel@tonic-gate if (mirror_are_submirrors_available(un) == 0)
19340Sstevel@tonic-gate return (1);
19350Sstevel@tonic-gate
19360Sstevel@tonic-gate if (MD_UNIT(MD_SID(un)) != NULL)
19370Sstevel@tonic-gate return (0);
19380Sstevel@tonic-gate
19390Sstevel@tonic-gate MD_STATUS(un) = 0;
19400Sstevel@tonic-gate
19410Sstevel@tonic-gate /* pre-4.1 didn't define CAN_META_CHILD capability */
19420Sstevel@tonic-gate MD_CAPAB(un) = MD_CAN_META_CHILD | MD_CAN_PARENT | MD_CAN_SP;
19430Sstevel@tonic-gate
19446901Sjkennedy un->un_overlap_tree_flag = 0;
19456901Sjkennedy avl_create(&un->un_overlap_root, mirror_overlap_compare,
19466901Sjkennedy sizeof (md_mps_t), offsetof(md_mps_t, ps_overlap_node));
19470Sstevel@tonic-gate
194810948SJames.Hall@Sun.COM /*
194910948SJames.Hall@Sun.COM * We need to collapse any sparse submirror entries into a non-sparse
195010948SJames.Hall@Sun.COM * array. This is to cover the case where we have an old replica image
195110948SJames.Hall@Sun.COM * which has not been updated (i.e. snarfed) since being modified.
195210948SJames.Hall@Sun.COM * The new code expects all submirror access to be sequential (i.e.
195310948SJames.Hall@Sun.COM * both the un_sm[] and un_smic[] entries correspond to non-empty
195410948SJames.Hall@Sun.COM * submirrors.
195510948SJames.Hall@Sun.COM */
195610948SJames.Hall@Sun.COM
195710948SJames.Hall@Sun.COM collapse_submirrors(un);
195810948SJames.Hall@Sun.COM
19590Sstevel@tonic-gate for (i = 0; i < NMIRROR; i++)
19600Sstevel@tonic-gate build_submirror(un, i, snarfing);
19610Sstevel@tonic-gate
19620Sstevel@tonic-gate if (unit_setup_resync(un, snarfing) != 0) {
19630Sstevel@tonic-gate if (snarfing) {
19640Sstevel@tonic-gate mddb_setrecprivate(un->c.un_record_id, MD_PRV_GOTIT);
19650Sstevel@tonic-gate /*
19660Sstevel@tonic-gate * If a MN set and set is not stale, then return -1
19670Sstevel@tonic-gate * which will force the caller to unload the set.
19680Sstevel@tonic-gate * The MN diskset nodes will return failure if
19690Sstevel@tonic-gate * unit_setup_resync fails so that nodes won't
19700Sstevel@tonic-gate * get out of sync.
19710Sstevel@tonic-gate *
19720Sstevel@tonic-gate * If set is STALE, the master node can't allocate
19730Sstevel@tonic-gate * a resync record (if needed), but node needs to
19740Sstevel@tonic-gate * join the set so that user can delete broken mddbs.
19750Sstevel@tonic-gate * So, if set is STALE, just continue on.
19760Sstevel@tonic-gate */
19770Sstevel@tonic-gate if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
19780Sstevel@tonic-gate !(md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)) {
19790Sstevel@tonic-gate return (-1);
19800Sstevel@tonic-gate }
19810Sstevel@tonic-gate } else
19820Sstevel@tonic-gate return (1);
19830Sstevel@tonic-gate }
19840Sstevel@tonic-gate
19856901Sjkennedy mutex_init(&un->un_overlap_tree_mx, NULL, MUTEX_DEFAULT, NULL);
19866901Sjkennedy cv_init(&un->un_overlap_tree_cv, NULL, CV_DEFAULT, NULL);
19870Sstevel@tonic-gate
19880Sstevel@tonic-gate un->un_suspend_wr_flag = 0;
19890Sstevel@tonic-gate mutex_init(&un->un_suspend_wr_mx, NULL, MUTEX_DEFAULT, NULL);
19900Sstevel@tonic-gate cv_init(&un->un_suspend_wr_cv, NULL, CV_DEFAULT, NULL);
19910Sstevel@tonic-gate
19920Sstevel@tonic-gate /*
19930Sstevel@tonic-gate * Allocate mutexes for mirror-owner and resync-owner changes.
19940Sstevel@tonic-gate * All references to the owner message state field must be guarded
19950Sstevel@tonic-gate * by this mutex.
19960Sstevel@tonic-gate */
19970Sstevel@tonic-gate mutex_init(&un->un_owner_mx, NULL, MUTEX_DEFAULT, NULL);
19980Sstevel@tonic-gate
19990Sstevel@tonic-gate /*
20000Sstevel@tonic-gate * Allocate mutex and condvar for resync thread manipulation. These
20010Sstevel@tonic-gate * will be used by mirror_resync_unit/mirror_ioctl_resync
20020Sstevel@tonic-gate */
20030Sstevel@tonic-gate mutex_init(&un->un_rs_thread_mx, NULL, MUTEX_DEFAULT, NULL);
20040Sstevel@tonic-gate cv_init(&un->un_rs_thread_cv, NULL, CV_DEFAULT, NULL);
20050Sstevel@tonic-gate
20060Sstevel@tonic-gate /*
20070Sstevel@tonic-gate * Allocate mutex and condvar for resync progress thread manipulation.
20080Sstevel@tonic-gate * This allows resyncs to be continued across an intervening reboot.
20090Sstevel@tonic-gate */
20100Sstevel@tonic-gate mutex_init(&un->un_rs_progress_mx, NULL, MUTEX_DEFAULT, NULL);
20110Sstevel@tonic-gate cv_init(&un->un_rs_progress_cv, NULL, CV_DEFAULT, NULL);
20120Sstevel@tonic-gate
20130Sstevel@tonic-gate /*
20140Sstevel@tonic-gate * Allocate mutex and condvar for Directed Mirror Reads (DMR). This
20150Sstevel@tonic-gate * provides synchronization between a user-ioctl and the resulting
20160Sstevel@tonic-gate * strategy() call that performs the read().
20170Sstevel@tonic-gate */
20180Sstevel@tonic-gate mutex_init(&un->un_dmr_mx, NULL, MUTEX_DEFAULT, NULL);
20190Sstevel@tonic-gate cv_init(&un->un_dmr_cv, NULL, CV_DEFAULT, NULL);
20200Sstevel@tonic-gate
20218452SJohn.Wren.Kennedy@Sun.COM /*
20228452SJohn.Wren.Kennedy@Sun.COM * Allocate rwlocks for un_pernode_dirty_bm accessing.
20238452SJohn.Wren.Kennedy@Sun.COM */
20248452SJohn.Wren.Kennedy@Sun.COM for (i = 0; i < MD_MNMAXSIDES; i++) {
20258452SJohn.Wren.Kennedy@Sun.COM rw_init(&un->un_pernode_dirty_mx[i], NULL, RW_DEFAULT, NULL);
20268452SJohn.Wren.Kennedy@Sun.COM }
20278452SJohn.Wren.Kennedy@Sun.COM
20287627SChris.Horne@Sun.COM /* place various information in the in-core data structures */
20297627SChris.Horne@Sun.COM md_nblocks_set(MD_SID(un), un->c.un_total_blocks);
20300Sstevel@tonic-gate MD_UNIT(MD_SID(un)) = un;
20317627SChris.Horne@Sun.COM
20320Sstevel@tonic-gate return (0);
20330Sstevel@tonic-gate }
20340Sstevel@tonic-gate
20350Sstevel@tonic-gate
20360Sstevel@tonic-gate void
reset_mirror(struct mm_unit * un,minor_t mnum,int removing)20370Sstevel@tonic-gate reset_mirror(struct mm_unit *un, minor_t mnum, int removing)
20380Sstevel@tonic-gate {
20390Sstevel@tonic-gate mddb_recid_t recid, vtoc_id;
20400Sstevel@tonic-gate size_t bitcnt;
20410Sstevel@tonic-gate size_t shortcnt;
20420Sstevel@tonic-gate int smi;
20430Sstevel@tonic-gate sv_dev_t sv[NMIRROR];
20440Sstevel@tonic-gate int nsv = 0;
20450Sstevel@tonic-gate uint_t bits = 0;
20460Sstevel@tonic-gate minor_t selfid;
20470Sstevel@tonic-gate md_unit_t *su;
20488452SJohn.Wren.Kennedy@Sun.COM int i;
20490Sstevel@tonic-gate
20500Sstevel@tonic-gate md_destroy_unit_incore(mnum, &mirror_md_ops);
20510Sstevel@tonic-gate
20520Sstevel@tonic-gate shortcnt = un->un_rrd_num * sizeof (short);
20530Sstevel@tonic-gate bitcnt = howmany(un->un_rrd_num, NBBY);
20540Sstevel@tonic-gate
20550Sstevel@tonic-gate if (un->un_outstanding_writes)
20560Sstevel@tonic-gate kmem_free((caddr_t)un->un_outstanding_writes, shortcnt);
20570Sstevel@tonic-gate if (un->un_goingclean_bm)
20580Sstevel@tonic-gate kmem_free((caddr_t)un->un_goingclean_bm, bitcnt);
20590Sstevel@tonic-gate if (un->un_goingdirty_bm)
20600Sstevel@tonic-gate kmem_free((caddr_t)un->un_goingdirty_bm, bitcnt);
20610Sstevel@tonic-gate if (un->un_resync_bm)
20620Sstevel@tonic-gate kmem_free((caddr_t)un->un_resync_bm, bitcnt);
20638452SJohn.Wren.Kennedy@Sun.COM if (un->un_pernode_dirty_sum)
20648452SJohn.Wren.Kennedy@Sun.COM kmem_free((caddr_t)un->un_pernode_dirty_sum, un->un_rrd_num);
20658452SJohn.Wren.Kennedy@Sun.COM
20668452SJohn.Wren.Kennedy@Sun.COM /*
20678452SJohn.Wren.Kennedy@Sun.COM * Destroy the taskq for deferred processing of DRL clean requests.
20688452SJohn.Wren.Kennedy@Sun.COM * This taskq will only be present for Multi Owner mirrors.
20698452SJohn.Wren.Kennedy@Sun.COM */
20708452SJohn.Wren.Kennedy@Sun.COM if (un->un_drl_task != NULL)
20718452SJohn.Wren.Kennedy@Sun.COM ddi_taskq_destroy(un->un_drl_task);
20720Sstevel@tonic-gate
20737627SChris.Horne@Sun.COM md_nblocks_set(mnum, -1ULL);
20740Sstevel@tonic-gate MD_UNIT(mnum) = NULL;
20750Sstevel@tonic-gate
20761623Stw21770 /*
20771623Stw21770 * Attempt release of its minor node
20781623Stw21770 */
20792077Stw21770 md_remove_minor_node(mnum);
20801623Stw21770
20810Sstevel@tonic-gate if (!removing)
20820Sstevel@tonic-gate return;
20830Sstevel@tonic-gate
20840Sstevel@tonic-gate for (smi = 0; smi < NMIRROR; smi++) {
20850Sstevel@tonic-gate if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE))
20860Sstevel@tonic-gate continue;
20870Sstevel@tonic-gate /* reallow soft partitioning of submirror and reset parent */
20880Sstevel@tonic-gate su = MD_UNIT(md_getminor(un->un_sm[smi].sm_dev));
20890Sstevel@tonic-gate MD_CAPAB(su) |= MD_CAN_SP;
20900Sstevel@tonic-gate md_reset_parent(un->un_sm[smi].sm_dev);
20910Sstevel@tonic-gate reset_comp_states(&un->un_sm[smi], &un->un_smic[smi]);
20920Sstevel@tonic-gate
20930Sstevel@tonic-gate sv[nsv].setno = MD_MIN2SET(mnum);
20940Sstevel@tonic-gate sv[nsv++].key = un->un_sm[smi].sm_key;
20950Sstevel@tonic-gate bits |= SMI2BIT(smi);
20960Sstevel@tonic-gate }
20970Sstevel@tonic-gate
20980Sstevel@tonic-gate MD_STATUS(un) |= MD_UN_BEING_RESET;
20990Sstevel@tonic-gate recid = un->un_rr_dirty_recid;
21000Sstevel@tonic-gate vtoc_id = un->c.un_vtoc_id;
21010Sstevel@tonic-gate selfid = MD_SID(un);
21020Sstevel@tonic-gate
21030Sstevel@tonic-gate mirror_commit(un, bits, 0);
21040Sstevel@tonic-gate
21056901Sjkennedy avl_destroy(&un->un_overlap_root);
21066901Sjkennedy
21070Sstevel@tonic-gate /* Destroy all mutexes and condvars before returning. */
21080Sstevel@tonic-gate mutex_destroy(&un->un_suspend_wr_mx);
21090Sstevel@tonic-gate cv_destroy(&un->un_suspend_wr_cv);
21106901Sjkennedy mutex_destroy(&un->un_overlap_tree_mx);
21116901Sjkennedy cv_destroy(&un->un_overlap_tree_cv);
21120Sstevel@tonic-gate mutex_destroy(&un->un_owner_mx);
21130Sstevel@tonic-gate mutex_destroy(&un->un_rs_thread_mx);
21140Sstevel@tonic-gate cv_destroy(&un->un_rs_thread_cv);
21150Sstevel@tonic-gate mutex_destroy(&un->un_rs_progress_mx);
21160Sstevel@tonic-gate cv_destroy(&un->un_rs_progress_cv);
21170Sstevel@tonic-gate mutex_destroy(&un->un_dmr_mx);
21180Sstevel@tonic-gate cv_destroy(&un->un_dmr_cv);
21191623Stw21770
21208452SJohn.Wren.Kennedy@Sun.COM for (i = 0; i < MD_MNMAXSIDES; i++) {
21218452SJohn.Wren.Kennedy@Sun.COM rw_destroy(&un->un_pernode_dirty_mx[i]);
21228452SJohn.Wren.Kennedy@Sun.COM if (un->un_pernode_dirty_bm[i])
21238452SJohn.Wren.Kennedy@Sun.COM kmem_free((caddr_t)un->un_pernode_dirty_bm[i], bitcnt);
21248452SJohn.Wren.Kennedy@Sun.COM }
21258452SJohn.Wren.Kennedy@Sun.COM
21261623Stw21770 /*
21271623Stw21770 * Remove self from the namespace
21281623Stw21770 */
21291623Stw21770 if (un->c.un_revision & MD_FN_META_DEV) {
21301623Stw21770 (void) md_rem_selfname(un->c.un_self_id);
21311623Stw21770 }
21321623Stw21770
21338452SJohn.Wren.Kennedy@Sun.COM /* This frees the unit structure. */
21340Sstevel@tonic-gate mddb_deleterec_wrapper(un->c.un_record_id);
21358452SJohn.Wren.Kennedy@Sun.COM
21360Sstevel@tonic-gate if (recid != 0)
21370Sstevel@tonic-gate mddb_deleterec_wrapper(recid);
21380Sstevel@tonic-gate
21390Sstevel@tonic-gate /* Remove the vtoc, if present */
21400Sstevel@tonic-gate if (vtoc_id)
21410Sstevel@tonic-gate mddb_deleterec_wrapper(vtoc_id);
21420Sstevel@tonic-gate
21430Sstevel@tonic-gate md_rem_names(sv, nsv);
21440Sstevel@tonic-gate
21450Sstevel@tonic-gate SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, SVM_TAG_METADEVICE,
21460Sstevel@tonic-gate MD_MIN2SET(selfid), selfid);
21470Sstevel@tonic-gate }
21480Sstevel@tonic-gate
21490Sstevel@tonic-gate int
mirror_internal_open(minor_t mnum,int flag,int otyp,int md_oflags,IOLOCK * lockp)21500Sstevel@tonic-gate mirror_internal_open(
21510Sstevel@tonic-gate minor_t mnum,
21520Sstevel@tonic-gate int flag,
21530Sstevel@tonic-gate int otyp,
21540Sstevel@tonic-gate int md_oflags,
21550Sstevel@tonic-gate IOLOCK *lockp /* can be NULL */
21560Sstevel@tonic-gate )
21570Sstevel@tonic-gate {
21580Sstevel@tonic-gate mdi_unit_t *ui = MDI_UNIT(mnum);
21590Sstevel@tonic-gate int err = 0;
21600Sstevel@tonic-gate
21610Sstevel@tonic-gate tryagain:
21620Sstevel@tonic-gate /* single thread */
21630Sstevel@tonic-gate if (lockp) {
21640Sstevel@tonic-gate /*
21650Sstevel@tonic-gate * If ioctl lock is held, use openclose_enter
21660Sstevel@tonic-gate * routine that will set the ioctl flag when
21670Sstevel@tonic-gate * grabbing the readerlock.
21680Sstevel@tonic-gate */
21690Sstevel@tonic-gate (void) md_ioctl_openclose_enter(lockp, ui);
21700Sstevel@tonic-gate } else {
21710Sstevel@tonic-gate (void) md_unit_openclose_enter(ui);
21720Sstevel@tonic-gate }
21730Sstevel@tonic-gate
21740Sstevel@tonic-gate /*
21750Sstevel@tonic-gate * The mirror_open_all_devs routine may end up sending a STATE_UPDATE
21760Sstevel@tonic-gate * message in a MN diskset and this requires that the openclose
21770Sstevel@tonic-gate * lock is dropped in order to send this message. So, another
21780Sstevel@tonic-gate * flag (MD_UL_OPENINPROGRESS) is used to keep another thread from
21790Sstevel@tonic-gate * attempting an open while this thread has an open in progress.
21800Sstevel@tonic-gate * Call the *_lh version of the lock exit routines since the ui_mx
21810Sstevel@tonic-gate * mutex must be held from checking for OPENINPROGRESS until
21820Sstevel@tonic-gate * after the cv_wait call.
21830Sstevel@tonic-gate */
21840Sstevel@tonic-gate mutex_enter(&ui->ui_mx);
21850Sstevel@tonic-gate if (ui->ui_lock & MD_UL_OPENINPROGRESS) {
21860Sstevel@tonic-gate if (lockp) {
21870Sstevel@tonic-gate (void) md_ioctl_openclose_exit_lh(lockp);
21880Sstevel@tonic-gate } else {
21890Sstevel@tonic-gate md_unit_openclose_exit_lh(ui);
21900Sstevel@tonic-gate }
21910Sstevel@tonic-gate cv_wait(&ui->ui_cv, &ui->ui_mx);
21920Sstevel@tonic-gate mutex_exit(&ui->ui_mx);
21930Sstevel@tonic-gate goto tryagain;
21940Sstevel@tonic-gate }
21950Sstevel@tonic-gate
21960Sstevel@tonic-gate ui->ui_lock |= MD_UL_OPENINPROGRESS;
21970Sstevel@tonic-gate mutex_exit(&ui->ui_mx);
21980Sstevel@tonic-gate
21990Sstevel@tonic-gate /* open devices, if necessary */
22000Sstevel@tonic-gate if (! md_unit_isopen(ui) || (ui->ui_tstate & MD_INACCESSIBLE)) {
22010Sstevel@tonic-gate if ((err = mirror_open_all_devs(mnum, md_oflags, lockp)) != 0)
22020Sstevel@tonic-gate goto out;
22030Sstevel@tonic-gate }
22040Sstevel@tonic-gate
22050Sstevel@tonic-gate /* count open */
22060Sstevel@tonic-gate if ((err = md_unit_incopen(mnum, flag, otyp)) != 0)
22070Sstevel@tonic-gate goto out;
22080Sstevel@tonic-gate
22090Sstevel@tonic-gate /* unlock, return success */
22100Sstevel@tonic-gate out:
22110Sstevel@tonic-gate mutex_enter(&ui->ui_mx);
22120Sstevel@tonic-gate ui->ui_lock &= ~MD_UL_OPENINPROGRESS;
22130Sstevel@tonic-gate mutex_exit(&ui->ui_mx);
22140Sstevel@tonic-gate
22150Sstevel@tonic-gate if (lockp) {
22160Sstevel@tonic-gate /*
22170Sstevel@tonic-gate * If ioctl lock is held, use openclose_exit
22180Sstevel@tonic-gate * routine that will clear the lockp reader flag.
22190Sstevel@tonic-gate */
22200Sstevel@tonic-gate (void) md_ioctl_openclose_exit(lockp);
22210Sstevel@tonic-gate } else {
22220Sstevel@tonic-gate md_unit_openclose_exit(ui);
22230Sstevel@tonic-gate }
22240Sstevel@tonic-gate return (err);
22250Sstevel@tonic-gate }
22260Sstevel@tonic-gate
22270Sstevel@tonic-gate int
mirror_internal_close(minor_t mnum,int otyp,int md_cflags,IOLOCK * lockp)22280Sstevel@tonic-gate mirror_internal_close(
22290Sstevel@tonic-gate minor_t mnum,
22300Sstevel@tonic-gate int otyp,
22310Sstevel@tonic-gate int md_cflags,
22320Sstevel@tonic-gate IOLOCK *lockp /* can be NULL */
22330Sstevel@tonic-gate )
22340Sstevel@tonic-gate {
22350Sstevel@tonic-gate mdi_unit_t *ui = MDI_UNIT(mnum);
22360Sstevel@tonic-gate mm_unit_t *un;
22370Sstevel@tonic-gate int err = 0;
22380Sstevel@tonic-gate
22390Sstevel@tonic-gate /* single thread */
22400Sstevel@tonic-gate if (lockp) {
22410Sstevel@tonic-gate /*
22420Sstevel@tonic-gate * If ioctl lock is held, use openclose_enter
22430Sstevel@tonic-gate * routine that will set the ioctl flag when
22440Sstevel@tonic-gate * grabbing the readerlock.
22450Sstevel@tonic-gate */
22460Sstevel@tonic-gate un = (mm_unit_t *)md_ioctl_openclose_enter(lockp, ui);
22470Sstevel@tonic-gate } else {
22480Sstevel@tonic-gate un = (mm_unit_t *)md_unit_openclose_enter(ui);
22490Sstevel@tonic-gate }
22500Sstevel@tonic-gate
22510Sstevel@tonic-gate /* count closed */
22520Sstevel@tonic-gate if ((err = md_unit_decopen(mnum, otyp)) != 0)
22530Sstevel@tonic-gate goto out;
22540Sstevel@tonic-gate
22550Sstevel@tonic-gate /* close devices, if necessary */
22560Sstevel@tonic-gate if (! md_unit_isopen(ui) || (md_cflags & MD_OFLG_PROBEDEV)) {
22570Sstevel@tonic-gate /*
22580Sstevel@tonic-gate * Clean up dirty bitmap for this unit. Do this
22590Sstevel@tonic-gate * before closing the underlying devices to avoid
22600Sstevel@tonic-gate * race conditions with reset_mirror() as a
22610Sstevel@tonic-gate * result of a 'metaset -r' command running in
22620Sstevel@tonic-gate * parallel. This might cause deallocation of
22630Sstevel@tonic-gate * dirty region bitmaps; with underlying metadevices
22640Sstevel@tonic-gate * in place this can't happen.
22650Sstevel@tonic-gate * Don't do this if a MN set and ABR not set
22660Sstevel@tonic-gate */
22670Sstevel@tonic-gate if (new_resync && !(MD_STATUS(un) & MD_UN_KEEP_DIRTY)) {
22680Sstevel@tonic-gate if (!MD_MNSET_SETNO(MD_UN2SET(un)) ||
22690Sstevel@tonic-gate !(ui->ui_tstate & MD_ABR_CAP))
22700Sstevel@tonic-gate mirror_process_unit_resync(un);
22710Sstevel@tonic-gate }
22720Sstevel@tonic-gate (void) mirror_close_all_devs(un, md_cflags);
22730Sstevel@tonic-gate
22740Sstevel@tonic-gate /*
22750Sstevel@tonic-gate * For a MN set with transient capabilities (eg ABR/DMR) set,
22760Sstevel@tonic-gate * clear these capabilities on the last open in the cluster.
22770Sstevel@tonic-gate * To do this we send a message to all nodes to see of the
22780Sstevel@tonic-gate * device is open.
22790Sstevel@tonic-gate */
22800Sstevel@tonic-gate if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
22810Sstevel@tonic-gate (ui->ui_tstate & (MD_ABR_CAP|MD_DMR_CAP))) {
22820Sstevel@tonic-gate if (lockp) {
22830Sstevel@tonic-gate (void) md_ioctl_openclose_exit(lockp);
22840Sstevel@tonic-gate } else {
22850Sstevel@tonic-gate md_unit_openclose_exit(ui);
22860Sstevel@tonic-gate }
22870Sstevel@tonic-gate
22880Sstevel@tonic-gate /*
22890Sstevel@tonic-gate * if we are in the context of an ioctl, drop the
22900Sstevel@tonic-gate * ioctl lock.
22910Sstevel@tonic-gate * Otherwise, no other locks should be held.
22920Sstevel@tonic-gate */
22930Sstevel@tonic-gate if (lockp) {
22940Sstevel@tonic-gate IOLOCK_RETURN_RELEASE(0, lockp);
22950Sstevel@tonic-gate }
22960Sstevel@tonic-gate
22970Sstevel@tonic-gate mdmn_clear_all_capabilities(mnum);
22980Sstevel@tonic-gate
22990Sstevel@tonic-gate /* if dropped the lock previously, regain it */
23000Sstevel@tonic-gate if (lockp) {
23010Sstevel@tonic-gate IOLOCK_RETURN_REACQUIRE(lockp);
23020Sstevel@tonic-gate }
23030Sstevel@tonic-gate return (0);
23040Sstevel@tonic-gate }
23050Sstevel@tonic-gate /* unlock and return success */
23060Sstevel@tonic-gate }
23070Sstevel@tonic-gate out:
23080Sstevel@tonic-gate /* Call whether lockp is NULL or not. */
23090Sstevel@tonic-gate if (lockp) {
23100Sstevel@tonic-gate md_ioctl_openclose_exit(lockp);
23110Sstevel@tonic-gate } else {
23120Sstevel@tonic-gate md_unit_openclose_exit(ui);
23130Sstevel@tonic-gate }
23140Sstevel@tonic-gate return (err);
23150Sstevel@tonic-gate }
23160Sstevel@tonic-gate
23170Sstevel@tonic-gate /*
23180Sstevel@tonic-gate * When a component has completed resyncing and is now ok, check if the
23190Sstevel@tonic-gate * corresponding component in the other submirrors is in the Last Erred
23200Sstevel@tonic-gate * state. If it is, we want to change that to the Erred state so we stop
23210Sstevel@tonic-gate * using that component and start using this good component instead.
23220Sstevel@tonic-gate *
23230Sstevel@tonic-gate * This is called from set_sm_comp_state and recursively calls
23240Sstevel@tonic-gate * set_sm_comp_state if it needs to change the Last Erred state.
23250Sstevel@tonic-gate */
23260Sstevel@tonic-gate static void
reset_lasterred(mm_unit_t * un,int smi,mddb_recid_t * extras,uint_t flags,IOLOCK * lockp)23270Sstevel@tonic-gate reset_lasterred(mm_unit_t *un, int smi, mddb_recid_t *extras, uint_t flags,
23280Sstevel@tonic-gate IOLOCK *lockp)
23290Sstevel@tonic-gate {
23300Sstevel@tonic-gate mm_submirror_t *sm;
23310Sstevel@tonic-gate mm_submirror_ic_t *smic;
23320Sstevel@tonic-gate int ci;
23330Sstevel@tonic-gate int i;
23340Sstevel@tonic-gate int compcnt;
23350Sstevel@tonic-gate int changed = 0;
23360Sstevel@tonic-gate
23370Sstevel@tonic-gate for (i = 0; i < NMIRROR; i++) {
23380Sstevel@tonic-gate sm = &un->un_sm[i];
23390Sstevel@tonic-gate smic = &un->un_smic[i];
23400Sstevel@tonic-gate
23410Sstevel@tonic-gate if (!SMS_IS(sm, SMS_INUSE))
23420Sstevel@tonic-gate continue;
23430Sstevel@tonic-gate
23440Sstevel@tonic-gate /* ignore the submirror that we just made ok */
23450Sstevel@tonic-gate if (i == smi)
23460Sstevel@tonic-gate continue;
23470Sstevel@tonic-gate
23480Sstevel@tonic-gate compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un);
23490Sstevel@tonic-gate for (ci = 0; ci < compcnt; ci++) {
23500Sstevel@tonic-gate md_m_shared_t *shared;
23510Sstevel@tonic-gate
23520Sstevel@tonic-gate shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
23530Sstevel@tonic-gate (sm->sm_dev, sm, ci);
23540Sstevel@tonic-gate
23550Sstevel@tonic-gate if ((shared->ms_state & CS_LAST_ERRED) &&
23560Sstevel@tonic-gate !mirror_other_sources(un, i, ci, 1)) {
23570Sstevel@tonic-gate
23580Sstevel@tonic-gate set_sm_comp_state(un, i, ci, CS_ERRED, extras,
23590Sstevel@tonic-gate flags, lockp);
23600Sstevel@tonic-gate changed = 1;
23610Sstevel@tonic-gate }
23620Sstevel@tonic-gate }
23630Sstevel@tonic-gate }
23640Sstevel@tonic-gate
23650Sstevel@tonic-gate /* maybe there is a hotspare for this newly erred component */
23660Sstevel@tonic-gate if (changed) {
23670Sstevel@tonic-gate set_t setno;
23680Sstevel@tonic-gate
23690Sstevel@tonic-gate setno = MD_UN2SET(un);
23700Sstevel@tonic-gate if (MD_MNSET_SETNO(setno)) {
23710Sstevel@tonic-gate send_poke_hotspares(setno);
23720Sstevel@tonic-gate } else {
23730Sstevel@tonic-gate (void) poke_hotspares();
23740Sstevel@tonic-gate }
23750Sstevel@tonic-gate }
23760Sstevel@tonic-gate }
23770Sstevel@tonic-gate
23780Sstevel@tonic-gate /*
23790Sstevel@tonic-gate * set_sm_comp_state
23800Sstevel@tonic-gate *
23810Sstevel@tonic-gate * Set the state of a submirror component to the specified new state.
23820Sstevel@tonic-gate * If the mirror is in a multi-node set, send messages to all nodes to
23830Sstevel@tonic-gate * block all writes to the mirror and then update the state and release the
23840Sstevel@tonic-gate * writes. These messages are only sent if MD_STATE_XMIT is set in flags.
23850Sstevel@tonic-gate * MD_STATE_XMIT will be unset in 2 cases:
23860Sstevel@tonic-gate * 1. When the state is changed to CS_RESYNC as this state change
23870Sstevel@tonic-gate * will already have been updated on each node by the processing of the
23880Sstevel@tonic-gate * distributed metasync command, hence no need to xmit.
23890Sstevel@tonic-gate * 2. When the state is change to CS_OKAY after a resync has completed. Again
23900Sstevel@tonic-gate * the resync completion will already have been processed on each node by
23910Sstevel@tonic-gate * the processing of the MD_MN_MSG_RESYNC_PHASE_DONE message for a component
23920Sstevel@tonic-gate * resync, hence no need to xmit.
23930Sstevel@tonic-gate *
23940Sstevel@tonic-gate * In case we are called from the updates of a watermark,
23950Sstevel@tonic-gate * (then MD_STATE_WMUPDATE will be set in the ps->flags) this is due to
23960Sstevel@tonic-gate * a metainit or similar. In this case the message that we sent to propagate
23970Sstevel@tonic-gate * the state change must not be a class1 message as that would deadlock with
23980Sstevel@tonic-gate * the metainit command that is still being processed.
23990Sstevel@tonic-gate * This we achieve by creating a class2 message MD_MN_MSG_STATE_UPDATE2
24000Sstevel@tonic-gate * instead. This also makes the submessage generator to create a class2
24010Sstevel@tonic-gate * submessage rather than a class1 (which would also block)
24020Sstevel@tonic-gate *
24030Sstevel@tonic-gate * On entry, unit_writerlock is held
24040Sstevel@tonic-gate * If MD_STATE_OCHELD is set in flags, then unit_openclose lock is
24050Sstevel@tonic-gate * also held.
24060Sstevel@tonic-gate */
24070Sstevel@tonic-gate void
set_sm_comp_state(mm_unit_t * un,int smi,int ci,int newstate,mddb_recid_t * extras,uint_t flags,IOLOCK * lockp)24080Sstevel@tonic-gate set_sm_comp_state(
24090Sstevel@tonic-gate mm_unit_t *un,
24100Sstevel@tonic-gate int smi,
24110Sstevel@tonic-gate int ci,
24120Sstevel@tonic-gate int newstate,
24130Sstevel@tonic-gate mddb_recid_t *extras,
24140Sstevel@tonic-gate uint_t flags,
24150Sstevel@tonic-gate IOLOCK *lockp
24160Sstevel@tonic-gate )
24170Sstevel@tonic-gate {
24180Sstevel@tonic-gate mm_submirror_t *sm;
24190Sstevel@tonic-gate mm_submirror_ic_t *smic;
24200Sstevel@tonic-gate md_m_shared_t *shared;
24210Sstevel@tonic-gate int origstate;
24220Sstevel@tonic-gate void (*get_dev)();
24230Sstevel@tonic-gate ms_cd_info_t cd;
24240Sstevel@tonic-gate char devname[MD_MAX_CTDLEN];
24250Sstevel@tonic-gate int err;
24260Sstevel@tonic-gate set_t setno = MD_UN2SET(un);
24270Sstevel@tonic-gate md_mn_msg_stch_t stchmsg;
24280Sstevel@tonic-gate mdi_unit_t *ui = MDI_UNIT(MD_SID(un));
24290Sstevel@tonic-gate md_mn_kresult_t *kresult;
24300Sstevel@tonic-gate int rval;
24310Sstevel@tonic-gate uint_t msgflags;
24320Sstevel@tonic-gate md_mn_msgtype_t msgtype;
24330Sstevel@tonic-gate int save_lock = 0;
24340Sstevel@tonic-gate mdi_unit_t *ui_sm;
243511130SJames.Hall@Sun.COM int nretries = 0;
24360Sstevel@tonic-gate
24370Sstevel@tonic-gate sm = &un->un_sm[smi];
24380Sstevel@tonic-gate smic = &un->un_smic[smi];
24390Sstevel@tonic-gate
24400Sstevel@tonic-gate /* If we have a real error status then turn off MD_INACCESSIBLE. */
24410Sstevel@tonic-gate ui_sm = MDI_UNIT(getminor(md_dev64_to_dev(sm->sm_dev)));
24420Sstevel@tonic-gate if (newstate & (CS_ERRED | CS_RESYNC | CS_LAST_ERRED) &&
24430Sstevel@tonic-gate ui_sm->ui_tstate & MD_INACCESSIBLE) {
24446901Sjkennedy ui_sm->ui_tstate &= ~MD_INACCESSIBLE;
24450Sstevel@tonic-gate }
24460Sstevel@tonic-gate
24476901Sjkennedy shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
24486901Sjkennedy (sm->sm_dev, sm, ci);
24490Sstevel@tonic-gate origstate = shared->ms_state;
24500Sstevel@tonic-gate
24510Sstevel@tonic-gate /*
24520Sstevel@tonic-gate * If the new state is an error and the old one wasn't, generate
24530Sstevel@tonic-gate * a console message. We do this before we send the state to other
24540Sstevel@tonic-gate * nodes in a MN set because the state change may change the component
24550Sstevel@tonic-gate * name if a hotspare is allocated.
24560Sstevel@tonic-gate */
24570Sstevel@tonic-gate if ((! (origstate & (CS_ERRED|CS_LAST_ERRED))) &&
24580Sstevel@tonic-gate (newstate & (CS_ERRED|CS_LAST_ERRED))) {
24590Sstevel@tonic-gate
24606901Sjkennedy get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0,
24616901Sjkennedy "get device", 0);
24620Sstevel@tonic-gate (void) (*get_dev)(sm->sm_dev, sm, ci, &cd);
24630Sstevel@tonic-gate
24640Sstevel@tonic-gate err = md_getdevname(setno, mddb_getsidenum(setno), 0,
24650Sstevel@tonic-gate cd.cd_dev, devname, sizeof (devname));
24660Sstevel@tonic-gate
24670Sstevel@tonic-gate if (err == ENOENT) {
24680Sstevel@tonic-gate (void) md_devname(setno, cd.cd_dev, devname,
24696901Sjkennedy sizeof (devname));
24700Sstevel@tonic-gate }
24710Sstevel@tonic-gate
24720Sstevel@tonic-gate cmn_err(CE_WARN, "md: %s: %s needs maintenance",
24730Sstevel@tonic-gate md_shortname(md_getminor(sm->sm_dev)), devname);
24740Sstevel@tonic-gate
24750Sstevel@tonic-gate if (newstate & CS_LAST_ERRED) {
24760Sstevel@tonic-gate cmn_err(CE_WARN, "md: %s: %s last erred",
24770Sstevel@tonic-gate md_shortname(md_getminor(sm->sm_dev)),
24780Sstevel@tonic-gate devname);
24790Sstevel@tonic-gate
24800Sstevel@tonic-gate } else if (shared->ms_flags & MDM_S_ISOPEN) {
24810Sstevel@tonic-gate /*
24820Sstevel@tonic-gate * Close the broken device and clear the open flag on
24830Sstevel@tonic-gate * it. Closing the device means the RCM framework will
24840Sstevel@tonic-gate * be able to unconfigure the device if required.
24850Sstevel@tonic-gate *
24860Sstevel@tonic-gate * We have to check that the device is open, otherwise
24870Sstevel@tonic-gate * the first open on it has resulted in the error that
24880Sstevel@tonic-gate * is being processed and the actual cd.cd_dev will be
24890Sstevel@tonic-gate * NODEV64.
24900Sstevel@tonic-gate *
24910Sstevel@tonic-gate * If this is a multi-node mirror, then the multinode
24920Sstevel@tonic-gate * state checks following this code will cause the
24930Sstevel@tonic-gate * slave nodes to close the mirror in the function
24940Sstevel@tonic-gate * mirror_set_state().
24950Sstevel@tonic-gate */
24960Sstevel@tonic-gate md_layered_close(cd.cd_dev, MD_OFLG_NULL);
24970Sstevel@tonic-gate shared->ms_flags &= ~MDM_S_ISOPEN;
24980Sstevel@tonic-gate }
24990Sstevel@tonic-gate
25000Sstevel@tonic-gate } else if ((origstate & CS_LAST_ERRED) && (newstate & CS_ERRED) &&
25010Sstevel@tonic-gate (shared->ms_flags & MDM_S_ISOPEN)) {
25020Sstevel@tonic-gate /*
25030Sstevel@tonic-gate * Similar to logic above except no log messages since we
25040Sstevel@tonic-gate * are just transitioning from Last Erred to Erred.
25050Sstevel@tonic-gate */
25060Sstevel@tonic-gate get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0,
25070Sstevel@tonic-gate "get device", 0);
25080Sstevel@tonic-gate (void) (*get_dev)(sm->sm_dev, sm, ci, &cd);
25090Sstevel@tonic-gate
25100Sstevel@tonic-gate md_layered_close(cd.cd_dev, MD_OFLG_NULL);
25110Sstevel@tonic-gate shared->ms_flags &= ~MDM_S_ISOPEN;
25120Sstevel@tonic-gate }
25130Sstevel@tonic-gate
25140Sstevel@tonic-gate if ((MD_MNSET_SETNO(setno)) && (origstate != newstate) &&
25150Sstevel@tonic-gate (flags & MD_STATE_XMIT) && !(ui->ui_tstate & MD_ERR_PENDING)) {
25160Sstevel@tonic-gate /*
25170Sstevel@tonic-gate * For a multi-node mirror, send the state change to the
25180Sstevel@tonic-gate * master, which broadcasts to all nodes, including this
25190Sstevel@tonic-gate * one. Once the message is received, the state is set
25200Sstevel@tonic-gate * in-core and the master commits the change to disk.
25210Sstevel@tonic-gate * There is a case, comp_replace, where this function
25220Sstevel@tonic-gate * can be called from within an ioctl and therefore in this
25230Sstevel@tonic-gate * case, as the ioctl will already be called on each node,
25240Sstevel@tonic-gate * there is no need to xmit the state change to the master for
25250Sstevel@tonic-gate * distribution to the other nodes. MD_STATE_XMIT flag is used
25260Sstevel@tonic-gate * to indicate whether a xmit is required. The mirror's
25270Sstevel@tonic-gate * transient state is set to MD_ERR_PENDING to avoid sending
25280Sstevel@tonic-gate * multiple messages.
25290Sstevel@tonic-gate */
25300Sstevel@tonic-gate if (newstate & (CS_ERRED|CS_LAST_ERRED))
25310Sstevel@tonic-gate ui->ui_tstate |= MD_ERR_PENDING;
25320Sstevel@tonic-gate
25330Sstevel@tonic-gate /*
25340Sstevel@tonic-gate * Send a state update message to all nodes. This message
25350Sstevel@tonic-gate * will generate 2 submessages, the first one to suspend
25360Sstevel@tonic-gate * all writes to the mirror and the second to update the
25370Sstevel@tonic-gate * state and resume writes.
25380Sstevel@tonic-gate */
25390Sstevel@tonic-gate stchmsg.msg_stch_mnum = un->c.un_self_id;
25400Sstevel@tonic-gate stchmsg.msg_stch_sm = smi;
25410Sstevel@tonic-gate stchmsg.msg_stch_comp = ci;
25420Sstevel@tonic-gate stchmsg.msg_stch_new_state = newstate;
25430Sstevel@tonic-gate stchmsg.msg_stch_hs_id = shared->ms_hs_id;
25440Sstevel@tonic-gate #ifdef DEBUG
25450Sstevel@tonic-gate if (mirror_debug_flag)
25460Sstevel@tonic-gate printf("send set state, %x, %x, %x, %x, %x\n",
25470Sstevel@tonic-gate stchmsg.msg_stch_mnum, stchmsg.msg_stch_sm,
25480Sstevel@tonic-gate stchmsg.msg_stch_comp, stchmsg.msg_stch_new_state,
25490Sstevel@tonic-gate stchmsg.msg_stch_hs_id);
25500Sstevel@tonic-gate #endif
25510Sstevel@tonic-gate if (flags & MD_STATE_WMUPDATE) {
25520Sstevel@tonic-gate msgtype = MD_MN_MSG_STATE_UPDATE2;
25530Sstevel@tonic-gate /*
25540Sstevel@tonic-gate * When coming from an update of watermarks, there
25550Sstevel@tonic-gate * must already be a message logged that triggered
25560Sstevel@tonic-gate * this action. So, no need to log this message, too.
25570Sstevel@tonic-gate */
25580Sstevel@tonic-gate msgflags = MD_MSGF_NO_LOG;
25590Sstevel@tonic-gate } else {
25600Sstevel@tonic-gate msgtype = MD_MN_MSG_STATE_UPDATE;
25610Sstevel@tonic-gate msgflags = MD_MSGF_DEFAULT_FLAGS;
25620Sstevel@tonic-gate }
25630Sstevel@tonic-gate
25640Sstevel@tonic-gate /*
25650Sstevel@tonic-gate * If we are in the context of an ioctl, drop the ioctl lock.
25660Sstevel@tonic-gate * lockp holds the list of locks held.
25670Sstevel@tonic-gate *
25680Sstevel@tonic-gate * Otherwise, increment the appropriate reacquire counters.
25690Sstevel@tonic-gate * If openclose lock is *held, then must reacquire reader
25700Sstevel@tonic-gate * lock before releasing the openclose lock.
25710Sstevel@tonic-gate * Do not drop the ARRAY_WRITER lock as we may not be able
25720Sstevel@tonic-gate * to reacquire it.
25730Sstevel@tonic-gate */
25740Sstevel@tonic-gate if (lockp) {
25750Sstevel@tonic-gate if (lockp->l_flags & MD_ARRAY_WRITER) {
25760Sstevel@tonic-gate save_lock = MD_ARRAY_WRITER;
25770Sstevel@tonic-gate lockp->l_flags &= ~MD_ARRAY_WRITER;
25780Sstevel@tonic-gate } else if (lockp->l_flags & MD_ARRAY_READER) {
25790Sstevel@tonic-gate save_lock = MD_ARRAY_READER;
25800Sstevel@tonic-gate lockp->l_flags &= ~MD_ARRAY_READER;
25810Sstevel@tonic-gate }
25820Sstevel@tonic-gate IOLOCK_RETURN_RELEASE(0, lockp);
25830Sstevel@tonic-gate } else {
25840Sstevel@tonic-gate if (flags & MD_STATE_OCHELD) {
25850Sstevel@tonic-gate md_unit_writerexit(ui);
25860Sstevel@tonic-gate (void) md_unit_readerlock(ui);
25870Sstevel@tonic-gate md_unit_openclose_exit(ui);
25880Sstevel@tonic-gate } else {
25890Sstevel@tonic-gate md_unit_writerexit(ui);
25900Sstevel@tonic-gate }
25910Sstevel@tonic-gate }
25920Sstevel@tonic-gate
25930Sstevel@tonic-gate kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
259411130SJames.Hall@Sun.COM sscs_msg:
25958452SJohn.Wren.Kennedy@Sun.COM rval = mdmn_ksend_message(setno, msgtype, msgflags, 0,
25966901Sjkennedy (char *)&stchmsg, sizeof (stchmsg), kresult);
25970Sstevel@tonic-gate
25980Sstevel@tonic-gate if (!MDMN_KSEND_MSG_OK(rval, kresult)) {
25990Sstevel@tonic-gate mdmn_ksend_show_error(rval, kresult, "STATE UPDATE");
26008452SJohn.Wren.Kennedy@Sun.COM /* If we're shutting down already, pause things here. */
26018452SJohn.Wren.Kennedy@Sun.COM if (kresult->kmmr_comm_state == MDMNE_RPC_FAIL) {
26028452SJohn.Wren.Kennedy@Sun.COM while (!md_mn_is_commd_present()) {
26038452SJohn.Wren.Kennedy@Sun.COM delay(md_hz);
26048452SJohn.Wren.Kennedy@Sun.COM }
260511130SJames.Hall@Sun.COM /*
260611130SJames.Hall@Sun.COM * commd is now available; retry the message
260711130SJames.Hall@Sun.COM * one time. If that fails we fall through and
260811130SJames.Hall@Sun.COM * panic as the system is in an unexpected state
260911130SJames.Hall@Sun.COM */
261011130SJames.Hall@Sun.COM if (nretries++ == 0)
261111130SJames.Hall@Sun.COM goto sscs_msg;
26128452SJohn.Wren.Kennedy@Sun.COM }
26130Sstevel@tonic-gate cmn_err(CE_PANIC,
26140Sstevel@tonic-gate "ksend_message failure: STATE_UPDATE");
26150Sstevel@tonic-gate }
26160Sstevel@tonic-gate kmem_free(kresult, sizeof (md_mn_kresult_t));
26170Sstevel@tonic-gate
26180Sstevel@tonic-gate /* if dropped the lock previously, regain it */
26190Sstevel@tonic-gate if (lockp) {
26200Sstevel@tonic-gate IOLOCK_RETURN_REACQUIRE(lockp);
26210Sstevel@tonic-gate lockp->l_flags |= save_lock;
26220Sstevel@tonic-gate } else {
26230Sstevel@tonic-gate /*
26240Sstevel@tonic-gate * Reacquire dropped locks and update acquirecnts
26250Sstevel@tonic-gate * appropriately.
26260Sstevel@tonic-gate */
26270Sstevel@tonic-gate if (flags & MD_STATE_OCHELD) {
26280Sstevel@tonic-gate /*
26290Sstevel@tonic-gate * openclose also grabs readerlock.
26300Sstevel@tonic-gate */
26310Sstevel@tonic-gate (void) md_unit_openclose_enter(ui);
26320Sstevel@tonic-gate md_unit_readerexit(ui);
26330Sstevel@tonic-gate (void) md_unit_writerlock(ui);
26340Sstevel@tonic-gate } else {
26350Sstevel@tonic-gate (void) md_unit_writerlock(ui);
26360Sstevel@tonic-gate }
26370Sstevel@tonic-gate }
26380Sstevel@tonic-gate
26390Sstevel@tonic-gate ui->ui_tstate &= ~MD_ERR_PENDING;
26400Sstevel@tonic-gate } else {
26410Sstevel@tonic-gate shared->ms_state = newstate;
26420Sstevel@tonic-gate uniqtime32(&shared->ms_timestamp);
26430Sstevel@tonic-gate
26440Sstevel@tonic-gate if (newstate == CS_ERRED)
26450Sstevel@tonic-gate shared->ms_flags |= MDM_S_NOWRITE;
26460Sstevel@tonic-gate else
26470Sstevel@tonic-gate shared->ms_flags &= ~MDM_S_NOWRITE;
26480Sstevel@tonic-gate
26490Sstevel@tonic-gate shared->ms_flags &= ~MDM_S_IOERR;
26500Sstevel@tonic-gate un->un_changecnt++;
26510Sstevel@tonic-gate shared->ms_lasterrcnt = un->un_changecnt;
26520Sstevel@tonic-gate
26530Sstevel@tonic-gate mirror_set_sm_state(sm, smic, SMS_RUNNING, 0);
26540Sstevel@tonic-gate mirror_commit(un, SMI2BIT(smi), extras);
26550Sstevel@tonic-gate }
26560Sstevel@tonic-gate
26570Sstevel@tonic-gate if ((origstate & CS_RESYNC) && (newstate & CS_OKAY)) {
26580Sstevel@tonic-gate /*
26590Sstevel@tonic-gate * Resetting the Last Erred state will recursively call back
26600Sstevel@tonic-gate * into this function (set_sm_comp_state) to update the state.
26610Sstevel@tonic-gate */
26620Sstevel@tonic-gate reset_lasterred(un, smi, extras, flags, lockp);
26630Sstevel@tonic-gate }
26640Sstevel@tonic-gate }
26650Sstevel@tonic-gate
26660Sstevel@tonic-gate static int
find_another_logical(mm_unit_t * un,mm_submirror_t * esm,diskaddr_t blk,u_longlong_t cnt,int must_be_open,int state,int err_cnt)26670Sstevel@tonic-gate find_another_logical(
26680Sstevel@tonic-gate mm_unit_t *un,
26690Sstevel@tonic-gate mm_submirror_t *esm,
26700Sstevel@tonic-gate diskaddr_t blk,
26710Sstevel@tonic-gate u_longlong_t cnt,
26720Sstevel@tonic-gate int must_be_open,
26730Sstevel@tonic-gate int state,
26740Sstevel@tonic-gate int err_cnt)
26750Sstevel@tonic-gate {
26760Sstevel@tonic-gate u_longlong_t cando;
26770Sstevel@tonic-gate md_dev64_t dev;
26780Sstevel@tonic-gate md_m_shared_t *s;
26790Sstevel@tonic-gate
26800Sstevel@tonic-gate esm->sm_state |= SMS_IGNORE;
26810Sstevel@tonic-gate while (cnt != 0) {
26820Sstevel@tonic-gate u_longlong_t mcnt;
26830Sstevel@tonic-gate
26840Sstevel@tonic-gate mcnt = MIN(cnt, lbtodb(1024 * 1024 * 1024)); /* 1 Gig Blks */
26850Sstevel@tonic-gate
26866901Sjkennedy dev = select_read_unit(un, blk, mcnt, &cando,
26876901Sjkennedy must_be_open, &s, NULL);
26880Sstevel@tonic-gate if (dev == (md_dev64_t)0)
26890Sstevel@tonic-gate break;
26900Sstevel@tonic-gate
26910Sstevel@tonic-gate if ((state == CS_LAST_ERRED) &&
26920Sstevel@tonic-gate (s->ms_state == CS_LAST_ERRED) &&
26930Sstevel@tonic-gate (err_cnt > s->ms_lasterrcnt))
26940Sstevel@tonic-gate break;
26950Sstevel@tonic-gate
26960Sstevel@tonic-gate cnt -= cando;
26970Sstevel@tonic-gate blk += cando;
26980Sstevel@tonic-gate }
26990Sstevel@tonic-gate esm->sm_state &= ~SMS_IGNORE;
27000Sstevel@tonic-gate return (cnt != 0);
27010Sstevel@tonic-gate }
27020Sstevel@tonic-gate
27030Sstevel@tonic-gate int
mirror_other_sources(mm_unit_t * un,int smi,int ci,int must_be_open)27040Sstevel@tonic-gate mirror_other_sources(mm_unit_t *un, int smi, int ci, int must_be_open)
27050Sstevel@tonic-gate {
27060Sstevel@tonic-gate mm_submirror_t *sm;
27070Sstevel@tonic-gate mm_submirror_ic_t *smic;
27080Sstevel@tonic-gate size_t count;
27090Sstevel@tonic-gate diskaddr_t block;
27100Sstevel@tonic-gate u_longlong_t skip;
27110Sstevel@tonic-gate u_longlong_t size;
27120Sstevel@tonic-gate md_dev64_t dev;
27130Sstevel@tonic-gate int cnt;
27140Sstevel@tonic-gate md_m_shared_t *s;
27150Sstevel@tonic-gate int not_found;
27160Sstevel@tonic-gate
27170Sstevel@tonic-gate sm = &un->un_sm[smi];
27180Sstevel@tonic-gate smic = &un->un_smic[smi];
27190Sstevel@tonic-gate dev = sm->sm_dev;
27200Sstevel@tonic-gate
27210Sstevel@tonic-gate /*
27220Sstevel@tonic-gate * Make sure every component of the submirror
27230Sstevel@tonic-gate * has other sources.
27240Sstevel@tonic-gate */
27250Sstevel@tonic-gate if (ci < 0) {
27260Sstevel@tonic-gate /* Find the highest lasterrcnt */
27270Sstevel@tonic-gate cnt = (*(smic->sm_get_component_count))(dev, sm);
27280Sstevel@tonic-gate for (ci = 0; ci < cnt; ci++) {
27290Sstevel@tonic-gate not_found = mirror_other_sources(un, smi, ci,
27300Sstevel@tonic-gate must_be_open);
27310Sstevel@tonic-gate if (not_found)
27320Sstevel@tonic-gate return (1);
27330Sstevel@tonic-gate }
27340Sstevel@tonic-gate return (0);
27350Sstevel@tonic-gate }
27360Sstevel@tonic-gate
27370Sstevel@tonic-gate /*
27380Sstevel@tonic-gate * Make sure this component has other sources
27390Sstevel@tonic-gate */
27400Sstevel@tonic-gate (void) (*(smic->sm_get_bcss))
27416901Sjkennedy (dev, sm, ci, &block, &count, &skip, &size);
27420Sstevel@tonic-gate
27430Sstevel@tonic-gate if (count == 0)
27440Sstevel@tonic-gate return (1);
27450Sstevel@tonic-gate
27460Sstevel@tonic-gate s = (md_m_shared_t *)(*(smic->sm_shared_by_indx))(dev, sm, ci);
27470Sstevel@tonic-gate
27480Sstevel@tonic-gate while (count--) {
27490Sstevel@tonic-gate if (block >= un->c.un_total_blocks)
27500Sstevel@tonic-gate return (0);
27510Sstevel@tonic-gate
27520Sstevel@tonic-gate if ((block + size) > un->c.un_total_blocks)
27530Sstevel@tonic-gate size = un->c.un_total_blocks - block;
27540Sstevel@tonic-gate
27550Sstevel@tonic-gate not_found = find_another_logical(un, sm, block, size,
27560Sstevel@tonic-gate must_be_open, s->ms_state, s->ms_lasterrcnt);
27570Sstevel@tonic-gate if (not_found)
27580Sstevel@tonic-gate return (1);
27590Sstevel@tonic-gate
27600Sstevel@tonic-gate block += size + skip;
27610Sstevel@tonic-gate }
27620Sstevel@tonic-gate return (0);
27630Sstevel@tonic-gate }
27640Sstevel@tonic-gate
27650Sstevel@tonic-gate static void
finish_error(md_mps_t * ps)27660Sstevel@tonic-gate finish_error(md_mps_t *ps)
27670Sstevel@tonic-gate {
27680Sstevel@tonic-gate struct buf *pb;
27690Sstevel@tonic-gate mm_unit_t *un;
27700Sstevel@tonic-gate mdi_unit_t *ui;
27710Sstevel@tonic-gate uint_t new_str_flags;
27720Sstevel@tonic-gate
27730Sstevel@tonic-gate pb = ps->ps_bp;
27740Sstevel@tonic-gate un = ps->ps_un;
27750Sstevel@tonic-gate ui = ps->ps_ui;
27760Sstevel@tonic-gate
27770Sstevel@tonic-gate /*
27780Sstevel@tonic-gate * Must flag any error to the resync originator if we're performing
27790Sstevel@tonic-gate * a Write-after-Read. This corresponds to an i/o error on a resync
27800Sstevel@tonic-gate * target device and in this case we ought to abort the resync as there
27810Sstevel@tonic-gate * is nothing that can be done to recover from this without operator
27820Sstevel@tonic-gate * intervention. If we don't set the B_ERROR flag we will continue
27830Sstevel@tonic-gate * reading from the mirror but won't write to the target (as it will
27840Sstevel@tonic-gate * have been placed into an errored state).
27850Sstevel@tonic-gate * To handle the case of multiple components within a submirror we only
27860Sstevel@tonic-gate * set the B_ERROR bit if explicitly requested to via MD_MPS_FLAG_ERROR.
27870Sstevel@tonic-gate * The originator of the resync read will cause this bit to be set if
27880Sstevel@tonic-gate * the underlying component count is one for a submirror resync. All
27890Sstevel@tonic-gate * other resync types will have the flag set as there is no underlying
27900Sstevel@tonic-gate * resync which can be performed on a contained metadevice for these
27910Sstevel@tonic-gate * resync types (optimized or component).
27920Sstevel@tonic-gate */
27930Sstevel@tonic-gate
27940Sstevel@tonic-gate if (ps->ps_flags & MD_MPS_WRITE_AFTER_READ) {
27950Sstevel@tonic-gate if (ps->ps_flags & MD_MPS_FLAG_ERROR)
27960Sstevel@tonic-gate pb->b_flags |= B_ERROR;
27970Sstevel@tonic-gate md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
27980Sstevel@tonic-gate MPS_FREE(mirror_parent_cache, ps);
27990Sstevel@tonic-gate md_unit_readerexit(ui);
28000Sstevel@tonic-gate md_biodone(pb);
28010Sstevel@tonic-gate return;
28020Sstevel@tonic-gate }
28030Sstevel@tonic-gate /*
28040Sstevel@tonic-gate * Set the MD_IO_COUNTED flag as we are retrying the same I/O
28050Sstevel@tonic-gate * operation therefore this I/O request has already been counted,
28060Sstevel@tonic-gate * the I/O count variable will be decremented by mirror_done()'s
28070Sstevel@tonic-gate * call to md_biodone().
28080Sstevel@tonic-gate */
28090Sstevel@tonic-gate if (ps->ps_changecnt != un->un_changecnt) {
28100Sstevel@tonic-gate new_str_flags = MD_STR_NOTTOP | MD_IO_COUNTED;
28110Sstevel@tonic-gate if (ps->ps_flags & MD_MPS_WOW)
28120Sstevel@tonic-gate new_str_flags |= MD_STR_WOW;
28130Sstevel@tonic-gate if (ps->ps_flags & MD_MPS_MAPPED)
28140Sstevel@tonic-gate new_str_flags |= MD_STR_MAPPED;
28150Sstevel@tonic-gate /*
28160Sstevel@tonic-gate * If this I/O request was a read that was part of a resync,
28170Sstevel@tonic-gate * set MD_STR_WAR for the retried read to ensure that the
28180Sstevel@tonic-gate * resync write (i.e. write-after-read) will be performed
28190Sstevel@tonic-gate */
28200Sstevel@tonic-gate if (ps->ps_flags & MD_MPS_RESYNC_READ)
28210Sstevel@tonic-gate new_str_flags |= MD_STR_WAR;
28220Sstevel@tonic-gate md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
28230Sstevel@tonic-gate MPS_FREE(mirror_parent_cache, ps);
28240Sstevel@tonic-gate md_unit_readerexit(ui);
28250Sstevel@tonic-gate (void) md_mirror_strategy(pb, new_str_flags, NULL);
28260Sstevel@tonic-gate return;
28270Sstevel@tonic-gate }
28280Sstevel@tonic-gate
28290Sstevel@tonic-gate pb->b_flags |= B_ERROR;
28300Sstevel@tonic-gate md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
28310Sstevel@tonic-gate MPS_FREE(mirror_parent_cache, ps);
28320Sstevel@tonic-gate md_unit_readerexit(ui);
28330Sstevel@tonic-gate md_biodone(pb);
28340Sstevel@tonic-gate }
28350Sstevel@tonic-gate
28360Sstevel@tonic-gate static void
error_update_unit(md_mps_t * ps)28370Sstevel@tonic-gate error_update_unit(md_mps_t *ps)
28380Sstevel@tonic-gate {
28390Sstevel@tonic-gate mm_unit_t *un;
28400Sstevel@tonic-gate mdi_unit_t *ui;
28410Sstevel@tonic-gate int smi; /* sub mirror index */
28420Sstevel@tonic-gate int ci; /* errored component */
28430Sstevel@tonic-gate set_t setno;
28440Sstevel@tonic-gate uint_t flags; /* for set_sm_comp_state() */
28450Sstevel@tonic-gate uint_t hspflags; /* for check_comp_4_hotspares() */
28460Sstevel@tonic-gate
28470Sstevel@tonic-gate ui = ps->ps_ui;
28480Sstevel@tonic-gate un = (mm_unit_t *)md_unit_writerlock(ui);
28490Sstevel@tonic-gate setno = MD_UN2SET(un);
28500Sstevel@tonic-gate
28510Sstevel@tonic-gate /* All of these updates have to propagated in case of MN set */
28520Sstevel@tonic-gate flags = MD_STATE_XMIT;
28530Sstevel@tonic-gate hspflags = MD_HOTSPARE_XMIT;
28540Sstevel@tonic-gate
28550Sstevel@tonic-gate /* special treatment if we are called during updating watermarks */
28560Sstevel@tonic-gate if (ps->ps_flags & MD_MPS_WMUPDATE) {
28570Sstevel@tonic-gate flags |= MD_STATE_WMUPDATE;
28580Sstevel@tonic-gate hspflags |= MD_HOTSPARE_WMUPDATE;
28590Sstevel@tonic-gate }
28600Sstevel@tonic-gate smi = 0;
28610Sstevel@tonic-gate ci = 0;
28620Sstevel@tonic-gate while (mirror_geterror(un, &smi, &ci, 1, 0) != 0) {
28630Sstevel@tonic-gate if (mirror_other_sources(un, smi, ci, 0) == 1) {
28640Sstevel@tonic-gate
28650Sstevel@tonic-gate /* Never called from ioctl context, so (IOLOCK *)NULL */
28660Sstevel@tonic-gate set_sm_comp_state(un, smi, ci, CS_LAST_ERRED, 0, flags,
28676901Sjkennedy (IOLOCK *)NULL);
28680Sstevel@tonic-gate /*
28690Sstevel@tonic-gate * For a MN set, the NOTIFY is done when the state
28700Sstevel@tonic-gate * change is processed on each node
28710Sstevel@tonic-gate */
28720Sstevel@tonic-gate if (!MD_MNSET_SETNO(MD_UN2SET(un))) {
28730Sstevel@tonic-gate SE_NOTIFY(EC_SVM_STATE, ESC_SVM_LASTERRED,
28740Sstevel@tonic-gate SVM_TAG_METADEVICE, setno, MD_SID(un));
28750Sstevel@tonic-gate }
28760Sstevel@tonic-gate continue;
28770Sstevel@tonic-gate }
28780Sstevel@tonic-gate /* Never called from ioctl context, so (IOLOCK *)NULL */
28790Sstevel@tonic-gate set_sm_comp_state(un, smi, ci, CS_ERRED, 0, flags,
28806901Sjkennedy (IOLOCK *)NULL);
28810Sstevel@tonic-gate /*
28820Sstevel@tonic-gate * For a MN set, the NOTIFY is done when the state
28830Sstevel@tonic-gate * change is processed on each node
28840Sstevel@tonic-gate */
28850Sstevel@tonic-gate if (!MD_MNSET_SETNO(MD_UN2SET(un))) {
28860Sstevel@tonic-gate SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED,
28870Sstevel@tonic-gate SVM_TAG_METADEVICE, setno, MD_SID(un));
28880Sstevel@tonic-gate }
28890Sstevel@tonic-gate smi = 0;
28900Sstevel@tonic-gate ci = 0;
28910Sstevel@tonic-gate }
28920Sstevel@tonic-gate
28930Sstevel@tonic-gate md_unit_writerexit(ui);
28940Sstevel@tonic-gate if (MD_MNSET_SETNO(setno)) {
28950Sstevel@tonic-gate send_poke_hotspares(setno);
28960Sstevel@tonic-gate } else {
28970Sstevel@tonic-gate (void) poke_hotspares();
28980Sstevel@tonic-gate }
28990Sstevel@tonic-gate (void) md_unit_readerlock(ui);
29000Sstevel@tonic-gate
29010Sstevel@tonic-gate finish_error(ps);
29020Sstevel@tonic-gate }
29030Sstevel@tonic-gate
29040Sstevel@tonic-gate /*
29050Sstevel@tonic-gate * When we have a B_FAILFAST IO error on a Last Erred component we need to
29060Sstevel@tonic-gate * retry the IO without B_FAILFAST set so that we try to ensure that the
29070Sstevel@tonic-gate * component "sees" each IO.
29080Sstevel@tonic-gate */
29090Sstevel@tonic-gate static void
last_err_retry(md_mcs_t * cs)29100Sstevel@tonic-gate last_err_retry(md_mcs_t *cs)
29110Sstevel@tonic-gate {
29120Sstevel@tonic-gate struct buf *cb;
29130Sstevel@tonic-gate md_mps_t *ps;
29140Sstevel@tonic-gate uint_t flags;
29150Sstevel@tonic-gate
29160Sstevel@tonic-gate cb = &cs->cs_buf;
29170Sstevel@tonic-gate cb->b_flags &= ~B_FAILFAST;
29180Sstevel@tonic-gate
29190Sstevel@tonic-gate /* if we're panicing just let this I/O error out */
29200Sstevel@tonic-gate if (panicstr) {
29216901Sjkennedy (void) mirror_done(cb);
29226901Sjkennedy return;
29230Sstevel@tonic-gate }
29240Sstevel@tonic-gate
29250Sstevel@tonic-gate /* reissue the I/O */
29260Sstevel@tonic-gate
29270Sstevel@tonic-gate ps = cs->cs_ps;
29280Sstevel@tonic-gate
29290Sstevel@tonic-gate bioerror(cb, 0);
29300Sstevel@tonic-gate
29310Sstevel@tonic-gate mutex_enter(&ps->ps_mx);
29320Sstevel@tonic-gate
29330Sstevel@tonic-gate flags = MD_STR_NOTTOP;
29340Sstevel@tonic-gate if (ps->ps_flags & MD_MPS_MAPPED)
29350Sstevel@tonic-gate flags |= MD_STR_MAPPED;
29360Sstevel@tonic-gate if (ps->ps_flags & MD_MPS_NOBLOCK)
29370Sstevel@tonic-gate flags |= MD_NOBLOCK;
29380Sstevel@tonic-gate
29390Sstevel@tonic-gate mutex_exit(&ps->ps_mx);
29400Sstevel@tonic-gate
29410Sstevel@tonic-gate clear_retry_error(cb);
29420Sstevel@tonic-gate
29430Sstevel@tonic-gate cmn_err(CE_NOTE, "!md: %s: Last Erred, retry I/O without B_FAILFAST",
29446901Sjkennedy md_shortname(getminor(cb->b_edev)));
29450Sstevel@tonic-gate
29460Sstevel@tonic-gate md_call_strategy(cb, flags, NULL);
29470Sstevel@tonic-gate }
29480Sstevel@tonic-gate
29490Sstevel@tonic-gate static void
mirror_error(md_mps_t * ps)29500Sstevel@tonic-gate mirror_error(md_mps_t *ps)
29510Sstevel@tonic-gate {
29520Sstevel@tonic-gate int smi; /* sub mirror index */
29530Sstevel@tonic-gate int ci; /* errored component */
29540Sstevel@tonic-gate
29550Sstevel@tonic-gate if (panicstr) {
29560Sstevel@tonic-gate finish_error(ps);
29570Sstevel@tonic-gate return;
29580Sstevel@tonic-gate }
29590Sstevel@tonic-gate
29600Sstevel@tonic-gate if (ps->ps_flags & MD_MPS_ON_OVERLAP)
29616901Sjkennedy mirror_overlap_tree_remove(ps);
29620Sstevel@tonic-gate
29630Sstevel@tonic-gate smi = 0;
29640Sstevel@tonic-gate ci = 0;
29650Sstevel@tonic-gate if (mirror_geterror(ps->ps_un, &smi, &ci, 0, 0) != 0) {
29660Sstevel@tonic-gate md_unit_readerexit(ps->ps_ui);
29670Sstevel@tonic-gate daemon_request(&md_mstr_daemon, error_update_unit,
29680Sstevel@tonic-gate (daemon_queue_t *)ps, REQ_OLD);
29690Sstevel@tonic-gate return;
29700Sstevel@tonic-gate }
29710Sstevel@tonic-gate
29720Sstevel@tonic-gate finish_error(ps);
29730Sstevel@tonic-gate }
29740Sstevel@tonic-gate
29750Sstevel@tonic-gate static int
copy_write_done(struct buf * cb)29760Sstevel@tonic-gate copy_write_done(struct buf *cb)
29770Sstevel@tonic-gate {
29780Sstevel@tonic-gate md_mps_t *ps;
29790Sstevel@tonic-gate buf_t *pb;
29800Sstevel@tonic-gate char *wowbuf;
29810Sstevel@tonic-gate wowhdr_t *wowhdr;
29820Sstevel@tonic-gate ssize_t wow_resid;
29830Sstevel@tonic-gate
29840Sstevel@tonic-gate /* get wowbuf ans save structure */
29850Sstevel@tonic-gate wowbuf = cb->b_un.b_addr;
29860Sstevel@tonic-gate wowhdr = WOWBUF_HDR(wowbuf);
29870Sstevel@tonic-gate ps = wowhdr->wow_ps;
29880Sstevel@tonic-gate pb = ps->ps_bp;
29890Sstevel@tonic-gate
29900Sstevel@tonic-gate /* Save error information, then free cb */
29910Sstevel@tonic-gate if (cb->b_flags & B_ERROR)
29920Sstevel@tonic-gate pb->b_flags |= B_ERROR;
29930Sstevel@tonic-gate
29940Sstevel@tonic-gate if (cb->b_flags & B_REMAPPED)
29950Sstevel@tonic-gate bp_mapout(cb);
29960Sstevel@tonic-gate
29970Sstevel@tonic-gate freerbuf(cb);
29980Sstevel@tonic-gate
29990Sstevel@tonic-gate /* update residual and continue if needed */
30000Sstevel@tonic-gate if ((pb->b_flags & B_ERROR) == 0) {
30010Sstevel@tonic-gate wow_resid = pb->b_bcount - wowhdr->wow_offset;
30020Sstevel@tonic-gate pb->b_resid = wow_resid;
30030Sstevel@tonic-gate if (wow_resid > 0) {
30040Sstevel@tonic-gate daemon_request(&md_mstr_daemon, copy_write_cont,
30050Sstevel@tonic-gate (daemon_queue_t *)wowhdr, REQ_OLD);
30060Sstevel@tonic-gate return (1);
30070Sstevel@tonic-gate }
30080Sstevel@tonic-gate }
30090Sstevel@tonic-gate
30100Sstevel@tonic-gate /* Write is complete, release resources. */
30110Sstevel@tonic-gate kmem_cache_free(mirror_wowblk_cache, wowhdr);
30120Sstevel@tonic-gate ASSERT(!(ps->ps_flags & MD_MPS_ON_OVERLAP));
30130Sstevel@tonic-gate md_kstat_done(ps->ps_ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
30140Sstevel@tonic-gate MPS_FREE(mirror_parent_cache, ps);
30150Sstevel@tonic-gate md_biodone(pb);
30160Sstevel@tonic-gate return (0);
30170Sstevel@tonic-gate }
30180Sstevel@tonic-gate
30190Sstevel@tonic-gate static void
copy_write_cont(wowhdr_t * wowhdr)30200Sstevel@tonic-gate copy_write_cont(wowhdr_t *wowhdr)
30210Sstevel@tonic-gate {
30220Sstevel@tonic-gate buf_t *pb;
30230Sstevel@tonic-gate buf_t *cb;
30240Sstevel@tonic-gate char *wowbuf;
30250Sstevel@tonic-gate int wow_offset;
30260Sstevel@tonic-gate size_t wow_resid;
30270Sstevel@tonic-gate diskaddr_t wow_blkno;
30280Sstevel@tonic-gate
30290Sstevel@tonic-gate wowbuf = WOWHDR_BUF(wowhdr);
30300Sstevel@tonic-gate pb = wowhdr->wow_ps->ps_bp;
30310Sstevel@tonic-gate
30320Sstevel@tonic-gate /* get data on current location */
30330Sstevel@tonic-gate wow_offset = wowhdr->wow_offset;
30340Sstevel@tonic-gate wow_resid = pb->b_bcount - wow_offset;
30350Sstevel@tonic-gate wow_blkno = pb->b_lblkno + lbtodb(wow_offset);
30360Sstevel@tonic-gate
30370Sstevel@tonic-gate /* setup child buffer */
30380Sstevel@tonic-gate cb = getrbuf(KM_SLEEP);
30390Sstevel@tonic-gate cb->b_flags = B_WRITE;
30400Sstevel@tonic-gate cb->b_edev = pb->b_edev;
30410Sstevel@tonic-gate cb->b_un.b_addr = wowbuf; /* change to point at WOWBUF */
30420Sstevel@tonic-gate cb->b_bufsize = md_wowbuf_size; /* change to wowbuf_size */
30430Sstevel@tonic-gate cb->b_iodone = copy_write_done;
30440Sstevel@tonic-gate cb->b_bcount = MIN(md_wowbuf_size, wow_resid);
30450Sstevel@tonic-gate cb->b_lblkno = wow_blkno;
30460Sstevel@tonic-gate
30470Sstevel@tonic-gate /* move offset to next section */
30480Sstevel@tonic-gate wowhdr->wow_offset += cb->b_bcount;
30490Sstevel@tonic-gate
30500Sstevel@tonic-gate /* copy and setup write for current section */
30510Sstevel@tonic-gate bcopy(&pb->b_un.b_addr[wow_offset], wowbuf, cb->b_bcount);
30520Sstevel@tonic-gate
30530Sstevel@tonic-gate /* do it */
30540Sstevel@tonic-gate /*
30550Sstevel@tonic-gate * Do not set the MD_IO_COUNTED flag as this is a new I/O request
30560Sstevel@tonic-gate * that handles the WOW condition. The resultant increment on the
30570Sstevel@tonic-gate * I/O count variable is cleared by copy_write_done()'s call to
30580Sstevel@tonic-gate * md_biodone().
30590Sstevel@tonic-gate */
30600Sstevel@tonic-gate (void) md_mirror_strategy(cb, MD_STR_NOTTOP | MD_STR_WOW
30616901Sjkennedy | MD_STR_MAPPED, NULL);
30620Sstevel@tonic-gate }
30630Sstevel@tonic-gate
30640Sstevel@tonic-gate static void
md_mirror_copy_write(md_mps_t * ps)30650Sstevel@tonic-gate md_mirror_copy_write(md_mps_t *ps)
30660Sstevel@tonic-gate {
30670Sstevel@tonic-gate wowhdr_t *wowhdr;
30680Sstevel@tonic-gate
30690Sstevel@tonic-gate wowhdr = kmem_cache_alloc(mirror_wowblk_cache, MD_ALLOCFLAGS);
30700Sstevel@tonic-gate mirror_wowblk_init(wowhdr);
30710Sstevel@tonic-gate wowhdr->wow_ps = ps;
30720Sstevel@tonic-gate wowhdr->wow_offset = 0;
30730Sstevel@tonic-gate copy_write_cont(wowhdr);
30740Sstevel@tonic-gate }
30750Sstevel@tonic-gate
30760Sstevel@tonic-gate static void
handle_wow(md_mps_t * ps)30770Sstevel@tonic-gate handle_wow(md_mps_t *ps)
30780Sstevel@tonic-gate {
30790Sstevel@tonic-gate buf_t *pb;
30800Sstevel@tonic-gate
30810Sstevel@tonic-gate pb = ps->ps_bp;
30820Sstevel@tonic-gate
30830Sstevel@tonic-gate bp_mapin(pb);
30840Sstevel@tonic-gate
30850Sstevel@tonic-gate md_mirror_wow_cnt++;
30860Sstevel@tonic-gate if (!(pb->b_flags & B_PHYS) && (md_mirror_wow_flg & WOW_LOGIT)) {
30870Sstevel@tonic-gate cmn_err(CE_NOTE,
30880Sstevel@tonic-gate "md: %s, blk %lld, cnt %ld: Write on write %d occurred",
30890Sstevel@tonic-gate md_shortname(getminor(pb->b_edev)),
30900Sstevel@tonic-gate (longlong_t)pb->b_lblkno, pb->b_bcount, md_mirror_wow_cnt);
30910Sstevel@tonic-gate }
30920Sstevel@tonic-gate
30930Sstevel@tonic-gate /*
30940Sstevel@tonic-gate * Set the MD_IO_COUNTED flag as we are retrying the same I/O
30950Sstevel@tonic-gate * operation therefore this I/O request has already been counted,
30960Sstevel@tonic-gate * the I/O count variable will be decremented by mirror_done()'s
30970Sstevel@tonic-gate * call to md_biodone().
30980Sstevel@tonic-gate */
30990Sstevel@tonic-gate if (md_mirror_wow_flg & WOW_NOCOPY)
31000Sstevel@tonic-gate (void) md_mirror_strategy(pb, MD_STR_NOTTOP | MD_STR_WOW |
31016901Sjkennedy MD_STR_MAPPED | MD_IO_COUNTED, ps);
31020Sstevel@tonic-gate else
31030Sstevel@tonic-gate md_mirror_copy_write(ps);
31040Sstevel@tonic-gate }
31050Sstevel@tonic-gate
31060Sstevel@tonic-gate /*
31070Sstevel@tonic-gate * Return true if the specified submirror is either in the Last Erred
31080Sstevel@tonic-gate * state or is transitioning into the Last Erred state.
31090Sstevel@tonic-gate */
31100Sstevel@tonic-gate static bool_t
submirror_is_lasterred(mm_unit_t * un,int smi)31110Sstevel@tonic-gate submirror_is_lasterred(mm_unit_t *un, int smi)
31120Sstevel@tonic-gate {
31130Sstevel@tonic-gate mm_submirror_t *sm;
31140Sstevel@tonic-gate mm_submirror_ic_t *smic;
31150Sstevel@tonic-gate md_m_shared_t *shared;
31160Sstevel@tonic-gate int ci;
31170Sstevel@tonic-gate int compcnt;
31180Sstevel@tonic-gate
31190Sstevel@tonic-gate sm = &un->un_sm[smi];
31200Sstevel@tonic-gate smic = &un->un_smic[smi];
31210Sstevel@tonic-gate
31220Sstevel@tonic-gate compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un);
31230Sstevel@tonic-gate for (ci = 0; ci < compcnt; ci++) {
31240Sstevel@tonic-gate shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
31250Sstevel@tonic-gate (sm->sm_dev, sm, ci);
31260Sstevel@tonic-gate
31270Sstevel@tonic-gate if (shared->ms_state == CS_LAST_ERRED)
31280Sstevel@tonic-gate return (B_TRUE);
31290Sstevel@tonic-gate
31300Sstevel@tonic-gate /*
31310Sstevel@tonic-gate * It is not currently Last Erred, check if entering Last Erred.
31320Sstevel@tonic-gate */
31330Sstevel@tonic-gate if ((shared->ms_flags & MDM_S_IOERR) &&
31340Sstevel@tonic-gate ((shared->ms_state == CS_OKAY) ||
31350Sstevel@tonic-gate (shared->ms_state == CS_RESYNC))) {
31360Sstevel@tonic-gate if (mirror_other_sources(un, smi, ci, 0) == 1)
31370Sstevel@tonic-gate return (B_TRUE);
31380Sstevel@tonic-gate }
31390Sstevel@tonic-gate }
31400Sstevel@tonic-gate
31410Sstevel@tonic-gate return (B_FALSE);
31420Sstevel@tonic-gate }
31430Sstevel@tonic-gate
31440Sstevel@tonic-gate
31450Sstevel@tonic-gate static int
mirror_done(struct buf * cb)31460Sstevel@tonic-gate mirror_done(struct buf *cb)
31470Sstevel@tonic-gate {
31480Sstevel@tonic-gate md_mps_t *ps;
31490Sstevel@tonic-gate md_mcs_t *cs;
31500Sstevel@tonic-gate
31510Sstevel@tonic-gate /*LINTED*/
31520Sstevel@tonic-gate cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off);
31530Sstevel@tonic-gate ps = cs->cs_ps;
31540Sstevel@tonic-gate
31550Sstevel@tonic-gate mutex_enter(&ps->ps_mx);
31560Sstevel@tonic-gate
31570Sstevel@tonic-gate /* check if we need to retry an errored failfast I/O */
31580Sstevel@tonic-gate if (cb->b_flags & B_ERROR) {
31590Sstevel@tonic-gate struct buf *pb = ps->ps_bp;
31600Sstevel@tonic-gate
31610Sstevel@tonic-gate if (cb->b_flags & B_FAILFAST) {
31620Sstevel@tonic-gate int i;
31630Sstevel@tonic-gate mm_unit_t *un = ps->ps_un;
31640Sstevel@tonic-gate
31650Sstevel@tonic-gate for (i = 0; i < NMIRROR; i++) {
31660Sstevel@tonic-gate if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
31670Sstevel@tonic-gate continue;
31680Sstevel@tonic-gate
31690Sstevel@tonic-gate if (cb->b_edev ==
31700Sstevel@tonic-gate md_dev64_to_dev(un->un_sm[i].sm_dev)) {
31710Sstevel@tonic-gate
31720Sstevel@tonic-gate /*
31730Sstevel@tonic-gate * This is the submirror that had the
31740Sstevel@tonic-gate * error. Check if it is Last Erred.
31750Sstevel@tonic-gate */
31760Sstevel@tonic-gate if (submirror_is_lasterred(un, i)) {
31770Sstevel@tonic-gate daemon_queue_t *dqp;
31780Sstevel@tonic-gate
31790Sstevel@tonic-gate mutex_exit(&ps->ps_mx);
31800Sstevel@tonic-gate dqp = (daemon_queue_t *)cs;
31810Sstevel@tonic-gate dqp->dq_prev = NULL;
31820Sstevel@tonic-gate dqp->dq_next = NULL;
31830Sstevel@tonic-gate daemon_request(&md_done_daemon,
31840Sstevel@tonic-gate last_err_retry, dqp,
31850Sstevel@tonic-gate REQ_OLD);
31860Sstevel@tonic-gate return (1);
31870Sstevel@tonic-gate }
31880Sstevel@tonic-gate break;
31890Sstevel@tonic-gate }
31900Sstevel@tonic-gate }
31910Sstevel@tonic-gate }
31920Sstevel@tonic-gate
31930Sstevel@tonic-gate /* continue to process the buf without doing a retry */
31940Sstevel@tonic-gate ps->ps_flags |= MD_MPS_ERROR;
31950Sstevel@tonic-gate pb->b_error = cb->b_error;
31960Sstevel@tonic-gate }
31970Sstevel@tonic-gate
31980Sstevel@tonic-gate return (mirror_done_common(cb));
31990Sstevel@tonic-gate }
32000Sstevel@tonic-gate
32010Sstevel@tonic-gate /*
32020Sstevel@tonic-gate * Split from the original mirror_done function so we can handle bufs after a
32030Sstevel@tonic-gate * retry.
32040Sstevel@tonic-gate * ps->ps_mx is already held in the caller of this function and the cb error
32050Sstevel@tonic-gate * has already been checked and handled in the caller.
32060Sstevel@tonic-gate */
32070Sstevel@tonic-gate static int
mirror_done_common(struct buf * cb)32080Sstevel@tonic-gate mirror_done_common(struct buf *cb)
32090Sstevel@tonic-gate {
32100Sstevel@tonic-gate struct buf *pb;
32110Sstevel@tonic-gate mm_unit_t *un;
32120Sstevel@tonic-gate mdi_unit_t *ui;
32130Sstevel@tonic-gate md_mps_t *ps;
32140Sstevel@tonic-gate md_mcs_t *cs;
32150Sstevel@tonic-gate size_t end_rr, start_rr, current_rr;
32160Sstevel@tonic-gate
32170Sstevel@tonic-gate /*LINTED*/
32180Sstevel@tonic-gate cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off);
32190Sstevel@tonic-gate ps = cs->cs_ps;
32200Sstevel@tonic-gate pb = ps->ps_bp;
32210Sstevel@tonic-gate
32220Sstevel@tonic-gate if (cb->b_flags & B_REMAPPED)
32230Sstevel@tonic-gate bp_mapout(cb);
32240Sstevel@tonic-gate
32250Sstevel@tonic-gate ps->ps_frags--;
32260Sstevel@tonic-gate if (ps->ps_frags != 0) {
32270Sstevel@tonic-gate mutex_exit(&ps->ps_mx);
32280Sstevel@tonic-gate kmem_cache_free(mirror_child_cache, cs);
32290Sstevel@tonic-gate return (1);
32300Sstevel@tonic-gate }
32310Sstevel@tonic-gate un = ps->ps_un;
32320Sstevel@tonic-gate ui = ps->ps_ui;
32330Sstevel@tonic-gate
32340Sstevel@tonic-gate /*
32350Sstevel@tonic-gate * Do not update outstanding_writes if we're running with ABR
32360Sstevel@tonic-gate * set for this mirror or the write() was issued with MD_STR_ABR set.
32370Sstevel@tonic-gate * Also a resync initiated write() has no outstanding_writes update
32380Sstevel@tonic-gate * either.
32390Sstevel@tonic-gate */
32400Sstevel@tonic-gate if (((cb->b_flags & B_READ) == 0) &&
32410Sstevel@tonic-gate (un->un_nsm >= 2) &&
32420Sstevel@tonic-gate (ps->ps_call == NULL) &&
32430Sstevel@tonic-gate !((ui->ui_tstate & MD_ABR_CAP) || (ps->ps_flags & MD_MPS_ABR)) &&
32440Sstevel@tonic-gate !(ps->ps_flags & MD_MPS_WRITE_AFTER_READ)) {
32450Sstevel@tonic-gate BLK_TO_RR(end_rr, ps->ps_lastblk, un);
32460Sstevel@tonic-gate BLK_TO_RR(start_rr, ps->ps_firstblk, un);
32470Sstevel@tonic-gate mutex_enter(&un->un_resync_mx);
32480Sstevel@tonic-gate for (current_rr = start_rr; current_rr <= end_rr; current_rr++)
32490Sstevel@tonic-gate un->un_outstanding_writes[current_rr]--;
32500Sstevel@tonic-gate mutex_exit(&un->un_resync_mx);
32510Sstevel@tonic-gate }
32520Sstevel@tonic-gate kmem_cache_free(mirror_child_cache, cs);
32530Sstevel@tonic-gate mutex_exit(&ps->ps_mx);
32540Sstevel@tonic-gate
32550Sstevel@tonic-gate if (ps->ps_call != NULL) {
32560Sstevel@tonic-gate daemon_request(&md_done_daemon, ps->ps_call,
32570Sstevel@tonic-gate (daemon_queue_t *)ps, REQ_OLD);
32580Sstevel@tonic-gate return (1);
32590Sstevel@tonic-gate }
32600Sstevel@tonic-gate
32610Sstevel@tonic-gate if ((ps->ps_flags & MD_MPS_ERROR)) {
32620Sstevel@tonic-gate daemon_request(&md_done_daemon, mirror_error,
32630Sstevel@tonic-gate (daemon_queue_t *)ps, REQ_OLD);
32640Sstevel@tonic-gate return (1);
32650Sstevel@tonic-gate }
32660Sstevel@tonic-gate
32670Sstevel@tonic-gate if (ps->ps_flags & MD_MPS_ON_OVERLAP)
32686901Sjkennedy mirror_overlap_tree_remove(ps);
32690Sstevel@tonic-gate
32700Sstevel@tonic-gate /*
32710Sstevel@tonic-gate * Handle Write-on-Write problem.
32720Sstevel@tonic-gate * Skip In case of Raw and Direct I/O as they are
32730Sstevel@tonic-gate * handled earlier.
32740Sstevel@tonic-gate *
32750Sstevel@tonic-gate */
32760Sstevel@tonic-gate if (!(md_mirror_wow_flg & WOW_DISABLE) &&
32770Sstevel@tonic-gate !(pb->b_flags & B_READ) &&
32780Sstevel@tonic-gate !(ps->ps_flags & MD_MPS_WOW) &&
32790Sstevel@tonic-gate !(pb->b_flags & B_PHYS) &&
32800Sstevel@tonic-gate any_pages_dirty(pb)) {
32810Sstevel@tonic-gate md_unit_readerexit(ps->ps_ui);
32820Sstevel@tonic-gate daemon_request(&md_mstr_daemon, handle_wow,
32830Sstevel@tonic-gate (daemon_queue_t *)ps, REQ_OLD);
32840Sstevel@tonic-gate return (1);
32850Sstevel@tonic-gate }
32860Sstevel@tonic-gate
32870Sstevel@tonic-gate md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
32880Sstevel@tonic-gate MPS_FREE(mirror_parent_cache, ps);
32890Sstevel@tonic-gate md_unit_readerexit(ui);
32900Sstevel@tonic-gate md_biodone(pb);
32910Sstevel@tonic-gate return (0);
32920Sstevel@tonic-gate }
32930Sstevel@tonic-gate
32940Sstevel@tonic-gate /*
32950Sstevel@tonic-gate * Clear error state in submirror component if the retry worked after
32960Sstevel@tonic-gate * a failfast error.
32970Sstevel@tonic-gate */
32980Sstevel@tonic-gate static void
clear_retry_error(struct buf * cb)32990Sstevel@tonic-gate clear_retry_error(struct buf *cb)
33000Sstevel@tonic-gate {
33010Sstevel@tonic-gate int smi;
33020Sstevel@tonic-gate md_mcs_t *cs;
33030Sstevel@tonic-gate mm_unit_t *un;
33040Sstevel@tonic-gate mdi_unit_t *ui_sm;
33050Sstevel@tonic-gate mm_submirror_t *sm;
33060Sstevel@tonic-gate mm_submirror_ic_t *smic;
33070Sstevel@tonic-gate u_longlong_t cnt;
33080Sstevel@tonic-gate md_m_shared_t *shared;
33090Sstevel@tonic-gate
33100Sstevel@tonic-gate /*LINTED*/
33110Sstevel@tonic-gate cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off);
33120Sstevel@tonic-gate un = cs->cs_ps->ps_un;
33130Sstevel@tonic-gate
33140Sstevel@tonic-gate for (smi = 0; smi < NMIRROR; smi++) {
33156901Sjkennedy if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE))
33166901Sjkennedy continue;
33176901Sjkennedy
33186901Sjkennedy if (cb->b_edev == md_dev64_to_dev(un->un_sm[smi].sm_dev))
33196901Sjkennedy break;
33200Sstevel@tonic-gate }
33210Sstevel@tonic-gate
33220Sstevel@tonic-gate if (smi >= NMIRROR)
33236901Sjkennedy return;
33240Sstevel@tonic-gate
33250Sstevel@tonic-gate sm = &un->un_sm[smi];
33260Sstevel@tonic-gate smic = &un->un_smic[smi];
33270Sstevel@tonic-gate cnt = cb->b_bcount;
33280Sstevel@tonic-gate
33290Sstevel@tonic-gate ui_sm = MDI_UNIT(getminor(cb->b_edev));
33300Sstevel@tonic-gate (void) md_unit_writerlock(ui_sm);
33310Sstevel@tonic-gate
33320Sstevel@tonic-gate shared = (md_m_shared_t *)(*(smic->sm_shared_by_blk))(sm->sm_dev, sm,
33330Sstevel@tonic-gate cb->b_blkno, &cnt);
33340Sstevel@tonic-gate
33350Sstevel@tonic-gate if (shared->ms_flags & MDM_S_IOERR) {
33366901Sjkennedy shared->ms_flags &= ~MDM_S_IOERR;
33370Sstevel@tonic-gate
33380Sstevel@tonic-gate } else {
33396901Sjkennedy /* the buf spans components and the first one is not erred */
33406901Sjkennedy int cnt;
33416901Sjkennedy int i;
33426901Sjkennedy
33436901Sjkennedy cnt = (*(smic->sm_get_component_count))(sm->sm_dev, un);
33446901Sjkennedy for (i = 0; i < cnt; i++) {
33456901Sjkennedy shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
33466901Sjkennedy (sm->sm_dev, sm, i);
33476901Sjkennedy
33486901Sjkennedy if (shared->ms_flags & MDM_S_IOERR &&
33496901Sjkennedy shared->ms_state == CS_OKAY) {
33506901Sjkennedy
33516901Sjkennedy shared->ms_flags &= ~MDM_S_IOERR;
33526901Sjkennedy break;
33536901Sjkennedy }
33540Sstevel@tonic-gate }
33550Sstevel@tonic-gate }
33560Sstevel@tonic-gate
33570Sstevel@tonic-gate md_unit_writerexit(ui_sm);
33580Sstevel@tonic-gate }
33590Sstevel@tonic-gate
33600Sstevel@tonic-gate static size_t
mirror_map_read(md_mps_t * ps,md_mcs_t * cs,diskaddr_t blkno,u_longlong_t count)33610Sstevel@tonic-gate mirror_map_read(
33620Sstevel@tonic-gate md_mps_t *ps,
33630Sstevel@tonic-gate md_mcs_t *cs,
33640Sstevel@tonic-gate diskaddr_t blkno,
33650Sstevel@tonic-gate u_longlong_t count
33660Sstevel@tonic-gate )
33670Sstevel@tonic-gate {
33680Sstevel@tonic-gate mm_unit_t *un;
33690Sstevel@tonic-gate buf_t *bp;
33700Sstevel@tonic-gate u_longlong_t cando;
33710Sstevel@tonic-gate
33720Sstevel@tonic-gate bp = &cs->cs_buf;
33730Sstevel@tonic-gate un = ps->ps_un;
33740Sstevel@tonic-gate
33750Sstevel@tonic-gate bp->b_lblkno = blkno;
33760Sstevel@tonic-gate if (fast_select_read_unit(ps, cs) == 0) {
33770Sstevel@tonic-gate bp->b_bcount = ldbtob(count);
33780Sstevel@tonic-gate return (0);
33790Sstevel@tonic-gate }
33806901Sjkennedy bp->b_edev = md_dev64_to_dev(select_read_unit(un, blkno,
33816901Sjkennedy count, &cando, 0, NULL, cs));
33820Sstevel@tonic-gate bp->b_bcount = ldbtob(cando);
33830Sstevel@tonic-gate if (count != cando)
33840Sstevel@tonic-gate return (cando);
33850Sstevel@tonic-gate return (0);
33860Sstevel@tonic-gate }
33870Sstevel@tonic-gate
33880Sstevel@tonic-gate static void
write_after_read(md_mps_t * ps)33890Sstevel@tonic-gate write_after_read(md_mps_t *ps)
33900Sstevel@tonic-gate {
33910Sstevel@tonic-gate struct buf *pb;
33920Sstevel@tonic-gate int flags;
33930Sstevel@tonic-gate
33940Sstevel@tonic-gate if (ps->ps_flags & MD_MPS_ERROR) {
33950Sstevel@tonic-gate mirror_error(ps);
33960Sstevel@tonic-gate return;
33970Sstevel@tonic-gate }
33980Sstevel@tonic-gate
33990Sstevel@tonic-gate pb = ps->ps_bp;
34000Sstevel@tonic-gate md_kstat_done(ps->ps_ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
34010Sstevel@tonic-gate ps->ps_call = NULL;
34020Sstevel@tonic-gate ps->ps_flags |= MD_MPS_WRITE_AFTER_READ;
34030Sstevel@tonic-gate flags = MD_STR_NOTTOP | MD_STR_WAR;
34040Sstevel@tonic-gate if (ps->ps_flags & MD_MPS_MAPPED)
34050Sstevel@tonic-gate flags |= MD_STR_MAPPED;
34060Sstevel@tonic-gate if (ps->ps_flags & MD_MPS_NOBLOCK)
34070Sstevel@tonic-gate flags |= MD_NOBLOCK;
34080Sstevel@tonic-gate if (ps->ps_flags & MD_MPS_DIRTY_RD)
34090Sstevel@tonic-gate flags |= MD_STR_DIRTY_RD;
34100Sstevel@tonic-gate (void) mirror_write_strategy(pb, flags, ps);
34110Sstevel@tonic-gate }
34120Sstevel@tonic-gate
34130Sstevel@tonic-gate static void
continue_serial(md_mps_t * ps)34140Sstevel@tonic-gate continue_serial(md_mps_t *ps)
34150Sstevel@tonic-gate {
34160Sstevel@tonic-gate md_mcs_t *cs;
34170Sstevel@tonic-gate buf_t *cb;
34180Sstevel@tonic-gate mm_unit_t *un;
34190Sstevel@tonic-gate int flags;
34200Sstevel@tonic-gate
34210Sstevel@tonic-gate un = ps->ps_un;
34220Sstevel@tonic-gate cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS);
34230Sstevel@tonic-gate mirror_child_init(cs);
34240Sstevel@tonic-gate cb = &cs->cs_buf;
34250Sstevel@tonic-gate ps->ps_call = NULL;
34260Sstevel@tonic-gate ps->ps_frags = 1;
34270Sstevel@tonic-gate (void) mirror_map_write(un, cs, ps, 0);
34280Sstevel@tonic-gate flags = MD_STR_NOTTOP;
34290Sstevel@tonic-gate if (ps->ps_flags & MD_MPS_MAPPED)
34300Sstevel@tonic-gate flags |= MD_STR_MAPPED;
34310Sstevel@tonic-gate md_call_strategy(cb, flags, NULL);
34320Sstevel@tonic-gate }
34330Sstevel@tonic-gate
34340Sstevel@tonic-gate static int
mirror_map_write(mm_unit_t * un,md_mcs_t * cs,md_mps_t * ps,int war)34350Sstevel@tonic-gate mirror_map_write(mm_unit_t *un, md_mcs_t *cs, md_mps_t *ps, int war)
34360Sstevel@tonic-gate {
34370Sstevel@tonic-gate int i;
34380Sstevel@tonic-gate dev_t dev; /* needed for bioclone, so not md_dev64_t */
34390Sstevel@tonic-gate buf_t *cb;
34400Sstevel@tonic-gate buf_t *pb;
34410Sstevel@tonic-gate diskaddr_t blkno;
34420Sstevel@tonic-gate size_t bcount;
34430Sstevel@tonic-gate off_t offset;
34440Sstevel@tonic-gate
34450Sstevel@tonic-gate pb = ps->ps_bp;
34460Sstevel@tonic-gate cb = &cs->cs_buf;
34470Sstevel@tonic-gate cs->cs_ps = ps;
34480Sstevel@tonic-gate
34490Sstevel@tonic-gate i = md_find_nth_unit(ps->ps_writable_sm, ps->ps_current_sm);
34500Sstevel@tonic-gate
34510Sstevel@tonic-gate dev = md_dev64_to_dev(un->un_sm[i].sm_dev);
34520Sstevel@tonic-gate
34530Sstevel@tonic-gate blkno = pb->b_lblkno;
34540Sstevel@tonic-gate bcount = pb->b_bcount;
34550Sstevel@tonic-gate offset = 0;
34560Sstevel@tonic-gate if (war && (blkno == 0) && (un->c.un_flag & MD_LABELED)) {
34570Sstevel@tonic-gate blkno = DK_LABEL_LOC + 1;
34580Sstevel@tonic-gate /*
34590Sstevel@tonic-gate * This handles the case where we're requesting
34600Sstevel@tonic-gate * a write to block 0 on a label partition
34610Sstevel@tonic-gate * and the request size was smaller than the
34620Sstevel@tonic-gate * size of the label. If this is the case
34630Sstevel@tonic-gate * then we'll return -1. Failure to do so will
34640Sstevel@tonic-gate * either cause the calling thread to hang due to
34650Sstevel@tonic-gate * an ssd bug, or worse if the bcount were allowed
34660Sstevel@tonic-gate * to go negative (ie large).
34670Sstevel@tonic-gate */
34680Sstevel@tonic-gate if (bcount <= DEV_BSIZE*(DK_LABEL_LOC + 1))
34690Sstevel@tonic-gate return (-1);
34700Sstevel@tonic-gate bcount -= (DEV_BSIZE*(DK_LABEL_LOC + 1));
34710Sstevel@tonic-gate offset = (DEV_BSIZE*(DK_LABEL_LOC + 1));
34720Sstevel@tonic-gate }
34730Sstevel@tonic-gate
34740Sstevel@tonic-gate cb = md_bioclone(pb, offset, bcount, dev, blkno, mirror_done,
34750Sstevel@tonic-gate cb, KM_NOSLEEP);
34760Sstevel@tonic-gate if (war)
34770Sstevel@tonic-gate cb->b_flags = (cb->b_flags & ~B_READ) | B_WRITE;
34780Sstevel@tonic-gate
34790Sstevel@tonic-gate /*
34800Sstevel@tonic-gate * If the submirror is in the erred stated, check if any component is
34810Sstevel@tonic-gate * in the Last Erred state. If so, we don't want to use the B_FAILFAST
34820Sstevel@tonic-gate * flag on the IO.
34830Sstevel@tonic-gate *
34840Sstevel@tonic-gate * Provide a fast path for the non-erred case (which should be the
34850Sstevel@tonic-gate * normal case).
34860Sstevel@tonic-gate */
34870Sstevel@tonic-gate if (un->un_sm[i].sm_flags & MD_SM_FAILFAST) {
34880Sstevel@tonic-gate if (un->un_sm[i].sm_state & SMS_COMP_ERRED) {
34890Sstevel@tonic-gate mm_submirror_t *sm;
34900Sstevel@tonic-gate mm_submirror_ic_t *smic;
34910Sstevel@tonic-gate int ci;
34920Sstevel@tonic-gate int compcnt;
34930Sstevel@tonic-gate
34940Sstevel@tonic-gate sm = &un->un_sm[i];
34950Sstevel@tonic-gate smic = &un->un_smic[i];
34960Sstevel@tonic-gate
34970Sstevel@tonic-gate compcnt = (*(smic->sm_get_component_count))
34980Sstevel@tonic-gate (sm->sm_dev, un);
34990Sstevel@tonic-gate for (ci = 0; ci < compcnt; ci++) {
35000Sstevel@tonic-gate md_m_shared_t *shared;
35010Sstevel@tonic-gate
35020Sstevel@tonic-gate shared = (md_m_shared_t *)
35030Sstevel@tonic-gate (*(smic->sm_shared_by_indx))(sm->sm_dev,
35040Sstevel@tonic-gate sm, ci);
35050Sstevel@tonic-gate
35060Sstevel@tonic-gate if (shared->ms_state == CS_LAST_ERRED)
35070Sstevel@tonic-gate break;
35080Sstevel@tonic-gate }
35090Sstevel@tonic-gate if (ci >= compcnt)
35100Sstevel@tonic-gate cb->b_flags |= B_FAILFAST;
35110Sstevel@tonic-gate
35120Sstevel@tonic-gate } else {
35130Sstevel@tonic-gate cb->b_flags |= B_FAILFAST;
35140Sstevel@tonic-gate }
35150Sstevel@tonic-gate }
35160Sstevel@tonic-gate
35170Sstevel@tonic-gate ps->ps_current_sm++;
35180Sstevel@tonic-gate if (ps->ps_current_sm != ps->ps_active_cnt) {
35190Sstevel@tonic-gate if (un->un_write_option == WR_SERIAL) {
35200Sstevel@tonic-gate ps->ps_call = continue_serial;
35210Sstevel@tonic-gate return (0);
35220Sstevel@tonic-gate }
35230Sstevel@tonic-gate return (1);
35240Sstevel@tonic-gate }
35250Sstevel@tonic-gate return (0);
35260Sstevel@tonic-gate }
35270Sstevel@tonic-gate
35280Sstevel@tonic-gate /*
35290Sstevel@tonic-gate * directed_read_done:
35300Sstevel@tonic-gate * ------------------
35310Sstevel@tonic-gate * Completion routine called when a DMR request has been returned from the
35320Sstevel@tonic-gate * underlying driver. Wake-up the original ioctl() and return the data to
35330Sstevel@tonic-gate * the user.
35340Sstevel@tonic-gate */
35350Sstevel@tonic-gate static void
directed_read_done(md_mps_t * ps)35360Sstevel@tonic-gate directed_read_done(md_mps_t *ps)
35370Sstevel@tonic-gate {
35380Sstevel@tonic-gate mm_unit_t *un;
35390Sstevel@tonic-gate mdi_unit_t *ui;
35400Sstevel@tonic-gate
35410Sstevel@tonic-gate un = ps->ps_un;
35420Sstevel@tonic-gate ui = ps->ps_ui;
35430Sstevel@tonic-gate
35440Sstevel@tonic-gate md_unit_readerexit(ui);
35450Sstevel@tonic-gate md_kstat_done(ui, ps->ps_bp, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
35460Sstevel@tonic-gate ps->ps_call = NULL;
35470Sstevel@tonic-gate
35480Sstevel@tonic-gate mutex_enter(&un->un_dmr_mx);
35490Sstevel@tonic-gate cv_signal(&un->un_dmr_cv);
35500Sstevel@tonic-gate mutex_exit(&un->un_dmr_mx);
35510Sstevel@tonic-gate
35520Sstevel@tonic-gate /* release the parent structure */
35530Sstevel@tonic-gate kmem_cache_free(mirror_parent_cache, ps);
35540Sstevel@tonic-gate }
35550Sstevel@tonic-gate
35560Sstevel@tonic-gate /*
35570Sstevel@tonic-gate * daemon_io:
35580Sstevel@tonic-gate * ------------
35590Sstevel@tonic-gate * Called to issue a mirror_write_strategy() or mirror_read_strategy
35600Sstevel@tonic-gate * call from a blockable context. NOTE: no mutex can be held on entry to this
35610Sstevel@tonic-gate * routine
35620Sstevel@tonic-gate */
35630Sstevel@tonic-gate static void
daemon_io(daemon_queue_t * dq)35640Sstevel@tonic-gate daemon_io(daemon_queue_t *dq)
35650Sstevel@tonic-gate {
35660Sstevel@tonic-gate md_mps_t *ps = (md_mps_t *)dq;
35670Sstevel@tonic-gate int flag = MD_STR_NOTTOP;
35680Sstevel@tonic-gate buf_t *pb = ps->ps_bp;
35690Sstevel@tonic-gate
35700Sstevel@tonic-gate if (ps->ps_flags & MD_MPS_MAPPED)
35710Sstevel@tonic-gate flag |= MD_STR_MAPPED;
35720Sstevel@tonic-gate if (ps->ps_flags & MD_MPS_WOW)
35730Sstevel@tonic-gate flag |= MD_STR_WOW;
35740Sstevel@tonic-gate if (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)
35750Sstevel@tonic-gate flag |= MD_STR_WAR;
35760Sstevel@tonic-gate if (ps->ps_flags & MD_MPS_ABR)
35770Sstevel@tonic-gate flag |= MD_STR_ABR;
35787975SAchim.Maurer@Sun.COM if (ps->ps_flags & MD_MPS_BLOCKABLE_IO)
35797975SAchim.Maurer@Sun.COM flag |= MD_STR_BLOCK_OK;
35800Sstevel@tonic-gate
35810Sstevel@tonic-gate /*
35820Sstevel@tonic-gate * If this is a resync read, ie MD_STR_DIRTY_RD not set, set
35830Sstevel@tonic-gate * MD_STR_WAR before calling mirror_read_strategy
35840Sstevel@tonic-gate */
35850Sstevel@tonic-gate if (pb->b_flags & B_READ) {
35860Sstevel@tonic-gate if (!(ps->ps_flags & MD_MPS_DIRTY_RD))
35870Sstevel@tonic-gate flag |= MD_STR_WAR;
35880Sstevel@tonic-gate mirror_read_strategy(pb, flag, ps);
35890Sstevel@tonic-gate } else
35900Sstevel@tonic-gate mirror_write_strategy(pb, flag, ps);
35910Sstevel@tonic-gate }
35920Sstevel@tonic-gate
35930Sstevel@tonic-gate /*
35940Sstevel@tonic-gate * update_resync:
35950Sstevel@tonic-gate * -------------
35960Sstevel@tonic-gate * Called to update the in-core version of the resync record with the latest
35970Sstevel@tonic-gate * version that was committed to disk when the previous mirror owner
35980Sstevel@tonic-gate * relinquished ownership. This call is likely to block as we must hold-off
35990Sstevel@tonic-gate * any current resync processing that may be occurring.
36000Sstevel@tonic-gate * On completion of the resync record update we issue the mirror_write_strategy
36010Sstevel@tonic-gate * call to complete the i/o that first started this sequence. To remove a race
36020Sstevel@tonic-gate * condition between a new write() request which is submitted and the resync
36030Sstevel@tonic-gate * record update we acquire the writerlock. This will hold off all i/o to the
36040Sstevel@tonic-gate * mirror until the resync update has completed.
36050Sstevel@tonic-gate * NOTE: no mutex can be held on entry to this routine
36060Sstevel@tonic-gate */
36070Sstevel@tonic-gate static void
update_resync(daemon_queue_t * dq)36080Sstevel@tonic-gate update_resync(daemon_queue_t *dq)
36090Sstevel@tonic-gate {
36100Sstevel@tonic-gate md_mps_t *ps = (md_mps_t *)dq;
36110Sstevel@tonic-gate buf_t *pb = ps->ps_bp;
36120Sstevel@tonic-gate mdi_unit_t *ui = ps->ps_ui;
36138452SJohn.Wren.Kennedy@Sun.COM mm_unit_t *un = MD_UNIT(ui->ui_link.ln_id);
36140Sstevel@tonic-gate set_t setno;
36150Sstevel@tonic-gate int restart_resync;
36160Sstevel@tonic-gate
36178452SJohn.Wren.Kennedy@Sun.COM mutex_enter(&un->un_rrp_inflight_mx);
36188452SJohn.Wren.Kennedy@Sun.COM (void) md_unit_writerlock(ui);
36190Sstevel@tonic-gate ps->ps_un = un;
36200Sstevel@tonic-gate setno = MD_MIN2SET(getminor(pb->b_edev));
36210Sstevel@tonic-gate if (mddb_reread_rr(setno, un->un_rr_dirty_recid) == 0) {
36220Sstevel@tonic-gate /*
36230Sstevel@tonic-gate * Synchronize our in-core view of what regions need to be
36240Sstevel@tonic-gate * resync'd with the on-disk version.
36250Sstevel@tonic-gate */
36260Sstevel@tonic-gate mirror_copy_rr(howmany(un->un_rrd_num, NBBY), un->un_resync_bm,
36270Sstevel@tonic-gate un->un_dirty_bm);
36280Sstevel@tonic-gate
36290Sstevel@tonic-gate /* Region dirty map is now up to date */
36300Sstevel@tonic-gate }
36310Sstevel@tonic-gate restart_resync = (un->un_rs_thread_flags & MD_RI_BLOCK_OWNER) ? 1 : 0;
36320Sstevel@tonic-gate md_unit_writerexit(ui);
36338452SJohn.Wren.Kennedy@Sun.COM mutex_exit(&un->un_rrp_inflight_mx);
36340Sstevel@tonic-gate
36350Sstevel@tonic-gate /* Restart the resync thread if it was previously blocked */
36360Sstevel@tonic-gate if (restart_resync) {
36370Sstevel@tonic-gate mutex_enter(&un->un_rs_thread_mx);
36380Sstevel@tonic-gate un->un_rs_thread_flags &= ~MD_RI_BLOCK_OWNER;
36390Sstevel@tonic-gate cv_signal(&un->un_rs_thread_cv);
36400Sstevel@tonic-gate mutex_exit(&un->un_rs_thread_mx);
36410Sstevel@tonic-gate }
36420Sstevel@tonic-gate /* Continue with original deferred i/o */
36430Sstevel@tonic-gate daemon_io(dq);
36440Sstevel@tonic-gate }
36450Sstevel@tonic-gate
36460Sstevel@tonic-gate /*
36470Sstevel@tonic-gate * owner_timeout:
36480Sstevel@tonic-gate * -------------
36490Sstevel@tonic-gate * Called if the original mdmn_ksend_message() failed and the request is to be
36500Sstevel@tonic-gate * retried. Reattempt the original ownership change.
36510Sstevel@tonic-gate *
36520Sstevel@tonic-gate * NOTE: called at interrupt context (see timeout(9f)).
36530Sstevel@tonic-gate */
36540Sstevel@tonic-gate static void
owner_timeout(void * arg)36550Sstevel@tonic-gate owner_timeout(void *arg)
36560Sstevel@tonic-gate {
36570Sstevel@tonic-gate daemon_queue_t *dq = (daemon_queue_t *)arg;
36580Sstevel@tonic-gate
36590Sstevel@tonic-gate daemon_request(&md_mirror_daemon, become_owner, dq, REQ_OLD);
36600Sstevel@tonic-gate }
36610Sstevel@tonic-gate
36620Sstevel@tonic-gate /*
36630Sstevel@tonic-gate * become_owner:
36640Sstevel@tonic-gate * ------------
36650Sstevel@tonic-gate * Called to issue RPC request to become the owner of the mirror
36660Sstevel@tonic-gate * associated with this i/o request. We assume that the ownership request
36670Sstevel@tonic-gate * is synchronous, so if it succeeds we will issue the request via
36680Sstevel@tonic-gate * mirror_write_strategy().
36690Sstevel@tonic-gate * If multiple i/o's are outstanding we will be called from the mirror_daemon
36700Sstevel@tonic-gate * service thread.
36710Sstevel@tonic-gate * NOTE: no mutex should be held on entry to this routine.
36720Sstevel@tonic-gate */
36730Sstevel@tonic-gate static void
become_owner(daemon_queue_t * dq)36740Sstevel@tonic-gate become_owner(daemon_queue_t *dq)
36750Sstevel@tonic-gate {
36760Sstevel@tonic-gate md_mps_t *ps = (md_mps_t *)dq;
36770Sstevel@tonic-gate mm_unit_t *un = ps->ps_un;
36780Sstevel@tonic-gate buf_t *pb = ps->ps_bp;
36790Sstevel@tonic-gate set_t setno;
36800Sstevel@tonic-gate md_mn_kresult_t *kres;
36810Sstevel@tonic-gate int msg_flags = md_mirror_msg_flags;
36820Sstevel@tonic-gate md_mps_t *ps1;
36830Sstevel@tonic-gate
36840Sstevel@tonic-gate ASSERT(dq->dq_next == NULL && dq->dq_prev == NULL);
36850Sstevel@tonic-gate
36860Sstevel@tonic-gate /*
36870Sstevel@tonic-gate * If we're already the mirror owner we do not need to send a message
36880Sstevel@tonic-gate * but can simply process the i/o request immediately.
36890Sstevel@tonic-gate * If we've already sent the request to become owner we requeue the
36900Sstevel@tonic-gate * request as we're waiting for the synchronous ownership message to
36910Sstevel@tonic-gate * be processed.
36920Sstevel@tonic-gate */
36930Sstevel@tonic-gate if (MD_MN_MIRROR_OWNER(un)) {
36940Sstevel@tonic-gate /*
36950Sstevel@tonic-gate * As the strategy() call will potentially block we need to
36960Sstevel@tonic-gate * punt this to a separate thread and complete this request
36970Sstevel@tonic-gate * as quickly as possible. Note: if we're a read request
36980Sstevel@tonic-gate * this must be a resync, we cannot afford to be queued
36990Sstevel@tonic-gate * behind any intervening i/o requests. In this case we put the
37000Sstevel@tonic-gate * request on the md_mirror_rs_daemon queue.
37010Sstevel@tonic-gate */
37020Sstevel@tonic-gate if (pb->b_flags & B_READ) {
37030Sstevel@tonic-gate daemon_request(&md_mirror_rs_daemon, daemon_io, dq,
37040Sstevel@tonic-gate REQ_OLD);
37050Sstevel@tonic-gate } else {
37060Sstevel@tonic-gate daemon_request(&md_mirror_io_daemon, daemon_io, dq,
37070Sstevel@tonic-gate REQ_OLD);
37080Sstevel@tonic-gate }
37090Sstevel@tonic-gate } else {
37100Sstevel@tonic-gate mutex_enter(&un->un_owner_mx);
37110Sstevel@tonic-gate if ((un->un_owner_state & MM_MN_OWNER_SENT) == 0) {
37120Sstevel@tonic-gate md_mn_req_owner_t *msg;
37130Sstevel@tonic-gate int rval = 0;
37140Sstevel@tonic-gate
37150Sstevel@tonic-gate /*
37160Sstevel@tonic-gate * Check to see that we haven't exceeded the maximum
37170Sstevel@tonic-gate * retry count. If we have we fail the i/o as the
37180Sstevel@tonic-gate * comms mechanism has become wedged beyond recovery.
37190Sstevel@tonic-gate */
37200Sstevel@tonic-gate if (dq->qlen++ >= MD_OWNER_RETRIES) {
37210Sstevel@tonic-gate mutex_exit(&un->un_owner_mx);
37220Sstevel@tonic-gate cmn_err(CE_WARN,
37230Sstevel@tonic-gate "md_mirror: Request exhausted ownership "
37240Sstevel@tonic-gate "retry limit of %d attempts", dq->qlen);
37250Sstevel@tonic-gate pb->b_error = EIO;
37260Sstevel@tonic-gate pb->b_flags |= B_ERROR;
37270Sstevel@tonic-gate pb->b_resid = pb->b_bcount;
37280Sstevel@tonic-gate kmem_cache_free(mirror_parent_cache, ps);
37290Sstevel@tonic-gate md_biodone(pb);
37300Sstevel@tonic-gate return;
37310Sstevel@tonic-gate }
37320Sstevel@tonic-gate
37330Sstevel@tonic-gate /*
37340Sstevel@tonic-gate * Issue request to change ownership. The call is
37350Sstevel@tonic-gate * synchronous so when it returns we can complete the
37360Sstevel@tonic-gate * i/o (if successful), or enqueue it again so that
37370Sstevel@tonic-gate * the operation will be retried.
37380Sstevel@tonic-gate */
37390Sstevel@tonic-gate un->un_owner_state |= MM_MN_OWNER_SENT;
37400Sstevel@tonic-gate mutex_exit(&un->un_owner_mx);
37410Sstevel@tonic-gate
37420Sstevel@tonic-gate msg = kmem_zalloc(sizeof (md_mn_req_owner_t), KM_SLEEP);
37430Sstevel@tonic-gate setno = MD_MIN2SET(getminor(pb->b_edev));
37440Sstevel@tonic-gate msg->mnum = MD_SID(un);
37450Sstevel@tonic-gate msg->owner = md_mn_mynode_id;
37460Sstevel@tonic-gate msg_flags |= MD_MSGF_NO_LOG;
37470Sstevel@tonic-gate /*
37480Sstevel@tonic-gate * If this IO is triggered by updating a watermark,
37490Sstevel@tonic-gate * it might be issued by the creation of a softpartition
37500Sstevel@tonic-gate * while the commd subsystem is suspended.
37510Sstevel@tonic-gate * We don't want this message to block.
37520Sstevel@tonic-gate */
37530Sstevel@tonic-gate if (ps->ps_flags & MD_MPS_WMUPDATE) {
37540Sstevel@tonic-gate msg_flags |= MD_MSGF_OVERRIDE_SUSPEND;
37550Sstevel@tonic-gate }
37560Sstevel@tonic-gate
37570Sstevel@tonic-gate kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
37580Sstevel@tonic-gate rval = mdmn_ksend_message(setno,
37598452SJohn.Wren.Kennedy@Sun.COM MD_MN_MSG_REQUIRE_OWNER, msg_flags, 0,
37608452SJohn.Wren.Kennedy@Sun.COM (char *)msg, sizeof (md_mn_req_owner_t), kres);
37610Sstevel@tonic-gate
37620Sstevel@tonic-gate kmem_free(msg, sizeof (md_mn_req_owner_t));
37630Sstevel@tonic-gate
37640Sstevel@tonic-gate if (MDMN_KSEND_MSG_OK(rval, kres)) {
37650Sstevel@tonic-gate dq->qlen = 0;
37660Sstevel@tonic-gate /*
37670Sstevel@tonic-gate * Successfully changed owner, reread the
37680Sstevel@tonic-gate * resync record so that we have a valid idea of
37690Sstevel@tonic-gate * any previously committed incomplete write()s.
37700Sstevel@tonic-gate * NOTE: As we need to acquire the resync mutex
37710Sstevel@tonic-gate * this may block, so we defer it to a separate
37720Sstevel@tonic-gate * thread handler. This makes us (effectively)
37730Sstevel@tonic-gate * non-blocking once the ownership message
37740Sstevel@tonic-gate * handling has completed.
37750Sstevel@tonic-gate */
37760Sstevel@tonic-gate mutex_enter(&un->un_owner_mx);
37770Sstevel@tonic-gate if (un->un_owner_state & MM_MN_BECOME_OWNER) {
37780Sstevel@tonic-gate un->un_mirror_owner = md_mn_mynode_id;
37790Sstevel@tonic-gate /* Sets owner of un_rr_dirty record */
37800Sstevel@tonic-gate if (un->un_rr_dirty_recid)
37810Sstevel@tonic-gate (void) mddb_setowner(
37820Sstevel@tonic-gate un->un_rr_dirty_recid,
37830Sstevel@tonic-gate md_mn_mynode_id);
37840Sstevel@tonic-gate un->un_owner_state &=
37850Sstevel@tonic-gate ~MM_MN_BECOME_OWNER;
37860Sstevel@tonic-gate /*
37870Sstevel@tonic-gate * Release the block on the current
37880Sstevel@tonic-gate * resync region if it is blocked
37890Sstevel@tonic-gate */
37906901Sjkennedy ps1 = un->un_rs_prev_overlap;
37910Sstevel@tonic-gate if ((ps1 != NULL) &&
37920Sstevel@tonic-gate (ps1->ps_flags & MD_MPS_ON_OVERLAP))
37936901Sjkennedy mirror_overlap_tree_remove(ps1);
37940Sstevel@tonic-gate mutex_exit(&un->un_owner_mx);
37950Sstevel@tonic-gate
37960Sstevel@tonic-gate /*
37970Sstevel@tonic-gate * If we're a read, this must be a
37980Sstevel@tonic-gate * resync request, issue
37990Sstevel@tonic-gate * the i/o request on the
38000Sstevel@tonic-gate * md_mirror_rs_daemon queue. This is
38010Sstevel@tonic-gate * to avoid a deadlock between the
38020Sstevel@tonic-gate * resync_unit thread and
38030Sstevel@tonic-gate * subsequent i/o requests that may
38040Sstevel@tonic-gate * block on the resync region.
38050Sstevel@tonic-gate */
38060Sstevel@tonic-gate if (pb->b_flags & B_READ) {
38070Sstevel@tonic-gate daemon_request(
38080Sstevel@tonic-gate &md_mirror_rs_daemon,
38090Sstevel@tonic-gate update_resync, dq, REQ_OLD);
38100Sstevel@tonic-gate } else {
38110Sstevel@tonic-gate daemon_request(
38120Sstevel@tonic-gate &md_mirror_io_daemon,
38130Sstevel@tonic-gate update_resync, dq, REQ_OLD);
38140Sstevel@tonic-gate }
38150Sstevel@tonic-gate kmem_free(kres,
38160Sstevel@tonic-gate sizeof (md_mn_kresult_t));
38170Sstevel@tonic-gate return;
38180Sstevel@tonic-gate } else {
38190Sstevel@tonic-gate /*
38200Sstevel@tonic-gate * Some other node has beaten us to
38210Sstevel@tonic-gate * obtain ownership. We need to
38220Sstevel@tonic-gate * reschedule our ownership request
38230Sstevel@tonic-gate */
38240Sstevel@tonic-gate mutex_exit(&un->un_owner_mx);
38250Sstevel@tonic-gate }
38260Sstevel@tonic-gate } else {
38270Sstevel@tonic-gate mdmn_ksend_show_error(rval, kres,
38280Sstevel@tonic-gate "MD_MN_MSG_REQUIRE_OWNER");
38290Sstevel@tonic-gate /*
38300Sstevel@tonic-gate * Message transport failure is handled by the
38310Sstevel@tonic-gate * comms layer. If the ownership change request
38320Sstevel@tonic-gate * does not succeed we need to flag the error to
38330Sstevel@tonic-gate * the initiator of the i/o. This is handled by
38340Sstevel@tonic-gate * the retry logic above. As the request failed
38350Sstevel@tonic-gate * we do not know _who_ the owner of the mirror
38360Sstevel@tonic-gate * currently is. We reset our idea of the owner
38370Sstevel@tonic-gate * to None so that any further write()s will
38380Sstevel@tonic-gate * attempt to become the owner again. This stops
38390Sstevel@tonic-gate * multiple nodes writing to the same mirror
38400Sstevel@tonic-gate * simultaneously.
38410Sstevel@tonic-gate */
38420Sstevel@tonic-gate mutex_enter(&un->un_owner_mx);
38430Sstevel@tonic-gate un->un_owner_state &=
38440Sstevel@tonic-gate ~(MM_MN_OWNER_SENT|MM_MN_BECOME_OWNER);
38450Sstevel@tonic-gate un->un_mirror_owner = MD_MN_MIRROR_UNOWNED;
38460Sstevel@tonic-gate mutex_exit(&un->un_owner_mx);
38470Sstevel@tonic-gate }
38480Sstevel@tonic-gate kmem_free(kres, sizeof (md_mn_kresult_t));
38490Sstevel@tonic-gate } else
38500Sstevel@tonic-gate mutex_exit(&un->un_owner_mx);
38510Sstevel@tonic-gate
38520Sstevel@tonic-gate /*
38530Sstevel@tonic-gate * Re-enqueue this request on the deferred i/o list. Delay the
38540Sstevel@tonic-gate * request for md_mirror_owner_to usecs to stop thrashing.
38550Sstevel@tonic-gate */
38560Sstevel@tonic-gate (void) timeout(owner_timeout, dq,
38570Sstevel@tonic-gate drv_usectohz(md_mirror_owner_to));
38580Sstevel@tonic-gate }
38590Sstevel@tonic-gate }
38600Sstevel@tonic-gate
38610Sstevel@tonic-gate static void
mirror_write_strategy(buf_t * pb,int flag,void * private)38620Sstevel@tonic-gate mirror_write_strategy(buf_t *pb, int flag, void *private)
38630Sstevel@tonic-gate {
38640Sstevel@tonic-gate md_mps_t *ps;
38650Sstevel@tonic-gate md_mcs_t *cs;
38660Sstevel@tonic-gate int more;
38670Sstevel@tonic-gate mm_unit_t *un;
38680Sstevel@tonic-gate mdi_unit_t *ui;
38690Sstevel@tonic-gate buf_t *cb; /* child buf pointer */
38700Sstevel@tonic-gate set_t setno;
38710Sstevel@tonic-gate int rs_on_overlap = 0;
38720Sstevel@tonic-gate
38730Sstevel@tonic-gate ui = MDI_UNIT(getminor(pb->b_edev));
38740Sstevel@tonic-gate un = (mm_unit_t *)MD_UNIT(getminor(pb->b_edev));
38750Sstevel@tonic-gate
38760Sstevel@tonic-gate
38770Sstevel@tonic-gate md_kstat_waitq_enter(ui);
38780Sstevel@tonic-gate
38790Sstevel@tonic-gate /*
38800Sstevel@tonic-gate * If a state change is in progress for this mirror in a MN set,
38810Sstevel@tonic-gate * suspend all non-resync writes until the state change is complete.
38820Sstevel@tonic-gate * The objective of this suspend is to ensure that it is not
38830Sstevel@tonic-gate * possible for one node to read data from a submirror that another node
38840Sstevel@tonic-gate * has not written to because of the state change. Therefore we
38850Sstevel@tonic-gate * suspend all writes until the state change has been made. As it is
38860Sstevel@tonic-gate * not possible to read from the target of a resync, there is no need
38870Sstevel@tonic-gate * to suspend resync writes.
38887975SAchim.Maurer@Sun.COM * Note that we only block here if the caller can handle a busy-wait.
38897975SAchim.Maurer@Sun.COM * The MD_STR_BLOCK_OK flag is set for daemon_io originated i/o only.
38900Sstevel@tonic-gate */
38910Sstevel@tonic-gate
38920Sstevel@tonic-gate if (!(flag & MD_STR_WAR)) {
38937975SAchim.Maurer@Sun.COM if (flag & MD_STR_BLOCK_OK) {
38947975SAchim.Maurer@Sun.COM mutex_enter(&un->un_suspend_wr_mx);
38957975SAchim.Maurer@Sun.COM while (un->un_suspend_wr_flag) {
38967975SAchim.Maurer@Sun.COM cv_wait(&un->un_suspend_wr_cv,
38977975SAchim.Maurer@Sun.COM &un->un_suspend_wr_mx);
38987975SAchim.Maurer@Sun.COM }
38997975SAchim.Maurer@Sun.COM mutex_exit(&un->un_suspend_wr_mx);
39000Sstevel@tonic-gate }
39010Sstevel@tonic-gate (void) md_unit_readerlock(ui);
39020Sstevel@tonic-gate }
39030Sstevel@tonic-gate
39040Sstevel@tonic-gate if (!(flag & MD_STR_NOTTOP)) {
39050Sstevel@tonic-gate if (md_checkbuf(ui, (md_unit_t *)un, pb)) {
39060Sstevel@tonic-gate md_kstat_waitq_exit(ui);
39070Sstevel@tonic-gate return;
39080Sstevel@tonic-gate }
39090Sstevel@tonic-gate }
39100Sstevel@tonic-gate
39110Sstevel@tonic-gate setno = MD_MIN2SET(getminor(pb->b_edev));
39120Sstevel@tonic-gate
39130Sstevel@tonic-gate /* If an ABR write has been requested, set MD_STR_ABR flag */
39140Sstevel@tonic-gate if (MD_MNSET_SETNO(setno) && (pb->b_flags & B_ABRWRITE))
39150Sstevel@tonic-gate flag |= MD_STR_ABR;
39160Sstevel@tonic-gate
39170Sstevel@tonic-gate if (private == NULL) {
39180Sstevel@tonic-gate ps = kmem_cache_alloc(mirror_parent_cache, MD_ALLOCFLAGS);
39190Sstevel@tonic-gate mirror_parent_init(ps);
39200Sstevel@tonic-gate } else {
39210Sstevel@tonic-gate ps = private;
39220Sstevel@tonic-gate private = NULL;
39230Sstevel@tonic-gate }
39240Sstevel@tonic-gate if (flag & MD_STR_MAPPED)
39250Sstevel@tonic-gate ps->ps_flags |= MD_MPS_MAPPED;
39260Sstevel@tonic-gate
39270Sstevel@tonic-gate if (flag & MD_STR_WOW)
39280Sstevel@tonic-gate ps->ps_flags |= MD_MPS_WOW;
39290Sstevel@tonic-gate
39300Sstevel@tonic-gate if (flag & MD_STR_ABR)
39310Sstevel@tonic-gate ps->ps_flags |= MD_MPS_ABR;
39320Sstevel@tonic-gate
39330Sstevel@tonic-gate if (flag & MD_STR_WMUPDATE)
39340Sstevel@tonic-gate ps->ps_flags |= MD_MPS_WMUPDATE;
39350Sstevel@tonic-gate
39360Sstevel@tonic-gate /*
39370Sstevel@tonic-gate * Save essential information from the original buffhdr
39380Sstevel@tonic-gate * in the md_save structure.
39390Sstevel@tonic-gate */
39400Sstevel@tonic-gate ps->ps_un = un;
39410Sstevel@tonic-gate ps->ps_ui = ui;
39420Sstevel@tonic-gate ps->ps_bp = pb;
39430Sstevel@tonic-gate ps->ps_addr = pb->b_un.b_addr;
39440Sstevel@tonic-gate ps->ps_firstblk = pb->b_lblkno;
39450Sstevel@tonic-gate ps->ps_lastblk = pb->b_lblkno + lbtodb(pb->b_bcount) - 1;
39460Sstevel@tonic-gate ps->ps_changecnt = un->un_changecnt;
39470Sstevel@tonic-gate
39480Sstevel@tonic-gate /*
39497975SAchim.Maurer@Sun.COM * Check for suspended writes here. This is where we can defer the
39507975SAchim.Maurer@Sun.COM * write request to the daemon_io queue which will then call us with
39517975SAchim.Maurer@Sun.COM * the MD_STR_BLOCK_OK flag set and we'll busy-wait (if necessary) at
39527975SAchim.Maurer@Sun.COM * the top of this routine.
39537975SAchim.Maurer@Sun.COM */
39547975SAchim.Maurer@Sun.COM if (!(flag & MD_STR_WAR) && !(flag & MD_STR_BLOCK_OK)) {
39557975SAchim.Maurer@Sun.COM mutex_enter(&un->un_suspend_wr_mx);
39567975SAchim.Maurer@Sun.COM if (un->un_suspend_wr_flag) {
39577975SAchim.Maurer@Sun.COM ps->ps_flags |= MD_MPS_BLOCKABLE_IO;
39587975SAchim.Maurer@Sun.COM mutex_exit(&un->un_suspend_wr_mx);
39597975SAchim.Maurer@Sun.COM md_unit_readerexit(ui);
39607975SAchim.Maurer@Sun.COM daemon_request(&md_mirror_daemon, daemon_io,
39617975SAchim.Maurer@Sun.COM (daemon_queue_t *)ps, REQ_OLD);
39627975SAchim.Maurer@Sun.COM return;
39637975SAchim.Maurer@Sun.COM }
39647975SAchim.Maurer@Sun.COM mutex_exit(&un->un_suspend_wr_mx);
39657975SAchim.Maurer@Sun.COM }
39667975SAchim.Maurer@Sun.COM
39677975SAchim.Maurer@Sun.COM /*
39680Sstevel@tonic-gate * If not MN owner and this is an ABR write, make sure the current
39696901Sjkennedy * resync region is in the overlaps tree
39700Sstevel@tonic-gate */
39710Sstevel@tonic-gate mutex_enter(&un->un_owner_mx);
39720Sstevel@tonic-gate if (MD_MNSET_SETNO(setno) && (!(MD_MN_MIRROR_OWNER(un))) &&
39730Sstevel@tonic-gate ((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR))) {
39740Sstevel@tonic-gate md_mps_t *ps1;
39750Sstevel@tonic-gate /* Block the current resync region, if not already blocked */
39766901Sjkennedy ps1 = un->un_rs_prev_overlap;
39770Sstevel@tonic-gate
39780Sstevel@tonic-gate if ((ps1 != NULL) && ((ps1->ps_firstblk != 0) ||
39790Sstevel@tonic-gate (ps1->ps_lastblk != 0))) {
39800Sstevel@tonic-gate /* Drop locks to avoid deadlock */
39810Sstevel@tonic-gate mutex_exit(&un->un_owner_mx);
39820Sstevel@tonic-gate md_unit_readerexit(ui);
39830Sstevel@tonic-gate wait_for_overlaps(ps1, MD_OVERLAP_ALLOW_REPEAT);
39840Sstevel@tonic-gate rs_on_overlap = 1;
39850Sstevel@tonic-gate (void) md_unit_readerlock(ui);
39860Sstevel@tonic-gate mutex_enter(&un->un_owner_mx);
39870Sstevel@tonic-gate /*
39880Sstevel@tonic-gate * Check to see if we have obtained ownership
39890Sstevel@tonic-gate * while waiting for overlaps. If we have, remove
39906901Sjkennedy * the resync_region entry from the overlap tree
39910Sstevel@tonic-gate */
39920Sstevel@tonic-gate if (MD_MN_MIRROR_OWNER(un) &&
39930Sstevel@tonic-gate (ps1->ps_flags & MD_MPS_ON_OVERLAP)) {
39946901Sjkennedy mirror_overlap_tree_remove(ps1);
39950Sstevel@tonic-gate rs_on_overlap = 0;
39960Sstevel@tonic-gate }
39970Sstevel@tonic-gate }
39980Sstevel@tonic-gate }
39990Sstevel@tonic-gate mutex_exit(&un->un_owner_mx);
40000Sstevel@tonic-gate
40010Sstevel@tonic-gate
40020Sstevel@tonic-gate /*
40030Sstevel@tonic-gate * following keep write after read from writing to the
40040Sstevel@tonic-gate * source in the case where it all came from one place
40050Sstevel@tonic-gate */
40060Sstevel@tonic-gate if (flag & MD_STR_WAR) {
40070Sstevel@tonic-gate int abort_write = 0;
40080Sstevel@tonic-gate /*
40090Sstevel@tonic-gate * We are perfoming a write-after-read. This is either as a
40100Sstevel@tonic-gate * result of a resync read or as a result of a read in a
40110Sstevel@tonic-gate * dirty resync region when the optimized resync is not
40120Sstevel@tonic-gate * complete. If in a MN set and a resync generated i/o,
40130Sstevel@tonic-gate * if the current block is not in the current
40140Sstevel@tonic-gate * resync region terminate the write as another node must have
40150Sstevel@tonic-gate * completed this resync region
40160Sstevel@tonic-gate */
40170Sstevel@tonic-gate if ((MD_MNSET_SETNO(MD_UN2SET(un))) &&
40180Sstevel@tonic-gate (!flag & MD_STR_DIRTY_RD)) {
40190Sstevel@tonic-gate if (!IN_RESYNC_REGION(un, ps))
40200Sstevel@tonic-gate abort_write = 1;
40210Sstevel@tonic-gate }
40220Sstevel@tonic-gate if ((select_write_after_read_units(un, ps) == 0) ||
40230Sstevel@tonic-gate (abort_write)) {
40240Sstevel@tonic-gate #ifdef DEBUG
40250Sstevel@tonic-gate if (mirror_debug_flag)
40260Sstevel@tonic-gate printf("Abort resync write on %x, block %lld\n",
40270Sstevel@tonic-gate MD_SID(un), ps->ps_firstblk);
40280Sstevel@tonic-gate #endif
40290Sstevel@tonic-gate if (ps->ps_flags & MD_MPS_ON_OVERLAP)
40306901Sjkennedy mirror_overlap_tree_remove(ps);
40310Sstevel@tonic-gate kmem_cache_free(mirror_parent_cache, ps);
40320Sstevel@tonic-gate md_kstat_waitq_exit(ui);
40330Sstevel@tonic-gate md_unit_readerexit(ui);
40340Sstevel@tonic-gate md_biodone(pb);
40350Sstevel@tonic-gate return;
40360Sstevel@tonic-gate }
40370Sstevel@tonic-gate } else {
40380Sstevel@tonic-gate select_write_units(un, ps);
40390Sstevel@tonic-gate
40400Sstevel@tonic-gate /* Drop readerlock to avoid deadlock */
40410Sstevel@tonic-gate md_unit_readerexit(ui);
40420Sstevel@tonic-gate wait_for_overlaps(ps, MD_OVERLAP_NO_REPEAT);
40430Sstevel@tonic-gate un = md_unit_readerlock(ui);
40440Sstevel@tonic-gate /*
40450Sstevel@tonic-gate * For a MN set with an ABR write, if we are now the
40466901Sjkennedy * owner and we have a resync region in the overlap
40476901Sjkennedy * tree, remove the entry from overlaps and retry the write.
40480Sstevel@tonic-gate */
40490Sstevel@tonic-gate
40500Sstevel@tonic-gate if (MD_MNSET_SETNO(setno) &&
40510Sstevel@tonic-gate ((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR))) {
40520Sstevel@tonic-gate mutex_enter(&un->un_owner_mx);
40530Sstevel@tonic-gate if (((MD_MN_MIRROR_OWNER(un))) && rs_on_overlap) {
40546901Sjkennedy mirror_overlap_tree_remove(ps);
40550Sstevel@tonic-gate md_kstat_waitq_exit(ui);
40560Sstevel@tonic-gate mutex_exit(&un->un_owner_mx);
40570Sstevel@tonic-gate md_unit_readerexit(ui);
40580Sstevel@tonic-gate daemon_request(&md_mirror_daemon, daemon_io,
40590Sstevel@tonic-gate (daemon_queue_t *)ps, REQ_OLD);
40600Sstevel@tonic-gate return;
40610Sstevel@tonic-gate }
40620Sstevel@tonic-gate mutex_exit(&un->un_owner_mx);
40630Sstevel@tonic-gate }
40640Sstevel@tonic-gate }
40650Sstevel@tonic-gate
40660Sstevel@tonic-gate /*
40678452SJohn.Wren.Kennedy@Sun.COM * For Multinode mirrors with no owner and a Resync Region (not ABR)
40688452SJohn.Wren.Kennedy@Sun.COM * we need to become the mirror owner before continuing with the
40698452SJohn.Wren.Kennedy@Sun.COM * write(). For ABR mirrors we check that we 'own' the resync if
40708452SJohn.Wren.Kennedy@Sun.COM * we're in write-after-read mode. We do this _after_ ensuring that
40718452SJohn.Wren.Kennedy@Sun.COM * there are no overlaps to ensure that once we know that we are
40728452SJohn.Wren.Kennedy@Sun.COM * the owner, the readerlock will not be released until the write is
40738452SJohn.Wren.Kennedy@Sun.COM * complete. As a change of ownership in a MN set requires the
40748452SJohn.Wren.Kennedy@Sun.COM * writerlock, this ensures that ownership cannot be changed until
40758452SJohn.Wren.Kennedy@Sun.COM * the write is complete.
40760Sstevel@tonic-gate */
40770Sstevel@tonic-gate if (MD_MNSET_SETNO(setno) && (!((ui->ui_tstate & MD_ABR_CAP) ||
40780Sstevel@tonic-gate (flag & MD_STR_ABR)) || (flag & MD_STR_WAR))) {
40798452SJohn.Wren.Kennedy@Sun.COM if (MD_MN_NO_MIRROR_OWNER(un)) {
40800Sstevel@tonic-gate if (ps->ps_flags & MD_MPS_ON_OVERLAP)
40816901Sjkennedy mirror_overlap_tree_remove(ps);
40820Sstevel@tonic-gate md_kstat_waitq_exit(ui);
40830Sstevel@tonic-gate ASSERT(!(flag & MD_STR_WAR));
40840Sstevel@tonic-gate md_unit_readerexit(ui);
40850Sstevel@tonic-gate daemon_request(&md_mirror_daemon, become_owner,
40860Sstevel@tonic-gate (daemon_queue_t *)ps, REQ_OLD);
40870Sstevel@tonic-gate return;
40880Sstevel@tonic-gate }
40890Sstevel@tonic-gate }
40900Sstevel@tonic-gate
40910Sstevel@tonic-gate /*
40920Sstevel@tonic-gate * Mark resync region if mirror has a Resync Region _and_ we are not
40930Sstevel@tonic-gate * a resync initiated write(). Don't mark region if we're flagged as
40940Sstevel@tonic-gate * an ABR write.
40950Sstevel@tonic-gate */
40960Sstevel@tonic-gate if (!((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR)) &&
40970Sstevel@tonic-gate !(flag & MD_STR_WAR)) {
40980Sstevel@tonic-gate if (mirror_mark_resync_region(un, ps->ps_firstblk,
40998452SJohn.Wren.Kennedy@Sun.COM ps->ps_lastblk, md_mn_mynode_id)) {
41000Sstevel@tonic-gate pb->b_flags |= B_ERROR;
41010Sstevel@tonic-gate pb->b_resid = pb->b_bcount;
41028452SJohn.Wren.Kennedy@Sun.COM if (ps->ps_flags & MD_MPS_ON_OVERLAP)
41038452SJohn.Wren.Kennedy@Sun.COM mirror_overlap_tree_remove(ps);
41040Sstevel@tonic-gate kmem_cache_free(mirror_parent_cache, ps);
41050Sstevel@tonic-gate md_kstat_waitq_exit(ui);
41060Sstevel@tonic-gate md_unit_readerexit(ui);
41070Sstevel@tonic-gate md_biodone(pb);
41080Sstevel@tonic-gate return;
41090Sstevel@tonic-gate }
41100Sstevel@tonic-gate }
41110Sstevel@tonic-gate
41120Sstevel@tonic-gate ps->ps_childbflags = pb->b_flags | B_WRITE;
41130Sstevel@tonic-gate ps->ps_childbflags &= ~B_READ;
41140Sstevel@tonic-gate if (flag & MD_STR_MAPPED)
41150Sstevel@tonic-gate ps->ps_childbflags &= ~B_PAGEIO;
41160Sstevel@tonic-gate
41170Sstevel@tonic-gate if (!(flag & MD_STR_NOTTOP) && panicstr)
41180Sstevel@tonic-gate /* Disable WOW and don't free ps */
41190Sstevel@tonic-gate ps->ps_flags |= (MD_MPS_WOW|MD_MPS_DONTFREE);
41200Sstevel@tonic-gate
41210Sstevel@tonic-gate md_kstat_waitq_to_runq(ui);
41220Sstevel@tonic-gate
41230Sstevel@tonic-gate /*
41240Sstevel@tonic-gate * Treat Raw and Direct I/O as Write-on-Write always
41250Sstevel@tonic-gate */
41260Sstevel@tonic-gate
41270Sstevel@tonic-gate if (!(md_mirror_wow_flg & WOW_DISABLE) &&
41280Sstevel@tonic-gate (md_mirror_wow_flg & WOW_PHYS_ENABLE) &&
41290Sstevel@tonic-gate (pb->b_flags & B_PHYS) &&
41300Sstevel@tonic-gate !(ps->ps_flags & MD_MPS_WOW)) {
41310Sstevel@tonic-gate if (ps->ps_flags & MD_MPS_ON_OVERLAP)
41326901Sjkennedy mirror_overlap_tree_remove(ps);
41330Sstevel@tonic-gate md_unit_readerexit(ui);
41340Sstevel@tonic-gate daemon_request(&md_mstr_daemon, handle_wow,
41356901Sjkennedy (daemon_queue_t *)ps, REQ_OLD);
41360Sstevel@tonic-gate return;
41370Sstevel@tonic-gate }
41380Sstevel@tonic-gate
41390Sstevel@tonic-gate ps->ps_frags = 1;
41400Sstevel@tonic-gate do {
41410Sstevel@tonic-gate cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS);
41420Sstevel@tonic-gate mirror_child_init(cs);
41430Sstevel@tonic-gate cb = &cs->cs_buf;
41440Sstevel@tonic-gate more = mirror_map_write(un, cs, ps, (flag & MD_STR_WAR));
41450Sstevel@tonic-gate
41460Sstevel@tonic-gate /*
41470Sstevel@tonic-gate * This handles the case where we're requesting
41480Sstevel@tonic-gate * a write to block 0 on a label partition. (more < 0)
41490Sstevel@tonic-gate * means that the request size was smaller than the
41500Sstevel@tonic-gate * size of the label. If so this request is done.
41510Sstevel@tonic-gate */
41520Sstevel@tonic-gate if (more < 0) {
41530Sstevel@tonic-gate if (ps->ps_flags & MD_MPS_ON_OVERLAP)
41546901Sjkennedy mirror_overlap_tree_remove(ps);
41550Sstevel@tonic-gate md_kstat_runq_exit(ui);
41560Sstevel@tonic-gate kmem_cache_free(mirror_child_cache, cs);
41570Sstevel@tonic-gate kmem_cache_free(mirror_parent_cache, ps);
41580Sstevel@tonic-gate md_unit_readerexit(ui);
41590Sstevel@tonic-gate md_biodone(pb);
41600Sstevel@tonic-gate return;
41610Sstevel@tonic-gate }
41620Sstevel@tonic-gate if (more) {
41630Sstevel@tonic-gate mutex_enter(&ps->ps_mx);
41640Sstevel@tonic-gate ps->ps_frags++;
41650Sstevel@tonic-gate mutex_exit(&ps->ps_mx);
41660Sstevel@tonic-gate }
41670Sstevel@tonic-gate md_call_strategy(cb, flag, private);
41680Sstevel@tonic-gate } while (more);
41690Sstevel@tonic-gate
41700Sstevel@tonic-gate if (!(flag & MD_STR_NOTTOP) && panicstr) {
41710Sstevel@tonic-gate while (!(ps->ps_flags & MD_MPS_DONE)) {
41720Sstevel@tonic-gate md_daemon(1, &md_done_daemon);
41730Sstevel@tonic-gate drv_usecwait(10);
41740Sstevel@tonic-gate }
41750Sstevel@tonic-gate kmem_cache_free(mirror_parent_cache, ps);
41760Sstevel@tonic-gate }
41770Sstevel@tonic-gate }
41780Sstevel@tonic-gate
41790Sstevel@tonic-gate static void
mirror_read_strategy(buf_t * pb,int flag,void * private)41800Sstevel@tonic-gate mirror_read_strategy(buf_t *pb, int flag, void *private)
41810Sstevel@tonic-gate {
41820Sstevel@tonic-gate md_mps_t *ps;
41830Sstevel@tonic-gate md_mcs_t *cs;
41840Sstevel@tonic-gate size_t more;
41850Sstevel@tonic-gate mm_unit_t *un;
41860Sstevel@tonic-gate mdi_unit_t *ui;
41870Sstevel@tonic-gate size_t current_count;
41880Sstevel@tonic-gate diskaddr_t current_blkno;
41890Sstevel@tonic-gate off_t current_offset;
41900Sstevel@tonic-gate buf_t *cb; /* child buf pointer */
41910Sstevel@tonic-gate set_t setno;
41920Sstevel@tonic-gate
41930Sstevel@tonic-gate ui = MDI_UNIT(getminor(pb->b_edev));
41940Sstevel@tonic-gate
41950Sstevel@tonic-gate md_kstat_waitq_enter(ui);
41960Sstevel@tonic-gate
41970Sstevel@tonic-gate un = (mm_unit_t *)md_unit_readerlock(ui);
41980Sstevel@tonic-gate
41990Sstevel@tonic-gate if (!(flag & MD_STR_NOTTOP)) {
42000Sstevel@tonic-gate if (md_checkbuf(ui, (md_unit_t *)un, pb)) {
42010Sstevel@tonic-gate md_kstat_waitq_exit(ui);
42020Sstevel@tonic-gate return;
42030Sstevel@tonic-gate }
42040Sstevel@tonic-gate }
42050Sstevel@tonic-gate
42060Sstevel@tonic-gate if (private == NULL) {
42070Sstevel@tonic-gate ps = kmem_cache_alloc(mirror_parent_cache, MD_ALLOCFLAGS);
42080Sstevel@tonic-gate mirror_parent_init(ps);
42090Sstevel@tonic-gate } else {
42100Sstevel@tonic-gate ps = private;
42110Sstevel@tonic-gate private = NULL;
42120Sstevel@tonic-gate }
42130Sstevel@tonic-gate
42140Sstevel@tonic-gate if (flag & MD_STR_MAPPED)
42150Sstevel@tonic-gate ps->ps_flags |= MD_MPS_MAPPED;
42160Sstevel@tonic-gate if (flag & MD_NOBLOCK)
42170Sstevel@tonic-gate ps->ps_flags |= MD_MPS_NOBLOCK;
42180Sstevel@tonic-gate if (flag & MD_STR_WMUPDATE)
42190Sstevel@tonic-gate ps->ps_flags |= MD_MPS_WMUPDATE;
42200Sstevel@tonic-gate
42210Sstevel@tonic-gate /*
42220Sstevel@tonic-gate * Check to see if this is a DMR driven read. If so we need to use the
42230Sstevel@tonic-gate * specified side (in un->un_dmr_last_read) for the source of the data.
42240Sstevel@tonic-gate */
42250Sstevel@tonic-gate if (flag & MD_STR_DMR)
42260Sstevel@tonic-gate ps->ps_flags |= MD_MPS_DMR;
42270Sstevel@tonic-gate
42280Sstevel@tonic-gate /*
42290Sstevel@tonic-gate * Save essential information from the original buffhdr
42300Sstevel@tonic-gate * in the md_save structure.
42310Sstevel@tonic-gate */
42320Sstevel@tonic-gate ps->ps_un = un;
42330Sstevel@tonic-gate ps->ps_ui = ui;
42340Sstevel@tonic-gate ps->ps_bp = pb;
42350Sstevel@tonic-gate ps->ps_addr = pb->b_un.b_addr;
42360Sstevel@tonic-gate ps->ps_firstblk = pb->b_lblkno;
42370Sstevel@tonic-gate ps->ps_lastblk = pb->b_lblkno + lbtodb(pb->b_bcount) - 1;
42380Sstevel@tonic-gate ps->ps_changecnt = un->un_changecnt;
42390Sstevel@tonic-gate
42400Sstevel@tonic-gate current_count = btodb(pb->b_bcount);
42410Sstevel@tonic-gate current_blkno = pb->b_lblkno;
42420Sstevel@tonic-gate current_offset = 0;
42430Sstevel@tonic-gate
42440Sstevel@tonic-gate /*
42450Sstevel@tonic-gate * If flag has MD_STR_WAR set this means that the read is issued by a
42460Sstevel@tonic-gate * resync thread which may or may not be an optimised resync.
42470Sstevel@tonic-gate *
42480Sstevel@tonic-gate * If MD_UN_OPT_NOT_DONE is set this means that the optimized resync
42490Sstevel@tonic-gate * code has not completed; either a resync has not started since snarf,
42500Sstevel@tonic-gate * or there is an optimized resync in progress.
42510Sstevel@tonic-gate *
42520Sstevel@tonic-gate * We need to generate a write after this read in the following two
42530Sstevel@tonic-gate * cases,
42540Sstevel@tonic-gate *
42550Sstevel@tonic-gate * 1. Any Resync-Generated read
42560Sstevel@tonic-gate *
42570Sstevel@tonic-gate * 2. Any read to a DIRTY REGION if there is an optimized resync
42580Sstevel@tonic-gate * pending or in progress.
42590Sstevel@tonic-gate *
42600Sstevel@tonic-gate * The write after read is done in these cases to ensure that all sides
42610Sstevel@tonic-gate * of the mirror are in sync with the read data and that it is not
42620Sstevel@tonic-gate * possible for an application to read the same block multiple times
42630Sstevel@tonic-gate * and get different data.
42640Sstevel@tonic-gate *
42650Sstevel@tonic-gate * This would be possible if the block was in a dirty region.
42660Sstevel@tonic-gate *
42670Sstevel@tonic-gate * If we're performing a directed read we don't write the data out as
42680Sstevel@tonic-gate * the application is responsible for restoring the mirror to a known
42690Sstevel@tonic-gate * state.
42700Sstevel@tonic-gate */
42710Sstevel@tonic-gate if (((MD_STATUS(un) & MD_UN_OPT_NOT_DONE) || (flag & MD_STR_WAR)) &&
42720Sstevel@tonic-gate !(flag & MD_STR_DMR)) {
42730Sstevel@tonic-gate size_t start_rr, i, end_rr;
42740Sstevel@tonic-gate int region_dirty = 1;
42750Sstevel@tonic-gate
42760Sstevel@tonic-gate /*
42770Sstevel@tonic-gate * We enter here under three circumstances,
42780Sstevel@tonic-gate *
42790Sstevel@tonic-gate * MD_UN_OPT_NOT_DONE MD_STR_WAR
42800Sstevel@tonic-gate * 0 1
42810Sstevel@tonic-gate * 1 0
42820Sstevel@tonic-gate * 1 1
42830Sstevel@tonic-gate *
42840Sstevel@tonic-gate * To be optimal we only care to explicitly check for dirty
42850Sstevel@tonic-gate * regions in the second case since if MD_STR_WAR is set we
42860Sstevel@tonic-gate * always do the write after read.
42870Sstevel@tonic-gate */
42880Sstevel@tonic-gate if (!(flag & MD_STR_WAR)) {
42890Sstevel@tonic-gate BLK_TO_RR(end_rr, ps->ps_lastblk, un);
42900Sstevel@tonic-gate BLK_TO_RR(start_rr, ps->ps_firstblk, un);
42910Sstevel@tonic-gate
42920Sstevel@tonic-gate for (i = start_rr; i <= end_rr; i++)
42930Sstevel@tonic-gate if ((region_dirty = IS_KEEPDIRTY(i, un)) != 0)
42940Sstevel@tonic-gate break;
42950Sstevel@tonic-gate }
42960Sstevel@tonic-gate
42970Sstevel@tonic-gate if ((region_dirty) &&
42980Sstevel@tonic-gate !(md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)) {
42990Sstevel@tonic-gate ps->ps_call = write_after_read;
43000Sstevel@tonic-gate /*
43010Sstevel@tonic-gate * Mark this as a RESYNC_READ in ps_flags.
43020Sstevel@tonic-gate * This is used if the read fails during a
43030Sstevel@tonic-gate * resync of a 3-way mirror to ensure that
43040Sstevel@tonic-gate * the retried read to the remaining
43050Sstevel@tonic-gate * good submirror has MD_STR_WAR set. This
43060Sstevel@tonic-gate * is needed to ensure that the resync write
43070Sstevel@tonic-gate * (write-after-read) takes place.
43080Sstevel@tonic-gate */
43090Sstevel@tonic-gate ps->ps_flags |= MD_MPS_RESYNC_READ;
43100Sstevel@tonic-gate
43110Sstevel@tonic-gate /*
43120Sstevel@tonic-gate * If MD_STR_FLAG_ERR is set in the flags we
43130Sstevel@tonic-gate * set MD_MPS_FLAG_ERROR so that an error on the resync
43140Sstevel@tonic-gate * write (issued by write_after_read) will be flagged
43150Sstevel@tonic-gate * to the biowait'ing resync thread. This allows us to
43160Sstevel@tonic-gate * avoid issuing further resync requests to a device
43170Sstevel@tonic-gate * that has had a write failure.
43180Sstevel@tonic-gate */
43190Sstevel@tonic-gate if (flag & MD_STR_FLAG_ERR)
43200Sstevel@tonic-gate ps->ps_flags |= MD_MPS_FLAG_ERROR;
43210Sstevel@tonic-gate
43220Sstevel@tonic-gate setno = MD_UN2SET(un);
43230Sstevel@tonic-gate /*
43240Sstevel@tonic-gate * Drop the readerlock to avoid
43250Sstevel@tonic-gate * deadlock
43260Sstevel@tonic-gate */
43270Sstevel@tonic-gate md_unit_readerexit(ui);
43280Sstevel@tonic-gate wait_for_overlaps(ps, MD_OVERLAP_NO_REPEAT);
43290Sstevel@tonic-gate un = md_unit_readerlock(ui);
43300Sstevel@tonic-gate /*
43310Sstevel@tonic-gate * Ensure that we are owner
43320Sstevel@tonic-gate */
43330Sstevel@tonic-gate if (MD_MNSET_SETNO(setno)) {
43340Sstevel@tonic-gate /*
43350Sstevel@tonic-gate * For a non-resync read that requires a
43360Sstevel@tonic-gate * write-after-read to be done, set a flag
43370Sstevel@tonic-gate * in the parent structure, so that the
43380Sstevel@tonic-gate * write_strategy routine can omit the
43390Sstevel@tonic-gate * test that the write is still within the
43400Sstevel@tonic-gate * resync region
43410Sstevel@tonic-gate */
43420Sstevel@tonic-gate if (!(flag & MD_STR_WAR))
43430Sstevel@tonic-gate ps->ps_flags |= MD_MPS_DIRTY_RD;
43440Sstevel@tonic-gate
43450Sstevel@tonic-gate /*
43460Sstevel@tonic-gate * Before reading the buffer, see if
43478452SJohn.Wren.Kennedy@Sun.COM * there is an owner.
43480Sstevel@tonic-gate */
43498452SJohn.Wren.Kennedy@Sun.COM if (MD_MN_NO_MIRROR_OWNER(un)) {
43500Sstevel@tonic-gate ps->ps_call = NULL;
43516901Sjkennedy mirror_overlap_tree_remove(ps);
43520Sstevel@tonic-gate md_kstat_waitq_exit(ui);
43530Sstevel@tonic-gate md_unit_readerexit(ui);
43540Sstevel@tonic-gate daemon_request(
43550Sstevel@tonic-gate &md_mirror_daemon,
43560Sstevel@tonic-gate become_owner,
43570Sstevel@tonic-gate (daemon_queue_t *)ps,
43580Sstevel@tonic-gate REQ_OLD);
43590Sstevel@tonic-gate return;
43600Sstevel@tonic-gate }
43610Sstevel@tonic-gate /*
43620Sstevel@tonic-gate * For a resync read, check to see if I/O is
43630Sstevel@tonic-gate * outside of the current resync region, or
43640Sstevel@tonic-gate * the resync has finished. If so
43650Sstevel@tonic-gate * just terminate the I/O
43660Sstevel@tonic-gate */
43670Sstevel@tonic-gate if ((flag & MD_STR_WAR) &&
43680Sstevel@tonic-gate (!(un->c.un_status & MD_UN_WAR) ||
43690Sstevel@tonic-gate (!IN_RESYNC_REGION(un, ps)))) {
43700Sstevel@tonic-gate #ifdef DEBUG
43710Sstevel@tonic-gate if (mirror_debug_flag)
43720Sstevel@tonic-gate printf("Abort resync read "
43730Sstevel@tonic-gate "%x: %lld\n",
43740Sstevel@tonic-gate MD_SID(un),
43750Sstevel@tonic-gate ps->ps_firstblk);
43760Sstevel@tonic-gate #endif
43776901Sjkennedy mirror_overlap_tree_remove(ps);
43780Sstevel@tonic-gate kmem_cache_free(mirror_parent_cache,
43790Sstevel@tonic-gate ps);
43800Sstevel@tonic-gate md_kstat_waitq_exit(ui);
43810Sstevel@tonic-gate md_unit_readerexit(ui);
43820Sstevel@tonic-gate md_biodone(pb);
43830Sstevel@tonic-gate return;
43840Sstevel@tonic-gate }
43850Sstevel@tonic-gate }
43860Sstevel@tonic-gate }
43870Sstevel@tonic-gate }
43880Sstevel@tonic-gate
43890Sstevel@tonic-gate if (flag & MD_STR_DMR) {
43900Sstevel@tonic-gate ps->ps_call = directed_read_done;
43910Sstevel@tonic-gate }
43920Sstevel@tonic-gate
43930Sstevel@tonic-gate if (!(flag & MD_STR_NOTTOP) && panicstr)
43940Sstevel@tonic-gate ps->ps_flags |= MD_MPS_DONTFREE;
43950Sstevel@tonic-gate
43960Sstevel@tonic-gate md_kstat_waitq_to_runq(ui);
43970Sstevel@tonic-gate
43980Sstevel@tonic-gate ps->ps_frags++;
43990Sstevel@tonic-gate do {
44000Sstevel@tonic-gate cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS);
44010Sstevel@tonic-gate mirror_child_init(cs);
44020Sstevel@tonic-gate cb = &cs->cs_buf;
44030Sstevel@tonic-gate cs->cs_ps = ps;
44040Sstevel@tonic-gate
44050Sstevel@tonic-gate cb = md_bioclone(pb, current_offset, current_count, NODEV,
44060Sstevel@tonic-gate current_blkno, mirror_done, cb, KM_NOSLEEP);
44070Sstevel@tonic-gate
44080Sstevel@tonic-gate more = mirror_map_read(ps, cs, current_blkno,
44096901Sjkennedy (u_longlong_t)current_count);
44100Sstevel@tonic-gate if (more) {
44110Sstevel@tonic-gate mutex_enter(&ps->ps_mx);
44120Sstevel@tonic-gate ps->ps_frags++;
44130Sstevel@tonic-gate mutex_exit(&ps->ps_mx);
44140Sstevel@tonic-gate }
44150Sstevel@tonic-gate
44160Sstevel@tonic-gate /*
44170Sstevel@tonic-gate * Do these calculations now,
44180Sstevel@tonic-gate * so that we pickup a valid b_bcount from the chld_bp.
44190Sstevel@tonic-gate */
44200Sstevel@tonic-gate current_count -= more;
44210Sstevel@tonic-gate current_offset += cb->b_bcount;
44220Sstevel@tonic-gate current_blkno += more;
44230Sstevel@tonic-gate md_call_strategy(cb, flag, private);
44240Sstevel@tonic-gate } while (more);
44250Sstevel@tonic-gate
44260Sstevel@tonic-gate if (!(flag & MD_STR_NOTTOP) && panicstr) {
44270Sstevel@tonic-gate while (!(ps->ps_flags & MD_MPS_DONE)) {
44280Sstevel@tonic-gate md_daemon(1, &md_done_daemon);
44290Sstevel@tonic-gate drv_usecwait(10);
44300Sstevel@tonic-gate }
44310Sstevel@tonic-gate kmem_cache_free(mirror_parent_cache, ps);
44320Sstevel@tonic-gate }
44330Sstevel@tonic-gate }
44340Sstevel@tonic-gate
44350Sstevel@tonic-gate void
md_mirror_strategy(buf_t * bp,int flag,void * private)44360Sstevel@tonic-gate md_mirror_strategy(buf_t *bp, int flag, void *private)
44370Sstevel@tonic-gate {
44380Sstevel@tonic-gate set_t setno = MD_MIN2SET(getminor(bp->b_edev));
44390Sstevel@tonic-gate
44400Sstevel@tonic-gate /*
44410Sstevel@tonic-gate * When doing IO to a multi owner meta device, check if set is halted.
44420Sstevel@tonic-gate * We do this check without the needed lock held, for performance
44430Sstevel@tonic-gate * reasons.
44440Sstevel@tonic-gate * If an IO just slips through while the set is locked via an
44450Sstevel@tonic-gate * MD_MN_SUSPEND_SET, we don't care about it.
44460Sstevel@tonic-gate * Only check for suspension if we are a top-level i/o request
44470Sstevel@tonic-gate * (MD_STR_NOTTOP is cleared in 'flag').
44480Sstevel@tonic-gate */
44490Sstevel@tonic-gate if ((md_set[setno].s_status & (MD_SET_HALTED | MD_SET_MNSET)) ==
44500Sstevel@tonic-gate (MD_SET_HALTED | MD_SET_MNSET)) {
44510Sstevel@tonic-gate if ((flag & MD_STR_NOTTOP) == 0) {
44520Sstevel@tonic-gate mutex_enter(&md_mx);
44530Sstevel@tonic-gate /* Here we loop until the set is no longer halted */
44540Sstevel@tonic-gate while (md_set[setno].s_status & MD_SET_HALTED) {
44550Sstevel@tonic-gate cv_wait(&md_cv, &md_mx);
44560Sstevel@tonic-gate }
44570Sstevel@tonic-gate mutex_exit(&md_mx);
44580Sstevel@tonic-gate }
44590Sstevel@tonic-gate }
44600Sstevel@tonic-gate
44610Sstevel@tonic-gate if ((flag & MD_IO_COUNTED) == 0) {
44620Sstevel@tonic-gate if ((flag & MD_NOBLOCK) == 0) {
44630Sstevel@tonic-gate if (md_inc_iocount(setno) != 0) {
44640Sstevel@tonic-gate bp->b_flags |= B_ERROR;
44650Sstevel@tonic-gate bp->b_error = ENXIO;
44660Sstevel@tonic-gate bp->b_resid = bp->b_bcount;
44670Sstevel@tonic-gate biodone(bp);
44680Sstevel@tonic-gate return;
44690Sstevel@tonic-gate }
44700Sstevel@tonic-gate } else {
44710Sstevel@tonic-gate md_inc_iocount_noblock(setno);
44720Sstevel@tonic-gate }
44730Sstevel@tonic-gate }
44740Sstevel@tonic-gate
44750Sstevel@tonic-gate if (bp->b_flags & B_READ)
44760Sstevel@tonic-gate mirror_read_strategy(bp, flag, private);
44770Sstevel@tonic-gate else
44780Sstevel@tonic-gate mirror_write_strategy(bp, flag, private);
44790Sstevel@tonic-gate }
44800Sstevel@tonic-gate
44810Sstevel@tonic-gate /*
44820Sstevel@tonic-gate * mirror_directed_read:
44830Sstevel@tonic-gate * --------------------
44840Sstevel@tonic-gate * Entry-point for the DKIOCDMR ioctl. We issue a read to a specified sub-mirror
44850Sstevel@tonic-gate * so that the application can determine what (if any) resync needs to be
44860Sstevel@tonic-gate * performed. The data is copied out to the user-supplied buffer.
44870Sstevel@tonic-gate *
44880Sstevel@tonic-gate * Parameters:
44890Sstevel@tonic-gate * mdev - dev_t for the mirror device
44900Sstevel@tonic-gate * vdr - directed read parameters specifying location and submirror
44910Sstevel@tonic-gate * to perform the read from
44920Sstevel@tonic-gate * mode - used to ddi_copyout() any resulting data from the read
44930Sstevel@tonic-gate *
44940Sstevel@tonic-gate * Returns:
44950Sstevel@tonic-gate * 0 success
44960Sstevel@tonic-gate * !0 error code
44970Sstevel@tonic-gate * EINVAL - invalid request format
44980Sstevel@tonic-gate */
44990Sstevel@tonic-gate int
mirror_directed_read(dev_t mdev,vol_directed_rd_t * vdr,int mode)45000Sstevel@tonic-gate mirror_directed_read(dev_t mdev, vol_directed_rd_t *vdr, int mode)
45010Sstevel@tonic-gate {
45020Sstevel@tonic-gate buf_t *bp;
45030Sstevel@tonic-gate minor_t mnum = getminor(mdev);
45040Sstevel@tonic-gate mdi_unit_t *ui = MDI_UNIT(mnum);
45050Sstevel@tonic-gate mm_unit_t *un;
45060Sstevel@tonic-gate mm_submirror_t *sm;
45070Sstevel@tonic-gate char *sm_nm;
45080Sstevel@tonic-gate uint_t next_side;
45090Sstevel@tonic-gate void *kbuffer;
45100Sstevel@tonic-gate
45110Sstevel@tonic-gate if (ui == NULL)
45120Sstevel@tonic-gate return (ENXIO);
45130Sstevel@tonic-gate
45140Sstevel@tonic-gate if (!(vdr->vdr_flags & DKV_DMR_NEXT_SIDE)) {
45150Sstevel@tonic-gate return (EINVAL);
45160Sstevel@tonic-gate }
45170Sstevel@tonic-gate
45180Sstevel@tonic-gate /* Check for aligned block access. We disallow non-aligned requests. */
45190Sstevel@tonic-gate if (vdr->vdr_offset % DEV_BSIZE) {
45200Sstevel@tonic-gate return (EINVAL);
45210Sstevel@tonic-gate }
45220Sstevel@tonic-gate
45230Sstevel@tonic-gate /*
45240Sstevel@tonic-gate * Allocate kernel buffer for target of read(). If we had a reliable
45250Sstevel@tonic-gate * (sorry functional) DDI this wouldn't be needed.
45260Sstevel@tonic-gate */
45270Sstevel@tonic-gate kbuffer = kmem_alloc(vdr->vdr_nbytes, KM_NOSLEEP);
45280Sstevel@tonic-gate if (kbuffer == NULL) {
45290Sstevel@tonic-gate cmn_err(CE_WARN, "mirror_directed_read: couldn't allocate %lx"
45300Sstevel@tonic-gate " bytes\n", vdr->vdr_nbytes);
45310Sstevel@tonic-gate return (ENOMEM);
45320Sstevel@tonic-gate }
45330Sstevel@tonic-gate
45340Sstevel@tonic-gate bp = getrbuf(KM_SLEEP);
45350Sstevel@tonic-gate
45360Sstevel@tonic-gate bp->b_un.b_addr = kbuffer;
45370Sstevel@tonic-gate bp->b_flags = B_READ;
45380Sstevel@tonic-gate bp->b_bcount = vdr->vdr_nbytes;
45390Sstevel@tonic-gate bp->b_lblkno = lbtodb(vdr->vdr_offset);
45400Sstevel@tonic-gate bp->b_edev = mdev;
45410Sstevel@tonic-gate
45420Sstevel@tonic-gate un = md_unit_readerlock(ui);
45430Sstevel@tonic-gate
45440Sstevel@tonic-gate /*
45450Sstevel@tonic-gate * If DKV_SIDE_INIT is set we need to determine the first available
45460Sstevel@tonic-gate * side to start reading from. If it isn't set we increment to the
45470Sstevel@tonic-gate * next readable submirror.
45480Sstevel@tonic-gate * If there are no readable submirrors we error out with DKV_DMR_ERROR.
45490Sstevel@tonic-gate * Note: we check for a readable submirror on completion of the i/o so
45500Sstevel@tonic-gate * we should _always_ have one available. If this becomes unavailable
45510Sstevel@tonic-gate * we have missed the 'DKV_DMR_DONE' opportunity. This could happen if
45520Sstevel@tonic-gate * a metadetach is made between the completion of one DKIOCDMR ioctl
45530Sstevel@tonic-gate * and the start of the next (i.e. a sys-admin 'accident' occurred).
45540Sstevel@tonic-gate * The chance of this is small, but not non-existent.
45550Sstevel@tonic-gate */
45560Sstevel@tonic-gate if (vdr->vdr_side == DKV_SIDE_INIT) {
45570Sstevel@tonic-gate next_side = 0;
45580Sstevel@tonic-gate } else {
45590Sstevel@tonic-gate next_side = vdr->vdr_side + 1;
45600Sstevel@tonic-gate }
45610Sstevel@tonic-gate while ((next_side < NMIRROR) &&
45620Sstevel@tonic-gate !SUBMIRROR_IS_READABLE(un, next_side))
45630Sstevel@tonic-gate next_side++;
45640Sstevel@tonic-gate if (next_side >= NMIRROR) {
45650Sstevel@tonic-gate vdr->vdr_flags |= DKV_DMR_ERROR;
45660Sstevel@tonic-gate freerbuf(bp);
45670Sstevel@tonic-gate vdr->vdr_bytesread = 0;
45680Sstevel@tonic-gate md_unit_readerexit(ui);
45690Sstevel@tonic-gate return (0);
45700Sstevel@tonic-gate }
45710Sstevel@tonic-gate
45720Sstevel@tonic-gate /* Set the side to read from */
45730Sstevel@tonic-gate un->un_dmr_last_read = next_side;
45740Sstevel@tonic-gate
45750Sstevel@tonic-gate md_unit_readerexit(ui);
45760Sstevel@tonic-gate
45770Sstevel@tonic-gate /*
45780Sstevel@tonic-gate * Save timestamp for verification purposes. Can be read by debugger
45790Sstevel@tonic-gate * to verify that this ioctl has been executed and to find the number
45800Sstevel@tonic-gate * of DMR reads and the time of the last DMR read.
45810Sstevel@tonic-gate */
45820Sstevel@tonic-gate uniqtime(&mirror_dmr_stats.dmr_timestamp);
45830Sstevel@tonic-gate mirror_dmr_stats.dmr_count++;
45840Sstevel@tonic-gate
45850Sstevel@tonic-gate /* Issue READ request and wait for completion */
45860Sstevel@tonic-gate mirror_read_strategy(bp, MD_STR_DMR|MD_NOBLOCK|MD_STR_NOTTOP, NULL);
45870Sstevel@tonic-gate
45880Sstevel@tonic-gate mutex_enter(&un->un_dmr_mx);
45890Sstevel@tonic-gate cv_wait(&un->un_dmr_cv, &un->un_dmr_mx);
45900Sstevel@tonic-gate mutex_exit(&un->un_dmr_mx);
45910Sstevel@tonic-gate
45920Sstevel@tonic-gate /*
45930Sstevel@tonic-gate * Check to see if we encountered an error during the read. If so we
45940Sstevel@tonic-gate * can make no guarantee about any possibly returned data.
45950Sstevel@tonic-gate */
45960Sstevel@tonic-gate if ((bp->b_flags & B_ERROR) == 0) {
45970Sstevel@tonic-gate vdr->vdr_flags &= ~DKV_DMR_ERROR;
45980Sstevel@tonic-gate if (bp->b_resid) {
45990Sstevel@tonic-gate vdr->vdr_flags |= DKV_DMR_SHORT;
46000Sstevel@tonic-gate vdr->vdr_bytesread = vdr->vdr_nbytes - bp->b_resid;
46010Sstevel@tonic-gate } else {
46020Sstevel@tonic-gate vdr->vdr_flags |= DKV_DMR_SUCCESS;
46030Sstevel@tonic-gate vdr->vdr_bytesread = vdr->vdr_nbytes;
46040Sstevel@tonic-gate }
46050Sstevel@tonic-gate /* Copy the data read back out to the user supplied buffer */
46060Sstevel@tonic-gate if (ddi_copyout(kbuffer, vdr->vdr_data, vdr->vdr_bytesread,
46070Sstevel@tonic-gate mode)) {
46080Sstevel@tonic-gate kmem_free(kbuffer, vdr->vdr_nbytes);
46090Sstevel@tonic-gate return (EFAULT);
46100Sstevel@tonic-gate }
46110Sstevel@tonic-gate
46120Sstevel@tonic-gate } else {
46130Sstevel@tonic-gate /* Error out with DKV_DMR_ERROR */
46140Sstevel@tonic-gate vdr->vdr_flags |= DKV_DMR_ERROR;
46150Sstevel@tonic-gate vdr->vdr_flags &= ~(DKV_DMR_SUCCESS|DKV_DMR_SHORT|DKV_DMR_DONE);
46160Sstevel@tonic-gate }
46170Sstevel@tonic-gate /*
46180Sstevel@tonic-gate * Update the DMR parameters with the side and name of submirror that
46190Sstevel@tonic-gate * we have just read from (un->un_dmr_last_read)
46200Sstevel@tonic-gate */
46210Sstevel@tonic-gate un = md_unit_readerlock(ui);
46220Sstevel@tonic-gate
46230Sstevel@tonic-gate vdr->vdr_side = un->un_dmr_last_read;
46240Sstevel@tonic-gate sm = &un->un_sm[un->un_dmr_last_read];
46250Sstevel@tonic-gate sm_nm = md_shortname(md_getminor(sm->sm_dev));
46260Sstevel@tonic-gate
46271623Stw21770 (void) strncpy(vdr->vdr_side_name, sm_nm, sizeof (vdr->vdr_side_name));
46280Sstevel@tonic-gate
46290Sstevel@tonic-gate /*
46300Sstevel@tonic-gate * Determine if we've completed the read cycle. This is true iff the
46310Sstevel@tonic-gate * next computed submirror (side) equals or exceeds NMIRROR. We cannot
46320Sstevel@tonic-gate * use un_nsm as we need to handle a sparse array of submirrors (which
46330Sstevel@tonic-gate * can occur if a submirror is metadetached).
46340Sstevel@tonic-gate */
46350Sstevel@tonic-gate next_side = un->un_dmr_last_read + 1;
46360Sstevel@tonic-gate while ((next_side < NMIRROR) &&
46370Sstevel@tonic-gate !SUBMIRROR_IS_READABLE(un, next_side))
46380Sstevel@tonic-gate next_side++;
46390Sstevel@tonic-gate if (next_side >= NMIRROR) {
46400Sstevel@tonic-gate /* We've finished */
46410Sstevel@tonic-gate vdr->vdr_flags |= DKV_DMR_DONE;
46420Sstevel@tonic-gate }
46430Sstevel@tonic-gate
46440Sstevel@tonic-gate md_unit_readerexit(ui);
46450Sstevel@tonic-gate freerbuf(bp);
46460Sstevel@tonic-gate kmem_free(kbuffer, vdr->vdr_nbytes);
46470Sstevel@tonic-gate
46480Sstevel@tonic-gate return (0);
46490Sstevel@tonic-gate }
46500Sstevel@tonic-gate
46510Sstevel@tonic-gate /*
46520Sstevel@tonic-gate * mirror_resync_message:
46530Sstevel@tonic-gate * ---------------------
46540Sstevel@tonic-gate * Handle the multi-node resync messages that keep all nodes within a given
46550Sstevel@tonic-gate * disk-set in sync with their view of a mirror's resync status.
46560Sstevel@tonic-gate *
46570Sstevel@tonic-gate * The message types dealt with are:
46580Sstevel@tonic-gate * MD_MN_MSG_RESYNC_STARTING - start a resync thread for a unit
46590Sstevel@tonic-gate * MD_MN_MSG_RESYNC_NEXT - specified next region to be resynced
46600Sstevel@tonic-gate * MD_MN_MSG_RESYNC_FINISH - stop the resync thread for a unit
46610Sstevel@tonic-gate * MD_MN_MSG_RESYNC_PHASE_DONE - end of a resync phase, opt, submirror or comp
46620Sstevel@tonic-gate *
46630Sstevel@tonic-gate * Returns:
46640Sstevel@tonic-gate * 0 Success
46650Sstevel@tonic-gate * >0 Failure error number
46660Sstevel@tonic-gate */
46670Sstevel@tonic-gate int
mirror_resync_message(md_mn_rs_params_t * p,IOLOCK * lockp)46680Sstevel@tonic-gate mirror_resync_message(md_mn_rs_params_t *p, IOLOCK *lockp)
46690Sstevel@tonic-gate {
46700Sstevel@tonic-gate mdi_unit_t *ui;
46710Sstevel@tonic-gate mm_unit_t *un;
46720Sstevel@tonic-gate set_t setno;
46730Sstevel@tonic-gate int is_ABR;
46740Sstevel@tonic-gate int smi;
46750Sstevel@tonic-gate int ci;
46760Sstevel@tonic-gate sm_state_t state;
46770Sstevel@tonic-gate int broke_out;
46780Sstevel@tonic-gate mm_submirror_t *sm;
46790Sstevel@tonic-gate mm_submirror_ic_t *smic;
46800Sstevel@tonic-gate md_m_shared_t *shared;
46810Sstevel@tonic-gate md_error_t mde = mdnullerror;
46820Sstevel@tonic-gate md_mps_t *ps;
46830Sstevel@tonic-gate int rs_active;
46848452SJohn.Wren.Kennedy@Sun.COM int rr, rr_start, rr_end;
46850Sstevel@tonic-gate
46860Sstevel@tonic-gate /* Check that the given device is part of a multi-node set */
46870Sstevel@tonic-gate setno = MD_MIN2SET(p->mnum);
46880Sstevel@tonic-gate if (setno >= md_nsets) {
46890Sstevel@tonic-gate return (ENXIO);
46900Sstevel@tonic-gate }
46910Sstevel@tonic-gate if (!MD_MNSET_SETNO(setno)) {
46920Sstevel@tonic-gate return (EINVAL);
46930Sstevel@tonic-gate }
46940Sstevel@tonic-gate
46950Sstevel@tonic-gate if ((un = mirror_getun(p->mnum, &p->mde, NO_LOCK, NULL)) == NULL)
46960Sstevel@tonic-gate return (EINVAL);
46970Sstevel@tonic-gate if ((ui = MDI_UNIT(p->mnum)) == NULL)
46980Sstevel@tonic-gate return (EINVAL);
46990Sstevel@tonic-gate is_ABR = (ui->ui_tstate & MD_ABR_CAP);
47000Sstevel@tonic-gate
47010Sstevel@tonic-gate /* Obtain the current resync status */
47020Sstevel@tonic-gate (void) md_ioctl_readerlock(lockp, ui);
47030Sstevel@tonic-gate rs_active = (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) ? 1 : 0;
47040Sstevel@tonic-gate md_ioctl_readerexit(lockp);
47050Sstevel@tonic-gate
47060Sstevel@tonic-gate switch ((md_mn_msgtype_t)p->msg_type) {
47070Sstevel@tonic-gate case MD_MN_MSG_RESYNC_STARTING:
47080Sstevel@tonic-gate /* Start the resync thread for the mirror */
47090Sstevel@tonic-gate (void) mirror_resync_unit(p->mnum, NULL, &p->mde, lockp);
47100Sstevel@tonic-gate break;
47110Sstevel@tonic-gate
47120Sstevel@tonic-gate case MD_MN_MSG_RESYNC_NEXT:
47130Sstevel@tonic-gate /*
47140Sstevel@tonic-gate * We have to release any previously marked overlap regions
47150Sstevel@tonic-gate * so that i/o can resume. Then we need to block the region
47160Sstevel@tonic-gate * from [rs_start..rs_start+rs_size) * so that no i/o is issued.
47170Sstevel@tonic-gate * Update un_rs_resync_done and un_rs_resync_2_do.
47180Sstevel@tonic-gate */
47190Sstevel@tonic-gate (void) md_ioctl_readerlock(lockp, ui);
47200Sstevel@tonic-gate /*
47210Sstevel@tonic-gate * Ignore the message if there is no active resync thread or
47220Sstevel@tonic-gate * if it is for a resync type that we have already completed.
47230Sstevel@tonic-gate * un_resync_completed is set to the last resync completed
47240Sstevel@tonic-gate * when processing a PHASE_DONE message.
47250Sstevel@tonic-gate */
47260Sstevel@tonic-gate if (!rs_active || (p->rs_type == un->un_resync_completed))
47270Sstevel@tonic-gate break;
47280Sstevel@tonic-gate /*
47290Sstevel@tonic-gate * If this message is for the same resync and is for an earlier
47300Sstevel@tonic-gate * resync region, just ignore it. This can only occur if this
47310Sstevel@tonic-gate * node has progressed on to the next resync region before
47320Sstevel@tonic-gate * we receive this message. This can occur if the class for
47330Sstevel@tonic-gate * this message is busy and the originator has to retry thus
47340Sstevel@tonic-gate * allowing this node to move onto the next resync_region.
47350Sstevel@tonic-gate */
47360Sstevel@tonic-gate if ((p->rs_type == un->un_rs_type) &&
47370Sstevel@tonic-gate (p->rs_start < un->un_resync_startbl))
47380Sstevel@tonic-gate break;
47396901Sjkennedy ps = un->un_rs_prev_overlap;
47400Sstevel@tonic-gate
47410Sstevel@tonic-gate /* Allocate previous overlap reference if needed */
47420Sstevel@tonic-gate if (ps == NULL) {
47430Sstevel@tonic-gate ps = kmem_cache_alloc(mirror_parent_cache,
47446901Sjkennedy MD_ALLOCFLAGS);
47450Sstevel@tonic-gate ps->ps_un = un;
47460Sstevel@tonic-gate ps->ps_ui = ui;
47470Sstevel@tonic-gate ps->ps_firstblk = 0;
47480Sstevel@tonic-gate ps->ps_lastblk = 0;
47490Sstevel@tonic-gate ps->ps_flags = 0;
47500Sstevel@tonic-gate md_ioctl_readerexit(lockp);
47510Sstevel@tonic-gate (void) md_ioctl_writerlock(lockp, ui);
47526901Sjkennedy un->un_rs_prev_overlap = ps;
47530Sstevel@tonic-gate md_ioctl_writerexit(lockp);
47540Sstevel@tonic-gate } else
47550Sstevel@tonic-gate md_ioctl_readerexit(lockp);
47560Sstevel@tonic-gate
47570Sstevel@tonic-gate if (p->rs_originator != md_mn_mynode_id) {
47580Sstevel@tonic-gate /*
47598452SJohn.Wren.Kennedy@Sun.COM * Clear our un_resync_bm for the regions completed.
47608452SJohn.Wren.Kennedy@Sun.COM * The owner (originator) will take care of itself.
47618452SJohn.Wren.Kennedy@Sun.COM */
47628452SJohn.Wren.Kennedy@Sun.COM BLK_TO_RR(rr_end, ps->ps_lastblk, un);
47638452SJohn.Wren.Kennedy@Sun.COM BLK_TO_RR(rr_start, p->rs_start, un);
47648452SJohn.Wren.Kennedy@Sun.COM if (ps->ps_lastblk && rr_end < rr_start) {
47658452SJohn.Wren.Kennedy@Sun.COM BLK_TO_RR(rr_start, ps->ps_firstblk, un);
47668452SJohn.Wren.Kennedy@Sun.COM mutex_enter(&un->un_resync_mx);
47678452SJohn.Wren.Kennedy@Sun.COM /*
47688452SJohn.Wren.Kennedy@Sun.COM * Update our resync bitmap to reflect that
47698452SJohn.Wren.Kennedy@Sun.COM * another node has synchronized this range.
47708452SJohn.Wren.Kennedy@Sun.COM */
47718452SJohn.Wren.Kennedy@Sun.COM for (rr = rr_start; rr <= rr_end; rr++) {
47728452SJohn.Wren.Kennedy@Sun.COM CLR_KEEPDIRTY(rr, un);
47738452SJohn.Wren.Kennedy@Sun.COM }
47748452SJohn.Wren.Kennedy@Sun.COM mutex_exit(&un->un_resync_mx);
47758452SJohn.Wren.Kennedy@Sun.COM }
47768452SJohn.Wren.Kennedy@Sun.COM
47778452SJohn.Wren.Kennedy@Sun.COM /*
47780Sstevel@tonic-gate * On all but the originating node, first update
47790Sstevel@tonic-gate * the resync state, then unblock the previous
47800Sstevel@tonic-gate * region and block the next one. No need
47810Sstevel@tonic-gate * to do this if the region is already blocked.
47820Sstevel@tonic-gate * Update the submirror state and flags from the
47830Sstevel@tonic-gate * originator. This keeps the cluster in sync with
47840Sstevel@tonic-gate * regards to the resync status.
47850Sstevel@tonic-gate */
47860Sstevel@tonic-gate
47870Sstevel@tonic-gate (void) md_ioctl_writerlock(lockp, ui);
47880Sstevel@tonic-gate un->un_rs_resync_done = p->rs_done;
47890Sstevel@tonic-gate un->un_rs_resync_2_do = p->rs_2_do;
47900Sstevel@tonic-gate un->un_rs_type = p->rs_type;
47910Sstevel@tonic-gate un->un_resync_startbl = p->rs_start;
47920Sstevel@tonic-gate md_ioctl_writerexit(lockp);
47930Sstevel@tonic-gate /*
47940Sstevel@tonic-gate * Use un_owner_mx to ensure that an ownership change
47950Sstevel@tonic-gate * cannot happen at the same time as this message
47960Sstevel@tonic-gate */
47970Sstevel@tonic-gate mutex_enter(&un->un_owner_mx);
47980Sstevel@tonic-gate if (MD_MN_MIRROR_OWNER(un)) {
47990Sstevel@tonic-gate ps->ps_firstblk = p->rs_start;
48000Sstevel@tonic-gate ps->ps_lastblk = ps->ps_firstblk +
48010Sstevel@tonic-gate p->rs_size - 1;
48020Sstevel@tonic-gate } else {
48030Sstevel@tonic-gate if ((ps->ps_firstblk != p->rs_start) ||
48040Sstevel@tonic-gate (ps->ps_lastblk != p->rs_start +
48050Sstevel@tonic-gate p->rs_size - 1)) {
48060Sstevel@tonic-gate /* Remove previous overlap range */
48070Sstevel@tonic-gate if (ps->ps_flags & MD_MPS_ON_OVERLAP)
48086901Sjkennedy mirror_overlap_tree_remove(ps);
48090Sstevel@tonic-gate
48100Sstevel@tonic-gate ps->ps_firstblk = p->rs_start;
48110Sstevel@tonic-gate ps->ps_lastblk = ps->ps_firstblk +
48120Sstevel@tonic-gate p->rs_size - 1;
48130Sstevel@tonic-gate
48140Sstevel@tonic-gate mutex_exit(&un->un_owner_mx);
48150Sstevel@tonic-gate /* Block this range from all i/o. */
48160Sstevel@tonic-gate if (ps->ps_firstblk != 0 ||
48170Sstevel@tonic-gate ps->ps_lastblk != 0)
48180Sstevel@tonic-gate wait_for_overlaps(ps,
48190Sstevel@tonic-gate MD_OVERLAP_ALLOW_REPEAT);
48200Sstevel@tonic-gate mutex_enter(&un->un_owner_mx);
48210Sstevel@tonic-gate /*
48220Sstevel@tonic-gate * Check to see if we have obtained
48230Sstevel@tonic-gate * ownership while waiting for
48240Sstevel@tonic-gate * overlaps. If we have, remove
48250Sstevel@tonic-gate * the resync_region entry from the
48266901Sjkennedy * overlap tree
48270Sstevel@tonic-gate */
48280Sstevel@tonic-gate if (MD_MN_MIRROR_OWNER(un) &&
48290Sstevel@tonic-gate (ps->ps_flags & MD_MPS_ON_OVERLAP))
48306901Sjkennedy mirror_overlap_tree_remove(ps);
48310Sstevel@tonic-gate }
48320Sstevel@tonic-gate }
48330Sstevel@tonic-gate mutex_exit(&un->un_owner_mx);
48340Sstevel@tonic-gate
48350Sstevel@tonic-gate /*
48360Sstevel@tonic-gate * If this is the first RESYNC_NEXT message (i.e.
48370Sstevel@tonic-gate * MD_MN_RS_FIRST_RESYNC_NEXT set in p->rs_flags),
48380Sstevel@tonic-gate * issue RESYNC_START NOTIFY event
48390Sstevel@tonic-gate */
48400Sstevel@tonic-gate if (p->rs_flags & MD_MN_RS_FIRST_RESYNC_NEXT) {
48410Sstevel@tonic-gate SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_START,
48420Sstevel@tonic-gate SVM_TAG_METADEVICE, MD_UN2SET(un),
48430Sstevel@tonic-gate MD_SID(un));
48440Sstevel@tonic-gate }
48450Sstevel@tonic-gate
48460Sstevel@tonic-gate /* Ensure that our local resync thread is running */
48470Sstevel@tonic-gate if (un->un_rs_thread == NULL) {
48480Sstevel@tonic-gate (void) mirror_resync_unit(p->mnum, NULL,
48490Sstevel@tonic-gate &p->mde, lockp);
48500Sstevel@tonic-gate }
48510Sstevel@tonic-gate }
48528452SJohn.Wren.Kennedy@Sun.COM
48530Sstevel@tonic-gate break;
48540Sstevel@tonic-gate case MD_MN_MSG_RESYNC_FINISH:
48550Sstevel@tonic-gate /*
48560Sstevel@tonic-gate * Complete the resync by stopping the resync thread.
48570Sstevel@tonic-gate * Also release the previous overlap region field.
48580Sstevel@tonic-gate * Update the resync_progress_thread by cv_signal'ing it so
48590Sstevel@tonic-gate * that we mark the end of the resync as soon as possible. This
48600Sstevel@tonic-gate * stops an unnecessary delay should be panic after resync
48610Sstevel@tonic-gate * completion.
48620Sstevel@tonic-gate */
48630Sstevel@tonic-gate #ifdef DEBUG
48640Sstevel@tonic-gate if (!rs_active) {
48650Sstevel@tonic-gate if (mirror_debug_flag)
48660Sstevel@tonic-gate printf("RESYNC_FINISH (mnum = %x), "
48670Sstevel@tonic-gate "Resync *NOT* active",
48680Sstevel@tonic-gate p->mnum);
48690Sstevel@tonic-gate }
48700Sstevel@tonic-gate #endif
48710Sstevel@tonic-gate
48720Sstevel@tonic-gate if ((un->c.un_status & MD_UN_RESYNC_ACTIVE) &&
48730Sstevel@tonic-gate (p->rs_originator != md_mn_mynode_id)) {
48740Sstevel@tonic-gate mutex_enter(&un->un_rs_thread_mx);
48750Sstevel@tonic-gate un->c.un_status &= ~MD_UN_RESYNC_CANCEL;
48760Sstevel@tonic-gate un->un_rs_thread_flags |= MD_RI_SHUTDOWN;
48770Sstevel@tonic-gate un->un_rs_thread_flags &=
48780Sstevel@tonic-gate ~(MD_RI_BLOCK|MD_RI_BLOCK_OWNER);
48790Sstevel@tonic-gate cv_signal(&un->un_rs_thread_cv);
48800Sstevel@tonic-gate mutex_exit(&un->un_rs_thread_mx);
48810Sstevel@tonic-gate }
48820Sstevel@tonic-gate if (is_ABR) {
48830Sstevel@tonic-gate /* Resync finished, if ABR set owner to NULL */
48840Sstevel@tonic-gate mutex_enter(&un->un_owner_mx);
48850Sstevel@tonic-gate un->un_mirror_owner = 0;
48860Sstevel@tonic-gate mutex_exit(&un->un_owner_mx);
48870Sstevel@tonic-gate }
48880Sstevel@tonic-gate (void) md_ioctl_writerlock(lockp, ui);
48896901Sjkennedy ps = un->un_rs_prev_overlap;
48900Sstevel@tonic-gate if (ps != NULL) {
48910Sstevel@tonic-gate /* Remove previous overlap range */
48920Sstevel@tonic-gate if (ps->ps_flags & MD_MPS_ON_OVERLAP)
48936901Sjkennedy mirror_overlap_tree_remove(ps);
48940Sstevel@tonic-gate /*
48950Sstevel@tonic-gate * Release the overlap range reference
48960Sstevel@tonic-gate */
48976901Sjkennedy un->un_rs_prev_overlap = NULL;
48980Sstevel@tonic-gate kmem_cache_free(mirror_parent_cache,
48990Sstevel@tonic-gate ps);
49000Sstevel@tonic-gate }
49010Sstevel@tonic-gate md_ioctl_writerexit(lockp);
49020Sstevel@tonic-gate
49030Sstevel@tonic-gate /* Mark the resync as complete in the metadb */
49040Sstevel@tonic-gate un->un_rs_resync_done = p->rs_done;
49050Sstevel@tonic-gate un->un_rs_resync_2_do = p->rs_2_do;
49060Sstevel@tonic-gate un->un_rs_type = p->rs_type;
49070Sstevel@tonic-gate mutex_enter(&un->un_rs_progress_mx);
49080Sstevel@tonic-gate cv_signal(&un->un_rs_progress_cv);
49090Sstevel@tonic-gate mutex_exit(&un->un_rs_progress_mx);
49100Sstevel@tonic-gate
49110Sstevel@tonic-gate un = md_ioctl_writerlock(lockp, ui);
49120Sstevel@tonic-gate un->c.un_status &= ~MD_UN_RESYNC_ACTIVE;
49130Sstevel@tonic-gate /* Deal with any pending grow_unit */
49140Sstevel@tonic-gate if (un->c.un_status & MD_UN_GROW_PENDING) {
49150Sstevel@tonic-gate if ((mirror_grow_unit(un, &mde) != 0) ||
49160Sstevel@tonic-gate (! mdismderror(&mde, MDE_GROW_DELAYED))) {
49170Sstevel@tonic-gate un->c.un_status &= ~MD_UN_GROW_PENDING;
49180Sstevel@tonic-gate }
49190Sstevel@tonic-gate }
49200Sstevel@tonic-gate md_ioctl_writerexit(lockp);
49210Sstevel@tonic-gate break;
49220Sstevel@tonic-gate
49230Sstevel@tonic-gate case MD_MN_MSG_RESYNC_PHASE_DONE:
49240Sstevel@tonic-gate /*
49250Sstevel@tonic-gate * A phase of the resync, optimized. component or
49260Sstevel@tonic-gate * submirror is complete. Update mirror status.
49270Sstevel@tonic-gate * If the flag CLEAR_OPT_NOT_DONE is set, it means that the
49280Sstevel@tonic-gate * mirror owner is peforming a resync. If we have just snarfed
49290Sstevel@tonic-gate * this set, then we must clear any of the flags set at snarf
49300Sstevel@tonic-gate * time by unit_setup_resync().
49310Sstevel@tonic-gate * Note that unit_setup_resync() sets up these flags to
49320Sstevel@tonic-gate * indicate that an optimized resync is required. These flags
49330Sstevel@tonic-gate * need to be reset because if we get here, the mirror owner
49340Sstevel@tonic-gate * will have handled the optimized resync.
49350Sstevel@tonic-gate * The flags that must be cleared are MD_UN_OPT_NOT_DONE and
49360Sstevel@tonic-gate * MD_UN_WAR. In addition, for each submirror,
49370Sstevel@tonic-gate * MD_SM_RESYNC_TARGET must be cleared and SMS_OFFLINE_RESYNC
49380Sstevel@tonic-gate * set to SMS_OFFLINE.
49390Sstevel@tonic-gate */
49400Sstevel@tonic-gate #ifdef DEBUG
49410Sstevel@tonic-gate if (mirror_debug_flag)
49420Sstevel@tonic-gate printf("phase done mess received from %d, mnum=%x,"
49430Sstevel@tonic-gate "type=%x, flags=%x\n", p->rs_originator, p->mnum,
49440Sstevel@tonic-gate p->rs_type, p->rs_flags);
49450Sstevel@tonic-gate #endif
49460Sstevel@tonic-gate /*
49470Sstevel@tonic-gate * Ignore the message if there is no active resync thread.
49480Sstevel@tonic-gate */
49490Sstevel@tonic-gate if (!rs_active)
49500Sstevel@tonic-gate break;
49510Sstevel@tonic-gate
49520Sstevel@tonic-gate broke_out = p->rs_flags & MD_MN_RS_ERR;
49530Sstevel@tonic-gate switch (RS_TYPE(p->rs_type)) {
49540Sstevel@tonic-gate case MD_RS_OPTIMIZED:
49550Sstevel@tonic-gate un = md_ioctl_writerlock(lockp, ui);
49560Sstevel@tonic-gate if (p->rs_flags & MD_MN_RS_CLEAR_OPT_NOT_DONE) {
49570Sstevel@tonic-gate /* If we are originator, just clear rs_type */
49580Sstevel@tonic-gate if (p->rs_originator == md_mn_mynode_id) {
49590Sstevel@tonic-gate SET_RS_TYPE_NONE(un->un_rs_type);
49600Sstevel@tonic-gate md_ioctl_writerexit(lockp);
49610Sstevel@tonic-gate break;
49620Sstevel@tonic-gate }
49630Sstevel@tonic-gate /*
49640Sstevel@tonic-gate * If CLEAR_OPT_NOT_DONE is set, only clear the
49650Sstevel@tonic-gate * flags if OPT_NOT_DONE is set *and* rs_type
49660Sstevel@tonic-gate * is MD_RS_NONE.
49670Sstevel@tonic-gate */
49680Sstevel@tonic-gate if ((un->c.un_status & MD_UN_OPT_NOT_DONE) &&
49690Sstevel@tonic-gate (RS_TYPE(un->un_rs_type) == MD_RS_NONE)) {
49700Sstevel@tonic-gate /* No resync in progress */
49710Sstevel@tonic-gate un->c.un_status &= ~MD_UN_OPT_NOT_DONE;
49720Sstevel@tonic-gate un->c.un_status &= ~MD_UN_WAR;
49730Sstevel@tonic-gate } else {
49740Sstevel@tonic-gate /*
49750Sstevel@tonic-gate * We are in the middle of an
49760Sstevel@tonic-gate * optimized resync and this message
49770Sstevel@tonic-gate * should be ignored.
49780Sstevel@tonic-gate */
49790Sstevel@tonic-gate md_ioctl_writerexit(lockp);
49800Sstevel@tonic-gate break;
49810Sstevel@tonic-gate }
49820Sstevel@tonic-gate } else {
49830Sstevel@tonic-gate /*
49840Sstevel@tonic-gate * This is the end of an optimized resync,
49850Sstevel@tonic-gate * clear the OPT_NOT_DONE and OFFLINE_SM flags
49860Sstevel@tonic-gate */
49870Sstevel@tonic-gate
49880Sstevel@tonic-gate un->c.un_status &= ~MD_UN_KEEP_DIRTY;
49890Sstevel@tonic-gate if (!broke_out)
49900Sstevel@tonic-gate un->c.un_status &= ~MD_UN_WAR;
49918452SJohn.Wren.Kennedy@Sun.COM
49928452SJohn.Wren.Kennedy@Sun.COM /*
49938452SJohn.Wren.Kennedy@Sun.COM * Clear our un_resync_bm for the regions
49948452SJohn.Wren.Kennedy@Sun.COM * completed. The owner (originator) will
49958452SJohn.Wren.Kennedy@Sun.COM * take care of itself.
49968452SJohn.Wren.Kennedy@Sun.COM */
49978452SJohn.Wren.Kennedy@Sun.COM if (p->rs_originator != md_mn_mynode_id &&
49988452SJohn.Wren.Kennedy@Sun.COM (ps = un->un_rs_prev_overlap) != NULL) {
49998452SJohn.Wren.Kennedy@Sun.COM BLK_TO_RR(rr_start, ps->ps_firstblk,
50008452SJohn.Wren.Kennedy@Sun.COM un);
50018452SJohn.Wren.Kennedy@Sun.COM BLK_TO_RR(rr_end, ps->ps_lastblk, un);
50028452SJohn.Wren.Kennedy@Sun.COM mutex_enter(&un->un_resync_mx);
50038452SJohn.Wren.Kennedy@Sun.COM for (rr = rr_start; rr <= rr_end;
50048452SJohn.Wren.Kennedy@Sun.COM rr++) {
50058452SJohn.Wren.Kennedy@Sun.COM CLR_KEEPDIRTY(rr, un);
50068452SJohn.Wren.Kennedy@Sun.COM }
50078452SJohn.Wren.Kennedy@Sun.COM mutex_exit(&un->un_resync_mx);
50088452SJohn.Wren.Kennedy@Sun.COM }
50090Sstevel@tonic-gate }
50100Sstevel@tonic-gate
50110Sstevel@tonic-gate /*
50120Sstevel@tonic-gate * Set resync_completed to last resync type and then
50130Sstevel@tonic-gate * clear resync_type to indicate no resync in progress
50140Sstevel@tonic-gate */
50150Sstevel@tonic-gate un->un_resync_completed = un->un_rs_type;
50160Sstevel@tonic-gate SET_RS_TYPE_NONE(un->un_rs_type);
50170Sstevel@tonic-gate
50180Sstevel@tonic-gate /*
50190Sstevel@tonic-gate * If resync is as a result of a submirror ONLINE,
50200Sstevel@tonic-gate * reset the submirror state to SMS_RUNNING if the
50210Sstevel@tonic-gate * resync was ok else set back to SMS_OFFLINE.
50220Sstevel@tonic-gate */
50230Sstevel@tonic-gate for (smi = 0; smi < NMIRROR; smi++) {
50240Sstevel@tonic-gate un->un_sm[smi].sm_flags &=
50250Sstevel@tonic-gate ~MD_SM_RESYNC_TARGET;
50260Sstevel@tonic-gate if (SMS_BY_INDEX_IS(un, smi,
50270Sstevel@tonic-gate SMS_OFFLINE_RESYNC)) {
50280Sstevel@tonic-gate if (p->rs_flags &
50290Sstevel@tonic-gate MD_MN_RS_CLEAR_OPT_NOT_DONE) {
50300Sstevel@tonic-gate state = SMS_OFFLINE;
50310Sstevel@tonic-gate } else {
50320Sstevel@tonic-gate state = (broke_out ?
50330Sstevel@tonic-gate SMS_OFFLINE : SMS_RUNNING);
50340Sstevel@tonic-gate }
50350Sstevel@tonic-gate mirror_set_sm_state(
50360Sstevel@tonic-gate &un->un_sm[smi],
50370Sstevel@tonic-gate &un->un_smic[smi], state,
50380Sstevel@tonic-gate broke_out);
50390Sstevel@tonic-gate mirror_commit(un, NO_SUBMIRRORS,
50400Sstevel@tonic-gate 0);
50410Sstevel@tonic-gate }
50420Sstevel@tonic-gate /*
50430Sstevel@tonic-gate * If we still have an offline submirror, reset
50440Sstevel@tonic-gate * the OFFLINE_SM flag in the mirror status
50450Sstevel@tonic-gate */
50460Sstevel@tonic-gate if (SMS_BY_INDEX_IS(un, smi,
50470Sstevel@tonic-gate SMS_OFFLINE))
50480Sstevel@tonic-gate un->c.un_status |=
50490Sstevel@tonic-gate MD_UN_OFFLINE_SM;
50500Sstevel@tonic-gate }
50510Sstevel@tonic-gate md_ioctl_writerexit(lockp);
50520Sstevel@tonic-gate break;
50530Sstevel@tonic-gate case MD_RS_SUBMIRROR:
50540Sstevel@tonic-gate un = md_ioctl_writerlock(lockp, ui);
50550Sstevel@tonic-gate smi = RS_SMI(p->rs_type);
50560Sstevel@tonic-gate sm = &un->un_sm[smi];
50570Sstevel@tonic-gate smic = &un->un_smic[smi];
50580Sstevel@tonic-gate /* Clear RESYNC target */
50590Sstevel@tonic-gate un->un_sm[smi].sm_flags &= ~MD_SM_RESYNC_TARGET;
50600Sstevel@tonic-gate /*
50610Sstevel@tonic-gate * Set resync_completed to last resync type and then
50620Sstevel@tonic-gate * clear resync_type to indicate no resync in progress
50630Sstevel@tonic-gate */
50640Sstevel@tonic-gate un->un_resync_completed = un->un_rs_type;
50650Sstevel@tonic-gate SET_RS_TYPE_NONE(un->un_rs_type);
50660Sstevel@tonic-gate /*
50670Sstevel@tonic-gate * If the resync completed ok reset the submirror
50680Sstevel@tonic-gate * state to SMS_RUNNING else reset it to SMS_ATTACHED
50690Sstevel@tonic-gate */
50700Sstevel@tonic-gate state = (broke_out ?
50710Sstevel@tonic-gate SMS_ATTACHED : SMS_RUNNING);
50720Sstevel@tonic-gate mirror_set_sm_state(sm, smic, state, broke_out);
50730Sstevel@tonic-gate un->c.un_status &= ~MD_UN_WAR;
50740Sstevel@tonic-gate mirror_commit(un, SMI2BIT(smi), 0);
50750Sstevel@tonic-gate md_ioctl_writerexit(lockp);
50760Sstevel@tonic-gate break;
50770Sstevel@tonic-gate case MD_RS_COMPONENT:
50780Sstevel@tonic-gate un = md_ioctl_writerlock(lockp, ui);
50790Sstevel@tonic-gate smi = RS_SMI(p->rs_type);
50800Sstevel@tonic-gate ci = RS_CI(p->rs_type);
50810Sstevel@tonic-gate sm = &un->un_sm[smi];
50820Sstevel@tonic-gate smic = &un->un_smic[smi];
50830Sstevel@tonic-gate shared = (md_m_shared_t *)
50840Sstevel@tonic-gate (*(smic->sm_shared_by_indx))
50850Sstevel@tonic-gate (sm->sm_dev, sm, ci);
50860Sstevel@tonic-gate un->c.un_status &= ~MD_UN_WAR;
50870Sstevel@tonic-gate /* Clear RESYNC target */
50880Sstevel@tonic-gate un->un_sm[smi].sm_flags &= ~MD_SM_RESYNC_TARGET;
50890Sstevel@tonic-gate /*
50900Sstevel@tonic-gate * Set resync_completed to last resync type and then
50910Sstevel@tonic-gate * clear resync_type to indicate no resync in progress
50920Sstevel@tonic-gate */
50930Sstevel@tonic-gate un->un_resync_completed = un->un_rs_type;
50940Sstevel@tonic-gate SET_RS_TYPE_NONE(un->un_rs_type);
50950Sstevel@tonic-gate
50960Sstevel@tonic-gate /*
50970Sstevel@tonic-gate * If the resync completed ok, set the component state
50980Sstevel@tonic-gate * to CS_OKAY.
50990Sstevel@tonic-gate */
51000Sstevel@tonic-gate if (broke_out)
51010Sstevel@tonic-gate shared->ms_flags |= MDM_S_RS_TRIED;
51020Sstevel@tonic-gate else {
51030Sstevel@tonic-gate /*
51040Sstevel@tonic-gate * As we don't transmit the changes,
51050Sstevel@tonic-gate * no need to drop the lock.
51060Sstevel@tonic-gate */
51070Sstevel@tonic-gate set_sm_comp_state(un, smi, ci, CS_OKAY, 0,
51080Sstevel@tonic-gate MD_STATE_NO_XMIT, (IOLOCK *)NULL);
51090Sstevel@tonic-gate }
51100Sstevel@tonic-gate md_ioctl_writerexit(lockp);
51110Sstevel@tonic-gate default:
51120Sstevel@tonic-gate break;
51130Sstevel@tonic-gate }
51140Sstevel@tonic-gate /*
51150Sstevel@tonic-gate * If the purpose of this PHASE_DONE message is just to
51160Sstevel@tonic-gate * indicate to all other nodes that the optimized resync
51170Sstevel@tonic-gate * required (OPT_NOT_DONE) flag is to be cleared, there is
51180Sstevel@tonic-gate * no need to generate a notify event as there has not
51190Sstevel@tonic-gate * actually been a resync.
51200Sstevel@tonic-gate */
51210Sstevel@tonic-gate if (!(p->rs_flags & MD_MN_RS_CLEAR_OPT_NOT_DONE)) {
51220Sstevel@tonic-gate if (broke_out) {
51230Sstevel@tonic-gate SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_FAILED,
51240Sstevel@tonic-gate SVM_TAG_METADEVICE, MD_UN2SET(un),
51250Sstevel@tonic-gate MD_SID(un));
51260Sstevel@tonic-gate } else {
51270Sstevel@tonic-gate SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_DONE,
51280Sstevel@tonic-gate SVM_TAG_METADEVICE, MD_UN2SET(un),
51290Sstevel@tonic-gate MD_SID(un));
51300Sstevel@tonic-gate }
51310Sstevel@tonic-gate }
51320Sstevel@tonic-gate break;
51330Sstevel@tonic-gate
51340Sstevel@tonic-gate default:
51350Sstevel@tonic-gate #ifdef DEBUG
51360Sstevel@tonic-gate cmn_err(CE_PANIC, "mirror_resync_message: Unknown message type"
51370Sstevel@tonic-gate " %x\n", p->msg_type);
51380Sstevel@tonic-gate #endif
51390Sstevel@tonic-gate return (EINVAL);
51400Sstevel@tonic-gate }
51410Sstevel@tonic-gate return (0);
51420Sstevel@tonic-gate }
51430Sstevel@tonic-gate
51440Sstevel@tonic-gate /* Return a -1 if snarf of optimized record failed and set should be released */
51450Sstevel@tonic-gate static int
mirror_snarf(md_snarfcmd_t cmd,set_t setno)51460Sstevel@tonic-gate mirror_snarf(md_snarfcmd_t cmd, set_t setno)
51470Sstevel@tonic-gate {
51480Sstevel@tonic-gate mddb_recid_t recid;
51490Sstevel@tonic-gate int gotsomething;
51500Sstevel@tonic-gate int all_mirrors_gotten;
51510Sstevel@tonic-gate mm_unit_t *un;
51520Sstevel@tonic-gate mddb_type_t typ1;
51530Sstevel@tonic-gate mddb_de_ic_t *dep;
51540Sstevel@tonic-gate mddb_rb32_t *rbp;
51550Sstevel@tonic-gate size_t newreqsize;
51560Sstevel@tonic-gate mm_unit_t *big_un;
51570Sstevel@tonic-gate mm_unit32_od_t *small_un;
51580Sstevel@tonic-gate int retval;
51590Sstevel@tonic-gate mdi_unit_t *ui;
51600Sstevel@tonic-gate
51610Sstevel@tonic-gate if (cmd == MD_SNARF_CLEANUP) {
51620Sstevel@tonic-gate if (md_get_setstatus(setno) & MD_SET_STALE)
51630Sstevel@tonic-gate return (0);
51640Sstevel@tonic-gate
51650Sstevel@tonic-gate recid = mddb_makerecid(setno, 0);
51660Sstevel@tonic-gate typ1 = (mddb_type_t)md_getshared_key(setno,
51670Sstevel@tonic-gate mirror_md_ops.md_driver.md_drivername);
51680Sstevel@tonic-gate while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) {
51690Sstevel@tonic-gate if (mddb_getrecprivate(recid) & MD_PRV_CLEANUP) {
51700Sstevel@tonic-gate un = (mm_unit_t *)mddb_getrecaddr(recid);
51710Sstevel@tonic-gate mirror_cleanup(un);
51720Sstevel@tonic-gate recid = mddb_makerecid(setno, 0);
51730Sstevel@tonic-gate }
51740Sstevel@tonic-gate }
51750Sstevel@tonic-gate return (0);
51760Sstevel@tonic-gate }
51770Sstevel@tonic-gate
51780Sstevel@tonic-gate all_mirrors_gotten = 1;
51790Sstevel@tonic-gate gotsomething = 0;
51800Sstevel@tonic-gate
51810Sstevel@tonic-gate recid = mddb_makerecid(setno, 0);
51820Sstevel@tonic-gate typ1 = (mddb_type_t)md_getshared_key(setno,
51830Sstevel@tonic-gate mirror_md_ops.md_driver.md_drivername);
51840Sstevel@tonic-gate
51850Sstevel@tonic-gate while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) {
51860Sstevel@tonic-gate if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
51870Sstevel@tonic-gate continue;
51880Sstevel@tonic-gate
51890Sstevel@tonic-gate dep = mddb_getrecdep(recid);
51900Sstevel@tonic-gate dep->de_flags = MDDB_F_MIRROR;
51910Sstevel@tonic-gate rbp = dep->de_rb;
51920Sstevel@tonic-gate
51931623Stw21770 switch (rbp->rb_revision) {
51941623Stw21770 case MDDB_REV_RB:
51951623Stw21770 case MDDB_REV_RBFN:
51961623Stw21770 if ((rbp->rb_private & MD_PRV_CONVD) == 0) {
51971623Stw21770 /*
51981623Stw21770 * This means, we have an old and small
51991623Stw21770 * record and this record hasn't already
52001623Stw21770 * been converted. Before we create an
52011623Stw21770 * incore metadevice from this we have to
52021623Stw21770 * convert it to a big record.
52031623Stw21770 */
52041623Stw21770 small_un =
52051623Stw21770 (mm_unit32_od_t *)mddb_getrecaddr(recid);
52061623Stw21770 newreqsize = sizeof (mm_unit_t);
52071623Stw21770 big_un = (mm_unit_t *)kmem_zalloc(newreqsize,
52086901Sjkennedy KM_SLEEP);
52091623Stw21770 mirror_convert((caddr_t)small_un,
52106901Sjkennedy (caddr_t)big_un, SMALL_2_BIG);
52111623Stw21770 kmem_free(small_un, dep->de_reqsize);
52121623Stw21770
52131623Stw21770 /*
52141623Stw21770 * Update userdata and incore userdata
52151623Stw21770 * incores are at the end of un
52161623Stw21770 */
52171623Stw21770 dep->de_rb_userdata_ic = big_un;
52181623Stw21770 dep->de_rb_userdata = big_un;
52191623Stw21770 dep->de_icreqsize = newreqsize;
52201623Stw21770 un = big_un;
52211623Stw21770 rbp->rb_private |= MD_PRV_CONVD;
52221623Stw21770 } else {
52231623Stw21770 /*
52241623Stw21770 * Unit already converted, just get the
52251623Stw21770 * record address.
52261623Stw21770 */
52271623Stw21770 un = (mm_unit_t *)mddb_getrecaddr_resize(recid,
52286901Sjkennedy sizeof (*un), 0);
52291623Stw21770 }
52301623Stw21770 un->c.un_revision &= ~MD_64BIT_META_DEV;
52311623Stw21770 break;
52321623Stw21770 case MDDB_REV_RB64:
52331623Stw21770 case MDDB_REV_RB64FN:
52340Sstevel@tonic-gate /* Big device */
52350Sstevel@tonic-gate un = (mm_unit_t *)mddb_getrecaddr_resize(recid,
52366901Sjkennedy sizeof (*un), 0);
52371623Stw21770 un->c.un_revision |= MD_64BIT_META_DEV;
52381623Stw21770 un->c.un_flag |= MD_EFILABEL;
52391623Stw21770 break;
52400Sstevel@tonic-gate }
52412077Stw21770 MDDB_NOTE_FN(rbp->rb_revision, un->c.un_revision);
52420Sstevel@tonic-gate
52430Sstevel@tonic-gate /*
52440Sstevel@tonic-gate * Create minor device node for snarfed entry.
52450Sstevel@tonic-gate */
52460Sstevel@tonic-gate (void) md_create_minor_node(setno, MD_SID(un));
52470Sstevel@tonic-gate
52480Sstevel@tonic-gate if (MD_UNIT(MD_SID(un)) != NULL) {
52490Sstevel@tonic-gate mddb_setrecprivate(recid, MD_PRV_PENDDEL);
52500Sstevel@tonic-gate continue;
52510Sstevel@tonic-gate }
52520Sstevel@tonic-gate all_mirrors_gotten = 0;
52530Sstevel@tonic-gate retval = mirror_build_incore(un, 1);
52540Sstevel@tonic-gate if (retval == 0) {
52550Sstevel@tonic-gate mddb_setrecprivate(recid, MD_PRV_GOTIT);
52560Sstevel@tonic-gate md_create_unit_incore(MD_SID(un), &mirror_md_ops, 0);
52570Sstevel@tonic-gate resync_start_timeout(setno);
52580Sstevel@tonic-gate gotsomething = 1;
52592063Shshaw } else {
52602063Shshaw return (retval);
52610Sstevel@tonic-gate }
52620Sstevel@tonic-gate /*
52630Sstevel@tonic-gate * Set flag to indicate that the mirror has not yet
52640Sstevel@tonic-gate * been through a reconfig. This flag is used for MN sets
52650Sstevel@tonic-gate * when determining whether to update the mirror state from
52660Sstevel@tonic-gate * the Master node.
52670Sstevel@tonic-gate */
52680Sstevel@tonic-gate if (MD_MNSET_SETNO(setno)) {
52690Sstevel@tonic-gate ui = MDI_UNIT(MD_SID(un));
52700Sstevel@tonic-gate ui->ui_tstate |= MD_RESYNC_NOT_DONE;
52710Sstevel@tonic-gate }
52720Sstevel@tonic-gate }
52730Sstevel@tonic-gate
52740Sstevel@tonic-gate if (!all_mirrors_gotten)
52750Sstevel@tonic-gate return (gotsomething);
52760Sstevel@tonic-gate
52770Sstevel@tonic-gate recid = mddb_makerecid(setno, 0);
52780Sstevel@tonic-gate while ((recid = mddb_getnextrec(recid, typ1, RESYNC_REC)) > 0)
52790Sstevel@tonic-gate if (!(mddb_getrecprivate(recid) & MD_PRV_GOTIT))
52800Sstevel@tonic-gate mddb_setrecprivate(recid, MD_PRV_PENDDEL);
52810Sstevel@tonic-gate
52820Sstevel@tonic-gate return (0);
52830Sstevel@tonic-gate }
52840Sstevel@tonic-gate
52850Sstevel@tonic-gate static int
mirror_halt(md_haltcmd_t cmd,set_t setno)52860Sstevel@tonic-gate mirror_halt(md_haltcmd_t cmd, set_t setno)
52870Sstevel@tonic-gate {
52880Sstevel@tonic-gate unit_t i;
52890Sstevel@tonic-gate mdi_unit_t *ui;
52900Sstevel@tonic-gate minor_t mnum;
52910Sstevel@tonic-gate int reset_mirror_flag = 0;
52920Sstevel@tonic-gate
52930Sstevel@tonic-gate if (cmd == MD_HALT_CLOSE)
52940Sstevel@tonic-gate return (0);
52950Sstevel@tonic-gate
52960Sstevel@tonic-gate if (cmd == MD_HALT_OPEN)
52970Sstevel@tonic-gate return (0);
52980Sstevel@tonic-gate
52990Sstevel@tonic-gate if (cmd == MD_HALT_UNLOAD)
53000Sstevel@tonic-gate return (0);
53010Sstevel@tonic-gate
53020Sstevel@tonic-gate if (cmd == MD_HALT_CHECK) {
53030Sstevel@tonic-gate for (i = 0; i < md_nunits; i++) {
53040Sstevel@tonic-gate mnum = MD_MKMIN(setno, i);
53050Sstevel@tonic-gate if ((ui = MDI_UNIT(mnum)) == NULL)
53060Sstevel@tonic-gate continue;
53070Sstevel@tonic-gate if (ui->ui_opsindex != mirror_md_ops.md_selfindex)
53080Sstevel@tonic-gate continue;
53090Sstevel@tonic-gate if (md_unit_isopen(ui))
53100Sstevel@tonic-gate return (1);
53110Sstevel@tonic-gate }
53120Sstevel@tonic-gate return (0);
53130Sstevel@tonic-gate }
53140Sstevel@tonic-gate
53150Sstevel@tonic-gate if (cmd != MD_HALT_DOIT)
53160Sstevel@tonic-gate return (1);
53170Sstevel@tonic-gate
53180Sstevel@tonic-gate for (i = 0; i < md_nunits; i++) {
53190Sstevel@tonic-gate mnum = MD_MKMIN(setno, i);
53200Sstevel@tonic-gate if ((ui = MDI_UNIT(mnum)) == NULL)
53210Sstevel@tonic-gate continue;
53220Sstevel@tonic-gate if (ui->ui_opsindex != mirror_md_ops.md_selfindex)
53230Sstevel@tonic-gate continue;
53240Sstevel@tonic-gate reset_mirror((mm_unit_t *)MD_UNIT(mnum), mnum, 0);
53250Sstevel@tonic-gate
53260Sstevel@tonic-gate /* Set a flag if there is at least one mirror metadevice. */
53270Sstevel@tonic-gate reset_mirror_flag = 1;
53280Sstevel@tonic-gate }
53290Sstevel@tonic-gate
53300Sstevel@tonic-gate /*
53310Sstevel@tonic-gate * Only wait for the global dr_timeout to finish
53320Sstevel@tonic-gate * - if there are mirror metadevices in this diskset or
53330Sstevel@tonic-gate * - if this is the local set since an unload of the md_mirror
53340Sstevel@tonic-gate * driver could follow a successful mirror halt in the local set.
53350Sstevel@tonic-gate */
53360Sstevel@tonic-gate if ((reset_mirror_flag != 0) || (setno == MD_LOCAL_SET)) {
53370Sstevel@tonic-gate while ((mirror_md_ops.md_head == NULL) &&
53380Sstevel@tonic-gate (mirror_timeout.dr_timeout_id != 0))
53390Sstevel@tonic-gate delay(md_hz);
53400Sstevel@tonic-gate }
53410Sstevel@tonic-gate
53420Sstevel@tonic-gate return (0);
53430Sstevel@tonic-gate }
53440Sstevel@tonic-gate
53450Sstevel@tonic-gate /*ARGSUSED3*/
53460Sstevel@tonic-gate static int
mirror_open(dev_t * dev,int flag,int otyp,cred_t * cred_p,int md_oflags)53470Sstevel@tonic-gate mirror_open(dev_t *dev, int flag, int otyp, cred_t *cred_p, int md_oflags)
53480Sstevel@tonic-gate {
53490Sstevel@tonic-gate IOLOCK lock;
535046Sskamm minor_t mnum = getminor(*dev);
535146Sskamm set_t setno;
535246Sskamm
535346Sskamm /*
535446Sskamm * When doing an open of a multi owner metadevice, check to see if this
535546Sskamm * node is a starting node and if a reconfig cycle is underway.
535646Sskamm * If so, the system isn't sufficiently set up enough to handle the
535746Sskamm * open (which involves I/O during sp_validate), so fail with ENXIO.
535846Sskamm */
535946Sskamm setno = MD_MIN2SET(mnum);
536046Sskamm if ((md_set[setno].s_status & (MD_SET_MNSET | MD_SET_MN_START_RC)) ==
536146Sskamm (MD_SET_MNSET | MD_SET_MN_START_RC)) {
536246Sskamm return (ENXIO);
536346Sskamm }
53640Sstevel@tonic-gate
53650Sstevel@tonic-gate if (md_oflags & MD_OFLG_FROMIOCTL) {
53660Sstevel@tonic-gate /*
53670Sstevel@tonic-gate * This indicates that the caller is an ioctl service routine.
53680Sstevel@tonic-gate * In this case we initialise our stack-based IOLOCK and pass
53690Sstevel@tonic-gate * this into the internal open routine. This allows multi-owner
53700Sstevel@tonic-gate * metadevices to avoid deadlocking if an error is encountered
53710Sstevel@tonic-gate * during the open() attempt. The failure case is:
53720Sstevel@tonic-gate * s-p -> mirror -> s-p (with error). Attempting to metaclear
53730Sstevel@tonic-gate * this configuration would deadlock as the mirror code has to
53740Sstevel@tonic-gate * send a state-update to the other nodes when it detects the
53750Sstevel@tonic-gate * failure of the underlying submirror with an errored soft-part
53760Sstevel@tonic-gate * on it. As there is a class1 message in progress (metaclear)
53770Sstevel@tonic-gate * set_sm_comp_state() cannot send another class1 message;
53780Sstevel@tonic-gate * instead we do not send a state_update message as the
53790Sstevel@tonic-gate * metaclear is distributed and the failed submirror will be
53800Sstevel@tonic-gate * cleared from the configuration by the metaclear.
53810Sstevel@tonic-gate */
53820Sstevel@tonic-gate IOLOCK_INIT(&lock);
53830Sstevel@tonic-gate return (mirror_internal_open(getminor(*dev), flag, otyp,
53840Sstevel@tonic-gate md_oflags, &lock));
53850Sstevel@tonic-gate } else {
53860Sstevel@tonic-gate return (mirror_internal_open(getminor(*dev), flag, otyp,
53870Sstevel@tonic-gate md_oflags, (IOLOCK *)NULL));
53880Sstevel@tonic-gate }
53890Sstevel@tonic-gate }
53900Sstevel@tonic-gate
53910Sstevel@tonic-gate
53920Sstevel@tonic-gate /*ARGSUSED1*/
53930Sstevel@tonic-gate static int
mirror_close(dev_t dev,int flag,int otyp,cred_t * cred_p,int md_cflags)53940Sstevel@tonic-gate mirror_close(dev_t dev, int flag, int otyp, cred_t *cred_p, int md_cflags)
53950Sstevel@tonic-gate {
53960Sstevel@tonic-gate return (mirror_internal_close(getminor(dev), otyp, md_cflags,
53976901Sjkennedy (IOLOCK *)NULL));
53980Sstevel@tonic-gate }
53990Sstevel@tonic-gate
54000Sstevel@tonic-gate
54010Sstevel@tonic-gate /*
54020Sstevel@tonic-gate * This routine dumps memory to the disk. It assumes that the memory has
54030Sstevel@tonic-gate * already been mapped into mainbus space. It is called at disk interrupt
54040Sstevel@tonic-gate * priority when the system is in trouble.
54050Sstevel@tonic-gate *
54060Sstevel@tonic-gate */
54070Sstevel@tonic-gate static int
mirror_dump(dev_t dev,caddr_t addr,daddr_t blkno,int nblk)54080Sstevel@tonic-gate mirror_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
54090Sstevel@tonic-gate {
54100Sstevel@tonic-gate mm_unit_t *un;
54110Sstevel@tonic-gate dev_t mapdev;
54120Sstevel@tonic-gate int result;
54130Sstevel@tonic-gate int smi;
54140Sstevel@tonic-gate int any_succeed = 0;
54150Sstevel@tonic-gate int save_result = 0;
54160Sstevel@tonic-gate
54170Sstevel@tonic-gate /*
54180Sstevel@tonic-gate * Don't need to grab the unit lock.
54190Sstevel@tonic-gate * Cause nothing else is suppose to be happenning.
54200Sstevel@tonic-gate * Also dump is not suppose to sleep.
54210Sstevel@tonic-gate */
54220Sstevel@tonic-gate un = (mm_unit_t *)MD_UNIT(getminor(dev));
54230Sstevel@tonic-gate
54240Sstevel@tonic-gate if ((diskaddr_t)blkno >= un->c.un_total_blocks)
54250Sstevel@tonic-gate return (EINVAL);
54260Sstevel@tonic-gate
54270Sstevel@tonic-gate if ((diskaddr_t)blkno + nblk > un->c.un_total_blocks)
54280Sstevel@tonic-gate return (EINVAL);
54290Sstevel@tonic-gate
54300Sstevel@tonic-gate for (smi = 0; smi < NMIRROR; smi++) {
54310Sstevel@tonic-gate if (!SUBMIRROR_IS_WRITEABLE(un, smi))
54320Sstevel@tonic-gate continue;
54330Sstevel@tonic-gate mapdev = md_dev64_to_dev(un->un_sm[smi].sm_dev);
54340Sstevel@tonic-gate result = bdev_dump(mapdev, addr, blkno, nblk);
54350Sstevel@tonic-gate if (result)
54360Sstevel@tonic-gate save_result = result;
54370Sstevel@tonic-gate
54380Sstevel@tonic-gate if (result == 0)
54390Sstevel@tonic-gate any_succeed++;
54400Sstevel@tonic-gate }
54410Sstevel@tonic-gate
54420Sstevel@tonic-gate if (any_succeed)
54430Sstevel@tonic-gate return (0);
54440Sstevel@tonic-gate
54450Sstevel@tonic-gate return (save_result);
54460Sstevel@tonic-gate }
54470Sstevel@tonic-gate
54480Sstevel@tonic-gate /*
54490Sstevel@tonic-gate * NAME: mirror_probe_dev
54500Sstevel@tonic-gate *
54510Sstevel@tonic-gate * DESCRITPION: force opens every component of a mirror.
54520Sstevel@tonic-gate *
54530Sstevel@tonic-gate * On entry the unit writerlock is held
54540Sstevel@tonic-gate */
54550Sstevel@tonic-gate static int
mirror_probe_dev(mdi_unit_t * ui,minor_t mnum)54560Sstevel@tonic-gate mirror_probe_dev(mdi_unit_t *ui, minor_t mnum)
54570Sstevel@tonic-gate {
54580Sstevel@tonic-gate int i;
54590Sstevel@tonic-gate int smi;
54600Sstevel@tonic-gate int ci;
54610Sstevel@tonic-gate mm_unit_t *un;
54620Sstevel@tonic-gate int md_devopen = 0;
54630Sstevel@tonic-gate set_t setno;
54640Sstevel@tonic-gate int sm_cnt;
54650Sstevel@tonic-gate int sm_unavail_cnt;
54660Sstevel@tonic-gate
54670Sstevel@tonic-gate if (md_unit_isopen(ui))
54680Sstevel@tonic-gate md_devopen++;
54690Sstevel@tonic-gate
54700Sstevel@tonic-gate un = MD_UNIT(mnum);
54710Sstevel@tonic-gate setno = MD_UN2SET(un);
54720Sstevel@tonic-gate
54730Sstevel@tonic-gate sm_cnt = 0;
54740Sstevel@tonic-gate sm_unavail_cnt = 0;
54750Sstevel@tonic-gate for (i = 0; i < NMIRROR; i++) {
54760Sstevel@tonic-gate md_dev64_t tmpdev;
54770Sstevel@tonic-gate mdi_unit_t *sm_ui;
54780Sstevel@tonic-gate
54790Sstevel@tonic-gate if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) {
54800Sstevel@tonic-gate continue;
54810Sstevel@tonic-gate }
54820Sstevel@tonic-gate
54830Sstevel@tonic-gate sm_cnt++;
54840Sstevel@tonic-gate tmpdev = un->un_sm[i].sm_dev;
54850Sstevel@tonic-gate (void) md_layered_open(mnum, &tmpdev,
54866901Sjkennedy MD_OFLG_CONT_ERRS | MD_OFLG_PROBEDEV);
54870Sstevel@tonic-gate un->un_sm[i].sm_dev = tmpdev;
54880Sstevel@tonic-gate
54890Sstevel@tonic-gate sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev)));
54900Sstevel@tonic-gate
54910Sstevel@tonic-gate /*
54920Sstevel@tonic-gate * Logic similar to that in mirror_open_all_devs. We set or
54930Sstevel@tonic-gate * clear the submirror Unavailable bit.
54940Sstevel@tonic-gate */
54950Sstevel@tonic-gate (void) md_unit_writerlock(sm_ui);
54960Sstevel@tonic-gate if (submirror_unavailable(un, i, 1)) {
54970Sstevel@tonic-gate sm_ui->ui_tstate |= MD_INACCESSIBLE;
54980Sstevel@tonic-gate sm_unavail_cnt++;
54990Sstevel@tonic-gate } else {
55000Sstevel@tonic-gate sm_ui->ui_tstate &= ~MD_INACCESSIBLE;
55010Sstevel@tonic-gate }
55020Sstevel@tonic-gate md_unit_writerexit(sm_ui);
55030Sstevel@tonic-gate }
55040Sstevel@tonic-gate
55050Sstevel@tonic-gate /*
55060Sstevel@tonic-gate * If all of the submirrors are unavailable, the mirror is also
55070Sstevel@tonic-gate * unavailable.
55080Sstevel@tonic-gate */
55090Sstevel@tonic-gate if (sm_cnt == sm_unavail_cnt) {
55100Sstevel@tonic-gate ui->ui_tstate |= MD_INACCESSIBLE;
55110Sstevel@tonic-gate } else {
55120Sstevel@tonic-gate ui->ui_tstate &= ~MD_INACCESSIBLE;
55130Sstevel@tonic-gate }
55140Sstevel@tonic-gate
55150Sstevel@tonic-gate /*
55160Sstevel@tonic-gate * Start checking from probe failures. If failures occur we
55170Sstevel@tonic-gate * set the appropriate erred state only if the metadevice is in
55180Sstevel@tonic-gate * use. This is specifically to prevent unnecessary resyncs.
55190Sstevel@tonic-gate * For instance if the disks were accidentally disconnected when
55200Sstevel@tonic-gate * the system booted up then until the metadevice is accessed
55210Sstevel@tonic-gate * (like file system mount) the user can shutdown, recable and
55220Sstevel@tonic-gate * reboot w/o incurring a potentially huge resync.
55230Sstevel@tonic-gate */
55240Sstevel@tonic-gate
55250Sstevel@tonic-gate smi = 0;
55260Sstevel@tonic-gate ci = 0;
55270Sstevel@tonic-gate while (mirror_geterror(un, &smi, &ci, 1, 1) != 0) {
55280Sstevel@tonic-gate
55290Sstevel@tonic-gate if (mirror_other_sources(un, smi, ci, 0) == 1) {
55300Sstevel@tonic-gate /*
55310Sstevel@tonic-gate * Note that for a MN set, there is no need to call
55320Sstevel@tonic-gate * SE_NOTIFY as that is done when processing the
55330Sstevel@tonic-gate * state change
55340Sstevel@tonic-gate */
55350Sstevel@tonic-gate if (md_devopen) {
55360Sstevel@tonic-gate /*
55370Sstevel@tonic-gate * Never called from ioctl context,
55380Sstevel@tonic-gate * so (IOLOCK *)NULL
55390Sstevel@tonic-gate */
55400Sstevel@tonic-gate set_sm_comp_state(un, smi, ci, CS_LAST_ERRED,
55410Sstevel@tonic-gate 0, MD_STATE_XMIT, (IOLOCK *)NULL);
55420Sstevel@tonic-gate if (!MD_MNSET_SETNO(setno)) {
55430Sstevel@tonic-gate SE_NOTIFY(EC_SVM_STATE,
55440Sstevel@tonic-gate ESC_SVM_LASTERRED,
55450Sstevel@tonic-gate SVM_TAG_METADEVICE, setno,
55460Sstevel@tonic-gate MD_SID(un));
55470Sstevel@tonic-gate }
55480Sstevel@tonic-gate continue;
55490Sstevel@tonic-gate } else {
55500Sstevel@tonic-gate (void) mirror_close_all_devs(un,
55510Sstevel@tonic-gate MD_OFLG_PROBEDEV);
55520Sstevel@tonic-gate if (!MD_MNSET_SETNO(setno)) {
55530Sstevel@tonic-gate SE_NOTIFY(EC_SVM_STATE,
55540Sstevel@tonic-gate ESC_SVM_OPEN_FAIL,
55550Sstevel@tonic-gate SVM_TAG_METADEVICE, setno,
55560Sstevel@tonic-gate MD_SID(un));
55570Sstevel@tonic-gate }
55580Sstevel@tonic-gate mirror_openfail_console_info(un, smi, ci);
55590Sstevel@tonic-gate return (ENXIO);
55600Sstevel@tonic-gate }
55610Sstevel@tonic-gate }
55620Sstevel@tonic-gate
55630Sstevel@tonic-gate /*
55640Sstevel@tonic-gate * Note that for a MN set, there is no need to call
55650Sstevel@tonic-gate * SE_NOTIFY as that is done when processing the
55660Sstevel@tonic-gate * state change
55670Sstevel@tonic-gate */
55680Sstevel@tonic-gate if (md_devopen) {
55690Sstevel@tonic-gate /* Never called from ioctl context, so (IOLOCK *)NULL */
55700Sstevel@tonic-gate set_sm_comp_state(un, smi, ci, CS_ERRED, 0,
55710Sstevel@tonic-gate MD_STATE_XMIT, (IOLOCK *)NULL);
55720Sstevel@tonic-gate if (!MD_MNSET_SETNO(setno)) {
55730Sstevel@tonic-gate SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED,
55740Sstevel@tonic-gate SVM_TAG_METADEVICE, setno,
55750Sstevel@tonic-gate MD_SID(un));
55760Sstevel@tonic-gate }
55770Sstevel@tonic-gate }
55780Sstevel@tonic-gate mirror_openfail_console_info(un, smi, ci);
55790Sstevel@tonic-gate ci++;
55800Sstevel@tonic-gate }
55810Sstevel@tonic-gate
55820Sstevel@tonic-gate if (MD_MNSET_SETNO(setno)) {
55830Sstevel@tonic-gate send_poke_hotspares(setno);
55840Sstevel@tonic-gate } else {
55850Sstevel@tonic-gate (void) poke_hotspares();
55860Sstevel@tonic-gate }
55870Sstevel@tonic-gate (void) mirror_close_all_devs(un, MD_OFLG_PROBEDEV);
55880Sstevel@tonic-gate
55890Sstevel@tonic-gate return (0);
55900Sstevel@tonic-gate }
55910Sstevel@tonic-gate
55920Sstevel@tonic-gate
55930Sstevel@tonic-gate static int
mirror_imp_set(set_t setno)55940Sstevel@tonic-gate mirror_imp_set(
55950Sstevel@tonic-gate set_t setno
55960Sstevel@tonic-gate )
55970Sstevel@tonic-gate {
55980Sstevel@tonic-gate
55990Sstevel@tonic-gate mddb_recid_t recid;
56000Sstevel@tonic-gate int gotsomething, i;
56010Sstevel@tonic-gate mddb_type_t typ1;
56020Sstevel@tonic-gate mddb_de_ic_t *dep;
56030Sstevel@tonic-gate mddb_rb32_t *rbp;
56040Sstevel@tonic-gate mm_unit32_od_t *un32;
56050Sstevel@tonic-gate mm_unit_t *un64;
56061623Stw21770 md_dev64_t self_devt;
56070Sstevel@tonic-gate minor_t *self_id; /* minor needs to be updated */
56080Sstevel@tonic-gate md_parent_t *parent_id; /* parent needs to be updated */
56090Sstevel@tonic-gate mddb_recid_t *record_id; /* record id needs to be updated */
56100Sstevel@tonic-gate mddb_recid_t *optrec_id;
56110Sstevel@tonic-gate md_dev64_t tmpdev;
56120Sstevel@tonic-gate
56130Sstevel@tonic-gate
56140Sstevel@tonic-gate gotsomething = 0;
56150Sstevel@tonic-gate
56160Sstevel@tonic-gate typ1 = (mddb_type_t)md_getshared_key(setno,
56170Sstevel@tonic-gate mirror_md_ops.md_driver.md_drivername);
56180Sstevel@tonic-gate recid = mddb_makerecid(setno, 0);
56190Sstevel@tonic-gate
56200Sstevel@tonic-gate while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) {
56210Sstevel@tonic-gate if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
56220Sstevel@tonic-gate continue;
56230Sstevel@tonic-gate
56240Sstevel@tonic-gate dep = mddb_getrecdep(recid);
56250Sstevel@tonic-gate rbp = dep->de_rb;
56260Sstevel@tonic-gate
56271623Stw21770 switch (rbp->rb_revision) {
56281623Stw21770 case MDDB_REV_RB:
56291623Stw21770 case MDDB_REV_RBFN:
56300Sstevel@tonic-gate /*
56310Sstevel@tonic-gate * Small device
56320Sstevel@tonic-gate */
56330Sstevel@tonic-gate un32 = (mm_unit32_od_t *)mddb_getrecaddr(recid);
56340Sstevel@tonic-gate self_id = &(un32->c.un_self_id);
56350Sstevel@tonic-gate parent_id = &(un32->c.un_parent);
56360Sstevel@tonic-gate record_id = &(un32->c.un_record_id);
56370Sstevel@tonic-gate optrec_id = &(un32->un_rr_dirty_recid);
56380Sstevel@tonic-gate
56390Sstevel@tonic-gate for (i = 0; i < un32->un_nsm; i++) {
56406901Sjkennedy tmpdev = md_expldev(un32->un_sm[i].sm_dev);
56416901Sjkennedy un32->un_sm[i].sm_dev = md_cmpldev
56426901Sjkennedy (md_makedevice(md_major, MD_MKMIN(setno,
56436901Sjkennedy MD_MIN2UNIT(md_getminor(tmpdev)))));
56446901Sjkennedy
56456901Sjkennedy if (!md_update_minor(setno, mddb_getsidenum
56466901Sjkennedy (setno), un32->un_sm[i].sm_key))
56470Sstevel@tonic-gate goto out;
56480Sstevel@tonic-gate }
56491623Stw21770 break;
56501623Stw21770 case MDDB_REV_RB64:
56511623Stw21770 case MDDB_REV_RB64FN:
56520Sstevel@tonic-gate un64 = (mm_unit_t *)mddb_getrecaddr(recid);
56530Sstevel@tonic-gate self_id = &(un64->c.un_self_id);
56540Sstevel@tonic-gate parent_id = &(un64->c.un_parent);
56550Sstevel@tonic-gate record_id = &(un64->c.un_record_id);
56560Sstevel@tonic-gate optrec_id = &(un64->un_rr_dirty_recid);
56570Sstevel@tonic-gate
56580Sstevel@tonic-gate for (i = 0; i < un64->un_nsm; i++) {
56596901Sjkennedy tmpdev = un64->un_sm[i].sm_dev;
56606901Sjkennedy un64->un_sm[i].sm_dev = md_makedevice
56616901Sjkennedy (md_major, MD_MKMIN(setno, MD_MIN2UNIT
56626901Sjkennedy (md_getminor(tmpdev))));
56636901Sjkennedy
56646901Sjkennedy if (!md_update_minor(setno, mddb_getsidenum
56656901Sjkennedy (setno), un64->un_sm[i].sm_key))
56660Sstevel@tonic-gate goto out;
56670Sstevel@tonic-gate }
56681623Stw21770 break;
56691623Stw21770 }
56701623Stw21770
56711623Stw21770 /*
56721623Stw21770 * If this is a top level and a friendly name metadevice,
56731623Stw21770 * update its minor in the namespace.
56741623Stw21770 */
56751623Stw21770 if ((*parent_id == MD_NO_PARENT) &&
56761623Stw21770 ((rbp->rb_revision == MDDB_REV_RBFN) ||
56771623Stw21770 (rbp->rb_revision == MDDB_REV_RB64FN))) {
56781623Stw21770
56791623Stw21770 self_devt = md_makedevice(md_major, *self_id);
56801623Stw21770 if (!md_update_top_device_minor(setno,
56811623Stw21770 mddb_getsidenum(setno), self_devt))
56821623Stw21770 goto out;
56830Sstevel@tonic-gate }
56840Sstevel@tonic-gate
56850Sstevel@tonic-gate /*
56860Sstevel@tonic-gate * Update unit with the imported setno
56870Sstevel@tonic-gate *
56880Sstevel@tonic-gate */
56890Sstevel@tonic-gate mddb_setrecprivate(recid, MD_PRV_GOTIT);
56900Sstevel@tonic-gate
56910Sstevel@tonic-gate *self_id = MD_MKMIN(setno, MD_MIN2UNIT(*self_id));
56920Sstevel@tonic-gate if (*parent_id != MD_NO_PARENT)
56930Sstevel@tonic-gate *parent_id = MD_MKMIN(setno, MD_MIN2UNIT(*parent_id));
56940Sstevel@tonic-gate *record_id = MAKERECID(setno, DBID(*record_id));
56950Sstevel@tonic-gate *optrec_id = MAKERECID(setno, DBID(*optrec_id));
56960Sstevel@tonic-gate
56970Sstevel@tonic-gate gotsomething = 1;
56980Sstevel@tonic-gate }
56990Sstevel@tonic-gate
57000Sstevel@tonic-gate out:
57010Sstevel@tonic-gate return (gotsomething);
57020Sstevel@tonic-gate }
57030Sstevel@tonic-gate
57040Sstevel@tonic-gate /*
57050Sstevel@tonic-gate * NAME: mirror_check_offline
57060Sstevel@tonic-gate *
57070Sstevel@tonic-gate * DESCRIPTION: return offline_status = 1 if any submirrors are offline
57080Sstevel@tonic-gate *
57090Sstevel@tonic-gate * Called from ioctl, so access to MD_UN_OFFLINE_SM in un_status is
57100Sstevel@tonic-gate * protected by the global ioctl lock as it is only set by the MD_IOCOFFLINE
57110Sstevel@tonic-gate * ioctl.
57120Sstevel@tonic-gate */
57130Sstevel@tonic-gate int
mirror_check_offline(md_dev64_t dev,int * offline_status)57140Sstevel@tonic-gate mirror_check_offline(md_dev64_t dev, int *offline_status)
57150Sstevel@tonic-gate {
57160Sstevel@tonic-gate mm_unit_t *un;
57170Sstevel@tonic-gate md_error_t mde = mdnullerror;
57180Sstevel@tonic-gate
57190Sstevel@tonic-gate if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL)
57200Sstevel@tonic-gate return (EINVAL);
57210Sstevel@tonic-gate *offline_status = 0;
57220Sstevel@tonic-gate if (un->c.un_status & MD_UN_OFFLINE_SM)
57230Sstevel@tonic-gate *offline_status = 1;
57240Sstevel@tonic-gate return (0);
57250Sstevel@tonic-gate }
57260Sstevel@tonic-gate
57270Sstevel@tonic-gate /*
57280Sstevel@tonic-gate * NAME: mirror_inc_abr_count
57290Sstevel@tonic-gate *
57300Sstevel@tonic-gate * DESCRIPTION: increment the count of layered soft parts with ABR set
57310Sstevel@tonic-gate *
57320Sstevel@tonic-gate * Called from ioctl, so access to un_abr_count is protected by the global
57330Sstevel@tonic-gate * ioctl lock. It is only referenced in the MD_IOCOFFLINE ioctl.
57340Sstevel@tonic-gate */
57350Sstevel@tonic-gate int
mirror_inc_abr_count(md_dev64_t dev)57360Sstevel@tonic-gate mirror_inc_abr_count(md_dev64_t dev)
57370Sstevel@tonic-gate {
57380Sstevel@tonic-gate mm_unit_t *un;
57390Sstevel@tonic-gate md_error_t mde = mdnullerror;
57400Sstevel@tonic-gate
57410Sstevel@tonic-gate if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL)
57420Sstevel@tonic-gate return (EINVAL);
57430Sstevel@tonic-gate un->un_abr_count++;
57440Sstevel@tonic-gate return (0);
57450Sstevel@tonic-gate }
57460Sstevel@tonic-gate
57470Sstevel@tonic-gate /*
57480Sstevel@tonic-gate * NAME: mirror_dec_abr_count
57490Sstevel@tonic-gate *
57500Sstevel@tonic-gate * DESCRIPTION: decrement the count of layered soft parts with ABR set
57510Sstevel@tonic-gate *
57520Sstevel@tonic-gate * Called from ioctl, so access to un_abr_count is protected by the global
57530Sstevel@tonic-gate * ioctl lock. It is only referenced in the MD_IOCOFFLINE ioctl.
57540Sstevel@tonic-gate */
57550Sstevel@tonic-gate int
mirror_dec_abr_count(md_dev64_t dev)57560Sstevel@tonic-gate mirror_dec_abr_count(md_dev64_t dev)
57570Sstevel@tonic-gate {
57580Sstevel@tonic-gate mm_unit_t *un;
57590Sstevel@tonic-gate md_error_t mde = mdnullerror;
57600Sstevel@tonic-gate
57610Sstevel@tonic-gate if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL)
57620Sstevel@tonic-gate return (EINVAL);
57630Sstevel@tonic-gate un->un_abr_count--;
57640Sstevel@tonic-gate return (0);
57650Sstevel@tonic-gate }
57660Sstevel@tonic-gate
57670Sstevel@tonic-gate static md_named_services_t mirror_named_services[] = {
57680Sstevel@tonic-gate {(intptr_t (*)()) poke_hotspares, "poke hotspares" },
57690Sstevel@tonic-gate {(intptr_t (*)()) mirror_rename_listkids, MDRNM_LIST_URKIDS },
57700Sstevel@tonic-gate {mirror_rename_check, MDRNM_CHECK },
57710Sstevel@tonic-gate {(intptr_t (*)()) mirror_renexch_update_kids, MDRNM_UPDATE_KIDS },
57720Sstevel@tonic-gate {(intptr_t (*)()) mirror_exchange_parent_update_to,
57730Sstevel@tonic-gate MDRNM_PARENT_UPDATE_TO},
57740Sstevel@tonic-gate {(intptr_t (*)()) mirror_exchange_self_update_from_down,
57750Sstevel@tonic-gate MDRNM_SELF_UPDATE_FROM_DOWN },
57760Sstevel@tonic-gate {(intptr_t (*)())mirror_probe_dev, "probe open test" },
57770Sstevel@tonic-gate {(intptr_t (*)())mirror_check_offline, MD_CHECK_OFFLINE },
57780Sstevel@tonic-gate {(intptr_t (*)())mirror_inc_abr_count, MD_INC_ABR_COUNT },
57790Sstevel@tonic-gate {(intptr_t (*)())mirror_dec_abr_count, MD_DEC_ABR_COUNT },
57800Sstevel@tonic-gate { NULL, 0 }
57810Sstevel@tonic-gate };
57820Sstevel@tonic-gate
57830Sstevel@tonic-gate md_ops_t mirror_md_ops = {
57840Sstevel@tonic-gate mirror_open, /* open */
57850Sstevel@tonic-gate mirror_close, /* close */
57860Sstevel@tonic-gate md_mirror_strategy, /* strategy */
57870Sstevel@tonic-gate NULL, /* print */
57880Sstevel@tonic-gate mirror_dump, /* dump */
57890Sstevel@tonic-gate NULL, /* read */
57900Sstevel@tonic-gate NULL, /* write */
57910Sstevel@tonic-gate md_mirror_ioctl, /* mirror_ioctl, */
57920Sstevel@tonic-gate mirror_snarf, /* mirror_snarf */
57930Sstevel@tonic-gate mirror_halt, /* mirror_halt */
57940Sstevel@tonic-gate NULL, /* aread */
57950Sstevel@tonic-gate NULL, /* awrite */
57960Sstevel@tonic-gate mirror_imp_set, /* import set */
57970Sstevel@tonic-gate mirror_named_services
57980Sstevel@tonic-gate };
57990Sstevel@tonic-gate
58000Sstevel@tonic-gate /* module specific initilization */
58010Sstevel@tonic-gate static void
init_init()58020Sstevel@tonic-gate init_init()
58030Sstevel@tonic-gate {
58040Sstevel@tonic-gate md_mirror_mcs_buf_off = sizeof (md_mcs_t) - sizeof (buf_t);
58050Sstevel@tonic-gate
58060Sstevel@tonic-gate /* Initialize the parent and child save memory pools */
58070Sstevel@tonic-gate mirror_parent_cache = kmem_cache_create("md_mirror_parent",
58080Sstevel@tonic-gate sizeof (md_mps_t), 0, mirror_parent_constructor,
58090Sstevel@tonic-gate mirror_parent_destructor, mirror_run_queue, NULL, NULL,
58100Sstevel@tonic-gate 0);
58110Sstevel@tonic-gate
58120Sstevel@tonic-gate mirror_child_cache = kmem_cache_create("md_mirror_child",
58130Sstevel@tonic-gate sizeof (md_mcs_t) - sizeof (buf_t) + biosize(), 0,
58140Sstevel@tonic-gate mirror_child_constructor, mirror_child_destructor,
58150Sstevel@tonic-gate mirror_run_queue, NULL, NULL, 0);
58160Sstevel@tonic-gate
58170Sstevel@tonic-gate /*
58180Sstevel@tonic-gate * Insure wowbuf_size is a multiple of DEV_BSIZE,
58190Sstevel@tonic-gate * then initialize wowbuf memory pool.
58200Sstevel@tonic-gate */
58210Sstevel@tonic-gate md_wowbuf_size = roundup(md_wowbuf_size, DEV_BSIZE);
58220Sstevel@tonic-gate if (md_wowbuf_size <= 0)
58230Sstevel@tonic-gate md_wowbuf_size = 2 * DEV_BSIZE;
58240Sstevel@tonic-gate if (md_wowbuf_size > (32 * DEV_BSIZE))
58250Sstevel@tonic-gate md_wowbuf_size = (32 * DEV_BSIZE);
58260Sstevel@tonic-gate
58270Sstevel@tonic-gate md_wowblk_size = md_wowbuf_size + sizeof (wowhdr_t);
58280Sstevel@tonic-gate mirror_wowblk_cache = kmem_cache_create("md_mirror_wow",
58290Sstevel@tonic-gate md_wowblk_size, 0, NULL, NULL, NULL, NULL, NULL, 0);
58300Sstevel@tonic-gate
58310Sstevel@tonic-gate mutex_init(&mirror_timeout.dr_mx, NULL, MUTEX_DEFAULT, NULL);
58320Sstevel@tonic-gate mutex_init(&hotspare_request.dr_mx, NULL, MUTEX_DEFAULT, NULL);
58330Sstevel@tonic-gate
58340Sstevel@tonic-gate mutex_init(&non_ff_drv_mutex, NULL, MUTEX_DEFAULT, NULL);
58350Sstevel@tonic-gate }
58360Sstevel@tonic-gate
58370Sstevel@tonic-gate /* module specific uninitilization (undo init_init()) */
58380Sstevel@tonic-gate static void
fini_uninit()58390Sstevel@tonic-gate fini_uninit()
58400Sstevel@tonic-gate {
58410Sstevel@tonic-gate kmem_cache_destroy(mirror_parent_cache);
58420Sstevel@tonic-gate kmem_cache_destroy(mirror_child_cache);
58430Sstevel@tonic-gate kmem_cache_destroy(mirror_wowblk_cache);
58440Sstevel@tonic-gate mirror_parent_cache = mirror_child_cache =
58450Sstevel@tonic-gate mirror_wowblk_cache = NULL;
58460Sstevel@tonic-gate
58470Sstevel@tonic-gate mutex_destroy(&mirror_timeout.dr_mx);
58480Sstevel@tonic-gate mutex_destroy(&hotspare_request.dr_mx);
58490Sstevel@tonic-gate mutex_destroy(&non_ff_drv_mutex);
58500Sstevel@tonic-gate }
58510Sstevel@tonic-gate
58520Sstevel@tonic-gate /* define the module linkage */
58534932Spetede MD_PLUGIN_MISC_MODULE("mirrors module", init_init(), fini_uninit())
5854