xref: /onnv-gate/usr/src/cmd/lvm/util/metaclust.c (revision 1623)
10Sstevel@tonic-gate /*
20Sstevel@tonic-gate  * CDDL HEADER START
30Sstevel@tonic-gate  *
40Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
5*1623Stw21770  * Common Development and Distribution License (the "License").
6*1623Stw21770  * You may not use this file except in compliance with the License.
70Sstevel@tonic-gate  *
80Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
90Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
100Sstevel@tonic-gate  * See the License for the specific language governing permissions
110Sstevel@tonic-gate  * and limitations under the License.
120Sstevel@tonic-gate  *
130Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
140Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
150Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
160Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
170Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
180Sstevel@tonic-gate  *
190Sstevel@tonic-gate  * CDDL HEADER END
200Sstevel@tonic-gate  */
210Sstevel@tonic-gate 
220Sstevel@tonic-gate /*
23*1623Stw21770  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
240Sstevel@tonic-gate  * Use is subject to license terms.
250Sstevel@tonic-gate  */
260Sstevel@tonic-gate 
270Sstevel@tonic-gate #pragma ident	"%Z%%M%	%I%	%E% SMI"
280Sstevel@tonic-gate 
290Sstevel@tonic-gate #include <meta.h>
300Sstevel@tonic-gate #include <sdssc.h>
310Sstevel@tonic-gate #include <signal.h>
320Sstevel@tonic-gate #include <syslog.h>
330Sstevel@tonic-gate #include <sys/types.h>
340Sstevel@tonic-gate #include <sys/wait.h>
350Sstevel@tonic-gate #include <sys/lvm/md_mirror.h>
360Sstevel@tonic-gate #include <metad.h>
370Sstevel@tonic-gate 
380Sstevel@tonic-gate #define	MY_VERSION		"1.0"	/* the highest supported version */
390Sstevel@tonic-gate #define	MAX_DEBUG_LEVEL		5	/* maximum verbosity level */
400Sstevel@tonic-gate 
410Sstevel@tonic-gate #define	RESET_OWNER		0x0001
420Sstevel@tonic-gate #define	CHOOSE_OWNER		0x0002
430Sstevel@tonic-gate #define	RESET_ABR		0x0004
440Sstevel@tonic-gate #define	UPDATE_ABR		0x0008
450Sstevel@tonic-gate #define	GET_MIRROR_STATE	0x0010
460Sstevel@tonic-gate 
470Sstevel@tonic-gate #define	SET_INFO_NO_WR	0x0002
480Sstevel@tonic-gate #define	SET_INFO_MN	0x0004
490Sstevel@tonic-gate 
500Sstevel@tonic-gate /*
510Sstevel@tonic-gate  * This table defines all the metaclust reconfig steps we understand
520Sstevel@tonic-gate  */
530Sstevel@tonic-gate typedef enum stpnum {
540Sstevel@tonic-gate 	MC_UNK = 0,
550Sstevel@tonic-gate 	MC_START,
560Sstevel@tonic-gate 	MC_STOP,
570Sstevel@tonic-gate 	MC_ABORT,
580Sstevel@tonic-gate 	MC_RETURN,
590Sstevel@tonic-gate 	MC_STEP1,
600Sstevel@tonic-gate 	MC_STEP2,
610Sstevel@tonic-gate 	MC_STEP3,
620Sstevel@tonic-gate 	MC_STEP4
630Sstevel@tonic-gate } stepnum_t;
640Sstevel@tonic-gate 
650Sstevel@tonic-gate /*
660Sstevel@tonic-gate  * Structure for step_name -> step_number mapping
670Sstevel@tonic-gate  */
680Sstevel@tonic-gate struct step_t {
690Sstevel@tonic-gate 	char		*step_nam;
700Sstevel@tonic-gate 	stepnum_t	step_num;
710Sstevel@tonic-gate };
720Sstevel@tonic-gate 
730Sstevel@tonic-gate /*
740Sstevel@tonic-gate  * Step name to step number mapping table
750Sstevel@tonic-gate  * This table MUST be sorted alphabetically in ascending order of step name
760Sstevel@tonic-gate  */
770Sstevel@tonic-gate static struct step_t step_table[] = {
780Sstevel@tonic-gate 	{ "abort",	MC_ABORT },
790Sstevel@tonic-gate 	{ "return",	MC_RETURN },
800Sstevel@tonic-gate 	{ "start",	MC_START },
810Sstevel@tonic-gate 	{ "step1",	MC_STEP1 },
820Sstevel@tonic-gate 	{ "step2",	MC_STEP2 },
830Sstevel@tonic-gate 	{ "step3",	MC_STEP3 },
840Sstevel@tonic-gate 	{ "step4",	MC_STEP4 },
850Sstevel@tonic-gate 	{ "stop",	MC_STOP }
860Sstevel@tonic-gate };
870Sstevel@tonic-gate 
880Sstevel@tonic-gate /*
890Sstevel@tonic-gate  * If support for a different version is added, the new version number should
900Sstevel@tonic-gate  * be appended to the version_table below. This list will be searched to
910Sstevel@tonic-gate  * determine if a version requested via the -V option is supported or not.
920Sstevel@tonic-gate  */
930Sstevel@tonic-gate static char *version_table[] = {
940Sstevel@tonic-gate 	MY_VERSION
950Sstevel@tonic-gate };
960Sstevel@tonic-gate 
970Sstevel@tonic-gate uint_t	timeout = 0;			/* disable timeout by default */
980Sstevel@tonic-gate char	*version = MY_VERSION;		/* use latest version by default */
990Sstevel@tonic-gate int	stepnum = MC_UNK;		/* reconfiguration step number */
1000Sstevel@tonic-gate pid_t	c_pid;				/* child process id */
1010Sstevel@tonic-gate 
1020Sstevel@tonic-gate /*
1030Sstevel@tonic-gate  * Binary search comparison routine
1040Sstevel@tonic-gate  */
1050Sstevel@tonic-gate static int
1060Sstevel@tonic-gate mc_compare(const void *stp1, const void *stp2)
1070Sstevel@tonic-gate {
1080Sstevel@tonic-gate 	return (strcmp((const char *)stp1,
1090Sstevel@tonic-gate 	    ((const struct step_t *)stp2)->step_nam));
1100Sstevel@tonic-gate }
1110Sstevel@tonic-gate 
1120Sstevel@tonic-gate /*
1130Sstevel@tonic-gate  * Timeout expiry alarm signal handler
1140Sstevel@tonic-gate  */
1150Sstevel@tonic-gate /*ARGSUSED*/
1160Sstevel@tonic-gate static void
1170Sstevel@tonic-gate sigalarmhandler(int sig)
1180Sstevel@tonic-gate {
1190Sstevel@tonic-gate 	int	i, n, ret, stat_loc = 0;
1200Sstevel@tonic-gate 
1210Sstevel@tonic-gate 	n = sizeof (step_table) / sizeof (step_table[0]);
1220Sstevel@tonic-gate 	for (i = 0; i < n; i++) {
1230Sstevel@tonic-gate 		if (stepnum == step_table[i].step_num)
1240Sstevel@tonic-gate 			break;
1250Sstevel@tonic-gate 	}
1260Sstevel@tonic-gate 
1270Sstevel@tonic-gate 	assert(i != n);
1280Sstevel@tonic-gate 
1290Sstevel@tonic-gate 	meta_mc_log(MC_LOG1, gettext("Timeout expired in %s: %s"),
1300Sstevel@tonic-gate 	    step_table[i].step_nam,
1310Sstevel@tonic-gate 	    meta_print_hrtime(gethrtime() - start_time));
1320Sstevel@tonic-gate 
1330Sstevel@tonic-gate 	if ((ret = kill(c_pid, SIGKILL)) == 0) {
1340Sstevel@tonic-gate 		/*
1350Sstevel@tonic-gate 		 * The child will wait forever until the status is retrieved
1360Sstevel@tonic-gate 		 * so get it now. Keep retrying if the call is interrupted.
1370Sstevel@tonic-gate 		 *
1380Sstevel@tonic-gate 		 * The possible results are,
1390Sstevel@tonic-gate 		 *
1400Sstevel@tonic-gate 		 *	- child killed successfully
1410Sstevel@tonic-gate 		 *	- signal sent but child not killed
1420Sstevel@tonic-gate 		 *	- waitpid failed/interrupted
1430Sstevel@tonic-gate 		 */
1440Sstevel@tonic-gate 		sleep(2);
1450Sstevel@tonic-gate 		while ((ret = waitpid(c_pid, &stat_loc, WNOHANG)) < 0) {
1460Sstevel@tonic-gate 			if (errno != EINTR) {
1470Sstevel@tonic-gate 				break;
1480Sstevel@tonic-gate 			}
1490Sstevel@tonic-gate 		}
1500Sstevel@tonic-gate 		if ((ret == c_pid) || (errno == ECHILD)) {
1510Sstevel@tonic-gate 			ret = 0;
1520Sstevel@tonic-gate 		} else {
1530Sstevel@tonic-gate 			ret = 1;
1540Sstevel@tonic-gate 		}
1550Sstevel@tonic-gate 	} else if (errno == ESRCH) {
1560Sstevel@tonic-gate 		/*
1570Sstevel@tonic-gate 		 * If the kill did not catch the child then it means the child
1580Sstevel@tonic-gate 		 * exited immediately after the timeout occured.
1590Sstevel@tonic-gate 		 */
1600Sstevel@tonic-gate 		ret = 0;
1610Sstevel@tonic-gate 	}
1620Sstevel@tonic-gate 
1630Sstevel@tonic-gate 	/*
1640Sstevel@tonic-gate 	 * make sure not to exit with 205 for any steps other than step1-step4.
1650Sstevel@tonic-gate 	 * Suncluster reconfiguration can't handle it otherwise.
1660Sstevel@tonic-gate 	 */
1670Sstevel@tonic-gate 	switch (stepnum) {
1680Sstevel@tonic-gate 	case MC_STEP1:
1690Sstevel@tonic-gate 	case MC_STEP2:
1700Sstevel@tonic-gate 	case MC_STEP3:
1710Sstevel@tonic-gate 	case MC_STEP4:
1720Sstevel@tonic-gate 		/*
1730Sstevel@tonic-gate 		 * If the child was killed successfully return 205 for a
1740Sstevel@tonic-gate 		 * new reconfig cycle otherwise send 1 to panic the node.
1750Sstevel@tonic-gate 		 */
1760Sstevel@tonic-gate 		if (ret != 0) {
1770Sstevel@tonic-gate 			md_eprintf(gettext("Could not kill child\n"));
1780Sstevel@tonic-gate 			exit(1);
1790Sstevel@tonic-gate 		} else {
1800Sstevel@tonic-gate 			exit(205);
1810Sstevel@tonic-gate 		}
1820Sstevel@tonic-gate 		break;
1830Sstevel@tonic-gate 	case MC_START:
1840Sstevel@tonic-gate 	case MC_STOP:
1850Sstevel@tonic-gate 	case MC_ABORT:
1860Sstevel@tonic-gate 	case MC_RETURN:
1870Sstevel@tonic-gate 	default:
1880Sstevel@tonic-gate 		exit(1);
1890Sstevel@tonic-gate 		break;
1900Sstevel@tonic-gate 	}
1910Sstevel@tonic-gate }
1920Sstevel@tonic-gate 
1930Sstevel@tonic-gate /*
1940Sstevel@tonic-gate  * Attempt to load local set.
1950Sstevel@tonic-gate  * Returns:
1960Sstevel@tonic-gate  *	pointer to mdsetname_t for local set (local_sp) is successful.
1970Sstevel@tonic-gate  *	0 if failure
1980Sstevel@tonic-gate  *		if there are no local set mddbs, no error message is printed.
1990Sstevel@tonic-gate  *		Otherwise, error message is printed so that user
2000Sstevel@tonic-gate  *		can determine why the local set didn't start.
2010Sstevel@tonic-gate  */
2020Sstevel@tonic-gate mdsetname_t *
2030Sstevel@tonic-gate load_local_set(md_error_t *ep)
2040Sstevel@tonic-gate {
2050Sstevel@tonic-gate 	mdsetname_t	*local_sp = NULL;
2060Sstevel@tonic-gate 
2070Sstevel@tonic-gate 	/* Does local set exist? If not, give no error */
2080Sstevel@tonic-gate 	if ((local_sp = metasetname(MD_LOCAL_NAME, ep)) == NULL) {
2090Sstevel@tonic-gate 		return (0);
2100Sstevel@tonic-gate 	}
2110Sstevel@tonic-gate 
2120Sstevel@tonic-gate 	/*
2130Sstevel@tonic-gate 	 * snarf local set
2140Sstevel@tonic-gate 	 * If fails with MDE_DB_NODB, then just return 1 printing
2150Sstevel@tonic-gate 	 * no failure.
2160Sstevel@tonic-gate 	 * Otherwise, print error message, and return 1.
2170Sstevel@tonic-gate 	 */
2180Sstevel@tonic-gate 	if (meta_setup_db_locations(ep) != 0) {
2190Sstevel@tonic-gate 		if (!(mdismddberror(ep, MDE_DB_NODB)))
2200Sstevel@tonic-gate 			mde_perror(ep, "");
2210Sstevel@tonic-gate 		return (0);
2220Sstevel@tonic-gate 	}
2230Sstevel@tonic-gate 
2240Sstevel@tonic-gate 	/* local set loaded successfully */
2250Sstevel@tonic-gate 	return (local_sp);
2260Sstevel@tonic-gate }
2270Sstevel@tonic-gate 
2280Sstevel@tonic-gate /*
2290Sstevel@tonic-gate  * Purpose:	Compose a full path name for a metadevice
2300Sstevel@tonic-gate  *
2310Sstevel@tonic-gate  * On entry:	sp	- setname pointer
2320Sstevel@tonic-gate  *		mnum	- minor number of metadevice
2330Sstevel@tonic-gate  *		pathname - pointer to array to return path string
2340Sstevel@tonic-gate  *		pathlen	- max length of pathname array
2350Sstevel@tonic-gate  */
2360Sstevel@tonic-gate static int
2370Sstevel@tonic-gate compose_path(mdsetname_t *sp, int mnum, char *pathname, int pathlen)
2380Sstevel@tonic-gate {
2390Sstevel@tonic-gate 	int	rtn;
240*1623Stw21770 	mdname_t	*np;
241*1623Stw21770 	md_error_t	status = mdnullerror;
2420Sstevel@tonic-gate 
2430Sstevel@tonic-gate 	if (MD_MIN2SET(mnum) != sp->setno) {
2440Sstevel@tonic-gate 		md_eprintf(gettext("minor number 0x%x invalid for set %d\n"),
2450Sstevel@tonic-gate 		    mnum, sp->setno);
2460Sstevel@tonic-gate 		return (-1);
2470Sstevel@tonic-gate 	}
248*1623Stw21770 
249*1623Stw21770 	if ((np = metamnumname(&sp, mnum, 0, &status)) == NULL) {
250*1623Stw21770 		return (-1);
251*1623Stw21770 	}
252*1623Stw21770 
253*1623Stw21770 	rtn = snprintf(pathname, pathlen, "%s", np->rname);
2540Sstevel@tonic-gate 
2550Sstevel@tonic-gate 	if ((pathname[0] == '\0') || (rtn >= pathlen)) {
2560Sstevel@tonic-gate 		md_eprintf(gettext(
257*1623Stw21770 		    "Could not create path for device %s\n"),
258*1623Stw21770 		    get_mdname(sp, mnum));
2590Sstevel@tonic-gate 		return (-1);
2600Sstevel@tonic-gate 	}
2610Sstevel@tonic-gate 	return (0);
2620Sstevel@tonic-gate }
2630Sstevel@tonic-gate 
2640Sstevel@tonic-gate /*
2650Sstevel@tonic-gate  * Purpose:	Walk through all the devices specified for the given set
2660Sstevel@tonic-gate  *		and do the action specified in mode
2670Sstevel@tonic-gate  */
2680Sstevel@tonic-gate static int
2690Sstevel@tonic-gate reset_state(uint_t mode, mdsetname_t *sp, char *drivername, md_error_t *ep)
2700Sstevel@tonic-gate {
2710Sstevel@tonic-gate 	mdnamelist_t			*devnlp = NULL;
2720Sstevel@tonic-gate 	mdnamelist_t			*p;
2730Sstevel@tonic-gate 	mdname_t			*devnp = NULL;
2740Sstevel@tonic-gate 	md_set_mmown_params_t		ownpar_p;
2750Sstevel@tonic-gate 	md_set_mmown_params_t		*ownpar = &ownpar_p;
2760Sstevel@tonic-gate 	md_unit_t			*mm;
2770Sstevel@tonic-gate 	int				mirror_dev = 0;
2780Sstevel@tonic-gate 	mndiskset_membershiplist_t	*nl;
2790Sstevel@tonic-gate 	int				cnt;
2800Sstevel@tonic-gate 	int				has_parent;
2810Sstevel@tonic-gate 	md_mn_get_mir_state_t		mir_state_p;
2820Sstevel@tonic-gate 	md_mn_get_mir_state_t		*mir_state = &mir_state_p;
2830Sstevel@tonic-gate 
2840Sstevel@tonic-gate 	/*
2850Sstevel@tonic-gate 	 * if we are choosing or resetting the owners then make sure
2860Sstevel@tonic-gate 	 * we are only doing it for mirror devices
2870Sstevel@tonic-gate 	 */
2880Sstevel@tonic-gate 	mirror_dev = (strcmp(MD_MIRROR, drivername) == 0);
2890Sstevel@tonic-gate 	if ((mode & (RESET_OWNER | CHOOSE_OWNER)) && !mirror_dev) {
2900Sstevel@tonic-gate 		return (-1);
2910Sstevel@tonic-gate 	}
2920Sstevel@tonic-gate 
2930Sstevel@tonic-gate 	/* get a list of all the metadevices for current set */
2940Sstevel@tonic-gate 	if (mirror_dev && meta_get_mirror_names(sp, &devnlp, 0, ep) < 0) {
2950Sstevel@tonic-gate 		mde_perror(ep, gettext("Could not get mirrors for set %s"),
2960Sstevel@tonic-gate 		    sp->setname);
2970Sstevel@tonic-gate 		return (-1);
2980Sstevel@tonic-gate 	} else if (meta_get_sp_names(sp, &devnlp, 0, ep) < 0) {
2990Sstevel@tonic-gate 		mde_perror(ep, gettext(
3000Sstevel@tonic-gate 		    "Could not get soft partitions for set %s"), sp->setname);
3010Sstevel@tonic-gate 		return (-1);
3020Sstevel@tonic-gate 	}
3030Sstevel@tonic-gate 
3040Sstevel@tonic-gate 	/* If resetting the owner, get the known membership list */
3050Sstevel@tonic-gate 	if (mode & RESET_OWNER) {
3060Sstevel@tonic-gate 		if (meta_read_nodelist(&cnt, &nl, ep)) {
3070Sstevel@tonic-gate 			mde_perror(ep, "Could not get nodelist");
3080Sstevel@tonic-gate 			return (-1);
3090Sstevel@tonic-gate 		}
3100Sstevel@tonic-gate 	}
3110Sstevel@tonic-gate 
3120Sstevel@tonic-gate 	/* for each metadevice */
3130Sstevel@tonic-gate 	for (p = devnlp; (p != NULL); p = p->next) {
3140Sstevel@tonic-gate 		devnp = p->namep;
3150Sstevel@tonic-gate 
3160Sstevel@tonic-gate 		/*
3170Sstevel@tonic-gate 		 * Get the current setting for mirror ABR state and all of the
3180Sstevel@tonic-gate 		 * submirror state and flags from the master node. We only
3190Sstevel@tonic-gate 		 * perform this when going through a 'start' cycle.
3200Sstevel@tonic-gate 		 */
3210Sstevel@tonic-gate 		if ((mode & GET_MIRROR_STATE) && mirror_dev) {
3220Sstevel@tonic-gate 			char	*miscname;
3230Sstevel@tonic-gate 
3240Sstevel@tonic-gate 			/*
3250Sstevel@tonic-gate 			 * Ensure that we ignore soft-parts that are returned
3260Sstevel@tonic-gate 			 * from the meta_get_mirror_names() call
3270Sstevel@tonic-gate 			 */
3280Sstevel@tonic-gate 			if ((miscname = metagetmiscname(devnp, ep)) == NULL)
3290Sstevel@tonic-gate 				goto out;
3300Sstevel@tonic-gate 			if (strcmp(miscname, MD_MIRROR) != 0)
3310Sstevel@tonic-gate 				continue;
3320Sstevel@tonic-gate 
3330Sstevel@tonic-gate 			mir_state->mnum = meta_getminor(devnp->dev);
3340Sstevel@tonic-gate 			MD_SETDRIVERNAME(mir_state, MD_MIRROR, sp->setno);
3350Sstevel@tonic-gate 			meta_mc_log(MC_LOG4, gettext("Getting mirror state"
336*1623Stw21770 			    " for %s: %s"), get_mdname(sp, mir_state->mnum),
3370Sstevel@tonic-gate 			    meta_print_hrtime(gethrtime() - start_time));
3380Sstevel@tonic-gate 
3390Sstevel@tonic-gate 			if (metaioctl(MD_MN_GET_MIRROR_STATE, mir_state, ep,
3400Sstevel@tonic-gate 			    "MD_MN_GET_MIRROR_STATE") != 0) {
3410Sstevel@tonic-gate 				mde_perror(ep, gettext("Unable to get "
342*1623Stw21770 				    "mirror state for %s"),
343*1623Stw21770 				    get_mdname(sp, mir_state->mnum));
3440Sstevel@tonic-gate 				goto out;
3450Sstevel@tonic-gate 			} else {
3460Sstevel@tonic-gate 				continue;
3470Sstevel@tonic-gate 			}
3480Sstevel@tonic-gate 		}
3490Sstevel@tonic-gate 
3500Sstevel@tonic-gate 		/* check if this is a top level metadevice */
3510Sstevel@tonic-gate 		if ((mm = meta_get_mdunit(sp, devnp, ep)) == NULL)
3520Sstevel@tonic-gate 			goto out;
3530Sstevel@tonic-gate 		if (MD_HAS_PARENT(MD_PARENT(mm))) {
3540Sstevel@tonic-gate 			has_parent = 1;
3550Sstevel@tonic-gate 		} else {
3560Sstevel@tonic-gate 			has_parent = 0;
3570Sstevel@tonic-gate 		}
3580Sstevel@tonic-gate 		Free(mm);
3590Sstevel@tonic-gate 
3600Sstevel@tonic-gate 		if (mode & (RESET_OWNER | CHOOSE_OWNER)) {
3610Sstevel@tonic-gate 			char	*miscname;
3620Sstevel@tonic-gate 
3630Sstevel@tonic-gate 			/*
3640Sstevel@tonic-gate 			 * we can only do these for mirrors so make sure we
3650Sstevel@tonic-gate 			 * really have a mirror device and not a softpartition
3660Sstevel@tonic-gate 			 * imitating one. meta_get_mirror_names seems to think
3670Sstevel@tonic-gate 			 * softparts on top of a mirror are mirrors!
3680Sstevel@tonic-gate 			 */
3690Sstevel@tonic-gate 			if ((miscname = metagetmiscname(devnp, ep)) == NULL)
3700Sstevel@tonic-gate 				goto out;
3710Sstevel@tonic-gate 			if (strcmp(miscname, MD_MIRROR) != 0)
3720Sstevel@tonic-gate 				continue;
3730Sstevel@tonic-gate 
3740Sstevel@tonic-gate 			(void) memset(ownpar, 0, sizeof (*ownpar));
3750Sstevel@tonic-gate 			ownpar->d.mnum = meta_getminor(devnp->dev);
3760Sstevel@tonic-gate 			MD_SETDRIVERNAME(ownpar, MD_MIRROR, sp->setno);
3770Sstevel@tonic-gate 
3780Sstevel@tonic-gate 			meta_mc_log(MC_LOG4, gettext("Setting owner "
379*1623Stw21770 			    "for %s: %s"), get_mdname(sp, ownpar->d.mnum),
3800Sstevel@tonic-gate 			    meta_print_hrtime(gethrtime() - start_time));
3810Sstevel@tonic-gate 
3820Sstevel@tonic-gate 			/* get the current owner id */
3830Sstevel@tonic-gate 			if (metaioctl(MD_MN_GET_MM_OWNER, ownpar, ep,
3840Sstevel@tonic-gate 			    "MD_MN_GET_MM_OWNER") != 0) {
3850Sstevel@tonic-gate 				mde_perror(ep, gettext("Unable to get "
386*1623Stw21770 				    "mirror owner for %s"),
387*1623Stw21770 				    get_mdname(sp, ownpar->d.mnum));
3880Sstevel@tonic-gate 				goto out;
3890Sstevel@tonic-gate 			}
3900Sstevel@tonic-gate 		}
3910Sstevel@tonic-gate 
3920Sstevel@tonic-gate 		if (mode & RESET_OWNER) {
3930Sstevel@tonic-gate 			if (ownpar->d.owner == MD_MN_MIRROR_UNOWNED) {
3940Sstevel@tonic-gate 				mdclrerror(ep);
3950Sstevel@tonic-gate 				continue;
3960Sstevel@tonic-gate 			}
3970Sstevel@tonic-gate 
3980Sstevel@tonic-gate 			/*
3990Sstevel@tonic-gate 			 * reset owner only if the current owner is
4000Sstevel@tonic-gate 			 * not in the membership list
4010Sstevel@tonic-gate 			 * Also kill the resync thread so that when the resync
4020Sstevel@tonic-gate 			 * is started, it will perform an optimized resync
4030Sstevel@tonic-gate 			 * for any resync regions that were dirty when the
4040Sstevel@tonic-gate 			 * current owner left the membership.
4050Sstevel@tonic-gate 			 */
4060Sstevel@tonic-gate 			if (meta_is_member(NULL, ownpar->d.owner, nl) != 1) {
4070Sstevel@tonic-gate 				if (meta_mn_change_owner(&ownpar,
4080Sstevel@tonic-gate 				    sp->setno, ownpar->d.mnum,
4090Sstevel@tonic-gate 				    MD_MN_MIRROR_UNOWNED,
4100Sstevel@tonic-gate 				    MD_MN_MM_ALLOW_CHANGE) == -1) {
4110Sstevel@tonic-gate 					md_eprintf(gettext(
4120Sstevel@tonic-gate 					    "Unable to reset mirror owner "
413*1623Stw21770 					    "for %s\n"),
414*1623Stw21770 					    get_mdname(sp, ownpar->d.mnum));
4150Sstevel@tonic-gate 					goto out;
4160Sstevel@tonic-gate 				}
4170Sstevel@tonic-gate 				if (meta_mirror_resync(sp, devnp, 0, ep,
4180Sstevel@tonic-gate 				    MD_RESYNC_KILL_NO_WAIT) != 0) {
4190Sstevel@tonic-gate 					md_eprintf(gettext(
4200Sstevel@tonic-gate 					    "Unable to kill resync for"
421*1623Stw21770 					    " %s\n"),
422*1623Stw21770 					    get_mdname(sp, ownpar->d.mnum));
4230Sstevel@tonic-gate 					goto out;
4240Sstevel@tonic-gate 				}
4250Sstevel@tonic-gate 			}
4260Sstevel@tonic-gate 		}
4270Sstevel@tonic-gate 
4280Sstevel@tonic-gate 		if (mode & CHOOSE_OWNER) {
4290Sstevel@tonic-gate 			/*
4300Sstevel@tonic-gate 			 * only orphaned resyncs will have no owner.
4310Sstevel@tonic-gate 			 * if that is the case choose a new owner. Otherwise
4320Sstevel@tonic-gate 			 * re-establish the existing owner. This covers the
4330Sstevel@tonic-gate 			 * case where a node that owned the mirror
4340Sstevel@tonic-gate 			 * reboots/panics and comes back into the cluster before
4350Sstevel@tonic-gate 			 * the reconfig cycle has completed. In this case the
4360Sstevel@tonic-gate 			 * other cluster nodes will have the mirror owner marked
4370Sstevel@tonic-gate 			 * as the rebooted node while it has the owner marked
4380Sstevel@tonic-gate 			 * as 'None'. We have to reestablish the ownership so
4390Sstevel@tonic-gate 			 * that the subsequent resync can continue.
4400Sstevel@tonic-gate 			 */
4410Sstevel@tonic-gate 			if (meta_mn_change_owner(&ownpar, sp->setno,
4420Sstevel@tonic-gate 			    ownpar->d.mnum, ownpar->d.owner,
4430Sstevel@tonic-gate 			    MD_MN_MM_CHOOSE_OWNER) == -1) {
4440Sstevel@tonic-gate 				md_eprintf(gettext("Unable to choose "
445*1623Stw21770 				    "mirror owner for %s\n"),
446*1623Stw21770 				    get_mdname(sp, ownpar->d.mnum));
4470Sstevel@tonic-gate 				goto out;
4480Sstevel@tonic-gate 			}
4490Sstevel@tonic-gate 		}
4500Sstevel@tonic-gate 
4510Sstevel@tonic-gate 		/*
4520Sstevel@tonic-gate 		 * For RESET_ABR and UPDATE_ABR - only handle top
4530Sstevel@tonic-gate 		 * level metadevices.
4540Sstevel@tonic-gate 		 */
4550Sstevel@tonic-gate 		if (has_parent)
4560Sstevel@tonic-gate 			continue;
4570Sstevel@tonic-gate 
4580Sstevel@tonic-gate 		if (mode & RESET_ABR) {
4590Sstevel@tonic-gate 			/*
4600Sstevel@tonic-gate 			 * Reset the ABR (application based recovery)
4610Sstevel@tonic-gate 			 * value on all nodes. We are dealing with
4620Sstevel@tonic-gate 			 * the possibility that we have ABR set but the
4630Sstevel@tonic-gate 			 * only node that had the device open with ABR has
4640Sstevel@tonic-gate 			 * left the cluster. We simply open and close the
4650Sstevel@tonic-gate 			 * device and if this is the last close in the
4660Sstevel@tonic-gate 			 * cluster, ABR will be cleared on all nodes.
4670Sstevel@tonic-gate 			 */
4680Sstevel@tonic-gate 			char		*miscname;
469*1623Stw21770 			char		name[MAXPATHLEN];
4700Sstevel@tonic-gate 			int		mnum, fd;
4710Sstevel@tonic-gate 
4720Sstevel@tonic-gate 			name[0] = '\0';
4730Sstevel@tonic-gate 			mnum = meta_getminor(devnp->dev);
4740Sstevel@tonic-gate 
4750Sstevel@tonic-gate 			/*
4760Sstevel@tonic-gate 			 * Ensure that we don't include soft-parts in the
4770Sstevel@tonic-gate 			 * mirror-only call to RESET_ABR. meta_get_mirror_names
4780Sstevel@tonic-gate 			 * returns a bogus list that includes all soft-parts
4790Sstevel@tonic-gate 			 * built on mirrors.
4800Sstevel@tonic-gate 			 */
4810Sstevel@tonic-gate 			if ((miscname = metagetmiscname(devnp, ep)) == NULL)
4820Sstevel@tonic-gate 				goto out;
4830Sstevel@tonic-gate 			if (mirror_dev && (strcmp(miscname, MD_MIRROR) != 0))
4840Sstevel@tonic-gate 				continue;
4850Sstevel@tonic-gate 
4860Sstevel@tonic-gate 			meta_mc_log(MC_LOG4, gettext("Re-setting ABR state "
487*1623Stw21770 			    "for %s: %s"), get_mdname(sp, mnum),
4880Sstevel@tonic-gate 			    meta_print_hrtime(gethrtime() - start_time));
4890Sstevel@tonic-gate 
4900Sstevel@tonic-gate 			/* compose the absolute device path and open it */
4910Sstevel@tonic-gate 			if (compose_path(sp, mnum, &name[0],
4920Sstevel@tonic-gate 			    sizeof (name)) != 0)
4930Sstevel@tonic-gate 				goto out;
4940Sstevel@tonic-gate 			if ((fd = open(name, O_RDWR, 0)) < 0) {
4950Sstevel@tonic-gate 				md_perror(gettext("Could not open device %s"),
4960Sstevel@tonic-gate 				    name);
4970Sstevel@tonic-gate 				continue;
4980Sstevel@tonic-gate 			}
4990Sstevel@tonic-gate 
5000Sstevel@tonic-gate 			(void) close(fd);
5010Sstevel@tonic-gate 		}
5020Sstevel@tonic-gate 
5030Sstevel@tonic-gate 		if (mode & UPDATE_ABR) {
5040Sstevel@tonic-gate 			/*
5050Sstevel@tonic-gate 			 * Update the ABR value on this node. We obtain the
5060Sstevel@tonic-gate 			 * current ABR state from the master node.
5070Sstevel@tonic-gate 			 */
5080Sstevel@tonic-gate 
5090Sstevel@tonic-gate 			char		*miscname;
510*1623Stw21770 			char		name[MAXPATHLEN];
5110Sstevel@tonic-gate 			int		mnum, fd;
5120Sstevel@tonic-gate 			volcap_t	vc;
5130Sstevel@tonic-gate 			uint_t		tstate;
5140Sstevel@tonic-gate 
5150Sstevel@tonic-gate 			name[0] = '\0';
5160Sstevel@tonic-gate 			mnum = meta_getminor(devnp->dev);
5170Sstevel@tonic-gate 
5180Sstevel@tonic-gate 			/*
5190Sstevel@tonic-gate 			 * Ensure that we don't include soft-parts in the
5200Sstevel@tonic-gate 			 * mirror-only call to UPDATE_ABR. meta_get_mirror_names
5210Sstevel@tonic-gate 			 * returns a bogus list that includes all soft-parts
5220Sstevel@tonic-gate 			 * built on mirrors.
5230Sstevel@tonic-gate 			 */
5240Sstevel@tonic-gate 			if ((miscname = metagetmiscname(devnp, ep)) == NULL)
5250Sstevel@tonic-gate 				goto out;
5260Sstevel@tonic-gate 			if (mirror_dev && (strcmp(miscname, MD_MIRROR) != 0))
5270Sstevel@tonic-gate 				continue;
5280Sstevel@tonic-gate 
5290Sstevel@tonic-gate 			/* Get tstate from Master */
5300Sstevel@tonic-gate 			if (meta_mn_send_get_tstate(devnp->dev, &tstate, ep)
5310Sstevel@tonic-gate 			    != 0)
5320Sstevel@tonic-gate 				continue;
5330Sstevel@tonic-gate 			/* If not set on the master, nothing to do */
5340Sstevel@tonic-gate 			if (!(tstate & MD_ABR_CAP))
5350Sstevel@tonic-gate 				continue;
5360Sstevel@tonic-gate 
5370Sstevel@tonic-gate 			meta_mc_log(MC_LOG4, gettext("Updating ABR state "
538*1623Stw21770 			    "for %s: %s"), get_mdname(sp, mnum),
5390Sstevel@tonic-gate 			    meta_print_hrtime(gethrtime() - start_time));
5400Sstevel@tonic-gate 
5410Sstevel@tonic-gate 			/* compose the absolute device path and open it */
5420Sstevel@tonic-gate 			if (compose_path(sp, mnum, &name[0],
5430Sstevel@tonic-gate 			    sizeof (name)) != 0)
5440Sstevel@tonic-gate 				goto out;
5450Sstevel@tonic-gate 			if ((fd = open(name, O_RDWR, 0)) < 0) {
5460Sstevel@tonic-gate 				md_perror(gettext("Could not open device %s"),
5470Sstevel@tonic-gate 				    name);
5480Sstevel@tonic-gate 				continue;
5490Sstevel@tonic-gate 			}
5500Sstevel@tonic-gate 
5510Sstevel@tonic-gate 			/* set ABR state */
5520Sstevel@tonic-gate 			vc.vc_info = 0;
5530Sstevel@tonic-gate 			vc.vc_set = 0;
5540Sstevel@tonic-gate 			if (ioctl(fd, DKIOCGETVOLCAP, &vc) < 0) {
5550Sstevel@tonic-gate 				/*
5560Sstevel@tonic-gate 				 * Ignore if device does not support this
5570Sstevel@tonic-gate 				 * ioctl
5580Sstevel@tonic-gate 				 */
5590Sstevel@tonic-gate 				if ((errno != ENOTTY) && (errno != ENOTSUP)) {
5600Sstevel@tonic-gate 					md_perror(gettext("Could not get "
5610Sstevel@tonic-gate 					    "ABR/DMR state for device %s"),
5620Sstevel@tonic-gate 					    name);
5630Sstevel@tonic-gate 				}
5640Sstevel@tonic-gate 				(void) close(fd);
5650Sstevel@tonic-gate 				continue;
5660Sstevel@tonic-gate 			}
5670Sstevel@tonic-gate 			if (!(vc.vc_info & (DKV_ABR_CAP | DKV_DMR_CAP))) {
5680Sstevel@tonic-gate 				(void) close(fd);
5690Sstevel@tonic-gate 				continue;
5700Sstevel@tonic-gate 			}
5710Sstevel@tonic-gate 
5720Sstevel@tonic-gate 			vc.vc_set = DKV_ABR_CAP;
5730Sstevel@tonic-gate 			if (ioctl(fd, DKIOCSETVOLCAP, &vc) < 0) {
5740Sstevel@tonic-gate 				md_perror(gettext(
5750Sstevel@tonic-gate 				    "Could not set ABR state for "
5760Sstevel@tonic-gate 				    "device %s"), name);
5770Sstevel@tonic-gate 				(void) close(fd);
5780Sstevel@tonic-gate 				goto out;
5790Sstevel@tonic-gate 			} else {
5800Sstevel@tonic-gate 				md_eprintf(gettext(
5810Sstevel@tonic-gate 				    "Setting ABR state on device %s\n"), name);
5820Sstevel@tonic-gate 			}
5830Sstevel@tonic-gate 
5840Sstevel@tonic-gate 			(void) close(fd);
5850Sstevel@tonic-gate 		}
5860Sstevel@tonic-gate 	}
5870Sstevel@tonic-gate 
5880Sstevel@tonic-gate 	/* cleanup */
5890Sstevel@tonic-gate 	if (mode & RESET_OWNER) {
5900Sstevel@tonic-gate 		meta_free_nodelist(nl);
5910Sstevel@tonic-gate 	}
5920Sstevel@tonic-gate 	metafreenamelist(devnlp);
5930Sstevel@tonic-gate 	return (0);
5940Sstevel@tonic-gate 
5950Sstevel@tonic-gate out:
5960Sstevel@tonic-gate 	/* cleanup */
5970Sstevel@tonic-gate 	if (mode & RESET_OWNER) {
5980Sstevel@tonic-gate 		meta_free_nodelist(nl);
5990Sstevel@tonic-gate 	}
6000Sstevel@tonic-gate 	metafreenamelist(devnlp);
6010Sstevel@tonic-gate 	return (-1);
6020Sstevel@tonic-gate }
6030Sstevel@tonic-gate 
6040Sstevel@tonic-gate /*
6050Sstevel@tonic-gate  * Print usage message
6060Sstevel@tonic-gate  */
6070Sstevel@tonic-gate static void
6080Sstevel@tonic-gate usage(mdsetname_t *sp, int eval)
6090Sstevel@tonic-gate {
6100Sstevel@tonic-gate 	(void) fprintf(stderr, gettext("usage:"
6110Sstevel@tonic-gate 	    "\t%s [-V version] [-t timeout] [-d level] start localnodeid\n"
6120Sstevel@tonic-gate 	    "\t%s [-V version] [-t timeout] [-d level] step nodelist...\n"
6130Sstevel@tonic-gate 	    "\t%s [-V version] [-t timeout] [-d level] abort | stop\n"
6140Sstevel@tonic-gate 	    "\t%s [-V | -? | -h]\n"),
6150Sstevel@tonic-gate 	    myname, myname, myname, myname);
6160Sstevel@tonic-gate 	if (!eval) {
6170Sstevel@tonic-gate 		fprintf(stderr, gettext("\n"
6180Sstevel@tonic-gate 		    "\tValid debug (-d) levels are 1-%d for increasing "
6190Sstevel@tonic-gate 		    "verbosity.\n\tDefault is -d 3.\n\n"
6200Sstevel@tonic-gate 		    "\tValid step values are: return | step1 | step2 | "
6210Sstevel@tonic-gate 		    "step3 | step4\n\n"
6220Sstevel@tonic-gate 		    "\tNodelist is a space-separated list of node id's\n\n"),
6230Sstevel@tonic-gate 		    MAX_DEBUG_LEVEL);
6240Sstevel@tonic-gate 	}
6250Sstevel@tonic-gate 	md_exit(sp, eval);
6260Sstevel@tonic-gate }
6270Sstevel@tonic-gate 
6280Sstevel@tonic-gate /*
6290Sstevel@tonic-gate  * Input:	Input takes a config step name followed by a list of
6300Sstevel@tonic-gate  *		possible node id's.
6310Sstevel@tonic-gate  *
6320Sstevel@tonic-gate  * Returns:	  0 - Success
6330Sstevel@tonic-gate  *		  1 - Fail
6340Sstevel@tonic-gate  *			Node will be removed from cluster membership
6350Sstevel@tonic-gate  *			by forcing node to panic.
6360Sstevel@tonic-gate  *		205 - Unsuccessful. Start another reconfig cycle.
6370Sstevel@tonic-gate  *			Problem was encountered that could be fixed by
6380Sstevel@tonic-gate  *			running another reconfig cycle.
6390Sstevel@tonic-gate  *			Problem could be a result of a failure to read
6400Sstevel@tonic-gate  *			the nodelist file or that all work could not be
6410Sstevel@tonic-gate  *			accomplished in a reconfig step in the amount of
6420Sstevel@tonic-gate  *			time given so another reconfig cycle is needed in
6430Sstevel@tonic-gate  *			order to finish the current step.
6440Sstevel@tonic-gate  */
6450Sstevel@tonic-gate int
6460Sstevel@tonic-gate main(int argc, char **argv)
6470Sstevel@tonic-gate {
6480Sstevel@tonic-gate 	mdsetname_t		*sp = NULL;
6490Sstevel@tonic-gate 	md_error_t		status = mdnullerror;
6500Sstevel@tonic-gate 	md_error_t		*ep = &status;
6510Sstevel@tonic-gate 	set_t			max_sets, setno;
6520Sstevel@tonic-gate 	int			c, clust = 0;
6530Sstevel@tonic-gate 	struct sigaction	nsa, osa;
6540Sstevel@tonic-gate 	struct step_t		*step_ptr;
6550Sstevel@tonic-gate 	mdsetname_t		*local_sp = NULL;
6560Sstevel@tonic-gate 	md_drive_desc		*dd;
6570Sstevel@tonic-gate 	int			rval = 0;
6580Sstevel@tonic-gate 	md_set_desc		*sd;
6590Sstevel@tonic-gate 	mddb_block_parm_t	mbp;
6600Sstevel@tonic-gate 	uint_t			debug = 3; /* log upto MC_LOG3 by default */
6610Sstevel@tonic-gate 	int			version_table_size;
6620Sstevel@tonic-gate 	mddb_setflags_config_t	sf;
6630Sstevel@tonic-gate 	int			ret_val;
6640Sstevel@tonic-gate 	mddb_config_t		cfg;
6650Sstevel@tonic-gate 	int			set_info[MD_MAXSETS];
6660Sstevel@tonic-gate 
6670Sstevel@tonic-gate 	/*
6680Sstevel@tonic-gate 	 * Get the locale set up before calling any other routines
6690Sstevel@tonic-gate 	 * with messages to ouput.  Just in case we're not in a build
6700Sstevel@tonic-gate 	 * environment, make sure that TEXT_DOMAIN gets set to
6710Sstevel@tonic-gate 	 * something.
6720Sstevel@tonic-gate 	 */
6730Sstevel@tonic-gate #if !defined(TEXT_DOMAIN)
6740Sstevel@tonic-gate #define	TEXT_DOMAIN "SYS_TEST"
6750Sstevel@tonic-gate #endif
6760Sstevel@tonic-gate 	(void) setlocale(LC_ALL, "");
6770Sstevel@tonic-gate 	(void) textdomain(TEXT_DOMAIN);
6780Sstevel@tonic-gate 
6790Sstevel@tonic-gate 	if ((clust = sdssc_bind_library()) == SDSSC_ERROR) {
6800Sstevel@tonic-gate 		md_eprintf(gettext("Interface error with libsds_sc.so\n"));
6810Sstevel@tonic-gate 		exit(1);
6820Sstevel@tonic-gate 	}
6830Sstevel@tonic-gate 
6840Sstevel@tonic-gate 	if (md_init(argc, argv, 1, 1, ep) != 0 || meta_check_root(ep) != 0) {
6850Sstevel@tonic-gate 		mde_perror(ep, "");
6860Sstevel@tonic-gate 		md_exit(sp, 1);
6870Sstevel@tonic-gate 	}
6880Sstevel@tonic-gate 
6890Sstevel@tonic-gate 	/*
6900Sstevel@tonic-gate 	 * open log and enable libmeta logging. Do it here explicitly
6910Sstevel@tonic-gate 	 * rather than letting md_init() do it because we are not really
6920Sstevel@tonic-gate 	 * a daemon and that is what md_init() opens the log as.
6930Sstevel@tonic-gate 	 */
6940Sstevel@tonic-gate 	openlog("metaclust", LOG_CONS, LOG_USER);
6950Sstevel@tonic-gate 
6960Sstevel@tonic-gate 	version_table_size = sizeof (version_table) / sizeof (version_table[0]);
6970Sstevel@tonic-gate 
6980Sstevel@tonic-gate 	optind = 1;
6990Sstevel@tonic-gate 	opterr = 0;
7000Sstevel@tonic-gate 	while ((c = getopt(argc, argv, "hd:V:t:?")) != -1) {
7010Sstevel@tonic-gate 		switch (c) {
7020Sstevel@tonic-gate 		case 'h':
7030Sstevel@tonic-gate 			usage(sp, 0);
7040Sstevel@tonic-gate 			break;
7050Sstevel@tonic-gate 
7060Sstevel@tonic-gate 		case 'd':
7070Sstevel@tonic-gate 			if (sscanf(optarg, "%u", &debug) != 1) {
7080Sstevel@tonic-gate 				md_eprintf(gettext("Invalid debug level\n"));
7090Sstevel@tonic-gate 				md_exit(sp, 1);
7100Sstevel@tonic-gate 			} else if ((debug < 1) || (debug > MAX_DEBUG_LEVEL)) {
7110Sstevel@tonic-gate 				debug = min(max(debug, 1), MAX_DEBUG_LEVEL);
7120Sstevel@tonic-gate 				md_eprintf(gettext("Debug level must be "
7130Sstevel@tonic-gate 				    "between 1 and %d inclusive.\n"),
7140Sstevel@tonic-gate 				    MAX_DEBUG_LEVEL);
7150Sstevel@tonic-gate 				md_eprintf(gettext("Debug level set to %d.\n"),
7160Sstevel@tonic-gate 				    debug);
7170Sstevel@tonic-gate 			}
7180Sstevel@tonic-gate 			break;
7190Sstevel@tonic-gate 
7200Sstevel@tonic-gate 		case 'V':
7210Sstevel@tonic-gate 			version = Strdup(optarg);
7220Sstevel@tonic-gate 			break;
7230Sstevel@tonic-gate 
7240Sstevel@tonic-gate 		case 't':
7250Sstevel@tonic-gate 			if (sscanf(optarg, "%u", &timeout) != 1) {
7260Sstevel@tonic-gate 				md_eprintf(gettext("Invalid timeout value\n"));
7270Sstevel@tonic-gate 				md_exit(sp, 1);
7280Sstevel@tonic-gate 			}
7290Sstevel@tonic-gate 			break;
7300Sstevel@tonic-gate 
7310Sstevel@tonic-gate 		case '?':
7320Sstevel@tonic-gate 			if (optopt == '?') {
7330Sstevel@tonic-gate 				usage(sp, 0);
7340Sstevel@tonic-gate 			} else if (optopt == 'V') {
7350Sstevel@tonic-gate 				int	i;
7360Sstevel@tonic-gate 
7370Sstevel@tonic-gate 				fprintf(stdout, gettext(
7380Sstevel@tonic-gate 				    "%s: Versions Supported:"), myname);
7390Sstevel@tonic-gate 				for (i = 0; i < version_table_size; i++) {
7400Sstevel@tonic-gate 					fprintf(stdout, " %s",
7410Sstevel@tonic-gate 					    version_table[i]);
7420Sstevel@tonic-gate 				}
7430Sstevel@tonic-gate 				fprintf(stdout, "\n");
7440Sstevel@tonic-gate 				md_exit(sp, 0);
7450Sstevel@tonic-gate 			}
7460Sstevel@tonic-gate 			/*FALLTHROUGH*/
7470Sstevel@tonic-gate 
7480Sstevel@tonic-gate 		default:
7490Sstevel@tonic-gate 			usage(sp, 1);
7500Sstevel@tonic-gate 			break;
7510Sstevel@tonic-gate 		}
7520Sstevel@tonic-gate 	}
7530Sstevel@tonic-gate 
7540Sstevel@tonic-gate 	/* initialise the debug level and start time */
7550Sstevel@tonic-gate 	setup_mc_log(debug);
7560Sstevel@tonic-gate 
7570Sstevel@tonic-gate 	/*
7580Sstevel@tonic-gate 	 * check that the version specified (if any) is supported.
7590Sstevel@tonic-gate 	 */
7600Sstevel@tonic-gate 	if (version != NULL) {
7610Sstevel@tonic-gate 		int	i, found = 0;
7620Sstevel@tonic-gate 
7630Sstevel@tonic-gate 		for (i = 0; i < version_table_size; i++) {
7640Sstevel@tonic-gate 			if (strcmp(version, version_table[i]) == 0) {
7650Sstevel@tonic-gate 				found = 1;
7660Sstevel@tonic-gate 				break;
7670Sstevel@tonic-gate 			}
7680Sstevel@tonic-gate 		}
7690Sstevel@tonic-gate 		if (!found) {
7700Sstevel@tonic-gate 			md_eprintf(gettext("Version %s not supported\n"),
7710Sstevel@tonic-gate 			    version);
7720Sstevel@tonic-gate 			md_exit(sp, 1);
7730Sstevel@tonic-gate 		}
7740Sstevel@tonic-gate 	}
7750Sstevel@tonic-gate 
7760Sstevel@tonic-gate 	argc -= optind;
7770Sstevel@tonic-gate 	argv += optind;
7780Sstevel@tonic-gate 
7790Sstevel@tonic-gate 	/* parse arguments */
7800Sstevel@tonic-gate 	if (argc <= 0) {
7810Sstevel@tonic-gate 		usage(sp, 1);
7820Sstevel@tonic-gate 	}
7830Sstevel@tonic-gate 
7840Sstevel@tonic-gate 	/* convert the step name to the corresponding number */
7850Sstevel@tonic-gate 	step_ptr = bsearch(argv[0], step_table, (sizeof (step_table) /
7860Sstevel@tonic-gate 	    sizeof (step_table[0])), sizeof (step_table[0]), mc_compare);
7870Sstevel@tonic-gate 	if (step_ptr != NULL) {
7880Sstevel@tonic-gate 		stepnum = step_ptr->step_num;
7890Sstevel@tonic-gate 	}
7900Sstevel@tonic-gate 
7910Sstevel@tonic-gate 	--argc;
7920Sstevel@tonic-gate 	++argv;
7930Sstevel@tonic-gate 
7940Sstevel@tonic-gate 	/* set timeout alarm signal, a value of 0 will disable timeout */
7950Sstevel@tonic-gate 	if (timeout > 0) {
7960Sstevel@tonic-gate 		int	stat_loc = 0;
7970Sstevel@tonic-gate 
7980Sstevel@tonic-gate 		c_pid = fork();
7990Sstevel@tonic-gate 
8000Sstevel@tonic-gate 		if (c_pid == (pid_t)-1) {
8010Sstevel@tonic-gate 			md_perror(gettext("Unable to fork"));
8020Sstevel@tonic-gate 			md_exit(sp, 1);
8030Sstevel@tonic-gate 		} else if (c_pid) {
8040Sstevel@tonic-gate 			/* parent */
8050Sstevel@tonic-gate 			nsa.sa_flags = 0;
8060Sstevel@tonic-gate 			if (sigfillset(&nsa.sa_mask) < 0) {
8070Sstevel@tonic-gate 				md_perror(gettext("Unable to set signal mask"));
8080Sstevel@tonic-gate 				md_exit(sp, 1);
8090Sstevel@tonic-gate 			}
8100Sstevel@tonic-gate 
8110Sstevel@tonic-gate 			nsa.sa_handler = sigalarmhandler;
8120Sstevel@tonic-gate 			if (sigaction(SIGALRM, &nsa, &osa) == -1) {
8130Sstevel@tonic-gate 				md_perror(gettext("Unable to set alarm "
8140Sstevel@tonic-gate 				    "handler"));
8150Sstevel@tonic-gate 				md_exit(sp, 1);
8160Sstevel@tonic-gate 			}
8170Sstevel@tonic-gate 
8180Sstevel@tonic-gate 			(void) alarm(timeout);
8190Sstevel@tonic-gate 
8200Sstevel@tonic-gate 			/*
8210Sstevel@tonic-gate 			 * wait for child to exit or timeout to expire.
8220Sstevel@tonic-gate 			 * keep retrying if the call is interrupted
8230Sstevel@tonic-gate 			 */
8240Sstevel@tonic-gate 			while ((ret_val = waitpid(c_pid, &stat_loc, 0)) < 0) {
8250Sstevel@tonic-gate 				if (errno != EINTR) {
8260Sstevel@tonic-gate 					break;
8270Sstevel@tonic-gate 				}
8280Sstevel@tonic-gate 			}
8290Sstevel@tonic-gate 			if (ret_val == c_pid) {
8300Sstevel@tonic-gate 				/* exit with the childs exit value */
8310Sstevel@tonic-gate 				exit(WEXITSTATUS(stat_loc));
8320Sstevel@tonic-gate 			} else if (errno == ECHILD) {
8330Sstevel@tonic-gate 				md_exit(sp, 0);
8340Sstevel@tonic-gate 			} else {
8350Sstevel@tonic-gate 				perror(myname);
8360Sstevel@tonic-gate 				md_exit(sp, 1);
8370Sstevel@tonic-gate 			}
8380Sstevel@tonic-gate 		}
8390Sstevel@tonic-gate 	}
8400Sstevel@tonic-gate 
8410Sstevel@tonic-gate 	/*
8420Sstevel@tonic-gate 	 * If a timeout value is given, everything from this point onwards is
8430Sstevel@tonic-gate 	 * executed in the child process.
8440Sstevel@tonic-gate 	 */
8450Sstevel@tonic-gate 
8460Sstevel@tonic-gate 	switch (stepnum) {
8470Sstevel@tonic-gate 	case MC_START:
8480Sstevel@tonic-gate 		/*
8490Sstevel@tonic-gate 		 * Start Step
8500Sstevel@tonic-gate 		 *
8510Sstevel@tonic-gate 		 * - Suspend all rpc.mdcommd messages
8520Sstevel@tonic-gate 		 */
8530Sstevel@tonic-gate 
8540Sstevel@tonic-gate 		/* expect the local node id to be given only */
8550Sstevel@tonic-gate 		if (argc != 1)
8560Sstevel@tonic-gate 			usage(sp, 1);
8570Sstevel@tonic-gate 
8580Sstevel@tonic-gate 		meta_mc_log(MC_LOG2, gettext("Starting Start step: %s"),
8590Sstevel@tonic-gate 		    meta_print_hrtime(0));
8600Sstevel@tonic-gate 
8610Sstevel@tonic-gate 		/*
8620Sstevel@tonic-gate 		 * Does local set exist? If not, exit with 0
8630Sstevel@tonic-gate 		 * since there's no reason to have this node panic if
8640Sstevel@tonic-gate 		 * the local set cannot be started.
8650Sstevel@tonic-gate 		 */
8660Sstevel@tonic-gate 		if ((local_sp = load_local_set(ep)) == NULL) {
8670Sstevel@tonic-gate 			md_exit(local_sp, 0);
8680Sstevel@tonic-gate 		}
8690Sstevel@tonic-gate 
8700Sstevel@tonic-gate 		if ((max_sets = get_max_sets(ep)) == 0) {
8710Sstevel@tonic-gate 			mde_perror(ep, "");
8720Sstevel@tonic-gate 			md_exit(sp, 1);
8730Sstevel@tonic-gate 		}
8740Sstevel@tonic-gate 
8750Sstevel@tonic-gate 		/* start walking through all possible disksets */
8760Sstevel@tonic-gate 		for (setno = 1; setno < max_sets; setno++) {
8770Sstevel@tonic-gate 			if ((sp = metasetnosetname(setno, ep)) == NULL) {
8780Sstevel@tonic-gate 				if (mdiserror(ep, MDE_NO_SET)) {
8790Sstevel@tonic-gate 					/* No set for this setno - continue */
8800Sstevel@tonic-gate 					mdclrerror(ep);
8810Sstevel@tonic-gate 					continue;
8820Sstevel@tonic-gate 				} else {
8830Sstevel@tonic-gate 					mde_perror(ep, gettext("Unable to "
8840Sstevel@tonic-gate 					    "get set %d information"), setno);
8850Sstevel@tonic-gate 					md_exit(sp, 1);
8860Sstevel@tonic-gate 				}
8870Sstevel@tonic-gate 			}
8880Sstevel@tonic-gate 
8890Sstevel@tonic-gate 			/* only check multi-node disksets */
8900Sstevel@tonic-gate 			if (!meta_is_mn_set(sp, ep)) {
8910Sstevel@tonic-gate 				mdclrerror(ep);
8920Sstevel@tonic-gate 				continue;
8930Sstevel@tonic-gate 			}
8940Sstevel@tonic-gate 
8950Sstevel@tonic-gate 			meta_mc_log(MC_LOG3, gettext("Start - block parse "
8960Sstevel@tonic-gate 			    "messages for set %s: %s"), sp->setname,
8970Sstevel@tonic-gate 			    meta_print_hrtime(gethrtime() - start_time));
8980Sstevel@tonic-gate 
8990Sstevel@tonic-gate 			/*
9000Sstevel@tonic-gate 			 * Mddb parse messages are sent amongst the nodes
9010Sstevel@tonic-gate 			 * in a diskset whenever the locator block or
9020Sstevel@tonic-gate 			 * locator names structure has been changed.
9030Sstevel@tonic-gate 			 * A locator block change could occur as a result
9040Sstevel@tonic-gate 			 * of a disk failure during the reconfig cycle,
9050Sstevel@tonic-gate 			 * so block the mddb parse messages while the
9060Sstevel@tonic-gate 			 * rpc.mdcommd is suspended during the reconfig cycle.
9070Sstevel@tonic-gate 			 */
9080Sstevel@tonic-gate 			if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) {
9090Sstevel@tonic-gate 				(void) memset(&mbp, 0, sizeof (mbp));
9100Sstevel@tonic-gate 				mbp.c_setno = setno;
9110Sstevel@tonic-gate 				mbp.c_blk_flags = MDDB_BLOCK_PARSE;
9120Sstevel@tonic-gate 				if (metaioctl(MD_MN_MDDB_BLOCK, &mbp,
9130Sstevel@tonic-gate 				    &mbp.c_mde, NULL)) {
9140Sstevel@tonic-gate 					mdstealerror(ep, &mbp.c_mde);
9150Sstevel@tonic-gate 					mde_perror(ep, gettext("Could not "
9160Sstevel@tonic-gate 					    "block set %s"), sp->setname);
9170Sstevel@tonic-gate 					md_exit(sp, 1);
9180Sstevel@tonic-gate 				}
9190Sstevel@tonic-gate 			}
9200Sstevel@tonic-gate 
9210Sstevel@tonic-gate 			/* suspend commd and spin waiting for drain */
9220Sstevel@tonic-gate 			while ((ret_val = mdmn_suspend(setno,
9230Sstevel@tonic-gate 			    MD_COMM_ALL_CLASSES)) ==
9240Sstevel@tonic-gate 			    MDE_DS_COMMDCTL_SUSPEND_NYD) {
9250Sstevel@tonic-gate 				sleep(1);
9260Sstevel@tonic-gate 			}
9270Sstevel@tonic-gate 
9280Sstevel@tonic-gate 			if (ret_val) {
9290Sstevel@tonic-gate 				md_eprintf(gettext("Could not suspend "
9300Sstevel@tonic-gate 				    "rpc.mdcommd for set %s\n"), sp->setname);
9310Sstevel@tonic-gate 				md_exit(sp, 1);
9320Sstevel@tonic-gate 			}
9330Sstevel@tonic-gate 
9340Sstevel@tonic-gate 			/*
9350Sstevel@tonic-gate 			 * Set start step flag for set. This is set to indicate
93646Sskamm 			 * that this node entered the reconfig cycle through
93746Sskamm 			 * the start step.  This is used during the reconfig
93846Sskamm 			 * cycle to determine whether the node had entered
93946Sskamm 			 * through the start step or the return step.
9400Sstevel@tonic-gate 			 */
9410Sstevel@tonic-gate 			(void) memset(&sf, 0, sizeof (sf));
9420Sstevel@tonic-gate 			sf.sf_setno = sp->setno;
9430Sstevel@tonic-gate 			sf.sf_setflags = MD_SET_MN_START_RC;
9440Sstevel@tonic-gate 			sf.sf_flags = MDDB_NM_SET;
9450Sstevel@tonic-gate 			/* Use magic to help protect ioctl against attack. */
9460Sstevel@tonic-gate 			sf.sf_magic = MDDB_SETFLAGS_MAGIC;
9470Sstevel@tonic-gate 			if (metaioctl(MD_MN_SET_SETFLAGS, &sf,
9480Sstevel@tonic-gate 			    &sf.sf_mde, NULL)) {
9490Sstevel@tonic-gate 				mdstealerror(ep, &sf.sf_mde);
9500Sstevel@tonic-gate 				mde_perror(ep, gettext("Could not set "
9510Sstevel@tonic-gate 				    "start_step flag for set %s"), sp->setname);
9520Sstevel@tonic-gate 				md_exit(sp, 1);
9530Sstevel@tonic-gate 			}
9540Sstevel@tonic-gate 
9550Sstevel@tonic-gate 		}
9560Sstevel@tonic-gate 
9570Sstevel@tonic-gate 		meta_mc_log(MC_LOG2, gettext("Start step completed: %s"),
9580Sstevel@tonic-gate 		    meta_print_hrtime(gethrtime() - start_time));
9590Sstevel@tonic-gate 
9600Sstevel@tonic-gate 		break;
9610Sstevel@tonic-gate 
9620Sstevel@tonic-gate 	case MC_STOP:
9630Sstevel@tonic-gate 		/*
9640Sstevel@tonic-gate 		 * Stop Step
9650Sstevel@tonic-gate 		 *
9660Sstevel@tonic-gate 		 * - ???
9670Sstevel@tonic-gate 		 */
9680Sstevel@tonic-gate 
9690Sstevel@tonic-gate 		/* don't expect any more arguments to follow the step name */
9700Sstevel@tonic-gate 		if (argc != 0)
9710Sstevel@tonic-gate 			usage(sp, 1);
9720Sstevel@tonic-gate 
9730Sstevel@tonic-gate 		break;
9740Sstevel@tonic-gate 
9750Sstevel@tonic-gate 	case MC_ABORT:
9760Sstevel@tonic-gate 		/*
9770Sstevel@tonic-gate 		 * Abort Step
9780Sstevel@tonic-gate 		 *
9790Sstevel@tonic-gate 		 * - Abort rpc.mdcommd
9800Sstevel@tonic-gate 		 */
9810Sstevel@tonic-gate 
9820Sstevel@tonic-gate 		/* don't expect any more arguments to follow the step name */
9830Sstevel@tonic-gate 		if (argc != 0)
9840Sstevel@tonic-gate 			usage(sp, 1);
9850Sstevel@tonic-gate 
9860Sstevel@tonic-gate 		meta_mc_log(MC_LOG2, gettext("Starting Abort step: %s"),
9870Sstevel@tonic-gate 		    meta_print_hrtime(0));
9880Sstevel@tonic-gate 
9890Sstevel@tonic-gate 		/*
9900Sstevel@tonic-gate 		 * Does local set exist? If not, exit with 0
9910Sstevel@tonic-gate 		 * since there's no reason to have this node panic if
9920Sstevel@tonic-gate 		 * the local set cannot be started.
9930Sstevel@tonic-gate 		 */
9940Sstevel@tonic-gate 		if ((local_sp = load_local_set(ep)) == NULL) {
9950Sstevel@tonic-gate 			md_exit(local_sp, 0);
9960Sstevel@tonic-gate 		}
9970Sstevel@tonic-gate 
9980Sstevel@tonic-gate 		/*
9990Sstevel@tonic-gate 		 * abort the rpc.mdcommd.  The abort is only issued on this node
10000Sstevel@tonic-gate 		 * meaning that the abort reconfig step is called on this
10010Sstevel@tonic-gate 		 * node before a panic while the rest of the cluster will
10020Sstevel@tonic-gate 		 * undergo a reconfig cycle.
10030Sstevel@tonic-gate 		 * There is no time relation between this node running a
10040Sstevel@tonic-gate 		 * reconfig abort and the the rest of the cluster
10050Sstevel@tonic-gate 		 * running a reconfig cycle meaning that this node may
10060Sstevel@tonic-gate 		 * panic before, during or after the cluster has run
10070Sstevel@tonic-gate 		 * a reconfig cycle.
10080Sstevel@tonic-gate 		 */
10090Sstevel@tonic-gate 		mdmn_abort();
10100Sstevel@tonic-gate 
10110Sstevel@tonic-gate 		meta_mc_log(MC_LOG2, gettext("Abort step completed: %s"),
10120Sstevel@tonic-gate 		    meta_print_hrtime(gethrtime() - start_time));
10130Sstevel@tonic-gate 
10140Sstevel@tonic-gate 		break;
10150Sstevel@tonic-gate 
10160Sstevel@tonic-gate 	case MC_RETURN:
10170Sstevel@tonic-gate 		/*
10180Sstevel@tonic-gate 		 * Return Step
10190Sstevel@tonic-gate 		 *
10200Sstevel@tonic-gate 		 * - Grab local set lock, issue rpc.mdcommd DRAIN ALL
10210Sstevel@tonic-gate 		 *   and release local set lock.  Grabbing the local set
10220Sstevel@tonic-gate 		 *   lock allows any active metaset/metadb commands to
10230Sstevel@tonic-gate 		 *   terminate gracefully and will keep a metaset/metadb
10240Sstevel@tonic-gate 		 *   command from starting until the DRAIN ALL is issued.
10250Sstevel@tonic-gate 		 *   The metaset/metadb commands can issue
10260Sstevel@tonic-gate 		 *   DRAIN ALL/RESUME ALL commands to rpc.mdcommd,
10270Sstevel@tonic-gate 		 *   so the return step must not issue the DRAIN ALL command
10280Sstevel@tonic-gate 		 *   until metaset/metadb have finished or metaset may issue
10290Sstevel@tonic-gate 		 *   a RESUME ALL after this return reconfig step has issued
10300Sstevel@tonic-gate 		 *   the DRAIN ALL command.
10310Sstevel@tonic-gate 		 *   After this reconfig step has issued the DRAIN_ALL and
10320Sstevel@tonic-gate 		 *   released the local set lock, metaset/metadb will fail
10330Sstevel@tonic-gate 		 *   when attempting to contact the rpc.mdcommd and will
10340Sstevel@tonic-gate 		 *   terminate without making any configuration changes.
10350Sstevel@tonic-gate 		 *   The DRAIN ALL command will keep all other meta* commands
10360Sstevel@tonic-gate 		 *   from running during the reconfig cycle (these commands
10370Sstevel@tonic-gate 		 *   will wait until the rpc.mdcommd is resumed) since the
10380Sstevel@tonic-gate 		 *   reconfig cycle may be changing the diskset configuration.
10390Sstevel@tonic-gate 		 */
10400Sstevel@tonic-gate 
10410Sstevel@tonic-gate 		/* expect the nodelist to follow the step name */
10420Sstevel@tonic-gate 		if (argc < 1)
10430Sstevel@tonic-gate 			usage(sp, 1);
10440Sstevel@tonic-gate 
10450Sstevel@tonic-gate 		meta_mc_log(MC_LOG2, gettext("Starting Return step: %s"),
10460Sstevel@tonic-gate 		    meta_print_hrtime(0));
10470Sstevel@tonic-gate 
10480Sstevel@tonic-gate 		/*
10490Sstevel@tonic-gate 		 * Does local set exist? If not, exit with 0
10500Sstevel@tonic-gate 		 * since there's no reason to have this node panic if
10510Sstevel@tonic-gate 		 * the local set cannot be started.
10520Sstevel@tonic-gate 		 */
10530Sstevel@tonic-gate 		if ((local_sp = load_local_set(ep)) == NULL) {
10540Sstevel@tonic-gate 			md_exit(local_sp, 0);
10550Sstevel@tonic-gate 		}
10560Sstevel@tonic-gate 
10570Sstevel@tonic-gate 		/*
10580Sstevel@tonic-gate 		 * Suspend any mirror resyncs that are in progress. This
10590Sstevel@tonic-gate 		 * stops unnecessary timeouts.
10600Sstevel@tonic-gate 		 */
10610Sstevel@tonic-gate 		meta_mirror_resync_block_all();
10620Sstevel@tonic-gate 
10630Sstevel@tonic-gate 		if (meta_lock(local_sp, TRUE, ep) != 0) {
10640Sstevel@tonic-gate 			mde_perror(ep, "");
10650Sstevel@tonic-gate 			md_exit(local_sp, 1);
10660Sstevel@tonic-gate 		}
10670Sstevel@tonic-gate 
10680Sstevel@tonic-gate 		/*
10690Sstevel@tonic-gate 		 * All metaset and metadb commands on this node have now
10700Sstevel@tonic-gate 		 * terminated gracefully.  Now, issue a drain all to
10710Sstevel@tonic-gate 		 * the rpc.mdcommd.  Any meta command issued after the
10720Sstevel@tonic-gate 		 * drain all will either spin sending the command to the
10730Sstevel@tonic-gate 		 * master until after the reconfig cycle has finished OR
10740Sstevel@tonic-gate 		 * will terminate gracefully (metaset/metadb).
10750Sstevel@tonic-gate 		 */
10760Sstevel@tonic-gate 		if ((max_sets = get_max_sets(ep)) == 0) {
10770Sstevel@tonic-gate 			mde_perror(ep, "");
10780Sstevel@tonic-gate 			md_exit(sp, 1);
10790Sstevel@tonic-gate 		}
10800Sstevel@tonic-gate 
10810Sstevel@tonic-gate 		/* start walking through all possible disksets */
10820Sstevel@tonic-gate 		for (setno = 1; setno < max_sets; setno++) {
10830Sstevel@tonic-gate 			if ((sp = metasetnosetname(setno, ep)) == NULL) {
10840Sstevel@tonic-gate 				if (mdiserror(ep, MDE_NO_SET)) {
10850Sstevel@tonic-gate 					/* No set for this setno - continue */
10860Sstevel@tonic-gate 					mdclrerror(ep);
10870Sstevel@tonic-gate 					continue;
10880Sstevel@tonic-gate 				} else {
10890Sstevel@tonic-gate 					mde_perror(ep, gettext("Unable to "
10900Sstevel@tonic-gate 					    "get set %d information"), setno);
10910Sstevel@tonic-gate 					md_exit(sp, 1);
10920Sstevel@tonic-gate 				}
10930Sstevel@tonic-gate 			}
10940Sstevel@tonic-gate 
10950Sstevel@tonic-gate 			/* only check multi-node disksets */
10960Sstevel@tonic-gate 			if (!meta_is_mn_set(sp, ep)) {
10970Sstevel@tonic-gate 				mdclrerror(ep);
10980Sstevel@tonic-gate 				continue;
10990Sstevel@tonic-gate 			}
11000Sstevel@tonic-gate 
11010Sstevel@tonic-gate 			meta_mc_log(MC_LOG3, gettext("Return - block parse "
11020Sstevel@tonic-gate 			    "messages for set %s: %s"), sp->setname,
11030Sstevel@tonic-gate 			    meta_print_hrtime(gethrtime() - start_time));
11040Sstevel@tonic-gate 
11050Sstevel@tonic-gate 			/*
11060Sstevel@tonic-gate 			 * Mddb parse messages are sent amongst the nodes
11070Sstevel@tonic-gate 			 * in a diskset whenever the locator block or
11080Sstevel@tonic-gate 			 * locator names structure has been changed.
11090Sstevel@tonic-gate 			 * A locator block change could occur as a result
11100Sstevel@tonic-gate 			 * of a disk failure during the reconfig cycle,
11110Sstevel@tonic-gate 			 * so block the mddb parse messages while the
11120Sstevel@tonic-gate 			 * rpc.commd is suspended during the reconfig cycle.
11130Sstevel@tonic-gate 			 */
11140Sstevel@tonic-gate 			if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) {
11150Sstevel@tonic-gate 				(void) memset(&mbp, 0, sizeof (mbp));
11160Sstevel@tonic-gate 				mbp.c_setno = setno;
11170Sstevel@tonic-gate 				mbp.c_blk_flags = MDDB_BLOCK_PARSE;
11180Sstevel@tonic-gate 				if (metaioctl(MD_MN_MDDB_BLOCK, &mbp,
11190Sstevel@tonic-gate 				    &mbp.c_mde, NULL)) {
11200Sstevel@tonic-gate 					mdstealerror(ep, &mbp.c_mde);
11210Sstevel@tonic-gate 					mde_perror(ep, gettext("Could not "
11220Sstevel@tonic-gate 					    "block set %s"), sp->setname);
11230Sstevel@tonic-gate 					md_exit(sp, 1);
11240Sstevel@tonic-gate 				}
11250Sstevel@tonic-gate 			}
11260Sstevel@tonic-gate 
11270Sstevel@tonic-gate 			/* suspend commd and spin waiting for drain */
11280Sstevel@tonic-gate 			while ((ret_val = mdmn_suspend(setno,
11290Sstevel@tonic-gate 			    MD_COMM_ALL_CLASSES)) ==
11300Sstevel@tonic-gate 			    MDE_DS_COMMDCTL_SUSPEND_NYD) {
11310Sstevel@tonic-gate 				sleep(1);
11320Sstevel@tonic-gate 			}
11330Sstevel@tonic-gate 
11340Sstevel@tonic-gate 			if (ret_val) {
11350Sstevel@tonic-gate 				md_eprintf(gettext("Could not suspend "
11360Sstevel@tonic-gate 				    "rpc.mdcommd for set %s\n"), sp->setname);
11370Sstevel@tonic-gate 				md_exit(sp, 1);
11380Sstevel@tonic-gate 			}
11390Sstevel@tonic-gate 		}
11400Sstevel@tonic-gate 		/*
11410Sstevel@tonic-gate 		 * Resume all I/Os for this node for all MN sets in
11420Sstevel@tonic-gate 		 * case master node had suspended I/Os but panic'd
11430Sstevel@tonic-gate 		 * before resuming I/Os.  In case of failure, exit
11440Sstevel@tonic-gate 		 * with a 1 since unable to resume I/Os on this node.
11450Sstevel@tonic-gate 		 */
11460Sstevel@tonic-gate 		if (clnt_mn_susp_res_io(mynode(), 0, MN_RES_IO, ep)) {
11470Sstevel@tonic-gate 			mde_perror(ep, gettext(
11480Sstevel@tonic-gate 			    "Unable to resume I/O on node %s for all sets"),
11490Sstevel@tonic-gate 			    mynode());
11500Sstevel@tonic-gate 			md_exit(sp, 1);
11510Sstevel@tonic-gate 		}
11520Sstevel@tonic-gate 
11530Sstevel@tonic-gate 
11540Sstevel@tonic-gate 		/*
11550Sstevel@tonic-gate 		 * Can now unlock local set lock.  New metaset/metadb
11560Sstevel@tonic-gate 		 * commands are now held off using drain all.
11570Sstevel@tonic-gate 		 */
11580Sstevel@tonic-gate 		(void) meta_unlock(local_sp, ep);
11590Sstevel@tonic-gate 
11600Sstevel@tonic-gate 		meta_mc_log(MC_LOG2, gettext("Return step completed: %s"),
11610Sstevel@tonic-gate 		    meta_print_hrtime(gethrtime() - start_time));
11620Sstevel@tonic-gate 
11630Sstevel@tonic-gate 		break;
11640Sstevel@tonic-gate 
11650Sstevel@tonic-gate 	case MC_STEP1:
11660Sstevel@tonic-gate 		/*
11670Sstevel@tonic-gate 		 * Step 1
11680Sstevel@tonic-gate 		 *
11690Sstevel@tonic-gate 		 * - Populate nodelist file if we are on clustering
11700Sstevel@tonic-gate 		 *   and pick a master node for each MN diskset.
11710Sstevel@tonic-gate 		 */
11720Sstevel@tonic-gate 
11730Sstevel@tonic-gate 		/* expect the nodelist to follow the step name */
11740Sstevel@tonic-gate 		if (argc < 1)
11750Sstevel@tonic-gate 			usage(sp, 1);
11760Sstevel@tonic-gate 
11770Sstevel@tonic-gate 		meta_mc_log(MC_LOG2, gettext("Starting Step1: %s"),
11780Sstevel@tonic-gate 		    meta_print_hrtime(0));
11790Sstevel@tonic-gate 
11800Sstevel@tonic-gate 		/* Always write nodelist file even if no local set exists */
11810Sstevel@tonic-gate 		if (clust == SDSSC_OKAY) {
11820Sstevel@tonic-gate 			/* skip to the nodelist args */
11830Sstevel@tonic-gate 			if (meta_write_nodelist(argc, argv, ep) != 0) {
11840Sstevel@tonic-gate 				mde_perror(ep, gettext(
11850Sstevel@tonic-gate 				    "Could not populate nodelist file"));
11860Sstevel@tonic-gate 				md_exit(sp, 1);
11870Sstevel@tonic-gate 			}
11880Sstevel@tonic-gate 		}
11890Sstevel@tonic-gate 
11900Sstevel@tonic-gate 		/*
11910Sstevel@tonic-gate 		 * Does local set exist? If not, exit with 0
11920Sstevel@tonic-gate 		 * since there's no reason to have this node panic if
11930Sstevel@tonic-gate 		 * the local set cannot be started.
11940Sstevel@tonic-gate 		 */
11950Sstevel@tonic-gate 		if ((local_sp = load_local_set(ep)) == NULL) {
11960Sstevel@tonic-gate 			md_exit(local_sp, 0);
11970Sstevel@tonic-gate 		}
11980Sstevel@tonic-gate 
11990Sstevel@tonic-gate 		/*
12000Sstevel@tonic-gate 		 * At this point, all meta* commands are blocked across
12010Sstevel@tonic-gate 		 * all disksets since the master rpc.mdcommd has drained or
12020Sstevel@tonic-gate 		 * the master node has died.
12030Sstevel@tonic-gate 		 * If a metaset or metadb command had been in progress
12040Sstevel@tonic-gate 		 * at the start of the reconfig cycle, this command has
12050Sstevel@tonic-gate 		 * either completed or it has been terminated due to
12060Sstevel@tonic-gate 		 * the death of the master node.
12070Sstevel@tonic-gate 		 *
12080Sstevel@tonic-gate 		 * This means that that it is now ok to remove any
12090Sstevel@tonic-gate 		 * outstanding clnt_locks associated with multinode
12100Sstevel@tonic-gate 		 * disksets on this node due to a node panic during
12110Sstevel@tonic-gate 		 * a metaset operation.  This allows the routines that
12120Sstevel@tonic-gate 		 * choose the master to use rpc.metad to determine the
12130Sstevel@tonic-gate 		 * master of the diskset.
12140Sstevel@tonic-gate 		 */
12150Sstevel@tonic-gate 		if (clnt_clr_mnsetlock(mynode(), ep) != 0) {
12160Sstevel@tonic-gate 			meta_mc_log(MC_LOG2, gettext("Step1 aborted:"
12170Sstevel@tonic-gate 			    "clear locks failed %s"),
12180Sstevel@tonic-gate 			    meta_print_hrtime(gethrtime() - start_time));
12190Sstevel@tonic-gate 			md_exit(local_sp, 1);
12200Sstevel@tonic-gate 		}
12210Sstevel@tonic-gate 
12220Sstevel@tonic-gate 		/*
12230Sstevel@tonic-gate 		 * Call reconfig_choose_master to choose a master for
12240Sstevel@tonic-gate 		 * each MN diskset, update the nodelist for each diskset
12250Sstevel@tonic-gate 		 * given the member information and send a reinit message
12260Sstevel@tonic-gate 		 * to rpc.mdcommd to reload the nodelist.
12270Sstevel@tonic-gate 		 */
12280Sstevel@tonic-gate 		rval = meta_reconfig_choose_master(ep);
12290Sstevel@tonic-gate 		if (rval == 205) {
12300Sstevel@tonic-gate 			/*
12310Sstevel@tonic-gate 			 * NOTE: Should issue call to reboot remote host that
12320Sstevel@tonic-gate 			 * is causing the RPC failure.  Clustering to
12330Sstevel@tonic-gate 			 * provide interface in the future.  This should
12340Sstevel@tonic-gate 			 * stop a never-ending set of 205 reconfig cycles.
12350Sstevel@tonic-gate 			 * Remote host causing failure is stored in
12360Sstevel@tonic-gate 			 * ep->host if ep is an RPC error.
12370Sstevel@tonic-gate 			 * if (mdanyrpcerror(ep))
12380Sstevel@tonic-gate 			 * 	reboot (ep->host);
12390Sstevel@tonic-gate 			 */
12400Sstevel@tonic-gate 			meta_mc_log(MC_LOG2, gettext("Step1 aborted:"
12410Sstevel@tonic-gate 			    "choose master failure of 205 %s"),
12420Sstevel@tonic-gate 			    meta_print_hrtime(gethrtime() - start_time));
12430Sstevel@tonic-gate 			md_exit(local_sp, 205);
12440Sstevel@tonic-gate 		} else if (rval != 0) {
12450Sstevel@tonic-gate 			meta_mc_log(MC_LOG2, gettext("Step1 failure: "
12460Sstevel@tonic-gate 			    "choose master failure %s"),
12470Sstevel@tonic-gate 			    meta_print_hrtime(gethrtime() - start_time));
12480Sstevel@tonic-gate 			md_exit(local_sp, 1);
12490Sstevel@tonic-gate 		}
12500Sstevel@tonic-gate 
12510Sstevel@tonic-gate 		meta_mc_log(MC_LOG2, gettext("Step1 completed: %s"),
12520Sstevel@tonic-gate 		    meta_print_hrtime(gethrtime() - start_time));
12530Sstevel@tonic-gate 
12540Sstevel@tonic-gate 		md_exit(local_sp, rval);
12550Sstevel@tonic-gate 		break;
12560Sstevel@tonic-gate 
12570Sstevel@tonic-gate 	case MC_STEP2:
12580Sstevel@tonic-gate 		/*
12590Sstevel@tonic-gate 		 * Step 2
12600Sstevel@tonic-gate 		 *
12610Sstevel@tonic-gate 		 * In Step 2, each node walks the list of disksets.  If a
12620Sstevel@tonic-gate 		 * node is a master of a MN diskset, it synchronizes
12630Sstevel@tonic-gate 		 * the local set USER records for that diskset.
12640Sstevel@tonic-gate 		 *
12650Sstevel@tonic-gate 		 * If disks exist in the diskset and there is a joined
12660Sstevel@tonic-gate 		 * (owner) node in the diskset, the master will also:
12670Sstevel@tonic-gate 		 *	- synchronize the diskset mddbs to the master
12680Sstevel@tonic-gate 		 *	- play the change log
12690Sstevel@tonic-gate 		 *
12700Sstevel@tonic-gate 		 * The master node will now attempt to join any unjoined
12710Sstevel@tonic-gate 		 * nodes that are currently members in the membership list.
12720Sstevel@tonic-gate 		 */
12730Sstevel@tonic-gate 
12740Sstevel@tonic-gate 		/* expect the nodelist to follow the step name */
12750Sstevel@tonic-gate 		if (argc < 1)
12760Sstevel@tonic-gate 			usage(sp, 1);
12770Sstevel@tonic-gate 
12780Sstevel@tonic-gate 		meta_mc_log(MC_LOG2, gettext("Starting Step2: %s"),
12790Sstevel@tonic-gate 		    meta_print_hrtime(0));
12800Sstevel@tonic-gate 
12810Sstevel@tonic-gate 		/*
12820Sstevel@tonic-gate 		 * Does local set exist? If not, exit with 0
12830Sstevel@tonic-gate 		 * since there's no reason to have this node panic if
12840Sstevel@tonic-gate 		 * the local set cannot be started.
12850Sstevel@tonic-gate 		 */
12860Sstevel@tonic-gate 		if ((local_sp = load_local_set(ep)) == NULL) {
12870Sstevel@tonic-gate 			md_exit(local_sp, 0);
12880Sstevel@tonic-gate 		}
12890Sstevel@tonic-gate 
12900Sstevel@tonic-gate 		if ((max_sets = get_max_sets(ep)) == 0) {
12910Sstevel@tonic-gate 			mde_perror(ep, "");
12920Sstevel@tonic-gate 			md_exit(local_sp, 1);
12930Sstevel@tonic-gate 		}
12940Sstevel@tonic-gate 
12950Sstevel@tonic-gate 		/* start walking through all possible disksets */
12960Sstevel@tonic-gate 		for (setno = 1; setno < max_sets; setno++) {
12970Sstevel@tonic-gate 			if ((sp = metasetnosetname(setno, ep)) == NULL) {
12980Sstevel@tonic-gate 				if (mdiserror(ep, MDE_NO_SET)) {
12990Sstevel@tonic-gate 					/* No set for this setno - continue */
13000Sstevel@tonic-gate 					mdclrerror(ep);
13010Sstevel@tonic-gate 					continue;
13020Sstevel@tonic-gate 				} else if (mdanyrpcerror(ep)) {
13030Sstevel@tonic-gate 					/* Fail on RPC failure to self */
13040Sstevel@tonic-gate 					mde_perror(ep, gettext(
13050Sstevel@tonic-gate 					    "Unable to get information for "
13060Sstevel@tonic-gate 					    "set number %d"), setno);
13070Sstevel@tonic-gate 					md_exit(local_sp, 1);
13080Sstevel@tonic-gate 				} else {
13090Sstevel@tonic-gate 					mde_perror(ep, gettext(
13100Sstevel@tonic-gate 					    "Unable to get information for "
13110Sstevel@tonic-gate 					    "set number %d"), setno);
13120Sstevel@tonic-gate 					mdclrerror(ep);
13130Sstevel@tonic-gate 					continue;
13140Sstevel@tonic-gate 				}
13150Sstevel@tonic-gate 			}
13160Sstevel@tonic-gate 
13170Sstevel@tonic-gate 			if ((sd = metaget_setdesc(sp, ep)) == NULL) {
13180Sstevel@tonic-gate 				if (mdanyrpcerror(ep)) {
13190Sstevel@tonic-gate 					/* Fail on RPC failure to self */
13200Sstevel@tonic-gate 					mde_perror(ep, gettext(
13210Sstevel@tonic-gate 					    "Unable to get information for "
13220Sstevel@tonic-gate 					    "set number %d"), setno);
13230Sstevel@tonic-gate 					md_exit(local_sp, 1);
13240Sstevel@tonic-gate 				}
13250Sstevel@tonic-gate 				mde_perror(ep, gettext("Unable to get set "
13260Sstevel@tonic-gate 				    "%s desc information"), sp->setname);
13270Sstevel@tonic-gate 				mdclrerror(ep);
13280Sstevel@tonic-gate 				continue;
13290Sstevel@tonic-gate 			}
13300Sstevel@tonic-gate 
13310Sstevel@tonic-gate 			/* Only check MN disksets */
13320Sstevel@tonic-gate 			if (!(MD_MNSET_DESC(sd))) {
13330Sstevel@tonic-gate 				continue;
13340Sstevel@tonic-gate 			}
13350Sstevel@tonic-gate 
13360Sstevel@tonic-gate 			/* All actions in step 2 are driven by master */
13370Sstevel@tonic-gate 			if (!(sd->sd_mn_am_i_master)) {
13380Sstevel@tonic-gate 				continue;
13390Sstevel@tonic-gate 			}
13400Sstevel@tonic-gate 
13410Sstevel@tonic-gate 			meta_mc_log(MC_LOG3, gettext("Step2 - begin record "
13420Sstevel@tonic-gate 			    "synchronization for set %s: %s"), sp->setname,
13430Sstevel@tonic-gate 			    meta_print_hrtime(gethrtime() - start_time));
13440Sstevel@tonic-gate 
13450Sstevel@tonic-gate 			/*
13460Sstevel@tonic-gate 			 * Synchronize the USER records in the local mddbs
13470Sstevel@tonic-gate 			 * for hosts that are members.  The USER records
13480Sstevel@tonic-gate 			 * contain set, drive and host information.
13490Sstevel@tonic-gate 			 */
13500Sstevel@tonic-gate 			rval = meta_mnsync_user_records(sp, ep);
13510Sstevel@tonic-gate 			if (rval != 0) {
13520Sstevel@tonic-gate 				mde_perror(ep, gettext(
13530Sstevel@tonic-gate 				    "Synchronization of user records "
13540Sstevel@tonic-gate 				    "in set %s failed\n"), sp->setname);
13550Sstevel@tonic-gate 				if (rval == 205) {
13560Sstevel@tonic-gate 					/*
13570Sstevel@tonic-gate 					 * NOTE: Should issue call to reboot
13580Sstevel@tonic-gate 					 * remote host that is causing the RPC
13590Sstevel@tonic-gate 					 * failure.  Clustering to provide
13600Sstevel@tonic-gate 					 * interface in the future.  This
13610Sstevel@tonic-gate 					 * should stop a never-ending set of
13620Sstevel@tonic-gate 					 * 205 reconfig cycles.
13630Sstevel@tonic-gate 					 * Remote host causing failure is
13640Sstevel@tonic-gate 					 * stored in ep->host if ep is an
13650Sstevel@tonic-gate 					 * RPC error.
13660Sstevel@tonic-gate 					 * if (mdanyrpcerror(ep))
13670Sstevel@tonic-gate 					 * 	reboot (ep->host);
13680Sstevel@tonic-gate 					 */
13690Sstevel@tonic-gate 					md_exit(local_sp, 205);
13700Sstevel@tonic-gate 				} else {
13710Sstevel@tonic-gate 					md_exit(local_sp, 1);
13720Sstevel@tonic-gate 				}
13730Sstevel@tonic-gate 			}
13740Sstevel@tonic-gate 
13750Sstevel@tonic-gate 			/* Reget sd since sync_user_recs may have flushed it */
13760Sstevel@tonic-gate 			if ((sd = metaget_setdesc(sp, ep)) == NULL) {
13770Sstevel@tonic-gate 				mde_perror(ep, gettext("Unable to get set "
13780Sstevel@tonic-gate 				    "%s desc information"), sp->setname);
13790Sstevel@tonic-gate 				md_exit(local_sp, 1);
13800Sstevel@tonic-gate 			}
13810Sstevel@tonic-gate 
13820Sstevel@tonic-gate 			dd = metaget_drivedesc(sp,
13830Sstevel@tonic-gate 			    (MD_BASICNAME_OK | PRINT_FAST), ep);
13840Sstevel@tonic-gate 			if (! mdisok(ep)) {
13850Sstevel@tonic-gate 				mde_perror(ep, gettext("Unable to get set "
13860Sstevel@tonic-gate 				    "%s drive information"), sp->setname);
13870Sstevel@tonic-gate 				md_exit(local_sp, 1);
13880Sstevel@tonic-gate 			}
13890Sstevel@tonic-gate 
13900Sstevel@tonic-gate 			/*
13910Sstevel@tonic-gate 			 * No drives in set, continue to next set.
13920Sstevel@tonic-gate 			 */
13930Sstevel@tonic-gate 			if (dd == NULL) {
13940Sstevel@tonic-gate 				/* Done with this set */
13950Sstevel@tonic-gate 				continue;
13960Sstevel@tonic-gate 			}
13970Sstevel@tonic-gate 
13980Sstevel@tonic-gate 			meta_mc_log(MC_LOG3, gettext("Step2 - local set user "
13990Sstevel@tonic-gate 			    "records completed for set %s: %s"), sp->setname,
14000Sstevel@tonic-gate 			    meta_print_hrtime(gethrtime() - start_time));
14010Sstevel@tonic-gate 
14020Sstevel@tonic-gate 			/*
14030Sstevel@tonic-gate 			 * Synchronize the diskset mddbs for hosts
14040Sstevel@tonic-gate 			 * that are members.  This may involve
14050Sstevel@tonic-gate 			 * playing the changelog and writing out
14060Sstevel@tonic-gate 			 * to the diskset mddbs.
14070Sstevel@tonic-gate 			 */
14080Sstevel@tonic-gate 			rval = meta_mnsync_diskset_mddbs(sp, ep);
14090Sstevel@tonic-gate 			if (rval != 0) {
14100Sstevel@tonic-gate 				mde_perror(ep, gettext(
14110Sstevel@tonic-gate 				    "Synchronization of diskset mddbs "
14120Sstevel@tonic-gate 				    "in set %s failed\n"), sp->setname);
14130Sstevel@tonic-gate 				meta_mc_log(MC_LOG3, gettext("Step2 - diskset "
14140Sstevel@tonic-gate 				    "mddb synchronization failed for "
14150Sstevel@tonic-gate 				    "set %s: %s"), sp->setname,
14160Sstevel@tonic-gate 				    meta_print_hrtime(gethrtime() -
14170Sstevel@tonic-gate 				    start_time));
14180Sstevel@tonic-gate 				if (rval == 205) {
14190Sstevel@tonic-gate 					/*
14200Sstevel@tonic-gate 					 * NOTE: Should issue call to reboot
14210Sstevel@tonic-gate 					 * remote host that is causing the RPC
14220Sstevel@tonic-gate 					 * failure.  Clustering to provide
14230Sstevel@tonic-gate 					 * interface in the future.  This
14240Sstevel@tonic-gate 					 * should stop a never-ending set of
14250Sstevel@tonic-gate 					 * 205 reconfig cycles.
14260Sstevel@tonic-gate 					 * Remote host causing failure is
14270Sstevel@tonic-gate 					 * stored in ep->host if ep is an
14280Sstevel@tonic-gate 					 * RPC error.
14290Sstevel@tonic-gate 					 * if (mdanyrpcerror(ep))
14300Sstevel@tonic-gate 					 * 	reboot (ep->host);
14310Sstevel@tonic-gate 					 */
14320Sstevel@tonic-gate 					md_exit(local_sp, 205);
14330Sstevel@tonic-gate 				} else if (rval == 1) {
14340Sstevel@tonic-gate 					continue;
14350Sstevel@tonic-gate 				} else {
14360Sstevel@tonic-gate 					md_exit(local_sp, 1);
14370Sstevel@tonic-gate 				}
14380Sstevel@tonic-gate 			}
14390Sstevel@tonic-gate 
14400Sstevel@tonic-gate 			meta_mc_log(MC_LOG3, gettext("Step2 - diskset mddb "
14410Sstevel@tonic-gate 			    "synchronization completed for set %s: %s"),
14420Sstevel@tonic-gate 			    sp->setname,
14430Sstevel@tonic-gate 			    meta_print_hrtime(gethrtime() - start_time));
14440Sstevel@tonic-gate 
14450Sstevel@tonic-gate 			/* Join the starting nodes to the diskset */
14460Sstevel@tonic-gate 			rval = meta_mnjoin_all(sp, ep);
14470Sstevel@tonic-gate 			if (rval != 0) {
14480Sstevel@tonic-gate 				mde_perror(ep, gettext(
14490Sstevel@tonic-gate 				    "Join of non-owner (starting) nodes "
14500Sstevel@tonic-gate 				    "in set %s failed\n"), sp->setname);
14510Sstevel@tonic-gate 				meta_mc_log(MC_LOG3, gettext("Step2 - non owner"
14520Sstevel@tonic-gate 				    "nodes joined for set %s: %s"),
14530Sstevel@tonic-gate 				    sp->setname,
14540Sstevel@tonic-gate 				    meta_print_hrtime(gethrtime() -
14550Sstevel@tonic-gate 				    start_time));
14560Sstevel@tonic-gate 				if (rval == 205) {
14570Sstevel@tonic-gate 					/*
14580Sstevel@tonic-gate 					 * NOTE: Should issue call to reboot
14590Sstevel@tonic-gate 					 * remote host that is causing the RPC
14600Sstevel@tonic-gate 					 * failure.  Clustering to provide
14610Sstevel@tonic-gate 					 * interface in the future.  This
14620Sstevel@tonic-gate 					 * should stop a never-ending set of
14630Sstevel@tonic-gate 					 * 205 reconfig cycles.
14640Sstevel@tonic-gate 					 * Remote host causing failure is
14650Sstevel@tonic-gate 					 * stored in ep->host if ep is an
14660Sstevel@tonic-gate 					 * RPC error.
14670Sstevel@tonic-gate 					 * if (mdanyrpcerror(ep))
14680Sstevel@tonic-gate 					 * 	reboot (ep->host);
14690Sstevel@tonic-gate 					 */
14700Sstevel@tonic-gate 					md_exit(local_sp, 205);
14710Sstevel@tonic-gate 				} else {
14720Sstevel@tonic-gate 					md_exit(local_sp, 1);
14730Sstevel@tonic-gate 				}
14740Sstevel@tonic-gate 			}
14750Sstevel@tonic-gate 
14760Sstevel@tonic-gate 			meta_mc_log(MC_LOG3, gettext("Step2 - non owner nodes "
14770Sstevel@tonic-gate 			    "joined for set %s: %s"), sp->setname,
14780Sstevel@tonic-gate 			    meta_print_hrtime(gethrtime() - start_time));
14790Sstevel@tonic-gate 
14800Sstevel@tonic-gate 		}
14810Sstevel@tonic-gate 
14820Sstevel@tonic-gate 		meta_mc_log(MC_LOG2, gettext("Step2 completed: %s"),
14830Sstevel@tonic-gate 		    meta_print_hrtime(gethrtime() - start_time));
14840Sstevel@tonic-gate 
14850Sstevel@tonic-gate 		break;
14860Sstevel@tonic-gate 
14870Sstevel@tonic-gate 	case MC_STEP3:
14880Sstevel@tonic-gate 		/*
14890Sstevel@tonic-gate 		 * Step 3
14900Sstevel@tonic-gate 		 *
14910Sstevel@tonic-gate 		 * For all multinode sets do,
14920Sstevel@tonic-gate 		 * - Reinitialise rpc.mdcommd
14930Sstevel@tonic-gate 		 * - Reset mirror owners to null if the current owner is
14940Sstevel@tonic-gate 		 *   no longer in the membership list
14950Sstevel@tonic-gate 		 */
14960Sstevel@tonic-gate 
14970Sstevel@tonic-gate 		/* expect the nodelist to follow the step name */
14980Sstevel@tonic-gate 		if (argc < 1)
14990Sstevel@tonic-gate 			usage(sp, 1);
15000Sstevel@tonic-gate 
15010Sstevel@tonic-gate 		meta_mc_log(MC_LOG2, gettext("Starting Step3: %s"),
15020Sstevel@tonic-gate 		    meta_print_hrtime(0));
15030Sstevel@tonic-gate 
15040Sstevel@tonic-gate 		/*
15050Sstevel@tonic-gate 		 * Does local set exist? If not, exit with 0
15060Sstevel@tonic-gate 		 * since there's no reason to have this node panic if
15070Sstevel@tonic-gate 		 * the local set cannot be started.
15080Sstevel@tonic-gate 		 */
15090Sstevel@tonic-gate 		if ((local_sp = load_local_set(ep)) == NULL) {
15100Sstevel@tonic-gate 			md_exit(local_sp, 0);
15110Sstevel@tonic-gate 		}
15120Sstevel@tonic-gate 
15130Sstevel@tonic-gate 		/*
15140Sstevel@tonic-gate 		 * walk through all sets on this node which could include:
15150Sstevel@tonic-gate 		 *	- MN disksets
15160Sstevel@tonic-gate 		 *	- traditional disksets
15170Sstevel@tonic-gate 		 *	- non-existent disksets
15180Sstevel@tonic-gate 		 * start mirror resync for all MN sets
15190Sstevel@tonic-gate 		 */
15200Sstevel@tonic-gate 		if ((max_sets = get_max_sets(ep)) == 0) {
15210Sstevel@tonic-gate 			mde_perror(ep, "");
15220Sstevel@tonic-gate 			md_exit(local_sp, 1);
15230Sstevel@tonic-gate 		}
15240Sstevel@tonic-gate 
15250Sstevel@tonic-gate 		/* start walking through all possible disksets */
15260Sstevel@tonic-gate 		for (setno = 1; setno < max_sets; setno++) {
15270Sstevel@tonic-gate 			if ((sp = metasetnosetname(setno, ep)) == NULL) {
15280Sstevel@tonic-gate 				if (mdiserror(ep, MDE_NO_SET)) {
15290Sstevel@tonic-gate 					/* No set for this setno - continue */
15300Sstevel@tonic-gate 					mdclrerror(ep);
15310Sstevel@tonic-gate 					continue;
15320Sstevel@tonic-gate 				} else {
15330Sstevel@tonic-gate 					mde_perror(ep, gettext("Unable to "
15340Sstevel@tonic-gate 					    "get set %d information"), setno);
15350Sstevel@tonic-gate 					md_exit(local_sp, 1);
15360Sstevel@tonic-gate 				}
15370Sstevel@tonic-gate 			}
15380Sstevel@tonic-gate 
15390Sstevel@tonic-gate 			/* only check multi-node disksets */
15400Sstevel@tonic-gate 			if (!meta_is_mn_set(sp, ep)) {
15410Sstevel@tonic-gate 				mdclrerror(ep);
15420Sstevel@tonic-gate 				continue;
15430Sstevel@tonic-gate 			}
15440Sstevel@tonic-gate 
15450Sstevel@tonic-gate 			if (meta_lock(sp, TRUE, ep) != 0) {
15460Sstevel@tonic-gate 				mde_perror(ep, "");
15470Sstevel@tonic-gate 				md_exit(local_sp, 1);
15480Sstevel@tonic-gate 			}
15490Sstevel@tonic-gate 
15500Sstevel@tonic-gate 			/* If this node isn't joined to set, do nothing */
15510Sstevel@tonic-gate 			if (s_ownset(sp->setno, ep) != MD_SETOWNER_YES) {
15520Sstevel@tonic-gate 				if (!mdisok(ep)) {
15530Sstevel@tonic-gate 					mde_perror(ep, gettext("Could "
15540Sstevel@tonic-gate 					    "not get set %s ownership"),
15550Sstevel@tonic-gate 					    sp->setname);
15560Sstevel@tonic-gate 					md_exit(sp, 1);
15570Sstevel@tonic-gate 				}
15580Sstevel@tonic-gate 				mdclrerror(ep);
15590Sstevel@tonic-gate 				meta_unlock(sp, ep);
15600Sstevel@tonic-gate 				continue;
15610Sstevel@tonic-gate 			}
15620Sstevel@tonic-gate 
15630Sstevel@tonic-gate 			meta_mc_log(MC_LOG3, gettext("Step3 - begin "
15640Sstevel@tonic-gate 			    "re-initialising rpc.mdcommd and resetting mirror "
15650Sstevel@tonic-gate 			    "owners for set %s: %s"), sp->setname,
15660Sstevel@tonic-gate 			    meta_print_hrtime(gethrtime() - start_time));
15670Sstevel@tonic-gate 
15680Sstevel@tonic-gate 			/* reinitialzse rpc.mdcommd with new nodelist */
15690Sstevel@tonic-gate 			if (mdmn_reinit_set(setno)) {
15700Sstevel@tonic-gate 				md_eprintf(gettext(
15710Sstevel@tonic-gate 				    "Could not re-initialise rpc.mdcommd for "
15720Sstevel@tonic-gate 				    "set %s\n"), sp->setname);
15730Sstevel@tonic-gate 				md_exit(sp, 1);
15740Sstevel@tonic-gate 			}
15750Sstevel@tonic-gate 
15760Sstevel@tonic-gate 			(void) memset(&cfg, 0, sizeof (cfg));
15770Sstevel@tonic-gate 			cfg.c_id = 0;
15780Sstevel@tonic-gate 			cfg.c_setno = sp->setno;
15790Sstevel@tonic-gate 			if (metaioctl(MD_DB_GETDEV, &cfg, &cfg.c_mde,
15800Sstevel@tonic-gate 			    NULL) != 0) {
15810Sstevel@tonic-gate 				mdstealerror(ep, &cfg.c_mde);
15820Sstevel@tonic-gate 				mde_perror(ep, gettext("Could "
15830Sstevel@tonic-gate 				    "not get set %s information"),
15840Sstevel@tonic-gate 				    sp->setname);
15850Sstevel@tonic-gate 				md_exit(sp, 1);
15860Sstevel@tonic-gate 			}
15870Sstevel@tonic-gate 
15880Sstevel@tonic-gate 			/* Don't do anything else if set is stale */
15890Sstevel@tonic-gate 			if (cfg.c_flags & MDDB_C_STALE) {
15900Sstevel@tonic-gate 				meta_unlock(sp, ep);
15910Sstevel@tonic-gate 				mdclrerror(ep);
15920Sstevel@tonic-gate 				continue;
15930Sstevel@tonic-gate 			}
15940Sstevel@tonic-gate 
15950Sstevel@tonic-gate 			/* reset mirror owners */
15960Sstevel@tonic-gate 			if (reset_state(RESET_OWNER, sp, MD_MIRROR, ep) == -1) {
15970Sstevel@tonic-gate 				md_exit(sp, 1);
15980Sstevel@tonic-gate 			}
15990Sstevel@tonic-gate 
16000Sstevel@tonic-gate 			meta_unlock(sp, ep);
16010Sstevel@tonic-gate 
16020Sstevel@tonic-gate 			meta_mc_log(MC_LOG3, gettext("Step3 - rpc.mdcommd "
16030Sstevel@tonic-gate 			    "re-initialised and mirror owners reset for "
16040Sstevel@tonic-gate 			    "set %s: %s"), sp->setname,
16050Sstevel@tonic-gate 			    meta_print_hrtime(gethrtime() - start_time));
16060Sstevel@tonic-gate 		}
16070Sstevel@tonic-gate 
16080Sstevel@tonic-gate 		meta_mc_log(MC_LOG2, gettext("Step3 completed: %s"),
16090Sstevel@tonic-gate 		    meta_print_hrtime(gethrtime() - start_time));
16100Sstevel@tonic-gate 
16110Sstevel@tonic-gate 		break;
16120Sstevel@tonic-gate 
16130Sstevel@tonic-gate 	case MC_STEP4:
16140Sstevel@tonic-gate 		/*
16150Sstevel@tonic-gate 		 * Step 4
16160Sstevel@tonic-gate 		 *
16170Sstevel@tonic-gate 		 * For all multinode sets do:
16180Sstevel@tonic-gate 		 * - Resume the rpc.mdcommd messages.  Must resume all
16190Sstevel@tonic-gate 		 *	sets before issuing I/O to any set since an error
16200Sstevel@tonic-gate 		 * 	encountered in a commd suspended set could be
16210Sstevel@tonic-gate 		 *	blocked waiting for commd in another set to resume.
16220Sstevel@tonic-gate 		 *	(This happens since the daemon queues service
16230Sstevel@tonic-gate 		 *	all sets).  An open of a soft partition causes
16240Sstevel@tonic-gate 		 *	a read of the watermarks during the open.
16250Sstevel@tonic-gate 		 * - If set is non-writable (not an owner or STALE), then
16260Sstevel@tonic-gate 		 *	continue to next set.
16270Sstevel@tonic-gate 		 *
16280Sstevel@tonic-gate 		 * For all multinode sets do,
16290Sstevel@tonic-gate 		 * - Reset ABR states for all mirrors, ie clear ABR if not
16300Sstevel@tonic-gate 		 *	open on any node.
16310Sstevel@tonic-gate 		 * - Reset ABR states for all soft partitions, ie clear ABR if
16320Sstevel@tonic-gate 		 *	not open on any node.
16330Sstevel@tonic-gate 		 * - For all slave nodes that have entered through the start
16340Sstevel@tonic-gate 		 *	step, update the ABR state to that of the master and
16350Sstevel@tonic-gate 		 *	get the submirror state from the master
16360Sstevel@tonic-gate 		 * - meta_lock set
16370Sstevel@tonic-gate 		 * - Resync all mirrors
16380Sstevel@tonic-gate 		 * - unlock meta_lock for this set.
16390Sstevel@tonic-gate 		 * - Choose a new owner for any orphaned resyncs
16400Sstevel@tonic-gate 		 *
16410Sstevel@tonic-gate 		 * There is one potential issue here. when concurrently
16420Sstevel@tonic-gate 		 * resetting and updating the ABR state. If the master has ABR
16430Sstevel@tonic-gate 		 * set, but should no longer have because the only node that
16440Sstevel@tonic-gate 		 * had the metadevice open and had ABR set has paniced, the
16450Sstevel@tonic-gate 		 * master will send a message to all nodes to clear the ABR
16460Sstevel@tonic-gate 		 * state. Meanwhile any node that has come through the
16470Sstevel@tonic-gate 		 * start step will get tstate from the master and will update
16480Sstevel@tonic-gate 		 * ABR if it was set in tstate. So, we appear to have a problem
16490Sstevel@tonic-gate 		 * if the following sequence occurs:-
16500Sstevel@tonic-gate 		 * - The slave gets tstate with ABR set
16510Sstevel@tonic-gate 		 * - The master sends a message to clear ABR
16520Sstevel@tonic-gate 		 * - The slave updates ABR with the value it got from tstate.
16530Sstevel@tonic-gate 		 * We now have the master with ABR clear and the slave with ABR
16540Sstevel@tonic-gate 		 * set. Fortunately, having set ABR, the slave will close the
16550Sstevel@tonic-gate 		 * metadevice after setting ABR and as there are no nodes with
16560Sstevel@tonic-gate 		 * the device open, the close will send a message to clear ABR
16570Sstevel@tonic-gate 		 * on all nodes. So, the nodes will all have ABR unset.
16580Sstevel@tonic-gate 		 */
16590Sstevel@tonic-gate 
16600Sstevel@tonic-gate 		/* expect the nodelist to follow the step name */
16610Sstevel@tonic-gate 		if (argc < 1)
16620Sstevel@tonic-gate 			usage(sp, 1);
16630Sstevel@tonic-gate 
16640Sstevel@tonic-gate 		meta_mc_log(MC_LOG2, gettext("Starting Step4: %s"),
16650Sstevel@tonic-gate 		    meta_print_hrtime(0));
16660Sstevel@tonic-gate 
16670Sstevel@tonic-gate 		/*
16680Sstevel@tonic-gate 		 * Does local set exist? If not, exit with 0
16690Sstevel@tonic-gate 		 * since there's no reason to have this node panic if
16700Sstevel@tonic-gate 		 * the local set cannot be started.
16710Sstevel@tonic-gate 		 */
16720Sstevel@tonic-gate 		if ((local_sp = load_local_set(ep)) == NULL) {
16730Sstevel@tonic-gate 			md_exit(local_sp, 0);
16740Sstevel@tonic-gate 		}
16750Sstevel@tonic-gate 
16760Sstevel@tonic-gate 		/*
16770Sstevel@tonic-gate 		 * walk through all sets on this node which could include:
16780Sstevel@tonic-gate 		 *	- MN disksets
16790Sstevel@tonic-gate 		 *	- traditional disksets
16800Sstevel@tonic-gate 		 *	- non-existent disksets
16810Sstevel@tonic-gate 		 * start mirror resync for all MN sets
16820Sstevel@tonic-gate 		 */
16830Sstevel@tonic-gate 		if ((max_sets = get_max_sets(ep)) == 0) {
16840Sstevel@tonic-gate 			mde_perror(ep, "");
16850Sstevel@tonic-gate 			md_exit(local_sp, 1);
16860Sstevel@tonic-gate 		}
16870Sstevel@tonic-gate 
16880Sstevel@tonic-gate 		/* Clear set_info structure */
16890Sstevel@tonic-gate 		for (setno = 1; setno < max_sets; setno++) {
16900Sstevel@tonic-gate 			set_info[setno] = 0;
16910Sstevel@tonic-gate 		}
16920Sstevel@tonic-gate 
16930Sstevel@tonic-gate 		/* start walking through all possible disksets */
16940Sstevel@tonic-gate 		for (setno = 1; setno < max_sets; setno++) {
16950Sstevel@tonic-gate 			if ((sp = metasetnosetname(setno, ep)) == NULL) {
16960Sstevel@tonic-gate 				if (mdiserror(ep, MDE_NO_SET)) {
16970Sstevel@tonic-gate 					/* No set for this setno - continue */
16980Sstevel@tonic-gate 					mdclrerror(ep);
16990Sstevel@tonic-gate 					continue;
17000Sstevel@tonic-gate 				} else {
17010Sstevel@tonic-gate 					mde_perror(ep, gettext("Unable to "
17020Sstevel@tonic-gate 					    "get set %d information"), setno);
17030Sstevel@tonic-gate 					md_exit(local_sp, 1);
17040Sstevel@tonic-gate 				}
17050Sstevel@tonic-gate 			}
17060Sstevel@tonic-gate 
17070Sstevel@tonic-gate 			if ((sd = metaget_setdesc(sp, ep)) == NULL) {
17080Sstevel@tonic-gate 				mde_perror(ep, gettext("Unable to get set "
17090Sstevel@tonic-gate 				    "%s desc information"), sp->setname);
17100Sstevel@tonic-gate 				mdclrerror(ep);
17110Sstevel@tonic-gate 				continue;
17120Sstevel@tonic-gate 			}
17130Sstevel@tonic-gate 
17140Sstevel@tonic-gate 			/* only check multi-node disksets */
17150Sstevel@tonic-gate 			if (!meta_is_mn_set(sp, ep)) {
17160Sstevel@tonic-gate 				mdclrerror(ep);
17170Sstevel@tonic-gate 				continue;
17180Sstevel@tonic-gate 			}
17190Sstevel@tonic-gate 
17200Sstevel@tonic-gate 			set_info[setno] |= SET_INFO_MN;
17210Sstevel@tonic-gate 
17220Sstevel@tonic-gate 			/*
17230Sstevel@tonic-gate 			 * If not an owner (all mddbs failed) or stale
17240Sstevel@tonic-gate 			 * (< 50% mddbs operational), then set is
17250Sstevel@tonic-gate 			 * non-writable so just resume commd and
17260Sstevel@tonic-gate 			 * unblock mddb messages.
17270Sstevel@tonic-gate 			 */
17280Sstevel@tonic-gate 			mdclrerror(ep);
17290Sstevel@tonic-gate 			if (s_ownset(sp->setno, ep) != MD_SETOWNER_YES) {
17300Sstevel@tonic-gate 				set_info[setno] |= SET_INFO_NO_WR;
17310Sstevel@tonic-gate 			}
17320Sstevel@tonic-gate 			if (!mdisok(ep)) {
17330Sstevel@tonic-gate 				mde_perror(ep, gettext("Could "
17340Sstevel@tonic-gate 				    "not get set %s ownership"),
17350Sstevel@tonic-gate 				    sp->setname);
17360Sstevel@tonic-gate 				md_exit(local_sp, 1);
17370Sstevel@tonic-gate 			}
17380Sstevel@tonic-gate 			/* Set is owned - is it stale? */
17390Sstevel@tonic-gate 			if (!set_info[setno] & SET_INFO_NO_WR) {
17400Sstevel@tonic-gate 				(void) memset(&cfg, 0, sizeof (cfg));
17410Sstevel@tonic-gate 				cfg.c_id = 0;
17420Sstevel@tonic-gate 				cfg.c_setno = sp->setno;
17430Sstevel@tonic-gate 				if (metaioctl(MD_DB_GETDEV, &cfg, &cfg.c_mde,
17440Sstevel@tonic-gate 				    NULL) != 0) {
17450Sstevel@tonic-gate 					mdstealerror(ep, &cfg.c_mde);
17460Sstevel@tonic-gate 					mde_perror(ep, gettext("Could "
17470Sstevel@tonic-gate 					    "not get set %s information"),
17480Sstevel@tonic-gate 					    sp->setname);
17490Sstevel@tonic-gate 					md_exit(local_sp, 1);
17500Sstevel@tonic-gate 				}
17510Sstevel@tonic-gate 				if (cfg.c_flags & MDDB_C_STALE) {
17520Sstevel@tonic-gate 					set_info[setno] |= SET_INFO_NO_WR;
17530Sstevel@tonic-gate 				}
17540Sstevel@tonic-gate 			}
17550Sstevel@tonic-gate 
17560Sstevel@tonic-gate 			/* resume rpc.mdcommd */
17570Sstevel@tonic-gate 			if (mdmn_resume(setno, MD_COMM_ALL_CLASSES, 0)) {
17580Sstevel@tonic-gate 				md_eprintf(gettext("Unable to resume "
17590Sstevel@tonic-gate 				    "rpc.mdcommd for set %s\n"), sp->setname);
17600Sstevel@tonic-gate 				md_exit(local_sp, 1);
17610Sstevel@tonic-gate 			}
17620Sstevel@tonic-gate 			meta_ping_mnset(setno);
17630Sstevel@tonic-gate 
17640Sstevel@tonic-gate 			/* Unblock mddb parse messages */
17650Sstevel@tonic-gate 			if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) {
17660Sstevel@tonic-gate 				(void) memset(&mbp, 0, sizeof (mbp));
17670Sstevel@tonic-gate 				mbp.c_setno = setno;
17680Sstevel@tonic-gate 				mbp.c_blk_flags = MDDB_UNBLOCK_PARSE;
17690Sstevel@tonic-gate 				if (metaioctl(MD_MN_MDDB_BLOCK, &mbp,
17700Sstevel@tonic-gate 				    &mbp.c_mde, NULL)) {
17710Sstevel@tonic-gate 					mdstealerror(ep, &mbp.c_mde);
17720Sstevel@tonic-gate 					mde_perror(ep, gettext("Could not "
17730Sstevel@tonic-gate 					    "unblock set %s"), sp->setname);
17740Sstevel@tonic-gate 					md_exit(local_sp, 1);
17750Sstevel@tonic-gate 				}
17760Sstevel@tonic-gate 			}
17770Sstevel@tonic-gate 			meta_mc_log(MC_LOG3, gettext("Step4 - rpc.mdcommd "
17780Sstevel@tonic-gate 			    "resumed and messages unblocked for set %s: %s"),
17790Sstevel@tonic-gate 			    sp->setname,
17800Sstevel@tonic-gate 			    meta_print_hrtime(gethrtime() - start_time));
17810Sstevel@tonic-gate 		}
17820Sstevel@tonic-gate 
17830Sstevel@tonic-gate 		for (setno = 1; setno < max_sets; setno++) {
17840Sstevel@tonic-gate 			int			start_step;
17850Sstevel@tonic-gate 
17860Sstevel@tonic-gate 			/* Skip traditional disksets. */
17870Sstevel@tonic-gate 			if ((set_info[setno] & SET_INFO_MN) == 0)
17880Sstevel@tonic-gate 				continue;
17890Sstevel@tonic-gate 
17900Sstevel@tonic-gate 			/*
17910Sstevel@tonic-gate 			 * If already determined that this set is
17920Sstevel@tonic-gate 			 * a non-writable set, then just continue
17930Sstevel@tonic-gate 			 * to next set since there's nothing else
17940Sstevel@tonic-gate 			 * to do for a non-writable set.
17950Sstevel@tonic-gate 			 */
17960Sstevel@tonic-gate 			if (set_info[setno] & SET_INFO_NO_WR)
17970Sstevel@tonic-gate 				continue;
17980Sstevel@tonic-gate 
17990Sstevel@tonic-gate 			if ((sp = metasetnosetname(setno, ep)) == NULL) {
18000Sstevel@tonic-gate 				if (mdiserror(ep, MDE_NO_SET)) {
18010Sstevel@tonic-gate 					/* No set for this setno - continue */
18020Sstevel@tonic-gate 					mdclrerror(ep);
18030Sstevel@tonic-gate 					continue;
18040Sstevel@tonic-gate 				} else {
18050Sstevel@tonic-gate 					mde_perror(ep, gettext("Unable to "
18060Sstevel@tonic-gate 					    "get set %d information"), setno);
18070Sstevel@tonic-gate 					md_exit(local_sp, 1);
18080Sstevel@tonic-gate 				}
18090Sstevel@tonic-gate 			}
18100Sstevel@tonic-gate 
18110Sstevel@tonic-gate 			if ((sd = metaget_setdesc(sp, ep)) == NULL) {
18120Sstevel@tonic-gate 				mde_perror(ep, gettext("Unable to get set "
18130Sstevel@tonic-gate 				    "%s desc information"), sp->setname);
18140Sstevel@tonic-gate 				mdclrerror(ep);
18150Sstevel@tonic-gate 				continue;
18160Sstevel@tonic-gate 			}
18170Sstevel@tonic-gate 
18180Sstevel@tonic-gate 			/* See if this node came through the start step */
18190Sstevel@tonic-gate 			(void) memset(&sf, 0, sizeof (sf));
18200Sstevel@tonic-gate 			sf.sf_setno = sp->setno;
18210Sstevel@tonic-gate 			sf.sf_flags = MDDB_NM_GET;
18220Sstevel@tonic-gate 			/* Use magic to help protect ioctl against attack. */
18230Sstevel@tonic-gate 			sf.sf_magic = MDDB_SETFLAGS_MAGIC;
18240Sstevel@tonic-gate 			if (metaioctl(MD_MN_SET_SETFLAGS, &sf,
18250Sstevel@tonic-gate 			    &sf.sf_mde, NULL)) {
18260Sstevel@tonic-gate 				mdstealerror(ep, &sf.sf_mde);
18270Sstevel@tonic-gate 				mde_perror(ep, gettext("Could not get "
18280Sstevel@tonic-gate 				    "start_step flag for set %s"), sp->setname);
18290Sstevel@tonic-gate 				md_exit(local_sp, 1);
18300Sstevel@tonic-gate 			}
18310Sstevel@tonic-gate 			start_step =
18320Sstevel@tonic-gate 			    (sf.sf_setflags & MD_SET_MN_START_RC)? 1: 0;
18330Sstevel@tonic-gate 
18340Sstevel@tonic-gate 			/*
18350Sstevel@tonic-gate 			 * We can now reset the start_step flag for the set
18360Sstevel@tonic-gate 			 * if it was already set.
18370Sstevel@tonic-gate 			 */
18380Sstevel@tonic-gate 			if (start_step) {
18390Sstevel@tonic-gate 				(void) memset(&sf, 0, sizeof (sf));
18400Sstevel@tonic-gate 					sf.sf_setno = sp->setno;
18410Sstevel@tonic-gate 				sf.sf_setflags = MD_SET_MN_START_RC;
18420Sstevel@tonic-gate 				sf.sf_flags = MDDB_NM_RESET;
18430Sstevel@tonic-gate 				/*
18440Sstevel@tonic-gate 				 * Use magic to help protect ioctl
18450Sstevel@tonic-gate 				 * against attack.
18460Sstevel@tonic-gate 				 */
18470Sstevel@tonic-gate 				sf.sf_magic = MDDB_SETFLAGS_MAGIC;
18480Sstevel@tonic-gate 				if (metaioctl(MD_MN_SET_SETFLAGS, &sf,
18490Sstevel@tonic-gate 				    &sf.sf_mde, NULL)) {
18500Sstevel@tonic-gate 					mdstealerror(ep, &sf.sf_mde);
18510Sstevel@tonic-gate 					mde_perror(ep,
18520Sstevel@tonic-gate 					    gettext("Could not reset "
18530Sstevel@tonic-gate 					    "start_step flag for set %s"),
18540Sstevel@tonic-gate 					    sp->setname);
18550Sstevel@tonic-gate 				}
18560Sstevel@tonic-gate 			}
18570Sstevel@tonic-gate 
18580Sstevel@tonic-gate 			meta_mc_log(MC_LOG3, gettext("Step4 - begin setting "
18590Sstevel@tonic-gate 			    "ABR state and restarting io's for "
18600Sstevel@tonic-gate 			    "set %s: %s"), sp->setname,
18610Sstevel@tonic-gate 			    meta_print_hrtime(gethrtime() - start_time));
18620Sstevel@tonic-gate 
18630Sstevel@tonic-gate 
18640Sstevel@tonic-gate 			/*
18650Sstevel@tonic-gate 			 * If we are not the master and we have come through
18660Sstevel@tonic-gate 			 * the start step, we must update the ABR states
18670Sstevel@tonic-gate 			 * for mirrors and soft partitions. Also the submirror
18680Sstevel@tonic-gate 			 * states need to be synchronised so that we see the
18690Sstevel@tonic-gate 			 * same status as other previously joined members.
18700Sstevel@tonic-gate 			 * This _must_ be done before starting the resync.
18710Sstevel@tonic-gate 			 */
18720Sstevel@tonic-gate 			if (!(sd->sd_mn_am_i_master) && start_step) {
18730Sstevel@tonic-gate 				if (reset_state(GET_MIRROR_STATE, sp, MD_MIRROR,
18740Sstevel@tonic-gate 				    ep) == -1) {
18750Sstevel@tonic-gate 					md_exit(local_sp, 1);
18760Sstevel@tonic-gate 				}
18770Sstevel@tonic-gate 				if (reset_state(UPDATE_ABR, sp, MD_SP,
18780Sstevel@tonic-gate 				    ep) == -1) {
18790Sstevel@tonic-gate 					md_exit(local_sp, 1);
18800Sstevel@tonic-gate 				}
18810Sstevel@tonic-gate 				/*
18820Sstevel@tonic-gate 				 * Mark the fact that we've got the mirror
18830Sstevel@tonic-gate 				 * state. This allows the resync thread to
18840Sstevel@tonic-gate 				 * determine if _it_ needs to issue this. This
18850Sstevel@tonic-gate 				 * can happen if a node is added to a set after
18860Sstevel@tonic-gate 				 * a reconfig cycle has completed.
18870Sstevel@tonic-gate 				 */
18880Sstevel@tonic-gate 				(void) memset(&sf, 0, sizeof (sf));
18890Sstevel@tonic-gate 					sf.sf_setno = sp->setno;
18900Sstevel@tonic-gate 				sf.sf_setflags = MD_SET_MN_MIR_STATE_RC;
18910Sstevel@tonic-gate 				sf.sf_flags = MDDB_NM_SET;
18920Sstevel@tonic-gate 				/*
18930Sstevel@tonic-gate 				 * Use magic to help protect ioctl
18940Sstevel@tonic-gate 				 * against attack.
18950Sstevel@tonic-gate 				 */
18960Sstevel@tonic-gate 				sf.sf_magic = MDDB_SETFLAGS_MAGIC;
18970Sstevel@tonic-gate 				if (metaioctl(MD_MN_SET_SETFLAGS, &sf,
18980Sstevel@tonic-gate 				    &sf.sf_mde, NULL)) {
18990Sstevel@tonic-gate 					mdstealerror(ep, &sf.sf_mde);
19000Sstevel@tonic-gate 					mde_perror(ep,
19010Sstevel@tonic-gate 					    gettext("Could not set "
19020Sstevel@tonic-gate 					    "submirror state flag for set %s"),
19030Sstevel@tonic-gate 					    sp->setname);
19040Sstevel@tonic-gate 				}
19050Sstevel@tonic-gate 			}
19060Sstevel@tonic-gate 
19070Sstevel@tonic-gate 			/*
19080Sstevel@tonic-gate 			 * All remaining actions are only performed by the
19090Sstevel@tonic-gate 			 * master
19100Sstevel@tonic-gate 			 */
19110Sstevel@tonic-gate 			if (!(sd->sd_mn_am_i_master)) {
19120Sstevel@tonic-gate 				if (meta_lock(sp, TRUE, ep) != 0) {
19130Sstevel@tonic-gate 					mde_perror(ep, "");
19140Sstevel@tonic-gate 					md_exit(local_sp, 1);
19150Sstevel@tonic-gate 				}
19160Sstevel@tonic-gate 				meta_mirror_resync_unblock(sp);
19170Sstevel@tonic-gate 				meta_unlock(sp, ep);
19180Sstevel@tonic-gate 				continue;
19190Sstevel@tonic-gate 			}
19200Sstevel@tonic-gate 
19210Sstevel@tonic-gate 			/*
19220Sstevel@tonic-gate 			 * If the master came through the start step, this
19230Sstevel@tonic-gate 			 * implies that all of the nodes must have done the
19240Sstevel@tonic-gate 			 * same and hence there can be no applications
19250Sstevel@tonic-gate 			 * running. Hence no need to reset ABR
19260Sstevel@tonic-gate 			 */
19270Sstevel@tonic-gate 			if (!start_step) {
19280Sstevel@tonic-gate 				/* Reset ABR state for mirrors */
19290Sstevel@tonic-gate 				if (reset_state(RESET_ABR, sp, MD_MIRROR,
19300Sstevel@tonic-gate 				    ep) == -1) {
19310Sstevel@tonic-gate 					md_exit(local_sp, 1);
19320Sstevel@tonic-gate 				}
19330Sstevel@tonic-gate 				/* ...and now the same for soft partitions */
19340Sstevel@tonic-gate 				if (reset_state(RESET_ABR, sp, MD_SP,
19350Sstevel@tonic-gate 				    ep) == -1) {
19360Sstevel@tonic-gate 					md_exit(local_sp, 1);
19370Sstevel@tonic-gate 				}
19380Sstevel@tonic-gate 			}
19390Sstevel@tonic-gate 
19400Sstevel@tonic-gate 			/*
19410Sstevel@tonic-gate 			 * choose owners for orphaned resyncs and reset
19420Sstevel@tonic-gate 			 * non-orphaned resyncs so that an owner node that
19430Sstevel@tonic-gate 			 * reboots will restart the resync if needed.
19440Sstevel@tonic-gate 			 */
19450Sstevel@tonic-gate 			if (reset_state(CHOOSE_OWNER, sp, MD_MIRROR, ep) == -1)
19460Sstevel@tonic-gate 				md_exit(local_sp, 1);
19470Sstevel@tonic-gate 
19480Sstevel@tonic-gate 			/*
19490Sstevel@tonic-gate 			 * Must unlock set lock before meta_mirror_resync_all
19500Sstevel@tonic-gate 			 * sends a message to run the metasync command
19510Sstevel@tonic-gate 			 * which also grabs the meta_lock.
19520Sstevel@tonic-gate 			 */
19530Sstevel@tonic-gate 			if (meta_lock(sp, TRUE, ep) != 0) {
19540Sstevel@tonic-gate 				mde_perror(ep, "");
19550Sstevel@tonic-gate 				md_exit(local_sp, 1);
19560Sstevel@tonic-gate 			}
19570Sstevel@tonic-gate 			meta_mirror_resync_unblock(sp);
19580Sstevel@tonic-gate 			meta_unlock(sp, ep);
19590Sstevel@tonic-gate 
19600Sstevel@tonic-gate 			/* resync all mirrors in set */
19610Sstevel@tonic-gate 			if (meta_mirror_resync_all(sp, 0, ep) != 0) {
19620Sstevel@tonic-gate 				mde_perror(ep, gettext("Mirror resyncs "
19630Sstevel@tonic-gate 				    "failed for set %s"), sp->setname);
19640Sstevel@tonic-gate 				md_exit(local_sp, 1);
19650Sstevel@tonic-gate 			}
19660Sstevel@tonic-gate 
19670Sstevel@tonic-gate 			meta_mc_log(MC_LOG3, gettext("Step4 - io's restarted "
19680Sstevel@tonic-gate 			    "for set %s: %s"), sp->setname,
19690Sstevel@tonic-gate 			    meta_print_hrtime(gethrtime() - start_time));
19700Sstevel@tonic-gate 		}
19710Sstevel@tonic-gate 
19720Sstevel@tonic-gate 		meta_mc_log(MC_LOG2, gettext("Step4 completed: %s"),
19730Sstevel@tonic-gate 		    meta_print_hrtime(gethrtime() - start_time));
19740Sstevel@tonic-gate 
19750Sstevel@tonic-gate 		break;
19760Sstevel@tonic-gate 
19770Sstevel@tonic-gate 	default:
19780Sstevel@tonic-gate 		usage(sp, 1);
19790Sstevel@tonic-gate 		break;
19800Sstevel@tonic-gate 	}
19810Sstevel@tonic-gate 
19820Sstevel@tonic-gate 	md_exit(sp, 0);
19830Sstevel@tonic-gate 	/* NOTREACHED */
19840Sstevel@tonic-gate 	return (0);
19850Sstevel@tonic-gate }
1986