1*0Sstevel@tonic-gate /*
2*0Sstevel@tonic-gate  * CDDL HEADER START
3*0Sstevel@tonic-gate  *
4*0Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
5*0Sstevel@tonic-gate  * Common Development and Distribution License, Version 1.0 only
6*0Sstevel@tonic-gate  * (the "License").  You may not use this file except in compliance
7*0Sstevel@tonic-gate  * with the License.
8*0Sstevel@tonic-gate  *
9*0Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10*0Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
11*0Sstevel@tonic-gate  * See the License for the specific language governing permissions
12*0Sstevel@tonic-gate  * and limitations under the License.
13*0Sstevel@tonic-gate  *
14*0Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
15*0Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16*0Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
17*0Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
18*0Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
19*0Sstevel@tonic-gate  *
20*0Sstevel@tonic-gate  * CDDL HEADER END
21*0Sstevel@tonic-gate  */
22*0Sstevel@tonic-gate 
23*0Sstevel@tonic-gate /*
24*0Sstevel@tonic-gate  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
25*0Sstevel@tonic-gate  * Use is subject to license terms.
26*0Sstevel@tonic-gate  */
27*0Sstevel@tonic-gate 
28*0Sstevel@tonic-gate #pragma ident	"%Z%%M%	%I%	%E% SMI"
29*0Sstevel@tonic-gate 
30*0Sstevel@tonic-gate #include <meta.h>
31*0Sstevel@tonic-gate #include <sdssc.h>
32*0Sstevel@tonic-gate #include <signal.h>
33*0Sstevel@tonic-gate #include <syslog.h>
34*0Sstevel@tonic-gate #include <sys/types.h>
35*0Sstevel@tonic-gate #include <sys/wait.h>
36*0Sstevel@tonic-gate #include <sys/lvm/md_mirror.h>
37*0Sstevel@tonic-gate #include <metad.h>
38*0Sstevel@tonic-gate 
39*0Sstevel@tonic-gate #define	MY_VERSION		"1.0"	/* the highest supported version */
40*0Sstevel@tonic-gate #define	MAX_DEBUG_LEVEL		5	/* maximum verbosity level */
41*0Sstevel@tonic-gate 
42*0Sstevel@tonic-gate #define	RESET_OWNER		0x0001
43*0Sstevel@tonic-gate #define	CHOOSE_OWNER		0x0002
44*0Sstevel@tonic-gate #define	RESET_ABR		0x0004
45*0Sstevel@tonic-gate #define	UPDATE_ABR		0x0008
46*0Sstevel@tonic-gate #define	GET_MIRROR_STATE	0x0010
47*0Sstevel@tonic-gate 
48*0Sstevel@tonic-gate #define	SET_INFO_NO_WR	0x0002
49*0Sstevel@tonic-gate #define	SET_INFO_MN	0x0004
50*0Sstevel@tonic-gate 
51*0Sstevel@tonic-gate /*
52*0Sstevel@tonic-gate  * This table defines all the metaclust reconfig steps we understand
53*0Sstevel@tonic-gate  */
54*0Sstevel@tonic-gate typedef enum stpnum {
55*0Sstevel@tonic-gate 	MC_UNK = 0,
56*0Sstevel@tonic-gate 	MC_START,
57*0Sstevel@tonic-gate 	MC_STOP,
58*0Sstevel@tonic-gate 	MC_ABORT,
59*0Sstevel@tonic-gate 	MC_RETURN,
60*0Sstevel@tonic-gate 	MC_STEP1,
61*0Sstevel@tonic-gate 	MC_STEP2,
62*0Sstevel@tonic-gate 	MC_STEP3,
63*0Sstevel@tonic-gate 	MC_STEP4
64*0Sstevel@tonic-gate } stepnum_t;
65*0Sstevel@tonic-gate 
66*0Sstevel@tonic-gate /*
67*0Sstevel@tonic-gate  * Structure for step_name -> step_number mapping
68*0Sstevel@tonic-gate  */
69*0Sstevel@tonic-gate struct step_t {
70*0Sstevel@tonic-gate 	char		*step_nam;
71*0Sstevel@tonic-gate 	stepnum_t	step_num;
72*0Sstevel@tonic-gate };
73*0Sstevel@tonic-gate 
74*0Sstevel@tonic-gate /*
75*0Sstevel@tonic-gate  * Step name to step number mapping table
76*0Sstevel@tonic-gate  * This table MUST be sorted alphabetically in ascending order of step name
77*0Sstevel@tonic-gate  */
78*0Sstevel@tonic-gate static struct step_t step_table[] = {
79*0Sstevel@tonic-gate 	{ "abort",	MC_ABORT },
80*0Sstevel@tonic-gate 	{ "return",	MC_RETURN },
81*0Sstevel@tonic-gate 	{ "start",	MC_START },
82*0Sstevel@tonic-gate 	{ "step1",	MC_STEP1 },
83*0Sstevel@tonic-gate 	{ "step2",	MC_STEP2 },
84*0Sstevel@tonic-gate 	{ "step3",	MC_STEP3 },
85*0Sstevel@tonic-gate 	{ "step4",	MC_STEP4 },
86*0Sstevel@tonic-gate 	{ "stop",	MC_STOP }
87*0Sstevel@tonic-gate };
88*0Sstevel@tonic-gate 
89*0Sstevel@tonic-gate /*
90*0Sstevel@tonic-gate  * If support for a different version is added, the new version number should
91*0Sstevel@tonic-gate  * be appended to the version_table below. This list will be searched to
92*0Sstevel@tonic-gate  * determine if a version requested via the -V option is supported or not.
93*0Sstevel@tonic-gate  */
94*0Sstevel@tonic-gate static char *version_table[] = {
95*0Sstevel@tonic-gate 	MY_VERSION
96*0Sstevel@tonic-gate };
97*0Sstevel@tonic-gate 
98*0Sstevel@tonic-gate uint_t	timeout = 0;			/* disable timeout by default */
99*0Sstevel@tonic-gate char	*version = MY_VERSION;		/* use latest version by default */
100*0Sstevel@tonic-gate int	stepnum = MC_UNK;		/* reconfiguration step number */
101*0Sstevel@tonic-gate pid_t	c_pid;				/* child process id */
102*0Sstevel@tonic-gate 
103*0Sstevel@tonic-gate /*
104*0Sstevel@tonic-gate  * Binary search comparison routine
105*0Sstevel@tonic-gate  */
106*0Sstevel@tonic-gate static int
107*0Sstevel@tonic-gate mc_compare(const void *stp1, const void *stp2)
108*0Sstevel@tonic-gate {
109*0Sstevel@tonic-gate 	return (strcmp((const char *)stp1,
110*0Sstevel@tonic-gate 	    ((const struct step_t *)stp2)->step_nam));
111*0Sstevel@tonic-gate }
112*0Sstevel@tonic-gate 
113*0Sstevel@tonic-gate /*
114*0Sstevel@tonic-gate  * Timeout expiry alarm signal handler
115*0Sstevel@tonic-gate  */
116*0Sstevel@tonic-gate /*ARGSUSED*/
117*0Sstevel@tonic-gate static void
118*0Sstevel@tonic-gate sigalarmhandler(int sig)
119*0Sstevel@tonic-gate {
120*0Sstevel@tonic-gate 	int	i, n, ret, stat_loc = 0;
121*0Sstevel@tonic-gate 
122*0Sstevel@tonic-gate 	n = sizeof (step_table) / sizeof (step_table[0]);
123*0Sstevel@tonic-gate 	for (i = 0; i < n; i++) {
124*0Sstevel@tonic-gate 		if (stepnum == step_table[i].step_num)
125*0Sstevel@tonic-gate 			break;
126*0Sstevel@tonic-gate 	}
127*0Sstevel@tonic-gate 
128*0Sstevel@tonic-gate 	assert(i != n);
129*0Sstevel@tonic-gate 
130*0Sstevel@tonic-gate 	meta_mc_log(MC_LOG1, gettext("Timeout expired in %s: %s"),
131*0Sstevel@tonic-gate 	    step_table[i].step_nam,
132*0Sstevel@tonic-gate 	    meta_print_hrtime(gethrtime() - start_time));
133*0Sstevel@tonic-gate 
134*0Sstevel@tonic-gate 	if ((ret = kill(c_pid, SIGKILL)) == 0) {
135*0Sstevel@tonic-gate 		/*
136*0Sstevel@tonic-gate 		 * The child will wait forever until the status is retrieved
137*0Sstevel@tonic-gate 		 * so get it now. Keep retrying if the call is interrupted.
138*0Sstevel@tonic-gate 		 *
139*0Sstevel@tonic-gate 		 * The possible results are,
140*0Sstevel@tonic-gate 		 *
141*0Sstevel@tonic-gate 		 *	- child killed successfully
142*0Sstevel@tonic-gate 		 *	- signal sent but child not killed
143*0Sstevel@tonic-gate 		 *	- waitpid failed/interrupted
144*0Sstevel@tonic-gate 		 */
145*0Sstevel@tonic-gate 		sleep(2);
146*0Sstevel@tonic-gate 		while ((ret = waitpid(c_pid, &stat_loc, WNOHANG)) < 0) {
147*0Sstevel@tonic-gate 			if (errno != EINTR) {
148*0Sstevel@tonic-gate 				break;
149*0Sstevel@tonic-gate 			}
150*0Sstevel@tonic-gate 		}
151*0Sstevel@tonic-gate 		if ((ret == c_pid) || (errno == ECHILD)) {
152*0Sstevel@tonic-gate 			ret = 0;
153*0Sstevel@tonic-gate 		} else {
154*0Sstevel@tonic-gate 			ret = 1;
155*0Sstevel@tonic-gate 		}
156*0Sstevel@tonic-gate 	} else if (errno == ESRCH) {
157*0Sstevel@tonic-gate 		/*
158*0Sstevel@tonic-gate 		 * If the kill did not catch the child then it means the child
159*0Sstevel@tonic-gate 		 * exited immediately after the timeout occured.
160*0Sstevel@tonic-gate 		 */
161*0Sstevel@tonic-gate 		ret = 0;
162*0Sstevel@tonic-gate 	}
163*0Sstevel@tonic-gate 
164*0Sstevel@tonic-gate 	/*
165*0Sstevel@tonic-gate 	 * make sure not to exit with 205 for any steps other than step1-step4.
166*0Sstevel@tonic-gate 	 * Suncluster reconfiguration can't handle it otherwise.
167*0Sstevel@tonic-gate 	 */
168*0Sstevel@tonic-gate 	switch (stepnum) {
169*0Sstevel@tonic-gate 	case MC_STEP1:
170*0Sstevel@tonic-gate 	case MC_STEP2:
171*0Sstevel@tonic-gate 	case MC_STEP3:
172*0Sstevel@tonic-gate 	case MC_STEP4:
173*0Sstevel@tonic-gate 		/*
174*0Sstevel@tonic-gate 		 * If the child was killed successfully return 205 for a
175*0Sstevel@tonic-gate 		 * new reconfig cycle otherwise send 1 to panic the node.
176*0Sstevel@tonic-gate 		 */
177*0Sstevel@tonic-gate 		if (ret != 0) {
178*0Sstevel@tonic-gate 			md_eprintf(gettext("Could not kill child\n"));
179*0Sstevel@tonic-gate 			exit(1);
180*0Sstevel@tonic-gate 		} else {
181*0Sstevel@tonic-gate 			exit(205);
182*0Sstevel@tonic-gate 		}
183*0Sstevel@tonic-gate 		break;
184*0Sstevel@tonic-gate 	case MC_START:
185*0Sstevel@tonic-gate 	case MC_STOP:
186*0Sstevel@tonic-gate 	case MC_ABORT:
187*0Sstevel@tonic-gate 	case MC_RETURN:
188*0Sstevel@tonic-gate 	default:
189*0Sstevel@tonic-gate 		exit(1);
190*0Sstevel@tonic-gate 		break;
191*0Sstevel@tonic-gate 	}
192*0Sstevel@tonic-gate }
193*0Sstevel@tonic-gate 
194*0Sstevel@tonic-gate /*
195*0Sstevel@tonic-gate  * Attempt to load local set.
196*0Sstevel@tonic-gate  * Returns:
197*0Sstevel@tonic-gate  *	pointer to mdsetname_t for local set (local_sp) is successful.
198*0Sstevel@tonic-gate  *	0 if failure
199*0Sstevel@tonic-gate  *		if there are no local set mddbs, no error message is printed.
200*0Sstevel@tonic-gate  *		Otherwise, error message is printed so that user
201*0Sstevel@tonic-gate  *		can determine why the local set didn't start.
202*0Sstevel@tonic-gate  */
203*0Sstevel@tonic-gate mdsetname_t *
204*0Sstevel@tonic-gate load_local_set(md_error_t *ep)
205*0Sstevel@tonic-gate {
206*0Sstevel@tonic-gate 	mdsetname_t	*local_sp = NULL;
207*0Sstevel@tonic-gate 
208*0Sstevel@tonic-gate 	/* Does local set exist? If not, give no error */
209*0Sstevel@tonic-gate 	if ((local_sp = metasetname(MD_LOCAL_NAME, ep)) == NULL) {
210*0Sstevel@tonic-gate 		return (0);
211*0Sstevel@tonic-gate 	}
212*0Sstevel@tonic-gate 
213*0Sstevel@tonic-gate 	/*
214*0Sstevel@tonic-gate 	 * snarf local set
215*0Sstevel@tonic-gate 	 * If fails with MDE_DB_NODB, then just return 1 printing
216*0Sstevel@tonic-gate 	 * no failure.
217*0Sstevel@tonic-gate 	 * Otherwise, print error message, and return 1.
218*0Sstevel@tonic-gate 	 */
219*0Sstevel@tonic-gate 	if (meta_setup_db_locations(ep) != 0) {
220*0Sstevel@tonic-gate 		if (!(mdismddberror(ep, MDE_DB_NODB)))
221*0Sstevel@tonic-gate 			mde_perror(ep, "");
222*0Sstevel@tonic-gate 		return (0);
223*0Sstevel@tonic-gate 	}
224*0Sstevel@tonic-gate 
225*0Sstevel@tonic-gate 	/* local set loaded successfully */
226*0Sstevel@tonic-gate 	return (local_sp);
227*0Sstevel@tonic-gate }
228*0Sstevel@tonic-gate 
229*0Sstevel@tonic-gate /*
230*0Sstevel@tonic-gate  * Purpose:	Compose a full path name for a metadevice
231*0Sstevel@tonic-gate  *
232*0Sstevel@tonic-gate  * On entry:	sp	- setname pointer
233*0Sstevel@tonic-gate  *		mnum	- minor number of metadevice
234*0Sstevel@tonic-gate  *		pathname - pointer to array to return path string
235*0Sstevel@tonic-gate  *		pathlen	- max length of pathname array
236*0Sstevel@tonic-gate  */
237*0Sstevel@tonic-gate static int
238*0Sstevel@tonic-gate compose_path(mdsetname_t *sp, int mnum, char *pathname, int pathlen)
239*0Sstevel@tonic-gate {
240*0Sstevel@tonic-gate 	int	rtn;
241*0Sstevel@tonic-gate 
242*0Sstevel@tonic-gate 	if (MD_MIN2SET(mnum) != sp->setno) {
243*0Sstevel@tonic-gate 		md_eprintf(gettext("minor number 0x%x invalid for set %d\n"),
244*0Sstevel@tonic-gate 		    mnum, sp->setno);
245*0Sstevel@tonic-gate 		return (-1);
246*0Sstevel@tonic-gate 	}
247*0Sstevel@tonic-gate 	rtn = snprintf(pathname, pathlen, "/dev/md/%s/rdsk/d%u",
248*0Sstevel@tonic-gate 	    sp->setname, (unsigned)MD_MIN2UNIT(mnum));
249*0Sstevel@tonic-gate 
250*0Sstevel@tonic-gate 	if ((pathname[0] == '\0') || (rtn >= pathlen)) {
251*0Sstevel@tonic-gate 		md_eprintf(gettext(
252*0Sstevel@tonic-gate 		    "Could not create path for device %s/d%u\n"),
253*0Sstevel@tonic-gate 		    sp->setname, (unsigned)MD_MIN2UNIT(mnum));
254*0Sstevel@tonic-gate 		return (-1);
255*0Sstevel@tonic-gate 	}
256*0Sstevel@tonic-gate 	return (0);
257*0Sstevel@tonic-gate }
258*0Sstevel@tonic-gate 
259*0Sstevel@tonic-gate /*
260*0Sstevel@tonic-gate  * Purpose:	Walk through all the devices specified for the given set
261*0Sstevel@tonic-gate  *		and do the action specified in mode
262*0Sstevel@tonic-gate  */
263*0Sstevel@tonic-gate static int
264*0Sstevel@tonic-gate reset_state(uint_t mode, mdsetname_t *sp, char *drivername, md_error_t *ep)
265*0Sstevel@tonic-gate {
266*0Sstevel@tonic-gate 	mdnamelist_t			*devnlp = NULL;
267*0Sstevel@tonic-gate 	mdnamelist_t			*p;
268*0Sstevel@tonic-gate 	mdname_t			*devnp = NULL;
269*0Sstevel@tonic-gate 	md_set_mmown_params_t		ownpar_p;
270*0Sstevel@tonic-gate 	md_set_mmown_params_t		*ownpar = &ownpar_p;
271*0Sstevel@tonic-gate 	md_unit_t			*mm;
272*0Sstevel@tonic-gate 	int				mirror_dev = 0;
273*0Sstevel@tonic-gate 	mndiskset_membershiplist_t	*nl;
274*0Sstevel@tonic-gate 	int				cnt;
275*0Sstevel@tonic-gate 	int				has_parent;
276*0Sstevel@tonic-gate 	md_mn_get_mir_state_t		mir_state_p;
277*0Sstevel@tonic-gate 	md_mn_get_mir_state_t		*mir_state = &mir_state_p;
278*0Sstevel@tonic-gate 
279*0Sstevel@tonic-gate 	/*
280*0Sstevel@tonic-gate 	 * if we are choosing or resetting the owners then make sure
281*0Sstevel@tonic-gate 	 * we are only doing it for mirror devices
282*0Sstevel@tonic-gate 	 */
283*0Sstevel@tonic-gate 	mirror_dev = (strcmp(MD_MIRROR, drivername) == 0);
284*0Sstevel@tonic-gate 	if ((mode & (RESET_OWNER | CHOOSE_OWNER)) && !mirror_dev) {
285*0Sstevel@tonic-gate 		return (-1);
286*0Sstevel@tonic-gate 	}
287*0Sstevel@tonic-gate 
288*0Sstevel@tonic-gate 	/* get a list of all the metadevices for current set */
289*0Sstevel@tonic-gate 	if (mirror_dev && meta_get_mirror_names(sp, &devnlp, 0, ep) < 0) {
290*0Sstevel@tonic-gate 		mde_perror(ep, gettext("Could not get mirrors for set %s"),
291*0Sstevel@tonic-gate 		    sp->setname);
292*0Sstevel@tonic-gate 		return (-1);
293*0Sstevel@tonic-gate 	} else if (meta_get_sp_names(sp, &devnlp, 0, ep) < 0) {
294*0Sstevel@tonic-gate 		mde_perror(ep, gettext(
295*0Sstevel@tonic-gate 		    "Could not get soft partitions for set %s"), sp->setname);
296*0Sstevel@tonic-gate 		return (-1);
297*0Sstevel@tonic-gate 	}
298*0Sstevel@tonic-gate 
299*0Sstevel@tonic-gate 	/* If resetting the owner, get the known membership list */
300*0Sstevel@tonic-gate 	if (mode & RESET_OWNER) {
301*0Sstevel@tonic-gate 		if (meta_read_nodelist(&cnt, &nl, ep)) {
302*0Sstevel@tonic-gate 			mde_perror(ep, "Could not get nodelist");
303*0Sstevel@tonic-gate 			return (-1);
304*0Sstevel@tonic-gate 		}
305*0Sstevel@tonic-gate 	}
306*0Sstevel@tonic-gate 
307*0Sstevel@tonic-gate 	/* for each metadevice */
308*0Sstevel@tonic-gate 	for (p = devnlp; (p != NULL); p = p->next) {
309*0Sstevel@tonic-gate 		devnp = p->namep;
310*0Sstevel@tonic-gate 
311*0Sstevel@tonic-gate 		/*
312*0Sstevel@tonic-gate 		 * Get the current setting for mirror ABR state and all of the
313*0Sstevel@tonic-gate 		 * submirror state and flags from the master node. We only
314*0Sstevel@tonic-gate 		 * perform this when going through a 'start' cycle.
315*0Sstevel@tonic-gate 		 */
316*0Sstevel@tonic-gate 		if ((mode & GET_MIRROR_STATE) && mirror_dev) {
317*0Sstevel@tonic-gate 			char	*miscname;
318*0Sstevel@tonic-gate 
319*0Sstevel@tonic-gate 			/*
320*0Sstevel@tonic-gate 			 * Ensure that we ignore soft-parts that are returned
321*0Sstevel@tonic-gate 			 * from the meta_get_mirror_names() call
322*0Sstevel@tonic-gate 			 */
323*0Sstevel@tonic-gate 			if ((miscname = metagetmiscname(devnp, ep)) == NULL)
324*0Sstevel@tonic-gate 				goto out;
325*0Sstevel@tonic-gate 			if (strcmp(miscname, MD_MIRROR) != 0)
326*0Sstevel@tonic-gate 				continue;
327*0Sstevel@tonic-gate 
328*0Sstevel@tonic-gate 			mir_state->mnum = meta_getminor(devnp->dev);
329*0Sstevel@tonic-gate 			MD_SETDRIVERNAME(mir_state, MD_MIRROR, sp->setno);
330*0Sstevel@tonic-gate 			meta_mc_log(MC_LOG4, gettext("Getting mirror state"
331*0Sstevel@tonic-gate 			    " for %s/d%u: %s"), sp->setname,
332*0Sstevel@tonic-gate 			    (unsigned)MD_MIN2UNIT(mir_state->mnum),
333*0Sstevel@tonic-gate 			    meta_print_hrtime(gethrtime() - start_time));
334*0Sstevel@tonic-gate 
335*0Sstevel@tonic-gate 			if (metaioctl(MD_MN_GET_MIRROR_STATE, mir_state, ep,
336*0Sstevel@tonic-gate 			    "MD_MN_GET_MIRROR_STATE") != 0) {
337*0Sstevel@tonic-gate 				mde_perror(ep, gettext("Unable to get "
338*0Sstevel@tonic-gate 				    "mirror state for %s/d%u"), sp->setname,
339*0Sstevel@tonic-gate 				    (unsigned)MD_MIN2UNIT(mir_state->mnum));
340*0Sstevel@tonic-gate 				goto out;
341*0Sstevel@tonic-gate 			} else {
342*0Sstevel@tonic-gate 				continue;
343*0Sstevel@tonic-gate 			}
344*0Sstevel@tonic-gate 		}
345*0Sstevel@tonic-gate 
346*0Sstevel@tonic-gate 		/* check if this is a top level metadevice */
347*0Sstevel@tonic-gate 		if ((mm = meta_get_mdunit(sp, devnp, ep)) == NULL)
348*0Sstevel@tonic-gate 			goto out;
349*0Sstevel@tonic-gate 		if (MD_HAS_PARENT(MD_PARENT(mm))) {
350*0Sstevel@tonic-gate 			has_parent = 1;
351*0Sstevel@tonic-gate 		} else {
352*0Sstevel@tonic-gate 			has_parent = 0;
353*0Sstevel@tonic-gate 		}
354*0Sstevel@tonic-gate 		Free(mm);
355*0Sstevel@tonic-gate 
356*0Sstevel@tonic-gate 		if (mode & (RESET_OWNER | CHOOSE_OWNER)) {
357*0Sstevel@tonic-gate 			char	*miscname;
358*0Sstevel@tonic-gate 
359*0Sstevel@tonic-gate 			/*
360*0Sstevel@tonic-gate 			 * we can only do these for mirrors so make sure we
361*0Sstevel@tonic-gate 			 * really have a mirror device and not a softpartition
362*0Sstevel@tonic-gate 			 * imitating one. meta_get_mirror_names seems to think
363*0Sstevel@tonic-gate 			 * softparts on top of a mirror are mirrors!
364*0Sstevel@tonic-gate 			 */
365*0Sstevel@tonic-gate 			if ((miscname = metagetmiscname(devnp, ep)) == NULL)
366*0Sstevel@tonic-gate 				goto out;
367*0Sstevel@tonic-gate 			if (strcmp(miscname, MD_MIRROR) != 0)
368*0Sstevel@tonic-gate 				continue;
369*0Sstevel@tonic-gate 
370*0Sstevel@tonic-gate 			(void) memset(ownpar, 0, sizeof (*ownpar));
371*0Sstevel@tonic-gate 			ownpar->d.mnum = meta_getminor(devnp->dev);
372*0Sstevel@tonic-gate 			MD_SETDRIVERNAME(ownpar, MD_MIRROR, sp->setno);
373*0Sstevel@tonic-gate 
374*0Sstevel@tonic-gate 			meta_mc_log(MC_LOG4, gettext("Setting owner "
375*0Sstevel@tonic-gate 			    "for %s/d%u: %s"), sp->setname,
376*0Sstevel@tonic-gate 			    (unsigned)MD_MIN2UNIT(ownpar->d.mnum),
377*0Sstevel@tonic-gate 			    meta_print_hrtime(gethrtime() - start_time));
378*0Sstevel@tonic-gate 
379*0Sstevel@tonic-gate 			/* get the current owner id */
380*0Sstevel@tonic-gate 			if (metaioctl(MD_MN_GET_MM_OWNER, ownpar, ep,
381*0Sstevel@tonic-gate 			    "MD_MN_GET_MM_OWNER") != 0) {
382*0Sstevel@tonic-gate 				mde_perror(ep, gettext("Unable to get "
383*0Sstevel@tonic-gate 				    "mirror owner for %s/d%u"), sp->setname,
384*0Sstevel@tonic-gate 				    (unsigned)MD_MIN2UNIT(ownpar->d.mnum));
385*0Sstevel@tonic-gate 				goto out;
386*0Sstevel@tonic-gate 			}
387*0Sstevel@tonic-gate 		}
388*0Sstevel@tonic-gate 
389*0Sstevel@tonic-gate 		if (mode & RESET_OWNER) {
390*0Sstevel@tonic-gate 			if (ownpar->d.owner == MD_MN_MIRROR_UNOWNED) {
391*0Sstevel@tonic-gate 				mdclrerror(ep);
392*0Sstevel@tonic-gate 				continue;
393*0Sstevel@tonic-gate 			}
394*0Sstevel@tonic-gate 
395*0Sstevel@tonic-gate 			/*
396*0Sstevel@tonic-gate 			 * reset owner only if the current owner is
397*0Sstevel@tonic-gate 			 * not in the membership list
398*0Sstevel@tonic-gate 			 * Also kill the resync thread so that when the resync
399*0Sstevel@tonic-gate 			 * is started, it will perform an optimized resync
400*0Sstevel@tonic-gate 			 * for any resync regions that were dirty when the
401*0Sstevel@tonic-gate 			 * current owner left the membership.
402*0Sstevel@tonic-gate 			 */
403*0Sstevel@tonic-gate 			if (meta_is_member(NULL, ownpar->d.owner, nl) != 1) {
404*0Sstevel@tonic-gate 				if (meta_mn_change_owner(&ownpar,
405*0Sstevel@tonic-gate 				    sp->setno, ownpar->d.mnum,
406*0Sstevel@tonic-gate 				    MD_MN_MIRROR_UNOWNED,
407*0Sstevel@tonic-gate 				    MD_MN_MM_ALLOW_CHANGE) == -1) {
408*0Sstevel@tonic-gate 					md_eprintf(gettext(
409*0Sstevel@tonic-gate 					    "Unable to reset mirror owner "
410*0Sstevel@tonic-gate 					    "for %s/d%u\n"), sp->setname,
411*0Sstevel@tonic-gate 					    (unsigned)MD_MIN2UNIT(
412*0Sstevel@tonic-gate 					    ownpar->d.mnum));
413*0Sstevel@tonic-gate 					goto out;
414*0Sstevel@tonic-gate 				}
415*0Sstevel@tonic-gate 				if (meta_mirror_resync(sp, devnp, 0, ep,
416*0Sstevel@tonic-gate 				    MD_RESYNC_KILL_NO_WAIT) != 0) {
417*0Sstevel@tonic-gate 					md_eprintf(gettext(
418*0Sstevel@tonic-gate 					    "Unable to kill resync for"
419*0Sstevel@tonic-gate 					    " %s/d%u\n"), sp->setname,
420*0Sstevel@tonic-gate 					    (unsigned)MD_MIN2UNIT(
421*0Sstevel@tonic-gate 					    ownpar->d.mnum));
422*0Sstevel@tonic-gate 					goto out;
423*0Sstevel@tonic-gate 				}
424*0Sstevel@tonic-gate 			}
425*0Sstevel@tonic-gate 		}
426*0Sstevel@tonic-gate 
427*0Sstevel@tonic-gate 		if (mode & CHOOSE_OWNER) {
428*0Sstevel@tonic-gate 			/*
429*0Sstevel@tonic-gate 			 * only orphaned resyncs will have no owner.
430*0Sstevel@tonic-gate 			 * if that is the case choose a new owner. Otherwise
431*0Sstevel@tonic-gate 			 * re-establish the existing owner. This covers the
432*0Sstevel@tonic-gate 			 * case where a node that owned the mirror
433*0Sstevel@tonic-gate 			 * reboots/panics and comes back into the cluster before
434*0Sstevel@tonic-gate 			 * the reconfig cycle has completed. In this case the
435*0Sstevel@tonic-gate 			 * other cluster nodes will have the mirror owner marked
436*0Sstevel@tonic-gate 			 * as the rebooted node while it has the owner marked
437*0Sstevel@tonic-gate 			 * as 'None'. We have to reestablish the ownership so
438*0Sstevel@tonic-gate 			 * that the subsequent resync can continue.
439*0Sstevel@tonic-gate 			 */
440*0Sstevel@tonic-gate 			if (meta_mn_change_owner(&ownpar, sp->setno,
441*0Sstevel@tonic-gate 			    ownpar->d.mnum, ownpar->d.owner,
442*0Sstevel@tonic-gate 			    MD_MN_MM_CHOOSE_OWNER) == -1) {
443*0Sstevel@tonic-gate 				md_eprintf(gettext("Unable to choose "
444*0Sstevel@tonic-gate 				    "mirror owner for %s/d%u\n"), sp->setname,
445*0Sstevel@tonic-gate 				    (unsigned)MD_MIN2UNIT(ownpar->d.mnum));
446*0Sstevel@tonic-gate 				goto out;
447*0Sstevel@tonic-gate 			}
448*0Sstevel@tonic-gate 		}
449*0Sstevel@tonic-gate 
450*0Sstevel@tonic-gate 		/*
451*0Sstevel@tonic-gate 		 * For RESET_ABR and UPDATE_ABR - only handle top
452*0Sstevel@tonic-gate 		 * level metadevices.
453*0Sstevel@tonic-gate 		 */
454*0Sstevel@tonic-gate 		if (has_parent)
455*0Sstevel@tonic-gate 			continue;
456*0Sstevel@tonic-gate 
457*0Sstevel@tonic-gate 		if (mode & RESET_ABR) {
458*0Sstevel@tonic-gate 			/*
459*0Sstevel@tonic-gate 			 * Reset the ABR (application based recovery)
460*0Sstevel@tonic-gate 			 * value on all nodes. We are dealing with
461*0Sstevel@tonic-gate 			 * the possibility that we have ABR set but the
462*0Sstevel@tonic-gate 			 * only node that had the device open with ABR has
463*0Sstevel@tonic-gate 			 * left the cluster. We simply open and close the
464*0Sstevel@tonic-gate 			 * device and if this is the last close in the
465*0Sstevel@tonic-gate 			 * cluster, ABR will be cleared on all nodes.
466*0Sstevel@tonic-gate 			 */
467*0Sstevel@tonic-gate 			char		*miscname;
468*0Sstevel@tonic-gate 			char		name[MD_MAX_CTDLEN];
469*0Sstevel@tonic-gate 			int		mnum, fd;
470*0Sstevel@tonic-gate 
471*0Sstevel@tonic-gate 			name[0] = '\0';
472*0Sstevel@tonic-gate 			mnum = meta_getminor(devnp->dev);
473*0Sstevel@tonic-gate 
474*0Sstevel@tonic-gate 			/*
475*0Sstevel@tonic-gate 			 * Ensure that we don't include soft-parts in the
476*0Sstevel@tonic-gate 			 * mirror-only call to RESET_ABR. meta_get_mirror_names
477*0Sstevel@tonic-gate 			 * returns a bogus list that includes all soft-parts
478*0Sstevel@tonic-gate 			 * built on mirrors.
479*0Sstevel@tonic-gate 			 */
480*0Sstevel@tonic-gate 			if ((miscname = metagetmiscname(devnp, ep)) == NULL)
481*0Sstevel@tonic-gate 				goto out;
482*0Sstevel@tonic-gate 			if (mirror_dev && (strcmp(miscname, MD_MIRROR) != 0))
483*0Sstevel@tonic-gate 				continue;
484*0Sstevel@tonic-gate 
485*0Sstevel@tonic-gate 			meta_mc_log(MC_LOG4, gettext("Re-setting ABR state "
486*0Sstevel@tonic-gate 			    "for %s/d%u: %s"), sp->setname,
487*0Sstevel@tonic-gate 			    (unsigned)MD_MIN2UNIT(mnum),
488*0Sstevel@tonic-gate 			    meta_print_hrtime(gethrtime() - start_time));
489*0Sstevel@tonic-gate 
490*0Sstevel@tonic-gate 			/* compose the absolute device path and open it */
491*0Sstevel@tonic-gate 			if (compose_path(sp, mnum, &name[0],
492*0Sstevel@tonic-gate 			    sizeof (name)) != 0)
493*0Sstevel@tonic-gate 				goto out;
494*0Sstevel@tonic-gate 			if ((fd = open(name, O_RDWR, 0)) < 0) {
495*0Sstevel@tonic-gate 				md_perror(gettext("Could not open device %s"),
496*0Sstevel@tonic-gate 				    name);
497*0Sstevel@tonic-gate 				continue;
498*0Sstevel@tonic-gate 			}
499*0Sstevel@tonic-gate 
500*0Sstevel@tonic-gate 			(void) close(fd);
501*0Sstevel@tonic-gate 		}
502*0Sstevel@tonic-gate 
503*0Sstevel@tonic-gate 		if (mode & UPDATE_ABR) {
504*0Sstevel@tonic-gate 			/*
505*0Sstevel@tonic-gate 			 * Update the ABR value on this node. We obtain the
506*0Sstevel@tonic-gate 			 * current ABR state from the master node.
507*0Sstevel@tonic-gate 			 */
508*0Sstevel@tonic-gate 
509*0Sstevel@tonic-gate 			char		*miscname;
510*0Sstevel@tonic-gate 			char		name[MD_MAX_CTDLEN];
511*0Sstevel@tonic-gate 			int		mnum, fd;
512*0Sstevel@tonic-gate 			volcap_t	vc;
513*0Sstevel@tonic-gate 			uint_t		tstate;
514*0Sstevel@tonic-gate 
515*0Sstevel@tonic-gate 			name[0] = '\0';
516*0Sstevel@tonic-gate 			mnum = meta_getminor(devnp->dev);
517*0Sstevel@tonic-gate 
518*0Sstevel@tonic-gate 			/*
519*0Sstevel@tonic-gate 			 * Ensure that we don't include soft-parts in the
520*0Sstevel@tonic-gate 			 * mirror-only call to UPDATE_ABR. meta_get_mirror_names
521*0Sstevel@tonic-gate 			 * returns a bogus list that includes all soft-parts
522*0Sstevel@tonic-gate 			 * built on mirrors.
523*0Sstevel@tonic-gate 			 */
524*0Sstevel@tonic-gate 			if ((miscname = metagetmiscname(devnp, ep)) == NULL)
525*0Sstevel@tonic-gate 				goto out;
526*0Sstevel@tonic-gate 			if (mirror_dev && (strcmp(miscname, MD_MIRROR) != 0))
527*0Sstevel@tonic-gate 				continue;
528*0Sstevel@tonic-gate 
529*0Sstevel@tonic-gate 			/* Get tstate from Master */
530*0Sstevel@tonic-gate 			if (meta_mn_send_get_tstate(devnp->dev, &tstate, ep)
531*0Sstevel@tonic-gate 			    != 0)
532*0Sstevel@tonic-gate 				continue;
533*0Sstevel@tonic-gate 			/* If not set on the master, nothing to do */
534*0Sstevel@tonic-gate 			if (!(tstate & MD_ABR_CAP))
535*0Sstevel@tonic-gate 				continue;
536*0Sstevel@tonic-gate 
537*0Sstevel@tonic-gate 			meta_mc_log(MC_LOG4, gettext("Updating ABR state "
538*0Sstevel@tonic-gate 			    "for %s/d%u: %s"), sp->setname,
539*0Sstevel@tonic-gate 			    (unsigned)MD_MIN2UNIT(mnum),
540*0Sstevel@tonic-gate 			    meta_print_hrtime(gethrtime() - start_time));
541*0Sstevel@tonic-gate 
542*0Sstevel@tonic-gate 			/* compose the absolute device path and open it */
543*0Sstevel@tonic-gate 			if (compose_path(sp, mnum, &name[0],
544*0Sstevel@tonic-gate 			    sizeof (name)) != 0)
545*0Sstevel@tonic-gate 				goto out;
546*0Sstevel@tonic-gate 			if ((fd = open(name, O_RDWR, 0)) < 0) {
547*0Sstevel@tonic-gate 				md_perror(gettext("Could not open device %s"),
548*0Sstevel@tonic-gate 				    name);
549*0Sstevel@tonic-gate 				continue;
550*0Sstevel@tonic-gate 			}
551*0Sstevel@tonic-gate 
552*0Sstevel@tonic-gate 			/* set ABR state */
553*0Sstevel@tonic-gate 			vc.vc_info = 0;
554*0Sstevel@tonic-gate 			vc.vc_set = 0;
555*0Sstevel@tonic-gate 			if (ioctl(fd, DKIOCGETVOLCAP, &vc) < 0) {
556*0Sstevel@tonic-gate 				/*
557*0Sstevel@tonic-gate 				 * Ignore if device does not support this
558*0Sstevel@tonic-gate 				 * ioctl
559*0Sstevel@tonic-gate 				 */
560*0Sstevel@tonic-gate 				if ((errno != ENOTTY) && (errno != ENOTSUP)) {
561*0Sstevel@tonic-gate 					md_perror(gettext("Could not get "
562*0Sstevel@tonic-gate 					    "ABR/DMR state for device %s"),
563*0Sstevel@tonic-gate 					    name);
564*0Sstevel@tonic-gate 				}
565*0Sstevel@tonic-gate 				(void) close(fd);
566*0Sstevel@tonic-gate 				continue;
567*0Sstevel@tonic-gate 			}
568*0Sstevel@tonic-gate 			if (!(vc.vc_info & (DKV_ABR_CAP | DKV_DMR_CAP))) {
569*0Sstevel@tonic-gate 				(void) close(fd);
570*0Sstevel@tonic-gate 				continue;
571*0Sstevel@tonic-gate 			}
572*0Sstevel@tonic-gate 
573*0Sstevel@tonic-gate 			vc.vc_set = DKV_ABR_CAP;
574*0Sstevel@tonic-gate 			if (ioctl(fd, DKIOCSETVOLCAP, &vc) < 0) {
575*0Sstevel@tonic-gate 				md_perror(gettext(
576*0Sstevel@tonic-gate 				    "Could not set ABR state for "
577*0Sstevel@tonic-gate 				    "device %s"), name);
578*0Sstevel@tonic-gate 				(void) close(fd);
579*0Sstevel@tonic-gate 				goto out;
580*0Sstevel@tonic-gate 			} else {
581*0Sstevel@tonic-gate 				md_eprintf(gettext(
582*0Sstevel@tonic-gate 				    "Setting ABR state on device %s\n"), name);
583*0Sstevel@tonic-gate 			}
584*0Sstevel@tonic-gate 
585*0Sstevel@tonic-gate 			(void) close(fd);
586*0Sstevel@tonic-gate 		}
587*0Sstevel@tonic-gate 	}
588*0Sstevel@tonic-gate 
589*0Sstevel@tonic-gate 	/* cleanup */
590*0Sstevel@tonic-gate 	if (mode & RESET_OWNER) {
591*0Sstevel@tonic-gate 		meta_free_nodelist(nl);
592*0Sstevel@tonic-gate 	}
593*0Sstevel@tonic-gate 	metafreenamelist(devnlp);
594*0Sstevel@tonic-gate 	return (0);
595*0Sstevel@tonic-gate 
596*0Sstevel@tonic-gate out:
597*0Sstevel@tonic-gate 	/* cleanup */
598*0Sstevel@tonic-gate 	if (mode & RESET_OWNER) {
599*0Sstevel@tonic-gate 		meta_free_nodelist(nl);
600*0Sstevel@tonic-gate 	}
601*0Sstevel@tonic-gate 	metafreenamelist(devnlp);
602*0Sstevel@tonic-gate 	return (-1);
603*0Sstevel@tonic-gate }
604*0Sstevel@tonic-gate 
605*0Sstevel@tonic-gate /*
606*0Sstevel@tonic-gate  * Print usage message
607*0Sstevel@tonic-gate  */
608*0Sstevel@tonic-gate static void
609*0Sstevel@tonic-gate usage(mdsetname_t *sp, int eval)
610*0Sstevel@tonic-gate {
611*0Sstevel@tonic-gate 	(void) fprintf(stderr, gettext("usage:"
612*0Sstevel@tonic-gate 	    "\t%s [-V version] [-t timeout] [-d level] start localnodeid\n"
613*0Sstevel@tonic-gate 	    "\t%s [-V version] [-t timeout] [-d level] step nodelist...\n"
614*0Sstevel@tonic-gate 	    "\t%s [-V version] [-t timeout] [-d level] abort | stop\n"
615*0Sstevel@tonic-gate 	    "\t%s [-V | -? | -h]\n"),
616*0Sstevel@tonic-gate 	    myname, myname, myname, myname);
617*0Sstevel@tonic-gate 	if (!eval) {
618*0Sstevel@tonic-gate 		fprintf(stderr, gettext("\n"
619*0Sstevel@tonic-gate 		    "\tValid debug (-d) levels are 1-%d for increasing "
620*0Sstevel@tonic-gate 		    "verbosity.\n\tDefault is -d 3.\n\n"
621*0Sstevel@tonic-gate 		    "\tValid step values are: return | step1 | step2 | "
622*0Sstevel@tonic-gate 		    "step3 | step4\n\n"
623*0Sstevel@tonic-gate 		    "\tNodelist is a space-separated list of node id's\n\n"),
624*0Sstevel@tonic-gate 		    MAX_DEBUG_LEVEL);
625*0Sstevel@tonic-gate 	}
626*0Sstevel@tonic-gate 	md_exit(sp, eval);
627*0Sstevel@tonic-gate }
628*0Sstevel@tonic-gate 
629*0Sstevel@tonic-gate /*
630*0Sstevel@tonic-gate  * Input:	Input takes a config step name followed by a list of
631*0Sstevel@tonic-gate  *		possible node id's.
632*0Sstevel@tonic-gate  *
633*0Sstevel@tonic-gate  * Returns:	  0 - Success
634*0Sstevel@tonic-gate  *		  1 - Fail
635*0Sstevel@tonic-gate  *			Node will be removed from cluster membership
636*0Sstevel@tonic-gate  *			by forcing node to panic.
637*0Sstevel@tonic-gate  *		205 - Unsuccessful. Start another reconfig cycle.
638*0Sstevel@tonic-gate  *			Problem was encountered that could be fixed by
639*0Sstevel@tonic-gate  *			running another reconfig cycle.
640*0Sstevel@tonic-gate  *			Problem could be a result of a failure to read
641*0Sstevel@tonic-gate  *			the nodelist file or that all work could not be
642*0Sstevel@tonic-gate  *			accomplished in a reconfig step in the amount of
643*0Sstevel@tonic-gate  *			time given so another reconfig cycle is needed in
644*0Sstevel@tonic-gate  *			order to finish the current step.
645*0Sstevel@tonic-gate  */
646*0Sstevel@tonic-gate int
647*0Sstevel@tonic-gate main(int argc, char **argv)
648*0Sstevel@tonic-gate {
649*0Sstevel@tonic-gate 	mdsetname_t		*sp = NULL;
650*0Sstevel@tonic-gate 	md_error_t		status = mdnullerror;
651*0Sstevel@tonic-gate 	md_error_t		*ep = &status;
652*0Sstevel@tonic-gate 	set_t			max_sets, setno;
653*0Sstevel@tonic-gate 	int			c, clust = 0;
654*0Sstevel@tonic-gate 	struct sigaction	nsa, osa;
655*0Sstevel@tonic-gate 	struct step_t		*step_ptr;
656*0Sstevel@tonic-gate 	mdsetname_t		*local_sp = NULL;
657*0Sstevel@tonic-gate 	md_drive_desc		*dd;
658*0Sstevel@tonic-gate 	int			rval = 0;
659*0Sstevel@tonic-gate 	md_set_desc		*sd;
660*0Sstevel@tonic-gate 	mddb_block_parm_t	mbp;
661*0Sstevel@tonic-gate 	uint_t			debug = 3; /* log upto MC_LOG3 by default */
662*0Sstevel@tonic-gate 	int			version_table_size;
663*0Sstevel@tonic-gate 	mddb_setflags_config_t	sf;
664*0Sstevel@tonic-gate 	int			ret_val;
665*0Sstevel@tonic-gate 	mddb_config_t		cfg;
666*0Sstevel@tonic-gate 	int			set_info[MD_MAXSETS];
667*0Sstevel@tonic-gate 
668*0Sstevel@tonic-gate 	/*
669*0Sstevel@tonic-gate 	 * Get the locale set up before calling any other routines
670*0Sstevel@tonic-gate 	 * with messages to ouput.  Just in case we're not in a build
671*0Sstevel@tonic-gate 	 * environment, make sure that TEXT_DOMAIN gets set to
672*0Sstevel@tonic-gate 	 * something.
673*0Sstevel@tonic-gate 	 */
674*0Sstevel@tonic-gate #if !defined(TEXT_DOMAIN)
675*0Sstevel@tonic-gate #define	TEXT_DOMAIN "SYS_TEST"
676*0Sstevel@tonic-gate #endif
677*0Sstevel@tonic-gate 	(void) setlocale(LC_ALL, "");
678*0Sstevel@tonic-gate 	(void) textdomain(TEXT_DOMAIN);
679*0Sstevel@tonic-gate 
680*0Sstevel@tonic-gate 	if ((clust = sdssc_bind_library()) == SDSSC_ERROR) {
681*0Sstevel@tonic-gate 		md_eprintf(gettext("Interface error with libsds_sc.so\n"));
682*0Sstevel@tonic-gate 		exit(1);
683*0Sstevel@tonic-gate 	}
684*0Sstevel@tonic-gate 
685*0Sstevel@tonic-gate 	if (md_init(argc, argv, 1, 1, ep) != 0 || meta_check_root(ep) != 0) {
686*0Sstevel@tonic-gate 		mde_perror(ep, "");
687*0Sstevel@tonic-gate 		md_exit(sp, 1);
688*0Sstevel@tonic-gate 	}
689*0Sstevel@tonic-gate 
690*0Sstevel@tonic-gate 	/*
691*0Sstevel@tonic-gate 	 * open log and enable libmeta logging. Do it here explicitly
692*0Sstevel@tonic-gate 	 * rather than letting md_init() do it because we are not really
693*0Sstevel@tonic-gate 	 * a daemon and that is what md_init() opens the log as.
694*0Sstevel@tonic-gate 	 */
695*0Sstevel@tonic-gate 	openlog("metaclust", LOG_CONS, LOG_USER);
696*0Sstevel@tonic-gate 
697*0Sstevel@tonic-gate 	version_table_size = sizeof (version_table) / sizeof (version_table[0]);
698*0Sstevel@tonic-gate 
699*0Sstevel@tonic-gate 	optind = 1;
700*0Sstevel@tonic-gate 	opterr = 0;
701*0Sstevel@tonic-gate 	while ((c = getopt(argc, argv, "hd:V:t:?")) != -1) {
702*0Sstevel@tonic-gate 		switch (c) {
703*0Sstevel@tonic-gate 		case 'h':
704*0Sstevel@tonic-gate 			usage(sp, 0);
705*0Sstevel@tonic-gate 			break;
706*0Sstevel@tonic-gate 
707*0Sstevel@tonic-gate 		case 'd':
708*0Sstevel@tonic-gate 			if (sscanf(optarg, "%u", &debug) != 1) {
709*0Sstevel@tonic-gate 				md_eprintf(gettext("Invalid debug level\n"));
710*0Sstevel@tonic-gate 				md_exit(sp, 1);
711*0Sstevel@tonic-gate 			} else if ((debug < 1) || (debug > MAX_DEBUG_LEVEL)) {
712*0Sstevel@tonic-gate 				debug = min(max(debug, 1), MAX_DEBUG_LEVEL);
713*0Sstevel@tonic-gate 				md_eprintf(gettext("Debug level must be "
714*0Sstevel@tonic-gate 				    "between 1 and %d inclusive.\n"),
715*0Sstevel@tonic-gate 				    MAX_DEBUG_LEVEL);
716*0Sstevel@tonic-gate 				md_eprintf(gettext("Debug level set to %d.\n"),
717*0Sstevel@tonic-gate 				    debug);
718*0Sstevel@tonic-gate 			}
719*0Sstevel@tonic-gate 			break;
720*0Sstevel@tonic-gate 
721*0Sstevel@tonic-gate 		case 'V':
722*0Sstevel@tonic-gate 			version = Strdup(optarg);
723*0Sstevel@tonic-gate 			break;
724*0Sstevel@tonic-gate 
725*0Sstevel@tonic-gate 		case 't':
726*0Sstevel@tonic-gate 			if (sscanf(optarg, "%u", &timeout) != 1) {
727*0Sstevel@tonic-gate 				md_eprintf(gettext("Invalid timeout value\n"));
728*0Sstevel@tonic-gate 				md_exit(sp, 1);
729*0Sstevel@tonic-gate 			}
730*0Sstevel@tonic-gate 			break;
731*0Sstevel@tonic-gate 
732*0Sstevel@tonic-gate 		case '?':
733*0Sstevel@tonic-gate 			if (optopt == '?') {
734*0Sstevel@tonic-gate 				usage(sp, 0);
735*0Sstevel@tonic-gate 			} else if (optopt == 'V') {
736*0Sstevel@tonic-gate 				int	i;
737*0Sstevel@tonic-gate 
738*0Sstevel@tonic-gate 				fprintf(stdout, gettext(
739*0Sstevel@tonic-gate 				    "%s: Versions Supported:"), myname);
740*0Sstevel@tonic-gate 				for (i = 0; i < version_table_size; i++) {
741*0Sstevel@tonic-gate 					fprintf(stdout, " %s",
742*0Sstevel@tonic-gate 					    version_table[i]);
743*0Sstevel@tonic-gate 				}
744*0Sstevel@tonic-gate 				fprintf(stdout, "\n");
745*0Sstevel@tonic-gate 				md_exit(sp, 0);
746*0Sstevel@tonic-gate 			}
747*0Sstevel@tonic-gate 			/*FALLTHROUGH*/
748*0Sstevel@tonic-gate 
749*0Sstevel@tonic-gate 		default:
750*0Sstevel@tonic-gate 			usage(sp, 1);
751*0Sstevel@tonic-gate 			break;
752*0Sstevel@tonic-gate 		}
753*0Sstevel@tonic-gate 	}
754*0Sstevel@tonic-gate 
755*0Sstevel@tonic-gate 	/* initialise the debug level and start time */
756*0Sstevel@tonic-gate 	setup_mc_log(debug);
757*0Sstevel@tonic-gate 
758*0Sstevel@tonic-gate 	/*
759*0Sstevel@tonic-gate 	 * check that the version specified (if any) is supported.
760*0Sstevel@tonic-gate 	 */
761*0Sstevel@tonic-gate 	if (version != NULL) {
762*0Sstevel@tonic-gate 		int	i, found = 0;
763*0Sstevel@tonic-gate 
764*0Sstevel@tonic-gate 		for (i = 0; i < version_table_size; i++) {
765*0Sstevel@tonic-gate 			if (strcmp(version, version_table[i]) == 0) {
766*0Sstevel@tonic-gate 				found = 1;
767*0Sstevel@tonic-gate 				break;
768*0Sstevel@tonic-gate 			}
769*0Sstevel@tonic-gate 		}
770*0Sstevel@tonic-gate 		if (!found) {
771*0Sstevel@tonic-gate 			md_eprintf(gettext("Version %s not supported\n"),
772*0Sstevel@tonic-gate 			    version);
773*0Sstevel@tonic-gate 			md_exit(sp, 1);
774*0Sstevel@tonic-gate 		}
775*0Sstevel@tonic-gate 	}
776*0Sstevel@tonic-gate 
777*0Sstevel@tonic-gate 	argc -= optind;
778*0Sstevel@tonic-gate 	argv += optind;
779*0Sstevel@tonic-gate 
780*0Sstevel@tonic-gate 	/* parse arguments */
781*0Sstevel@tonic-gate 	if (argc <= 0) {
782*0Sstevel@tonic-gate 		usage(sp, 1);
783*0Sstevel@tonic-gate 	}
784*0Sstevel@tonic-gate 
785*0Sstevel@tonic-gate 	/* convert the step name to the corresponding number */
786*0Sstevel@tonic-gate 	step_ptr = bsearch(argv[0], step_table, (sizeof (step_table) /
787*0Sstevel@tonic-gate 	    sizeof (step_table[0])), sizeof (step_table[0]), mc_compare);
788*0Sstevel@tonic-gate 	if (step_ptr != NULL) {
789*0Sstevel@tonic-gate 		stepnum = step_ptr->step_num;
790*0Sstevel@tonic-gate 	}
791*0Sstevel@tonic-gate 
792*0Sstevel@tonic-gate 	--argc;
793*0Sstevel@tonic-gate 	++argv;
794*0Sstevel@tonic-gate 
795*0Sstevel@tonic-gate 	/* set timeout alarm signal, a value of 0 will disable timeout */
796*0Sstevel@tonic-gate 	if (timeout > 0) {
797*0Sstevel@tonic-gate 		int	stat_loc = 0;
798*0Sstevel@tonic-gate 
799*0Sstevel@tonic-gate 		c_pid = fork();
800*0Sstevel@tonic-gate 
801*0Sstevel@tonic-gate 		if (c_pid == (pid_t)-1) {
802*0Sstevel@tonic-gate 			md_perror(gettext("Unable to fork"));
803*0Sstevel@tonic-gate 			md_exit(sp, 1);
804*0Sstevel@tonic-gate 		} else if (c_pid) {
805*0Sstevel@tonic-gate 			/* parent */
806*0Sstevel@tonic-gate 			nsa.sa_flags = 0;
807*0Sstevel@tonic-gate 			if (sigfillset(&nsa.sa_mask) < 0) {
808*0Sstevel@tonic-gate 				md_perror(gettext("Unable to set signal mask"));
809*0Sstevel@tonic-gate 				md_exit(sp, 1);
810*0Sstevel@tonic-gate 			}
811*0Sstevel@tonic-gate 
812*0Sstevel@tonic-gate 			nsa.sa_handler = sigalarmhandler;
813*0Sstevel@tonic-gate 			if (sigaction(SIGALRM, &nsa, &osa) == -1) {
814*0Sstevel@tonic-gate 				md_perror(gettext("Unable to set alarm "
815*0Sstevel@tonic-gate 				    "handler"));
816*0Sstevel@tonic-gate 				md_exit(sp, 1);
817*0Sstevel@tonic-gate 			}
818*0Sstevel@tonic-gate 
819*0Sstevel@tonic-gate 			(void) alarm(timeout);
820*0Sstevel@tonic-gate 
821*0Sstevel@tonic-gate 			/*
822*0Sstevel@tonic-gate 			 * wait for child to exit or timeout to expire.
823*0Sstevel@tonic-gate 			 * keep retrying if the call is interrupted
824*0Sstevel@tonic-gate 			 */
825*0Sstevel@tonic-gate 			while ((ret_val = waitpid(c_pid, &stat_loc, 0)) < 0) {
826*0Sstevel@tonic-gate 				if (errno != EINTR) {
827*0Sstevel@tonic-gate 					break;
828*0Sstevel@tonic-gate 				}
829*0Sstevel@tonic-gate 			}
830*0Sstevel@tonic-gate 			if (ret_val == c_pid) {
831*0Sstevel@tonic-gate 				/* exit with the childs exit value */
832*0Sstevel@tonic-gate 				exit(WEXITSTATUS(stat_loc));
833*0Sstevel@tonic-gate 			} else if (errno == ECHILD) {
834*0Sstevel@tonic-gate 				md_exit(sp, 0);
835*0Sstevel@tonic-gate 			} else {
836*0Sstevel@tonic-gate 				perror(myname);
837*0Sstevel@tonic-gate 				md_exit(sp, 1);
838*0Sstevel@tonic-gate 			}
839*0Sstevel@tonic-gate 		}
840*0Sstevel@tonic-gate 	}
841*0Sstevel@tonic-gate 
842*0Sstevel@tonic-gate 	/*
843*0Sstevel@tonic-gate 	 * If a timeout value is given, everything from this point onwards is
844*0Sstevel@tonic-gate 	 * executed in the child process.
845*0Sstevel@tonic-gate 	 */
846*0Sstevel@tonic-gate 
847*0Sstevel@tonic-gate 	switch (stepnum) {
848*0Sstevel@tonic-gate 	case MC_START:
849*0Sstevel@tonic-gate 		/*
850*0Sstevel@tonic-gate 		 * Start Step
851*0Sstevel@tonic-gate 		 *
852*0Sstevel@tonic-gate 		 * - Suspend all rpc.mdcommd messages
853*0Sstevel@tonic-gate 		 */
854*0Sstevel@tonic-gate 
855*0Sstevel@tonic-gate 		/* expect the local node id to be given only */
856*0Sstevel@tonic-gate 		if (argc != 1)
857*0Sstevel@tonic-gate 			usage(sp, 1);
858*0Sstevel@tonic-gate 
859*0Sstevel@tonic-gate 		meta_mc_log(MC_LOG2, gettext("Starting Start step: %s"),
860*0Sstevel@tonic-gate 		    meta_print_hrtime(0));
861*0Sstevel@tonic-gate 
862*0Sstevel@tonic-gate 		/*
863*0Sstevel@tonic-gate 		 * Does local set exist? If not, exit with 0
864*0Sstevel@tonic-gate 		 * since there's no reason to have this node panic if
865*0Sstevel@tonic-gate 		 * the local set cannot be started.
866*0Sstevel@tonic-gate 		 */
867*0Sstevel@tonic-gate 		if ((local_sp = load_local_set(ep)) == NULL) {
868*0Sstevel@tonic-gate 			md_exit(local_sp, 0);
869*0Sstevel@tonic-gate 		}
870*0Sstevel@tonic-gate 
871*0Sstevel@tonic-gate 		if ((max_sets = get_max_sets(ep)) == 0) {
872*0Sstevel@tonic-gate 			mde_perror(ep, "");
873*0Sstevel@tonic-gate 			md_exit(sp, 1);
874*0Sstevel@tonic-gate 		}
875*0Sstevel@tonic-gate 
876*0Sstevel@tonic-gate 		/* start walking through all possible disksets */
877*0Sstevel@tonic-gate 		for (setno = 1; setno < max_sets; setno++) {
878*0Sstevel@tonic-gate 			if ((sp = metasetnosetname(setno, ep)) == NULL) {
879*0Sstevel@tonic-gate 				if (mdiserror(ep, MDE_NO_SET)) {
880*0Sstevel@tonic-gate 					/* No set for this setno - continue */
881*0Sstevel@tonic-gate 					mdclrerror(ep);
882*0Sstevel@tonic-gate 					continue;
883*0Sstevel@tonic-gate 				} else {
884*0Sstevel@tonic-gate 					mde_perror(ep, gettext("Unable to "
885*0Sstevel@tonic-gate 					    "get set %d information"), setno);
886*0Sstevel@tonic-gate 					md_exit(sp, 1);
887*0Sstevel@tonic-gate 				}
888*0Sstevel@tonic-gate 			}
889*0Sstevel@tonic-gate 
890*0Sstevel@tonic-gate 			/* only check multi-node disksets */
891*0Sstevel@tonic-gate 			if (!meta_is_mn_set(sp, ep)) {
892*0Sstevel@tonic-gate 				mdclrerror(ep);
893*0Sstevel@tonic-gate 				continue;
894*0Sstevel@tonic-gate 			}
895*0Sstevel@tonic-gate 
896*0Sstevel@tonic-gate 			meta_mc_log(MC_LOG3, gettext("Start - block parse "
897*0Sstevel@tonic-gate 			    "messages for set %s: %s"), sp->setname,
898*0Sstevel@tonic-gate 			    meta_print_hrtime(gethrtime() - start_time));
899*0Sstevel@tonic-gate 
900*0Sstevel@tonic-gate 			/*
901*0Sstevel@tonic-gate 			 * Mddb parse messages are sent amongst the nodes
902*0Sstevel@tonic-gate 			 * in a diskset whenever the locator block or
903*0Sstevel@tonic-gate 			 * locator names structure has been changed.
904*0Sstevel@tonic-gate 			 * A locator block change could occur as a result
905*0Sstevel@tonic-gate 			 * of a disk failure during the reconfig cycle,
906*0Sstevel@tonic-gate 			 * so block the mddb parse messages while the
907*0Sstevel@tonic-gate 			 * rpc.mdcommd is suspended during the reconfig cycle.
908*0Sstevel@tonic-gate 			 */
909*0Sstevel@tonic-gate 			if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) {
910*0Sstevel@tonic-gate 				(void) memset(&mbp, 0, sizeof (mbp));
911*0Sstevel@tonic-gate 				mbp.c_setno = setno;
912*0Sstevel@tonic-gate 				mbp.c_blk_flags = MDDB_BLOCK_PARSE;
913*0Sstevel@tonic-gate 				if (metaioctl(MD_MN_MDDB_BLOCK, &mbp,
914*0Sstevel@tonic-gate 				    &mbp.c_mde, NULL)) {
915*0Sstevel@tonic-gate 					mdstealerror(ep, &mbp.c_mde);
916*0Sstevel@tonic-gate 					mde_perror(ep, gettext("Could not "
917*0Sstevel@tonic-gate 					    "block set %s"), sp->setname);
918*0Sstevel@tonic-gate 					md_exit(sp, 1);
919*0Sstevel@tonic-gate 				}
920*0Sstevel@tonic-gate 			}
921*0Sstevel@tonic-gate 
922*0Sstevel@tonic-gate 			/* suspend commd and spin waiting for drain */
923*0Sstevel@tonic-gate 			while ((ret_val = mdmn_suspend(setno,
924*0Sstevel@tonic-gate 			    MD_COMM_ALL_CLASSES)) ==
925*0Sstevel@tonic-gate 			    MDE_DS_COMMDCTL_SUSPEND_NYD) {
926*0Sstevel@tonic-gate 				sleep(1);
927*0Sstevel@tonic-gate 			}
928*0Sstevel@tonic-gate 
929*0Sstevel@tonic-gate 			if (ret_val) {
930*0Sstevel@tonic-gate 				md_eprintf(gettext("Could not suspend "
931*0Sstevel@tonic-gate 				    "rpc.mdcommd for set %s\n"), sp->setname);
932*0Sstevel@tonic-gate 				md_exit(sp, 1);
933*0Sstevel@tonic-gate 			}
934*0Sstevel@tonic-gate 
935*0Sstevel@tonic-gate 			/*
936*0Sstevel@tonic-gate 			 * Set start step flag for set. This is set to indicate
937*0Sstevel@tonic-gate 			 * that the reconfig cycle entered through the start
938*0Sstevel@tonic-gate 			 * step and is used in reconfig step 4 to determine
939*0Sstevel@tonic-gate 			 * whether the node had entered through the start
940*0Sstevel@tonic-gate 			 * step or the return step.
941*0Sstevel@tonic-gate 			 */
942*0Sstevel@tonic-gate 			(void) memset(&sf, 0, sizeof (sf));
943*0Sstevel@tonic-gate 			sf.sf_setno = sp->setno;
944*0Sstevel@tonic-gate 			sf.sf_setflags = MD_SET_MN_START_RC;
945*0Sstevel@tonic-gate 			sf.sf_flags = MDDB_NM_SET;
946*0Sstevel@tonic-gate 			/* Use magic to help protect ioctl against attack. */
947*0Sstevel@tonic-gate 			sf.sf_magic = MDDB_SETFLAGS_MAGIC;
948*0Sstevel@tonic-gate 			if (metaioctl(MD_MN_SET_SETFLAGS, &sf,
949*0Sstevel@tonic-gate 			    &sf.sf_mde, NULL)) {
950*0Sstevel@tonic-gate 				mdstealerror(ep, &sf.sf_mde);
951*0Sstevel@tonic-gate 				mde_perror(ep, gettext("Could not set "
952*0Sstevel@tonic-gate 				    "start_step flag for set %s"), sp->setname);
953*0Sstevel@tonic-gate 				md_exit(sp, 1);
954*0Sstevel@tonic-gate 			}
955*0Sstevel@tonic-gate 
956*0Sstevel@tonic-gate 		}
957*0Sstevel@tonic-gate 
958*0Sstevel@tonic-gate 		meta_mc_log(MC_LOG2, gettext("Start step completed: %s"),
959*0Sstevel@tonic-gate 		    meta_print_hrtime(gethrtime() - start_time));
960*0Sstevel@tonic-gate 
961*0Sstevel@tonic-gate 		break;
962*0Sstevel@tonic-gate 
963*0Sstevel@tonic-gate 	case MC_STOP:
964*0Sstevel@tonic-gate 		/*
965*0Sstevel@tonic-gate 		 * Stop Step
966*0Sstevel@tonic-gate 		 *
967*0Sstevel@tonic-gate 		 * - ???
968*0Sstevel@tonic-gate 		 */
969*0Sstevel@tonic-gate 
970*0Sstevel@tonic-gate 		/* don't expect any more arguments to follow the step name */
971*0Sstevel@tonic-gate 		if (argc != 0)
972*0Sstevel@tonic-gate 			usage(sp, 1);
973*0Sstevel@tonic-gate 
974*0Sstevel@tonic-gate 		break;
975*0Sstevel@tonic-gate 
976*0Sstevel@tonic-gate 	case MC_ABORT:
977*0Sstevel@tonic-gate 		/*
978*0Sstevel@tonic-gate 		 * Abort Step
979*0Sstevel@tonic-gate 		 *
980*0Sstevel@tonic-gate 		 * - Abort rpc.mdcommd
981*0Sstevel@tonic-gate 		 */
982*0Sstevel@tonic-gate 
983*0Sstevel@tonic-gate 		/* don't expect any more arguments to follow the step name */
984*0Sstevel@tonic-gate 		if (argc != 0)
985*0Sstevel@tonic-gate 			usage(sp, 1);
986*0Sstevel@tonic-gate 
987*0Sstevel@tonic-gate 		meta_mc_log(MC_LOG2, gettext("Starting Abort step: %s"),
988*0Sstevel@tonic-gate 		    meta_print_hrtime(0));
989*0Sstevel@tonic-gate 
990*0Sstevel@tonic-gate 		/*
991*0Sstevel@tonic-gate 		 * Does local set exist? If not, exit with 0
992*0Sstevel@tonic-gate 		 * since there's no reason to have this node panic if
993*0Sstevel@tonic-gate 		 * the local set cannot be started.
994*0Sstevel@tonic-gate 		 */
995*0Sstevel@tonic-gate 		if ((local_sp = load_local_set(ep)) == NULL) {
996*0Sstevel@tonic-gate 			md_exit(local_sp, 0);
997*0Sstevel@tonic-gate 		}
998*0Sstevel@tonic-gate 
999*0Sstevel@tonic-gate 		/*
1000*0Sstevel@tonic-gate 		 * abort the rpc.mdcommd.  The abort is only issued on this node
1001*0Sstevel@tonic-gate 		 * meaning that the abort reconfig step is called on this
1002*0Sstevel@tonic-gate 		 * node before a panic while the rest of the cluster will
1003*0Sstevel@tonic-gate 		 * undergo a reconfig cycle.
1004*0Sstevel@tonic-gate 		 * There is no time relation between this node running a
1005*0Sstevel@tonic-gate 		 * reconfig abort and the the rest of the cluster
1006*0Sstevel@tonic-gate 		 * running a reconfig cycle meaning that this node may
1007*0Sstevel@tonic-gate 		 * panic before, during or after the cluster has run
1008*0Sstevel@tonic-gate 		 * a reconfig cycle.
1009*0Sstevel@tonic-gate 		 */
1010*0Sstevel@tonic-gate 		mdmn_abort();
1011*0Sstevel@tonic-gate 
1012*0Sstevel@tonic-gate 		meta_mc_log(MC_LOG2, gettext("Abort step completed: %s"),
1013*0Sstevel@tonic-gate 		    meta_print_hrtime(gethrtime() - start_time));
1014*0Sstevel@tonic-gate 
1015*0Sstevel@tonic-gate 		break;
1016*0Sstevel@tonic-gate 
1017*0Sstevel@tonic-gate 	case MC_RETURN:
1018*0Sstevel@tonic-gate 		/*
1019*0Sstevel@tonic-gate 		 * Return Step
1020*0Sstevel@tonic-gate 		 *
1021*0Sstevel@tonic-gate 		 * - Grab local set lock, issue rpc.mdcommd DRAIN ALL
1022*0Sstevel@tonic-gate 		 *   and release local set lock.  Grabbing the local set
1023*0Sstevel@tonic-gate 		 *   lock allows any active metaset/metadb commands to
1024*0Sstevel@tonic-gate 		 *   terminate gracefully and will keep a metaset/metadb
1025*0Sstevel@tonic-gate 		 *   command from starting until the DRAIN ALL is issued.
1026*0Sstevel@tonic-gate 		 *   The metaset/metadb commands can issue
1027*0Sstevel@tonic-gate 		 *   DRAIN ALL/RESUME ALL commands to rpc.mdcommd,
1028*0Sstevel@tonic-gate 		 *   so the return step must not issue the DRAIN ALL command
1029*0Sstevel@tonic-gate 		 *   until metaset/metadb have finished or metaset may issue
1030*0Sstevel@tonic-gate 		 *   a RESUME ALL after this return reconfig step has issued
1031*0Sstevel@tonic-gate 		 *   the DRAIN ALL command.
1032*0Sstevel@tonic-gate 		 *   After this reconfig step has issued the DRAIN_ALL and
1033*0Sstevel@tonic-gate 		 *   released the local set lock, metaset/metadb will fail
1034*0Sstevel@tonic-gate 		 *   when attempting to contact the rpc.mdcommd and will
1035*0Sstevel@tonic-gate 		 *   terminate without making any configuration changes.
1036*0Sstevel@tonic-gate 		 *   The DRAIN ALL command will keep all other meta* commands
1037*0Sstevel@tonic-gate 		 *   from running during the reconfig cycle (these commands
1038*0Sstevel@tonic-gate 		 *   will wait until the rpc.mdcommd is resumed) since the
1039*0Sstevel@tonic-gate 		 *   reconfig cycle may be changing the diskset configuration.
1040*0Sstevel@tonic-gate 		 */
1041*0Sstevel@tonic-gate 
1042*0Sstevel@tonic-gate 		/* expect the nodelist to follow the step name */
1043*0Sstevel@tonic-gate 		if (argc < 1)
1044*0Sstevel@tonic-gate 			usage(sp, 1);
1045*0Sstevel@tonic-gate 
1046*0Sstevel@tonic-gate 		meta_mc_log(MC_LOG2, gettext("Starting Return step: %s"),
1047*0Sstevel@tonic-gate 		    meta_print_hrtime(0));
1048*0Sstevel@tonic-gate 
1049*0Sstevel@tonic-gate 		/*
1050*0Sstevel@tonic-gate 		 * Does local set exist? If not, exit with 0
1051*0Sstevel@tonic-gate 		 * since there's no reason to have this node panic if
1052*0Sstevel@tonic-gate 		 * the local set cannot be started.
1053*0Sstevel@tonic-gate 		 */
1054*0Sstevel@tonic-gate 		if ((local_sp = load_local_set(ep)) == NULL) {
1055*0Sstevel@tonic-gate 			md_exit(local_sp, 0);
1056*0Sstevel@tonic-gate 		}
1057*0Sstevel@tonic-gate 
1058*0Sstevel@tonic-gate 		/*
1059*0Sstevel@tonic-gate 		 * Suspend any mirror resyncs that are in progress. This
1060*0Sstevel@tonic-gate 		 * stops unnecessary timeouts.
1061*0Sstevel@tonic-gate 		 */
1062*0Sstevel@tonic-gate 		meta_mirror_resync_block_all();
1063*0Sstevel@tonic-gate 
1064*0Sstevel@tonic-gate 		if (meta_lock(local_sp, TRUE, ep) != 0) {
1065*0Sstevel@tonic-gate 			mde_perror(ep, "");
1066*0Sstevel@tonic-gate 			md_exit(local_sp, 1);
1067*0Sstevel@tonic-gate 		}
1068*0Sstevel@tonic-gate 
1069*0Sstevel@tonic-gate 		/*
1070*0Sstevel@tonic-gate 		 * All metaset and metadb commands on this node have now
1071*0Sstevel@tonic-gate 		 * terminated gracefully.  Now, issue a drain all to
1072*0Sstevel@tonic-gate 		 * the rpc.mdcommd.  Any meta command issued after the
1073*0Sstevel@tonic-gate 		 * drain all will either spin sending the command to the
1074*0Sstevel@tonic-gate 		 * master until after the reconfig cycle has finished OR
1075*0Sstevel@tonic-gate 		 * will terminate gracefully (metaset/metadb).
1076*0Sstevel@tonic-gate 		 */
1077*0Sstevel@tonic-gate 		if ((max_sets = get_max_sets(ep)) == 0) {
1078*0Sstevel@tonic-gate 			mde_perror(ep, "");
1079*0Sstevel@tonic-gate 			md_exit(sp, 1);
1080*0Sstevel@tonic-gate 		}
1081*0Sstevel@tonic-gate 
1082*0Sstevel@tonic-gate 		/* start walking through all possible disksets */
1083*0Sstevel@tonic-gate 		for (setno = 1; setno < max_sets; setno++) {
1084*0Sstevel@tonic-gate 			if ((sp = metasetnosetname(setno, ep)) == NULL) {
1085*0Sstevel@tonic-gate 				if (mdiserror(ep, MDE_NO_SET)) {
1086*0Sstevel@tonic-gate 					/* No set for this setno - continue */
1087*0Sstevel@tonic-gate 					mdclrerror(ep);
1088*0Sstevel@tonic-gate 					continue;
1089*0Sstevel@tonic-gate 				} else {
1090*0Sstevel@tonic-gate 					mde_perror(ep, gettext("Unable to "
1091*0Sstevel@tonic-gate 					    "get set %d information"), setno);
1092*0Sstevel@tonic-gate 					md_exit(sp, 1);
1093*0Sstevel@tonic-gate 				}
1094*0Sstevel@tonic-gate 			}
1095*0Sstevel@tonic-gate 
1096*0Sstevel@tonic-gate 			/* only check multi-node disksets */
1097*0Sstevel@tonic-gate 			if (!meta_is_mn_set(sp, ep)) {
1098*0Sstevel@tonic-gate 				mdclrerror(ep);
1099*0Sstevel@tonic-gate 				continue;
1100*0Sstevel@tonic-gate 			}
1101*0Sstevel@tonic-gate 
1102*0Sstevel@tonic-gate 			meta_mc_log(MC_LOG3, gettext("Return - block parse "
1103*0Sstevel@tonic-gate 			    "messages for set %s: %s"), sp->setname,
1104*0Sstevel@tonic-gate 			    meta_print_hrtime(gethrtime() - start_time));
1105*0Sstevel@tonic-gate 
1106*0Sstevel@tonic-gate 			/*
1107*0Sstevel@tonic-gate 			 * Mddb parse messages are sent amongst the nodes
1108*0Sstevel@tonic-gate 			 * in a diskset whenever the locator block or
1109*0Sstevel@tonic-gate 			 * locator names structure has been changed.
1110*0Sstevel@tonic-gate 			 * A locator block change could occur as a result
1111*0Sstevel@tonic-gate 			 * of a disk failure during the reconfig cycle,
1112*0Sstevel@tonic-gate 			 * so block the mddb parse messages while the
1113*0Sstevel@tonic-gate 			 * rpc.commd is suspended during the reconfig cycle.
1114*0Sstevel@tonic-gate 			 */
1115*0Sstevel@tonic-gate 			if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) {
1116*0Sstevel@tonic-gate 				(void) memset(&mbp, 0, sizeof (mbp));
1117*0Sstevel@tonic-gate 				mbp.c_setno = setno;
1118*0Sstevel@tonic-gate 				mbp.c_blk_flags = MDDB_BLOCK_PARSE;
1119*0Sstevel@tonic-gate 				if (metaioctl(MD_MN_MDDB_BLOCK, &mbp,
1120*0Sstevel@tonic-gate 				    &mbp.c_mde, NULL)) {
1121*0Sstevel@tonic-gate 					mdstealerror(ep, &mbp.c_mde);
1122*0Sstevel@tonic-gate 					mde_perror(ep, gettext("Could not "
1123*0Sstevel@tonic-gate 					    "block set %s"), sp->setname);
1124*0Sstevel@tonic-gate 					md_exit(sp, 1);
1125*0Sstevel@tonic-gate 				}
1126*0Sstevel@tonic-gate 			}
1127*0Sstevel@tonic-gate 
1128*0Sstevel@tonic-gate 			/* suspend commd and spin waiting for drain */
1129*0Sstevel@tonic-gate 			while ((ret_val = mdmn_suspend(setno,
1130*0Sstevel@tonic-gate 			    MD_COMM_ALL_CLASSES)) ==
1131*0Sstevel@tonic-gate 			    MDE_DS_COMMDCTL_SUSPEND_NYD) {
1132*0Sstevel@tonic-gate 				sleep(1);
1133*0Sstevel@tonic-gate 			}
1134*0Sstevel@tonic-gate 
1135*0Sstevel@tonic-gate 			if (ret_val) {
1136*0Sstevel@tonic-gate 				md_eprintf(gettext("Could not suspend "
1137*0Sstevel@tonic-gate 				    "rpc.mdcommd for set %s\n"), sp->setname);
1138*0Sstevel@tonic-gate 				md_exit(sp, 1);
1139*0Sstevel@tonic-gate 			}
1140*0Sstevel@tonic-gate 		}
1141*0Sstevel@tonic-gate 		/*
1142*0Sstevel@tonic-gate 		 * Resume all I/Os for this node for all MN sets in
1143*0Sstevel@tonic-gate 		 * case master node had suspended I/Os but panic'd
1144*0Sstevel@tonic-gate 		 * before resuming I/Os.  In case of failure, exit
1145*0Sstevel@tonic-gate 		 * with a 1 since unable to resume I/Os on this node.
1146*0Sstevel@tonic-gate 		 */
1147*0Sstevel@tonic-gate 		if (clnt_mn_susp_res_io(mynode(), 0, MN_RES_IO, ep)) {
1148*0Sstevel@tonic-gate 			mde_perror(ep, gettext(
1149*0Sstevel@tonic-gate 			    "Unable to resume I/O on node %s for all sets"),
1150*0Sstevel@tonic-gate 			    mynode());
1151*0Sstevel@tonic-gate 			md_exit(sp, 1);
1152*0Sstevel@tonic-gate 		}
1153*0Sstevel@tonic-gate 
1154*0Sstevel@tonic-gate 
1155*0Sstevel@tonic-gate 		/*
1156*0Sstevel@tonic-gate 		 * Can now unlock local set lock.  New metaset/metadb
1157*0Sstevel@tonic-gate 		 * commands are now held off using drain all.
1158*0Sstevel@tonic-gate 		 */
1159*0Sstevel@tonic-gate 		(void) meta_unlock(local_sp, ep);
1160*0Sstevel@tonic-gate 
1161*0Sstevel@tonic-gate 		meta_mc_log(MC_LOG2, gettext("Return step completed: %s"),
1162*0Sstevel@tonic-gate 		    meta_print_hrtime(gethrtime() - start_time));
1163*0Sstevel@tonic-gate 
1164*0Sstevel@tonic-gate 		break;
1165*0Sstevel@tonic-gate 
1166*0Sstevel@tonic-gate 	case MC_STEP1:
1167*0Sstevel@tonic-gate 		/*
1168*0Sstevel@tonic-gate 		 * Step 1
1169*0Sstevel@tonic-gate 		 *
1170*0Sstevel@tonic-gate 		 * - Populate nodelist file if we are on clustering
1171*0Sstevel@tonic-gate 		 *   and pick a master node for each MN diskset.
1172*0Sstevel@tonic-gate 		 */
1173*0Sstevel@tonic-gate 
1174*0Sstevel@tonic-gate 		/* expect the nodelist to follow the step name */
1175*0Sstevel@tonic-gate 		if (argc < 1)
1176*0Sstevel@tonic-gate 			usage(sp, 1);
1177*0Sstevel@tonic-gate 
1178*0Sstevel@tonic-gate 		meta_mc_log(MC_LOG2, gettext("Starting Step1: %s"),
1179*0Sstevel@tonic-gate 		    meta_print_hrtime(0));
1180*0Sstevel@tonic-gate 
1181*0Sstevel@tonic-gate 		/* Always write nodelist file even if no local set exists */
1182*0Sstevel@tonic-gate 		if (clust == SDSSC_OKAY) {
1183*0Sstevel@tonic-gate 			/* skip to the nodelist args */
1184*0Sstevel@tonic-gate 			if (meta_write_nodelist(argc, argv, ep) != 0) {
1185*0Sstevel@tonic-gate 				mde_perror(ep, gettext(
1186*0Sstevel@tonic-gate 				    "Could not populate nodelist file"));
1187*0Sstevel@tonic-gate 				md_exit(sp, 1);
1188*0Sstevel@tonic-gate 			}
1189*0Sstevel@tonic-gate 		}
1190*0Sstevel@tonic-gate 
1191*0Sstevel@tonic-gate 		/*
1192*0Sstevel@tonic-gate 		 * Does local set exist? If not, exit with 0
1193*0Sstevel@tonic-gate 		 * since there's no reason to have this node panic if
1194*0Sstevel@tonic-gate 		 * the local set cannot be started.
1195*0Sstevel@tonic-gate 		 */
1196*0Sstevel@tonic-gate 		if ((local_sp = load_local_set(ep)) == NULL) {
1197*0Sstevel@tonic-gate 			md_exit(local_sp, 0);
1198*0Sstevel@tonic-gate 		}
1199*0Sstevel@tonic-gate 
1200*0Sstevel@tonic-gate 		/*
1201*0Sstevel@tonic-gate 		 * At this point, all meta* commands are blocked across
1202*0Sstevel@tonic-gate 		 * all disksets since the master rpc.mdcommd has drained or
1203*0Sstevel@tonic-gate 		 * the master node has died.
1204*0Sstevel@tonic-gate 		 * If a metaset or metadb command had been in progress
1205*0Sstevel@tonic-gate 		 * at the start of the reconfig cycle, this command has
1206*0Sstevel@tonic-gate 		 * either completed or it has been terminated due to
1207*0Sstevel@tonic-gate 		 * the death of the master node.
1208*0Sstevel@tonic-gate 		 *
1209*0Sstevel@tonic-gate 		 * This means that that it is now ok to remove any
1210*0Sstevel@tonic-gate 		 * outstanding clnt_locks associated with multinode
1211*0Sstevel@tonic-gate 		 * disksets on this node due to a node panic during
1212*0Sstevel@tonic-gate 		 * a metaset operation.  This allows the routines that
1213*0Sstevel@tonic-gate 		 * choose the master to use rpc.metad to determine the
1214*0Sstevel@tonic-gate 		 * master of the diskset.
1215*0Sstevel@tonic-gate 		 */
1216*0Sstevel@tonic-gate 		if (clnt_clr_mnsetlock(mynode(), ep) != 0) {
1217*0Sstevel@tonic-gate 			meta_mc_log(MC_LOG2, gettext("Step1 aborted:"
1218*0Sstevel@tonic-gate 			    "clear locks failed %s"),
1219*0Sstevel@tonic-gate 			    meta_print_hrtime(gethrtime() - start_time));
1220*0Sstevel@tonic-gate 			md_exit(local_sp, 1);
1221*0Sstevel@tonic-gate 		}
1222*0Sstevel@tonic-gate 
1223*0Sstevel@tonic-gate 		/*
1224*0Sstevel@tonic-gate 		 * Call reconfig_choose_master to choose a master for
1225*0Sstevel@tonic-gate 		 * each MN diskset, update the nodelist for each diskset
1226*0Sstevel@tonic-gate 		 * given the member information and send a reinit message
1227*0Sstevel@tonic-gate 		 * to rpc.mdcommd to reload the nodelist.
1228*0Sstevel@tonic-gate 		 */
1229*0Sstevel@tonic-gate 		rval = meta_reconfig_choose_master(ep);
1230*0Sstevel@tonic-gate 		if (rval == 205) {
1231*0Sstevel@tonic-gate 			/*
1232*0Sstevel@tonic-gate 			 * NOTE: Should issue call to reboot remote host that
1233*0Sstevel@tonic-gate 			 * is causing the RPC failure.  Clustering to
1234*0Sstevel@tonic-gate 			 * provide interface in the future.  This should
1235*0Sstevel@tonic-gate 			 * stop a never-ending set of 205 reconfig cycles.
1236*0Sstevel@tonic-gate 			 * Remote host causing failure is stored in
1237*0Sstevel@tonic-gate 			 * ep->host if ep is an RPC error.
1238*0Sstevel@tonic-gate 			 * if (mdanyrpcerror(ep))
1239*0Sstevel@tonic-gate 			 * 	reboot (ep->host);
1240*0Sstevel@tonic-gate 			 */
1241*0Sstevel@tonic-gate 			meta_mc_log(MC_LOG2, gettext("Step1 aborted:"
1242*0Sstevel@tonic-gate 			    "choose master failure of 205 %s"),
1243*0Sstevel@tonic-gate 			    meta_print_hrtime(gethrtime() - start_time));
1244*0Sstevel@tonic-gate 			md_exit(local_sp, 205);
1245*0Sstevel@tonic-gate 		} else if (rval != 0) {
1246*0Sstevel@tonic-gate 			meta_mc_log(MC_LOG2, gettext("Step1 failure: "
1247*0Sstevel@tonic-gate 			    "choose master failure %s"),
1248*0Sstevel@tonic-gate 			    meta_print_hrtime(gethrtime() - start_time));
1249*0Sstevel@tonic-gate 			md_exit(local_sp, 1);
1250*0Sstevel@tonic-gate 		}
1251*0Sstevel@tonic-gate 
1252*0Sstevel@tonic-gate 		meta_mc_log(MC_LOG2, gettext("Step1 completed: %s"),
1253*0Sstevel@tonic-gate 		    meta_print_hrtime(gethrtime() - start_time));
1254*0Sstevel@tonic-gate 
1255*0Sstevel@tonic-gate 		md_exit(local_sp, rval);
1256*0Sstevel@tonic-gate 		break;
1257*0Sstevel@tonic-gate 
1258*0Sstevel@tonic-gate 	case MC_STEP2:
1259*0Sstevel@tonic-gate 		/*
1260*0Sstevel@tonic-gate 		 * Step 2
1261*0Sstevel@tonic-gate 		 *
1262*0Sstevel@tonic-gate 		 * In Step 2, each node walks the list of disksets.  If a
1263*0Sstevel@tonic-gate 		 * node is a master of a MN diskset, it synchronizes
1264*0Sstevel@tonic-gate 		 * the local set USER records for that diskset.
1265*0Sstevel@tonic-gate 		 *
1266*0Sstevel@tonic-gate 		 * If disks exist in the diskset and there is a joined
1267*0Sstevel@tonic-gate 		 * (owner) node in the diskset, the master will also:
1268*0Sstevel@tonic-gate 		 *	- synchronize the diskset mddbs to the master
1269*0Sstevel@tonic-gate 		 *	- play the change log
1270*0Sstevel@tonic-gate 		 *
1271*0Sstevel@tonic-gate 		 * The master node will now attempt to join any unjoined
1272*0Sstevel@tonic-gate 		 * nodes that are currently members in the membership list.
1273*0Sstevel@tonic-gate 		 */
1274*0Sstevel@tonic-gate 
1275*0Sstevel@tonic-gate 		/* expect the nodelist to follow the step name */
1276*0Sstevel@tonic-gate 		if (argc < 1)
1277*0Sstevel@tonic-gate 			usage(sp, 1);
1278*0Sstevel@tonic-gate 
1279*0Sstevel@tonic-gate 		meta_mc_log(MC_LOG2, gettext("Starting Step2: %s"),
1280*0Sstevel@tonic-gate 		    meta_print_hrtime(0));
1281*0Sstevel@tonic-gate 
1282*0Sstevel@tonic-gate 		/*
1283*0Sstevel@tonic-gate 		 * Does local set exist? If not, exit with 0
1284*0Sstevel@tonic-gate 		 * since there's no reason to have this node panic if
1285*0Sstevel@tonic-gate 		 * the local set cannot be started.
1286*0Sstevel@tonic-gate 		 */
1287*0Sstevel@tonic-gate 		if ((local_sp = load_local_set(ep)) == NULL) {
1288*0Sstevel@tonic-gate 			md_exit(local_sp, 0);
1289*0Sstevel@tonic-gate 		}
1290*0Sstevel@tonic-gate 
1291*0Sstevel@tonic-gate 		if ((max_sets = get_max_sets(ep)) == 0) {
1292*0Sstevel@tonic-gate 			mde_perror(ep, "");
1293*0Sstevel@tonic-gate 			md_exit(local_sp, 1);
1294*0Sstevel@tonic-gate 		}
1295*0Sstevel@tonic-gate 
1296*0Sstevel@tonic-gate 		/* start walking through all possible disksets */
1297*0Sstevel@tonic-gate 		for (setno = 1; setno < max_sets; setno++) {
1298*0Sstevel@tonic-gate 			if ((sp = metasetnosetname(setno, ep)) == NULL) {
1299*0Sstevel@tonic-gate 				if (mdiserror(ep, MDE_NO_SET)) {
1300*0Sstevel@tonic-gate 					/* No set for this setno - continue */
1301*0Sstevel@tonic-gate 					mdclrerror(ep);
1302*0Sstevel@tonic-gate 					continue;
1303*0Sstevel@tonic-gate 				} else if (mdanyrpcerror(ep)) {
1304*0Sstevel@tonic-gate 					/* Fail on RPC failure to self */
1305*0Sstevel@tonic-gate 					mde_perror(ep, gettext(
1306*0Sstevel@tonic-gate 					    "Unable to get information for "
1307*0Sstevel@tonic-gate 					    "set number %d"), setno);
1308*0Sstevel@tonic-gate 					md_exit(local_sp, 1);
1309*0Sstevel@tonic-gate 				} else {
1310*0Sstevel@tonic-gate 					mde_perror(ep, gettext(
1311*0Sstevel@tonic-gate 					    "Unable to get information for "
1312*0Sstevel@tonic-gate 					    "set number %d"), setno);
1313*0Sstevel@tonic-gate 					mdclrerror(ep);
1314*0Sstevel@tonic-gate 					continue;
1315*0Sstevel@tonic-gate 				}
1316*0Sstevel@tonic-gate 			}
1317*0Sstevel@tonic-gate 
1318*0Sstevel@tonic-gate 			if ((sd = metaget_setdesc(sp, ep)) == NULL) {
1319*0Sstevel@tonic-gate 				if (mdanyrpcerror(ep)) {
1320*0Sstevel@tonic-gate 					/* Fail on RPC failure to self */
1321*0Sstevel@tonic-gate 					mde_perror(ep, gettext(
1322*0Sstevel@tonic-gate 					    "Unable to get information for "
1323*0Sstevel@tonic-gate 					    "set number %d"), setno);
1324*0Sstevel@tonic-gate 					md_exit(local_sp, 1);
1325*0Sstevel@tonic-gate 				}
1326*0Sstevel@tonic-gate 				mde_perror(ep, gettext("Unable to get set "
1327*0Sstevel@tonic-gate 				    "%s desc information"), sp->setname);
1328*0Sstevel@tonic-gate 				mdclrerror(ep);
1329*0Sstevel@tonic-gate 				continue;
1330*0Sstevel@tonic-gate 			}
1331*0Sstevel@tonic-gate 
1332*0Sstevel@tonic-gate 			/* Only check MN disksets */
1333*0Sstevel@tonic-gate 			if (!(MD_MNSET_DESC(sd))) {
1334*0Sstevel@tonic-gate 				continue;
1335*0Sstevel@tonic-gate 			}
1336*0Sstevel@tonic-gate 
1337*0Sstevel@tonic-gate 			/* All actions in step 2 are driven by master */
1338*0Sstevel@tonic-gate 			if (!(sd->sd_mn_am_i_master)) {
1339*0Sstevel@tonic-gate 				continue;
1340*0Sstevel@tonic-gate 			}
1341*0Sstevel@tonic-gate 
1342*0Sstevel@tonic-gate 			meta_mc_log(MC_LOG3, gettext("Step2 - begin record "
1343*0Sstevel@tonic-gate 			    "synchronization for set %s: %s"), sp->setname,
1344*0Sstevel@tonic-gate 			    meta_print_hrtime(gethrtime() - start_time));
1345*0Sstevel@tonic-gate 
1346*0Sstevel@tonic-gate 			/*
1347*0Sstevel@tonic-gate 			 * Synchronize the USER records in the local mddbs
1348*0Sstevel@tonic-gate 			 * for hosts that are members.  The USER records
1349*0Sstevel@tonic-gate 			 * contain set, drive and host information.
1350*0Sstevel@tonic-gate 			 */
1351*0Sstevel@tonic-gate 			rval = meta_mnsync_user_records(sp, ep);
1352*0Sstevel@tonic-gate 			if (rval != 0) {
1353*0Sstevel@tonic-gate 				mde_perror(ep, gettext(
1354*0Sstevel@tonic-gate 				    "Synchronization of user records "
1355*0Sstevel@tonic-gate 				    "in set %s failed\n"), sp->setname);
1356*0Sstevel@tonic-gate 				if (rval == 205) {
1357*0Sstevel@tonic-gate 					/*
1358*0Sstevel@tonic-gate 					 * NOTE: Should issue call to reboot
1359*0Sstevel@tonic-gate 					 * remote host that is causing the RPC
1360*0Sstevel@tonic-gate 					 * failure.  Clustering to provide
1361*0Sstevel@tonic-gate 					 * interface in the future.  This
1362*0Sstevel@tonic-gate 					 * should stop a never-ending set of
1363*0Sstevel@tonic-gate 					 * 205 reconfig cycles.
1364*0Sstevel@tonic-gate 					 * Remote host causing failure is
1365*0Sstevel@tonic-gate 					 * stored in ep->host if ep is an
1366*0Sstevel@tonic-gate 					 * RPC error.
1367*0Sstevel@tonic-gate 					 * if (mdanyrpcerror(ep))
1368*0Sstevel@tonic-gate 					 * 	reboot (ep->host);
1369*0Sstevel@tonic-gate 					 */
1370*0Sstevel@tonic-gate 					md_exit(local_sp, 205);
1371*0Sstevel@tonic-gate 				} else {
1372*0Sstevel@tonic-gate 					md_exit(local_sp, 1);
1373*0Sstevel@tonic-gate 				}
1374*0Sstevel@tonic-gate 			}
1375*0Sstevel@tonic-gate 
1376*0Sstevel@tonic-gate 			/* Reget sd since sync_user_recs may have flushed it */
1377*0Sstevel@tonic-gate 			if ((sd = metaget_setdesc(sp, ep)) == NULL) {
1378*0Sstevel@tonic-gate 				mde_perror(ep, gettext("Unable to get set "
1379*0Sstevel@tonic-gate 				    "%s desc information"), sp->setname);
1380*0Sstevel@tonic-gate 				md_exit(local_sp, 1);
1381*0Sstevel@tonic-gate 			}
1382*0Sstevel@tonic-gate 
1383*0Sstevel@tonic-gate 			dd = metaget_drivedesc(sp,
1384*0Sstevel@tonic-gate 			    (MD_BASICNAME_OK | PRINT_FAST), ep);
1385*0Sstevel@tonic-gate 			if (! mdisok(ep)) {
1386*0Sstevel@tonic-gate 				mde_perror(ep, gettext("Unable to get set "
1387*0Sstevel@tonic-gate 				    "%s drive information"), sp->setname);
1388*0Sstevel@tonic-gate 				md_exit(local_sp, 1);
1389*0Sstevel@tonic-gate 			}
1390*0Sstevel@tonic-gate 
1391*0Sstevel@tonic-gate 			/*
1392*0Sstevel@tonic-gate 			 * No drives in set, continue to next set.
1393*0Sstevel@tonic-gate 			 */
1394*0Sstevel@tonic-gate 			if (dd == NULL) {
1395*0Sstevel@tonic-gate 				/* Done with this set */
1396*0Sstevel@tonic-gate 				continue;
1397*0Sstevel@tonic-gate 			}
1398*0Sstevel@tonic-gate 
1399*0Sstevel@tonic-gate 			meta_mc_log(MC_LOG3, gettext("Step2 - local set user "
1400*0Sstevel@tonic-gate 			    "records completed for set %s: %s"), sp->setname,
1401*0Sstevel@tonic-gate 			    meta_print_hrtime(gethrtime() - start_time));
1402*0Sstevel@tonic-gate 
1403*0Sstevel@tonic-gate 			/*
1404*0Sstevel@tonic-gate 			 * Synchronize the diskset mddbs for hosts
1405*0Sstevel@tonic-gate 			 * that are members.  This may involve
1406*0Sstevel@tonic-gate 			 * playing the changelog and writing out
1407*0Sstevel@tonic-gate 			 * to the diskset mddbs.
1408*0Sstevel@tonic-gate 			 */
1409*0Sstevel@tonic-gate 			rval = meta_mnsync_diskset_mddbs(sp, ep);
1410*0Sstevel@tonic-gate 			if (rval != 0) {
1411*0Sstevel@tonic-gate 				mde_perror(ep, gettext(
1412*0Sstevel@tonic-gate 				    "Synchronization of diskset mddbs "
1413*0Sstevel@tonic-gate 				    "in set %s failed\n"), sp->setname);
1414*0Sstevel@tonic-gate 				meta_mc_log(MC_LOG3, gettext("Step2 - diskset "
1415*0Sstevel@tonic-gate 				    "mddb synchronization failed for "
1416*0Sstevel@tonic-gate 				    "set %s: %s"), sp->setname,
1417*0Sstevel@tonic-gate 				    meta_print_hrtime(gethrtime() -
1418*0Sstevel@tonic-gate 				    start_time));
1419*0Sstevel@tonic-gate 				if (rval == 205) {
1420*0Sstevel@tonic-gate 					/*
1421*0Sstevel@tonic-gate 					 * NOTE: Should issue call to reboot
1422*0Sstevel@tonic-gate 					 * remote host that is causing the RPC
1423*0Sstevel@tonic-gate 					 * failure.  Clustering to provide
1424*0Sstevel@tonic-gate 					 * interface in the future.  This
1425*0Sstevel@tonic-gate 					 * should stop a never-ending set of
1426*0Sstevel@tonic-gate 					 * 205 reconfig cycles.
1427*0Sstevel@tonic-gate 					 * Remote host causing failure is
1428*0Sstevel@tonic-gate 					 * stored in ep->host if ep is an
1429*0Sstevel@tonic-gate 					 * RPC error.
1430*0Sstevel@tonic-gate 					 * if (mdanyrpcerror(ep))
1431*0Sstevel@tonic-gate 					 * 	reboot (ep->host);
1432*0Sstevel@tonic-gate 					 */
1433*0Sstevel@tonic-gate 					md_exit(local_sp, 205);
1434*0Sstevel@tonic-gate 				} else if (rval == 1) {
1435*0Sstevel@tonic-gate 					continue;
1436*0Sstevel@tonic-gate 				} else {
1437*0Sstevel@tonic-gate 					md_exit(local_sp, 1);
1438*0Sstevel@tonic-gate 				}
1439*0Sstevel@tonic-gate 			}
1440*0Sstevel@tonic-gate 
1441*0Sstevel@tonic-gate 			meta_mc_log(MC_LOG3, gettext("Step2 - diskset mddb "
1442*0Sstevel@tonic-gate 			    "synchronization completed for set %s: %s"),
1443*0Sstevel@tonic-gate 			    sp->setname,
1444*0Sstevel@tonic-gate 			    meta_print_hrtime(gethrtime() - start_time));
1445*0Sstevel@tonic-gate 
1446*0Sstevel@tonic-gate 			/* Join the starting nodes to the diskset */
1447*0Sstevel@tonic-gate 			rval = meta_mnjoin_all(sp, ep);
1448*0Sstevel@tonic-gate 			if (rval != 0) {
1449*0Sstevel@tonic-gate 				mde_perror(ep, gettext(
1450*0Sstevel@tonic-gate 				    "Join of non-owner (starting) nodes "
1451*0Sstevel@tonic-gate 				    "in set %s failed\n"), sp->setname);
1452*0Sstevel@tonic-gate 				meta_mc_log(MC_LOG3, gettext("Step2 - non owner"
1453*0Sstevel@tonic-gate 				    "nodes joined for set %s: %s"),
1454*0Sstevel@tonic-gate 				    sp->setname,
1455*0Sstevel@tonic-gate 				    meta_print_hrtime(gethrtime() -
1456*0Sstevel@tonic-gate 				    start_time));
1457*0Sstevel@tonic-gate 				if (rval == 205) {
1458*0Sstevel@tonic-gate 					/*
1459*0Sstevel@tonic-gate 					 * NOTE: Should issue call to reboot
1460*0Sstevel@tonic-gate 					 * remote host that is causing the RPC
1461*0Sstevel@tonic-gate 					 * failure.  Clustering to provide
1462*0Sstevel@tonic-gate 					 * interface in the future.  This
1463*0Sstevel@tonic-gate 					 * should stop a never-ending set of
1464*0Sstevel@tonic-gate 					 * 205 reconfig cycles.
1465*0Sstevel@tonic-gate 					 * Remote host causing failure is
1466*0Sstevel@tonic-gate 					 * stored in ep->host if ep is an
1467*0Sstevel@tonic-gate 					 * RPC error.
1468*0Sstevel@tonic-gate 					 * if (mdanyrpcerror(ep))
1469*0Sstevel@tonic-gate 					 * 	reboot (ep->host);
1470*0Sstevel@tonic-gate 					 */
1471*0Sstevel@tonic-gate 					md_exit(local_sp, 205);
1472*0Sstevel@tonic-gate 				} else {
1473*0Sstevel@tonic-gate 					md_exit(local_sp, 1);
1474*0Sstevel@tonic-gate 				}
1475*0Sstevel@tonic-gate 			}
1476*0Sstevel@tonic-gate 
1477*0Sstevel@tonic-gate 			meta_mc_log(MC_LOG3, gettext("Step2 - non owner nodes "
1478*0Sstevel@tonic-gate 			    "joined for set %s: %s"), sp->setname,
1479*0Sstevel@tonic-gate 			    meta_print_hrtime(gethrtime() - start_time));
1480*0Sstevel@tonic-gate 
1481*0Sstevel@tonic-gate 		}
1482*0Sstevel@tonic-gate 
1483*0Sstevel@tonic-gate 		meta_mc_log(MC_LOG2, gettext("Step2 completed: %s"),
1484*0Sstevel@tonic-gate 		    meta_print_hrtime(gethrtime() - start_time));
1485*0Sstevel@tonic-gate 
1486*0Sstevel@tonic-gate 		break;
1487*0Sstevel@tonic-gate 
1488*0Sstevel@tonic-gate 	case MC_STEP3:
1489*0Sstevel@tonic-gate 		/*
1490*0Sstevel@tonic-gate 		 * Step 3
1491*0Sstevel@tonic-gate 		 *
1492*0Sstevel@tonic-gate 		 * For all multinode sets do,
1493*0Sstevel@tonic-gate 		 * - Reinitialise rpc.mdcommd
1494*0Sstevel@tonic-gate 		 * - Reset mirror owners to null if the current owner is
1495*0Sstevel@tonic-gate 		 *   no longer in the membership list
1496*0Sstevel@tonic-gate 		 */
1497*0Sstevel@tonic-gate 
1498*0Sstevel@tonic-gate 		/* expect the nodelist to follow the step name */
1499*0Sstevel@tonic-gate 		if (argc < 1)
1500*0Sstevel@tonic-gate 			usage(sp, 1);
1501*0Sstevel@tonic-gate 
1502*0Sstevel@tonic-gate 		meta_mc_log(MC_LOG2, gettext("Starting Step3: %s"),
1503*0Sstevel@tonic-gate 		    meta_print_hrtime(0));
1504*0Sstevel@tonic-gate 
1505*0Sstevel@tonic-gate 		/*
1506*0Sstevel@tonic-gate 		 * Does local set exist? If not, exit with 0
1507*0Sstevel@tonic-gate 		 * since there's no reason to have this node panic if
1508*0Sstevel@tonic-gate 		 * the local set cannot be started.
1509*0Sstevel@tonic-gate 		 */
1510*0Sstevel@tonic-gate 		if ((local_sp = load_local_set(ep)) == NULL) {
1511*0Sstevel@tonic-gate 			md_exit(local_sp, 0);
1512*0Sstevel@tonic-gate 		}
1513*0Sstevel@tonic-gate 
1514*0Sstevel@tonic-gate 		/*
1515*0Sstevel@tonic-gate 		 * walk through all sets on this node which could include:
1516*0Sstevel@tonic-gate 		 *	- MN disksets
1517*0Sstevel@tonic-gate 		 *	- traditional disksets
1518*0Sstevel@tonic-gate 		 *	- non-existent disksets
1519*0Sstevel@tonic-gate 		 * start mirror resync for all MN sets
1520*0Sstevel@tonic-gate 		 */
1521*0Sstevel@tonic-gate 		if ((max_sets = get_max_sets(ep)) == 0) {
1522*0Sstevel@tonic-gate 			mde_perror(ep, "");
1523*0Sstevel@tonic-gate 			md_exit(local_sp, 1);
1524*0Sstevel@tonic-gate 		}
1525*0Sstevel@tonic-gate 
1526*0Sstevel@tonic-gate 		/* start walking through all possible disksets */
1527*0Sstevel@tonic-gate 		for (setno = 1; setno < max_sets; setno++) {
1528*0Sstevel@tonic-gate 			if ((sp = metasetnosetname(setno, ep)) == NULL) {
1529*0Sstevel@tonic-gate 				if (mdiserror(ep, MDE_NO_SET)) {
1530*0Sstevel@tonic-gate 					/* No set for this setno - continue */
1531*0Sstevel@tonic-gate 					mdclrerror(ep);
1532*0Sstevel@tonic-gate 					continue;
1533*0Sstevel@tonic-gate 				} else {
1534*0Sstevel@tonic-gate 					mde_perror(ep, gettext("Unable to "
1535*0Sstevel@tonic-gate 					    "get set %d information"), setno);
1536*0Sstevel@tonic-gate 					md_exit(local_sp, 1);
1537*0Sstevel@tonic-gate 				}
1538*0Sstevel@tonic-gate 			}
1539*0Sstevel@tonic-gate 
1540*0Sstevel@tonic-gate 			/* only check multi-node disksets */
1541*0Sstevel@tonic-gate 			if (!meta_is_mn_set(sp, ep)) {
1542*0Sstevel@tonic-gate 				mdclrerror(ep);
1543*0Sstevel@tonic-gate 				continue;
1544*0Sstevel@tonic-gate 			}
1545*0Sstevel@tonic-gate 
1546*0Sstevel@tonic-gate 			if (meta_lock(sp, TRUE, ep) != 0) {
1547*0Sstevel@tonic-gate 				mde_perror(ep, "");
1548*0Sstevel@tonic-gate 				md_exit(local_sp, 1);
1549*0Sstevel@tonic-gate 			}
1550*0Sstevel@tonic-gate 
1551*0Sstevel@tonic-gate 			/* If this node isn't joined to set, do nothing */
1552*0Sstevel@tonic-gate 			if (s_ownset(sp->setno, ep) != MD_SETOWNER_YES) {
1553*0Sstevel@tonic-gate 				if (!mdisok(ep)) {
1554*0Sstevel@tonic-gate 					mde_perror(ep, gettext("Could "
1555*0Sstevel@tonic-gate 					    "not get set %s ownership"),
1556*0Sstevel@tonic-gate 					    sp->setname);
1557*0Sstevel@tonic-gate 					md_exit(sp, 1);
1558*0Sstevel@tonic-gate 				}
1559*0Sstevel@tonic-gate 				mdclrerror(ep);
1560*0Sstevel@tonic-gate 				meta_unlock(sp, ep);
1561*0Sstevel@tonic-gate 				continue;
1562*0Sstevel@tonic-gate 			}
1563*0Sstevel@tonic-gate 
1564*0Sstevel@tonic-gate 			meta_mc_log(MC_LOG3, gettext("Step3 - begin "
1565*0Sstevel@tonic-gate 			    "re-initialising rpc.mdcommd and resetting mirror "
1566*0Sstevel@tonic-gate 			    "owners for set %s: %s"), sp->setname,
1567*0Sstevel@tonic-gate 			    meta_print_hrtime(gethrtime() - start_time));
1568*0Sstevel@tonic-gate 
1569*0Sstevel@tonic-gate 			/* reinitialzse rpc.mdcommd with new nodelist */
1570*0Sstevel@tonic-gate 			if (mdmn_reinit_set(setno)) {
1571*0Sstevel@tonic-gate 				md_eprintf(gettext(
1572*0Sstevel@tonic-gate 				    "Could not re-initialise rpc.mdcommd for "
1573*0Sstevel@tonic-gate 				    "set %s\n"), sp->setname);
1574*0Sstevel@tonic-gate 				md_exit(sp, 1);
1575*0Sstevel@tonic-gate 			}
1576*0Sstevel@tonic-gate 
1577*0Sstevel@tonic-gate 			(void) memset(&cfg, 0, sizeof (cfg));
1578*0Sstevel@tonic-gate 			cfg.c_id = 0;
1579*0Sstevel@tonic-gate 			cfg.c_setno = sp->setno;
1580*0Sstevel@tonic-gate 			if (metaioctl(MD_DB_GETDEV, &cfg, &cfg.c_mde,
1581*0Sstevel@tonic-gate 			    NULL) != 0) {
1582*0Sstevel@tonic-gate 				mdstealerror(ep, &cfg.c_mde);
1583*0Sstevel@tonic-gate 				mde_perror(ep, gettext("Could "
1584*0Sstevel@tonic-gate 				    "not get set %s information"),
1585*0Sstevel@tonic-gate 				    sp->setname);
1586*0Sstevel@tonic-gate 				md_exit(sp, 1);
1587*0Sstevel@tonic-gate 			}
1588*0Sstevel@tonic-gate 
1589*0Sstevel@tonic-gate 			/* Don't do anything else if set is stale */
1590*0Sstevel@tonic-gate 			if (cfg.c_flags & MDDB_C_STALE) {
1591*0Sstevel@tonic-gate 				meta_unlock(sp, ep);
1592*0Sstevel@tonic-gate 				mdclrerror(ep);
1593*0Sstevel@tonic-gate 				continue;
1594*0Sstevel@tonic-gate 			}
1595*0Sstevel@tonic-gate 
1596*0Sstevel@tonic-gate 			/* reset mirror owners */
1597*0Sstevel@tonic-gate 			if (reset_state(RESET_OWNER, sp, MD_MIRROR, ep) == -1) {
1598*0Sstevel@tonic-gate 				md_exit(sp, 1);
1599*0Sstevel@tonic-gate 			}
1600*0Sstevel@tonic-gate 
1601*0Sstevel@tonic-gate 			meta_unlock(sp, ep);
1602*0Sstevel@tonic-gate 
1603*0Sstevel@tonic-gate 			meta_mc_log(MC_LOG3, gettext("Step3 - rpc.mdcommd "
1604*0Sstevel@tonic-gate 			    "re-initialised and mirror owners reset for "
1605*0Sstevel@tonic-gate 			    "set %s: %s"), sp->setname,
1606*0Sstevel@tonic-gate 			    meta_print_hrtime(gethrtime() - start_time));
1607*0Sstevel@tonic-gate 		}
1608*0Sstevel@tonic-gate 
1609*0Sstevel@tonic-gate 		meta_mc_log(MC_LOG2, gettext("Step3 completed: %s"),
1610*0Sstevel@tonic-gate 		    meta_print_hrtime(gethrtime() - start_time));
1611*0Sstevel@tonic-gate 
1612*0Sstevel@tonic-gate 		break;
1613*0Sstevel@tonic-gate 
1614*0Sstevel@tonic-gate 	case MC_STEP4:
1615*0Sstevel@tonic-gate 		/*
1616*0Sstevel@tonic-gate 		 * Step 4
1617*0Sstevel@tonic-gate 		 *
1618*0Sstevel@tonic-gate 		 * For all multinode sets do:
1619*0Sstevel@tonic-gate 		 * - Resume the rpc.mdcommd messages.  Must resume all
1620*0Sstevel@tonic-gate 		 *	sets before issuing I/O to any set since an error
1621*0Sstevel@tonic-gate 		 * 	encountered in a commd suspended set could be
1622*0Sstevel@tonic-gate 		 *	blocked waiting for commd in another set to resume.
1623*0Sstevel@tonic-gate 		 *	(This happens since the daemon queues service
1624*0Sstevel@tonic-gate 		 *	all sets).  An open of a soft partition causes
1625*0Sstevel@tonic-gate 		 *	a read of the watermarks during the open.
1626*0Sstevel@tonic-gate 		 * - If set is non-writable (not an owner or STALE), then
1627*0Sstevel@tonic-gate 		 *	continue to next set.
1628*0Sstevel@tonic-gate 		 *
1629*0Sstevel@tonic-gate 		 * For all multinode sets do,
1630*0Sstevel@tonic-gate 		 * - Reset ABR states for all mirrors, ie clear ABR if not
1631*0Sstevel@tonic-gate 		 *	open on any node.
1632*0Sstevel@tonic-gate 		 * - Reset ABR states for all soft partitions, ie clear ABR if
1633*0Sstevel@tonic-gate 		 *	not open on any node.
1634*0Sstevel@tonic-gate 		 * - For all slave nodes that have entered through the start
1635*0Sstevel@tonic-gate 		 *	step, update the ABR state to that of the master and
1636*0Sstevel@tonic-gate 		 *	get the submirror state from the master
1637*0Sstevel@tonic-gate 		 * - meta_lock set
1638*0Sstevel@tonic-gate 		 * - Resync all mirrors
1639*0Sstevel@tonic-gate 		 * - unlock meta_lock for this set.
1640*0Sstevel@tonic-gate 		 * - Choose a new owner for any orphaned resyncs
1641*0Sstevel@tonic-gate 		 *
1642*0Sstevel@tonic-gate 		 * There is one potential issue here. when concurrently
1643*0Sstevel@tonic-gate 		 * resetting and updating the ABR state. If the master has ABR
1644*0Sstevel@tonic-gate 		 * set, but should no longer have because the only node that
1645*0Sstevel@tonic-gate 		 * had the metadevice open and had ABR set has paniced, the
1646*0Sstevel@tonic-gate 		 * master will send a message to all nodes to clear the ABR
1647*0Sstevel@tonic-gate 		 * state. Meanwhile any node that has come through the
1648*0Sstevel@tonic-gate 		 * start step will get tstate from the master and will update
1649*0Sstevel@tonic-gate 		 * ABR if it was set in tstate. So, we appear to have a problem
1650*0Sstevel@tonic-gate 		 * if the following sequence occurs:-
1651*0Sstevel@tonic-gate 		 * - The slave gets tstate with ABR set
1652*0Sstevel@tonic-gate 		 * - The master sends a message to clear ABR
1653*0Sstevel@tonic-gate 		 * - The slave updates ABR with the value it got from tstate.
1654*0Sstevel@tonic-gate 		 * We now have the master with ABR clear and the slave with ABR
1655*0Sstevel@tonic-gate 		 * set. Fortunately, having set ABR, the slave will close the
1656*0Sstevel@tonic-gate 		 * metadevice after setting ABR and as there are no nodes with
1657*0Sstevel@tonic-gate 		 * the device open, the close will send a message to clear ABR
1658*0Sstevel@tonic-gate 		 * on all nodes. So, the nodes will all have ABR unset.
1659*0Sstevel@tonic-gate 		 */
1660*0Sstevel@tonic-gate 
1661*0Sstevel@tonic-gate 		/* expect the nodelist to follow the step name */
1662*0Sstevel@tonic-gate 		if (argc < 1)
1663*0Sstevel@tonic-gate 			usage(sp, 1);
1664*0Sstevel@tonic-gate 
1665*0Sstevel@tonic-gate 		meta_mc_log(MC_LOG2, gettext("Starting Step4: %s"),
1666*0Sstevel@tonic-gate 		    meta_print_hrtime(0));
1667*0Sstevel@tonic-gate 
1668*0Sstevel@tonic-gate 		/*
1669*0Sstevel@tonic-gate 		 * Does local set exist? If not, exit with 0
1670*0Sstevel@tonic-gate 		 * since there's no reason to have this node panic if
1671*0Sstevel@tonic-gate 		 * the local set cannot be started.
1672*0Sstevel@tonic-gate 		 */
1673*0Sstevel@tonic-gate 		if ((local_sp = load_local_set(ep)) == NULL) {
1674*0Sstevel@tonic-gate 			md_exit(local_sp, 0);
1675*0Sstevel@tonic-gate 		}
1676*0Sstevel@tonic-gate 
1677*0Sstevel@tonic-gate 		/*
1678*0Sstevel@tonic-gate 		 * walk through all sets on this node which could include:
1679*0Sstevel@tonic-gate 		 *	- MN disksets
1680*0Sstevel@tonic-gate 		 *	- traditional disksets
1681*0Sstevel@tonic-gate 		 *	- non-existent disksets
1682*0Sstevel@tonic-gate 		 * start mirror resync for all MN sets
1683*0Sstevel@tonic-gate 		 */
1684*0Sstevel@tonic-gate 		if ((max_sets = get_max_sets(ep)) == 0) {
1685*0Sstevel@tonic-gate 			mde_perror(ep, "");
1686*0Sstevel@tonic-gate 			md_exit(local_sp, 1);
1687*0Sstevel@tonic-gate 		}
1688*0Sstevel@tonic-gate 
1689*0Sstevel@tonic-gate 		/* Clear set_info structure */
1690*0Sstevel@tonic-gate 		for (setno = 1; setno < max_sets; setno++) {
1691*0Sstevel@tonic-gate 			set_info[setno] = 0;
1692*0Sstevel@tonic-gate 		}
1693*0Sstevel@tonic-gate 
1694*0Sstevel@tonic-gate 		/* start walking through all possible disksets */
1695*0Sstevel@tonic-gate 		for (setno = 1; setno < max_sets; setno++) {
1696*0Sstevel@tonic-gate 			if ((sp = metasetnosetname(setno, ep)) == NULL) {
1697*0Sstevel@tonic-gate 				if (mdiserror(ep, MDE_NO_SET)) {
1698*0Sstevel@tonic-gate 					/* No set for this setno - continue */
1699*0Sstevel@tonic-gate 					mdclrerror(ep);
1700*0Sstevel@tonic-gate 					continue;
1701*0Sstevel@tonic-gate 				} else {
1702*0Sstevel@tonic-gate 					mde_perror(ep, gettext("Unable to "
1703*0Sstevel@tonic-gate 					    "get set %d information"), setno);
1704*0Sstevel@tonic-gate 					md_exit(local_sp, 1);
1705*0Sstevel@tonic-gate 				}
1706*0Sstevel@tonic-gate 			}
1707*0Sstevel@tonic-gate 
1708*0Sstevel@tonic-gate 			if ((sd = metaget_setdesc(sp, ep)) == NULL) {
1709*0Sstevel@tonic-gate 				mde_perror(ep, gettext("Unable to get set "
1710*0Sstevel@tonic-gate 				    "%s desc information"), sp->setname);
1711*0Sstevel@tonic-gate 				mdclrerror(ep);
1712*0Sstevel@tonic-gate 				continue;
1713*0Sstevel@tonic-gate 			}
1714*0Sstevel@tonic-gate 
1715*0Sstevel@tonic-gate 			/* only check multi-node disksets */
1716*0Sstevel@tonic-gate 			if (!meta_is_mn_set(sp, ep)) {
1717*0Sstevel@tonic-gate 				mdclrerror(ep);
1718*0Sstevel@tonic-gate 				continue;
1719*0Sstevel@tonic-gate 			}
1720*0Sstevel@tonic-gate 
1721*0Sstevel@tonic-gate 			set_info[setno] |= SET_INFO_MN;
1722*0Sstevel@tonic-gate 
1723*0Sstevel@tonic-gate 			/*
1724*0Sstevel@tonic-gate 			 * If not an owner (all mddbs failed) or stale
1725*0Sstevel@tonic-gate 			 * (< 50% mddbs operational), then set is
1726*0Sstevel@tonic-gate 			 * non-writable so just resume commd and
1727*0Sstevel@tonic-gate 			 * unblock mddb messages.
1728*0Sstevel@tonic-gate 			 */
1729*0Sstevel@tonic-gate 			mdclrerror(ep);
1730*0Sstevel@tonic-gate 			if (s_ownset(sp->setno, ep) != MD_SETOWNER_YES) {
1731*0Sstevel@tonic-gate 				set_info[setno] |= SET_INFO_NO_WR;
1732*0Sstevel@tonic-gate 			}
1733*0Sstevel@tonic-gate 			if (!mdisok(ep)) {
1734*0Sstevel@tonic-gate 				mde_perror(ep, gettext("Could "
1735*0Sstevel@tonic-gate 				    "not get set %s ownership"),
1736*0Sstevel@tonic-gate 				    sp->setname);
1737*0Sstevel@tonic-gate 				md_exit(local_sp, 1);
1738*0Sstevel@tonic-gate 			}
1739*0Sstevel@tonic-gate 			/* Set is owned - is it stale? */
1740*0Sstevel@tonic-gate 			if (!set_info[setno] & SET_INFO_NO_WR) {
1741*0Sstevel@tonic-gate 				(void) memset(&cfg, 0, sizeof (cfg));
1742*0Sstevel@tonic-gate 				cfg.c_id = 0;
1743*0Sstevel@tonic-gate 				cfg.c_setno = sp->setno;
1744*0Sstevel@tonic-gate 				if (metaioctl(MD_DB_GETDEV, &cfg, &cfg.c_mde,
1745*0Sstevel@tonic-gate 				    NULL) != 0) {
1746*0Sstevel@tonic-gate 					mdstealerror(ep, &cfg.c_mde);
1747*0Sstevel@tonic-gate 					mde_perror(ep, gettext("Could "
1748*0Sstevel@tonic-gate 					    "not get set %s information"),
1749*0Sstevel@tonic-gate 					    sp->setname);
1750*0Sstevel@tonic-gate 					md_exit(local_sp, 1);
1751*0Sstevel@tonic-gate 				}
1752*0Sstevel@tonic-gate 				if (cfg.c_flags & MDDB_C_STALE) {
1753*0Sstevel@tonic-gate 					set_info[setno] |= SET_INFO_NO_WR;
1754*0Sstevel@tonic-gate 				}
1755*0Sstevel@tonic-gate 			}
1756*0Sstevel@tonic-gate 
1757*0Sstevel@tonic-gate 			/* resume rpc.mdcommd */
1758*0Sstevel@tonic-gate 			if (mdmn_resume(setno, MD_COMM_ALL_CLASSES, 0)) {
1759*0Sstevel@tonic-gate 				md_eprintf(gettext("Unable to resume "
1760*0Sstevel@tonic-gate 				    "rpc.mdcommd for set %s\n"), sp->setname);
1761*0Sstevel@tonic-gate 				md_exit(local_sp, 1);
1762*0Sstevel@tonic-gate 			}
1763*0Sstevel@tonic-gate 			meta_ping_mnset(setno);
1764*0Sstevel@tonic-gate 
1765*0Sstevel@tonic-gate 			/* Unblock mddb parse messages */
1766*0Sstevel@tonic-gate 			if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) {
1767*0Sstevel@tonic-gate 				(void) memset(&mbp, 0, sizeof (mbp));
1768*0Sstevel@tonic-gate 				mbp.c_setno = setno;
1769*0Sstevel@tonic-gate 				mbp.c_blk_flags = MDDB_UNBLOCK_PARSE;
1770*0Sstevel@tonic-gate 				if (metaioctl(MD_MN_MDDB_BLOCK, &mbp,
1771*0Sstevel@tonic-gate 				    &mbp.c_mde, NULL)) {
1772*0Sstevel@tonic-gate 					mdstealerror(ep, &mbp.c_mde);
1773*0Sstevel@tonic-gate 					mde_perror(ep, gettext("Could not "
1774*0Sstevel@tonic-gate 					    "unblock set %s"), sp->setname);
1775*0Sstevel@tonic-gate 					md_exit(local_sp, 1);
1776*0Sstevel@tonic-gate 				}
1777*0Sstevel@tonic-gate 			}
1778*0Sstevel@tonic-gate 			meta_mc_log(MC_LOG3, gettext("Step4 - rpc.mdcommd "
1779*0Sstevel@tonic-gate 			    "resumed and messages unblocked for set %s: %s"),
1780*0Sstevel@tonic-gate 			    sp->setname,
1781*0Sstevel@tonic-gate 			    meta_print_hrtime(gethrtime() - start_time));
1782*0Sstevel@tonic-gate 		}
1783*0Sstevel@tonic-gate 
1784*0Sstevel@tonic-gate 		for (setno = 1; setno < max_sets; setno++) {
1785*0Sstevel@tonic-gate 			int			start_step;
1786*0Sstevel@tonic-gate 
1787*0Sstevel@tonic-gate 			/* Skip traditional disksets. */
1788*0Sstevel@tonic-gate 			if ((set_info[setno] & SET_INFO_MN) == 0)
1789*0Sstevel@tonic-gate 				continue;
1790*0Sstevel@tonic-gate 
1791*0Sstevel@tonic-gate 			/*
1792*0Sstevel@tonic-gate 			 * If already determined that this set is
1793*0Sstevel@tonic-gate 			 * a non-writable set, then just continue
1794*0Sstevel@tonic-gate 			 * to next set since there's nothing else
1795*0Sstevel@tonic-gate 			 * to do for a non-writable set.
1796*0Sstevel@tonic-gate 			 */
1797*0Sstevel@tonic-gate 			if (set_info[setno] & SET_INFO_NO_WR)
1798*0Sstevel@tonic-gate 				continue;
1799*0Sstevel@tonic-gate 
1800*0Sstevel@tonic-gate 			if ((sp = metasetnosetname(setno, ep)) == NULL) {
1801*0Sstevel@tonic-gate 				if (mdiserror(ep, MDE_NO_SET)) {
1802*0Sstevel@tonic-gate 					/* No set for this setno - continue */
1803*0Sstevel@tonic-gate 					mdclrerror(ep);
1804*0Sstevel@tonic-gate 					continue;
1805*0Sstevel@tonic-gate 				} else {
1806*0Sstevel@tonic-gate 					mde_perror(ep, gettext("Unable to "
1807*0Sstevel@tonic-gate 					    "get set %d information"), setno);
1808*0Sstevel@tonic-gate 					md_exit(local_sp, 1);
1809*0Sstevel@tonic-gate 				}
1810*0Sstevel@tonic-gate 			}
1811*0Sstevel@tonic-gate 
1812*0Sstevel@tonic-gate 			if ((sd = metaget_setdesc(sp, ep)) == NULL) {
1813*0Sstevel@tonic-gate 				mde_perror(ep, gettext("Unable to get set "
1814*0Sstevel@tonic-gate 				    "%s desc information"), sp->setname);
1815*0Sstevel@tonic-gate 				mdclrerror(ep);
1816*0Sstevel@tonic-gate 				continue;
1817*0Sstevel@tonic-gate 			}
1818*0Sstevel@tonic-gate 
1819*0Sstevel@tonic-gate 			/* See if this node came through the start step */
1820*0Sstevel@tonic-gate 			(void) memset(&sf, 0, sizeof (sf));
1821*0Sstevel@tonic-gate 			sf.sf_setno = sp->setno;
1822*0Sstevel@tonic-gate 			sf.sf_flags = MDDB_NM_GET;
1823*0Sstevel@tonic-gate 			/* Use magic to help protect ioctl against attack. */
1824*0Sstevel@tonic-gate 			sf.sf_magic = MDDB_SETFLAGS_MAGIC;
1825*0Sstevel@tonic-gate 			if (metaioctl(MD_MN_SET_SETFLAGS, &sf,
1826*0Sstevel@tonic-gate 			    &sf.sf_mde, NULL)) {
1827*0Sstevel@tonic-gate 				mdstealerror(ep, &sf.sf_mde);
1828*0Sstevel@tonic-gate 				mde_perror(ep, gettext("Could not get "
1829*0Sstevel@tonic-gate 				    "start_step flag for set %s"), sp->setname);
1830*0Sstevel@tonic-gate 				md_exit(local_sp, 1);
1831*0Sstevel@tonic-gate 			}
1832*0Sstevel@tonic-gate 			start_step =
1833*0Sstevel@tonic-gate 			    (sf.sf_setflags & MD_SET_MN_START_RC)? 1: 0;
1834*0Sstevel@tonic-gate 
1835*0Sstevel@tonic-gate 			/*
1836*0Sstevel@tonic-gate 			 * We can now reset the start_step flag for the set
1837*0Sstevel@tonic-gate 			 * if it was already set.
1838*0Sstevel@tonic-gate 			 */
1839*0Sstevel@tonic-gate 			if (start_step) {
1840*0Sstevel@tonic-gate 				(void) memset(&sf, 0, sizeof (sf));
1841*0Sstevel@tonic-gate 					sf.sf_setno = sp->setno;
1842*0Sstevel@tonic-gate 				sf.sf_setflags = MD_SET_MN_START_RC;
1843*0Sstevel@tonic-gate 				sf.sf_flags = MDDB_NM_RESET;
1844*0Sstevel@tonic-gate 				/*
1845*0Sstevel@tonic-gate 				 * Use magic to help protect ioctl
1846*0Sstevel@tonic-gate 				 * against attack.
1847*0Sstevel@tonic-gate 				 */
1848*0Sstevel@tonic-gate 				sf.sf_magic = MDDB_SETFLAGS_MAGIC;
1849*0Sstevel@tonic-gate 				if (metaioctl(MD_MN_SET_SETFLAGS, &sf,
1850*0Sstevel@tonic-gate 				    &sf.sf_mde, NULL)) {
1851*0Sstevel@tonic-gate 					mdstealerror(ep, &sf.sf_mde);
1852*0Sstevel@tonic-gate 					mde_perror(ep,
1853*0Sstevel@tonic-gate 					    gettext("Could not reset "
1854*0Sstevel@tonic-gate 					    "start_step flag for set %s"),
1855*0Sstevel@tonic-gate 					    sp->setname);
1856*0Sstevel@tonic-gate 				}
1857*0Sstevel@tonic-gate 			}
1858*0Sstevel@tonic-gate 
1859*0Sstevel@tonic-gate 			meta_mc_log(MC_LOG3, gettext("Step4 - begin setting "
1860*0Sstevel@tonic-gate 			    "ABR state and restarting io's for "
1861*0Sstevel@tonic-gate 			    "set %s: %s"), sp->setname,
1862*0Sstevel@tonic-gate 			    meta_print_hrtime(gethrtime() - start_time));
1863*0Sstevel@tonic-gate 
1864*0Sstevel@tonic-gate 
1865*0Sstevel@tonic-gate 			/*
1866*0Sstevel@tonic-gate 			 * If we are not the master and we have come through
1867*0Sstevel@tonic-gate 			 * the start step, we must update the ABR states
1868*0Sstevel@tonic-gate 			 * for mirrors and soft partitions. Also the submirror
1869*0Sstevel@tonic-gate 			 * states need to be synchronised so that we see the
1870*0Sstevel@tonic-gate 			 * same status as other previously joined members.
1871*0Sstevel@tonic-gate 			 * This _must_ be done before starting the resync.
1872*0Sstevel@tonic-gate 			 */
1873*0Sstevel@tonic-gate 			if (!(sd->sd_mn_am_i_master) && start_step) {
1874*0Sstevel@tonic-gate 				if (reset_state(GET_MIRROR_STATE, sp, MD_MIRROR,
1875*0Sstevel@tonic-gate 				    ep) == -1) {
1876*0Sstevel@tonic-gate 					md_exit(local_sp, 1);
1877*0Sstevel@tonic-gate 				}
1878*0Sstevel@tonic-gate 				if (reset_state(UPDATE_ABR, sp, MD_SP,
1879*0Sstevel@tonic-gate 				    ep) == -1) {
1880*0Sstevel@tonic-gate 					md_exit(local_sp, 1);
1881*0Sstevel@tonic-gate 				}
1882*0Sstevel@tonic-gate 				/*
1883*0Sstevel@tonic-gate 				 * Mark the fact that we've got the mirror
1884*0Sstevel@tonic-gate 				 * state. This allows the resync thread to
1885*0Sstevel@tonic-gate 				 * determine if _it_ needs to issue this. This
1886*0Sstevel@tonic-gate 				 * can happen if a node is added to a set after
1887*0Sstevel@tonic-gate 				 * a reconfig cycle has completed.
1888*0Sstevel@tonic-gate 				 */
1889*0Sstevel@tonic-gate 				(void) memset(&sf, 0, sizeof (sf));
1890*0Sstevel@tonic-gate 					sf.sf_setno = sp->setno;
1891*0Sstevel@tonic-gate 				sf.sf_setflags = MD_SET_MN_MIR_STATE_RC;
1892*0Sstevel@tonic-gate 				sf.sf_flags = MDDB_NM_SET;
1893*0Sstevel@tonic-gate 				/*
1894*0Sstevel@tonic-gate 				 * Use magic to help protect ioctl
1895*0Sstevel@tonic-gate 				 * against attack.
1896*0Sstevel@tonic-gate 				 */
1897*0Sstevel@tonic-gate 				sf.sf_magic = MDDB_SETFLAGS_MAGIC;
1898*0Sstevel@tonic-gate 				if (metaioctl(MD_MN_SET_SETFLAGS, &sf,
1899*0Sstevel@tonic-gate 				    &sf.sf_mde, NULL)) {
1900*0Sstevel@tonic-gate 					mdstealerror(ep, &sf.sf_mde);
1901*0Sstevel@tonic-gate 					mde_perror(ep,
1902*0Sstevel@tonic-gate 					    gettext("Could not set "
1903*0Sstevel@tonic-gate 					    "submirror state flag for set %s"),
1904*0Sstevel@tonic-gate 					    sp->setname);
1905*0Sstevel@tonic-gate 				}
1906*0Sstevel@tonic-gate 			}
1907*0Sstevel@tonic-gate 
1908*0Sstevel@tonic-gate 			/*
1909*0Sstevel@tonic-gate 			 * All remaining actions are only performed by the
1910*0Sstevel@tonic-gate 			 * master
1911*0Sstevel@tonic-gate 			 */
1912*0Sstevel@tonic-gate 			if (!(sd->sd_mn_am_i_master)) {
1913*0Sstevel@tonic-gate 				if (meta_lock(sp, TRUE, ep) != 0) {
1914*0Sstevel@tonic-gate 					mde_perror(ep, "");
1915*0Sstevel@tonic-gate 					md_exit(local_sp, 1);
1916*0Sstevel@tonic-gate 				}
1917*0Sstevel@tonic-gate 				meta_mirror_resync_unblock(sp);
1918*0Sstevel@tonic-gate 				meta_unlock(sp, ep);
1919*0Sstevel@tonic-gate 				continue;
1920*0Sstevel@tonic-gate 			}
1921*0Sstevel@tonic-gate 
1922*0Sstevel@tonic-gate 			/*
1923*0Sstevel@tonic-gate 			 * If the master came through the start step, this
1924*0Sstevel@tonic-gate 			 * implies that all of the nodes must have done the
1925*0Sstevel@tonic-gate 			 * same and hence there can be no applications
1926*0Sstevel@tonic-gate 			 * running. Hence no need to reset ABR
1927*0Sstevel@tonic-gate 			 */
1928*0Sstevel@tonic-gate 			if (!start_step) {
1929*0Sstevel@tonic-gate 				/* Reset ABR state for mirrors */
1930*0Sstevel@tonic-gate 				if (reset_state(RESET_ABR, sp, MD_MIRROR,
1931*0Sstevel@tonic-gate 				    ep) == -1) {
1932*0Sstevel@tonic-gate 					md_exit(local_sp, 1);
1933*0Sstevel@tonic-gate 				}
1934*0Sstevel@tonic-gate 				/* ...and now the same for soft partitions */
1935*0Sstevel@tonic-gate 				if (reset_state(RESET_ABR, sp, MD_SP,
1936*0Sstevel@tonic-gate 				    ep) == -1) {
1937*0Sstevel@tonic-gate 					md_exit(local_sp, 1);
1938*0Sstevel@tonic-gate 				}
1939*0Sstevel@tonic-gate 			}
1940*0Sstevel@tonic-gate 
1941*0Sstevel@tonic-gate 			/*
1942*0Sstevel@tonic-gate 			 * choose owners for orphaned resyncs and reset
1943*0Sstevel@tonic-gate 			 * non-orphaned resyncs so that an owner node that
1944*0Sstevel@tonic-gate 			 * reboots will restart the resync if needed.
1945*0Sstevel@tonic-gate 			 */
1946*0Sstevel@tonic-gate 			if (reset_state(CHOOSE_OWNER, sp, MD_MIRROR, ep) == -1)
1947*0Sstevel@tonic-gate 				md_exit(local_sp, 1);
1948*0Sstevel@tonic-gate 
1949*0Sstevel@tonic-gate 			/*
1950*0Sstevel@tonic-gate 			 * Must unlock set lock before meta_mirror_resync_all
1951*0Sstevel@tonic-gate 			 * sends a message to run the metasync command
1952*0Sstevel@tonic-gate 			 * which also grabs the meta_lock.
1953*0Sstevel@tonic-gate 			 */
1954*0Sstevel@tonic-gate 			if (meta_lock(sp, TRUE, ep) != 0) {
1955*0Sstevel@tonic-gate 				mde_perror(ep, "");
1956*0Sstevel@tonic-gate 				md_exit(local_sp, 1);
1957*0Sstevel@tonic-gate 			}
1958*0Sstevel@tonic-gate 			meta_mirror_resync_unblock(sp);
1959*0Sstevel@tonic-gate 			meta_unlock(sp, ep);
1960*0Sstevel@tonic-gate 
1961*0Sstevel@tonic-gate 			/* resync all mirrors in set */
1962*0Sstevel@tonic-gate 			if (meta_mirror_resync_all(sp, 0, ep) != 0) {
1963*0Sstevel@tonic-gate 				mde_perror(ep, gettext("Mirror resyncs "
1964*0Sstevel@tonic-gate 				    "failed for set %s"), sp->setname);
1965*0Sstevel@tonic-gate 				md_exit(local_sp, 1);
1966*0Sstevel@tonic-gate 			}
1967*0Sstevel@tonic-gate 
1968*0Sstevel@tonic-gate 			meta_mc_log(MC_LOG3, gettext("Step4 - io's restarted "
1969*0Sstevel@tonic-gate 			    "for set %s: %s"), sp->setname,
1970*0Sstevel@tonic-gate 			    meta_print_hrtime(gethrtime() - start_time));
1971*0Sstevel@tonic-gate 		}
1972*0Sstevel@tonic-gate 
1973*0Sstevel@tonic-gate 		meta_mc_log(MC_LOG2, gettext("Step4 completed: %s"),
1974*0Sstevel@tonic-gate 		    meta_print_hrtime(gethrtime() - start_time));
1975*0Sstevel@tonic-gate 
1976*0Sstevel@tonic-gate 		break;
1977*0Sstevel@tonic-gate 
1978*0Sstevel@tonic-gate 	default:
1979*0Sstevel@tonic-gate 		usage(sp, 1);
1980*0Sstevel@tonic-gate 		break;
1981*0Sstevel@tonic-gate 	}
1982*0Sstevel@tonic-gate 
1983*0Sstevel@tonic-gate 	md_exit(sp, 0);
1984*0Sstevel@tonic-gate 	/* NOTREACHED */
1985*0Sstevel@tonic-gate 	return (0);
1986*0Sstevel@tonic-gate }
1987