1*0Sstevel@tonic-gate /*
2*0Sstevel@tonic-gate  * CDDL HEADER START
3*0Sstevel@tonic-gate  *
4*0Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
5*0Sstevel@tonic-gate  * Common Development and Distribution License, Version 1.0 only
6*0Sstevel@tonic-gate  * (the "License").  You may not use this file except in compliance
7*0Sstevel@tonic-gate  * with the License.
8*0Sstevel@tonic-gate  *
9*0Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10*0Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
11*0Sstevel@tonic-gate  * See the License for the specific language governing permissions
12*0Sstevel@tonic-gate  * and limitations under the License.
13*0Sstevel@tonic-gate  *
14*0Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
15*0Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16*0Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
17*0Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
18*0Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
19*0Sstevel@tonic-gate  *
20*0Sstevel@tonic-gate  * CDDL HEADER END
21*0Sstevel@tonic-gate  */
22*0Sstevel@tonic-gate /*
23*0Sstevel@tonic-gate  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24*0Sstevel@tonic-gate  * Use is subject to license terms.
25*0Sstevel@tonic-gate  */
26*0Sstevel@tonic-gate 
27*0Sstevel@tonic-gate #pragma ident	"%Z%%M%	%I%	%E% SMI"
28*0Sstevel@tonic-gate 
29*0Sstevel@tonic-gate /*
30*0Sstevel@tonic-gate  * Just in case we're not in a build environment, make sure that
31*0Sstevel@tonic-gate  * TEXT_DOMAIN gets set to something.
32*0Sstevel@tonic-gate  */
33*0Sstevel@tonic-gate #if !defined(TEXT_DOMAIN)
34*0Sstevel@tonic-gate #define	TEXT_DOMAIN "SYS_TEST"
35*0Sstevel@tonic-gate #endif
36*0Sstevel@tonic-gate 
37*0Sstevel@tonic-gate /*
38*0Sstevel@tonic-gate  * Metadevice diskset interfaces
39*0Sstevel@tonic-gate  */
40*0Sstevel@tonic-gate 
41*0Sstevel@tonic-gate #include "meta_set_prv.h"
42*0Sstevel@tonic-gate #include <meta.h>
43*0Sstevel@tonic-gate #include <metad.h>
44*0Sstevel@tonic-gate #include <mdmn_changelog.h>
45*0Sstevel@tonic-gate #include <sys/lvm/md_crc.h>
46*0Sstevel@tonic-gate #include <sys/utsname.h>
47*0Sstevel@tonic-gate #include <sdssc.h>
48*0Sstevel@tonic-gate 
49*0Sstevel@tonic-gate #include <sys/sysevent/eventdefs.h>
50*0Sstevel@tonic-gate #include <sys/sysevent/svm.h>
51*0Sstevel@tonic-gate extern	char	*blkname(char *);
52*0Sstevel@tonic-gate 
53*0Sstevel@tonic-gate static md_drive_desc *
54*0Sstevel@tonic-gate dr2drivedesc(
55*0Sstevel@tonic-gate 	mdsetname_t	*sp,
56*0Sstevel@tonic-gate 	side_t		sideno,
57*0Sstevel@tonic-gate 	int		flags,
58*0Sstevel@tonic-gate 	md_error_t	*ep
59*0Sstevel@tonic-gate )
60*0Sstevel@tonic-gate {
61*0Sstevel@tonic-gate 	md_set_record	*sr;
62*0Sstevel@tonic-gate 	md_drive_record	*dr;
63*0Sstevel@tonic-gate 	mddrivename_t	*dnp;
64*0Sstevel@tonic-gate 	md_drive_desc	*dd_head = NULL;
65*0Sstevel@tonic-gate 	md_set_desc	*sd;
66*0Sstevel@tonic-gate 
67*0Sstevel@tonic-gate 	if (flags & MD_BYPASS_DAEMON) {
68*0Sstevel@tonic-gate 		if ((sr = metad_getsetbynum(sp->setno, ep)) == NULL)
69*0Sstevel@tonic-gate 			return (NULL);
70*0Sstevel@tonic-gate 		sd = metaget_setdesc(sp, ep);
71*0Sstevel@tonic-gate 		sideno = getnodeside(mynode(), sd);
72*0Sstevel@tonic-gate 		sp = metafakesetname(sp->setno, sr->sr_setname);
73*0Sstevel@tonic-gate 	} else {
74*0Sstevel@tonic-gate 		if ((sr = getsetbyname(sp->setname, ep)) == NULL)
75*0Sstevel@tonic-gate 			return (NULL);
76*0Sstevel@tonic-gate 	}
77*0Sstevel@tonic-gate 
78*0Sstevel@tonic-gate 	assert(sideno != MD_SIDEWILD);
79*0Sstevel@tonic-gate 
80*0Sstevel@tonic-gate 	/*
81*0Sstevel@tonic-gate 	 * WARNING:
82*0Sstevel@tonic-gate 	 * The act of getting the dnp from the namespace means that we
83*0Sstevel@tonic-gate 	 * will get the devid of the disk as recorded in the namespace.
84*0Sstevel@tonic-gate 	 * This devid has the potential to be stale if the disk is being
85*0Sstevel@tonic-gate 	 * replaced via a rebind, this means that any code that relies
86*0Sstevel@tonic-gate 	 * on any of the dnp information should take the appropriate action
87*0Sstevel@tonic-gate 	 * to preserve that information. For example in the rebind code the
88*0Sstevel@tonic-gate 	 * devid of the new disk is saved off and then copied back in once
89*0Sstevel@tonic-gate 	 * the code that has called this function has completed.
90*0Sstevel@tonic-gate 	 */
91*0Sstevel@tonic-gate 	for (dr = sr->sr_drivechain; dr != NULL; dr = dr->dr_next) {
92*0Sstevel@tonic-gate 		if ((dnp = metadrivename_withdrkey(sp, sideno, dr->dr_key,
93*0Sstevel@tonic-gate 		    flags, ep)) == NULL) {
94*0Sstevel@tonic-gate 			if (!(flags & MD_BYPASS_DAEMON))
95*0Sstevel@tonic-gate 				free_sr(sr);
96*0Sstevel@tonic-gate 			metafreedrivedesc(&dd_head);
97*0Sstevel@tonic-gate 			return (NULL);
98*0Sstevel@tonic-gate 		}
99*0Sstevel@tonic-gate 
100*0Sstevel@tonic-gate 		(void) metadrivedesc_append(&dd_head, dnp, dr->dr_dbcnt,
101*0Sstevel@tonic-gate 		    dr->dr_dbsize, dr->dr_ctime, dr->dr_genid, dr->dr_flags);
102*0Sstevel@tonic-gate 	}
103*0Sstevel@tonic-gate 
104*0Sstevel@tonic-gate 	if (!(flags & MD_BYPASS_DAEMON)) {
105*0Sstevel@tonic-gate 		free_sr(sr);
106*0Sstevel@tonic-gate 	}
107*0Sstevel@tonic-gate 	return (dd_head);
108*0Sstevel@tonic-gate }
109*0Sstevel@tonic-gate 
110*0Sstevel@tonic-gate static int
111*0Sstevel@tonic-gate get_sidenmlist(
112*0Sstevel@tonic-gate 	mdsetname_t	*sp,
113*0Sstevel@tonic-gate 	mddrivename_t	*dnp,
114*0Sstevel@tonic-gate 	md_error_t	*ep
115*0Sstevel@tonic-gate )
116*0Sstevel@tonic-gate {
117*0Sstevel@tonic-gate 	md_set_desc	*sd;
118*0Sstevel@tonic-gate 	mdsidenames_t	*sn, **sn_next;
119*0Sstevel@tonic-gate 	int		i;
120*0Sstevel@tonic-gate 
121*0Sstevel@tonic-gate 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
122*0Sstevel@tonic-gate 		return (-1);
123*0Sstevel@tonic-gate 
124*0Sstevel@tonic-gate 	metaflushsidenames(dnp);
125*0Sstevel@tonic-gate 	sn_next = &dnp->side_names;
126*0Sstevel@tonic-gate 	if (MD_MNSET_DESC(sd)) {
127*0Sstevel@tonic-gate 		/*
128*0Sstevel@tonic-gate 		 * Only get sidenames for this node since
129*0Sstevel@tonic-gate 		 * that is the only side information stored in
130*0Sstevel@tonic-gate 		 * the local mddb for a multi-node diskset.
131*0Sstevel@tonic-gate 		 */
132*0Sstevel@tonic-gate 		if (sd->sd_mn_mynode) {
133*0Sstevel@tonic-gate 			sn = Zalloc(sizeof (*sn));
134*0Sstevel@tonic-gate 			sn->sideno = sd->sd_mn_mynode->nd_nodeid;
135*0Sstevel@tonic-gate 			if ((sn->cname = meta_getnmentbykey(MD_LOCAL_SET,
136*0Sstevel@tonic-gate 			    sn->sideno, dnp->side_names_key, &sn->dname,
137*0Sstevel@tonic-gate 			    &sn->mnum, NULL, ep)) == NULL) {
138*0Sstevel@tonic-gate 				if (sn->dname != NULL)
139*0Sstevel@tonic-gate 					Free(sn->dname);
140*0Sstevel@tonic-gate 				Free(sn);
141*0Sstevel@tonic-gate 				return (-1);
142*0Sstevel@tonic-gate 			}
143*0Sstevel@tonic-gate 
144*0Sstevel@tonic-gate 			/* Add to the end of the linked list */
145*0Sstevel@tonic-gate 			assert(*sn_next == NULL);
146*0Sstevel@tonic-gate 			*sn_next = sn;
147*0Sstevel@tonic-gate 			sn_next = &sn->next;
148*0Sstevel@tonic-gate 		}
149*0Sstevel@tonic-gate 	} else {
150*0Sstevel@tonic-gate 		for (i = 0; i < MD_MAXSIDES; i++) {
151*0Sstevel@tonic-gate 			/* Skip empty slots */
152*0Sstevel@tonic-gate 			if (sd->sd_nodes[i][0] == '\0')
153*0Sstevel@tonic-gate 				continue;
154*0Sstevel@tonic-gate 
155*0Sstevel@tonic-gate 			sn = Zalloc(sizeof (*sn));
156*0Sstevel@tonic-gate 			sn->sideno = i;
157*0Sstevel@tonic-gate 			if ((sn->cname = meta_getnmentbykey(MD_LOCAL_SET,
158*0Sstevel@tonic-gate 			    i+SKEW, dnp->side_names_key, &sn->dname,
159*0Sstevel@tonic-gate 			    &sn->mnum, NULL, ep)) == NULL) {
160*0Sstevel@tonic-gate 				/*
161*0Sstevel@tonic-gate 				 * It is possible that during the add of a
162*0Sstevel@tonic-gate 				 * host to have a 'missing' side as the side
163*0Sstevel@tonic-gate 				 * for this disk will be added later. So ignore
164*0Sstevel@tonic-gate 				 * the error. The 'missing' side will be added
165*0Sstevel@tonic-gate 				 * once the addhosts process has completed.
166*0Sstevel@tonic-gate 				 */
167*0Sstevel@tonic-gate 				if (mdissyserror(ep, ENOENT)) {
168*0Sstevel@tonic-gate 					mdclrerror(ep);
169*0Sstevel@tonic-gate 					Free(sn);
170*0Sstevel@tonic-gate 					continue;
171*0Sstevel@tonic-gate 				}
172*0Sstevel@tonic-gate 
173*0Sstevel@tonic-gate 				if (sn->dname != NULL)
174*0Sstevel@tonic-gate 					Free(sn->dname);
175*0Sstevel@tonic-gate 				Free(sn);
176*0Sstevel@tonic-gate 				return (-1);
177*0Sstevel@tonic-gate 			}
178*0Sstevel@tonic-gate 
179*0Sstevel@tonic-gate 			/* Add to the end of the linked list */
180*0Sstevel@tonic-gate 			assert(*sn_next == NULL);
181*0Sstevel@tonic-gate 			*sn_next = sn;
182*0Sstevel@tonic-gate 			sn_next = &sn->next;
183*0Sstevel@tonic-gate 		}
184*0Sstevel@tonic-gate 	}
185*0Sstevel@tonic-gate 
186*0Sstevel@tonic-gate 	return (0);
187*0Sstevel@tonic-gate }
188*0Sstevel@tonic-gate 
189*0Sstevel@tonic-gate static md_drive_desc *
190*0Sstevel@tonic-gate rl_to_dd(
191*0Sstevel@tonic-gate 	mdsetname_t		*sp,
192*0Sstevel@tonic-gate 	md_replicalist_t	*rlp,
193*0Sstevel@tonic-gate 	md_error_t		*ep
194*0Sstevel@tonic-gate )
195*0Sstevel@tonic-gate {
196*0Sstevel@tonic-gate 	md_replicalist_t	*rl;
197*0Sstevel@tonic-gate 	md_replica_t		*r;
198*0Sstevel@tonic-gate 	md_drive_desc		*dd = NULL;
199*0Sstevel@tonic-gate 	md_drive_desc		*d;
200*0Sstevel@tonic-gate 	int			found;
201*0Sstevel@tonic-gate 	md_set_desc		*sd;
202*0Sstevel@tonic-gate 	daddr_t			nblks = 0;
203*0Sstevel@tonic-gate 
204*0Sstevel@tonic-gate 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
205*0Sstevel@tonic-gate 		return (NULL);
206*0Sstevel@tonic-gate 
207*0Sstevel@tonic-gate 	/* find the smallest existing replica */
208*0Sstevel@tonic-gate 	for (rl = rlp; rl != NULL; rl = rl->rl_next) {
209*0Sstevel@tonic-gate 		r = rl->rl_repp;
210*0Sstevel@tonic-gate 		nblks = ((nblks == 0) ? r->r_nblk : min(r->r_nblk, nblks));
211*0Sstevel@tonic-gate 	}
212*0Sstevel@tonic-gate 
213*0Sstevel@tonic-gate 	if (nblks <= 0)
214*0Sstevel@tonic-gate 		nblks = (MD_MNSET_DESC(sd)) ? MD_MN_DBSIZE : MD_DBSIZE;
215*0Sstevel@tonic-gate 
216*0Sstevel@tonic-gate 	for (rl = rlp; rl != NULL; rl = rl->rl_next) {
217*0Sstevel@tonic-gate 		r = rl->rl_repp;
218*0Sstevel@tonic-gate 
219*0Sstevel@tonic-gate 		found = 0;
220*0Sstevel@tonic-gate 		for (d = dd; d != NULL; d = d->dd_next) {
221*0Sstevel@tonic-gate 			if (strcmp(r->r_namep->drivenamep->cname,
222*0Sstevel@tonic-gate 			    d->dd_dnp->cname) == 0) {
223*0Sstevel@tonic-gate 				found = 1;
224*0Sstevel@tonic-gate 				dd->dd_dbcnt++;
225*0Sstevel@tonic-gate 				break;
226*0Sstevel@tonic-gate 			}
227*0Sstevel@tonic-gate 		}
228*0Sstevel@tonic-gate 
229*0Sstevel@tonic-gate 		if (! found)
230*0Sstevel@tonic-gate 			(void) metadrivedesc_append(&dd, r->r_namep->drivenamep,
231*0Sstevel@tonic-gate 			    1, nblks, sd->sd_ctime, sd->sd_genid, MD_DR_OK);
232*0Sstevel@tonic-gate 	}
233*0Sstevel@tonic-gate 
234*0Sstevel@tonic-gate 	return (dd);
235*0Sstevel@tonic-gate }
236*0Sstevel@tonic-gate 
237*0Sstevel@tonic-gate /*
238*0Sstevel@tonic-gate  * Exported Entry Points
239*0Sstevel@tonic-gate  */
240*0Sstevel@tonic-gate 
241*0Sstevel@tonic-gate set_t
242*0Sstevel@tonic-gate get_max_sets(md_error_t *ep)
243*0Sstevel@tonic-gate {
244*0Sstevel@tonic-gate 
245*0Sstevel@tonic-gate 	static set_t		max_sets = 0;
246*0Sstevel@tonic-gate 
247*0Sstevel@tonic-gate 	if (max_sets == 0)
248*0Sstevel@tonic-gate 		if (metaioctl(MD_IOCGETNSET, &max_sets, ep, NULL) != 0)
249*0Sstevel@tonic-gate 			return (0);
250*0Sstevel@tonic-gate 
251*0Sstevel@tonic-gate 	return (max_sets);
252*0Sstevel@tonic-gate }
253*0Sstevel@tonic-gate 
254*0Sstevel@tonic-gate int
255*0Sstevel@tonic-gate get_max_meds(md_error_t *ep)
256*0Sstevel@tonic-gate {
257*0Sstevel@tonic-gate 	static int		max_meds = 0;
258*0Sstevel@tonic-gate 
259*0Sstevel@tonic-gate 	if (max_meds == 0)
260*0Sstevel@tonic-gate 		if (metaioctl(MD_MED_GET_NMED, &max_meds, ep, NULL) != 0)
261*0Sstevel@tonic-gate 			return (0);
262*0Sstevel@tonic-gate 
263*0Sstevel@tonic-gate 	return (max_meds);
264*0Sstevel@tonic-gate }
265*0Sstevel@tonic-gate 
266*0Sstevel@tonic-gate side_t
267*0Sstevel@tonic-gate getmyside(mdsetname_t *sp, md_error_t *ep)
268*0Sstevel@tonic-gate {
269*0Sstevel@tonic-gate 	md_set_desc		*sd;
270*0Sstevel@tonic-gate 	char 			*node = NULL;
271*0Sstevel@tonic-gate 	side_t			sideno;
272*0Sstevel@tonic-gate 
273*0Sstevel@tonic-gate 	if (sp->setno == 0)
274*0Sstevel@tonic-gate 		return (0);
275*0Sstevel@tonic-gate 
276*0Sstevel@tonic-gate 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
277*0Sstevel@tonic-gate 		return (MD_SIDEWILD);
278*0Sstevel@tonic-gate 
279*0Sstevel@tonic-gate 	node = mynode();
280*0Sstevel@tonic-gate 
281*0Sstevel@tonic-gate 	assert(node != NULL);
282*0Sstevel@tonic-gate 
283*0Sstevel@tonic-gate 	sideno = getnodeside(node, sd);
284*0Sstevel@tonic-gate 
285*0Sstevel@tonic-gate 	if (sideno != MD_SIDEWILD)
286*0Sstevel@tonic-gate 		return (sideno);
287*0Sstevel@tonic-gate 
288*0Sstevel@tonic-gate 	return (mddserror(ep, MDE_DS_HOSTNOSIDE, sp->setno, node, NULL, node));
289*0Sstevel@tonic-gate }
290*0Sstevel@tonic-gate 
291*0Sstevel@tonic-gate /*
292*0Sstevel@tonic-gate  * get set info from name
293*0Sstevel@tonic-gate  */
294*0Sstevel@tonic-gate md_set_record *
295*0Sstevel@tonic-gate getsetbyname(char *setname, md_error_t *ep)
296*0Sstevel@tonic-gate {
297*0Sstevel@tonic-gate 	md_set_record		*sr = NULL;
298*0Sstevel@tonic-gate 	md_mnset_record		*mnsr = NULL;
299*0Sstevel@tonic-gate 	char			*p;
300*0Sstevel@tonic-gate 	size_t			len;
301*0Sstevel@tonic-gate 
302*0Sstevel@tonic-gate 	/* get set info from daemon */
303*0Sstevel@tonic-gate 	if (clnt_getset(mynode(), setname, MD_SET_BAD, &sr, ep) == -1)
304*0Sstevel@tonic-gate 		return (NULL);
305*0Sstevel@tonic-gate 	if (sr != NULL) {
306*0Sstevel@tonic-gate 		/*
307*0Sstevel@tonic-gate 		 * Returned record could be for a multi-node set or a
308*0Sstevel@tonic-gate 		 * non-multi-node set.
309*0Sstevel@tonic-gate 		 */
310*0Sstevel@tonic-gate 		if (MD_MNSET_REC(sr)) {
311*0Sstevel@tonic-gate 			/*
312*0Sstevel@tonic-gate 			 * Record is for a multi-node set.  Reissue call
313*0Sstevel@tonic-gate 			 * to get mnset information.  Need to free
314*0Sstevel@tonic-gate 			 * record as if a non-multi-node set record since
315*0Sstevel@tonic-gate 			 * that is what clnt_getset gave us.  If in
316*0Sstevel@tonic-gate 			 * the daemon, don't free since this is a pointer
317*0Sstevel@tonic-gate 			 * into the setrecords array.
318*0Sstevel@tonic-gate 			 */
319*0Sstevel@tonic-gate 			if (! md_in_daemon) {
320*0Sstevel@tonic-gate 				sr->sr_flags &= ~MD_SR_MN;
321*0Sstevel@tonic-gate 				free_sr(sr);
322*0Sstevel@tonic-gate 			}
323*0Sstevel@tonic-gate 			if (clnt_mngetset(mynode(), setname, MD_SET_BAD, &mnsr,
324*0Sstevel@tonic-gate 			    ep) == -1)
325*0Sstevel@tonic-gate 				return (NULL);
326*0Sstevel@tonic-gate 			if (mnsr != NULL)
327*0Sstevel@tonic-gate 				return ((struct md_set_record *)mnsr);
328*0Sstevel@tonic-gate 		} else {
329*0Sstevel@tonic-gate 			return (sr);
330*0Sstevel@tonic-gate 		}
331*0Sstevel@tonic-gate 	}
332*0Sstevel@tonic-gate 
333*0Sstevel@tonic-gate 	/* no such set */
334*0Sstevel@tonic-gate 	len = strlen(setname) + 30;
335*0Sstevel@tonic-gate 	p = Malloc(len);
336*0Sstevel@tonic-gate 	(void) snprintf(p, len, "setname \"%s\"", setname);
337*0Sstevel@tonic-gate 	(void) mderror(ep, MDE_NO_SET, p);
338*0Sstevel@tonic-gate 	Free(p);
339*0Sstevel@tonic-gate 	return (NULL);
340*0Sstevel@tonic-gate }
341*0Sstevel@tonic-gate 
342*0Sstevel@tonic-gate /*
343*0Sstevel@tonic-gate  * get set info from number
344*0Sstevel@tonic-gate  */
345*0Sstevel@tonic-gate md_set_record *
346*0Sstevel@tonic-gate getsetbynum(set_t setno, md_error_t *ep)
347*0Sstevel@tonic-gate {
348*0Sstevel@tonic-gate 	md_set_record		*sr;
349*0Sstevel@tonic-gate 	md_mnset_record		*mnsr = NULL;
350*0Sstevel@tonic-gate 	char			buf[100];
351*0Sstevel@tonic-gate 
352*0Sstevel@tonic-gate 	if (clnt_getset(mynode(), NULL, setno, &sr, ep) == -1)
353*0Sstevel@tonic-gate 		return (NULL);
354*0Sstevel@tonic-gate 
355*0Sstevel@tonic-gate 	if (sr != NULL) {
356*0Sstevel@tonic-gate 		/*
357*0Sstevel@tonic-gate 		 * Record is for a multi-node set.  Reissue call
358*0Sstevel@tonic-gate 		 * to get mnset information.  Need to free
359*0Sstevel@tonic-gate 		 * record as if a non-multi-node set record since
360*0Sstevel@tonic-gate 		 * that is what clnt_getset gave us.  If in
361*0Sstevel@tonic-gate 		 * the daemon, don't free since this is a pointer
362*0Sstevel@tonic-gate 		 * into the setrecords array.
363*0Sstevel@tonic-gate 		 */
364*0Sstevel@tonic-gate 		if (MD_MNSET_REC(sr)) {
365*0Sstevel@tonic-gate 			/*
366*0Sstevel@tonic-gate 			 * Record is for a multi-node set.  Reissue call
367*0Sstevel@tonic-gate 			 * to get mnset information.
368*0Sstevel@tonic-gate 			 */
369*0Sstevel@tonic-gate 			if (! md_in_daemon) {
370*0Sstevel@tonic-gate 				sr->sr_flags &= ~MD_SR_MN;
371*0Sstevel@tonic-gate 				free_sr(sr);
372*0Sstevel@tonic-gate 			}
373*0Sstevel@tonic-gate 			if (clnt_mngetset(mynode(), NULL, setno, &mnsr,
374*0Sstevel@tonic-gate 			    ep) == -1)
375*0Sstevel@tonic-gate 				return (NULL);
376*0Sstevel@tonic-gate 			if (mnsr != NULL)
377*0Sstevel@tonic-gate 				return ((struct md_set_record *)mnsr);
378*0Sstevel@tonic-gate 		} else {
379*0Sstevel@tonic-gate 			return (sr);
380*0Sstevel@tonic-gate 		}
381*0Sstevel@tonic-gate 	}
382*0Sstevel@tonic-gate 
383*0Sstevel@tonic-gate 	(void) sprintf(buf, "setno %u", setno);
384*0Sstevel@tonic-gate 	(void) mderror(ep, MDE_NO_SET, buf);
385*0Sstevel@tonic-gate 	return (NULL);
386*0Sstevel@tonic-gate }
387*0Sstevel@tonic-gate 
388*0Sstevel@tonic-gate int
389*0Sstevel@tonic-gate meta_check_drive_inuse(
390*0Sstevel@tonic-gate 	mdsetname_t	*sp,
391*0Sstevel@tonic-gate 	mddrivename_t	*dnp,
392*0Sstevel@tonic-gate 	int		check_db,
393*0Sstevel@tonic-gate 	md_error_t	*ep
394*0Sstevel@tonic-gate )
395*0Sstevel@tonic-gate {
396*0Sstevel@tonic-gate 	mdnamelist_t	*nlp = NULL;
397*0Sstevel@tonic-gate 	mdnamelist_t	*p;
398*0Sstevel@tonic-gate 	int		rval = 0;
399*0Sstevel@tonic-gate 
400*0Sstevel@tonic-gate 	/* get all underlying partitions */
401*0Sstevel@tonic-gate 	if (meta_getalldevs(sp, &nlp, check_db, ep) != 0)
402*0Sstevel@tonic-gate 		return (-1);
403*0Sstevel@tonic-gate 
404*0Sstevel@tonic-gate 	/* search for drive */
405*0Sstevel@tonic-gate 	for (p = nlp; (p != NULL); p = p->next) {
406*0Sstevel@tonic-gate 		mdname_t	*np = p->namep;
407*0Sstevel@tonic-gate 
408*0Sstevel@tonic-gate 		if (strcmp(dnp->cname, np->drivenamep->cname) == 0) {
409*0Sstevel@tonic-gate 			rval = (mddserror(ep, MDE_DS_DRIVEINUSE, sp->setno,
410*0Sstevel@tonic-gate 			    NULL, dnp->cname, sp->setname));
411*0Sstevel@tonic-gate 			break;
412*0Sstevel@tonic-gate 		}
413*0Sstevel@tonic-gate 	}
414*0Sstevel@tonic-gate 
415*0Sstevel@tonic-gate 	/* cleanup, return success */
416*0Sstevel@tonic-gate 	metafreenamelist(nlp);
417*0Sstevel@tonic-gate 	return (rval);
418*0Sstevel@tonic-gate }
419*0Sstevel@tonic-gate 
420*0Sstevel@tonic-gate /*
421*0Sstevel@tonic-gate  * simple check for ownership
422*0Sstevel@tonic-gate  */
423*0Sstevel@tonic-gate int
424*0Sstevel@tonic-gate meta_check_ownership(mdsetname_t *sp, md_error_t *ep)
425*0Sstevel@tonic-gate {
426*0Sstevel@tonic-gate 	int			ownset;
427*0Sstevel@tonic-gate 	md_set_desc		*sd;
428*0Sstevel@tonic-gate 	md_drive_desc		*dd;
429*0Sstevel@tonic-gate 	md_replicalist_t	*rlp = NULL;
430*0Sstevel@tonic-gate 	md_error_t		xep = mdnullerror;
431*0Sstevel@tonic-gate 
432*0Sstevel@tonic-gate 	if (metaislocalset(sp))
433*0Sstevel@tonic-gate 		return (0);
434*0Sstevel@tonic-gate 
435*0Sstevel@tonic-gate 	ownset = own_set(sp, NULL, TRUE, ep);
436*0Sstevel@tonic-gate 	if (! mdisok(ep))
437*0Sstevel@tonic-gate 		return (-1);
438*0Sstevel@tonic-gate 
439*0Sstevel@tonic-gate 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
440*0Sstevel@tonic-gate 		return (-1);
441*0Sstevel@tonic-gate 
442*0Sstevel@tonic-gate 	dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), ep);
443*0Sstevel@tonic-gate 	if (! mdisok(ep))
444*0Sstevel@tonic-gate 		return (-1);
445*0Sstevel@tonic-gate 
446*0Sstevel@tonic-gate 	/* If we have no drive descriptors, check for no ownership */
447*0Sstevel@tonic-gate 	if (dd == NULL) {
448*0Sstevel@tonic-gate 		if (ownset == MD_SETOWNER_NONE)
449*0Sstevel@tonic-gate 			return (0);
450*0Sstevel@tonic-gate 
451*0Sstevel@tonic-gate 		/* If ownership somehow has come to exist, we must clean up */
452*0Sstevel@tonic-gate 
453*0Sstevel@tonic-gate 		if (metareplicalist(sp, (MD_BASICNAME_OK | PRINT_FAST), &rlp,
454*0Sstevel@tonic-gate 		    &xep) < 0)
455*0Sstevel@tonic-gate 			mdclrerror(&xep);
456*0Sstevel@tonic-gate 
457*0Sstevel@tonic-gate 		if ((dd = rl_to_dd(sp, rlp, &xep)) == NULL)
458*0Sstevel@tonic-gate 			if (! mdisok(&xep))
459*0Sstevel@tonic-gate 				mdclrerror(&xep);
460*0Sstevel@tonic-gate 
461*0Sstevel@tonic-gate 		if (!(MD_MNSET_DESC(sd)) && !MD_ATSET_DESC(sd)) {
462*0Sstevel@tonic-gate 			if (rel_own_bydd(sp, dd, TRUE, &xep))
463*0Sstevel@tonic-gate 				mdclrerror(&xep);
464*0Sstevel@tonic-gate 		}
465*0Sstevel@tonic-gate 
466*0Sstevel@tonic-gate 		if (halt_set(sp, &xep))
467*0Sstevel@tonic-gate 			mdclrerror(&xep);
468*0Sstevel@tonic-gate 
469*0Sstevel@tonic-gate 		metafreereplicalist(rlp);
470*0Sstevel@tonic-gate 
471*0Sstevel@tonic-gate 		metafreedrivedesc(&dd);
472*0Sstevel@tonic-gate 
473*0Sstevel@tonic-gate 		return (0);
474*0Sstevel@tonic-gate 	}
475*0Sstevel@tonic-gate 
476*0Sstevel@tonic-gate 	metafreedrivedesc(&sd->sd_drvs);
477*0Sstevel@tonic-gate 
478*0Sstevel@tonic-gate 	if (ownset == MD_SETOWNER_YES)
479*0Sstevel@tonic-gate 		return (0);
480*0Sstevel@tonic-gate 
481*0Sstevel@tonic-gate 	return (mddserror(ep, MDE_DS_NOOWNER, sp->setno, NULL, NULL,
482*0Sstevel@tonic-gate 	    sp->setname));
483*0Sstevel@tonic-gate }
484*0Sstevel@tonic-gate 
485*0Sstevel@tonic-gate /*
486*0Sstevel@tonic-gate  * simple check for ownership
487*0Sstevel@tonic-gate  */
488*0Sstevel@tonic-gate int
489*0Sstevel@tonic-gate meta_check_ownership_on_host(mdsetname_t *sp, char *hostname, md_error_t *ep)
490*0Sstevel@tonic-gate {
491*0Sstevel@tonic-gate 	md_set_desc	*sd;
492*0Sstevel@tonic-gate 	md_drive_desc	*dd;
493*0Sstevel@tonic-gate 	int		bool;
494*0Sstevel@tonic-gate 
495*0Sstevel@tonic-gate 	if (metaislocalset(sp))
496*0Sstevel@tonic-gate 		return (0);
497*0Sstevel@tonic-gate 
498*0Sstevel@tonic-gate 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
499*0Sstevel@tonic-gate 		return (-1);
500*0Sstevel@tonic-gate 
501*0Sstevel@tonic-gate 	if (getnodeside(hostname, sd) == MD_SIDEWILD)
502*0Sstevel@tonic-gate 		return (mddserror(ep, MDE_DS_NODENOTINSET, sp->setno,
503*0Sstevel@tonic-gate 		    hostname, NULL, sp->setname));
504*0Sstevel@tonic-gate 
505*0Sstevel@tonic-gate 	dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), ep);
506*0Sstevel@tonic-gate 	if (! mdisok(ep))
507*0Sstevel@tonic-gate 		return (-1);
508*0Sstevel@tonic-gate 
509*0Sstevel@tonic-gate 	if (clnt_ownset(hostname, sp, &bool, ep) == -1)
510*0Sstevel@tonic-gate 		return (-1);
511*0Sstevel@tonic-gate 
512*0Sstevel@tonic-gate 	if (dd == NULL)
513*0Sstevel@tonic-gate 		return (0);
514*0Sstevel@tonic-gate 
515*0Sstevel@tonic-gate 	metafreedrivedesc(&sd->sd_drvs);
516*0Sstevel@tonic-gate 
517*0Sstevel@tonic-gate 	if (bool == TRUE)
518*0Sstevel@tonic-gate 		return (0);
519*0Sstevel@tonic-gate 
520*0Sstevel@tonic-gate 	return (mddserror(ep, MDE_DS_NODEISNOTOWNER, sp->setno, hostname, NULL,
521*0Sstevel@tonic-gate 	    sp->setname));
522*0Sstevel@tonic-gate }
523*0Sstevel@tonic-gate 
524*0Sstevel@tonic-gate /*
525*0Sstevel@tonic-gate  * Function that determines if a node is in the multinode diskset
526*0Sstevel@tonic-gate  * membership list.  Calling node passes in node to be checked and
527*0Sstevel@tonic-gate  * the nodelist as returned from meta_read_nodelist.  This routine
528*0Sstevel@tonic-gate  * anticipates being called many times using the same diskset membership
529*0Sstevel@tonic-gate  * list which is why the alloc and free of the diskset membership list
530*0Sstevel@tonic-gate  * is left to the calling routine.
531*0Sstevel@tonic-gate  * Returns:
532*0Sstevel@tonic-gate  *	1 - if a member
533*0Sstevel@tonic-gate  *	0 - not a member
534*0Sstevel@tonic-gate  */
535*0Sstevel@tonic-gate int
536*0Sstevel@tonic-gate meta_is_member(
537*0Sstevel@tonic-gate 	char				*node_name,
538*0Sstevel@tonic-gate 	md_mn_nodeid_t			node_id,
539*0Sstevel@tonic-gate 	mndiskset_membershiplist_t	*nl
540*0Sstevel@tonic-gate )
541*0Sstevel@tonic-gate {
542*0Sstevel@tonic-gate 	mndiskset_membershiplist_t	*nl2;
543*0Sstevel@tonic-gate 	int				flag_check_name;
544*0Sstevel@tonic-gate 
545*0Sstevel@tonic-gate 	if (node_id != 0)
546*0Sstevel@tonic-gate 		flag_check_name = 0;
547*0Sstevel@tonic-gate 	else if (node_name != NULL)
548*0Sstevel@tonic-gate 		flag_check_name = 1;
549*0Sstevel@tonic-gate 	else
550*0Sstevel@tonic-gate 		return (0);
551*0Sstevel@tonic-gate 
552*0Sstevel@tonic-gate 	nl2 = nl;
553*0Sstevel@tonic-gate 	while (nl2) {
554*0Sstevel@tonic-gate 		if (flag_check_name) {
555*0Sstevel@tonic-gate 			/* Compare given name against name in member list */
556*0Sstevel@tonic-gate 			if (strcmp(nl2->msl_node_name, node_name) == 0)
557*0Sstevel@tonic-gate 				break;
558*0Sstevel@tonic-gate 		} else {
559*0Sstevel@tonic-gate 			/* Compare given nodeid against nodeid in member list */
560*0Sstevel@tonic-gate 			if (nl2->msl_node_id == node_id)
561*0Sstevel@tonic-gate 				break;
562*0Sstevel@tonic-gate 		}
563*0Sstevel@tonic-gate 		nl2 = nl2->next;
564*0Sstevel@tonic-gate 	}
565*0Sstevel@tonic-gate 	/* No match found in member list */
566*0Sstevel@tonic-gate 	if (nl2 == NULL) {
567*0Sstevel@tonic-gate 		return (0);
568*0Sstevel@tonic-gate 	}
569*0Sstevel@tonic-gate 	/* Return 1 if node is in member list */
570*0Sstevel@tonic-gate 	return (1);
571*0Sstevel@tonic-gate }
572*0Sstevel@tonic-gate 
573*0Sstevel@tonic-gate /*
574*0Sstevel@tonic-gate  * meta_getnext_devinfo should go to the host that
575*0Sstevel@tonic-gate  * has the device, to return the device name, driver name, minor num.
576*0Sstevel@tonic-gate  * We can take the big cheat for now, since it is a requirement
577*0Sstevel@tonic-gate  * that the device names and device numbers are the same, and
578*0Sstevel@tonic-gate  * just get the info locally.
579*0Sstevel@tonic-gate  *
580*0Sstevel@tonic-gate  * This routine is very similar to meta_getnextside_devinfo except
581*0Sstevel@tonic-gate  * that the specific side to be used is being passed in.
582*0Sstevel@tonic-gate  *
583*0Sstevel@tonic-gate  * Exit status:
584*0Sstevel@tonic-gate  *	 0 - No more side info to return
585*0Sstevel@tonic-gate  *	 1 - More side info's to return
586*0Sstevel@tonic-gate  *	-1 - An error has been detected
587*0Sstevel@tonic-gate  */
588*0Sstevel@tonic-gate /*ARGSUSED*/
589*0Sstevel@tonic-gate int
590*0Sstevel@tonic-gate meta_getside_devinfo(
591*0Sstevel@tonic-gate 	mdsetname_t	*sp,		/* for this set */
592*0Sstevel@tonic-gate 	char		*bname,		/* local block name (myside) */
593*0Sstevel@tonic-gate 	side_t		sideno,		/* sideno */
594*0Sstevel@tonic-gate 	char		**ret_bname,	/* block device name of returned side */
595*0Sstevel@tonic-gate 	char		**ret_dname,	/* driver name of returned side */
596*0Sstevel@tonic-gate 	minor_t		*ret_mnum,	/* minor number of returned side */
597*0Sstevel@tonic-gate 	md_error_t	*ep
598*0Sstevel@tonic-gate )
599*0Sstevel@tonic-gate {
600*0Sstevel@tonic-gate 	mdname_t	*np;
601*0Sstevel@tonic-gate 
602*0Sstevel@tonic-gate 	if (ret_bname != NULL)
603*0Sstevel@tonic-gate 		*ret_bname = NULL;
604*0Sstevel@tonic-gate 	if (ret_dname != NULL)
605*0Sstevel@tonic-gate 		*ret_dname = NULL;
606*0Sstevel@tonic-gate 	if (ret_mnum != NULL)
607*0Sstevel@tonic-gate 		*ret_mnum = NODEV32;
608*0Sstevel@tonic-gate 
609*0Sstevel@tonic-gate 
610*0Sstevel@tonic-gate 	if ((np = metaname(&sp, bname, ep)) == NULL)
611*0Sstevel@tonic-gate 		return (-1);
612*0Sstevel@tonic-gate 
613*0Sstevel@tonic-gate /*
614*0Sstevel@tonic-gate  * NOTE (future) - There will be more work here once devids are integrated
615*0Sstevel@tonic-gate  * into disksets.  Then the side should be used to find the correct
616*0Sstevel@tonic-gate  * host and the b/d names should be gotten from that host.
617*0Sstevel@tonic-gate  */
618*0Sstevel@tonic-gate 
619*0Sstevel@tonic-gate 	/*
620*0Sstevel@tonic-gate 	 * Return the side info.
621*0Sstevel@tonic-gate 	 */
622*0Sstevel@tonic-gate 	if (ret_bname != NULL)
623*0Sstevel@tonic-gate 		*ret_bname = Strdup(np->bname);
624*0Sstevel@tonic-gate 
625*0Sstevel@tonic-gate 	if (ret_dname != NULL) {
626*0Sstevel@tonic-gate 		mdcinfo_t	*cinfo;
627*0Sstevel@tonic-gate 
628*0Sstevel@tonic-gate 		if ((cinfo = metagetcinfo(np, ep)) == NULL)
629*0Sstevel@tonic-gate 			return (-1);
630*0Sstevel@tonic-gate 
631*0Sstevel@tonic-gate 		*ret_dname = Strdup(cinfo->dname);
632*0Sstevel@tonic-gate 	}
633*0Sstevel@tonic-gate 
634*0Sstevel@tonic-gate 	if (ret_mnum != NULL)
635*0Sstevel@tonic-gate 		*ret_mnum = meta_getminor(np->dev);
636*0Sstevel@tonic-gate 
637*0Sstevel@tonic-gate 	return (1);
638*0Sstevel@tonic-gate }
639*0Sstevel@tonic-gate 
640*0Sstevel@tonic-gate /*
641*0Sstevel@tonic-gate  * Get the information on the device from the remote node using the devid
642*0Sstevel@tonic-gate  * of the disk.
643*0Sstevel@tonic-gate  *
644*0Sstevel@tonic-gate  * Exit status:
645*0Sstevel@tonic-gate  *	 0 - No more side info to return
646*0Sstevel@tonic-gate  *	 1 - More side info's to return
647*0Sstevel@tonic-gate  *	-1 - An error has been detected
648*0Sstevel@tonic-gate  */
649*0Sstevel@tonic-gate int
650*0Sstevel@tonic-gate meta_getnextside_devinfo(
651*0Sstevel@tonic-gate 	mdsetname_t	*sp,		/* for this set */
652*0Sstevel@tonic-gate 	char		*bname,		/* local block name (myside) */
653*0Sstevel@tonic-gate 	side_t		*sideno,	/* previous sideno & returned sideno */
654*0Sstevel@tonic-gate 	char		**ret_bname,	/* block device name of returned side */
655*0Sstevel@tonic-gate 	char		**ret_dname,	/* driver name of returned side */
656*0Sstevel@tonic-gate 	minor_t		*ret_mnum,	/* minor number of returned side */
657*0Sstevel@tonic-gate 	md_error_t	*ep
658*0Sstevel@tonic-gate )
659*0Sstevel@tonic-gate {
660*0Sstevel@tonic-gate 	md_set_desc	*sd;
661*0Sstevel@tonic-gate 	int		i;
662*0Sstevel@tonic-gate 	mdname_t	*np;
663*0Sstevel@tonic-gate 	mddrivename_t	*dnp;
664*0Sstevel@tonic-gate 	char		*devidstr = NULL;
665*0Sstevel@tonic-gate 	int		devidstrlen;
666*0Sstevel@tonic-gate 	md_dev64_t	retdev = NODEV64;
667*0Sstevel@tonic-gate 	char		*ret_devname = NULL;
668*0Sstevel@tonic-gate 	char		*ret_blkdevname = NULL;
669*0Sstevel@tonic-gate 	char		*ret_driver = NULL;
670*0Sstevel@tonic-gate 	char		*nodename;
671*0Sstevel@tonic-gate 	int		fd;
672*0Sstevel@tonic-gate 	int		ret = -1;
673*0Sstevel@tonic-gate 	char		*minor_name = NULL;
674*0Sstevel@tonic-gate 	md_mnnode_desc	*nd;
675*0Sstevel@tonic-gate 
676*0Sstevel@tonic-gate 
677*0Sstevel@tonic-gate 	if (ret_bname != NULL)
678*0Sstevel@tonic-gate 		*ret_bname = NULL;
679*0Sstevel@tonic-gate 	if (ret_dname != NULL)
680*0Sstevel@tonic-gate 		*ret_dname = NULL;
681*0Sstevel@tonic-gate 	if (ret_mnum != NULL)
682*0Sstevel@tonic-gate 		*ret_mnum = NODEV32;
683*0Sstevel@tonic-gate 
684*0Sstevel@tonic-gate 	if (metaislocalset(sp)) {
685*0Sstevel@tonic-gate 		/* no more sides - we are done */
686*0Sstevel@tonic-gate 		if (*sideno != MD_SIDEWILD)
687*0Sstevel@tonic-gate 			return (0);
688*0Sstevel@tonic-gate 
689*0Sstevel@tonic-gate 		/* First time through -  set up return sideno */
690*0Sstevel@tonic-gate 		*sideno = 0;
691*0Sstevel@tonic-gate 	} else {
692*0Sstevel@tonic-gate 
693*0Sstevel@tonic-gate 		/*
694*0Sstevel@tonic-gate 		 * Find the next sideno, starting after the one given.
695*0Sstevel@tonic-gate 		 */
696*0Sstevel@tonic-gate 		if ((sd = metaget_setdesc(sp, ep)) == NULL)
697*0Sstevel@tonic-gate 			return (-1);
698*0Sstevel@tonic-gate 
699*0Sstevel@tonic-gate 		if (MD_MNSET_DESC(sd)) {
700*0Sstevel@tonic-gate 			nd = sd->sd_nodelist;
701*0Sstevel@tonic-gate 			if ((*sideno == MD_SIDEWILD) &&
702*0Sstevel@tonic-gate 			    (nd != (struct md_mnnode_desc *)NULL)) {
703*0Sstevel@tonic-gate 				*sideno = nd->nd_nodeid;
704*0Sstevel@tonic-gate 			} else {
705*0Sstevel@tonic-gate 				while (nd) {
706*0Sstevel@tonic-gate 					/*
707*0Sstevel@tonic-gate 					 * Found given sideno, now find
708*0Sstevel@tonic-gate 					 * next sideno, if there is one.
709*0Sstevel@tonic-gate 					 */
710*0Sstevel@tonic-gate 					if ((*sideno == nd->nd_nodeid) &&
711*0Sstevel@tonic-gate 					    (nd->nd_next !=
712*0Sstevel@tonic-gate 					    (struct md_mnnode_desc *)NULL)) {
713*0Sstevel@tonic-gate 						*sideno =
714*0Sstevel@tonic-gate 						    nd->nd_next->nd_nodeid;
715*0Sstevel@tonic-gate 						break;
716*0Sstevel@tonic-gate 					}
717*0Sstevel@tonic-gate 					nd = nd->nd_next;
718*0Sstevel@tonic-gate 				}
719*0Sstevel@tonic-gate 				if (nd == NULL) {
720*0Sstevel@tonic-gate 					return (0);
721*0Sstevel@tonic-gate 				}
722*0Sstevel@tonic-gate 			}
723*0Sstevel@tonic-gate 			if (*sideno == MD_SIDEWILD)
724*0Sstevel@tonic-gate 				return (0);
725*0Sstevel@tonic-gate 		} else {
726*0Sstevel@tonic-gate 			for (i = (*sideno)+1; i < MD_MAXSIDES; i++)
727*0Sstevel@tonic-gate 				/* Find next full slot */
728*0Sstevel@tonic-gate 				if (sd->sd_nodes[i][0] != '\0')
729*0Sstevel@tonic-gate 					break;
730*0Sstevel@tonic-gate 
731*0Sstevel@tonic-gate 			/* No more sides - we are done */
732*0Sstevel@tonic-gate 			if (i == MD_MAXSIDES)
733*0Sstevel@tonic-gate 				return (0);
734*0Sstevel@tonic-gate 
735*0Sstevel@tonic-gate 			/* Set up the return sideno */
736*0Sstevel@tonic-gate 			*sideno = i;
737*0Sstevel@tonic-gate 			nodename = (char *)sd->sd_nodes[i];
738*0Sstevel@tonic-gate 		}
739*0Sstevel@tonic-gate 	}
740*0Sstevel@tonic-gate 
741*0Sstevel@tonic-gate 	/*
742*0Sstevel@tonic-gate 	 * Need to pass the node the devid of the disk and get it to
743*0Sstevel@tonic-gate 	 * send back the details of the disk from that side.
744*0Sstevel@tonic-gate 	 */
745*0Sstevel@tonic-gate 	if ((np = metaname(&sp, bname, ep)) == NULL)
746*0Sstevel@tonic-gate 		return (-1);
747*0Sstevel@tonic-gate 
748*0Sstevel@tonic-gate 	dnp = np->drivenamep;
749*0Sstevel@tonic-gate 
750*0Sstevel@tonic-gate 	/*
751*0Sstevel@tonic-gate 	 * By default, set up the parameters so that they are copied out.
752*0Sstevel@tonic-gate 	 */
753*0Sstevel@tonic-gate 	if (ret_bname != NULL)
754*0Sstevel@tonic-gate 		*ret_bname = Strdup(np->bname);
755*0Sstevel@tonic-gate 
756*0Sstevel@tonic-gate 	if (ret_dname != NULL) {
757*0Sstevel@tonic-gate 		mdcinfo_t	*cinfo;
758*0Sstevel@tonic-gate 
759*0Sstevel@tonic-gate 		if ((cinfo = metagetcinfo(np, ep)) == NULL)
760*0Sstevel@tonic-gate 			return (-1);
761*0Sstevel@tonic-gate 
762*0Sstevel@tonic-gate 		*ret_dname = Strdup(cinfo->dname);
763*0Sstevel@tonic-gate 	}
764*0Sstevel@tonic-gate 
765*0Sstevel@tonic-gate 	if (ret_mnum != NULL)
766*0Sstevel@tonic-gate 		*ret_mnum = meta_getminor(np->dev);
767*0Sstevel@tonic-gate 
768*0Sstevel@tonic-gate 	/*
769*0Sstevel@tonic-gate 	 * Try some optimization. If this is the local set or the device
770*0Sstevel@tonic-gate 	 * is a metadevice then just copy the information. If the device
771*0Sstevel@tonic-gate 	 * does not have a devid (due to not having a minor name) then
772*0Sstevel@tonic-gate 	 * fall back to the pre-devid behaviour of copying the information
773*0Sstevel@tonic-gate 	 * on the device: this is okay because the sanity checks before this
774*0Sstevel@tonic-gate 	 * call would have found any issues with the device. If it's a
775*0Sstevel@tonic-gate 	 * multi-node diskset also just return ie. copy.
776*0Sstevel@tonic-gate 	 */
777*0Sstevel@tonic-gate 	if (metaislocalset(sp) || metaismeta(np) || (dnp->devid == NULL) ||
778*0Sstevel@tonic-gate 	    (MD_MNSET_DESC(sd)))
779*0Sstevel@tonic-gate 		return (1);
780*0Sstevel@tonic-gate 
781*0Sstevel@tonic-gate 	if (np->minor_name == (char *)NULL) {
782*0Sstevel@tonic-gate 		/*
783*0Sstevel@tonic-gate 		 * Have to get the minor name then. The slice should exist
784*0Sstevel@tonic-gate 		 * on the disk because it will have already been repartitioned
785*0Sstevel@tonic-gate 		 * up prior to getting to this point.
786*0Sstevel@tonic-gate 		 */
787*0Sstevel@tonic-gate 		if ((fd = open(np->bname, (O_RDONLY|O_NDELAY), 0)) < 0) {
788*0Sstevel@tonic-gate 			(void) mdsyserror(ep, errno, np->bname);
789*0Sstevel@tonic-gate 			return (-1);
790*0Sstevel@tonic-gate 		}
791*0Sstevel@tonic-gate 		(void) devid_get_minor_name(fd, &minor_name);
792*0Sstevel@tonic-gate 		np->minor_name = Strdup(minor_name);
793*0Sstevel@tonic-gate 		devid_str_free(minor_name);
794*0Sstevel@tonic-gate 		(void) close(fd);
795*0Sstevel@tonic-gate 	}
796*0Sstevel@tonic-gate 
797*0Sstevel@tonic-gate 	/* allocate extra space for "/" and NULL hence +2 */
798*0Sstevel@tonic-gate 	devidstrlen = strlen(dnp->devid) + strlen(np->minor_name) + 2;
799*0Sstevel@tonic-gate 	devidstr = (char *)Malloc(devidstrlen);
800*0Sstevel@tonic-gate 
801*0Sstevel@tonic-gate 	/*
802*0Sstevel@tonic-gate 	 * As a minor name is supplied then the ret_devname will be
803*0Sstevel@tonic-gate 	 * appropriate to that minor_name and in this case it will be
804*0Sstevel@tonic-gate 	 * a block device ie /dev/dsk.
805*0Sstevel@tonic-gate 	 */
806*0Sstevel@tonic-gate 	(void) snprintf(devidstr, devidstrlen,
807*0Sstevel@tonic-gate 		"%s/%s", dnp->devid, np->minor_name);
808*0Sstevel@tonic-gate 
809*0Sstevel@tonic-gate 	ret = clnt_devinfo_by_devid(nodename, sp, devidstr, &retdev,
810*0Sstevel@tonic-gate 	    np->bname, &ret_devname, &ret_driver, ep);
811*0Sstevel@tonic-gate 
812*0Sstevel@tonic-gate 	Free(devidstr);
813*0Sstevel@tonic-gate 
814*0Sstevel@tonic-gate 	/*
815*0Sstevel@tonic-gate 	 * If the other side is not running device id in disksets,
816*0Sstevel@tonic-gate 	 * 'ret' is set to ENOTSUP in which case we fallback to
817*0Sstevel@tonic-gate 	 * the existing behaviour
818*0Sstevel@tonic-gate 	 */
819*0Sstevel@tonic-gate 	if (ret == ENOTSUP)
820*0Sstevel@tonic-gate 		return (1);
821*0Sstevel@tonic-gate 	else if (ret == -1)
822*0Sstevel@tonic-gate 		return (-1);
823*0Sstevel@tonic-gate 
824*0Sstevel@tonic-gate 	/*
825*0Sstevel@tonic-gate 	 * ret_devname comes from the rpc call and is a
826*0Sstevel@tonic-gate 	 * raw device name. We need to make this into a
827*0Sstevel@tonic-gate 	 * block device via blkname for further processing.
828*0Sstevel@tonic-gate 	 * Unfortunately, when our device id isn't found in
829*0Sstevel@tonic-gate 	 * the system, the rpc call will return a " " in
830*0Sstevel@tonic-gate 	 * ret_devname in which case we need to fill that in
831*0Sstevel@tonic-gate 	 * as ret_blkname because blkname of " " returns NULL.
832*0Sstevel@tonic-gate 	 */
833*0Sstevel@tonic-gate 	if (ret_bname != NULL && ret_devname != NULL) {
834*0Sstevel@tonic-gate 		ret_blkdevname = blkname(ret_devname);
835*0Sstevel@tonic-gate 		if (ret_blkdevname == NULL)
836*0Sstevel@tonic-gate 			*ret_bname = Strdup(ret_devname);
837*0Sstevel@tonic-gate 		else
838*0Sstevel@tonic-gate 			*ret_bname = Strdup(ret_blkdevname);
839*0Sstevel@tonic-gate 	}
840*0Sstevel@tonic-gate 
841*0Sstevel@tonic-gate 	if (ret_dname != NULL && ret_driver != NULL)
842*0Sstevel@tonic-gate 		*ret_dname = Strdup(ret_driver);
843*0Sstevel@tonic-gate 
844*0Sstevel@tonic-gate 	if (ret_mnum != NULL)
845*0Sstevel@tonic-gate 		*ret_mnum = meta_getminor(retdev);
846*0Sstevel@tonic-gate 
847*0Sstevel@tonic-gate 	return (1);
848*0Sstevel@tonic-gate }
849*0Sstevel@tonic-gate 
850*0Sstevel@tonic-gate int
851*0Sstevel@tonic-gate meta_is_drive_in_anyset(
852*0Sstevel@tonic-gate 	mddrivename_t	*dnp,
853*0Sstevel@tonic-gate 	mdsetname_t	**spp,
854*0Sstevel@tonic-gate 	int		bypass_daemon,
855*0Sstevel@tonic-gate 	md_error_t 	*ep
856*0Sstevel@tonic-gate )
857*0Sstevel@tonic-gate {
858*0Sstevel@tonic-gate 	set_t		setno;
859*0Sstevel@tonic-gate 	mdsetname_t	*this_sp;
860*0Sstevel@tonic-gate 	int		is_it;
861*0Sstevel@tonic-gate 	set_t		max_sets;
862*0Sstevel@tonic-gate 
863*0Sstevel@tonic-gate 	if ((max_sets = get_max_sets(ep)) == 0)
864*0Sstevel@tonic-gate 		return (-1);
865*0Sstevel@tonic-gate 
866*0Sstevel@tonic-gate 	assert(spp != NULL);
867*0Sstevel@tonic-gate 	*spp = NULL;
868*0Sstevel@tonic-gate 
869*0Sstevel@tonic-gate 	for (setno = 1; setno < max_sets; setno++) {
870*0Sstevel@tonic-gate 		if (!bypass_daemon) {
871*0Sstevel@tonic-gate 			if ((this_sp = metasetnosetname(setno, ep)) == NULL) {
872*0Sstevel@tonic-gate 				if (mdismddberror(ep, MDE_DB_NODB)) {
873*0Sstevel@tonic-gate 					mdclrerror(ep);
874*0Sstevel@tonic-gate 					return (0);
875*0Sstevel@tonic-gate 				}
876*0Sstevel@tonic-gate 				if (mdiserror(ep, MDE_NO_SET)) {
877*0Sstevel@tonic-gate 					mdclrerror(ep);
878*0Sstevel@tonic-gate 					continue;
879*0Sstevel@tonic-gate 				}
880*0Sstevel@tonic-gate 				return (-1);
881*0Sstevel@tonic-gate 			}
882*0Sstevel@tonic-gate 		} else
883*0Sstevel@tonic-gate 			this_sp = metafakesetname(setno, NULL);
884*0Sstevel@tonic-gate 
885*0Sstevel@tonic-gate 		if ((is_it = meta_is_drive_in_thisset(this_sp, dnp,
886*0Sstevel@tonic-gate 		    bypass_daemon, ep)) == -1) {
887*0Sstevel@tonic-gate 			if (mdiserror(ep, MDE_NO_SET)) {
888*0Sstevel@tonic-gate 				mdclrerror(ep);
889*0Sstevel@tonic-gate 				continue;
890*0Sstevel@tonic-gate 			}
891*0Sstevel@tonic-gate 			return (-1);
892*0Sstevel@tonic-gate 		}
893*0Sstevel@tonic-gate 		if (is_it) {
894*0Sstevel@tonic-gate 			*spp = this_sp;
895*0Sstevel@tonic-gate 			return (0);
896*0Sstevel@tonic-gate 		}
897*0Sstevel@tonic-gate 	}
898*0Sstevel@tonic-gate 	return (0);
899*0Sstevel@tonic-gate }
900*0Sstevel@tonic-gate 
901*0Sstevel@tonic-gate int
902*0Sstevel@tonic-gate meta_is_drive_in_thisset(
903*0Sstevel@tonic-gate 	mdsetname_t	*sp,
904*0Sstevel@tonic-gate 	mddrivename_t	*dnp,
905*0Sstevel@tonic-gate 	int		bypass_daemon,
906*0Sstevel@tonic-gate 	md_error_t	*ep
907*0Sstevel@tonic-gate )
908*0Sstevel@tonic-gate {
909*0Sstevel@tonic-gate 	md_drive_desc	*dd, *p;
910*0Sstevel@tonic-gate 
911*0Sstevel@tonic-gate 	if (bypass_daemon)
912*0Sstevel@tonic-gate 		dd = dr2drivedesc(sp, MD_SIDEWILD,
913*0Sstevel@tonic-gate 		    (MD_BASICNAME_OK | MD_BYPASS_DAEMON), ep);
914*0Sstevel@tonic-gate 	else
915*0Sstevel@tonic-gate 		dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep);
916*0Sstevel@tonic-gate 
917*0Sstevel@tonic-gate 	if (dd == NULL) {
918*0Sstevel@tonic-gate 		if (! mdisok(ep))
919*0Sstevel@tonic-gate 			return (-1);
920*0Sstevel@tonic-gate 		return (0);
921*0Sstevel@tonic-gate 	}
922*0Sstevel@tonic-gate 
923*0Sstevel@tonic-gate 
924*0Sstevel@tonic-gate 	for (p = dd; p != NULL; p = p->dd_next)
925*0Sstevel@tonic-gate 		if (strcmp(p->dd_dnp->cname, dnp->cname) == 0)
926*0Sstevel@tonic-gate 			return (1);
927*0Sstevel@tonic-gate 	return (0);
928*0Sstevel@tonic-gate }
929*0Sstevel@tonic-gate 
930*0Sstevel@tonic-gate int
931*0Sstevel@tonic-gate meta_set_balance(
932*0Sstevel@tonic-gate 	mdsetname_t		*sp,
933*0Sstevel@tonic-gate 	md_error_t		*ep
934*0Sstevel@tonic-gate )
935*0Sstevel@tonic-gate {
936*0Sstevel@tonic-gate 	md_set_desc		*sd;
937*0Sstevel@tonic-gate 	md_drive_desc		*dd, *curdd;
938*0Sstevel@tonic-gate 	daddr_t			dbsize;
939*0Sstevel@tonic-gate 	daddr_t			nblks;
940*0Sstevel@tonic-gate 	int			i;
941*0Sstevel@tonic-gate 	int			rval = 0;
942*0Sstevel@tonic-gate 	sigset_t		oldsigs;
943*0Sstevel@tonic-gate 	md_setkey_t		*cl_sk;
944*0Sstevel@tonic-gate 	md_error_t		xep = mdnullerror;
945*0Sstevel@tonic-gate 	md_mnnode_desc		*nd;
946*0Sstevel@tonic-gate 	int			suspend1_flag = 0;
947*0Sstevel@tonic-gate 
948*0Sstevel@tonic-gate 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
949*0Sstevel@tonic-gate 		return (-1);
950*0Sstevel@tonic-gate 
951*0Sstevel@tonic-gate 	dbsize = (MD_MNSET_DESC(sd)) ? MD_MN_DBSIZE : MD_DBSIZE;
952*0Sstevel@tonic-gate 
953*0Sstevel@tonic-gate 	/* Make sure we own the set */
954*0Sstevel@tonic-gate 	if (meta_check_ownership(sp, ep) != 0)
955*0Sstevel@tonic-gate 		return (-1);
956*0Sstevel@tonic-gate 
957*0Sstevel@tonic-gate 	/* END CHECK CODE */
958*0Sstevel@tonic-gate 
959*0Sstevel@tonic-gate 	/*
960*0Sstevel@tonic-gate 	 * Get drive descriptors for the drives that are currently in the set.
961*0Sstevel@tonic-gate 	 */
962*0Sstevel@tonic-gate 	curdd = metaget_drivedesc(sp, MD_FULLNAME_ONLY, ep);
963*0Sstevel@tonic-gate 
964*0Sstevel@tonic-gate 	if (! mdisok(ep))
965*0Sstevel@tonic-gate 		return (-1);
966*0Sstevel@tonic-gate 
967*0Sstevel@tonic-gate 	/* Find the minimum replica size in use is or use the default */
968*0Sstevel@tonic-gate 	if ((nblks = meta_db_minreplica(sp, ep)) < 0)
969*0Sstevel@tonic-gate 		mdclrerror(ep);
970*0Sstevel@tonic-gate 	else
971*0Sstevel@tonic-gate 		dbsize = nblks;	/* adjust replica size */
972*0Sstevel@tonic-gate 
973*0Sstevel@tonic-gate 	/* Make sure we are blocking all signals */
974*0Sstevel@tonic-gate 	if (procsigs(TRUE, &oldsigs, &xep) < 0)
975*0Sstevel@tonic-gate 		mdclrerror(&xep);
976*0Sstevel@tonic-gate 
977*0Sstevel@tonic-gate 	/*
978*0Sstevel@tonic-gate 	 * Lock the set on current set members.
979*0Sstevel@tonic-gate 	 * For MN diskset lock_set and SUSPEND are used to protect against
980*0Sstevel@tonic-gate 	 * other meta* commands running on the other nodes.
981*0Sstevel@tonic-gate 	 */
982*0Sstevel@tonic-gate 	if (MD_MNSET_DESC(sd)) {
983*0Sstevel@tonic-gate 		nd = sd->sd_nodelist;
984*0Sstevel@tonic-gate 		while (nd) {
985*0Sstevel@tonic-gate 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
986*0Sstevel@tonic-gate 				nd = nd->nd_next;
987*0Sstevel@tonic-gate 				continue;
988*0Sstevel@tonic-gate 			}
989*0Sstevel@tonic-gate 			if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
990*0Sstevel@tonic-gate 				rval = -1;
991*0Sstevel@tonic-gate 				goto out;
992*0Sstevel@tonic-gate 			}
993*0Sstevel@tonic-gate 			nd = nd->nd_next;
994*0Sstevel@tonic-gate 		}
995*0Sstevel@tonic-gate 		/*
996*0Sstevel@tonic-gate 		 * Lock out other meta* commands by suspending
997*0Sstevel@tonic-gate 		 * class 1 messages across the diskset.
998*0Sstevel@tonic-gate 		 */
999*0Sstevel@tonic-gate 		nd = sd->sd_nodelist;
1000*0Sstevel@tonic-gate 		while (nd) {
1001*0Sstevel@tonic-gate 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1002*0Sstevel@tonic-gate 				nd = nd->nd_next;
1003*0Sstevel@tonic-gate 				continue;
1004*0Sstevel@tonic-gate 			}
1005*0Sstevel@tonic-gate 			if (clnt_mdcommdctl(nd->nd_nodename,
1006*0Sstevel@tonic-gate 			    COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1,
1007*0Sstevel@tonic-gate 			    MD_MSCF_NO_FLAGS, ep)) {
1008*0Sstevel@tonic-gate 				rval = -1;
1009*0Sstevel@tonic-gate 				goto out;
1010*0Sstevel@tonic-gate 			}
1011*0Sstevel@tonic-gate 			suspend1_flag = 1;
1012*0Sstevel@tonic-gate 			nd = nd->nd_next;
1013*0Sstevel@tonic-gate 		}
1014*0Sstevel@tonic-gate 	} else {
1015*0Sstevel@tonic-gate 		for (i = 0; i < MD_MAXSIDES; i++) {
1016*0Sstevel@tonic-gate 			/* Skip empty slots */
1017*0Sstevel@tonic-gate 			if (sd->sd_nodes[i][0] == '\0') continue;
1018*0Sstevel@tonic-gate 
1019*0Sstevel@tonic-gate 			if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) {
1020*0Sstevel@tonic-gate 				rval = -1;
1021*0Sstevel@tonic-gate 				goto out;
1022*0Sstevel@tonic-gate 			}
1023*0Sstevel@tonic-gate 		}
1024*0Sstevel@tonic-gate 	}
1025*0Sstevel@tonic-gate 
1026*0Sstevel@tonic-gate 	/* We are not adding or deleting any drives, just balancing */
1027*0Sstevel@tonic-gate 	dd = NULL;
1028*0Sstevel@tonic-gate 
1029*0Sstevel@tonic-gate 	/*
1030*0Sstevel@tonic-gate 	 * Balance the DB's according to the list of existing drives and the
1031*0Sstevel@tonic-gate 	 * list of added drives.
1032*0Sstevel@tonic-gate 	 */
1033*0Sstevel@tonic-gate 	if ((rval = meta_db_balance(sp, dd, curdd, dbsize, ep)) == -1)
1034*0Sstevel@tonic-gate 		goto out;
1035*0Sstevel@tonic-gate 
1036*0Sstevel@tonic-gate out:
1037*0Sstevel@tonic-gate 	/*
1038*0Sstevel@tonic-gate 	 * Unlock diskset by resuming class 1 messages across the diskset.
1039*0Sstevel@tonic-gate 	 * Just resume all classes so that resume is the same whether
1040*0Sstevel@tonic-gate 	 * just one class was locked or all classes were locked.
1041*0Sstevel@tonic-gate 	 */
1042*0Sstevel@tonic-gate 	if (suspend1_flag) {
1043*0Sstevel@tonic-gate 		nd = sd->sd_nodelist;
1044*0Sstevel@tonic-gate 		while (nd) {
1045*0Sstevel@tonic-gate 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1046*0Sstevel@tonic-gate 				nd = nd->nd_next;
1047*0Sstevel@tonic-gate 				continue;
1048*0Sstevel@tonic-gate 			}
1049*0Sstevel@tonic-gate 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
1050*0Sstevel@tonic-gate 				sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
1051*0Sstevel@tonic-gate 				/*
1052*0Sstevel@tonic-gate 				 * We are here because we failed to resume
1053*0Sstevel@tonic-gate 				 * rpc.mdcommd.  However we potentially have
1054*0Sstevel@tonic-gate 				 * an error from the previous call
1055*0Sstevel@tonic-gate 				 * (meta_db_balance). If the previous call
1056*0Sstevel@tonic-gate 				 * did fail,  we capture that error and
1057*0Sstevel@tonic-gate 				 * generate a perror withthe string,
1058*0Sstevel@tonic-gate 				 * "Unable to resume...".
1059*0Sstevel@tonic-gate 				 * Setting rval to -1 ensures that in the
1060*0Sstevel@tonic-gate 				 * next iteration of the loop, ep is not
1061*0Sstevel@tonic-gate 				 * clobbered.
1062*0Sstevel@tonic-gate 				 */
1063*0Sstevel@tonic-gate 				if (rval == 0)
1064*0Sstevel@tonic-gate 					(void) mdstealerror(ep, &xep);
1065*0Sstevel@tonic-gate 				else
1066*0Sstevel@tonic-gate 					mdclrerror(&xep);
1067*0Sstevel@tonic-gate 				rval = -1;
1068*0Sstevel@tonic-gate 				mde_perror(ep, dgettext(TEXT_DOMAIN,
1069*0Sstevel@tonic-gate 				    "Unable to resume rpc.mdcommd."));
1070*0Sstevel@tonic-gate 			}
1071*0Sstevel@tonic-gate 			nd = nd->nd_next;
1072*0Sstevel@tonic-gate 		}
1073*0Sstevel@tonic-gate 	}
1074*0Sstevel@tonic-gate 
1075*0Sstevel@tonic-gate 	/* Unlock the set */
1076*0Sstevel@tonic-gate 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
1077*0Sstevel@tonic-gate 	if (MD_MNSET_DESC(sd)) {
1078*0Sstevel@tonic-gate 		nd = sd->sd_nodelist;
1079*0Sstevel@tonic-gate 		while (nd) {
1080*0Sstevel@tonic-gate 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1081*0Sstevel@tonic-gate 				nd = nd->nd_next;
1082*0Sstevel@tonic-gate 				continue;
1083*0Sstevel@tonic-gate 			}
1084*0Sstevel@tonic-gate 			if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
1085*0Sstevel@tonic-gate 				if (rval == 0)
1086*0Sstevel@tonic-gate 					(void) mdstealerror(ep, &xep);
1087*0Sstevel@tonic-gate 				else
1088*0Sstevel@tonic-gate 					mdclrerror(&xep);
1089*0Sstevel@tonic-gate 				rval = -1;
1090*0Sstevel@tonic-gate 			}
1091*0Sstevel@tonic-gate 			nd = nd->nd_next;
1092*0Sstevel@tonic-gate 		}
1093*0Sstevel@tonic-gate 	} else {
1094*0Sstevel@tonic-gate 		for (i = 0; i < MD_MAXSIDES; i++) {
1095*0Sstevel@tonic-gate 			/* Skip empty slots */
1096*0Sstevel@tonic-gate 			if (sd->sd_nodes[i][0] == '\0')
1097*0Sstevel@tonic-gate 				continue;
1098*0Sstevel@tonic-gate 
1099*0Sstevel@tonic-gate 			if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) {
1100*0Sstevel@tonic-gate 				if (rval == 0)
1101*0Sstevel@tonic-gate 					(void) mdstealerror(ep, &xep);
1102*0Sstevel@tonic-gate 				rval = -1;
1103*0Sstevel@tonic-gate 			}
1104*0Sstevel@tonic-gate 		}
1105*0Sstevel@tonic-gate 	}
1106*0Sstevel@tonic-gate 
1107*0Sstevel@tonic-gate 	/* release signals back to what they were on entry */
1108*0Sstevel@tonic-gate 	if (procsigs(FALSE, &oldsigs, &xep) < 0)
1109*0Sstevel@tonic-gate 		mdclrerror(&xep);
1110*0Sstevel@tonic-gate 
1111*0Sstevel@tonic-gate 	cl_set_setkey(NULL);
1112*0Sstevel@tonic-gate 
1113*0Sstevel@tonic-gate 	metaflushsetname(sp);
1114*0Sstevel@tonic-gate 
1115*0Sstevel@tonic-gate 	return (rval);
1116*0Sstevel@tonic-gate }
1117*0Sstevel@tonic-gate 
1118*0Sstevel@tonic-gate int
1119*0Sstevel@tonic-gate meta_set_destroy(
1120*0Sstevel@tonic-gate 	mdsetname_t	*sp,
1121*0Sstevel@tonic-gate 	int		lock_set,
1122*0Sstevel@tonic-gate 	md_error_t	*ep
1123*0Sstevel@tonic-gate )
1124*0Sstevel@tonic-gate {
1125*0Sstevel@tonic-gate 	int		i;
1126*0Sstevel@tonic-gate 	med_rec_t	medr;
1127*0Sstevel@tonic-gate 	md_set_desc	*sd;
1128*0Sstevel@tonic-gate 	md_drive_desc	*dd, *p, *p1;
1129*0Sstevel@tonic-gate 	mddrivename_t	*dnp;
1130*0Sstevel@tonic-gate 	mdname_t	*np;
1131*0Sstevel@tonic-gate 	mdnamelist_t	*nlp = NULL;
1132*0Sstevel@tonic-gate 	int		num_users = 0;
1133*0Sstevel@tonic-gate 	int		has_set;
1134*0Sstevel@tonic-gate 	side_t		mysideno;
1135*0Sstevel@tonic-gate 	sigset_t	oldsigs;
1136*0Sstevel@tonic-gate 	md_error_t	xep = mdnullerror;
1137*0Sstevel@tonic-gate 	md_setkey_t	*cl_sk;
1138*0Sstevel@tonic-gate 	int		rval = 0;
1139*0Sstevel@tonic-gate 	int		delete_end = 1;
1140*0Sstevel@tonic-gate 
1141*0Sstevel@tonic-gate 	/* Make sure we are blocking all signals */
1142*0Sstevel@tonic-gate 	if (procsigs(TRUE, &oldsigs, ep) < 0)
1143*0Sstevel@tonic-gate 		return (-1);
1144*0Sstevel@tonic-gate 
1145*0Sstevel@tonic-gate 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
1146*0Sstevel@tonic-gate 		if (! mdisok(ep))
1147*0Sstevel@tonic-gate 			rval = -1;
1148*0Sstevel@tonic-gate 		goto out;
1149*0Sstevel@tonic-gate 	}
1150*0Sstevel@tonic-gate 
1151*0Sstevel@tonic-gate 	/*
1152*0Sstevel@tonic-gate 	 * meta_set_destroy should not be called for a MN diskset.
1153*0Sstevel@tonic-gate 	 * This routine destroys a set without communicating this information
1154*0Sstevel@tonic-gate 	 * to the other nodes which would lead to an inconsistency in
1155*0Sstevel@tonic-gate 	 * the MN diskset.
1156*0Sstevel@tonic-gate 	 */
1157*0Sstevel@tonic-gate 	if (MD_MNSET_DESC(sd)) {
1158*0Sstevel@tonic-gate 		rval = -1;
1159*0Sstevel@tonic-gate 		goto out;
1160*0Sstevel@tonic-gate 	}
1161*0Sstevel@tonic-gate 
1162*0Sstevel@tonic-gate 	/* Continue if a traditional diskset */
1163*0Sstevel@tonic-gate 
1164*0Sstevel@tonic-gate 	/*
1165*0Sstevel@tonic-gate 	 * Check to see who has the set.  If we are not the last user of the
1166*0Sstevel@tonic-gate 	 * set, we will not touch the replicas.
1167*0Sstevel@tonic-gate 	 */
1168*0Sstevel@tonic-gate 	for (i = 0; i < MD_MAXSIDES; i++) {
1169*0Sstevel@tonic-gate 		/* Skip empty slots */
1170*0Sstevel@tonic-gate 		if (sd->sd_nodes[i][0] == '\0')
1171*0Sstevel@tonic-gate 			continue;
1172*0Sstevel@tonic-gate 
1173*0Sstevel@tonic-gate 		has_set = nodehasset(sp, sd->sd_nodes[i], NHS_NST_EQ,
1174*0Sstevel@tonic-gate 		    ep);
1175*0Sstevel@tonic-gate 
1176*0Sstevel@tonic-gate 		if (has_set < 0) {
1177*0Sstevel@tonic-gate 			mdclrerror(ep);
1178*0Sstevel@tonic-gate 		} else
1179*0Sstevel@tonic-gate 			num_users++;
1180*0Sstevel@tonic-gate 	}
1181*0Sstevel@tonic-gate 
1182*0Sstevel@tonic-gate 	if ((dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep)) == NULL) {
1183*0Sstevel@tonic-gate 		if (! mdisok(ep)) {
1184*0Sstevel@tonic-gate 			rval = -1;
1185*0Sstevel@tonic-gate 			goto out;
1186*0Sstevel@tonic-gate 		}
1187*0Sstevel@tonic-gate 	}
1188*0Sstevel@tonic-gate 
1189*0Sstevel@tonic-gate 	if (setup_db_bydd(sp, dd, TRUE, ep) == -1) {
1190*0Sstevel@tonic-gate 		rval = -1;
1191*0Sstevel@tonic-gate 		goto out;
1192*0Sstevel@tonic-gate 	}
1193*0Sstevel@tonic-gate 
1194*0Sstevel@tonic-gate 	if (lock_set == TRUE) {
1195*0Sstevel@tonic-gate 		/* Lock the set on our side */
1196*0Sstevel@tonic-gate 		if (clnt_lock_set(mynode(), sp, ep)) {
1197*0Sstevel@tonic-gate 			rval = -1;
1198*0Sstevel@tonic-gate 			goto out;
1199*0Sstevel@tonic-gate 		}
1200*0Sstevel@tonic-gate 	}
1201*0Sstevel@tonic-gate 
1202*0Sstevel@tonic-gate 	/*
1203*0Sstevel@tonic-gate 	 * A traditional diskset has no diskset stale information to send
1204*0Sstevel@tonic-gate 	 * since there can only be one owner node at a time.
1205*0Sstevel@tonic-gate 	 */
1206*0Sstevel@tonic-gate 	if (snarf_set(sp, FALSE, ep))
1207*0Sstevel@tonic-gate 		mdclrerror(ep);
1208*0Sstevel@tonic-gate 
1209*0Sstevel@tonic-gate 	if (dd != NULL) {
1210*0Sstevel@tonic-gate 		/*
1211*0Sstevel@tonic-gate 		 * Make sure that no drives are in use as parts of metadrives
1212*0Sstevel@tonic-gate 		 * or hot spare pools, this is one of the few error conditions
1213*0Sstevel@tonic-gate 		 * that will stop this routine, unless the environment has
1214*0Sstevel@tonic-gate 		 * META_DESTROY_SET_OK set, in which case, the operation will
1215*0Sstevel@tonic-gate 		 * proceed.
1216*0Sstevel@tonic-gate 		 */
1217*0Sstevel@tonic-gate 		if (getenv("META_DESTROY_SET_OK") == NULL) {
1218*0Sstevel@tonic-gate 			for (p = dd; p != NULL; p = p->dd_next) {
1219*0Sstevel@tonic-gate 				dnp = p->dd_dnp;
1220*0Sstevel@tonic-gate 
1221*0Sstevel@tonic-gate 				i = meta_check_drive_inuse(sp, dnp, FALSE, ep);
1222*0Sstevel@tonic-gate 				if (i == -1) {
1223*0Sstevel@tonic-gate 					/* need xep - wire calls clear error */
1224*0Sstevel@tonic-gate 					i = metaget_setownership(sp, &xep);
1225*0Sstevel@tonic-gate 					if (i == -1) {
1226*0Sstevel@tonic-gate 						rval = -1;
1227*0Sstevel@tonic-gate 						goto out;
1228*0Sstevel@tonic-gate 					}
1229*0Sstevel@tonic-gate 
1230*0Sstevel@tonic-gate 					mysideno = getmyside(sp, &xep);
1231*0Sstevel@tonic-gate 
1232*0Sstevel@tonic-gate 					if (mysideno == MD_SIDEWILD) {
1233*0Sstevel@tonic-gate 						rval = -1;
1234*0Sstevel@tonic-gate 						goto out;
1235*0Sstevel@tonic-gate 					}
1236*0Sstevel@tonic-gate 
1237*0Sstevel@tonic-gate 					if (sd->sd_isown[mysideno] == FALSE)
1238*0Sstevel@tonic-gate 						if (halt_set(sp, &xep)) {
1239*0Sstevel@tonic-gate 							rval = -1;
1240*0Sstevel@tonic-gate 							goto out;
1241*0Sstevel@tonic-gate 						}
1242*0Sstevel@tonic-gate 
1243*0Sstevel@tonic-gate 					rval = -1;
1244*0Sstevel@tonic-gate 					goto out;
1245*0Sstevel@tonic-gate 				}
1246*0Sstevel@tonic-gate 			}
1247*0Sstevel@tonic-gate 		}
1248*0Sstevel@tonic-gate 
1249*0Sstevel@tonic-gate 		for (i = 0; i < MD_MAXSIDES; i++) {
1250*0Sstevel@tonic-gate 			/* Skip empty slots */
1251*0Sstevel@tonic-gate 			if (sd->sd_nodes[i][0] == '\0')
1252*0Sstevel@tonic-gate 				continue;
1253*0Sstevel@tonic-gate 
1254*0Sstevel@tonic-gate 			/* Skip non local nodes */
1255*0Sstevel@tonic-gate 			if (strcmp(mynode(), sd->sd_nodes[i]) != 0)
1256*0Sstevel@tonic-gate 				continue;
1257*0Sstevel@tonic-gate 
1258*0Sstevel@tonic-gate 			if (clnt_deldrvs(sd->sd_nodes[i], sp, dd, ep))
1259*0Sstevel@tonic-gate 				mdclrerror(ep);
1260*0Sstevel@tonic-gate 		}
1261*0Sstevel@tonic-gate 
1262*0Sstevel@tonic-gate 		/*
1263*0Sstevel@tonic-gate 		 * Go thru each drive and individually delete the replicas.
1264*0Sstevel@tonic-gate 		 * This way we can ignore individual errors.
1265*0Sstevel@tonic-gate 		 */
1266*0Sstevel@tonic-gate 		for (p = dd; p != NULL; p = p->dd_next) {
1267*0Sstevel@tonic-gate 			uint_t	rep_slice;
1268*0Sstevel@tonic-gate 
1269*0Sstevel@tonic-gate 			dnp = p->dd_dnp;
1270*0Sstevel@tonic-gate 			if ((meta_replicaslice(dnp, &rep_slice, ep) != 0) ||
1271*0Sstevel@tonic-gate 			    (((np = metaslicename(dnp, rep_slice, ep))
1272*0Sstevel@tonic-gate 				== NULL) &&
1273*0Sstevel@tonic-gate 				((np = metaslicename(dnp, MD_SLICE0, ep))
1274*0Sstevel@tonic-gate 				    == NULL))) {
1275*0Sstevel@tonic-gate 				rval = -1;
1276*0Sstevel@tonic-gate 				goto out;
1277*0Sstevel@tonic-gate 			}
1278*0Sstevel@tonic-gate 
1279*0Sstevel@tonic-gate 			if ((np = metaslicename(dnp,
1280*0Sstevel@tonic-gate 			    rep_slice, ep)) == NULL) {
1281*0Sstevel@tonic-gate 				if ((np = metaslicename(dnp,
1282*0Sstevel@tonic-gate 				    MD_SLICE0, ep)) == NULL) {
1283*0Sstevel@tonic-gate 					rval = -1;
1284*0Sstevel@tonic-gate 					goto out;
1285*0Sstevel@tonic-gate 				}
1286*0Sstevel@tonic-gate 				mdclrerror(ep);
1287*0Sstevel@tonic-gate 			}
1288*0Sstevel@tonic-gate 
1289*0Sstevel@tonic-gate 			/* Yes this is UGLY!!! */
1290*0Sstevel@tonic-gate 			p1 = p->dd_next;
1291*0Sstevel@tonic-gate 			p->dd_next = NULL;
1292*0Sstevel@tonic-gate 			if (rel_own_bydd(sp, p, FALSE, ep))
1293*0Sstevel@tonic-gate 				mdclrerror(ep);
1294*0Sstevel@tonic-gate 			p->dd_next = p1;
1295*0Sstevel@tonic-gate 
1296*0Sstevel@tonic-gate 			if (p->dd_dbcnt == 0)
1297*0Sstevel@tonic-gate 				continue;
1298*0Sstevel@tonic-gate 
1299*0Sstevel@tonic-gate 			/*
1300*0Sstevel@tonic-gate 			 * Skip the replica removal if we are not the last user
1301*0Sstevel@tonic-gate 			 */
1302*0Sstevel@tonic-gate 			if (num_users != 1)
1303*0Sstevel@tonic-gate 				continue;
1304*0Sstevel@tonic-gate 
1305*0Sstevel@tonic-gate 			nlp = NULL;
1306*0Sstevel@tonic-gate 			(void) metanamelist_append(&nlp, np);
1307*0Sstevel@tonic-gate 			if (meta_db_detach(sp, nlp,
1308*0Sstevel@tonic-gate 			    (MDFORCE_DS | MDFORCE_SET_LOCKED), NULL, ep))
1309*0Sstevel@tonic-gate 				mdclrerror(ep);
1310*0Sstevel@tonic-gate 			metafreenamelist(nlp);
1311*0Sstevel@tonic-gate 		}
1312*0Sstevel@tonic-gate 	}
1313*0Sstevel@tonic-gate 
1314*0Sstevel@tonic-gate 	if (halt_set(sp, ep)) {
1315*0Sstevel@tonic-gate 		rval = -1;
1316*0Sstevel@tonic-gate 		goto out;
1317*0Sstevel@tonic-gate 	}
1318*0Sstevel@tonic-gate 
1319*0Sstevel@tonic-gate 	/* Setup the mediator record */
1320*0Sstevel@tonic-gate 	(void) memset(&medr, '\0', sizeof (med_rec_t));
1321*0Sstevel@tonic-gate 	medr.med_rec_mag = MED_REC_MAGIC;
1322*0Sstevel@tonic-gate 	medr.med_rec_rev = MED_REC_REV;
1323*0Sstevel@tonic-gate 	medr.med_rec_fl  = 0;
1324*0Sstevel@tonic-gate 	medr.med_rec_sn  = sp->setno;
1325*0Sstevel@tonic-gate 	(void) strcpy(medr.med_rec_snm, sp->setname);
1326*0Sstevel@tonic-gate 	medr.med_rec_meds = sd->sd_med;	/* structure assigment */
1327*0Sstevel@tonic-gate 	(void) memset(&medr.med_rec_data, '\0', sizeof (med_data_t));
1328*0Sstevel@tonic-gate 	medr.med_rec_foff = 0;
1329*0Sstevel@tonic-gate 
1330*0Sstevel@tonic-gate 	/*
1331*0Sstevel@tonic-gate 	 * If we are the last remaining user, then remove the mediator hosts
1332*0Sstevel@tonic-gate 	 */
1333*0Sstevel@tonic-gate 	if (num_users == 1) {
1334*0Sstevel@tonic-gate 		for (i = 0; i < MED_MAX_HOSTS; i++) {
1335*0Sstevel@tonic-gate 			if (medr.med_rec_meds.n_lst[i].a_cnt != 0)
1336*0Sstevel@tonic-gate 				SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_REMOVE,
1337*0Sstevel@tonic-gate 				    SVM_TAG_MEDIATOR, sp->setno, i);
1338*0Sstevel@tonic-gate 			(void) memset(&medr.med_rec_meds.n_lst[i], '\0',
1339*0Sstevel@tonic-gate 			    sizeof (md_h_t));
1340*0Sstevel@tonic-gate 		}
1341*0Sstevel@tonic-gate 		medr.med_rec_meds.n_cnt = 0;
1342*0Sstevel@tonic-gate 	} else { 	/* Remove this host from the mediator node list. */
1343*0Sstevel@tonic-gate 		for (i = 0; i < MD_MAXSIDES; i++) {
1344*0Sstevel@tonic-gate 			/* Skip empty slots */
1345*0Sstevel@tonic-gate 			if (sd->sd_nodes[i][0] == '\0')
1346*0Sstevel@tonic-gate 				continue;
1347*0Sstevel@tonic-gate 
1348*0Sstevel@tonic-gate 			/* Copy non local node */
1349*0Sstevel@tonic-gate 			if (strcmp(mynode(), sd->sd_nodes[i]) != 0) {
1350*0Sstevel@tonic-gate 				(void) strcpy(medr.med_rec_nodes[i],
1351*0Sstevel@tonic-gate 				    sd->sd_nodes[i]);
1352*0Sstevel@tonic-gate 				continue;
1353*0Sstevel@tonic-gate 			}
1354*0Sstevel@tonic-gate 
1355*0Sstevel@tonic-gate 			/* Clear local node */
1356*0Sstevel@tonic-gate 			(void) memset(&medr.med_rec_nodes[i], '\0',
1357*0Sstevel@tonic-gate 			    sizeof (md_node_nm_t));
1358*0Sstevel@tonic-gate 		}
1359*0Sstevel@tonic-gate 	}
1360*0Sstevel@tonic-gate 
1361*0Sstevel@tonic-gate 	crcgen(&medr, &medr.med_rec_cks, sizeof (med_rec_t), NULL);
1362*0Sstevel@tonic-gate 
1363*0Sstevel@tonic-gate 	/*
1364*0Sstevel@tonic-gate 	 * If the client is part of a cluster put the DCS service
1365*0Sstevel@tonic-gate 	 * into a deleteing state.
1366*0Sstevel@tonic-gate 	 */
1367*0Sstevel@tonic-gate 	if (sdssc_delete_begin(sp->setname) == SDSSC_ERROR) {
1368*0Sstevel@tonic-gate 		if (metad_isautotakebyname(sp->setname)) {
1369*0Sstevel@tonic-gate 			delete_end = 0;
1370*0Sstevel@tonic-gate 		} else {
1371*0Sstevel@tonic-gate 			mdclrerror(ep);
1372*0Sstevel@tonic-gate 			goto out;
1373*0Sstevel@tonic-gate 		}
1374*0Sstevel@tonic-gate 	}
1375*0Sstevel@tonic-gate 
1376*0Sstevel@tonic-gate 	/* Inform the mediator hosts of the new information */
1377*0Sstevel@tonic-gate 	for (i = 0; i < MED_MAX_HOSTS; i++) {
1378*0Sstevel@tonic-gate 		if (sd->sd_med.n_lst[i].a_cnt == 0)
1379*0Sstevel@tonic-gate 			continue;
1380*0Sstevel@tonic-gate 
1381*0Sstevel@tonic-gate 		if (clnt_med_upd_rec(&sd->sd_med.n_lst[i], sp, &medr, ep))
1382*0Sstevel@tonic-gate 			mdclrerror(ep);
1383*0Sstevel@tonic-gate 	}
1384*0Sstevel@tonic-gate 
1385*0Sstevel@tonic-gate 	/* Delete the set locally */
1386*0Sstevel@tonic-gate 	for (i = 0; i < MD_MAXSIDES; i++) {
1387*0Sstevel@tonic-gate 		/* Skip empty slots */
1388*0Sstevel@tonic-gate 		if (sd->sd_nodes[i][0] == '\0')
1389*0Sstevel@tonic-gate 			continue;
1390*0Sstevel@tonic-gate 
1391*0Sstevel@tonic-gate 		/* Skip non local nodes */
1392*0Sstevel@tonic-gate 		if (strcmp(mynode(), sd->sd_nodes[i]) != 0)
1393*0Sstevel@tonic-gate 			continue;
1394*0Sstevel@tonic-gate 
1395*0Sstevel@tonic-gate 		if (clnt_delset(sd->sd_nodes[i], sp, ep) == -1)
1396*0Sstevel@tonic-gate 			mdclrerror(ep);
1397*0Sstevel@tonic-gate 	}
1398*0Sstevel@tonic-gate 	if (delete_end &&
1399*0Sstevel@tonic-gate 	    sdssc_delete_end(sp->setname, SDSSC_COMMIT) == SDSSC_ERROR)
1400*0Sstevel@tonic-gate 		rval = -1;
1401*0Sstevel@tonic-gate 
1402*0Sstevel@tonic-gate out:
1403*0Sstevel@tonic-gate 	/* release signals back to what they were on entry */
1404*0Sstevel@tonic-gate 	if (procsigs(FALSE, &oldsigs, &xep) < 0) {
1405*0Sstevel@tonic-gate 		if (rval == 0)
1406*0Sstevel@tonic-gate 			(void) mdstealerror(ep, &xep);
1407*0Sstevel@tonic-gate 		rval = -1;
1408*0Sstevel@tonic-gate 	}
1409*0Sstevel@tonic-gate 
1410*0Sstevel@tonic-gate 	if (lock_set == TRUE) {
1411*0Sstevel@tonic-gate 		cl_sk = cl_get_setkey(sp->setno, sp->setname);
1412*0Sstevel@tonic-gate 		if (clnt_unlock_set(mynode(), cl_sk, &xep)) {
1413*0Sstevel@tonic-gate 			if (rval == 0)
1414*0Sstevel@tonic-gate 				(void) mdstealerror(ep, &xep);
1415*0Sstevel@tonic-gate 			rval = -1;
1416*0Sstevel@tonic-gate 		}
1417*0Sstevel@tonic-gate 		cl_set_setkey(NULL);
1418*0Sstevel@tonic-gate 	}
1419*0Sstevel@tonic-gate 
1420*0Sstevel@tonic-gate 	metaflushsetname(sp);
1421*0Sstevel@tonic-gate 	return (rval);
1422*0Sstevel@tonic-gate }
1423*0Sstevel@tonic-gate 
1424*0Sstevel@tonic-gate int
1425*0Sstevel@tonic-gate meta_set_purge(
1426*0Sstevel@tonic-gate 	mdsetname_t	*sp,
1427*0Sstevel@tonic-gate 	int		bypass_cluster,
1428*0Sstevel@tonic-gate 	int		forceflg,
1429*0Sstevel@tonic-gate 	md_error_t	*ep
1430*0Sstevel@tonic-gate )
1431*0Sstevel@tonic-gate {
1432*0Sstevel@tonic-gate 	char		*thishost = mynode();
1433*0Sstevel@tonic-gate 	md_set_desc	*sd;
1434*0Sstevel@tonic-gate 	md_setkey_t	*cl_sk;
1435*0Sstevel@tonic-gate 	md_error_t	xep = mdnullerror;
1436*0Sstevel@tonic-gate 	int		rval = 0;
1437*0Sstevel@tonic-gate 	int		i, num_hosts = 0;
1438*0Sstevel@tonic-gate 	int		has_set = 0;
1439*0Sstevel@tonic-gate 	int		max_node = 0;
1440*0Sstevel@tonic-gate 	int		delete_end = 1;
1441*0Sstevel@tonic-gate 	md_mnnode_desc	*nd;
1442*0Sstevel@tonic-gate 
1443*0Sstevel@tonic-gate 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
1444*0Sstevel@tonic-gate 		/* unable to find set description */
1445*0Sstevel@tonic-gate 		rval = 1;
1446*0Sstevel@tonic-gate 		return (rval);
1447*0Sstevel@tonic-gate 	}
1448*0Sstevel@tonic-gate 
1449*0Sstevel@tonic-gate 	if (MD_MNSET_DESC(sd)) {
1450*0Sstevel@tonic-gate 		/*
1451*0Sstevel@tonic-gate 		 * Get a count of the hosts in the set and also lock the set
1452*0Sstevel@tonic-gate 		 * on those hosts that know about it.
1453*0Sstevel@tonic-gate 		 */
1454*0Sstevel@tonic-gate 		nd = sd->sd_nodelist;
1455*0Sstevel@tonic-gate 		while (nd) {
1456*0Sstevel@tonic-gate 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1457*0Sstevel@tonic-gate 				nd = nd->nd_next;
1458*0Sstevel@tonic-gate 				continue;
1459*0Sstevel@tonic-gate 			}
1460*0Sstevel@tonic-gate 			has_set = nodehasset(sp, nd->nd_nodename,
1461*0Sstevel@tonic-gate 				NHS_NST_EQ, ep);
1462*0Sstevel@tonic-gate 
1463*0Sstevel@tonic-gate 			/*
1464*0Sstevel@tonic-gate 			 * The host is not aware of this set (has_set < 0) or
1465*0Sstevel@tonic-gate 			 * the set does not match (has_set == 0). This check
1466*0Sstevel@tonic-gate 			 * prevents the code getting confused by an apparent
1467*0Sstevel@tonic-gate 			 * inconsistancy in the set's state, this is in the
1468*0Sstevel@tonic-gate 			 * purge code so something is broken in any case and
1469*0Sstevel@tonic-gate 			 * this is just trying to fix the brokeness.
1470*0Sstevel@tonic-gate 			 */
1471*0Sstevel@tonic-gate 			if (has_set <= 0) {
1472*0Sstevel@tonic-gate 				mdclrerror(ep);
1473*0Sstevel@tonic-gate 				nd->nd_flags |= MD_MN_NODE_NOSET;
1474*0Sstevel@tonic-gate 			} else {
1475*0Sstevel@tonic-gate 				num_hosts++;
1476*0Sstevel@tonic-gate 				if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
1477*0Sstevel@tonic-gate 					/*
1478*0Sstevel@tonic-gate 					 * If the force flag is set then
1479*0Sstevel@tonic-gate 					 * ignore any RPC failures because we
1480*0Sstevel@tonic-gate 					 * are only really interested with
1481*0Sstevel@tonic-gate 					 * the set on local node.
1482*0Sstevel@tonic-gate 					 */
1483*0Sstevel@tonic-gate 					if (forceflg && mdanyrpcerror(ep)) {
1484*0Sstevel@tonic-gate 						mdclrerror(ep);
1485*0Sstevel@tonic-gate 					} else {
1486*0Sstevel@tonic-gate 						/*
1487*0Sstevel@tonic-gate 						 * set max_node so that in the
1488*0Sstevel@tonic-gate 						 * unlock code nodes in the
1489*0Sstevel@tonic-gate 						 * set that have not been
1490*0Sstevel@tonic-gate 						 * locked are not unlocked.
1491*0Sstevel@tonic-gate 						 */
1492*0Sstevel@tonic-gate 						max_node = nd->nd_nodeid;
1493*0Sstevel@tonic-gate 						rval = 2;
1494*0Sstevel@tonic-gate 						goto out1;
1495*0Sstevel@tonic-gate 					}
1496*0Sstevel@tonic-gate 				}
1497*0Sstevel@tonic-gate 
1498*0Sstevel@tonic-gate 			}
1499*0Sstevel@tonic-gate 			nd = nd->nd_next;
1500*0Sstevel@tonic-gate 		}
1501*0Sstevel@tonic-gate 		max_node = 0;
1502*0Sstevel@tonic-gate 	} else {
1503*0Sstevel@tonic-gate 		/*
1504*0Sstevel@tonic-gate 		 * Get a count of the hosts in the set and also lock the set
1505*0Sstevel@tonic-gate 		 * on those hosts that know about it.
1506*0Sstevel@tonic-gate 		 */
1507*0Sstevel@tonic-gate 		for (i = 0; i < MD_MAXSIDES; i++) {
1508*0Sstevel@tonic-gate 			/* Skip empty slots */
1509*0Sstevel@tonic-gate 			if (sd->sd_nodes[i][0] == '\0')
1510*0Sstevel@tonic-gate 				continue;
1511*0Sstevel@tonic-gate 
1512*0Sstevel@tonic-gate 			has_set = nodehasset(sp, sd->sd_nodes[i],
1513*0Sstevel@tonic-gate 				NHS_NST_EQ, ep);
1514*0Sstevel@tonic-gate 
1515*0Sstevel@tonic-gate 			/*
1516*0Sstevel@tonic-gate 			 * The host is not aware of this set (has_set < 0) or
1517*0Sstevel@tonic-gate 			 * the set does not match (has_set == 0). This check
1518*0Sstevel@tonic-gate 			 * prevents the code getting confused by an apparent
1519*0Sstevel@tonic-gate 			 * inconsistancy in the set's state, this is in the
1520*0Sstevel@tonic-gate 			 * purge code so something is broken in any case and
1521*0Sstevel@tonic-gate 			 * this is just trying to fix the brokeness.
1522*0Sstevel@tonic-gate 			 */
1523*0Sstevel@tonic-gate 			if (has_set <= 0) {
1524*0Sstevel@tonic-gate 				mdclrerror(ep);
1525*0Sstevel@tonic-gate 				/*
1526*0Sstevel@tonic-gate 				 * set the node to NULL to prevent further
1527*0Sstevel@tonic-gate 				 * requests to this unresponsive node.
1528*0Sstevel@tonic-gate 				 */
1529*0Sstevel@tonic-gate 				sd->sd_nodes[i][0] = '\0';
1530*0Sstevel@tonic-gate 			} else {
1531*0Sstevel@tonic-gate 				num_hosts++;
1532*0Sstevel@tonic-gate 				if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) {
1533*0Sstevel@tonic-gate 					/*
1534*0Sstevel@tonic-gate 					 * If the force flag is set then
1535*0Sstevel@tonic-gate 					 * ignore any RPC failures because we
1536*0Sstevel@tonic-gate 					 * are only really interested with
1537*0Sstevel@tonic-gate 					 * the set on local node.
1538*0Sstevel@tonic-gate 					 */
1539*0Sstevel@tonic-gate 					if (forceflg && mdanyrpcerror(ep)) {
1540*0Sstevel@tonic-gate 						mdclrerror(ep);
1541*0Sstevel@tonic-gate 					} else {
1542*0Sstevel@tonic-gate 						rval = 2;
1543*0Sstevel@tonic-gate 						/*
1544*0Sstevel@tonic-gate 						 * set max_node so that in the
1545*0Sstevel@tonic-gate 						 * unlock code nodes in the
1546*0Sstevel@tonic-gate 						 * set that have not been
1547*0Sstevel@tonic-gate 						 * locked are not unlocked.
1548*0Sstevel@tonic-gate 						 */
1549*0Sstevel@tonic-gate 						max_node = i;
1550*0Sstevel@tonic-gate 						goto out1;
1551*0Sstevel@tonic-gate 					}
1552*0Sstevel@tonic-gate 				}
1553*0Sstevel@tonic-gate 			}
1554*0Sstevel@tonic-gate 		}
1555*0Sstevel@tonic-gate 		max_node = i;	/* now MD_MAXSIDES */
1556*0Sstevel@tonic-gate 	}
1557*0Sstevel@tonic-gate 	if (!bypass_cluster) {
1558*0Sstevel@tonic-gate 		/*
1559*0Sstevel@tonic-gate 		 * If there is only one host associated with the
1560*0Sstevel@tonic-gate 		 * set then remove the set from the cluster.
1561*0Sstevel@tonic-gate 		 */
1562*0Sstevel@tonic-gate 		if (num_hosts == 1) {
1563*0Sstevel@tonic-gate 			if (sdssc_delete_begin(sp->setname) == SDSSC_ERROR) {
1564*0Sstevel@tonic-gate 				if (metad_isautotakebyname(sp->setname)) {
1565*0Sstevel@tonic-gate 					delete_end = 0;
1566*0Sstevel@tonic-gate 				} else {
1567*0Sstevel@tonic-gate 					mdclrerror(ep);
1568*0Sstevel@tonic-gate 					rval = 3;
1569*0Sstevel@tonic-gate 					goto out1;
1570*0Sstevel@tonic-gate 				}
1571*0Sstevel@tonic-gate 			}
1572*0Sstevel@tonic-gate 		}
1573*0Sstevel@tonic-gate 	}
1574*0Sstevel@tonic-gate 
1575*0Sstevel@tonic-gate 	if (MD_MNSET_DESC(sd)) {
1576*0Sstevel@tonic-gate 		/*
1577*0Sstevel@tonic-gate 		 * Get a count of the hosts in the set and also lock the set
1578*0Sstevel@tonic-gate 		 * on those hosts that know about it.
1579*0Sstevel@tonic-gate 		 */
1580*0Sstevel@tonic-gate 		nd = sd->sd_nodelist;
1581*0Sstevel@tonic-gate 		while (nd) {
1582*0Sstevel@tonic-gate 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1583*0Sstevel@tonic-gate 				nd = nd->nd_next;
1584*0Sstevel@tonic-gate 				continue;
1585*0Sstevel@tonic-gate 			}
1586*0Sstevel@tonic-gate 			if (nd->nd_nodeid != sd->sd_mn_mynode->nd_nodeid) {
1587*0Sstevel@tonic-gate 				/*
1588*0Sstevel@tonic-gate 				 * Tell the remote node to remove this node
1589*0Sstevel@tonic-gate 				 */
1590*0Sstevel@tonic-gate 				if (clnt_delhosts(nd->nd_nodename, sp, 1,
1591*0Sstevel@tonic-gate 					&thishost, ep) == -1) {
1592*0Sstevel@tonic-gate 					/*
1593*0Sstevel@tonic-gate 					 * If we fail to delete ourselves
1594*0Sstevel@tonic-gate 					 * from the remote host it does not
1595*0Sstevel@tonic-gate 					 * really matter because the set is
1596*0Sstevel@tonic-gate 					 * being "purged" from this node. The
1597*0Sstevel@tonic-gate 					 * set can be purged from the other
1598*0Sstevel@tonic-gate 					 * node at a later time.
1599*0Sstevel@tonic-gate 					 */
1600*0Sstevel@tonic-gate 					mdclrerror(ep);
1601*0Sstevel@tonic-gate 				}
1602*0Sstevel@tonic-gate 				nd = nd->nd_next;
1603*0Sstevel@tonic-gate 				continue;
1604*0Sstevel@tonic-gate 			}
1605*0Sstevel@tonic-gate 			/* remove the set from this host */
1606*0Sstevel@tonic-gate 			if (clnt_delset(nd->nd_nodename, sp, ep) == -1) {
1607*0Sstevel@tonic-gate 				md_perror(dgettext(TEXT_DOMAIN, "delset"));
1608*0Sstevel@tonic-gate 				if (!bypass_cluster && num_hosts == 1)
1609*0Sstevel@tonic-gate 					(void) sdssc_delete_end(sp->setname,
1610*0Sstevel@tonic-gate 					    SDSSC_CLEANUP);
1611*0Sstevel@tonic-gate 				mdclrerror(ep);
1612*0Sstevel@tonic-gate 				goto out1;
1613*0Sstevel@tonic-gate 			}
1614*0Sstevel@tonic-gate 			nd = nd->nd_next;
1615*0Sstevel@tonic-gate 		}
1616*0Sstevel@tonic-gate 	} else {
1617*0Sstevel@tonic-gate 		for (i = 0; i < MD_MAXSIDES; i++) {
1618*0Sstevel@tonic-gate 			/* Skip empty slots */
1619*0Sstevel@tonic-gate 			if (sd->sd_nodes[i][0] == '\0')
1620*0Sstevel@tonic-gate 				continue;
1621*0Sstevel@tonic-gate 			if (strcmp(thishost, sd->sd_nodes[i]) != 0) {
1622*0Sstevel@tonic-gate 				/*
1623*0Sstevel@tonic-gate 				 * Tell the remote node to remove this node
1624*0Sstevel@tonic-gate 				 */
1625*0Sstevel@tonic-gate 				if (clnt_delhosts(sd->sd_nodes[i], sp, 1,
1626*0Sstevel@tonic-gate 				    &thishost, ep) == -1) {
1627*0Sstevel@tonic-gate 					/*
1628*0Sstevel@tonic-gate 					 * If we fail to delete ourselves
1629*0Sstevel@tonic-gate 					 * from the remote host it does not
1630*0Sstevel@tonic-gate 					 * really matter because the set is
1631*0Sstevel@tonic-gate 					 * being "purged" from this node. The
1632*0Sstevel@tonic-gate 					 * set can be purged from the other
1633*0Sstevel@tonic-gate 					 * node at a later time.
1634*0Sstevel@tonic-gate 					 */
1635*0Sstevel@tonic-gate 					mdclrerror(ep);
1636*0Sstevel@tonic-gate 				}
1637*0Sstevel@tonic-gate 				continue;
1638*0Sstevel@tonic-gate 			}
1639*0Sstevel@tonic-gate 
1640*0Sstevel@tonic-gate 			/* remove the set from this host */
1641*0Sstevel@tonic-gate 			if (clnt_delset(sd->sd_nodes[i], sp, ep) == -1) {
1642*0Sstevel@tonic-gate 				md_perror(dgettext(TEXT_DOMAIN, "delset"));
1643*0Sstevel@tonic-gate 				if (!bypass_cluster && num_hosts == 1)
1644*0Sstevel@tonic-gate 					(void) sdssc_delete_end(sp->setname,
1645*0Sstevel@tonic-gate 					    SDSSC_CLEANUP);
1646*0Sstevel@tonic-gate 				mdclrerror(ep);
1647*0Sstevel@tonic-gate 				goto out1;
1648*0Sstevel@tonic-gate 			}
1649*0Sstevel@tonic-gate 		}
1650*0Sstevel@tonic-gate 	}
1651*0Sstevel@tonic-gate 
1652*0Sstevel@tonic-gate 	if (!bypass_cluster && num_hosts == 1) {
1653*0Sstevel@tonic-gate 		if (delete_end && sdssc_delete_end(sp->setname, SDSSC_COMMIT) ==
1654*0Sstevel@tonic-gate 		    SDSSC_ERROR) {
1655*0Sstevel@tonic-gate 			rval = 4;
1656*0Sstevel@tonic-gate 		}
1657*0Sstevel@tonic-gate 	}
1658*0Sstevel@tonic-gate 
1659*0Sstevel@tonic-gate out1:
1660*0Sstevel@tonic-gate 
1661*0Sstevel@tonic-gate 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
1662*0Sstevel@tonic-gate 
1663*0Sstevel@tonic-gate 	/*
1664*0Sstevel@tonic-gate 	 * Remove the set lock on those nodes that had the set locked
1665*0Sstevel@tonic-gate 	 * max_node will either be MD_MAXSIDES or array index of the last
1666*0Sstevel@tonic-gate 	 * node contacted (or rather failed to contact) for traditional
1667*0Sstevel@tonic-gate 	 * diskset.  For a MN diskset, max_node is the node_id of the node
1668*0Sstevel@tonic-gate 	 * that failed the lock.
1669*0Sstevel@tonic-gate 	 */
1670*0Sstevel@tonic-gate 	if (MD_MNSET_DESC(sd)) {
1671*0Sstevel@tonic-gate 		nd = sd->sd_nodelist;
1672*0Sstevel@tonic-gate 		while (nd) {
1673*0Sstevel@tonic-gate 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1674*0Sstevel@tonic-gate 				nd = nd->nd_next;
1675*0Sstevel@tonic-gate 				continue;
1676*0Sstevel@tonic-gate 			}
1677*0Sstevel@tonic-gate 			if (nd->nd_nodeid == max_node)
1678*0Sstevel@tonic-gate 				break;
1679*0Sstevel@tonic-gate 			if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
1680*0Sstevel@tonic-gate 				if (forceflg && mdanyrpcerror(&xep)) {
1681*0Sstevel@tonic-gate 					mdclrerror(&xep);
1682*0Sstevel@tonic-gate 					nd = nd->nd_next;
1683*0Sstevel@tonic-gate 					continue;
1684*0Sstevel@tonic-gate 				}
1685*0Sstevel@tonic-gate 				if (rval == 0)
1686*0Sstevel@tonic-gate 					(void) mdstealerror(ep, &xep);
1687*0Sstevel@tonic-gate 				rval = 5;
1688*0Sstevel@tonic-gate 			}
1689*0Sstevel@tonic-gate 			nd = nd->nd_next;
1690*0Sstevel@tonic-gate 		}
1691*0Sstevel@tonic-gate 	} else {
1692*0Sstevel@tonic-gate 		for (i = 0; i < max_node; i++) {
1693*0Sstevel@tonic-gate 			/* Skip empty slots */
1694*0Sstevel@tonic-gate 			if (sd->sd_nodes[i][0] == '\0')
1695*0Sstevel@tonic-gate 				continue;
1696*0Sstevel@tonic-gate 
1697*0Sstevel@tonic-gate 			if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) {
1698*0Sstevel@tonic-gate 				if (forceflg && mdanyrpcerror(&xep)) {
1699*0Sstevel@tonic-gate 					mdclrerror(&xep);
1700*0Sstevel@tonic-gate 					continue;
1701*0Sstevel@tonic-gate 				}
1702*0Sstevel@tonic-gate 				if (rval == 0)
1703*0Sstevel@tonic-gate 					(void) mdstealerror(ep, &xep);
1704*0Sstevel@tonic-gate 				rval = 5;
1705*0Sstevel@tonic-gate 			}
1706*0Sstevel@tonic-gate 		}
1707*0Sstevel@tonic-gate 	}
1708*0Sstevel@tonic-gate 
1709*0Sstevel@tonic-gate 	cl_set_setkey(NULL);
1710*0Sstevel@tonic-gate 
1711*0Sstevel@tonic-gate 	return (rval);
1712*0Sstevel@tonic-gate }
1713*0Sstevel@tonic-gate 
1714*0Sstevel@tonic-gate int
1715*0Sstevel@tonic-gate meta_set_query(
1716*0Sstevel@tonic-gate 	mdsetname_t		*sp,
1717*0Sstevel@tonic-gate 	mddb_dtag_lst_t		**dtlpp,
1718*0Sstevel@tonic-gate 	md_error_t		*ep
1719*0Sstevel@tonic-gate )
1720*0Sstevel@tonic-gate {
1721*0Sstevel@tonic-gate 	mddb_dtag_get_parm_t	dtgp;
1722*0Sstevel@tonic-gate 
1723*0Sstevel@tonic-gate 	(void) memset(&dtgp, '\0', sizeof (mddb_dtag_get_parm_t));
1724*0Sstevel@tonic-gate 	dtgp.dtgp_setno = sp->setno;
1725*0Sstevel@tonic-gate 
1726*0Sstevel@tonic-gate 	/*CONSTCOND*/
1727*0Sstevel@tonic-gate 	while (1) {
1728*0Sstevel@tonic-gate 		if (metaioctl(MD_MED_GET_TAG, &dtgp, &dtgp.dtgp_mde, NULL) != 0)
1729*0Sstevel@tonic-gate 			if (! mdismddberror(&dtgp.dtgp_mde, MDE_DB_NOTAG) ||
1730*0Sstevel@tonic-gate 			    *dtlpp == NULL)
1731*0Sstevel@tonic-gate 				return (mdstealerror(ep, &dtgp.dtgp_mde));
1732*0Sstevel@tonic-gate 			else
1733*0Sstevel@tonic-gate 				break;
1734*0Sstevel@tonic-gate 
1735*0Sstevel@tonic-gate 		/*
1736*0Sstevel@tonic-gate 		 * Run to the end of the list
1737*0Sstevel@tonic-gate 		 */
1738*0Sstevel@tonic-gate 		for (/* void */; (*dtlpp != NULL); dtlpp = &(*dtlpp)->dtl_nx)
1739*0Sstevel@tonic-gate 			/* void */;
1740*0Sstevel@tonic-gate 
1741*0Sstevel@tonic-gate 		*dtlpp = Zalloc(sizeof (mddb_dtag_lst_t));
1742*0Sstevel@tonic-gate 
1743*0Sstevel@tonic-gate 		(void) memmove(&(*dtlpp)->dtl_dt, &dtgp.dtgp_dt,
1744*0Sstevel@tonic-gate 		    sizeof (mddb_dtag_t));
1745*0Sstevel@tonic-gate 
1746*0Sstevel@tonic-gate 		dtgp.dtgp_dt.dt_id++;
1747*0Sstevel@tonic-gate 	}
1748*0Sstevel@tonic-gate 	return (0);
1749*0Sstevel@tonic-gate }
1750*0Sstevel@tonic-gate 
1751*0Sstevel@tonic-gate /*
1752*0Sstevel@tonic-gate  * return drivename get by key
1753*0Sstevel@tonic-gate  */
1754*0Sstevel@tonic-gate mddrivename_t *
1755*0Sstevel@tonic-gate metadrivename_withdrkey(
1756*0Sstevel@tonic-gate 	mdsetname_t	*sp,
1757*0Sstevel@tonic-gate 	side_t		sideno,
1758*0Sstevel@tonic-gate 	mdkey_t		key,
1759*0Sstevel@tonic-gate 	int		flags,
1760*0Sstevel@tonic-gate 	md_error_t	*ep
1761*0Sstevel@tonic-gate )
1762*0Sstevel@tonic-gate {
1763*0Sstevel@tonic-gate 	char		*nm;
1764*0Sstevel@tonic-gate 	mdname_t	*np;
1765*0Sstevel@tonic-gate 	mddrivename_t	*dnp;
1766*0Sstevel@tonic-gate 	ddi_devid_t	devidp;
1767*0Sstevel@tonic-gate 	md_set_desc	*sd;
1768*0Sstevel@tonic-gate 
1769*0Sstevel@tonic-gate 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
1770*0Sstevel@tonic-gate 		return (NULL);
1771*0Sstevel@tonic-gate 	}
1772*0Sstevel@tonic-gate 
1773*0Sstevel@tonic-gate 	/* get namespace info */
1774*0Sstevel@tonic-gate 	if (MD_MNSET_DESC(sd)) {
1775*0Sstevel@tonic-gate 		if ((nm = meta_getnmbykey(MD_LOCAL_SET, sideno,
1776*0Sstevel@tonic-gate 		    key, ep)) == NULL)
1777*0Sstevel@tonic-gate 			return (NULL);
1778*0Sstevel@tonic-gate 	} else {
1779*0Sstevel@tonic-gate 		if ((nm = meta_getnmbykey(MD_LOCAL_SET, sideno+SKEW,
1780*0Sstevel@tonic-gate 		    key, ep)) == NULL)
1781*0Sstevel@tonic-gate 			return (NULL);
1782*0Sstevel@tonic-gate 	}
1783*0Sstevel@tonic-gate 
1784*0Sstevel@tonic-gate 	/* get device name */
1785*0Sstevel@tonic-gate 	if (flags & PRINT_FAST) {
1786*0Sstevel@tonic-gate 		if ((np = metaname_fast(&sp, nm, ep)) == NULL) {
1787*0Sstevel@tonic-gate 			Free(nm);
1788*0Sstevel@tonic-gate 			return (NULL);
1789*0Sstevel@tonic-gate 		}
1790*0Sstevel@tonic-gate 	} else {
1791*0Sstevel@tonic-gate 		if ((np = metaname(&sp, nm, ep)) == NULL) {
1792*0Sstevel@tonic-gate 			Free(nm);
1793*0Sstevel@tonic-gate 			return (NULL);
1794*0Sstevel@tonic-gate 		}
1795*0Sstevel@tonic-gate 	}
1796*0Sstevel@tonic-gate 	Free(nm);
1797*0Sstevel@tonic-gate 
1798*0Sstevel@tonic-gate 	/* make sure it's OK */
1799*0Sstevel@tonic-gate 	if ((! (flags & MD_BASICNAME_OK)) && (metachkcomp(np, ep) != 0))
1800*0Sstevel@tonic-gate 		return (NULL);
1801*0Sstevel@tonic-gate 
1802*0Sstevel@tonic-gate 	/* get drivename */
1803*0Sstevel@tonic-gate 	dnp = np->drivenamep;
1804*0Sstevel@tonic-gate 	dnp->side_names_key = key;
1805*0Sstevel@tonic-gate 
1806*0Sstevel@tonic-gate 	/*
1807*0Sstevel@tonic-gate 	 * Skip the following devid check if dnp is did device
1808*0Sstevel@tonic-gate 	 * The device id is disabled for did device due to the
1809*0Sstevel@tonic-gate 	 * lack of minor name support in the did driver. The following
1810*0Sstevel@tonic-gate 	 * devid code path can set and propagate the error and
1811*0Sstevel@tonic-gate 	 * eventually prevent did disks from being added to the
1812*0Sstevel@tonic-gate 	 * diskset under SunCluster systems
1813*0Sstevel@tonic-gate 	 */
1814*0Sstevel@tonic-gate 	if (strncmp(dnp->rname, "/dev/did/", strlen("/dev/did/")) == 0) {
1815*0Sstevel@tonic-gate 		goto out;
1816*0Sstevel@tonic-gate 	}
1817*0Sstevel@tonic-gate 
1818*0Sstevel@tonic-gate 	/* Also, Skip the check if MN diskset, no devid's */
1819*0Sstevel@tonic-gate 	if (MD_MNSET_DESC(sd)) {
1820*0Sstevel@tonic-gate 		goto out;
1821*0Sstevel@tonic-gate 	}
1822*0Sstevel@tonic-gate 
1823*0Sstevel@tonic-gate 	/*
1824*0Sstevel@tonic-gate 	 * Get the devid associated with the key.
1825*0Sstevel@tonic-gate 	 *
1826*0Sstevel@tonic-gate 	 * If a devid was returned, it MUST be valid even in
1827*0Sstevel@tonic-gate 	 * the case where a device id has been "updated". The
1828*0Sstevel@tonic-gate 	 * "update" of the device id may have occured due to
1829*0Sstevel@tonic-gate 	 * a firmware upgrade.
1830*0Sstevel@tonic-gate 	 */
1831*0Sstevel@tonic-gate 	if ((devidp = meta_getdidbykey(MD_LOCAL_SET, sideno+SKEW, key, ep))
1832*0Sstevel@tonic-gate 	    != NULL) {
1833*0Sstevel@tonic-gate 		dnp->devid = devid_str_encode(devidp, NULL);
1834*0Sstevel@tonic-gate 		free(devidp);
1835*0Sstevel@tonic-gate 	} else {
1836*0Sstevel@tonic-gate 		/*
1837*0Sstevel@tonic-gate 		 * It is okay if replica is not in devid mode
1838*0Sstevel@tonic-gate 		 */
1839*0Sstevel@tonic-gate 		if (mdissyserror(ep, MDDB_F_NODEVID)) {
1840*0Sstevel@tonic-gate 			mdclrerror(ep);
1841*0Sstevel@tonic-gate 			goto out;
1842*0Sstevel@tonic-gate 		}
1843*0Sstevel@tonic-gate 
1844*0Sstevel@tonic-gate 		/*
1845*0Sstevel@tonic-gate 		 * devid is missing so this means that we have
1846*0Sstevel@tonic-gate 		 * just upgraded from a configuration where
1847*0Sstevel@tonic-gate 		 * devid's were not used so try to add in
1848*0Sstevel@tonic-gate 		 * the devid and requery.
1849*0Sstevel@tonic-gate 		 */
1850*0Sstevel@tonic-gate 		if (meta_setdid(MD_LOCAL_SET, sideno + SKEW, key,
1851*0Sstevel@tonic-gate 		    ep) < 0)
1852*0Sstevel@tonic-gate 			return (NULL);
1853*0Sstevel@tonic-gate 		if ((devidp = (ddi_devid_t)meta_getdidbykey(MD_LOCAL_SET,
1854*0Sstevel@tonic-gate 		    sideno+SKEW, key, ep)) == NULL)
1855*0Sstevel@tonic-gate 			return (NULL);
1856*0Sstevel@tonic-gate 		dnp->devid = devid_str_encode(devidp, NULL);
1857*0Sstevel@tonic-gate 		devid_free(devidp);
1858*0Sstevel@tonic-gate 	}
1859*0Sstevel@tonic-gate 
1860*0Sstevel@tonic-gate out:
1861*0Sstevel@tonic-gate 	if (flags & MD_BYPASS_DAEMON)
1862*0Sstevel@tonic-gate 		return (dnp);
1863*0Sstevel@tonic-gate 
1864*0Sstevel@tonic-gate 	if (get_sidenmlist(sp, dnp, ep))
1865*0Sstevel@tonic-gate 		return (NULL);
1866*0Sstevel@tonic-gate 
1867*0Sstevel@tonic-gate 	/* return success */
1868*0Sstevel@tonic-gate 	return (dnp);
1869*0Sstevel@tonic-gate }
1870*0Sstevel@tonic-gate 
1871*0Sstevel@tonic-gate void
1872*0Sstevel@tonic-gate metafreedrivedesc(md_drive_desc **dd)
1873*0Sstevel@tonic-gate {
1874*0Sstevel@tonic-gate 	md_drive_desc	*p, *next = NULL;
1875*0Sstevel@tonic-gate 
1876*0Sstevel@tonic-gate 	for (p = *dd; p != NULL; p = next) {
1877*0Sstevel@tonic-gate 		next = p->dd_next;
1878*0Sstevel@tonic-gate 		Free(p);
1879*0Sstevel@tonic-gate 	}
1880*0Sstevel@tonic-gate 	*dd = NULL;
1881*0Sstevel@tonic-gate }
1882*0Sstevel@tonic-gate 
1883*0Sstevel@tonic-gate md_drive_desc *
1884*0Sstevel@tonic-gate metaget_drivedesc(
1885*0Sstevel@tonic-gate 	mdsetname_t	*sp,
1886*0Sstevel@tonic-gate 	int		flags,
1887*0Sstevel@tonic-gate 	md_error_t	*ep
1888*0Sstevel@tonic-gate )
1889*0Sstevel@tonic-gate {
1890*0Sstevel@tonic-gate 	side_t		sideno = MD_SIDEWILD;
1891*0Sstevel@tonic-gate 
1892*0Sstevel@tonic-gate 	assert(! (flags & MD_BYPASS_DAEMON));
1893*0Sstevel@tonic-gate 
1894*0Sstevel@tonic-gate 	if ((sideno = getmyside(sp, ep)) == MD_SIDEWILD)
1895*0Sstevel@tonic-gate 		return (NULL);
1896*0Sstevel@tonic-gate 
1897*0Sstevel@tonic-gate 	return (metaget_drivedesc_sideno(sp, sideno, flags, ep));
1898*0Sstevel@tonic-gate }
1899*0Sstevel@tonic-gate 
1900*0Sstevel@tonic-gate md_drive_desc *
1901*0Sstevel@tonic-gate metaget_drivedesc_fromnamelist(
1902*0Sstevel@tonic-gate 	mdsetname_t	*sp,
1903*0Sstevel@tonic-gate 	mdnamelist_t	*nlp,
1904*0Sstevel@tonic-gate 	md_error_t	*ep
1905*0Sstevel@tonic-gate )
1906*0Sstevel@tonic-gate {
1907*0Sstevel@tonic-gate 	md_set_desc		*sd;
1908*0Sstevel@tonic-gate 	mdnamelist_t		*p;
1909*0Sstevel@tonic-gate 	md_drive_desc		*dd = NULL;
1910*0Sstevel@tonic-gate 
1911*0Sstevel@tonic-gate 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
1912*0Sstevel@tonic-gate 		return (NULL);
1913*0Sstevel@tonic-gate 
1914*0Sstevel@tonic-gate 	for (p = nlp; p != NULL; p = p->next)
1915*0Sstevel@tonic-gate 		(void) metadrivedesc_append(&dd, p->namep->drivenamep, 0, 0,
1916*0Sstevel@tonic-gate 		    sd->sd_ctime, sd->sd_genid, MD_DR_ADD);
1917*0Sstevel@tonic-gate 
1918*0Sstevel@tonic-gate 	return (dd);
1919*0Sstevel@tonic-gate }
1920*0Sstevel@tonic-gate 
1921*0Sstevel@tonic-gate md_drive_desc *
1922*0Sstevel@tonic-gate metaget_drivedesc_sideno(
1923*0Sstevel@tonic-gate 	mdsetname_t *sp,
1924*0Sstevel@tonic-gate 	side_t sideno,
1925*0Sstevel@tonic-gate 	int flags,
1926*0Sstevel@tonic-gate 	md_error_t *ep
1927*0Sstevel@tonic-gate )
1928*0Sstevel@tonic-gate {
1929*0Sstevel@tonic-gate 	md_set_desc	*sd = NULL;
1930*0Sstevel@tonic-gate 
1931*0Sstevel@tonic-gate 	assert(! (flags & MD_BYPASS_DAEMON));
1932*0Sstevel@tonic-gate 
1933*0Sstevel@tonic-gate 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
1934*0Sstevel@tonic-gate 		return (NULL);
1935*0Sstevel@tonic-gate 
1936*0Sstevel@tonic-gate 	if (sd->sd_drvs)
1937*0Sstevel@tonic-gate 		return (sd->sd_drvs);
1938*0Sstevel@tonic-gate 
1939*0Sstevel@tonic-gate 	if ((sd->sd_drvs = dr2drivedesc(sp, sideno, flags, ep)) == NULL)
1940*0Sstevel@tonic-gate 		return (NULL);
1941*0Sstevel@tonic-gate 
1942*0Sstevel@tonic-gate 	return (sd->sd_drvs);
1943*0Sstevel@tonic-gate }
1944*0Sstevel@tonic-gate 
1945*0Sstevel@tonic-gate int
1946*0Sstevel@tonic-gate metaget_setownership(
1947*0Sstevel@tonic-gate 	mdsetname_t	*sp,
1948*0Sstevel@tonic-gate 	md_error_t	*ep
1949*0Sstevel@tonic-gate )
1950*0Sstevel@tonic-gate {
1951*0Sstevel@tonic-gate 	md_set_desc	*sd;
1952*0Sstevel@tonic-gate 	int		bool;
1953*0Sstevel@tonic-gate 	int		i;
1954*0Sstevel@tonic-gate 	md_mnnode_desc	*nd;
1955*0Sstevel@tonic-gate 
1956*0Sstevel@tonic-gate 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
1957*0Sstevel@tonic-gate 		return (-1);
1958*0Sstevel@tonic-gate 
1959*0Sstevel@tonic-gate 	if (MD_MNSET_DESC(sd)) {
1960*0Sstevel@tonic-gate 		nd = sd->sd_nodelist;
1961*0Sstevel@tonic-gate 		while (nd) {
1962*0Sstevel@tonic-gate 			/* If node isn't alive, can't own diskset */
1963*0Sstevel@tonic-gate 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1964*0Sstevel@tonic-gate 				nd->nd_flags &= ~MD_MN_NODE_OWN;
1965*0Sstevel@tonic-gate 				nd = nd->nd_next;
1966*0Sstevel@tonic-gate 				continue;
1967*0Sstevel@tonic-gate 			}
1968*0Sstevel@tonic-gate 			/*
1969*0Sstevel@tonic-gate 			 * If can't communicate with rpc.metad, then mark
1970*0Sstevel@tonic-gate 			 * this node as not an owner.  That node may
1971*0Sstevel@tonic-gate 			 * in fact, be an owner, but without rpc.metad running
1972*0Sstevel@tonic-gate 			 * that node can't do much.
1973*0Sstevel@tonic-gate 			 */
1974*0Sstevel@tonic-gate 			if (clnt_ownset(nd->nd_nodename, sp, &bool, ep) == -1) {
1975*0Sstevel@tonic-gate 				nd->nd_flags &= ~MD_MN_NODE_OWN;
1976*0Sstevel@tonic-gate 			} else if (bool == TRUE) {
1977*0Sstevel@tonic-gate 				nd->nd_flags |= MD_MN_NODE_OWN;
1978*0Sstevel@tonic-gate 			} else {
1979*0Sstevel@tonic-gate 				nd->nd_flags &= ~MD_MN_NODE_OWN;
1980*0Sstevel@tonic-gate 			}
1981*0Sstevel@tonic-gate 			nd = nd->nd_next;
1982*0Sstevel@tonic-gate 		}
1983*0Sstevel@tonic-gate 		return (0);
1984*0Sstevel@tonic-gate 	}
1985*0Sstevel@tonic-gate 
1986*0Sstevel@tonic-gate 	/* Rest of code handles traditional disksets */
1987*0Sstevel@tonic-gate 
1988*0Sstevel@tonic-gate 	for (i = 0; i < MD_MAXSIDES; i++)
1989*0Sstevel@tonic-gate 		sd->sd_isown[i] = 0;
1990*0Sstevel@tonic-gate 
1991*0Sstevel@tonic-gate 	if (clnt_ownset(mynode(), sp, &bool, ep) == -1)
1992*0Sstevel@tonic-gate 		return (-1);
1993*0Sstevel@tonic-gate 
1994*0Sstevel@tonic-gate 	if (bool == TRUE)
1995*0Sstevel@tonic-gate 		sd->sd_isown[getmyside(sp, ep)] = 1;
1996*0Sstevel@tonic-gate 
1997*0Sstevel@tonic-gate 	return (0);
1998*0Sstevel@tonic-gate }
1999*0Sstevel@tonic-gate 
2000*0Sstevel@tonic-gate char *
2001*0Sstevel@tonic-gate mynode(void)
2002*0Sstevel@tonic-gate {
2003*0Sstevel@tonic-gate 	static struct utsname	myuname;
2004*0Sstevel@tonic-gate 	static int		done = 0;
2005*0Sstevel@tonic-gate 
2006*0Sstevel@tonic-gate 	if (! done) {
2007*0Sstevel@tonic-gate 		if (uname(&myuname) == -1) {
2008*0Sstevel@tonic-gate 			md_perror(dgettext(TEXT_DOMAIN, "uname"));
2009*0Sstevel@tonic-gate 			assert(0);
2010*0Sstevel@tonic-gate 		}
2011*0Sstevel@tonic-gate 		done = 1;
2012*0Sstevel@tonic-gate 	}
2013*0Sstevel@tonic-gate 	return (myuname.nodename);
2014*0Sstevel@tonic-gate }
2015*0Sstevel@tonic-gate 
2016*0Sstevel@tonic-gate int
2017*0Sstevel@tonic-gate strinlst(char *str, int cnt, char **lst)
2018*0Sstevel@tonic-gate {
2019*0Sstevel@tonic-gate 	int i;
2020*0Sstevel@tonic-gate 
2021*0Sstevel@tonic-gate 	for (i = 0; i < cnt; i++)
2022*0Sstevel@tonic-gate 		if (strcmp(lst[i], str) == 0)
2023*0Sstevel@tonic-gate 			return (TRUE);
2024*0Sstevel@tonic-gate 
2025*0Sstevel@tonic-gate 	return (FALSE);
2026*0Sstevel@tonic-gate }
2027*0Sstevel@tonic-gate 
2028*0Sstevel@tonic-gate /*
2029*0Sstevel@tonic-gate  * meta_get_reserved_names
2030*0Sstevel@tonic-gate  *  returns an mdnamelist_t of reserved slices
2031*0Sstevel@tonic-gate  *  reserved slices are those that are used but don't necessarily
2032*0Sstevel@tonic-gate  *  show up as metadevices (ex. reserved slice for db in sets, logs)
2033*0Sstevel@tonic-gate  */
2034*0Sstevel@tonic-gate 
2035*0Sstevel@tonic-gate /*ARGSUSED*/
2036*0Sstevel@tonic-gate int
2037*0Sstevel@tonic-gate meta_get_reserved_names(
2038*0Sstevel@tonic-gate 	mdsetname_t	*sp,
2039*0Sstevel@tonic-gate 	mdnamelist_t	**nlpp,
2040*0Sstevel@tonic-gate 	int		options,
2041*0Sstevel@tonic-gate 	md_error_t	*ep)
2042*0Sstevel@tonic-gate {
2043*0Sstevel@tonic-gate 	int		 count		= 0;
2044*0Sstevel@tonic-gate 	mdname_t	*np		= NULL;
2045*0Sstevel@tonic-gate 	mdnamelist_t	*transnlp	= NULL;
2046*0Sstevel@tonic-gate 	mdnamelist_t	**tailpp 	= nlpp;
2047*0Sstevel@tonic-gate 	mdnamelist_t	*nlp;
2048*0Sstevel@tonic-gate 	md_drive_desc	*dd, *di;
2049*0Sstevel@tonic-gate 
2050*0Sstevel@tonic-gate 	if (metaislocalset(sp))
2051*0Sstevel@tonic-gate 		goto out;
2052*0Sstevel@tonic-gate 
2053*0Sstevel@tonic-gate 	if (!(dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep)) && !mdisok(ep)) {
2054*0Sstevel@tonic-gate 		count = -1;
2055*0Sstevel@tonic-gate 		goto out;
2056*0Sstevel@tonic-gate 	}
2057*0Sstevel@tonic-gate 
2058*0Sstevel@tonic-gate 	/* db in for sets on reserved slice */
2059*0Sstevel@tonic-gate 	for (di = dd; di && count >= 0; di = di->dd_next) {
2060*0Sstevel@tonic-gate 		uint_t	rep_slice;
2061*0Sstevel@tonic-gate 
2062*0Sstevel@tonic-gate 		/*
2063*0Sstevel@tonic-gate 		 * Add the name struct to the end of the
2064*0Sstevel@tonic-gate 		 * namelist but keep a pointer to the last
2065*0Sstevel@tonic-gate 		 * element so that we don't incur the overhead
2066*0Sstevel@tonic-gate 		 * of traversing the list each time
2067*0Sstevel@tonic-gate 		 */
2068*0Sstevel@tonic-gate 		if (di->dd_dnp &&
2069*0Sstevel@tonic-gate 		    (meta_replicaslice(di->dd_dnp, &rep_slice, ep) == 0) &&
2070*0Sstevel@tonic-gate 		    (np = metaslicename(di->dd_dnp, rep_slice, ep)) &&
2071*0Sstevel@tonic-gate 		    (tailpp = meta_namelist_append_wrapper(tailpp, np)))
2072*0Sstevel@tonic-gate 			count++;
2073*0Sstevel@tonic-gate 		else
2074*0Sstevel@tonic-gate 			count = -1;
2075*0Sstevel@tonic-gate 	}
2076*0Sstevel@tonic-gate 
2077*0Sstevel@tonic-gate 	/* now find logs */
2078*0Sstevel@tonic-gate 	if (meta_get_trans_names(sp, &transnlp, 0, ep) < 0) {
2079*0Sstevel@tonic-gate 		count = -1;
2080*0Sstevel@tonic-gate 		goto out;
2081*0Sstevel@tonic-gate 	}
2082*0Sstevel@tonic-gate 
2083*0Sstevel@tonic-gate 	for (nlp = transnlp; (nlp != NULL); nlp = nlp->next) {
2084*0Sstevel@tonic-gate 		mdname_t	*transnp = nlp->namep;
2085*0Sstevel@tonic-gate 		md_trans_t	*transp;
2086*0Sstevel@tonic-gate 
2087*0Sstevel@tonic-gate 		if ((transp = meta_get_trans(sp, transnp, ep)) == NULL) {
2088*0Sstevel@tonic-gate 			count = -1;
2089*0Sstevel@tonic-gate 			goto out;
2090*0Sstevel@tonic-gate 		}
2091*0Sstevel@tonic-gate 		if (transp->lognamep) {
2092*0Sstevel@tonic-gate 			/*
2093*0Sstevel@tonic-gate 			 * Add the name struct to the end of the
2094*0Sstevel@tonic-gate 			 * namelist but keep a pointer to the last
2095*0Sstevel@tonic-gate 			 * element so that we don't incur the overhead
2096*0Sstevel@tonic-gate 			 * of traversing the list each time
2097*0Sstevel@tonic-gate 			 */
2098*0Sstevel@tonic-gate 			tailpp = meta_namelist_append_wrapper(
2099*0Sstevel@tonic-gate 			    tailpp, transp->lognamep);
2100*0Sstevel@tonic-gate 		}
2101*0Sstevel@tonic-gate 	}
2102*0Sstevel@tonic-gate out:
2103*0Sstevel@tonic-gate 	metafreenamelist(transnlp);
2104*0Sstevel@tonic-gate 	return (count);
2105*0Sstevel@tonic-gate }
2106*0Sstevel@tonic-gate 
2107*0Sstevel@tonic-gate /*
2108*0Sstevel@tonic-gate  * Entry point to join a node to MultiNode diskset.
2109*0Sstevel@tonic-gate  *
2110*0Sstevel@tonic-gate  * Validate host in diskset.
2111*0Sstevel@tonic-gate  *	- Should be in membership list from API
2112*0Sstevel@tonic-gate  *	- Should not already be joined into diskset.
2113*0Sstevel@tonic-gate  *	- Set must have drives
2114*0Sstevel@tonic-gate  * Assume valid configuration is stored in the set/drive/node records
2115*0Sstevel@tonic-gate  * in the local mddb since no node or drive can be added to the MNset
2116*0Sstevel@tonic-gate  * unless all drives and nodes are available.  Reconfig steps will
2117*0Sstevel@tonic-gate  * resync all ALIVE nodes in case of panic in critical areas.
2118*0Sstevel@tonic-gate  *
2119*0Sstevel@tonic-gate  * Lock down the set.
2120*0Sstevel@tonic-gate  * Verify host is a member of this diskset.
2121*0Sstevel@tonic-gate  * If drives exist in the configuration, load the mddbs.
2122*0Sstevel@tonic-gate  * Set this node to active by notifying master if one exists.
2123*0Sstevel@tonic-gate  * If this is the first node active in the diskset, this node
2124*0Sstevel@tonic-gate  * 	becomes the master.
2125*0Sstevel@tonic-gate  * Unlock the set.
2126*0Sstevel@tonic-gate  *
2127*0Sstevel@tonic-gate  * Mirror Resync:
2128*0Sstevel@tonic-gate  * If this node is the last node to join the set and clustering
2129*0Sstevel@tonic-gate  * isn't running, then start the 'metasync -r' type resync
2130*0Sstevel@tonic-gate  * on all mirrors in this diskset.
2131*0Sstevel@tonic-gate  * If clustering is running, this resync operation will
2132*0Sstevel@tonic-gate  * be handled by the reconfig steps and should NOT
2133*0Sstevel@tonic-gate  * be handled during a join operation.
2134*0Sstevel@tonic-gate  *
2135*0Sstevel@tonic-gate  * There are multiple return values in order to assist
2136*0Sstevel@tonic-gate  * the join operation of all sets in the metaset command.
2137*0Sstevel@tonic-gate  *
2138*0Sstevel@tonic-gate  * Return values:
2139*0Sstevel@tonic-gate  *	0  - Node successfully joined to set.
2140*0Sstevel@tonic-gate  *	-1 - Join attempted but failed
2141*0Sstevel@tonic-gate  *		- any failure from libmeta calls
2142*0Sstevel@tonic-gate  *		- node not in the member list
2143*0Sstevel@tonic-gate  *	-2 - Join not attempted since
2144*0Sstevel@tonic-gate  *		- this set had no drives in set
2145*0Sstevel@tonic-gate  *		- this node already joined to set
2146*0Sstevel@tonic-gate  *		- set is not a multinode set
2147*0Sstevel@tonic-gate  *	-3 - Node joined to STALE set.
2148*0Sstevel@tonic-gate  */
2149*0Sstevel@tonic-gate extern int
2150*0Sstevel@tonic-gate meta_set_join(
2151*0Sstevel@tonic-gate 	mdsetname_t	*sp,
2152*0Sstevel@tonic-gate 	md_error_t	*ep
2153*0Sstevel@tonic-gate )
2154*0Sstevel@tonic-gate {
2155*0Sstevel@tonic-gate 	md_set_desc		*sd;
2156*0Sstevel@tonic-gate 	md_drive_desc		*dd;
2157*0Sstevel@tonic-gate 	md_mnnode_desc		*nd, *nd2, my_nd;
2158*0Sstevel@tonic-gate 	int			rval = 0;
2159*0Sstevel@tonic-gate 	md_setkey_t		*cl_sk;
2160*0Sstevel@tonic-gate 	md_error_t		xep = mdnullerror;
2161*0Sstevel@tonic-gate 	md_error_t		ep_snarf = mdnullerror;
2162*0Sstevel@tonic-gate 	int			master_flag = 0;
2163*0Sstevel@tonic-gate 	md_mnset_record		*mas_mnsr = NULL;
2164*0Sstevel@tonic-gate 	int			clear_nr_flags = 0;
2165*0Sstevel@tonic-gate 	md_mnnode_record	*nr;
2166*0Sstevel@tonic-gate 	int			stale_set = 0;
2167*0Sstevel@tonic-gate 	int			rb_flags = 0;
2168*0Sstevel@tonic-gate 	int			stale_bool = FALSE;
2169*0Sstevel@tonic-gate 	int			suspendall_flag = 0;
2170*0Sstevel@tonic-gate 	int			suspend1_flag = 0;
2171*0Sstevel@tonic-gate 	sigset_t		oldsigs;
2172*0Sstevel@tonic-gate 	int			send_reinit = 0;
2173*0Sstevel@tonic-gate 
2174*0Sstevel@tonic-gate 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
2175*0Sstevel@tonic-gate 		return (-1);
2176*0Sstevel@tonic-gate 	}
2177*0Sstevel@tonic-gate 
2178*0Sstevel@tonic-gate 	/* Must be a multinode diskset */
2179*0Sstevel@tonic-gate 	if (!MD_MNSET_DESC(sd)) {
2180*0Sstevel@tonic-gate 		(void) mderror(ep, MDE_NOT_MN, sp->setname);
2181*0Sstevel@tonic-gate 		return (-2);
2182*0Sstevel@tonic-gate 	}
2183*0Sstevel@tonic-gate 
2184*0Sstevel@tonic-gate 	/* Verify that the node is ALIVE (i.e. is in the API membership list) */
2185*0Sstevel@tonic-gate 	if (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_ALIVE)) {
2186*0Sstevel@tonic-gate 		(void) mddserror(ep, MDE_DS_NOTINMEMBERLIST, sp->setno,
2187*0Sstevel@tonic-gate 			sd->sd_mn_mynode->nd_nodename, NULL,
2188*0Sstevel@tonic-gate 			sp->setname);
2189*0Sstevel@tonic-gate 		return (-1);
2190*0Sstevel@tonic-gate 	}
2191*0Sstevel@tonic-gate 
2192*0Sstevel@tonic-gate 	/* Make sure we are blocking all signals */
2193*0Sstevel@tonic-gate 	if (procsigs(TRUE, &oldsigs, &xep) < 0)
2194*0Sstevel@tonic-gate 		mdclrerror(&xep);
2195*0Sstevel@tonic-gate 
2196*0Sstevel@tonic-gate 	/*
2197*0Sstevel@tonic-gate 	 * Lock the set on current set members.
2198*0Sstevel@tonic-gate 	 * For MN diskset lock_set and SUSPEND are used to protect against
2199*0Sstevel@tonic-gate 	 * other meta* commands running on the other nodes.
2200*0Sstevel@tonic-gate 	 */
2201*0Sstevel@tonic-gate 	nd = sd->sd_nodelist;
2202*0Sstevel@tonic-gate 	while (nd) {
2203*0Sstevel@tonic-gate 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2204*0Sstevel@tonic-gate 			nd = nd->nd_next;
2205*0Sstevel@tonic-gate 			continue;
2206*0Sstevel@tonic-gate 		}
2207*0Sstevel@tonic-gate 		if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
2208*0Sstevel@tonic-gate 			rval = -1;
2209*0Sstevel@tonic-gate 			goto out;
2210*0Sstevel@tonic-gate 		}
2211*0Sstevel@tonic-gate 		nd = nd->nd_next;
2212*0Sstevel@tonic-gate 	}
2213*0Sstevel@tonic-gate 
2214*0Sstevel@tonic-gate 	/*
2215*0Sstevel@tonic-gate 	 * Lock out other meta* commands by suspending
2216*0Sstevel@tonic-gate 	 * class 1 messages across the diskset.
2217*0Sstevel@tonic-gate 	 */
2218*0Sstevel@tonic-gate 	nd = sd->sd_nodelist;
2219*0Sstevel@tonic-gate 	while (nd) {
2220*0Sstevel@tonic-gate 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2221*0Sstevel@tonic-gate 			nd = nd->nd_next;
2222*0Sstevel@tonic-gate 			continue;
2223*0Sstevel@tonic-gate 		}
2224*0Sstevel@tonic-gate 		if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND,
2225*0Sstevel@tonic-gate 			    sp, MD_MSG_CLASS1, MD_MSCF_NO_FLAGS, ep)) {
2226*0Sstevel@tonic-gate 			rval = -1;
2227*0Sstevel@tonic-gate 			goto out;
2228*0Sstevel@tonic-gate 		}
2229*0Sstevel@tonic-gate 		suspend1_flag = 1;
2230*0Sstevel@tonic-gate 		nd = nd->nd_next;
2231*0Sstevel@tonic-gate 	}
2232*0Sstevel@tonic-gate 
2233*0Sstevel@tonic-gate 	/*
2234*0Sstevel@tonic-gate 	 * Verify that this host is a member (in the host list) of the set.
2235*0Sstevel@tonic-gate 	 */
2236*0Sstevel@tonic-gate 	nd = sd->sd_nodelist;
2237*0Sstevel@tonic-gate 	while (nd) {
2238*0Sstevel@tonic-gate 		if (strcmp(mynode(), nd->nd_nodename) == 0) {
2239*0Sstevel@tonic-gate 			break;
2240*0Sstevel@tonic-gate 		}
2241*0Sstevel@tonic-gate 		nd = nd->nd_next;
2242*0Sstevel@tonic-gate 	}
2243*0Sstevel@tonic-gate 	if (!nd) {
2244*0Sstevel@tonic-gate 		(void) mddserror(ep, MDE_DS_NODENOTINSET, sp->setno,
2245*0Sstevel@tonic-gate 			sd->sd_mn_mynode->nd_nodename, NULL,
2246*0Sstevel@tonic-gate 			sp->setname);
2247*0Sstevel@tonic-gate 		rval = -1;
2248*0Sstevel@tonic-gate 		goto out;
2249*0Sstevel@tonic-gate 	}
2250*0Sstevel@tonic-gate 
2251*0Sstevel@tonic-gate 	/*
2252*0Sstevel@tonic-gate 	 * Need to return failure if host is already 'joined'
2253*0Sstevel@tonic-gate 	 * into the set.  This is done so that if later the user
2254*0Sstevel@tonic-gate 	 * issues a command to join all sets and a failure is
2255*0Sstevel@tonic-gate 	 * encountered - that the resulting cleanup effort
2256*0Sstevel@tonic-gate 	 * (withdrawing from all sets that were joined
2257*0Sstevel@tonic-gate 	 * during that command) won't withdraw from this set.
2258*0Sstevel@tonic-gate 	 */
2259*0Sstevel@tonic-gate 	if (nd->nd_flags & MD_MN_NODE_OWN) {
2260*0Sstevel@tonic-gate 		rval = -2;
2261*0Sstevel@tonic-gate 		goto out2;
2262*0Sstevel@tonic-gate 	}
2263*0Sstevel@tonic-gate 
2264*0Sstevel@tonic-gate 	/*
2265*0Sstevel@tonic-gate 	 * Call metaget_setownership that calls each node in diskset and
2266*0Sstevel@tonic-gate 	 * marks in set descriptor if node is an owner of the set or not.
2267*0Sstevel@tonic-gate 	 * metaget_setownership checks to see if a node is an owner by
2268*0Sstevel@tonic-gate 	 * checking to see if that node's kernel has the mddb loaded.
2269*0Sstevel@tonic-gate 	 * If a node had panic'd during a reconfig or an
2270*0Sstevel@tonic-gate 	 * add/delete/join/withdraw operation, the other nodes' node
2271*0Sstevel@tonic-gate 	 * records may not reflect the current state of the diskset,
2272*0Sstevel@tonic-gate 	 * so calling metaget_setownership is the safest thing to do.
2273*0Sstevel@tonic-gate 	 */
2274*0Sstevel@tonic-gate 	if (metaget_setownership(sp, ep) == -1) {
2275*0Sstevel@tonic-gate 		rval = -1;
2276*0Sstevel@tonic-gate 		goto out;
2277*0Sstevel@tonic-gate 	}
2278*0Sstevel@tonic-gate 
2279*0Sstevel@tonic-gate 	/* If first active member of diskset, become the master. */
2280*0Sstevel@tonic-gate 	nd = sd->sd_nodelist;
2281*0Sstevel@tonic-gate 	while (nd) {
2282*0Sstevel@tonic-gate 		if (nd->nd_flags & MD_MN_NODE_OWN)
2283*0Sstevel@tonic-gate 			break;
2284*0Sstevel@tonic-gate 		nd = nd->nd_next;
2285*0Sstevel@tonic-gate 	}
2286*0Sstevel@tonic-gate 	if (nd == NULL)
2287*0Sstevel@tonic-gate 		master_flag = 1;
2288*0Sstevel@tonic-gate 
2289*0Sstevel@tonic-gate 	/*
2290*0Sstevel@tonic-gate 	 * If not first active member of diskset, then get the
2291*0Sstevel@tonic-gate 	 * master information from a node that is already joined
2292*0Sstevel@tonic-gate 	 * and set the master information for this node.  Be sure
2293*0Sstevel@tonic-gate 	 * that this node (the already joined node) has its own
2294*0Sstevel@tonic-gate 	 * join flag set.  If not, then this diskset isn't currently
2295*0Sstevel@tonic-gate 	 * consistent and shouldn't allow a node to join.  This diskset
2296*0Sstevel@tonic-gate 	 * inconsistency should only occur when a node has panic'd in
2297*0Sstevel@tonic-gate 	 * the set while doing a metaset operation and the sysadmin is
2298*0Sstevel@tonic-gate 	 * attempting to join a node into the set.  This inconsistency
2299*0Sstevel@tonic-gate 	 * will be fixed during a reconfig cycle which should be occurring
2300*0Sstevel@tonic-gate 	 * soon since a node panic'd.
2301*0Sstevel@tonic-gate 	 *
2302*0Sstevel@tonic-gate 	 * If unable to get this information from an owning node, then
2303*0Sstevel@tonic-gate 	 * this diskset isn't currently consistent and shouldn't
2304*0Sstevel@tonic-gate 	 * allow a node to join.
2305*0Sstevel@tonic-gate 	 */
2306*0Sstevel@tonic-gate 	if (!master_flag) {
2307*0Sstevel@tonic-gate 		/* get master information from an owner (joined) node */
2308*0Sstevel@tonic-gate 		if (clnt_mngetset(nd->nd_nodename, sp->setname,
2309*0Sstevel@tonic-gate 		    sp->setno, &mas_mnsr, ep) == -1) {
2310*0Sstevel@tonic-gate 			rval = -1;
2311*0Sstevel@tonic-gate 			goto out;
2312*0Sstevel@tonic-gate 		}
2313*0Sstevel@tonic-gate 
2314*0Sstevel@tonic-gate 		/* Verify that owner (joined) node has its own JOIN flag set */
2315*0Sstevel@tonic-gate 		nr = mas_mnsr->sr_nodechain;
2316*0Sstevel@tonic-gate 		while (nr) {
2317*0Sstevel@tonic-gate 			if ((nd->nd_nodeid == nr->nr_nodeid) &&
2318*0Sstevel@tonic-gate 			    ((nr->nr_flags & MD_MN_NODE_OWN) == NULL)) {
2319*0Sstevel@tonic-gate 				(void) mddserror(ep, MDE_DS_NODENOSET,
2320*0Sstevel@tonic-gate 				    sp->setno, nd->nd_nodename, NULL,
2321*0Sstevel@tonic-gate 				    nd->nd_nodename);
2322*0Sstevel@tonic-gate 				free_sr((md_set_record *)mas_mnsr);
2323*0Sstevel@tonic-gate 				rval = -1;
2324*0Sstevel@tonic-gate 				goto out;
2325*0Sstevel@tonic-gate 			}
2326*0Sstevel@tonic-gate 			nr = nr->nr_next;
2327*0Sstevel@tonic-gate 		}
2328*0Sstevel@tonic-gate 
2329*0Sstevel@tonic-gate 		/*
2330*0Sstevel@tonic-gate 		 * Does master have set marked as STALE?
2331*0Sstevel@tonic-gate 		 * If so, need to pass this down to kernel when
2332*0Sstevel@tonic-gate 		 * this node snarfs the set.
2333*0Sstevel@tonic-gate 		 */
2334*0Sstevel@tonic-gate 		if (clnt_mn_is_stale(nd->nd_nodename, sp,
2335*0Sstevel@tonic-gate 		    &stale_bool, ep) == -1) {
2336*0Sstevel@tonic-gate 			rval = -1;
2337*0Sstevel@tonic-gate 			goto out;
2338*0Sstevel@tonic-gate 		}
2339*0Sstevel@tonic-gate 
2340*0Sstevel@tonic-gate 		/* set master information in my rpc.metad's set record */
2341*0Sstevel@tonic-gate 		if (clnt_mnsetmaster(mynode(), sp, mas_mnsr->sr_master_nodenm,
2342*0Sstevel@tonic-gate 		    mas_mnsr->sr_master_nodeid, ep)) {
2343*0Sstevel@tonic-gate 			free_sr((md_set_record *)mas_mnsr);
2344*0Sstevel@tonic-gate 			rval = -1;
2345*0Sstevel@tonic-gate 			goto out;
2346*0Sstevel@tonic-gate 		}
2347*0Sstevel@tonic-gate 
2348*0Sstevel@tonic-gate 		/* set master information in my cached set desc */
2349*0Sstevel@tonic-gate 		(void) strcpy(sd->sd_mn_master_nodenm,
2350*0Sstevel@tonic-gate 		    mas_mnsr->sr_master_nodenm);
2351*0Sstevel@tonic-gate 		sd->sd_mn_master_nodeid = mas_mnsr->sr_master_nodeid;
2352*0Sstevel@tonic-gate 		nd2 = sd->sd_nodelist;
2353*0Sstevel@tonic-gate 		while (nd2) {
2354*0Sstevel@tonic-gate 		    if (nd2->nd_nodeid == mas_mnsr->sr_master_nodeid) {
2355*0Sstevel@tonic-gate 			sd->sd_mn_masternode = nd2;
2356*0Sstevel@tonic-gate 			break;
2357*0Sstevel@tonic-gate 		    }
2358*0Sstevel@tonic-gate 		    nd2 = nd2->nd_next;
2359*0Sstevel@tonic-gate 		}
2360*0Sstevel@tonic-gate 		free_sr((md_set_record *)mas_mnsr);
2361*0Sstevel@tonic-gate 
2362*0Sstevel@tonic-gate 		/*
2363*0Sstevel@tonic-gate 		 * Set the node flags in mynode's rpc.metad node records for
2364*0Sstevel@tonic-gate 		 * the nodes that are in the diskset.  Can use my sd
2365*0Sstevel@tonic-gate 		 * since earlier call to metaget_setownership set the
2366*0Sstevel@tonic-gate 		 * owner flags based on whether that node had snarfed
2367*0Sstevel@tonic-gate 		 * the MN diskset mddb.  Reconfig steps guarantee that
2368*0Sstevel@tonic-gate 		 * return of metaget_setownership will match the owning
2369*0Sstevel@tonic-gate 		 * node's owner list except in the case where a node
2370*0Sstevel@tonic-gate 		 * has just panic'd and in this case, a reconfig will
2371*0Sstevel@tonic-gate 		 * be starting immediately and the owner lists will
2372*0Sstevel@tonic-gate 		 * be sync'd up by the reconfig.
2373*0Sstevel@tonic-gate 		 *
2374*0Sstevel@tonic-gate 		 * Flag of SET means to take no action except to
2375*0Sstevel@tonic-gate 		 * set the node flags as given in the nodelist linked list.
2376*0Sstevel@tonic-gate 		 */
2377*0Sstevel@tonic-gate 		if (clnt_upd_nr_flags(mynode(), sp, sd->sd_nodelist,
2378*0Sstevel@tonic-gate 		    MD_NR_SET, NULL, ep)) {
2379*0Sstevel@tonic-gate 			rval = -1;
2380*0Sstevel@tonic-gate 			goto out;
2381*0Sstevel@tonic-gate 		}
2382*0Sstevel@tonic-gate 	}
2383*0Sstevel@tonic-gate 
2384*0Sstevel@tonic-gate 	/*
2385*0Sstevel@tonic-gate 	 * Read in the mddb if there are drives in the set.
2386*0Sstevel@tonic-gate 	 */
2387*0Sstevel@tonic-gate 	if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
2388*0Sstevel@tonic-gate 	    ep)) == NULL) {
2389*0Sstevel@tonic-gate 		/* No drives in list */
2390*0Sstevel@tonic-gate 		if (! mdisok(ep)) {
2391*0Sstevel@tonic-gate 			rval = -1;
2392*0Sstevel@tonic-gate 			goto out;
2393*0Sstevel@tonic-gate 		}
2394*0Sstevel@tonic-gate 		rval = -2;
2395*0Sstevel@tonic-gate 		goto out;
2396*0Sstevel@tonic-gate 	}
2397*0Sstevel@tonic-gate 
2398*0Sstevel@tonic-gate 	/*
2399*0Sstevel@tonic-gate 	 * Notify rpc.mdcommd on all nodes of a nodelist change.
2400*0Sstevel@tonic-gate 	 * Start by suspending rpc.mdcommd (which drains it of all messages),
2401*0Sstevel@tonic-gate 	 * then change the nodelist followed by a reinit and resume.
2402*0Sstevel@tonic-gate 	 */
2403*0Sstevel@tonic-gate 	nd = sd->sd_nodelist;
2404*0Sstevel@tonic-gate 	while (nd) {
2405*0Sstevel@tonic-gate 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2406*0Sstevel@tonic-gate 			nd = nd->nd_next;
2407*0Sstevel@tonic-gate 			continue;
2408*0Sstevel@tonic-gate 		}
2409*0Sstevel@tonic-gate 
2410*0Sstevel@tonic-gate 		if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND, sp,
2411*0Sstevel@tonic-gate 		    MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) {
2412*0Sstevel@tonic-gate 			rval = -1;
2413*0Sstevel@tonic-gate 			goto out;
2414*0Sstevel@tonic-gate 		}
2415*0Sstevel@tonic-gate 		suspendall_flag = 1;
2416*0Sstevel@tonic-gate 		nd = nd->nd_next;
2417*0Sstevel@tonic-gate 	}
2418*0Sstevel@tonic-gate 
2419*0Sstevel@tonic-gate 	/* Set master in my set record in rpc.metad */
2420*0Sstevel@tonic-gate 	if (master_flag) {
2421*0Sstevel@tonic-gate 		if (clnt_mnsetmaster(mynode(), sp,
2422*0Sstevel@tonic-gate 		    sd->sd_mn_mynode->nd_nodename,
2423*0Sstevel@tonic-gate 		    sd->sd_mn_mynode->nd_nodeid, ep)) {
2424*0Sstevel@tonic-gate 			rval = -1;
2425*0Sstevel@tonic-gate 			goto out;
2426*0Sstevel@tonic-gate 		}
2427*0Sstevel@tonic-gate 	}
2428*0Sstevel@tonic-gate 	/* Causes mddbs to be loaded in kernel */
2429*0Sstevel@tonic-gate 	if (setup_db_bydd(sp, dd, 0, ep) == -1) {
2430*0Sstevel@tonic-gate 		mde_perror(ep, dgettext(TEXT_DOMAIN,
2431*0Sstevel@tonic-gate 		    "Host not able to start diskset."));
2432*0Sstevel@tonic-gate 		rval = -1;
2433*0Sstevel@tonic-gate 		goto out;
2434*0Sstevel@tonic-gate 	}
2435*0Sstevel@tonic-gate 
2436*0Sstevel@tonic-gate 	if (! mdisok(ep)) {
2437*0Sstevel@tonic-gate 		rval = -1;
2438*0Sstevel@tonic-gate 		goto out;
2439*0Sstevel@tonic-gate 	}
2440*0Sstevel@tonic-gate 
2441*0Sstevel@tonic-gate 	/*
2442*0Sstevel@tonic-gate 	 * Set rollback flags to 1 so that halt_set is called if a failure
2443*0Sstevel@tonic-gate 	 * is seen after this point.  If snarf_set fails, still need to
2444*0Sstevel@tonic-gate 	 * call halt_set to cleanup the diskset.
2445*0Sstevel@tonic-gate 	 */
2446*0Sstevel@tonic-gate 	rb_flags = 1;
2447*0Sstevel@tonic-gate 
2448*0Sstevel@tonic-gate 	/* Starts the set */
2449*0Sstevel@tonic-gate 	if (snarf_set(sp, stale_bool, ep) != 0) {
2450*0Sstevel@tonic-gate 		if (mdismddberror(ep, MDE_DB_STALE)) {
2451*0Sstevel@tonic-gate 			/*
2452*0Sstevel@tonic-gate 			 * Don't fail join, STALE means that set has
2453*0Sstevel@tonic-gate 			 * < 50% mddbs.
2454*0Sstevel@tonic-gate 			 */
2455*0Sstevel@tonic-gate 			(void) mdstealerror(&ep_snarf, ep);
2456*0Sstevel@tonic-gate 			stale_set = 1;
2457*0Sstevel@tonic-gate 		} else if (mdisok(ep)) {
2458*0Sstevel@tonic-gate 			/* If snarf failed, but no error was set - set it */
2459*0Sstevel@tonic-gate 			(void) mdmddberror(ep, MDE_DB_NOTNOW, NODEV64,
2460*0Sstevel@tonic-gate 			    sp->setno, 0, NULL);
2461*0Sstevel@tonic-gate 				rval = -1;
2462*0Sstevel@tonic-gate 				goto out;
2463*0Sstevel@tonic-gate 		} else if (!(mdismddberror(ep, MDE_DB_ACCOK))) {
2464*0Sstevel@tonic-gate 			/*
2465*0Sstevel@tonic-gate 			 * Don't fail join if ACCOK; ACCOK means that mediator
2466*0Sstevel@tonic-gate 			 * provided extra vote.
2467*0Sstevel@tonic-gate 			 */
2468*0Sstevel@tonic-gate 			rval = -1;
2469*0Sstevel@tonic-gate 			goto out;
2470*0Sstevel@tonic-gate 		}
2471*0Sstevel@tonic-gate 	}
2472*0Sstevel@tonic-gate 
2473*0Sstevel@tonic-gate 	/* Did set really get snarfed? */
2474*0Sstevel@tonic-gate 	if (own_set(sp, NULL, TRUE, ep) == MD_SETOWNER_NO) {
2475*0Sstevel@tonic-gate 		if (mdisok(ep)) {
2476*0Sstevel@tonic-gate 			/* If snarf failed, but no error was set - set it */
2477*0Sstevel@tonic-gate 			(void) mdmddberror(ep, MDE_DB_NOTNOW, NODEV64,
2478*0Sstevel@tonic-gate 				sp->setno, 0, NULL);
2479*0Sstevel@tonic-gate 		}
2480*0Sstevel@tonic-gate 		mde_perror(ep, dgettext(TEXT_DOMAIN,
2481*0Sstevel@tonic-gate 		    "Host not able to start diskset."));
2482*0Sstevel@tonic-gate 		rval = -1;
2483*0Sstevel@tonic-gate 		goto out;
2484*0Sstevel@tonic-gate 	}
2485*0Sstevel@tonic-gate 
2486*0Sstevel@tonic-gate 	/* Change to nodelist so need to send reinit to rpc.mdcommd */
2487*0Sstevel@tonic-gate 	send_reinit = 1;
2488*0Sstevel@tonic-gate 
2489*0Sstevel@tonic-gate 	/* If first node to enter set, setup master and clear change log */
2490*0Sstevel@tonic-gate 	if (master_flag) {
2491*0Sstevel@tonic-gate 		/* Set master in my locally cached set descriptor */
2492*0Sstevel@tonic-gate 		(void) strcpy(sd->sd_mn_master_nodenm,
2493*0Sstevel@tonic-gate 		    sd->sd_mn_mynode->nd_nodename);
2494*0Sstevel@tonic-gate 		sd->sd_mn_master_nodeid = sd->sd_mn_mynode->nd_nodeid;
2495*0Sstevel@tonic-gate 		sd->sd_mn_am_i_master = 1;
2496*0Sstevel@tonic-gate 
2497*0Sstevel@tonic-gate 		/*
2498*0Sstevel@tonic-gate 		 * If first node to join set, then clear out change log
2499*0Sstevel@tonic-gate 		 * entries.  Change log entries are only needed when a
2500*0Sstevel@tonic-gate 		 * change of master is occurring in a diskset that has
2501*0Sstevel@tonic-gate 		 * multiple owners.   Since this node is the first owner
2502*0Sstevel@tonic-gate 		 * of the diskset, clear the entries.
2503*0Sstevel@tonic-gate 		 *
2504*0Sstevel@tonic-gate 		 * Only do this if we are in a single node non-SC3.x
2505*0Sstevel@tonic-gate 		 * situation.
2506*0Sstevel@tonic-gate 		 */
2507*0Sstevel@tonic-gate 		if (meta_mn_singlenode() &&
2508*0Sstevel@tonic-gate 			mdmn_reset_changelog(sp, ep,  MDMN_CLF_RESETLOG) != 0) {
2509*0Sstevel@tonic-gate 			mde_perror(ep, dgettext(TEXT_DOMAIN,
2510*0Sstevel@tonic-gate 			    "Unable to reset changelog."));
2511*0Sstevel@tonic-gate 			rval = -1;
2512*0Sstevel@tonic-gate 			goto out;
2513*0Sstevel@tonic-gate 		}
2514*0Sstevel@tonic-gate 	}
2515*0Sstevel@tonic-gate 
2516*0Sstevel@tonic-gate 	/* Set my locally cached flag */
2517*0Sstevel@tonic-gate 	sd->sd_mn_mynode->nd_flags |= MD_MN_NODE_OWN;
2518*0Sstevel@tonic-gate 
2519*0Sstevel@tonic-gate 	/*
2520*0Sstevel@tonic-gate 	 * Set this node's own flag on all joined nodes in the set
2521*0Sstevel@tonic-gate 	 * (including my node).
2522*0Sstevel@tonic-gate 	 */
2523*0Sstevel@tonic-gate 	clear_nr_flags = 1;
2524*0Sstevel@tonic-gate 
2525*0Sstevel@tonic-gate 	my_nd = *(sd->sd_mn_mynode);
2526*0Sstevel@tonic-gate 	my_nd.nd_next = NULL;
2527*0Sstevel@tonic-gate 	nd = sd->sd_nodelist;
2528*0Sstevel@tonic-gate 	while (nd) {
2529*0Sstevel@tonic-gate 		if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
2530*0Sstevel@tonic-gate 			nd = nd->nd_next;
2531*0Sstevel@tonic-gate 			continue;
2532*0Sstevel@tonic-gate 		}
2533*0Sstevel@tonic-gate 		if (clnt_upd_nr_flags(nd->nd_nodename, sp, &my_nd,
2534*0Sstevel@tonic-gate 		    MD_NR_JOIN, NULL, ep)) {
2535*0Sstevel@tonic-gate 			rval = -1;
2536*0Sstevel@tonic-gate 			goto out;
2537*0Sstevel@tonic-gate 		}
2538*0Sstevel@tonic-gate 		nd = nd->nd_next;
2539*0Sstevel@tonic-gate 	}
2540*0Sstevel@tonic-gate 
2541*0Sstevel@tonic-gate out:
2542*0Sstevel@tonic-gate 	if (rval != NULL) {
2543*0Sstevel@tonic-gate 		/*
2544*0Sstevel@tonic-gate 		 * If rollback flag is 1, then node was joined to set.
2545*0Sstevel@tonic-gate 		 * Since an error occurred, withdraw node from set in
2546*0Sstevel@tonic-gate 		 * order to rollback to before command was run.
2547*0Sstevel@tonic-gate 		 * Need to preserve ep so that calling function can
2548*0Sstevel@tonic-gate 		 * get error information.
2549*0Sstevel@tonic-gate 		 */
2550*0Sstevel@tonic-gate 		if (rb_flags == 1) {
2551*0Sstevel@tonic-gate 			if (halt_set(sp, &xep)) {
2552*0Sstevel@tonic-gate 				mdclrerror(&xep);
2553*0Sstevel@tonic-gate 			}
2554*0Sstevel@tonic-gate 		}
2555*0Sstevel@tonic-gate 
2556*0Sstevel@tonic-gate 		/*
2557*0Sstevel@tonic-gate 		 * If error, reset master to INVALID.
2558*0Sstevel@tonic-gate 		 * Ignore error since (next) first node to successfully join
2559*0Sstevel@tonic-gate 		 * will set master on all nodes.
2560*0Sstevel@tonic-gate 		 */
2561*0Sstevel@tonic-gate 		(void) clnt_mnsetmaster(mynode(), sp, "",
2562*0Sstevel@tonic-gate 			MD_MN_INVALID_NID, &xep);
2563*0Sstevel@tonic-gate 		mdclrerror(&xep);
2564*0Sstevel@tonic-gate 		/* Reset master in my locally cached set descriptor */
2565*0Sstevel@tonic-gate 		sd->sd_mn_master_nodeid = MD_MN_INVALID_NID;
2566*0Sstevel@tonic-gate 		sd->sd_mn_am_i_master = 0;
2567*0Sstevel@tonic-gate 
2568*0Sstevel@tonic-gate 		/*
2569*0Sstevel@tonic-gate 		 * If nr flags set on other nodes, reset them.
2570*0Sstevel@tonic-gate 		 */
2571*0Sstevel@tonic-gate 		if (clear_nr_flags) {
2572*0Sstevel@tonic-gate 			nd = sd->sd_nodelist;
2573*0Sstevel@tonic-gate 			while (nd) {
2574*0Sstevel@tonic-gate 				if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
2575*0Sstevel@tonic-gate 					nd = nd->nd_next;
2576*0Sstevel@tonic-gate 					continue;
2577*0Sstevel@tonic-gate 				}
2578*0Sstevel@tonic-gate 				(void) clnt_upd_nr_flags(nd->nd_nodename, sp,
2579*0Sstevel@tonic-gate 					&my_nd, MD_NR_WITHDRAW, NULL, &xep);
2580*0Sstevel@tonic-gate 				mdclrerror(&xep);
2581*0Sstevel@tonic-gate 				nd = nd->nd_next;
2582*0Sstevel@tonic-gate 			}
2583*0Sstevel@tonic-gate 			/* Reset my locally cached flag */
2584*0Sstevel@tonic-gate 			sd->sd_mn_mynode->nd_flags &= ~MD_MN_NODE_OWN;
2585*0Sstevel@tonic-gate 		}
2586*0Sstevel@tonic-gate 	}
2587*0Sstevel@tonic-gate 
2588*0Sstevel@tonic-gate 	/*
2589*0Sstevel@tonic-gate 	 * Notify rpc.mdcommd on all nodes of a nodelist change.
2590*0Sstevel@tonic-gate 	 * Send reinit command to mdcommd which forces it to get
2591*0Sstevel@tonic-gate 	 * fresh set description.
2592*0Sstevel@tonic-gate 	 */
2593*0Sstevel@tonic-gate 	if (send_reinit) {
2594*0Sstevel@tonic-gate 		/* Send reinit */
2595*0Sstevel@tonic-gate 		nd = sd->sd_nodelist;
2596*0Sstevel@tonic-gate 		while (nd) {
2597*0Sstevel@tonic-gate 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2598*0Sstevel@tonic-gate 				nd = nd->nd_next;
2599*0Sstevel@tonic-gate 				continue;
2600*0Sstevel@tonic-gate 			}
2601*0Sstevel@tonic-gate 
2602*0Sstevel@tonic-gate 			/* Class is ignored for REINIT */
2603*0Sstevel@tonic-gate 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
2604*0Sstevel@tonic-gate 				sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
2605*0Sstevel@tonic-gate 				/*
2606*0Sstevel@tonic-gate 				 * We are here because we failed to resume
2607*0Sstevel@tonic-gate 				 * rpc.mdcommd.  However we potentially have
2608*0Sstevel@tonic-gate 				 * an error from the previous call
2609*0Sstevel@tonic-gate 				 * If the previous call did fail,  we capture
2610*0Sstevel@tonic-gate 				 * that error and generate a perror with
2611*0Sstevel@tonic-gate 				 * the string, "Unable to resume...".
2612*0Sstevel@tonic-gate 				 * Setting rval to -1 ensures that in the
2613*0Sstevel@tonic-gate 				 * next iteration of the loop, ep is not
2614*0Sstevel@tonic-gate 				 * clobbered.
2615*0Sstevel@tonic-gate 				 */
2616*0Sstevel@tonic-gate 				if (rval == 0)
2617*0Sstevel@tonic-gate 					(void) mdstealerror(ep, &xep);
2618*0Sstevel@tonic-gate 				else
2619*0Sstevel@tonic-gate 					mdclrerror(&xep);
2620*0Sstevel@tonic-gate 				rval = -1;
2621*0Sstevel@tonic-gate 				mde_perror(ep, dgettext(TEXT_DOMAIN,
2622*0Sstevel@tonic-gate 				    "Unable to reinit rpc.mdcommd."));
2623*0Sstevel@tonic-gate 			}
2624*0Sstevel@tonic-gate 			nd = nd->nd_next;
2625*0Sstevel@tonic-gate 		}
2626*0Sstevel@tonic-gate 
2627*0Sstevel@tonic-gate 	}
2628*0Sstevel@tonic-gate 
2629*0Sstevel@tonic-gate out2:
2630*0Sstevel@tonic-gate 	/*
2631*0Sstevel@tonic-gate 	 * Unlock diskset by resuming messages across the diskset.
2632*0Sstevel@tonic-gate 	 * Just resume all classes so that resume is the same whether
2633*0Sstevel@tonic-gate 	 * just one class was locked or all classes were locked.
2634*0Sstevel@tonic-gate 	 */
2635*0Sstevel@tonic-gate 	if ((suspend1_flag) || (suspendall_flag)) {
2636*0Sstevel@tonic-gate 		nd = sd->sd_nodelist;
2637*0Sstevel@tonic-gate 		while (nd) {
2638*0Sstevel@tonic-gate 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2639*0Sstevel@tonic-gate 				nd = nd->nd_next;
2640*0Sstevel@tonic-gate 				continue;
2641*0Sstevel@tonic-gate 			}
2642*0Sstevel@tonic-gate 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
2643*0Sstevel@tonic-gate 				sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
2644*0Sstevel@tonic-gate 				/*
2645*0Sstevel@tonic-gate 				 * We are here because we failed to resume
2646*0Sstevel@tonic-gate 				 * rpc.mdcommd.  However we potentially have
2647*0Sstevel@tonic-gate 				 * an error from the previous call
2648*0Sstevel@tonic-gate 				 * If the previous call did fail,  we capture
2649*0Sstevel@tonic-gate 				 * that error and generate a perror with
2650*0Sstevel@tonic-gate 				 * the string, "Unable to resume...".
2651*0Sstevel@tonic-gate 				 * Setting rval to -1 ensures that in the
2652*0Sstevel@tonic-gate 				 * next iteration of the loop, ep is not
2653*0Sstevel@tonic-gate 				 * clobbered.
2654*0Sstevel@tonic-gate 				 */
2655*0Sstevel@tonic-gate 				if (rval == 0)
2656*0Sstevel@tonic-gate 					(void) mdstealerror(ep, &xep);
2657*0Sstevel@tonic-gate 				else
2658*0Sstevel@tonic-gate 					mdclrerror(&xep);
2659*0Sstevel@tonic-gate 				rval = -1;
2660*0Sstevel@tonic-gate 				mde_perror(ep, dgettext(TEXT_DOMAIN,
2661*0Sstevel@tonic-gate 				    "Unable to resume rpc.mdcommd."));
2662*0Sstevel@tonic-gate 			}
2663*0Sstevel@tonic-gate 			nd = nd->nd_next;
2664*0Sstevel@tonic-gate 		}
2665*0Sstevel@tonic-gate 		meta_ping_mnset(sp->setno);
2666*0Sstevel@tonic-gate 	}
2667*0Sstevel@tonic-gate 
2668*0Sstevel@tonic-gate 	/*
2669*0Sstevel@tonic-gate 	 * Unlock set.  This flushes the caches on the servers.
2670*0Sstevel@tonic-gate 	 */
2671*0Sstevel@tonic-gate 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
2672*0Sstevel@tonic-gate 	nd = sd->sd_nodelist;
2673*0Sstevel@tonic-gate 	while (nd) {
2674*0Sstevel@tonic-gate 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2675*0Sstevel@tonic-gate 			nd = nd->nd_next;
2676*0Sstevel@tonic-gate 			continue;
2677*0Sstevel@tonic-gate 		}
2678*0Sstevel@tonic-gate 		if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
2679*0Sstevel@tonic-gate 			if (rval == 0)
2680*0Sstevel@tonic-gate 				(void) mdstealerror(ep, &xep);
2681*0Sstevel@tonic-gate 			else
2682*0Sstevel@tonic-gate 				mdclrerror(&xep);
2683*0Sstevel@tonic-gate 			rval = -1;
2684*0Sstevel@tonic-gate 		}
2685*0Sstevel@tonic-gate 		nd = nd->nd_next;
2686*0Sstevel@tonic-gate 	}
2687*0Sstevel@tonic-gate 
2688*0Sstevel@tonic-gate 	/*
2689*0Sstevel@tonic-gate 	 * If this node is the last to join the diskset and clustering isn't
2690*0Sstevel@tonic-gate 	 * running, then resync the mirrors in the diskset. We have to wait
2691*0Sstevel@tonic-gate 	 * until all nodes are joined so that the status gets propagated to
2692*0Sstevel@tonic-gate 	 * all of the members of the set.
2693*0Sstevel@tonic-gate 	 * Ignore any error from the resync as the join function shouldn't fail
2694*0Sstevel@tonic-gate 	 * because the mirror resync had a problem.
2695*0Sstevel@tonic-gate 	 *
2696*0Sstevel@tonic-gate 	 * Don't start resync if set is stale.
2697*0Sstevel@tonic-gate 	 */
2698*0Sstevel@tonic-gate 	if ((rval == 0) && (sdssc_bind_library() != SDSSC_OKAY) &&
2699*0Sstevel@tonic-gate 	    (stale_set != 1)) {
2700*0Sstevel@tonic-gate 		nd = sd->sd_nodelist;
2701*0Sstevel@tonic-gate 		while (nd) {
2702*0Sstevel@tonic-gate 			if (!(nd->nd_flags & MD_MN_NODE_OWN))
2703*0Sstevel@tonic-gate 				break;
2704*0Sstevel@tonic-gate 			nd = nd->nd_next;
2705*0Sstevel@tonic-gate 		}
2706*0Sstevel@tonic-gate 		/*
2707*0Sstevel@tonic-gate 		 * nd set to NULL means that we have no nodes in the set that
2708*0Sstevel@tonic-gate 		 * haven't joined. In this case we start the resync.
2709*0Sstevel@tonic-gate 		 */
2710*0Sstevel@tonic-gate 		if (nd == NULL) {
2711*0Sstevel@tonic-gate 			(void) meta_mirror_resync_all(sp, 0, &xep);
2712*0Sstevel@tonic-gate 			mdclrerror(&xep);
2713*0Sstevel@tonic-gate 		}
2714*0Sstevel@tonic-gate 	}
2715*0Sstevel@tonic-gate 
2716*0Sstevel@tonic-gate 	/* Update ABR state for all soft partitions */
2717*0Sstevel@tonic-gate 	(void) meta_sp_update_abr(sp, &xep);
2718*0Sstevel@tonic-gate 	mdclrerror(&xep);
2719*0Sstevel@tonic-gate 
2720*0Sstevel@tonic-gate 	/*
2721*0Sstevel@tonic-gate 	 * call metaflushsetnames to reset local cache for master and
2722*0Sstevel@tonic-gate 	 * node information.
2723*0Sstevel@tonic-gate 	 */
2724*0Sstevel@tonic-gate 	metaflushsetname(sp);
2725*0Sstevel@tonic-gate 
2726*0Sstevel@tonic-gate 	/* release signals back to what they were on entry */
2727*0Sstevel@tonic-gate 	if (procsigs(FALSE, &oldsigs, &xep) < 0)
2728*0Sstevel@tonic-gate 		mdclrerror(&xep);
2729*0Sstevel@tonic-gate 
2730*0Sstevel@tonic-gate 	/*
2731*0Sstevel@tonic-gate 	 * If no error and stale_set is set, then set ep back
2732*0Sstevel@tonic-gate 	 * to ep from snarf_set call and return -3.  If another error
2733*0Sstevel@tonic-gate 	 * occurred and rval is not 0, then that error would have
2734*0Sstevel@tonic-gate 	 * caused the node to be withdrawn from the set and would
2735*0Sstevel@tonic-gate 	 * have set ep to that error information.
2736*0Sstevel@tonic-gate 	 */
2737*0Sstevel@tonic-gate 	if ((rval == 0) && (stale_set)) {
2738*0Sstevel@tonic-gate 		(void) mdstealerror(ep, &ep_snarf);
2739*0Sstevel@tonic-gate 		return (-3);
2740*0Sstevel@tonic-gate 	}
2741*0Sstevel@tonic-gate 
2742*0Sstevel@tonic-gate 	return (rval);
2743*0Sstevel@tonic-gate }
2744*0Sstevel@tonic-gate 
2745*0Sstevel@tonic-gate /*
2746*0Sstevel@tonic-gate  * Entry point to withdraw a node from MultiNode diskset.
2747*0Sstevel@tonic-gate  *
2748*0Sstevel@tonic-gate  * Validate host in diskset.
2749*0Sstevel@tonic-gate  *	- Should be joined into diskset.
2750*0Sstevel@tonic-gate  * Assume valid configuration is stored in the set/drive/node records
2751*0Sstevel@tonic-gate  * in the local mddb since no node or drive can be added to the MNset
2752*0Sstevel@tonic-gate  * unless all drives and nodes are available.  Reconfig steps will
2753*0Sstevel@tonic-gate  * resync all ALIVE nodes in case of panic in critical areas.
2754*0Sstevel@tonic-gate  *
2755*0Sstevel@tonic-gate  * Lock down the set.
2756*0Sstevel@tonic-gate  * Verify that drives exist in configuration.
2757*0Sstevel@tonic-gate  * Verify host is a member of this diskset.
2758*0Sstevel@tonic-gate  * Verify host is an owner of the diskset (host is joined to diskset).
2759*0Sstevel@tonic-gate  * Only allow withdrawal of master node if master node is the only joined
2760*0Sstevel@tonic-gate  * in the diskset.
2761*0Sstevel@tonic-gate  * Halt the diskset on this node.
2762*0Sstevel@tonic-gate  * Reset Master on this node.
2763*0Sstevel@tonic-gate  * Updated node flags that this node with withdrawn.
2764*0Sstevel@tonic-gate  * Unlock the set.
2765*0Sstevel@tonic-gate  *
2766*0Sstevel@tonic-gate  * Return values:
2767*0Sstevel@tonic-gate  *	0  - Node successfully withdrew from set.
2768*0Sstevel@tonic-gate  *	-1 - Withdrawal attempted but failed
2769*0Sstevel@tonic-gate  *		- any failure from libmeta calls
2770*0Sstevel@tonic-gate  *		- node not in the member list
2771*0Sstevel@tonic-gate  *	-2 - Withdrawal not attempted since
2772*0Sstevel@tonic-gate  *		- this set had no drives in set
2773*0Sstevel@tonic-gate  *		- this node not joined to set
2774*0Sstevel@tonic-gate  *		- set is not a multinode set
2775*0Sstevel@tonic-gate  */
2776*0Sstevel@tonic-gate extern int
2777*0Sstevel@tonic-gate meta_set_withdraw(
2778*0Sstevel@tonic-gate 	mdsetname_t	*sp,
2779*0Sstevel@tonic-gate 	md_error_t	*ep
2780*0Sstevel@tonic-gate )
2781*0Sstevel@tonic-gate {
2782*0Sstevel@tonic-gate 	md_set_desc		*sd;
2783*0Sstevel@tonic-gate 	md_drive_desc		*dd = 0;
2784*0Sstevel@tonic-gate 	md_mnnode_desc		*nd, my_nd;
2785*0Sstevel@tonic-gate 	int			rval = 0;
2786*0Sstevel@tonic-gate 	md_setkey_t		*cl_sk;
2787*0Sstevel@tonic-gate 	md_error_t		xep = mdnullerror;
2788*0Sstevel@tonic-gate 	int			set_halted = 0;
2789*0Sstevel@tonic-gate 	int			suspendall_flag = 0;
2790*0Sstevel@tonic-gate 	int			suspend1_flag = 0;
2791*0Sstevel@tonic-gate 	bool_t			stale_bool = FALSE;
2792*0Sstevel@tonic-gate 	mddb_config_t		c;
2793*0Sstevel@tonic-gate 	int			node_id_list[1];
2794*0Sstevel@tonic-gate 	sigset_t		oldsigs;
2795*0Sstevel@tonic-gate 	int			send_reinit = 0;
2796*0Sstevel@tonic-gate 
2797*0Sstevel@tonic-gate 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
2798*0Sstevel@tonic-gate 		return (-1);
2799*0Sstevel@tonic-gate 	}
2800*0Sstevel@tonic-gate 
2801*0Sstevel@tonic-gate 	/* Must be a multinode diskset */
2802*0Sstevel@tonic-gate 	if (!MD_MNSET_DESC(sd)) {
2803*0Sstevel@tonic-gate 		(void) mderror(ep, MDE_NOT_MN, sp->setname);
2804*0Sstevel@tonic-gate 		return (-1);
2805*0Sstevel@tonic-gate 	}
2806*0Sstevel@tonic-gate 
2807*0Sstevel@tonic-gate 	/* Make sure we are blocking all signals */
2808*0Sstevel@tonic-gate 	if (procsigs(TRUE, &oldsigs, &xep) < 0)
2809*0Sstevel@tonic-gate 		mdclrerror(&xep);
2810*0Sstevel@tonic-gate 
2811*0Sstevel@tonic-gate 	/*
2812*0Sstevel@tonic-gate 	 * Lock the set on current set members.
2813*0Sstevel@tonic-gate 	 * For MN diskset lock_set and SUSPEND are used to protect against
2814*0Sstevel@tonic-gate 	 * other meta* commands running on the other nodes.
2815*0Sstevel@tonic-gate 	 */
2816*0Sstevel@tonic-gate 	nd = sd->sd_nodelist;
2817*0Sstevel@tonic-gate 	while (nd) {
2818*0Sstevel@tonic-gate 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2819*0Sstevel@tonic-gate 			nd = nd->nd_next;
2820*0Sstevel@tonic-gate 			continue;
2821*0Sstevel@tonic-gate 		}
2822*0Sstevel@tonic-gate 		if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
2823*0Sstevel@tonic-gate 			rval = -1;
2824*0Sstevel@tonic-gate 			goto out;
2825*0Sstevel@tonic-gate 		}
2826*0Sstevel@tonic-gate 		nd = nd->nd_next;
2827*0Sstevel@tonic-gate 	}
2828*0Sstevel@tonic-gate 	/*
2829*0Sstevel@tonic-gate 	 * Lock out other meta* commands by suspending
2830*0Sstevel@tonic-gate 	 * class 1 messages across the diskset.
2831*0Sstevel@tonic-gate 	 */
2832*0Sstevel@tonic-gate 	nd = sd->sd_nodelist;
2833*0Sstevel@tonic-gate 	while (nd) {
2834*0Sstevel@tonic-gate 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2835*0Sstevel@tonic-gate 			nd = nd->nd_next;
2836*0Sstevel@tonic-gate 			continue;
2837*0Sstevel@tonic-gate 		}
2838*0Sstevel@tonic-gate 		if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND,
2839*0Sstevel@tonic-gate 			sp, MD_MSG_CLASS1, MD_MSCF_NO_FLAGS, ep)) {
2840*0Sstevel@tonic-gate 			rval = -1;
2841*0Sstevel@tonic-gate 			goto out;
2842*0Sstevel@tonic-gate 		}
2843*0Sstevel@tonic-gate 		suspend1_flag = 1;
2844*0Sstevel@tonic-gate 		nd = nd->nd_next;
2845*0Sstevel@tonic-gate 	}
2846*0Sstevel@tonic-gate 
2847*0Sstevel@tonic-gate 	/* Get list of drives - needed in case of failure */
2848*0Sstevel@tonic-gate 	if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
2849*0Sstevel@tonic-gate 	    ep)) == NULL) {
2850*0Sstevel@tonic-gate 		/* Error getting drives in list */
2851*0Sstevel@tonic-gate 		if (! mdisok(ep)) {
2852*0Sstevel@tonic-gate 			rval = -1;
2853*0Sstevel@tonic-gate 			goto out2;
2854*0Sstevel@tonic-gate 		}
2855*0Sstevel@tonic-gate 		/* no drives in list */
2856*0Sstevel@tonic-gate 		rval = -2;
2857*0Sstevel@tonic-gate 		goto out2;
2858*0Sstevel@tonic-gate 	}
2859*0Sstevel@tonic-gate 
2860*0Sstevel@tonic-gate 	/*
2861*0Sstevel@tonic-gate 	 * Verify that this host is a member (in the host list) of the set.
2862*0Sstevel@tonic-gate 	 */
2863*0Sstevel@tonic-gate 	nd = sd->sd_nodelist;
2864*0Sstevel@tonic-gate 	while (nd) {
2865*0Sstevel@tonic-gate 		if (strcmp(mynode(), nd->nd_nodename) == 0) {
2866*0Sstevel@tonic-gate 			break;
2867*0Sstevel@tonic-gate 		}
2868*0Sstevel@tonic-gate 		nd = nd->nd_next;
2869*0Sstevel@tonic-gate 	}
2870*0Sstevel@tonic-gate 	if (!nd) {
2871*0Sstevel@tonic-gate 		(void) mddserror(ep, MDE_DS_NODENOTINSET, sp->setno,
2872*0Sstevel@tonic-gate 			sd->sd_mn_mynode->nd_nodename, NULL,
2873*0Sstevel@tonic-gate 			sp->setname);
2874*0Sstevel@tonic-gate 		rval = -1;
2875*0Sstevel@tonic-gate 		goto out2;
2876*0Sstevel@tonic-gate 	}
2877*0Sstevel@tonic-gate 
2878*0Sstevel@tonic-gate 	/*
2879*0Sstevel@tonic-gate 	 * Call metaget_setownership that calls each node in diskset and
2880*0Sstevel@tonic-gate 	 * marks in set descriptor if node is an owner of the set or not.
2881*0Sstevel@tonic-gate 	 * metaget_setownership checks to see if a node is an owner by
2882*0Sstevel@tonic-gate 	 * checking to see if that node's kernel has the mddb loaded.
2883*0Sstevel@tonic-gate 	 * If a node had panic'd during a reconfig or an
2884*0Sstevel@tonic-gate 	 * add/delete/join/withdraw operation, the other nodes' node
2885*0Sstevel@tonic-gate 	 * records may not reflect the current state of the diskset,
2886*0Sstevel@tonic-gate 	 * so calling metaget_setownership is the safest thing to do.
2887*0Sstevel@tonic-gate 	 */
2888*0Sstevel@tonic-gate 	if (metaget_setownership(sp, ep) == -1) {
2889*0Sstevel@tonic-gate 		rval = -1;
2890*0Sstevel@tonic-gate 		goto out2;
2891*0Sstevel@tonic-gate 	}
2892*0Sstevel@tonic-gate 
2893*0Sstevel@tonic-gate 	/*
2894*0Sstevel@tonic-gate 	 * Verify that this node is joined
2895*0Sstevel@tonic-gate 	 * to diskset (i.e. is an owner of the diskset).
2896*0Sstevel@tonic-gate 	 */
2897*0Sstevel@tonic-gate 	if (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) {
2898*0Sstevel@tonic-gate 		rval = -2;
2899*0Sstevel@tonic-gate 		goto out2;
2900*0Sstevel@tonic-gate 	}
2901*0Sstevel@tonic-gate 
2902*0Sstevel@tonic-gate 	/*
2903*0Sstevel@tonic-gate 	 * For a MN diskset, only withdraw master if it is
2904*0Sstevel@tonic-gate 	 * the only joined node.
2905*0Sstevel@tonic-gate 	 */
2906*0Sstevel@tonic-gate 	if (sd->sd_mn_master_nodeid == sd->sd_mn_mynode->nd_nodeid) {
2907*0Sstevel@tonic-gate 		nd = sd->sd_nodelist;
2908*0Sstevel@tonic-gate 		while (nd) {
2909*0Sstevel@tonic-gate 			/* Skip my node since checking for other owners */
2910*0Sstevel@tonic-gate 			if (nd->nd_nodeid == sd->sd_mn_master_nodeid) {
2911*0Sstevel@tonic-gate 				nd = nd->nd_next;
2912*0Sstevel@tonic-gate 				continue;
2913*0Sstevel@tonic-gate 			}
2914*0Sstevel@tonic-gate 			/* If another owner node if found, error */
2915*0Sstevel@tonic-gate 			if (nd->nd_flags & MD_MN_NODE_OWN) {
2916*0Sstevel@tonic-gate 				(void) mddserror(ep, MDE_DS_WITHDRAWMASTER,
2917*0Sstevel@tonic-gate 					sp->setno,
2918*0Sstevel@tonic-gate 					sd->sd_mn_mynode->nd_nodename, NULL,
2919*0Sstevel@tonic-gate 					sp->setname);
2920*0Sstevel@tonic-gate 				rval = -1;
2921*0Sstevel@tonic-gate 				goto out2;
2922*0Sstevel@tonic-gate 			}
2923*0Sstevel@tonic-gate 			nd = nd->nd_next;
2924*0Sstevel@tonic-gate 		}
2925*0Sstevel@tonic-gate 	}
2926*0Sstevel@tonic-gate 
2927*0Sstevel@tonic-gate 	/*
2928*0Sstevel@tonic-gate 	 * Is current set STALE?
2929*0Sstevel@tonic-gate 	 */
2930*0Sstevel@tonic-gate 	(void) memset(&c, 0, sizeof (c));
2931*0Sstevel@tonic-gate 	c.c_id = 0;
2932*0Sstevel@tonic-gate 	c.c_setno = sp->setno;
2933*0Sstevel@tonic-gate 	if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) {
2934*0Sstevel@tonic-gate 		(void) mdstealerror(ep, &c.c_mde);
2935*0Sstevel@tonic-gate 		rval = -1;
2936*0Sstevel@tonic-gate 		goto out;
2937*0Sstevel@tonic-gate 	}
2938*0Sstevel@tonic-gate 	if (c.c_flags & MDDB_C_STALE) {
2939*0Sstevel@tonic-gate 		stale_bool = TRUE;
2940*0Sstevel@tonic-gate 	}
2941*0Sstevel@tonic-gate 
2942*0Sstevel@tonic-gate 	/*
2943*0Sstevel@tonic-gate 	 * Notify rpc.mdcommd on all nodes of a nodelist change.
2944*0Sstevel@tonic-gate 	 * Start by suspending rpc.mdcommd (which drains it of all messages),
2945*0Sstevel@tonic-gate 	 * then change the nodelist followed by a reinit and resume.
2946*0Sstevel@tonic-gate 	 */
2947*0Sstevel@tonic-gate 	nd = sd->sd_nodelist;
2948*0Sstevel@tonic-gate 	while (nd) {
2949*0Sstevel@tonic-gate 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2950*0Sstevel@tonic-gate 			nd = nd->nd_next;
2951*0Sstevel@tonic-gate 			continue;
2952*0Sstevel@tonic-gate 		}
2953*0Sstevel@tonic-gate 
2954*0Sstevel@tonic-gate 		if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND,
2955*0Sstevel@tonic-gate 		    sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) {
2956*0Sstevel@tonic-gate 			rval = -1;
2957*0Sstevel@tonic-gate 			goto out;
2958*0Sstevel@tonic-gate 		}
2959*0Sstevel@tonic-gate 		suspendall_flag = 1;
2960*0Sstevel@tonic-gate 		nd = nd->nd_next;
2961*0Sstevel@tonic-gate 	}
2962*0Sstevel@tonic-gate 
2963*0Sstevel@tonic-gate 	/*
2964*0Sstevel@tonic-gate 	 * Withdraw the set - halt set.
2965*0Sstevel@tonic-gate 	 * This will fail if any I/O is occuring to any metadevice which
2966*0Sstevel@tonic-gate 	 * includes a resync to a mirror metadevice.
2967*0Sstevel@tonic-gate 	 */
2968*0Sstevel@tonic-gate 	set_halted = 1;
2969*0Sstevel@tonic-gate 	if (halt_set(sp, ep)) {
2970*0Sstevel@tonic-gate 		/* Was set actually halted? */
2971*0Sstevel@tonic-gate 		if (own_set(sp, NULL, TRUE, ep) == MD_SETOWNER_YES) {
2972*0Sstevel@tonic-gate 			set_halted = 0;
2973*0Sstevel@tonic-gate 		}
2974*0Sstevel@tonic-gate 		rval = -1;
2975*0Sstevel@tonic-gate 		goto out;
2976*0Sstevel@tonic-gate 	}
2977*0Sstevel@tonic-gate 
2978*0Sstevel@tonic-gate 	/* Change to nodelist so need to send reinit to rpc.mdcommd */
2979*0Sstevel@tonic-gate 	send_reinit = 1;
2980*0Sstevel@tonic-gate 
2981*0Sstevel@tonic-gate 	/* Reset master on withdrawn node */
2982*0Sstevel@tonic-gate 	if (clnt_mnsetmaster(sd->sd_mn_mynode->nd_nodename, sp, "",
2983*0Sstevel@tonic-gate 	    MD_MN_INVALID_NID, ep)) {
2984*0Sstevel@tonic-gate 		rval = -1;
2985*0Sstevel@tonic-gate 		goto out;
2986*0Sstevel@tonic-gate 	}
2987*0Sstevel@tonic-gate 
2988*0Sstevel@tonic-gate 	/* Mark my node as withdrawn and send to other nodes */
2989*0Sstevel@tonic-gate 	nd = sd->sd_nodelist;
2990*0Sstevel@tonic-gate 	my_nd = *(sd->sd_mn_mynode);	/* structure copy */
2991*0Sstevel@tonic-gate 	my_nd.nd_next = NULL;
2992*0Sstevel@tonic-gate 	while (nd) {
2993*0Sstevel@tonic-gate 		if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
2994*0Sstevel@tonic-gate 			nd = nd->nd_next;
2995*0Sstevel@tonic-gate 			continue;
2996*0Sstevel@tonic-gate 		}
2997*0Sstevel@tonic-gate 		if (clnt_upd_nr_flags(nd->nd_nodename, sp, &my_nd,
2998*0Sstevel@tonic-gate 		    MD_NR_WITHDRAW, NULL, ep)) {
2999*0Sstevel@tonic-gate 			rval = -1;
3000*0Sstevel@tonic-gate 			goto out;
3001*0Sstevel@tonic-gate 		}
3002*0Sstevel@tonic-gate 		nd = nd->nd_next;
3003*0Sstevel@tonic-gate 	}
3004*0Sstevel@tonic-gate 
3005*0Sstevel@tonic-gate 	/*
3006*0Sstevel@tonic-gate 	 * If withdrawn node is a mirror owner, reset mirror owner
3007*0Sstevel@tonic-gate 	 * to NULL.  If an error occurs, print a warning and continue.
3008*0Sstevel@tonic-gate 	 * Don't fail metaset because of mirror owner reset problem since
3009*0Sstevel@tonic-gate 	 * next node to grab mirror will resolve this issue.
3010*0Sstevel@tonic-gate 	 * Before next node grabs mirrors, metaset will show the withdrawn
3011*0Sstevel@tonic-gate 	 * node as owner which is why an attempt to reset the mirror owner
3012*0Sstevel@tonic-gate 	 * is made.
3013*0Sstevel@tonic-gate 	 */
3014*0Sstevel@tonic-gate 	node_id_list[0] = sd->sd_mn_mynode->nd_nodeid;	/* Setup my nodeid */
3015*0Sstevel@tonic-gate 	nd = sd->sd_nodelist;
3016*0Sstevel@tonic-gate 	while (nd) {
3017*0Sstevel@tonic-gate 		if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
3018*0Sstevel@tonic-gate 			nd = nd->nd_next;
3019*0Sstevel@tonic-gate 			continue;
3020*0Sstevel@tonic-gate 		}
3021*0Sstevel@tonic-gate 		if (clnt_reset_mirror_owner(nd->nd_nodename, sp,
3022*0Sstevel@tonic-gate 		    1, &node_id_list[0], &xep) == 01) {
3023*0Sstevel@tonic-gate 			mde_perror(&xep, dgettext(TEXT_DOMAIN,
3024*0Sstevel@tonic-gate 			    "Unable to reset mirror owner on node %s"),
3025*0Sstevel@tonic-gate 			    nd->nd_nodename);
3026*0Sstevel@tonic-gate 			mdclrerror(&xep);
3027*0Sstevel@tonic-gate 		}
3028*0Sstevel@tonic-gate 		nd = nd->nd_next;
3029*0Sstevel@tonic-gate 	}
3030*0Sstevel@tonic-gate 
3031*0Sstevel@tonic-gate out:
3032*0Sstevel@tonic-gate 	if (rval == -1) {
3033*0Sstevel@tonic-gate 		/* Rejoin node - Mark node as joined and send to other nodes */
3034*0Sstevel@tonic-gate 		nd = sd->sd_nodelist;
3035*0Sstevel@tonic-gate 		my_nd = *(sd->sd_mn_mynode);	/* structure copy */
3036*0Sstevel@tonic-gate 		my_nd.nd_next = NULL;
3037*0Sstevel@tonic-gate 		while (nd) {
3038*0Sstevel@tonic-gate 			if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
3039*0Sstevel@tonic-gate 				nd = nd->nd_next;
3040*0Sstevel@tonic-gate 				continue;
3041*0Sstevel@tonic-gate 			}
3042*0Sstevel@tonic-gate 			if (clnt_upd_nr_flags(nd->nd_nodename, sp, &my_nd,
3043*0Sstevel@tonic-gate 			    MD_NR_JOIN, NULL, &xep)) {
3044*0Sstevel@tonic-gate 				mdclrerror(&xep);
3045*0Sstevel@tonic-gate 			}
3046*0Sstevel@tonic-gate 			nd = nd->nd_next;
3047*0Sstevel@tonic-gate 		}
3048*0Sstevel@tonic-gate 
3049*0Sstevel@tonic-gate 		/* Set master on withdrawn node */
3050*0Sstevel@tonic-gate 		if (clnt_mnsetmaster(sd->sd_mn_mynode->nd_nodename, sp,
3051*0Sstevel@tonic-gate 		    sd->sd_mn_master_nodenm,
3052*0Sstevel@tonic-gate 		    sd->sd_mn_master_nodeid, &xep)) {
3053*0Sstevel@tonic-gate 			mdclrerror(&xep);
3054*0Sstevel@tonic-gate 		}
3055*0Sstevel@tonic-gate 
3056*0Sstevel@tonic-gate 		/* Join set if halt_set had succeeded */
3057*0Sstevel@tonic-gate 		if (set_halted) {
3058*0Sstevel@tonic-gate 			if (setup_db_bydd(sp, dd, 0, &xep) == -1) {
3059*0Sstevel@tonic-gate 				mdclrerror(&xep);
3060*0Sstevel@tonic-gate 			}
3061*0Sstevel@tonic-gate 			/* If set previously stale - make it so at re-join */
3062*0Sstevel@tonic-gate 			if (snarf_set(sp, stale_bool, &xep) != 0) {
3063*0Sstevel@tonic-gate 				mdclrerror(&xep);
3064*0Sstevel@tonic-gate 				(void) halt_set(sp, &xep);
3065*0Sstevel@tonic-gate 				mdclrerror(&xep);
3066*0Sstevel@tonic-gate 			}
3067*0Sstevel@tonic-gate 		}
3068*0Sstevel@tonic-gate 	}
3069*0Sstevel@tonic-gate 
3070*0Sstevel@tonic-gate 	/*
3071*0Sstevel@tonic-gate 	 * Notify rpc.mdcommd on all nodes of a nodelist change.
3072*0Sstevel@tonic-gate 	 * Send reinit command to mdcommd which forces it to get
3073*0Sstevel@tonic-gate 	 * fresh set description.
3074*0Sstevel@tonic-gate 	 */
3075*0Sstevel@tonic-gate 	if (send_reinit) {
3076*0Sstevel@tonic-gate 		/* Send reinit */
3077*0Sstevel@tonic-gate 		nd = sd->sd_nodelist;
3078*0Sstevel@tonic-gate 		while (nd) {
3079*0Sstevel@tonic-gate 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3080*0Sstevel@tonic-gate 				nd = nd->nd_next;
3081*0Sstevel@tonic-gate 				continue;
3082*0Sstevel@tonic-gate 			}
3083*0Sstevel@tonic-gate 
3084*0Sstevel@tonic-gate 			/* Class is ignored for REINIT */
3085*0Sstevel@tonic-gate 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
3086*0Sstevel@tonic-gate 				sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
3087*0Sstevel@tonic-gate 				/*
3088*0Sstevel@tonic-gate 				 * We are here because we failed to resume
3089*0Sstevel@tonic-gate 				 * rpc.mdcommd.  However we potentially have
3090*0Sstevel@tonic-gate 				 * an error from the previous call.
3091*0Sstevel@tonic-gate 				 * If the previous call did fail,  we
3092*0Sstevel@tonic-gate 				 * capture that error and generate a perror
3093*0Sstevel@tonic-gate 				 * withthe string,  "Unable to resume...".
3094*0Sstevel@tonic-gate 				 * Setting rval to -1 ensures that in the
3095*0Sstevel@tonic-gate 				 * next iteration of the loop, ep is not
3096*0Sstevel@tonic-gate 				 * clobbered.
3097*0Sstevel@tonic-gate 				 */
3098*0Sstevel@tonic-gate 				if (rval == 0)
3099*0Sstevel@tonic-gate 					(void) mdstealerror(ep, &xep);
3100*0Sstevel@tonic-gate 				else
3101*0Sstevel@tonic-gate 					mdclrerror(&xep);
3102*0Sstevel@tonic-gate 				rval = -1;
3103*0Sstevel@tonic-gate 				mde_perror(ep, dgettext(TEXT_DOMAIN,
3104*0Sstevel@tonic-gate 				    "Unable to reinit rpc.mdcommd."));
3105*0Sstevel@tonic-gate 			}
3106*0Sstevel@tonic-gate 			nd = nd->nd_next;
3107*0Sstevel@tonic-gate 		}
3108*0Sstevel@tonic-gate 	}
3109*0Sstevel@tonic-gate 
3110*0Sstevel@tonic-gate out2:
3111*0Sstevel@tonic-gate 	/*
3112*0Sstevel@tonic-gate 	 * Unlock diskset by resuming messages across the diskset.
3113*0Sstevel@tonic-gate 	 * Just resume all classes so that resume is the same whether
3114*0Sstevel@tonic-gate 	 * just one class was locked or all classes were locked.
3115*0Sstevel@tonic-gate 	 */
3116*0Sstevel@tonic-gate 	if ((suspend1_flag) || (suspendall_flag)) {
3117*0Sstevel@tonic-gate 		nd = sd->sd_nodelist;
3118*0Sstevel@tonic-gate 		while (nd) {
3119*0Sstevel@tonic-gate 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3120*0Sstevel@tonic-gate 				nd = nd->nd_next;
3121*0Sstevel@tonic-gate 				continue;
3122*0Sstevel@tonic-gate 			}
3123*0Sstevel@tonic-gate 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
3124*0Sstevel@tonic-gate 				sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
3125*0Sstevel@tonic-gate 				/*
3126*0Sstevel@tonic-gate 				 * We are here because we failed to resume
3127*0Sstevel@tonic-gate 				 * rpc.mdcommd.  However we potentially have
3128*0Sstevel@tonic-gate 				 * an error from the previous call
3129*0Sstevel@tonic-gate 				 * If the previous call did fail,  we capture
3130*0Sstevel@tonic-gate 				 * that error and generate a perror with
3131*0Sstevel@tonic-gate 				 * the string, "Unable to resume...".
3132*0Sstevel@tonic-gate 				 * Setting rval to -1 ensures that in the
3133*0Sstevel@tonic-gate 				 * next iteration of the loop, ep is not
3134*0Sstevel@tonic-gate 				 * clobbered.
3135*0Sstevel@tonic-gate 				 */
3136*0Sstevel@tonic-gate 				if (rval == 0)
3137*0Sstevel@tonic-gate 					(void) mdstealerror(ep, &xep);
3138*0Sstevel@tonic-gate 				else
3139*0Sstevel@tonic-gate 					mdclrerror(&xep);
3140*0Sstevel@tonic-gate 				rval = -1;
3141*0Sstevel@tonic-gate 				mde_perror(ep, dgettext(TEXT_DOMAIN,
3142*0Sstevel@tonic-gate 				    "Unable to resume rpc.mdcommd."));
3143*0Sstevel@tonic-gate 			}
3144*0Sstevel@tonic-gate 			nd = nd->nd_next;
3145*0Sstevel@tonic-gate 		}
3146*0Sstevel@tonic-gate 		meta_ping_mnset(sp->setno);
3147*0Sstevel@tonic-gate 	}
3148*0Sstevel@tonic-gate 
3149*0Sstevel@tonic-gate 	/*
3150*0Sstevel@tonic-gate 	 * Unlock set.  This flushes the caches on the servers.
3151*0Sstevel@tonic-gate 	 */
3152*0Sstevel@tonic-gate 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
3153*0Sstevel@tonic-gate 	nd = sd->sd_nodelist;
3154*0Sstevel@tonic-gate 	while (nd) {
3155*0Sstevel@tonic-gate 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3156*0Sstevel@tonic-gate 			nd = nd->nd_next;
3157*0Sstevel@tonic-gate 			continue;
3158*0Sstevel@tonic-gate 		}
3159*0Sstevel@tonic-gate 		if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
3160*0Sstevel@tonic-gate 			if (rval == 0)
3161*0Sstevel@tonic-gate 				(void) mdstealerror(ep, &xep);
3162*0Sstevel@tonic-gate 			else
3163*0Sstevel@tonic-gate 				mdclrerror(&xep);
3164*0Sstevel@tonic-gate 			rval = -1;
3165*0Sstevel@tonic-gate 		}
3166*0Sstevel@tonic-gate 		nd = nd->nd_next;
3167*0Sstevel@tonic-gate 	}
3168*0Sstevel@tonic-gate 
3169*0Sstevel@tonic-gate 	/*
3170*0Sstevel@tonic-gate 	 * call metaflushsetnames to reset local cache for master and
3171*0Sstevel@tonic-gate 	 * node information.
3172*0Sstevel@tonic-gate 	 */
3173*0Sstevel@tonic-gate 	metaflushsetname(sp);
3174*0Sstevel@tonic-gate 
3175*0Sstevel@tonic-gate 	/* release signals back to what they were on entry */
3176*0Sstevel@tonic-gate 	if (procsigs(FALSE, &oldsigs, &xep) < 0)
3177*0Sstevel@tonic-gate 		mdclrerror(&xep);
3178*0Sstevel@tonic-gate 
3179*0Sstevel@tonic-gate 	return (rval);
3180*0Sstevel@tonic-gate 
3181*0Sstevel@tonic-gate }
3182*0Sstevel@tonic-gate 
3183*0Sstevel@tonic-gate /*
3184*0Sstevel@tonic-gate  * Update nodelist with cluster member information.
3185*0Sstevel@tonic-gate  * A node not in the member list will be marked
3186*0Sstevel@tonic-gate  * as not ALIVE and not OWN.
3187*0Sstevel@tonic-gate  * A node in the member list will be marked ALIVE, but
3188*0Sstevel@tonic-gate  * the OWN bit will not be changed.
3189*0Sstevel@tonic-gate  *
3190*0Sstevel@tonic-gate  * If mynode isn't in the membership list, fail causing
3191*0Sstevel@tonic-gate  * another reconfig cycle to be started since a non-member
3192*0Sstevel@tonic-gate  * node shouldn't be taking part in the reconfig cycle.
3193*0Sstevel@tonic-gate  *
3194*0Sstevel@tonic-gate  * Return values:
3195*0Sstevel@tonic-gate  *	0 - No problem.
3196*0Sstevel@tonic-gate  *	1 - Any failure including RPC failure to my node.
3197*0Sstevel@tonic-gate  */
3198*0Sstevel@tonic-gate int
3199*0Sstevel@tonic-gate meta_reconfig_update_nodelist(
3200*0Sstevel@tonic-gate 	mdsetname_t			*sp,
3201*0Sstevel@tonic-gate 	mndiskset_membershiplist_t	*nl,
3202*0Sstevel@tonic-gate 	md_set_desc			*sd,
3203*0Sstevel@tonic-gate 	md_error_t			*ep
3204*0Sstevel@tonic-gate )
3205*0Sstevel@tonic-gate {
3206*0Sstevel@tonic-gate 	mndiskset_membershiplist_t	*nl2;
3207*0Sstevel@tonic-gate 	md_mnnode_desc			*nd;
3208*0Sstevel@tonic-gate 	md_error_t			xep = mdnullerror;
3209*0Sstevel@tonic-gate 	int				rval = 0;
3210*0Sstevel@tonic-gate 
3211*0Sstevel@tonic-gate 	/*
3212*0Sstevel@tonic-gate 	 * Walk through nodelist, checking to see if each
3213*0Sstevel@tonic-gate 	 * node is in the member list.
3214*0Sstevel@tonic-gate 	 * If node is not a member, reset ALIVE and OWN node flag.
3215*0Sstevel@tonic-gate 	 * If node is a member, set ALIVE.
3216*0Sstevel@tonic-gate 	 * If mynode's OWN flag gets reset, then halt the diskset on this node.
3217*0Sstevel@tonic-gate 	 */
3218*0Sstevel@tonic-gate 	nd = sd->sd_nodelist;
3219*0Sstevel@tonic-gate 	while (nd) {
3220*0Sstevel@tonic-gate 		nl2 = nl;
3221*0Sstevel@tonic-gate 		while (nl2) {
3222*0Sstevel@tonic-gate 			/* If node is in member list, set ALIVE */
3223*0Sstevel@tonic-gate 			if (nl2->msl_node_id == nd->nd_nodeid) {
3224*0Sstevel@tonic-gate 				nd->nd_flags |= MD_MN_NODE_ALIVE;
3225*0Sstevel@tonic-gate 				break;
3226*0Sstevel@tonic-gate 			} else {
3227*0Sstevel@tonic-gate 				nl2 = nl2->next;
3228*0Sstevel@tonic-gate 			}
3229*0Sstevel@tonic-gate 			/* node is not in member list, mark !ALIVE and !OWN */
3230*0Sstevel@tonic-gate 			if (nl2 == NULL) {
3231*0Sstevel@tonic-gate 				/* If node is mynode, then halt set if needed */
3232*0Sstevel@tonic-gate 				if (strcmp(mynode(), nd->nd_nodename) == 0) {
3233*0Sstevel@tonic-gate 					/*
3234*0Sstevel@tonic-gate 					 * This shouldn't happen, but just
3235*0Sstevel@tonic-gate 					 * in case...  Any node not in the
3236*0Sstevel@tonic-gate 					 * membership list should be dead and
3237*0Sstevel@tonic-gate 					 * not running reconfig step1.
3238*0Sstevel@tonic-gate 					 */
3239*0Sstevel@tonic-gate 					if (nd->nd_flags & MD_MN_NODE_OWN) {
3240*0Sstevel@tonic-gate 						if (halt_set(sp, &xep)) {
3241*0Sstevel@tonic-gate 							mde_perror(&xep, "");
3242*0Sstevel@tonic-gate 							mdclrerror(&xep);
3243*0Sstevel@tonic-gate 						}
3244*0Sstevel@tonic-gate 					}
3245*0Sstevel@tonic-gate 					/*
3246*0Sstevel@tonic-gate 					 * Return failure since this node
3247*0Sstevel@tonic-gate 					 * (mynode) is not in the membership
3248*0Sstevel@tonic-gate 					 * list, but process the rest of the
3249*0Sstevel@tonic-gate 					 * nodelist first so that rpc.metad
3250*0Sstevel@tonic-gate 					 * can be updated with the latest
3251*0Sstevel@tonic-gate 					 * membership information.
3252*0Sstevel@tonic-gate 					 */
3253*0Sstevel@tonic-gate 					(void) mddserror(ep,
3254*0Sstevel@tonic-gate 					    MDE_DS_NOTINMEMBERLIST,
3255*0Sstevel@tonic-gate 					    sp->setno, nd->nd_nodename, NULL,
3256*0Sstevel@tonic-gate 					    sp->setname);
3257*0Sstevel@tonic-gate 					rval = 1;
3258*0Sstevel@tonic-gate 				}
3259*0Sstevel@tonic-gate 				nd->nd_flags &= ~MD_MN_NODE_ALIVE;
3260*0Sstevel@tonic-gate 				nd->nd_flags &= ~MD_MN_NODE_OWN;
3261*0Sstevel@tonic-gate 			}
3262*0Sstevel@tonic-gate 		}
3263*0Sstevel@tonic-gate 		nd = nd->nd_next;
3264*0Sstevel@tonic-gate 	}
3265*0Sstevel@tonic-gate 
3266*0Sstevel@tonic-gate 	/* Send this information to rpc.metad */
3267*0Sstevel@tonic-gate 	if (clnt_upd_nr_flags(mynode(), sp, sd->sd_nodelist,
3268*0Sstevel@tonic-gate 	    MD_NR_SET,  MNSET_IN_RECONFIG, &xep)) {
3269*0Sstevel@tonic-gate 		/* Return failure if can't send node flags to rpc.metad */
3270*0Sstevel@tonic-gate 		if (rval == 0) {
3271*0Sstevel@tonic-gate 			(void) mdstealerror(ep, &xep);
3272*0Sstevel@tonic-gate 			rval = 1;
3273*0Sstevel@tonic-gate 		}
3274*0Sstevel@tonic-gate 	}
3275*0Sstevel@tonic-gate 	return (rval);
3276*0Sstevel@tonic-gate }
3277*0Sstevel@tonic-gate 
3278*0Sstevel@tonic-gate /*
3279*0Sstevel@tonic-gate  * Choose master determines the master for a diskset.
3280*0Sstevel@tonic-gate  * Each node determines the master on its own and
3281*0Sstevel@tonic-gate  * adds this information to its local rpc.metad nodelist
3282*0Sstevel@tonic-gate  * and also sends it to the kernel.
3283*0Sstevel@tonic-gate  *
3284*0Sstevel@tonic-gate  * Nodelist in set descriptor (sd) is sorted in
3285*0Sstevel@tonic-gate  * monotonically increasing sequence of nodeid.
3286*0Sstevel@tonic-gate  *
3287*0Sstevel@tonic-gate  * Return values:
3288*0Sstevel@tonic-gate  *	0 - No problem.
3289*0Sstevel@tonic-gate  *	205 - There was an RPC problem to another node.
3290*0Sstevel@tonic-gate  *	-1 - There was an error.  This could be an RPC error to my node.
3291*0Sstevel@tonic-gate  *		This is a catastrophic failure causing node to panic.
3292*0Sstevel@tonic-gate  */
3293*0Sstevel@tonic-gate int
3294*0Sstevel@tonic-gate meta_reconfig_choose_master_for_set(
3295*0Sstevel@tonic-gate 	mdsetname_t	*sp,
3296*0Sstevel@tonic-gate 	md_set_desc	*sd,
3297*0Sstevel@tonic-gate 	md_error_t	*ep
3298*0Sstevel@tonic-gate )
3299*0Sstevel@tonic-gate {
3300*0Sstevel@tonic-gate 	int			is_owner;
3301*0Sstevel@tonic-gate 	md_mnset_record		*mnsr = NULL;
3302*0Sstevel@tonic-gate 	int			lowest_alive_nodeid = 0;
3303*0Sstevel@tonic-gate 	uint_t			master_nodeid;
3304*0Sstevel@tonic-gate 	md_mnnode_desc		*nd, *nd2;
3305*0Sstevel@tonic-gate 	md_mnnode_record	*nr;
3306*0Sstevel@tonic-gate 	md_drive_desc		*dd;
3307*0Sstevel@tonic-gate 	md_setkey_t		*cl_sk;
3308*0Sstevel@tonic-gate 	int			rval = 0;
3309*0Sstevel@tonic-gate 	md_error_t		xep = mdnullerror;
3310*0Sstevel@tonic-gate 	mddb_setflags_config_t	sf;
3311*0Sstevel@tonic-gate 
3312*0Sstevel@tonic-gate 	/*
3313*0Sstevel@tonic-gate 	 * Is current node joined to diskset?
3314*0Sstevel@tonic-gate 	 * Don't trust flags, really check to see if mddb is snarfed.
3315*0Sstevel@tonic-gate 	 */
3316*0Sstevel@tonic-gate 	if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) {
3317*0Sstevel@tonic-gate 		/*
3318*0Sstevel@tonic-gate 		 * If a node is joined to the diskset, this node checks
3319*0Sstevel@tonic-gate 		 * to see if the current master of the diskset is valid and
3320*0Sstevel@tonic-gate 		 * is still in the membership list (ALIVE) and is
3321*0Sstevel@tonic-gate 		 * still joined (OWN).  Need to verify if master is
3322*0Sstevel@tonic-gate 		 * really joined - don't trust the flags.  (Can trust
3323*0Sstevel@tonic-gate 		 * ALIVE since set during earlier part of reconfig cycle.)
3324*0Sstevel@tonic-gate 		 * If the current master is valid, still in the membership
3325*0Sstevel@tonic-gate 		 * list and joined, then master is not changed on this node.
3326*0Sstevel@tonic-gate 		 * Just return.
3327*0Sstevel@tonic-gate 		 *
3328*0Sstevel@tonic-gate 		 * Verify that nodeid is valid before accessing masternode.
3329*0Sstevel@tonic-gate 		 */
3330*0Sstevel@tonic-gate 		if ((sd->sd_mn_master_nodeid != MD_MN_INVALID_NID) &&
3331*0Sstevel@tonic-gate 		    (sd->sd_mn_masternode->nd_flags & MD_MN_NODE_ALIVE)) {
3332*0Sstevel@tonic-gate 			if (clnt_ownset(sd->sd_mn_master_nodenm, sp,
3333*0Sstevel@tonic-gate 			    &is_owner, ep) == -1) {
3334*0Sstevel@tonic-gate 				/* If RPC failure to another node return 205 */
3335*0Sstevel@tonic-gate 				if ((mdanyrpcerror(ep)) &&
3336*0Sstevel@tonic-gate 				    (sd->sd_mn_mynode->nd_nodeid !=
3337*0Sstevel@tonic-gate 				    sd->sd_mn_master_nodeid)) {
3338*0Sstevel@tonic-gate 					return (205);
3339*0Sstevel@tonic-gate 				} else {
3340*0Sstevel@tonic-gate 					/* Any other failure */
3341*0Sstevel@tonic-gate 					return (-1);
3342*0Sstevel@tonic-gate 				}
3343*0Sstevel@tonic-gate 			} else {
3344*0Sstevel@tonic-gate 				if (is_owner == TRUE) {
3345*0Sstevel@tonic-gate 
3346*0Sstevel@tonic-gate 					meta_mc_log(MC_LOG5, dgettext(
3347*0Sstevel@tonic-gate 					    TEXT_DOMAIN, "Set %s previous "
3348*0Sstevel@tonic-gate 					    "master chosen %s (%d): %s"),
3349*0Sstevel@tonic-gate 					    sp->setname,
3350*0Sstevel@tonic-gate 					    sd->sd_mn_master_nodenm,
3351*0Sstevel@tonic-gate 					    sd->sd_mn_master_nodeid,
3352*0Sstevel@tonic-gate 					    meta_print_hrtime(gethrtime() -
3353*0Sstevel@tonic-gate 					    start_time));
3354*0Sstevel@tonic-gate 
3355*0Sstevel@tonic-gate 					/* Previous master is ok - done */
3356*0Sstevel@tonic-gate 					return (0);
3357*0Sstevel@tonic-gate 				}
3358*0Sstevel@tonic-gate 			}
3359*0Sstevel@tonic-gate 		}
3360*0Sstevel@tonic-gate 
3361*0Sstevel@tonic-gate 		/*
3362*0Sstevel@tonic-gate 		 * If current master is no longer in the membership list or
3363*0Sstevel@tonic-gate 		 * is no longer joined, then this node uses the following
3364*0Sstevel@tonic-gate 		 * algorithm:
3365*0Sstevel@tonic-gate 		 * - node calls RPC routine clnt_ownset to get latest
3366*0Sstevel@tonic-gate 		 *	information on which nodes are owners of diskset.
3367*0Sstevel@tonic-gate 		 * 	clnt_ownset checks on each node to see if its kernel
3368*0Sstevel@tonic-gate 		 *	has that diskset snarfed.
3369*0Sstevel@tonic-gate 		 */
3370*0Sstevel@tonic-gate 		nd = sd->sd_nodelist;
3371*0Sstevel@tonic-gate 		while (nd) {
3372*0Sstevel@tonic-gate 			/* Don't consider node that isn't in member list */
3373*0Sstevel@tonic-gate 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3374*0Sstevel@tonic-gate 				nd = nd->nd_next;
3375*0Sstevel@tonic-gate 				continue;
3376*0Sstevel@tonic-gate 			}
3377*0Sstevel@tonic-gate 
3378*0Sstevel@tonic-gate 			if (clnt_ownset(nd->nd_nodename, sp,
3379*0Sstevel@tonic-gate 			    &is_owner, ep) == -1) {
3380*0Sstevel@tonic-gate 				/* If RPC failure to another node return 205 */
3381*0Sstevel@tonic-gate 				if ((mdanyrpcerror(ep)) &&
3382*0Sstevel@tonic-gate 				    (sd->sd_mn_mynode->nd_nodeid !=
3383*0Sstevel@tonic-gate 				    nd->nd_nodeid)) {
3384*0Sstevel@tonic-gate 					return (205);
3385*0Sstevel@tonic-gate 				} else {
3386*0Sstevel@tonic-gate 					/* Any other failure */
3387*0Sstevel@tonic-gate 					return (-1);
3388*0Sstevel@tonic-gate 				}
3389*0Sstevel@tonic-gate 			}
3390*0Sstevel@tonic-gate 
3391*0Sstevel@tonic-gate 			/*
3392*0Sstevel@tonic-gate 			 * Set owner flag for each node based on whether
3393*0Sstevel@tonic-gate 			 * that node really has a diskset mddb snarfed in
3394*0Sstevel@tonic-gate 			 * or not.
3395*0Sstevel@tonic-gate 			 */
3396*0Sstevel@tonic-gate 			if (is_owner == TRUE)
3397*0Sstevel@tonic-gate 				nd->nd_flags |= MD_MN_NODE_OWN;
3398*0Sstevel@tonic-gate 			else
3399*0Sstevel@tonic-gate 				nd->nd_flags &= ~MD_MN_NODE_OWN;
3400*0Sstevel@tonic-gate 
3401*0Sstevel@tonic-gate 			nd = nd->nd_next;
3402*0Sstevel@tonic-gate 		}
3403*0Sstevel@tonic-gate 
3404*0Sstevel@tonic-gate 		/*
3405*0Sstevel@tonic-gate 		 * - node walks through nodelist looking for nodes that are
3406*0Sstevel@tonic-gate 		 *	owners of the diskset that are in the membership list.
3407*0Sstevel@tonic-gate 		 * - for each owner, node calls RPC routine clnt_getset to
3408*0Sstevel@tonic-gate 		 *	 see if that node has its node record set to OK.
3409*0Sstevel@tonic-gate 		 * - If so, master is chosen to be this owner node.
3410*0Sstevel@tonic-gate 		 */
3411*0Sstevel@tonic-gate 		nd = sd->sd_nodelist;
3412*0Sstevel@tonic-gate 		while (nd) {
3413*0Sstevel@tonic-gate 			/* Don't consider node that isn't in member list */
3414*0Sstevel@tonic-gate 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3415*0Sstevel@tonic-gate 				nd = nd->nd_next;
3416*0Sstevel@tonic-gate 				continue;
3417*0Sstevel@tonic-gate 			}
3418*0Sstevel@tonic-gate 
3419*0Sstevel@tonic-gate 			/* Don't consider a node that isn't an owner */
3420*0Sstevel@tonic-gate 			if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
3421*0Sstevel@tonic-gate 				nd = nd->nd_next;
3422*0Sstevel@tonic-gate 				continue;
3423*0Sstevel@tonic-gate 			}
3424*0Sstevel@tonic-gate 
3425*0Sstevel@tonic-gate 			/* Does node has its own node record set to OK? */
3426*0Sstevel@tonic-gate 			if (clnt_mngetset(nd->nd_nodename, sp->setname,
3427*0Sstevel@tonic-gate 			    MD_SET_BAD, &mnsr, ep) == -1) {
3428*0Sstevel@tonic-gate 				/* If RPC failure to another node return 205 */
3429*0Sstevel@tonic-gate 				if ((mdanyrpcerror(ep)) &&
3430*0Sstevel@tonic-gate 				    (sd->sd_mn_mynode->nd_nodeid !=
3431*0Sstevel@tonic-gate 				    nd->nd_nodeid)) {
3432*0Sstevel@tonic-gate 					return (205);
3433*0Sstevel@tonic-gate 				} else {
3434*0Sstevel@tonic-gate 					/* Any other failure */
3435*0Sstevel@tonic-gate 					return (-1);
3436*0Sstevel@tonic-gate 				}
3437*0Sstevel@tonic-gate 			}
3438*0Sstevel@tonic-gate 			nr = mnsr->sr_nodechain;
3439*0Sstevel@tonic-gate 			while (nr) {
3440*0Sstevel@tonic-gate 				if (nd->nd_nodeid == nr->nr_nodeid) {
3441*0Sstevel@tonic-gate 					if (nr->nr_flags & MD_MN_NODE_OK) {
3442*0Sstevel@tonic-gate 						/* Found a master */
3443*0Sstevel@tonic-gate 						free_sr(
3444*0Sstevel@tonic-gate 						    (md_set_record *)mnsr);
3445*0Sstevel@tonic-gate 						goto found_master;
3446*0Sstevel@tonic-gate 					}
3447*0Sstevel@tonic-gate 				}
3448*0Sstevel@tonic-gate 				nr = nr->nr_next;
3449*0Sstevel@tonic-gate 			}
3450*0Sstevel@tonic-gate 			free_sr((md_set_record *)mnsr);
3451*0Sstevel@tonic-gate 			nd = nd->nd_next;
3452*0Sstevel@tonic-gate 		}
3453*0Sstevel@tonic-gate 
3454*0Sstevel@tonic-gate 		/*
3455*0Sstevel@tonic-gate 		 * - If no owner node has its own node record on its own node
3456*0Sstevel@tonic-gate 		 *	set to OK, then this node checks all of the non-owner
3457*0Sstevel@tonic-gate 		 * 	nodes that are in the membership list.
3458*0Sstevel@tonic-gate 		 * - for each non-owner, node calls RPC routine clnt_getset to
3459*0Sstevel@tonic-gate 		 *	 see if that node has its node record set to OK.
3460*0Sstevel@tonic-gate 		 * - If set doesn't exist, don't choose node for master.
3461*0Sstevel@tonic-gate 		 * - If so, master is chosen to be this non-owner node.
3462*0Sstevel@tonic-gate 		 *
3463*0Sstevel@tonic-gate 		 */
3464*0Sstevel@tonic-gate 		nd = sd->sd_nodelist;
3465*0Sstevel@tonic-gate 		while (nd) {
3466*0Sstevel@tonic-gate 			/* Don't consider node that isn't in member list */
3467*0Sstevel@tonic-gate 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3468*0Sstevel@tonic-gate 				nd = nd->nd_next;
3469*0Sstevel@tonic-gate 				continue;
3470*0Sstevel@tonic-gate 			}
3471*0Sstevel@tonic-gate 
3472*0Sstevel@tonic-gate 			/* Only checking non-owner nodes this time around */
3473*0Sstevel@tonic-gate 			if (nd->nd_flags & MD_MN_NODE_OWN) {
3474*0Sstevel@tonic-gate 				nd = nd->nd_next;
3475*0Sstevel@tonic-gate 				continue;
3476*0Sstevel@tonic-gate 			}
3477*0Sstevel@tonic-gate 
3478*0Sstevel@tonic-gate 			/* Does node has its own node record set to OK? */
3479*0Sstevel@tonic-gate 			if (clnt_mngetset(nd->nd_nodename, sp->setname,
3480*0Sstevel@tonic-gate 			    MD_SET_BAD, &mnsr, ep) == -1) {
3481*0Sstevel@tonic-gate 				/*
3482*0Sstevel@tonic-gate 				 * If set doesn't exist on non-owner node,
3483*0Sstevel@tonic-gate 				 * don't consider this node for master.
3484*0Sstevel@tonic-gate 				 */
3485*0Sstevel@tonic-gate 				if (mdiserror(ep, MDE_NO_SET)) {
3486*0Sstevel@tonic-gate 					nd = nd->nd_next;
3487*0Sstevel@tonic-gate 					continue;
3488*0Sstevel@tonic-gate 				} else if ((mdanyrpcerror(ep)) &&
3489*0Sstevel@tonic-gate 				    (sd->sd_mn_mynode->nd_nodeid !=
3490*0Sstevel@tonic-gate 				    nd->nd_nodeid)) {
3491*0Sstevel@tonic-gate 					/* RPC failure to another node */
3492*0Sstevel@tonic-gate 					return (205);
3493*0Sstevel@tonic-gate 				} else {
3494*0Sstevel@tonic-gate 					/* Any other failure */
3495*0Sstevel@tonic-gate 					return (-1);
3496*0Sstevel@tonic-gate 				}
3497*0Sstevel@tonic-gate 			}
3498*0Sstevel@tonic-gate 			nr = mnsr->sr_nodechain;
3499*0Sstevel@tonic-gate 			while (nr) {
3500*0Sstevel@tonic-gate 				if (nd->nd_nodeid == nr->nr_nodeid) {
3501*0Sstevel@tonic-gate 					if (nr->nr_flags & MD_MN_NODE_OK) {
3502*0Sstevel@tonic-gate 						/* Found a master */
3503*0Sstevel@tonic-gate 						free_sr(
3504*0Sstevel@tonic-gate 						    (md_set_record *)mnsr);
3505*0Sstevel@tonic-gate 						goto found_master;
3506*0Sstevel@tonic-gate 					}
3507*0Sstevel@tonic-gate 				}
3508*0Sstevel@tonic-gate 				nr = nr->nr_next;
3509*0Sstevel@tonic-gate 			}
3510*0Sstevel@tonic-gate 			free_sr((md_set_record *)mnsr);
3511*0Sstevel@tonic-gate 			nd = nd->nd_next;
3512*0Sstevel@tonic-gate 		}
3513*0Sstevel@tonic-gate 
3514*0Sstevel@tonic-gate 		/*
3515*0Sstevel@tonic-gate 		 * - If no node can be found that has its own node record on
3516*0Sstevel@tonic-gate 		 *	its node to be set to OK, then all alive nodes
3517*0Sstevel@tonic-gate 		 * 	were in the process of being added to or deleted
3518*0Sstevel@tonic-gate 		 *	from set.  Each alive node will remove all
3519*0Sstevel@tonic-gate 		 *	information pertaining to this set from its node.
3520*0Sstevel@tonic-gate 		 *
3521*0Sstevel@tonic-gate 		 * If all nodes in set are ALIVE, then call sdssc end routines
3522*0Sstevel@tonic-gate 		 * since set was truly being initially created or destroyed.
3523*0Sstevel@tonic-gate 		 */
3524*0Sstevel@tonic-gate 		goto delete_set;
3525*0Sstevel@tonic-gate 	} else {
3526*0Sstevel@tonic-gate 
3527*0Sstevel@tonic-gate 		/*
3528*0Sstevel@tonic-gate 		 * If node is not joined to diskset, then this
3529*0Sstevel@tonic-gate 		 * node uses the following algorithm:
3530*0Sstevel@tonic-gate 		 * - If unjoined node doesn't have a node record for itself,
3531*0Sstevel@tonic-gate 		 *	just delete the diskset since diskset was in the
3532*0Sstevel@tonic-gate 		 *	process of being created.
3533*0Sstevel@tonic-gate 		 * - node needs to find master of diskset before
3534*0Sstevel@tonic-gate 		 *	reconfig cycle, if a master existed.
3535*0Sstevel@tonic-gate 		 * - node calls RPC routine clnt_ownset to get latest
3536*0Sstevel@tonic-gate 		 * 	information on which nodes are owners of diskset.
3537*0Sstevel@tonic-gate 		 *	clnt_ownset checks on each node to see if its
3538*0Sstevel@tonic-gate 		 *	kernel has that diskset snarfed.
3539*0Sstevel@tonic-gate 		 */
3540*0Sstevel@tonic-gate 
3541*0Sstevel@tonic-gate 		/*
3542*0Sstevel@tonic-gate 		 * Is my node in the set description?
3543*0Sstevel@tonic-gate 		 * If not, delete the set from this node.
3544*0Sstevel@tonic-gate 		 * sr2setdesc sets sd_mn_mynode pointer to the node
3545*0Sstevel@tonic-gate 		 * descriptor for this node if there was a node
3546*0Sstevel@tonic-gate 		 * record for this node.
3547*0Sstevel@tonic-gate 		 *
3548*0Sstevel@tonic-gate 		 */
3549*0Sstevel@tonic-gate 		if (sd->sd_mn_mynode == NULL) {
3550*0Sstevel@tonic-gate 			goto delete_set;
3551*0Sstevel@tonic-gate 		}
3552*0Sstevel@tonic-gate 
3553*0Sstevel@tonic-gate 		nd = sd->sd_nodelist;
3554*0Sstevel@tonic-gate 		while (nd) {
3555*0Sstevel@tonic-gate 			/* Don't consider node that isn't in member list */
3556*0Sstevel@tonic-gate 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3557*0Sstevel@tonic-gate 				nd = nd->nd_next;
3558*0Sstevel@tonic-gate 				continue;
3559*0Sstevel@tonic-gate 			}
3560*0Sstevel@tonic-gate 
3561*0Sstevel@tonic-gate 			if (clnt_ownset(nd->nd_nodename, sp,
3562*0Sstevel@tonic-gate 			    &is_owner, ep) == -1) {
3563*0Sstevel@tonic-gate 				/* If RPC failure to another node return 205 */
3564*0Sstevel@tonic-gate 				if ((mdanyrpcerror(ep)) &&
3565*0Sstevel@tonic-gate 				    (sd->sd_mn_mynode->nd_nodeid !=
3566*0Sstevel@tonic-gate 				    nd->nd_nodeid)) {
3567*0Sstevel@tonic-gate 					return (205);
3568*0Sstevel@tonic-gate 				} else {
3569*0Sstevel@tonic-gate 					/* Any other failure */
3570*0Sstevel@tonic-gate 					return (-1);
3571*0Sstevel@tonic-gate 				}
3572*0Sstevel@tonic-gate 			}
3573*0Sstevel@tonic-gate 
3574*0Sstevel@tonic-gate 			/*
3575*0Sstevel@tonic-gate 			 * Set owner flag for each node based on whether
3576*0Sstevel@tonic-gate 			 * that node really has a diskset mddb snarfed in
3577*0Sstevel@tonic-gate 			 * or not.
3578*0Sstevel@tonic-gate 			 */
3579*0Sstevel@tonic-gate 			if (is_owner == TRUE)
3580*0Sstevel@tonic-gate 				nd->nd_flags |= MD_MN_NODE_OWN;
3581*0Sstevel@tonic-gate 			else
3582*0Sstevel@tonic-gate 				nd->nd_flags &= ~MD_MN_NODE_OWN;
3583*0Sstevel@tonic-gate 
3584*0Sstevel@tonic-gate 			nd = nd->nd_next;
3585*0Sstevel@tonic-gate 		}
3586*0Sstevel@tonic-gate 
3587*0Sstevel@tonic-gate 		/*
3588*0Sstevel@tonic-gate 		 * - node walks through nodelist looking for nodes that
3589*0Sstevel@tonic-gate 		 *	are owners of the diskset that are in
3590*0Sstevel@tonic-gate 		 *	the membership list.
3591*0Sstevel@tonic-gate 		 * - for each owner, node calls RPC routine clnt_getset to
3592*0Sstevel@tonic-gate 		 *	see if that node has a master set and to get the
3593*0Sstevel@tonic-gate 		 *	diskset description.
3594*0Sstevel@tonic-gate 		 * - If the owner node has a set description that doesn't
3595*0Sstevel@tonic-gate 		 *	include the non-joined node in the nodelist, this node
3596*0Sstevel@tonic-gate 		 *	removes its set description of that diskset
3597*0Sstevel@tonic-gate 		 *	(i.e. removes the set from its local mddbs).  This is
3598*0Sstevel@tonic-gate 		 *	handling the case of when a node was removed from a
3599*0Sstevel@tonic-gate 		 *	diskset while it was not in the cluster membership
3600*0Sstevel@tonic-gate 		 *	list.
3601*0Sstevel@tonic-gate 		 * - If that node has a master set and the master is in the
3602*0Sstevel@tonic-gate 		 *	membership list and is an owner, then either this was
3603*0Sstevel@tonic-gate 		 *	the master from before the reconfig cycle or this
3604*0Sstevel@tonic-gate 		 *	node has already chosen a new master - either way,
3605*0Sstevel@tonic-gate 		 *	the master value is valid as long as it is in the
3606*0Sstevel@tonic-gate 		 *	membership list and is an owner
3607*0Sstevel@tonic-gate 		 * - master is chosen to be owner node's master
3608*0Sstevel@tonic-gate 		 */
3609*0Sstevel@tonic-gate 		nd = sd->sd_nodelist;
3610*0Sstevel@tonic-gate 		while (nd) {
3611*0Sstevel@tonic-gate 			/* Don't consider node that isn't in member list */
3612*0Sstevel@tonic-gate 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3613*0Sstevel@tonic-gate 				nd = nd->nd_next;
3614*0Sstevel@tonic-gate 				continue;
3615*0Sstevel@tonic-gate 			}
3616*0Sstevel@tonic-gate 
3617*0Sstevel@tonic-gate 			/* Don't consider a node that isn't an owner */
3618*0Sstevel@tonic-gate 			if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
3619*0Sstevel@tonic-gate 				nd = nd->nd_next;
3620*0Sstevel@tonic-gate 				continue;
3621*0Sstevel@tonic-gate 			}
3622*0Sstevel@tonic-gate 
3623*0Sstevel@tonic-gate 			/* Get owner node's set record */
3624*0Sstevel@tonic-gate 			if (clnt_mngetset(nd->nd_nodename, sp->setname,
3625*0Sstevel@tonic-gate 			    MD_SET_BAD, &mnsr, ep) == -1) {
3626*0Sstevel@tonic-gate 				/* If RPC failure to another node return 205 */
3627*0Sstevel@tonic-gate 				if ((mdanyrpcerror(ep)) &&
3628*0Sstevel@tonic-gate 				    (sd->sd_mn_mynode->nd_nodeid !=
3629*0Sstevel@tonic-gate 				    nd->nd_nodeid)) {
3630*0Sstevel@tonic-gate 					return (205);
3631*0Sstevel@tonic-gate 				} else {
3632*0Sstevel@tonic-gate 					/* Any other failure */
3633*0Sstevel@tonic-gate 					return (-1);
3634*0Sstevel@tonic-gate 				}
3635*0Sstevel@tonic-gate 			}
3636*0Sstevel@tonic-gate 
3637*0Sstevel@tonic-gate 			/* Is this node in the owner node's set record */
3638*0Sstevel@tonic-gate 			nr = mnsr->sr_nodechain;
3639*0Sstevel@tonic-gate 			while (nr) {
3640*0Sstevel@tonic-gate 				if (sd->sd_mn_mynode->nd_nodeid ==
3641*0Sstevel@tonic-gate 				    nr->nr_nodeid) {
3642*0Sstevel@tonic-gate 					break;
3643*0Sstevel@tonic-gate 				}
3644*0Sstevel@tonic-gate 				nr = nr->nr_next;
3645*0Sstevel@tonic-gate 			}
3646*0Sstevel@tonic-gate 			if (nr == NULL) {
3647*0Sstevel@tonic-gate 				/* my node not found - delete set */
3648*0Sstevel@tonic-gate 				free_sr((md_set_record *)mnsr);
3649*0Sstevel@tonic-gate 				goto delete_set;
3650*0Sstevel@tonic-gate 			}
3651*0Sstevel@tonic-gate 
3652*0Sstevel@tonic-gate 			/* Is owner's node's master valid? */
3653*0Sstevel@tonic-gate 			master_nodeid = mnsr->sr_master_nodeid;
3654*0Sstevel@tonic-gate 			free_sr((md_set_record *)mnsr);
3655*0Sstevel@tonic-gate 			if (master_nodeid == MD_MN_INVALID_NID) {
3656*0Sstevel@tonic-gate 				nd = nd->nd_next;
3657*0Sstevel@tonic-gate 				continue;
3658*0Sstevel@tonic-gate 			}
3659*0Sstevel@tonic-gate 
3660*0Sstevel@tonic-gate 			nd2 = sd->sd_nodelist;
3661*0Sstevel@tonic-gate 			while (nd2) {
3662*0Sstevel@tonic-gate 				if ((nd2->nd_nodeid == master_nodeid) &&
3663*0Sstevel@tonic-gate 				    (nd2->nd_flags & MD_MN_NODE_ALIVE) &&
3664*0Sstevel@tonic-gate 				    (nd2->nd_flags & MD_MN_NODE_OWN)) {
3665*0Sstevel@tonic-gate 						nd = nd2;
3666*0Sstevel@tonic-gate 						goto found_master;
3667*0Sstevel@tonic-gate 				}
3668*0Sstevel@tonic-gate 				nd2 = nd2->nd_next;
3669*0Sstevel@tonic-gate 			}
3670*0Sstevel@tonic-gate 			nd = nd->nd_next;
3671*0Sstevel@tonic-gate 		}
3672*0Sstevel@tonic-gate 
3673*0Sstevel@tonic-gate 		/*
3674*0Sstevel@tonic-gate 		 * - If no owner node has a valid master, then follow
3675*0Sstevel@tonic-gate 		 * 	algorithm of when a node is joined to the diskset.
3676*0Sstevel@tonic-gate 		 * - node walks through nodelist looking for nodes that are
3677*0Sstevel@tonic-gate 		 *	owners of the diskset that are in the membership list.
3678*0Sstevel@tonic-gate 		 * - for each owner, node calls RPC routine clnt_getset to
3679*0Sstevel@tonic-gate 		 *	 see if that node has its node record set to OK.
3680*0Sstevel@tonic-gate 		 * - If so, master is chosen to be this owner node.
3681*0Sstevel@tonic-gate 		 */
3682*0Sstevel@tonic-gate 		nd = sd->sd_nodelist;
3683*0Sstevel@tonic-gate 		while (nd) {
3684*0Sstevel@tonic-gate 			/* Don't consider node that isn't in member list */
3685*0Sstevel@tonic-gate 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3686*0Sstevel@tonic-gate 				nd = nd->nd_next;
3687*0Sstevel@tonic-gate 				continue;
3688*0Sstevel@tonic-gate 			}
3689*0Sstevel@tonic-gate 
3690*0Sstevel@tonic-gate 			/* Don't consider a node that isn't an owner */
3691*0Sstevel@tonic-gate 			if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
3692*0Sstevel@tonic-gate 				nd = nd->nd_next;
3693*0Sstevel@tonic-gate 				continue;
3694*0Sstevel@tonic-gate 			}
3695*0Sstevel@tonic-gate 
3696*0Sstevel@tonic-gate 			/* Does node has its own node record set to OK? */
3697*0Sstevel@tonic-gate 			if (clnt_mngetset(nd->nd_nodename, sp->setname,
3698*0Sstevel@tonic-gate 			    MD_SET_BAD, &mnsr, ep) == -1) {
3699*0Sstevel@tonic-gate 				/* If RPC failure to another node return 205 */
3700*0Sstevel@tonic-gate 				if ((mdanyrpcerror(ep)) &&
3701*0Sstevel@tonic-gate 				    (sd->sd_mn_mynode->nd_nodeid !=
3702*0Sstevel@tonic-gate 				    nd->nd_nodeid)) {
3703*0Sstevel@tonic-gate 					return (205);
3704*0Sstevel@tonic-gate 				} else {
3705*0Sstevel@tonic-gate 					/* Any other failure */
3706*0Sstevel@tonic-gate 					return (-1);
3707*0Sstevel@tonic-gate 				}
3708*0Sstevel@tonic-gate 			}
3709*0Sstevel@tonic-gate 			nr = mnsr->sr_nodechain;
3710*0Sstevel@tonic-gate 			while (nr) {
3711*0Sstevel@tonic-gate 				if (nd->nd_nodeid == nr->nr_nodeid) {
3712*0Sstevel@tonic-gate 					if (nr->nr_flags & MD_MN_NODE_OK) {
3713*0Sstevel@tonic-gate 						/* Found a master */
3714*0Sstevel@tonic-gate 						free_sr(
3715*0Sstevel@tonic-gate 						    (md_set_record *)mnsr);
3716*0Sstevel@tonic-gate 						goto found_master;
3717*0Sstevel@tonic-gate 					}
3718*0Sstevel@tonic-gate 				}
3719*0Sstevel@tonic-gate 				nr = nr->nr_next;
3720*0Sstevel@tonic-gate 			}
3721*0Sstevel@tonic-gate 			free_sr((md_set_record *)mnsr);
3722*0Sstevel@tonic-gate 			nd = nd->nd_next;
3723*0Sstevel@tonic-gate 		}
3724*0Sstevel@tonic-gate 
3725*0Sstevel@tonic-gate 		/*
3726*0Sstevel@tonic-gate 		 * - If no owner node has its own node record on its own node
3727*0Sstevel@tonic-gate 		 *	set to OK, then this node checks all of the non-owner
3728*0Sstevel@tonic-gate 		 *	nodes that are in the membership list.
3729*0Sstevel@tonic-gate 		 * - for each non-owner, node calls RPC routine clnt_getset to
3730*0Sstevel@tonic-gate 		 *	see if that node has its node record set to OK.
3731*0Sstevel@tonic-gate 		 * - If set doesn't exist, don't choose node for master.
3732*0Sstevel@tonic-gate 		 * - If this node doesn't exist in the nodelist on any of the
3733*0Sstevel@tonic-gate 		 *	non-owner nodes, this node removes its set description
3734*0Sstevel@tonic-gate 		 *	of that diskset (i.e. removes the set from its local
3735*0Sstevel@tonic-gate 		 *	mddbs). This is handling the case of when a node was
3736*0Sstevel@tonic-gate 		 *	removed from a diskset while it was not in the
3737*0Sstevel@tonic-gate 		 *	cluster membership list.
3738*0Sstevel@tonic-gate 		 * - If non-owner node has its node record set to OK and if
3739*0Sstevel@tonic-gate 		 *	this node hasn't removed this diskset (step directly
3740*0Sstevel@tonic-gate 		 *	before this one), then the master is chosen to be this
3741*0Sstevel@tonic-gate 		 *	non-owner node.
3742*0Sstevel@tonic-gate 		 */
3743*0Sstevel@tonic-gate 		nd = sd->sd_nodelist;
3744*0Sstevel@tonic-gate 		while (nd) {
3745*0Sstevel@tonic-gate 			/* Don't consider node that isn't in member list */
3746*0Sstevel@tonic-gate 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3747*0Sstevel@tonic-gate 				nd->nd_flags |= MD_MN_NODE_DEL;
3748*0Sstevel@tonic-gate 				nd = nd->nd_next;
3749*0Sstevel@tonic-gate 				continue;
3750*0Sstevel@tonic-gate 			}
3751*0Sstevel@tonic-gate 
3752*0Sstevel@tonic-gate 			/* Don't consider owner nodes since none are OK */
3753*0Sstevel@tonic-gate 			if (nd->nd_flags & MD_MN_NODE_OWN) {
3754*0Sstevel@tonic-gate 				nd->nd_flags |= MD_MN_NODE_DEL;
3755*0Sstevel@tonic-gate 				nd = nd->nd_next;
3756*0Sstevel@tonic-gate 				continue;
3757*0Sstevel@tonic-gate 			}
3758*0Sstevel@tonic-gate 
3759*0Sstevel@tonic-gate 			/*
3760*0Sstevel@tonic-gate 			 * Don't need to get nodelist from my node since
3761*0Sstevel@tonic-gate 			 * this is where sd_nodelist was obtained.
3762*0Sstevel@tonic-gate 			 */
3763*0Sstevel@tonic-gate 			if (sd->sd_mn_mynode->nd_nodeid == nd->nd_nodeid) {
3764*0Sstevel@tonic-gate 				nd = nd->nd_next;
3765*0Sstevel@tonic-gate 				continue;
3766*0Sstevel@tonic-gate 			}
3767*0Sstevel@tonic-gate 
3768*0Sstevel@tonic-gate 			/*
3769*0Sstevel@tonic-gate 			 * If node has already been decided against for
3770*0Sstevel@tonic-gate 			 * master, then skip it.
3771*0Sstevel@tonic-gate 			 */
3772*0Sstevel@tonic-gate 			if (nd->nd_flags & MD_MN_NODE_DEL) {
3773*0Sstevel@tonic-gate 				nd = nd->nd_next;
3774*0Sstevel@tonic-gate 				continue;
3775*0Sstevel@tonic-gate 			}
3776*0Sstevel@tonic-gate 
3777*0Sstevel@tonic-gate 			/*
3778*0Sstevel@tonic-gate 			 * Does node in my nodelist have its own node
3779*0Sstevel@tonic-gate 			 * record marked OK on its node?  And does node
3780*0Sstevel@tonic-gate 			 * in my nodelist exist on all other nodes?
3781*0Sstevel@tonic-gate 			 * Don't want to choose a node for master unless
3782*0Sstevel@tonic-gate 			 * that node is marked OK on its own node and that
3783*0Sstevel@tonic-gate 			 * node exists on all other alive nodes.
3784*0Sstevel@tonic-gate 			 *
3785*0Sstevel@tonic-gate 			 * This is guarding against the case when several
3786*0Sstevel@tonic-gate 			 * nodes are down and one of the downed nodes is
3787*0Sstevel@tonic-gate 			 * deleted from the diskset.  When the down nodes
3788*0Sstevel@tonic-gate 			 * are rebooted into the cluster, you don't want
3789*0Sstevel@tonic-gate 			 * any node to pick the deleted node as the master.
3790*0Sstevel@tonic-gate 			 */
3791*0Sstevel@tonic-gate 			if (clnt_mngetset(nd->nd_nodename, sp->setname,
3792*0Sstevel@tonic-gate 			    MD_SET_BAD, &mnsr, ep) == -1) {
3793*0Sstevel@tonic-gate 				/*
3794*0Sstevel@tonic-gate 				 * If set doesn't exist on non-owner node,
3795*0Sstevel@tonic-gate 				 * don't consider this node for master.
3796*0Sstevel@tonic-gate 				 */
3797*0Sstevel@tonic-gate 				if (mdiserror(ep, MDE_NO_SET)) {
3798*0Sstevel@tonic-gate 					nd->nd_flags |= MD_MN_NODE_DEL;
3799*0Sstevel@tonic-gate 					nd = nd->nd_next;
3800*0Sstevel@tonic-gate 					continue;
3801*0Sstevel@tonic-gate 				} else if (mdanyrpcerror(ep)) {
3802*0Sstevel@tonic-gate 					/* RPC failure to another node */
3803*0Sstevel@tonic-gate 					return (205);
3804*0Sstevel@tonic-gate 				} else {
3805*0Sstevel@tonic-gate 					/* Any other failure */
3806*0Sstevel@tonic-gate 					return (-1);
3807*0Sstevel@tonic-gate 				}
3808*0Sstevel@tonic-gate 			}
3809*0Sstevel@tonic-gate 			/*
3810*0Sstevel@tonic-gate 			 * Is my node in the nodelist gotten from the other
3811*0Sstevel@tonic-gate 			 * node?  If not, then remove the set from my node
3812*0Sstevel@tonic-gate 			 * since set was deleted from my node while my node
3813*0Sstevel@tonic-gate 			 * was out of the cluster.
3814*0Sstevel@tonic-gate 			 */
3815*0Sstevel@tonic-gate 			nr = mnsr->sr_nodechain;
3816*0Sstevel@tonic-gate 			while (nr) {
3817*0Sstevel@tonic-gate 				if (sd->sd_mn_mynode->nd_nodeid ==
3818*0Sstevel@tonic-gate 				    nr->nr_nodeid) {
3819*0Sstevel@tonic-gate 					break;
3820*0Sstevel@tonic-gate 				}
3821*0Sstevel@tonic-gate 				nr = nr->nr_next;
3822*0Sstevel@tonic-gate 			}
3823*0Sstevel@tonic-gate 			if (nr == NULL) {
3824*0Sstevel@tonic-gate 				/* my node not found - delete set */
3825*0Sstevel@tonic-gate 				free_sr((md_set_record *)mnsr);
3826*0Sstevel@tonic-gate 				goto delete_set;
3827*0Sstevel@tonic-gate 			}
3828*0Sstevel@tonic-gate 
3829*0Sstevel@tonic-gate 			/* Is node being checked marked OK on its own node? */
3830*0Sstevel@tonic-gate 			nr = mnsr->sr_nodechain;
3831*0Sstevel@tonic-gate 			while (nr) {
3832*0Sstevel@tonic-gate 				if (nd->nd_nodeid == nr->nr_nodeid) {
3833*0Sstevel@tonic-gate 					if (!(nr->nr_flags & MD_MN_NODE_OK)) {
3834*0Sstevel@tonic-gate 						nd->nd_flags |= MD_MN_NODE_DEL;
3835*0Sstevel@tonic-gate 					}
3836*0Sstevel@tonic-gate 					break;
3837*0Sstevel@tonic-gate 				}
3838*0Sstevel@tonic-gate 				nr = nr->nr_next;
3839*0Sstevel@tonic-gate 			}
3840*0Sstevel@tonic-gate 			/*
3841*0Sstevel@tonic-gate 			 * If node being checked doesn't exist on its
3842*0Sstevel@tonic-gate 			 * own node - don't choose it as master.
3843*0Sstevel@tonic-gate 			 */
3844*0Sstevel@tonic-gate 			if (nr == NULL) {
3845*0Sstevel@tonic-gate 				nd->nd_flags |= MD_MN_NODE_DEL;
3846*0Sstevel@tonic-gate 			}
3847*0Sstevel@tonic-gate 
3848*0Sstevel@tonic-gate 			/*
3849*0Sstevel@tonic-gate 			 * Check every node in my node's nodelist against
3850*0Sstevel@tonic-gate 			 * the nodelist gotten from the other node.
3851*0Sstevel@tonic-gate 			 * If a node in my node's nodelist is not found in the
3852*0Sstevel@tonic-gate 			 * other node's nodelist, then set the DEL flag.
3853*0Sstevel@tonic-gate 			 */
3854*0Sstevel@tonic-gate 			nd2 = sd->sd_nodelist;
3855*0Sstevel@tonic-gate 			while (nd2) {
3856*0Sstevel@tonic-gate 				nr = mnsr->sr_nodechain;
3857*0Sstevel@tonic-gate 				while (nr) {
3858*0Sstevel@tonic-gate 					if (nd2->nd_nodeid == nr->nr_nodeid) {
3859*0Sstevel@tonic-gate 						break;
3860*0Sstevel@tonic-gate 					}
3861*0Sstevel@tonic-gate 					nr = nr->nr_next;
3862*0Sstevel@tonic-gate 				}
3863*0Sstevel@tonic-gate 				/* nd2 not found in other node's nodelist */
3864*0Sstevel@tonic-gate 				if (nr == NULL) {
3865*0Sstevel@tonic-gate 					nd2->nd_flags |= MD_MN_NODE_DEL;
3866*0Sstevel@tonic-gate 				}
3867*0Sstevel@tonic-gate 				nd2 = nd2->nd_next;
3868*0Sstevel@tonic-gate 			}
3869*0Sstevel@tonic-gate 
3870*0Sstevel@tonic-gate 			free_sr((md_set_record *)mnsr);
3871*0Sstevel@tonic-gate 			nd = nd->nd_next;
3872*0Sstevel@tonic-gate 		}
3873*0Sstevel@tonic-gate 
3874*0Sstevel@tonic-gate 		/*
3875*0Sstevel@tonic-gate 		 * Rescan list look for node that has not been marked DEL.
3876*0Sstevel@tonic-gate 		 * First node found is the master.
3877*0Sstevel@tonic-gate 		 */
3878*0Sstevel@tonic-gate 		nd = sd->sd_nodelist;
3879*0Sstevel@tonic-gate 		while (nd) {
3880*0Sstevel@tonic-gate 			if (!(nd->nd_flags & MD_MN_NODE_DEL)) {
3881*0Sstevel@tonic-gate 				break;
3882*0Sstevel@tonic-gate 			}
3883*0Sstevel@tonic-gate 			nd = nd->nd_next;
3884*0Sstevel@tonic-gate 			continue;
3885*0Sstevel@tonic-gate 		}
3886*0Sstevel@tonic-gate 		if (nd) {
3887*0Sstevel@tonic-gate 			/* Found a master */
3888*0Sstevel@tonic-gate 			goto found_master;
3889*0Sstevel@tonic-gate 		}
3890*0Sstevel@tonic-gate 
3891*0Sstevel@tonic-gate 		/*
3892*0Sstevel@tonic-gate 		 * - If no node can be found that has its own node record on
3893*0Sstevel@tonic-gate 		 *	its node to be set to OK, then all alive nodes
3894*0Sstevel@tonic-gate 		 * 	were in the process of being added to or deleted
3895*0Sstevel@tonic-gate 		 *	from set.  Each alive node will remove all
3896*0Sstevel@tonic-gate 		 *	information pertaining to this set from its node.
3897*0Sstevel@tonic-gate 		 *
3898*0Sstevel@tonic-gate 		 * If all nodes in set are ALIVE, then call sdssc end routines
3899*0Sstevel@tonic-gate 		 * since set was truly being initially created or destroyed.
3900*0Sstevel@tonic-gate 		 */
3901*0Sstevel@tonic-gate 		goto delete_set;
3902*0Sstevel@tonic-gate 	}
3903*0Sstevel@tonic-gate 
3904*0Sstevel@tonic-gate found_master:
3905*0Sstevel@tonic-gate 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
3906*0Sstevel@tonic-gate 	    "Set %s master chosen %s (%d): %s"),
3907*0Sstevel@tonic-gate 	    sp->setname, nd->nd_nodename, nd->nd_nodeid,
3908*0Sstevel@tonic-gate 	    meta_print_hrtime(gethrtime() - start_time));
3909*0Sstevel@tonic-gate 
3910*0Sstevel@tonic-gate 	if (clnt_lock_set(mynode(), sp, ep) == -1) {
3911*0Sstevel@tonic-gate 		return (-1);
3912*0Sstevel@tonic-gate 	}
3913*0Sstevel@tonic-gate 
3914*0Sstevel@tonic-gate 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
3915*0Sstevel@tonic-gate 
3916*0Sstevel@tonic-gate 	if (clnt_mnsetmaster(mynode(), sp,
3917*0Sstevel@tonic-gate 	    nd->nd_nodename, nd->nd_nodeid, ep)) {
3918*0Sstevel@tonic-gate 		rval = -1;
3919*0Sstevel@tonic-gate 	} else if (sd->sd_mn_mynode->nd_nodeid == nd->nd_nodeid) {
3920*0Sstevel@tonic-gate 		/* If this node is new master, set flag in this node's kernel */
3921*0Sstevel@tonic-gate 		(void) memset(&sf, 0, sizeof (sf));
3922*0Sstevel@tonic-gate 		sf.sf_setno = sp->setno;
3923*0Sstevel@tonic-gate 		sf.sf_setflags = MD_SET_MN_NEWMAS_RC;
3924*0Sstevel@tonic-gate 		/* Use magic to help protect ioctl against attack. */
3925*0Sstevel@tonic-gate 		sf.sf_magic = MDDB_SETFLAGS_MAGIC;
3926*0Sstevel@tonic-gate 		sf.sf_flags = MDDB_NM_SET;
3927*0Sstevel@tonic-gate 
3928*0Sstevel@tonic-gate 		meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
3929*0Sstevel@tonic-gate 		    "Setting new master flag for set %s: %s"),
3930*0Sstevel@tonic-gate 		    sp->setname, meta_print_hrtime(gethrtime() - start_time));
3931*0Sstevel@tonic-gate 
3932*0Sstevel@tonic-gate 		/*
3933*0Sstevel@tonic-gate 		 * Fail reconfig cycle if ioctl fails since it is critical
3934*0Sstevel@tonic-gate 		 * to set new master flag.
3935*0Sstevel@tonic-gate 		 */
3936*0Sstevel@tonic-gate 		if (metaioctl(MD_MN_SET_SETFLAGS, &sf, &sf.sf_mde,
3937*0Sstevel@tonic-gate 		    NULL) != NULL) {
3938*0Sstevel@tonic-gate 			(void) mdstealerror(ep, &sf.sf_mde);
3939*0Sstevel@tonic-gate 			rval = -1;
3940*0Sstevel@tonic-gate 		}
3941*0Sstevel@tonic-gate 	}
3942*0Sstevel@tonic-gate 
3943*0Sstevel@tonic-gate 	if (clnt_unlock_set(mynode(), cl_sk, &xep) == -1) {
3944*0Sstevel@tonic-gate 		if (rval == 0) {
3945*0Sstevel@tonic-gate 			(void) mdstealerror(ep, &xep);
3946*0Sstevel@tonic-gate 			rval = -1;
3947*0Sstevel@tonic-gate 		}
3948*0Sstevel@tonic-gate 	}
3949*0Sstevel@tonic-gate 
3950*0Sstevel@tonic-gate 	cl_set_setkey(NULL);
3951*0Sstevel@tonic-gate 
3952*0Sstevel@tonic-gate 	metaflushsetname(sp);
3953*0Sstevel@tonic-gate 
3954*0Sstevel@tonic-gate 	return (rval);
3955*0Sstevel@tonic-gate 
3956*0Sstevel@tonic-gate delete_set:
3957*0Sstevel@tonic-gate 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
3958*0Sstevel@tonic-gate 	    "Master not chosen, deleting set %s: %s"),
3959*0Sstevel@tonic-gate 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
3960*0Sstevel@tonic-gate 
3961*0Sstevel@tonic-gate 	/*
3962*0Sstevel@tonic-gate 	 * Remove all set information from this node:
3963*0Sstevel@tonic-gate 	 *	- node records for this set
3964*0Sstevel@tonic-gate 	 *	- drive records for this set
3965*0Sstevel@tonic-gate 	 *	- set record for this set
3966*0Sstevel@tonic-gate 	 * (Only do this on this node since each node
3967*0Sstevel@tonic-gate 	 * will do it for its own local mddb.)
3968*0Sstevel@tonic-gate 	 *
3969*0Sstevel@tonic-gate 	 * If all nodes in set are ALIVE, then
3970*0Sstevel@tonic-gate 	 * the lowest numbered ALIVE nodeid in set
3971*0Sstevel@tonic-gate 	 * (irregardless of whether an owner node or not) will
3972*0Sstevel@tonic-gate 	 * call the DCS service to cleanup for create/delete of set.
3973*0Sstevel@tonic-gate 	 *   sdssc_create_end(cleanup) if set was being created or
3974*0Sstevel@tonic-gate 	 *   sdssc_delete_end(cleanup) if set was being deleted.
3975*0Sstevel@tonic-gate 	 * A node record with flag ADD denotes a set being
3976*0Sstevel@tonic-gate 	 * created.  A node record with flag DEL denotes a
3977*0Sstevel@tonic-gate 	 * set being deleted.
3978*0Sstevel@tonic-gate 	 */
3979*0Sstevel@tonic-gate 	nd = sd->sd_nodelist;
3980*0Sstevel@tonic-gate 	while (nd) {
3981*0Sstevel@tonic-gate 		/* Found a node that isn't alive */
3982*0Sstevel@tonic-gate 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE))
3983*0Sstevel@tonic-gate 			break;
3984*0Sstevel@tonic-gate 
3985*0Sstevel@tonic-gate 		/* Is my node the lowest numbered ALIVE node? */
3986*0Sstevel@tonic-gate 		if (nd->nd_nodeid < sd->sd_mn_mynode->nd_nodeid) {
3987*0Sstevel@tonic-gate 			break;
3988*0Sstevel@tonic-gate 		}
3989*0Sstevel@tonic-gate 		nd = nd->nd_next;
3990*0Sstevel@tonic-gate 	}
3991*0Sstevel@tonic-gate 	if (nd == NULL) {
3992*0Sstevel@tonic-gate 		/* All nodes ALIVE and this is the lowest nodeid */
3993*0Sstevel@tonic-gate 		lowest_alive_nodeid = 1;
3994*0Sstevel@tonic-gate 	}
3995*0Sstevel@tonic-gate 
3996*0Sstevel@tonic-gate 	if (clnt_lock_set(mynode(), sp, ep) == -1) {
3997*0Sstevel@tonic-gate 		return (-1);
3998*0Sstevel@tonic-gate 	}
3999*0Sstevel@tonic-gate 
4000*0Sstevel@tonic-gate 
4001*0Sstevel@tonic-gate 	/*
4002*0Sstevel@tonic-gate 	 * If this node had been joined, withdraw and reset master.
4003*0Sstevel@tonic-gate 	 *
4004*0Sstevel@tonic-gate 	 * This could happen if a node was being added to or removed
4005*0Sstevel@tonic-gate 	 * from a diskset and the node doing the add/delete operation and
4006*0Sstevel@tonic-gate 	 * all other nodes in the diskset have left the cluster.
4007*0Sstevel@tonic-gate 	 */
4008*0Sstevel@tonic-gate 	if (sd->sd_mn_mynode) {
4009*0Sstevel@tonic-gate 		nd = sd->sd_mn_mynode;
4010*0Sstevel@tonic-gate 		if (nd->nd_flags & MD_MN_NODE_OWN) {
4011*0Sstevel@tonic-gate 			if (clnt_withdrawset(mynode(), sp, ep)) {
4012*0Sstevel@tonic-gate 				rval = -1;
4013*0Sstevel@tonic-gate 				goto out;
4014*0Sstevel@tonic-gate 			}
4015*0Sstevel@tonic-gate 			if (clnt_mnsetmaster(mynode(), sp, "",
4016*0Sstevel@tonic-gate 			    MD_MN_INVALID_NID, ep)) {
4017*0Sstevel@tonic-gate 				rval = -1;
4018*0Sstevel@tonic-gate 				goto out;
4019*0Sstevel@tonic-gate 			}
4020*0Sstevel@tonic-gate 		}
4021*0Sstevel@tonic-gate 	}
4022*0Sstevel@tonic-gate 
4023*0Sstevel@tonic-gate 	/*
4024*0Sstevel@tonic-gate 	 * Remove side records for this node (side) from local mddb
4025*0Sstevel@tonic-gate 	 * (clnt_deldrvs does this) if there are drives in the set.
4026*0Sstevel@tonic-gate 	 *
4027*0Sstevel@tonic-gate 	 * Don't need to mark this node as DEL since already marked as
4028*0Sstevel@tonic-gate 	 * ADD or DEL (or this node would have been chosen as master).
4029*0Sstevel@tonic-gate 	 * Don't need to mark other node records, drive records or
4030*0Sstevel@tonic-gate 	 * set records as DEL.  If a panic occurs during clnt_delset,
4031*0Sstevel@tonic-gate 	 * these records will be deleted the next time this node
4032*0Sstevel@tonic-gate 	 * becomes a member and goes through the reconfig cycle.
4033*0Sstevel@tonic-gate 	 */
4034*0Sstevel@tonic-gate 	/* Get the drive descriptors for this set */
4035*0Sstevel@tonic-gate 	if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
4036*0Sstevel@tonic-gate 	    ep)) == NULL) {
4037*0Sstevel@tonic-gate 		if (! mdisok(ep)) {
4038*0Sstevel@tonic-gate 			/*
4039*0Sstevel@tonic-gate 			 * Ignore and clear out any failures from
4040*0Sstevel@tonic-gate 			 * metaget_drivedesc since a panic could have
4041*0Sstevel@tonic-gate 			 * occurred when a node was partially added to a set.
4042*0Sstevel@tonic-gate 			 */
4043*0Sstevel@tonic-gate 			mdclrerror(ep);
4044*0Sstevel@tonic-gate 		}
4045*0Sstevel@tonic-gate 	} else {
4046*0Sstevel@tonic-gate 		if (clnt_deldrvs(mynode(), sp, dd, ep)) {
4047*0Sstevel@tonic-gate 			rval = -1;
4048*0Sstevel@tonic-gate 			goto out;
4049*0Sstevel@tonic-gate 		}
4050*0Sstevel@tonic-gate 	}
4051*0Sstevel@tonic-gate 
4052*0Sstevel@tonic-gate 	/*
4053*0Sstevel@tonic-gate 	 * Now, delete the set - this removes the node, drive
4054*0Sstevel@tonic-gate 	 * and set records from the local mddb.
4055*0Sstevel@tonic-gate 	 */
4056*0Sstevel@tonic-gate 	if (clnt_delset(mynode(), sp, ep)) {
4057*0Sstevel@tonic-gate 		rval = -1;
4058*0Sstevel@tonic-gate 		goto out;
4059*0Sstevel@tonic-gate 	}
4060*0Sstevel@tonic-gate 
4061*0Sstevel@tonic-gate out:
4062*0Sstevel@tonic-gate 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
4063*0Sstevel@tonic-gate 
4064*0Sstevel@tonic-gate 	/*
4065*0Sstevel@tonic-gate 	 * Ignore errors from unlock of set since set is no longer
4066*0Sstevel@tonic-gate 	 * known (if clnt_delset worked).
4067*0Sstevel@tonic-gate 	 */
4068*0Sstevel@tonic-gate 	if (clnt_unlock_set(mynode(), cl_sk, &xep) == -1) {
4069*0Sstevel@tonic-gate 		mdclrerror(&xep);
4070*0Sstevel@tonic-gate 	}
4071*0Sstevel@tonic-gate 
4072*0Sstevel@tonic-gate 	cl_set_setkey(NULL);
4073*0Sstevel@tonic-gate 
4074*0Sstevel@tonic-gate 	metaflushsetname(sp);
4075*0Sstevel@tonic-gate 
4076*0Sstevel@tonic-gate 	/*
4077*0Sstevel@tonic-gate 	 * If this node is the lowest numbered nodeid then
4078*0Sstevel@tonic-gate 	 * call sdssc_create/delete_end depending on whether
4079*0Sstevel@tonic-gate 	 * this node is marked as ADD or DEL in the node record.
4080*0Sstevel@tonic-gate 	 */
4081*0Sstevel@tonic-gate 	if (lowest_alive_nodeid) {
4082*0Sstevel@tonic-gate 		if (nd->nd_flags & MD_MN_NODE_ADD)
4083*0Sstevel@tonic-gate 			sdssc_create_end(sp->setname, SDSSC_CLEANUP);
4084*0Sstevel@tonic-gate 		else if (nd->nd_flags & MD_MN_NODE_DEL)
4085*0Sstevel@tonic-gate 			sdssc_delete_end(sp->setname, SDSSC_CLEANUP);
4086*0Sstevel@tonic-gate 	}
4087*0Sstevel@tonic-gate 
4088*0Sstevel@tonic-gate 	/* Finished with this set -- return */
4089*0Sstevel@tonic-gate 	return (rval);
4090*0Sstevel@tonic-gate }
4091*0Sstevel@tonic-gate 
4092*0Sstevel@tonic-gate /*
4093*0Sstevel@tonic-gate  * Reconfig step to choose a new master for all MN disksets.
4094*0Sstevel@tonic-gate  * Return values:
4095*0Sstevel@tonic-gate  *	0 - Everything is great.
4096*0Sstevel@tonic-gate  *	1 - This node failed to reconfig.
4097*0Sstevel@tonic-gate  *	205 - Cause another reconfig due to a nodelist problem
4098*0Sstevel@tonic-gate  *		or RPC failure to another node
4099*0Sstevel@tonic-gate  */
4100*0Sstevel@tonic-gate int
4101*0Sstevel@tonic-gate meta_reconfig_choose_master(
4102*0Sstevel@tonic-gate 	md_error_t	*ep
4103*0Sstevel@tonic-gate )
4104*0Sstevel@tonic-gate {
4105*0Sstevel@tonic-gate 	set_t				max_sets, setno;
4106*0Sstevel@tonic-gate 	int				nodecnt;
4107*0Sstevel@tonic-gate 	mndiskset_membershiplist_t	*nl;
4108*0Sstevel@tonic-gate 	md_set_desc			*sd;
4109*0Sstevel@tonic-gate 	mdsetname_t			*sp;
4110*0Sstevel@tonic-gate 	int				rval = 0;
4111*0Sstevel@tonic-gate 	mddb_setflags_config_t		sf;
4112*0Sstevel@tonic-gate 	int				start_node_delayed = 0;
4113*0Sstevel@tonic-gate 
4114*0Sstevel@tonic-gate 	if ((max_sets = get_max_sets(ep)) == 0) {
4115*0Sstevel@tonic-gate 		mde_perror(ep, dgettext(TEXT_DOMAIN,
4116*0Sstevel@tonic-gate 		    "Unable to get number of sets"));
4117*0Sstevel@tonic-gate 		return (1);
4118*0Sstevel@tonic-gate 	}
4119*0Sstevel@tonic-gate 
4120*0Sstevel@tonic-gate 	/*
4121*0Sstevel@tonic-gate 	 * Get membershiplist from API routine.  If there's
4122*0Sstevel@tonic-gate 	 * an error, return a 205 to cause another reconfig.
4123*0Sstevel@tonic-gate 	 */
4124*0Sstevel@tonic-gate 	if (meta_read_nodelist(&nodecnt, &nl, ep) == -1) {
4125*0Sstevel@tonic-gate 		mde_perror(ep, "");
4126*0Sstevel@tonic-gate 		return (205);
4127*0Sstevel@tonic-gate 	}
4128*0Sstevel@tonic-gate 
4129*0Sstevel@tonic-gate 	for (setno = 1; setno < max_sets; setno++) {
4130*0Sstevel@tonic-gate 		if ((sp = metasetnosetname(setno, ep)) == NULL) {
4131*0Sstevel@tonic-gate 			if (mdiserror(ep, MDE_NO_SET)) {
4132*0Sstevel@tonic-gate 				/* No set for this setno - continue */
4133*0Sstevel@tonic-gate 				mdclrerror(ep);
4134*0Sstevel@tonic-gate 				continue;
4135*0Sstevel@tonic-gate 			} else {
4136*0Sstevel@tonic-gate 				/*
4137*0Sstevel@tonic-gate 				 * If encountered an RPC error from my node,
4138*0Sstevel@tonic-gate 				 * then immediately fail.
4139*0Sstevel@tonic-gate 				 */
4140*0Sstevel@tonic-gate 				if (mdanyrpcerror(ep)) {
4141*0Sstevel@tonic-gate 					mde_perror(ep, "");
4142*0Sstevel@tonic-gate 					return (1);
4143*0Sstevel@tonic-gate 				}
4144*0Sstevel@tonic-gate 				/* Can't get set information */
4145*0Sstevel@tonic-gate 				mde_perror(ep, dgettext(TEXT_DOMAIN,
4146*0Sstevel@tonic-gate 					"Unable to get information for "
4147*0Sstevel@tonic-gate 					"set number %d"), setno);
4148*0Sstevel@tonic-gate 				mdclrerror(ep);
4149*0Sstevel@tonic-gate 				continue;
4150*0Sstevel@tonic-gate 			}
4151*0Sstevel@tonic-gate 		}
4152*0Sstevel@tonic-gate 
4153*0Sstevel@tonic-gate 		/* If setname is there, set desc should exist. */
4154*0Sstevel@tonic-gate 		if ((sd = metaget_setdesc(sp, ep)) == NULL) {
4155*0Sstevel@tonic-gate 			/*
4156*0Sstevel@tonic-gate 			 * If encountered an RPC error from my node,
4157*0Sstevel@tonic-gate 			 * then immediately fail.
4158*0Sstevel@tonic-gate 			 */
4159*0Sstevel@tonic-gate 			if (mdanyrpcerror(ep)) {
4160*0Sstevel@tonic-gate 				mde_perror(ep, "");
4161*0Sstevel@tonic-gate 				return (1);
4162*0Sstevel@tonic-gate 			}
4163*0Sstevel@tonic-gate 			mde_perror(ep, dgettext(TEXT_DOMAIN,
4164*0Sstevel@tonic-gate 				"Unable to get set %s desc information"),
4165*0Sstevel@tonic-gate 				sp->setname);
4166*0Sstevel@tonic-gate 			mdclrerror(ep);
4167*0Sstevel@tonic-gate 			continue;
4168*0Sstevel@tonic-gate 		}
4169*0Sstevel@tonic-gate 
4170*0Sstevel@tonic-gate 		/* Only reconfig MN disksets */
4171*0Sstevel@tonic-gate 		if (!MD_MNSET_DESC(sd)) {
4172*0Sstevel@tonic-gate 			continue;
4173*0Sstevel@tonic-gate 		}
4174*0Sstevel@tonic-gate 
4175*0Sstevel@tonic-gate 		meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
4176*0Sstevel@tonic-gate 		    "Begin choose master for set %s: %s"),
4177*0Sstevel@tonic-gate 		    sp->setname, meta_print_hrtime(gethrtime() - start_time));
4178*0Sstevel@tonic-gate 
4179*0Sstevel@tonic-gate 		/* Update nodelist with member information. */
4180*0Sstevel@tonic-gate 		if (meta_reconfig_update_nodelist(sp, nl, sd, ep)) {
4181*0Sstevel@tonic-gate 			/*
4182*0Sstevel@tonic-gate 			 * If encountered an RPC error from my node,
4183*0Sstevel@tonic-gate 			 * then immediately fail.
4184*0Sstevel@tonic-gate 			 */
4185*0Sstevel@tonic-gate 			if (mdanyrpcerror(ep)) {
4186*0Sstevel@tonic-gate 				mde_perror(ep, "");
4187*0Sstevel@tonic-gate 				return (1);
4188*0Sstevel@tonic-gate 			}
4189*0Sstevel@tonic-gate 			mde_perror(ep, "");
4190*0Sstevel@tonic-gate 			mdclrerror(ep);
4191*0Sstevel@tonic-gate 			continue;
4192*0Sstevel@tonic-gate 		}
4193*0Sstevel@tonic-gate 
4194*0Sstevel@tonic-gate 		/*
4195*0Sstevel@tonic-gate 		 * If all nodes in a cluster are starting, then
4196*0Sstevel@tonic-gate 		 * all nodes will attempt to contact all other nodes
4197*0Sstevel@tonic-gate 		 * to determine a master node.  This can lead to a
4198*0Sstevel@tonic-gate 		 * problem where node 1 is trying to contact the rpc.metad
4199*0Sstevel@tonic-gate 		 * node 2 and node 2 is trying to contact the rpc.metad
4200*0Sstevel@tonic-gate 		 * on node 1 -- and this causes the rpc call to fail
4201*0Sstevel@tonic-gate 		 * on both nodes and causes a new reconfig cycle.
4202*0Sstevel@tonic-gate 		 *
4203*0Sstevel@tonic-gate 		 * In order to break this problem, a newly starting node
4204*0Sstevel@tonic-gate 		 * will delay a small amount of time (nodeid mod 4 seconds)
4205*0Sstevel@tonic-gate 		 * and will then run the code to choose a master for the
4206*0Sstevel@tonic-gate 		 * first set.  Delay will only be done once regardless of the
4207*0Sstevel@tonic-gate 		 * number of sets.
4208*0Sstevel@tonic-gate 		 */
4209*0Sstevel@tonic-gate 		if (start_node_delayed == 0) {
4210*0Sstevel@tonic-gate 			(void) memset(&sf, 0, sizeof (sf));
4211*0Sstevel@tonic-gate 			sf.sf_setno = sp->setno;
4212*0Sstevel@tonic-gate 			sf.sf_flags = MDDB_NM_GET;
4213*0Sstevel@tonic-gate 			/* Use magic to help protect ioctl against attack. */
4214*0Sstevel@tonic-gate 			sf.sf_magic = MDDB_SETFLAGS_MAGIC;
4215*0Sstevel@tonic-gate 			if ((metaioctl(MD_MN_GET_SETFLAGS, &sf,
4216*0Sstevel@tonic-gate 			    &sf.sf_mde, NULL) == 0) &&
4217*0Sstevel@tonic-gate 			    ((sf.sf_setflags & MD_SET_MN_START_RC) ==
4218*0Sstevel@tonic-gate 			    MD_SET_MN_START_RC)) {
4219*0Sstevel@tonic-gate 				(void) sleep(sd->sd_mn_mynode->nd_nodeid % 4);
4220*0Sstevel@tonic-gate 			}
4221*0Sstevel@tonic-gate 			start_node_delayed = 1;
4222*0Sstevel@tonic-gate 		}
4223*0Sstevel@tonic-gate 
4224*0Sstevel@tonic-gate 		/* Choose master for this set */
4225*0Sstevel@tonic-gate 		rval = meta_reconfig_choose_master_for_set(sp, sd, ep);
4226*0Sstevel@tonic-gate 		if (rval == -1) {
4227*0Sstevel@tonic-gate 			mde_perror(ep, "");
4228*0Sstevel@tonic-gate 			return (1);
4229*0Sstevel@tonic-gate 		} else if (rval == 205) {
4230*0Sstevel@tonic-gate 			mde_perror(ep, "");
4231*0Sstevel@tonic-gate 			return (205);
4232*0Sstevel@tonic-gate 		}
4233*0Sstevel@tonic-gate 
4234*0Sstevel@tonic-gate 		/* Send new nodelist to rpc.mdcommd */
4235*0Sstevel@tonic-gate 		(void) mdmn_reinit_set(sp->setno);
4236*0Sstevel@tonic-gate 
4237*0Sstevel@tonic-gate 		meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
4238*0Sstevel@tonic-gate 		    "Choose master for set %s completed: %s"),
4239*0Sstevel@tonic-gate 		    sp->setname, meta_print_hrtime(gethrtime() - start_time));
4240*0Sstevel@tonic-gate 	}
4241*0Sstevel@tonic-gate 
4242*0Sstevel@tonic-gate 	/*
4243*0Sstevel@tonic-gate 	 * Each node turns on I/Os for all MN disksets.
4244*0Sstevel@tonic-gate 	 * This is to recover from the situation where the master died
4245*0Sstevel@tonic-gate 	 * during a previous reconfig cycle when I/Os were suspended
4246*0Sstevel@tonic-gate 	 * for a MN diskset.
4247*0Sstevel@tonic-gate 	 * If a failure occurs return a 1 which will force this node to
4248*0Sstevel@tonic-gate 	 * panic.  Cannot leave node in the situation where I/Os are
4249*0Sstevel@tonic-gate 	 * not resumed.
4250*0Sstevel@tonic-gate 	 */
4251*0Sstevel@tonic-gate 	setno = 0; /* 0 means all MN sets */
4252*0Sstevel@tonic-gate 	if (metaioctl(MD_MN_RESUME_SET, &setno, ep, NULL)) {
4253*0Sstevel@tonic-gate 		mde_perror(ep, "");
4254*0Sstevel@tonic-gate 		return (1);
4255*0Sstevel@tonic-gate 	}
4256*0Sstevel@tonic-gate 
4257*0Sstevel@tonic-gate 	/* Free the nodelist */
4258*0Sstevel@tonic-gate 	if (nodecnt)
4259*0Sstevel@tonic-gate 		meta_free_nodelist(nl);
4260*0Sstevel@tonic-gate 
4261*0Sstevel@tonic-gate 	return (0);
4262*0Sstevel@tonic-gate }
4263*0Sstevel@tonic-gate 
4264*0Sstevel@tonic-gate /*
4265*0Sstevel@tonic-gate  * meta_mnsync_user_records will synchronize the diskset user records across
4266*0Sstevel@tonic-gate  * all nodes in the diskset.  The diskset user records are stored in
4267*0Sstevel@tonic-gate  * each node's local set mddb.
4268*0Sstevel@tonic-gate  *
4269*0Sstevel@tonic-gate  * This needs to be done even if there is no master change during the
4270*0Sstevel@tonic-gate  * reconfig cycle since this routine should clean up any mess left by
4271*0Sstevel@tonic-gate  * the untimely termination of a metaset or metadb command (due to a
4272*0Sstevel@tonic-gate  * node panic or to user intervention).
4273*0Sstevel@tonic-gate  *
4274*0Sstevel@tonic-gate  * Caller is the Master node.
4275*0Sstevel@tonic-gate  *
4276*0Sstevel@tonic-gate  * Returns	 0 - Success
4277*0Sstevel@tonic-gate  *		205 - Failure during RPC to another node
4278*0Sstevel@tonic-gate  *		-1 - Any other failure and ep is filled in.
4279*0Sstevel@tonic-gate  */
4280*0Sstevel@tonic-gate int
4281*0Sstevel@tonic-gate meta_mnsync_user_records(
4282*0Sstevel@tonic-gate 	mdsetname_t	*sp,
4283*0Sstevel@tonic-gate 	md_error_t	*ep
4284*0Sstevel@tonic-gate )
4285*0Sstevel@tonic-gate {
4286*0Sstevel@tonic-gate 	md_set_desc		*sd;
4287*0Sstevel@tonic-gate 	md_mnnode_desc		*master_nodelist, *nd, *nd2, *ndtail;
4288*0Sstevel@tonic-gate 	md_mnset_record		*mnsr;
4289*0Sstevel@tonic-gate 	md_mnsr_node_t		*master_mnsr_node = NULL, *mnsr_node = NULL;
4290*0Sstevel@tonic-gate 	md_mnnode_record	*nr;
4291*0Sstevel@tonic-gate 	md_drive_record		*dr;
4292*0Sstevel@tonic-gate 	int			dr_cnt, dd_cnt;
4293*0Sstevel@tonic-gate 	int			found_my_nr;
4294*0Sstevel@tonic-gate 	md_drive_desc		*dd, *dd_prev, *master_dd, *other_dd;
4295*0Sstevel@tonic-gate 	int			all_drives_ok;
4296*0Sstevel@tonic-gate 	int			rval = 0;
4297*0Sstevel@tonic-gate 	int			max_genid = 0;
4298*0Sstevel@tonic-gate 	int			num_alive_nodes, num_alive_nodes_del = 0;
4299*0Sstevel@tonic-gate 	int			set_locked = 0;
4300*0Sstevel@tonic-gate 	md_setkey_t		*cl_sk;
4301*0Sstevel@tonic-gate 	md_error_t		xep = mdnullerror;
4302*0Sstevel@tonic-gate 	char			*anode[1];
4303*0Sstevel@tonic-gate 	mddb_setflags_config_t	sf;
4304*0Sstevel@tonic-gate 
4305*0Sstevel@tonic-gate 	/*
4306*0Sstevel@tonic-gate 	 * Sync up node records first.
4307*0Sstevel@tonic-gate 	 * Construct a master nodelist using the nodelist from this
4308*0Sstevel@tonic-gate 	 * node's rpc.metad node records and then setting the state of each
4309*0Sstevel@tonic-gate 	 * node following these rules:
4310*0Sstevel@tonic-gate 	 *	- If a node record is marked OK on its node, mark it OK
4311*0Sstevel@tonic-gate 	 *		in the master nodelist (and later OK on all nodes)
4312*0Sstevel@tonic-gate 	 *		If a node record is also marked OWN on its node,
4313*0Sstevel@tonic-gate 	 *		mark it OWN in the master nodelist.
4314*0Sstevel@tonic-gate 	 *	- If a node record is not marked OK on its node, then mark
4315*0Sstevel@tonic-gate 	 *		it as DEL in the master list (later deleting it)
4316*0Sstevel@tonic-gate 	 *	- If node record doesn't exist on that node, then mark it DEL
4317*0Sstevel@tonic-gate 	 *		(later deleting it)
4318*0Sstevel@tonic-gate 	 *	- If set record doesn't exist on that node, mark node as DEL
4319*0Sstevel@tonic-gate 	 *	- If a node record doesn't exist on all nodes, then mark it DEL
4320*0Sstevel@tonic-gate 	 *	- If a node is not ALIVE, then
4321*0Sstevel@tonic-gate 	 *		- If that node marked DEL on any node - mark it DEL
4322*0Sstevel@tonic-gate 	 *			in master list but leave in nodelist
4323*0Sstevel@tonic-gate 	 *		- If that node is marked as ADD on any node, mark it
4324*0Sstevel@tonic-gate 	 *			ADD in the master list but leave in nodelist
4325*0Sstevel@tonic-gate 	 *		- When that node returns to the living, the DEL
4326*0Sstevel@tonic-gate 	 *			node record will be removed and the ADD node
4327*0Sstevel@tonic-gate 	 *			record may be removed if marked ADD on that
4328*0Sstevel@tonic-gate 	 *			node.
4329*0Sstevel@tonic-gate 	 * The key rule is to not remove a node from the nodelist until
4330*0Sstevel@tonic-gate 	 * that node record is removed from its own node.  Do not want to
4331*0Sstevel@tonic-gate 	 * remove a node's record from all other nodes and then have
4332*0Sstevel@tonic-gate 	 * that node have its own record marked OK so that a node will pick
4333*0Sstevel@tonic-gate 	 * a different master than the other nodes.
4334*0Sstevel@tonic-gate 	 *
4335*0Sstevel@tonic-gate 	 * Next,
4336*0Sstevel@tonic-gate 	 * If node is ALIVE and node record is marked DEL in master nodelist,
4337*0Sstevel@tonic-gate 	 * remove node from set.
4338*0Sstevel@tonic-gate 	 * If node is ALIVE and node record is marked OK in master nodelist,
4339*0Sstevel@tonic-gate 	 * mark it OK on all other nodes.
4340*0Sstevel@tonic-gate 	 * If node is not ALIVE and node record is marked DEL in master
4341*0Sstevel@tonic-gate 	 * nodelist, mark it DEL on all other nodes.
4342*0Sstevel@tonic-gate 	 * If node is not ALIVE and node record is marked ADD in master,
4343*0Sstevel@tonic-gate 	 * nodelist, mark it ADD on all other nodes.
4344*0Sstevel@tonic-gate 	 */
4345*0Sstevel@tonic-gate 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
4346*0Sstevel@tonic-gate 		return (-1);
4347*0Sstevel@tonic-gate 	}
4348*0Sstevel@tonic-gate 	master_nodelist = sd->sd_nodelist;
4349*0Sstevel@tonic-gate 
4350*0Sstevel@tonic-gate 	/*
4351*0Sstevel@tonic-gate 	 * Walk through nodelist creating a master nodelist.
4352*0Sstevel@tonic-gate 	 */
4353*0Sstevel@tonic-gate 	num_alive_nodes = 0;
4354*0Sstevel@tonic-gate 	nd = master_nodelist;
4355*0Sstevel@tonic-gate 	while (nd) {
4356*0Sstevel@tonic-gate 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
4357*0Sstevel@tonic-gate 			nd = nd->nd_next;
4358*0Sstevel@tonic-gate 			continue;
4359*0Sstevel@tonic-gate 		}
4360*0Sstevel@tonic-gate 		num_alive_nodes++;
4361*0Sstevel@tonic-gate 		if (clnt_mngetset(nd->nd_nodename, sp->setname,
4362*0Sstevel@tonic-gate 		    MD_SET_BAD, &mnsr, ep) == -1) {
4363*0Sstevel@tonic-gate 			if (mdiserror(ep, MDE_NO_SET)) {
4364*0Sstevel@tonic-gate 				/* set doesn't exist, mark node as DEL */
4365*0Sstevel@tonic-gate 				nd->nd_flags &= ~MD_MN_NODE_OK;
4366*0Sstevel@tonic-gate 				nd->nd_flags &= ~MD_MN_NODE_ADD;
4367*0Sstevel@tonic-gate 				nd->nd_flags |= MD_MN_NODE_DEL;
4368*0Sstevel@tonic-gate 				nd->nd_flags |= MD_MN_NODE_NOSET;
4369*0Sstevel@tonic-gate 				nd = nd->nd_next;
4370*0Sstevel@tonic-gate 				continue;
4371*0Sstevel@tonic-gate 			} else {
4372*0Sstevel@tonic-gate 				/* If RPC failure to another node return 205 */
4373*0Sstevel@tonic-gate 				if ((mdanyrpcerror(ep)) &&
4374*0Sstevel@tonic-gate 				    (sd->sd_mn_mynode->nd_nodeid !=
4375*0Sstevel@tonic-gate 				    nd->nd_nodeid)) {
4376*0Sstevel@tonic-gate 					rval = 205;
4377*0Sstevel@tonic-gate 				} else {
4378*0Sstevel@tonic-gate 					/* Any other failure */
4379*0Sstevel@tonic-gate 					rval = -1;
4380*0Sstevel@tonic-gate 				}
4381*0Sstevel@tonic-gate 				goto out;
4382*0Sstevel@tonic-gate 			}
4383*0Sstevel@tonic-gate 		}
4384*0Sstevel@tonic-gate 		/* Find biggest genid in records for this diskset */
4385*0Sstevel@tonic-gate 		if (mnsr->sr_genid > max_genid)
4386*0Sstevel@tonic-gate 			max_genid = mnsr->sr_genid;
4387*0Sstevel@tonic-gate 
4388*0Sstevel@tonic-gate 		dr = mnsr->sr_drivechain;
4389*0Sstevel@tonic-gate 		while (dr) {
4390*0Sstevel@tonic-gate 			/* Find biggest genid in records for this diskset */
4391*0Sstevel@tonic-gate 			if (dr->dr_genid > max_genid) {
4392*0Sstevel@tonic-gate 				max_genid = dr->dr_genid;
4393*0Sstevel@tonic-gate 			}
4394*0Sstevel@tonic-gate 			dr = dr->dr_next;
4395*0Sstevel@tonic-gate 		}
4396*0Sstevel@tonic-gate 
4397*0Sstevel@tonic-gate 		found_my_nr = 0;
4398*0Sstevel@tonic-gate 		nr = mnsr->sr_nodechain;
4399*0Sstevel@tonic-gate 		/* nr is the list of node recs from nd_nodename node */
4400*0Sstevel@tonic-gate 		while (nr) {
4401*0Sstevel@tonic-gate 			/* Find biggest genid in records for this diskset */
4402*0Sstevel@tonic-gate 			if (nr->nr_genid > max_genid)
4403*0Sstevel@tonic-gate 				max_genid = nr->nr_genid;
4404*0Sstevel@tonic-gate 			nd2 = master_nodelist;
4405*0Sstevel@tonic-gate 			ndtail = NULL;
4406*0Sstevel@tonic-gate 			/* For each node record, is it in master list? */
4407*0Sstevel@tonic-gate 			while (nd2) {
4408*0Sstevel@tonic-gate 				if (nd2->nd_nodeid == nr->nr_nodeid)
4409*0Sstevel@tonic-gate 					break;
4410*0Sstevel@tonic-gate 				if (nd2->nd_next == NULL)
4411*0Sstevel@tonic-gate 					ndtail = nd2;
4412*0Sstevel@tonic-gate 				nd2 = nd2->nd_next;
4413*0Sstevel@tonic-gate 			}
4414*0Sstevel@tonic-gate 			/*
4415*0Sstevel@tonic-gate 			 * Found node record not in master list -- add it
4416*0Sstevel@tonic-gate 			 * to list marking it as DEL since node record
4417*0Sstevel@tonic-gate 			 * should exist on all nodes unless a panic occurred
4418*0Sstevel@tonic-gate 			 * during addition or deletion of host to diskset.
4419*0Sstevel@tonic-gate 			 */
4420*0Sstevel@tonic-gate 			if (nd2 == NULL) {
4421*0Sstevel@tonic-gate 				nd2 = Zalloc(sizeof (*nd2));
4422*0Sstevel@tonic-gate 				(void) strcpy(nd2->nd_nodename,
4423*0Sstevel@tonic-gate 				    nr->nr_nodename);
4424*0Sstevel@tonic-gate 				nd2->nd_flags = nr->nr_flags;
4425*0Sstevel@tonic-gate 				nd2->nd_flags |= MD_MN_NODE_DEL;
4426*0Sstevel@tonic-gate 				nd2->nd_nodeid = nr->nr_nodeid;
4427*0Sstevel@tonic-gate 				nd2->nd_next = NULL;
4428*0Sstevel@tonic-gate 				ndtail->nd_next = nd2;
4429*0Sstevel@tonic-gate 				nd2 = NULL;
4430*0Sstevel@tonic-gate 				nr = nr->nr_next;
4431*0Sstevel@tonic-gate 				continue;
4432*0Sstevel@tonic-gate 			}
4433*0Sstevel@tonic-gate 			/*
4434*0Sstevel@tonic-gate 			 * Is this the node record for the node that
4435*0Sstevel@tonic-gate 			 * we requested the set desc from?
4436*0Sstevel@tonic-gate 			 * If so, check if node has its own node record
4437*0Sstevel@tonic-gate 			 * marked OK. If marked OK, check for the OWN bit.
4438*0Sstevel@tonic-gate 			 */
4439*0Sstevel@tonic-gate 			if (nr->nr_nodeid == nd->nd_nodeid) {
4440*0Sstevel@tonic-gate 				found_my_nr = 1;
4441*0Sstevel@tonic-gate 				if (nr->nr_flags & MD_MN_NODE_OK) {
4442*0Sstevel@tonic-gate 					/*
4443*0Sstevel@tonic-gate 					 * If node record is marked OK
4444*0Sstevel@tonic-gate 					 * on its own node, then mark it OK
4445*0Sstevel@tonic-gate 					 * in the master list.  Node record
4446*0Sstevel@tonic-gate 					 * would have to exist on all nodes
4447*0Sstevel@tonic-gate 					 * in the ADD state before it could
4448*0Sstevel@tonic-gate 					 * be put into the OK state.
4449*0Sstevel@tonic-gate 					 */
4450*0Sstevel@tonic-gate 					nd->nd_flags |= MD_MN_NODE_OK;
4451*0Sstevel@tonic-gate 					nd->nd_flags &=
4452*0Sstevel@tonic-gate 					    ~(MD_MN_NODE_ADD | MD_MN_NODE_DEL);
4453*0Sstevel@tonic-gate 					/*
4454*0Sstevel@tonic-gate 					 * Mark own in master list as marked
4455*0Sstevel@tonic-gate 					 * on own node.
4456*0Sstevel@tonic-gate 					 */
4457*0Sstevel@tonic-gate 					if (nr->nr_flags & MD_MN_NODE_OWN)
4458*0Sstevel@tonic-gate 						nd->nd_flags |= MD_MN_NODE_OWN;
4459*0Sstevel@tonic-gate 					else
4460*0Sstevel@tonic-gate 						nd->nd_flags &= ~MD_MN_NODE_OWN;
4461*0Sstevel@tonic-gate 				} else {
4462*0Sstevel@tonic-gate 					/* Otherwise, mark node as DEL */
4463*0Sstevel@tonic-gate 					nd->nd_flags &= ~MD_MN_NODE_OK;
4464*0Sstevel@tonic-gate 					nd->nd_flags &= ~MD_MN_NODE_ADD;
4465*0Sstevel@tonic-gate 					nd->nd_flags |= MD_MN_NODE_DEL;
4466*0Sstevel@tonic-gate 				}
4467*0Sstevel@tonic-gate 			}
4468*0Sstevel@tonic-gate 			/*
4469*0Sstevel@tonic-gate 			 * If node is not ALIVE and marked DEL
4470*0Sstevel@tonic-gate 			 * on any node, make it DEL in master list.
4471*0Sstevel@tonic-gate 			 * If node is not ALIVE and marked ADD
4472*0Sstevel@tonic-gate 			 * on any node, make it ADD in master list
4473*0Sstevel@tonic-gate 			 * unless node record has already been marked DEL.
4474*0Sstevel@tonic-gate 			 */
4475*0Sstevel@tonic-gate 			if (!(nr->nr_flags & MD_MN_NODE_ALIVE)) {
4476*0Sstevel@tonic-gate 				if (nr->nr_flags & MD_MN_NODE_ADD) {
4477*0Sstevel@tonic-gate 					if (!(nd->nd_flags & MD_MN_NODE_DEL)) {
4478*0Sstevel@tonic-gate 						/* If not DEL - mark it ADD */
4479*0Sstevel@tonic-gate 						nd->nd_flags |= MD_MN_NODE_ADD;
4480*0Sstevel@tonic-gate 						nd->nd_flags &= ~MD_MN_NODE_OK;
4481*0Sstevel@tonic-gate 					}
4482*0Sstevel@tonic-gate 				}
4483*0Sstevel@tonic-gate 				if (nr->nr_flags & MD_MN_NODE_DEL) {
4484*0Sstevel@tonic-gate 					nd->nd_flags |= MD_MN_NODE_DEL;
4485*0Sstevel@tonic-gate 					nd->nd_flags &= ~MD_MN_NODE_OK;
4486*0Sstevel@tonic-gate 					/* Could already be ADD - make it DEL */
4487*0Sstevel@tonic-gate 					nd->nd_flags &= ~MD_MN_NODE_ADD;
4488*0Sstevel@tonic-gate 				}
4489*0Sstevel@tonic-gate 			}
4490*0Sstevel@tonic-gate 			nr = nr->nr_next;
4491*0Sstevel@tonic-gate 		}
4492*0Sstevel@tonic-gate 		/*
4493*0Sstevel@tonic-gate 		 * If a node record doesn't exist on its own node,
4494*0Sstevel@tonic-gate 		 * then mark node as DEL.
4495*0Sstevel@tonic-gate 		 */
4496*0Sstevel@tonic-gate 		if (found_my_nr == 0) {
4497*0Sstevel@tonic-gate 			nd->nd_flags &= ~MD_MN_NODE_OK;
4498*0Sstevel@tonic-gate 			nd->nd_flags |= MD_MN_NODE_DEL;
4499*0Sstevel@tonic-gate 		}
4500*0Sstevel@tonic-gate 
4501*0Sstevel@tonic-gate 		/*
4502*0Sstevel@tonic-gate 		 * If node is OK - put mnsr onto master_mnsr_node list for
4503*0Sstevel@tonic-gate 		 * later use when syncing up the drive records in the set.
4504*0Sstevel@tonic-gate 		 */
4505*0Sstevel@tonic-gate 		if (nd->nd_flags & MD_MN_NODE_OK) {
4506*0Sstevel@tonic-gate 			mnsr_node = Zalloc(sizeof (*mnsr_node));
4507*0Sstevel@tonic-gate 			mnsr_node->mmn_mnsr = mnsr;
4508*0Sstevel@tonic-gate 			(void) strncpy(mnsr_node->mmn_nodename,
4509*0Sstevel@tonic-gate 				nd->nd_nodename, MD_MAX_MNNODENAME_PLUS_1);
4510*0Sstevel@tonic-gate 			mnsr_node->mmn_next = master_mnsr_node;
4511*0Sstevel@tonic-gate 			master_mnsr_node = mnsr_node;
4512*0Sstevel@tonic-gate 		} else {
4513*0Sstevel@tonic-gate 			free_sr((struct md_set_record *)mnsr);
4514*0Sstevel@tonic-gate 		}
4515*0Sstevel@tonic-gate 
4516*0Sstevel@tonic-gate 		nd = nd->nd_next;
4517*0Sstevel@tonic-gate 	}
4518*0Sstevel@tonic-gate 
4519*0Sstevel@tonic-gate 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
4520*0Sstevel@tonic-gate 	    "Master nodelist created for set %s: %s"),
4521*0Sstevel@tonic-gate 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
4522*0Sstevel@tonic-gate 
4523*0Sstevel@tonic-gate 	/*
4524*0Sstevel@tonic-gate 	 * Send master nodelist to the rpc.metad on all nodes (including
4525*0Sstevel@tonic-gate 	 * myself) and each node will update itself.  This will set the
4526*0Sstevel@tonic-gate 	 * ADD and DEL flags on each node as setup in the master nodelist.
4527*0Sstevel@tonic-gate 	 * Don't send nodelist to node where set doesn't exist.
4528*0Sstevel@tonic-gate 	 */
4529*0Sstevel@tonic-gate 	nd = master_nodelist;
4530*0Sstevel@tonic-gate 	while (nd) {
4531*0Sstevel@tonic-gate 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE) ||
4532*0Sstevel@tonic-gate 		    (nd->nd_flags & MD_MN_NODE_NOSET)) {
4533*0Sstevel@tonic-gate 			nd = nd->nd_next;
4534*0Sstevel@tonic-gate 			continue;
4535*0Sstevel@tonic-gate 		}
4536*0Sstevel@tonic-gate 		if (clnt_upd_nr_flags(nd->nd_nodename, sp,
4537*0Sstevel@tonic-gate 		    master_nodelist, MD_NR_SET, MNSET_IN_RECONFIG, ep)) {
4538*0Sstevel@tonic-gate 			/* If RPC failure to another node return 205 */
4539*0Sstevel@tonic-gate 			if ((mdanyrpcerror(ep)) &&
4540*0Sstevel@tonic-gate 			    (sd->sd_mn_mynode->nd_nodeid !=
4541*0Sstevel@tonic-gate 			    nd->nd_nodeid)) {
4542*0Sstevel@tonic-gate 				rval = 205;
4543*0Sstevel@tonic-gate 			} else {
4544*0Sstevel@tonic-gate 				/* Any other failure */
4545*0Sstevel@tonic-gate 				rval = -1;
4546*0Sstevel@tonic-gate 			}
4547*0Sstevel@tonic-gate 			goto out;
4548*0Sstevel@tonic-gate 		}
4549*0Sstevel@tonic-gate 		nd = nd->nd_next;
4550*0Sstevel@tonic-gate 	}
4551*0Sstevel@tonic-gate 
4552*0Sstevel@tonic-gate 	/*
4553*0Sstevel@tonic-gate 	 * Now, delete nodes that need to be deleted.
4554*0Sstevel@tonic-gate 	 */
4555*0Sstevel@tonic-gate 	if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
4556*0Sstevel@tonic-gate 	    ep))  == NULL) {
4557*0Sstevel@tonic-gate 		if (! mdisok(ep)) {
4558*0Sstevel@tonic-gate 			rval = -1;
4559*0Sstevel@tonic-gate 			goto out;
4560*0Sstevel@tonic-gate 		}
4561*0Sstevel@tonic-gate 	}
4562*0Sstevel@tonic-gate 
4563*0Sstevel@tonic-gate 	/*
4564*0Sstevel@tonic-gate 	 * May be doing lots of RPC commands to the nodes, so lock the
4565*0Sstevel@tonic-gate 	 * ALIVE members of the set since most of the rpc.metad routines
4566*0Sstevel@tonic-gate 	 * require this for security reasons.
4567*0Sstevel@tonic-gate 	 */
4568*0Sstevel@tonic-gate 	nd = master_nodelist;
4569*0Sstevel@tonic-gate 	while (nd) {
4570*0Sstevel@tonic-gate 		/* Skip non-alive nodes and node without set */
4571*0Sstevel@tonic-gate 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE) ||
4572*0Sstevel@tonic-gate 		    (nd->nd_flags & MD_MN_NODE_NOSET)) {
4573*0Sstevel@tonic-gate 			nd = nd->nd_next;
4574*0Sstevel@tonic-gate 			continue;
4575*0Sstevel@tonic-gate 		}
4576*0Sstevel@tonic-gate 		if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
4577*0Sstevel@tonic-gate 			/* If RPC failure to another node return 205 */
4578*0Sstevel@tonic-gate 			if ((mdanyrpcerror(ep)) &&
4579*0Sstevel@tonic-gate 			    (sd->sd_mn_mynode->nd_nodeid !=
4580*0Sstevel@tonic-gate 			    nd->nd_nodeid)) {
4581*0Sstevel@tonic-gate 				rval = 205;
4582*0Sstevel@tonic-gate 			} else {
4583*0Sstevel@tonic-gate 				/* Any other failure */
4584*0Sstevel@tonic-gate 				rval = -1;
4585*0Sstevel@tonic-gate 			}
4586*0Sstevel@tonic-gate 			goto out;
4587*0Sstevel@tonic-gate 		}
4588*0Sstevel@tonic-gate 		set_locked = 1;
4589*0Sstevel@tonic-gate 		nd = nd->nd_next;
4590*0Sstevel@tonic-gate 	}
4591*0Sstevel@tonic-gate 
4592*0Sstevel@tonic-gate 	nd = master_nodelist;
4593*0Sstevel@tonic-gate 	while (nd) {
4594*0Sstevel@tonic-gate 		/* Skip non-alive nodes */
4595*0Sstevel@tonic-gate 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
4596*0Sstevel@tonic-gate 			nd = nd->nd_next;
4597*0Sstevel@tonic-gate 			continue;
4598*0Sstevel@tonic-gate 		}
4599*0Sstevel@tonic-gate 		if (nd->nd_flags & MD_MN_NODE_DEL) {
4600*0Sstevel@tonic-gate 			num_alive_nodes_del++;
4601*0Sstevel@tonic-gate 			/*
4602*0Sstevel@tonic-gate 			 * Delete this node rec from all ALIVE nodes in diskset.
4603*0Sstevel@tonic-gate 			 */
4604*0Sstevel@tonic-gate 			nd2 = master_nodelist;
4605*0Sstevel@tonic-gate 			while (nd2) {
4606*0Sstevel@tonic-gate 				/* Skip non-alive nodes and node without set */
4607*0Sstevel@tonic-gate 				if (!(nd2->nd_flags & MD_MN_NODE_ALIVE) ||
4608*0Sstevel@tonic-gate 				    (nd2->nd_flags & MD_MN_NODE_NOSET)) {
4609*0Sstevel@tonic-gate 					nd2 = nd2->nd_next;
4610*0Sstevel@tonic-gate 					continue;
4611*0Sstevel@tonic-gate 				}
4612*0Sstevel@tonic-gate 
4613*0Sstevel@tonic-gate 				/* This is a node being deleted from set */
4614*0Sstevel@tonic-gate 				if (nd2->nd_nodeid == nd->nd_nodeid) {
4615*0Sstevel@tonic-gate 					/* Mark set record as DEL */
4616*0Sstevel@tonic-gate 					if (clnt_upd_sr_flags(nd->nd_nodename,
4617*0Sstevel@tonic-gate 					    sp, MD_SR_DEL, ep)) {
4618*0Sstevel@tonic-gate 						/* RPC failure to !my node */
4619*0Sstevel@tonic-gate 						if ((mdanyrpcerror(ep)) &&
4620*0Sstevel@tonic-gate 						    (sd->sd_mn_mynode->
4621*0Sstevel@tonic-gate 						    nd_nodeid
4622*0Sstevel@tonic-gate 						    != nd->nd_nodeid)) {
4623*0Sstevel@tonic-gate 							rval = 205;
4624*0Sstevel@tonic-gate 						} else {
4625*0Sstevel@tonic-gate 							/* Any other failure */
4626*0Sstevel@tonic-gate 							rval = -1;
4627*0Sstevel@tonic-gate 						}
4628*0Sstevel@tonic-gate 						goto out;
4629*0Sstevel@tonic-gate 					}
4630*0Sstevel@tonic-gate 					if (clnt_deldrvs(nd->nd_nodename, sp,
4631*0Sstevel@tonic-gate 					    dd, ep)) {
4632*0Sstevel@tonic-gate 						/* RPC failure to !my node */
4633*0Sstevel@tonic-gate 						if ((mdanyrpcerror(ep)) &&
4634*0Sstevel@tonic-gate 						    (sd->sd_mn_mynode->
4635*0Sstevel@tonic-gate 						    nd_nodeid
4636*0Sstevel@tonic-gate 						    != nd->nd_nodeid)) {
4637*0Sstevel@tonic-gate 							rval = 205;
4638*0Sstevel@tonic-gate 						} else {
4639*0Sstevel@tonic-gate 							/* Any other failure */
4640*0Sstevel@tonic-gate 							rval = -1;
4641*0Sstevel@tonic-gate 						}
4642*0Sstevel@tonic-gate 						goto out;
4643*0Sstevel@tonic-gate 					}
4644*0Sstevel@tonic-gate 					if (clnt_delset(nd->nd_nodename, sp,
4645*0Sstevel@tonic-gate 					    ep) == -1) {
4646*0Sstevel@tonic-gate 						/* RPC failure to !my node */
4647*0Sstevel@tonic-gate 						if ((mdanyrpcerror(ep)) &&
4648*0Sstevel@tonic-gate 						    (sd->sd_mn_mynode->
4649*0Sstevel@tonic-gate 						    nd_nodeid
4650*0Sstevel@tonic-gate 						    != nd->nd_nodeid)) {
4651*0Sstevel@tonic-gate 							rval = 205;
4652*0Sstevel@tonic-gate 						} else {
4653*0Sstevel@tonic-gate 							/* Any other failure */
4654*0Sstevel@tonic-gate 							rval = -1;
4655*0Sstevel@tonic-gate 						}
4656*0Sstevel@tonic-gate 						goto out;
4657*0Sstevel@tonic-gate 					}
4658*0Sstevel@tonic-gate 				} else {
4659*0Sstevel@tonic-gate 					/*
4660*0Sstevel@tonic-gate 					 * Delete host from sets on hosts
4661*0Sstevel@tonic-gate 					 * not being deleted.
4662*0Sstevel@tonic-gate 					 */
4663*0Sstevel@tonic-gate 					anode[0] = Strdup(nd->nd_nodename);
4664*0Sstevel@tonic-gate 					if (clnt_delhosts(nd2->nd_nodename, sp,
4665*0Sstevel@tonic-gate 					    1, anode, ep) == -1) {
4666*0Sstevel@tonic-gate 						Free(anode[0]);
4667*0Sstevel@tonic-gate 						/* RPC failure to !my node */
4668*0Sstevel@tonic-gate 						if ((mdanyrpcerror(ep)) &&
4669*0Sstevel@tonic-gate 						    (sd->sd_mn_mynode->
4670*0Sstevel@tonic-gate 						    nd_nodeid
4671*0Sstevel@tonic-gate 						    != nd2->nd_nodeid)) {
4672*0Sstevel@tonic-gate 							rval = 205;
4673*0Sstevel@tonic-gate 						} else {
4674*0Sstevel@tonic-gate 							/* Any other failure */
4675*0Sstevel@tonic-gate 							rval = -1;
4676*0Sstevel@tonic-gate 						}
4677*0Sstevel@tonic-gate 						goto out;
4678*0Sstevel@tonic-gate 					}
4679*0Sstevel@tonic-gate 
4680*0Sstevel@tonic-gate 					meta_mc_log(MC_LOG5,
4681*0Sstevel@tonic-gate 					    dgettext(TEXT_DOMAIN,
4682*0Sstevel@tonic-gate 					    "Deleted node %s (%d) on node %s "
4683*0Sstevel@tonic-gate 					    "from set %s: %s"),
4684*0Sstevel@tonic-gate 					    nd->nd_nodename, nd->nd_nodeid,
4685*0Sstevel@tonic-gate 					    nd2->nd_nodename,
4686*0Sstevel@tonic-gate 					    sp->setname,
4687*0Sstevel@tonic-gate 					    meta_print_hrtime(
4688*0Sstevel@tonic-gate 					    gethrtime() - start_time));
4689*0Sstevel@tonic-gate 
4690*0Sstevel@tonic-gate 					Free(anode[0]);
4691*0Sstevel@tonic-gate 				}
4692*0Sstevel@tonic-gate 				nd2 = nd2->nd_next;
4693*0Sstevel@tonic-gate 			}
4694*0Sstevel@tonic-gate 		}
4695*0Sstevel@tonic-gate 		nd = nd->nd_next;
4696*0Sstevel@tonic-gate 	}
4697*0Sstevel@tonic-gate 
4698*0Sstevel@tonic-gate 	nd = master_nodelist;
4699*0Sstevel@tonic-gate 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
4700*0Sstevel@tonic-gate 	while (nd) {
4701*0Sstevel@tonic-gate 		/* Skip non-alive nodes and node without set */
4702*0Sstevel@tonic-gate 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE) ||
4703*0Sstevel@tonic-gate 		    (nd->nd_flags & MD_MN_NODE_NOSET)) {
4704*0Sstevel@tonic-gate 			nd = nd->nd_next;
4705*0Sstevel@tonic-gate 			continue;
4706*0Sstevel@tonic-gate 		}
4707*0Sstevel@tonic-gate 		if (clnt_unlock_set(nd->nd_nodename, cl_sk, ep)) {
4708*0Sstevel@tonic-gate 			/* If RPC failure to another node return 205 */
4709*0Sstevel@tonic-gate 			if ((mdanyrpcerror(ep)) &&
4710*0Sstevel@tonic-gate 			    (sd->sd_mn_mynode->nd_nodeid !=
4711*0Sstevel@tonic-gate 			    nd->nd_nodeid)) {
4712*0Sstevel@tonic-gate 				rval = 205;
4713*0Sstevel@tonic-gate 			} else {
4714*0Sstevel@tonic-gate 				/* Any other failure */
4715*0Sstevel@tonic-gate 				rval = -1;
4716*0Sstevel@tonic-gate 			}
4717*0Sstevel@tonic-gate 			goto out;
4718*0Sstevel@tonic-gate 		}
4719*0Sstevel@tonic-gate 		nd = nd->nd_next;
4720*0Sstevel@tonic-gate 	}
4721*0Sstevel@tonic-gate 	cl_set_setkey(NULL);
4722*0Sstevel@tonic-gate 	set_locked = 0;
4723*0Sstevel@tonic-gate 
4724*0Sstevel@tonic-gate 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
4725*0Sstevel@tonic-gate 	    "Nodelist syncronization complete for set %s: %s"),
4726*0Sstevel@tonic-gate 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
4727*0Sstevel@tonic-gate 
4728*0Sstevel@tonic-gate 	metaflushsetname(sp);
4729*0Sstevel@tonic-gate 
4730*0Sstevel@tonic-gate 	/*
4731*0Sstevel@tonic-gate 	 * If all alive nodes have been deleted from set, just
4732*0Sstevel@tonic-gate 	 * return since nothing else can be done until non-alive
4733*0Sstevel@tonic-gate 	 * nodes (if there are any) rejoin the cluster.
4734*0Sstevel@tonic-gate 	 */
4735*0Sstevel@tonic-gate 	if (num_alive_nodes == num_alive_nodes_del) {
4736*0Sstevel@tonic-gate 		rval = 0;
4737*0Sstevel@tonic-gate 		goto out;
4738*0Sstevel@tonic-gate 	}
4739*0Sstevel@tonic-gate 
4740*0Sstevel@tonic-gate 	/*
4741*0Sstevel@tonic-gate 	 * Sync up drive records.
4742*0Sstevel@tonic-gate 	 *
4743*0Sstevel@tonic-gate 	 * If a node panic'd (or metaset command was killed) during the
4744*0Sstevel@tonic-gate 	 * addition or deletion of a drive to the diskset, the nodes
4745*0Sstevel@tonic-gate 	 * may have a different view of the drive list.  During cleanup
4746*0Sstevel@tonic-gate 	 * of the drive list during reconfig, a drive will be deleted
4747*0Sstevel@tonic-gate 	 * from the list if the master node sees that the drive has been
4748*0Sstevel@tonic-gate 	 * marked in the ADD state on any node or is marked in the DEL state
4749*0Sstevel@tonic-gate 	 * on all nodes.
4750*0Sstevel@tonic-gate 	 * This cleanup must occur even if all nodes in the cluster are
4751*0Sstevel@tonic-gate 	 * not part of the cluster so that all nodes have the same view
4752*0Sstevel@tonic-gate 	 * of the drivelist.
4753*0Sstevel@tonic-gate 	 * Then if the entire cluster goes down and comes back up, the
4754*0Sstevel@tonic-gate 	 * new master node could be a node that wasn't in the cluster when
4755*0Sstevel@tonic-gate 	 * the node was deleted.  This could lead to a situation where the
4756*0Sstevel@tonic-gate 	 * master node thinks that a drive is OK, but this drive isn't
4757*0Sstevel@tonic-gate 	 * known to the other nodes.
4758*0Sstevel@tonic-gate 	 * This situation can also occur during the addition of a drive
4759*0Sstevel@tonic-gate 	 * where a node has the drive marked OK, but the node executing the
4760*0Sstevel@tonic-gate 	 * metaset command enountered a failure before marking that drive OK
4761*0Sstevel@tonic-gate 	 * on the rest of the nodes.  If the node with the OK drive then
4762*0Sstevel@tonic-gate 	 * panics, then rest of the nodes will remove that drive marked ADD
4763*0Sstevel@tonic-gate 	 * and when the node with the OK drive rejoins the cluster, it will
4764*0Sstevel@tonic-gate 	 * have a drive marked OK that is unknown by the other nodes.
4765*0Sstevel@tonic-gate 	 *
4766*0Sstevel@tonic-gate 	 * There are 2 situations to consider:
4767*0Sstevel@tonic-gate 	 * A) Master knows about a drive that other nodes don't know about.
4768*0Sstevel@tonic-gate 	 * B) At least one slave node knows about a drive that the master
4769*0Sstevel@tonic-gate 	 *    node doesn't know about.
4770*0Sstevel@tonic-gate 	 *
4771*0Sstevel@tonic-gate 	 * To handle these situations the following steps are followed:
4772*0Sstevel@tonic-gate 	 * 1) Count number of drives known by this master node and the
4773*0Sstevel@tonic-gate 	 *    other slave nodes.
4774*0Sstevel@tonic-gate 	 *    If all nodes have the same number of drives and the master has
4775*0Sstevel@tonic-gate 	 *    all drives marked OK, then skip to step4.
4776*0Sstevel@tonic-gate 	 *
4777*0Sstevel@tonic-gate 	 * 2) If a node has less drives listed than the master, the master
4778*0Sstevel@tonic-gate 	 *    must get the drive descriptor list from that node so that
4779*0Sstevel@tonic-gate 	 *    master can determine which drive it needs to delete from that
4780*0Sstevel@tonic-gate 	 *    node.  Master must get the drive descriptor list since the
4781*0Sstevel@tonic-gate 	 *    drive record list does not contain the name of the drive, but
4782*0Sstevel@tonic-gate 	 *    only a key and the key can only be interprested on that other
4783*0Sstevel@tonic-gate 	 *    node.
4784*0Sstevel@tonic-gate 	 *
4785*0Sstevel@tonic-gate 	 * 3) The master will then create the master drive list by doing:
4786*0Sstevel@tonic-gate 	 *	- Master starts with drive list known by master.
4787*0Sstevel@tonic-gate 	 *	- Any drive marked ADD will be removed from the list.
4788*0Sstevel@tonic-gate 	 *	- Any drive not known by another node (from step2) will be
4789*0Sstevel@tonic-gate 	 *	removed from the drive list.
4790*0Sstevel@tonic-gate 	 *	- If a drive is marked DEL on the master, the master must
4791*0Sstevel@tonic-gate 	 *	verify that the drive record is marked DEL on all nodes.
4792*0Sstevel@tonic-gate 	 *	If any node has the drive record marked OK, mark it OK
4793*0Sstevel@tonic-gate 	 *	on the master.  (The reason why is described below).
4794*0Sstevel@tonic-gate 	 *
4795*0Sstevel@tonic-gate 	 * 4) The master sends out the master drive list and the slave
4796*0Sstevel@tonic-gate 	 *    nodes will force their drive lists to match the master
4797*0Sstevel@tonic-gate 	 *    drive list by deleting drives, if necessary and by changing
4798*0Sstevel@tonic-gate 	 *    the drive record states from ADD->OK if master has drive
4799*0Sstevel@tonic-gate 	 *    marked OK and slave has drive marked ADD.
4800*0Sstevel@tonic-gate 	 *
4801*0Sstevel@tonic-gate 	 * Interesting scenarios:
4802*0Sstevel@tonic-gate 	 *
4803*0Sstevel@tonic-gate 	 * 1) System has 4 nodes with node 1 as the master.  Node 3 starts
4804*0Sstevel@tonic-gate 	 *    to delete a drive record (drive record on node 1 is marked DEL),
4805*0Sstevel@tonic-gate 	 *    but is stopped when node 3 panics.  Node 1 also panics.
4806*0Sstevel@tonic-gate 	 *    During reconfig cycle, node 2 is picked as master and the drive
4807*0Sstevel@tonic-gate 	 *    record is left alone since all nodes in the cluster have it
4808*0Sstevel@tonic-gate 	 *    marked OK.  User now sees drive as part of diskset.
4809*0Sstevel@tonic-gate 	 *    Now, entire cluster is rebooted and node 1 rejoins the cluster.
4810*0Sstevel@tonic-gate 	 *    Node 1 is picked as the master and node 1 has drive record
4811*0Sstevel@tonic-gate 	 *    marked DEL.  Node 1 contacts all other nodes in the cluster
4812*0Sstevel@tonic-gate 	 *    and since at least one node has the drive record marked OK,
4813*0Sstevel@tonic-gate 	 *    the master marks the drive record OK.
4814*0Sstevel@tonic-gate 	 *    User continues to see the drive as part of the diskset.
4815*0Sstevel@tonic-gate 	 */
4816*0Sstevel@tonic-gate 
4817*0Sstevel@tonic-gate 	/* Reget set descriptor since flushed above */
4818*0Sstevel@tonic-gate 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
4819*0Sstevel@tonic-gate 		rval = -1;
4820*0Sstevel@tonic-gate 		goto out;
4821*0Sstevel@tonic-gate 	}
4822*0Sstevel@tonic-gate 
4823*0Sstevel@tonic-gate 	/* Has side effect of setting sd->sd_drvs to same as master_dd */
4824*0Sstevel@tonic-gate 	if ((master_dd = metaget_drivedesc_sideno(sp,
4825*0Sstevel@tonic-gate 	    sd->sd_mn_mynode->nd_nodeid,
4826*0Sstevel@tonic-gate 	    (MD_BASICNAME_OK | PRINT_FAST), ep)) == NULL) {
4827*0Sstevel@tonic-gate 		/* No drives in list */
4828*0Sstevel@tonic-gate 		if (!mdisok(ep)) {
4829*0Sstevel@tonic-gate 			/*
4830*0Sstevel@tonic-gate 			 * Can't get drive list for this node, so
4831*0Sstevel@tonic-gate 			 * return -1 causing this node to be removed
4832*0Sstevel@tonic-gate 			 * cluster config and fixed.
4833*0Sstevel@tonic-gate 			 */
4834*0Sstevel@tonic-gate 			rval = -1;
4835*0Sstevel@tonic-gate 			goto out;
4836*0Sstevel@tonic-gate 		}
4837*0Sstevel@tonic-gate 	}
4838*0Sstevel@tonic-gate 
4839*0Sstevel@tonic-gate 	/* Count the number of drives for all nodes */
4840*0Sstevel@tonic-gate 	mnsr_node = master_mnsr_node;
4841*0Sstevel@tonic-gate 	while (mnsr_node) {
4842*0Sstevel@tonic-gate 		dr_cnt = 0;
4843*0Sstevel@tonic-gate 		dr = mnsr_node->mmn_mnsr->sr_drivechain;
4844*0Sstevel@tonic-gate 		while (dr) {
4845*0Sstevel@tonic-gate 			dr_cnt++;
4846*0Sstevel@tonic-gate 			dr = dr->dr_next;
4847*0Sstevel@tonic-gate 		}
4848*0Sstevel@tonic-gate 		mnsr_node->mmn_numdrives = dr_cnt;
4849*0Sstevel@tonic-gate 		mnsr_node = mnsr_node->mmn_next;
4850*0Sstevel@tonic-gate 	}
4851*0Sstevel@tonic-gate 
4852*0Sstevel@tonic-gate 	/* Count the number of drives for the master; also check flags */
4853*0Sstevel@tonic-gate 	all_drives_ok = 1;
4854*0Sstevel@tonic-gate 	dd_cnt = 0;
4855*0Sstevel@tonic-gate 	dd = master_dd;
4856*0Sstevel@tonic-gate 	while (dd) {
4857*0Sstevel@tonic-gate 		dd_cnt++;
4858*0Sstevel@tonic-gate 		if (!(dd->dd_flags & MD_DR_OK))
4859*0Sstevel@tonic-gate 			all_drives_ok = 0;
4860*0Sstevel@tonic-gate 		dd = dd->dd_next;
4861*0Sstevel@tonic-gate 	}
4862*0Sstevel@tonic-gate 
4863*0Sstevel@tonic-gate 	/* If all drives are ok, do quick check against number of drives */
4864*0Sstevel@tonic-gate 	if (all_drives_ok) {
4865*0Sstevel@tonic-gate 		/* If all nodes have same number of drives, almost done */
4866*0Sstevel@tonic-gate 		mnsr_node = master_mnsr_node;
4867*0Sstevel@tonic-gate 		while (mnsr_node) {
4868*0Sstevel@tonic-gate 			if (mnsr_node->mmn_numdrives != dd_cnt)
4869*0Sstevel@tonic-gate 				break;
4870*0Sstevel@tonic-gate 			mnsr_node = mnsr_node->mmn_next;
4871*0Sstevel@tonic-gate 		}
4872*0Sstevel@tonic-gate 		/* All nodes have same number of drives, just send flags */
4873*0Sstevel@tonic-gate 		if (mnsr_node == NULL) {
4874*0Sstevel@tonic-gate 			goto send_drive_list;
4875*0Sstevel@tonic-gate 		}
4876*0Sstevel@tonic-gate 	}
4877*0Sstevel@tonic-gate 
4878*0Sstevel@tonic-gate 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
4879*0Sstevel@tonic-gate 	    "Begin detailed drive synchronization for set %s: %s"),
4880*0Sstevel@tonic-gate 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
4881*0Sstevel@tonic-gate 
4882*0Sstevel@tonic-gate 	/* Detailed check required  */
4883*0Sstevel@tonic-gate 	mnsr_node = master_mnsr_node;
4884*0Sstevel@tonic-gate 	while (mnsr_node) {
4885*0Sstevel@tonic-gate 		/* Does slave node have less drives than master? */
4886*0Sstevel@tonic-gate 		if (mnsr_node->mmn_numdrives < dd_cnt) {
4887*0Sstevel@tonic-gate 			/* Yes - must determine which drive is missing */
4888*0Sstevel@tonic-gate 			if (clnt_getdrivedesc(mnsr_node->mmn_nodename, sp,
4889*0Sstevel@tonic-gate 			    &other_dd, ep)) {
4890*0Sstevel@tonic-gate 				/* RPC failure to !my node */
4891*0Sstevel@tonic-gate 				if ((mdanyrpcerror(ep)) &&
4892*0Sstevel@tonic-gate 				    (strcmp(mynode(), mnsr_node->mmn_nodename)
4893*0Sstevel@tonic-gate 				    != 0)) {
4894*0Sstevel@tonic-gate 					rval = 205;
4895*0Sstevel@tonic-gate 				} else {
4896*0Sstevel@tonic-gate 					/* Any other failure */
4897*0Sstevel@tonic-gate 					rval = -1;
4898*0Sstevel@tonic-gate 				}
4899*0Sstevel@tonic-gate 				mde_perror(ep, dgettext(TEXT_DOMAIN,
4900*0Sstevel@tonic-gate 				    "Master node %s unable to "
4901*0Sstevel@tonic-gate 				    "retrieve drive list from node %s"),
4902*0Sstevel@tonic-gate 				    mynode(), mnsr_node->mmn_nodename);
4903*0Sstevel@tonic-gate 				goto out;
4904*0Sstevel@tonic-gate 			}
4905*0Sstevel@tonic-gate 			mnsr_node->mmn_dd = other_dd;
4906*0Sstevel@tonic-gate 			dd = master_dd;
4907*0Sstevel@tonic-gate 			while (dd) {
4908*0Sstevel@tonic-gate 				if (!(dd->dd_flags & MD_DR_OK)) {
4909*0Sstevel@tonic-gate 					dd = dd->dd_next;
4910*0Sstevel@tonic-gate 					continue;
4911*0Sstevel@tonic-gate 				}
4912*0Sstevel@tonic-gate 				other_dd = mnsr_node->mmn_dd;
4913*0Sstevel@tonic-gate 				while (other_dd) {
4914*0Sstevel@tonic-gate 					/* Convert to devids, when available */
4915*0Sstevel@tonic-gate 					if (strcmp(other_dd->dd_dnp->cname,
4916*0Sstevel@tonic-gate 					    dd->dd_dnp->cname) == 0) {
4917*0Sstevel@tonic-gate 						break;
4918*0Sstevel@tonic-gate 					}
4919*0Sstevel@tonic-gate 					other_dd = other_dd->dd_next;
4920*0Sstevel@tonic-gate 				}
4921*0Sstevel@tonic-gate 				/*
4922*0Sstevel@tonic-gate 				 * dd not found on slave so mark it
4923*0Sstevel@tonic-gate 				 * ADD for later deletion (drives in ADD
4924*0Sstevel@tonic-gate 				 * state are deleted later in this routine).
4925*0Sstevel@tonic-gate 				 */
4926*0Sstevel@tonic-gate 				if (other_dd == NULL) {
4927*0Sstevel@tonic-gate 					dd->dd_flags = MD_DR_ADD;
4928*0Sstevel@tonic-gate 				}
4929*0Sstevel@tonic-gate 				dd = dd->dd_next;
4930*0Sstevel@tonic-gate 			}
4931*0Sstevel@tonic-gate 
4932*0Sstevel@tonic-gate 		}
4933*0Sstevel@tonic-gate 		mnsr_node = mnsr_node->mmn_next;
4934*0Sstevel@tonic-gate 	}
4935*0Sstevel@tonic-gate 
4936*0Sstevel@tonic-gate 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
4937*0Sstevel@tonic-gate 	    "Drive check completed for set %s: %s"),
4938*0Sstevel@tonic-gate 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
4939*0Sstevel@tonic-gate 
4940*0Sstevel@tonic-gate 	dd = master_dd;
4941*0Sstevel@tonic-gate 	dd_prev = 0;
4942*0Sstevel@tonic-gate 	while (dd) {
4943*0Sstevel@tonic-gate 		/* Remove any ADD drives from list */
4944*0Sstevel@tonic-gate 		if (dd->dd_flags & MD_DR_ADD) {
4945*0Sstevel@tonic-gate 			if (dd_prev) {
4946*0Sstevel@tonic-gate 				dd_prev->dd_next = dd->dd_next;
4947*0Sstevel@tonic-gate 				dd->dd_next = NULL;
4948*0Sstevel@tonic-gate 				metafreedrivedesc(&dd);
4949*0Sstevel@tonic-gate 				dd = dd_prev->dd_next;
4950*0Sstevel@tonic-gate 			} else {
4951*0Sstevel@tonic-gate 				/*
4952*0Sstevel@tonic-gate 				 * If removing drive descriptor from head
4953*0Sstevel@tonic-gate 				 * of linked list, also change sd->sd_drvs.
4954*0Sstevel@tonic-gate 				 */
4955*0Sstevel@tonic-gate 				master_dd = sd->sd_drvs = dd->dd_next;
4956*0Sstevel@tonic-gate 				dd->dd_next = NULL;
4957*0Sstevel@tonic-gate 				metafreedrivedesc(&dd);
4958*0Sstevel@tonic-gate 				dd = master_dd;
4959*0Sstevel@tonic-gate 			}
4960*0Sstevel@tonic-gate 			/* dd setup in if/else above */
4961*0Sstevel@tonic-gate 			continue;
4962*0Sstevel@tonic-gate 		}
4963*0Sstevel@tonic-gate 		/*
4964*0Sstevel@tonic-gate 		 * If drive is marked DEL, check all other nodes.
4965*0Sstevel@tonic-gate 		 * If drive on another node is marked OK, mark drive OK
4966*0Sstevel@tonic-gate 		 * in master list.  If drive is marked DEL or doesn't exist
4967*0Sstevel@tonic-gate 		 * on all nodes, remove drive from list.
4968*0Sstevel@tonic-gate 		 */
4969*0Sstevel@tonic-gate 		if (dd->dd_flags & MD_DR_DEL) {
4970*0Sstevel@tonic-gate 			mnsr_node = master_mnsr_node;
4971*0Sstevel@tonic-gate 			while (mnsr_node) {
4972*0Sstevel@tonic-gate 				if (mnsr_node->mmn_dd == NULL) {
4973*0Sstevel@tonic-gate 				    if (clnt_getdrivedesc(
4974*0Sstevel@tonic-gate 					mnsr_node->mmn_nodename, sp,
4975*0Sstevel@tonic-gate 					&other_dd, ep)) {
4976*0Sstevel@tonic-gate 					    /* RPC failure to !my node */
4977*0Sstevel@tonic-gate 					    if ((mdanyrpcerror(ep)) &&
4978*0Sstevel@tonic-gate 						(strcmp(mynode(),
4979*0Sstevel@tonic-gate 						mnsr_node->mmn_nodename)
4980*0Sstevel@tonic-gate 						!= 0)) {
4981*0Sstevel@tonic-gate 						    rval = 205;
4982*0Sstevel@tonic-gate 					    } else {
4983*0Sstevel@tonic-gate 						    /* Any other failure */
4984*0Sstevel@tonic-gate 						    rval = -1;
4985*0Sstevel@tonic-gate 					    }
4986*0Sstevel@tonic-gate 					    mde_perror(ep, dgettext(TEXT_DOMAIN,
4987*0Sstevel@tonic-gate 						"Master node %s unable "
4988*0Sstevel@tonic-gate 						"to retrieve drive list from "
4989*0Sstevel@tonic-gate 						"node %s"), mynode(),
4990*0Sstevel@tonic-gate 						mnsr_node->mmn_nodename);
4991*0Sstevel@tonic-gate 					    goto out;
4992*0Sstevel@tonic-gate 				    }
4993*0Sstevel@tonic-gate 				    mnsr_node->mmn_dd = other_dd;
4994*0Sstevel@tonic-gate 				}
4995*0Sstevel@tonic-gate 				other_dd = mnsr_node->mmn_dd;
4996*0Sstevel@tonic-gate 				while (other_dd) {
4997*0Sstevel@tonic-gate 					/* Found drive (OK) from other node */
4998*0Sstevel@tonic-gate 					if (strcmp(dd->dd_dnp->cname,
4999*0Sstevel@tonic-gate 					    other_dd->dd_dnp->cname)
5000*0Sstevel@tonic-gate 					    == 0) {
5001*0Sstevel@tonic-gate 						/* Drive marked OK */
5002*0Sstevel@tonic-gate 						if (other_dd->dd_flags &
5003*0Sstevel@tonic-gate 						    MD_DR_OK) {
5004*0Sstevel@tonic-gate 						    dd->dd_flags = MD_DR_OK;
5005*0Sstevel@tonic-gate 						}
5006*0Sstevel@tonic-gate 						break;
5007*0Sstevel@tonic-gate 					}
5008*0Sstevel@tonic-gate 					other_dd = other_dd->dd_next;
5009*0Sstevel@tonic-gate 				}
5010*0Sstevel@tonic-gate 				if (dd->dd_flags == MD_DR_OK)
5011*0Sstevel@tonic-gate 					break;
5012*0Sstevel@tonic-gate 
5013*0Sstevel@tonic-gate 				mnsr_node = mnsr_node->mmn_next;
5014*0Sstevel@tonic-gate 			}
5015*0Sstevel@tonic-gate 			/*
5016*0Sstevel@tonic-gate 			 * If no node had this drive marked OK, delete it.
5017*0Sstevel@tonic-gate 			 */
5018*0Sstevel@tonic-gate 			if (dd->dd_flags & MD_DR_DEL) {
5019*0Sstevel@tonic-gate 				if (dd_prev) {
5020*0Sstevel@tonic-gate 					dd_prev->dd_next = dd->dd_next;
5021*0Sstevel@tonic-gate 					dd->dd_next = NULL;
5022*0Sstevel@tonic-gate 					metafreedrivedesc(&dd);
5023*0Sstevel@tonic-gate 					dd = dd_prev->dd_next;
5024*0Sstevel@tonic-gate 				} else {
5025*0Sstevel@tonic-gate 					/*
5026*0Sstevel@tonic-gate 					 * If removing drive descriptor from
5027*0Sstevel@tonic-gate 					 * head of linked list, also change
5028*0Sstevel@tonic-gate 					 * sd->sd_drvs.
5029*0Sstevel@tonic-gate 					 */
5030*0Sstevel@tonic-gate 					master_dd = sd->sd_drvs = dd->dd_next;
5031*0Sstevel@tonic-gate 					dd->dd_next = NULL;
5032*0Sstevel@tonic-gate 					metafreedrivedesc(&dd);
5033*0Sstevel@tonic-gate 					dd = master_dd;
5034*0Sstevel@tonic-gate 				}
5035*0Sstevel@tonic-gate 				/* dd setup in if/else above */
5036*0Sstevel@tonic-gate 				continue;
5037*0Sstevel@tonic-gate 			}
5038*0Sstevel@tonic-gate 		}
5039*0Sstevel@tonic-gate 		dd_prev = dd;
5040*0Sstevel@tonic-gate 		dd = dd->dd_next;
5041*0Sstevel@tonic-gate 	}
5042*0Sstevel@tonic-gate 
5043*0Sstevel@tonic-gate 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5044*0Sstevel@tonic-gate 	    "Setting drive states completed for set %s: %s"),
5045*0Sstevel@tonic-gate 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
5046*0Sstevel@tonic-gate 
5047*0Sstevel@tonic-gate send_drive_list:
5048*0Sstevel@tonic-gate 	/*
5049*0Sstevel@tonic-gate 	 * Set genid on all drives to be the highest value seen.
5050*0Sstevel@tonic-gate 	 */
5051*0Sstevel@tonic-gate 	dd = master_dd;
5052*0Sstevel@tonic-gate 	while (dd) {
5053*0Sstevel@tonic-gate 		dd->dd_genid = max_genid;
5054*0Sstevel@tonic-gate 		dd = dd->dd_next;
5055*0Sstevel@tonic-gate 	}
5056*0Sstevel@tonic-gate 	/*
5057*0Sstevel@tonic-gate 	 * Send updated drive list to all alive nodes.
5058*0Sstevel@tonic-gate 	 * Will also set genid on set and node records to have same
5059*0Sstevel@tonic-gate 	 * as the drive records.
5060*0Sstevel@tonic-gate 	 */
5061*0Sstevel@tonic-gate 	nd = sd->sd_nodelist;
5062*0Sstevel@tonic-gate 	while (nd) {
5063*0Sstevel@tonic-gate 		/* Skip non-alive nodes */
5064*0Sstevel@tonic-gate 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
5065*0Sstevel@tonic-gate 			nd = nd->nd_next;
5066*0Sstevel@tonic-gate 			continue;
5067*0Sstevel@tonic-gate 		}
5068*0Sstevel@tonic-gate 		if (clnt_upd_dr_reconfig(nd->nd_nodename, sp, master_dd, ep)) {
5069*0Sstevel@tonic-gate 			/* RPC failure to another node */
5070*0Sstevel@tonic-gate 			if ((mdanyrpcerror(ep)) &&
5071*0Sstevel@tonic-gate 			    (sd->sd_mn_mynode->nd_nodeid != nd->nd_nodeid)) {
5072*0Sstevel@tonic-gate 				rval = 205;
5073*0Sstevel@tonic-gate 			} else {
5074*0Sstevel@tonic-gate 				/* Any other failure */
5075*0Sstevel@tonic-gate 				rval = -1;
5076*0Sstevel@tonic-gate 			}
5077*0Sstevel@tonic-gate 			goto out;
5078*0Sstevel@tonic-gate 		}
5079*0Sstevel@tonic-gate 		nd = nd->nd_next;
5080*0Sstevel@tonic-gate 	}
5081*0Sstevel@tonic-gate 
5082*0Sstevel@tonic-gate 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5083*0Sstevel@tonic-gate 	    "Sent drive list to all nodes for set %s: %s"),
5084*0Sstevel@tonic-gate 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
5085*0Sstevel@tonic-gate 
5086*0Sstevel@tonic-gate 	/*
5087*0Sstevel@tonic-gate 	 * If no drive records left in set and nodes had been joined,
5088*0Sstevel@tonic-gate 	 * withdraw the nodes.  Always reset the master and mark
5089*0Sstevel@tonic-gate 	 * all nodes as withdrawn on all nodes.
5090*0Sstevel@tonic-gate 	 */
5091*0Sstevel@tonic-gate 	if (master_dd == NULL) {
5092*0Sstevel@tonic-gate 		/* Reset new master flag since no longer master */
5093*0Sstevel@tonic-gate 		(void) memset(&sf, 0, sizeof (sf));
5094*0Sstevel@tonic-gate 		sf.sf_setno = sp->setno;
5095*0Sstevel@tonic-gate 		sf.sf_setflags = MD_SET_MN_NEWMAS_RC;
5096*0Sstevel@tonic-gate 		sf.sf_flags = MDDB_NM_RESET;
5097*0Sstevel@tonic-gate 		/* Use magic to help protect ioctl against attack. */
5098*0Sstevel@tonic-gate 		sf.sf_magic = MDDB_SETFLAGS_MAGIC;
5099*0Sstevel@tonic-gate 		/* Ignore failure, failure to reset flag isn't catastrophic */
5100*0Sstevel@tonic-gate 		(void) metaioctl(MD_MN_SET_SETFLAGS, &sf,
5101*0Sstevel@tonic-gate 		    &sf.sf_mde, NULL);
5102*0Sstevel@tonic-gate 
5103*0Sstevel@tonic-gate 		meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5104*0Sstevel@tonic-gate 		    "Reset new master flag for " "set %s: %s"),
5105*0Sstevel@tonic-gate 		    sp->setname, meta_print_hrtime(gethrtime() - start_time));
5106*0Sstevel@tonic-gate 
5107*0Sstevel@tonic-gate 		nd = sd->sd_nodelist;
5108*0Sstevel@tonic-gate 		while (nd) {
5109*0Sstevel@tonic-gate 			/* Skip non-alive nodes  */
5110*0Sstevel@tonic-gate 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
5111*0Sstevel@tonic-gate 				nd = nd->nd_next;
5112*0Sstevel@tonic-gate 				continue;
5113*0Sstevel@tonic-gate 			}
5114*0Sstevel@tonic-gate 
5115*0Sstevel@tonic-gate 			if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
5116*0Sstevel@tonic-gate 				/* RPC failure to another node */
5117*0Sstevel@tonic-gate 				if ((mdanyrpcerror(ep)) &&
5118*0Sstevel@tonic-gate 				    (sd->sd_mn_mynode->nd_nodeid !=
5119*0Sstevel@tonic-gate 				    nd->nd_nodeid)) {
5120*0Sstevel@tonic-gate 					rval = 205;
5121*0Sstevel@tonic-gate 				} else {
5122*0Sstevel@tonic-gate 					/* Any other failure */
5123*0Sstevel@tonic-gate 					rval = -1;
5124*0Sstevel@tonic-gate 				}
5125*0Sstevel@tonic-gate 				goto out;
5126*0Sstevel@tonic-gate 			}
5127*0Sstevel@tonic-gate 			set_locked = 1;
5128*0Sstevel@tonic-gate 
5129*0Sstevel@tonic-gate 			/* Withdraw node from set if owner */
5130*0Sstevel@tonic-gate 			if ((nd->nd_flags & MD_MN_NODE_OWN) &&
5131*0Sstevel@tonic-gate 			    (clnt_withdrawset(nd->nd_nodename, sp, ep))) {
5132*0Sstevel@tonic-gate 				/* RPC failure to another node */
5133*0Sstevel@tonic-gate 				if ((mdanyrpcerror(ep)) &&
5134*0Sstevel@tonic-gate 				    (sd->sd_mn_mynode->nd_nodeid !=
5135*0Sstevel@tonic-gate 				    nd->nd_nodeid)) {
5136*0Sstevel@tonic-gate 					rval = 205;
5137*0Sstevel@tonic-gate 				} else {
5138*0Sstevel@tonic-gate 					/* Any other failure */
5139*0Sstevel@tonic-gate 					rval = -1;
5140*0Sstevel@tonic-gate 				}
5141*0Sstevel@tonic-gate 				goto out;
5142*0Sstevel@tonic-gate 			}
5143*0Sstevel@tonic-gate 
5144*0Sstevel@tonic-gate 			/* Mark all nodes as withdrawn on this node */
5145*0Sstevel@tonic-gate 			if (clnt_upd_nr_flags(nd->nd_nodename, sp,
5146*0Sstevel@tonic-gate 			    sd->sd_nodelist, MD_NR_WITHDRAW, NULL, ep)) {
5147*0Sstevel@tonic-gate 				/* RPC failure to another node */
5148*0Sstevel@tonic-gate 				if ((mdanyrpcerror(ep)) &&
5149*0Sstevel@tonic-gate 				    (sd->sd_mn_mynode->nd_nodeid !=
5150*0Sstevel@tonic-gate 				    nd->nd_nodeid)) {
5151*0Sstevel@tonic-gate 					rval = 205;
5152*0Sstevel@tonic-gate 				} else {
5153*0Sstevel@tonic-gate 					/* Any other failure */
5154*0Sstevel@tonic-gate 					rval = -1;
5155*0Sstevel@tonic-gate 				}
5156*0Sstevel@tonic-gate 				goto out;
5157*0Sstevel@tonic-gate 			}
5158*0Sstevel@tonic-gate 
5159*0Sstevel@tonic-gate 			/* Resets master to no-master on this node */
5160*0Sstevel@tonic-gate 			if (clnt_mnsetmaster(nd->nd_nodename, sp,
5161*0Sstevel@tonic-gate 			    "", MD_MN_INVALID_NID, ep)) {
5162*0Sstevel@tonic-gate 				/* RPC failure to another node */
5163*0Sstevel@tonic-gate 				if ((mdanyrpcerror(ep)) &&
5164*0Sstevel@tonic-gate 				    (sd->sd_mn_mynode->nd_nodeid !=
5165*0Sstevel@tonic-gate 				    nd->nd_nodeid)) {
5166*0Sstevel@tonic-gate 					rval = 205;
5167*0Sstevel@tonic-gate 				} else {
5168*0Sstevel@tonic-gate 					/* Any other failure */
5169*0Sstevel@tonic-gate 					rval = -1;
5170*0Sstevel@tonic-gate 				}
5171*0Sstevel@tonic-gate 				goto out;
5172*0Sstevel@tonic-gate 			}
5173*0Sstevel@tonic-gate 
5174*0Sstevel@tonic-gate 			cl_sk = cl_get_setkey(sp->setno, sp->setname);
5175*0Sstevel@tonic-gate 			if (clnt_unlock_set(nd->nd_nodename, cl_sk, ep)) {
5176*0Sstevel@tonic-gate 				/* RPC failure to another node */
5177*0Sstevel@tonic-gate 				if ((mdanyrpcerror(ep)) &&
5178*0Sstevel@tonic-gate 				    (sd->sd_mn_mynode->nd_nodeid !=
5179*0Sstevel@tonic-gate 				    nd->nd_nodeid)) {
5180*0Sstevel@tonic-gate 					rval = 205;
5181*0Sstevel@tonic-gate 				} else {
5182*0Sstevel@tonic-gate 					/* Any other failure */
5183*0Sstevel@tonic-gate 					rval = -1;
5184*0Sstevel@tonic-gate 				}
5185*0Sstevel@tonic-gate 				goto out;
5186*0Sstevel@tonic-gate 			}
5187*0Sstevel@tonic-gate 			set_locked = 0;
5188*0Sstevel@tonic-gate 			nd = nd->nd_next;
5189*0Sstevel@tonic-gate 		}
5190*0Sstevel@tonic-gate 	}
5191*0Sstevel@tonic-gate 
5192*0Sstevel@tonic-gate out:
5193*0Sstevel@tonic-gate 	/*
5194*0Sstevel@tonic-gate 	 * If got here and set is still locked, then an error has
5195*0Sstevel@tonic-gate 	 * occurred and master_nodelist is still valid.
5196*0Sstevel@tonic-gate 	 * If error is not an RPC error, then unlock.
5197*0Sstevel@tonic-gate 	 * If error is an RPC error, skip unlocks since this could cause
5198*0Sstevel@tonic-gate 	 * yet another RPC timeout if a node has failed.
5199*0Sstevel@tonic-gate 	 * Ignore failures in unlock since unlock is just trying to
5200*0Sstevel@tonic-gate 	 * clean things up.
5201*0Sstevel@tonic-gate 	 */
5202*0Sstevel@tonic-gate 	if ((set_locked) && !(mdanyrpcerror(ep))) {
5203*0Sstevel@tonic-gate 		nd = master_nodelist;
5204*0Sstevel@tonic-gate 		cl_sk = cl_get_setkey(sp->setno, sp->setname);
5205*0Sstevel@tonic-gate 		while (nd) {
5206*0Sstevel@tonic-gate 			/* Skip non-alive nodes */
5207*0Sstevel@tonic-gate 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
5208*0Sstevel@tonic-gate 				nd = nd->nd_next;
5209*0Sstevel@tonic-gate 				continue;
5210*0Sstevel@tonic-gate 			}
5211*0Sstevel@tonic-gate 			/*
5212*0Sstevel@tonic-gate 			 * If clnt_unlock fails, just break out since next
5213*0Sstevel@tonic-gate 			 * reconfig cycle will reset the locks anyway.
5214*0Sstevel@tonic-gate 			 */
5215*0Sstevel@tonic-gate 			if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
5216*0Sstevel@tonic-gate 				break;
5217*0Sstevel@tonic-gate 			}
5218*0Sstevel@tonic-gate 			nd = nd->nd_next;
5219*0Sstevel@tonic-gate 		}
5220*0Sstevel@tonic-gate 		cl_set_setkey(NULL);
5221*0Sstevel@tonic-gate 	}
5222*0Sstevel@tonic-gate 	/* Free master_mnsr and drive descs */
5223*0Sstevel@tonic-gate 	mnsr_node = master_mnsr_node;
5224*0Sstevel@tonic-gate 	while (mnsr_node) {
5225*0Sstevel@tonic-gate 		master_mnsr_node = mnsr_node->mmn_next;
5226*0Sstevel@tonic-gate 		free_sr((md_set_record *)mnsr_node->mmn_mnsr);
5227*0Sstevel@tonic-gate 		free_rem_dd(mnsr_node->mmn_dd);
5228*0Sstevel@tonic-gate 		Free(mnsr_node);
5229*0Sstevel@tonic-gate 		mnsr_node = master_mnsr_node;
5230*0Sstevel@tonic-gate 	}
5231*0Sstevel@tonic-gate 
5232*0Sstevel@tonic-gate 	/* Frees sd->sd_drvs (which is also master_dd) */
5233*0Sstevel@tonic-gate 	metaflushsetname(sp);
5234*0Sstevel@tonic-gate 	return (rval);
5235*0Sstevel@tonic-gate }
5236*0Sstevel@tonic-gate 
5237*0Sstevel@tonic-gate /*
5238*0Sstevel@tonic-gate  * meta_mnsync_diskset_mddbs
5239*0Sstevel@tonic-gate  * Calling node is guaranteed to be an owner node.
5240*0Sstevel@tonic-gate  * Calling node is the master node.
5241*0Sstevel@tonic-gate  *
5242*0Sstevel@tonic-gate  * Master node verifies that ondisk mddb format matches its incore format.
5243*0Sstevel@tonic-gate  * If no nodes are joined to set, remove the change log entries.
5244*0Sstevel@tonic-gate  * If a node is joined to set, play the change log.
5245*0Sstevel@tonic-gate  *
5246*0Sstevel@tonic-gate  * Returns	 0 - Success
5247*0Sstevel@tonic-gate  *		 1 - Master unable to join to set.
5248*0Sstevel@tonic-gate  *		205 - Failure during RPC to another node
5249*0Sstevel@tonic-gate  *		-1 - Any other failure and ep is filled in.
5250*0Sstevel@tonic-gate  *			-1 return will eventually cause node to panic
5251*0Sstevel@tonic-gate  *			in a SunCluster environment.
5252*0Sstevel@tonic-gate  */
5253*0Sstevel@tonic-gate int
5254*0Sstevel@tonic-gate meta_mnsync_diskset_mddbs(
5255*0Sstevel@tonic-gate 	mdsetname_t	*sp,
5256*0Sstevel@tonic-gate 	md_error_t	*ep
5257*0Sstevel@tonic-gate )
5258*0Sstevel@tonic-gate {
5259*0Sstevel@tonic-gate 	md_set_desc		*sd;
5260*0Sstevel@tonic-gate 	mddb_config_t		c;
5261*0Sstevel@tonic-gate 	md_mn_msgclass_t	class;
5262*0Sstevel@tonic-gate 	mddb_setflags_config_t	sf;
5263*0Sstevel@tonic-gate 	md_mnnode_desc		*nd, *nd2;
5264*0Sstevel@tonic-gate 	md_error_t		xep = mdnullerror;
5265*0Sstevel@tonic-gate 	int			stale_set = 0;
5266*0Sstevel@tonic-gate 
5267*0Sstevel@tonic-gate 	/* If setname is there, set desc should exist. */
5268*0Sstevel@tonic-gate 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
5269*0Sstevel@tonic-gate 		mde_perror(ep, dgettext(TEXT_DOMAIN,
5270*0Sstevel@tonic-gate 		    "Unable to get set %s desc information"), sp->setname);
5271*0Sstevel@tonic-gate 		return (-1);
5272*0Sstevel@tonic-gate 	}
5273*0Sstevel@tonic-gate 
5274*0Sstevel@tonic-gate 	/* Are there drives in the set? */
5275*0Sstevel@tonic-gate 	if (metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
5276*0Sstevel@tonic-gate 	    ep) == NULL) {
5277*0Sstevel@tonic-gate 		if (! mdisok(ep)) {
5278*0Sstevel@tonic-gate 			return (-1);
5279*0Sstevel@tonic-gate 		}
5280*0Sstevel@tonic-gate 		/* No drives in set -- nothing to sync up */
5281*0Sstevel@tonic-gate 		return (0);
5282*0Sstevel@tonic-gate 	}
5283*0Sstevel@tonic-gate 
5284*0Sstevel@tonic-gate 	/*
5285*0Sstevel@tonic-gate 	 * Is master node (which is this node) joined to set?
5286*0Sstevel@tonic-gate 	 * If master node isn't joined (which means that no nodes
5287*0Sstevel@tonic-gate 	 * are joined to diskset), remove the change log entries
5288*0Sstevel@tonic-gate 	 * since no need to replay them - all nodes will have same
5289*0Sstevel@tonic-gate 	 * view of mddbs since all nodes are reading in the mddbs
5290*0Sstevel@tonic-gate 	 * from disk.
5291*0Sstevel@tonic-gate 	 * There is also no need to sync up the master and ondisk mddbs
5292*0Sstevel@tonic-gate 	 * since master has no incore knowledge.
5293*0Sstevel@tonic-gate 	 * Need to join master to set in order to flush the change
5294*0Sstevel@tonic-gate 	 * log entries. Don't need to block I/O during join of master
5295*0Sstevel@tonic-gate 	 * to set since no other nodes are joined to set and so no I/O
5296*0Sstevel@tonic-gate 	 * can be occurring.
5297*0Sstevel@tonic-gate 	 */
5298*0Sstevel@tonic-gate 	if (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) {
5299*0Sstevel@tonic-gate 		/* Join master to set */
5300*0Sstevel@tonic-gate 		if (clnt_joinset(mynode(), sp,
5301*0Sstevel@tonic-gate 		    MNSET_IN_RECONFIG, ep)) {
5302*0Sstevel@tonic-gate 			if (mdismddberror(ep, MDE_DB_STALE)) {
5303*0Sstevel@tonic-gate 				/*
5304*0Sstevel@tonic-gate 				 * If STALE, print message and continue on.
5305*0Sstevel@tonic-gate 				 * Don't do any writes or reads to mddbs
5306*0Sstevel@tonic-gate 				 * so don't clear change log.
5307*0Sstevel@tonic-gate 				 */
5308*0Sstevel@tonic-gate 				mde_perror(ep, dgettext(TEXT_DOMAIN,
5309*0Sstevel@tonic-gate 				    "Join of master node to STALE set %s"),
5310*0Sstevel@tonic-gate 				    sp->setname);
5311*0Sstevel@tonic-gate 				stale_set = 1;
5312*0Sstevel@tonic-gate 				mdclrerror(ep);
5313*0Sstevel@tonic-gate 			} else if (mdismddberror(ep, MDE_DB_ACCOK)) {
5314*0Sstevel@tonic-gate 				/* ACCOK means mediator provided extra vote */
5315*0Sstevel@tonic-gate 				mdclrerror(ep);
5316*0Sstevel@tonic-gate 			} else {
5317*0Sstevel@tonic-gate 				/*
5318*0Sstevel@tonic-gate 				 * If master is unable to join set, print an
5319*0Sstevel@tonic-gate 				 * error message.  Don't return failure or node
5320*0Sstevel@tonic-gate 				 * will panic during cluster reconfig cycle.
5321*0Sstevel@tonic-gate 				 * Also, withdraw node from set in order to
5322*0Sstevel@tonic-gate 				 * cleanup from failed join attempt.
5323*0Sstevel@tonic-gate 				 */
5324*0Sstevel@tonic-gate 				mde_perror(ep, dgettext(TEXT_DOMAIN,
5325*0Sstevel@tonic-gate 				    "Join of master node in set %s failed"),
5326*0Sstevel@tonic-gate 				    sp->setname);
5327*0Sstevel@tonic-gate 				if (clnt_withdrawset(mynode(), sp, &xep))
5328*0Sstevel@tonic-gate 					mdclrerror(&xep);
5329*0Sstevel@tonic-gate 				return (1);
5330*0Sstevel@tonic-gate 			}
5331*0Sstevel@tonic-gate 		}
5332*0Sstevel@tonic-gate 		/*
5333*0Sstevel@tonic-gate 		 * Master node successfully joined.
5334*0Sstevel@tonic-gate 		 * Set local copy of flags to OWN and
5335*0Sstevel@tonic-gate 		 * send owner flag to rpc.metad. If not stale,
5336*0Sstevel@tonic-gate 		 * flush the change log.
5337*0Sstevel@tonic-gate 		 */
5338*0Sstevel@tonic-gate 		sd->sd_mn_mynode->nd_flags |= MD_MN_NODE_OWN;
5339*0Sstevel@tonic-gate 		if (clnt_upd_nr_flags(mynode(), sp, sd->sd_nodelist, MD_NR_SET,
5340*0Sstevel@tonic-gate 		    MNSET_IN_RECONFIG, ep)) {
5341*0Sstevel@tonic-gate 			mde_perror(ep, dgettext(TEXT_DOMAIN,
5342*0Sstevel@tonic-gate 			    "Flag update of master node join in set %s failed"),
5343*0Sstevel@tonic-gate 			    sp->setname);
5344*0Sstevel@tonic-gate 			return (-1);
5345*0Sstevel@tonic-gate 		}
5346*0Sstevel@tonic-gate 
5347*0Sstevel@tonic-gate 		if (!stale_set) {
5348*0Sstevel@tonic-gate 			if (mdmn_reset_changelog(sp, ep,
5349*0Sstevel@tonic-gate 			    MDMN_CLF_RESETLOG) != 0) {
5350*0Sstevel@tonic-gate 				mde_perror(ep, dgettext(TEXT_DOMAIN,
5351*0Sstevel@tonic-gate 				    "Unable to reset changelog."));
5352*0Sstevel@tonic-gate 				return (-1);
5353*0Sstevel@tonic-gate 			}
5354*0Sstevel@tonic-gate 			meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5355*0Sstevel@tonic-gate 			    "Removed changelog entries for set %s: %s"),
5356*0Sstevel@tonic-gate 			    sp->setname,
5357*0Sstevel@tonic-gate 			    meta_print_hrtime(gethrtime() - start_time));
5358*0Sstevel@tonic-gate 		}
5359*0Sstevel@tonic-gate 		/* Reset new master flag before return */
5360*0Sstevel@tonic-gate 		(void) memset(&sf, 0, sizeof (sf));
5361*0Sstevel@tonic-gate 		sf.sf_setno = sp->setno;
5362*0Sstevel@tonic-gate 		sf.sf_setflags = MD_SET_MN_NEWMAS_RC;
5363*0Sstevel@tonic-gate 		sf.sf_flags = MDDB_NM_RESET;
5364*0Sstevel@tonic-gate 		/* Use magic to help protect ioctl against attack. */
5365*0Sstevel@tonic-gate 		sf.sf_magic = MDDB_SETFLAGS_MAGIC;
5366*0Sstevel@tonic-gate 		/* Ignore failure, failure to reset flag isn't catastrophic */
5367*0Sstevel@tonic-gate 		(void) metaioctl(MD_MN_SET_SETFLAGS, &sf,
5368*0Sstevel@tonic-gate 		    &sf.sf_mde, NULL);
5369*0Sstevel@tonic-gate 
5370*0Sstevel@tonic-gate 		meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5371*0Sstevel@tonic-gate 		    "Reset new master flag for set %s: %s"),
5372*0Sstevel@tonic-gate 		    sp->setname, meta_print_hrtime(gethrtime() - start_time));
5373*0Sstevel@tonic-gate 
5374*0Sstevel@tonic-gate 		return (0);
5375*0Sstevel@tonic-gate 	}
5376*0Sstevel@tonic-gate 
5377*0Sstevel@tonic-gate 	/*
5378*0Sstevel@tonic-gate 	 * Is master already joined to STALE set (< 50% mddbs avail)?
5379*0Sstevel@tonic-gate 	 * If so, can make no config changes to mddbs so don't check or play
5380*0Sstevel@tonic-gate 	 * changelog and don't sync master node to ondisk mddbs.
5381*0Sstevel@tonic-gate 	 * To get out of the stale state all nodes must be withdrawn
5382*0Sstevel@tonic-gate 	 * from set.  Then as nodes are re-joined, all nodes will
5383*0Sstevel@tonic-gate 	 * have same view of mddbs since all nodes are reading the
5384*0Sstevel@tonic-gate 	 * mddbs from disk.
5385*0Sstevel@tonic-gate 	 */
5386*0Sstevel@tonic-gate 	(void) memset(&c, 0, sizeof (c));
5387*0Sstevel@tonic-gate 	c.c_id = 0;
5388*0Sstevel@tonic-gate 	c.c_setno = sp->setno;
5389*0Sstevel@tonic-gate 	if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) {
5390*0Sstevel@tonic-gate 		(void) mdstealerror(ep, &c.c_mde);
5391*0Sstevel@tonic-gate 		return (-1);
5392*0Sstevel@tonic-gate 	}
5393*0Sstevel@tonic-gate 	if (c.c_flags & MDDB_C_STALE) {
5394*0Sstevel@tonic-gate 		return (0);
5395*0Sstevel@tonic-gate 	}
5396*0Sstevel@tonic-gate 
5397*0Sstevel@tonic-gate 	/*
5398*0Sstevel@tonic-gate 	 * If this node is NOT a newly chosen master, then there's
5399*0Sstevel@tonic-gate 	 * nothing else to do since the change log should be empty and
5400*0Sstevel@tonic-gate 	 * the ondisk and incore mddbs are already consistent.
5401*0Sstevel@tonic-gate 	 *
5402*0Sstevel@tonic-gate 	 * A newly chosen master is a node that was not the master
5403*0Sstevel@tonic-gate 	 * at the beginning of the reconfig cycle.  If a node is a new
5404*0Sstevel@tonic-gate 	 * master, then the new master state is reset after the ondisk
5405*0Sstevel@tonic-gate 	 * and incore mddbs are consistent and the change log has
5406*0Sstevel@tonic-gate 	 * been replayed.
5407*0Sstevel@tonic-gate 	 */
5408*0Sstevel@tonic-gate 	(void) memset(&sf, 0, sizeof (sf));
5409*0Sstevel@tonic-gate 	sf.sf_setno = sp->setno;
5410*0Sstevel@tonic-gate 	sf.sf_flags = MDDB_NM_GET;
5411*0Sstevel@tonic-gate 	/* Use magic to help protect ioctl against attack. */
5412*0Sstevel@tonic-gate 	sf.sf_magic = MDDB_SETFLAGS_MAGIC;
5413*0Sstevel@tonic-gate 	if ((metaioctl(MD_MN_GET_SETFLAGS, &sf, &sf.sf_mde, NULL) == 0) &&
5414*0Sstevel@tonic-gate 	    ((sf.sf_setflags & MD_SET_MN_NEWMAS_RC) == 0)) {
5415*0Sstevel@tonic-gate 		return (0);
5416*0Sstevel@tonic-gate 	}
5417*0Sstevel@tonic-gate 
5418*0Sstevel@tonic-gate 	/*
5419*0Sstevel@tonic-gate 	 * Now, sync up incore master view to ondisk mddbs.
5420*0Sstevel@tonic-gate 	 * This is needed in the case where a master node
5421*0Sstevel@tonic-gate 	 * had made a change to the mddb, but this change
5422*0Sstevel@tonic-gate 	 * may not have been relayed to the slaves yet.
5423*0Sstevel@tonic-gate 	 * So, the new master needs to verify that the ondisk
5424*0Sstevel@tonic-gate 	 * mddbs match what the new master has incore -
5425*0Sstevel@tonic-gate 	 * if different, new master rewrites all of the mddbs.
5426*0Sstevel@tonic-gate 	 * Then the new master will replay the changelog and the
5427*0Sstevel@tonic-gate 	 * new master will then execute what the old master had
5428*0Sstevel@tonic-gate 	 * done.
5429*0Sstevel@tonic-gate 	 *
5430*0Sstevel@tonic-gate 	 * Block all I/Os to disks in this diskset on all nodes in
5431*0Sstevel@tonic-gate 	 * the diskset.  This will allow the rewriting of the mddbs
5432*0Sstevel@tonic-gate 	 * (if needed), to proceed in a timely manner.
5433*0Sstevel@tonic-gate 	 *
5434*0Sstevel@tonic-gate 	 * If block of I/Os fail, return a -1.
5435*0Sstevel@tonic-gate 	 */
5436*0Sstevel@tonic-gate 
5437*0Sstevel@tonic-gate 	nd = sd->sd_nodelist;
5438*0Sstevel@tonic-gate 	while (nd) {
5439*0Sstevel@tonic-gate 		/* Skip non-alive and non-owner nodes  */
5440*0Sstevel@tonic-gate 		if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) ||
5441*0Sstevel@tonic-gate 		    (!(nd->nd_flags & MD_MN_NODE_OWN))) {
5442*0Sstevel@tonic-gate 			nd = nd->nd_next;
5443*0Sstevel@tonic-gate 			continue;
5444*0Sstevel@tonic-gate 		}
5445*0Sstevel@tonic-gate 		if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno,
5446*0Sstevel@tonic-gate 		    MN_SUSP_IO, ep)) {
5447*0Sstevel@tonic-gate 			mde_perror(ep, dgettext(TEXT_DOMAIN,
5448*0Sstevel@tonic-gate 			    "Unable to suspend I/O on node %s in set %s"),
5449*0Sstevel@tonic-gate 			    nd->nd_nodename, sp->setname);
5450*0Sstevel@tonic-gate 
5451*0Sstevel@tonic-gate 			/*
5452*0Sstevel@tonic-gate 			 * Resume all other nodes that had been suspended.
5453*0Sstevel@tonic-gate 			 * (Reconfig return step also resumes I/Os
5454*0Sstevel@tonic-gate 			 * for all sets.)
5455*0Sstevel@tonic-gate 			 */
5456*0Sstevel@tonic-gate 			nd2 = sd->sd_nodelist;
5457*0Sstevel@tonic-gate 			while (nd2) {
5458*0Sstevel@tonic-gate 				/* Stop when reaching failed node */
5459*0Sstevel@tonic-gate 				if (nd2->nd_nodeid == nd->nd_nodeid)
5460*0Sstevel@tonic-gate 					break;
5461*0Sstevel@tonic-gate 				/* Skip non-alive and non-owner nodes  */
5462*0Sstevel@tonic-gate 				if ((!(nd2->nd_flags & MD_MN_NODE_ALIVE)) ||
5463*0Sstevel@tonic-gate 				    (!(nd2->nd_flags & MD_MN_NODE_OWN))) {
5464*0Sstevel@tonic-gate 					nd2 = nd2->nd_next;
5465*0Sstevel@tonic-gate 					continue;
5466*0Sstevel@tonic-gate 				}
5467*0Sstevel@tonic-gate 				(void) (clnt_mn_susp_res_io(nd2->nd_nodename,
5468*0Sstevel@tonic-gate 					sp->setno, MN_RES_IO, &xep));
5469*0Sstevel@tonic-gate 				nd2 = nd2->nd_next;
5470*0Sstevel@tonic-gate 			}
5471*0Sstevel@tonic-gate 
5472*0Sstevel@tonic-gate 			/*
5473*0Sstevel@tonic-gate 			 * If an RPC failure on another node, return a 205.
5474*0Sstevel@tonic-gate 			 * Otherwise, exit with failure.
5475*0Sstevel@tonic-gate 			 */
5476*0Sstevel@tonic-gate 			if ((mdanyrpcerror(ep)) &&
5477*0Sstevel@tonic-gate 			    (sd->sd_mn_mynode->nd_nodeid !=
5478*0Sstevel@tonic-gate 			    nd->nd_nodeid)) {
5479*0Sstevel@tonic-gate 				return (205);
5480*0Sstevel@tonic-gate 			} else {
5481*0Sstevel@tonic-gate 				return (-1);
5482*0Sstevel@tonic-gate 			}
5483*0Sstevel@tonic-gate 
5484*0Sstevel@tonic-gate 		}
5485*0Sstevel@tonic-gate 		nd = nd->nd_next;
5486*0Sstevel@tonic-gate 	}
5487*0Sstevel@tonic-gate 
5488*0Sstevel@tonic-gate 	(void) memset(&c, 0, sizeof (c));
5489*0Sstevel@tonic-gate 	c.c_id = 0;
5490*0Sstevel@tonic-gate 	c.c_setno = sp->setno;
5491*0Sstevel@tonic-gate 	/* Master can't sync up to ondisk mddbs?  Kick it out of cluster */
5492*0Sstevel@tonic-gate 	if (metaioctl(MD_MN_CHK_WRT_MDDB, &c, &c.c_mde, NULL) != 0)
5493*0Sstevel@tonic-gate 		return (-1);
5494*0Sstevel@tonic-gate 
5495*0Sstevel@tonic-gate 	/*
5496*0Sstevel@tonic-gate 	 * Resume I/Os that were suspended above.
5497*0Sstevel@tonic-gate 	 */
5498*0Sstevel@tonic-gate 	nd = sd->sd_nodelist;
5499*0Sstevel@tonic-gate 	while (nd) {
5500*0Sstevel@tonic-gate 		/* Skip non-alive and non-owner nodes  */
5501*0Sstevel@tonic-gate 		if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) ||
5502*0Sstevel@tonic-gate 		    (!(nd->nd_flags & MD_MN_NODE_OWN))) {
5503*0Sstevel@tonic-gate 			nd = nd->nd_next;
5504*0Sstevel@tonic-gate 			continue;
5505*0Sstevel@tonic-gate 		}
5506*0Sstevel@tonic-gate 		if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno,
5507*0Sstevel@tonic-gate 		    MN_RES_IO, ep)) {
5508*0Sstevel@tonic-gate 			mde_perror(ep, dgettext(TEXT_DOMAIN,
5509*0Sstevel@tonic-gate 			    "Unable to resume I/O on node %s in set %s"),
5510*0Sstevel@tonic-gate 			    nd->nd_nodename, sp->setname);
5511*0Sstevel@tonic-gate 
5512*0Sstevel@tonic-gate 			/*
5513*0Sstevel@tonic-gate 			 * If an RPC failure then don't do any
5514*0Sstevel@tonic-gate 			 * more RPC calls, since one timeout is enough
5515*0Sstevel@tonic-gate 			 * to endure.  If RPC failure to another node, return
5516*0Sstevel@tonic-gate 			 * 205.  If RPC failure to my node, return -1.
5517*0Sstevel@tonic-gate 			 * If not an RPC failure, continue resuming the
5518*0Sstevel@tonic-gate 			 * rest of the nodes and then return -1.
5519*0Sstevel@tonic-gate 			 */
5520*0Sstevel@tonic-gate 			if (mdanyrpcerror(ep)) {
5521*0Sstevel@tonic-gate 				if (sd->sd_mn_mynode->nd_nodeid ==
5522*0Sstevel@tonic-gate 				    nd->nd_nodeid) {
5523*0Sstevel@tonic-gate 					return (-1);
5524*0Sstevel@tonic-gate 				} else {
5525*0Sstevel@tonic-gate 					return (205);
5526*0Sstevel@tonic-gate 				}
5527*0Sstevel@tonic-gate 			}
5528*0Sstevel@tonic-gate 
5529*0Sstevel@tonic-gate 			/*
5530*0Sstevel@tonic-gate 			 * If not an RPC error, continue resuming rest of
5531*0Sstevel@tonic-gate 			 * nodes, ignoring any failures except for an
5532*0Sstevel@tonic-gate 			 * RPC failure which constitutes an immediate exit.
5533*0Sstevel@tonic-gate 			 * Start in middle of list with failing node.
5534*0Sstevel@tonic-gate 			 */
5535*0Sstevel@tonic-gate 			nd2 = nd->nd_next;
5536*0Sstevel@tonic-gate 			while (nd2) {
5537*0Sstevel@tonic-gate 				/* Skip non-alive and non-owner nodes  */
5538*0Sstevel@tonic-gate 				if ((!(nd2->nd_flags & MD_MN_NODE_ALIVE)) ||
5539*0Sstevel@tonic-gate 				    (!(nd2->nd_flags & MD_MN_NODE_OWN))) {
5540*0Sstevel@tonic-gate 					nd2 = nd2->nd_next;
5541*0Sstevel@tonic-gate 					continue;
5542*0Sstevel@tonic-gate 				}
5543*0Sstevel@tonic-gate 				(void) (clnt_mn_susp_res_io(nd2->nd_nodename,
5544*0Sstevel@tonic-gate 					sp->setno, MN_RES_IO, &xep));
5545*0Sstevel@tonic-gate 				if (mdanyrpcerror(&xep)) {
5546*0Sstevel@tonic-gate 					return (-1);
5547*0Sstevel@tonic-gate 				}
5548*0Sstevel@tonic-gate 				nd2 = nd2->nd_next;
5549*0Sstevel@tonic-gate 			}
5550*0Sstevel@tonic-gate 		}
5551*0Sstevel@tonic-gate 		nd = nd->nd_next;
5552*0Sstevel@tonic-gate 	}
5553*0Sstevel@tonic-gate 
5554*0Sstevel@tonic-gate 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, "Master node has completed "
5555*0Sstevel@tonic-gate 	    "checking/writing the mddb for set %s: %s"), sp->setname,
5556*0Sstevel@tonic-gate 	    meta_print_hrtime(gethrtime() - start_time));
5557*0Sstevel@tonic-gate 
5558*0Sstevel@tonic-gate 	/*
5559*0Sstevel@tonic-gate 	 * Send (aka replay) all messages we find in the changelog.
5560*0Sstevel@tonic-gate 	 * Flag the messages with
5561*0Sstevel@tonic-gate 	 *   MD_MSGF_REPLAY_MSG, so no new message ID is generated for them
5562*0Sstevel@tonic-gate 	 *   MD_MSGF_OVERRIDE_SUSPEND so they can pass the suspended commd.
5563*0Sstevel@tonic-gate 	 */
5564*0Sstevel@tonic-gate 	for (class = MD_MN_NCLASSES - 1; class > 0; class--) {
5565*0Sstevel@tonic-gate 		mdmn_changelog_record_t	*lr;
5566*0Sstevel@tonic-gate 		md_error_t	xep = mdnullerror;
5567*0Sstevel@tonic-gate 		md_mn_result_t	*resultp = NULL;
5568*0Sstevel@tonic-gate 		int		ret;
5569*0Sstevel@tonic-gate 
5570*0Sstevel@tonic-gate 		lr = mdmn_get_changelogrec(sp->setno, class);
5571*0Sstevel@tonic-gate 		if ((lr->lr_flags & MD_MN_LR_INUSE) == 0) {
5572*0Sstevel@tonic-gate 			/* no entry for this class */
5573*0Sstevel@tonic-gate 			continue;
5574*0Sstevel@tonic-gate 		}
5575*0Sstevel@tonic-gate 
5576*0Sstevel@tonic-gate 		meta_mc_log(MC_LOG1, dgettext(TEXT_DOMAIN,
5577*0Sstevel@tonic-gate 		    "replaying message ID=(%d, 0x%llx-%d)\n"),
5578*0Sstevel@tonic-gate 		    MSGID_ELEMS(lr->lr_msg.msg_msgid));
5579*0Sstevel@tonic-gate 
5580*0Sstevel@tonic-gate 		ret = mdmn_send_message_with_msgid(
5581*0Sstevel@tonic-gate 			lr->lr_msg.msg_setno,
5582*0Sstevel@tonic-gate 			lr->lr_msg.msg_type,
5583*0Sstevel@tonic-gate 			lr->lr_msg.msg_flags |  MD_MSGF_REPLAY_MSG |
5584*0Sstevel@tonic-gate 						MD_MSGF_OVERRIDE_SUSPEND,
5585*0Sstevel@tonic-gate 			lr->lr_msg.msg_event_data,
5586*0Sstevel@tonic-gate 			lr->lr_msg.msg_event_size,
5587*0Sstevel@tonic-gate 			&resultp,
5588*0Sstevel@tonic-gate 			&lr->lr_msg.msg_msgid,
5589*0Sstevel@tonic-gate 			&xep);
5590*0Sstevel@tonic-gate 
5591*0Sstevel@tonic-gate 		meta_mc_log(MC_LOG1, dgettext(TEXT_DOMAIN,
5592*0Sstevel@tonic-gate 		    "mdmn_send_message returned %d\n"), ret);
5593*0Sstevel@tonic-gate 
5594*0Sstevel@tonic-gate 		if (resultp)
5595*0Sstevel@tonic-gate 			free_result(resultp);
5596*0Sstevel@tonic-gate 	}
5597*0Sstevel@tonic-gate 
5598*0Sstevel@tonic-gate 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5599*0Sstevel@tonic-gate 	    "Playing changelog completed for set %s: %s"),
5600*0Sstevel@tonic-gate 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
5601*0Sstevel@tonic-gate 
5602*0Sstevel@tonic-gate 	/*
5603*0Sstevel@tonic-gate 	 * Now that new master has ondisk and incore mddbs in sync, reset
5604*0Sstevel@tonic-gate 	 * this node's new master kernel flag (for this set).  If this node
5605*0Sstevel@tonic-gate 	 * re-enters another reconfig cycle before the completion of this
5606*0Sstevel@tonic-gate 	 * reconfig cycle, this master node won't need to check if the ondisk
5607*0Sstevel@tonic-gate 	 * and incore mddbs are in sync since this node won't be considered
5608*0Sstevel@tonic-gate 	 * a new master (since this flag is being reset here in the middle of
5609*0Sstevel@tonic-gate 	 * step2).  This will save time during any subsequent reconfig
5610*0Sstevel@tonic-gate 	 * cycles as long as this node continues to be master.
5611*0Sstevel@tonic-gate 	 */
5612*0Sstevel@tonic-gate 	(void) memset(&sf, 0, sizeof (sf));
5613*0Sstevel@tonic-gate 	sf.sf_setno = sp->setno;
5614*0Sstevel@tonic-gate 	sf.sf_setflags = MD_SET_MN_NEWMAS_RC;
5615*0Sstevel@tonic-gate 	sf.sf_flags = MDDB_NM_RESET;
5616*0Sstevel@tonic-gate 	/* Use magic to help protect ioctl against attack. */
5617*0Sstevel@tonic-gate 	sf.sf_magic = MDDB_SETFLAGS_MAGIC;
5618*0Sstevel@tonic-gate 	/* Ignore failure, since failure to reset flag isn't catastrophic */
5619*0Sstevel@tonic-gate 	(void) metaioctl(MD_MN_SET_SETFLAGS, &sf, &sf.sf_mde, NULL);
5620*0Sstevel@tonic-gate 
5621*0Sstevel@tonic-gate 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5622*0Sstevel@tonic-gate 	    "Reset new master flag for set %s: %s"),
5623*0Sstevel@tonic-gate 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
5624*0Sstevel@tonic-gate 
5625*0Sstevel@tonic-gate 	return (0);
5626*0Sstevel@tonic-gate }
5627*0Sstevel@tonic-gate 
5628*0Sstevel@tonic-gate /*
5629*0Sstevel@tonic-gate  * meta_mnjoin_all will join all starting nodes in the diskset.
5630*0Sstevel@tonic-gate  * A starting node is considered to be any node that is not
5631*0Sstevel@tonic-gate  * an owner of the set but is a member of the cluster.
5632*0Sstevel@tonic-gate  * Master node is already joined to set (done in meta_mnsync_diskset_mddbs).
5633*0Sstevel@tonic-gate  *
5634*0Sstevel@tonic-gate  * Caller is the Master node.
5635*0Sstevel@tonic-gate  *
5636*0Sstevel@tonic-gate  * Returns	 0 - Success
5637*0Sstevel@tonic-gate  *		205 - Failure during RPC to another node
5638*0Sstevel@tonic-gate  *		-1 - Any other failure and ep is filled in.
5639*0Sstevel@tonic-gate  */
5640*0Sstevel@tonic-gate int
5641*0Sstevel@tonic-gate meta_mnjoin_all(
5642*0Sstevel@tonic-gate 	mdsetname_t	*sp,
5643*0Sstevel@tonic-gate 	md_error_t	*ep
5644*0Sstevel@tonic-gate )
5645*0Sstevel@tonic-gate {
5646*0Sstevel@tonic-gate 	md_set_desc		*sd;
5647*0Sstevel@tonic-gate 	md_mnnode_desc		*nd, *nd2;
5648*0Sstevel@tonic-gate 	int			rval = 0;
5649*0Sstevel@tonic-gate 	int			stale_flag = 0;
5650*0Sstevel@tonic-gate 	mddb_config_t		c;
5651*0Sstevel@tonic-gate 	int			susp_res_flag = 0;
5652*0Sstevel@tonic-gate 	md_error_t		xep = mdnullerror;
5653*0Sstevel@tonic-gate 
5654*0Sstevel@tonic-gate 	/* If setname is there, set desc should exist. */
5655*0Sstevel@tonic-gate 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
5656*0Sstevel@tonic-gate 		mde_perror(ep, dgettext(TEXT_DOMAIN,
5657*0Sstevel@tonic-gate 		    "Unable to get set %s desc information"), sp->setname);
5658*0Sstevel@tonic-gate 		return (-1);
5659*0Sstevel@tonic-gate 	}
5660*0Sstevel@tonic-gate 
5661*0Sstevel@tonic-gate 	/* Are there drives in the set? */
5662*0Sstevel@tonic-gate 	if (metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
5663*0Sstevel@tonic-gate 	    ep) == NULL) {
5664*0Sstevel@tonic-gate 		if (! mdisok(ep)) {
5665*0Sstevel@tonic-gate 			return (-1);
5666*0Sstevel@tonic-gate 		}
5667*0Sstevel@tonic-gate 		/* No drives in set -- nothing to join */
5668*0Sstevel@tonic-gate 		return (0);
5669*0Sstevel@tonic-gate 	}
5670*0Sstevel@tonic-gate 
5671*0Sstevel@tonic-gate 	/*
5672*0Sstevel@tonic-gate 	 * Is set currently stale?
5673*0Sstevel@tonic-gate 	 */
5674*0Sstevel@tonic-gate 	(void) memset(&c, 0, sizeof (c));
5675*0Sstevel@tonic-gate 	c.c_id = 0;
5676*0Sstevel@tonic-gate 	c.c_setno = sp->setno;
5677*0Sstevel@tonic-gate 	/* Ignore failure since master node may not be joined yet */
5678*0Sstevel@tonic-gate 	(void) metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL);
5679*0Sstevel@tonic-gate 	if (c.c_flags & MDDB_C_STALE) {
5680*0Sstevel@tonic-gate 		stale_flag = MNSET_IS_STALE;
5681*0Sstevel@tonic-gate 	}
5682*0Sstevel@tonic-gate 
5683*0Sstevel@tonic-gate 	/*
5684*0Sstevel@tonic-gate 	 * If any nodes are going to be joined to diskset, then
5685*0Sstevel@tonic-gate 	 * suspend I/O to all disks in diskset so that nodes can join
5686*0Sstevel@tonic-gate 	 * (read in mddbs) in a reasonable amount of time even under
5687*0Sstevel@tonic-gate 	 * high I/O load.  Don't need to do this if set is STALE since
5688*0Sstevel@tonic-gate 	 * no I/O can be occurring to a STALE set.
5689*0Sstevel@tonic-gate 	 */
5690*0Sstevel@tonic-gate 	if (stale_flag != MNSET_IS_STALE) {
5691*0Sstevel@tonic-gate 		nd = sd->sd_nodelist;
5692*0Sstevel@tonic-gate 		while (nd) {
5693*0Sstevel@tonic-gate 			/* Found a node that will be joined to diskset */
5694*0Sstevel@tonic-gate 			if ((nd->nd_flags & MD_MN_NODE_ALIVE) &&
5695*0Sstevel@tonic-gate 			    (!(nd->nd_flags & MD_MN_NODE_OWN))) {
5696*0Sstevel@tonic-gate 				/* Set flag that diskset should be suspended */
5697*0Sstevel@tonic-gate 				susp_res_flag = 1;
5698*0Sstevel@tonic-gate 				break;
5699*0Sstevel@tonic-gate 			}
5700*0Sstevel@tonic-gate 			nd = nd->nd_next;
5701*0Sstevel@tonic-gate 		}
5702*0Sstevel@tonic-gate 	}
5703*0Sstevel@tonic-gate 
5704*0Sstevel@tonic-gate 	if (susp_res_flag) {
5705*0Sstevel@tonic-gate 		/*
5706*0Sstevel@tonic-gate 		 * Block all I/Os to disks in this diskset on all joined
5707*0Sstevel@tonic-gate 		 * nodes in the diskset.
5708*0Sstevel@tonic-gate 		 * If block of I/Os fails due to an RPC failure on another
5709*0Sstevel@tonic-gate 		 * node, return 205; otherwise, return -1.
5710*0Sstevel@tonic-gate 		 */
5711*0Sstevel@tonic-gate 		nd = sd->sd_nodelist;
5712*0Sstevel@tonic-gate 		while (nd) {
5713*0Sstevel@tonic-gate 			/* Skip non-alive and non-owner nodes  */
5714*0Sstevel@tonic-gate 			if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) ||
5715*0Sstevel@tonic-gate 			    (!(nd->nd_flags & MD_MN_NODE_OWN))) {
5716*0Sstevel@tonic-gate 				nd = nd->nd_next;
5717*0Sstevel@tonic-gate 				continue;
5718*0Sstevel@tonic-gate 			}
5719*0Sstevel@tonic-gate 			if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno,
5720*0Sstevel@tonic-gate 			    MN_SUSP_IO, ep)) {
5721*0Sstevel@tonic-gate 				mde_perror(ep, dgettext(TEXT_DOMAIN,
5722*0Sstevel@tonic-gate 				    "Unable to suspend I/O on node %s"
5723*0Sstevel@tonic-gate 				    " in set %s"), nd->nd_nodename,
5724*0Sstevel@tonic-gate 				    sp->setname);
5725*0Sstevel@tonic-gate 				/*
5726*0Sstevel@tonic-gate 				 * Resume other nodes that had been suspended.
5727*0Sstevel@tonic-gate 				 * (Reconfig return step also resumes I/Os
5728*0Sstevel@tonic-gate 				 * for all sets.)
5729*0Sstevel@tonic-gate 				 */
5730*0Sstevel@tonic-gate 				nd2 = sd->sd_nodelist;
5731*0Sstevel@tonic-gate 				while (nd2) {
5732*0Sstevel@tonic-gate 					/* Stop when reaching failed node */
5733*0Sstevel@tonic-gate 					if (nd2->nd_nodeid == nd->nd_nodeid)
5734*0Sstevel@tonic-gate 						break;
5735*0Sstevel@tonic-gate 					/* Skip non-alive/non-owner nodes  */
5736*0Sstevel@tonic-gate 					if ((!(nd2->nd_flags &
5737*0Sstevel@tonic-gate 					    MD_MN_NODE_ALIVE)) ||
5738*0Sstevel@tonic-gate 					    (!(nd2->nd_flags &
5739*0Sstevel@tonic-gate 					    MD_MN_NODE_OWN))) {
5740*0Sstevel@tonic-gate 						nd2 = nd2->nd_next;
5741*0Sstevel@tonic-gate 						continue;
5742*0Sstevel@tonic-gate 					}
5743*0Sstevel@tonic-gate 					(void) (clnt_mn_susp_res_io(
5744*0Sstevel@tonic-gate 					    nd2->nd_nodename, sp->setno,
5745*0Sstevel@tonic-gate 					    MN_RES_IO, &xep));
5746*0Sstevel@tonic-gate 					nd2 = nd2->nd_next;
5747*0Sstevel@tonic-gate 				}
5748*0Sstevel@tonic-gate 
5749*0Sstevel@tonic-gate 				/*
5750*0Sstevel@tonic-gate 				 * If the suspend failed due to an
5751*0Sstevel@tonic-gate 				 * RPC failure on another node, return
5752*0Sstevel@tonic-gate 				 * a 205.
5753*0Sstevel@tonic-gate 				 * Otherwise, exit with failure.
5754*0Sstevel@tonic-gate 				 * The return reconfig step will resume
5755*0Sstevel@tonic-gate 				 * I/Os for all disksets.
5756*0Sstevel@tonic-gate 				 */
5757*0Sstevel@tonic-gate 				if ((mdanyrpcerror(ep)) &&
5758*0Sstevel@tonic-gate 				    (sd->sd_mn_mynode->nd_nodeid !=
5759*0Sstevel@tonic-gate 				    nd->nd_nodeid)) {
5760*0Sstevel@tonic-gate 					return (205);
5761*0Sstevel@tonic-gate 				} else {
5762*0Sstevel@tonic-gate 					return (-1);
5763*0Sstevel@tonic-gate 				}
5764*0Sstevel@tonic-gate 			}
5765*0Sstevel@tonic-gate 			nd = nd->nd_next;
5766*0Sstevel@tonic-gate 		}
5767*0Sstevel@tonic-gate 	}
5768*0Sstevel@tonic-gate 
5769*0Sstevel@tonic-gate 	nd = sd->sd_nodelist;
5770*0Sstevel@tonic-gate 	while (nd) {
5771*0Sstevel@tonic-gate 		/*
5772*0Sstevel@tonic-gate 		 * If a node is in the membership list but isn't joined
5773*0Sstevel@tonic-gate 		 * to the set, try to join the node.
5774*0Sstevel@tonic-gate 		 */
5775*0Sstevel@tonic-gate 		if ((nd->nd_flags & MD_MN_NODE_ALIVE) &&
5776*0Sstevel@tonic-gate 		    (!(nd->nd_flags & MD_MN_NODE_OWN))) {
5777*0Sstevel@tonic-gate 			if (clnt_joinset(nd->nd_nodename, sp,
5778*0Sstevel@tonic-gate 			    (MNSET_IN_RECONFIG | stale_flag), ep)) {
5779*0Sstevel@tonic-gate 				/*
5780*0Sstevel@tonic-gate 				 * If RPC failure to another node
5781*0Sstevel@tonic-gate 				 * then exit without attempting anything else.
5782*0Sstevel@tonic-gate 				 * (Reconfig return step will resume I/Os
5783*0Sstevel@tonic-gate 				 * for all sets.)
5784*0Sstevel@tonic-gate 				 */
5785*0Sstevel@tonic-gate 				if (mdanyrpcerror(ep)) {
5786*0Sstevel@tonic-gate 					mde_perror(ep, "");
5787*0Sstevel@tonic-gate 					return (205);
5788*0Sstevel@tonic-gate 				}
5789*0Sstevel@tonic-gate 				/*
5790*0Sstevel@tonic-gate 				 * STALE and ACCOK failures aren't true
5791*0Sstevel@tonic-gate 				 * failures.  STALE means that <50% mddbs
5792*0Sstevel@tonic-gate 				 * are available. ACCOK means that the
5793*0Sstevel@tonic-gate 				 * mediator provided the extra vote.
5794*0Sstevel@tonic-gate 				 * If a true failure, then print messasge
5795*0Sstevel@tonic-gate 				 * and withdraw node from set in order to
5796*0Sstevel@tonic-gate 				 * cleanup from failed join attempt.
5797*0Sstevel@tonic-gate 				 */
5798*0Sstevel@tonic-gate 				if ((!mdismddberror(ep, MDE_DB_STALE)) &&
5799*0Sstevel@tonic-gate 				    (!mdismddberror(ep, MDE_DB_ACCOK))) {
5800*0Sstevel@tonic-gate 					mde_perror(ep,
5801*0Sstevel@tonic-gate 					    "WARNING: Unable to join node %s "
5802*0Sstevel@tonic-gate 					    "to set %s", nd->nd_nodename,
5803*0Sstevel@tonic-gate 					    sp->setname);
5804*0Sstevel@tonic-gate 					mdclrerror(ep);
5805*0Sstevel@tonic-gate 					if (clnt_withdrawset(nd->nd_nodename,
5806*0Sstevel@tonic-gate 					    sp, &xep))
5807*0Sstevel@tonic-gate 						mdclrerror(&xep);
5808*0Sstevel@tonic-gate 					nd = nd->nd_next;
5809*0Sstevel@tonic-gate 					continue;
5810*0Sstevel@tonic-gate 				}
5811*0Sstevel@tonic-gate 			}
5812*0Sstevel@tonic-gate 			/* Set owner flag even if STALE or ACCOK */
5813*0Sstevel@tonic-gate 			nd->nd_flags |= MD_MN_NODE_OWN;
5814*0Sstevel@tonic-gate 		}
5815*0Sstevel@tonic-gate 		nd = nd->nd_next;
5816*0Sstevel@tonic-gate 	}
5817*0Sstevel@tonic-gate 	/*
5818*0Sstevel@tonic-gate 	 * Resume I/Os if suspended above.
5819*0Sstevel@tonic-gate 	 */
5820*0Sstevel@tonic-gate 	if (susp_res_flag) {
5821*0Sstevel@tonic-gate 		nd = sd->sd_nodelist;
5822*0Sstevel@tonic-gate 		while (nd) {
5823*0Sstevel@tonic-gate 			/*
5824*0Sstevel@tonic-gate 			 * Skip non-alive and non-owner nodes
5825*0Sstevel@tonic-gate 			 * (this list doesn't include any of
5826*0Sstevel@tonic-gate 			 * the nodes that were joined).
5827*0Sstevel@tonic-gate 			 */
5828*0Sstevel@tonic-gate 			if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) ||
5829*0Sstevel@tonic-gate 			    (!(nd->nd_flags & MD_MN_NODE_OWN))) {
5830*0Sstevel@tonic-gate 				nd = nd->nd_next;
5831*0Sstevel@tonic-gate 				continue;
5832*0Sstevel@tonic-gate 			}
5833*0Sstevel@tonic-gate 			if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno,
5834*0Sstevel@tonic-gate 			    MN_RES_IO, ep)) {
5835*0Sstevel@tonic-gate 				mde_perror(ep, dgettext(TEXT_DOMAIN,
5836*0Sstevel@tonic-gate 				    "Unable to resume I/O on node %s"
5837*0Sstevel@tonic-gate 				    " in set %s"), nd->nd_nodename,
5838*0Sstevel@tonic-gate 				    sp->setname);
5839*0Sstevel@tonic-gate 
5840*0Sstevel@tonic-gate 				/*
5841*0Sstevel@tonic-gate 				 * If an RPC failure then don't do any
5842*0Sstevel@tonic-gate 				 * more RPC calls, since one timeout is enough
5843*0Sstevel@tonic-gate 				 * to endure.  If RPC failure to another node,
5844*0Sstevel@tonic-gate 				 * return 205.  If RPC failure to my node,
5845*0Sstevel@tonic-gate 				 * return -1.
5846*0Sstevel@tonic-gate 				 * (Reconfig return step will resume I/Os
5847*0Sstevel@tonic-gate 				 * for all sets.)
5848*0Sstevel@tonic-gate 				 * If not an RPC failure, continue resuming the
5849*0Sstevel@tonic-gate 				 * rest of the nodes and then return -1.
5850*0Sstevel@tonic-gate 				 */
5851*0Sstevel@tonic-gate 				if (mdanyrpcerror(ep)) {
5852*0Sstevel@tonic-gate 					if (sd->sd_mn_mynode->nd_nodeid ==
5853*0Sstevel@tonic-gate 					    nd->nd_nodeid) {
5854*0Sstevel@tonic-gate 						return (-1);
5855*0Sstevel@tonic-gate 					} else {
5856*0Sstevel@tonic-gate 						return (205);
5857*0Sstevel@tonic-gate 					}
5858*0Sstevel@tonic-gate 				}
5859*0Sstevel@tonic-gate 
5860*0Sstevel@tonic-gate 				/*
5861*0Sstevel@tonic-gate 				 * If not an RPC error, continue resuming rest
5862*0Sstevel@tonic-gate 				 * of nodes, ignoring any failures except for
5863*0Sstevel@tonic-gate 				 * an RPC failure which constitutes an
5864*0Sstevel@tonic-gate 				 * immediate exit.
5865*0Sstevel@tonic-gate 				 * Start in middle of list with failing node.
5866*0Sstevel@tonic-gate 				 */
5867*0Sstevel@tonic-gate 				nd2 = nd->nd_next;
5868*0Sstevel@tonic-gate 				while (nd2) {
5869*0Sstevel@tonic-gate 					/* Skip non-owner nodes  */
5870*0Sstevel@tonic-gate 					if ((!(nd2->nd_flags &
5871*0Sstevel@tonic-gate 					    MD_MN_NODE_ALIVE)) ||
5872*0Sstevel@tonic-gate 					    (!(nd2->nd_flags &
5873*0Sstevel@tonic-gate 					    MD_MN_NODE_OWN))) {
5874*0Sstevel@tonic-gate 						nd2 = nd2->nd_next;
5875*0Sstevel@tonic-gate 						continue;
5876*0Sstevel@tonic-gate 					}
5877*0Sstevel@tonic-gate 					(void) (clnt_mn_susp_res_io(
5878*0Sstevel@tonic-gate 					    nd2->nd_nodename, sp->setno,
5879*0Sstevel@tonic-gate 					    MN_RES_IO, &xep));
5880*0Sstevel@tonic-gate 					if (mdanyrpcerror(&xep)) {
5881*0Sstevel@tonic-gate 						return (-1);
5882*0Sstevel@tonic-gate 					}
5883*0Sstevel@tonic-gate 					nd2 = nd2->nd_next;
5884*0Sstevel@tonic-gate 				}
5885*0Sstevel@tonic-gate 			}
5886*0Sstevel@tonic-gate 			nd = nd->nd_next;
5887*0Sstevel@tonic-gate 		}
5888*0Sstevel@tonic-gate 	}
5889*0Sstevel@tonic-gate 
5890*0Sstevel@tonic-gate 	nd = sd->sd_nodelist;
5891*0Sstevel@tonic-gate 	while (nd) {
5892*0Sstevel@tonic-gate 		if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
5893*0Sstevel@tonic-gate 			nd = nd->nd_next;
5894*0Sstevel@tonic-gate 			continue;
5895*0Sstevel@tonic-gate 		}
5896*0Sstevel@tonic-gate 		/*
5897*0Sstevel@tonic-gate 		 * If 1 node fails - go ahead and update the rest except
5898*0Sstevel@tonic-gate 		 * in the case of an RPC failure, fail immediately.
5899*0Sstevel@tonic-gate 		 */
5900*0Sstevel@tonic-gate 		if (clnt_upd_nr_flags(nd->nd_nodename, sp,
5901*0Sstevel@tonic-gate 		    sd->sd_nodelist, MD_NR_SET, MNSET_IN_RECONFIG, ep)) {
5902*0Sstevel@tonic-gate 			/* RPC failure to another node */
5903*0Sstevel@tonic-gate 			if (mdanyrpcerror(ep)) {
5904*0Sstevel@tonic-gate 				return (205);
5905*0Sstevel@tonic-gate 			}
5906*0Sstevel@tonic-gate 			nd = nd->nd_next;
5907*0Sstevel@tonic-gate 			rval = -1;
5908*0Sstevel@tonic-gate 			continue;
5909*0Sstevel@tonic-gate 		}
5910*0Sstevel@tonic-gate 		nd = nd->nd_next;
5911*0Sstevel@tonic-gate 	}
5912*0Sstevel@tonic-gate 
5913*0Sstevel@tonic-gate 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
5914*0Sstevel@tonic-gate 	    "Join of all nodes completed for set %s: %s"),
5915*0Sstevel@tonic-gate 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
5916*0Sstevel@tonic-gate 
5917*0Sstevel@tonic-gate 	return (rval);
5918*0Sstevel@tonic-gate }
5919