10Sstevel@tonic-gate /*
20Sstevel@tonic-gate  * CDDL HEADER START
30Sstevel@tonic-gate  *
40Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
5*1623Stw21770  * Common Development and Distribution License (the "License").
6*1623Stw21770  * You may not use this file except in compliance with the License.
70Sstevel@tonic-gate  *
80Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
90Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
100Sstevel@tonic-gate  * See the License for the specific language governing permissions
110Sstevel@tonic-gate  * and limitations under the License.
120Sstevel@tonic-gate  *
130Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
140Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
150Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
160Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
170Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
180Sstevel@tonic-gate  *
190Sstevel@tonic-gate  * CDDL HEADER END
200Sstevel@tonic-gate  */
210Sstevel@tonic-gate /*
22*1623Stw21770  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
230Sstevel@tonic-gate  * Use is subject to license terms.
240Sstevel@tonic-gate  */
250Sstevel@tonic-gate 
260Sstevel@tonic-gate #pragma ident	"%Z%%M%	%I%	%E% SMI"
270Sstevel@tonic-gate 
280Sstevel@tonic-gate /*
290Sstevel@tonic-gate  * Just in case we're not in a build environment, make sure that
300Sstevel@tonic-gate  * TEXT_DOMAIN gets set to something.
310Sstevel@tonic-gate  */
320Sstevel@tonic-gate #if !defined(TEXT_DOMAIN)
330Sstevel@tonic-gate #define	TEXT_DOMAIN "SYS_TEST"
340Sstevel@tonic-gate #endif
350Sstevel@tonic-gate 
360Sstevel@tonic-gate /*
370Sstevel@tonic-gate  * Metadevice diskset interfaces
380Sstevel@tonic-gate  */
390Sstevel@tonic-gate 
400Sstevel@tonic-gate #include "meta_set_prv.h"
410Sstevel@tonic-gate #include <meta.h>
420Sstevel@tonic-gate #include <metad.h>
430Sstevel@tonic-gate #include <mdmn_changelog.h>
440Sstevel@tonic-gate #include <sys/lvm/md_crc.h>
450Sstevel@tonic-gate #include <sys/utsname.h>
460Sstevel@tonic-gate #include <sdssc.h>
470Sstevel@tonic-gate 
480Sstevel@tonic-gate #include <sys/sysevent/eventdefs.h>
490Sstevel@tonic-gate #include <sys/sysevent/svm.h>
500Sstevel@tonic-gate extern	char	*blkname(char *);
510Sstevel@tonic-gate 
520Sstevel@tonic-gate static md_drive_desc *
530Sstevel@tonic-gate dr2drivedesc(
540Sstevel@tonic-gate 	mdsetname_t	*sp,
550Sstevel@tonic-gate 	side_t		sideno,
560Sstevel@tonic-gate 	int		flags,
570Sstevel@tonic-gate 	md_error_t	*ep
580Sstevel@tonic-gate )
590Sstevel@tonic-gate {
600Sstevel@tonic-gate 	md_set_record	*sr;
610Sstevel@tonic-gate 	md_drive_record	*dr;
620Sstevel@tonic-gate 	mddrivename_t	*dnp;
630Sstevel@tonic-gate 	md_drive_desc	*dd_head = NULL;
640Sstevel@tonic-gate 	md_set_desc	*sd;
650Sstevel@tonic-gate 
660Sstevel@tonic-gate 	if (flags & MD_BYPASS_DAEMON) {
670Sstevel@tonic-gate 		if ((sr = metad_getsetbynum(sp->setno, ep)) == NULL)
680Sstevel@tonic-gate 			return (NULL);
690Sstevel@tonic-gate 		sd = metaget_setdesc(sp, ep);
700Sstevel@tonic-gate 		sideno = getnodeside(mynode(), sd);
710Sstevel@tonic-gate 		sp = metafakesetname(sp->setno, sr->sr_setname);
720Sstevel@tonic-gate 	} else {
730Sstevel@tonic-gate 		if ((sr = getsetbyname(sp->setname, ep)) == NULL)
740Sstevel@tonic-gate 			return (NULL);
750Sstevel@tonic-gate 	}
760Sstevel@tonic-gate 
770Sstevel@tonic-gate 	assert(sideno != MD_SIDEWILD);
780Sstevel@tonic-gate 
790Sstevel@tonic-gate 	/*
800Sstevel@tonic-gate 	 * WARNING:
810Sstevel@tonic-gate 	 * The act of getting the dnp from the namespace means that we
820Sstevel@tonic-gate 	 * will get the devid of the disk as recorded in the namespace.
830Sstevel@tonic-gate 	 * This devid has the potential to be stale if the disk is being
840Sstevel@tonic-gate 	 * replaced via a rebind, this means that any code that relies
850Sstevel@tonic-gate 	 * on any of the dnp information should take the appropriate action
860Sstevel@tonic-gate 	 * to preserve that information. For example in the rebind code the
870Sstevel@tonic-gate 	 * devid of the new disk is saved off and then copied back in once
880Sstevel@tonic-gate 	 * the code that has called this function has completed.
890Sstevel@tonic-gate 	 */
900Sstevel@tonic-gate 	for (dr = sr->sr_drivechain; dr != NULL; dr = dr->dr_next) {
910Sstevel@tonic-gate 		if ((dnp = metadrivename_withdrkey(sp, sideno, dr->dr_key,
920Sstevel@tonic-gate 		    flags, ep)) == NULL) {
930Sstevel@tonic-gate 			if (!(flags & MD_BYPASS_DAEMON))
940Sstevel@tonic-gate 				free_sr(sr);
950Sstevel@tonic-gate 			metafreedrivedesc(&dd_head);
960Sstevel@tonic-gate 			return (NULL);
970Sstevel@tonic-gate 		}
980Sstevel@tonic-gate 
990Sstevel@tonic-gate 		(void) metadrivedesc_append(&dd_head, dnp, dr->dr_dbcnt,
1000Sstevel@tonic-gate 		    dr->dr_dbsize, dr->dr_ctime, dr->dr_genid, dr->dr_flags);
1010Sstevel@tonic-gate 	}
1020Sstevel@tonic-gate 
1030Sstevel@tonic-gate 	if (!(flags & MD_BYPASS_DAEMON)) {
1040Sstevel@tonic-gate 		free_sr(sr);
1050Sstevel@tonic-gate 	}
1060Sstevel@tonic-gate 	return (dd_head);
1070Sstevel@tonic-gate }
1080Sstevel@tonic-gate 
1090Sstevel@tonic-gate static int
1100Sstevel@tonic-gate get_sidenmlist(
1110Sstevel@tonic-gate 	mdsetname_t	*sp,
1120Sstevel@tonic-gate 	mddrivename_t	*dnp,
1130Sstevel@tonic-gate 	md_error_t	*ep
1140Sstevel@tonic-gate )
1150Sstevel@tonic-gate {
1160Sstevel@tonic-gate 	md_set_desc	*sd;
1170Sstevel@tonic-gate 	mdsidenames_t	*sn, **sn_next;
1180Sstevel@tonic-gate 	int		i;
1190Sstevel@tonic-gate 
1200Sstevel@tonic-gate 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
1210Sstevel@tonic-gate 		return (-1);
1220Sstevel@tonic-gate 
1230Sstevel@tonic-gate 	metaflushsidenames(dnp);
1240Sstevel@tonic-gate 	sn_next = &dnp->side_names;
1250Sstevel@tonic-gate 	if (MD_MNSET_DESC(sd)) {
1260Sstevel@tonic-gate 		/*
1270Sstevel@tonic-gate 		 * Only get sidenames for this node since
1280Sstevel@tonic-gate 		 * that is the only side information stored in
1290Sstevel@tonic-gate 		 * the local mddb for a multi-node diskset.
1300Sstevel@tonic-gate 		 */
1310Sstevel@tonic-gate 		if (sd->sd_mn_mynode) {
1320Sstevel@tonic-gate 			sn = Zalloc(sizeof (*sn));
1330Sstevel@tonic-gate 			sn->sideno = sd->sd_mn_mynode->nd_nodeid;
1340Sstevel@tonic-gate 			if ((sn->cname = meta_getnmentbykey(MD_LOCAL_SET,
1350Sstevel@tonic-gate 			    sn->sideno, dnp->side_names_key, &sn->dname,
1360Sstevel@tonic-gate 			    &sn->mnum, NULL, ep)) == NULL) {
1370Sstevel@tonic-gate 				if (sn->dname != NULL)
1380Sstevel@tonic-gate 					Free(sn->dname);
1390Sstevel@tonic-gate 				Free(sn);
1400Sstevel@tonic-gate 				return (-1);
1410Sstevel@tonic-gate 			}
1420Sstevel@tonic-gate 
1430Sstevel@tonic-gate 			/* Add to the end of the linked list */
1440Sstevel@tonic-gate 			assert(*sn_next == NULL);
1450Sstevel@tonic-gate 			*sn_next = sn;
1460Sstevel@tonic-gate 			sn_next = &sn->next;
1470Sstevel@tonic-gate 		}
1480Sstevel@tonic-gate 	} else {
1490Sstevel@tonic-gate 		for (i = 0; i < MD_MAXSIDES; i++) {
1500Sstevel@tonic-gate 			/* Skip empty slots */
1510Sstevel@tonic-gate 			if (sd->sd_nodes[i][0] == '\0')
1520Sstevel@tonic-gate 				continue;
1530Sstevel@tonic-gate 
1540Sstevel@tonic-gate 			sn = Zalloc(sizeof (*sn));
1550Sstevel@tonic-gate 			sn->sideno = i;
1560Sstevel@tonic-gate 			if ((sn->cname = meta_getnmentbykey(MD_LOCAL_SET,
1570Sstevel@tonic-gate 			    i+SKEW, dnp->side_names_key, &sn->dname,
1580Sstevel@tonic-gate 			    &sn->mnum, NULL, ep)) == NULL) {
1590Sstevel@tonic-gate 				/*
1600Sstevel@tonic-gate 				 * It is possible that during the add of a
1610Sstevel@tonic-gate 				 * host to have a 'missing' side as the side
1620Sstevel@tonic-gate 				 * for this disk will be added later. So ignore
1630Sstevel@tonic-gate 				 * the error. The 'missing' side will be added
1640Sstevel@tonic-gate 				 * once the addhosts process has completed.
1650Sstevel@tonic-gate 				 */
1660Sstevel@tonic-gate 				if (mdissyserror(ep, ENOENT)) {
1670Sstevel@tonic-gate 					mdclrerror(ep);
1680Sstevel@tonic-gate 					Free(sn);
1690Sstevel@tonic-gate 					continue;
1700Sstevel@tonic-gate 				}
1710Sstevel@tonic-gate 
1720Sstevel@tonic-gate 				if (sn->dname != NULL)
1730Sstevel@tonic-gate 					Free(sn->dname);
1740Sstevel@tonic-gate 				Free(sn);
1750Sstevel@tonic-gate 				return (-1);
1760Sstevel@tonic-gate 			}
1770Sstevel@tonic-gate 
1780Sstevel@tonic-gate 			/* Add to the end of the linked list */
1790Sstevel@tonic-gate 			assert(*sn_next == NULL);
1800Sstevel@tonic-gate 			*sn_next = sn;
1810Sstevel@tonic-gate 			sn_next = &sn->next;
1820Sstevel@tonic-gate 		}
1830Sstevel@tonic-gate 	}
1840Sstevel@tonic-gate 
1850Sstevel@tonic-gate 	return (0);
1860Sstevel@tonic-gate }
1870Sstevel@tonic-gate 
1880Sstevel@tonic-gate static md_drive_desc *
1890Sstevel@tonic-gate rl_to_dd(
1900Sstevel@tonic-gate 	mdsetname_t		*sp,
1910Sstevel@tonic-gate 	md_replicalist_t	*rlp,
1920Sstevel@tonic-gate 	md_error_t		*ep
1930Sstevel@tonic-gate )
1940Sstevel@tonic-gate {
1950Sstevel@tonic-gate 	md_replicalist_t	*rl;
1960Sstevel@tonic-gate 	md_replica_t		*r;
1970Sstevel@tonic-gate 	md_drive_desc		*dd = NULL;
1980Sstevel@tonic-gate 	md_drive_desc		*d;
1990Sstevel@tonic-gate 	int			found;
2000Sstevel@tonic-gate 	md_set_desc		*sd;
2010Sstevel@tonic-gate 	daddr_t			nblks = 0;
2020Sstevel@tonic-gate 
2030Sstevel@tonic-gate 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
2040Sstevel@tonic-gate 		return (NULL);
2050Sstevel@tonic-gate 
2060Sstevel@tonic-gate 	/* find the smallest existing replica */
2070Sstevel@tonic-gate 	for (rl = rlp; rl != NULL; rl = rl->rl_next) {
2080Sstevel@tonic-gate 		r = rl->rl_repp;
2090Sstevel@tonic-gate 		nblks = ((nblks == 0) ? r->r_nblk : min(r->r_nblk, nblks));
2100Sstevel@tonic-gate 	}
2110Sstevel@tonic-gate 
2120Sstevel@tonic-gate 	if (nblks <= 0)
2130Sstevel@tonic-gate 		nblks = (MD_MNSET_DESC(sd)) ? MD_MN_DBSIZE : MD_DBSIZE;
2140Sstevel@tonic-gate 
2150Sstevel@tonic-gate 	for (rl = rlp; rl != NULL; rl = rl->rl_next) {
2160Sstevel@tonic-gate 		r = rl->rl_repp;
2170Sstevel@tonic-gate 
2180Sstevel@tonic-gate 		found = 0;
2190Sstevel@tonic-gate 		for (d = dd; d != NULL; d = d->dd_next) {
2200Sstevel@tonic-gate 			if (strcmp(r->r_namep->drivenamep->cname,
2210Sstevel@tonic-gate 			    d->dd_dnp->cname) == 0) {
2220Sstevel@tonic-gate 				found = 1;
2230Sstevel@tonic-gate 				dd->dd_dbcnt++;
2240Sstevel@tonic-gate 				break;
2250Sstevel@tonic-gate 			}
2260Sstevel@tonic-gate 		}
2270Sstevel@tonic-gate 
2280Sstevel@tonic-gate 		if (! found)
2290Sstevel@tonic-gate 			(void) metadrivedesc_append(&dd, r->r_namep->drivenamep,
2300Sstevel@tonic-gate 			    1, nblks, sd->sd_ctime, sd->sd_genid, MD_DR_OK);
2310Sstevel@tonic-gate 	}
2320Sstevel@tonic-gate 
2330Sstevel@tonic-gate 	return (dd);
2340Sstevel@tonic-gate }
2350Sstevel@tonic-gate 
2360Sstevel@tonic-gate /*
2370Sstevel@tonic-gate  * Exported Entry Points
2380Sstevel@tonic-gate  */
2390Sstevel@tonic-gate 
2400Sstevel@tonic-gate set_t
2410Sstevel@tonic-gate get_max_sets(md_error_t *ep)
2420Sstevel@tonic-gate {
2430Sstevel@tonic-gate 
2440Sstevel@tonic-gate 	static set_t		max_sets = 0;
2450Sstevel@tonic-gate 
2460Sstevel@tonic-gate 	if (max_sets == 0)
2470Sstevel@tonic-gate 		if (metaioctl(MD_IOCGETNSET, &max_sets, ep, NULL) != 0)
2480Sstevel@tonic-gate 			return (0);
2490Sstevel@tonic-gate 
2500Sstevel@tonic-gate 	return (max_sets);
2510Sstevel@tonic-gate }
2520Sstevel@tonic-gate 
2530Sstevel@tonic-gate int
2540Sstevel@tonic-gate get_max_meds(md_error_t *ep)
2550Sstevel@tonic-gate {
2560Sstevel@tonic-gate 	static int		max_meds = 0;
2570Sstevel@tonic-gate 
2580Sstevel@tonic-gate 	if (max_meds == 0)
2590Sstevel@tonic-gate 		if (metaioctl(MD_MED_GET_NMED, &max_meds, ep, NULL) != 0)
2600Sstevel@tonic-gate 			return (0);
2610Sstevel@tonic-gate 
2620Sstevel@tonic-gate 	return (max_meds);
2630Sstevel@tonic-gate }
2640Sstevel@tonic-gate 
2650Sstevel@tonic-gate side_t
2660Sstevel@tonic-gate getmyside(mdsetname_t *sp, md_error_t *ep)
2670Sstevel@tonic-gate {
2680Sstevel@tonic-gate 	md_set_desc		*sd;
2690Sstevel@tonic-gate 	char 			*node = NULL;
2700Sstevel@tonic-gate 	side_t			sideno;
2710Sstevel@tonic-gate 
2720Sstevel@tonic-gate 	if (sp->setno == 0)
2730Sstevel@tonic-gate 		return (0);
2740Sstevel@tonic-gate 
2750Sstevel@tonic-gate 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
2760Sstevel@tonic-gate 		return (MD_SIDEWILD);
2770Sstevel@tonic-gate 
2780Sstevel@tonic-gate 	node = mynode();
2790Sstevel@tonic-gate 
2800Sstevel@tonic-gate 	assert(node != NULL);
2810Sstevel@tonic-gate 
2820Sstevel@tonic-gate 	sideno = getnodeside(node, sd);
2830Sstevel@tonic-gate 
2840Sstevel@tonic-gate 	if (sideno != MD_SIDEWILD)
2850Sstevel@tonic-gate 		return (sideno);
2860Sstevel@tonic-gate 
2870Sstevel@tonic-gate 	return (mddserror(ep, MDE_DS_HOSTNOSIDE, sp->setno, node, NULL, node));
2880Sstevel@tonic-gate }
2890Sstevel@tonic-gate 
2900Sstevel@tonic-gate /*
2910Sstevel@tonic-gate  * get set info from name
2920Sstevel@tonic-gate  */
2930Sstevel@tonic-gate md_set_record *
2940Sstevel@tonic-gate getsetbyname(char *setname, md_error_t *ep)
2950Sstevel@tonic-gate {
2960Sstevel@tonic-gate 	md_set_record		*sr = NULL;
2970Sstevel@tonic-gate 	md_mnset_record		*mnsr = NULL;
2980Sstevel@tonic-gate 	char			*p;
2990Sstevel@tonic-gate 	size_t			len;
3000Sstevel@tonic-gate 
3010Sstevel@tonic-gate 	/* get set info from daemon */
3020Sstevel@tonic-gate 	if (clnt_getset(mynode(), setname, MD_SET_BAD, &sr, ep) == -1)
3030Sstevel@tonic-gate 		return (NULL);
3040Sstevel@tonic-gate 	if (sr != NULL) {
3050Sstevel@tonic-gate 		/*
3060Sstevel@tonic-gate 		 * Returned record could be for a multi-node set or a
3070Sstevel@tonic-gate 		 * non-multi-node set.
3080Sstevel@tonic-gate 		 */
3090Sstevel@tonic-gate 		if (MD_MNSET_REC(sr)) {
3100Sstevel@tonic-gate 			/*
3110Sstevel@tonic-gate 			 * Record is for a multi-node set.  Reissue call
3120Sstevel@tonic-gate 			 * to get mnset information.  Need to free
3130Sstevel@tonic-gate 			 * record as if a non-multi-node set record since
3140Sstevel@tonic-gate 			 * that is what clnt_getset gave us.  If in
3150Sstevel@tonic-gate 			 * the daemon, don't free since this is a pointer
3160Sstevel@tonic-gate 			 * into the setrecords array.
3170Sstevel@tonic-gate 			 */
3180Sstevel@tonic-gate 			if (! md_in_daemon) {
3190Sstevel@tonic-gate 				sr->sr_flags &= ~MD_SR_MN;
3200Sstevel@tonic-gate 				free_sr(sr);
3210Sstevel@tonic-gate 			}
3220Sstevel@tonic-gate 			if (clnt_mngetset(mynode(), setname, MD_SET_BAD, &mnsr,
3230Sstevel@tonic-gate 			    ep) == -1)
3240Sstevel@tonic-gate 				return (NULL);
3250Sstevel@tonic-gate 			if (mnsr != NULL)
3260Sstevel@tonic-gate 				return ((struct md_set_record *)mnsr);
3270Sstevel@tonic-gate 		} else {
3280Sstevel@tonic-gate 			return (sr);
3290Sstevel@tonic-gate 		}
3300Sstevel@tonic-gate 	}
3310Sstevel@tonic-gate 
3320Sstevel@tonic-gate 	/* no such set */
3330Sstevel@tonic-gate 	len = strlen(setname) + 30;
3340Sstevel@tonic-gate 	p = Malloc(len);
3350Sstevel@tonic-gate 	(void) snprintf(p, len, "setname \"%s\"", setname);
3360Sstevel@tonic-gate 	(void) mderror(ep, MDE_NO_SET, p);
3370Sstevel@tonic-gate 	Free(p);
3380Sstevel@tonic-gate 	return (NULL);
3390Sstevel@tonic-gate }
3400Sstevel@tonic-gate 
3410Sstevel@tonic-gate /*
3420Sstevel@tonic-gate  * get set info from number
3430Sstevel@tonic-gate  */
3440Sstevel@tonic-gate md_set_record *
3450Sstevel@tonic-gate getsetbynum(set_t setno, md_error_t *ep)
3460Sstevel@tonic-gate {
3470Sstevel@tonic-gate 	md_set_record		*sr;
3480Sstevel@tonic-gate 	md_mnset_record		*mnsr = NULL;
3490Sstevel@tonic-gate 	char			buf[100];
3500Sstevel@tonic-gate 
3510Sstevel@tonic-gate 	if (clnt_getset(mynode(), NULL, setno, &sr, ep) == -1)
3520Sstevel@tonic-gate 		return (NULL);
3530Sstevel@tonic-gate 
3540Sstevel@tonic-gate 	if (sr != NULL) {
3550Sstevel@tonic-gate 		/*
3560Sstevel@tonic-gate 		 * Record is for a multi-node set.  Reissue call
3570Sstevel@tonic-gate 		 * to get mnset information.  Need to free
3580Sstevel@tonic-gate 		 * record as if a non-multi-node set record since
3590Sstevel@tonic-gate 		 * that is what clnt_getset gave us.  If in
3600Sstevel@tonic-gate 		 * the daemon, don't free since this is a pointer
3610Sstevel@tonic-gate 		 * into the setrecords array.
3620Sstevel@tonic-gate 		 */
3630Sstevel@tonic-gate 		if (MD_MNSET_REC(sr)) {
3640Sstevel@tonic-gate 			/*
3650Sstevel@tonic-gate 			 * Record is for a multi-node set.  Reissue call
3660Sstevel@tonic-gate 			 * to get mnset information.
3670Sstevel@tonic-gate 			 */
3680Sstevel@tonic-gate 			if (! md_in_daemon) {
3690Sstevel@tonic-gate 				sr->sr_flags &= ~MD_SR_MN;
3700Sstevel@tonic-gate 				free_sr(sr);
3710Sstevel@tonic-gate 			}
3720Sstevel@tonic-gate 			if (clnt_mngetset(mynode(), NULL, setno, &mnsr,
3730Sstevel@tonic-gate 			    ep) == -1)
3740Sstevel@tonic-gate 				return (NULL);
3750Sstevel@tonic-gate 			if (mnsr != NULL)
3760Sstevel@tonic-gate 				return ((struct md_set_record *)mnsr);
3770Sstevel@tonic-gate 		} else {
3780Sstevel@tonic-gate 			return (sr);
3790Sstevel@tonic-gate 		}
3800Sstevel@tonic-gate 	}
3810Sstevel@tonic-gate 
3820Sstevel@tonic-gate 	(void) sprintf(buf, "setno %u", setno);
3830Sstevel@tonic-gate 	(void) mderror(ep, MDE_NO_SET, buf);
3840Sstevel@tonic-gate 	return (NULL);
3850Sstevel@tonic-gate }
3860Sstevel@tonic-gate 
3870Sstevel@tonic-gate int
3880Sstevel@tonic-gate meta_check_drive_inuse(
3890Sstevel@tonic-gate 	mdsetname_t	*sp,
3900Sstevel@tonic-gate 	mddrivename_t	*dnp,
3910Sstevel@tonic-gate 	int		check_db,
3920Sstevel@tonic-gate 	md_error_t	*ep
3930Sstevel@tonic-gate )
3940Sstevel@tonic-gate {
3950Sstevel@tonic-gate 	mdnamelist_t	*nlp = NULL;
3960Sstevel@tonic-gate 	mdnamelist_t	*p;
3970Sstevel@tonic-gate 	int		rval = 0;
3980Sstevel@tonic-gate 
3990Sstevel@tonic-gate 	/* get all underlying partitions */
4000Sstevel@tonic-gate 	if (meta_getalldevs(sp, &nlp, check_db, ep) != 0)
4010Sstevel@tonic-gate 		return (-1);
4020Sstevel@tonic-gate 
4030Sstevel@tonic-gate 	/* search for drive */
4040Sstevel@tonic-gate 	for (p = nlp; (p != NULL); p = p->next) {
4050Sstevel@tonic-gate 		mdname_t	*np = p->namep;
4060Sstevel@tonic-gate 
4070Sstevel@tonic-gate 		if (strcmp(dnp->cname, np->drivenamep->cname) == 0) {
4080Sstevel@tonic-gate 			rval = (mddserror(ep, MDE_DS_DRIVEINUSE, sp->setno,
4090Sstevel@tonic-gate 			    NULL, dnp->cname, sp->setname));
4100Sstevel@tonic-gate 			break;
4110Sstevel@tonic-gate 		}
4120Sstevel@tonic-gate 	}
4130Sstevel@tonic-gate 
4140Sstevel@tonic-gate 	/* cleanup, return success */
4150Sstevel@tonic-gate 	metafreenamelist(nlp);
4160Sstevel@tonic-gate 	return (rval);
4170Sstevel@tonic-gate }
4180Sstevel@tonic-gate 
4190Sstevel@tonic-gate /*
4200Sstevel@tonic-gate  * simple check for ownership
4210Sstevel@tonic-gate  */
4220Sstevel@tonic-gate int
4230Sstevel@tonic-gate meta_check_ownership(mdsetname_t *sp, md_error_t *ep)
4240Sstevel@tonic-gate {
4250Sstevel@tonic-gate 	int			ownset;
4260Sstevel@tonic-gate 	md_set_desc		*sd;
4270Sstevel@tonic-gate 	md_drive_desc		*dd;
4280Sstevel@tonic-gate 	md_replicalist_t	*rlp = NULL;
4290Sstevel@tonic-gate 	md_error_t		xep = mdnullerror;
4300Sstevel@tonic-gate 
4310Sstevel@tonic-gate 	if (metaislocalset(sp))
4320Sstevel@tonic-gate 		return (0);
4330Sstevel@tonic-gate 
4340Sstevel@tonic-gate 	ownset = own_set(sp, NULL, TRUE, ep);
4350Sstevel@tonic-gate 	if (! mdisok(ep))
4360Sstevel@tonic-gate 		return (-1);
4370Sstevel@tonic-gate 
4380Sstevel@tonic-gate 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
4390Sstevel@tonic-gate 		return (-1);
4400Sstevel@tonic-gate 
4410Sstevel@tonic-gate 	dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), ep);
4420Sstevel@tonic-gate 	if (! mdisok(ep))
4430Sstevel@tonic-gate 		return (-1);
4440Sstevel@tonic-gate 
4450Sstevel@tonic-gate 	/* If we have no drive descriptors, check for no ownership */
4460Sstevel@tonic-gate 	if (dd == NULL) {
4470Sstevel@tonic-gate 		if (ownset == MD_SETOWNER_NONE)
4480Sstevel@tonic-gate 			return (0);
4490Sstevel@tonic-gate 
4500Sstevel@tonic-gate 		/* If ownership somehow has come to exist, we must clean up */
4510Sstevel@tonic-gate 
4520Sstevel@tonic-gate 		if (metareplicalist(sp, (MD_BASICNAME_OK | PRINT_FAST), &rlp,
4530Sstevel@tonic-gate 		    &xep) < 0)
4540Sstevel@tonic-gate 			mdclrerror(&xep);
4550Sstevel@tonic-gate 
4560Sstevel@tonic-gate 		if ((dd = rl_to_dd(sp, rlp, &xep)) == NULL)
4570Sstevel@tonic-gate 			if (! mdisok(&xep))
4580Sstevel@tonic-gate 				mdclrerror(&xep);
4590Sstevel@tonic-gate 
4600Sstevel@tonic-gate 		if (!(MD_MNSET_DESC(sd)) && !MD_ATSET_DESC(sd)) {
4610Sstevel@tonic-gate 			if (rel_own_bydd(sp, dd, TRUE, &xep))
4620Sstevel@tonic-gate 				mdclrerror(&xep);
4630Sstevel@tonic-gate 		}
4640Sstevel@tonic-gate 
4650Sstevel@tonic-gate 		if (halt_set(sp, &xep))
4660Sstevel@tonic-gate 			mdclrerror(&xep);
4670Sstevel@tonic-gate 
4680Sstevel@tonic-gate 		metafreereplicalist(rlp);
4690Sstevel@tonic-gate 
4700Sstevel@tonic-gate 		metafreedrivedesc(&dd);
4710Sstevel@tonic-gate 
4720Sstevel@tonic-gate 		return (0);
4730Sstevel@tonic-gate 	}
4740Sstevel@tonic-gate 
4750Sstevel@tonic-gate 	metafreedrivedesc(&sd->sd_drvs);
4760Sstevel@tonic-gate 
4770Sstevel@tonic-gate 	if (ownset == MD_SETOWNER_YES)
4780Sstevel@tonic-gate 		return (0);
4790Sstevel@tonic-gate 
4800Sstevel@tonic-gate 	return (mddserror(ep, MDE_DS_NOOWNER, sp->setno, NULL, NULL,
4810Sstevel@tonic-gate 	    sp->setname));
4820Sstevel@tonic-gate }
4830Sstevel@tonic-gate 
4840Sstevel@tonic-gate /*
4850Sstevel@tonic-gate  * simple check for ownership
4860Sstevel@tonic-gate  */
4870Sstevel@tonic-gate int
4880Sstevel@tonic-gate meta_check_ownership_on_host(mdsetname_t *sp, char *hostname, md_error_t *ep)
4890Sstevel@tonic-gate {
4900Sstevel@tonic-gate 	md_set_desc	*sd;
4910Sstevel@tonic-gate 	md_drive_desc	*dd;
4920Sstevel@tonic-gate 	int		bool;
4930Sstevel@tonic-gate 
4940Sstevel@tonic-gate 	if (metaislocalset(sp))
4950Sstevel@tonic-gate 		return (0);
4960Sstevel@tonic-gate 
4970Sstevel@tonic-gate 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
4980Sstevel@tonic-gate 		return (-1);
4990Sstevel@tonic-gate 
5000Sstevel@tonic-gate 	if (getnodeside(hostname, sd) == MD_SIDEWILD)
5010Sstevel@tonic-gate 		return (mddserror(ep, MDE_DS_NODENOTINSET, sp->setno,
5020Sstevel@tonic-gate 		    hostname, NULL, sp->setname));
5030Sstevel@tonic-gate 
5040Sstevel@tonic-gate 	dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), ep);
5050Sstevel@tonic-gate 	if (! mdisok(ep))
5060Sstevel@tonic-gate 		return (-1);
5070Sstevel@tonic-gate 
5080Sstevel@tonic-gate 	if (clnt_ownset(hostname, sp, &bool, ep) == -1)
5090Sstevel@tonic-gate 		return (-1);
5100Sstevel@tonic-gate 
5110Sstevel@tonic-gate 	if (dd == NULL)
5120Sstevel@tonic-gate 		return (0);
5130Sstevel@tonic-gate 
5140Sstevel@tonic-gate 	metafreedrivedesc(&sd->sd_drvs);
5150Sstevel@tonic-gate 
5160Sstevel@tonic-gate 	if (bool == TRUE)
5170Sstevel@tonic-gate 		return (0);
5180Sstevel@tonic-gate 
5190Sstevel@tonic-gate 	return (mddserror(ep, MDE_DS_NODEISNOTOWNER, sp->setno, hostname, NULL,
5200Sstevel@tonic-gate 	    sp->setname));
5210Sstevel@tonic-gate }
5220Sstevel@tonic-gate 
5230Sstevel@tonic-gate /*
5240Sstevel@tonic-gate  * Function that determines if a node is in the multinode diskset
5250Sstevel@tonic-gate  * membership list.  Calling node passes in node to be checked and
5260Sstevel@tonic-gate  * the nodelist as returned from meta_read_nodelist.  This routine
5270Sstevel@tonic-gate  * anticipates being called many times using the same diskset membership
5280Sstevel@tonic-gate  * list which is why the alloc and free of the diskset membership list
5290Sstevel@tonic-gate  * is left to the calling routine.
5300Sstevel@tonic-gate  * Returns:
5310Sstevel@tonic-gate  *	1 - if a member
5320Sstevel@tonic-gate  *	0 - not a member
5330Sstevel@tonic-gate  */
5340Sstevel@tonic-gate int
5350Sstevel@tonic-gate meta_is_member(
5360Sstevel@tonic-gate 	char				*node_name,
5370Sstevel@tonic-gate 	md_mn_nodeid_t			node_id,
5380Sstevel@tonic-gate 	mndiskset_membershiplist_t	*nl
5390Sstevel@tonic-gate )
5400Sstevel@tonic-gate {
5410Sstevel@tonic-gate 	mndiskset_membershiplist_t	*nl2;
5420Sstevel@tonic-gate 	int				flag_check_name;
5430Sstevel@tonic-gate 
5440Sstevel@tonic-gate 	if (node_id != 0)
5450Sstevel@tonic-gate 		flag_check_name = 0;
5460Sstevel@tonic-gate 	else if (node_name != NULL)
5470Sstevel@tonic-gate 		flag_check_name = 1;
5480Sstevel@tonic-gate 	else
5490Sstevel@tonic-gate 		return (0);
5500Sstevel@tonic-gate 
5510Sstevel@tonic-gate 	nl2 = nl;
5520Sstevel@tonic-gate 	while (nl2) {
5530Sstevel@tonic-gate 		if (flag_check_name) {
5540Sstevel@tonic-gate 			/* Compare given name against name in member list */
5550Sstevel@tonic-gate 			if (strcmp(nl2->msl_node_name, node_name) == 0)
5560Sstevel@tonic-gate 				break;
5570Sstevel@tonic-gate 		} else {
5580Sstevel@tonic-gate 			/* Compare given nodeid against nodeid in member list */
5590Sstevel@tonic-gate 			if (nl2->msl_node_id == node_id)
5600Sstevel@tonic-gate 				break;
5610Sstevel@tonic-gate 		}
5620Sstevel@tonic-gate 		nl2 = nl2->next;
5630Sstevel@tonic-gate 	}
5640Sstevel@tonic-gate 	/* No match found in member list */
5650Sstevel@tonic-gate 	if (nl2 == NULL) {
5660Sstevel@tonic-gate 		return (0);
5670Sstevel@tonic-gate 	}
5680Sstevel@tonic-gate 	/* Return 1 if node is in member list */
5690Sstevel@tonic-gate 	return (1);
5700Sstevel@tonic-gate }
5710Sstevel@tonic-gate 
5720Sstevel@tonic-gate /*
5730Sstevel@tonic-gate  * meta_getnext_devinfo should go to the host that
5740Sstevel@tonic-gate  * has the device, to return the device name, driver name, minor num.
5750Sstevel@tonic-gate  * We can take the big cheat for now, since it is a requirement
5760Sstevel@tonic-gate  * that the device names and device numbers are the same, and
5770Sstevel@tonic-gate  * just get the info locally.
5780Sstevel@tonic-gate  *
5790Sstevel@tonic-gate  * This routine is very similar to meta_getnextside_devinfo except
5800Sstevel@tonic-gate  * that the specific side to be used is being passed in.
5810Sstevel@tonic-gate  *
5820Sstevel@tonic-gate  * Exit status:
5830Sstevel@tonic-gate  *	 0 - No more side info to return
5840Sstevel@tonic-gate  *	 1 - More side info's to return
5850Sstevel@tonic-gate  *	-1 - An error has been detected
5860Sstevel@tonic-gate  */
5870Sstevel@tonic-gate /*ARGSUSED*/
5880Sstevel@tonic-gate int
5890Sstevel@tonic-gate meta_getside_devinfo(
5900Sstevel@tonic-gate 	mdsetname_t	*sp,		/* for this set */
5910Sstevel@tonic-gate 	char		*bname,		/* local block name (myside) */
5920Sstevel@tonic-gate 	side_t		sideno,		/* sideno */
5930Sstevel@tonic-gate 	char		**ret_bname,	/* block device name of returned side */
5940Sstevel@tonic-gate 	char		**ret_dname,	/* driver name of returned side */
5950Sstevel@tonic-gate 	minor_t		*ret_mnum,	/* minor number of returned side */
5960Sstevel@tonic-gate 	md_error_t	*ep
5970Sstevel@tonic-gate )
5980Sstevel@tonic-gate {
5990Sstevel@tonic-gate 	mdname_t	*np;
6000Sstevel@tonic-gate 
6010Sstevel@tonic-gate 	if (ret_bname != NULL)
6020Sstevel@tonic-gate 		*ret_bname = NULL;
6030Sstevel@tonic-gate 	if (ret_dname != NULL)
6040Sstevel@tonic-gate 		*ret_dname = NULL;
6050Sstevel@tonic-gate 	if (ret_mnum != NULL)
6060Sstevel@tonic-gate 		*ret_mnum = NODEV32;
6070Sstevel@tonic-gate 
6080Sstevel@tonic-gate 
609*1623Stw21770 	if ((np = metaname(&sp, bname, LOGICAL_DEVICE, ep)) == NULL)
6100Sstevel@tonic-gate 		return (-1);
6110Sstevel@tonic-gate 
6120Sstevel@tonic-gate /*
6130Sstevel@tonic-gate  * NOTE (future) - There will be more work here once devids are integrated
6140Sstevel@tonic-gate  * into disksets.  Then the side should be used to find the correct
6150Sstevel@tonic-gate  * host and the b/d names should be gotten from that host.
6160Sstevel@tonic-gate  */
6170Sstevel@tonic-gate 
6180Sstevel@tonic-gate 	/*
6190Sstevel@tonic-gate 	 * Return the side info.
6200Sstevel@tonic-gate 	 */
6210Sstevel@tonic-gate 	if (ret_bname != NULL)
6220Sstevel@tonic-gate 		*ret_bname = Strdup(np->bname);
6230Sstevel@tonic-gate 
6240Sstevel@tonic-gate 	if (ret_dname != NULL) {
6250Sstevel@tonic-gate 		mdcinfo_t	*cinfo;
6260Sstevel@tonic-gate 
6270Sstevel@tonic-gate 		if ((cinfo = metagetcinfo(np, ep)) == NULL)
6280Sstevel@tonic-gate 			return (-1);
6290Sstevel@tonic-gate 
6300Sstevel@tonic-gate 		*ret_dname = Strdup(cinfo->dname);
6310Sstevel@tonic-gate 	}
6320Sstevel@tonic-gate 
6330Sstevel@tonic-gate 	if (ret_mnum != NULL)
6340Sstevel@tonic-gate 		*ret_mnum = meta_getminor(np->dev);
6350Sstevel@tonic-gate 
6360Sstevel@tonic-gate 	return (1);
6370Sstevel@tonic-gate }
6380Sstevel@tonic-gate 
6390Sstevel@tonic-gate /*
6400Sstevel@tonic-gate  * Get the information on the device from the remote node using the devid
6410Sstevel@tonic-gate  * of the disk.
6420Sstevel@tonic-gate  *
6430Sstevel@tonic-gate  * Exit status:
6440Sstevel@tonic-gate  *	 0 - No more side info to return
6450Sstevel@tonic-gate  *	 1 - More side info's to return
6460Sstevel@tonic-gate  *	-1 - An error has been detected
6470Sstevel@tonic-gate  */
6480Sstevel@tonic-gate int
6490Sstevel@tonic-gate meta_getnextside_devinfo(
6500Sstevel@tonic-gate 	mdsetname_t	*sp,		/* for this set */
6510Sstevel@tonic-gate 	char		*bname,		/* local block name (myside) */
6520Sstevel@tonic-gate 	side_t		*sideno,	/* previous sideno & returned sideno */
6530Sstevel@tonic-gate 	char		**ret_bname,	/* block device name of returned side */
6540Sstevel@tonic-gate 	char		**ret_dname,	/* driver name of returned side */
6550Sstevel@tonic-gate 	minor_t		*ret_mnum,	/* minor number of returned side */
6560Sstevel@tonic-gate 	md_error_t	*ep
6570Sstevel@tonic-gate )
6580Sstevel@tonic-gate {
6590Sstevel@tonic-gate 	md_set_desc	*sd;
6600Sstevel@tonic-gate 	int		i;
6610Sstevel@tonic-gate 	mdname_t	*np;
6620Sstevel@tonic-gate 	mddrivename_t	*dnp;
6630Sstevel@tonic-gate 	char		*devidstr = NULL;
6640Sstevel@tonic-gate 	int		devidstrlen;
6650Sstevel@tonic-gate 	md_dev64_t	retdev = NODEV64;
6660Sstevel@tonic-gate 	char		*ret_devname = NULL;
6670Sstevel@tonic-gate 	char		*ret_blkdevname = NULL;
6680Sstevel@tonic-gate 	char		*ret_driver = NULL;
6690Sstevel@tonic-gate 	char		*nodename;
6700Sstevel@tonic-gate 	int		fd;
6710Sstevel@tonic-gate 	int		ret = -1;
6720Sstevel@tonic-gate 	char		*minor_name = NULL;
6730Sstevel@tonic-gate 	md_mnnode_desc	*nd;
6740Sstevel@tonic-gate 
6750Sstevel@tonic-gate 
6760Sstevel@tonic-gate 	if (ret_bname != NULL)
6770Sstevel@tonic-gate 		*ret_bname = NULL;
6780Sstevel@tonic-gate 	if (ret_dname != NULL)
6790Sstevel@tonic-gate 		*ret_dname = NULL;
6800Sstevel@tonic-gate 	if (ret_mnum != NULL)
6810Sstevel@tonic-gate 		*ret_mnum = NODEV32;
6820Sstevel@tonic-gate 
6830Sstevel@tonic-gate 	if (metaislocalset(sp)) {
6840Sstevel@tonic-gate 		/* no more sides - we are done */
6850Sstevel@tonic-gate 		if (*sideno != MD_SIDEWILD)
6860Sstevel@tonic-gate 			return (0);
6870Sstevel@tonic-gate 
6880Sstevel@tonic-gate 		/* First time through -  set up return sideno */
6890Sstevel@tonic-gate 		*sideno = 0;
6900Sstevel@tonic-gate 	} else {
6910Sstevel@tonic-gate 
6920Sstevel@tonic-gate 		/*
6930Sstevel@tonic-gate 		 * Find the next sideno, starting after the one given.
6940Sstevel@tonic-gate 		 */
6950Sstevel@tonic-gate 		if ((sd = metaget_setdesc(sp, ep)) == NULL)
6960Sstevel@tonic-gate 			return (-1);
6970Sstevel@tonic-gate 
6980Sstevel@tonic-gate 		if (MD_MNSET_DESC(sd)) {
6990Sstevel@tonic-gate 			nd = sd->sd_nodelist;
7000Sstevel@tonic-gate 			if ((*sideno == MD_SIDEWILD) &&
7010Sstevel@tonic-gate 			    (nd != (struct md_mnnode_desc *)NULL)) {
7020Sstevel@tonic-gate 				*sideno = nd->nd_nodeid;
7030Sstevel@tonic-gate 			} else {
7040Sstevel@tonic-gate 				while (nd) {
7050Sstevel@tonic-gate 					/*
7060Sstevel@tonic-gate 					 * Found given sideno, now find
7070Sstevel@tonic-gate 					 * next sideno, if there is one.
7080Sstevel@tonic-gate 					 */
7090Sstevel@tonic-gate 					if ((*sideno == nd->nd_nodeid) &&
7100Sstevel@tonic-gate 					    (nd->nd_next !=
7110Sstevel@tonic-gate 					    (struct md_mnnode_desc *)NULL)) {
7120Sstevel@tonic-gate 						*sideno =
7130Sstevel@tonic-gate 						    nd->nd_next->nd_nodeid;
7140Sstevel@tonic-gate 						break;
7150Sstevel@tonic-gate 					}
7160Sstevel@tonic-gate 					nd = nd->nd_next;
7170Sstevel@tonic-gate 				}
7180Sstevel@tonic-gate 				if (nd == NULL) {
7190Sstevel@tonic-gate 					return (0);
7200Sstevel@tonic-gate 				}
7210Sstevel@tonic-gate 			}
7220Sstevel@tonic-gate 			if (*sideno == MD_SIDEWILD)
7230Sstevel@tonic-gate 				return (0);
7240Sstevel@tonic-gate 		} else {
7250Sstevel@tonic-gate 			for (i = (*sideno)+1; i < MD_MAXSIDES; i++)
7260Sstevel@tonic-gate 				/* Find next full slot */
7270Sstevel@tonic-gate 				if (sd->sd_nodes[i][0] != '\0')
7280Sstevel@tonic-gate 					break;
7290Sstevel@tonic-gate 
7300Sstevel@tonic-gate 			/* No more sides - we are done */
7310Sstevel@tonic-gate 			if (i == MD_MAXSIDES)
7320Sstevel@tonic-gate 				return (0);
7330Sstevel@tonic-gate 
7340Sstevel@tonic-gate 			/* Set up the return sideno */
7350Sstevel@tonic-gate 			*sideno = i;
7360Sstevel@tonic-gate 			nodename = (char *)sd->sd_nodes[i];
7370Sstevel@tonic-gate 		}
7380Sstevel@tonic-gate 	}
7390Sstevel@tonic-gate 
7400Sstevel@tonic-gate 	/*
7410Sstevel@tonic-gate 	 * Need to pass the node the devid of the disk and get it to
7420Sstevel@tonic-gate 	 * send back the details of the disk from that side.
7430Sstevel@tonic-gate 	 */
744*1623Stw21770 	if ((np = metaname(&sp, bname, UNKNOWN, ep)) == NULL)
7450Sstevel@tonic-gate 		return (-1);
7460Sstevel@tonic-gate 
7470Sstevel@tonic-gate 	dnp = np->drivenamep;
7480Sstevel@tonic-gate 
7490Sstevel@tonic-gate 	/*
7500Sstevel@tonic-gate 	 * By default, set up the parameters so that they are copied out.
7510Sstevel@tonic-gate 	 */
7520Sstevel@tonic-gate 	if (ret_bname != NULL)
7530Sstevel@tonic-gate 		*ret_bname = Strdup(np->bname);
7540Sstevel@tonic-gate 
7550Sstevel@tonic-gate 	if (ret_dname != NULL) {
7560Sstevel@tonic-gate 		mdcinfo_t	*cinfo;
7570Sstevel@tonic-gate 
7580Sstevel@tonic-gate 		if ((cinfo = metagetcinfo(np, ep)) == NULL)
7590Sstevel@tonic-gate 			return (-1);
7600Sstevel@tonic-gate 
7610Sstevel@tonic-gate 		*ret_dname = Strdup(cinfo->dname);
7620Sstevel@tonic-gate 	}
7630Sstevel@tonic-gate 
7640Sstevel@tonic-gate 	if (ret_mnum != NULL)
7650Sstevel@tonic-gate 		*ret_mnum = meta_getminor(np->dev);
7660Sstevel@tonic-gate 
7670Sstevel@tonic-gate 	/*
7680Sstevel@tonic-gate 	 * Try some optimization. If this is the local set or the device
7690Sstevel@tonic-gate 	 * is a metadevice then just copy the information. If the device
7700Sstevel@tonic-gate 	 * does not have a devid (due to not having a minor name) then
7710Sstevel@tonic-gate 	 * fall back to the pre-devid behaviour of copying the information
7720Sstevel@tonic-gate 	 * on the device: this is okay because the sanity checks before this
7730Sstevel@tonic-gate 	 * call would have found any issues with the device. If it's a
7740Sstevel@tonic-gate 	 * multi-node diskset also just return ie. copy.
7750Sstevel@tonic-gate 	 */
7760Sstevel@tonic-gate 	if (metaislocalset(sp) || metaismeta(np) || (dnp->devid == NULL) ||
7770Sstevel@tonic-gate 	    (MD_MNSET_DESC(sd)))
7780Sstevel@tonic-gate 		return (1);
7790Sstevel@tonic-gate 
7800Sstevel@tonic-gate 	if (np->minor_name == (char *)NULL) {
7810Sstevel@tonic-gate 		/*
7820Sstevel@tonic-gate 		 * Have to get the minor name then. The slice should exist
7830Sstevel@tonic-gate 		 * on the disk because it will have already been repartitioned
7840Sstevel@tonic-gate 		 * up prior to getting to this point.
7850Sstevel@tonic-gate 		 */
7860Sstevel@tonic-gate 		if ((fd = open(np->bname, (O_RDONLY|O_NDELAY), 0)) < 0) {
7870Sstevel@tonic-gate 			(void) mdsyserror(ep, errno, np->bname);
7880Sstevel@tonic-gate 			return (-1);
7890Sstevel@tonic-gate 		}
7900Sstevel@tonic-gate 		(void) devid_get_minor_name(fd, &minor_name);
7910Sstevel@tonic-gate 		np->minor_name = Strdup(minor_name);
7920Sstevel@tonic-gate 		devid_str_free(minor_name);
7930Sstevel@tonic-gate 		(void) close(fd);
7940Sstevel@tonic-gate 	}
7950Sstevel@tonic-gate 
7960Sstevel@tonic-gate 	/* allocate extra space for "/" and NULL hence +2 */
7970Sstevel@tonic-gate 	devidstrlen = strlen(dnp->devid) + strlen(np->minor_name) + 2;
7980Sstevel@tonic-gate 	devidstr = (char *)Malloc(devidstrlen);
7990Sstevel@tonic-gate 
8000Sstevel@tonic-gate 	/*
8010Sstevel@tonic-gate 	 * As a minor name is supplied then the ret_devname will be
8020Sstevel@tonic-gate 	 * appropriate to that minor_name and in this case it will be
8030Sstevel@tonic-gate 	 * a block device ie /dev/dsk.
8040Sstevel@tonic-gate 	 */
8050Sstevel@tonic-gate 	(void) snprintf(devidstr, devidstrlen,
8060Sstevel@tonic-gate 		"%s/%s", dnp->devid, np->minor_name);
8070Sstevel@tonic-gate 
8080Sstevel@tonic-gate 	ret = clnt_devinfo_by_devid(nodename, sp, devidstr, &retdev,
8090Sstevel@tonic-gate 	    np->bname, &ret_devname, &ret_driver, ep);
8100Sstevel@tonic-gate 
8110Sstevel@tonic-gate 	Free(devidstr);
8120Sstevel@tonic-gate 
8130Sstevel@tonic-gate 	/*
8140Sstevel@tonic-gate 	 * If the other side is not running device id in disksets,
8150Sstevel@tonic-gate 	 * 'ret' is set to ENOTSUP in which case we fallback to
8160Sstevel@tonic-gate 	 * the existing behaviour
8170Sstevel@tonic-gate 	 */
8180Sstevel@tonic-gate 	if (ret == ENOTSUP)
8190Sstevel@tonic-gate 		return (1);
8200Sstevel@tonic-gate 	else if (ret == -1)
8210Sstevel@tonic-gate 		return (-1);
8220Sstevel@tonic-gate 
8230Sstevel@tonic-gate 	/*
8240Sstevel@tonic-gate 	 * ret_devname comes from the rpc call and is a
8250Sstevel@tonic-gate 	 * raw device name. We need to make this into a
8260Sstevel@tonic-gate 	 * block device via blkname for further processing.
8270Sstevel@tonic-gate 	 * Unfortunately, when our device id isn't found in
8280Sstevel@tonic-gate 	 * the system, the rpc call will return a " " in
8290Sstevel@tonic-gate 	 * ret_devname in which case we need to fill that in
8300Sstevel@tonic-gate 	 * as ret_blkname because blkname of " " returns NULL.
8310Sstevel@tonic-gate 	 */
8320Sstevel@tonic-gate 	if (ret_bname != NULL && ret_devname != NULL) {
8330Sstevel@tonic-gate 		ret_blkdevname = blkname(ret_devname);
8340Sstevel@tonic-gate 		if (ret_blkdevname == NULL)
8350Sstevel@tonic-gate 			*ret_bname = Strdup(ret_devname);
8360Sstevel@tonic-gate 		else
8370Sstevel@tonic-gate 			*ret_bname = Strdup(ret_blkdevname);
8380Sstevel@tonic-gate 	}
8390Sstevel@tonic-gate 
8400Sstevel@tonic-gate 	if (ret_dname != NULL && ret_driver != NULL)
8410Sstevel@tonic-gate 		*ret_dname = Strdup(ret_driver);
8420Sstevel@tonic-gate 
8430Sstevel@tonic-gate 	if (ret_mnum != NULL)
8440Sstevel@tonic-gate 		*ret_mnum = meta_getminor(retdev);
8450Sstevel@tonic-gate 
8460Sstevel@tonic-gate 	return (1);
8470Sstevel@tonic-gate }
8480Sstevel@tonic-gate 
8490Sstevel@tonic-gate int
8500Sstevel@tonic-gate meta_is_drive_in_anyset(
8510Sstevel@tonic-gate 	mddrivename_t	*dnp,
8520Sstevel@tonic-gate 	mdsetname_t	**spp,
8530Sstevel@tonic-gate 	int		bypass_daemon,
8540Sstevel@tonic-gate 	md_error_t 	*ep
8550Sstevel@tonic-gate )
8560Sstevel@tonic-gate {
8570Sstevel@tonic-gate 	set_t		setno;
8580Sstevel@tonic-gate 	mdsetname_t	*this_sp;
8590Sstevel@tonic-gate 	int		is_it;
8600Sstevel@tonic-gate 	set_t		max_sets;
8610Sstevel@tonic-gate 
8620Sstevel@tonic-gate 	if ((max_sets = get_max_sets(ep)) == 0)
8630Sstevel@tonic-gate 		return (-1);
8640Sstevel@tonic-gate 
8650Sstevel@tonic-gate 	assert(spp != NULL);
8660Sstevel@tonic-gate 	*spp = NULL;
8670Sstevel@tonic-gate 
8680Sstevel@tonic-gate 	for (setno = 1; setno < max_sets; setno++) {
8690Sstevel@tonic-gate 		if (!bypass_daemon) {
8700Sstevel@tonic-gate 			if ((this_sp = metasetnosetname(setno, ep)) == NULL) {
8710Sstevel@tonic-gate 				if (mdismddberror(ep, MDE_DB_NODB)) {
8720Sstevel@tonic-gate 					mdclrerror(ep);
8730Sstevel@tonic-gate 					return (0);
8740Sstevel@tonic-gate 				}
8750Sstevel@tonic-gate 				if (mdiserror(ep, MDE_NO_SET)) {
8760Sstevel@tonic-gate 					mdclrerror(ep);
8770Sstevel@tonic-gate 					continue;
8780Sstevel@tonic-gate 				}
8790Sstevel@tonic-gate 				return (-1);
8800Sstevel@tonic-gate 			}
8810Sstevel@tonic-gate 		} else
8820Sstevel@tonic-gate 			this_sp = metafakesetname(setno, NULL);
8830Sstevel@tonic-gate 
8840Sstevel@tonic-gate 		if ((is_it = meta_is_drive_in_thisset(this_sp, dnp,
8850Sstevel@tonic-gate 		    bypass_daemon, ep)) == -1) {
8860Sstevel@tonic-gate 			if (mdiserror(ep, MDE_NO_SET)) {
8870Sstevel@tonic-gate 				mdclrerror(ep);
8880Sstevel@tonic-gate 				continue;
8890Sstevel@tonic-gate 			}
8900Sstevel@tonic-gate 			return (-1);
8910Sstevel@tonic-gate 		}
8920Sstevel@tonic-gate 		if (is_it) {
8930Sstevel@tonic-gate 			*spp = this_sp;
8940Sstevel@tonic-gate 			return (0);
8950Sstevel@tonic-gate 		}
8960Sstevel@tonic-gate 	}
8970Sstevel@tonic-gate 	return (0);
8980Sstevel@tonic-gate }
8990Sstevel@tonic-gate 
9000Sstevel@tonic-gate int
9010Sstevel@tonic-gate meta_is_drive_in_thisset(
9020Sstevel@tonic-gate 	mdsetname_t	*sp,
9030Sstevel@tonic-gate 	mddrivename_t	*dnp,
9040Sstevel@tonic-gate 	int		bypass_daemon,
9050Sstevel@tonic-gate 	md_error_t	*ep
9060Sstevel@tonic-gate )
9070Sstevel@tonic-gate {
9080Sstevel@tonic-gate 	md_drive_desc	*dd, *p;
9090Sstevel@tonic-gate 
9100Sstevel@tonic-gate 	if (bypass_daemon)
9110Sstevel@tonic-gate 		dd = dr2drivedesc(sp, MD_SIDEWILD,
9120Sstevel@tonic-gate 		    (MD_BASICNAME_OK | MD_BYPASS_DAEMON), ep);
9130Sstevel@tonic-gate 	else
9140Sstevel@tonic-gate 		dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep);
9150Sstevel@tonic-gate 
9160Sstevel@tonic-gate 	if (dd == NULL) {
9170Sstevel@tonic-gate 		if (! mdisok(ep))
9180Sstevel@tonic-gate 			return (-1);
9190Sstevel@tonic-gate 		return (0);
9200Sstevel@tonic-gate 	}
9210Sstevel@tonic-gate 
9220Sstevel@tonic-gate 
9230Sstevel@tonic-gate 	for (p = dd; p != NULL; p = p->dd_next)
9240Sstevel@tonic-gate 		if (strcmp(p->dd_dnp->cname, dnp->cname) == 0)
9250Sstevel@tonic-gate 			return (1);
9260Sstevel@tonic-gate 	return (0);
9270Sstevel@tonic-gate }
9280Sstevel@tonic-gate 
9290Sstevel@tonic-gate int
9300Sstevel@tonic-gate meta_set_balance(
9310Sstevel@tonic-gate 	mdsetname_t		*sp,
9320Sstevel@tonic-gate 	md_error_t		*ep
9330Sstevel@tonic-gate )
9340Sstevel@tonic-gate {
9350Sstevel@tonic-gate 	md_set_desc		*sd;
9360Sstevel@tonic-gate 	md_drive_desc		*dd, *curdd;
9370Sstevel@tonic-gate 	daddr_t			dbsize;
9380Sstevel@tonic-gate 	daddr_t			nblks;
9390Sstevel@tonic-gate 	int			i;
9400Sstevel@tonic-gate 	int			rval = 0;
9410Sstevel@tonic-gate 	sigset_t		oldsigs;
9420Sstevel@tonic-gate 	md_setkey_t		*cl_sk;
9430Sstevel@tonic-gate 	md_error_t		xep = mdnullerror;
9440Sstevel@tonic-gate 	md_mnnode_desc		*nd;
9450Sstevel@tonic-gate 	int			suspend1_flag = 0;
9460Sstevel@tonic-gate 
9470Sstevel@tonic-gate 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
9480Sstevel@tonic-gate 		return (-1);
9490Sstevel@tonic-gate 
9500Sstevel@tonic-gate 	dbsize = (MD_MNSET_DESC(sd)) ? MD_MN_DBSIZE : MD_DBSIZE;
9510Sstevel@tonic-gate 
9520Sstevel@tonic-gate 	/* Make sure we own the set */
9530Sstevel@tonic-gate 	if (meta_check_ownership(sp, ep) != 0)
9540Sstevel@tonic-gate 		return (-1);
9550Sstevel@tonic-gate 
9560Sstevel@tonic-gate 	/* END CHECK CODE */
9570Sstevel@tonic-gate 
9580Sstevel@tonic-gate 	/*
9590Sstevel@tonic-gate 	 * Get drive descriptors for the drives that are currently in the set.
9600Sstevel@tonic-gate 	 */
9610Sstevel@tonic-gate 	curdd = metaget_drivedesc(sp, MD_FULLNAME_ONLY, ep);
9620Sstevel@tonic-gate 
9630Sstevel@tonic-gate 	if (! mdisok(ep))
9640Sstevel@tonic-gate 		return (-1);
9650Sstevel@tonic-gate 
9660Sstevel@tonic-gate 	/* Find the minimum replica size in use is or use the default */
9670Sstevel@tonic-gate 	if ((nblks = meta_db_minreplica(sp, ep)) < 0)
9680Sstevel@tonic-gate 		mdclrerror(ep);
9690Sstevel@tonic-gate 	else
9700Sstevel@tonic-gate 		dbsize = nblks;	/* adjust replica size */
9710Sstevel@tonic-gate 
9720Sstevel@tonic-gate 	/* Make sure we are blocking all signals */
9730Sstevel@tonic-gate 	if (procsigs(TRUE, &oldsigs, &xep) < 0)
9740Sstevel@tonic-gate 		mdclrerror(&xep);
9750Sstevel@tonic-gate 
9760Sstevel@tonic-gate 	/*
9770Sstevel@tonic-gate 	 * Lock the set on current set members.
9780Sstevel@tonic-gate 	 * For MN diskset lock_set and SUSPEND are used to protect against
9790Sstevel@tonic-gate 	 * other meta* commands running on the other nodes.
9800Sstevel@tonic-gate 	 */
9810Sstevel@tonic-gate 	if (MD_MNSET_DESC(sd)) {
9820Sstevel@tonic-gate 		nd = sd->sd_nodelist;
9830Sstevel@tonic-gate 		while (nd) {
9840Sstevel@tonic-gate 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
9850Sstevel@tonic-gate 				nd = nd->nd_next;
9860Sstevel@tonic-gate 				continue;
9870Sstevel@tonic-gate 			}
9880Sstevel@tonic-gate 			if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
9890Sstevel@tonic-gate 				rval = -1;
9900Sstevel@tonic-gate 				goto out;
9910Sstevel@tonic-gate 			}
9920Sstevel@tonic-gate 			nd = nd->nd_next;
9930Sstevel@tonic-gate 		}
9940Sstevel@tonic-gate 		/*
9950Sstevel@tonic-gate 		 * Lock out other meta* commands by suspending
9960Sstevel@tonic-gate 		 * class 1 messages across the diskset.
9970Sstevel@tonic-gate 		 */
9980Sstevel@tonic-gate 		nd = sd->sd_nodelist;
9990Sstevel@tonic-gate 		while (nd) {
10000Sstevel@tonic-gate 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
10010Sstevel@tonic-gate 				nd = nd->nd_next;
10020Sstevel@tonic-gate 				continue;
10030Sstevel@tonic-gate 			}
10040Sstevel@tonic-gate 			if (clnt_mdcommdctl(nd->nd_nodename,
10050Sstevel@tonic-gate 			    COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1,
10060Sstevel@tonic-gate 			    MD_MSCF_NO_FLAGS, ep)) {
10070Sstevel@tonic-gate 				rval = -1;
10080Sstevel@tonic-gate 				goto out;
10090Sstevel@tonic-gate 			}
10100Sstevel@tonic-gate 			suspend1_flag = 1;
10110Sstevel@tonic-gate 			nd = nd->nd_next;
10120Sstevel@tonic-gate 		}
10130Sstevel@tonic-gate 	} else {
10140Sstevel@tonic-gate 		for (i = 0; i < MD_MAXSIDES; i++) {
10150Sstevel@tonic-gate 			/* Skip empty slots */
10160Sstevel@tonic-gate 			if (sd->sd_nodes[i][0] == '\0') continue;
10170Sstevel@tonic-gate 
10180Sstevel@tonic-gate 			if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) {
10190Sstevel@tonic-gate 				rval = -1;
10200Sstevel@tonic-gate 				goto out;
10210Sstevel@tonic-gate 			}
10220Sstevel@tonic-gate 		}
10230Sstevel@tonic-gate 	}
10240Sstevel@tonic-gate 
10250Sstevel@tonic-gate 	/* We are not adding or deleting any drives, just balancing */
10260Sstevel@tonic-gate 	dd = NULL;
10270Sstevel@tonic-gate 
10280Sstevel@tonic-gate 	/*
10290Sstevel@tonic-gate 	 * Balance the DB's according to the list of existing drives and the
10300Sstevel@tonic-gate 	 * list of added drives.
10310Sstevel@tonic-gate 	 */
10320Sstevel@tonic-gate 	if ((rval = meta_db_balance(sp, dd, curdd, dbsize, ep)) == -1)
10330Sstevel@tonic-gate 		goto out;
10340Sstevel@tonic-gate 
10350Sstevel@tonic-gate out:
10360Sstevel@tonic-gate 	/*
10370Sstevel@tonic-gate 	 * Unlock diskset by resuming class 1 messages across the diskset.
10380Sstevel@tonic-gate 	 * Just resume all classes so that resume is the same whether
10390Sstevel@tonic-gate 	 * just one class was locked or all classes were locked.
10400Sstevel@tonic-gate 	 */
10410Sstevel@tonic-gate 	if (suspend1_flag) {
10420Sstevel@tonic-gate 		nd = sd->sd_nodelist;
10430Sstevel@tonic-gate 		while (nd) {
10440Sstevel@tonic-gate 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
10450Sstevel@tonic-gate 				nd = nd->nd_next;
10460Sstevel@tonic-gate 				continue;
10470Sstevel@tonic-gate 			}
10480Sstevel@tonic-gate 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
10490Sstevel@tonic-gate 				sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
10500Sstevel@tonic-gate 				/*
10510Sstevel@tonic-gate 				 * We are here because we failed to resume
10520Sstevel@tonic-gate 				 * rpc.mdcommd.  However we potentially have
10530Sstevel@tonic-gate 				 * an error from the previous call
10540Sstevel@tonic-gate 				 * (meta_db_balance). If the previous call
10550Sstevel@tonic-gate 				 * did fail,  we capture that error and
10560Sstevel@tonic-gate 				 * generate a perror withthe string,
10570Sstevel@tonic-gate 				 * "Unable to resume...".
10580Sstevel@tonic-gate 				 * Setting rval to -1 ensures that in the
10590Sstevel@tonic-gate 				 * next iteration of the loop, ep is not
10600Sstevel@tonic-gate 				 * clobbered.
10610Sstevel@tonic-gate 				 */
10620Sstevel@tonic-gate 				if (rval == 0)
10630Sstevel@tonic-gate 					(void) mdstealerror(ep, &xep);
10640Sstevel@tonic-gate 				else
10650Sstevel@tonic-gate 					mdclrerror(&xep);
10660Sstevel@tonic-gate 				rval = -1;
10670Sstevel@tonic-gate 				mde_perror(ep, dgettext(TEXT_DOMAIN,
10680Sstevel@tonic-gate 				    "Unable to resume rpc.mdcommd."));
10690Sstevel@tonic-gate 			}
10700Sstevel@tonic-gate 			nd = nd->nd_next;
10710Sstevel@tonic-gate 		}
10720Sstevel@tonic-gate 	}
10730Sstevel@tonic-gate 
10740Sstevel@tonic-gate 	/* Unlock the set */
10750Sstevel@tonic-gate 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
10760Sstevel@tonic-gate 	if (MD_MNSET_DESC(sd)) {
10770Sstevel@tonic-gate 		nd = sd->sd_nodelist;
10780Sstevel@tonic-gate 		while (nd) {
10790Sstevel@tonic-gate 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
10800Sstevel@tonic-gate 				nd = nd->nd_next;
10810Sstevel@tonic-gate 				continue;
10820Sstevel@tonic-gate 			}
10830Sstevel@tonic-gate 			if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
10840Sstevel@tonic-gate 				if (rval == 0)
10850Sstevel@tonic-gate 					(void) mdstealerror(ep, &xep);
10860Sstevel@tonic-gate 				else
10870Sstevel@tonic-gate 					mdclrerror(&xep);
10880Sstevel@tonic-gate 				rval = -1;
10890Sstevel@tonic-gate 			}
10900Sstevel@tonic-gate 			nd = nd->nd_next;
10910Sstevel@tonic-gate 		}
10920Sstevel@tonic-gate 	} else {
10930Sstevel@tonic-gate 		for (i = 0; i < MD_MAXSIDES; i++) {
10940Sstevel@tonic-gate 			/* Skip empty slots */
10950Sstevel@tonic-gate 			if (sd->sd_nodes[i][0] == '\0')
10960Sstevel@tonic-gate 				continue;
10970Sstevel@tonic-gate 
10980Sstevel@tonic-gate 			if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) {
10990Sstevel@tonic-gate 				if (rval == 0)
11000Sstevel@tonic-gate 					(void) mdstealerror(ep, &xep);
11010Sstevel@tonic-gate 				rval = -1;
11020Sstevel@tonic-gate 			}
11030Sstevel@tonic-gate 		}
11040Sstevel@tonic-gate 	}
11050Sstevel@tonic-gate 
11060Sstevel@tonic-gate 	/* release signals back to what they were on entry */
11070Sstevel@tonic-gate 	if (procsigs(FALSE, &oldsigs, &xep) < 0)
11080Sstevel@tonic-gate 		mdclrerror(&xep);
11090Sstevel@tonic-gate 
11100Sstevel@tonic-gate 	cl_set_setkey(NULL);
11110Sstevel@tonic-gate 
11120Sstevel@tonic-gate 	metaflushsetname(sp);
11130Sstevel@tonic-gate 
11140Sstevel@tonic-gate 	return (rval);
11150Sstevel@tonic-gate }
11160Sstevel@tonic-gate 
11170Sstevel@tonic-gate int
11180Sstevel@tonic-gate meta_set_destroy(
11190Sstevel@tonic-gate 	mdsetname_t	*sp,
11200Sstevel@tonic-gate 	int		lock_set,
11210Sstevel@tonic-gate 	md_error_t	*ep
11220Sstevel@tonic-gate )
11230Sstevel@tonic-gate {
11240Sstevel@tonic-gate 	int		i;
11250Sstevel@tonic-gate 	med_rec_t	medr;
11260Sstevel@tonic-gate 	md_set_desc	*sd;
11270Sstevel@tonic-gate 	md_drive_desc	*dd, *p, *p1;
11280Sstevel@tonic-gate 	mddrivename_t	*dnp;
11290Sstevel@tonic-gate 	mdname_t	*np;
11300Sstevel@tonic-gate 	mdnamelist_t	*nlp = NULL;
11310Sstevel@tonic-gate 	int		num_users = 0;
11320Sstevel@tonic-gate 	int		has_set;
11330Sstevel@tonic-gate 	side_t		mysideno;
11340Sstevel@tonic-gate 	sigset_t	oldsigs;
11350Sstevel@tonic-gate 	md_error_t	xep = mdnullerror;
11360Sstevel@tonic-gate 	md_setkey_t	*cl_sk;
11370Sstevel@tonic-gate 	int		rval = 0;
11380Sstevel@tonic-gate 	int		delete_end = 1;
11390Sstevel@tonic-gate 
11400Sstevel@tonic-gate 	/* Make sure we are blocking all signals */
11410Sstevel@tonic-gate 	if (procsigs(TRUE, &oldsigs, ep) < 0)
11420Sstevel@tonic-gate 		return (-1);
11430Sstevel@tonic-gate 
11440Sstevel@tonic-gate 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
11450Sstevel@tonic-gate 		if (! mdisok(ep))
11460Sstevel@tonic-gate 			rval = -1;
11470Sstevel@tonic-gate 		goto out;
11480Sstevel@tonic-gate 	}
11490Sstevel@tonic-gate 
11500Sstevel@tonic-gate 	/*
11510Sstevel@tonic-gate 	 * meta_set_destroy should not be called for a MN diskset.
11520Sstevel@tonic-gate 	 * This routine destroys a set without communicating this information
11530Sstevel@tonic-gate 	 * to the other nodes which would lead to an inconsistency in
11540Sstevel@tonic-gate 	 * the MN diskset.
11550Sstevel@tonic-gate 	 */
11560Sstevel@tonic-gate 	if (MD_MNSET_DESC(sd)) {
11570Sstevel@tonic-gate 		rval = -1;
11580Sstevel@tonic-gate 		goto out;
11590Sstevel@tonic-gate 	}
11600Sstevel@tonic-gate 
11610Sstevel@tonic-gate 	/* Continue if a traditional diskset */
11620Sstevel@tonic-gate 
11630Sstevel@tonic-gate 	/*
11640Sstevel@tonic-gate 	 * Check to see who has the set.  If we are not the last user of the
11650Sstevel@tonic-gate 	 * set, we will not touch the replicas.
11660Sstevel@tonic-gate 	 */
11670Sstevel@tonic-gate 	for (i = 0; i < MD_MAXSIDES; i++) {
11680Sstevel@tonic-gate 		/* Skip empty slots */
11690Sstevel@tonic-gate 		if (sd->sd_nodes[i][0] == '\0')
11700Sstevel@tonic-gate 			continue;
11710Sstevel@tonic-gate 
11720Sstevel@tonic-gate 		has_set = nodehasset(sp, sd->sd_nodes[i], NHS_NST_EQ,
11730Sstevel@tonic-gate 		    ep);
11740Sstevel@tonic-gate 
11750Sstevel@tonic-gate 		if (has_set < 0) {
11760Sstevel@tonic-gate 			mdclrerror(ep);
11770Sstevel@tonic-gate 		} else
11780Sstevel@tonic-gate 			num_users++;
11790Sstevel@tonic-gate 	}
11800Sstevel@tonic-gate 
11810Sstevel@tonic-gate 	if ((dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep)) == NULL) {
11820Sstevel@tonic-gate 		if (! mdisok(ep)) {
11830Sstevel@tonic-gate 			rval = -1;
11840Sstevel@tonic-gate 			goto out;
11850Sstevel@tonic-gate 		}
11860Sstevel@tonic-gate 	}
11870Sstevel@tonic-gate 
11880Sstevel@tonic-gate 	if (setup_db_bydd(sp, dd, TRUE, ep) == -1) {
11890Sstevel@tonic-gate 		rval = -1;
11900Sstevel@tonic-gate 		goto out;
11910Sstevel@tonic-gate 	}
11920Sstevel@tonic-gate 
11930Sstevel@tonic-gate 	if (lock_set == TRUE) {
11940Sstevel@tonic-gate 		/* Lock the set on our side */
11950Sstevel@tonic-gate 		if (clnt_lock_set(mynode(), sp, ep)) {
11960Sstevel@tonic-gate 			rval = -1;
11970Sstevel@tonic-gate 			goto out;
11980Sstevel@tonic-gate 		}
11990Sstevel@tonic-gate 	}
12000Sstevel@tonic-gate 
12010Sstevel@tonic-gate 	/*
12020Sstevel@tonic-gate 	 * A traditional diskset has no diskset stale information to send
12030Sstevel@tonic-gate 	 * since there can only be one owner node at a time.
12040Sstevel@tonic-gate 	 */
12050Sstevel@tonic-gate 	if (snarf_set(sp, FALSE, ep))
12060Sstevel@tonic-gate 		mdclrerror(ep);
12070Sstevel@tonic-gate 
12080Sstevel@tonic-gate 	if (dd != NULL) {
12090Sstevel@tonic-gate 		/*
12100Sstevel@tonic-gate 		 * Make sure that no drives are in use as parts of metadrives
12110Sstevel@tonic-gate 		 * or hot spare pools, this is one of the few error conditions
12120Sstevel@tonic-gate 		 * that will stop this routine, unless the environment has
12130Sstevel@tonic-gate 		 * META_DESTROY_SET_OK set, in which case, the operation will
12140Sstevel@tonic-gate 		 * proceed.
12150Sstevel@tonic-gate 		 */
12160Sstevel@tonic-gate 		if (getenv("META_DESTROY_SET_OK") == NULL) {
12170Sstevel@tonic-gate 			for (p = dd; p != NULL; p = p->dd_next) {
12180Sstevel@tonic-gate 				dnp = p->dd_dnp;
12190Sstevel@tonic-gate 
12200Sstevel@tonic-gate 				i = meta_check_drive_inuse(sp, dnp, FALSE, ep);
12210Sstevel@tonic-gate 				if (i == -1) {
12220Sstevel@tonic-gate 					/* need xep - wire calls clear error */
12230Sstevel@tonic-gate 					i = metaget_setownership(sp, &xep);
12240Sstevel@tonic-gate 					if (i == -1) {
12250Sstevel@tonic-gate 						rval = -1;
12260Sstevel@tonic-gate 						goto out;
12270Sstevel@tonic-gate 					}
12280Sstevel@tonic-gate 
12290Sstevel@tonic-gate 					mysideno = getmyside(sp, &xep);
12300Sstevel@tonic-gate 
12310Sstevel@tonic-gate 					if (mysideno == MD_SIDEWILD) {
12320Sstevel@tonic-gate 						rval = -1;
12330Sstevel@tonic-gate 						goto out;
12340Sstevel@tonic-gate 					}
12350Sstevel@tonic-gate 
12360Sstevel@tonic-gate 					if (sd->sd_isown[mysideno] == FALSE)
12370Sstevel@tonic-gate 						if (halt_set(sp, &xep)) {
12380Sstevel@tonic-gate 							rval = -1;
12390Sstevel@tonic-gate 							goto out;
12400Sstevel@tonic-gate 						}
12410Sstevel@tonic-gate 
12420Sstevel@tonic-gate 					rval = -1;
12430Sstevel@tonic-gate 					goto out;
12440Sstevel@tonic-gate 				}
12450Sstevel@tonic-gate 			}
12460Sstevel@tonic-gate 		}
12470Sstevel@tonic-gate 
12480Sstevel@tonic-gate 		for (i = 0; i < MD_MAXSIDES; i++) {
12490Sstevel@tonic-gate 			/* Skip empty slots */
12500Sstevel@tonic-gate 			if (sd->sd_nodes[i][0] == '\0')
12510Sstevel@tonic-gate 				continue;
12520Sstevel@tonic-gate 
12530Sstevel@tonic-gate 			/* Skip non local nodes */
12540Sstevel@tonic-gate 			if (strcmp(mynode(), sd->sd_nodes[i]) != 0)
12550Sstevel@tonic-gate 				continue;
12560Sstevel@tonic-gate 
12570Sstevel@tonic-gate 			if (clnt_deldrvs(sd->sd_nodes[i], sp, dd, ep))
12580Sstevel@tonic-gate 				mdclrerror(ep);
12590Sstevel@tonic-gate 		}
12600Sstevel@tonic-gate 
12610Sstevel@tonic-gate 		/*
12620Sstevel@tonic-gate 		 * Go thru each drive and individually delete the replicas.
12630Sstevel@tonic-gate 		 * This way we can ignore individual errors.
12640Sstevel@tonic-gate 		 */
12650Sstevel@tonic-gate 		for (p = dd; p != NULL; p = p->dd_next) {
12660Sstevel@tonic-gate 			uint_t	rep_slice;
12670Sstevel@tonic-gate 
12680Sstevel@tonic-gate 			dnp = p->dd_dnp;
12690Sstevel@tonic-gate 			if ((meta_replicaslice(dnp, &rep_slice, ep) != 0) ||
12700Sstevel@tonic-gate 			    (((np = metaslicename(dnp, rep_slice, ep))
12710Sstevel@tonic-gate 				== NULL) &&
12720Sstevel@tonic-gate 				((np = metaslicename(dnp, MD_SLICE0, ep))
12730Sstevel@tonic-gate 				    == NULL))) {
12740Sstevel@tonic-gate 				rval = -1;
12750Sstevel@tonic-gate 				goto out;
12760Sstevel@tonic-gate 			}
12770Sstevel@tonic-gate 
12780Sstevel@tonic-gate 			if ((np = metaslicename(dnp,
12790Sstevel@tonic-gate 			    rep_slice, ep)) == NULL) {
12800Sstevel@tonic-gate 				if ((np = metaslicename(dnp,
12810Sstevel@tonic-gate 				    MD_SLICE0, ep)) == NULL) {
12820Sstevel@tonic-gate 					rval = -1;
12830Sstevel@tonic-gate 					goto out;
12840Sstevel@tonic-gate 				}
12850Sstevel@tonic-gate 				mdclrerror(ep);
12860Sstevel@tonic-gate 			}
12870Sstevel@tonic-gate 
12880Sstevel@tonic-gate 			/* Yes this is UGLY!!! */
12890Sstevel@tonic-gate 			p1 = p->dd_next;
12900Sstevel@tonic-gate 			p->dd_next = NULL;
12910Sstevel@tonic-gate 			if (rel_own_bydd(sp, p, FALSE, ep))
12920Sstevel@tonic-gate 				mdclrerror(ep);
12930Sstevel@tonic-gate 			p->dd_next = p1;
12940Sstevel@tonic-gate 
12950Sstevel@tonic-gate 			if (p->dd_dbcnt == 0)
12960Sstevel@tonic-gate 				continue;
12970Sstevel@tonic-gate 
12980Sstevel@tonic-gate 			/*
12990Sstevel@tonic-gate 			 * Skip the replica removal if we are not the last user
13000Sstevel@tonic-gate 			 */
13010Sstevel@tonic-gate 			if (num_users != 1)
13020Sstevel@tonic-gate 				continue;
13030Sstevel@tonic-gate 
13040Sstevel@tonic-gate 			nlp = NULL;
13050Sstevel@tonic-gate 			(void) metanamelist_append(&nlp, np);
13060Sstevel@tonic-gate 			if (meta_db_detach(sp, nlp,
13070Sstevel@tonic-gate 			    (MDFORCE_DS | MDFORCE_SET_LOCKED), NULL, ep))
13080Sstevel@tonic-gate 				mdclrerror(ep);
13090Sstevel@tonic-gate 			metafreenamelist(nlp);
13100Sstevel@tonic-gate 		}
13110Sstevel@tonic-gate 	}
13120Sstevel@tonic-gate 
13130Sstevel@tonic-gate 	if (halt_set(sp, ep)) {
13140Sstevel@tonic-gate 		rval = -1;
13150Sstevel@tonic-gate 		goto out;
13160Sstevel@tonic-gate 	}
13170Sstevel@tonic-gate 
13180Sstevel@tonic-gate 	/* Setup the mediator record */
13190Sstevel@tonic-gate 	(void) memset(&medr, '\0', sizeof (med_rec_t));
13200Sstevel@tonic-gate 	medr.med_rec_mag = MED_REC_MAGIC;
13210Sstevel@tonic-gate 	medr.med_rec_rev = MED_REC_REV;
13220Sstevel@tonic-gate 	medr.med_rec_fl  = 0;
13230Sstevel@tonic-gate 	medr.med_rec_sn  = sp->setno;
13240Sstevel@tonic-gate 	(void) strcpy(medr.med_rec_snm, sp->setname);
13250Sstevel@tonic-gate 	medr.med_rec_meds = sd->sd_med;	/* structure assigment */
13260Sstevel@tonic-gate 	(void) memset(&medr.med_rec_data, '\0', sizeof (med_data_t));
13270Sstevel@tonic-gate 	medr.med_rec_foff = 0;
13280Sstevel@tonic-gate 
13290Sstevel@tonic-gate 	/*
13300Sstevel@tonic-gate 	 * If we are the last remaining user, then remove the mediator hosts
13310Sstevel@tonic-gate 	 */
13320Sstevel@tonic-gate 	if (num_users == 1) {
13330Sstevel@tonic-gate 		for (i = 0; i < MED_MAX_HOSTS; i++) {
13340Sstevel@tonic-gate 			if (medr.med_rec_meds.n_lst[i].a_cnt != 0)
13350Sstevel@tonic-gate 				SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_REMOVE,
13360Sstevel@tonic-gate 				    SVM_TAG_MEDIATOR, sp->setno, i);
13370Sstevel@tonic-gate 			(void) memset(&medr.med_rec_meds.n_lst[i], '\0',
13380Sstevel@tonic-gate 			    sizeof (md_h_t));
13390Sstevel@tonic-gate 		}
13400Sstevel@tonic-gate 		medr.med_rec_meds.n_cnt = 0;
13410Sstevel@tonic-gate 	} else { 	/* Remove this host from the mediator node list. */
13420Sstevel@tonic-gate 		for (i = 0; i < MD_MAXSIDES; i++) {
13430Sstevel@tonic-gate 			/* Skip empty slots */
13440Sstevel@tonic-gate 			if (sd->sd_nodes[i][0] == '\0')
13450Sstevel@tonic-gate 				continue;
13460Sstevel@tonic-gate 
13470Sstevel@tonic-gate 			/* Copy non local node */
13480Sstevel@tonic-gate 			if (strcmp(mynode(), sd->sd_nodes[i]) != 0) {
13490Sstevel@tonic-gate 				(void) strcpy(medr.med_rec_nodes[i],
13500Sstevel@tonic-gate 				    sd->sd_nodes[i]);
13510Sstevel@tonic-gate 				continue;
13520Sstevel@tonic-gate 			}
13530Sstevel@tonic-gate 
13540Sstevel@tonic-gate 			/* Clear local node */
13550Sstevel@tonic-gate 			(void) memset(&medr.med_rec_nodes[i], '\0',
13560Sstevel@tonic-gate 			    sizeof (md_node_nm_t));
13570Sstevel@tonic-gate 		}
13580Sstevel@tonic-gate 	}
13590Sstevel@tonic-gate 
13600Sstevel@tonic-gate 	crcgen(&medr, &medr.med_rec_cks, sizeof (med_rec_t), NULL);
13610Sstevel@tonic-gate 
13620Sstevel@tonic-gate 	/*
13630Sstevel@tonic-gate 	 * If the client is part of a cluster put the DCS service
13640Sstevel@tonic-gate 	 * into a deleteing state.
13650Sstevel@tonic-gate 	 */
13660Sstevel@tonic-gate 	if (sdssc_delete_begin(sp->setname) == SDSSC_ERROR) {
13670Sstevel@tonic-gate 		if (metad_isautotakebyname(sp->setname)) {
13680Sstevel@tonic-gate 			delete_end = 0;
13690Sstevel@tonic-gate 		} else {
13700Sstevel@tonic-gate 			mdclrerror(ep);
13710Sstevel@tonic-gate 			goto out;
13720Sstevel@tonic-gate 		}
13730Sstevel@tonic-gate 	}
13740Sstevel@tonic-gate 
13750Sstevel@tonic-gate 	/* Inform the mediator hosts of the new information */
13760Sstevel@tonic-gate 	for (i = 0; i < MED_MAX_HOSTS; i++) {
13770Sstevel@tonic-gate 		if (sd->sd_med.n_lst[i].a_cnt == 0)
13780Sstevel@tonic-gate 			continue;
13790Sstevel@tonic-gate 
13800Sstevel@tonic-gate 		if (clnt_med_upd_rec(&sd->sd_med.n_lst[i], sp, &medr, ep))
13810Sstevel@tonic-gate 			mdclrerror(ep);
13820Sstevel@tonic-gate 	}
13830Sstevel@tonic-gate 
13840Sstevel@tonic-gate 	/* Delete the set locally */
13850Sstevel@tonic-gate 	for (i = 0; i < MD_MAXSIDES; i++) {
13860Sstevel@tonic-gate 		/* Skip empty slots */
13870Sstevel@tonic-gate 		if (sd->sd_nodes[i][0] == '\0')
13880Sstevel@tonic-gate 			continue;
13890Sstevel@tonic-gate 
13900Sstevel@tonic-gate 		/* Skip non local nodes */
13910Sstevel@tonic-gate 		if (strcmp(mynode(), sd->sd_nodes[i]) != 0)
13920Sstevel@tonic-gate 			continue;
13930Sstevel@tonic-gate 
13940Sstevel@tonic-gate 		if (clnt_delset(sd->sd_nodes[i], sp, ep) == -1)
13950Sstevel@tonic-gate 			mdclrerror(ep);
13960Sstevel@tonic-gate 	}
13970Sstevel@tonic-gate 	if (delete_end &&
13980Sstevel@tonic-gate 	    sdssc_delete_end(sp->setname, SDSSC_COMMIT) == SDSSC_ERROR)
13990Sstevel@tonic-gate 		rval = -1;
14000Sstevel@tonic-gate 
14010Sstevel@tonic-gate out:
14020Sstevel@tonic-gate 	/* release signals back to what they were on entry */
14030Sstevel@tonic-gate 	if (procsigs(FALSE, &oldsigs, &xep) < 0) {
14040Sstevel@tonic-gate 		if (rval == 0)
14050Sstevel@tonic-gate 			(void) mdstealerror(ep, &xep);
14060Sstevel@tonic-gate 		rval = -1;
14070Sstevel@tonic-gate 	}
14080Sstevel@tonic-gate 
14090Sstevel@tonic-gate 	if (lock_set == TRUE) {
14100Sstevel@tonic-gate 		cl_sk = cl_get_setkey(sp->setno, sp->setname);
14110Sstevel@tonic-gate 		if (clnt_unlock_set(mynode(), cl_sk, &xep)) {
14120Sstevel@tonic-gate 			if (rval == 0)
14130Sstevel@tonic-gate 				(void) mdstealerror(ep, &xep);
14140Sstevel@tonic-gate 			rval = -1;
14150Sstevel@tonic-gate 		}
14160Sstevel@tonic-gate 		cl_set_setkey(NULL);
14170Sstevel@tonic-gate 	}
14180Sstevel@tonic-gate 
14190Sstevel@tonic-gate 	metaflushsetname(sp);
14200Sstevel@tonic-gate 	return (rval);
14210Sstevel@tonic-gate }
14220Sstevel@tonic-gate 
14230Sstevel@tonic-gate int
14240Sstevel@tonic-gate meta_set_purge(
14250Sstevel@tonic-gate 	mdsetname_t	*sp,
14260Sstevel@tonic-gate 	int		bypass_cluster,
14270Sstevel@tonic-gate 	int		forceflg,
14280Sstevel@tonic-gate 	md_error_t	*ep
14290Sstevel@tonic-gate )
14300Sstevel@tonic-gate {
14310Sstevel@tonic-gate 	char		*thishost = mynode();
14320Sstevel@tonic-gate 	md_set_desc	*sd;
14330Sstevel@tonic-gate 	md_setkey_t	*cl_sk;
14340Sstevel@tonic-gate 	md_error_t	xep = mdnullerror;
14350Sstevel@tonic-gate 	int		rval = 0;
14360Sstevel@tonic-gate 	int		i, num_hosts = 0;
14370Sstevel@tonic-gate 	int		has_set = 0;
14380Sstevel@tonic-gate 	int		max_node = 0;
14390Sstevel@tonic-gate 	int		delete_end = 1;
14400Sstevel@tonic-gate 	md_mnnode_desc	*nd;
14410Sstevel@tonic-gate 
14420Sstevel@tonic-gate 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
14430Sstevel@tonic-gate 		/* unable to find set description */
14440Sstevel@tonic-gate 		rval = 1;
14450Sstevel@tonic-gate 		return (rval);
14460Sstevel@tonic-gate 	}
14470Sstevel@tonic-gate 
14480Sstevel@tonic-gate 	if (MD_MNSET_DESC(sd)) {
14490Sstevel@tonic-gate 		/*
14500Sstevel@tonic-gate 		 * Get a count of the hosts in the set and also lock the set
14510Sstevel@tonic-gate 		 * on those hosts that know about it.
14520Sstevel@tonic-gate 		 */
14530Sstevel@tonic-gate 		nd = sd->sd_nodelist;
14540Sstevel@tonic-gate 		while (nd) {
14550Sstevel@tonic-gate 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
14560Sstevel@tonic-gate 				nd = nd->nd_next;
14570Sstevel@tonic-gate 				continue;
14580Sstevel@tonic-gate 			}
14590Sstevel@tonic-gate 			has_set = nodehasset(sp, nd->nd_nodename,
14600Sstevel@tonic-gate 				NHS_NST_EQ, ep);
14610Sstevel@tonic-gate 
14620Sstevel@tonic-gate 			/*
14630Sstevel@tonic-gate 			 * The host is not aware of this set (has_set < 0) or
14640Sstevel@tonic-gate 			 * the set does not match (has_set == 0). This check
14650Sstevel@tonic-gate 			 * prevents the code getting confused by an apparent
14660Sstevel@tonic-gate 			 * inconsistancy in the set's state, this is in the
14670Sstevel@tonic-gate 			 * purge code so something is broken in any case and
14680Sstevel@tonic-gate 			 * this is just trying to fix the brokeness.
14690Sstevel@tonic-gate 			 */
14700Sstevel@tonic-gate 			if (has_set <= 0) {
14710Sstevel@tonic-gate 				mdclrerror(ep);
14720Sstevel@tonic-gate 				nd->nd_flags |= MD_MN_NODE_NOSET;
14730Sstevel@tonic-gate 			} else {
14740Sstevel@tonic-gate 				num_hosts++;
14750Sstevel@tonic-gate 				if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
14760Sstevel@tonic-gate 					/*
14770Sstevel@tonic-gate 					 * If the force flag is set then
14780Sstevel@tonic-gate 					 * ignore any RPC failures because we
14790Sstevel@tonic-gate 					 * are only really interested with
14800Sstevel@tonic-gate 					 * the set on local node.
14810Sstevel@tonic-gate 					 */
14820Sstevel@tonic-gate 					if (forceflg && mdanyrpcerror(ep)) {
14830Sstevel@tonic-gate 						mdclrerror(ep);
14840Sstevel@tonic-gate 					} else {
14850Sstevel@tonic-gate 						/*
14860Sstevel@tonic-gate 						 * set max_node so that in the
14870Sstevel@tonic-gate 						 * unlock code nodes in the
14880Sstevel@tonic-gate 						 * set that have not been
14890Sstevel@tonic-gate 						 * locked are not unlocked.
14900Sstevel@tonic-gate 						 */
14910Sstevel@tonic-gate 						max_node = nd->nd_nodeid;
14920Sstevel@tonic-gate 						rval = 2;
14930Sstevel@tonic-gate 						goto out1;
14940Sstevel@tonic-gate 					}
14950Sstevel@tonic-gate 				}
14960Sstevel@tonic-gate 
14970Sstevel@tonic-gate 			}
14980Sstevel@tonic-gate 			nd = nd->nd_next;
14990Sstevel@tonic-gate 		}
15000Sstevel@tonic-gate 		max_node = 0;
15010Sstevel@tonic-gate 	} else {
15020Sstevel@tonic-gate 		/*
15030Sstevel@tonic-gate 		 * Get a count of the hosts in the set and also lock the set
15040Sstevel@tonic-gate 		 * on those hosts that know about it.
15050Sstevel@tonic-gate 		 */
15060Sstevel@tonic-gate 		for (i = 0; i < MD_MAXSIDES; i++) {
15070Sstevel@tonic-gate 			/* Skip empty slots */
15080Sstevel@tonic-gate 			if (sd->sd_nodes[i][0] == '\0')
15090Sstevel@tonic-gate 				continue;
15100Sstevel@tonic-gate 
15110Sstevel@tonic-gate 			has_set = nodehasset(sp, sd->sd_nodes[i],
15120Sstevel@tonic-gate 				NHS_NST_EQ, ep);
15130Sstevel@tonic-gate 
15140Sstevel@tonic-gate 			/*
15150Sstevel@tonic-gate 			 * The host is not aware of this set (has_set < 0) or
15160Sstevel@tonic-gate 			 * the set does not match (has_set == 0). This check
15170Sstevel@tonic-gate 			 * prevents the code getting confused by an apparent
15180Sstevel@tonic-gate 			 * inconsistancy in the set's state, this is in the
15190Sstevel@tonic-gate 			 * purge code so something is broken in any case and
15200Sstevel@tonic-gate 			 * this is just trying to fix the brokeness.
15210Sstevel@tonic-gate 			 */
15220Sstevel@tonic-gate 			if (has_set <= 0) {
15230Sstevel@tonic-gate 				mdclrerror(ep);
15240Sstevel@tonic-gate 				/*
15250Sstevel@tonic-gate 				 * set the node to NULL to prevent further
15260Sstevel@tonic-gate 				 * requests to this unresponsive node.
15270Sstevel@tonic-gate 				 */
15280Sstevel@tonic-gate 				sd->sd_nodes[i][0] = '\0';
15290Sstevel@tonic-gate 			} else {
15300Sstevel@tonic-gate 				num_hosts++;
15310Sstevel@tonic-gate 				if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) {
15320Sstevel@tonic-gate 					/*
15330Sstevel@tonic-gate 					 * If the force flag is set then
15340Sstevel@tonic-gate 					 * ignore any RPC failures because we
15350Sstevel@tonic-gate 					 * are only really interested with
15360Sstevel@tonic-gate 					 * the set on local node.
15370Sstevel@tonic-gate 					 */
15380Sstevel@tonic-gate 					if (forceflg && mdanyrpcerror(ep)) {
15390Sstevel@tonic-gate 						mdclrerror(ep);
15400Sstevel@tonic-gate 					} else {
15410Sstevel@tonic-gate 						rval = 2;
15420Sstevel@tonic-gate 						/*
15430Sstevel@tonic-gate 						 * set max_node so that in the
15440Sstevel@tonic-gate 						 * unlock code nodes in the
15450Sstevel@tonic-gate 						 * set that have not been
15460Sstevel@tonic-gate 						 * locked are not unlocked.
15470Sstevel@tonic-gate 						 */
15480Sstevel@tonic-gate 						max_node = i;
15490Sstevel@tonic-gate 						goto out1;
15500Sstevel@tonic-gate 					}
15510Sstevel@tonic-gate 				}
15520Sstevel@tonic-gate 			}
15530Sstevel@tonic-gate 		}
15540Sstevel@tonic-gate 		max_node = i;	/* now MD_MAXSIDES */
15550Sstevel@tonic-gate 	}
15560Sstevel@tonic-gate 	if (!bypass_cluster) {
15570Sstevel@tonic-gate 		/*
15580Sstevel@tonic-gate 		 * If there is only one host associated with the
15590Sstevel@tonic-gate 		 * set then remove the set from the cluster.
15600Sstevel@tonic-gate 		 */
15610Sstevel@tonic-gate 		if (num_hosts == 1) {
15620Sstevel@tonic-gate 			if (sdssc_delete_begin(sp->setname) == SDSSC_ERROR) {
15630Sstevel@tonic-gate 				if (metad_isautotakebyname(sp->setname)) {
15640Sstevel@tonic-gate 					delete_end = 0;
15650Sstevel@tonic-gate 				} else {
15660Sstevel@tonic-gate 					mdclrerror(ep);
15670Sstevel@tonic-gate 					rval = 3;
15680Sstevel@tonic-gate 					goto out1;
15690Sstevel@tonic-gate 				}
15700Sstevel@tonic-gate 			}
15710Sstevel@tonic-gate 		}
15720Sstevel@tonic-gate 	}
15730Sstevel@tonic-gate 
15740Sstevel@tonic-gate 	if (MD_MNSET_DESC(sd)) {
15750Sstevel@tonic-gate 		/*
15760Sstevel@tonic-gate 		 * Get a count of the hosts in the set and also lock the set
15770Sstevel@tonic-gate 		 * on those hosts that know about it.
15780Sstevel@tonic-gate 		 */
15790Sstevel@tonic-gate 		nd = sd->sd_nodelist;
15800Sstevel@tonic-gate 		while (nd) {
15810Sstevel@tonic-gate 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
15820Sstevel@tonic-gate 				nd = nd->nd_next;
15830Sstevel@tonic-gate 				continue;
15840Sstevel@tonic-gate 			}
15850Sstevel@tonic-gate 			if (nd->nd_nodeid != sd->sd_mn_mynode->nd_nodeid) {
15860Sstevel@tonic-gate 				/*
15870Sstevel@tonic-gate 				 * Tell the remote node to remove this node
15880Sstevel@tonic-gate 				 */
15890Sstevel@tonic-gate 				if (clnt_delhosts(nd->nd_nodename, sp, 1,
15900Sstevel@tonic-gate 					&thishost, ep) == -1) {
15910Sstevel@tonic-gate 					/*
15920Sstevel@tonic-gate 					 * If we fail to delete ourselves
15930Sstevel@tonic-gate 					 * from the remote host it does not
15940Sstevel@tonic-gate 					 * really matter because the set is
15950Sstevel@tonic-gate 					 * being "purged" from this node. The
15960Sstevel@tonic-gate 					 * set can be purged from the other
15970Sstevel@tonic-gate 					 * node at a later time.
15980Sstevel@tonic-gate 					 */
15990Sstevel@tonic-gate 					mdclrerror(ep);
16000Sstevel@tonic-gate 				}
16010Sstevel@tonic-gate 				nd = nd->nd_next;
16020Sstevel@tonic-gate 				continue;
16030Sstevel@tonic-gate 			}
16040Sstevel@tonic-gate 			/* remove the set from this host */
16050Sstevel@tonic-gate 			if (clnt_delset(nd->nd_nodename, sp, ep) == -1) {
16060Sstevel@tonic-gate 				md_perror(dgettext(TEXT_DOMAIN, "delset"));
16070Sstevel@tonic-gate 				if (!bypass_cluster && num_hosts == 1)
16080Sstevel@tonic-gate 					(void) sdssc_delete_end(sp->setname,
16090Sstevel@tonic-gate 					    SDSSC_CLEANUP);
16100Sstevel@tonic-gate 				mdclrerror(ep);
16110Sstevel@tonic-gate 				goto out1;
16120Sstevel@tonic-gate 			}
16130Sstevel@tonic-gate 			nd = nd->nd_next;
16140Sstevel@tonic-gate 		}
16150Sstevel@tonic-gate 	} else {
16160Sstevel@tonic-gate 		for (i = 0; i < MD_MAXSIDES; i++) {
16170Sstevel@tonic-gate 			/* Skip empty slots */
16180Sstevel@tonic-gate 			if (sd->sd_nodes[i][0] == '\0')
16190Sstevel@tonic-gate 				continue;
16200Sstevel@tonic-gate 			if (strcmp(thishost, sd->sd_nodes[i]) != 0) {
16210Sstevel@tonic-gate 				/*
16220Sstevel@tonic-gate 				 * Tell the remote node to remove this node
16230Sstevel@tonic-gate 				 */
16240Sstevel@tonic-gate 				if (clnt_delhosts(sd->sd_nodes[i], sp, 1,
16250Sstevel@tonic-gate 				    &thishost, ep) == -1) {
16260Sstevel@tonic-gate 					/*
16270Sstevel@tonic-gate 					 * If we fail to delete ourselves
16280Sstevel@tonic-gate 					 * from the remote host it does not
16290Sstevel@tonic-gate 					 * really matter because the set is
16300Sstevel@tonic-gate 					 * being "purged" from this node. The
16310Sstevel@tonic-gate 					 * set can be purged from the other
16320Sstevel@tonic-gate 					 * node at a later time.
16330Sstevel@tonic-gate 					 */
16340Sstevel@tonic-gate 					mdclrerror(ep);
16350Sstevel@tonic-gate 				}
16360Sstevel@tonic-gate 				continue;
16370Sstevel@tonic-gate 			}
16380Sstevel@tonic-gate 
16390Sstevel@tonic-gate 			/* remove the set from this host */
16400Sstevel@tonic-gate 			if (clnt_delset(sd->sd_nodes[i], sp, ep) == -1) {
16410Sstevel@tonic-gate 				md_perror(dgettext(TEXT_DOMAIN, "delset"));
16420Sstevel@tonic-gate 				if (!bypass_cluster && num_hosts == 1)
16430Sstevel@tonic-gate 					(void) sdssc_delete_end(sp->setname,
16440Sstevel@tonic-gate 					    SDSSC_CLEANUP);
16450Sstevel@tonic-gate 				mdclrerror(ep);
16460Sstevel@tonic-gate 				goto out1;
16470Sstevel@tonic-gate 			}
16480Sstevel@tonic-gate 		}
16490Sstevel@tonic-gate 	}
16500Sstevel@tonic-gate 
16510Sstevel@tonic-gate 	if (!bypass_cluster && num_hosts == 1) {
16520Sstevel@tonic-gate 		if (delete_end && sdssc_delete_end(sp->setname, SDSSC_COMMIT) ==
16530Sstevel@tonic-gate 		    SDSSC_ERROR) {
16540Sstevel@tonic-gate 			rval = 4;
16550Sstevel@tonic-gate 		}
16560Sstevel@tonic-gate 	}
16570Sstevel@tonic-gate 
16580Sstevel@tonic-gate out1:
16590Sstevel@tonic-gate 
16600Sstevel@tonic-gate 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
16610Sstevel@tonic-gate 
16620Sstevel@tonic-gate 	/*
16630Sstevel@tonic-gate 	 * Remove the set lock on those nodes that had the set locked
16640Sstevel@tonic-gate 	 * max_node will either be MD_MAXSIDES or array index of the last
16650Sstevel@tonic-gate 	 * node contacted (or rather failed to contact) for traditional
16660Sstevel@tonic-gate 	 * diskset.  For a MN diskset, max_node is the node_id of the node
16670Sstevel@tonic-gate 	 * that failed the lock.
16680Sstevel@tonic-gate 	 */
16690Sstevel@tonic-gate 	if (MD_MNSET_DESC(sd)) {
16700Sstevel@tonic-gate 		nd = sd->sd_nodelist;
16710Sstevel@tonic-gate 		while (nd) {
16720Sstevel@tonic-gate 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
16730Sstevel@tonic-gate 				nd = nd->nd_next;
16740Sstevel@tonic-gate 				continue;
16750Sstevel@tonic-gate 			}
16760Sstevel@tonic-gate 			if (nd->nd_nodeid == max_node)
16770Sstevel@tonic-gate 				break;
16780Sstevel@tonic-gate 			if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
16790Sstevel@tonic-gate 				if (forceflg && mdanyrpcerror(&xep)) {
16800Sstevel@tonic-gate 					mdclrerror(&xep);
16810Sstevel@tonic-gate 					nd = nd->nd_next;
16820Sstevel@tonic-gate 					continue;
16830Sstevel@tonic-gate 				}
16840Sstevel@tonic-gate 				if (rval == 0)
16850Sstevel@tonic-gate 					(void) mdstealerror(ep, &xep);
16860Sstevel@tonic-gate 				rval = 5;
16870Sstevel@tonic-gate 			}
16880Sstevel@tonic-gate 			nd = nd->nd_next;
16890Sstevel@tonic-gate 		}
16900Sstevel@tonic-gate 	} else {
16910Sstevel@tonic-gate 		for (i = 0; i < max_node; i++) {
16920Sstevel@tonic-gate 			/* Skip empty slots */
16930Sstevel@tonic-gate 			if (sd->sd_nodes[i][0] == '\0')
16940Sstevel@tonic-gate 				continue;
16950Sstevel@tonic-gate 
16960Sstevel@tonic-gate 			if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) {
16970Sstevel@tonic-gate 				if (forceflg && mdanyrpcerror(&xep)) {
16980Sstevel@tonic-gate 					mdclrerror(&xep);
16990Sstevel@tonic-gate 					continue;
17000Sstevel@tonic-gate 				}
17010Sstevel@tonic-gate 				if (rval == 0)
17020Sstevel@tonic-gate 					(void) mdstealerror(ep, &xep);
17030Sstevel@tonic-gate 				rval = 5;
17040Sstevel@tonic-gate 			}
17050Sstevel@tonic-gate 		}
17060Sstevel@tonic-gate 	}
17070Sstevel@tonic-gate 
17080Sstevel@tonic-gate 	cl_set_setkey(NULL);
17090Sstevel@tonic-gate 
17100Sstevel@tonic-gate 	return (rval);
17110Sstevel@tonic-gate }
17120Sstevel@tonic-gate 
17130Sstevel@tonic-gate int
17140Sstevel@tonic-gate meta_set_query(
17150Sstevel@tonic-gate 	mdsetname_t		*sp,
17160Sstevel@tonic-gate 	mddb_dtag_lst_t		**dtlpp,
17170Sstevel@tonic-gate 	md_error_t		*ep
17180Sstevel@tonic-gate )
17190Sstevel@tonic-gate {
17200Sstevel@tonic-gate 	mddb_dtag_get_parm_t	dtgp;
17210Sstevel@tonic-gate 
17220Sstevel@tonic-gate 	(void) memset(&dtgp, '\0', sizeof (mddb_dtag_get_parm_t));
17230Sstevel@tonic-gate 	dtgp.dtgp_setno = sp->setno;
17240Sstevel@tonic-gate 
17250Sstevel@tonic-gate 	/*CONSTCOND*/
17260Sstevel@tonic-gate 	while (1) {
17270Sstevel@tonic-gate 		if (metaioctl(MD_MED_GET_TAG, &dtgp, &dtgp.dtgp_mde, NULL) != 0)
17280Sstevel@tonic-gate 			if (! mdismddberror(&dtgp.dtgp_mde, MDE_DB_NOTAG) ||
17290Sstevel@tonic-gate 			    *dtlpp == NULL)
17300Sstevel@tonic-gate 				return (mdstealerror(ep, &dtgp.dtgp_mde));
17310Sstevel@tonic-gate 			else
17320Sstevel@tonic-gate 				break;
17330Sstevel@tonic-gate 
17340Sstevel@tonic-gate 		/*
17350Sstevel@tonic-gate 		 * Run to the end of the list
17360Sstevel@tonic-gate 		 */
17370Sstevel@tonic-gate 		for (/* void */; (*dtlpp != NULL); dtlpp = &(*dtlpp)->dtl_nx)
17380Sstevel@tonic-gate 			/* void */;
17390Sstevel@tonic-gate 
17400Sstevel@tonic-gate 		*dtlpp = Zalloc(sizeof (mddb_dtag_lst_t));
17410Sstevel@tonic-gate 
17420Sstevel@tonic-gate 		(void) memmove(&(*dtlpp)->dtl_dt, &dtgp.dtgp_dt,
17430Sstevel@tonic-gate 		    sizeof (mddb_dtag_t));
17440Sstevel@tonic-gate 
17450Sstevel@tonic-gate 		dtgp.dtgp_dt.dt_id++;
17460Sstevel@tonic-gate 	}
17470Sstevel@tonic-gate 	return (0);
17480Sstevel@tonic-gate }
17490Sstevel@tonic-gate 
17500Sstevel@tonic-gate /*
17510Sstevel@tonic-gate  * return drivename get by key
17520Sstevel@tonic-gate  */
17530Sstevel@tonic-gate mddrivename_t *
17540Sstevel@tonic-gate metadrivename_withdrkey(
17550Sstevel@tonic-gate 	mdsetname_t	*sp,
17560Sstevel@tonic-gate 	side_t		sideno,
17570Sstevel@tonic-gate 	mdkey_t		key,
17580Sstevel@tonic-gate 	int		flags,
17590Sstevel@tonic-gate 	md_error_t	*ep
17600Sstevel@tonic-gate )
17610Sstevel@tonic-gate {
17620Sstevel@tonic-gate 	char		*nm;
17630Sstevel@tonic-gate 	mdname_t	*np;
17640Sstevel@tonic-gate 	mddrivename_t	*dnp;
17650Sstevel@tonic-gate 	ddi_devid_t	devidp;
17660Sstevel@tonic-gate 	md_set_desc	*sd;
17670Sstevel@tonic-gate 
17680Sstevel@tonic-gate 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
17690Sstevel@tonic-gate 		return (NULL);
17700Sstevel@tonic-gate 	}
17710Sstevel@tonic-gate 
17720Sstevel@tonic-gate 	/* get namespace info */
17730Sstevel@tonic-gate 	if (MD_MNSET_DESC(sd)) {
17740Sstevel@tonic-gate 		if ((nm = meta_getnmbykey(MD_LOCAL_SET, sideno,
17750Sstevel@tonic-gate 		    key, ep)) == NULL)
17760Sstevel@tonic-gate 			return (NULL);
17770Sstevel@tonic-gate 	} else {
17780Sstevel@tonic-gate 		if ((nm = meta_getnmbykey(MD_LOCAL_SET, sideno+SKEW,
17790Sstevel@tonic-gate 		    key, ep)) == NULL)
17800Sstevel@tonic-gate 			return (NULL);
17810Sstevel@tonic-gate 	}
17820Sstevel@tonic-gate 
17830Sstevel@tonic-gate 	/* get device name */
17840Sstevel@tonic-gate 	if (flags & PRINT_FAST) {
1785*1623Stw21770 		if ((np = metaname_fast(&sp, nm, LOGICAL_DEVICE, ep)) == NULL) {
17860Sstevel@tonic-gate 			Free(nm);
17870Sstevel@tonic-gate 			return (NULL);
17880Sstevel@tonic-gate 		}
17890Sstevel@tonic-gate 	} else {
1790*1623Stw21770 		if ((np = metaname(&sp, nm, LOGICAL_DEVICE, ep)) == NULL) {
17910Sstevel@tonic-gate 			Free(nm);
17920Sstevel@tonic-gate 			return (NULL);
17930Sstevel@tonic-gate 		}
17940Sstevel@tonic-gate 	}
17950Sstevel@tonic-gate 	Free(nm);
17960Sstevel@tonic-gate 
17970Sstevel@tonic-gate 	/* make sure it's OK */
17980Sstevel@tonic-gate 	if ((! (flags & MD_BASICNAME_OK)) && (metachkcomp(np, ep) != 0))
17990Sstevel@tonic-gate 		return (NULL);
18000Sstevel@tonic-gate 
18010Sstevel@tonic-gate 	/* get drivename */
18020Sstevel@tonic-gate 	dnp = np->drivenamep;
18030Sstevel@tonic-gate 	dnp->side_names_key = key;
18040Sstevel@tonic-gate 
18050Sstevel@tonic-gate 	/*
18060Sstevel@tonic-gate 	 * Skip the following devid check if dnp is did device
18070Sstevel@tonic-gate 	 * The device id is disabled for did device due to the
18080Sstevel@tonic-gate 	 * lack of minor name support in the did driver. The following
18090Sstevel@tonic-gate 	 * devid code path can set and propagate the error and
18100Sstevel@tonic-gate 	 * eventually prevent did disks from being added to the
18110Sstevel@tonic-gate 	 * diskset under SunCluster systems
18120Sstevel@tonic-gate 	 */
18130Sstevel@tonic-gate 	if (strncmp(dnp->rname, "/dev/did/", strlen("/dev/did/")) == 0) {
18140Sstevel@tonic-gate 		goto out;
18150Sstevel@tonic-gate 	}
18160Sstevel@tonic-gate 
18170Sstevel@tonic-gate 	/* Also, Skip the check if MN diskset, no devid's */
18180Sstevel@tonic-gate 	if (MD_MNSET_DESC(sd)) {
18190Sstevel@tonic-gate 		goto out;
18200Sstevel@tonic-gate 	}
18210Sstevel@tonic-gate 
18220Sstevel@tonic-gate 	/*
18230Sstevel@tonic-gate 	 * Get the devid associated with the key.
18240Sstevel@tonic-gate 	 *
18250Sstevel@tonic-gate 	 * If a devid was returned, it MUST be valid even in
18260Sstevel@tonic-gate 	 * the case where a device id has been "updated". The
18270Sstevel@tonic-gate 	 * "update" of the device id may have occured due to
18280Sstevel@tonic-gate 	 * a firmware upgrade.
18290Sstevel@tonic-gate 	 */
18300Sstevel@tonic-gate 	if ((devidp = meta_getdidbykey(MD_LOCAL_SET, sideno+SKEW, key, ep))
18310Sstevel@tonic-gate 	    != NULL) {
18320Sstevel@tonic-gate 		dnp->devid = devid_str_encode(devidp, NULL);
18330Sstevel@tonic-gate 		free(devidp);
18340Sstevel@tonic-gate 	} else {
18350Sstevel@tonic-gate 		/*
18360Sstevel@tonic-gate 		 * It is okay if replica is not in devid mode
18370Sstevel@tonic-gate 		 */
18380Sstevel@tonic-gate 		if (mdissyserror(ep, MDDB_F_NODEVID)) {
18390Sstevel@tonic-gate 			mdclrerror(ep);
18400Sstevel@tonic-gate 			goto out;
18410Sstevel@tonic-gate 		}
18420Sstevel@tonic-gate 
18430Sstevel@tonic-gate 		/*
18440Sstevel@tonic-gate 		 * devid is missing so this means that we have
18450Sstevel@tonic-gate 		 * just upgraded from a configuration where
18460Sstevel@tonic-gate 		 * devid's were not used so try to add in
18470Sstevel@tonic-gate 		 * the devid and requery.
18480Sstevel@tonic-gate 		 */
18490Sstevel@tonic-gate 		if (meta_setdid(MD_LOCAL_SET, sideno + SKEW, key,
18500Sstevel@tonic-gate 		    ep) < 0)
18510Sstevel@tonic-gate 			return (NULL);
18520Sstevel@tonic-gate 		if ((devidp = (ddi_devid_t)meta_getdidbykey(MD_LOCAL_SET,
18530Sstevel@tonic-gate 		    sideno+SKEW, key, ep)) == NULL)
18540Sstevel@tonic-gate 			return (NULL);
18550Sstevel@tonic-gate 		dnp->devid = devid_str_encode(devidp, NULL);
18560Sstevel@tonic-gate 		devid_free(devidp);
18570Sstevel@tonic-gate 	}
18580Sstevel@tonic-gate 
18590Sstevel@tonic-gate out:
18600Sstevel@tonic-gate 	if (flags & MD_BYPASS_DAEMON)
18610Sstevel@tonic-gate 		return (dnp);
18620Sstevel@tonic-gate 
18630Sstevel@tonic-gate 	if (get_sidenmlist(sp, dnp, ep))
18640Sstevel@tonic-gate 		return (NULL);
18650Sstevel@tonic-gate 
18660Sstevel@tonic-gate 	/* return success */
18670Sstevel@tonic-gate 	return (dnp);
18680Sstevel@tonic-gate }
18690Sstevel@tonic-gate 
18700Sstevel@tonic-gate void
18710Sstevel@tonic-gate metafreedrivedesc(md_drive_desc **dd)
18720Sstevel@tonic-gate {
18730Sstevel@tonic-gate 	md_drive_desc	*p, *next = NULL;
18740Sstevel@tonic-gate 
18750Sstevel@tonic-gate 	for (p = *dd; p != NULL; p = next) {
18760Sstevel@tonic-gate 		next = p->dd_next;
18770Sstevel@tonic-gate 		Free(p);
18780Sstevel@tonic-gate 	}
18790Sstevel@tonic-gate 	*dd = NULL;
18800Sstevel@tonic-gate }
18810Sstevel@tonic-gate 
18820Sstevel@tonic-gate md_drive_desc *
18830Sstevel@tonic-gate metaget_drivedesc(
18840Sstevel@tonic-gate 	mdsetname_t	*sp,
18850Sstevel@tonic-gate 	int		flags,
18860Sstevel@tonic-gate 	md_error_t	*ep
18870Sstevel@tonic-gate )
18880Sstevel@tonic-gate {
18890Sstevel@tonic-gate 	side_t		sideno = MD_SIDEWILD;
18900Sstevel@tonic-gate 
18910Sstevel@tonic-gate 	assert(! (flags & MD_BYPASS_DAEMON));
18920Sstevel@tonic-gate 
18930Sstevel@tonic-gate 	if ((sideno = getmyside(sp, ep)) == MD_SIDEWILD)
18940Sstevel@tonic-gate 		return (NULL);
18950Sstevel@tonic-gate 
18960Sstevel@tonic-gate 	return (metaget_drivedesc_sideno(sp, sideno, flags, ep));
18970Sstevel@tonic-gate }
18980Sstevel@tonic-gate 
18990Sstevel@tonic-gate md_drive_desc *
19000Sstevel@tonic-gate metaget_drivedesc_fromnamelist(
19010Sstevel@tonic-gate 	mdsetname_t	*sp,
19020Sstevel@tonic-gate 	mdnamelist_t	*nlp,
19030Sstevel@tonic-gate 	md_error_t	*ep
19040Sstevel@tonic-gate )
19050Sstevel@tonic-gate {
19060Sstevel@tonic-gate 	md_set_desc		*sd;
19070Sstevel@tonic-gate 	mdnamelist_t		*p;
19080Sstevel@tonic-gate 	md_drive_desc		*dd = NULL;
19090Sstevel@tonic-gate 
19100Sstevel@tonic-gate 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
19110Sstevel@tonic-gate 		return (NULL);
19120Sstevel@tonic-gate 
19130Sstevel@tonic-gate 	for (p = nlp; p != NULL; p = p->next)
19140Sstevel@tonic-gate 		(void) metadrivedesc_append(&dd, p->namep->drivenamep, 0, 0,
19150Sstevel@tonic-gate 		    sd->sd_ctime, sd->sd_genid, MD_DR_ADD);
19160Sstevel@tonic-gate 
19170Sstevel@tonic-gate 	return (dd);
19180Sstevel@tonic-gate }
19190Sstevel@tonic-gate 
19200Sstevel@tonic-gate md_drive_desc *
19210Sstevel@tonic-gate metaget_drivedesc_sideno(
19220Sstevel@tonic-gate 	mdsetname_t *sp,
19230Sstevel@tonic-gate 	side_t sideno,
19240Sstevel@tonic-gate 	int flags,
19250Sstevel@tonic-gate 	md_error_t *ep
19260Sstevel@tonic-gate )
19270Sstevel@tonic-gate {
19280Sstevel@tonic-gate 	md_set_desc	*sd = NULL;
19290Sstevel@tonic-gate 
19300Sstevel@tonic-gate 	assert(! (flags & MD_BYPASS_DAEMON));
19310Sstevel@tonic-gate 
19320Sstevel@tonic-gate 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
19330Sstevel@tonic-gate 		return (NULL);
19340Sstevel@tonic-gate 
19350Sstevel@tonic-gate 	if (sd->sd_drvs)
19360Sstevel@tonic-gate 		return (sd->sd_drvs);
19370Sstevel@tonic-gate 
19380Sstevel@tonic-gate 	if ((sd->sd_drvs = dr2drivedesc(sp, sideno, flags, ep)) == NULL)
19390Sstevel@tonic-gate 		return (NULL);
19400Sstevel@tonic-gate 
19410Sstevel@tonic-gate 	return (sd->sd_drvs);
19420Sstevel@tonic-gate }
19430Sstevel@tonic-gate 
19440Sstevel@tonic-gate int
19450Sstevel@tonic-gate metaget_setownership(
19460Sstevel@tonic-gate 	mdsetname_t	*sp,
19470Sstevel@tonic-gate 	md_error_t	*ep
19480Sstevel@tonic-gate )
19490Sstevel@tonic-gate {
19500Sstevel@tonic-gate 	md_set_desc	*sd;
19510Sstevel@tonic-gate 	int		bool;
19520Sstevel@tonic-gate 	int		i;
19530Sstevel@tonic-gate 	md_mnnode_desc	*nd;
19540Sstevel@tonic-gate 
19550Sstevel@tonic-gate 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
19560Sstevel@tonic-gate 		return (-1);
19570Sstevel@tonic-gate 
19580Sstevel@tonic-gate 	if (MD_MNSET_DESC(sd)) {
19590Sstevel@tonic-gate 		nd = sd->sd_nodelist;
19600Sstevel@tonic-gate 		while (nd) {
19610Sstevel@tonic-gate 			/* If node isn't alive, can't own diskset */
19620Sstevel@tonic-gate 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
19630Sstevel@tonic-gate 				nd->nd_flags &= ~MD_MN_NODE_OWN;
19640Sstevel@tonic-gate 				nd = nd->nd_next;
19650Sstevel@tonic-gate 				continue;
19660Sstevel@tonic-gate 			}
19670Sstevel@tonic-gate 			/*
19680Sstevel@tonic-gate 			 * If can't communicate with rpc.metad, then mark
19690Sstevel@tonic-gate 			 * this node as not an owner.  That node may
19700Sstevel@tonic-gate 			 * in fact, be an owner, but without rpc.metad running
19710Sstevel@tonic-gate 			 * that node can't do much.
19720Sstevel@tonic-gate 			 */
19730Sstevel@tonic-gate 			if (clnt_ownset(nd->nd_nodename, sp, &bool, ep) == -1) {
19740Sstevel@tonic-gate 				nd->nd_flags &= ~MD_MN_NODE_OWN;
19750Sstevel@tonic-gate 			} else if (bool == TRUE) {
19760Sstevel@tonic-gate 				nd->nd_flags |= MD_MN_NODE_OWN;
19770Sstevel@tonic-gate 			} else {
19780Sstevel@tonic-gate 				nd->nd_flags &= ~MD_MN_NODE_OWN;
19790Sstevel@tonic-gate 			}
19800Sstevel@tonic-gate 			nd = nd->nd_next;
19810Sstevel@tonic-gate 		}
19820Sstevel@tonic-gate 		return (0);
19830Sstevel@tonic-gate 	}
19840Sstevel@tonic-gate 
19850Sstevel@tonic-gate 	/* Rest of code handles traditional disksets */
19860Sstevel@tonic-gate 
19870Sstevel@tonic-gate 	for (i = 0; i < MD_MAXSIDES; i++)
19880Sstevel@tonic-gate 		sd->sd_isown[i] = 0;
19890Sstevel@tonic-gate 
19900Sstevel@tonic-gate 	if (clnt_ownset(mynode(), sp, &bool, ep) == -1)
19910Sstevel@tonic-gate 		return (-1);
19920Sstevel@tonic-gate 
19930Sstevel@tonic-gate 	if (bool == TRUE)
19940Sstevel@tonic-gate 		sd->sd_isown[getmyside(sp, ep)] = 1;
19950Sstevel@tonic-gate 
19960Sstevel@tonic-gate 	return (0);
19970Sstevel@tonic-gate }
19980Sstevel@tonic-gate 
19990Sstevel@tonic-gate char *
20000Sstevel@tonic-gate mynode(void)
20010Sstevel@tonic-gate {
20020Sstevel@tonic-gate 	static struct utsname	myuname;
20030Sstevel@tonic-gate 	static int		done = 0;
20040Sstevel@tonic-gate 
20050Sstevel@tonic-gate 	if (! done) {
20060Sstevel@tonic-gate 		if (uname(&myuname) == -1) {
20070Sstevel@tonic-gate 			md_perror(dgettext(TEXT_DOMAIN, "uname"));
20080Sstevel@tonic-gate 			assert(0);
20090Sstevel@tonic-gate 		}
20100Sstevel@tonic-gate 		done = 1;
20110Sstevel@tonic-gate 	}
20120Sstevel@tonic-gate 	return (myuname.nodename);
20130Sstevel@tonic-gate }
20140Sstevel@tonic-gate 
20150Sstevel@tonic-gate int
20160Sstevel@tonic-gate strinlst(char *str, int cnt, char **lst)
20170Sstevel@tonic-gate {
20180Sstevel@tonic-gate 	int i;
20190Sstevel@tonic-gate 
20200Sstevel@tonic-gate 	for (i = 0; i < cnt; i++)
20210Sstevel@tonic-gate 		if (strcmp(lst[i], str) == 0)
20220Sstevel@tonic-gate 			return (TRUE);
20230Sstevel@tonic-gate 
20240Sstevel@tonic-gate 	return (FALSE);
20250Sstevel@tonic-gate }
20260Sstevel@tonic-gate 
20270Sstevel@tonic-gate /*
20280Sstevel@tonic-gate  * meta_get_reserved_names
20290Sstevel@tonic-gate  *  returns an mdnamelist_t of reserved slices
20300Sstevel@tonic-gate  *  reserved slices are those that are used but don't necessarily
20310Sstevel@tonic-gate  *  show up as metadevices (ex. reserved slice for db in sets, logs)
20320Sstevel@tonic-gate  */
20330Sstevel@tonic-gate 
20340Sstevel@tonic-gate /*ARGSUSED*/
20350Sstevel@tonic-gate int
20360Sstevel@tonic-gate meta_get_reserved_names(
20370Sstevel@tonic-gate 	mdsetname_t	*sp,
20380Sstevel@tonic-gate 	mdnamelist_t	**nlpp,
20390Sstevel@tonic-gate 	int		options,
20400Sstevel@tonic-gate 	md_error_t	*ep)
20410Sstevel@tonic-gate {
20420Sstevel@tonic-gate 	int		 count		= 0;
20430Sstevel@tonic-gate 	mdname_t	*np		= NULL;
20440Sstevel@tonic-gate 	mdnamelist_t	*transnlp	= NULL;
20450Sstevel@tonic-gate 	mdnamelist_t	**tailpp 	= nlpp;
20460Sstevel@tonic-gate 	mdnamelist_t	*nlp;
20470Sstevel@tonic-gate 	md_drive_desc	*dd, *di;
20480Sstevel@tonic-gate 
20490Sstevel@tonic-gate 	if (metaislocalset(sp))
20500Sstevel@tonic-gate 		goto out;
20510Sstevel@tonic-gate 
20520Sstevel@tonic-gate 	if (!(dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep)) && !mdisok(ep)) {
20530Sstevel@tonic-gate 		count = -1;
20540Sstevel@tonic-gate 		goto out;
20550Sstevel@tonic-gate 	}
20560Sstevel@tonic-gate 
20570Sstevel@tonic-gate 	/* db in for sets on reserved slice */
20580Sstevel@tonic-gate 	for (di = dd; di && count >= 0; di = di->dd_next) {
20590Sstevel@tonic-gate 		uint_t	rep_slice;
20600Sstevel@tonic-gate 
20610Sstevel@tonic-gate 		/*
20620Sstevel@tonic-gate 		 * Add the name struct to the end of the
20630Sstevel@tonic-gate 		 * namelist but keep a pointer to the last
20640Sstevel@tonic-gate 		 * element so that we don't incur the overhead
20650Sstevel@tonic-gate 		 * of traversing the list each time
20660Sstevel@tonic-gate 		 */
20670Sstevel@tonic-gate 		if (di->dd_dnp &&
20680Sstevel@tonic-gate 		    (meta_replicaslice(di->dd_dnp, &rep_slice, ep) == 0) &&
20690Sstevel@tonic-gate 		    (np = metaslicename(di->dd_dnp, rep_slice, ep)) &&
20700Sstevel@tonic-gate 		    (tailpp = meta_namelist_append_wrapper(tailpp, np)))
20710Sstevel@tonic-gate 			count++;
20720Sstevel@tonic-gate 		else
20730Sstevel@tonic-gate 			count = -1;
20740Sstevel@tonic-gate 	}
20750Sstevel@tonic-gate 
20760Sstevel@tonic-gate 	/* now find logs */
20770Sstevel@tonic-gate 	if (meta_get_trans_names(sp, &transnlp, 0, ep) < 0) {
20780Sstevel@tonic-gate 		count = -1;
20790Sstevel@tonic-gate 		goto out;
20800Sstevel@tonic-gate 	}
20810Sstevel@tonic-gate 
20820Sstevel@tonic-gate 	for (nlp = transnlp; (nlp != NULL); nlp = nlp->next) {
20830Sstevel@tonic-gate 		mdname_t	*transnp = nlp->namep;
20840Sstevel@tonic-gate 		md_trans_t	*transp;
20850Sstevel@tonic-gate 
20860Sstevel@tonic-gate 		if ((transp = meta_get_trans(sp, transnp, ep)) == NULL) {
20870Sstevel@tonic-gate 			count = -1;
20880Sstevel@tonic-gate 			goto out;
20890Sstevel@tonic-gate 		}
20900Sstevel@tonic-gate 		if (transp->lognamep) {
20910Sstevel@tonic-gate 			/*
20920Sstevel@tonic-gate 			 * Add the name struct to the end of the
20930Sstevel@tonic-gate 			 * namelist but keep a pointer to the last
20940Sstevel@tonic-gate 			 * element so that we don't incur the overhead
20950Sstevel@tonic-gate 			 * of traversing the list each time
20960Sstevel@tonic-gate 			 */
20970Sstevel@tonic-gate 			tailpp = meta_namelist_append_wrapper(
20980Sstevel@tonic-gate 			    tailpp, transp->lognamep);
20990Sstevel@tonic-gate 		}
21000Sstevel@tonic-gate 	}
21010Sstevel@tonic-gate out:
21020Sstevel@tonic-gate 	metafreenamelist(transnlp);
21030Sstevel@tonic-gate 	return (count);
21040Sstevel@tonic-gate }
21050Sstevel@tonic-gate 
21060Sstevel@tonic-gate /*
21070Sstevel@tonic-gate  * Entry point to join a node to MultiNode diskset.
21080Sstevel@tonic-gate  *
21090Sstevel@tonic-gate  * Validate host in diskset.
21100Sstevel@tonic-gate  *	- Should be in membership list from API
21110Sstevel@tonic-gate  *	- Should not already be joined into diskset.
21120Sstevel@tonic-gate  *	- Set must have drives
21130Sstevel@tonic-gate  * Assume valid configuration is stored in the set/drive/node records
21140Sstevel@tonic-gate  * in the local mddb since no node or drive can be added to the MNset
21150Sstevel@tonic-gate  * unless all drives and nodes are available.  Reconfig steps will
21160Sstevel@tonic-gate  * resync all ALIVE nodes in case of panic in critical areas.
21170Sstevel@tonic-gate  *
21180Sstevel@tonic-gate  * Lock down the set.
21190Sstevel@tonic-gate  * Verify host is a member of this diskset.
21200Sstevel@tonic-gate  * If drives exist in the configuration, load the mddbs.
21210Sstevel@tonic-gate  * Set this node to active by notifying master if one exists.
21220Sstevel@tonic-gate  * If this is the first node active in the diskset, this node
21230Sstevel@tonic-gate  * 	becomes the master.
21240Sstevel@tonic-gate  * Unlock the set.
21250Sstevel@tonic-gate  *
21260Sstevel@tonic-gate  * Mirror Resync:
21270Sstevel@tonic-gate  * If this node is the last node to join the set and clustering
21280Sstevel@tonic-gate  * isn't running, then start the 'metasync -r' type resync
21290Sstevel@tonic-gate  * on all mirrors in this diskset.
21300Sstevel@tonic-gate  * If clustering is running, this resync operation will
21310Sstevel@tonic-gate  * be handled by the reconfig steps and should NOT
21320Sstevel@tonic-gate  * be handled during a join operation.
21330Sstevel@tonic-gate  *
21340Sstevel@tonic-gate  * There are multiple return values in order to assist
21350Sstevel@tonic-gate  * the join operation of all sets in the metaset command.
21360Sstevel@tonic-gate  *
21370Sstevel@tonic-gate  * Return values:
21380Sstevel@tonic-gate  *	0  - Node successfully joined to set.
21390Sstevel@tonic-gate  *	-1 - Join attempted but failed
21400Sstevel@tonic-gate  *		- any failure from libmeta calls
21410Sstevel@tonic-gate  *		- node not in the member list
21420Sstevel@tonic-gate  *	-2 - Join not attempted since
21430Sstevel@tonic-gate  *		- this set had no drives in set
21440Sstevel@tonic-gate  *		- this node already joined to set
21450Sstevel@tonic-gate  *		- set is not a multinode set
21460Sstevel@tonic-gate  *	-3 - Node joined to STALE set.
21470Sstevel@tonic-gate  */
21480Sstevel@tonic-gate extern int
21490Sstevel@tonic-gate meta_set_join(
21500Sstevel@tonic-gate 	mdsetname_t	*sp,
21510Sstevel@tonic-gate 	md_error_t	*ep
21520Sstevel@tonic-gate )
21530Sstevel@tonic-gate {
21540Sstevel@tonic-gate 	md_set_desc		*sd;
21550Sstevel@tonic-gate 	md_drive_desc		*dd;
21560Sstevel@tonic-gate 	md_mnnode_desc		*nd, *nd2, my_nd;
21570Sstevel@tonic-gate 	int			rval = 0;
21580Sstevel@tonic-gate 	md_setkey_t		*cl_sk;
21590Sstevel@tonic-gate 	md_error_t		xep = mdnullerror;
21600Sstevel@tonic-gate 	md_error_t		ep_snarf = mdnullerror;
21610Sstevel@tonic-gate 	int			master_flag = 0;
21620Sstevel@tonic-gate 	md_mnset_record		*mas_mnsr = NULL;
21630Sstevel@tonic-gate 	int			clear_nr_flags = 0;
21640Sstevel@tonic-gate 	md_mnnode_record	*nr;
21650Sstevel@tonic-gate 	int			stale_set = 0;
21660Sstevel@tonic-gate 	int			rb_flags = 0;
21670Sstevel@tonic-gate 	int			stale_bool = FALSE;
21680Sstevel@tonic-gate 	int			suspendall_flag = 0;
21690Sstevel@tonic-gate 	int			suspend1_flag = 0;
21700Sstevel@tonic-gate 	sigset_t		oldsigs;
21710Sstevel@tonic-gate 	int			send_reinit = 0;
21720Sstevel@tonic-gate 
21730Sstevel@tonic-gate 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
21740Sstevel@tonic-gate 		return (-1);
21750Sstevel@tonic-gate 	}
21760Sstevel@tonic-gate 
21770Sstevel@tonic-gate 	/* Must be a multinode diskset */
21780Sstevel@tonic-gate 	if (!MD_MNSET_DESC(sd)) {
21790Sstevel@tonic-gate 		(void) mderror(ep, MDE_NOT_MN, sp->setname);
21800Sstevel@tonic-gate 		return (-2);
21810Sstevel@tonic-gate 	}
21820Sstevel@tonic-gate 
21830Sstevel@tonic-gate 	/* Verify that the node is ALIVE (i.e. is in the API membership list) */
21840Sstevel@tonic-gate 	if (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_ALIVE)) {
21850Sstevel@tonic-gate 		(void) mddserror(ep, MDE_DS_NOTINMEMBERLIST, sp->setno,
21860Sstevel@tonic-gate 			sd->sd_mn_mynode->nd_nodename, NULL,
21870Sstevel@tonic-gate 			sp->setname);
21880Sstevel@tonic-gate 		return (-1);
21890Sstevel@tonic-gate 	}
21900Sstevel@tonic-gate 
21910Sstevel@tonic-gate 	/* Make sure we are blocking all signals */
21920Sstevel@tonic-gate 	if (procsigs(TRUE, &oldsigs, &xep) < 0)
21930Sstevel@tonic-gate 		mdclrerror(&xep);
21940Sstevel@tonic-gate 
21950Sstevel@tonic-gate 	/*
21960Sstevel@tonic-gate 	 * Lock the set on current set members.
21970Sstevel@tonic-gate 	 * For MN diskset lock_set and SUSPEND are used to protect against
21980Sstevel@tonic-gate 	 * other meta* commands running on the other nodes.
21990Sstevel@tonic-gate 	 */
22000Sstevel@tonic-gate 	nd = sd->sd_nodelist;
22010Sstevel@tonic-gate 	while (nd) {
22020Sstevel@tonic-gate 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
22030Sstevel@tonic-gate 			nd = nd->nd_next;
22040Sstevel@tonic-gate 			continue;
22050Sstevel@tonic-gate 		}
22060Sstevel@tonic-gate 		if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
22070Sstevel@tonic-gate 			rval = -1;
22080Sstevel@tonic-gate 			goto out;
22090Sstevel@tonic-gate 		}
22100Sstevel@tonic-gate 		nd = nd->nd_next;
22110Sstevel@tonic-gate 	}
22120Sstevel@tonic-gate 
22130Sstevel@tonic-gate 	/*
22140Sstevel@tonic-gate 	 * Lock out other meta* commands by suspending
22150Sstevel@tonic-gate 	 * class 1 messages across the diskset.
22160Sstevel@tonic-gate 	 */
22170Sstevel@tonic-gate 	nd = sd->sd_nodelist;
22180Sstevel@tonic-gate 	while (nd) {
22190Sstevel@tonic-gate 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
22200Sstevel@tonic-gate 			nd = nd->nd_next;
22210Sstevel@tonic-gate 			continue;
22220Sstevel@tonic-gate 		}
22230Sstevel@tonic-gate 		if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND,
22240Sstevel@tonic-gate 			    sp, MD_MSG_CLASS1, MD_MSCF_NO_FLAGS, ep)) {
22250Sstevel@tonic-gate 			rval = -1;
22260Sstevel@tonic-gate 			goto out;
22270Sstevel@tonic-gate 		}
22280Sstevel@tonic-gate 		suspend1_flag = 1;
22290Sstevel@tonic-gate 		nd = nd->nd_next;
22300Sstevel@tonic-gate 	}
22310Sstevel@tonic-gate 
22320Sstevel@tonic-gate 	/*
22330Sstevel@tonic-gate 	 * Verify that this host is a member (in the host list) of the set.
22340Sstevel@tonic-gate 	 */
22350Sstevel@tonic-gate 	nd = sd->sd_nodelist;
22360Sstevel@tonic-gate 	while (nd) {
22370Sstevel@tonic-gate 		if (strcmp(mynode(), nd->nd_nodename) == 0) {
22380Sstevel@tonic-gate 			break;
22390Sstevel@tonic-gate 		}
22400Sstevel@tonic-gate 		nd = nd->nd_next;
22410Sstevel@tonic-gate 	}
22420Sstevel@tonic-gate 	if (!nd) {
22430Sstevel@tonic-gate 		(void) mddserror(ep, MDE_DS_NODENOTINSET, sp->setno,
22440Sstevel@tonic-gate 			sd->sd_mn_mynode->nd_nodename, NULL,
22450Sstevel@tonic-gate 			sp->setname);
22460Sstevel@tonic-gate 		rval = -1;
22470Sstevel@tonic-gate 		goto out;
22480Sstevel@tonic-gate 	}
22490Sstevel@tonic-gate 
22500Sstevel@tonic-gate 	/*
22510Sstevel@tonic-gate 	 * Need to return failure if host is already 'joined'
22520Sstevel@tonic-gate 	 * into the set.  This is done so that if later the user
22530Sstevel@tonic-gate 	 * issues a command to join all sets and a failure is
22540Sstevel@tonic-gate 	 * encountered - that the resulting cleanup effort
22550Sstevel@tonic-gate 	 * (withdrawing from all sets that were joined
22560Sstevel@tonic-gate 	 * during that command) won't withdraw from this set.
22570Sstevel@tonic-gate 	 */
22580Sstevel@tonic-gate 	if (nd->nd_flags & MD_MN_NODE_OWN) {
22590Sstevel@tonic-gate 		rval = -2;
22600Sstevel@tonic-gate 		goto out2;
22610Sstevel@tonic-gate 	}
22620Sstevel@tonic-gate 
22630Sstevel@tonic-gate 	/*
22640Sstevel@tonic-gate 	 * Call metaget_setownership that calls each node in diskset and
22650Sstevel@tonic-gate 	 * marks in set descriptor if node is an owner of the set or not.
22660Sstevel@tonic-gate 	 * metaget_setownership checks to see if a node is an owner by
22670Sstevel@tonic-gate 	 * checking to see if that node's kernel has the mddb loaded.
22680Sstevel@tonic-gate 	 * If a node had panic'd during a reconfig or an
22690Sstevel@tonic-gate 	 * add/delete/join/withdraw operation, the other nodes' node
22700Sstevel@tonic-gate 	 * records may not reflect the current state of the diskset,
22710Sstevel@tonic-gate 	 * so calling metaget_setownership is the safest thing to do.
22720Sstevel@tonic-gate 	 */
22730Sstevel@tonic-gate 	if (metaget_setownership(sp, ep) == -1) {
22740Sstevel@tonic-gate 		rval = -1;
22750Sstevel@tonic-gate 		goto out;
22760Sstevel@tonic-gate 	}
22770Sstevel@tonic-gate 
22780Sstevel@tonic-gate 	/* If first active member of diskset, become the master. */
22790Sstevel@tonic-gate 	nd = sd->sd_nodelist;
22800Sstevel@tonic-gate 	while (nd) {
22810Sstevel@tonic-gate 		if (nd->nd_flags & MD_MN_NODE_OWN)
22820Sstevel@tonic-gate 			break;
22830Sstevel@tonic-gate 		nd = nd->nd_next;
22840Sstevel@tonic-gate 	}
22850Sstevel@tonic-gate 	if (nd == NULL)
22860Sstevel@tonic-gate 		master_flag = 1;
22870Sstevel@tonic-gate 
22880Sstevel@tonic-gate 	/*
22890Sstevel@tonic-gate 	 * If not first active member of diskset, then get the
22900Sstevel@tonic-gate 	 * master information from a node that is already joined
22910Sstevel@tonic-gate 	 * and set the master information for this node.  Be sure
22920Sstevel@tonic-gate 	 * that this node (the already joined node) has its own
22930Sstevel@tonic-gate 	 * join flag set.  If not, then this diskset isn't currently
22940Sstevel@tonic-gate 	 * consistent and shouldn't allow a node to join.  This diskset
22950Sstevel@tonic-gate 	 * inconsistency should only occur when a node has panic'd in
22960Sstevel@tonic-gate 	 * the set while doing a metaset operation and the sysadmin is
22970Sstevel@tonic-gate 	 * attempting to join a node into the set.  This inconsistency
22980Sstevel@tonic-gate 	 * will be fixed during a reconfig cycle which should be occurring
22990Sstevel@tonic-gate 	 * soon since a node panic'd.
23000Sstevel@tonic-gate 	 *
23010Sstevel@tonic-gate 	 * If unable to get this information from an owning node, then
23020Sstevel@tonic-gate 	 * this diskset isn't currently consistent and shouldn't
23030Sstevel@tonic-gate 	 * allow a node to join.
23040Sstevel@tonic-gate 	 */
23050Sstevel@tonic-gate 	if (!master_flag) {
23060Sstevel@tonic-gate 		/* get master information from an owner (joined) node */
23070Sstevel@tonic-gate 		if (clnt_mngetset(nd->nd_nodename, sp->setname,
23080Sstevel@tonic-gate 		    sp->setno, &mas_mnsr, ep) == -1) {
23090Sstevel@tonic-gate 			rval = -1;
23100Sstevel@tonic-gate 			goto out;
23110Sstevel@tonic-gate 		}
23120Sstevel@tonic-gate 
23130Sstevel@tonic-gate 		/* Verify that owner (joined) node has its own JOIN flag set */
23140Sstevel@tonic-gate 		nr = mas_mnsr->sr_nodechain;
23150Sstevel@tonic-gate 		while (nr) {
23160Sstevel@tonic-gate 			if ((nd->nd_nodeid == nr->nr_nodeid) &&
23170Sstevel@tonic-gate 			    ((nr->nr_flags & MD_MN_NODE_OWN) == NULL)) {
23180Sstevel@tonic-gate 				(void) mddserror(ep, MDE_DS_NODENOSET,
23190Sstevel@tonic-gate 				    sp->setno, nd->nd_nodename, NULL,
23200Sstevel@tonic-gate 				    nd->nd_nodename);
23210Sstevel@tonic-gate 				free_sr((md_set_record *)mas_mnsr);
23220Sstevel@tonic-gate 				rval = -1;
23230Sstevel@tonic-gate 				goto out;
23240Sstevel@tonic-gate 			}
23250Sstevel@tonic-gate 			nr = nr->nr_next;
23260Sstevel@tonic-gate 		}
23270Sstevel@tonic-gate 
23280Sstevel@tonic-gate 		/*
23290Sstevel@tonic-gate 		 * Does master have set marked as STALE?
23300Sstevel@tonic-gate 		 * If so, need to pass this down to kernel when
23310Sstevel@tonic-gate 		 * this node snarfs the set.
23320Sstevel@tonic-gate 		 */
23330Sstevel@tonic-gate 		if (clnt_mn_is_stale(nd->nd_nodename, sp,
23340Sstevel@tonic-gate 		    &stale_bool, ep) == -1) {
23350Sstevel@tonic-gate 			rval = -1;
23360Sstevel@tonic-gate 			goto out;
23370Sstevel@tonic-gate 		}
23380Sstevel@tonic-gate 
23390Sstevel@tonic-gate 		/* set master information in my rpc.metad's set record */
23400Sstevel@tonic-gate 		if (clnt_mnsetmaster(mynode(), sp, mas_mnsr->sr_master_nodenm,
23410Sstevel@tonic-gate 		    mas_mnsr->sr_master_nodeid, ep)) {
23420Sstevel@tonic-gate 			free_sr((md_set_record *)mas_mnsr);
23430Sstevel@tonic-gate 			rval = -1;
23440Sstevel@tonic-gate 			goto out;
23450Sstevel@tonic-gate 		}
23460Sstevel@tonic-gate 
23470Sstevel@tonic-gate 		/* set master information in my cached set desc */
23480Sstevel@tonic-gate 		(void) strcpy(sd->sd_mn_master_nodenm,
23490Sstevel@tonic-gate 		    mas_mnsr->sr_master_nodenm);
23500Sstevel@tonic-gate 		sd->sd_mn_master_nodeid = mas_mnsr->sr_master_nodeid;
23510Sstevel@tonic-gate 		nd2 = sd->sd_nodelist;
23520Sstevel@tonic-gate 		while (nd2) {
23530Sstevel@tonic-gate 		    if (nd2->nd_nodeid == mas_mnsr->sr_master_nodeid) {
23540Sstevel@tonic-gate 			sd->sd_mn_masternode = nd2;
23550Sstevel@tonic-gate 			break;
23560Sstevel@tonic-gate 		    }
23570Sstevel@tonic-gate 		    nd2 = nd2->nd_next;
23580Sstevel@tonic-gate 		}
23590Sstevel@tonic-gate 		free_sr((md_set_record *)mas_mnsr);
23600Sstevel@tonic-gate 
23610Sstevel@tonic-gate 		/*
23620Sstevel@tonic-gate 		 * Set the node flags in mynode's rpc.metad node records for
23630Sstevel@tonic-gate 		 * the nodes that are in the diskset.  Can use my sd
23640Sstevel@tonic-gate 		 * since earlier call to metaget_setownership set the
23650Sstevel@tonic-gate 		 * owner flags based on whether that node had snarfed
23660Sstevel@tonic-gate 		 * the MN diskset mddb.  Reconfig steps guarantee that
23670Sstevel@tonic-gate 		 * return of metaget_setownership will match the owning
23680Sstevel@tonic-gate 		 * node's owner list except in the case where a node
23690Sstevel@tonic-gate 		 * has just panic'd and in this case, a reconfig will
23700Sstevel@tonic-gate 		 * be starting immediately and the owner lists will
23710Sstevel@tonic-gate 		 * be sync'd up by the reconfig.
23720Sstevel@tonic-gate 		 *
23730Sstevel@tonic-gate 		 * Flag of SET means to take no action except to
23740Sstevel@tonic-gate 		 * set the node flags as given in the nodelist linked list.
23750Sstevel@tonic-gate 		 */
23760Sstevel@tonic-gate 		if (clnt_upd_nr_flags(mynode(), sp, sd->sd_nodelist,
23770Sstevel@tonic-gate 		    MD_NR_SET, NULL, ep)) {
23780Sstevel@tonic-gate 			rval = -1;
23790Sstevel@tonic-gate 			goto out;
23800Sstevel@tonic-gate 		}
23810Sstevel@tonic-gate 	}
23820Sstevel@tonic-gate 
23830Sstevel@tonic-gate 	/*
23840Sstevel@tonic-gate 	 * Read in the mddb if there are drives in the set.
23850Sstevel@tonic-gate 	 */
23860Sstevel@tonic-gate 	if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
23870Sstevel@tonic-gate 	    ep)) == NULL) {
23880Sstevel@tonic-gate 		/* No drives in list */
23890Sstevel@tonic-gate 		if (! mdisok(ep)) {
23900Sstevel@tonic-gate 			rval = -1;
23910Sstevel@tonic-gate 			goto out;
23920Sstevel@tonic-gate 		}
23930Sstevel@tonic-gate 		rval = -2;
23940Sstevel@tonic-gate 		goto out;
23950Sstevel@tonic-gate 	}
23960Sstevel@tonic-gate 
23970Sstevel@tonic-gate 	/*
23980Sstevel@tonic-gate 	 * Notify rpc.mdcommd on all nodes of a nodelist change.
23990Sstevel@tonic-gate 	 * Start by suspending rpc.mdcommd (which drains it of all messages),
24000Sstevel@tonic-gate 	 * then change the nodelist followed by a reinit and resume.
24010Sstevel@tonic-gate 	 */
24020Sstevel@tonic-gate 	nd = sd->sd_nodelist;
24030Sstevel@tonic-gate 	while (nd) {
24040Sstevel@tonic-gate 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
24050Sstevel@tonic-gate 			nd = nd->nd_next;
24060Sstevel@tonic-gate 			continue;
24070Sstevel@tonic-gate 		}
24080Sstevel@tonic-gate 
24090Sstevel@tonic-gate 		if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND, sp,
24100Sstevel@tonic-gate 		    MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) {
24110Sstevel@tonic-gate 			rval = -1;
24120Sstevel@tonic-gate 			goto out;
24130Sstevel@tonic-gate 		}
24140Sstevel@tonic-gate 		suspendall_flag = 1;
24150Sstevel@tonic-gate 		nd = nd->nd_next;
24160Sstevel@tonic-gate 	}
24170Sstevel@tonic-gate 
24180Sstevel@tonic-gate 	/* Set master in my set record in rpc.metad */
24190Sstevel@tonic-gate 	if (master_flag) {
24200Sstevel@tonic-gate 		if (clnt_mnsetmaster(mynode(), sp,
24210Sstevel@tonic-gate 		    sd->sd_mn_mynode->nd_nodename,
24220Sstevel@tonic-gate 		    sd->sd_mn_mynode->nd_nodeid, ep)) {
24230Sstevel@tonic-gate 			rval = -1;
24240Sstevel@tonic-gate 			goto out;
24250Sstevel@tonic-gate 		}
24260Sstevel@tonic-gate 	}
2427650Sskamm 	/*
2428650Sskamm 	 * Causes mddbs to be loaded into the kernel.
2429650Sskamm 	 * Set the force flag so that replica locations can be
2430650Sskamm 	 * loaded into the kernel even if a mediator node was
2431650Sskamm 	 * unavailable.  This allows a node to join an MO
2432650Sskamm 	 * diskset when there are sufficient replicas available,
2433650Sskamm 	 * but a mediator node in unavailable.
2434650Sskamm 	 */
2435650Sskamm 	if (setup_db_bydd(sp, dd, TRUE, ep) == -1) {
24360Sstevel@tonic-gate 		mde_perror(ep, dgettext(TEXT_DOMAIN,
24370Sstevel@tonic-gate 		    "Host not able to start diskset."));
24380Sstevel@tonic-gate 		rval = -1;
24390Sstevel@tonic-gate 		goto out;
24400Sstevel@tonic-gate 	}
24410Sstevel@tonic-gate 
24420Sstevel@tonic-gate 	if (! mdisok(ep)) {
24430Sstevel@tonic-gate 		rval = -1;
24440Sstevel@tonic-gate 		goto out;
24450Sstevel@tonic-gate 	}
24460Sstevel@tonic-gate 
24470Sstevel@tonic-gate 	/*
24480Sstevel@tonic-gate 	 * Set rollback flags to 1 so that halt_set is called if a failure
24490Sstevel@tonic-gate 	 * is seen after this point.  If snarf_set fails, still need to
24500Sstevel@tonic-gate 	 * call halt_set to cleanup the diskset.
24510Sstevel@tonic-gate 	 */
24520Sstevel@tonic-gate 	rb_flags = 1;
24530Sstevel@tonic-gate 
24540Sstevel@tonic-gate 	/* Starts the set */
24550Sstevel@tonic-gate 	if (snarf_set(sp, stale_bool, ep) != 0) {
24560Sstevel@tonic-gate 		if (mdismddberror(ep, MDE_DB_STALE)) {
24570Sstevel@tonic-gate 			/*
24580Sstevel@tonic-gate 			 * Don't fail join, STALE means that set has
24590Sstevel@tonic-gate 			 * < 50% mddbs.
24600Sstevel@tonic-gate 			 */
24610Sstevel@tonic-gate 			(void) mdstealerror(&ep_snarf, ep);
24620Sstevel@tonic-gate 			stale_set = 1;
24630Sstevel@tonic-gate 		} else if (mdisok(ep)) {
24640Sstevel@tonic-gate 			/* If snarf failed, but no error was set - set it */
246562Sjeanm 			(void) mdmddberror(ep, MDE_DB_NOTNOW, (minor_t)NODEV64,
24660Sstevel@tonic-gate 			    sp->setno, 0, NULL);
24670Sstevel@tonic-gate 				rval = -1;
24680Sstevel@tonic-gate 				goto out;
24690Sstevel@tonic-gate 		} else if (!(mdismddberror(ep, MDE_DB_ACCOK))) {
24700Sstevel@tonic-gate 			/*
24710Sstevel@tonic-gate 			 * Don't fail join if ACCOK; ACCOK means that mediator
24720Sstevel@tonic-gate 			 * provided extra vote.
24730Sstevel@tonic-gate 			 */
24740Sstevel@tonic-gate 			rval = -1;
24750Sstevel@tonic-gate 			goto out;
24760Sstevel@tonic-gate 		}
24770Sstevel@tonic-gate 	}
24780Sstevel@tonic-gate 
24790Sstevel@tonic-gate 	/* Did set really get snarfed? */
24800Sstevel@tonic-gate 	if (own_set(sp, NULL, TRUE, ep) == MD_SETOWNER_NO) {
24810Sstevel@tonic-gate 		if (mdisok(ep)) {
24820Sstevel@tonic-gate 			/* If snarf failed, but no error was set - set it */
248362Sjeanm 			(void) mdmddberror(ep, MDE_DB_NOTNOW, (minor_t)NODEV64,
24840Sstevel@tonic-gate 				sp->setno, 0, NULL);
24850Sstevel@tonic-gate 		}
24860Sstevel@tonic-gate 		mde_perror(ep, dgettext(TEXT_DOMAIN,
24870Sstevel@tonic-gate 		    "Host not able to start diskset."));
24880Sstevel@tonic-gate 		rval = -1;
24890Sstevel@tonic-gate 		goto out;
24900Sstevel@tonic-gate 	}
24910Sstevel@tonic-gate 
24920Sstevel@tonic-gate 	/* Change to nodelist so need to send reinit to rpc.mdcommd */
24930Sstevel@tonic-gate 	send_reinit = 1;
24940Sstevel@tonic-gate 
24950Sstevel@tonic-gate 	/* If first node to enter set, setup master and clear change log */
24960Sstevel@tonic-gate 	if (master_flag) {
24970Sstevel@tonic-gate 		/* Set master in my locally cached set descriptor */
24980Sstevel@tonic-gate 		(void) strcpy(sd->sd_mn_master_nodenm,
24990Sstevel@tonic-gate 		    sd->sd_mn_mynode->nd_nodename);
25000Sstevel@tonic-gate 		sd->sd_mn_master_nodeid = sd->sd_mn_mynode->nd_nodeid;
25010Sstevel@tonic-gate 		sd->sd_mn_am_i_master = 1;
25020Sstevel@tonic-gate 
25030Sstevel@tonic-gate 		/*
25040Sstevel@tonic-gate 		 * If first node to join set, then clear out change log
25050Sstevel@tonic-gate 		 * entries.  Change log entries are only needed when a
25060Sstevel@tonic-gate 		 * change of master is occurring in a diskset that has
25070Sstevel@tonic-gate 		 * multiple owners.   Since this node is the first owner
25080Sstevel@tonic-gate 		 * of the diskset, clear the entries.
25090Sstevel@tonic-gate 		 *
25100Sstevel@tonic-gate 		 * Only do this if we are in a single node non-SC3.x
25110Sstevel@tonic-gate 		 * situation.
25120Sstevel@tonic-gate 		 */
25130Sstevel@tonic-gate 		if (meta_mn_singlenode() &&
25140Sstevel@tonic-gate 			mdmn_reset_changelog(sp, ep,  MDMN_CLF_RESETLOG) != 0) {
25150Sstevel@tonic-gate 			mde_perror(ep, dgettext(TEXT_DOMAIN,
25160Sstevel@tonic-gate 			    "Unable to reset changelog."));
25170Sstevel@tonic-gate 			rval = -1;
25180Sstevel@tonic-gate 			goto out;
25190Sstevel@tonic-gate 		}
25200Sstevel@tonic-gate 	}
25210Sstevel@tonic-gate 
25220Sstevel@tonic-gate 	/* Set my locally cached flag */
25230Sstevel@tonic-gate 	sd->sd_mn_mynode->nd_flags |= MD_MN_NODE_OWN;
25240Sstevel@tonic-gate 
25250Sstevel@tonic-gate 	/*
25260Sstevel@tonic-gate 	 * Set this node's own flag on all joined nodes in the set
25270Sstevel@tonic-gate 	 * (including my node).
25280Sstevel@tonic-gate 	 */
25290Sstevel@tonic-gate 	clear_nr_flags = 1;
25300Sstevel@tonic-gate 
25310Sstevel@tonic-gate 	my_nd = *(sd->sd_mn_mynode);
25320Sstevel@tonic-gate 	my_nd.nd_next = NULL;
25330Sstevel@tonic-gate 	nd = sd->sd_nodelist;
25340Sstevel@tonic-gate 	while (nd) {
25350Sstevel@tonic-gate 		if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
25360Sstevel@tonic-gate 			nd = nd->nd_next;
25370Sstevel@tonic-gate 			continue;
25380Sstevel@tonic-gate 		}
25390Sstevel@tonic-gate 		if (clnt_upd_nr_flags(nd->nd_nodename, sp, &my_nd,
25400Sstevel@tonic-gate 		    MD_NR_JOIN, NULL, ep)) {
25410Sstevel@tonic-gate 			rval = -1;
25420Sstevel@tonic-gate 			goto out;
25430Sstevel@tonic-gate 		}
25440Sstevel@tonic-gate 		nd = nd->nd_next;
25450Sstevel@tonic-gate 	}
25460Sstevel@tonic-gate 
25470Sstevel@tonic-gate out:
25480Sstevel@tonic-gate 	if (rval != NULL) {
25490Sstevel@tonic-gate 		/*
25500Sstevel@tonic-gate 		 * If rollback flag is 1, then node was joined to set.
25510Sstevel@tonic-gate 		 * Since an error occurred, withdraw node from set in
25520Sstevel@tonic-gate 		 * order to rollback to before command was run.
25530Sstevel@tonic-gate 		 * Need to preserve ep so that calling function can
25540Sstevel@tonic-gate 		 * get error information.
25550Sstevel@tonic-gate 		 */
25560Sstevel@tonic-gate 		if (rb_flags == 1) {
25570Sstevel@tonic-gate 			if (halt_set(sp, &xep)) {
25580Sstevel@tonic-gate 				mdclrerror(&xep);
25590Sstevel@tonic-gate 			}
25600Sstevel@tonic-gate 		}
25610Sstevel@tonic-gate 
25620Sstevel@tonic-gate 		/*
25630Sstevel@tonic-gate 		 * If error, reset master to INVALID.
25640Sstevel@tonic-gate 		 * Ignore error since (next) first node to successfully join
25650Sstevel@tonic-gate 		 * will set master on all nodes.
25660Sstevel@tonic-gate 		 */
25670Sstevel@tonic-gate 		(void) clnt_mnsetmaster(mynode(), sp, "",
25680Sstevel@tonic-gate 			MD_MN_INVALID_NID, &xep);
25690Sstevel@tonic-gate 		mdclrerror(&xep);
25700Sstevel@tonic-gate 		/* Reset master in my locally cached set descriptor */
25710Sstevel@tonic-gate 		sd->sd_mn_master_nodeid = MD_MN_INVALID_NID;
25720Sstevel@tonic-gate 		sd->sd_mn_am_i_master = 0;
25730Sstevel@tonic-gate 
25740Sstevel@tonic-gate 		/*
25750Sstevel@tonic-gate 		 * If nr flags set on other nodes, reset them.
25760Sstevel@tonic-gate 		 */
25770Sstevel@tonic-gate 		if (clear_nr_flags) {
25780Sstevel@tonic-gate 			nd = sd->sd_nodelist;
25790Sstevel@tonic-gate 			while (nd) {
25800Sstevel@tonic-gate 				if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
25810Sstevel@tonic-gate 					nd = nd->nd_next;
25820Sstevel@tonic-gate 					continue;
25830Sstevel@tonic-gate 				}
25840Sstevel@tonic-gate 				(void) clnt_upd_nr_flags(nd->nd_nodename, sp,
25850Sstevel@tonic-gate 					&my_nd, MD_NR_WITHDRAW, NULL, &xep);
25860Sstevel@tonic-gate 				mdclrerror(&xep);
25870Sstevel@tonic-gate 				nd = nd->nd_next;
25880Sstevel@tonic-gate 			}
25890Sstevel@tonic-gate 			/* Reset my locally cached flag */
25900Sstevel@tonic-gate 			sd->sd_mn_mynode->nd_flags &= ~MD_MN_NODE_OWN;
25910Sstevel@tonic-gate 		}
25920Sstevel@tonic-gate 	}
25930Sstevel@tonic-gate 
25940Sstevel@tonic-gate 	/*
25950Sstevel@tonic-gate 	 * Notify rpc.mdcommd on all nodes of a nodelist change.
25960Sstevel@tonic-gate 	 * Send reinit command to mdcommd which forces it to get
25970Sstevel@tonic-gate 	 * fresh set description.
25980Sstevel@tonic-gate 	 */
25990Sstevel@tonic-gate 	if (send_reinit) {
26000Sstevel@tonic-gate 		/* Send reinit */
26010Sstevel@tonic-gate 		nd = sd->sd_nodelist;
26020Sstevel@tonic-gate 		while (nd) {
26030Sstevel@tonic-gate 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
26040Sstevel@tonic-gate 				nd = nd->nd_next;
26050Sstevel@tonic-gate 				continue;
26060Sstevel@tonic-gate 			}
26070Sstevel@tonic-gate 
26080Sstevel@tonic-gate 			/* Class is ignored for REINIT */
26090Sstevel@tonic-gate 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
26100Sstevel@tonic-gate 				sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
26110Sstevel@tonic-gate 				/*
26120Sstevel@tonic-gate 				 * We are here because we failed to resume
26130Sstevel@tonic-gate 				 * rpc.mdcommd.  However we potentially have
26140Sstevel@tonic-gate 				 * an error from the previous call
26150Sstevel@tonic-gate 				 * If the previous call did fail,  we capture
26160Sstevel@tonic-gate 				 * that error and generate a perror with
26170Sstevel@tonic-gate 				 * the string, "Unable to resume...".
26180Sstevel@tonic-gate 				 * Setting rval to -1 ensures that in the
26190Sstevel@tonic-gate 				 * next iteration of the loop, ep is not
26200Sstevel@tonic-gate 				 * clobbered.
26210Sstevel@tonic-gate 				 */
26220Sstevel@tonic-gate 				if (rval == 0)
26230Sstevel@tonic-gate 					(void) mdstealerror(ep, &xep);
26240Sstevel@tonic-gate 				else
26250Sstevel@tonic-gate 					mdclrerror(&xep);
26260Sstevel@tonic-gate 				rval = -1;
26270Sstevel@tonic-gate 				mde_perror(ep, dgettext(TEXT_DOMAIN,
26280Sstevel@tonic-gate 				    "Unable to reinit rpc.mdcommd."));
26290Sstevel@tonic-gate 			}
26300Sstevel@tonic-gate 			nd = nd->nd_next;
26310Sstevel@tonic-gate 		}
26320Sstevel@tonic-gate 
26330Sstevel@tonic-gate 	}
26340Sstevel@tonic-gate 
26350Sstevel@tonic-gate out2:
26360Sstevel@tonic-gate 	/*
26370Sstevel@tonic-gate 	 * Unlock diskset by resuming messages across the diskset.
26380Sstevel@tonic-gate 	 * Just resume all classes so that resume is the same whether
26390Sstevel@tonic-gate 	 * just one class was locked or all classes were locked.
26400Sstevel@tonic-gate 	 */
26410Sstevel@tonic-gate 	if ((suspend1_flag) || (suspendall_flag)) {
26420Sstevel@tonic-gate 		nd = sd->sd_nodelist;
26430Sstevel@tonic-gate 		while (nd) {
26440Sstevel@tonic-gate 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
26450Sstevel@tonic-gate 				nd = nd->nd_next;
26460Sstevel@tonic-gate 				continue;
26470Sstevel@tonic-gate 			}
26480Sstevel@tonic-gate 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
26490Sstevel@tonic-gate 				sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
26500Sstevel@tonic-gate 				/*
26510Sstevel@tonic-gate 				 * We are here because we failed to resume
26520Sstevel@tonic-gate 				 * rpc.mdcommd.  However we potentially have
26530Sstevel@tonic-gate 				 * an error from the previous call
26540Sstevel@tonic-gate 				 * If the previous call did fail,  we capture
26550Sstevel@tonic-gate 				 * that error and generate a perror with
26560Sstevel@tonic-gate 				 * the string, "Unable to resume...".
26570Sstevel@tonic-gate 				 * Setting rval to -1 ensures that in the
26580Sstevel@tonic-gate 				 * next iteration of the loop, ep is not
26590Sstevel@tonic-gate 				 * clobbered.
26600Sstevel@tonic-gate 				 */
26610Sstevel@tonic-gate 				if (rval == 0)
26620Sstevel@tonic-gate 					(void) mdstealerror(ep, &xep);
26630Sstevel@tonic-gate 				else
26640Sstevel@tonic-gate 					mdclrerror(&xep);
26650Sstevel@tonic-gate 				rval = -1;
26660Sstevel@tonic-gate 				mde_perror(ep, dgettext(TEXT_DOMAIN,
26670Sstevel@tonic-gate 				    "Unable to resume rpc.mdcommd."));
26680Sstevel@tonic-gate 			}
26690Sstevel@tonic-gate 			nd = nd->nd_next;
26700Sstevel@tonic-gate 		}
26710Sstevel@tonic-gate 		meta_ping_mnset(sp->setno);
26720Sstevel@tonic-gate 	}
26730Sstevel@tonic-gate 
26740Sstevel@tonic-gate 	/*
26750Sstevel@tonic-gate 	 * Unlock set.  This flushes the caches on the servers.
26760Sstevel@tonic-gate 	 */
26770Sstevel@tonic-gate 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
26780Sstevel@tonic-gate 	nd = sd->sd_nodelist;
26790Sstevel@tonic-gate 	while (nd) {
26800Sstevel@tonic-gate 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
26810Sstevel@tonic-gate 			nd = nd->nd_next;
26820Sstevel@tonic-gate 			continue;
26830Sstevel@tonic-gate 		}
26840Sstevel@tonic-gate 		if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
26850Sstevel@tonic-gate 			if (rval == 0)
26860Sstevel@tonic-gate 				(void) mdstealerror(ep, &xep);
26870Sstevel@tonic-gate 			else
26880Sstevel@tonic-gate 				mdclrerror(&xep);
26890Sstevel@tonic-gate 			rval = -1;
26900Sstevel@tonic-gate 		}
26910Sstevel@tonic-gate 		nd = nd->nd_next;
26920Sstevel@tonic-gate 	}
26930Sstevel@tonic-gate 
26940Sstevel@tonic-gate 	/*
26950Sstevel@tonic-gate 	 * If this node is the last to join the diskset and clustering isn't
26960Sstevel@tonic-gate 	 * running, then resync the mirrors in the diskset. We have to wait
26970Sstevel@tonic-gate 	 * until all nodes are joined so that the status gets propagated to
26980Sstevel@tonic-gate 	 * all of the members of the set.
26990Sstevel@tonic-gate 	 * Ignore any error from the resync as the join function shouldn't fail
27000Sstevel@tonic-gate 	 * because the mirror resync had a problem.
27010Sstevel@tonic-gate 	 *
27020Sstevel@tonic-gate 	 * Don't start resync if set is stale.
27030Sstevel@tonic-gate 	 */
27040Sstevel@tonic-gate 	if ((rval == 0) && (sdssc_bind_library() != SDSSC_OKAY) &&
27050Sstevel@tonic-gate 	    (stale_set != 1)) {
27060Sstevel@tonic-gate 		nd = sd->sd_nodelist;
27070Sstevel@tonic-gate 		while (nd) {
27080Sstevel@tonic-gate 			if (!(nd->nd_flags & MD_MN_NODE_OWN))
27090Sstevel@tonic-gate 				break;
27100Sstevel@tonic-gate 			nd = nd->nd_next;
27110Sstevel@tonic-gate 		}
27120Sstevel@tonic-gate 		/*
27130Sstevel@tonic-gate 		 * nd set to NULL means that we have no nodes in the set that
27140Sstevel@tonic-gate 		 * haven't joined. In this case we start the resync.
27150Sstevel@tonic-gate 		 */
27160Sstevel@tonic-gate 		if (nd == NULL) {
27170Sstevel@tonic-gate 			(void) meta_mirror_resync_all(sp, 0, &xep);
27180Sstevel@tonic-gate 			mdclrerror(&xep);
27190Sstevel@tonic-gate 		}
27200Sstevel@tonic-gate 	}
27210Sstevel@tonic-gate 
27220Sstevel@tonic-gate 	/* Update ABR state for all soft partitions */
27230Sstevel@tonic-gate 	(void) meta_sp_update_abr(sp, &xep);
27240Sstevel@tonic-gate 	mdclrerror(&xep);
27250Sstevel@tonic-gate 
27260Sstevel@tonic-gate 	/*
27270Sstevel@tonic-gate 	 * call metaflushsetnames to reset local cache for master and
27280Sstevel@tonic-gate 	 * node information.
27290Sstevel@tonic-gate 	 */
27300Sstevel@tonic-gate 	metaflushsetname(sp);
27310Sstevel@tonic-gate 
27320Sstevel@tonic-gate 	/* release signals back to what they were on entry */
27330Sstevel@tonic-gate 	if (procsigs(FALSE, &oldsigs, &xep) < 0)
27340Sstevel@tonic-gate 		mdclrerror(&xep);
27350Sstevel@tonic-gate 
27360Sstevel@tonic-gate 	/*
27370Sstevel@tonic-gate 	 * If no error and stale_set is set, then set ep back
27380Sstevel@tonic-gate 	 * to ep from snarf_set call and return -3.  If another error
27390Sstevel@tonic-gate 	 * occurred and rval is not 0, then that error would have
27400Sstevel@tonic-gate 	 * caused the node to be withdrawn from the set and would
27410Sstevel@tonic-gate 	 * have set ep to that error information.
27420Sstevel@tonic-gate 	 */
27430Sstevel@tonic-gate 	if ((rval == 0) && (stale_set)) {
27440Sstevel@tonic-gate 		(void) mdstealerror(ep, &ep_snarf);
27450Sstevel@tonic-gate 		return (-3);
27460Sstevel@tonic-gate 	}
27470Sstevel@tonic-gate 
27480Sstevel@tonic-gate 	return (rval);
27490Sstevel@tonic-gate }
27500Sstevel@tonic-gate 
27510Sstevel@tonic-gate /*
27520Sstevel@tonic-gate  * Entry point to withdraw a node from MultiNode diskset.
27530Sstevel@tonic-gate  *
27540Sstevel@tonic-gate  * Validate host in diskset.
27550Sstevel@tonic-gate  *	- Should be joined into diskset.
27560Sstevel@tonic-gate  * Assume valid configuration is stored in the set/drive/node records
27570Sstevel@tonic-gate  * in the local mddb since no node or drive can be added to the MNset
27580Sstevel@tonic-gate  * unless all drives and nodes are available.  Reconfig steps will
27590Sstevel@tonic-gate  * resync all ALIVE nodes in case of panic in critical areas.
27600Sstevel@tonic-gate  *
27610Sstevel@tonic-gate  * Lock down the set.
27620Sstevel@tonic-gate  * Verify that drives exist in configuration.
27630Sstevel@tonic-gate  * Verify host is a member of this diskset.
27640Sstevel@tonic-gate  * Verify host is an owner of the diskset (host is joined to diskset).
27650Sstevel@tonic-gate  * Only allow withdrawal of master node if master node is the only joined
27660Sstevel@tonic-gate  * in the diskset.
27670Sstevel@tonic-gate  * Halt the diskset on this node.
27680Sstevel@tonic-gate  * Reset Master on this node.
27690Sstevel@tonic-gate  * Updated node flags that this node with withdrawn.
27700Sstevel@tonic-gate  * Unlock the set.
27710Sstevel@tonic-gate  *
27720Sstevel@tonic-gate  * Return values:
27730Sstevel@tonic-gate  *	0  - Node successfully withdrew from set.
27740Sstevel@tonic-gate  *	-1 - Withdrawal attempted but failed
27750Sstevel@tonic-gate  *		- any failure from libmeta calls
27760Sstevel@tonic-gate  *		- node not in the member list
27770Sstevel@tonic-gate  *	-2 - Withdrawal not attempted since
27780Sstevel@tonic-gate  *		- this set had no drives in set
27790Sstevel@tonic-gate  *		- this node not joined to set
27800Sstevel@tonic-gate  *		- set is not a multinode set
27810Sstevel@tonic-gate  */
27820Sstevel@tonic-gate extern int
27830Sstevel@tonic-gate meta_set_withdraw(
27840Sstevel@tonic-gate 	mdsetname_t	*sp,
27850Sstevel@tonic-gate 	md_error_t	*ep
27860Sstevel@tonic-gate )
27870Sstevel@tonic-gate {
27880Sstevel@tonic-gate 	md_set_desc		*sd;
27890Sstevel@tonic-gate 	md_drive_desc		*dd = 0;
27900Sstevel@tonic-gate 	md_mnnode_desc		*nd, my_nd;
27910Sstevel@tonic-gate 	int			rval = 0;
27920Sstevel@tonic-gate 	md_setkey_t		*cl_sk;
27930Sstevel@tonic-gate 	md_error_t		xep = mdnullerror;
27940Sstevel@tonic-gate 	int			set_halted = 0;
27950Sstevel@tonic-gate 	int			suspendall_flag = 0;
27960Sstevel@tonic-gate 	int			suspend1_flag = 0;
27970Sstevel@tonic-gate 	bool_t			stale_bool = FALSE;
27980Sstevel@tonic-gate 	mddb_config_t		c;
27990Sstevel@tonic-gate 	int			node_id_list[1];
28000Sstevel@tonic-gate 	sigset_t		oldsigs;
28010Sstevel@tonic-gate 	int			send_reinit = 0;
28020Sstevel@tonic-gate 
28030Sstevel@tonic-gate 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
28040Sstevel@tonic-gate 		return (-1);
28050Sstevel@tonic-gate 	}
28060Sstevel@tonic-gate 
28070Sstevel@tonic-gate 	/* Must be a multinode diskset */
28080Sstevel@tonic-gate 	if (!MD_MNSET_DESC(sd)) {
28090Sstevel@tonic-gate 		(void) mderror(ep, MDE_NOT_MN, sp->setname);
28100Sstevel@tonic-gate 		return (-1);
28110Sstevel@tonic-gate 	}
28120Sstevel@tonic-gate 
28130Sstevel@tonic-gate 	/* Make sure we are blocking all signals */
28140Sstevel@tonic-gate 	if (procsigs(TRUE, &oldsigs, &xep) < 0)
28150Sstevel@tonic-gate 		mdclrerror(&xep);
28160Sstevel@tonic-gate 
28170Sstevel@tonic-gate 	/*
28180Sstevel@tonic-gate 	 * Lock the set on current set members.
28190Sstevel@tonic-gate 	 * For MN diskset lock_set and SUSPEND are used to protect against
28200Sstevel@tonic-gate 	 * other meta* commands running on the other nodes.
28210Sstevel@tonic-gate 	 */
28220Sstevel@tonic-gate 	nd = sd->sd_nodelist;
28230Sstevel@tonic-gate 	while (nd) {
28240Sstevel@tonic-gate 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
28250Sstevel@tonic-gate 			nd = nd->nd_next;
28260Sstevel@tonic-gate 			continue;
28270Sstevel@tonic-gate 		}
28280Sstevel@tonic-gate 		if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
28290Sstevel@tonic-gate 			rval = -1;
28300Sstevel@tonic-gate 			goto out;
28310Sstevel@tonic-gate 		}
28320Sstevel@tonic-gate 		nd = nd->nd_next;
28330Sstevel@tonic-gate 	}
28340Sstevel@tonic-gate 	/*
28350Sstevel@tonic-gate 	 * Lock out other meta* commands by suspending
28360Sstevel@tonic-gate 	 * class 1 messages across the diskset.
28370Sstevel@tonic-gate 	 */
28380Sstevel@tonic-gate 	nd = sd->sd_nodelist;
28390Sstevel@tonic-gate 	while (nd) {
28400Sstevel@tonic-gate 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
28410Sstevel@tonic-gate 			nd = nd->nd_next;
28420Sstevel@tonic-gate 			continue;
28430Sstevel@tonic-gate 		}
28440Sstevel@tonic-gate 		if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND,
28450Sstevel@tonic-gate 			sp, MD_MSG_CLASS1, MD_MSCF_NO_FLAGS, ep)) {
28460Sstevel@tonic-gate 			rval = -1;
28470Sstevel@tonic-gate 			goto out;
28480Sstevel@tonic-gate 		}
28490Sstevel@tonic-gate 		suspend1_flag = 1;
28500Sstevel@tonic-gate 		nd = nd->nd_next;
28510Sstevel@tonic-gate 	}
28520Sstevel@tonic-gate 
28530Sstevel@tonic-gate 	/* Get list of drives - needed in case of failure */
28540Sstevel@tonic-gate 	if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
28550Sstevel@tonic-gate 	    ep)) == NULL) {
28560Sstevel@tonic-gate 		/* Error getting drives in list */
28570Sstevel@tonic-gate 		if (! mdisok(ep)) {
28580Sstevel@tonic-gate 			rval = -1;
28590Sstevel@tonic-gate 			goto out2;
28600Sstevel@tonic-gate 		}
28610Sstevel@tonic-gate 		/* no drives in list */
28620Sstevel@tonic-gate 		rval = -2;
28630Sstevel@tonic-gate 		goto out2;
28640Sstevel@tonic-gate 	}
28650Sstevel@tonic-gate 
28660Sstevel@tonic-gate 	/*
28670Sstevel@tonic-gate 	 * Verify that this host is a member (in the host list) of the set.
28680Sstevel@tonic-gate 	 */
28690Sstevel@tonic-gate 	nd = sd->sd_nodelist;
28700Sstevel@tonic-gate 	while (nd) {
28710Sstevel@tonic-gate 		if (strcmp(mynode(), nd->nd_nodename) == 0) {
28720Sstevel@tonic-gate 			break;
28730Sstevel@tonic-gate 		}
28740Sstevel@tonic-gate 		nd = nd->nd_next;
28750Sstevel@tonic-gate 	}
28760Sstevel@tonic-gate 	if (!nd) {
28770Sstevel@tonic-gate 		(void) mddserror(ep, MDE_DS_NODENOTINSET, sp->setno,
28780Sstevel@tonic-gate 			sd->sd_mn_mynode->nd_nodename, NULL,
28790Sstevel@tonic-gate 			sp->setname);
28800Sstevel@tonic-gate 		rval = -1;
28810Sstevel@tonic-gate 		goto out2;
28820Sstevel@tonic-gate 	}
28830Sstevel@tonic-gate 
28840Sstevel@tonic-gate 	/*
28850Sstevel@tonic-gate 	 * Call metaget_setownership that calls each node in diskset and
28860Sstevel@tonic-gate 	 * marks in set descriptor if node is an owner of the set or not.
28870Sstevel@tonic-gate 	 * metaget_setownership checks to see if a node is an owner by
28880Sstevel@tonic-gate 	 * checking to see if that node's kernel has the mddb loaded.
28890Sstevel@tonic-gate 	 * If a node had panic'd during a reconfig or an
28900Sstevel@tonic-gate 	 * add/delete/join/withdraw operation, the other nodes' node
28910Sstevel@tonic-gate 	 * records may not reflect the current state of the diskset,
28920Sstevel@tonic-gate 	 * so calling metaget_setownership is the safest thing to do.
28930Sstevel@tonic-gate 	 */
28940Sstevel@tonic-gate 	if (metaget_setownership(sp, ep) == -1) {
28950Sstevel@tonic-gate 		rval = -1;
28960Sstevel@tonic-gate 		goto out2;
28970Sstevel@tonic-gate 	}
28980Sstevel@tonic-gate 
28990Sstevel@tonic-gate 	/*
29000Sstevel@tonic-gate 	 * Verify that this node is joined
29010Sstevel@tonic-gate 	 * to diskset (i.e. is an owner of the diskset).
29020Sstevel@tonic-gate 	 */
29030Sstevel@tonic-gate 	if (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) {
29040Sstevel@tonic-gate 		rval = -2;
29050Sstevel@tonic-gate 		goto out2;
29060Sstevel@tonic-gate 	}
29070Sstevel@tonic-gate 
29080Sstevel@tonic-gate 	/*
29090Sstevel@tonic-gate 	 * For a MN diskset, only withdraw master if it is
29100Sstevel@tonic-gate 	 * the only joined node.
29110Sstevel@tonic-gate 	 */
29120Sstevel@tonic-gate 	if (sd->sd_mn_master_nodeid == sd->sd_mn_mynode->nd_nodeid) {
29130Sstevel@tonic-gate 		nd = sd->sd_nodelist;
29140Sstevel@tonic-gate 		while (nd) {
29150Sstevel@tonic-gate 			/* Skip my node since checking for other owners */
29160Sstevel@tonic-gate 			if (nd->nd_nodeid == sd->sd_mn_master_nodeid) {
29170Sstevel@tonic-gate 				nd = nd->nd_next;
29180Sstevel@tonic-gate 				continue;
29190Sstevel@tonic-gate 			}
29200Sstevel@tonic-gate 			/* If another owner node if found, error */
29210Sstevel@tonic-gate 			if (nd->nd_flags & MD_MN_NODE_OWN) {
29220Sstevel@tonic-gate 				(void) mddserror(ep, MDE_DS_WITHDRAWMASTER,
29230Sstevel@tonic-gate 					sp->setno,
29240Sstevel@tonic-gate 					sd->sd_mn_mynode->nd_nodename, NULL,
29250Sstevel@tonic-gate 					sp->setname);
29260Sstevel@tonic-gate 				rval = -1;
29270Sstevel@tonic-gate 				goto out2;
29280Sstevel@tonic-gate 			}
29290Sstevel@tonic-gate 			nd = nd->nd_next;
29300Sstevel@tonic-gate 		}
29310Sstevel@tonic-gate 	}
29320Sstevel@tonic-gate 
29330Sstevel@tonic-gate 	/*
29340Sstevel@tonic-gate 	 * Is current set STALE?
29350Sstevel@tonic-gate 	 */
29360Sstevel@tonic-gate 	(void) memset(&c, 0, sizeof (c));
29370Sstevel@tonic-gate 	c.c_id = 0;
29380Sstevel@tonic-gate 	c.c_setno = sp->setno;
29390Sstevel@tonic-gate 	if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) {
29400Sstevel@tonic-gate 		(void) mdstealerror(ep, &c.c_mde);
29410Sstevel@tonic-gate 		rval = -1;
29420Sstevel@tonic-gate 		goto out;
29430Sstevel@tonic-gate 	}
29440Sstevel@tonic-gate 	if (c.c_flags & MDDB_C_STALE) {
29450Sstevel@tonic-gate 		stale_bool = TRUE;
29460Sstevel@tonic-gate 	}
29470Sstevel@tonic-gate 
29480Sstevel@tonic-gate 	/*
29490Sstevel@tonic-gate 	 * Notify rpc.mdcommd on all nodes of a nodelist change.
29500Sstevel@tonic-gate 	 * Start by suspending rpc.mdcommd (which drains it of all messages),
29510Sstevel@tonic-gate 	 * then change the nodelist followed by a reinit and resume.
29520Sstevel@tonic-gate 	 */
29530Sstevel@tonic-gate 	nd = sd->sd_nodelist;
29540Sstevel@tonic-gate 	while (nd) {
29550Sstevel@tonic-gate 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
29560Sstevel@tonic-gate 			nd = nd->nd_next;
29570Sstevel@tonic-gate 			continue;
29580Sstevel@tonic-gate 		}
29590Sstevel@tonic-gate 
29600Sstevel@tonic-gate 		if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND,
29610Sstevel@tonic-gate 		    sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) {
29620Sstevel@tonic-gate 			rval = -1;
29630Sstevel@tonic-gate 			goto out;
29640Sstevel@tonic-gate 		}
29650Sstevel@tonic-gate 		suspendall_flag = 1;
29660Sstevel@tonic-gate 		nd = nd->nd_next;
29670Sstevel@tonic-gate 	}
29680Sstevel@tonic-gate 
29690Sstevel@tonic-gate 	/*
29700Sstevel@tonic-gate 	 * Withdraw the set - halt set.
29710Sstevel@tonic-gate 	 * This will fail if any I/O is occuring to any metadevice which
29720Sstevel@tonic-gate 	 * includes a resync to a mirror metadevice.
29730Sstevel@tonic-gate 	 */
29740Sstevel@tonic-gate 	set_halted = 1;
29750Sstevel@tonic-gate 	if (halt_set(sp, ep)) {
29760Sstevel@tonic-gate 		/* Was set actually halted? */
29770Sstevel@tonic-gate 		if (own_set(sp, NULL, TRUE, ep) == MD_SETOWNER_YES) {
29780Sstevel@tonic-gate 			set_halted = 0;
29790Sstevel@tonic-gate 		}
29800Sstevel@tonic-gate 		rval = -1;
29810Sstevel@tonic-gate 		goto out;
29820Sstevel@tonic-gate 	}
29830Sstevel@tonic-gate 
29840Sstevel@tonic-gate 	/* Change to nodelist so need to send reinit to rpc.mdcommd */
29850Sstevel@tonic-gate 	send_reinit = 1;
29860Sstevel@tonic-gate 
29870Sstevel@tonic-gate 	/* Reset master on withdrawn node */
29880Sstevel@tonic-gate 	if (clnt_mnsetmaster(sd->sd_mn_mynode->nd_nodename, sp, "",
29890Sstevel@tonic-gate 	    MD_MN_INVALID_NID, ep)) {
29900Sstevel@tonic-gate 		rval = -1;
29910Sstevel@tonic-gate 		goto out;
29920Sstevel@tonic-gate 	}
29930Sstevel@tonic-gate 
29940Sstevel@tonic-gate 	/* Mark my node as withdrawn and send to other nodes */
29950Sstevel@tonic-gate 	nd = sd->sd_nodelist;
29960Sstevel@tonic-gate 	my_nd = *(sd->sd_mn_mynode);	/* structure copy */
29970Sstevel@tonic-gate 	my_nd.nd_next = NULL;
29980Sstevel@tonic-gate 	while (nd) {
29990Sstevel@tonic-gate 		if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
30000Sstevel@tonic-gate 			nd = nd->nd_next;
30010Sstevel@tonic-gate 			continue;
30020Sstevel@tonic-gate 		}
30030Sstevel@tonic-gate 		if (clnt_upd_nr_flags(nd->nd_nodename, sp, &my_nd,
30040Sstevel@tonic-gate 		    MD_NR_WITHDRAW, NULL, ep)) {
30050Sstevel@tonic-gate 			rval = -1;
30060Sstevel@tonic-gate 			goto out;
30070Sstevel@tonic-gate 		}
30080Sstevel@tonic-gate 		nd = nd->nd_next;
30090Sstevel@tonic-gate 	}
30100Sstevel@tonic-gate 
30110Sstevel@tonic-gate 	/*
30120Sstevel@tonic-gate 	 * If withdrawn node is a mirror owner, reset mirror owner
30130Sstevel@tonic-gate 	 * to NULL.  If an error occurs, print a warning and continue.
30140Sstevel@tonic-gate 	 * Don't fail metaset because of mirror owner reset problem since
30150Sstevel@tonic-gate 	 * next node to grab mirror will resolve this issue.
30160Sstevel@tonic-gate 	 * Before next node grabs mirrors, metaset will show the withdrawn
30170Sstevel@tonic-gate 	 * node as owner which is why an attempt to reset the mirror owner
30180Sstevel@tonic-gate 	 * is made.
30190Sstevel@tonic-gate 	 */
30200Sstevel@tonic-gate 	node_id_list[0] = sd->sd_mn_mynode->nd_nodeid;	/* Setup my nodeid */
30210Sstevel@tonic-gate 	nd = sd->sd_nodelist;
30220Sstevel@tonic-gate 	while (nd) {
30230Sstevel@tonic-gate 		if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
30240Sstevel@tonic-gate 			nd = nd->nd_next;
30250Sstevel@tonic-gate 			continue;
30260Sstevel@tonic-gate 		}
30270Sstevel@tonic-gate 		if (clnt_reset_mirror_owner(nd->nd_nodename, sp,
30280Sstevel@tonic-gate 		    1, &node_id_list[0], &xep) == 01) {
30290Sstevel@tonic-gate 			mde_perror(&xep, dgettext(TEXT_DOMAIN,
30300Sstevel@tonic-gate 			    "Unable to reset mirror owner on node %s"),
30310Sstevel@tonic-gate 			    nd->nd_nodename);
30320Sstevel@tonic-gate 			mdclrerror(&xep);
30330Sstevel@tonic-gate 		}
30340Sstevel@tonic-gate 		nd = nd->nd_next;
30350Sstevel@tonic-gate 	}
30360Sstevel@tonic-gate 
30370Sstevel@tonic-gate out:
30380Sstevel@tonic-gate 	if (rval == -1) {
30390Sstevel@tonic-gate 		/* Rejoin node - Mark node as joined and send to other nodes */
30400Sstevel@tonic-gate 		nd = sd->sd_nodelist;
30410Sstevel@tonic-gate 		my_nd = *(sd->sd_mn_mynode);	/* structure copy */
30420Sstevel@tonic-gate 		my_nd.nd_next = NULL;
30430Sstevel@tonic-gate 		while (nd) {
30440Sstevel@tonic-gate 			if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
30450Sstevel@tonic-gate 				nd = nd->nd_next;
30460Sstevel@tonic-gate 				continue;
30470Sstevel@tonic-gate 			}
30480Sstevel@tonic-gate 			if (clnt_upd_nr_flags(nd->nd_nodename, sp, &my_nd,
30490Sstevel@tonic-gate 			    MD_NR_JOIN, NULL, &xep)) {
30500Sstevel@tonic-gate 				mdclrerror(&xep);
30510Sstevel@tonic-gate 			}
30520Sstevel@tonic-gate 			nd = nd->nd_next;
30530Sstevel@tonic-gate 		}
30540Sstevel@tonic-gate 
30550Sstevel@tonic-gate 		/* Set master on withdrawn node */
30560Sstevel@tonic-gate 		if (clnt_mnsetmaster(sd->sd_mn_mynode->nd_nodename, sp,
30570Sstevel@tonic-gate 		    sd->sd_mn_master_nodenm,
30580Sstevel@tonic-gate 		    sd->sd_mn_master_nodeid, &xep)) {
30590Sstevel@tonic-gate 			mdclrerror(&xep);
30600Sstevel@tonic-gate 		}
30610Sstevel@tonic-gate 
30620Sstevel@tonic-gate 		/* Join set if halt_set had succeeded */
30630Sstevel@tonic-gate 		if (set_halted) {
3064650Sskamm 			/*
3065650Sskamm 			 * Causes mddbs to be loaded into the kernel.
3066650Sskamm 			 * Set the force flag so that replica locations can be
3067650Sskamm 			 * loaded into the kernel even if a mediator node was
3068650Sskamm 			 * unavailable.  This allows a node to join an MO
3069650Sskamm 			 * diskset when there are sufficient replicas available,
3070650Sskamm 			 * but a mediator node in unavailable.
3071650Sskamm 			 */
3072650Sskamm 			if (setup_db_bydd(sp, dd, TRUE, &xep) == -1) {
30730Sstevel@tonic-gate 				mdclrerror(&xep);
30740Sstevel@tonic-gate 			}
30750Sstevel@tonic-gate 			/* If set previously stale - make it so at re-join */
30760Sstevel@tonic-gate 			if (snarf_set(sp, stale_bool, &xep) != 0) {
30770Sstevel@tonic-gate 				mdclrerror(&xep);
30780Sstevel@tonic-gate 				(void) halt_set(sp, &xep);
30790Sstevel@tonic-gate 				mdclrerror(&xep);
30800Sstevel@tonic-gate 			}
30810Sstevel@tonic-gate 		}
30820Sstevel@tonic-gate 	}
30830Sstevel@tonic-gate 
30840Sstevel@tonic-gate 	/*
30850Sstevel@tonic-gate 	 * Notify rpc.mdcommd on all nodes of a nodelist change.
30860Sstevel@tonic-gate 	 * Send reinit command to mdcommd which forces it to get
30870Sstevel@tonic-gate 	 * fresh set description.
30880Sstevel@tonic-gate 	 */
30890Sstevel@tonic-gate 	if (send_reinit) {
30900Sstevel@tonic-gate 		/* Send reinit */
30910Sstevel@tonic-gate 		nd = sd->sd_nodelist;
30920Sstevel@tonic-gate 		while (nd) {
30930Sstevel@tonic-gate 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
30940Sstevel@tonic-gate 				nd = nd->nd_next;
30950Sstevel@tonic-gate 				continue;
30960Sstevel@tonic-gate 			}
30970Sstevel@tonic-gate 
30980Sstevel@tonic-gate 			/* Class is ignored for REINIT */
30990Sstevel@tonic-gate 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
31000Sstevel@tonic-gate 				sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
31010Sstevel@tonic-gate 				/*
31020Sstevel@tonic-gate 				 * We are here because we failed to resume
31030Sstevel@tonic-gate 				 * rpc.mdcommd.  However we potentially have
31040Sstevel@tonic-gate 				 * an error from the previous call.
31050Sstevel@tonic-gate 				 * If the previous call did fail,  we
31060Sstevel@tonic-gate 				 * capture that error and generate a perror
31070Sstevel@tonic-gate 				 * withthe string,  "Unable to resume...".
31080Sstevel@tonic-gate 				 * Setting rval to -1 ensures that in the
31090Sstevel@tonic-gate 				 * next iteration of the loop, ep is not
31100Sstevel@tonic-gate 				 * clobbered.
31110Sstevel@tonic-gate 				 */
31120Sstevel@tonic-gate 				if (rval == 0)
31130Sstevel@tonic-gate 					(void) mdstealerror(ep, &xep);
31140Sstevel@tonic-gate 				else
31150Sstevel@tonic-gate 					mdclrerror(&xep);
31160Sstevel@tonic-gate 				rval = -1;
31170Sstevel@tonic-gate 				mde_perror(ep, dgettext(TEXT_DOMAIN,
31180Sstevel@tonic-gate 				    "Unable to reinit rpc.mdcommd."));
31190Sstevel@tonic-gate 			}
31200Sstevel@tonic-gate 			nd = nd->nd_next;
31210Sstevel@tonic-gate 		}
31220Sstevel@tonic-gate 	}
31230Sstevel@tonic-gate 
31240Sstevel@tonic-gate out2:
31250Sstevel@tonic-gate 	/*
31260Sstevel@tonic-gate 	 * Unlock diskset by resuming messages across the diskset.
31270Sstevel@tonic-gate 	 * Just resume all classes so that resume is the same whether
31280Sstevel@tonic-gate 	 * just one class was locked or all classes were locked.
31290Sstevel@tonic-gate 	 */
31300Sstevel@tonic-gate 	if ((suspend1_flag) || (suspendall_flag)) {
31310Sstevel@tonic-gate 		nd = sd->sd_nodelist;
31320Sstevel@tonic-gate 		while (nd) {
31330Sstevel@tonic-gate 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
31340Sstevel@tonic-gate 				nd = nd->nd_next;
31350Sstevel@tonic-gate 				continue;
31360Sstevel@tonic-gate 			}
31370Sstevel@tonic-gate 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
31380Sstevel@tonic-gate 				sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
31390Sstevel@tonic-gate 				/*
31400Sstevel@tonic-gate 				 * We are here because we failed to resume
31410Sstevel@tonic-gate 				 * rpc.mdcommd.  However we potentially have
31420Sstevel@tonic-gate 				 * an error from the previous call
31430Sstevel@tonic-gate 				 * If the previous call did fail,  we capture
31440Sstevel@tonic-gate 				 * that error and generate a perror with
31450Sstevel@tonic-gate 				 * the string, "Unable to resume...".
31460Sstevel@tonic-gate 				 * Setting rval to -1 ensures that in the
31470Sstevel@tonic-gate 				 * next iteration of the loop, ep is not
31480Sstevel@tonic-gate 				 * clobbered.
31490Sstevel@tonic-gate 				 */
31500Sstevel@tonic-gate 				if (rval == 0)
31510Sstevel@tonic-gate 					(void) mdstealerror(ep, &xep);
31520Sstevel@tonic-gate 				else
31530Sstevel@tonic-gate 					mdclrerror(&xep);
31540Sstevel@tonic-gate 				rval = -1;
31550Sstevel@tonic-gate 				mde_perror(ep, dgettext(TEXT_DOMAIN,
31560Sstevel@tonic-gate 				    "Unable to resume rpc.mdcommd."));
31570Sstevel@tonic-gate 			}
31580Sstevel@tonic-gate 			nd = nd->nd_next;
31590Sstevel@tonic-gate 		}
31600Sstevel@tonic-gate 		meta_ping_mnset(sp->setno);
31610Sstevel@tonic-gate 	}
31620Sstevel@tonic-gate 
31630Sstevel@tonic-gate 	/*
31640Sstevel@tonic-gate 	 * Unlock set.  This flushes the caches on the servers.
31650Sstevel@tonic-gate 	 */
31660Sstevel@tonic-gate 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
31670Sstevel@tonic-gate 	nd = sd->sd_nodelist;
31680Sstevel@tonic-gate 	while (nd) {
31690Sstevel@tonic-gate 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
31700Sstevel@tonic-gate 			nd = nd->nd_next;
31710Sstevel@tonic-gate 			continue;
31720Sstevel@tonic-gate 		}
31730Sstevel@tonic-gate 		if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
31740Sstevel@tonic-gate 			if (rval == 0)
31750Sstevel@tonic-gate 				(void) mdstealerror(ep, &xep);
31760Sstevel@tonic-gate 			else
31770Sstevel@tonic-gate 				mdclrerror(&xep);
31780Sstevel@tonic-gate 			rval = -1;
31790Sstevel@tonic-gate 		}
31800Sstevel@tonic-gate 		nd = nd->nd_next;
31810Sstevel@tonic-gate 	}
31820Sstevel@tonic-gate 
31830Sstevel@tonic-gate 	/*
31840Sstevel@tonic-gate 	 * call metaflushsetnames to reset local cache for master and
31850Sstevel@tonic-gate 	 * node information.
31860Sstevel@tonic-gate 	 */
31870Sstevel@tonic-gate 	metaflushsetname(sp);
31880Sstevel@tonic-gate 
31890Sstevel@tonic-gate 	/* release signals back to what they were on entry */
31900Sstevel@tonic-gate 	if (procsigs(FALSE, &oldsigs, &xep) < 0)
31910Sstevel@tonic-gate 		mdclrerror(&xep);
31920Sstevel@tonic-gate 
31930Sstevel@tonic-gate 	return (rval);
31940Sstevel@tonic-gate 
31950Sstevel@tonic-gate }
31960Sstevel@tonic-gate 
31970Sstevel@tonic-gate /*
31980Sstevel@tonic-gate  * Update nodelist with cluster member information.
31990Sstevel@tonic-gate  * A node not in the member list will be marked
32000Sstevel@tonic-gate  * as not ALIVE and not OWN.
32010Sstevel@tonic-gate  * A node in the member list will be marked ALIVE, but
32020Sstevel@tonic-gate  * the OWN bit will not be changed.
32030Sstevel@tonic-gate  *
32040Sstevel@tonic-gate  * If mynode isn't in the membership list, fail causing
32050Sstevel@tonic-gate  * another reconfig cycle to be started since a non-member
32060Sstevel@tonic-gate  * node shouldn't be taking part in the reconfig cycle.
32070Sstevel@tonic-gate  *
32080Sstevel@tonic-gate  * Return values:
32090Sstevel@tonic-gate  *	0 - No problem.
32100Sstevel@tonic-gate  *	1 - Any failure including RPC failure to my node.
32110Sstevel@tonic-gate  */
32120Sstevel@tonic-gate int
32130Sstevel@tonic-gate meta_reconfig_update_nodelist(
32140Sstevel@tonic-gate 	mdsetname_t			*sp,
32150Sstevel@tonic-gate 	mndiskset_membershiplist_t	*nl,
32160Sstevel@tonic-gate 	md_set_desc			*sd,
32170Sstevel@tonic-gate 	md_error_t			*ep
32180Sstevel@tonic-gate )
32190Sstevel@tonic-gate {
32200Sstevel@tonic-gate 	mndiskset_membershiplist_t	*nl2;
32210Sstevel@tonic-gate 	md_mnnode_desc			*nd;
32220Sstevel@tonic-gate 	md_error_t			xep = mdnullerror;
32230Sstevel@tonic-gate 	int				rval = 0;
32240Sstevel@tonic-gate 
32250Sstevel@tonic-gate 	/*
32260Sstevel@tonic-gate 	 * Walk through nodelist, checking to see if each
32270Sstevel@tonic-gate 	 * node is in the member list.
32280Sstevel@tonic-gate 	 * If node is not a member, reset ALIVE and OWN node flag.
32290Sstevel@tonic-gate 	 * If node is a member, set ALIVE.
32300Sstevel@tonic-gate 	 * If mynode's OWN flag gets reset, then halt the diskset on this node.
32310Sstevel@tonic-gate 	 */
32320Sstevel@tonic-gate 	nd = sd->sd_nodelist;
32330Sstevel@tonic-gate 	while (nd) {
32340Sstevel@tonic-gate 		nl2 = nl;
32350Sstevel@tonic-gate 		while (nl2) {
32360Sstevel@tonic-gate 			/* If node is in member list, set ALIVE */
32370Sstevel@tonic-gate 			if (nl2->msl_node_id == nd->nd_nodeid) {
32380Sstevel@tonic-gate 				nd->nd_flags |= MD_MN_NODE_ALIVE;
32390Sstevel@tonic-gate 				break;
32400Sstevel@tonic-gate 			} else {
32410Sstevel@tonic-gate 				nl2 = nl2->next;
32420Sstevel@tonic-gate 			}
32430Sstevel@tonic-gate 			/* node is not in member list, mark !ALIVE and !OWN */
32440Sstevel@tonic-gate 			if (nl2 == NULL) {
32450Sstevel@tonic-gate 				/* If node is mynode, then halt set if needed */
32460Sstevel@tonic-gate 				if (strcmp(mynode(), nd->nd_nodename) == 0) {
32470Sstevel@tonic-gate 					/*
32480Sstevel@tonic-gate 					 * This shouldn't happen, but just
32490Sstevel@tonic-gate 					 * in case...  Any node not in the
32500Sstevel@tonic-gate 					 * membership list should be dead and
32510Sstevel@tonic-gate 					 * not running reconfig step1.
32520Sstevel@tonic-gate 					 */
32530Sstevel@tonic-gate 					if (nd->nd_flags & MD_MN_NODE_OWN) {
32540Sstevel@tonic-gate 						if (halt_set(sp, &xep)) {
32550Sstevel@tonic-gate 							mde_perror(&xep, "");
32560Sstevel@tonic-gate 							mdclrerror(&xep);
32570Sstevel@tonic-gate 						}
32580Sstevel@tonic-gate 					}
32590Sstevel@tonic-gate 					/*
32600Sstevel@tonic-gate 					 * Return failure since this node
32610Sstevel@tonic-gate 					 * (mynode) is not in the membership
32620Sstevel@tonic-gate 					 * list, but process the rest of the
32630Sstevel@tonic-gate 					 * nodelist first so that rpc.metad
32640Sstevel@tonic-gate 					 * can be updated with the latest
32650Sstevel@tonic-gate 					 * membership information.
32660Sstevel@tonic-gate 					 */
32670Sstevel@tonic-gate 					(void) mddserror(ep,
32680Sstevel@tonic-gate 					    MDE_DS_NOTINMEMBERLIST,
32690Sstevel@tonic-gate 					    sp->setno, nd->nd_nodename, NULL,
32700Sstevel@tonic-gate 					    sp->setname);
32710Sstevel@tonic-gate 					rval = 1;
32720Sstevel@tonic-gate 				}
32730Sstevel@tonic-gate 				nd->nd_flags &= ~MD_MN_NODE_ALIVE;
32740Sstevel@tonic-gate 				nd->nd_flags &= ~MD_MN_NODE_OWN;
32750Sstevel@tonic-gate 			}
32760Sstevel@tonic-gate 		}
32770Sstevel@tonic-gate 		nd = nd->nd_next;
32780Sstevel@tonic-gate 	}
32790Sstevel@tonic-gate 
32800Sstevel@tonic-gate 	/* Send this information to rpc.metad */
32810Sstevel@tonic-gate 	if (clnt_upd_nr_flags(mynode(), sp, sd->sd_nodelist,
32820Sstevel@tonic-gate 	    MD_NR_SET,  MNSET_IN_RECONFIG, &xep)) {
32830Sstevel@tonic-gate 		/* Return failure if can't send node flags to rpc.metad */
32840Sstevel@tonic-gate 		if (rval == 0) {
32850Sstevel@tonic-gate 			(void) mdstealerror(ep, &xep);
32860Sstevel@tonic-gate 			rval = 1;
32870Sstevel@tonic-gate 		}
32880Sstevel@tonic-gate 	}
32890Sstevel@tonic-gate 	return (rval);
32900Sstevel@tonic-gate }
32910Sstevel@tonic-gate 
32920Sstevel@tonic-gate /*
32930Sstevel@tonic-gate  * Choose master determines the master for a diskset.
32940Sstevel@tonic-gate  * Each node determines the master on its own and
32950Sstevel@tonic-gate  * adds this information to its local rpc.metad nodelist
32960Sstevel@tonic-gate  * and also sends it to the kernel.
32970Sstevel@tonic-gate  *
32980Sstevel@tonic-gate  * Nodelist in set descriptor (sd) is sorted in
32990Sstevel@tonic-gate  * monotonically increasing sequence of nodeid.
33000Sstevel@tonic-gate  *
33010Sstevel@tonic-gate  * Return values:
33020Sstevel@tonic-gate  *	0 - No problem.
33030Sstevel@tonic-gate  *	205 - There was an RPC problem to another node.
33040Sstevel@tonic-gate  *	-1 - There was an error.  This could be an RPC error to my node.
33050Sstevel@tonic-gate  *		This is a catastrophic failure causing node to panic.
33060Sstevel@tonic-gate  */
33070Sstevel@tonic-gate int
33080Sstevel@tonic-gate meta_reconfig_choose_master_for_set(
33090Sstevel@tonic-gate 	mdsetname_t	*sp,
33100Sstevel@tonic-gate 	md_set_desc	*sd,
33110Sstevel@tonic-gate 	md_error_t	*ep
33120Sstevel@tonic-gate )
33130Sstevel@tonic-gate {
33140Sstevel@tonic-gate 	int			is_owner;
33150Sstevel@tonic-gate 	md_mnset_record		*mnsr = NULL;
33160Sstevel@tonic-gate 	int			lowest_alive_nodeid = 0;
33170Sstevel@tonic-gate 	uint_t			master_nodeid;
33180Sstevel@tonic-gate 	md_mnnode_desc		*nd, *nd2;
33190Sstevel@tonic-gate 	md_mnnode_record	*nr;
33200Sstevel@tonic-gate 	md_drive_desc		*dd;
33210Sstevel@tonic-gate 	md_setkey_t		*cl_sk;
33220Sstevel@tonic-gate 	int			rval = 0;
33230Sstevel@tonic-gate 	md_error_t		xep = mdnullerror;
33240Sstevel@tonic-gate 	mddb_setflags_config_t	sf;
33250Sstevel@tonic-gate 
33260Sstevel@tonic-gate 	/*
33270Sstevel@tonic-gate 	 * Is current node joined to diskset?
33280Sstevel@tonic-gate 	 * Don't trust flags, really check to see if mddb is snarfed.
33290Sstevel@tonic-gate 	 */
33300Sstevel@tonic-gate 	if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) {
33310Sstevel@tonic-gate 		/*
33320Sstevel@tonic-gate 		 * If a node is joined to the diskset, this node checks
33330Sstevel@tonic-gate 		 * to see if the current master of the diskset is valid and
33340Sstevel@tonic-gate 		 * is still in the membership list (ALIVE) and is
33350Sstevel@tonic-gate 		 * still joined (OWN).  Need to verify if master is
33360Sstevel@tonic-gate 		 * really joined - don't trust the flags.  (Can trust
33370Sstevel@tonic-gate 		 * ALIVE since set during earlier part of reconfig cycle.)
33380Sstevel@tonic-gate 		 * If the current master is valid, still in the membership
33390Sstevel@tonic-gate 		 * list and joined, then master is not changed on this node.
33400Sstevel@tonic-gate 		 * Just return.
33410Sstevel@tonic-gate 		 *
33420Sstevel@tonic-gate 		 * Verify that nodeid is valid before accessing masternode.
33430Sstevel@tonic-gate 		 */
33440Sstevel@tonic-gate 		if ((sd->sd_mn_master_nodeid != MD_MN_INVALID_NID) &&
33450Sstevel@tonic-gate 		    (sd->sd_mn_masternode->nd_flags & MD_MN_NODE_ALIVE)) {
33460Sstevel@tonic-gate 			if (clnt_ownset(sd->sd_mn_master_nodenm, sp,
33470Sstevel@tonic-gate 			    &is_owner, ep) == -1) {
33480Sstevel@tonic-gate 				/* If RPC failure to another node return 205 */
33490Sstevel@tonic-gate 				if ((mdanyrpcerror(ep)) &&
33500Sstevel@tonic-gate 				    (sd->sd_mn_mynode->nd_nodeid !=
33510Sstevel@tonic-gate 				    sd->sd_mn_master_nodeid)) {
33520Sstevel@tonic-gate 					return (205);
33530Sstevel@tonic-gate 				} else {
33540Sstevel@tonic-gate 					/* Any other failure */
33550Sstevel@tonic-gate 					return (-1);
33560Sstevel@tonic-gate 				}
33570Sstevel@tonic-gate 			} else {
33580Sstevel@tonic-gate 				if (is_owner == TRUE) {
33590Sstevel@tonic-gate 
33600Sstevel@tonic-gate 					meta_mc_log(MC_LOG5, dgettext(
33610Sstevel@tonic-gate 					    TEXT_DOMAIN, "Set %s previous "
33620Sstevel@tonic-gate 					    "master chosen %s (%d): %s"),
33630Sstevel@tonic-gate 					    sp->setname,
33640Sstevel@tonic-gate 					    sd->sd_mn_master_nodenm,
33650Sstevel@tonic-gate 					    sd->sd_mn_master_nodeid,
33660Sstevel@tonic-gate 					    meta_print_hrtime(gethrtime() -
33670Sstevel@tonic-gate 					    start_time));
33680Sstevel@tonic-gate 
33690Sstevel@tonic-gate 					/* Previous master is ok - done */
33700Sstevel@tonic-gate 					return (0);
33710Sstevel@tonic-gate 				}
33720Sstevel@tonic-gate 			}
33730Sstevel@tonic-gate 		}
33740Sstevel@tonic-gate 
33750Sstevel@tonic-gate 		/*
33760Sstevel@tonic-gate 		 * If current master is no longer in the membership list or
33770Sstevel@tonic-gate 		 * is no longer joined, then this node uses the following
33780Sstevel@tonic-gate 		 * algorithm:
33790Sstevel@tonic-gate 		 * - node calls RPC routine clnt_ownset to get latest
33800Sstevel@tonic-gate 		 *	information on which nodes are owners of diskset.
33810Sstevel@tonic-gate 		 * 	clnt_ownset checks on each node to see if its kernel
33820Sstevel@tonic-gate 		 *	has that diskset snarfed.
33830Sstevel@tonic-gate 		 */
33840Sstevel@tonic-gate 		nd = sd->sd_nodelist;
33850Sstevel@tonic-gate 		while (nd) {
33860Sstevel@tonic-gate 			/* Don't consider node that isn't in member list */
33870Sstevel@tonic-gate 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
33880Sstevel@tonic-gate 				nd = nd->nd_next;
33890Sstevel@tonic-gate 				continue;
33900Sstevel@tonic-gate 			}
33910Sstevel@tonic-gate 
33920Sstevel@tonic-gate 			if (clnt_ownset(nd->nd_nodename, sp,
33930Sstevel@tonic-gate 			    &is_owner, ep) == -1) {
33940Sstevel@tonic-gate 				/* If RPC failure to another node return 205 */
33950Sstevel@tonic-gate 				if ((mdanyrpcerror(ep)) &&
33960Sstevel@tonic-gate 				    (sd->sd_mn_mynode->nd_nodeid !=
33970Sstevel@tonic-gate 				    nd->nd_nodeid)) {
33980Sstevel@tonic-gate 					return (205);
33990Sstevel@tonic-gate 				} else {
34000Sstevel@tonic-gate 					/* Any other failure */
34010Sstevel@tonic-gate 					return (-1);
34020Sstevel@tonic-gate 				}
34030Sstevel@tonic-gate 			}
34040Sstevel@tonic-gate 
34050Sstevel@tonic-gate 			/*
34060Sstevel@tonic-gate 			 * Set owner flag for each node based on whether
34070Sstevel@tonic-gate 			 * that node really has a diskset mddb snarfed in
34080Sstevel@tonic-gate 			 * or not.
34090Sstevel@tonic-gate 			 */
34100Sstevel@tonic-gate 			if (is_owner == TRUE)
34110Sstevel@tonic-gate 				nd->nd_flags |= MD_MN_NODE_OWN;
34120Sstevel@tonic-gate 			else
34130Sstevel@tonic-gate 				nd->nd_flags &= ~MD_MN_NODE_OWN;
34140Sstevel@tonic-gate 
34150Sstevel@tonic-gate 			nd = nd->nd_next;
34160Sstevel@tonic-gate 		}
34170Sstevel@tonic-gate 
34180Sstevel@tonic-gate 		/*
34190Sstevel@tonic-gate 		 * - node walks through nodelist looking for nodes that are
34200Sstevel@tonic-gate 		 *	owners of the diskset that are in the membership list.
34210Sstevel@tonic-gate 		 * - for each owner, node calls RPC routine clnt_getset to
34220Sstevel@tonic-gate 		 *	 see if that node has its node record set to OK.
34230Sstevel@tonic-gate 		 * - If so, master is chosen to be this owner node.
34240Sstevel@tonic-gate 		 */
34250Sstevel@tonic-gate 		nd = sd->sd_nodelist;
34260Sstevel@tonic-gate 		while (nd) {
34270Sstevel@tonic-gate 			/* Don't consider node that isn't in member list */
34280Sstevel@tonic-gate 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
34290Sstevel@tonic-gate 				nd = nd->nd_next;
34300Sstevel@tonic-gate 				continue;
34310Sstevel@tonic-gate 			}
34320Sstevel@tonic-gate 
34330Sstevel@tonic-gate 			/* Don't consider a node that isn't an owner */
34340Sstevel@tonic-gate 			if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
34350Sstevel@tonic-gate 				nd = nd->nd_next;
34360Sstevel@tonic-gate 				continue;
34370Sstevel@tonic-gate 			}
34380Sstevel@tonic-gate 
34390Sstevel@tonic-gate 			/* Does node has its own node record set to OK? */
34400Sstevel@tonic-gate 			if (clnt_mngetset(nd->nd_nodename, sp->setname,
34410Sstevel@tonic-gate 			    MD_SET_BAD, &mnsr, ep) == -1) {
34420Sstevel@tonic-gate 				/* If RPC failure to another node return 205 */
34430Sstevel@tonic-gate 				if ((mdanyrpcerror(ep)) &&
34440Sstevel@tonic-gate 				    (sd->sd_mn_mynode->nd_nodeid !=
34450Sstevel@tonic-gate 				    nd->nd_nodeid)) {
34460Sstevel@tonic-gate 					return (205);
34470Sstevel@tonic-gate 				} else {
34480Sstevel@tonic-gate 					/* Any other failure */
34490Sstevel@tonic-gate 					return (-1);
34500Sstevel@tonic-gate 				}
34510Sstevel@tonic-gate 			}
34520Sstevel@tonic-gate 			nr = mnsr->sr_nodechain;
34530Sstevel@tonic-gate 			while (nr) {
34540Sstevel@tonic-gate 				if (nd->nd_nodeid == nr->nr_nodeid) {
34550Sstevel@tonic-gate 					if (nr->nr_flags & MD_MN_NODE_OK) {
34560Sstevel@tonic-gate 						/* Found a master */
34570Sstevel@tonic-gate 						free_sr(
34580Sstevel@tonic-gate 						    (md_set_record *)mnsr);
34590Sstevel@tonic-gate 						goto found_master;
34600Sstevel@tonic-gate 					}
34610Sstevel@tonic-gate 				}
34620Sstevel@tonic-gate 				nr = nr->nr_next;
34630Sstevel@tonic-gate 			}
34640Sstevel@tonic-gate 			free_sr((md_set_record *)mnsr);
34650Sstevel@tonic-gate 			nd = nd->nd_next;
34660Sstevel@tonic-gate 		}
34670Sstevel@tonic-gate 
34680Sstevel@tonic-gate 		/*
34690Sstevel@tonic-gate 		 * - If no owner node has its own node record on its own node
34700Sstevel@tonic-gate 		 *	set to OK, then this node checks all of the non-owner
34710Sstevel@tonic-gate 		 * 	nodes that are in the membership list.
34720Sstevel@tonic-gate 		 * - for each non-owner, node calls RPC routine clnt_getset to
34730Sstevel@tonic-gate 		 *	 see if that node has its node record set to OK.
34740Sstevel@tonic-gate 		 * - If set doesn't exist, don't choose node for master.
34750Sstevel@tonic-gate 		 * - If so, master is chosen to be this non-owner node.
34760Sstevel@tonic-gate 		 *
34770Sstevel@tonic-gate 		 */
34780Sstevel@tonic-gate 		nd = sd->sd_nodelist;
34790Sstevel@tonic-gate 		while (nd) {
34800Sstevel@tonic-gate 			/* Don't consider node that isn't in member list */
34810Sstevel@tonic-gate 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
34820Sstevel@tonic-gate 				nd = nd->nd_next;
34830Sstevel@tonic-gate 				continue;
34840Sstevel@tonic-gate 			}
34850Sstevel@tonic-gate 
34860Sstevel@tonic-gate 			/* Only checking non-owner nodes this time around */
34870Sstevel@tonic-gate 			if (nd->nd_flags & MD_MN_NODE_OWN) {
34880Sstevel@tonic-gate 				nd = nd->nd_next;
34890Sstevel@tonic-gate 				continue;
34900Sstevel@tonic-gate 			}
34910Sstevel@tonic-gate 
34920Sstevel@tonic-gate 			/* Does node has its own node record set to OK? */
34930Sstevel@tonic-gate 			if (clnt_mngetset(nd->nd_nodename, sp->setname,
34940Sstevel@tonic-gate 			    MD_SET_BAD, &mnsr, ep) == -1) {
34950Sstevel@tonic-gate 				/*
34960Sstevel@tonic-gate 				 * If set doesn't exist on non-owner node,
34970Sstevel@tonic-gate 				 * don't consider this node for master.
34980Sstevel@tonic-gate 				 */
34990Sstevel@tonic-gate 				if (mdiserror(ep, MDE_NO_SET)) {
35000Sstevel@tonic-gate 					nd = nd->nd_next;
35010Sstevel@tonic-gate 					continue;
35020Sstevel@tonic-gate 				} else if ((mdanyrpcerror(ep)) &&
35030Sstevel@tonic-gate 				    (sd->sd_mn_mynode->nd_nodeid !=
35040Sstevel@tonic-gate 				    nd->nd_nodeid)) {
35050Sstevel@tonic-gate 					/* RPC failure to another node */
35060Sstevel@tonic-gate 					return (205);
35070Sstevel@tonic-gate 				} else {
35080Sstevel@tonic-gate 					/* Any other failure */
35090Sstevel@tonic-gate 					return (-1);
35100Sstevel@tonic-gate 				}
35110Sstevel@tonic-gate 			}
35120Sstevel@tonic-gate 			nr = mnsr->sr_nodechain;
35130Sstevel@tonic-gate 			while (nr) {
35140Sstevel@tonic-gate 				if (nd->nd_nodeid == nr->nr_nodeid) {
35150Sstevel@tonic-gate 					if (nr->nr_flags & MD_MN_NODE_OK) {
35160Sstevel@tonic-gate 						/* Found a master */
35170Sstevel@tonic-gate 						free_sr(
35180Sstevel@tonic-gate 						    (md_set_record *)mnsr);
35190Sstevel@tonic-gate 						goto found_master;
35200Sstevel@tonic-gate 					}
35210Sstevel@tonic-gate 				}
35220Sstevel@tonic-gate 				nr = nr->nr_next;
35230Sstevel@tonic-gate 			}
35240Sstevel@tonic-gate 			free_sr((md_set_record *)mnsr);
35250Sstevel@tonic-gate 			nd = nd->nd_next;
35260Sstevel@tonic-gate 		}
35270Sstevel@tonic-gate 
35280Sstevel@tonic-gate 		/*
35290Sstevel@tonic-gate 		 * - If no node can be found that has its own node record on
35300Sstevel@tonic-gate 		 *	its node to be set to OK, then all alive nodes
35310Sstevel@tonic-gate 		 * 	were in the process of being added to or deleted
35320Sstevel@tonic-gate 		 *	from set.  Each alive node will remove all
35330Sstevel@tonic-gate 		 *	information pertaining to this set from its node.
35340Sstevel@tonic-gate 		 *
35350Sstevel@tonic-gate 		 * If all nodes in set are ALIVE, then call sdssc end routines
35360Sstevel@tonic-gate 		 * since set was truly being initially created or destroyed.
35370Sstevel@tonic-gate 		 */
35380Sstevel@tonic-gate 		goto delete_set;
35390Sstevel@tonic-gate 	} else {
35400Sstevel@tonic-gate 
35410Sstevel@tonic-gate 		/*
35420Sstevel@tonic-gate 		 * If node is not joined to diskset, then this
35430Sstevel@tonic-gate 		 * node uses the following algorithm:
35440Sstevel@tonic-gate 		 * - If unjoined node doesn't have a node record for itself,
35450Sstevel@tonic-gate 		 *	just delete the diskset since diskset was in the
35460Sstevel@tonic-gate 		 *	process of being created.
35470Sstevel@tonic-gate 		 * - node needs to find master of diskset before
35480Sstevel@tonic-gate 		 *	reconfig cycle, if a master existed.
35490Sstevel@tonic-gate 		 * - node calls RPC routine clnt_ownset to get latest
35500Sstevel@tonic-gate 		 * 	information on which nodes are owners of diskset.
35510Sstevel@tonic-gate 		 *	clnt_ownset checks on each node to see if its
35520Sstevel@tonic-gate 		 *	kernel has that diskset snarfed.
35530Sstevel@tonic-gate 		 */
35540Sstevel@tonic-gate 
35550Sstevel@tonic-gate 		/*
35560Sstevel@tonic-gate 		 * Is my node in the set description?
35570Sstevel@tonic-gate 		 * If not, delete the set from this node.
35580Sstevel@tonic-gate 		 * sr2setdesc sets sd_mn_mynode pointer to the node
35590Sstevel@tonic-gate 		 * descriptor for this node if there was a node
35600Sstevel@tonic-gate 		 * record for this node.
35610Sstevel@tonic-gate 		 *
35620Sstevel@tonic-gate 		 */
35630Sstevel@tonic-gate 		if (sd->sd_mn_mynode == NULL) {
35640Sstevel@tonic-gate 			goto delete_set;
35650Sstevel@tonic-gate 		}
35660Sstevel@tonic-gate 
35670Sstevel@tonic-gate 		nd = sd->sd_nodelist;
35680Sstevel@tonic-gate 		while (nd) {
35690Sstevel@tonic-gate 			/* Don't consider node that isn't in member list */
35700Sstevel@tonic-gate 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
35710Sstevel@tonic-gate 				nd = nd->nd_next;
35720Sstevel@tonic-gate 				continue;
35730Sstevel@tonic-gate 			}
35740Sstevel@tonic-gate 
35750Sstevel@tonic-gate 			if (clnt_ownset(nd->nd_nodename, sp,
35760Sstevel@tonic-gate 			    &is_owner, ep) == -1) {
35770Sstevel@tonic-gate 				/* If RPC failure to another node return 205 */
35780Sstevel@tonic-gate 				if ((mdanyrpcerror(ep)) &&
35790Sstevel@tonic-gate 				    (sd->sd_mn_mynode->nd_nodeid !=
35800Sstevel@tonic-gate 				    nd->nd_nodeid)) {
35810Sstevel@tonic-gate 					return (205);
35820Sstevel@tonic-gate 				} else {
35830Sstevel@tonic-gate 					/* Any other failure */
35840Sstevel@tonic-gate 					return (-1);
35850Sstevel@tonic-gate 				}
35860Sstevel@tonic-gate 			}
35870Sstevel@tonic-gate 
35880Sstevel@tonic-gate 			/*
35890Sstevel@tonic-gate 			 * Set owner flag for each node based on whether
35900Sstevel@tonic-gate 			 * that node really has a diskset mddb snarfed in
35910Sstevel@tonic-gate 			 * or not.
35920Sstevel@tonic-gate 			 */
35930Sstevel@tonic-gate 			if (is_owner == TRUE)
35940Sstevel@tonic-gate 				nd->nd_flags |= MD_MN_NODE_OWN;
35950Sstevel@tonic-gate 			else
35960Sstevel@tonic-gate 				nd->nd_flags &= ~MD_MN_NODE_OWN;
35970Sstevel@tonic-gate 
35980Sstevel@tonic-gate 			nd = nd->nd_next;
35990Sstevel@tonic-gate 		}
36000Sstevel@tonic-gate 
36010Sstevel@tonic-gate 		/*
36020Sstevel@tonic-gate 		 * - node walks through nodelist looking for nodes that
36030Sstevel@tonic-gate 		 *	are owners of the diskset that are in
36040Sstevel@tonic-gate 		 *	the membership list.
36050Sstevel@tonic-gate 		 * - for each owner, node calls RPC routine clnt_getset to
36060Sstevel@tonic-gate 		 *	see if that node has a master set and to get the
36070Sstevel@tonic-gate 		 *	diskset description.
36080Sstevel@tonic-gate 		 * - If the owner node has a set description that doesn't
36090Sstevel@tonic-gate 		 *	include the non-joined node in the nodelist, this node
36100Sstevel@tonic-gate 		 *	removes its set description of that diskset
36110Sstevel@tonic-gate 		 *	(i.e. removes the set from its local mddbs).  This is
36120Sstevel@tonic-gate 		 *	handling the case of when a node was removed from a
36130Sstevel@tonic-gate 		 *	diskset while it was not in the cluster membership
36140Sstevel@tonic-gate 		 *	list.
36150Sstevel@tonic-gate 		 * - If that node has a master set and the master is in the
36160Sstevel@tonic-gate 		 *	membership list and is an owner, then either this was
36170Sstevel@tonic-gate 		 *	the master from before the reconfig cycle or this
36180Sstevel@tonic-gate 		 *	node has already chosen a new master - either way,
36190Sstevel@tonic-gate 		 *	the master value is valid as long as it is in the
36200Sstevel@tonic-gate 		 *	membership list and is an owner
36210Sstevel@tonic-gate 		 * - master is chosen to be owner node's master
36220Sstevel@tonic-gate 		 */
36230Sstevel@tonic-gate 		nd = sd->sd_nodelist;
36240Sstevel@tonic-gate 		while (nd) {
36250Sstevel@tonic-gate 			/* Don't consider node that isn't in member list */
36260Sstevel@tonic-gate 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
36270Sstevel@tonic-gate 				nd = nd->nd_next;
36280Sstevel@tonic-gate 				continue;
36290Sstevel@tonic-gate 			}
36300Sstevel@tonic-gate 
36310Sstevel@tonic-gate 			/* Don't consider a node that isn't an owner */
36320Sstevel@tonic-gate 			if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
36330Sstevel@tonic-gate 				nd = nd->nd_next;
36340Sstevel@tonic-gate 				continue;
36350Sstevel@tonic-gate 			}
36360Sstevel@tonic-gate 
36370Sstevel@tonic-gate 			/* Get owner node's set record */
36380Sstevel@tonic-gate 			if (clnt_mngetset(nd->nd_nodename, sp->setname,
36390Sstevel@tonic-gate 			    MD_SET_BAD, &mnsr, ep) == -1) {
36400Sstevel@tonic-gate 				/* If RPC failure to another node return 205 */
36410Sstevel@tonic-gate 				if ((mdanyrpcerror(ep)) &&
36420Sstevel@tonic-gate 				    (sd->sd_mn_mynode->nd_nodeid !=
36430Sstevel@tonic-gate 				    nd->nd_nodeid)) {
36440Sstevel@tonic-gate 					return (205);
36450Sstevel@tonic-gate 				} else {
36460Sstevel@tonic-gate 					/* Any other failure */
36470Sstevel@tonic-gate 					return (-1);
36480Sstevel@tonic-gate 				}
36490Sstevel@tonic-gate 			}
36500Sstevel@tonic-gate 
36510Sstevel@tonic-gate 			/* Is this node in the owner node's set record */
36520Sstevel@tonic-gate 			nr = mnsr->sr_nodechain;
36530Sstevel@tonic-gate 			while (nr) {
36540Sstevel@tonic-gate 				if (sd->sd_mn_mynode->nd_nodeid ==
36550Sstevel@tonic-gate 				    nr->nr_nodeid) {
36560Sstevel@tonic-gate 					break;
36570Sstevel@tonic-gate 				}
36580Sstevel@tonic-gate 				nr = nr->nr_next;
36590Sstevel@tonic-gate 			}
36600Sstevel@tonic-gate 			if (nr == NULL) {
36610Sstevel@tonic-gate 				/* my node not found - delete set */
36620Sstevel@tonic-gate 				free_sr((md_set_record *)mnsr);
36630Sstevel@tonic-gate 				goto delete_set;
36640Sstevel@tonic-gate 			}
36650Sstevel@tonic-gate 
36660Sstevel@tonic-gate 			/* Is owner's node's master valid? */
36670Sstevel@tonic-gate 			master_nodeid = mnsr->sr_master_nodeid;
36680Sstevel@tonic-gate 			free_sr((md_set_record *)mnsr);
36690Sstevel@tonic-gate 			if (master_nodeid == MD_MN_INVALID_NID) {
36700Sstevel@tonic-gate 				nd = nd->nd_next;
36710Sstevel@tonic-gate 				continue;
36720Sstevel@tonic-gate 			}
36730Sstevel@tonic-gate 
36740Sstevel@tonic-gate 			nd2 = sd->sd_nodelist;
36750Sstevel@tonic-gate 			while (nd2) {
36760Sstevel@tonic-gate 				if ((nd2->nd_nodeid == master_nodeid) &&
36770Sstevel@tonic-gate 				    (nd2->nd_flags & MD_MN_NODE_ALIVE) &&
36780Sstevel@tonic-gate 				    (nd2->nd_flags & MD_MN_NODE_OWN)) {
36790Sstevel@tonic-gate 						nd = nd2;
36800Sstevel@tonic-gate 						goto found_master;
36810Sstevel@tonic-gate 				}
36820Sstevel@tonic-gate 				nd2 = nd2->nd_next;
36830Sstevel@tonic-gate 			}
36840Sstevel@tonic-gate 			nd = nd->nd_next;
36850Sstevel@tonic-gate 		}
36860Sstevel@tonic-gate 
36870Sstevel@tonic-gate 		/*
36880Sstevel@tonic-gate 		 * - If no owner node has a valid master, then follow
36890Sstevel@tonic-gate 		 * 	algorithm of when a node is joined to the diskset.
36900Sstevel@tonic-gate 		 * - node walks through nodelist looking for nodes that are
36910Sstevel@tonic-gate 		 *	owners of the diskset that are in the membership list.
36920Sstevel@tonic-gate 		 * - for each owner, node calls RPC routine clnt_getset to
36930Sstevel@tonic-gate 		 *	 see if that node has its node record set to OK.
36940Sstevel@tonic-gate 		 * - If so, master is chosen to be this owner node.
36950Sstevel@tonic-gate 		 */
36960Sstevel@tonic-gate 		nd = sd->sd_nodelist;
36970Sstevel@tonic-gate 		while (nd) {
36980Sstevel@tonic-gate 			/* Don't consider node that isn't in member list */
36990Sstevel@tonic-gate 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
37000Sstevel@tonic-gate 				nd = nd->nd_next;
37010Sstevel@tonic-gate 				continue;
37020Sstevel@tonic-gate 			}
37030Sstevel@tonic-gate 
37040Sstevel@tonic-gate 			/* Don't consider a node that isn't an owner */
37050Sstevel@tonic-gate 			if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
37060Sstevel@tonic-gate 				nd = nd->nd_next;
37070Sstevel@tonic-gate 				continue;
37080Sstevel@tonic-gate 			}
37090Sstevel@tonic-gate 
37100Sstevel@tonic-gate 			/* Does node has its own node record set to OK? */
37110Sstevel@tonic-gate 			if (clnt_mngetset(nd->nd_nodename, sp->setname,
37120Sstevel@tonic-gate 			    MD_SET_BAD, &mnsr, ep) == -1) {
37130Sstevel@tonic-gate 				/* If RPC failure to another node return 205 */
37140Sstevel@tonic-gate 				if ((mdanyrpcerror(ep)) &&
37150Sstevel@tonic-gate 				    (sd->sd_mn_mynode->nd_nodeid !=
37160Sstevel@tonic-gate 				    nd->nd_nodeid)) {
37170Sstevel@tonic-gate 					return (205);
37180Sstevel@tonic-gate 				} else {
37190Sstevel@tonic-gate 					/* Any other failure */
37200Sstevel@tonic-gate 					return (-1);
37210Sstevel@tonic-gate 				}
37220Sstevel@tonic-gate 			}
37230Sstevel@tonic-gate 			nr = mnsr->sr_nodechain;
37240Sstevel@tonic-gate 			while (nr) {
37250Sstevel@tonic-gate 				if (nd->nd_nodeid == nr->nr_nodeid) {
37260Sstevel@tonic-gate 					if (nr->nr_flags & MD_MN_NODE_OK) {
37270Sstevel@tonic-gate 						/* Found a master */
37280Sstevel@tonic-gate 						free_sr(
37290Sstevel@tonic-gate 						    (md_set_record *)mnsr);
37300Sstevel@tonic-gate 						goto found_master;
37310Sstevel@tonic-gate 					}
37320Sstevel@tonic-gate 				}
37330Sstevel@tonic-gate 				nr = nr->nr_next;
37340Sstevel@tonic-gate 			}
37350Sstevel@tonic-gate 			free_sr((md_set_record *)mnsr);
37360Sstevel@tonic-gate 			nd = nd->nd_next;
37370Sstevel@tonic-gate 		}
37380Sstevel@tonic-gate 
37390Sstevel@tonic-gate 		/*
37400Sstevel@tonic-gate 		 * - If no owner node has its own node record on its own node
37410Sstevel@tonic-gate 		 *	set to OK, then this node checks all of the non-owner
37420Sstevel@tonic-gate 		 *	nodes that are in the membership list.
37430Sstevel@tonic-gate 		 * - for each non-owner, node calls RPC routine clnt_getset to
37440Sstevel@tonic-gate 		 *	see if that node has its node record set to OK.
37450Sstevel@tonic-gate 		 * - If set doesn't exist, don't choose node for master.
37460Sstevel@tonic-gate 		 * - If this node doesn't exist in the nodelist on any of the
37470Sstevel@tonic-gate 		 *	non-owner nodes, this node removes its set description
37480Sstevel@tonic-gate 		 *	of that diskset (i.e. removes the set from its local
37490Sstevel@tonic-gate 		 *	mddbs). This is handling the case of when a node was
37500Sstevel@tonic-gate 		 *	removed from a diskset while it was not in the
37510Sstevel@tonic-gate 		 *	cluster membership list.
37520Sstevel@tonic-gate 		 * - If non-owner node has its node record set to OK and if
37530Sstevel@tonic-gate 		 *	this node hasn't removed this diskset (step directly
37540Sstevel@tonic-gate 		 *	before this one), then the master is chosen to be this
37550Sstevel@tonic-gate 		 *	non-owner node.
37560Sstevel@tonic-gate 		 */
37570Sstevel@tonic-gate 		nd = sd->sd_nodelist;
37580Sstevel@tonic-gate 		while (nd) {
37590Sstevel@tonic-gate 			/* Don't consider node that isn't in member list */
37600Sstevel@tonic-gate 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
37610Sstevel@tonic-gate 				nd->nd_flags |= MD_MN_NODE_DEL;
37620Sstevel@tonic-gate 				nd = nd->nd_next;
37630Sstevel@tonic-gate 				continue;
37640Sstevel@tonic-gate 			}
37650Sstevel@tonic-gate 
37660Sstevel@tonic-gate 			/* Don't consider owner nodes since none are OK */
37670Sstevel@tonic-gate 			if (nd->nd_flags & MD_MN_NODE_OWN) {
37680Sstevel@tonic-gate 				nd->nd_flags |= MD_MN_NODE_DEL;
37690Sstevel@tonic-gate 				nd = nd->nd_next;
37700Sstevel@tonic-gate 				continue;
37710Sstevel@tonic-gate 			}
37720Sstevel@tonic-gate 
37730Sstevel@tonic-gate 			/*
37740Sstevel@tonic-gate 			 * Don't need to get nodelist from my node since
37750Sstevel@tonic-gate 			 * this is where sd_nodelist was obtained.
37760Sstevel@tonic-gate 			 */
37770Sstevel@tonic-gate 			if (sd->sd_mn_mynode->nd_nodeid == nd->nd_nodeid) {
37780Sstevel@tonic-gate 				nd = nd->nd_next;
37790Sstevel@tonic-gate 				continue;
37800Sstevel@tonic-gate 			}
37810Sstevel@tonic-gate 
37820Sstevel@tonic-gate 			/*
37830Sstevel@tonic-gate 			 * If node has already been decided against for
37840Sstevel@tonic-gate 			 * master, then skip it.
37850Sstevel@tonic-gate 			 */
37860Sstevel@tonic-gate 			if (nd->nd_flags & MD_MN_NODE_DEL) {
37870Sstevel@tonic-gate 				nd = nd->nd_next;
37880Sstevel@tonic-gate 				continue;
37890Sstevel@tonic-gate 			}
37900Sstevel@tonic-gate 
37910Sstevel@tonic-gate 			/*
37920Sstevel@tonic-gate 			 * Does node in my nodelist have its own node
37930Sstevel@tonic-gate 			 * record marked OK on its node?  And does node
37940Sstevel@tonic-gate 			 * in my nodelist exist on all other nodes?
37950Sstevel@tonic-gate 			 * Don't want to choose a node for master unless
37960Sstevel@tonic-gate 			 * that node is marked OK on its own node and that
37970Sstevel@tonic-gate 			 * node exists on all other alive nodes.
37980Sstevel@tonic-gate 			 *
37990Sstevel@tonic-gate 			 * This is guarding against the case when several
38000Sstevel@tonic-gate 			 * nodes are down and one of the downed nodes is
38010Sstevel@tonic-gate 			 * deleted from the diskset.  When the down nodes
38020Sstevel@tonic-gate 			 * are rebooted into the cluster, you don't want
38030Sstevel@tonic-gate 			 * any node to pick the deleted node as the master.
38040Sstevel@tonic-gate 			 */
38050Sstevel@tonic-gate 			if (clnt_mngetset(nd->nd_nodename, sp->setname,
38060Sstevel@tonic-gate 			    MD_SET_BAD, &mnsr, ep) == -1) {
38070Sstevel@tonic-gate 				/*
38080Sstevel@tonic-gate 				 * If set doesn't exist on non-owner node,
38090Sstevel@tonic-gate 				 * don't consider this node for master.
38100Sstevel@tonic-gate 				 */
38110Sstevel@tonic-gate 				if (mdiserror(ep, MDE_NO_SET)) {
38120Sstevel@tonic-gate 					nd->nd_flags |= MD_MN_NODE_DEL;
38130Sstevel@tonic-gate 					nd = nd->nd_next;
38140Sstevel@tonic-gate 					continue;
38150Sstevel@tonic-gate 				} else if (mdanyrpcerror(ep)) {
38160Sstevel@tonic-gate 					/* RPC failure to another node */
38170Sstevel@tonic-gate 					return (205);
38180Sstevel@tonic-gate 				} else {
38190Sstevel@tonic-gate 					/* Any other failure */
38200Sstevel@tonic-gate 					return (-1);
38210Sstevel@tonic-gate 				}
38220Sstevel@tonic-gate 			}
38230Sstevel@tonic-gate 			/*
38240Sstevel@tonic-gate 			 * Is my node in the nodelist gotten from the other
38250Sstevel@tonic-gate 			 * node?  If not, then remove the set from my node
38260Sstevel@tonic-gate 			 * since set was deleted from my node while my node
38270Sstevel@tonic-gate 			 * was out of the cluster.
38280Sstevel@tonic-gate 			 */
38290Sstevel@tonic-gate 			nr = mnsr->sr_nodechain;
38300Sstevel@tonic-gate 			while (nr) {
38310Sstevel@tonic-gate 				if (sd->sd_mn_mynode->nd_nodeid ==
38320Sstevel@tonic-gate 				    nr->nr_nodeid) {
38330Sstevel@tonic-gate 					break;
38340Sstevel@tonic-gate 				}
38350Sstevel@tonic-gate 				nr = nr->nr_next;
38360Sstevel@tonic-gate 			}
38370Sstevel@tonic-gate 			if (nr == NULL) {
38380Sstevel@tonic-gate 				/* my node not found - delete set */
38390Sstevel@tonic-gate 				free_sr((md_set_record *)mnsr);
38400Sstevel@tonic-gate 				goto delete_set;
38410Sstevel@tonic-gate 			}
38420Sstevel@tonic-gate 
38430Sstevel@tonic-gate 			/* Is node being checked marked OK on its own node? */
38440Sstevel@tonic-gate 			nr = mnsr->sr_nodechain;
38450Sstevel@tonic-gate 			while (nr) {
38460Sstevel@tonic-gate 				if (nd->nd_nodeid == nr->nr_nodeid) {
38470Sstevel@tonic-gate 					if (!(nr->nr_flags & MD_MN_NODE_OK)) {
38480Sstevel@tonic-gate 						nd->nd_flags |= MD_MN_NODE_DEL;
38490Sstevel@tonic-gate 					}
38500Sstevel@tonic-gate 					break;
38510Sstevel@tonic-gate 				}
38520Sstevel@tonic-gate 				nr = nr->nr_next;
38530Sstevel@tonic-gate 			}
38540Sstevel@tonic-gate 			/*
38550Sstevel@tonic-gate 			 * If node being checked doesn't exist on its
38560Sstevel@tonic-gate 			 * own node - don't choose it as master.
38570Sstevel@tonic-gate 			 */
38580Sstevel@tonic-gate 			if (nr == NULL) {
38590Sstevel@tonic-gate 				nd->nd_flags |= MD_MN_NODE_DEL;
38600Sstevel@tonic-gate 			}
38610Sstevel@tonic-gate 
38620Sstevel@tonic-gate 			/*
38630Sstevel@tonic-gate 			 * Check every node in my node's nodelist against
38640Sstevel@tonic-gate 			 * the nodelist gotten from the other node.
38650Sstevel@tonic-gate 			 * If a node in my node's nodelist is not found in the
38660Sstevel@tonic-gate 			 * other node's nodelist, then set the DEL flag.
38670Sstevel@tonic-gate 			 */
38680Sstevel@tonic-gate 			nd2 = sd->sd_nodelist;
38690Sstevel@tonic-gate 			while (nd2) {
38700Sstevel@tonic-gate 				nr = mnsr->sr_nodechain;
38710Sstevel@tonic-gate 				while (nr) {
38720Sstevel@tonic-gate 					if (nd2->nd_nodeid == nr->nr_nodeid) {
38730Sstevel@tonic-gate 						break;
38740Sstevel@tonic-gate 					}
38750Sstevel@tonic-gate 					nr = nr->nr_next;
38760Sstevel@tonic-gate 				}
38770Sstevel@tonic-gate 				/* nd2 not found in other node's nodelist */
38780Sstevel@tonic-gate 				if (nr == NULL) {
38790Sstevel@tonic-gate 					nd2->nd_flags |= MD_MN_NODE_DEL;
38800Sstevel@tonic-gate 				}
38810Sstevel@tonic-gate 				nd2 = nd2->nd_next;
38820Sstevel@tonic-gate 			}
38830Sstevel@tonic-gate 
38840Sstevel@tonic-gate 			free_sr((md_set_record *)mnsr);
38850Sstevel@tonic-gate 			nd = nd->nd_next;
38860Sstevel@tonic-gate 		}
38870Sstevel@tonic-gate 
38880Sstevel@tonic-gate 		/*
38890Sstevel@tonic-gate 		 * Rescan list look for node that has not been marked DEL.
38900Sstevel@tonic-gate 		 * First node found is the master.
38910Sstevel@tonic-gate 		 */
38920Sstevel@tonic-gate 		nd = sd->sd_nodelist;
38930Sstevel@tonic-gate 		while (nd) {
38940Sstevel@tonic-gate 			if (!(nd->nd_flags & MD_MN_NODE_DEL)) {
38950Sstevel@tonic-gate 				break;
38960Sstevel@tonic-gate 			}
38970Sstevel@tonic-gate 			nd = nd->nd_next;
38980Sstevel@tonic-gate 			continue;
38990Sstevel@tonic-gate 		}
39000Sstevel@tonic-gate 		if (nd) {
39010Sstevel@tonic-gate 			/* Found a master */
39020Sstevel@tonic-gate 			goto found_master;
39030Sstevel@tonic-gate 		}
39040Sstevel@tonic-gate 
39050Sstevel@tonic-gate 		/*
39060Sstevel@tonic-gate 		 * - If no node can be found that has its own node record on
39070Sstevel@tonic-gate 		 *	its node to be set to OK, then all alive nodes
39080Sstevel@tonic-gate 		 * 	were in the process of being added to or deleted
39090Sstevel@tonic-gate 		 *	from set.  Each alive node will remove all
39100Sstevel@tonic-gate 		 *	information pertaining to this set from its node.
39110Sstevel@tonic-gate 		 *
39120Sstevel@tonic-gate 		 * If all nodes in set are ALIVE, then call sdssc end routines
39130Sstevel@tonic-gate 		 * since set was truly being initially created or destroyed.
39140Sstevel@tonic-gate 		 */
39150Sstevel@tonic-gate 		goto delete_set;
39160Sstevel@tonic-gate 	}
39170Sstevel@tonic-gate 
39180Sstevel@tonic-gate found_master:
39190Sstevel@tonic-gate 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
39200Sstevel@tonic-gate 	    "Set %s master chosen %s (%d): %s"),
39210Sstevel@tonic-gate 	    sp->setname, nd->nd_nodename, nd->nd_nodeid,
39220Sstevel@tonic-gate 	    meta_print_hrtime(gethrtime() - start_time));
39230Sstevel@tonic-gate 
39240Sstevel@tonic-gate 	if (clnt_lock_set(mynode(), sp, ep) == -1) {
39250Sstevel@tonic-gate 		return (-1);
39260Sstevel@tonic-gate 	}
39270Sstevel@tonic-gate 
39280Sstevel@tonic-gate 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
39290Sstevel@tonic-gate 
39300Sstevel@tonic-gate 	if (clnt_mnsetmaster(mynode(), sp,
39310Sstevel@tonic-gate 	    nd->nd_nodename, nd->nd_nodeid, ep)) {
39320Sstevel@tonic-gate 		rval = -1;
39330Sstevel@tonic-gate 	} else if (sd->sd_mn_mynode->nd_nodeid == nd->nd_nodeid) {
39340Sstevel@tonic-gate 		/* If this node is new master, set flag in this node's kernel */
39350Sstevel@tonic-gate 		(void) memset(&sf, 0, sizeof (sf));
39360Sstevel@tonic-gate 		sf.sf_setno = sp->setno;
39370Sstevel@tonic-gate 		sf.sf_setflags = MD_SET_MN_NEWMAS_RC;
39380Sstevel@tonic-gate 		/* Use magic to help protect ioctl against attack. */
39390Sstevel@tonic-gate 		sf.sf_magic = MDDB_SETFLAGS_MAGIC;
39400Sstevel@tonic-gate 		sf.sf_flags = MDDB_NM_SET;
39410Sstevel@tonic-gate 
39420Sstevel@tonic-gate 		meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
39430Sstevel@tonic-gate 		    "Setting new master flag for set %s: %s"),
39440Sstevel@tonic-gate 		    sp->setname, meta_print_hrtime(gethrtime() - start_time));
39450Sstevel@tonic-gate 
39460Sstevel@tonic-gate 		/*
39470Sstevel@tonic-gate 		 * Fail reconfig cycle if ioctl fails since it is critical
39480Sstevel@tonic-gate 		 * to set new master flag.
39490Sstevel@tonic-gate 		 */
39500Sstevel@tonic-gate 		if (metaioctl(MD_MN_SET_SETFLAGS, &sf, &sf.sf_mde,
39510Sstevel@tonic-gate 		    NULL) != NULL) {
39520Sstevel@tonic-gate 			(void) mdstealerror(ep, &sf.sf_mde);
39530Sstevel@tonic-gate 			rval = -1;
39540Sstevel@tonic-gate 		}
39550Sstevel@tonic-gate 	}
39560Sstevel@tonic-gate 
39570Sstevel@tonic-gate 	if (clnt_unlock_set(mynode(), cl_sk, &xep) == -1) {
39580Sstevel@tonic-gate 		if (rval == 0) {
39590Sstevel@tonic-gate 			(void) mdstealerror(ep, &xep);
39600Sstevel@tonic-gate 			rval = -1;
39610Sstevel@tonic-gate 		}
39620Sstevel@tonic-gate 	}
39630Sstevel@tonic-gate 
39640Sstevel@tonic-gate 	cl_set_setkey(NULL);
39650Sstevel@tonic-gate 
39660Sstevel@tonic-gate 	metaflushsetname(sp);
39670Sstevel@tonic-gate 
39680Sstevel@tonic-gate 	return (rval);
39690Sstevel@tonic-gate 
39700Sstevel@tonic-gate delete_set:
39710Sstevel@tonic-gate 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
39720Sstevel@tonic-gate 	    "Master not chosen, deleting set %s: %s"),
39730Sstevel@tonic-gate 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
39740Sstevel@tonic-gate 
39750Sstevel@tonic-gate 	/*
39760Sstevel@tonic-gate 	 * Remove all set information from this node:
39770Sstevel@tonic-gate 	 *	- node records for this set
39780Sstevel@tonic-gate 	 *	- drive records for this set
39790Sstevel@tonic-gate 	 *	- set record for this set
39800Sstevel@tonic-gate 	 * (Only do this on this node since each node
39810Sstevel@tonic-gate 	 * will do it for its own local mddb.)
39820Sstevel@tonic-gate 	 *
39830Sstevel@tonic-gate 	 * If all nodes in set are ALIVE, then
39840Sstevel@tonic-gate 	 * the lowest numbered ALIVE nodeid in set
39850Sstevel@tonic-gate 	 * (irregardless of whether an owner node or not) will
39860Sstevel@tonic-gate 	 * call the DCS service to cleanup for create/delete of set.
39870Sstevel@tonic-gate 	 *   sdssc_create_end(cleanup) if set was being created or
39880Sstevel@tonic-gate 	 *   sdssc_delete_end(cleanup) if set was being deleted.
39890Sstevel@tonic-gate 	 * A node record with flag ADD denotes a set being
39900Sstevel@tonic-gate 	 * created.  A node record with flag DEL denotes a
39910Sstevel@tonic-gate 	 * set being deleted.
39920Sstevel@tonic-gate 	 */
39930Sstevel@tonic-gate 	nd = sd->sd_nodelist;
39940Sstevel@tonic-gate 	while (nd) {
39950Sstevel@tonic-gate 		/* Found a node that isn't alive */
39960Sstevel@tonic-gate 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE))
39970Sstevel@tonic-gate 			break;
39980Sstevel@tonic-gate 
39990Sstevel@tonic-gate 		/* Is my node the lowest numbered ALIVE node? */
40000Sstevel@tonic-gate 		if (nd->nd_nodeid < sd->sd_mn_mynode->nd_nodeid) {
40010Sstevel@tonic-gate 			break;
40020Sstevel@tonic-gate 		}
40030Sstevel@tonic-gate 		nd = nd->nd_next;
40040Sstevel@tonic-gate 	}
40050Sstevel@tonic-gate 	if (nd == NULL) {
40060Sstevel@tonic-gate 		/* All nodes ALIVE and this is the lowest nodeid */
40070Sstevel@tonic-gate 		lowest_alive_nodeid = 1;
40080Sstevel@tonic-gate 	}
40090Sstevel@tonic-gate 
40100Sstevel@tonic-gate 	if (clnt_lock_set(mynode(), sp, ep) == -1) {
40110Sstevel@tonic-gate 		return (-1);
40120Sstevel@tonic-gate 	}
40130Sstevel@tonic-gate 
40140Sstevel@tonic-gate 
40150Sstevel@tonic-gate 	/*
40160Sstevel@tonic-gate 	 * If this node had been joined, withdraw and reset master.
40170Sstevel@tonic-gate 	 *
40180Sstevel@tonic-gate 	 * This could happen if a node was being added to or removed
40190Sstevel@tonic-gate 	 * from a diskset and the node doing the add/delete operation and
40200Sstevel@tonic-gate 	 * all other nodes in the diskset have left the cluster.
40210Sstevel@tonic-gate 	 */
40220Sstevel@tonic-gate 	if (sd->sd_mn_mynode) {
40230Sstevel@tonic-gate 		nd = sd->sd_mn_mynode;
40240Sstevel@tonic-gate 		if (nd->nd_flags & MD_MN_NODE_OWN) {
40250Sstevel@tonic-gate 			if (clnt_withdrawset(mynode(), sp, ep)) {
40260Sstevel@tonic-gate 				rval = -1;
40270Sstevel@tonic-gate 				goto out;
40280Sstevel@tonic-gate 			}
40290Sstevel@tonic-gate 			if (clnt_mnsetmaster(mynode(), sp, "",
40300Sstevel@tonic-gate 			    MD_MN_INVALID_NID, ep)) {
40310Sstevel@tonic-gate 				rval = -1;
40320Sstevel@tonic-gate 				goto out;
40330Sstevel@tonic-gate 			}
40340Sstevel@tonic-gate 		}
40350Sstevel@tonic-gate 	}
40360Sstevel@tonic-gate 
40370Sstevel@tonic-gate 	/*
40380Sstevel@tonic-gate 	 * Remove side records for this node (side) from local mddb
40390Sstevel@tonic-gate 	 * (clnt_deldrvs does this) if there are drives in the set.
40400Sstevel@tonic-gate 	 *
40410Sstevel@tonic-gate 	 * Don't need to mark this node as DEL since already marked as
40420Sstevel@tonic-gate 	 * ADD or DEL (or this node would have been chosen as master).
40430Sstevel@tonic-gate 	 * Don't need to mark other node records, drive records or
40440Sstevel@tonic-gate 	 * set records as DEL.  If a panic occurs during clnt_delset,
40450Sstevel@tonic-gate 	 * these records will be deleted the next time this node
40460Sstevel@tonic-gate 	 * becomes a member and goes through the reconfig cycle.
40470Sstevel@tonic-gate 	 */
40480Sstevel@tonic-gate 	/* Get the drive descriptors for this set */
40490Sstevel@tonic-gate 	if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
40500Sstevel@tonic-gate 	    ep)) == NULL) {
40510Sstevel@tonic-gate 		if (! mdisok(ep)) {
40520Sstevel@tonic-gate 			/*
40530Sstevel@tonic-gate 			 * Ignore and clear out any failures from
40540Sstevel@tonic-gate 			 * metaget_drivedesc since a panic could have
40550Sstevel@tonic-gate 			 * occurred when a node was partially added to a set.
40560Sstevel@tonic-gate 			 */
40570Sstevel@tonic-gate 			mdclrerror(ep);
40580Sstevel@tonic-gate 		}
40590Sstevel@tonic-gate 	} else {
40600Sstevel@tonic-gate 		if (clnt_deldrvs(mynode(), sp, dd, ep)) {
40610Sstevel@tonic-gate 			rval = -1;
40620Sstevel@tonic-gate 			goto out;
40630Sstevel@tonic-gate 		}
40640Sstevel@tonic-gate 	}
40650Sstevel@tonic-gate 
40660Sstevel@tonic-gate 	/*
40670Sstevel@tonic-gate 	 * Now, delete the set - this removes the node, drive
40680Sstevel@tonic-gate 	 * and set records from the local mddb.
40690Sstevel@tonic-gate 	 */
40700Sstevel@tonic-gate 	if (clnt_delset(mynode(), sp, ep)) {
40710Sstevel@tonic-gate 		rval = -1;
40720Sstevel@tonic-gate 		goto out;
40730Sstevel@tonic-gate 	}
40740Sstevel@tonic-gate 
40750Sstevel@tonic-gate out:
40760Sstevel@tonic-gate 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
40770Sstevel@tonic-gate 
40780Sstevel@tonic-gate 	/*
40790Sstevel@tonic-gate 	 * Ignore errors from unlock of set since set is no longer
40800Sstevel@tonic-gate 	 * known (if clnt_delset worked).
40810Sstevel@tonic-gate 	 */
40820Sstevel@tonic-gate 	if (clnt_unlock_set(mynode(), cl_sk, &xep) == -1) {
40830Sstevel@tonic-gate 		mdclrerror(&xep);
40840Sstevel@tonic-gate 	}
40850Sstevel@tonic-gate 
40860Sstevel@tonic-gate 	cl_set_setkey(NULL);
40870Sstevel@tonic-gate 
40880Sstevel@tonic-gate 	metaflushsetname(sp);
40890Sstevel@tonic-gate 
40900Sstevel@tonic-gate 	/*
40910Sstevel@tonic-gate 	 * If this node is the lowest numbered nodeid then
40920Sstevel@tonic-gate 	 * call sdssc_create/delete_end depending on whether
40930Sstevel@tonic-gate 	 * this node is marked as ADD or DEL in the node record.
40940Sstevel@tonic-gate 	 */
40950Sstevel@tonic-gate 	if (lowest_alive_nodeid) {
40960Sstevel@tonic-gate 		if (nd->nd_flags & MD_MN_NODE_ADD)
40970Sstevel@tonic-gate 			sdssc_create_end(sp->setname, SDSSC_CLEANUP);
40980Sstevel@tonic-gate 		else if (nd->nd_flags & MD_MN_NODE_DEL)
40990Sstevel@tonic-gate 			sdssc_delete_end(sp->setname, SDSSC_CLEANUP);
41000Sstevel@tonic-gate 	}
41010Sstevel@tonic-gate 
41020Sstevel@tonic-gate 	/* Finished with this set -- return */
41030Sstevel@tonic-gate 	return (rval);
41040Sstevel@tonic-gate }
41050Sstevel@tonic-gate 
41060Sstevel@tonic-gate /*
41070Sstevel@tonic-gate  * Reconfig step to choose a new master for all MN disksets.
41080Sstevel@tonic-gate  * Return values:
41090Sstevel@tonic-gate  *	0 - Everything is great.
41100Sstevel@tonic-gate  *	1 - This node failed to reconfig.
41110Sstevel@tonic-gate  *	205 - Cause another reconfig due to a nodelist problem
41120Sstevel@tonic-gate  *		or RPC failure to another node
41130Sstevel@tonic-gate  */
41140Sstevel@tonic-gate int
41150Sstevel@tonic-gate meta_reconfig_choose_master(
41160Sstevel@tonic-gate 	md_error_t	*ep
41170Sstevel@tonic-gate )
41180Sstevel@tonic-gate {
41190Sstevel@tonic-gate 	set_t				max_sets, setno;
41200Sstevel@tonic-gate 	int				nodecnt;
41210Sstevel@tonic-gate 	mndiskset_membershiplist_t	*nl;
41220Sstevel@tonic-gate 	md_set_desc			*sd;
41230Sstevel@tonic-gate 	mdsetname_t			*sp;
41240Sstevel@tonic-gate 	int				rval = 0;
41250Sstevel@tonic-gate 	mddb_setflags_config_t		sf;
41260Sstevel@tonic-gate 	int				start_node_delayed = 0;
41270Sstevel@tonic-gate 
41280Sstevel@tonic-gate 	if ((max_sets = get_max_sets(ep)) == 0) {
41290Sstevel@tonic-gate 		mde_perror(ep, dgettext(TEXT_DOMAIN,
41300Sstevel@tonic-gate 		    "Unable to get number of sets"));
41310Sstevel@tonic-gate 		return (1);
41320Sstevel@tonic-gate 	}
41330Sstevel@tonic-gate 
41340Sstevel@tonic-gate 	/*
41350Sstevel@tonic-gate 	 * Get membershiplist from API routine.  If there's
41360Sstevel@tonic-gate 	 * an error, return a 205 to cause another reconfig.
41370Sstevel@tonic-gate 	 */
41380Sstevel@tonic-gate 	if (meta_read_nodelist(&nodecnt, &nl, ep) == -1) {
41390Sstevel@tonic-gate 		mde_perror(ep, "");
41400Sstevel@tonic-gate 		return (205);
41410Sstevel@tonic-gate 	}
41420Sstevel@tonic-gate 
41430Sstevel@tonic-gate 	for (setno = 1; setno < max_sets; setno++) {
41440Sstevel@tonic-gate 		if ((sp = metasetnosetname(setno, ep)) == NULL) {
41450Sstevel@tonic-gate 			if (mdiserror(ep, MDE_NO_SET)) {
41460Sstevel@tonic-gate 				/* No set for this setno - continue */
41470Sstevel@tonic-gate 				mdclrerror(ep);
41480Sstevel@tonic-gate 				continue;
41490Sstevel@tonic-gate 			} else {
41500Sstevel@tonic-gate 				/*
41510Sstevel@tonic-gate 				 * If encountered an RPC error from my node,
41520Sstevel@tonic-gate 				 * then immediately fail.
41530Sstevel@tonic-gate 				 */
41540Sstevel@tonic-gate 				if (mdanyrpcerror(ep)) {
41550Sstevel@tonic-gate 					mde_perror(ep, "");
41560Sstevel@tonic-gate 					return (1);
41570Sstevel@tonic-gate 				}
41580Sstevel@tonic-gate 				/* Can't get set information */
41590Sstevel@tonic-gate 				mde_perror(ep, dgettext(TEXT_DOMAIN,
41600Sstevel@tonic-gate 					"Unable to get information for "
41610Sstevel@tonic-gate 					"set number %d"), setno);
41620Sstevel@tonic-gate 				mdclrerror(ep);
41630Sstevel@tonic-gate 				continue;
41640Sstevel@tonic-gate 			}
41650Sstevel@tonic-gate 		}
41660Sstevel@tonic-gate 
41670Sstevel@tonic-gate 		/* If setname is there, set desc should exist. */
41680Sstevel@tonic-gate 		if ((sd = metaget_setdesc(sp, ep)) == NULL) {
41690Sstevel@tonic-gate 			/*
41700Sstevel@tonic-gate 			 * If encountered an RPC error from my node,
41710Sstevel@tonic-gate 			 * then immediately fail.
41720Sstevel@tonic-gate 			 */
41730Sstevel@tonic-gate 			if (mdanyrpcerror(ep)) {
41740Sstevel@tonic-gate 				mde_perror(ep, "");
41750Sstevel@tonic-gate 				return (1);
41760Sstevel@tonic-gate 			}
41770Sstevel@tonic-gate 			mde_perror(ep, dgettext(TEXT_DOMAIN,
41780Sstevel@tonic-gate 				"Unable to get set %s desc information"),
41790Sstevel@tonic-gate 				sp->setname);
41800Sstevel@tonic-gate 			mdclrerror(ep);
41810Sstevel@tonic-gate 			continue;
41820Sstevel@tonic-gate 		}
41830Sstevel@tonic-gate 
41840Sstevel@tonic-gate 		/* Only reconfig MN disksets */
41850Sstevel@tonic-gate 		if (!MD_MNSET_DESC(sd)) {
41860Sstevel@tonic-gate 			continue;
41870Sstevel@tonic-gate 		}
41880Sstevel@tonic-gate 
41890Sstevel@tonic-gate 		meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
41900Sstevel@tonic-gate 		    "Begin choose master for set %s: %s"),
41910Sstevel@tonic-gate 		    sp->setname, meta_print_hrtime(gethrtime() - start_time));
41920Sstevel@tonic-gate 
41930Sstevel@tonic-gate 		/* Update nodelist with member information. */
41940Sstevel@tonic-gate 		if (meta_reconfig_update_nodelist(sp, nl, sd, ep)) {
41950Sstevel@tonic-gate 			/*
41960Sstevel@tonic-gate 			 * If encountered an RPC error from my node,
41970Sstevel@tonic-gate 			 * then immediately fail.
41980Sstevel@tonic-gate 			 */
41990Sstevel@tonic-gate 			if (mdanyrpcerror(ep)) {
42000Sstevel@tonic-gate 				mde_perror(ep, "");
42010Sstevel@tonic-gate 				return (1);
42020Sstevel@tonic-gate 			}
42030Sstevel@tonic-gate 			mde_perror(ep, "");
42040Sstevel@tonic-gate 			mdclrerror(ep);
42050Sstevel@tonic-gate 			continue;
42060Sstevel@tonic-gate 		}
42070Sstevel@tonic-gate 
42080Sstevel@tonic-gate 		/*
42090Sstevel@tonic-gate 		 * If all nodes in a cluster are starting, then
42100Sstevel@tonic-gate 		 * all nodes will attempt to contact all other nodes
42110Sstevel@tonic-gate 		 * to determine a master node.  This can lead to a
42120Sstevel@tonic-gate 		 * problem where node 1 is trying to contact the rpc.metad
42130Sstevel@tonic-gate 		 * node 2 and node 2 is trying to contact the rpc.metad
42140Sstevel@tonic-gate 		 * on node 1 -- and this causes the rpc call to fail
42150Sstevel@tonic-gate 		 * on both nodes and causes a new reconfig cycle.
42160Sstevel@tonic-gate 		 *
42170Sstevel@tonic-gate 		 * In order to break this problem, a newly starting node
42180Sstevel@tonic-gate 		 * will delay a small amount of time (nodeid mod 4 seconds)
42190Sstevel@tonic-gate 		 * and will then run the code to choose a master for the
42200Sstevel@tonic-gate 		 * first set.  Delay will only be done once regardless of the
42210Sstevel@tonic-gate 		 * number of sets.
42220Sstevel@tonic-gate 		 */
42230Sstevel@tonic-gate 		if (start_node_delayed == 0) {
42240Sstevel@tonic-gate 			(void) memset(&sf, 0, sizeof (sf));
42250Sstevel@tonic-gate 			sf.sf_setno = sp->setno;
42260Sstevel@tonic-gate 			sf.sf_flags = MDDB_NM_GET;
42270Sstevel@tonic-gate 			/* Use magic to help protect ioctl against attack. */
42280Sstevel@tonic-gate 			sf.sf_magic = MDDB_SETFLAGS_MAGIC;
42290Sstevel@tonic-gate 			if ((metaioctl(MD_MN_GET_SETFLAGS, &sf,
42300Sstevel@tonic-gate 			    &sf.sf_mde, NULL) == 0) &&
42310Sstevel@tonic-gate 			    ((sf.sf_setflags & MD_SET_MN_START_RC) ==
42320Sstevel@tonic-gate 			    MD_SET_MN_START_RC)) {
42330Sstevel@tonic-gate 				(void) sleep(sd->sd_mn_mynode->nd_nodeid % 4);
42340Sstevel@tonic-gate 			}
42350Sstevel@tonic-gate 			start_node_delayed = 1;
42360Sstevel@tonic-gate 		}
42370Sstevel@tonic-gate 
42380Sstevel@tonic-gate 		/* Choose master for this set */
42390Sstevel@tonic-gate 		rval = meta_reconfig_choose_master_for_set(sp, sd, ep);
42400Sstevel@tonic-gate 		if (rval == -1) {
42410Sstevel@tonic-gate 			mde_perror(ep, "");
42420Sstevel@tonic-gate 			return (1);
42430Sstevel@tonic-gate 		} else if (rval == 205) {
42440Sstevel@tonic-gate 			mde_perror(ep, "");
42450Sstevel@tonic-gate 			return (205);
42460Sstevel@tonic-gate 		}
42470Sstevel@tonic-gate 
42480Sstevel@tonic-gate 		/* Send new nodelist to rpc.mdcommd */
42490Sstevel@tonic-gate 		(void) mdmn_reinit_set(sp->setno);
42500Sstevel@tonic-gate 
42510Sstevel@tonic-gate 		meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
42520Sstevel@tonic-gate 		    "Choose master for set %s completed: %s"),
42530Sstevel@tonic-gate 		    sp->setname, meta_print_hrtime(gethrtime() - start_time));
42540Sstevel@tonic-gate 	}
42550Sstevel@tonic-gate 
42560Sstevel@tonic-gate 	/*
42570Sstevel@tonic-gate 	 * Each node turns on I/Os for all MN disksets.
42580Sstevel@tonic-gate 	 * This is to recover from the situation where the master died
42590Sstevel@tonic-gate 	 * during a previous reconfig cycle when I/Os were suspended
42600Sstevel@tonic-gate 	 * for a MN diskset.
42610Sstevel@tonic-gate 	 * If a failure occurs return a 1 which will force this node to
42620Sstevel@tonic-gate 	 * panic.  Cannot leave node in the situation where I/Os are
42630Sstevel@tonic-gate 	 * not resumed.
42640Sstevel@tonic-gate 	 */
42650Sstevel@tonic-gate 	setno = 0; /* 0 means all MN sets */
42660Sstevel@tonic-gate 	if (metaioctl(MD_MN_RESUME_SET, &setno, ep, NULL)) {
42670Sstevel@tonic-gate 		mde_perror(ep, "");
42680Sstevel@tonic-gate 		return (1);
42690Sstevel@tonic-gate 	}
42700Sstevel@tonic-gate 
42710Sstevel@tonic-gate 	/* Free the nodelist */
42720Sstevel@tonic-gate 	if (nodecnt)
42730Sstevel@tonic-gate 		meta_free_nodelist(nl);
42740Sstevel@tonic-gate 
42750Sstevel@tonic-gate 	return (0);
42760Sstevel@tonic-gate }
42770Sstevel@tonic-gate 
42780Sstevel@tonic-gate /*
42790Sstevel@tonic-gate  * meta_mnsync_user_records will synchronize the diskset user records across
42800Sstevel@tonic-gate  * all nodes in the diskset.  The diskset user records are stored in
42810Sstevel@tonic-gate  * each node's local set mddb.
42820Sstevel@tonic-gate  *
42830Sstevel@tonic-gate  * This needs to be done even if there is no master change during the
42840Sstevel@tonic-gate  * reconfig cycle since this routine should clean up any mess left by
42850Sstevel@tonic-gate  * the untimely termination of a metaset or metadb command (due to a
42860Sstevel@tonic-gate  * node panic or to user intervention).
42870Sstevel@tonic-gate  *
42880Sstevel@tonic-gate  * Caller is the Master node.
42890Sstevel@tonic-gate  *
42900Sstevel@tonic-gate  * Returns	 0 - Success
42910Sstevel@tonic-gate  *		205 - Failure during RPC to another node
42920Sstevel@tonic-gate  *		-1 - Any other failure and ep is filled in.
42930Sstevel@tonic-gate  */
42940Sstevel@tonic-gate int
42950Sstevel@tonic-gate meta_mnsync_user_records(
42960Sstevel@tonic-gate 	mdsetname_t	*sp,
42970Sstevel@tonic-gate 	md_error_t	*ep
42980Sstevel@tonic-gate )
42990Sstevel@tonic-gate {
43000Sstevel@tonic-gate 	md_set_desc		*sd;
43010Sstevel@tonic-gate 	md_mnnode_desc		*master_nodelist, *nd, *nd2, *ndtail;
43020Sstevel@tonic-gate 	md_mnset_record		*mnsr;
43030Sstevel@tonic-gate 	md_mnsr_node_t		*master_mnsr_node = NULL, *mnsr_node = NULL;
43040Sstevel@tonic-gate 	md_mnnode_record	*nr;
43050Sstevel@tonic-gate 	md_drive_record		*dr;
43060Sstevel@tonic-gate 	int			dr_cnt, dd_cnt;
43070Sstevel@tonic-gate 	int			found_my_nr;
43080Sstevel@tonic-gate 	md_drive_desc		*dd, *dd_prev, *master_dd, *other_dd;
43090Sstevel@tonic-gate 	int			all_drives_ok;
43100Sstevel@tonic-gate 	int			rval = 0;
43110Sstevel@tonic-gate 	int			max_genid = 0;
43120Sstevel@tonic-gate 	int			num_alive_nodes, num_alive_nodes_del = 0;
43130Sstevel@tonic-gate 	int			set_locked = 0;
43140Sstevel@tonic-gate 	md_setkey_t		*cl_sk;
43150Sstevel@tonic-gate 	md_error_t		xep = mdnullerror;
43160Sstevel@tonic-gate 	char			*anode[1];
43170Sstevel@tonic-gate 	mddb_setflags_config_t	sf;
43180Sstevel@tonic-gate 
43190Sstevel@tonic-gate 	/*
43200Sstevel@tonic-gate 	 * Sync up node records first.
43210Sstevel@tonic-gate 	 * Construct a master nodelist using the nodelist from this
43220Sstevel@tonic-gate 	 * node's rpc.metad node records and then setting the state of each
43230Sstevel@tonic-gate 	 * node following these rules:
43240Sstevel@tonic-gate 	 *	- If a node record is marked OK on its node, mark it OK
43250Sstevel@tonic-gate 	 *		in the master nodelist (and later OK on all nodes)
43260Sstevel@tonic-gate 	 *		If a node record is also marked OWN on its node,
43270Sstevel@tonic-gate 	 *		mark it OWN in the master nodelist.
43280Sstevel@tonic-gate 	 *	- If a node record is not marked OK on its node, then mark
43290Sstevel@tonic-gate 	 *		it as DEL in the master list (later deleting it)
43300Sstevel@tonic-gate 	 *	- If node record doesn't exist on that node, then mark it DEL
43310Sstevel@tonic-gate 	 *		(later deleting it)
43320Sstevel@tonic-gate 	 *	- If set record doesn't exist on that node, mark node as DEL
43330Sstevel@tonic-gate 	 *	- If a node record doesn't exist on all nodes, then mark it DEL
43340Sstevel@tonic-gate 	 *	- If a node is not ALIVE, then
43350Sstevel@tonic-gate 	 *		- If that node marked DEL on any node - mark it DEL
43360Sstevel@tonic-gate 	 *			in master list but leave in nodelist
43370Sstevel@tonic-gate 	 *		- If that node is marked as ADD on any node, mark it
43380Sstevel@tonic-gate 	 *			ADD in the master list but leave in nodelist
43390Sstevel@tonic-gate 	 *		- When that node returns to the living, the DEL
43400Sstevel@tonic-gate 	 *			node record will be removed and the ADD node
43410Sstevel@tonic-gate 	 *			record may be removed if marked ADD on that
43420Sstevel@tonic-gate 	 *			node.
43430Sstevel@tonic-gate 	 * The key rule is to not remove a node from the nodelist until
43440Sstevel@tonic-gate 	 * that node record is removed from its own node.  Do not want to
43450Sstevel@tonic-gate 	 * remove a node's record from all other nodes and then have
43460Sstevel@tonic-gate 	 * that node have its own record marked OK so that a node will pick
43470Sstevel@tonic-gate 	 * a different master than the other nodes.
43480Sstevel@tonic-gate 	 *
43490Sstevel@tonic-gate 	 * Next,
43500Sstevel@tonic-gate 	 * If node is ALIVE and node record is marked DEL in master nodelist,
43510Sstevel@tonic-gate 	 * remove node from set.
43520Sstevel@tonic-gate 	 * If node is ALIVE and node record is marked OK in master nodelist,
43530Sstevel@tonic-gate 	 * mark it OK on all other nodes.
43540Sstevel@tonic-gate 	 * If node is not ALIVE and node record is marked DEL in master
43550Sstevel@tonic-gate 	 * nodelist, mark it DEL on all other nodes.
43560Sstevel@tonic-gate 	 * If node is not ALIVE and node record is marked ADD in master,
43570Sstevel@tonic-gate 	 * nodelist, mark it ADD on all other nodes.
43580Sstevel@tonic-gate 	 */
43590Sstevel@tonic-gate 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
43600Sstevel@tonic-gate 		return (-1);
43610Sstevel@tonic-gate 	}
43620Sstevel@tonic-gate 	master_nodelist = sd->sd_nodelist;
43630Sstevel@tonic-gate 
43640Sstevel@tonic-gate 	/*
43650Sstevel@tonic-gate 	 * Walk through nodelist creating a master nodelist.
43660Sstevel@tonic-gate 	 */
43670Sstevel@tonic-gate 	num_alive_nodes = 0;
43680Sstevel@tonic-gate 	nd = master_nodelist;
43690Sstevel@tonic-gate 	while (nd) {
43700Sstevel@tonic-gate 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
43710Sstevel@tonic-gate 			nd = nd->nd_next;
43720Sstevel@tonic-gate 			continue;
43730Sstevel@tonic-gate 		}
43740Sstevel@tonic-gate 		num_alive_nodes++;
43750Sstevel@tonic-gate 		if (clnt_mngetset(nd->nd_nodename, sp->setname,
43760Sstevel@tonic-gate 		    MD_SET_BAD, &mnsr, ep) == -1) {
43770Sstevel@tonic-gate 			if (mdiserror(ep, MDE_NO_SET)) {
43780Sstevel@tonic-gate 				/* set doesn't exist, mark node as DEL */
43790Sstevel@tonic-gate 				nd->nd_flags &= ~MD_MN_NODE_OK;
43800Sstevel@tonic-gate 				nd->nd_flags &= ~MD_MN_NODE_ADD;
43810Sstevel@tonic-gate 				nd->nd_flags |= MD_MN_NODE_DEL;
43820Sstevel@tonic-gate 				nd->nd_flags |= MD_MN_NODE_NOSET;
43830Sstevel@tonic-gate 				nd = nd->nd_next;
43840Sstevel@tonic-gate 				continue;
43850Sstevel@tonic-gate 			} else {
43860Sstevel@tonic-gate 				/* If RPC failure to another node return 205 */
43870Sstevel@tonic-gate 				if ((mdanyrpcerror(ep)) &&
43880Sstevel@tonic-gate 				    (sd->sd_mn_mynode->nd_nodeid !=
43890Sstevel@tonic-gate 				    nd->nd_nodeid)) {
43900Sstevel@tonic-gate 					rval = 205;
43910Sstevel@tonic-gate 				} else {
43920Sstevel@tonic-gate 					/* Any other failure */
43930Sstevel@tonic-gate 					rval = -1;
43940Sstevel@tonic-gate 				}
43950Sstevel@tonic-gate 				goto out;
43960Sstevel@tonic-gate 			}
43970Sstevel@tonic-gate 		}
43980Sstevel@tonic-gate 		/* Find biggest genid in records for this diskset */
43990Sstevel@tonic-gate 		if (mnsr->sr_genid > max_genid)
44000Sstevel@tonic-gate 			max_genid = mnsr->sr_genid;
44010Sstevel@tonic-gate 
44020Sstevel@tonic-gate 		dr = mnsr->sr_drivechain;
44030Sstevel@tonic-gate 		while (dr) {
44040Sstevel@tonic-gate 			/* Find biggest genid in records for this diskset */
44050Sstevel@tonic-gate 			if (dr->dr_genid > max_genid) {
44060Sstevel@tonic-gate 				max_genid = dr->dr_genid;
44070Sstevel@tonic-gate 			}
44080Sstevel@tonic-gate 			dr = dr->dr_next;
44090Sstevel@tonic-gate 		}
44100Sstevel@tonic-gate 
44110Sstevel@tonic-gate 		found_my_nr = 0;
44120Sstevel@tonic-gate 		nr = mnsr->sr_nodechain;
44130Sstevel@tonic-gate 		/* nr is the list of node recs from nd_nodename node */
44140Sstevel@tonic-gate 		while (nr) {
44150Sstevel@tonic-gate 			/* Find biggest genid in records for this diskset */
44160Sstevel@tonic-gate 			if (nr->nr_genid > max_genid)
44170Sstevel@tonic-gate 				max_genid = nr->nr_genid;
44180Sstevel@tonic-gate 			nd2 = master_nodelist;
44190Sstevel@tonic-gate 			ndtail = NULL;
44200Sstevel@tonic-gate 			/* For each node record, is it in master list? */
44210Sstevel@tonic-gate 			while (nd2) {
44220Sstevel@tonic-gate 				if (nd2->nd_nodeid == nr->nr_nodeid)
44230Sstevel@tonic-gate 					break;
44240Sstevel@tonic-gate 				if (nd2->nd_next == NULL)
44250Sstevel@tonic-gate 					ndtail = nd2;
44260Sstevel@tonic-gate 				nd2 = nd2->nd_next;
44270Sstevel@tonic-gate 			}
44280Sstevel@tonic-gate 			/*
44290Sstevel@tonic-gate 			 * Found node record not in master list -- add it
44300Sstevel@tonic-gate 			 * to list marking it as DEL since node record
44310Sstevel@tonic-gate 			 * should exist on all nodes unless a panic occurred
44320Sstevel@tonic-gate 			 * during addition or deletion of host to diskset.
44330Sstevel@tonic-gate 			 */
44340Sstevel@tonic-gate 			if (nd2 == NULL) {
44350Sstevel@tonic-gate 				nd2 = Zalloc(sizeof (*nd2));
44360Sstevel@tonic-gate 				(void) strcpy(nd2->nd_nodename,
44370Sstevel@tonic-gate 				    nr->nr_nodename);
44380Sstevel@tonic-gate 				nd2->nd_flags = nr->nr_flags;
44390Sstevel@tonic-gate 				nd2->nd_flags |= MD_MN_NODE_DEL;
44400Sstevel@tonic-gate 				nd2->nd_nodeid = nr->nr_nodeid;
44410Sstevel@tonic-gate 				nd2->nd_next = NULL;
44420Sstevel@tonic-gate 				ndtail->nd_next = nd2;
44430Sstevel@tonic-gate 				nd2 = NULL;
44440Sstevel@tonic-gate 				nr = nr->nr_next;
44450Sstevel@tonic-gate 				continue;
44460Sstevel@tonic-gate 			}
44470Sstevel@tonic-gate 			/*
44480Sstevel@tonic-gate 			 * Is this the node record for the node that
44490Sstevel@tonic-gate 			 * we requested the set desc from?
44500Sstevel@tonic-gate 			 * If so, check if node has its own node record
44510Sstevel@tonic-gate 			 * marked OK. If marked OK, check for the OWN bit.
44520Sstevel@tonic-gate 			 */
44530Sstevel@tonic-gate 			if (nr->nr_nodeid == nd->nd_nodeid) {
44540Sstevel@tonic-gate 				found_my_nr = 1;
44550Sstevel@tonic-gate 				if (nr->nr_flags & MD_MN_NODE_OK) {
44560Sstevel@tonic-gate 					/*
44570Sstevel@tonic-gate 					 * If node record is marked OK
44580Sstevel@tonic-gate 					 * on its own node, then mark it OK
44590Sstevel@tonic-gate 					 * in the master list.  Node record
44600Sstevel@tonic-gate 					 * would have to exist on all nodes
44610Sstevel@tonic-gate 					 * in the ADD state before it could
44620Sstevel@tonic-gate 					 * be put into the OK state.
44630Sstevel@tonic-gate 					 */
44640Sstevel@tonic-gate 					nd->nd_flags |= MD_MN_NODE_OK;
44650Sstevel@tonic-gate 					nd->nd_flags &=
44660Sstevel@tonic-gate 					    ~(MD_MN_NODE_ADD | MD_MN_NODE_DEL);
44670Sstevel@tonic-gate 					/*
44680Sstevel@tonic-gate 					 * Mark own in master list as marked
44690Sstevel@tonic-gate 					 * on own node.
44700Sstevel@tonic-gate 					 */
44710Sstevel@tonic-gate 					if (nr->nr_flags & MD_MN_NODE_OWN)
44720Sstevel@tonic-gate 						nd->nd_flags |= MD_MN_NODE_OWN;
44730Sstevel@tonic-gate 					else
44740Sstevel@tonic-gate 						nd->nd_flags &= ~MD_MN_NODE_OWN;
44750Sstevel@tonic-gate 				} else {
44760Sstevel@tonic-gate 					/* Otherwise, mark node as DEL */
44770Sstevel@tonic-gate 					nd->nd_flags &= ~MD_MN_NODE_OK;
44780Sstevel@tonic-gate 					nd->nd_flags &= ~MD_MN_NODE_ADD;
44790Sstevel@tonic-gate 					nd->nd_flags |= MD_MN_NODE_DEL;
44800Sstevel@tonic-gate 				}
44810Sstevel@tonic-gate 			}
44820Sstevel@tonic-gate 			/*
44830Sstevel@tonic-gate 			 * If node is not ALIVE and marked DEL
44840Sstevel@tonic-gate 			 * on any node, make it DEL in master list.
44850Sstevel@tonic-gate 			 * If node is not ALIVE and marked ADD
44860Sstevel@tonic-gate 			 * on any node, make it ADD in master list
44870Sstevel@tonic-gate 			 * unless node record has already been marked DEL.
44880Sstevel@tonic-gate 			 */
44890Sstevel@tonic-gate 			if (!(nr->nr_flags & MD_MN_NODE_ALIVE)) {
44900Sstevel@tonic-gate 				if (nr->nr_flags & MD_MN_NODE_ADD) {
44910Sstevel@tonic-gate 					if (!(nd->nd_flags & MD_MN_NODE_DEL)) {
44920Sstevel@tonic-gate 						/* If not DEL - mark it ADD */
44930Sstevel@tonic-gate 						nd->nd_flags |= MD_MN_NODE_ADD;
44940Sstevel@tonic-gate 						nd->nd_flags &= ~MD_MN_NODE_OK;
44950Sstevel@tonic-gate 					}
44960Sstevel@tonic-gate 				}
44970Sstevel@tonic-gate 				if (nr->nr_flags & MD_MN_NODE_DEL) {
44980Sstevel@tonic-gate 					nd->nd_flags |= MD_MN_NODE_DEL;
44990Sstevel@tonic-gate 					nd->nd_flags &= ~MD_MN_NODE_OK;
45000Sstevel@tonic-gate 					/* Could already be ADD - make it DEL */
45010Sstevel@tonic-gate 					nd->nd_flags &= ~MD_MN_NODE_ADD;
45020Sstevel@tonic-gate 				}
45030Sstevel@tonic-gate 			}
45040Sstevel@tonic-gate 			nr = nr->nr_next;
45050Sstevel@tonic-gate 		}
45060Sstevel@tonic-gate 		/*
45070Sstevel@tonic-gate 		 * If a node record doesn't exist on its own node,
45080Sstevel@tonic-gate 		 * then mark node as DEL.
45090Sstevel@tonic-gate 		 */
45100Sstevel@tonic-gate 		if (found_my_nr == 0) {
45110Sstevel@tonic-gate 			nd->nd_flags &= ~MD_MN_NODE_OK;
45120Sstevel@tonic-gate 			nd->nd_flags |= MD_MN_NODE_DEL;
45130Sstevel@tonic-gate 		}
45140Sstevel@tonic-gate 
45150Sstevel@tonic-gate 		/*
45160Sstevel@tonic-gate 		 * If node is OK - put mnsr onto master_mnsr_node list for
45170Sstevel@tonic-gate 		 * later use when syncing up the drive records in the set.
45180Sstevel@tonic-gate 		 */
45190Sstevel@tonic-gate 		if (nd->nd_flags & MD_MN_NODE_OK) {
45200Sstevel@tonic-gate 			mnsr_node = Zalloc(sizeof (*mnsr_node));
45210Sstevel@tonic-gate 			mnsr_node->mmn_mnsr = mnsr;
45220Sstevel@tonic-gate 			(void) strncpy(mnsr_node->mmn_nodename,
45230Sstevel@tonic-gate 				nd->nd_nodename, MD_MAX_MNNODENAME_PLUS_1);
45240Sstevel@tonic-gate 			mnsr_node->mmn_next = master_mnsr_node;
45250Sstevel@tonic-gate 			master_mnsr_node = mnsr_node;
45260Sstevel@tonic-gate 		} else {
45270Sstevel@tonic-gate 			free_sr((struct md_set_record *)mnsr);
45280Sstevel@tonic-gate 		}
45290Sstevel@tonic-gate 
45300Sstevel@tonic-gate 		nd = nd->nd_next;
45310Sstevel@tonic-gate 	}
45320Sstevel@tonic-gate 
45330Sstevel@tonic-gate 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
45340Sstevel@tonic-gate 	    "Master nodelist created for set %s: %s"),
45350Sstevel@tonic-gate 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
45360Sstevel@tonic-gate 
45370Sstevel@tonic-gate 	/*
45380Sstevel@tonic-gate 	 * Send master nodelist to the rpc.metad on all nodes (including
45390Sstevel@tonic-gate 	 * myself) and each node will update itself.  This will set the
45400Sstevel@tonic-gate 	 * ADD and DEL flags on each node as setup in the master nodelist.
45410Sstevel@tonic-gate 	 * Don't send nodelist to node where set doesn't exist.
45420Sstevel@tonic-gate 	 */
45430Sstevel@tonic-gate 	nd = master_nodelist;
45440Sstevel@tonic-gate 	while (nd) {
45450Sstevel@tonic-gate 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE) ||
45460Sstevel@tonic-gate 		    (nd->nd_flags & MD_MN_NODE_NOSET)) {
45470Sstevel@tonic-gate 			nd = nd->nd_next;
45480Sstevel@tonic-gate 			continue;
45490Sstevel@tonic-gate 		}
45500Sstevel@tonic-gate 		if (clnt_upd_nr_flags(nd->nd_nodename, sp,
45510Sstevel@tonic-gate 		    master_nodelist, MD_NR_SET, MNSET_IN_RECONFIG, ep)) {
45520Sstevel@tonic-gate 			/* If RPC failure to another node return 205 */
45530Sstevel@tonic-gate 			if ((mdanyrpcerror(ep)) &&
45540Sstevel@tonic-gate 			    (sd->sd_mn_mynode->nd_nodeid !=
45550Sstevel@tonic-gate 			    nd->nd_nodeid)) {
45560Sstevel@tonic-gate 				rval = 205;
45570Sstevel@tonic-gate 			} else {
45580Sstevel@tonic-gate 				/* Any other failure */
45590Sstevel@tonic-gate 				rval = -1;
45600Sstevel@tonic-gate 			}
45610Sstevel@tonic-gate 			goto out;
45620Sstevel@tonic-gate 		}
45630Sstevel@tonic-gate 		nd = nd->nd_next;
45640Sstevel@tonic-gate 	}
45650Sstevel@tonic-gate 
45660Sstevel@tonic-gate 	/*
45670Sstevel@tonic-gate 	 * Now, delete nodes that need to be deleted.
45680Sstevel@tonic-gate 	 */
45690Sstevel@tonic-gate 	if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
45700Sstevel@tonic-gate 	    ep))  == NULL) {
45710Sstevel@tonic-gate 		if (! mdisok(ep)) {
45720Sstevel@tonic-gate 			rval = -1;
45730Sstevel@tonic-gate 			goto out;
45740Sstevel@tonic-gate 		}
45750Sstevel@tonic-gate 	}
45760Sstevel@tonic-gate 
45770Sstevel@tonic-gate 	/*
45780Sstevel@tonic-gate 	 * May be doing lots of RPC commands to the nodes, so lock the
45790Sstevel@tonic-gate 	 * ALIVE members of the set since most of the rpc.metad routines
45800Sstevel@tonic-gate 	 * require this for security reasons.
45810Sstevel@tonic-gate 	 */
45820Sstevel@tonic-gate 	nd = master_nodelist;
45830Sstevel@tonic-gate 	while (nd) {
45840Sstevel@tonic-gate 		/* Skip non-alive nodes and node without set */
45850Sstevel@tonic-gate 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE) ||
45860Sstevel@tonic-gate 		    (nd->nd_flags & MD_MN_NODE_NOSET)) {
45870Sstevel@tonic-gate 			nd = nd->nd_next;
45880Sstevel@tonic-gate 			continue;
45890Sstevel@tonic-gate 		}
45900Sstevel@tonic-gate 		if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
45910Sstevel@tonic-gate 			/* If RPC failure to another node return 205 */
45920Sstevel@tonic-gate 			if ((mdanyrpcerror(ep)) &&
45930Sstevel@tonic-gate 			    (sd->sd_mn_mynode->nd_nodeid !=
45940Sstevel@tonic-gate 			    nd->nd_nodeid)) {
45950Sstevel@tonic-gate 				rval = 205;
45960Sstevel@tonic-gate 			} else {
45970Sstevel@tonic-gate 				/* Any other failure */
45980Sstevel@tonic-gate 				rval = -1;
45990Sstevel@tonic-gate 			}
46000Sstevel@tonic-gate 			goto out;
46010Sstevel@tonic-gate 		}
46020Sstevel@tonic-gate 		set_locked = 1;
46030Sstevel@tonic-gate 		nd = nd->nd_next;
46040Sstevel@tonic-gate 	}
46050Sstevel@tonic-gate 
46060Sstevel@tonic-gate 	nd = master_nodelist;
46070Sstevel@tonic-gate 	while (nd) {
46080Sstevel@tonic-gate 		/* Skip non-alive nodes */
46090Sstevel@tonic-gate 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
46100Sstevel@tonic-gate 			nd = nd->nd_next;
46110Sstevel@tonic-gate 			continue;
46120Sstevel@tonic-gate 		}
46130Sstevel@tonic-gate 		if (nd->nd_flags & MD_MN_NODE_DEL) {
46140Sstevel@tonic-gate 			num_alive_nodes_del++;
46150Sstevel@tonic-gate 			/*
46160Sstevel@tonic-gate 			 * Delete this node rec from all ALIVE nodes in diskset.
46170Sstevel@tonic-gate 			 */
46180Sstevel@tonic-gate 			nd2 = master_nodelist;
46190Sstevel@tonic-gate 			while (nd2) {
46200Sstevel@tonic-gate 				/* Skip non-alive nodes and node without set */
46210Sstevel@tonic-gate 				if (!(nd2->nd_flags & MD_MN_NODE_ALIVE) ||
46220Sstevel@tonic-gate 				    (nd2->nd_flags & MD_MN_NODE_NOSET)) {
46230Sstevel@tonic-gate 					nd2 = nd2->nd_next;
46240Sstevel@tonic-gate 					continue;
46250Sstevel@tonic-gate 				}
46260Sstevel@tonic-gate 
46270Sstevel@tonic-gate 				/* This is a node being deleted from set */
46280Sstevel@tonic-gate 				if (nd2->nd_nodeid == nd->nd_nodeid) {
46290Sstevel@tonic-gate 					/* Mark set record as DEL */
46300Sstevel@tonic-gate 					if (clnt_upd_sr_flags(nd->nd_nodename,
46310Sstevel@tonic-gate 					    sp, MD_SR_DEL, ep)) {
46320Sstevel@tonic-gate 						/* RPC failure to !my node */
46330Sstevel@tonic-gate 						if ((mdanyrpcerror(ep)) &&
46340Sstevel@tonic-gate 						    (sd->sd_mn_mynode->
46350Sstevel@tonic-gate 						    nd_nodeid
46360Sstevel@tonic-gate 						    != nd->nd_nodeid)) {
46370Sstevel@tonic-gate 							rval = 205;
46380Sstevel@tonic-gate 						} else {
46390Sstevel@tonic-gate 							/* Any other failure */
46400Sstevel@tonic-gate 							rval = -1;
46410Sstevel@tonic-gate 						}
46420Sstevel@tonic-gate 						goto out;
46430Sstevel@tonic-gate 					}
46440Sstevel@tonic-gate 					if (clnt_deldrvs(nd->nd_nodename, sp,
46450Sstevel@tonic-gate 					    dd, ep)) {
46460Sstevel@tonic-gate 						/* RPC failure to !my node */
46470Sstevel@tonic-gate 						if ((mdanyrpcerror(ep)) &&
46480Sstevel@tonic-gate 						    (sd->sd_mn_mynode->
46490Sstevel@tonic-gate 						    nd_nodeid
46500Sstevel@tonic-gate 						    != nd->nd_nodeid)) {
46510Sstevel@tonic-gate 							rval = 205;
46520Sstevel@tonic-gate 						} else {
46530Sstevel@tonic-gate 							/* Any other failure */
46540Sstevel@tonic-gate 							rval = -1;
46550Sstevel@tonic-gate 						}
46560Sstevel@tonic-gate 						goto out;
46570Sstevel@tonic-gate 					}
46580Sstevel@tonic-gate 					if (clnt_delset(nd->nd_nodename, sp,
46590Sstevel@tonic-gate 					    ep) == -1) {
46600Sstevel@tonic-gate 						/* RPC failure to !my node */
46610Sstevel@tonic-gate 						if ((mdanyrpcerror(ep)) &&
46620Sstevel@tonic-gate 						    (sd->sd_mn_mynode->
46630Sstevel@tonic-gate 						    nd_nodeid
46640Sstevel@tonic-gate 						    != nd->nd_nodeid)) {
46650Sstevel@tonic-gate 							rval = 205;
46660Sstevel@tonic-gate 						} else {
46670Sstevel@tonic-gate 							/* Any other failure */
46680Sstevel@tonic-gate 							rval = -1;
46690Sstevel@tonic-gate 						}
46700Sstevel@tonic-gate 						goto out;
46710Sstevel@tonic-gate 					}
46720Sstevel@tonic-gate 				} else {
46730Sstevel@tonic-gate 					/*
46740Sstevel@tonic-gate 					 * Delete host from sets on hosts
46750Sstevel@tonic-gate 					 * not being deleted.
46760Sstevel@tonic-gate 					 */
46770Sstevel@tonic-gate 					anode[0] = Strdup(nd->nd_nodename);
46780Sstevel@tonic-gate 					if (clnt_delhosts(nd2->nd_nodename, sp,
46790Sstevel@tonic-gate 					    1, anode, ep) == -1) {
46800Sstevel@tonic-gate 						Free(anode[0]);
46810Sstevel@tonic-gate 						/* RPC failure to !my node */
46820Sstevel@tonic-gate 						if ((mdanyrpcerror(ep)) &&
46830Sstevel@tonic-gate 						    (sd->sd_mn_mynode->
46840Sstevel@tonic-gate 						    nd_nodeid
46850Sstevel@tonic-gate 						    != nd2->nd_nodeid)) {
46860Sstevel@tonic-gate 							rval = 205;
46870Sstevel@tonic-gate 						} else {
46880Sstevel@tonic-gate 							/* Any other failure */
46890Sstevel@tonic-gate 							rval = -1;
46900Sstevel@tonic-gate 						}
46910Sstevel@tonic-gate 						goto out;
46920Sstevel@tonic-gate 					}
46930Sstevel@tonic-gate 
46940Sstevel@tonic-gate 					meta_mc_log(MC_LOG5,
46950Sstevel@tonic-gate 					    dgettext(TEXT_DOMAIN,
46960Sstevel@tonic-gate 					    "Deleted node %s (%d) on node %s "
46970Sstevel@tonic-gate 					    "from set %s: %s"),
46980Sstevel@tonic-gate 					    nd->nd_nodename, nd->nd_nodeid,
46990Sstevel@tonic-gate 					    nd2->nd_nodename,
47000Sstevel@tonic-gate 					    sp->setname,
47010Sstevel@tonic-gate 					    meta_print_hrtime(
47020Sstevel@tonic-gate 					    gethrtime() - start_time));
47030Sstevel@tonic-gate 
47040Sstevel@tonic-gate 					Free(anode[0]);
47050Sstevel@tonic-gate 				}
47060Sstevel@tonic-gate 				nd2 = nd2->nd_next;
47070Sstevel@tonic-gate 			}
47080Sstevel@tonic-gate 		}
47090Sstevel@tonic-gate 		nd = nd->nd_next;
47100Sstevel@tonic-gate 	}
47110Sstevel@tonic-gate 
47120Sstevel@tonic-gate 	nd = master_nodelist;
47130Sstevel@tonic-gate 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
47140Sstevel@tonic-gate 	while (nd) {
47150Sstevel@tonic-gate 		/* Skip non-alive nodes and node without set */
47160Sstevel@tonic-gate 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE) ||
47170Sstevel@tonic-gate 		    (nd->nd_flags & MD_MN_NODE_NOSET)) {
47180Sstevel@tonic-gate 			nd = nd->nd_next;
47190Sstevel@tonic-gate 			continue;
47200Sstevel@tonic-gate 		}
47210Sstevel@tonic-gate 		if (clnt_unlock_set(nd->nd_nodename, cl_sk, ep)) {
47220Sstevel@tonic-gate 			/* If RPC failure to another node return 205 */
47230Sstevel@tonic-gate 			if ((mdanyrpcerror(ep)) &&
47240Sstevel@tonic-gate 			    (sd->sd_mn_mynode->nd_nodeid !=
47250Sstevel@tonic-gate 			    nd->nd_nodeid)) {
47260Sstevel@tonic-gate 				rval = 205;
47270Sstevel@tonic-gate 			} else {
47280Sstevel@tonic-gate 				/* Any other failure */
47290Sstevel@tonic-gate 				rval = -1;
47300Sstevel@tonic-gate 			}
47310Sstevel@tonic-gate 			goto out;
47320Sstevel@tonic-gate 		}
47330Sstevel@tonic-gate 		nd = nd->nd_next;
47340Sstevel@tonic-gate 	}
47350Sstevel@tonic-gate 	cl_set_setkey(NULL);
47360Sstevel@tonic-gate 	set_locked = 0;
47370Sstevel@tonic-gate 
47380Sstevel@tonic-gate 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
47390Sstevel@tonic-gate 	    "Nodelist syncronization complete for set %s: %s"),
47400Sstevel@tonic-gate 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
47410Sstevel@tonic-gate 
47420Sstevel@tonic-gate 	metaflushsetname(sp);
47430Sstevel@tonic-gate 
47440Sstevel@tonic-gate 	/*
47450Sstevel@tonic-gate 	 * If all alive nodes have been deleted from set, just
47460Sstevel@tonic-gate 	 * return since nothing else can be done until non-alive
47470Sstevel@tonic-gate 	 * nodes (if there are any) rejoin the cluster.
47480Sstevel@tonic-gate 	 */
47490Sstevel@tonic-gate 	if (num_alive_nodes == num_alive_nodes_del) {
47500Sstevel@tonic-gate 		rval = 0;
47510Sstevel@tonic-gate 		goto out;
47520Sstevel@tonic-gate 	}
47530Sstevel@tonic-gate 
47540Sstevel@tonic-gate 	/*
47550Sstevel@tonic-gate 	 * Sync up drive records.
47560Sstevel@tonic-gate 	 *
47570Sstevel@tonic-gate 	 * If a node panic'd (or metaset command was killed) during the
47580Sstevel@tonic-gate 	 * addition or deletion of a drive to the diskset, the nodes
47590Sstevel@tonic-gate 	 * may have a different view of the drive list.  During cleanup
47600Sstevel@tonic-gate 	 * of the drive list during reconfig, a drive will be deleted
47610Sstevel@tonic-gate 	 * from the list if the master node sees that the drive has been
47620Sstevel@tonic-gate 	 * marked in the ADD state on any node or is marked in the DEL state
47630Sstevel@tonic-gate 	 * on all nodes.
47640Sstevel@tonic-gate 	 * This cleanup must occur even if all nodes in the cluster are
47650Sstevel@tonic-gate 	 * not part of the cluster so that all nodes have the same view
47660Sstevel@tonic-gate 	 * of the drivelist.
47670Sstevel@tonic-gate 	 * Then if the entire cluster goes down and comes back up, the
47680Sstevel@tonic-gate 	 * new master node could be a node that wasn't in the cluster when
47690Sstevel@tonic-gate 	 * the node was deleted.  This could lead to a situation where the
47700Sstevel@tonic-gate 	 * master node thinks that a drive is OK, but this drive isn't
47710Sstevel@tonic-gate 	 * known to the other nodes.
47720Sstevel@tonic-gate 	 * This situation can also occur during the addition of a drive
47730Sstevel@tonic-gate 	 * where a node has the drive marked OK, but the node executing the
47740Sstevel@tonic-gate 	 * metaset command enountered a failure before marking that drive OK
47750Sstevel@tonic-gate 	 * on the rest of the nodes.  If the node with the OK drive then
47760Sstevel@tonic-gate 	 * panics, then rest of the nodes will remove that drive marked ADD
47770Sstevel@tonic-gate 	 * and when the node with the OK drive rejoins the cluster, it will
47780Sstevel@tonic-gate 	 * have a drive marked OK that is unknown by the other nodes.
47790Sstevel@tonic-gate 	 *
47800Sstevel@tonic-gate 	 * There are 2 situations to consider:
47810Sstevel@tonic-gate 	 * A) Master knows about a drive that other nodes don't know about.
47820Sstevel@tonic-gate 	 * B) At least one slave node knows about a drive that the master
47830Sstevel@tonic-gate 	 *    node doesn't know about.
47840Sstevel@tonic-gate 	 *
47850Sstevel@tonic-gate 	 * To handle these situations the following steps are followed:
47860Sstevel@tonic-gate 	 * 1) Count number of drives known by this master node and the
47870Sstevel@tonic-gate 	 *    other slave nodes.
47880Sstevel@tonic-gate 	 *    If all nodes have the same number of drives and the master has
47890Sstevel@tonic-gate 	 *    all drives marked OK, then skip to step4.
47900Sstevel@tonic-gate 	 *
47910Sstevel@tonic-gate 	 * 2) If a node has less drives listed than the master, the master
47920Sstevel@tonic-gate 	 *    must get the drive descriptor list from that node so that
47930Sstevel@tonic-gate 	 *    master can determine which drive it needs to delete from that
47940Sstevel@tonic-gate 	 *    node.  Master must get the drive descriptor list since the
47950Sstevel@tonic-gate 	 *    drive record list does not contain the name of the drive, but
47960Sstevel@tonic-gate 	 *    only a key and the key can only be interprested on that other
47970Sstevel@tonic-gate 	 *    node.
47980Sstevel@tonic-gate 	 *
47990Sstevel@tonic-gate 	 * 3) The master will then create the master drive list by doing:
48000Sstevel@tonic-gate 	 *	- Master starts with drive list known by master.
48010Sstevel@tonic-gate 	 *	- Any drive marked ADD will be removed from the list.
48020Sstevel@tonic-gate 	 *	- Any drive not known by another node (from step2) will be
48030Sstevel@tonic-gate 	 *	removed from the drive list.
48040Sstevel@tonic-gate 	 *	- If a drive is marked DEL on the master, the master must
48050Sstevel@tonic-gate 	 *	verify that the drive record is marked DEL on all nodes.
48060Sstevel@tonic-gate 	 *	If any node has the drive record marked OK, mark it OK
48070Sstevel@tonic-gate 	 *	on the master.  (The reason why is described below).
48080Sstevel@tonic-gate 	 *
48090Sstevel@tonic-gate 	 * 4) The master sends out the master drive list and the slave
48100Sstevel@tonic-gate 	 *    nodes will force their drive lists to match the master
48110Sstevel@tonic-gate 	 *    drive list by deleting drives, if necessary and by changing
48120Sstevel@tonic-gate 	 *    the drive record states from ADD->OK if master has drive
48130Sstevel@tonic-gate 	 *    marked OK and slave has drive marked ADD.
48140Sstevel@tonic-gate 	 *
48150Sstevel@tonic-gate 	 * Interesting scenarios:
48160Sstevel@tonic-gate 	 *
48170Sstevel@tonic-gate 	 * 1) System has 4 nodes with node 1 as the master.  Node 3 starts
48180Sstevel@tonic-gate 	 *    to delete a drive record (drive record on node 1 is marked DEL),
48190Sstevel@tonic-gate 	 *    but is stopped when node 3 panics.  Node 1 also panics.
48200Sstevel@tonic-gate 	 *    During reconfig cycle, node 2 is picked as master and the drive
48210Sstevel@tonic-gate 	 *    record is left alone since all nodes in the cluster have it
48220Sstevel@tonic-gate 	 *    marked OK.  User now sees drive as part of diskset.
48230Sstevel@tonic-gate 	 *    Now, entire cluster is rebooted and node 1 rejoins the cluster.
48240Sstevel@tonic-gate 	 *    Node 1 is picked as the master and node 1 has drive record
48250Sstevel@tonic-gate 	 *    marked DEL.  Node 1 contacts all other nodes in the cluster
48260Sstevel@tonic-gate 	 *    and since at least one node has the drive record marked OK,
48270Sstevel@tonic-gate 	 *    the master marks the drive record OK.
48280Sstevel@tonic-gate 	 *    User continues to see the drive as part of the diskset.
48290Sstevel@tonic-gate 	 */
48300Sstevel@tonic-gate 
48310Sstevel@tonic-gate 	/* Reget set descriptor since flushed above */
48320Sstevel@tonic-gate 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
48330Sstevel@tonic-gate 		rval = -1;
48340Sstevel@tonic-gate 		goto out;
48350Sstevel@tonic-gate 	}
48360Sstevel@tonic-gate 
48370Sstevel@tonic-gate 	/* Has side effect of setting sd->sd_drvs to same as master_dd */
48380Sstevel@tonic-gate 	if ((master_dd = metaget_drivedesc_sideno(sp,
48390Sstevel@tonic-gate 	    sd->sd_mn_mynode->nd_nodeid,
48400Sstevel@tonic-gate 	    (MD_BASICNAME_OK | PRINT_FAST), ep)) == NULL) {
48410Sstevel@tonic-gate 		/* No drives in list */
48420Sstevel@tonic-gate 		if (!mdisok(ep)) {
48430Sstevel@tonic-gate 			/*
48440Sstevel@tonic-gate 			 * Can't get drive list for this node, so
48450Sstevel@tonic-gate 			 * return -1 causing this node to be removed
48460Sstevel@tonic-gate 			 * cluster config and fixed.
48470Sstevel@tonic-gate 			 */
48480Sstevel@tonic-gate 			rval = -1;
48490Sstevel@tonic-gate 			goto out;
48500Sstevel@tonic-gate 		}
48510Sstevel@tonic-gate 	}
48520Sstevel@tonic-gate 
48530Sstevel@tonic-gate 	/* Count the number of drives for all nodes */
48540Sstevel@tonic-gate 	mnsr_node = master_mnsr_node;
48550Sstevel@tonic-gate 	while (mnsr_node) {
48560Sstevel@tonic-gate 		dr_cnt = 0;
48570Sstevel@tonic-gate 		dr = mnsr_node->mmn_mnsr->sr_drivechain;
48580Sstevel@tonic-gate 		while (dr) {
48590Sstevel@tonic-gate 			dr_cnt++;
48600Sstevel@tonic-gate 			dr = dr->dr_next;
48610Sstevel@tonic-gate 		}
48620Sstevel@tonic-gate 		mnsr_node->mmn_numdrives = dr_cnt;
48630Sstevel@tonic-gate 		mnsr_node = mnsr_node->mmn_next;
48640Sstevel@tonic-gate 	}
48650Sstevel@tonic-gate 
48660Sstevel@tonic-gate 	/* Count the number of drives for the master; also check flags */
48670Sstevel@tonic-gate 	all_drives_ok = 1;
48680Sstevel@tonic-gate 	dd_cnt = 0;
48690Sstevel@tonic-gate 	dd = master_dd;
48700Sstevel@tonic-gate 	while (dd) {
48710Sstevel@tonic-gate 		dd_cnt++;
48720Sstevel@tonic-gate 		if (!(dd->dd_flags & MD_DR_OK))
48730Sstevel@tonic-gate 			all_drives_ok = 0;
48740Sstevel@tonic-gate 		dd = dd->dd_next;
48750Sstevel@tonic-gate 	}
48760Sstevel@tonic-gate 
48770Sstevel@tonic-gate 	/* If all drives are ok, do quick check against number of drives */
48780Sstevel@tonic-gate 	if (all_drives_ok) {
48790Sstevel@tonic-gate 		/* If all nodes have same number of drives, almost done */
48800Sstevel@tonic-gate 		mnsr_node = master_mnsr_node;
48810Sstevel@tonic-gate 		while (mnsr_node) {
48820Sstevel@tonic-gate 			if (mnsr_node->mmn_numdrives != dd_cnt)
48830Sstevel@tonic-gate 				break;
48840Sstevel@tonic-gate 			mnsr_node = mnsr_node->mmn_next;
48850Sstevel@tonic-gate 		}
48860Sstevel@tonic-gate 		/* All nodes have same number of drives, just send flags */
48870Sstevel@tonic-gate 		if (mnsr_node == NULL) {
48880Sstevel@tonic-gate 			goto send_drive_list;
48890Sstevel@tonic-gate 		}
48900Sstevel@tonic-gate 	}
48910Sstevel@tonic-gate 
48920Sstevel@tonic-gate 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
48930Sstevel@tonic-gate 	    "Begin detailed drive synchronization for set %s: %s"),
48940Sstevel@tonic-gate 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
48950Sstevel@tonic-gate 
48960Sstevel@tonic-gate 	/* Detailed check required  */
48970Sstevel@tonic-gate 	mnsr_node = master_mnsr_node;
48980Sstevel@tonic-gate 	while (mnsr_node) {
48990Sstevel@tonic-gate 		/* Does slave node have less drives than master? */
49000Sstevel@tonic-gate 		if (mnsr_node->mmn_numdrives < dd_cnt) {
49010Sstevel@tonic-gate 			/* Yes - must determine which drive is missing */
49020Sstevel@tonic-gate 			if (clnt_getdrivedesc(mnsr_node->mmn_nodename, sp,
49030Sstevel@tonic-gate 			    &other_dd, ep)) {
49040Sstevel@tonic-gate 				/* RPC failure to !my node */
49050Sstevel@tonic-gate 				if ((mdanyrpcerror(ep)) &&
49060Sstevel@tonic-gate 				    (strcmp(mynode(), mnsr_node->mmn_nodename)
49070Sstevel@tonic-gate 				    != 0)) {
49080Sstevel@tonic-gate 					rval = 205;
49090Sstevel@tonic-gate 				} else {
49100Sstevel@tonic-gate 					/* Any other failure */
49110Sstevel@tonic-gate 					rval = -1;
49120Sstevel@tonic-gate 				}
49130Sstevel@tonic-gate 				mde_perror(ep, dgettext(TEXT_DOMAIN,
49140Sstevel@tonic-gate 				    "Master node %s unable to "
49150Sstevel@tonic-gate 				    "retrieve drive list from node %s"),
49160Sstevel@tonic-gate 				    mynode(), mnsr_node->mmn_nodename);
49170Sstevel@tonic-gate 				goto out;
49180Sstevel@tonic-gate 			}
49190Sstevel@tonic-gate 			mnsr_node->mmn_dd = other_dd;
49200Sstevel@tonic-gate 			dd = master_dd;
49210Sstevel@tonic-gate 			while (dd) {
49220Sstevel@tonic-gate 				if (!(dd->dd_flags & MD_DR_OK)) {
49230Sstevel@tonic-gate 					dd = dd->dd_next;
49240Sstevel@tonic-gate 					continue;
49250Sstevel@tonic-gate 				}
49260Sstevel@tonic-gate 				other_dd = mnsr_node->mmn_dd;
49270Sstevel@tonic-gate 				while (other_dd) {
49280Sstevel@tonic-gate 					/* Convert to devids, when available */
49290Sstevel@tonic-gate 					if (strcmp(other_dd->dd_dnp->cname,
49300Sstevel@tonic-gate 					    dd->dd_dnp->cname) == 0) {
49310Sstevel@tonic-gate 						break;
49320Sstevel@tonic-gate 					}
49330Sstevel@tonic-gate 					other_dd = other_dd->dd_next;
49340Sstevel@tonic-gate 				}
49350Sstevel@tonic-gate 				/*
49360Sstevel@tonic-gate 				 * dd not found on slave so mark it
49370Sstevel@tonic-gate 				 * ADD for later deletion (drives in ADD
49380Sstevel@tonic-gate 				 * state are deleted later in this routine).
49390Sstevel@tonic-gate 				 */
49400Sstevel@tonic-gate 				if (other_dd == NULL) {
49410Sstevel@tonic-gate 					dd->dd_flags = MD_DR_ADD;
49420Sstevel@tonic-gate 				}
49430Sstevel@tonic-gate 				dd = dd->dd_next;
49440Sstevel@tonic-gate 			}
49450Sstevel@tonic-gate 
49460Sstevel@tonic-gate 		}
49470Sstevel@tonic-gate 		mnsr_node = mnsr_node->mmn_next;
49480Sstevel@tonic-gate 	}
49490Sstevel@tonic-gate 
49500Sstevel@tonic-gate 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
49510Sstevel@tonic-gate 	    "Drive check completed for set %s: %s"),
49520Sstevel@tonic-gate 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
49530Sstevel@tonic-gate 
49540Sstevel@tonic-gate 	dd = master_dd;
49550Sstevel@tonic-gate 	dd_prev = 0;
49560Sstevel@tonic-gate 	while (dd) {
49570Sstevel@tonic-gate 		/* Remove any ADD drives from list */
49580Sstevel@tonic-gate 		if (dd->dd_flags & MD_DR_ADD) {
49590Sstevel@tonic-gate 			if (dd_prev) {
49600Sstevel@tonic-gate 				dd_prev->dd_next = dd->dd_next;
49610Sstevel@tonic-gate 				dd->dd_next = NULL;
49620Sstevel@tonic-gate 				metafreedrivedesc(&dd);
49630Sstevel@tonic-gate 				dd = dd_prev->dd_next;
49640Sstevel@tonic-gate 			} else {
49650Sstevel@tonic-gate 				/*
49660Sstevel@tonic-gate 				 * If removing drive descriptor from head
49670Sstevel@tonic-gate 				 * of linked list, also change sd->sd_drvs.
49680Sstevel@tonic-gate 				 */
49690Sstevel@tonic-gate 				master_dd = sd->sd_drvs = dd->dd_next;
49700Sstevel@tonic-gate 				dd->dd_next = NULL;
49710Sstevel@tonic-gate 				metafreedrivedesc(&dd);
49720Sstevel@tonic-gate 				dd = master_dd;
49730Sstevel@tonic-gate 			}
49740Sstevel@tonic-gate 			/* dd setup in if/else above */
49750Sstevel@tonic-gate 			continue;
49760Sstevel@tonic-gate 		}
49770Sstevel@tonic-gate 		/*
49780Sstevel@tonic-gate 		 * If drive is marked DEL, check all other nodes.
49790Sstevel@tonic-gate 		 * If drive on another node is marked OK, mark drive OK
49800Sstevel@tonic-gate 		 * in master list.  If drive is marked DEL or doesn't exist
49810Sstevel@tonic-gate 		 * on all nodes, remove drive from list.
49820Sstevel@tonic-gate 		 */
49830Sstevel@tonic-gate 		if (dd->dd_flags & MD_DR_DEL) {
49840Sstevel@tonic-gate 			mnsr_node = master_mnsr_node;
49850Sstevel@tonic-gate 			while (mnsr_node) {
49860Sstevel@tonic-gate 				if (mnsr_node->mmn_dd == NULL) {
49870Sstevel@tonic-gate 				    if (clnt_getdrivedesc(
49880Sstevel@tonic-gate 					mnsr_node->mmn_nodename, sp,
49890Sstevel@tonic-gate 					&other_dd, ep)) {
49900Sstevel@tonic-gate 					    /* RPC failure to !my node */
49910Sstevel@tonic-gate 					    if ((mdanyrpcerror(ep)) &&
49920Sstevel@tonic-gate 						(strcmp(mynode(),
49930Sstevel@tonic-gate 						mnsr_node->mmn_nodename)
49940Sstevel@tonic-gate 						!= 0)) {
49950Sstevel@tonic-gate 						    rval = 205;
49960Sstevel@tonic-gate 					    } else {
49970Sstevel@tonic-gate 						    /* Any other failure */
49980Sstevel@tonic-gate 						    rval = -1;
49990Sstevel@tonic-gate 					    }
50000Sstevel@tonic-gate 					    mde_perror(ep, dgettext(TEXT_DOMAIN,
50010Sstevel@tonic-gate 						"Master node %s unable "
50020Sstevel@tonic-gate 						"to retrieve drive list from "
50030Sstevel@tonic-gate 						"node %s"), mynode(),
50040Sstevel@tonic-gate 						mnsr_node->mmn_nodename);
50050Sstevel@tonic-gate 					    goto out;
50060Sstevel@tonic-gate 				    }
50070Sstevel@tonic-gate 				    mnsr_node->mmn_dd = other_dd;
50080Sstevel@tonic-gate 				}
50090Sstevel@tonic-gate 				other_dd = mnsr_node->mmn_dd;
50100Sstevel@tonic-gate 				while (other_dd) {
50110Sstevel@tonic-gate 					/* Found drive (OK) from other node */
50120Sstevel@tonic-gate 					if (strcmp(dd->dd_dnp->cname,
50130Sstevel@tonic-gate 					    other_dd->dd_dnp->cname)
50140Sstevel@tonic-gate 					    == 0) {
50150Sstevel@tonic-gate 						/* Drive marked OK */
50160Sstevel@tonic-gate 						if (other_dd->dd_flags &
50170Sstevel@tonic-gate 						    MD_DR_OK) {
50180Sstevel@tonic-gate 						    dd->dd_flags = MD_DR_OK;
50190Sstevel@tonic-gate 						}
50200Sstevel@tonic-gate 						break;
50210Sstevel@tonic-gate 					}
50220Sstevel@tonic-gate 					other_dd = other_dd->dd_next;
50230Sstevel@tonic-gate 				}
50240Sstevel@tonic-gate 				if (dd->dd_flags == MD_DR_OK)
50250Sstevel@tonic-gate 					break;
50260Sstevel@tonic-gate 
50270Sstevel@tonic-gate 				mnsr_node = mnsr_node->mmn_next;
50280Sstevel@tonic-gate 			}
50290Sstevel@tonic-gate 			/*
50300Sstevel@tonic-gate 			 * If no node had this drive marked OK, delete it.
50310Sstevel@tonic-gate 			 */
50320Sstevel@tonic-gate 			if (dd->dd_flags & MD_DR_DEL) {
50330Sstevel@tonic-gate 				if (dd_prev) {
50340Sstevel@tonic-gate 					dd_prev->dd_next = dd->dd_next;
50350Sstevel@tonic-gate 					dd->dd_next = NULL;
50360Sstevel@tonic-gate 					metafreedrivedesc(&dd);
50370Sstevel@tonic-gate 					dd = dd_prev->dd_next;
50380Sstevel@tonic-gate 				} else {
50390Sstevel@tonic-gate 					/*
50400Sstevel@tonic-gate 					 * If removing drive descriptor from
50410Sstevel@tonic-gate 					 * head of linked list, also change
50420Sstevel@tonic-gate 					 * sd->sd_drvs.
50430Sstevel@tonic-gate 					 */
50440Sstevel@tonic-gate 					master_dd = sd->sd_drvs = dd->dd_next;
50450Sstevel@tonic-gate 					dd->dd_next = NULL;
50460Sstevel@tonic-gate 					metafreedrivedesc(&dd);
50470Sstevel@tonic-gate 					dd = master_dd;
50480Sstevel@tonic-gate 				}
50490Sstevel@tonic-gate 				/* dd setup in if/else above */
50500Sstevel@tonic-gate 				continue;
50510Sstevel@tonic-gate 			}
50520Sstevel@tonic-gate 		}
50530Sstevel@tonic-gate 		dd_prev = dd;
50540Sstevel@tonic-gate 		dd = dd->dd_next;
50550Sstevel@tonic-gate 	}
50560Sstevel@tonic-gate 
50570Sstevel@tonic-gate 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
50580Sstevel@tonic-gate 	    "Setting drive states completed for set %s: %s"),
50590Sstevel@tonic-gate 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
50600Sstevel@tonic-gate 
50610Sstevel@tonic-gate send_drive_list:
50620Sstevel@tonic-gate 	/*
50630Sstevel@tonic-gate 	 * Set genid on all drives to be the highest value seen.
50640Sstevel@tonic-gate 	 */
50650Sstevel@tonic-gate 	dd = master_dd;
50660Sstevel@tonic-gate 	while (dd) {
50670Sstevel@tonic-gate 		dd->dd_genid = max_genid;
50680Sstevel@tonic-gate 		dd = dd->dd_next;
50690Sstevel@tonic-gate 	}
50700Sstevel@tonic-gate 	/*
50710Sstevel@tonic-gate 	 * Send updated drive list to all alive nodes.
50720Sstevel@tonic-gate 	 * Will also set genid on set and node records to have same
50730Sstevel@tonic-gate 	 * as the drive records.
50740Sstevel@tonic-gate 	 */
50750Sstevel@tonic-gate 	nd = sd->sd_nodelist;
50760Sstevel@tonic-gate 	while (nd) {
50770Sstevel@tonic-gate 		/* Skip non-alive nodes */
50780Sstevel@tonic-gate 		if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
50790Sstevel@tonic-gate 			nd = nd->nd_next;
50800Sstevel@tonic-gate 			continue;
50810Sstevel@tonic-gate 		}
50820Sstevel@tonic-gate 		if (clnt_upd_dr_reconfig(nd->nd_nodename, sp, master_dd, ep)) {
50830Sstevel@tonic-gate 			/* RPC failure to another node */
50840Sstevel@tonic-gate 			if ((mdanyrpcerror(ep)) &&
50850Sstevel@tonic-gate 			    (sd->sd_mn_mynode->nd_nodeid != nd->nd_nodeid)) {
50860Sstevel@tonic-gate 				rval = 205;
50870Sstevel@tonic-gate 			} else {
50880Sstevel@tonic-gate 				/* Any other failure */
50890Sstevel@tonic-gate 				rval = -1;
50900Sstevel@tonic-gate 			}
50910Sstevel@tonic-gate 			goto out;
50920Sstevel@tonic-gate 		}
50930Sstevel@tonic-gate 		nd = nd->nd_next;
50940Sstevel@tonic-gate 	}
50950Sstevel@tonic-gate 
50960Sstevel@tonic-gate 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
50970Sstevel@tonic-gate 	    "Sent drive list to all nodes for set %s: %s"),
50980Sstevel@tonic-gate 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
50990Sstevel@tonic-gate 
51000Sstevel@tonic-gate 	/*
51010Sstevel@tonic-gate 	 * If no drive records left in set and nodes had been joined,
51020Sstevel@tonic-gate 	 * withdraw the nodes.  Always reset the master and mark
51030Sstevel@tonic-gate 	 * all nodes as withdrawn on all nodes.
51040Sstevel@tonic-gate 	 */
51050Sstevel@tonic-gate 	if (master_dd == NULL) {
51060Sstevel@tonic-gate 		/* Reset new master flag since no longer master */
51070Sstevel@tonic-gate 		(void) memset(&sf, 0, sizeof (sf));
51080Sstevel@tonic-gate 		sf.sf_setno = sp->setno;
51090Sstevel@tonic-gate 		sf.sf_setflags = MD_SET_MN_NEWMAS_RC;
51100Sstevel@tonic-gate 		sf.sf_flags = MDDB_NM_RESET;
51110Sstevel@tonic-gate 		/* Use magic to help protect ioctl against attack. */
51120Sstevel@tonic-gate 		sf.sf_magic = MDDB_SETFLAGS_MAGIC;
51130Sstevel@tonic-gate 		/* Ignore failure, failure to reset flag isn't catastrophic */
51140Sstevel@tonic-gate 		(void) metaioctl(MD_MN_SET_SETFLAGS, &sf,
51150Sstevel@tonic-gate 		    &sf.sf_mde, NULL);
51160Sstevel@tonic-gate 
51170Sstevel@tonic-gate 		meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
51180Sstevel@tonic-gate 		    "Reset new master flag for " "set %s: %s"),
51190Sstevel@tonic-gate 		    sp->setname, meta_print_hrtime(gethrtime() - start_time));
51200Sstevel@tonic-gate 
51210Sstevel@tonic-gate 		nd = sd->sd_nodelist;
51220Sstevel@tonic-gate 		while (nd) {
51230Sstevel@tonic-gate 			/* Skip non-alive nodes  */
51240Sstevel@tonic-gate 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
51250Sstevel@tonic-gate 				nd = nd->nd_next;
51260Sstevel@tonic-gate 				continue;
51270Sstevel@tonic-gate 			}
51280Sstevel@tonic-gate 
51290Sstevel@tonic-gate 			if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
51300Sstevel@tonic-gate 				/* RPC failure to another node */
51310Sstevel@tonic-gate 				if ((mdanyrpcerror(ep)) &&
51320Sstevel@tonic-gate 				    (sd->sd_mn_mynode->nd_nodeid !=
51330Sstevel@tonic-gate 				    nd->nd_nodeid)) {
51340Sstevel@tonic-gate 					rval = 205;
51350Sstevel@tonic-gate 				} else {
51360Sstevel@tonic-gate 					/* Any other failure */
51370Sstevel@tonic-gate 					rval = -1;
51380Sstevel@tonic-gate 				}
51390Sstevel@tonic-gate 				goto out;
51400Sstevel@tonic-gate 			}
51410Sstevel@tonic-gate 			set_locked = 1;
51420Sstevel@tonic-gate 
51430Sstevel@tonic-gate 			/* Withdraw node from set if owner */
51440Sstevel@tonic-gate 			if ((nd->nd_flags & MD_MN_NODE_OWN) &&
51450Sstevel@tonic-gate 			    (clnt_withdrawset(nd->nd_nodename, sp, ep))) {
51460Sstevel@tonic-gate 				/* RPC failure to another node */
51470Sstevel@tonic-gate 				if ((mdanyrpcerror(ep)) &&
51480Sstevel@tonic-gate 				    (sd->sd_mn_mynode->nd_nodeid !=
51490Sstevel@tonic-gate 				    nd->nd_nodeid)) {
51500Sstevel@tonic-gate 					rval = 205;
51510Sstevel@tonic-gate 				} else {
51520Sstevel@tonic-gate 					/* Any other failure */
51530Sstevel@tonic-gate 					rval = -1;
51540Sstevel@tonic-gate 				}
51550Sstevel@tonic-gate 				goto out;
51560Sstevel@tonic-gate 			}
51570Sstevel@tonic-gate 
51580Sstevel@tonic-gate 			/* Mark all nodes as withdrawn on this node */
51590Sstevel@tonic-gate 			if (clnt_upd_nr_flags(nd->nd_nodename, sp,
51600Sstevel@tonic-gate 			    sd->sd_nodelist, MD_NR_WITHDRAW, NULL, ep)) {
51610Sstevel@tonic-gate 				/* RPC failure to another node */
51620Sstevel@tonic-gate 				if ((mdanyrpcerror(ep)) &&
51630Sstevel@tonic-gate 				    (sd->sd_mn_mynode->nd_nodeid !=
51640Sstevel@tonic-gate 				    nd->nd_nodeid)) {
51650Sstevel@tonic-gate 					rval = 205;
51660Sstevel@tonic-gate 				} else {
51670Sstevel@tonic-gate 					/* Any other failure */
51680Sstevel@tonic-gate 					rval = -1;
51690Sstevel@tonic-gate 				}
51700Sstevel@tonic-gate 				goto out;
51710Sstevel@tonic-gate 			}
51720Sstevel@tonic-gate 
51730Sstevel@tonic-gate 			/* Resets master to no-master on this node */
51740Sstevel@tonic-gate 			if (clnt_mnsetmaster(nd->nd_nodename, sp,
51750Sstevel@tonic-gate 			    "", MD_MN_INVALID_NID, ep)) {
51760Sstevel@tonic-gate 				/* RPC failure to another node */
51770Sstevel@tonic-gate 				if ((mdanyrpcerror(ep)) &&
51780Sstevel@tonic-gate 				    (sd->sd_mn_mynode->nd_nodeid !=
51790Sstevel@tonic-gate 				    nd->nd_nodeid)) {
51800Sstevel@tonic-gate 					rval = 205;
51810Sstevel@tonic-gate 				} else {
51820Sstevel@tonic-gate 					/* Any other failure */
51830Sstevel@tonic-gate 					rval = -1;
51840Sstevel@tonic-gate 				}
51850Sstevel@tonic-gate 				goto out;
51860Sstevel@tonic-gate 			}
51870Sstevel@tonic-gate 
51880Sstevel@tonic-gate 			cl_sk = cl_get_setkey(sp->setno, sp->setname);
51890Sstevel@tonic-gate 			if (clnt_unlock_set(nd->nd_nodename, cl_sk, ep)) {
51900Sstevel@tonic-gate 				/* RPC failure to another node */
51910Sstevel@tonic-gate 				if ((mdanyrpcerror(ep)) &&
51920Sstevel@tonic-gate 				    (sd->sd_mn_mynode->nd_nodeid !=
51930Sstevel@tonic-gate 				    nd->nd_nodeid)) {
51940Sstevel@tonic-gate 					rval = 205;
51950Sstevel@tonic-gate 				} else {
51960Sstevel@tonic-gate 					/* Any other failure */
51970Sstevel@tonic-gate 					rval = -1;
51980Sstevel@tonic-gate 				}
51990Sstevel@tonic-gate 				goto out;
52000Sstevel@tonic-gate 			}
52010Sstevel@tonic-gate 			set_locked = 0;
52020Sstevel@tonic-gate 			nd = nd->nd_next;
52030Sstevel@tonic-gate 		}
52040Sstevel@tonic-gate 	}
52050Sstevel@tonic-gate 
52060Sstevel@tonic-gate out:
52070Sstevel@tonic-gate 	/*
52080Sstevel@tonic-gate 	 * If got here and set is still locked, then an error has
52090Sstevel@tonic-gate 	 * occurred and master_nodelist is still valid.
52100Sstevel@tonic-gate 	 * If error is not an RPC error, then unlock.
52110Sstevel@tonic-gate 	 * If error is an RPC error, skip unlocks since this could cause
52120Sstevel@tonic-gate 	 * yet another RPC timeout if a node has failed.
52130Sstevel@tonic-gate 	 * Ignore failures in unlock since unlock is just trying to
52140Sstevel@tonic-gate 	 * clean things up.
52150Sstevel@tonic-gate 	 */
52160Sstevel@tonic-gate 	if ((set_locked) && !(mdanyrpcerror(ep))) {
52170Sstevel@tonic-gate 		nd = master_nodelist;
52180Sstevel@tonic-gate 		cl_sk = cl_get_setkey(sp->setno, sp->setname);
52190Sstevel@tonic-gate 		while (nd) {
52200Sstevel@tonic-gate 			/* Skip non-alive nodes */
52210Sstevel@tonic-gate 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
52220Sstevel@tonic-gate 				nd = nd->nd_next;
52230Sstevel@tonic-gate 				continue;
52240Sstevel@tonic-gate 			}
52250Sstevel@tonic-gate 			/*
52260Sstevel@tonic-gate 			 * If clnt_unlock fails, just break out since next
52270Sstevel@tonic-gate 			 * reconfig cycle will reset the locks anyway.
52280Sstevel@tonic-gate 			 */
52290Sstevel@tonic-gate 			if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
52300Sstevel@tonic-gate 				break;
52310Sstevel@tonic-gate 			}
52320Sstevel@tonic-gate 			nd = nd->nd_next;
52330Sstevel@tonic-gate 		}
52340Sstevel@tonic-gate 		cl_set_setkey(NULL);
52350Sstevel@tonic-gate 	}
52360Sstevel@tonic-gate 	/* Free master_mnsr and drive descs */
52370Sstevel@tonic-gate 	mnsr_node = master_mnsr_node;
52380Sstevel@tonic-gate 	while (mnsr_node) {
52390Sstevel@tonic-gate 		master_mnsr_node = mnsr_node->mmn_next;
52400Sstevel@tonic-gate 		free_sr((md_set_record *)mnsr_node->mmn_mnsr);
52410Sstevel@tonic-gate 		free_rem_dd(mnsr_node->mmn_dd);
52420Sstevel@tonic-gate 		Free(mnsr_node);
52430Sstevel@tonic-gate 		mnsr_node = master_mnsr_node;
52440Sstevel@tonic-gate 	}
52450Sstevel@tonic-gate 
52460Sstevel@tonic-gate 	/* Frees sd->sd_drvs (which is also master_dd) */
52470Sstevel@tonic-gate 	metaflushsetname(sp);
52480Sstevel@tonic-gate 	return (rval);
52490Sstevel@tonic-gate }
52500Sstevel@tonic-gate 
52510Sstevel@tonic-gate /*
52520Sstevel@tonic-gate  * meta_mnsync_diskset_mddbs
52530Sstevel@tonic-gate  * Calling node is guaranteed to be an owner node.
52540Sstevel@tonic-gate  * Calling node is the master node.
52550Sstevel@tonic-gate  *
52560Sstevel@tonic-gate  * Master node verifies that ondisk mddb format matches its incore format.
52570Sstevel@tonic-gate  * If no nodes are joined to set, remove the change log entries.
52580Sstevel@tonic-gate  * If a node is joined to set, play the change log.
52590Sstevel@tonic-gate  *
52600Sstevel@tonic-gate  * Returns	 0 - Success
52610Sstevel@tonic-gate  *		 1 - Master unable to join to set.
52620Sstevel@tonic-gate  *		205 - Failure during RPC to another node
52630Sstevel@tonic-gate  *		-1 - Any other failure and ep is filled in.
52640Sstevel@tonic-gate  *			-1 return will eventually cause node to panic
52650Sstevel@tonic-gate  *			in a SunCluster environment.
52660Sstevel@tonic-gate  */
52670Sstevel@tonic-gate int
52680Sstevel@tonic-gate meta_mnsync_diskset_mddbs(
52690Sstevel@tonic-gate 	mdsetname_t	*sp,
52700Sstevel@tonic-gate 	md_error_t	*ep
52710Sstevel@tonic-gate )
52720Sstevel@tonic-gate {
52730Sstevel@tonic-gate 	md_set_desc		*sd;
52740Sstevel@tonic-gate 	mddb_config_t		c;
52750Sstevel@tonic-gate 	md_mn_msgclass_t	class;
52760Sstevel@tonic-gate 	mddb_setflags_config_t	sf;
52770Sstevel@tonic-gate 	md_mnnode_desc		*nd, *nd2;
52780Sstevel@tonic-gate 	md_error_t		xep = mdnullerror;
52790Sstevel@tonic-gate 	int			stale_set = 0;
52800Sstevel@tonic-gate 
52810Sstevel@tonic-gate 	/* If setname is there, set desc should exist. */
52820Sstevel@tonic-gate 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
52830Sstevel@tonic-gate 		mde_perror(ep, dgettext(TEXT_DOMAIN,
52840Sstevel@tonic-gate 		    "Unable to get set %s desc information"), sp->setname);
52850Sstevel@tonic-gate 		return (-1);
52860Sstevel@tonic-gate 	}
52870Sstevel@tonic-gate 
52880Sstevel@tonic-gate 	/* Are there drives in the set? */
52890Sstevel@tonic-gate 	if (metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
52900Sstevel@tonic-gate 	    ep) == NULL) {
52910Sstevel@tonic-gate 		if (! mdisok(ep)) {
52920Sstevel@tonic-gate 			return (-1);
52930Sstevel@tonic-gate 		}
52940Sstevel@tonic-gate 		/* No drives in set -- nothing to sync up */
52950Sstevel@tonic-gate 		return (0);
52960Sstevel@tonic-gate 	}
52970Sstevel@tonic-gate 
52980Sstevel@tonic-gate 	/*
52990Sstevel@tonic-gate 	 * Is master node (which is this node) joined to set?
53000Sstevel@tonic-gate 	 * If master node isn't joined (which means that no nodes
53010Sstevel@tonic-gate 	 * are joined to diskset), remove the change log entries
53020Sstevel@tonic-gate 	 * since no need to replay them - all nodes will have same
53030Sstevel@tonic-gate 	 * view of mddbs since all nodes are reading in the mddbs
53040Sstevel@tonic-gate 	 * from disk.
53050Sstevel@tonic-gate 	 * There is also no need to sync up the master and ondisk mddbs
53060Sstevel@tonic-gate 	 * since master has no incore knowledge.
53070Sstevel@tonic-gate 	 * Need to join master to set in order to flush the change
53080Sstevel@tonic-gate 	 * log entries. Don't need to block I/O during join of master
53090Sstevel@tonic-gate 	 * to set since no other nodes are joined to set and so no I/O
53100Sstevel@tonic-gate 	 * can be occurring.
53110Sstevel@tonic-gate 	 */
53120Sstevel@tonic-gate 	if (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) {
53130Sstevel@tonic-gate 		/* Join master to set */
53140Sstevel@tonic-gate 		if (clnt_joinset(mynode(), sp,
53150Sstevel@tonic-gate 		    MNSET_IN_RECONFIG, ep)) {
53160Sstevel@tonic-gate 			if (mdismddberror(ep, MDE_DB_STALE)) {
53170Sstevel@tonic-gate 				/*
53180Sstevel@tonic-gate 				 * If STALE, print message and continue on.
53190Sstevel@tonic-gate 				 * Don't do any writes or reads to mddbs
53200Sstevel@tonic-gate 				 * so don't clear change log.
53210Sstevel@tonic-gate 				 */
53220Sstevel@tonic-gate 				mde_perror(ep, dgettext(TEXT_DOMAIN,
53230Sstevel@tonic-gate 				    "Join of master node to STALE set %s"),
53240Sstevel@tonic-gate 				    sp->setname);
53250Sstevel@tonic-gate 				stale_set = 1;
53260Sstevel@tonic-gate 				mdclrerror(ep);
53270Sstevel@tonic-gate 			} else if (mdismddberror(ep, MDE_DB_ACCOK)) {
53280Sstevel@tonic-gate 				/* ACCOK means mediator provided extra vote */
53290Sstevel@tonic-gate 				mdclrerror(ep);
53300Sstevel@tonic-gate 			} else {
53310Sstevel@tonic-gate 				/*
53320Sstevel@tonic-gate 				 * If master is unable to join set, print an
53330Sstevel@tonic-gate 				 * error message.  Don't return failure or node
53340Sstevel@tonic-gate 				 * will panic during cluster reconfig cycle.
53350Sstevel@tonic-gate 				 * Also, withdraw node from set in order to
53360Sstevel@tonic-gate 				 * cleanup from failed join attempt.
53370Sstevel@tonic-gate 				 */
53380Sstevel@tonic-gate 				mde_perror(ep, dgettext(TEXT_DOMAIN,
53390Sstevel@tonic-gate 				    "Join of master node in set %s failed"),
53400Sstevel@tonic-gate 				    sp->setname);
53410Sstevel@tonic-gate 				if (clnt_withdrawset(mynode(), sp, &xep))
53420Sstevel@tonic-gate 					mdclrerror(&xep);
53430Sstevel@tonic-gate 				return (1);
53440Sstevel@tonic-gate 			}
53450Sstevel@tonic-gate 		}
53460Sstevel@tonic-gate 		/*
53470Sstevel@tonic-gate 		 * Master node successfully joined.
53480Sstevel@tonic-gate 		 * Set local copy of flags to OWN and
53490Sstevel@tonic-gate 		 * send owner flag to rpc.metad. If not stale,
53500Sstevel@tonic-gate 		 * flush the change log.
53510Sstevel@tonic-gate 		 */
53520Sstevel@tonic-gate 		sd->sd_mn_mynode->nd_flags |= MD_MN_NODE_OWN;
53530Sstevel@tonic-gate 		if (clnt_upd_nr_flags(mynode(), sp, sd->sd_nodelist, MD_NR_SET,
53540Sstevel@tonic-gate 		    MNSET_IN_RECONFIG, ep)) {
53550Sstevel@tonic-gate 			mde_perror(ep, dgettext(TEXT_DOMAIN,
53560Sstevel@tonic-gate 			    "Flag update of master node join in set %s failed"),
53570Sstevel@tonic-gate 			    sp->setname);
53580Sstevel@tonic-gate 			return (-1);
53590Sstevel@tonic-gate 		}
53600Sstevel@tonic-gate 
53610Sstevel@tonic-gate 		if (!stale_set) {
53620Sstevel@tonic-gate 			if (mdmn_reset_changelog(sp, ep,
53630Sstevel@tonic-gate 			    MDMN_CLF_RESETLOG) != 0) {
53640Sstevel@tonic-gate 				mde_perror(ep, dgettext(TEXT_DOMAIN,
53650Sstevel@tonic-gate 				    "Unable to reset changelog."));
53660Sstevel@tonic-gate 				return (-1);
53670Sstevel@tonic-gate 			}
53680Sstevel@tonic-gate 			meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
53690Sstevel@tonic-gate 			    "Removed changelog entries for set %s: %s"),
53700Sstevel@tonic-gate 			    sp->setname,
53710Sstevel@tonic-gate 			    meta_print_hrtime(gethrtime() - start_time));
53720Sstevel@tonic-gate 		}
53730Sstevel@tonic-gate 		/* Reset new master flag before return */
53740Sstevel@tonic-gate 		(void) memset(&sf, 0, sizeof (sf));
53750Sstevel@tonic-gate 		sf.sf_setno = sp->setno;
53760Sstevel@tonic-gate 		sf.sf_setflags = MD_SET_MN_NEWMAS_RC;
53770Sstevel@tonic-gate 		sf.sf_flags = MDDB_NM_RESET;
53780Sstevel@tonic-gate 		/* Use magic to help protect ioctl against attack. */
53790Sstevel@tonic-gate 		sf.sf_magic = MDDB_SETFLAGS_MAGIC;
53800Sstevel@tonic-gate 		/* Ignore failure, failure to reset flag isn't catastrophic */
53810Sstevel@tonic-gate 		(void) metaioctl(MD_MN_SET_SETFLAGS, &sf,
53820Sstevel@tonic-gate 		    &sf.sf_mde, NULL);
53830Sstevel@tonic-gate 
53840Sstevel@tonic-gate 		meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
53850Sstevel@tonic-gate 		    "Reset new master flag for set %s: %s"),
53860Sstevel@tonic-gate 		    sp->setname, meta_print_hrtime(gethrtime() - start_time));
53870Sstevel@tonic-gate 
53880Sstevel@tonic-gate 		return (0);
53890Sstevel@tonic-gate 	}
53900Sstevel@tonic-gate 
53910Sstevel@tonic-gate 	/*
53920Sstevel@tonic-gate 	 * Is master already joined to STALE set (< 50% mddbs avail)?
53930Sstevel@tonic-gate 	 * If so, can make no config changes to mddbs so don't check or play
53940Sstevel@tonic-gate 	 * changelog and don't sync master node to ondisk mddbs.
53950Sstevel@tonic-gate 	 * To get out of the stale state all nodes must be withdrawn
53960Sstevel@tonic-gate 	 * from set.  Then as nodes are re-joined, all nodes will
53970Sstevel@tonic-gate 	 * have same view of mddbs since all nodes are reading the
53980Sstevel@tonic-gate 	 * mddbs from disk.
53990Sstevel@tonic-gate 	 */
54000Sstevel@tonic-gate 	(void) memset(&c, 0, sizeof (c));
54010Sstevel@tonic-gate 	c.c_id = 0;
54020Sstevel@tonic-gate 	c.c_setno = sp->setno;
54030Sstevel@tonic-gate 	if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) {
54040Sstevel@tonic-gate 		(void) mdstealerror(ep, &c.c_mde);
54050Sstevel@tonic-gate 		return (-1);
54060Sstevel@tonic-gate 	}
54070Sstevel@tonic-gate 	if (c.c_flags & MDDB_C_STALE) {
54080Sstevel@tonic-gate 		return (0);
54090Sstevel@tonic-gate 	}
54100Sstevel@tonic-gate 
54110Sstevel@tonic-gate 	/*
54120Sstevel@tonic-gate 	 * If this node is NOT a newly chosen master, then there's
54130Sstevel@tonic-gate 	 * nothing else to do since the change log should be empty and
54140Sstevel@tonic-gate 	 * the ondisk and incore mddbs are already consistent.
54150Sstevel@tonic-gate 	 *
54160Sstevel@tonic-gate 	 * A newly chosen master is a node that was not the master
54170Sstevel@tonic-gate 	 * at the beginning of the reconfig cycle.  If a node is a new
54180Sstevel@tonic-gate 	 * master, then the new master state is reset after the ondisk
54190Sstevel@tonic-gate 	 * and incore mddbs are consistent and the change log has
54200Sstevel@tonic-gate 	 * been replayed.
54210Sstevel@tonic-gate 	 */
54220Sstevel@tonic-gate 	(void) memset(&sf, 0, sizeof (sf));
54230Sstevel@tonic-gate 	sf.sf_setno = sp->setno;
54240Sstevel@tonic-gate 	sf.sf_flags = MDDB_NM_GET;
54250Sstevel@tonic-gate 	/* Use magic to help protect ioctl against attack. */
54260Sstevel@tonic-gate 	sf.sf_magic = MDDB_SETFLAGS_MAGIC;
54270Sstevel@tonic-gate 	if ((metaioctl(MD_MN_GET_SETFLAGS, &sf, &sf.sf_mde, NULL) == 0) &&
54280Sstevel@tonic-gate 	    ((sf.sf_setflags & MD_SET_MN_NEWMAS_RC) == 0)) {
54290Sstevel@tonic-gate 		return (0);
54300Sstevel@tonic-gate 	}
54310Sstevel@tonic-gate 
54320Sstevel@tonic-gate 	/*
54330Sstevel@tonic-gate 	 * Now, sync up incore master view to ondisk mddbs.
54340Sstevel@tonic-gate 	 * This is needed in the case where a master node
54350Sstevel@tonic-gate 	 * had made a change to the mddb, but this change
54360Sstevel@tonic-gate 	 * may not have been relayed to the slaves yet.
54370Sstevel@tonic-gate 	 * So, the new master needs to verify that the ondisk
54380Sstevel@tonic-gate 	 * mddbs match what the new master has incore -
54390Sstevel@tonic-gate 	 * if different, new master rewrites all of the mddbs.
54400Sstevel@tonic-gate 	 * Then the new master will replay the changelog and the
54410Sstevel@tonic-gate 	 * new master will then execute what the old master had
54420Sstevel@tonic-gate 	 * done.
54430Sstevel@tonic-gate 	 *
54440Sstevel@tonic-gate 	 * Block all I/Os to disks in this diskset on all nodes in
54450Sstevel@tonic-gate 	 * the diskset.  This will allow the rewriting of the mddbs
54460Sstevel@tonic-gate 	 * (if needed), to proceed in a timely manner.
54470Sstevel@tonic-gate 	 *
54480Sstevel@tonic-gate 	 * If block of I/Os fail, return a -1.
54490Sstevel@tonic-gate 	 */
54500Sstevel@tonic-gate 
54510Sstevel@tonic-gate 	nd = sd->sd_nodelist;
54520Sstevel@tonic-gate 	while (nd) {
54530Sstevel@tonic-gate 		/* Skip non-alive and non-owner nodes  */
54540Sstevel@tonic-gate 		if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) ||
54550Sstevel@tonic-gate 		    (!(nd->nd_flags & MD_MN_NODE_OWN))) {
54560Sstevel@tonic-gate 			nd = nd->nd_next;
54570Sstevel@tonic-gate 			continue;
54580Sstevel@tonic-gate 		}
54590Sstevel@tonic-gate 		if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno,
54600Sstevel@tonic-gate 		    MN_SUSP_IO, ep)) {
54610Sstevel@tonic-gate 			mde_perror(ep, dgettext(TEXT_DOMAIN,
54620Sstevel@tonic-gate 			    "Unable to suspend I/O on node %s in set %s"),
54630Sstevel@tonic-gate 			    nd->nd_nodename, sp->setname);
54640Sstevel@tonic-gate 
54650Sstevel@tonic-gate 			/*
54660Sstevel@tonic-gate 			 * Resume all other nodes that had been suspended.
54670Sstevel@tonic-gate 			 * (Reconfig return step also resumes I/Os
54680Sstevel@tonic-gate 			 * for all sets.)
54690Sstevel@tonic-gate 			 */
54700Sstevel@tonic-gate 			nd2 = sd->sd_nodelist;
54710Sstevel@tonic-gate 			while (nd2) {
54720Sstevel@tonic-gate 				/* Stop when reaching failed node */
54730Sstevel@tonic-gate 				if (nd2->nd_nodeid == nd->nd_nodeid)
54740Sstevel@tonic-gate 					break;
54750Sstevel@tonic-gate 				/* Skip non-alive and non-owner nodes  */
54760Sstevel@tonic-gate 				if ((!(nd2->nd_flags & MD_MN_NODE_ALIVE)) ||
54770Sstevel@tonic-gate 				    (!(nd2->nd_flags & MD_MN_NODE_OWN))) {
54780Sstevel@tonic-gate 					nd2 = nd2->nd_next;
54790Sstevel@tonic-gate 					continue;
54800Sstevel@tonic-gate 				}
54810Sstevel@tonic-gate 				(void) (clnt_mn_susp_res_io(nd2->nd_nodename,
54820Sstevel@tonic-gate 					sp->setno, MN_RES_IO, &xep));
54830Sstevel@tonic-gate 				nd2 = nd2->nd_next;
54840Sstevel@tonic-gate 			}
54850Sstevel@tonic-gate 
54860Sstevel@tonic-gate 			/*
54870Sstevel@tonic-gate 			 * If an RPC failure on another node, return a 205.
54880Sstevel@tonic-gate 			 * Otherwise, exit with failure.
54890Sstevel@tonic-gate 			 */
54900Sstevel@tonic-gate 			if ((mdanyrpcerror(ep)) &&
54910Sstevel@tonic-gate 			    (sd->sd_mn_mynode->nd_nodeid !=
54920Sstevel@tonic-gate 			    nd->nd_nodeid)) {
54930Sstevel@tonic-gate 				return (205);
54940Sstevel@tonic-gate 			} else {
54950Sstevel@tonic-gate 				return (-1);
54960Sstevel@tonic-gate 			}
54970Sstevel@tonic-gate 
54980Sstevel@tonic-gate 		}
54990Sstevel@tonic-gate 		nd = nd->nd_next;
55000Sstevel@tonic-gate 	}
55010Sstevel@tonic-gate 
55020Sstevel@tonic-gate 	(void) memset(&c, 0, sizeof (c));
55030Sstevel@tonic-gate 	c.c_id = 0;
55040Sstevel@tonic-gate 	c.c_setno = sp->setno;
55050Sstevel@tonic-gate 	/* Master can't sync up to ondisk mddbs?  Kick it out of cluster */
55060Sstevel@tonic-gate 	if (metaioctl(MD_MN_CHK_WRT_MDDB, &c, &c.c_mde, NULL) != 0)
55070Sstevel@tonic-gate 		return (-1);
55080Sstevel@tonic-gate 
55090Sstevel@tonic-gate 	/*
55100Sstevel@tonic-gate 	 * Resume I/Os that were suspended above.
55110Sstevel@tonic-gate 	 */
55120Sstevel@tonic-gate 	nd = sd->sd_nodelist;
55130Sstevel@tonic-gate 	while (nd) {
55140Sstevel@tonic-gate 		/* Skip non-alive and non-owner nodes  */
55150Sstevel@tonic-gate 		if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) ||
55160Sstevel@tonic-gate 		    (!(nd->nd_flags & MD_MN_NODE_OWN))) {
55170Sstevel@tonic-gate 			nd = nd->nd_next;
55180Sstevel@tonic-gate 			continue;
55190Sstevel@tonic-gate 		}
55200Sstevel@tonic-gate 		if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno,
55210Sstevel@tonic-gate 		    MN_RES_IO, ep)) {
55220Sstevel@tonic-gate 			mde_perror(ep, dgettext(TEXT_DOMAIN,
55230Sstevel@tonic-gate 			    "Unable to resume I/O on node %s in set %s"),
55240Sstevel@tonic-gate 			    nd->nd_nodename, sp->setname);
55250Sstevel@tonic-gate 
55260Sstevel@tonic-gate 			/*
55270Sstevel@tonic-gate 			 * If an RPC failure then don't do any
55280Sstevel@tonic-gate 			 * more RPC calls, since one timeout is enough
55290Sstevel@tonic-gate 			 * to endure.  If RPC failure to another node, return
55300Sstevel@tonic-gate 			 * 205.  If RPC failure to my node, return -1.
55310Sstevel@tonic-gate 			 * If not an RPC failure, continue resuming the
55320Sstevel@tonic-gate 			 * rest of the nodes and then return -1.
55330Sstevel@tonic-gate 			 */
55340Sstevel@tonic-gate 			if (mdanyrpcerror(ep)) {
55350Sstevel@tonic-gate 				if (sd->sd_mn_mynode->nd_nodeid ==
55360Sstevel@tonic-gate 				    nd->nd_nodeid) {
55370Sstevel@tonic-gate 					return (-1);
55380Sstevel@tonic-gate 				} else {
55390Sstevel@tonic-gate 					return (205);
55400Sstevel@tonic-gate 				}
55410Sstevel@tonic-gate 			}
55420Sstevel@tonic-gate 
55430Sstevel@tonic-gate 			/*
55440Sstevel@tonic-gate 			 * If not an RPC error, continue resuming rest of
55450Sstevel@tonic-gate 			 * nodes, ignoring any failures except for an
55460Sstevel@tonic-gate 			 * RPC failure which constitutes an immediate exit.
55470Sstevel@tonic-gate 			 * Start in middle of list with failing node.
55480Sstevel@tonic-gate 			 */
55490Sstevel@tonic-gate 			nd2 = nd->nd_next;
55500Sstevel@tonic-gate 			while (nd2) {
55510Sstevel@tonic-gate 				/* Skip non-alive and non-owner nodes  */
55520Sstevel@tonic-gate 				if ((!(nd2->nd_flags & MD_MN_NODE_ALIVE)) ||
55530Sstevel@tonic-gate 				    (!(nd2->nd_flags & MD_MN_NODE_OWN))) {
55540Sstevel@tonic-gate 					nd2 = nd2->nd_next;
55550Sstevel@tonic-gate 					continue;
55560Sstevel@tonic-gate 				}
55570Sstevel@tonic-gate 				(void) (clnt_mn_susp_res_io(nd2->nd_nodename,
55580Sstevel@tonic-gate 					sp->setno, MN_RES_IO, &xep));
55590Sstevel@tonic-gate 				if (mdanyrpcerror(&xep)) {
55600Sstevel@tonic-gate 					return (-1);
55610Sstevel@tonic-gate 				}
55620Sstevel@tonic-gate 				nd2 = nd2->nd_next;
55630Sstevel@tonic-gate 			}
55640Sstevel@tonic-gate 		}
55650Sstevel@tonic-gate 		nd = nd->nd_next;
55660Sstevel@tonic-gate 	}
55670Sstevel@tonic-gate 
55680Sstevel@tonic-gate 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, "Master node has completed "
55690Sstevel@tonic-gate 	    "checking/writing the mddb for set %s: %s"), sp->setname,
55700Sstevel@tonic-gate 	    meta_print_hrtime(gethrtime() - start_time));
55710Sstevel@tonic-gate 
55720Sstevel@tonic-gate 	/*
55730Sstevel@tonic-gate 	 * Send (aka replay) all messages we find in the changelog.
55740Sstevel@tonic-gate 	 * Flag the messages with
55750Sstevel@tonic-gate 	 *   MD_MSGF_REPLAY_MSG, so no new message ID is generated for them
55760Sstevel@tonic-gate 	 *   MD_MSGF_OVERRIDE_SUSPEND so they can pass the suspended commd.
55770Sstevel@tonic-gate 	 */
55780Sstevel@tonic-gate 	for (class = MD_MN_NCLASSES - 1; class > 0; class--) {
55790Sstevel@tonic-gate 		mdmn_changelog_record_t	*lr;
55800Sstevel@tonic-gate 		md_error_t	xep = mdnullerror;
55810Sstevel@tonic-gate 		md_mn_result_t	*resultp = NULL;
55820Sstevel@tonic-gate 		int		ret;
55830Sstevel@tonic-gate 
55840Sstevel@tonic-gate 		lr = mdmn_get_changelogrec(sp->setno, class);
55850Sstevel@tonic-gate 		if ((lr->lr_flags & MD_MN_LR_INUSE) == 0) {
55860Sstevel@tonic-gate 			/* no entry for this class */
55870Sstevel@tonic-gate 			continue;
55880Sstevel@tonic-gate 		}
55890Sstevel@tonic-gate 
55900Sstevel@tonic-gate 		meta_mc_log(MC_LOG1, dgettext(TEXT_DOMAIN,
55910Sstevel@tonic-gate 		    "replaying message ID=(%d, 0x%llx-%d)\n"),
55920Sstevel@tonic-gate 		    MSGID_ELEMS(lr->lr_msg.msg_msgid));
55930Sstevel@tonic-gate 
55940Sstevel@tonic-gate 		ret = mdmn_send_message_with_msgid(
55950Sstevel@tonic-gate 			lr->lr_msg.msg_setno,
55960Sstevel@tonic-gate 			lr->lr_msg.msg_type,
55970Sstevel@tonic-gate 			lr->lr_msg.msg_flags |  MD_MSGF_REPLAY_MSG |
55980Sstevel@tonic-gate 						MD_MSGF_OVERRIDE_SUSPEND,
55990Sstevel@tonic-gate 			lr->lr_msg.msg_event_data,
56000Sstevel@tonic-gate 			lr->lr_msg.msg_event_size,
56010Sstevel@tonic-gate 			&resultp,
56020Sstevel@tonic-gate 			&lr->lr_msg.msg_msgid,
56030Sstevel@tonic-gate 			&xep);
56040Sstevel@tonic-gate 
56050Sstevel@tonic-gate 		meta_mc_log(MC_LOG1, dgettext(TEXT_DOMAIN,
56060Sstevel@tonic-gate 		    "mdmn_send_message returned %d\n"), ret);
56070Sstevel@tonic-gate 
56080Sstevel@tonic-gate 		if (resultp)
56090Sstevel@tonic-gate 			free_result(resultp);
56100Sstevel@tonic-gate 	}
56110Sstevel@tonic-gate 
56120Sstevel@tonic-gate 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
56130Sstevel@tonic-gate 	    "Playing changelog completed for set %s: %s"),
56140Sstevel@tonic-gate 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
56150Sstevel@tonic-gate 
56160Sstevel@tonic-gate 	/*
56170Sstevel@tonic-gate 	 * Now that new master has ondisk and incore mddbs in sync, reset
56180Sstevel@tonic-gate 	 * this node's new master kernel flag (for this set).  If this node
56190Sstevel@tonic-gate 	 * re-enters another reconfig cycle before the completion of this
56200Sstevel@tonic-gate 	 * reconfig cycle, this master node won't need to check if the ondisk
56210Sstevel@tonic-gate 	 * and incore mddbs are in sync since this node won't be considered
56220Sstevel@tonic-gate 	 * a new master (since this flag is being reset here in the middle of
56230Sstevel@tonic-gate 	 * step2).  This will save time during any subsequent reconfig
56240Sstevel@tonic-gate 	 * cycles as long as this node continues to be master.
56250Sstevel@tonic-gate 	 */
56260Sstevel@tonic-gate 	(void) memset(&sf, 0, sizeof (sf));
56270Sstevel@tonic-gate 	sf.sf_setno = sp->setno;
56280Sstevel@tonic-gate 	sf.sf_setflags = MD_SET_MN_NEWMAS_RC;
56290Sstevel@tonic-gate 	sf.sf_flags = MDDB_NM_RESET;
56300Sstevel@tonic-gate 	/* Use magic to help protect ioctl against attack. */
56310Sstevel@tonic-gate 	sf.sf_magic = MDDB_SETFLAGS_MAGIC;
56320Sstevel@tonic-gate 	/* Ignore failure, since failure to reset flag isn't catastrophic */
56330Sstevel@tonic-gate 	(void) metaioctl(MD_MN_SET_SETFLAGS, &sf, &sf.sf_mde, NULL);
56340Sstevel@tonic-gate 
56350Sstevel@tonic-gate 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
56360Sstevel@tonic-gate 	    "Reset new master flag for set %s: %s"),
56370Sstevel@tonic-gate 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
56380Sstevel@tonic-gate 
56390Sstevel@tonic-gate 	return (0);
56400Sstevel@tonic-gate }
56410Sstevel@tonic-gate 
56420Sstevel@tonic-gate /*
56430Sstevel@tonic-gate  * meta_mnjoin_all will join all starting nodes in the diskset.
56440Sstevel@tonic-gate  * A starting node is considered to be any node that is not
56450Sstevel@tonic-gate  * an owner of the set but is a member of the cluster.
56460Sstevel@tonic-gate  * Master node is already joined to set (done in meta_mnsync_diskset_mddbs).
56470Sstevel@tonic-gate  *
56480Sstevel@tonic-gate  * Caller is the Master node.
56490Sstevel@tonic-gate  *
56500Sstevel@tonic-gate  * Returns	 0 - Success
56510Sstevel@tonic-gate  *		205 - Failure during RPC to another node
56520Sstevel@tonic-gate  *		-1 - Any other failure and ep is filled in.
56530Sstevel@tonic-gate  */
56540Sstevel@tonic-gate int
56550Sstevel@tonic-gate meta_mnjoin_all(
56560Sstevel@tonic-gate 	mdsetname_t	*sp,
56570Sstevel@tonic-gate 	md_error_t	*ep
56580Sstevel@tonic-gate )
56590Sstevel@tonic-gate {
56600Sstevel@tonic-gate 	md_set_desc		*sd;
56610Sstevel@tonic-gate 	md_mnnode_desc		*nd, *nd2;
56620Sstevel@tonic-gate 	int			rval = 0;
56630Sstevel@tonic-gate 	int			stale_flag = 0;
56640Sstevel@tonic-gate 	mddb_config_t		c;
56650Sstevel@tonic-gate 	int			susp_res_flag = 0;
56660Sstevel@tonic-gate 	md_error_t		xep = mdnullerror;
56670Sstevel@tonic-gate 
56680Sstevel@tonic-gate 	/* If setname is there, set desc should exist. */
56690Sstevel@tonic-gate 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
56700Sstevel@tonic-gate 		mde_perror(ep, dgettext(TEXT_DOMAIN,
56710Sstevel@tonic-gate 		    "Unable to get set %s desc information"), sp->setname);
56720Sstevel@tonic-gate 		return (-1);
56730Sstevel@tonic-gate 	}
56740Sstevel@tonic-gate 
56750Sstevel@tonic-gate 	/* Are there drives in the set? */
56760Sstevel@tonic-gate 	if (metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
56770Sstevel@tonic-gate 	    ep) == NULL) {
56780Sstevel@tonic-gate 		if (! mdisok(ep)) {
56790Sstevel@tonic-gate 			return (-1);
56800Sstevel@tonic-gate 		}
56810Sstevel@tonic-gate 		/* No drives in set -- nothing to join */
56820Sstevel@tonic-gate 		return (0);
56830Sstevel@tonic-gate 	}
56840Sstevel@tonic-gate 
56850Sstevel@tonic-gate 	/*
56860Sstevel@tonic-gate 	 * Is set currently stale?
56870Sstevel@tonic-gate 	 */
56880Sstevel@tonic-gate 	(void) memset(&c, 0, sizeof (c));
56890Sstevel@tonic-gate 	c.c_id = 0;
56900Sstevel@tonic-gate 	c.c_setno = sp->setno;
56910Sstevel@tonic-gate 	/* Ignore failure since master node may not be joined yet */
56920Sstevel@tonic-gate 	(void) metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL);
56930Sstevel@tonic-gate 	if (c.c_flags & MDDB_C_STALE) {
56940Sstevel@tonic-gate 		stale_flag = MNSET_IS_STALE;
56950Sstevel@tonic-gate 	}
56960Sstevel@tonic-gate 
56970Sstevel@tonic-gate 	/*
56980Sstevel@tonic-gate 	 * If any nodes are going to be joined to diskset, then
56990Sstevel@tonic-gate 	 * suspend I/O to all disks in diskset so that nodes can join
57000Sstevel@tonic-gate 	 * (read in mddbs) in a reasonable amount of time even under
57010Sstevel@tonic-gate 	 * high I/O load.  Don't need to do this if set is STALE since
57020Sstevel@tonic-gate 	 * no I/O can be occurring to a STALE set.
57030Sstevel@tonic-gate 	 */
57040Sstevel@tonic-gate 	if (stale_flag != MNSET_IS_STALE) {
57050Sstevel@tonic-gate 		nd = sd->sd_nodelist;
57060Sstevel@tonic-gate 		while (nd) {
57070Sstevel@tonic-gate 			/* Found a node that will be joined to diskset */
57080Sstevel@tonic-gate 			if ((nd->nd_flags & MD_MN_NODE_ALIVE) &&
57090Sstevel@tonic-gate 			    (!(nd->nd_flags & MD_MN_NODE_OWN))) {
57100Sstevel@tonic-gate 				/* Set flag that diskset should be suspended */
57110Sstevel@tonic-gate 				susp_res_flag = 1;
57120Sstevel@tonic-gate 				break;
57130Sstevel@tonic-gate 			}
57140Sstevel@tonic-gate 			nd = nd->nd_next;
57150Sstevel@tonic-gate 		}
57160Sstevel@tonic-gate 	}
57170Sstevel@tonic-gate 
57180Sstevel@tonic-gate 	if (susp_res_flag) {
57190Sstevel@tonic-gate 		/*
57200Sstevel@tonic-gate 		 * Block all I/Os to disks in this diskset on all joined
57210Sstevel@tonic-gate 		 * nodes in the diskset.
57220Sstevel@tonic-gate 		 * If block of I/Os fails due to an RPC failure on another
57230Sstevel@tonic-gate 		 * node, return 205; otherwise, return -1.
57240Sstevel@tonic-gate 		 */
57250Sstevel@tonic-gate 		nd = sd->sd_nodelist;
57260Sstevel@tonic-gate 		while (nd) {
57270Sstevel@tonic-gate 			/* Skip non-alive and non-owner nodes  */
57280Sstevel@tonic-gate 			if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) ||
57290Sstevel@tonic-gate 			    (!(nd->nd_flags & MD_MN_NODE_OWN))) {
57300Sstevel@tonic-gate 				nd = nd->nd_next;
57310Sstevel@tonic-gate 				continue;
57320Sstevel@tonic-gate 			}
57330Sstevel@tonic-gate 			if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno,
57340Sstevel@tonic-gate 			    MN_SUSP_IO, ep)) {
57350Sstevel@tonic-gate 				mde_perror(ep, dgettext(TEXT_DOMAIN,
57360Sstevel@tonic-gate 				    "Unable to suspend I/O on node %s"
57370Sstevel@tonic-gate 				    " in set %s"), nd->nd_nodename,
57380Sstevel@tonic-gate 				    sp->setname);
57390Sstevel@tonic-gate 				/*
57400Sstevel@tonic-gate 				 * Resume other nodes that had been suspended.
57410Sstevel@tonic-gate 				 * (Reconfig return step also resumes I/Os
57420Sstevel@tonic-gate 				 * for all sets.)
57430Sstevel@tonic-gate 				 */
57440Sstevel@tonic-gate 				nd2 = sd->sd_nodelist;
57450Sstevel@tonic-gate 				while (nd2) {
57460Sstevel@tonic-gate 					/* Stop when reaching failed node */
57470Sstevel@tonic-gate 					if (nd2->nd_nodeid == nd->nd_nodeid)
57480Sstevel@tonic-gate 						break;
57490Sstevel@tonic-gate 					/* Skip non-alive/non-owner nodes  */
57500Sstevel@tonic-gate 					if ((!(nd2->nd_flags &
57510Sstevel@tonic-gate 					    MD_MN_NODE_ALIVE)) ||
57520Sstevel@tonic-gate 					    (!(nd2->nd_flags &
57530Sstevel@tonic-gate 					    MD_MN_NODE_OWN))) {
57540Sstevel@tonic-gate 						nd2 = nd2->nd_next;
57550Sstevel@tonic-gate 						continue;
57560Sstevel@tonic-gate 					}
57570Sstevel@tonic-gate 					(void) (clnt_mn_susp_res_io(
57580Sstevel@tonic-gate 					    nd2->nd_nodename, sp->setno,
57590Sstevel@tonic-gate 					    MN_RES_IO, &xep));
57600Sstevel@tonic-gate 					nd2 = nd2->nd_next;
57610Sstevel@tonic-gate 				}
57620Sstevel@tonic-gate 
57630Sstevel@tonic-gate 				/*
57640Sstevel@tonic-gate 				 * If the suspend failed due to an
57650Sstevel@tonic-gate 				 * RPC failure on another node, return
57660Sstevel@tonic-gate 				 * a 205.
57670Sstevel@tonic-gate 				 * Otherwise, exit with failure.
57680Sstevel@tonic-gate 				 * The return reconfig step will resume
57690Sstevel@tonic-gate 				 * I/Os for all disksets.
57700Sstevel@tonic-gate 				 */
57710Sstevel@tonic-gate 				if ((mdanyrpcerror(ep)) &&
57720Sstevel@tonic-gate 				    (sd->sd_mn_mynode->nd_nodeid !=
57730Sstevel@tonic-gate 				    nd->nd_nodeid)) {
57740Sstevel@tonic-gate 					return (205);
57750Sstevel@tonic-gate 				} else {
57760Sstevel@tonic-gate 					return (-1);
57770Sstevel@tonic-gate 				}
57780Sstevel@tonic-gate 			}
57790Sstevel@tonic-gate 			nd = nd->nd_next;
57800Sstevel@tonic-gate 		}
57810Sstevel@tonic-gate 	}
57820Sstevel@tonic-gate 
57830Sstevel@tonic-gate 	nd = sd->sd_nodelist;
57840Sstevel@tonic-gate 	while (nd) {
57850Sstevel@tonic-gate 		/*
57860Sstevel@tonic-gate 		 * If a node is in the membership list but isn't joined
57870Sstevel@tonic-gate 		 * to the set, try to join the node.
57880Sstevel@tonic-gate 		 */
57890Sstevel@tonic-gate 		if ((nd->nd_flags & MD_MN_NODE_ALIVE) &&
57900Sstevel@tonic-gate 		    (!(nd->nd_flags & MD_MN_NODE_OWN))) {
57910Sstevel@tonic-gate 			if (clnt_joinset(nd->nd_nodename, sp,
57920Sstevel@tonic-gate 			    (MNSET_IN_RECONFIG | stale_flag), ep)) {
57930Sstevel@tonic-gate 				/*
57940Sstevel@tonic-gate 				 * If RPC failure to another node
57950Sstevel@tonic-gate 				 * then exit without attempting anything else.
57960Sstevel@tonic-gate 				 * (Reconfig return step will resume I/Os
57970Sstevel@tonic-gate 				 * for all sets.)
57980Sstevel@tonic-gate 				 */
57990Sstevel@tonic-gate 				if (mdanyrpcerror(ep)) {
58000Sstevel@tonic-gate 					mde_perror(ep, "");
58010Sstevel@tonic-gate 					return (205);
58020Sstevel@tonic-gate 				}
58030Sstevel@tonic-gate 				/*
58040Sstevel@tonic-gate 				 * STALE and ACCOK failures aren't true
58050Sstevel@tonic-gate 				 * failures.  STALE means that <50% mddbs
58060Sstevel@tonic-gate 				 * are available. ACCOK means that the
58070Sstevel@tonic-gate 				 * mediator provided the extra vote.
58080Sstevel@tonic-gate 				 * If a true failure, then print messasge
58090Sstevel@tonic-gate 				 * and withdraw node from set in order to
58100Sstevel@tonic-gate 				 * cleanup from failed join attempt.
58110Sstevel@tonic-gate 				 */
58120Sstevel@tonic-gate 				if ((!mdismddberror(ep, MDE_DB_STALE)) &&
58130Sstevel@tonic-gate 				    (!mdismddberror(ep, MDE_DB_ACCOK))) {
58140Sstevel@tonic-gate 					mde_perror(ep,
58150Sstevel@tonic-gate 					    "WARNING: Unable to join node %s "
58160Sstevel@tonic-gate 					    "to set %s", nd->nd_nodename,
58170Sstevel@tonic-gate 					    sp->setname);
58180Sstevel@tonic-gate 					mdclrerror(ep);
58190Sstevel@tonic-gate 					if (clnt_withdrawset(nd->nd_nodename,
58200Sstevel@tonic-gate 					    sp, &xep))
58210Sstevel@tonic-gate 						mdclrerror(&xep);
58220Sstevel@tonic-gate 					nd = nd->nd_next;
58230Sstevel@tonic-gate 					continue;
58240Sstevel@tonic-gate 				}
58250Sstevel@tonic-gate 			}
58260Sstevel@tonic-gate 			/* Set owner flag even if STALE or ACCOK */
58270Sstevel@tonic-gate 			nd->nd_flags |= MD_MN_NODE_OWN;
58280Sstevel@tonic-gate 		}
58290Sstevel@tonic-gate 		nd = nd->nd_next;
58300Sstevel@tonic-gate 	}
58310Sstevel@tonic-gate 	/*
58320Sstevel@tonic-gate 	 * Resume I/Os if suspended above.
58330Sstevel@tonic-gate 	 */
58340Sstevel@tonic-gate 	if (susp_res_flag) {
58350Sstevel@tonic-gate 		nd = sd->sd_nodelist;
58360Sstevel@tonic-gate 		while (nd) {
58370Sstevel@tonic-gate 			/*
58380Sstevel@tonic-gate 			 * Skip non-alive and non-owner nodes
58390Sstevel@tonic-gate 			 * (this list doesn't include any of
58400Sstevel@tonic-gate 			 * the nodes that were joined).
58410Sstevel@tonic-gate 			 */
58420Sstevel@tonic-gate 			if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) ||
58430Sstevel@tonic-gate 			    (!(nd->nd_flags & MD_MN_NODE_OWN))) {
58440Sstevel@tonic-gate 				nd = nd->nd_next;
58450Sstevel@tonic-gate 				continue;
58460Sstevel@tonic-gate 			}
58470Sstevel@tonic-gate 			if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno,
58480Sstevel@tonic-gate 			    MN_RES_IO, ep)) {
58490Sstevel@tonic-gate 				mde_perror(ep, dgettext(TEXT_DOMAIN,
58500Sstevel@tonic-gate 				    "Unable to resume I/O on node %s"
58510Sstevel@tonic-gate 				    " in set %s"), nd->nd_nodename,
58520Sstevel@tonic-gate 				    sp->setname);
58530Sstevel@tonic-gate 
58540Sstevel@tonic-gate 				/*
58550Sstevel@tonic-gate 				 * If an RPC failure then don't do any
58560Sstevel@tonic-gate 				 * more RPC calls, since one timeout is enough
58570Sstevel@tonic-gate 				 * to endure.  If RPC failure to another node,
58580Sstevel@tonic-gate 				 * return 205.  If RPC failure to my node,
58590Sstevel@tonic-gate 				 * return -1.
58600Sstevel@tonic-gate 				 * (Reconfig return step will resume I/Os
58610Sstevel@tonic-gate 				 * for all sets.)
58620Sstevel@tonic-gate 				 * If not an RPC failure, continue resuming the
58630Sstevel@tonic-gate 				 * rest of the nodes and then return -1.
58640Sstevel@tonic-gate 				 */
58650Sstevel@tonic-gate 				if (mdanyrpcerror(ep)) {
58660Sstevel@tonic-gate 					if (sd->sd_mn_mynode->nd_nodeid ==
58670Sstevel@tonic-gate 					    nd->nd_nodeid) {
58680Sstevel@tonic-gate 						return (-1);
58690Sstevel@tonic-gate 					} else {
58700Sstevel@tonic-gate 						return (205);
58710Sstevel@tonic-gate 					}
58720Sstevel@tonic-gate 				}
58730Sstevel@tonic-gate 
58740Sstevel@tonic-gate 				/*
58750Sstevel@tonic-gate 				 * If not an RPC error, continue resuming rest
58760Sstevel@tonic-gate 				 * of nodes, ignoring any failures except for
58770Sstevel@tonic-gate 				 * an RPC failure which constitutes an
58780Sstevel@tonic-gate 				 * immediate exit.
58790Sstevel@tonic-gate 				 * Start in middle of list with failing node.
58800Sstevel@tonic-gate 				 */
58810Sstevel@tonic-gate 				nd2 = nd->nd_next;
58820Sstevel@tonic-gate 				while (nd2) {
58830Sstevel@tonic-gate 					/* Skip non-owner nodes  */
58840Sstevel@tonic-gate 					if ((!(nd2->nd_flags &
58850Sstevel@tonic-gate 					    MD_MN_NODE_ALIVE)) ||
58860Sstevel@tonic-gate 					    (!(nd2->nd_flags &
58870Sstevel@tonic-gate 					    MD_MN_NODE_OWN))) {
58880Sstevel@tonic-gate 						nd2 = nd2->nd_next;
58890Sstevel@tonic-gate 						continue;
58900Sstevel@tonic-gate 					}
58910Sstevel@tonic-gate 					(void) (clnt_mn_susp_res_io(
58920Sstevel@tonic-gate 					    nd2->nd_nodename, sp->setno,
58930Sstevel@tonic-gate 					    MN_RES_IO, &xep));
58940Sstevel@tonic-gate 					if (mdanyrpcerror(&xep)) {
58950Sstevel@tonic-gate 						return (-1);
58960Sstevel@tonic-gate 					}
58970Sstevel@tonic-gate 					nd2 = nd2->nd_next;
58980Sstevel@tonic-gate 				}
58990Sstevel@tonic-gate 			}
59000Sstevel@tonic-gate 			nd = nd->nd_next;
59010Sstevel@tonic-gate 		}
59020Sstevel@tonic-gate 	}
59030Sstevel@tonic-gate 
59040Sstevel@tonic-gate 	nd = sd->sd_nodelist;
59050Sstevel@tonic-gate 	while (nd) {
59060Sstevel@tonic-gate 		if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
59070Sstevel@tonic-gate 			nd = nd->nd_next;
59080Sstevel@tonic-gate 			continue;
59090Sstevel@tonic-gate 		}
59100Sstevel@tonic-gate 		/*
59110Sstevel@tonic-gate 		 * If 1 node fails - go ahead and update the rest except
59120Sstevel@tonic-gate 		 * in the case of an RPC failure, fail immediately.
59130Sstevel@tonic-gate 		 */
59140Sstevel@tonic-gate 		if (clnt_upd_nr_flags(nd->nd_nodename, sp,
59150Sstevel@tonic-gate 		    sd->sd_nodelist, MD_NR_SET, MNSET_IN_RECONFIG, ep)) {
59160Sstevel@tonic-gate 			/* RPC failure to another node */
59170Sstevel@tonic-gate 			if (mdanyrpcerror(ep)) {
59180Sstevel@tonic-gate 				return (205);
59190Sstevel@tonic-gate 			}
59200Sstevel@tonic-gate 			nd = nd->nd_next;
59210Sstevel@tonic-gate 			rval = -1;
59220Sstevel@tonic-gate 			continue;
59230Sstevel@tonic-gate 		}
59240Sstevel@tonic-gate 		nd = nd->nd_next;
59250Sstevel@tonic-gate 	}
59260Sstevel@tonic-gate 
59270Sstevel@tonic-gate 	meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN,
59280Sstevel@tonic-gate 	    "Join of all nodes completed for set %s: %s"),
59290Sstevel@tonic-gate 	    sp->setname, meta_print_hrtime(gethrtime() - start_time));
59300Sstevel@tonic-gate 
59310Sstevel@tonic-gate 	return (rval);
59320Sstevel@tonic-gate }
5933