xref: /onnv-gate/usr/src/lib/lvm/libmeta/common/meta_set_drv.c (revision 3073:c5251d7eaee3)
10Sstevel@tonic-gate /*
20Sstevel@tonic-gate  * CDDL HEADER START
30Sstevel@tonic-gate  *
40Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
51945Sjeanm  * Common Development and Distribution License (the "License").
61945Sjeanm  * You may not use this file except in compliance with the License.
70Sstevel@tonic-gate  *
80Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
90Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
100Sstevel@tonic-gate  * See the License for the specific language governing permissions
110Sstevel@tonic-gate  * and limitations under the License.
120Sstevel@tonic-gate  *
130Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
140Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
150Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
160Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
170Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
180Sstevel@tonic-gate  *
190Sstevel@tonic-gate  * CDDL HEADER END
200Sstevel@tonic-gate  */
210Sstevel@tonic-gate /*
221945Sjeanm  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
230Sstevel@tonic-gate  * Use is subject to license terms.
240Sstevel@tonic-gate  */
250Sstevel@tonic-gate 
260Sstevel@tonic-gate #pragma ident	"%Z%%M%	%I%	%E% SMI"
270Sstevel@tonic-gate 
280Sstevel@tonic-gate /*
290Sstevel@tonic-gate  * Metadevice diskset interfaces
300Sstevel@tonic-gate  */
310Sstevel@tonic-gate 
320Sstevel@tonic-gate #include <meta.h>
330Sstevel@tonic-gate #include <mdmn_changelog.h>
340Sstevel@tonic-gate #include "meta_set_prv.h"
350Sstevel@tonic-gate #include "meta_repartition.h"
360Sstevel@tonic-gate 
370Sstevel@tonic-gate static int
check_setnodes_againstdrivelist(mdsetname_t * sp,mddrivenamelist_t * dnlp,md_error_t * ep)380Sstevel@tonic-gate check_setnodes_againstdrivelist(
390Sstevel@tonic-gate 	mdsetname_t		*sp,
400Sstevel@tonic-gate 	mddrivenamelist_t	*dnlp,
410Sstevel@tonic-gate 	md_error_t		*ep
420Sstevel@tonic-gate )
430Sstevel@tonic-gate {
440Sstevel@tonic-gate 	md_set_desc		*sd;
450Sstevel@tonic-gate 	mddrivenamelist_t	*p;
460Sstevel@tonic-gate 	int 			i;
470Sstevel@tonic-gate 	md_mnnode_desc		*nd;
480Sstevel@tonic-gate 
490Sstevel@tonic-gate 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
500Sstevel@tonic-gate 		return (-1);
510Sstevel@tonic-gate 
520Sstevel@tonic-gate 	if (MD_MNSET_DESC(sd)) {
530Sstevel@tonic-gate 		nd = sd->sd_nodelist;
540Sstevel@tonic-gate 		while (nd) {
550Sstevel@tonic-gate 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
560Sstevel@tonic-gate 				nd = nd->nd_next;
570Sstevel@tonic-gate 				continue;
580Sstevel@tonic-gate 			}
590Sstevel@tonic-gate 			for (p = dnlp; p != NULL; p = p->next)
600Sstevel@tonic-gate 				if (checkdrive_onnode(sp, p->drivenamep,
610Sstevel@tonic-gate 				    nd->nd_nodename, ep))
620Sstevel@tonic-gate 					return (-1);
630Sstevel@tonic-gate 			nd = nd->nd_next;
640Sstevel@tonic-gate 		}
650Sstevel@tonic-gate 	} else {
660Sstevel@tonic-gate 		for (i = 0; i < MD_MAXSIDES; i++) {
670Sstevel@tonic-gate 			/* Skip empty slots */
680Sstevel@tonic-gate 			if (sd->sd_nodes[i][0] == '\0')
690Sstevel@tonic-gate 				continue;
700Sstevel@tonic-gate 
710Sstevel@tonic-gate 			for (p = dnlp; p != NULL; p = p->next)
720Sstevel@tonic-gate 				if (checkdrive_onnode(sp, p->drivenamep,
730Sstevel@tonic-gate 				    sd->sd_nodes[i], ep))
740Sstevel@tonic-gate 					return (-1);
750Sstevel@tonic-gate 		}
760Sstevel@tonic-gate 	}
770Sstevel@tonic-gate 	return (0);
780Sstevel@tonic-gate }
790Sstevel@tonic-gate 
800Sstevel@tonic-gate static int
drvsuniq(mdsetname_t * sp,mddrivenamelist_t * dnlp,md_error_t * ep)810Sstevel@tonic-gate drvsuniq(mdsetname_t *sp, mddrivenamelist_t *dnlp, md_error_t *ep)
820Sstevel@tonic-gate {
830Sstevel@tonic-gate 	mddrivenamelist_t *dl1, *dl2;
840Sstevel@tonic-gate 	mddrivename_t *dn1, *dn2;
850Sstevel@tonic-gate 
860Sstevel@tonic-gate 	for (dl1 = dnlp; dl1 != NULL; dl1 = dl1->next) {
870Sstevel@tonic-gate 		dn1 = dl1->drivenamep;
880Sstevel@tonic-gate 
890Sstevel@tonic-gate 		for (dl2 = dl1->next; dl2 != NULL; dl2 = dl2->next) {
900Sstevel@tonic-gate 			dn2 = dl2->drivenamep;
910Sstevel@tonic-gate 			if (strcmp(dn1->cname, dn2->cname) != 0)
920Sstevel@tonic-gate 				continue;
930Sstevel@tonic-gate 
940Sstevel@tonic-gate 			return (mddserror(ep, MDE_DS_DUPDRIVE, sp->setno,
950Sstevel@tonic-gate 			    NULL, dn1->cname, sp->setname));
960Sstevel@tonic-gate 		}
970Sstevel@tonic-gate 	}
980Sstevel@tonic-gate 	return (0);
990Sstevel@tonic-gate }
1000Sstevel@tonic-gate 
1010Sstevel@tonic-gate static md_drive_desc *
metaget_drivedesc_fromdrivelist(mdsetname_t * sp,mddrivenamelist_t * dnlp,uint_t flags,md_error_t * ep)1020Sstevel@tonic-gate metaget_drivedesc_fromdrivelist(
1030Sstevel@tonic-gate 	mdsetname_t		*sp,
1040Sstevel@tonic-gate 	mddrivenamelist_t	*dnlp,
1050Sstevel@tonic-gate 	uint_t			flags,
1060Sstevel@tonic-gate 	md_error_t		*ep
1070Sstevel@tonic-gate )
1080Sstevel@tonic-gate {
1090Sstevel@tonic-gate 	mddrivenamelist_t	*p;
1100Sstevel@tonic-gate 	md_drive_desc		*dd = NULL;
1110Sstevel@tonic-gate 	md_set_desc		*sd;
1120Sstevel@tonic-gate 
1130Sstevel@tonic-gate 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
1140Sstevel@tonic-gate 		return (NULL);
1150Sstevel@tonic-gate 
1160Sstevel@tonic-gate 	for (p = dnlp; p != NULL; p = p->next) {
1170Sstevel@tonic-gate 		(void) metadrivedesc_append(&dd, p->drivenamep, 0, 0,
1180Sstevel@tonic-gate 		    sd->sd_ctime, sd->sd_genid, flags);
1190Sstevel@tonic-gate 	}
1200Sstevel@tonic-gate 
1210Sstevel@tonic-gate 	return (dd);
1220Sstevel@tonic-gate }
1230Sstevel@tonic-gate 
1240Sstevel@tonic-gate /*
1250Sstevel@tonic-gate  * Exported Entry Points
1260Sstevel@tonic-gate  */
1270Sstevel@tonic-gate 
1280Sstevel@tonic-gate int
meta_make_sidenmlist(mdsetname_t * sp,mddrivename_t * dnp,int import_flag,md_im_drive_info_t * midp,md_error_t * ep)1290Sstevel@tonic-gate meta_make_sidenmlist(
1301945Sjeanm 	mdsetname_t		*sp,
1311945Sjeanm 	mddrivename_t		*dnp,
1321945Sjeanm 	int			import_flag, /* flags partial import */
1331945Sjeanm 	md_im_drive_info_t	*midp,	/* import drive information */
1341945Sjeanm 	md_error_t		*ep
1350Sstevel@tonic-gate )
1360Sstevel@tonic-gate {
1371945Sjeanm 	mdsidenames_t		*sn, **sn_next;
1381945Sjeanm 	mdname_t		*np;
1391945Sjeanm 	int			done;
1401945Sjeanm 	side_t			sideno = MD_SIDEWILD;
1411945Sjeanm 	uint_t			rep_slice;
1421945Sjeanm 	char			*bname;
1430Sstevel@tonic-gate 
1441945Sjeanm 	if (!import_flag) {
1451945Sjeanm 		/*
1461945Sjeanm 		 * Normal (aka NOT partial import) code path.
1471945Sjeanm 		 */
1481945Sjeanm 		if (meta_replicaslice(dnp, &rep_slice, ep) != 0) {
1491945Sjeanm 			return (-1);
1501945Sjeanm 		}
1511945Sjeanm 
1521945Sjeanm 		dnp->side_names_key = MD_KEYWILD;
1530Sstevel@tonic-gate 
1541945Sjeanm 		if ((np = metaslicename(dnp, rep_slice, ep)) == NULL)
1551945Sjeanm 			return (-1);
1561945Sjeanm 		bname = Strdup(np->bname);
1571945Sjeanm 	} else {
1581945Sjeanm 		/*
1591945Sjeanm 		 * When doing a partial import, we'll get the needed
1601945Sjeanm 		 * information from somewhere other than the system.
1611945Sjeanm 		 */
1621945Sjeanm 		dnp->side_names_key = MD_KEYWILD;
1631945Sjeanm 		bname = Strdup(midp->mid_devname);
1641945Sjeanm 	}
1650Sstevel@tonic-gate 	metaflushsidenames(dnp);
1660Sstevel@tonic-gate 	sn_next = &dnp->side_names;
1670Sstevel@tonic-gate 	/*CONSTCOND*/
1680Sstevel@tonic-gate 	while (1) {
1690Sstevel@tonic-gate 		sn = Zalloc(sizeof (*sn));
1700Sstevel@tonic-gate 
1711945Sjeanm 		if ((done = meta_getnextside_devinfo(sp, bname, &sideno,
1721945Sjeanm 		    &sn->cname, &sn->dname, &sn->mnum, ep)) == -1) {
1731945Sjeanm 			if (import_flag) {
1741945Sjeanm 				mdclrerror(ep);
1751945Sjeanm 				sn->dname = Strdup(midp->mid_driver_name);
1761945Sjeanm 				sn->mnum = midp->mid_mnum;
1771945Sjeanm 			} else {
1781945Sjeanm 				Free(sn);
1791945Sjeanm 				Free(bname);
1801945Sjeanm 				return (-1);
1811945Sjeanm 			}
1820Sstevel@tonic-gate 		}
1830Sstevel@tonic-gate 
1840Sstevel@tonic-gate 		if (done == 0) {
1850Sstevel@tonic-gate 			Free(sn);
1861945Sjeanm 			Free(bname);
1870Sstevel@tonic-gate 			return (0);
1880Sstevel@tonic-gate 		}
1890Sstevel@tonic-gate 
1900Sstevel@tonic-gate 		sn->sideno = sideno;
1910Sstevel@tonic-gate 
1920Sstevel@tonic-gate 		/* Add to the end of the linked list */
1930Sstevel@tonic-gate 		assert(*sn_next == NULL);
1940Sstevel@tonic-gate 		*sn_next = sn;
1950Sstevel@tonic-gate 		sn_next = &sn->next;
1960Sstevel@tonic-gate 	}
1970Sstevel@tonic-gate 	/*NOTREACHED*/
1980Sstevel@tonic-gate }
1990Sstevel@tonic-gate 
2000Sstevel@tonic-gate int
meta_set_adddrives(mdsetname_t * sp,mddrivenamelist_t * dnlp,daddr_t dbsize,int force_label,md_error_t * ep)2010Sstevel@tonic-gate meta_set_adddrives(
2020Sstevel@tonic-gate 	mdsetname_t		*sp,
2030Sstevel@tonic-gate 	mddrivenamelist_t	*dnlp,
2040Sstevel@tonic-gate 	daddr_t			dbsize,
2050Sstevel@tonic-gate 	int			force_label,
2060Sstevel@tonic-gate 	md_error_t		*ep
2070Sstevel@tonic-gate )
2080Sstevel@tonic-gate {
2090Sstevel@tonic-gate 	md_set_desc		*sd;
2100Sstevel@tonic-gate 	md_drive_desc		*dd = NULL, *curdd = NULL, *ddp;
2110Sstevel@tonic-gate 	int			i;
2120Sstevel@tonic-gate 	mddrivenamelist_t	*p;
2130Sstevel@tonic-gate 	mhd_mhiargs_t		mhiargs;
2140Sstevel@tonic-gate 	int			rval = 0;
2150Sstevel@tonic-gate 	md_timeval32_t		now;
2160Sstevel@tonic-gate 	sigset_t		oldsigs;
2170Sstevel@tonic-gate 	ulong_t			genid;
2180Sstevel@tonic-gate 	ulong_t			max_genid = 0;
2190Sstevel@tonic-gate 	md_setkey_t		*cl_sk;
2200Sstevel@tonic-gate 	int			rb_level = 0;
2210Sstevel@tonic-gate 	md_error_t		xep = mdnullerror;
2220Sstevel@tonic-gate 	md_mnnode_desc		*nd;
2230Sstevel@tonic-gate 	int			suspendall_flag = 0;
2240Sstevel@tonic-gate 	int			suspend1_flag = 0;
2250Sstevel@tonic-gate 	int			lock_flag = 0;
2260Sstevel@tonic-gate 	int			flush_set_onerr = 0;
2272150Sjeanm 	md_replicalist_t	*rlp = NULL, *rl;
2280Sstevel@tonic-gate 
2290Sstevel@tonic-gate 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
2300Sstevel@tonic-gate 		return (-1);
2310Sstevel@tonic-gate 
2320Sstevel@tonic-gate 	/* Make sure we own the set */
2330Sstevel@tonic-gate 	if (meta_check_ownership(sp, ep) != 0)
2340Sstevel@tonic-gate 		return (-1);
2350Sstevel@tonic-gate 
2360Sstevel@tonic-gate 	/*
2370Sstevel@tonic-gate 	 * The drive and node records are stored in the local mddbs of each
2380Sstevel@tonic-gate 	 * node in the diskset.  Each node's rpc.metad daemon reads in the set,
2390Sstevel@tonic-gate 	 * drive and node records from that node's local mddb and caches them
2400Sstevel@tonic-gate 	 * internally. Any process needing diskset information contacts its
2410Sstevel@tonic-gate 	 * local rpc.metad to get this information.  Since each node in the
2420Sstevel@tonic-gate 	 * diskset is independently reading the set information from its local
2430Sstevel@tonic-gate 	 * mddb, the set, drive and node records in the local mddbs must stay
2440Sstevel@tonic-gate 	 * in-sync, so that all nodes have a consistent view of the diskset.
2450Sstevel@tonic-gate 	 *
2460Sstevel@tonic-gate 	 * For a multinode diskset, explicitly verify that all nodes in the
2470Sstevel@tonic-gate 	 * diskset are ALIVE (i.e. are in the API membership list).  Otherwise,
2480Sstevel@tonic-gate 	 * fail this operation since all nodes must be ALIVE in order to add
2490Sstevel@tonic-gate 	 * the new drive record to their local mddb.  If a panic of this node
2500Sstevel@tonic-gate 	 * leaves the local mddbs set, node and drive records out-of-sync, the
2510Sstevel@tonic-gate 	 * reconfig cycle will fix the local mddbs and force them back into
2520Sstevel@tonic-gate 	 * synchronization.
2530Sstevel@tonic-gate 	 */
2540Sstevel@tonic-gate 	if (MD_MNSET_DESC(sd)) {
2550Sstevel@tonic-gate 		nd = sd->sd_nodelist;
2560Sstevel@tonic-gate 		while (nd) {
2570Sstevel@tonic-gate 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2580Sstevel@tonic-gate 				(void) mddserror(ep, MDE_DS_NOTINMEMBERLIST,
2590Sstevel@tonic-gate 					sp->setno,
2600Sstevel@tonic-gate 					nd->nd_nodename, NULL, sp->setname);
2610Sstevel@tonic-gate 				return (-1);
2620Sstevel@tonic-gate 			}
2630Sstevel@tonic-gate 			nd = nd->nd_next;
2640Sstevel@tonic-gate 		}
2650Sstevel@tonic-gate 	}
2660Sstevel@tonic-gate 
2670Sstevel@tonic-gate 	if (drvsuniq(sp, dnlp, ep) == -1)
2680Sstevel@tonic-gate 		return (-1);
2690Sstevel@tonic-gate 
2700Sstevel@tonic-gate 	/*
2710Sstevel@tonic-gate 	 * Lock the set on current set members.
2720Sstevel@tonic-gate 	 * Set locking done much earlier for MN diskset than for traditional
2730Sstevel@tonic-gate 	 * diskset since lock_set and SUSPEND are used to protect against
2740Sstevel@tonic-gate 	 * other meta* commands running on the other nodes.
2750Sstevel@tonic-gate 	 */
2760Sstevel@tonic-gate 	if (MD_MNSET_DESC(sd)) {
2770Sstevel@tonic-gate 		/* Make sure we are blocking all signals */
2780Sstevel@tonic-gate 		if (procsigs(TRUE, &oldsigs, &xep) < 0)
2790Sstevel@tonic-gate 			mdclrerror(&xep);
2800Sstevel@tonic-gate 
2810Sstevel@tonic-gate 		nd = sd->sd_nodelist;
2820Sstevel@tonic-gate 		/* All nodes are guaranteed to be ALIVE */
2830Sstevel@tonic-gate 		while (nd) {
2840Sstevel@tonic-gate 			if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
2850Sstevel@tonic-gate 				rval = -1;
2860Sstevel@tonic-gate 				goto out;
2870Sstevel@tonic-gate 			}
2880Sstevel@tonic-gate 			lock_flag = 1;
2890Sstevel@tonic-gate 			nd = nd->nd_next;
2900Sstevel@tonic-gate 		}
2910Sstevel@tonic-gate 		/*
2920Sstevel@tonic-gate 		 * Lock out other meta* commands by suspending
2930Sstevel@tonic-gate 		 * class 1 messages across the diskset.
2940Sstevel@tonic-gate 		 */
2950Sstevel@tonic-gate 		nd = sd->sd_nodelist;
2960Sstevel@tonic-gate 		/* All nodes are guaranteed to be ALIVE */
2970Sstevel@tonic-gate 		while (nd) {
2980Sstevel@tonic-gate 			if (clnt_mdcommdctl(nd->nd_nodename,
2990Sstevel@tonic-gate 			    COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1,
3000Sstevel@tonic-gate 			    MD_MSCF_NO_FLAGS, ep)) {
3010Sstevel@tonic-gate 				rval = -1;
3020Sstevel@tonic-gate 				goto out;
3030Sstevel@tonic-gate 			}
3040Sstevel@tonic-gate 			suspend1_flag = 1;
3050Sstevel@tonic-gate 			nd = nd->nd_next;
3060Sstevel@tonic-gate 		}
3070Sstevel@tonic-gate 	}
3080Sstevel@tonic-gate 
3090Sstevel@tonic-gate 	if (check_setnodes_againstdrivelist(sp, dnlp, ep)) {
3100Sstevel@tonic-gate 		rval = -1;
3110Sstevel@tonic-gate 		goto out;
3120Sstevel@tonic-gate 	}
3130Sstevel@tonic-gate 
3140Sstevel@tonic-gate 	for (p = dnlp; p != NULL; p = p->next) {
3150Sstevel@tonic-gate 		mdsetname_t	*tmp;
3160Sstevel@tonic-gate 
3170Sstevel@tonic-gate 		if (meta_is_drive_in_anyset(p->drivenamep, &tmp, FALSE,
3180Sstevel@tonic-gate 		    ep) == -1) {
3190Sstevel@tonic-gate 			rval = -1;
3200Sstevel@tonic-gate 			goto out;
3210Sstevel@tonic-gate 		}
3220Sstevel@tonic-gate 
3230Sstevel@tonic-gate 		if (tmp != NULL) {
3240Sstevel@tonic-gate 			(void) mddserror(ep, MDE_DS_DRIVEINSET, sp->setno,
3250Sstevel@tonic-gate 			    tmp->setname, p->drivenamep->cname, sp->setname);
3260Sstevel@tonic-gate 			rval = -1;
3270Sstevel@tonic-gate 			goto out;
3280Sstevel@tonic-gate 		}
3290Sstevel@tonic-gate 	}
3300Sstevel@tonic-gate 
3310Sstevel@tonic-gate 	/* END CHECK CODE */
3320Sstevel@tonic-gate 
3330Sstevel@tonic-gate 	/*
3340Sstevel@tonic-gate 	 * This is a separate loop (from above) so that we validate all the
3350Sstevel@tonic-gate 	 * drives handed to us before we repartition any one drive.
3360Sstevel@tonic-gate 	 */
3370Sstevel@tonic-gate 	for (p = dnlp; p != NULL; p = p->next) {
3380Sstevel@tonic-gate 		if (meta_repartition_drive(sp,
3391945Sjeanm 		    p->drivenamep, force_label == TRUE ? MD_REPART_FORCE : 0,
3400Sstevel@tonic-gate 		    NULL, /* Don't return the VTOC. */
3410Sstevel@tonic-gate 		    ep) != 0) {
3420Sstevel@tonic-gate 			rval = -1;
3430Sstevel@tonic-gate 			goto out;
3440Sstevel@tonic-gate 		}
3450Sstevel@tonic-gate 		/*
3460Sstevel@tonic-gate 		 * Create the names for the drives we are adding per side.
3470Sstevel@tonic-gate 		 */
3481945Sjeanm 		if (meta_make_sidenmlist(sp, p->drivenamep, 0, NULL,
3491945Sjeanm 		    ep) == -1) {
3500Sstevel@tonic-gate 			rval = -1;
3510Sstevel@tonic-gate 			goto out;
3520Sstevel@tonic-gate 		}
3530Sstevel@tonic-gate 	}
3540Sstevel@tonic-gate 
3550Sstevel@tonic-gate 	/*
3560Sstevel@tonic-gate 	 * Get the list of drives descriptors that we are adding.
3570Sstevel@tonic-gate 	 */
3580Sstevel@tonic-gate 	dd = metaget_drivedesc_fromdrivelist(sp, dnlp, MD_DR_ADD, ep);
3590Sstevel@tonic-gate 
3600Sstevel@tonic-gate 	if (! mdisok(ep)) {
3610Sstevel@tonic-gate 		rval = -1;
3620Sstevel@tonic-gate 		goto out;
3630Sstevel@tonic-gate 	}
3640Sstevel@tonic-gate 
3650Sstevel@tonic-gate 	/*
3660Sstevel@tonic-gate 	 * Get the set timeout information.
3670Sstevel@tonic-gate 	 */
3680Sstevel@tonic-gate 	(void) memset(&mhiargs, '\0', sizeof (mhiargs));
3690Sstevel@tonic-gate 	if (clnt_gtimeout(mynode(), sp, &mhiargs, ep) == -1) {
3700Sstevel@tonic-gate 		rval = -1;
3710Sstevel@tonic-gate 		goto out;
3720Sstevel@tonic-gate 	}
3730Sstevel@tonic-gate 
3740Sstevel@tonic-gate 	/*
3750Sstevel@tonic-gate 	 * Get timestamp and generation id for new records
3760Sstevel@tonic-gate 	 */
3770Sstevel@tonic-gate 	now = sd->sd_ctime;
3780Sstevel@tonic-gate 	genid = sd->sd_genid;
3790Sstevel@tonic-gate 
3800Sstevel@tonic-gate 
3810Sstevel@tonic-gate 	/* At this point, in case of error, set should be flushed. */
3820Sstevel@tonic-gate 	flush_set_onerr = 1;
3830Sstevel@tonic-gate 
3840Sstevel@tonic-gate 	/* Lock the set on current set members */
3850Sstevel@tonic-gate 	if (!(MD_MNSET_DESC(sd))) {
3860Sstevel@tonic-gate 		md_rb_sig_handling_on();
3870Sstevel@tonic-gate 		for (i = 0; i < MD_MAXSIDES; i++) {
3880Sstevel@tonic-gate 			/* Skip empty slots */
3890Sstevel@tonic-gate 			if (sd->sd_nodes[i][0] == '\0')
3900Sstevel@tonic-gate 				continue;
3910Sstevel@tonic-gate 
3920Sstevel@tonic-gate 			if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) {
3930Sstevel@tonic-gate 				rval = -1;
3940Sstevel@tonic-gate 				goto out;
3950Sstevel@tonic-gate 			}
3960Sstevel@tonic-gate 			lock_flag = 1;
3970Sstevel@tonic-gate 		}
3980Sstevel@tonic-gate 	}
3990Sstevel@tonic-gate 
4000Sstevel@tonic-gate 	/*
4010Sstevel@tonic-gate 	 * Get drive descriptors for the drives that are currently in the set.
4020Sstevel@tonic-gate 	 */
4030Sstevel@tonic-gate 	curdd = metaget_drivedesc(sp, MD_FULLNAME_ONLY, ep);
4040Sstevel@tonic-gate 	if (! mdisok(ep))
4050Sstevel@tonic-gate 		goto rollback;
4060Sstevel@tonic-gate 
4070Sstevel@tonic-gate 	/*
4080Sstevel@tonic-gate 	 * If first drive being added to set, set the mastership
4090Sstevel@tonic-gate 	 * of the multinode diskset to be this node.
4100Sstevel@tonic-gate 	 * Only set it on this node.  If all goes well
4110Sstevel@tonic-gate 	 * and there are no errors, the mastership of this node will be set
4120Sstevel@tonic-gate 	 * on all nodes in user space and in the kernel.
4130Sstevel@tonic-gate 	 */
4140Sstevel@tonic-gate 	if ((MD_MNSET_DESC(sd)) && (curdd == NULL)) {
4150Sstevel@tonic-gate 		if (clnt_mnsetmaster(mynode(), sp,
4160Sstevel@tonic-gate 		    sd->sd_mn_mynode->nd_nodename,
4170Sstevel@tonic-gate 		    sd->sd_mn_mynode->nd_nodeid, ep)) {
4180Sstevel@tonic-gate 			goto rollback;
4190Sstevel@tonic-gate 		}
4200Sstevel@tonic-gate 		/*
4210Sstevel@tonic-gate 		 * Set this up in my local cache of the set desc so that
4220Sstevel@tonic-gate 		 * the set descriptor won't have to be gotten again from
4230Sstevel@tonic-gate 		 * rpc.metad.  If it is flushed and gotten again, these
4240Sstevel@tonic-gate 		 * values will be set in sr2setdesc.
4250Sstevel@tonic-gate 		 */
4260Sstevel@tonic-gate 		sd->sd_mn_master_nodeid = sd->sd_mn_mynode->nd_nodeid;
4270Sstevel@tonic-gate 		(void) strcpy(sd->sd_mn_master_nodenm,
4280Sstevel@tonic-gate 		    sd->sd_mn_mynode->nd_nodename);
4290Sstevel@tonic-gate 		sd->sd_mn_am_i_master = 1;
4300Sstevel@tonic-gate 	}
4310Sstevel@tonic-gate 
4320Sstevel@tonic-gate 	RB_TEST(1, "adddrives", ep)
4330Sstevel@tonic-gate 
4340Sstevel@tonic-gate 	RB_PREEMPT;
4350Sstevel@tonic-gate 	rb_level = 1;	/* level 1 */
4360Sstevel@tonic-gate 
4370Sstevel@tonic-gate 	RB_TEST(2, "adddrives", ep)
4380Sstevel@tonic-gate 
4390Sstevel@tonic-gate 	/*
4400Sstevel@tonic-gate 	 * Add the drive records for the drives that we are adding to
4410Sstevel@tonic-gate 	 * each host in the set.  Marks the drive as MD_DR_ADD.
4420Sstevel@tonic-gate 	 */
4430Sstevel@tonic-gate 	if (MD_MNSET_DESC(sd)) {
4440Sstevel@tonic-gate 		nd = sd->sd_nodelist;
4450Sstevel@tonic-gate 		/* All nodes are guaranteed to be ALIVE */
4460Sstevel@tonic-gate 		while (nd) {
4470Sstevel@tonic-gate 			if (clnt_adddrvs(nd->nd_nodename, sp, dd, now, genid,
4480Sstevel@tonic-gate 			    ep) == -1)
4490Sstevel@tonic-gate 				goto rollback;
4500Sstevel@tonic-gate 
4510Sstevel@tonic-gate 			RB_TEST(3, "adddrives", ep)
4520Sstevel@tonic-gate 			nd = nd->nd_next;
4530Sstevel@tonic-gate 		}
4540Sstevel@tonic-gate 	} else {
4550Sstevel@tonic-gate 		for (i = 0; i < MD_MAXSIDES; i++) {
4560Sstevel@tonic-gate 			/* Skip empty slots */
4570Sstevel@tonic-gate 			if (sd->sd_nodes[i][0] == '\0')
4580Sstevel@tonic-gate 				continue;
4590Sstevel@tonic-gate 
4600Sstevel@tonic-gate 			if (clnt_adddrvs(sd->sd_nodes[i], sp, dd, now, genid,
4610Sstevel@tonic-gate 			    ep) == -1)
4620Sstevel@tonic-gate 				goto rollback;
4630Sstevel@tonic-gate 
4640Sstevel@tonic-gate 			RB_TEST(3, "adddrives", ep)
4650Sstevel@tonic-gate 		}
4660Sstevel@tonic-gate 	}
4670Sstevel@tonic-gate 
4680Sstevel@tonic-gate 	RB_TEST(4, "adddrives", ep)
4690Sstevel@tonic-gate 
4700Sstevel@tonic-gate 	RB_PREEMPT;
4710Sstevel@tonic-gate 	rb_level = 2;	/* level 2 */
4720Sstevel@tonic-gate 
4730Sstevel@tonic-gate 	RB_TEST(5, "adddrives", ep)
4740Sstevel@tonic-gate 
4750Sstevel@tonic-gate 	/*
4760Sstevel@tonic-gate 	 * Take ownership of the added drives.
4770Sstevel@tonic-gate 	 */
4780Sstevel@tonic-gate 	if (!(MD_MNSET_DESC(sd)) && !MD_ATSET_DESC(sd)) {
4790Sstevel@tonic-gate 		if (tk_own_bydd(sp, dd, &mhiargs, TRUE, ep))
4800Sstevel@tonic-gate 			goto rollback;
4810Sstevel@tonic-gate 	}
4820Sstevel@tonic-gate 
483*3073Sjkennedy 	/*
484*3073Sjkennedy 	 * If this is not a MN set and the state flags do not indicate the
485*3073Sjkennedy 	 * presence of devids, update the set records on all nodes.
486*3073Sjkennedy 	 */
487*3073Sjkennedy 	if (!(sd->sd_flags & MD_SR_MB_DEVID) && !(MD_MNSET_DESC(sd))) {
488*3073Sjkennedy 		if (meta_update_mb(sp, dd, ep) == 0) {
489*3073Sjkennedy 			mdclrerror(ep);
490*3073Sjkennedy 
491*3073Sjkennedy 			/* update the sr_flags on all hosts */
492*3073Sjkennedy 			for (i = 0; i < MD_MAXSIDES; i++) {
493*3073Sjkennedy 				if (sd->sd_nodes[i][0] == '\0')
494*3073Sjkennedy 					continue;
495*3073Sjkennedy 
496*3073Sjkennedy 				if (clnt_upd_sr_flags(sd->sd_nodes[i],
497*3073Sjkennedy 				    sp, (sd->sd_flags | MD_SR_MB_DEVID), ep))
498*3073Sjkennedy 					goto rollback;
499*3073Sjkennedy 			}
500*3073Sjkennedy 		}
501*3073Sjkennedy 	}
502*3073Sjkennedy 
5030Sstevel@tonic-gate 	RB_TEST(6, "adddrives", ep)
5040Sstevel@tonic-gate 
5050Sstevel@tonic-gate 	RB_PREEMPT;
5060Sstevel@tonic-gate 	rb_level = 3;	/* level 3 */
5070Sstevel@tonic-gate 
5080Sstevel@tonic-gate 	RB_TEST(7, "adddrives", ep)
5090Sstevel@tonic-gate 
5100Sstevel@tonic-gate 	/*
5110Sstevel@tonic-gate 	 * Balance the DB's according to the list of existing drives and the
5120Sstevel@tonic-gate 	 * list of added drives.
5130Sstevel@tonic-gate 	 */
5140Sstevel@tonic-gate 	if ((rval = meta_db_balance(sp, dd, curdd, dbsize, ep)) == -1)
5150Sstevel@tonic-gate 		goto rollback;
5160Sstevel@tonic-gate 
5172150Sjeanm 	/*
5182150Sjeanm 	 * Slam a dummy master block on all the disks that we are adding
5192150Sjeanm 	 * that don't have replicas on them.
5202150Sjeanm 	 * Used by diskset import if the disksets are remotely replicated
5212150Sjeanm 	 */
5222150Sjeanm 	if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) >= 0) {
5232150Sjeanm 		for (ddp = dd; ddp != NULL; ddp = ddp->dd_next) {
5242150Sjeanm 			uint_t		rep_slice;
5252150Sjeanm 			int		fd = -1;
5262150Sjeanm 			mdname_t	*np = NULL;
5272150Sjeanm 			char		*drive_name;
5282150Sjeanm 
5292150Sjeanm 			drive_name = ddp->dd_dnp->cname;
5302150Sjeanm 
5312150Sjeanm 			for (rl = rlp; rl != NULL; rl = rl->rl_next) {
5322150Sjeanm 				char	*rep_name;
5332150Sjeanm 
5342150Sjeanm 				rep_name =
5352150Sjeanm 				    rl->rl_repp->r_namep->drivenamep->cname;
5362150Sjeanm 
5372150Sjeanm 				if (strcmp(drive_name, rep_name) == 0) {
5382150Sjeanm 					/*
5392150Sjeanm 					 * Disk has a replica on it so don't
5402150Sjeanm 					 * add dummy master block.
5412150Sjeanm 					 */
5422150Sjeanm 					break;
5432150Sjeanm 				}
5442150Sjeanm 			}
5452150Sjeanm 			if (rl == NULL) {
5462150Sjeanm 				/*
5472150Sjeanm 				 * Drive doesn't have a replica on it so
5482150Sjeanm 				 * we need a dummy master block. Add it.
5492150Sjeanm 				 */
5502150Sjeanm 				if (meta_replicaslice(ddp->dd_dnp, &rep_slice,
5512150Sjeanm 				    &xep) != 0) {
5522150Sjeanm 					mdclrerror(&xep);
5532150Sjeanm 					continue;
5542150Sjeanm 				}
5552150Sjeanm 
5562150Sjeanm 				if ((np = metaslicename(ddp->dd_dnp, rep_slice,
5572150Sjeanm 				    &xep)) == NULL) {
5582150Sjeanm 					mdclrerror(&xep);
5592150Sjeanm 					continue;
5602150Sjeanm 				}
5612150Sjeanm 
5622150Sjeanm 				if ((fd = open(np->rname, O_RDWR)) >= 0) {
5632150Sjeanm 					meta_mkdummymaster(sp, fd, 16);
5642150Sjeanm 					(void) close(fd);
5652150Sjeanm 				}
5662150Sjeanm 			}
5672150Sjeanm 		}
5682150Sjeanm 	}
5692150Sjeanm 
5700Sstevel@tonic-gate 	if ((curdd == NULL) && (MD_MNSET_DESC(sd))) {
5710Sstevel@tonic-gate 		/*
5720Sstevel@tonic-gate 		 * Notify rpc.mdcommd on all nodes of a nodelist change.
5730Sstevel@tonic-gate 		 * Start by suspending rpc.mdcommd (which drains it of all
5740Sstevel@tonic-gate 		 * messages), then change the nodelist followed by a reinit
5750Sstevel@tonic-gate 		 * and resume.
5760Sstevel@tonic-gate 		 */
5770Sstevel@tonic-gate 		nd = sd->sd_nodelist;
5780Sstevel@tonic-gate 		/* All nodes are guaranteed to be ALIVE */
5790Sstevel@tonic-gate 		while (nd) {
5800Sstevel@tonic-gate 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND,
5810Sstevel@tonic-gate 			    sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) {
5820Sstevel@tonic-gate 				rval = -1;
5830Sstevel@tonic-gate 				goto out;
5840Sstevel@tonic-gate 			}
5850Sstevel@tonic-gate 			suspendall_flag = 1;
5860Sstevel@tonic-gate 			nd = nd->nd_next;
5870Sstevel@tonic-gate 		}
5880Sstevel@tonic-gate 	}
5890Sstevel@tonic-gate 
5900Sstevel@tonic-gate 	/*
5910Sstevel@tonic-gate 	 * If a MN diskset and this is the first disk(s) being added
5920Sstevel@tonic-gate 	 * to set, then pre-allocate change log records here.
5930Sstevel@tonic-gate 	 * When the other nodes are joined into the MN diskset, the
5940Sstevel@tonic-gate 	 * USER records will just be snarfed in.
5950Sstevel@tonic-gate 	 */
5960Sstevel@tonic-gate 	if ((MD_MNSET_DESC(sd)) && (curdd == NULL)) {
5970Sstevel@tonic-gate 		if (mdmn_allocate_changelog(sp, ep) != 0)
5980Sstevel@tonic-gate 			goto rollback;
5990Sstevel@tonic-gate 	}
6000Sstevel@tonic-gate 
6010Sstevel@tonic-gate 	/*
6020Sstevel@tonic-gate 	 * Mark the drives MD_DR_OK.
6030Sstevel@tonic-gate 	 * If first drive being added to MN diskset, then set
6040Sstevel@tonic-gate 	 * master on all nodes to be this node and then join
6050Sstevel@tonic-gate 	 * all alive nodes (nodes in membership list) to set.
6060Sstevel@tonic-gate 	 */
6070Sstevel@tonic-gate 	if (MD_MNSET_DESC(sd)) {
6080Sstevel@tonic-gate 		nd = sd->sd_nodelist;
6090Sstevel@tonic-gate 		/* All nodes are guaranteed to be ALIVE */
6100Sstevel@tonic-gate 		while (nd) {
6110Sstevel@tonic-gate 			/* don't set master on this node - done earlier */
6120Sstevel@tonic-gate 			if ((curdd == NULL) && (nd->nd_nodeid !=
6130Sstevel@tonic-gate 			    sd->sd_mn_mynode->nd_nodeid)) {
6140Sstevel@tonic-gate 				/*
6150Sstevel@tonic-gate 				 * Set master on all alive nodes since
6160Sstevel@tonic-gate 				 * all alive nodes will become joined nodes.
6170Sstevel@tonic-gate 				 */
6180Sstevel@tonic-gate 				if (clnt_mnsetmaster(nd->nd_nodename, sp,
6190Sstevel@tonic-gate 				    sd->sd_mn_mynode->nd_nodename,
6200Sstevel@tonic-gate 				    sd->sd_mn_mynode->nd_nodeid, ep)) {
6210Sstevel@tonic-gate 					goto rollback;
6220Sstevel@tonic-gate 				}
6230Sstevel@tonic-gate 			}
6240Sstevel@tonic-gate 
6250Sstevel@tonic-gate 			if (curdd == NULL) {
6260Sstevel@tonic-gate 				/*
6270Sstevel@tonic-gate 				 * No special flags for join set.  Since
6280Sstevel@tonic-gate 				 * all nodes are joining if 1st drive is being
6290Sstevel@tonic-gate 				 * added to set then all nodes will be either
6300Sstevel@tonic-gate 				 * STALE or non-STALE and each node can
6310Sstevel@tonic-gate 				 * determine this on its own.
6320Sstevel@tonic-gate 				 */
6330Sstevel@tonic-gate 				if (clnt_joinset(nd->nd_nodename, sp,
6340Sstevel@tonic-gate 				    NULL, ep)) {
6350Sstevel@tonic-gate 					goto rollback;
6360Sstevel@tonic-gate 				}
6370Sstevel@tonic-gate 				/* Sets join node flag on all nodes in list */
6380Sstevel@tonic-gate 				if (clnt_upd_nr_flags(nd->nd_nodename, sp,
6390Sstevel@tonic-gate 				    sd->sd_nodelist, MD_NR_JOIN, NULL, ep)) {
6400Sstevel@tonic-gate 					goto rollback;
6410Sstevel@tonic-gate 				}
6420Sstevel@tonic-gate 			}
6430Sstevel@tonic-gate 
6440Sstevel@tonic-gate 			/*
6450Sstevel@tonic-gate 			 * Set MD_DR_OK as last thing before unlock.
6460Sstevel@tonic-gate 			 * In case of panic on this node, recovery
6470Sstevel@tonic-gate 			 * code can check for MD_DR_OK to determine
6480Sstevel@tonic-gate 			 * status of diskset.
6490Sstevel@tonic-gate 			 */
6500Sstevel@tonic-gate 			if (clnt_upd_dr_flags(nd->nd_nodename, sp, dd,
6510Sstevel@tonic-gate 			    MD_DR_OK, ep) == -1)
6520Sstevel@tonic-gate 				goto rollback;
6530Sstevel@tonic-gate 
6540Sstevel@tonic-gate 
6550Sstevel@tonic-gate 			RB_TEST(8, "adddrives", ep)
6560Sstevel@tonic-gate 			nd = nd->nd_next;
6570Sstevel@tonic-gate 		}
6580Sstevel@tonic-gate 	} else {
6590Sstevel@tonic-gate 		for (i = 0; i < MD_MAXSIDES; i++) {
6600Sstevel@tonic-gate 			/* Skip empty slots */
6610Sstevel@tonic-gate 			if (sd->sd_nodes[i][0] == '\0')
6620Sstevel@tonic-gate 				continue;
6630Sstevel@tonic-gate 
6640Sstevel@tonic-gate 			if (clnt_upd_dr_flags(sd->sd_nodes[i], sp, dd, MD_DR_OK,
6650Sstevel@tonic-gate 			    ep) == -1)
6660Sstevel@tonic-gate 				goto rollback;
6670Sstevel@tonic-gate 
6680Sstevel@tonic-gate 			RB_TEST(8, "adddrives", ep)
6690Sstevel@tonic-gate 		}
6700Sstevel@tonic-gate 	}
6710Sstevel@tonic-gate 
6720Sstevel@tonic-gate 	RB_TEST(9, "adddrives", ep)
6730Sstevel@tonic-gate 
6740Sstevel@tonic-gate out:
6750Sstevel@tonic-gate 	/*
6760Sstevel@tonic-gate 	 * Notify rpc.mdcommd on all nodes of a nodelist change.
6770Sstevel@tonic-gate 	 * Send reinit command to mdcommd which forces it to get
6780Sstevel@tonic-gate 	 * fresh set description.
6790Sstevel@tonic-gate 	 */
6800Sstevel@tonic-gate 	if (suspendall_flag) {
6810Sstevel@tonic-gate 		/* Send reinit */
6820Sstevel@tonic-gate 		nd = sd->sd_nodelist;
6830Sstevel@tonic-gate 		/* All nodes are guaranteed to be ALIVE */
6840Sstevel@tonic-gate 		while (nd) {
6850Sstevel@tonic-gate 			/* Class is ignored for REINIT */
6860Sstevel@tonic-gate 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
6870Sstevel@tonic-gate 			    sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
6880Sstevel@tonic-gate 				if (rval == 0)
6890Sstevel@tonic-gate 					(void) mdstealerror(ep, &xep);
6900Sstevel@tonic-gate 				rval = -1;
6910Sstevel@tonic-gate 				mde_perror(ep, dgettext(TEXT_DOMAIN,
6920Sstevel@tonic-gate 				    "Unable to reinit rpc.mdcommd.\n"));
6930Sstevel@tonic-gate 			}
6940Sstevel@tonic-gate 			nd = nd->nd_next;
6950Sstevel@tonic-gate 		}
6960Sstevel@tonic-gate 	}
6970Sstevel@tonic-gate 	/*
6980Sstevel@tonic-gate 	 * Unlock diskset by resuming messages across the diskset.
6990Sstevel@tonic-gate 	 * Just resume all classes so that resume is the same whether
7000Sstevel@tonic-gate 	 * just one class was locked or all classes were locked.
7010Sstevel@tonic-gate 	 */
7020Sstevel@tonic-gate 	if ((suspend1_flag) || (suspendall_flag)) {
7030Sstevel@tonic-gate 		nd = sd->sd_nodelist;
7040Sstevel@tonic-gate 		/* All nodes are guaranteed to be ALIVE */
7050Sstevel@tonic-gate 		while (nd) {
7060Sstevel@tonic-gate 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
7070Sstevel@tonic-gate 			    sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
7080Sstevel@tonic-gate 				if (rval == 0)
7090Sstevel@tonic-gate 					(void) mdstealerror(ep, &xep);
7100Sstevel@tonic-gate 				rval = -1;
7110Sstevel@tonic-gate 				mde_perror(ep, dgettext(TEXT_DOMAIN,
7120Sstevel@tonic-gate 				    "Unable to resume rpc.mdcommd.\n"));
7130Sstevel@tonic-gate 			}
7140Sstevel@tonic-gate 			nd = nd->nd_next;
7150Sstevel@tonic-gate 		}
7160Sstevel@tonic-gate 		meta_ping_mnset(sp->setno);
7170Sstevel@tonic-gate 	}
7180Sstevel@tonic-gate 
7190Sstevel@tonic-gate 	if (lock_flag) {
7200Sstevel@tonic-gate 		cl_sk = cl_get_setkey(sp->setno, sp->setname);
7210Sstevel@tonic-gate 		if (MD_MNSET_DESC(sd)) {
7220Sstevel@tonic-gate 			nd = sd->sd_nodelist;
7230Sstevel@tonic-gate 			/* All nodes are guaranteed to be ALIVE */
7240Sstevel@tonic-gate 			while (nd) {
7250Sstevel@tonic-gate 				if (clnt_unlock_set(nd->nd_nodename,
7260Sstevel@tonic-gate 				    cl_sk, &xep)) {
7270Sstevel@tonic-gate 					if (rval == 0)
7280Sstevel@tonic-gate 						(void) mdstealerror(ep, &xep);
7290Sstevel@tonic-gate 					rval = -1;
7300Sstevel@tonic-gate 				}
7310Sstevel@tonic-gate 				nd = nd->nd_next;
7320Sstevel@tonic-gate 			}
7330Sstevel@tonic-gate 		} else {
7340Sstevel@tonic-gate 			for (i = 0; i < MD_MAXSIDES; i++) {
7350Sstevel@tonic-gate 				/* Skip empty slots */
7360Sstevel@tonic-gate 				if (sd->sd_nodes[i][0] == '\0')
7370Sstevel@tonic-gate 					continue;
7380Sstevel@tonic-gate 
7390Sstevel@tonic-gate 				if (clnt_unlock_set(sd->sd_nodes[i],
7400Sstevel@tonic-gate 				    cl_sk, &xep)) {
7410Sstevel@tonic-gate 					if (rval == 0)
7420Sstevel@tonic-gate 						(void) mdstealerror(ep, &xep);
7430Sstevel@tonic-gate 					rval = -1;
7440Sstevel@tonic-gate 				}
7450Sstevel@tonic-gate 			}
7460Sstevel@tonic-gate 		}
7470Sstevel@tonic-gate 		cl_set_setkey(NULL);
7480Sstevel@tonic-gate 	}
7490Sstevel@tonic-gate 
7500Sstevel@tonic-gate 	metafreedrivedesc(&dd);
7510Sstevel@tonic-gate 
7520Sstevel@tonic-gate 	if (flush_set_onerr) {
7530Sstevel@tonic-gate 		metaflushsetname(sp);
7540Sstevel@tonic-gate 		if (!(MD_MNSET_DESC(sd))) {
7550Sstevel@tonic-gate 			md_rb_sig_handling_off(md_got_sig(), md_which_sig());
7560Sstevel@tonic-gate 		}
7570Sstevel@tonic-gate 	}
7580Sstevel@tonic-gate 
7590Sstevel@tonic-gate 	if (MD_MNSET_DESC(sd)) {
7600Sstevel@tonic-gate 		/* release signals back to what they were on entry */
7610Sstevel@tonic-gate 		if (procsigs(FALSE, &oldsigs, &xep) < 0)
7620Sstevel@tonic-gate 			mdclrerror(&xep);
7630Sstevel@tonic-gate 	}
7640Sstevel@tonic-gate 
7650Sstevel@tonic-gate 	return (rval);
7660Sstevel@tonic-gate 
7670Sstevel@tonic-gate rollback:
7680Sstevel@tonic-gate 	/* all signals already blocked for MN disket */
7690Sstevel@tonic-gate 	if (!(MD_MNSET_DESC(sd))) {
7700Sstevel@tonic-gate 		/* Make sure we are blocking all signals */
7710Sstevel@tonic-gate 		if (procsigs(TRUE, &oldsigs, &xep) < 0)
7720Sstevel@tonic-gate 			mdclrerror(&xep);
7730Sstevel@tonic-gate 	}
7740Sstevel@tonic-gate 
7750Sstevel@tonic-gate 	rval = -1;
7760Sstevel@tonic-gate 
7770Sstevel@tonic-gate 	max_genid = sd->sd_genid;
7780Sstevel@tonic-gate 
7790Sstevel@tonic-gate 	/* level 3 */
7800Sstevel@tonic-gate 	if (rb_level > 2) {
7810Sstevel@tonic-gate 		/*
7820Sstevel@tonic-gate 		 * Since the add drive operation is failing, need
7830Sstevel@tonic-gate 		 * to reset config back to the way it was
7840Sstevel@tonic-gate 		 * before the add drive opration.
7850Sstevel@tonic-gate 		 * If a MN diskset and this is the first drive being added,
7860Sstevel@tonic-gate 		 * then reset master on all ALIVE nodes (which is all nodes)
7870Sstevel@tonic-gate 		 * since the master would have not been set previously.
7880Sstevel@tonic-gate 		 * Don't reset master on this node, since this
7890Sstevel@tonic-gate 		 * is done later.
7900Sstevel@tonic-gate 		 * This is ok to fail since next node to add first
7910Sstevel@tonic-gate 		 * disk to diskset will also set the master on all nodes.
7920Sstevel@tonic-gate 		 *
7930Sstevel@tonic-gate 		 * Also, if this is the first drive being added,
7940Sstevel@tonic-gate 		 * need to have each node withdraw itself from the set.
7950Sstevel@tonic-gate 		 */
7960Sstevel@tonic-gate 		if ((MD_MNSET_DESC(sd)) && (curdd == NULL)) {
7970Sstevel@tonic-gate 			nd = sd->sd_nodelist;
7980Sstevel@tonic-gate 			/* All nodes are guaranteed to be ALIVE */
7990Sstevel@tonic-gate 			while (nd) {
8000Sstevel@tonic-gate 				/*
8010Sstevel@tonic-gate 				 * Be careful with ordering in case of
8020Sstevel@tonic-gate 				 * panic between the steps and the
8030Sstevel@tonic-gate 				 * effect on recovery during reconfig.
8040Sstevel@tonic-gate 				 */
8050Sstevel@tonic-gate 				if (clnt_withdrawset(nd->nd_nodename, sp, &xep))
8060Sstevel@tonic-gate 					mdclrerror(&xep);
8070Sstevel@tonic-gate 
8080Sstevel@tonic-gate 				/* Sets withdraw flag on all nodes in list */
8090Sstevel@tonic-gate 				if (clnt_upd_nr_flags(nd->nd_nodename, sp,
8100Sstevel@tonic-gate 				    sd->sd_nodelist, MD_NR_WITHDRAW,
8110Sstevel@tonic-gate 				    NULL, &xep)) {
8120Sstevel@tonic-gate 					mdclrerror(&xep);
8130Sstevel@tonic-gate 				}
8140Sstevel@tonic-gate 
8150Sstevel@tonic-gate 				/* Skip this node */
8160Sstevel@tonic-gate 				if (nd->nd_nodeid ==
8170Sstevel@tonic-gate 				    sd->sd_mn_mynode->nd_nodeid) {
8180Sstevel@tonic-gate 					nd = nd->nd_next;
8190Sstevel@tonic-gate 					continue;
8200Sstevel@tonic-gate 				}
8210Sstevel@tonic-gate 				/* Reset master on all of the other nodes. */
8220Sstevel@tonic-gate 				if (clnt_mnsetmaster(nd->nd_nodename, sp,
8230Sstevel@tonic-gate 				    "", MD_MN_INVALID_NID, &xep))
8240Sstevel@tonic-gate 					mdclrerror(&xep);
8250Sstevel@tonic-gate 				nd = nd->nd_next;
8260Sstevel@tonic-gate 			}
8270Sstevel@tonic-gate 		}
8280Sstevel@tonic-gate 	}
8290Sstevel@tonic-gate 
8300Sstevel@tonic-gate 	/*
8310Sstevel@tonic-gate 	 * Send resume command to mdcommd.  Don't send reinit command
8320Sstevel@tonic-gate 	 * since nodelist should not have changed.
8330Sstevel@tonic-gate 	 * If suspendall_flag is set, then user would have been adding
8340Sstevel@tonic-gate 	 * first drives to set.  Since this failed, there is certainly
8350Sstevel@tonic-gate 	 * no reinit message to send to rpc.commd since no nodes will
8360Sstevel@tonic-gate 	 * be joined to set at the end of this metaset command.
8370Sstevel@tonic-gate 	 */
8380Sstevel@tonic-gate 	if (suspendall_flag) {
8390Sstevel@tonic-gate 		/* Send resume */
8400Sstevel@tonic-gate 		nd = sd->sd_nodelist;
8410Sstevel@tonic-gate 		/* All nodes are guaranteed to be ALIVE */
8420Sstevel@tonic-gate 		while (nd) {
8430Sstevel@tonic-gate 			/*
8440Sstevel@tonic-gate 			 * Resume all classes but class 1 so that lock is held
8450Sstevel@tonic-gate 			 * against meta* commands.
8460Sstevel@tonic-gate 			 * To later resume class1, must issue a class0 resume.
8470Sstevel@tonic-gate 			 */
8480Sstevel@tonic-gate 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
8490Sstevel@tonic-gate 			    sp, MD_MSG_CLASS0,
8500Sstevel@tonic-gate 			    MD_MSCF_DONT_RESUME_CLASS1, &xep)) {
8510Sstevel@tonic-gate 				mde_perror(&xep, dgettext(TEXT_DOMAIN,
8520Sstevel@tonic-gate 				    "Unable to resume rpc.mdcommd.\n"));
8530Sstevel@tonic-gate 				mdclrerror(&xep);
8540Sstevel@tonic-gate 			}
8550Sstevel@tonic-gate 			nd = nd->nd_next;
8560Sstevel@tonic-gate 		}
8570Sstevel@tonic-gate 		meta_ping_mnset(sp->setno);
8580Sstevel@tonic-gate 	}
8590Sstevel@tonic-gate 
8600Sstevel@tonic-gate 	/* level 3 */
8610Sstevel@tonic-gate 	if (rb_level > 2) {
8620Sstevel@tonic-gate 		mdnamelist_t	*nlp;
8630Sstevel@tonic-gate 		mdname_t	*np;
8640Sstevel@tonic-gate 
8650Sstevel@tonic-gate 		for (ddp = dd; ddp != NULL; ddp = ddp->dd_next) {
8660Sstevel@tonic-gate 			uint_t	rep_slice;
8670Sstevel@tonic-gate 
8680Sstevel@tonic-gate 			if ((meta_replicaslice(ddp->dd_dnp,
8690Sstevel@tonic-gate 			    &rep_slice, &xep) != 0) ||
8700Sstevel@tonic-gate 			    ((np = metaslicename(ddp->dd_dnp, rep_slice,
8710Sstevel@tonic-gate 				&xep)) == NULL)) {
8720Sstevel@tonic-gate 				mdclrerror(&xep);
8730Sstevel@tonic-gate 				continue;
8740Sstevel@tonic-gate 			}
8750Sstevel@tonic-gate 			nlp = NULL;
8760Sstevel@tonic-gate 			(void) metanamelist_append(&nlp, np);
8770Sstevel@tonic-gate 
8780Sstevel@tonic-gate 			if (meta_db_detach(sp, nlp,
8790Sstevel@tonic-gate 			    (MDFORCE_DS | MDFORCE_SET_LOCKED), NULL, &xep))
8800Sstevel@tonic-gate 				mdclrerror(&xep);
8810Sstevel@tonic-gate 
8820Sstevel@tonic-gate 			metafreenamelist(nlp);
8830Sstevel@tonic-gate 		}
8840Sstevel@tonic-gate 
8850Sstevel@tonic-gate 		/* Re-balance */
8860Sstevel@tonic-gate 		if (meta_db_balance(sp, NULL, curdd, 0, &xep) == -1)
8870Sstevel@tonic-gate 			mdclrerror(&xep);
8880Sstevel@tonic-gate 
8890Sstevel@tonic-gate 		/* Only if we are adding the first drive */
8900Sstevel@tonic-gate 		/* Handled MN diskset above. */
8910Sstevel@tonic-gate 		if ((curdd == NULL) && !(MD_MNSET_DESC(sd))) {
8920Sstevel@tonic-gate 			if (clnt_stimeout(mynode(), sp, &defmhiargs,
8930Sstevel@tonic-gate 			    &xep) == -1)
8940Sstevel@tonic-gate 				mdclrerror(&xep);
8950Sstevel@tonic-gate 
8960Sstevel@tonic-gate 			/* This is needed because of a corner case */
8970Sstevel@tonic-gate 			if (halt_set(sp, &xep))
8980Sstevel@tonic-gate 				mdclrerror(&xep);
8990Sstevel@tonic-gate 		}
9000Sstevel@tonic-gate 		max_genid++;
9010Sstevel@tonic-gate 	}
9020Sstevel@tonic-gate 
9030Sstevel@tonic-gate 	/* level 2 */
9040Sstevel@tonic-gate 	if (rb_level > 1) {
9050Sstevel@tonic-gate 		if (!(MD_MNSET_DESC(sd)) && !MD_ATSET_DESC(sd)) {
9060Sstevel@tonic-gate 			if (rel_own_bydd(sp, dd, TRUE, &xep))
9070Sstevel@tonic-gate 				mdclrerror(&xep);
9080Sstevel@tonic-gate 		}
9090Sstevel@tonic-gate 	}
9100Sstevel@tonic-gate 
9110Sstevel@tonic-gate 	/* level 1 */
9120Sstevel@tonic-gate 	if (rb_level > 0) {
9130Sstevel@tonic-gate 		if (MD_MNSET_DESC(sd)) {
9140Sstevel@tonic-gate 			nd = sd->sd_nodelist;
9150Sstevel@tonic-gate 			/* All nodes are guaranteed to be ALIVE */
9160Sstevel@tonic-gate 			while (nd) {
9170Sstevel@tonic-gate 				if (clnt_deldrvs(nd->nd_nodename, sp, dd,
9180Sstevel@tonic-gate 				    &xep) == -1)
9190Sstevel@tonic-gate 					mdclrerror(&xep);
9200Sstevel@tonic-gate 				nd = nd->nd_next;
9210Sstevel@tonic-gate 			}
9220Sstevel@tonic-gate 		} else {
9230Sstevel@tonic-gate 			for (i = 0; i < MD_MAXSIDES; i++) {
9240Sstevel@tonic-gate 				/* Skip empty slots */
9250Sstevel@tonic-gate 				if (sd->sd_nodes[i][0] == '\0')
9260Sstevel@tonic-gate 					continue;
9270Sstevel@tonic-gate 
9280Sstevel@tonic-gate 				if (clnt_deldrvs(sd->sd_nodes[i], sp, dd,
9290Sstevel@tonic-gate 				    &xep) == -1)
9300Sstevel@tonic-gate 					mdclrerror(&xep);
9310Sstevel@tonic-gate 			}
9320Sstevel@tonic-gate 		}
9330Sstevel@tonic-gate 		max_genid += 2;
9340Sstevel@tonic-gate 		resync_genid(sp, sd, max_genid, 0, NULL);
9350Sstevel@tonic-gate 	}
9360Sstevel@tonic-gate 
9370Sstevel@tonic-gate 	if ((suspend1_flag) || (suspendall_flag)) {
9380Sstevel@tonic-gate 		/* Send resume */
9390Sstevel@tonic-gate 		nd = sd->sd_nodelist;
9400Sstevel@tonic-gate 		/* All nodes are guaranteed to be ALIVE */
9410Sstevel@tonic-gate 		while (nd) {
9420Sstevel@tonic-gate 			/*
9430Sstevel@tonic-gate 			 * Just resume all classes so that resume is the
9440Sstevel@tonic-gate 			 * same whether just one class was locked or all
9450Sstevel@tonic-gate 			 * classes were locked.
9460Sstevel@tonic-gate 			 */
9470Sstevel@tonic-gate 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
9480Sstevel@tonic-gate 			    sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
9490Sstevel@tonic-gate 				mdclrerror(&xep);
9500Sstevel@tonic-gate 			}
9510Sstevel@tonic-gate 			nd = nd->nd_next;
9520Sstevel@tonic-gate 		}
9530Sstevel@tonic-gate 		meta_ping_mnset(sp->setno);
9540Sstevel@tonic-gate 	}
9550Sstevel@tonic-gate 
9560Sstevel@tonic-gate 	/* level 0 */
9570Sstevel@tonic-gate 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
9580Sstevel@tonic-gate 	/* Don't test lock flag since guaranteed to be set if in rollback */
9590Sstevel@tonic-gate 	if (MD_MNSET_DESC(sd)) {
9600Sstevel@tonic-gate 		/*
9610Sstevel@tonic-gate 		 * Since the add drive operation is failing, need
9620Sstevel@tonic-gate 		 * to reset config back to the way it was
9630Sstevel@tonic-gate 		 * before the add drive opration.
9640Sstevel@tonic-gate 		 * If a MN diskset and this is the first drive being
9650Sstevel@tonic-gate 		 * added, then reset master on this node since
9660Sstevel@tonic-gate 		 * the master would have not been set previously.
9670Sstevel@tonic-gate 		 * This is ok to fail since next node to add first
9680Sstevel@tonic-gate 		 * disk to diskset will also set the master on all nodes.
9690Sstevel@tonic-gate 		 */
9700Sstevel@tonic-gate 		if (curdd == NULL) {
9710Sstevel@tonic-gate 			/* Reset master on mynode */
9720Sstevel@tonic-gate 			if (clnt_mnsetmaster(mynode(), sp, "",
9730Sstevel@tonic-gate 			    MD_MN_INVALID_NID, &xep))
9740Sstevel@tonic-gate 				mdclrerror(&xep);
9750Sstevel@tonic-gate 		}
9760Sstevel@tonic-gate 		nd = sd->sd_nodelist;
9770Sstevel@tonic-gate 		/* All nodes are guaranteed to be ALIVE */
9780Sstevel@tonic-gate 		while (nd) {
9790Sstevel@tonic-gate 			if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep))
9800Sstevel@tonic-gate 				mdclrerror(&xep);
9810Sstevel@tonic-gate 			nd = nd->nd_next;
9820Sstevel@tonic-gate 		}
9830Sstevel@tonic-gate 	} else {
9840Sstevel@tonic-gate 		for (i = 0; i < MD_MAXSIDES; i++) {
9850Sstevel@tonic-gate 			/* Skip empty slots */
9860Sstevel@tonic-gate 			if (sd->sd_nodes[i][0] == '\0')
9870Sstevel@tonic-gate 				continue;
9880Sstevel@tonic-gate 
9890Sstevel@tonic-gate 			if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep))
9900Sstevel@tonic-gate 				mdclrerror(&xep);
9910Sstevel@tonic-gate 		}
9920Sstevel@tonic-gate 	}
9930Sstevel@tonic-gate 	cl_set_setkey(NULL);
9940Sstevel@tonic-gate 
9950Sstevel@tonic-gate 	/* release signals back to what they were on entry */
9960Sstevel@tonic-gate 	if (procsigs(FALSE, &oldsigs, &xep) < 0)
9970Sstevel@tonic-gate 		mdclrerror(&xep);
9980Sstevel@tonic-gate 
9990Sstevel@tonic-gate 	metafreedrivedesc(&dd);
10000Sstevel@tonic-gate 
10010Sstevel@tonic-gate 	if (flush_set_onerr) {
10020Sstevel@tonic-gate 		metaflushsetname(sp);
10030Sstevel@tonic-gate 		if (!(MD_MNSET_DESC(sd))) {
10040Sstevel@tonic-gate 			md_rb_sig_handling_off(md_got_sig(), md_which_sig());
10050Sstevel@tonic-gate 		}
10060Sstevel@tonic-gate 	}
10070Sstevel@tonic-gate 
10080Sstevel@tonic-gate 	return (rval);
10090Sstevel@tonic-gate }
10100Sstevel@tonic-gate 
10111945Sjeanm /*
10121945Sjeanm  * Add drives routine used during import of a diskset.
10131945Sjeanm  */
10141945Sjeanm int
meta_imp_set_adddrives(mdsetname_t * sp,mddrivenamelist_t * dnlp,md_im_set_desc_t * misp,md_error_t * ep)10151945Sjeanm meta_imp_set_adddrives(
10161945Sjeanm 	mdsetname_t		*sp,
10171945Sjeanm 	mddrivenamelist_t	*dnlp,
10181945Sjeanm 	md_im_set_desc_t	*misp,
10191945Sjeanm 	md_error_t		*ep
10201945Sjeanm )
10211945Sjeanm {
10221945Sjeanm 	md_set_desc		*sd;
10231945Sjeanm 	mddrivenamelist_t	*p;
10241945Sjeanm 	md_drive_desc		*dd = NULL, *ddp;
10251945Sjeanm 	int			flush_set_onerr = 0;
10261945Sjeanm 	md_timeval32_t		now;
10271945Sjeanm 	ulong_t			genid;
10281945Sjeanm 	mhd_mhiargs_t		mhiargs;
10291945Sjeanm 	md_im_replica_info_t	*mirp;
10301945Sjeanm 	md_im_drive_info_t	*midp;
10311945Sjeanm 	int			rval = 0;
10321945Sjeanm 	sigset_t		oldsigs;
10331945Sjeanm 	ulong_t			max_genid = 0;
10341945Sjeanm 	int			rb_level = 0;
10351945Sjeanm 	md_error_t		xep = mdnullerror;
10361945Sjeanm 
10371945Sjeanm 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
10381945Sjeanm 		return (-1);
10391945Sjeanm 
10401945Sjeanm 	for (p = dnlp; p != NULL; p = p->next) {
10411945Sjeanm 		int		imp_flag = 0;
10421945Sjeanm 
10431945Sjeanm 		/*
10441945Sjeanm 		 * If we have a partial diskset, meta_make_sidenmlist will
10451945Sjeanm 		 * need information from midp to complete making the
10461945Sjeanm 		 * side name structure.
10471945Sjeanm 		 */
10481945Sjeanm 		if (misp->mis_partial) {
10491945Sjeanm 			imp_flag = MDDB_C_IMPORT;
10501945Sjeanm 			for (midp = misp->mis_drives; midp != NULL;
10511945Sjeanm 			    midp = midp->mid_next) {
10521945Sjeanm 				if (midp->mid_dnp == p->drivenamep)
10531945Sjeanm 					break;
10541945Sjeanm 			}
10551945Sjeanm 			if (midp == NULL) {
10561945Sjeanm 				(void) mddserror(ep, MDE_DS_SETNOTIMP,
10571945Sjeanm 				    MD_SET_BAD, mynode(), NULL, sp->setname);
10581945Sjeanm 				rval = -1;
10591945Sjeanm 				goto out;
10601945Sjeanm 			}
10611945Sjeanm 		}
10621945Sjeanm 		/*
10631945Sjeanm 		 * Create the names for the drives we are adding per side.
10641945Sjeanm 		 */
10651945Sjeanm 		if (meta_make_sidenmlist(sp, p->drivenamep, imp_flag,
10661945Sjeanm 		    midp, ep) == -1) {
10671945Sjeanm 			rval = -1;
10681945Sjeanm 			goto out;
10691945Sjeanm 		}
10701945Sjeanm 	}
10711945Sjeanm 
10721945Sjeanm 	/*
10731945Sjeanm 	 * Get the list of drives descriptors that we are adding.
10741945Sjeanm 	 */
10751945Sjeanm 	dd = metaget_drivedesc_fromdrivelist(sp, dnlp, MD_DR_ADD, ep);
10761945Sjeanm 
10771945Sjeanm 	if (! mdisok(ep)) {
10781945Sjeanm 		rval = -1;
10791945Sjeanm 		goto out;
10801945Sjeanm 	}
10811945Sjeanm 
10821945Sjeanm 	/*
10831945Sjeanm 	 * Get the set timeout information.
10841945Sjeanm 	 */
10851945Sjeanm 	(void) memset(&mhiargs, '\0', sizeof (mhiargs));
10861945Sjeanm 	if (clnt_gtimeout(mynode(), sp, &mhiargs, ep) == -1) {
10871945Sjeanm 		rval = -1;
10881945Sjeanm 		goto out;
10891945Sjeanm 	}
10901945Sjeanm 
10911945Sjeanm 	/*
10921945Sjeanm 	 * Get timestamp and generation id for new records
10931945Sjeanm 	 */
10941945Sjeanm 	now = sd->sd_ctime;
10951945Sjeanm 	genid = sd->sd_genid;
10961945Sjeanm 
10971945Sjeanm 	/* At this point, in case of error, set should be flushed. */
10981945Sjeanm 	flush_set_onerr = 1;
10991945Sjeanm 
11001945Sjeanm 	rb_level = 1;   /* level 1 */
11011945Sjeanm 
11021945Sjeanm 	for (midp = misp->mis_drives; midp != NULL; midp = midp->mid_next) {
11031945Sjeanm 		for (ddp = dd; ddp != NULL; ddp = ddp->dd_next) {
11041945Sjeanm 			if (ddp->dd_dnp == midp->mid_dnp) {
11051945Sjeanm 				/* same disk */
11061945Sjeanm 				ddp->dd_dnp->devid =
11071945Sjeanm 				    devid_str_encode(midp->mid_devid,
11081945Sjeanm 				    midp->mid_minor_name);
11091945Sjeanm 
11101945Sjeanm 				ddp->dd_dbcnt = 0;
11111945Sjeanm 				mirp = midp->mid_replicas;
11121945Sjeanm 				if (mirp) {
11131945Sjeanm 					ddp->dd_dbsize = mirp->mir_length;
11141945Sjeanm 					for (; mirp != NULL;
11151945Sjeanm 					    mirp = mirp->mir_next) {
11161945Sjeanm 						ddp->dd_dbcnt++;
11171945Sjeanm 					}
11181945Sjeanm 				}
11191945Sjeanm 				if ((midp->mid_available &
11201945Sjeanm 				    MD_IM_DISK_NOT_AVAILABLE) &&
11211945Sjeanm 				    (misp->mis_flags & MD_IM_SET_REPLICATED)) {
11221945Sjeanm 					ddp->dd_flags = MD_DR_UNRSLV_REPLICATED;
11231945Sjeanm 				}
11241945Sjeanm 			}
11251945Sjeanm 		}
11261945Sjeanm 	}
11271945Sjeanm 
11281945Sjeanm 	/*
11291945Sjeanm 	 * Add the drive records for the drives that we are adding to
11301945Sjeanm 	 * each host in the set.  Marks the drive records as MD_DR_ADD.
11311945Sjeanm 	 * May also mark a drive record as MD_DR_UNRSLV_REPLICATED if
11321945Sjeanm 	 * this flag was set in the dd_flags for that drive.
11331945Sjeanm 	 */
11341945Sjeanm 	if (clnt_imp_adddrvs(mynode(), sp, dd, now, genid, ep) == -1)
11351945Sjeanm 		goto rollback;
11361945Sjeanm 
11371945Sjeanm 	rb_level = 2;   /* level 2 */
11381945Sjeanm 
11391945Sjeanm 	/*
11401945Sjeanm 	 * Take ownership of the added drives.
11411945Sjeanm 	 */
11421945Sjeanm 	if (tk_own_bydd(sp, dd, &mhiargs, TRUE, ep))
11431945Sjeanm 		goto rollback;
11441945Sjeanm 
11451945Sjeanm out:
11461945Sjeanm 	metafreedrivedesc(&dd);
11471945Sjeanm 
11481945Sjeanm 	if (flush_set_onerr) {
11491945Sjeanm 		metaflushsetname(sp);
11501945Sjeanm 	}
11511945Sjeanm 
11521945Sjeanm 	return (rval);
11531945Sjeanm 
11541945Sjeanm rollback:
11551945Sjeanm 	/* Make sure we are blocking all signals */
11561945Sjeanm 	if (procsigs(TRUE, &oldsigs, &xep) < 0)
11571945Sjeanm 		mdclrerror(&xep);
11581945Sjeanm 
11591945Sjeanm 	rval = -1;
11601945Sjeanm 
11611945Sjeanm 	max_genid = sd->sd_genid;
11621945Sjeanm 
11631945Sjeanm 	/* level 2 */
11641945Sjeanm 	if (rb_level > 1) {
11651945Sjeanm 		if (!MD_ATSET_DESC(sd)) {
11661945Sjeanm 			if (rel_own_bydd(sp, dd, TRUE, &xep)) {
11671945Sjeanm 				mdclrerror(&xep);
11681945Sjeanm 			}
11691945Sjeanm 		}
11701945Sjeanm 	}
11711945Sjeanm 
11721945Sjeanm 	/* level 1 */
11731945Sjeanm 	if (rb_level > 0) {
11741945Sjeanm 		if (clnt_deldrvs(mynode(), sp, dd, &xep) == -1) {
11751945Sjeanm 			mdclrerror(&xep);
11761945Sjeanm 		}
11771945Sjeanm 		max_genid += 2;
11781945Sjeanm 		resync_genid(sp, sd, max_genid, 0, NULL);
11791945Sjeanm 	}
11801945Sjeanm 
11811945Sjeanm 	/* level 0 */
11821945Sjeanm 
11831945Sjeanm 	/* release signals back to what they were on entry */
11841945Sjeanm 	if (procsigs(FALSE, &oldsigs, &xep) < 0)
11851945Sjeanm 		mdclrerror(&xep);
11861945Sjeanm 
11871945Sjeanm 	metafreedrivedesc(&dd);
11881945Sjeanm 
11891945Sjeanm 	if (flush_set_onerr) {
11901945Sjeanm 		metaflushsetname(sp);
11911945Sjeanm 		md_rb_sig_handling_off(md_got_sig(), md_which_sig());
11921945Sjeanm 	}
11931945Sjeanm 
11941945Sjeanm 	return (rval);
11951945Sjeanm }
11961945Sjeanm 
11970Sstevel@tonic-gate int
meta_set_deletedrives(mdsetname_t * sp,mddrivenamelist_t * dnlp,int forceflg,md_error_t * ep)11980Sstevel@tonic-gate meta_set_deletedrives(
11990Sstevel@tonic-gate 	mdsetname_t		*sp,
12000Sstevel@tonic-gate 	mddrivenamelist_t	*dnlp,
12010Sstevel@tonic-gate 	int			forceflg,
12020Sstevel@tonic-gate 	md_error_t		*ep
12030Sstevel@tonic-gate )
12040Sstevel@tonic-gate {
12050Sstevel@tonic-gate 	md_set_desc		*sd;
12060Sstevel@tonic-gate 	md_drive_desc		*ddp, *dd = NULL, *curdd = NULL;
12070Sstevel@tonic-gate 	md_replicalist_t	*rlp = NULL, *rl;
12080Sstevel@tonic-gate 	mddrivenamelist_t	*p;
12090Sstevel@tonic-gate 	int			deldrvcnt = 0;
12100Sstevel@tonic-gate 	int			rval = 0;
12110Sstevel@tonic-gate 	mhd_mhiargs_t		mhiargs;
12120Sstevel@tonic-gate 	int			i;
12130Sstevel@tonic-gate 	sigset_t		oldsigs;
12140Sstevel@tonic-gate 	md_setkey_t		*cl_sk;
12150Sstevel@tonic-gate 	ulong_t			max_genid = 0;
12160Sstevel@tonic-gate 	int			rb_level = 0;
12170Sstevel@tonic-gate 	md_error_t		xep = mdnullerror;
12180Sstevel@tonic-gate 	md_mnnode_desc		*nd;
12190Sstevel@tonic-gate 	int			has_set;
12200Sstevel@tonic-gate 	int			current_drv_cnt = 0;
12210Sstevel@tonic-gate 	int			suspendall_flag = 0, suspendall_flag_rb = 0;
12220Sstevel@tonic-gate 	int			suspend1_flag = 0;
12230Sstevel@tonic-gate 	int			lock_flag = 0;
12240Sstevel@tonic-gate 	bool_t			stale_bool = FALSE;
12250Sstevel@tonic-gate 	int			flush_set_onerr = 0;
12260Sstevel@tonic-gate 	mdnamelist_t		*nlp;
12270Sstevel@tonic-gate 	mdname_t		*np;
12280Sstevel@tonic-gate 
12290Sstevel@tonic-gate 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
12300Sstevel@tonic-gate 		return (-1);
12310Sstevel@tonic-gate 
12320Sstevel@tonic-gate 	/* Make sure we own the set */
12330Sstevel@tonic-gate 	if (meta_check_ownership(sp, ep) != 0)
12340Sstevel@tonic-gate 		return (-1);
12350Sstevel@tonic-gate 
12360Sstevel@tonic-gate 	if (drvsuniq(sp, dnlp, ep) == -1)
12370Sstevel@tonic-gate 		return (-1);
12380Sstevel@tonic-gate 
12390Sstevel@tonic-gate 	/*
12400Sstevel@tonic-gate 	 * Check and see if all the nodes have the set.
12410Sstevel@tonic-gate 	 *
12420Sstevel@tonic-gate 	 * The drive and node records are stored in the local mddbs of each
12430Sstevel@tonic-gate 	 * node in the diskset.  Each node's rpc.metad daemon reads in the set,
12440Sstevel@tonic-gate 	 * drive and node records from that node's local mddb and caches them
12450Sstevel@tonic-gate 	 * internally. Any process needing diskset information contacts its
12460Sstevel@tonic-gate 	 * local rpc.metad to get this information.  Since each node in the
12470Sstevel@tonic-gate 	 * diskset is independently reading the set information from its local
12480Sstevel@tonic-gate 	 * mddb, the set, drive and node records in the local mddbs must stay
12490Sstevel@tonic-gate 	 * in-sync, so that all nodes have a consistent view of the diskset.
12500Sstevel@tonic-gate 	 *
12510Sstevel@tonic-gate 	 * For a multinode diskset, explicitly verify that all nodes in the
12520Sstevel@tonic-gate 	 * diskset are ALIVE (i.e. are in the API membership list).  Otherwise,
12530Sstevel@tonic-gate 	 * fail this operation since all nodes must be ALIVE in order to delete
12540Sstevel@tonic-gate 	 * a drive record from their local mddb.  If a panic of this node
12550Sstevel@tonic-gate 	 * leaves the local mddbs set, node and drive records out-of-sync, the
12560Sstevel@tonic-gate 	 * reconfig cycle will fix the local mddbs and force them back into
12570Sstevel@tonic-gate 	 * synchronization.
12580Sstevel@tonic-gate 	 */
12590Sstevel@tonic-gate 	if (MD_MNSET_DESC(sd)) {
12600Sstevel@tonic-gate 		nd = sd->sd_nodelist;
12610Sstevel@tonic-gate 		while (nd) {
12620Sstevel@tonic-gate 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
12630Sstevel@tonic-gate 				(void) mddserror(ep, MDE_DS_NOTINMEMBERLIST,
12640Sstevel@tonic-gate 					sp->setno,
12650Sstevel@tonic-gate 					nd->nd_nodename, NULL, sp->setname);
12660Sstevel@tonic-gate 				return (-1);
12670Sstevel@tonic-gate 			}
12680Sstevel@tonic-gate 			nd = nd->nd_next;
12690Sstevel@tonic-gate 		}
12700Sstevel@tonic-gate 
12710Sstevel@tonic-gate 		/* Make sure we are blocking all signals */
12720Sstevel@tonic-gate 		if (procsigs(TRUE, &oldsigs, &xep) < 0)
12730Sstevel@tonic-gate 			mdclrerror(&xep);
12740Sstevel@tonic-gate 
12750Sstevel@tonic-gate 		/*
12760Sstevel@tonic-gate 		 * Lock the set on current set members.
12770Sstevel@tonic-gate 		 * Set locking done much earlier for MN diskset than for
12780Sstevel@tonic-gate 		 * traditional diskset since lock_set and SUSPEND are used
12790Sstevel@tonic-gate 		 * to protect against other meta* commands running on the
12800Sstevel@tonic-gate 		 * other nodes.
12810Sstevel@tonic-gate 		 */
12820Sstevel@tonic-gate 		nd = sd->sd_nodelist;
12830Sstevel@tonic-gate 		/* All nodes are guaranteed to be ALIVE */
12840Sstevel@tonic-gate 		while (nd) {
12850Sstevel@tonic-gate 			if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
12860Sstevel@tonic-gate 				rval = -1;
12870Sstevel@tonic-gate 				goto out;
12880Sstevel@tonic-gate 			}
12890Sstevel@tonic-gate 			lock_flag = 1;
12900Sstevel@tonic-gate 			nd = nd->nd_next;
12910Sstevel@tonic-gate 		}
12920Sstevel@tonic-gate 		/*
12930Sstevel@tonic-gate 		 * Lock out other meta* commands by suspending
12940Sstevel@tonic-gate 		 * class 1 messages across the diskset.
12950Sstevel@tonic-gate 		 */
12960Sstevel@tonic-gate 		nd = sd->sd_nodelist;
12970Sstevel@tonic-gate 		/* All nodes are guaranteed to be ALIVE */
12980Sstevel@tonic-gate 		while (nd) {
12990Sstevel@tonic-gate 			if (clnt_mdcommdctl(nd->nd_nodename,
13000Sstevel@tonic-gate 			    COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1,
13010Sstevel@tonic-gate 			    MD_MSCF_NO_FLAGS, ep)) {
13020Sstevel@tonic-gate 				rval = -1;
13030Sstevel@tonic-gate 				goto out;
13040Sstevel@tonic-gate 			}
13050Sstevel@tonic-gate 			suspend1_flag = 1;
13060Sstevel@tonic-gate 			nd = nd->nd_next;
13070Sstevel@tonic-gate 		}
13080Sstevel@tonic-gate 
13090Sstevel@tonic-gate 		nd = sd->sd_nodelist;
13100Sstevel@tonic-gate 		/* All nodes are guaranteed to be ALIVE */
13110Sstevel@tonic-gate 		while (nd) {
13120Sstevel@tonic-gate 			if (strcmp(nd->nd_nodename, mynode()) == 0) {
13130Sstevel@tonic-gate 				nd = nd->nd_next;
13140Sstevel@tonic-gate 				continue;
13150Sstevel@tonic-gate 			}
13160Sstevel@tonic-gate 
13170Sstevel@tonic-gate 			has_set = nodehasset(sp, nd->nd_nodename,
13180Sstevel@tonic-gate 				    NHS_NSTG_EQ, ep);
13190Sstevel@tonic-gate 			if (has_set < 0) {
13200Sstevel@tonic-gate 				rval = -1;
13210Sstevel@tonic-gate 				goto out;
13220Sstevel@tonic-gate 			}
13230Sstevel@tonic-gate 
13240Sstevel@tonic-gate 			if (! has_set) {
13250Sstevel@tonic-gate 				(void) mddserror(ep, MDE_DS_NODENOSET,
13260Sstevel@tonic-gate 					sp->setno, nd->nd_nodename,
13270Sstevel@tonic-gate 					NULL, sp->setname);
13280Sstevel@tonic-gate 				rval = -1;
13290Sstevel@tonic-gate 				goto out;
13300Sstevel@tonic-gate 			}
13310Sstevel@tonic-gate 			nd = nd->nd_next;
13320Sstevel@tonic-gate 		}
13330Sstevel@tonic-gate 	} else {
13340Sstevel@tonic-gate 		for (i = 0; i < MD_MAXSIDES; i++) {
13350Sstevel@tonic-gate 			/* Skip empty slots */
13360Sstevel@tonic-gate 			if (sd->sd_nodes[i][0] == '\0')
13370Sstevel@tonic-gate 				continue;
13380Sstevel@tonic-gate 
13390Sstevel@tonic-gate 			if (strcmp(sd->sd_nodes[i], mynode()) == 0)
13400Sstevel@tonic-gate 				continue;
13410Sstevel@tonic-gate 
13420Sstevel@tonic-gate 			has_set = nodehasset(sp, sd->sd_nodes[i], NHS_NSTG_EQ,
13430Sstevel@tonic-gate 				ep);
13440Sstevel@tonic-gate 			if (has_set < 0) {
13450Sstevel@tonic-gate 				/*
13460Sstevel@tonic-gate 				 * Can directly return since !MN diskset;
13470Sstevel@tonic-gate 				 * nothing to unlock.
13480Sstevel@tonic-gate 				 */
13490Sstevel@tonic-gate 				return (-1);
13500Sstevel@tonic-gate 			}
13510Sstevel@tonic-gate 
13520Sstevel@tonic-gate 			if (! has_set) {
13530Sstevel@tonic-gate 				/*
13540Sstevel@tonic-gate 				 * Can directly return since !MN diskset;
13550Sstevel@tonic-gate 				 * nothing to unlock.
13560Sstevel@tonic-gate 				 */
13570Sstevel@tonic-gate 				return (mddserror(ep, MDE_DS_NODENOSET,
13580Sstevel@tonic-gate 				    sp->setno, sd->sd_nodes[i], NULL,
13590Sstevel@tonic-gate 				    sp->setname));
13600Sstevel@tonic-gate 			}
13610Sstevel@tonic-gate 		}
13620Sstevel@tonic-gate 	}
13630Sstevel@tonic-gate 
13640Sstevel@tonic-gate 	for (p = dnlp; p != NULL; p = p->next) {
13650Sstevel@tonic-gate 		int		is_it;
13660Sstevel@tonic-gate 		mddrivename_t	*dnp;
13670Sstevel@tonic-gate 
13680Sstevel@tonic-gate 		dnp = p->drivenamep;
13690Sstevel@tonic-gate 
13700Sstevel@tonic-gate 		if ((is_it = meta_is_drive_in_thisset(sp, dnp, FALSE, ep))
13710Sstevel@tonic-gate 		    == -1) {
13720Sstevel@tonic-gate 			rval = -1;
13730Sstevel@tonic-gate 			goto out;
13740Sstevel@tonic-gate 		}
13750Sstevel@tonic-gate 
13760Sstevel@tonic-gate 		if (! is_it) {
13770Sstevel@tonic-gate 			(void) mddserror(ep, MDE_DS_DRIVENOTINSET, sp->setno,
13780Sstevel@tonic-gate 			    NULL, dnp->cname, sp->setname);
13790Sstevel@tonic-gate 			rval = -1;
13800Sstevel@tonic-gate 			goto out;
13810Sstevel@tonic-gate 		}
13820Sstevel@tonic-gate 
13830Sstevel@tonic-gate 		if ((meta_check_drive_inuse(sp, dnp, FALSE, ep)) == -1) {
13840Sstevel@tonic-gate 			rval = -1;
13850Sstevel@tonic-gate 			goto out;
13860Sstevel@tonic-gate 		}
13870Sstevel@tonic-gate 
13880Sstevel@tonic-gate 		deldrvcnt++;
13890Sstevel@tonic-gate 	}
13900Sstevel@tonic-gate 	current_drv_cnt = deldrvcnt;
13910Sstevel@tonic-gate 
13920Sstevel@tonic-gate 	/*
13930Sstevel@tonic-gate 	 * Get drive descriptors for the drives that are currently in the set.
13940Sstevel@tonic-gate 	 */
13950Sstevel@tonic-gate 	curdd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep);
13960Sstevel@tonic-gate 	if (! mdisok(ep)) {
13970Sstevel@tonic-gate 		rval = -1;
13980Sstevel@tonic-gate 		goto out;
13990Sstevel@tonic-gate 	}
14000Sstevel@tonic-gate 
14010Sstevel@tonic-gate 	/*
14020Sstevel@tonic-gate 	 * Decrement the the delete drive count for each drive currently in the
14030Sstevel@tonic-gate 	 * set.
14040Sstevel@tonic-gate 	 */
14050Sstevel@tonic-gate 	for (ddp = curdd; ddp != NULL; ddp = ddp->dd_next)
14060Sstevel@tonic-gate 		deldrvcnt--;
14070Sstevel@tonic-gate 
14080Sstevel@tonic-gate 	/*
14090Sstevel@tonic-gate 	 * If the count of drives we are deleting is equal to the drives in the
14100Sstevel@tonic-gate 	 * set, and we haven't specified forceflg, return an error
14110Sstevel@tonic-gate 	 */
14120Sstevel@tonic-gate 	if (deldrvcnt == 0 && forceflg == FALSE) {
14130Sstevel@tonic-gate 		(void) mderror(ep, MDE_FORCE_DEL_ALL_DRV, NULL);
14140Sstevel@tonic-gate 		rval = -1;
14150Sstevel@tonic-gate 		goto out;
14160Sstevel@tonic-gate 	}
14170Sstevel@tonic-gate 
14180Sstevel@tonic-gate 	/*
14190Sstevel@tonic-gate 	 * Get the list of drive descriptors that we are deleting.
14200Sstevel@tonic-gate 	 */
14210Sstevel@tonic-gate 	dd = metaget_drivedesc_fromdrivelist(sp, dnlp, MD_DR_DEL, ep);
14220Sstevel@tonic-gate 	if (! mdisok(ep)) {
14230Sstevel@tonic-gate 		rval = -1;
14240Sstevel@tonic-gate 		goto out;
14250Sstevel@tonic-gate 	}
14260Sstevel@tonic-gate 
14270Sstevel@tonic-gate 	/*
14280Sstevel@tonic-gate 	 * Get the set timeout information in case we have to roll back.
14290Sstevel@tonic-gate 	 */
14300Sstevel@tonic-gate 	(void) memset(&mhiargs, '\0', sizeof (mhiargs));
14310Sstevel@tonic-gate 	if (clnt_gtimeout(mynode(), sp, &mhiargs, ep) == -1) {
14320Sstevel@tonic-gate 		rval = -1;
14330Sstevel@tonic-gate 		goto out;
14340Sstevel@tonic-gate 	}
14350Sstevel@tonic-gate 
14360Sstevel@tonic-gate 	/* At this point, in case of error, set should be flushed. */
14370Sstevel@tonic-gate 	flush_set_onerr = 1;
14380Sstevel@tonic-gate 
14390Sstevel@tonic-gate 	/* END CHECK CODE */
14400Sstevel@tonic-gate 
14410Sstevel@tonic-gate 	/* Lock the set on current set members */
14420Sstevel@tonic-gate 	if (!(MD_MNSET_DESC(sd))) {
14430Sstevel@tonic-gate 		md_rb_sig_handling_on();
14440Sstevel@tonic-gate 		for (i = 0; i < MD_MAXSIDES; i++) {
14450Sstevel@tonic-gate 			/* Skip empty slots */
14460Sstevel@tonic-gate 			if (sd->sd_nodes[i][0] == '\0')
14470Sstevel@tonic-gate 				continue;
14480Sstevel@tonic-gate 
14490Sstevel@tonic-gate 			if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) {
14500Sstevel@tonic-gate 				rval = -1;
14510Sstevel@tonic-gate 				goto out;
14520Sstevel@tonic-gate 			}
14530Sstevel@tonic-gate 			lock_flag = 1;
14540Sstevel@tonic-gate 		}
14550Sstevel@tonic-gate 	}
14560Sstevel@tonic-gate 
14570Sstevel@tonic-gate 	if ((deldrvcnt == 0) && (MD_MNSET_DESC(sd))) {
14580Sstevel@tonic-gate 		mddb_config_t		c;
14590Sstevel@tonic-gate 		/*
14600Sstevel@tonic-gate 		 * Is current set STALE?
14610Sstevel@tonic-gate 		 */
14620Sstevel@tonic-gate 		(void) memset(&c, 0, sizeof (c));
14630Sstevel@tonic-gate 		c.c_id = 0;
14640Sstevel@tonic-gate 		c.c_setno = sp->setno;
14650Sstevel@tonic-gate 		if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) {
14660Sstevel@tonic-gate 			(void) mdstealerror(ep, &c.c_mde);
14670Sstevel@tonic-gate 			rval = -1;
14680Sstevel@tonic-gate 			goto out;
14690Sstevel@tonic-gate 		}
14700Sstevel@tonic-gate 		if (c.c_flags & MDDB_C_STALE) {
14710Sstevel@tonic-gate 			stale_bool = TRUE;
14720Sstevel@tonic-gate 		}
14730Sstevel@tonic-gate 	}
14740Sstevel@tonic-gate 
14750Sstevel@tonic-gate 	RB_TEST(1, "deletedrives", ep)
14760Sstevel@tonic-gate 
14770Sstevel@tonic-gate 	RB_PREEMPT;
14780Sstevel@tonic-gate 	rb_level = 1;	/* level 1 */
14790Sstevel@tonic-gate 
14800Sstevel@tonic-gate 	RB_TEST(2, "deletedrives", ep)
14810Sstevel@tonic-gate 
14820Sstevel@tonic-gate 	/*
14830Sstevel@tonic-gate 	 * Mark the drives MD_DR_DEL
14840Sstevel@tonic-gate 	 */
14850Sstevel@tonic-gate 	if (MD_MNSET_DESC(sd)) {
14860Sstevel@tonic-gate 		nd = sd->sd_nodelist;
14870Sstevel@tonic-gate 		/* All nodes are guaranteed to be ALIVE */
14880Sstevel@tonic-gate 		while (nd) {
14890Sstevel@tonic-gate 			if (clnt_upd_dr_flags(nd->nd_nodename, sp, dd,
14900Sstevel@tonic-gate 			    MD_DR_DEL, ep) == -1)
14910Sstevel@tonic-gate 				goto rollback;
14920Sstevel@tonic-gate 
14930Sstevel@tonic-gate 			RB_TEST(3, "deletedrives", ep)
14940Sstevel@tonic-gate 			nd = nd->nd_next;
14950Sstevel@tonic-gate 		}
14960Sstevel@tonic-gate 	} else {
14970Sstevel@tonic-gate 		for (i = 0; i < MD_MAXSIDES; i++) {
14980Sstevel@tonic-gate 			/* Skip empty slots */
14990Sstevel@tonic-gate 			if (sd->sd_nodes[i][0] == '\0')
15000Sstevel@tonic-gate 				continue;
15010Sstevel@tonic-gate 
15020Sstevel@tonic-gate 			if (clnt_upd_dr_flags(sd->sd_nodes[i], sp, dd,
15030Sstevel@tonic-gate 			    MD_DR_DEL, ep) == -1)
15040Sstevel@tonic-gate 				goto rollback;
15050Sstevel@tonic-gate 
15060Sstevel@tonic-gate 			RB_TEST(3, "deletedrives", ep)
15070Sstevel@tonic-gate 		}
15080Sstevel@tonic-gate 	}
15090Sstevel@tonic-gate 
15100Sstevel@tonic-gate 	RB_TEST(4, "deletedrives", ep)
15110Sstevel@tonic-gate 
15120Sstevel@tonic-gate 	RB_PREEMPT;
15130Sstevel@tonic-gate 	rb_level = 2;	/* level 2 */
15140Sstevel@tonic-gate 
15150Sstevel@tonic-gate 	RB_TEST(5, "deletedrives", ep)
15160Sstevel@tonic-gate 
15170Sstevel@tonic-gate 	/*
15180Sstevel@tonic-gate 	 * Balance the DB's according to the list of existing drives and the
15190Sstevel@tonic-gate 	 * list of deleted drives.
15200Sstevel@tonic-gate 	 */
15210Sstevel@tonic-gate 	if (meta_db_balance(sp, dd, curdd, 0, ep) == -1)
15220Sstevel@tonic-gate 		goto rollback;
15230Sstevel@tonic-gate 
15240Sstevel@tonic-gate 	/*
15250Sstevel@tonic-gate 	 * If the drive(s) to be deleted cannot be accessed,
15260Sstevel@tonic-gate 	 * they haven't really been deleted yet. Check and delete now
15270Sstevel@tonic-gate 	 * if need be.
15280Sstevel@tonic-gate 	 */
15290Sstevel@tonic-gate 	if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) >= 0) {
15300Sstevel@tonic-gate 		nlp = NULL;
15310Sstevel@tonic-gate 		for (ddp = dd; ddp != NULL; ddp = ddp->dd_next) {
15320Sstevel@tonic-gate 			char	*delete_name;
15330Sstevel@tonic-gate 
15340Sstevel@tonic-gate 			delete_name = ddp->dd_dnp->cname;
15350Sstevel@tonic-gate 
15360Sstevel@tonic-gate 			for (rl = rlp; rl != NULL; rl = rl->rl_next) {
15370Sstevel@tonic-gate 				char	*cur_name;
15380Sstevel@tonic-gate 
15390Sstevel@tonic-gate 				cur_name =
15400Sstevel@tonic-gate 				    rl->rl_repp->r_namep->drivenamep->cname;
15410Sstevel@tonic-gate 
15420Sstevel@tonic-gate 				if (strcmp(delete_name, cur_name) == 0) {
15430Sstevel@tonic-gate 					/* put it on the delete list */
15440Sstevel@tonic-gate 					np = rl->rl_repp->r_namep;
15450Sstevel@tonic-gate 					(void) metanamelist_append(&nlp, np);
15460Sstevel@tonic-gate 
15470Sstevel@tonic-gate 				}
15480Sstevel@tonic-gate 			}
15490Sstevel@tonic-gate 		}
15500Sstevel@tonic-gate 
15510Sstevel@tonic-gate 		if (nlp != NULL) {
15520Sstevel@tonic-gate 			if (meta_db_detach(sp, nlp,
15530Sstevel@tonic-gate 			    (MDFORCE_DS | MDFORCE_SET_LOCKED), NULL,
15540Sstevel@tonic-gate 			    ep) == -1) {
15550Sstevel@tonic-gate 				metafreenamelist(nlp);
15560Sstevel@tonic-gate 				goto rollback;
15570Sstevel@tonic-gate 			}
15580Sstevel@tonic-gate 			metafreenamelist(nlp);
15590Sstevel@tonic-gate 		}
15600Sstevel@tonic-gate 	}
15610Sstevel@tonic-gate 
15620Sstevel@tonic-gate 	RB_TEST(6, "deletedrives", ep)
15630Sstevel@tonic-gate 
15640Sstevel@tonic-gate 	RB_PREEMPT;
15650Sstevel@tonic-gate 	rb_level = 3;	/* level 3 */
15660Sstevel@tonic-gate 
15670Sstevel@tonic-gate 	RB_TEST(7, "deletedrives", ep)
15680Sstevel@tonic-gate 
15690Sstevel@tonic-gate 	/*
15700Sstevel@tonic-gate 	 * Cannot suspend set until after meta_db_balance since
15710Sstevel@tonic-gate 	 * meta_db_balance uses META_DB_ATTACH/DETACH messages.
15720Sstevel@tonic-gate 	 */
15730Sstevel@tonic-gate 	if ((deldrvcnt == 0) && (MD_MNSET_DESC(sd))) {
15740Sstevel@tonic-gate 		/*
15750Sstevel@tonic-gate 		 * Notify rpc.mdcommd on all nodes of a nodelist change.
15760Sstevel@tonic-gate 		 * Start by suspending rpc.mdcommd (which drains it of all
15770Sstevel@tonic-gate 		 * messages), then change the nodelist followed by a reinit
15780Sstevel@tonic-gate 		 * and resume.
15790Sstevel@tonic-gate 		 */
15800Sstevel@tonic-gate 		nd = sd->sd_nodelist;
15810Sstevel@tonic-gate 		/* All nodes are guaranteed to be ALIVE */
15820Sstevel@tonic-gate 		while (nd) {
15830Sstevel@tonic-gate 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND,
15840Sstevel@tonic-gate 			    sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) {
15850Sstevel@tonic-gate 				rval = -1;
15860Sstevel@tonic-gate 				goto out;
15870Sstevel@tonic-gate 			}
15880Sstevel@tonic-gate 			suspendall_flag = 1;
15890Sstevel@tonic-gate 			nd = nd->nd_next;
15900Sstevel@tonic-gate 		}
15910Sstevel@tonic-gate 	}
15920Sstevel@tonic-gate 
15930Sstevel@tonic-gate 	/*
15940Sstevel@tonic-gate 	 * Remove the drive records for the drives that were deleted from
15950Sstevel@tonic-gate 	 * each host in the set.  This removes the record and dr_flags.
15960Sstevel@tonic-gate 	 */
15970Sstevel@tonic-gate 	if (MD_MNSET_DESC(sd)) {
15980Sstevel@tonic-gate 		nd = sd->sd_nodelist;
15990Sstevel@tonic-gate 		/* All nodes are guaranteed to be ALIVE */
16000Sstevel@tonic-gate 		while (nd) {
16010Sstevel@tonic-gate 			if (clnt_deldrvs(nd->nd_nodename, sp, dd, ep) == -1)
16020Sstevel@tonic-gate 				goto rollback;
16030Sstevel@tonic-gate 
16040Sstevel@tonic-gate 			RB_TEST(8, "deletedrives", ep)
16050Sstevel@tonic-gate 			nd = nd->nd_next;
16060Sstevel@tonic-gate 		}
16070Sstevel@tonic-gate 	} else {
16080Sstevel@tonic-gate 		for (i = 0; i < MD_MAXSIDES; i++) {
16090Sstevel@tonic-gate 			/* Skip empty slots */
16100Sstevel@tonic-gate 			if (sd->sd_nodes[i][0] == '\0')
16110Sstevel@tonic-gate 				continue;
16120Sstevel@tonic-gate 
16130Sstevel@tonic-gate 			if (clnt_deldrvs(sd->sd_nodes[i], sp, dd, ep) == -1)
16140Sstevel@tonic-gate 				goto rollback;
16150Sstevel@tonic-gate 
16160Sstevel@tonic-gate 			RB_TEST(8, "deletedrives", ep)
16170Sstevel@tonic-gate 		}
16180Sstevel@tonic-gate 	}
16190Sstevel@tonic-gate 
16200Sstevel@tonic-gate 	RB_TEST(9, "deletedrives", ep)
16210Sstevel@tonic-gate 
16220Sstevel@tonic-gate 	RB_PREEMPT;
16230Sstevel@tonic-gate 	rb_level = 4;	/* level 4 */
16240Sstevel@tonic-gate 
16250Sstevel@tonic-gate 	RB_TEST(10, "deletedrives", ep)
16260Sstevel@tonic-gate 
16270Sstevel@tonic-gate 	if (!(MD_MNSET_DESC(sd)) && !MD_ATSET_DESC(sd)) {
16280Sstevel@tonic-gate 		if (rel_own_bydd(sp, dd, TRUE, ep))
16290Sstevel@tonic-gate 			goto rollback;
16300Sstevel@tonic-gate 	}
16310Sstevel@tonic-gate 
16320Sstevel@tonic-gate 	/* If we deleted all the drives, then we need to halt the set. */
16330Sstevel@tonic-gate 	if (deldrvcnt == 0) {
16340Sstevel@tonic-gate 		RB_TEST(11, "deletedrives", ep)
16350Sstevel@tonic-gate 
16360Sstevel@tonic-gate 		RB_PREEMPT;
16370Sstevel@tonic-gate 		rb_level = 5;	/* level 5 */
16380Sstevel@tonic-gate 
16390Sstevel@tonic-gate 		RB_TEST(12, "deletedrives", ep)
16400Sstevel@tonic-gate 
16410Sstevel@tonic-gate 		if (clnt_stimeout(mynode(), sp, &defmhiargs, ep) == -1)
16420Sstevel@tonic-gate 			goto rollback;
16430Sstevel@tonic-gate 
16440Sstevel@tonic-gate 		RB_TEST(13, "deletedrives", ep)
16450Sstevel@tonic-gate 
16460Sstevel@tonic-gate 		RB_PREEMPT;
16470Sstevel@tonic-gate 		rb_level = 6;	/* level 6 */
16480Sstevel@tonic-gate 
16490Sstevel@tonic-gate 		RB_TEST(14, "deletedrives", ep)
16500Sstevel@tonic-gate 
16510Sstevel@tonic-gate 		/* Halt MN diskset on all nodes by having node withdraw */
16520Sstevel@tonic-gate 		if (MD_MNSET_DESC(sd)) {
16530Sstevel@tonic-gate 			nd = sd->sd_nodelist;
16540Sstevel@tonic-gate 			/* All nodes are guaranteed to be ALIVE */
16550Sstevel@tonic-gate 			while (nd) {
16560Sstevel@tonic-gate 				/* Only withdraw nodes that are joined */
16570Sstevel@tonic-gate 				if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
16580Sstevel@tonic-gate 					nd = nd->nd_next;
16590Sstevel@tonic-gate 					continue;
16600Sstevel@tonic-gate 				}
16610Sstevel@tonic-gate 				/*
16620Sstevel@tonic-gate 				 * Going to set locally cached node flags to
16630Sstevel@tonic-gate 				 * rollback join so in case of error, the
16640Sstevel@tonic-gate 				 * rollback code knows which nodes to re-join.
16650Sstevel@tonic-gate 				 */
16660Sstevel@tonic-gate 				nd->nd_flags |= MD_MN_NODE_RB_JOIN;
16670Sstevel@tonic-gate 
16680Sstevel@tonic-gate 				/*
16690Sstevel@tonic-gate 				 * Be careful in ordering of following steps
16700Sstevel@tonic-gate 				 * so that recovery from a panic between
16710Sstevel@tonic-gate 				 * the steps is viable.
16720Sstevel@tonic-gate 				 * Only reset master info in rpc.metad -
16730Sstevel@tonic-gate 				 * don't reset local cached information
16740Sstevel@tonic-gate 				 * which will be used to set master information
16750Sstevel@tonic-gate 				 * back in case of failure (rollback).
16760Sstevel@tonic-gate 				 */
16770Sstevel@tonic-gate 				if (clnt_withdrawset(nd->nd_nodename, sp, ep))
16780Sstevel@tonic-gate 					goto rollback;
16790Sstevel@tonic-gate 				/* Sets withdraw flag on all nodes in list */
16800Sstevel@tonic-gate 				if (clnt_upd_nr_flags(nd->nd_nodename, sp,
16810Sstevel@tonic-gate 				    sd->sd_nodelist, MD_NR_WITHDRAW,
16820Sstevel@tonic-gate 				    NULL, ep)) {
16830Sstevel@tonic-gate 					goto rollback;
16840Sstevel@tonic-gate 				}
16850Sstevel@tonic-gate 				if (clnt_mnsetmaster(nd->nd_nodename, sp,
16860Sstevel@tonic-gate 				    "", MD_MN_INVALID_NID, ep)) {
16870Sstevel@tonic-gate 					goto rollback;
16880Sstevel@tonic-gate 				}
16890Sstevel@tonic-gate 				nd = nd->nd_next;
16900Sstevel@tonic-gate 			}
16910Sstevel@tonic-gate 		} else {
16920Sstevel@tonic-gate 			if (halt_set(sp, ep))
16930Sstevel@tonic-gate 				goto rollback;
16940Sstevel@tonic-gate 		}
16950Sstevel@tonic-gate 
16960Sstevel@tonic-gate 		RB_TEST(15, "deletedrives", ep)
16970Sstevel@tonic-gate 	}
16980Sstevel@tonic-gate 
16990Sstevel@tonic-gate 	RB_TEST(16, "deletedrives", ep)
17000Sstevel@tonic-gate 
17010Sstevel@tonic-gate out:
17020Sstevel@tonic-gate 	/*
17030Sstevel@tonic-gate 	 * Notify rpc.mdcommd on all nodes of a nodelist change.
17040Sstevel@tonic-gate 	 * Send reinit command to mdcommd which forces it to get
17050Sstevel@tonic-gate 	 * fresh set description.
17060Sstevel@tonic-gate 	 */
17070Sstevel@tonic-gate 	if (suspendall_flag) {
17080Sstevel@tonic-gate 		/* Send reinit */
17090Sstevel@tonic-gate 		nd = sd->sd_nodelist;
17100Sstevel@tonic-gate 		/* All nodes are guaranteed to be ALIVE */
17110Sstevel@tonic-gate 		while (nd) {
17120Sstevel@tonic-gate 			/* Class is ignored for REINIT */
17130Sstevel@tonic-gate 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
17140Sstevel@tonic-gate 			    sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
17150Sstevel@tonic-gate 				if (rval == 0)
17160Sstevel@tonic-gate 					(void) mdstealerror(ep, &xep);
17170Sstevel@tonic-gate 				rval = -1;
17180Sstevel@tonic-gate 				mde_perror(ep, dgettext(TEXT_DOMAIN,
17190Sstevel@tonic-gate 				    "Unable to reinit rpc.mdcommd.\n"));
17200Sstevel@tonic-gate 			}
17210Sstevel@tonic-gate 			nd = nd->nd_next;
17220Sstevel@tonic-gate 		}
17230Sstevel@tonic-gate 	}
17240Sstevel@tonic-gate 
17250Sstevel@tonic-gate 	/*
17260Sstevel@tonic-gate 	 * Just resume all classes so that resume is the same whether
17270Sstevel@tonic-gate 	 * just one class was locked or all classes were locked.
17280Sstevel@tonic-gate 	 */
17290Sstevel@tonic-gate 	if ((suspend1_flag) || (suspendall_flag)) {
17300Sstevel@tonic-gate 		/* Send resume */
17310Sstevel@tonic-gate 		nd = sd->sd_nodelist;
17320Sstevel@tonic-gate 		/* All nodes are guaranteed to be ALIVE */
17330Sstevel@tonic-gate 		while (nd) {
17340Sstevel@tonic-gate 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
17350Sstevel@tonic-gate 			    sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
17360Sstevel@tonic-gate 				if (rval == 0)
17370Sstevel@tonic-gate 					(void) mdstealerror(ep, &xep);
17380Sstevel@tonic-gate 				rval = -1;
17390Sstevel@tonic-gate 				mde_perror(ep, dgettext(TEXT_DOMAIN,
17400Sstevel@tonic-gate 				    "Unable to resume rpc.mdcommd.\n"));
17410Sstevel@tonic-gate 			}
17420Sstevel@tonic-gate 			nd = nd->nd_next;
17430Sstevel@tonic-gate 		}
17440Sstevel@tonic-gate 		meta_ping_mnset(sp->setno);
17450Sstevel@tonic-gate 	}
17460Sstevel@tonic-gate 	if (lock_flag) {
17470Sstevel@tonic-gate 		cl_sk = cl_get_setkey(sp->setno, sp->setname);
17480Sstevel@tonic-gate 		if (MD_MNSET_DESC(sd)) {
17490Sstevel@tonic-gate 			nd = sd->sd_nodelist;
17500Sstevel@tonic-gate 			/* All nodes are guaranteed to be ALIVE */
17510Sstevel@tonic-gate 			while (nd) {
17520Sstevel@tonic-gate 				if (clnt_unlock_set(nd->nd_nodename,
17530Sstevel@tonic-gate 				    cl_sk, &xep)) {
17540Sstevel@tonic-gate 					if (rval == 0)
17550Sstevel@tonic-gate 						(void) mdstealerror(ep, &xep);
17560Sstevel@tonic-gate 					rval = -1;
17570Sstevel@tonic-gate 				}
17580Sstevel@tonic-gate 				nd = nd->nd_next;
17590Sstevel@tonic-gate 			}
17600Sstevel@tonic-gate 		} else {
17610Sstevel@tonic-gate 			for (i = 0; i < MD_MAXSIDES; i++) {
17620Sstevel@tonic-gate 				/* Skip empty slots */
17630Sstevel@tonic-gate 				if (sd->sd_nodes[i][0] == '\0')
17640Sstevel@tonic-gate 					continue;
17650Sstevel@tonic-gate 
17660Sstevel@tonic-gate 				if (clnt_unlock_set(sd->sd_nodes[i],
17670Sstevel@tonic-gate 				    cl_sk, &xep)) {
17680Sstevel@tonic-gate 					if (rval == 0)
17690Sstevel@tonic-gate 						(void) mdstealerror(ep, &xep);
17700Sstevel@tonic-gate 					rval = -1;
17710Sstevel@tonic-gate 				}
17720Sstevel@tonic-gate 			}
17730Sstevel@tonic-gate 		}
17740Sstevel@tonic-gate 		cl_set_setkey(NULL);
17750Sstevel@tonic-gate 	}
17760Sstevel@tonic-gate 
17770Sstevel@tonic-gate 	metafreedrivedesc(&dd);
17780Sstevel@tonic-gate 
17790Sstevel@tonic-gate 	if (flush_set_onerr) {
17800Sstevel@tonic-gate 		metaflushsetname(sp);
17810Sstevel@tonic-gate 		if (!(MD_MNSET_DESC(sd))) {
17820Sstevel@tonic-gate 			md_rb_sig_handling_off(md_got_sig(), md_which_sig());
17830Sstevel@tonic-gate 		}
17840Sstevel@tonic-gate 	}
17850Sstevel@tonic-gate 
17860Sstevel@tonic-gate 	if (MD_MNSET_DESC(sd)) {
17870Sstevel@tonic-gate 		/* release signals back to what they were on entry */
17880Sstevel@tonic-gate 		if (procsigs(FALSE, &oldsigs, &xep) < 0)
17890Sstevel@tonic-gate 			mdclrerror(&xep);
17900Sstevel@tonic-gate 	}
17910Sstevel@tonic-gate 
17920Sstevel@tonic-gate 	return (rval);
17930Sstevel@tonic-gate 
17940Sstevel@tonic-gate rollback:
17950Sstevel@tonic-gate 	/* all signals already blocked for MN disket */
17960Sstevel@tonic-gate 	if (!(MD_MNSET_DESC(sd))) {
17970Sstevel@tonic-gate 		/* Make sure we are blocking all signals */
17980Sstevel@tonic-gate 		if (procsigs(TRUE, &oldsigs, &xep) < 0)
17990Sstevel@tonic-gate 			mdclrerror(&xep);
18000Sstevel@tonic-gate 	}
18010Sstevel@tonic-gate 
18020Sstevel@tonic-gate 	rval = -1;
18030Sstevel@tonic-gate 
18040Sstevel@tonic-gate 	max_genid = sd->sd_genid;
18050Sstevel@tonic-gate 
18060Sstevel@tonic-gate 	/* Set the master on all nodes first thing */
18070Sstevel@tonic-gate 	if (rb_level > 5) {
18080Sstevel@tonic-gate 		if (MD_MNSET_DESC(sd)) {
18090Sstevel@tonic-gate 			nd = sd->sd_nodelist;
18100Sstevel@tonic-gate 			/* All nodes are guaranteed to be ALIVE */
18110Sstevel@tonic-gate 			while (nd) {
18120Sstevel@tonic-gate 				if (!(nd->nd_flags & MD_MN_NODE_RB_JOIN)) {
18130Sstevel@tonic-gate 					continue;
18140Sstevel@tonic-gate 				}
18150Sstevel@tonic-gate 				/*
18160Sstevel@tonic-gate 				 * Set master on all re-joining nodes to be
18170Sstevel@tonic-gate 				 * my cached view of master.
18180Sstevel@tonic-gate 				 */
18190Sstevel@tonic-gate 				if (clnt_mnsetmaster(nd->nd_nodename, sp,
18200Sstevel@tonic-gate 				    sd->sd_mn_master_nodenm,
18210Sstevel@tonic-gate 				    sd->sd_mn_master_nodeid, &xep)) {
18220Sstevel@tonic-gate 					mdclrerror(&xep);
18230Sstevel@tonic-gate 				}
18240Sstevel@tonic-gate 			}
18250Sstevel@tonic-gate 		}
18260Sstevel@tonic-gate 	}
18270Sstevel@tonic-gate 
18280Sstevel@tonic-gate 	/* level 3 */
18290Sstevel@tonic-gate 	if (rb_level > 2) {
18300Sstevel@tonic-gate 		md_set_record		*sr;
18310Sstevel@tonic-gate 		md_mnset_record		*mnsr;
18320Sstevel@tonic-gate 		md_drive_record		*dr;
18330Sstevel@tonic-gate 		int			sr_drive_cnt;
18340Sstevel@tonic-gate 
18350Sstevel@tonic-gate 		/*
18360Sstevel@tonic-gate 		 * See if we have to re-add the drives specified.
18370Sstevel@tonic-gate 		 */
18380Sstevel@tonic-gate 		if (MD_MNSET_DESC(sd)) {
18390Sstevel@tonic-gate 			nd = sd->sd_nodelist;
18400Sstevel@tonic-gate 			/* All nodes are guaranteed to be ALIVE */
18410Sstevel@tonic-gate 			while (nd) {
18420Sstevel@tonic-gate 				/*
18430Sstevel@tonic-gate 				 * Must get current set record from each
18440Sstevel@tonic-gate 				 * node to see what else must be done
18450Sstevel@tonic-gate 				 * to recover.
18460Sstevel@tonic-gate 				 * Record should be for a multi-node diskset.
18470Sstevel@tonic-gate 				 */
18480Sstevel@tonic-gate 				if (clnt_mngetset(nd->nd_nodename, sp->setname,
18490Sstevel@tonic-gate 				    MD_SET_BAD, &mnsr, &xep) == -1) {
18500Sstevel@tonic-gate 					mdclrerror(&xep);
18510Sstevel@tonic-gate 					nd = nd->nd_next;
18520Sstevel@tonic-gate 					continue;
18530Sstevel@tonic-gate 				}
18540Sstevel@tonic-gate 
18550Sstevel@tonic-gate 				/*
18560Sstevel@tonic-gate 				 * If all drives are already there, skip
18570Sstevel@tonic-gate 				 * to next node.
18580Sstevel@tonic-gate 				 */
18590Sstevel@tonic-gate 				sr_drive_cnt = 0;
18600Sstevel@tonic-gate 				dr = mnsr->sr_drivechain;
18610Sstevel@tonic-gate 				while (dr) {
18620Sstevel@tonic-gate 					sr_drive_cnt++;
18630Sstevel@tonic-gate 					dr = dr->dr_next;
18640Sstevel@tonic-gate 				}
18650Sstevel@tonic-gate 				if (sr_drive_cnt == current_drv_cnt) {
18660Sstevel@tonic-gate 					free_sr((md_set_record *)mnsr);
18670Sstevel@tonic-gate 					nd = nd->nd_next;
18680Sstevel@tonic-gate 					continue;
18690Sstevel@tonic-gate 				}
18700Sstevel@tonic-gate 
18710Sstevel@tonic-gate 				/* Readd all drives */
18720Sstevel@tonic-gate 				if (clnt_adddrvs(nd->nd_nodename, sp, dd,
18730Sstevel@tonic-gate 				    mnsr->sr_ctime, mnsr->sr_genid, &xep) == -1)
18740Sstevel@tonic-gate 					mdclrerror(&xep);
18750Sstevel@tonic-gate 
18760Sstevel@tonic-gate 				free_sr((struct md_set_record *)mnsr);
18770Sstevel@tonic-gate 				nd = nd->nd_next;
18780Sstevel@tonic-gate 			}
18790Sstevel@tonic-gate 		} else {
18800Sstevel@tonic-gate 			for (i = 0; i < MD_MAXSIDES; i++) {
18810Sstevel@tonic-gate 				/* Skip empty slots */
18820Sstevel@tonic-gate 				if (sd->sd_nodes[i][0] == '\0')
18830Sstevel@tonic-gate 					continue;
18840Sstevel@tonic-gate 
18850Sstevel@tonic-gate 				/* Record should be for a non-multi-node set */
18860Sstevel@tonic-gate 				if (clnt_getset(sd->sd_nodes[i], sp->setname,
18870Sstevel@tonic-gate 				    MD_SET_BAD, &sr, &xep) == -1) {
18880Sstevel@tonic-gate 					mdclrerror(&xep);
18890Sstevel@tonic-gate 					continue;
18900Sstevel@tonic-gate 				}
18910Sstevel@tonic-gate 
18920Sstevel@tonic-gate 				/*
18930Sstevel@tonic-gate 				 * Set record structure was allocated from RPC
18940Sstevel@tonic-gate 				 * routine getset so this structure is only of
18950Sstevel@tonic-gate 				 * size md_set_record even if the MN flag is
18960Sstevel@tonic-gate 				 * set.  So, clear the flag so that the free
18970Sstevel@tonic-gate 				 * code doesn't attempt to free a structure
18980Sstevel@tonic-gate 				 * the size of md_mnset_record.
18990Sstevel@tonic-gate 				 */
19000Sstevel@tonic-gate 				if (MD_MNSET_REC(sr)) {
19010Sstevel@tonic-gate 					sr->sr_flags &= ~MD_SR_MN;
19020Sstevel@tonic-gate 					free_sr(sr);
19030Sstevel@tonic-gate 					continue;
19040Sstevel@tonic-gate 				}
19050Sstevel@tonic-gate 
19060Sstevel@tonic-gate 				/* Drive already added, skip to next node */
19070Sstevel@tonic-gate 				if (sr->sr_drivechain != NULL) {
19080Sstevel@tonic-gate 					free_sr(sr);
19090Sstevel@tonic-gate 					continue;
19100Sstevel@tonic-gate 				}
19110Sstevel@tonic-gate 
19120Sstevel@tonic-gate 				if (clnt_adddrvs(sd->sd_nodes[i], sp, dd,
19130Sstevel@tonic-gate 				    sr->sr_ctime, sr->sr_genid, &xep) == -1)
19140Sstevel@tonic-gate 					mdclrerror(&xep);
19150Sstevel@tonic-gate 
19160Sstevel@tonic-gate 				free_sr(sr);
19170Sstevel@tonic-gate 			}
19180Sstevel@tonic-gate 		}
19190Sstevel@tonic-gate 		max_genid += 2;
19200Sstevel@tonic-gate 	}
19210Sstevel@tonic-gate 
19220Sstevel@tonic-gate 	/*
19230Sstevel@tonic-gate 	 * Notify rpc.mdcommd on all nodes of a nodelist change.
19240Sstevel@tonic-gate 	 * At this point in time, don't know which nodes are joined
19250Sstevel@tonic-gate 	 * to the set.  So, send a reinit command to mdcommd
19260Sstevel@tonic-gate 	 * which forces it to get fresh set description.  Then send resume.
19270Sstevel@tonic-gate 	 *
19280Sstevel@tonic-gate 	 * Later, this code will use rpc.mdcommd messages to reattach disks
19290Sstevel@tonic-gate 	 * and then rpc.mdcommd may be suspended again, rest of the nodes
19300Sstevel@tonic-gate 	 * joined, rpc.mdcommd reinited and then resumed.
19310Sstevel@tonic-gate 	 */
19320Sstevel@tonic-gate 	if (suspendall_flag) {
19330Sstevel@tonic-gate 		/* Send reinit */
19340Sstevel@tonic-gate 		nd = sd->sd_nodelist;
19350Sstevel@tonic-gate 		/* All nodes are guaranteed to be ALIVE */
19360Sstevel@tonic-gate 		while (nd) {
19370Sstevel@tonic-gate 			/* Class is ignored for REINIT */
19380Sstevel@tonic-gate 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
19390Sstevel@tonic-gate 			    sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
19400Sstevel@tonic-gate 				mde_perror(&xep, dgettext(TEXT_DOMAIN,
19410Sstevel@tonic-gate 				    "Unable to reinit rpc.mdcommd.\n"));
19420Sstevel@tonic-gate 				mdclrerror(&xep);
19430Sstevel@tonic-gate 			}
19440Sstevel@tonic-gate 			nd = nd->nd_next;
19450Sstevel@tonic-gate 		}
19460Sstevel@tonic-gate 
19470Sstevel@tonic-gate 		/* Send resume */
19480Sstevel@tonic-gate 		nd = sd->sd_nodelist;
19490Sstevel@tonic-gate 		/* All nodes are guaranteed to be ALIVE */
19500Sstevel@tonic-gate 		while (nd) {
19510Sstevel@tonic-gate 			/*
19520Sstevel@tonic-gate 			 * Resume all classes but class 1 so that lock is held
19530Sstevel@tonic-gate 			 * against meta* commands.
19540Sstevel@tonic-gate 			 * To later resume class1, must issue a class0 resume.
19550Sstevel@tonic-gate 			 */
19560Sstevel@tonic-gate 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
19570Sstevel@tonic-gate 			    sp, MD_MSG_CLASS0,
19580Sstevel@tonic-gate 			    MD_MSCF_DONT_RESUME_CLASS1, &xep)) {
19590Sstevel@tonic-gate 				mde_perror(&xep, dgettext(TEXT_DOMAIN,
19600Sstevel@tonic-gate 				    "Unable to resume rpc.mdcommd.\n"));
19610Sstevel@tonic-gate 				mdclrerror(&xep);
19620Sstevel@tonic-gate 			}
19630Sstevel@tonic-gate 			nd = nd->nd_next;
19640Sstevel@tonic-gate 		}
19650Sstevel@tonic-gate 		meta_ping_mnset(sp->setno);
19660Sstevel@tonic-gate 	}
19670Sstevel@tonic-gate 
19680Sstevel@tonic-gate 	/* level 2 */
19690Sstevel@tonic-gate 	if (rb_level > 1) {
19700Sstevel@tonic-gate 		mdnamelist_t	*nlp;
19710Sstevel@tonic-gate 		mdname_t	*np;
19720Sstevel@tonic-gate 
19730Sstevel@tonic-gate 		for (ddp = dd; ddp != NULL; ddp = ddp->dd_next) {
19740Sstevel@tonic-gate 			uint_t	rep_slice;
19750Sstevel@tonic-gate 
19760Sstevel@tonic-gate 			if ((meta_replicaslice(ddp->dd_dnp,
19770Sstevel@tonic-gate 			    &rep_slice, &xep) != 0) ||
19780Sstevel@tonic-gate 			    ((np = metaslicename(ddp->dd_dnp, rep_slice,
19790Sstevel@tonic-gate 				&xep)) == NULL)) {
19800Sstevel@tonic-gate 				mdclrerror(&xep);
19810Sstevel@tonic-gate 				continue;
19820Sstevel@tonic-gate 			}
19830Sstevel@tonic-gate 			nlp = NULL;
19840Sstevel@tonic-gate 			(void) metanamelist_append(&nlp, np);
19850Sstevel@tonic-gate 
19860Sstevel@tonic-gate 			if (meta_db_attach(sp, nlp,
19870Sstevel@tonic-gate 			    (MDCHK_DRVINSET | MDCHK_SET_LOCKED),
19880Sstevel@tonic-gate 			    &sd->sd_ctime, ddp->dd_dbcnt, ddp->dd_dbsize,
19890Sstevel@tonic-gate 			    NULL, &xep) == -1)
19900Sstevel@tonic-gate 				mdclrerror(&xep);
19910Sstevel@tonic-gate 
19920Sstevel@tonic-gate 			metafreenamelist(nlp);
19930Sstevel@tonic-gate 		}
19940Sstevel@tonic-gate 		/* Re-balance */
19950Sstevel@tonic-gate 		if (meta_db_balance(sp, NULL, curdd, 0, &xep) == -1)
19960Sstevel@tonic-gate 			mdclrerror(&xep);
19970Sstevel@tonic-gate 	}
19980Sstevel@tonic-gate 
19990Sstevel@tonic-gate 	/* level 4 */
20000Sstevel@tonic-gate 	if (rb_level > 3) {
20010Sstevel@tonic-gate 		if (!(MD_MNSET_DESC(sd)) && !MD_ATSET_DESC(sd)) {
20020Sstevel@tonic-gate 			if (tk_own_bydd(sp, dd, &mhiargs, TRUE, &xep))
20030Sstevel@tonic-gate 				mdclrerror(&xep);
20040Sstevel@tonic-gate 		}
20050Sstevel@tonic-gate 	}
20060Sstevel@tonic-gate 
20070Sstevel@tonic-gate 	/* level 5 */
20080Sstevel@tonic-gate 	if (rb_level > 4) {
20090Sstevel@tonic-gate 		if (clnt_stimeout(mynode(), sp, &mhiargs, &xep) == -1)
20100Sstevel@tonic-gate 			mdclrerror(&xep);
20110Sstevel@tonic-gate 	}
20120Sstevel@tonic-gate 
20130Sstevel@tonic-gate 	/*
20140Sstevel@tonic-gate 	 * If at least one node needs to be rejoined to MN diskset,
20150Sstevel@tonic-gate 	 * then suspend commd again.
20160Sstevel@tonic-gate 	 */
20170Sstevel@tonic-gate 	if (MD_MNSET_DESC(sd)) {
20180Sstevel@tonic-gate 		nd = sd->sd_nodelist;
20190Sstevel@tonic-gate 		/* All nodes are guaranteed to be ALIVE */
20200Sstevel@tonic-gate 		while (nd) {
20210Sstevel@tonic-gate 			if (!(nd->nd_flags & MD_MN_NODE_RB_JOIN)) {
20220Sstevel@tonic-gate 				nd = nd->nd_next;
20230Sstevel@tonic-gate 				continue;
20240Sstevel@tonic-gate 			}
20250Sstevel@tonic-gate 			break;
20260Sstevel@tonic-gate 		}
20270Sstevel@tonic-gate 		if (nd) {
20280Sstevel@tonic-gate 			/*
20290Sstevel@tonic-gate 			 * Found node that will be rejoined so
20300Sstevel@tonic-gate 			 * notify rpc.mdcommd on all nodes of a nodelist change.
20310Sstevel@tonic-gate 			 * Start by suspending rpc.mdcommd (which drains it of
20320Sstevel@tonic-gate 			 * all messages), then change the nodelist followed by
20330Sstevel@tonic-gate 			 * a reinit and resume.
20340Sstevel@tonic-gate 			 */
20350Sstevel@tonic-gate 			nd = sd->sd_nodelist;
20360Sstevel@tonic-gate 			/* All nodes are guaranteed to be ALIVE */
20370Sstevel@tonic-gate 			while (nd) {
20380Sstevel@tonic-gate 				if (clnt_mdcommdctl(nd->nd_nodename,
20390Sstevel@tonic-gate 				    COMMDCTL_SUSPEND, sp, MD_MSG_CLASS0,
20400Sstevel@tonic-gate 				    MD_MSCF_NO_FLAGS, &xep)) {
20410Sstevel@tonic-gate 					mdclrerror(&xep);
20420Sstevel@tonic-gate 				}
20430Sstevel@tonic-gate 				suspendall_flag_rb = 1;
20440Sstevel@tonic-gate 				nd = nd->nd_next;
20450Sstevel@tonic-gate 			}
20460Sstevel@tonic-gate 		}
20470Sstevel@tonic-gate 	}
20480Sstevel@tonic-gate 
20490Sstevel@tonic-gate 
20500Sstevel@tonic-gate 
20510Sstevel@tonic-gate 	/* level 6 */
20520Sstevel@tonic-gate 	if (rb_level > 5) {
20530Sstevel@tonic-gate 		if (MD_MNSET_DESC(sd)) {
20540Sstevel@tonic-gate 			int	join_flags = 0;
20550Sstevel@tonic-gate 
20560Sstevel@tonic-gate 			nd = sd->sd_nodelist;
20570Sstevel@tonic-gate 			/* All nodes are guaranteed to be ALIVE */
20580Sstevel@tonic-gate 			while (nd) {
20590Sstevel@tonic-gate 				/* Only rejoin nodes that were joined before */
20600Sstevel@tonic-gate 				if (!(nd->nd_flags & MD_MN_NODE_RB_JOIN)) {
20610Sstevel@tonic-gate 					nd = nd->nd_next;
20620Sstevel@tonic-gate 					continue;
20630Sstevel@tonic-gate 				}
20640Sstevel@tonic-gate 				/*
20650Sstevel@tonic-gate 				 * Rejoin nodes to same state as before -
20660Sstevel@tonic-gate 				 * either STALE or non-STALE.
20670Sstevel@tonic-gate 				 */
20680Sstevel@tonic-gate 				if (stale_bool == TRUE)
20690Sstevel@tonic-gate 					join_flags = MNSET_IS_STALE;
20700Sstevel@tonic-gate 				if (clnt_joinset(nd->nd_nodename, sp,
20710Sstevel@tonic-gate 				    join_flags, &xep))
20720Sstevel@tonic-gate 					mdclrerror(&xep);
20730Sstevel@tonic-gate 				/* Sets OWN flag on all nodes in list */
20740Sstevel@tonic-gate 				if (clnt_upd_nr_flags(nd->nd_nodename, sp,
20750Sstevel@tonic-gate 				    sd->sd_nodelist, MD_NR_JOIN, NULL, &xep)) {
20760Sstevel@tonic-gate 					mdclrerror(&xep);
20770Sstevel@tonic-gate 				}
20780Sstevel@tonic-gate 				nd = nd->nd_next;
20790Sstevel@tonic-gate 			}
20800Sstevel@tonic-gate 		} else {
20810Sstevel@tonic-gate 			if (setup_db_bydd(sp, dd, TRUE, &xep) == -1)
20820Sstevel@tonic-gate 				mdclrerror(&xep);
20830Sstevel@tonic-gate 
20840Sstevel@tonic-gate 			/* No special flag for traditional diskset */
20850Sstevel@tonic-gate 			if (snarf_set(sp, NULL, &xep))
20860Sstevel@tonic-gate 				mdclrerror(&xep);
20870Sstevel@tonic-gate 		}
20880Sstevel@tonic-gate 	}
20890Sstevel@tonic-gate 
20900Sstevel@tonic-gate 	/* level 1 */
20910Sstevel@tonic-gate 	if (rb_level > 0) {
20920Sstevel@tonic-gate 		/*
20930Sstevel@tonic-gate 		 * Mark the drives as OK.
20940Sstevel@tonic-gate 		 */
20950Sstevel@tonic-gate 		if (MD_MNSET_DESC(sd)) {
20960Sstevel@tonic-gate 			nd = sd->sd_nodelist;
20970Sstevel@tonic-gate 			/* All nodes are guaranteed to be ALIVE */
20980Sstevel@tonic-gate 			while (nd) {
20990Sstevel@tonic-gate 				/*
21000Sstevel@tonic-gate 				 * Must be last action before unlock.
21010Sstevel@tonic-gate 				 * In case of panic, recovery code checks
21020Sstevel@tonic-gate 				 * for MD_DR_OK to know that drive
21030Sstevel@tonic-gate 				 * and possible master are fully added back.
21040Sstevel@tonic-gate 				 */
21050Sstevel@tonic-gate 				if (clnt_upd_dr_flags(nd->nd_nodename, sp, dd,
21060Sstevel@tonic-gate 				    MD_DR_OK, &xep) == -1)
21070Sstevel@tonic-gate 					mdclrerror(&xep);
21080Sstevel@tonic-gate 				nd = nd->nd_next;
21090Sstevel@tonic-gate 			}
21100Sstevel@tonic-gate 		} else {
21110Sstevel@tonic-gate 			for (i = 0; i < MD_MAXSIDES; i++) {
21120Sstevel@tonic-gate 				/* Skip empty slots */
21130Sstevel@tonic-gate 				if (sd->sd_nodes[i][0] == '\0')
21140Sstevel@tonic-gate 					continue;
21150Sstevel@tonic-gate 
21160Sstevel@tonic-gate 				if (clnt_upd_dr_flags(sd->sd_nodes[i], sp, dd,
21170Sstevel@tonic-gate 				    MD_DR_OK, &xep) == -1)
21180Sstevel@tonic-gate 					mdclrerror(&xep);
21190Sstevel@tonic-gate 
21200Sstevel@tonic-gate 			}
21210Sstevel@tonic-gate 		}
21220Sstevel@tonic-gate 		max_genid += 2;
21230Sstevel@tonic-gate 		resync_genid(sp, sd, max_genid, 0, NULL);
21240Sstevel@tonic-gate 	}
21250Sstevel@tonic-gate 	/*
21260Sstevel@tonic-gate 	 * Notify rpc.mdcommd on all nodes of a nodelist change.
21270Sstevel@tonic-gate 	 * Send a reinit command to mdcommd which forces it to get
21280Sstevel@tonic-gate 	 * fresh set description.
21290Sstevel@tonic-gate 	 */
21300Sstevel@tonic-gate 	if (suspendall_flag_rb) {
21310Sstevel@tonic-gate 		/* Send reinit */
21320Sstevel@tonic-gate 		nd = sd->sd_nodelist;
21330Sstevel@tonic-gate 		/* All nodes are guaranteed to be ALIVE */
21340Sstevel@tonic-gate 		while (nd) {
21350Sstevel@tonic-gate 			/* Class is ignored for REINIT */
21360Sstevel@tonic-gate 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
21370Sstevel@tonic-gate 			    sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
21380Sstevel@tonic-gate 				mde_perror(&xep, dgettext(TEXT_DOMAIN,
21390Sstevel@tonic-gate 				    "Unable to reinit rpc.mdcommd.\n"));
21400Sstevel@tonic-gate 				mdclrerror(&xep);
21410Sstevel@tonic-gate 			}
21420Sstevel@tonic-gate 			nd = nd->nd_next;
21430Sstevel@tonic-gate 		}
21440Sstevel@tonic-gate 	}
21450Sstevel@tonic-gate 
21460Sstevel@tonic-gate 	/*
21470Sstevel@tonic-gate 	 * Just resume all classes so that resume is the same whether
21480Sstevel@tonic-gate 	 * just one class was locked or all classes were locked.
21490Sstevel@tonic-gate 	 */
21500Sstevel@tonic-gate 	if ((suspend1_flag) || (suspendall_flag_rb) || (suspendall_flag)) {
21510Sstevel@tonic-gate 		/* Send resume */
21520Sstevel@tonic-gate 		nd = sd->sd_nodelist;
21530Sstevel@tonic-gate 		/* All nodes are guaranteed to be ALIVE */
21540Sstevel@tonic-gate 		while (nd) {
21550Sstevel@tonic-gate 			if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
21560Sstevel@tonic-gate 			    sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
21570Sstevel@tonic-gate 				mde_perror(&xep, dgettext(TEXT_DOMAIN,
21580Sstevel@tonic-gate 				    "Unable to resume rpc.mdcommd.\n"));
21590Sstevel@tonic-gate 				mdclrerror(&xep);
21600Sstevel@tonic-gate 			}
21610Sstevel@tonic-gate 			nd = nd->nd_next;
21620Sstevel@tonic-gate 		}
21630Sstevel@tonic-gate 		meta_ping_mnset(sp->setno);
21640Sstevel@tonic-gate 	}
21650Sstevel@tonic-gate 
21660Sstevel@tonic-gate 
21670Sstevel@tonic-gate 	/* level 0 */
21680Sstevel@tonic-gate 	cl_sk = cl_get_setkey(sp->setno, sp->setname);
21690Sstevel@tonic-gate 	/* Don't test lock flag since guaranteed to be set if in rollback */
21700Sstevel@tonic-gate 	if (MD_MNSET_DESC(sd)) {
21710Sstevel@tonic-gate 		nd = sd->sd_nodelist;
21720Sstevel@tonic-gate 		/* All nodes are guaranteed to be ALIVE */
21730Sstevel@tonic-gate 		while (nd) {
21740Sstevel@tonic-gate 			if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep))
21750Sstevel@tonic-gate 				mdclrerror(&xep);
21760Sstevel@tonic-gate 			nd = nd->nd_next;
21770Sstevel@tonic-gate 		}
21780Sstevel@tonic-gate 	} else {
21790Sstevel@tonic-gate 		for (i = 0; i < MD_MAXSIDES; i++) {
21800Sstevel@tonic-gate 			/* Skip empty slots */
21810Sstevel@tonic-gate 			if (sd->sd_nodes[i][0] == '\0')
21820Sstevel@tonic-gate 				continue;
21830Sstevel@tonic-gate 
21840Sstevel@tonic-gate 			if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep))
21850Sstevel@tonic-gate 				mdclrerror(&xep);
21860Sstevel@tonic-gate 		}
21870Sstevel@tonic-gate 	}
21880Sstevel@tonic-gate 	cl_set_setkey(NULL);
21890Sstevel@tonic-gate 
21900Sstevel@tonic-gate 	/* release signals back to what they were on entry */
21910Sstevel@tonic-gate 	if (procsigs(FALSE, &oldsigs, &xep) < 0)
21920Sstevel@tonic-gate 		mdclrerror(&xep);
21930Sstevel@tonic-gate 
21940Sstevel@tonic-gate 	metafreedrivedesc(&dd);
21950Sstevel@tonic-gate 
21960Sstevel@tonic-gate 	if (flush_set_onerr) {
21970Sstevel@tonic-gate 		metaflushsetname(sp);
21980Sstevel@tonic-gate 		if (!(MD_MNSET_DESC(sd))) {
21990Sstevel@tonic-gate 			md_rb_sig_handling_off(md_got_sig(), md_which_sig());
22000Sstevel@tonic-gate 		}
22010Sstevel@tonic-gate 	}
22020Sstevel@tonic-gate 
22030Sstevel@tonic-gate 	return (rval);
22040Sstevel@tonic-gate }
2205