xref: /onnv-gate/usr/src/lib/lvm/libmeta/common/meta_db_balance.c (revision 7779:6063aac63621)
10Sstevel@tonic-gate /*
20Sstevel@tonic-gate  * CDDL HEADER START
30Sstevel@tonic-gate  *
40Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
52150Sjeanm  * Common Development and Distribution License (the "License").
62150Sjeanm  * You may not use this file except in compliance with the License.
70Sstevel@tonic-gate  *
80Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
90Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
100Sstevel@tonic-gate  * See the License for the specific language governing permissions
110Sstevel@tonic-gate  * and limitations under the License.
120Sstevel@tonic-gate  *
130Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
140Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
150Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
160Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
170Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
180Sstevel@tonic-gate  *
190Sstevel@tonic-gate  * CDDL HEADER END
200Sstevel@tonic-gate  */
210Sstevel@tonic-gate /*
226698Ssk102515  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
230Sstevel@tonic-gate  * Use is subject to license terms.
240Sstevel@tonic-gate  */
250Sstevel@tonic-gate 
260Sstevel@tonic-gate /*
270Sstevel@tonic-gate  * Database location balancing code.
280Sstevel@tonic-gate  */
290Sstevel@tonic-gate 
300Sstevel@tonic-gate #include <meta.h>
310Sstevel@tonic-gate #include <sys/lvm/md_mddb.h>
320Sstevel@tonic-gate #include <sdssc.h>
330Sstevel@tonic-gate 
340Sstevel@tonic-gate #define	MD_MINBALREP	2
350Sstevel@tonic-gate 
360Sstevel@tonic-gate /*
370Sstevel@tonic-gate  * Stuff for DB balancing.
380Sstevel@tonic-gate  */
390Sstevel@tonic-gate enum md_ctlr_ops_t {
400Sstevel@tonic-gate 	DRV_NOP = 0,
410Sstevel@tonic-gate 	DRV_ADD = 1,
420Sstevel@tonic-gate 	DRV_DEL = 2
430Sstevel@tonic-gate };
440Sstevel@tonic-gate typedef enum md_ctlr_ops_t md_ctlr_ops_t;
450Sstevel@tonic-gate 
460Sstevel@tonic-gate /* drive flag fields */
470Sstevel@tonic-gate #define	DRV_F_ERROR	0x1
480Sstevel@tonic-gate #define	DRV_F_INDISKSET	0x2
490Sstevel@tonic-gate 
500Sstevel@tonic-gate struct md_ctlr_drv_t {
510Sstevel@tonic-gate 	md_ctlr_ops_t drv_op;
520Sstevel@tonic-gate 	int drv_flags;
530Sstevel@tonic-gate 	int drv_dbcnt;
540Sstevel@tonic-gate 	int drv_new_dbcnt;
550Sstevel@tonic-gate 	daddr_t drv_dbsize;
560Sstevel@tonic-gate 	mddrivename_t *drv_dnp;
570Sstevel@tonic-gate 	struct md_ctlr_drv_t *drv_next;
580Sstevel@tonic-gate };
590Sstevel@tonic-gate typedef struct md_ctlr_drv_t md_ctlr_drv_t;
600Sstevel@tonic-gate 
610Sstevel@tonic-gate struct md_ctlr_ctl_t {
620Sstevel@tonic-gate 	mdcinfo_t *ctl_cinfop;
630Sstevel@tonic-gate 	int ctl_dbcnt;
640Sstevel@tonic-gate 	int ctl_drcnt;
650Sstevel@tonic-gate 	md_ctlr_drv_t *ctl_drvs;
660Sstevel@tonic-gate 	struct md_ctlr_ctl_t *ctl_next;
670Sstevel@tonic-gate };
680Sstevel@tonic-gate typedef struct md_ctlr_ctl_t md_ctlr_ctl_t;
690Sstevel@tonic-gate 
700Sstevel@tonic-gate static int
add_replica(mdsetname_t * sp,mddrivename_t * dnp,int dbcnt,daddr_t dbsize,md_error_t * ep)710Sstevel@tonic-gate add_replica(
720Sstevel@tonic-gate 	mdsetname_t		*sp,
730Sstevel@tonic-gate 	mddrivename_t		*dnp,
740Sstevel@tonic-gate 	int			dbcnt,
750Sstevel@tonic-gate 	daddr_t			dbsize,
760Sstevel@tonic-gate 	md_error_t		*ep
770Sstevel@tonic-gate )
780Sstevel@tonic-gate {
790Sstevel@tonic-gate 	mdnamelist_t		*nlp = NULL;
800Sstevel@tonic-gate 	mdname_t		*np;
810Sstevel@tonic-gate 	md_set_desc		*sd;
820Sstevel@tonic-gate 	uint_t			rep_slice;
830Sstevel@tonic-gate 
840Sstevel@tonic-gate 	if (meta_replicaslice(dnp, &rep_slice, ep) != 0)
850Sstevel@tonic-gate 		return (-1);
860Sstevel@tonic-gate 
870Sstevel@tonic-gate 	if ((np = metaslicename(dnp, rep_slice, ep)) == NULL)
880Sstevel@tonic-gate 		return (-1);
890Sstevel@tonic-gate 
900Sstevel@tonic-gate 	(void) metanamelist_append(&nlp, np);
910Sstevel@tonic-gate 
920Sstevel@tonic-gate 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
930Sstevel@tonic-gate 		metafreenamelist(nlp);
940Sstevel@tonic-gate 		return (-1);
950Sstevel@tonic-gate 	}
960Sstevel@tonic-gate 
970Sstevel@tonic-gate 	if (meta_db_attach(sp, nlp, (MDCHK_DRVINSET | MDCHK_SET_LOCKED),
980Sstevel@tonic-gate 	    (&sd->sd_ctime), dbcnt, dbsize, NULL, ep) == -1) {
990Sstevel@tonic-gate 		metafreenamelist(nlp);
1000Sstevel@tonic-gate 		return (-1);
1010Sstevel@tonic-gate 	}
1020Sstevel@tonic-gate 
1030Sstevel@tonic-gate 	metafreenamelist(nlp);
1040Sstevel@tonic-gate 	return (0);
1050Sstevel@tonic-gate }
1060Sstevel@tonic-gate 
1070Sstevel@tonic-gate static int
del_replica(mdsetname_t * sp,mddrivename_t * dnp,md_error_t * ep)1080Sstevel@tonic-gate del_replica(
1090Sstevel@tonic-gate 	mdsetname_t		*sp,
1100Sstevel@tonic-gate 	mddrivename_t		*dnp,
1110Sstevel@tonic-gate 	md_error_t		*ep
1120Sstevel@tonic-gate )
1130Sstevel@tonic-gate {
1140Sstevel@tonic-gate 	mdnamelist_t		*nlp = NULL;
1150Sstevel@tonic-gate 	mdname_t		*np;
1160Sstevel@tonic-gate 	uint_t			rep_slice;
1170Sstevel@tonic-gate 
1180Sstevel@tonic-gate 	if (meta_replicaslice(dnp, &rep_slice, ep) != 0)
1190Sstevel@tonic-gate 		return (-1);
1200Sstevel@tonic-gate 
1210Sstevel@tonic-gate 	if ((np = metaslicename(dnp, rep_slice, ep)) == NULL)
1220Sstevel@tonic-gate 		return (-1);
1230Sstevel@tonic-gate 
1240Sstevel@tonic-gate 	(void) metanamelist_append(&nlp, np);
1250Sstevel@tonic-gate 
1260Sstevel@tonic-gate 	if (meta_db_detach(sp, nlp, (MDFORCE_DS | MDFORCE_SET_LOCKED),
1270Sstevel@tonic-gate 	    NULL, ep) == -1) {
1280Sstevel@tonic-gate 		metafreenamelist(nlp);
1290Sstevel@tonic-gate 		return (-1);
1300Sstevel@tonic-gate 	}
1310Sstevel@tonic-gate 
1320Sstevel@tonic-gate 	metafreenamelist(nlp);
1330Sstevel@tonic-gate 	return (0);
1340Sstevel@tonic-gate }
1350Sstevel@tonic-gate 
1360Sstevel@tonic-gate static int
rep_has_err(md_replicalist_t * rlp,mdname_t * np)1370Sstevel@tonic-gate rep_has_err(md_replicalist_t *rlp, mdname_t *np)
1380Sstevel@tonic-gate {
1390Sstevel@tonic-gate 	md_replicalist_t	*rl;
1400Sstevel@tonic-gate 
1410Sstevel@tonic-gate 	for (rl = rlp; rl != NULL; rl = rl->rl_next) {
1420Sstevel@tonic-gate 		md_replica_t	*r = rl->rl_repp;
1430Sstevel@tonic-gate 
1440Sstevel@tonic-gate 		if (strcmp(r->r_namep->cname, np->cname) != 0)
1450Sstevel@tonic-gate 			continue;
1460Sstevel@tonic-gate 
1470Sstevel@tonic-gate 		if (r->r_flags & (MDDB_F_EREAD | MDDB_F_EFMT | MDDB_F_EDATA |
1480Sstevel@tonic-gate 		    MDDB_F_EMASTER | MDDB_F_EWRITE))
1490Sstevel@tonic-gate 			return (1);
1500Sstevel@tonic-gate 
1510Sstevel@tonic-gate 	}
1520Sstevel@tonic-gate 	return (0);
1530Sstevel@tonic-gate }
1540Sstevel@tonic-gate 
1550Sstevel@tonic-gate static int
add_drv_to_ctl_lst(md_ctlr_ctl_t ** clpp,md_replicalist_t * rlp,mddrivename_t * dnp,int dbcnt,daddr_t dbsize,mdcinfo_t * cinfop,int indiskset,int with_bus,int errored,md_error_t * ep)1560Sstevel@tonic-gate add_drv_to_ctl_lst(
1570Sstevel@tonic-gate 	md_ctlr_ctl_t		**clpp,
1580Sstevel@tonic-gate 	md_replicalist_t	*rlp,
1590Sstevel@tonic-gate 	mddrivename_t		*dnp,
1600Sstevel@tonic-gate 	int			dbcnt,
1610Sstevel@tonic-gate 	daddr_t			dbsize,
1620Sstevel@tonic-gate 	mdcinfo_t		*cinfop,
1630Sstevel@tonic-gate 	int			indiskset,
1640Sstevel@tonic-gate 	int			with_bus,
1650Sstevel@tonic-gate 	int			errored,
1660Sstevel@tonic-gate 	md_error_t		*ep
1670Sstevel@tonic-gate )
1680Sstevel@tonic-gate {
1690Sstevel@tonic-gate 	md_ctlr_drv_t		**dpp;
1700Sstevel@tonic-gate 	mdname_t		*np;
1710Sstevel@tonic-gate 	mdcinfo_t		*tcinfop;
1726698Ssk102515 	char			*cmp_name_1, *cmp_name_2;
1730Sstevel@tonic-gate 	int			not_found;
1740Sstevel@tonic-gate 
1750Sstevel@tonic-gate 	/*
1760Sstevel@tonic-gate 	 * The user must pass in a list head.
1770Sstevel@tonic-gate 	 */
1780Sstevel@tonic-gate 	assert(clpp != NULL);
1790Sstevel@tonic-gate 
1800Sstevel@tonic-gate 	if (cinfop == NULL) {
1810Sstevel@tonic-gate 		uint_t	rep_slice;
1820Sstevel@tonic-gate 
1830Sstevel@tonic-gate 		if (meta_replicaslice(dnp, &rep_slice, ep) != 0) {
1840Sstevel@tonic-gate 			/*
1850Sstevel@tonic-gate 			 * A failure to get the slice information can occur
1860Sstevel@tonic-gate 			 * because the drive has failed, if this is the
1870Sstevel@tonic-gate 			 * case then there is nothing that can be done
1880Sstevel@tonic-gate 			 * with this drive, so do not include it in the
1890Sstevel@tonic-gate 			 * list of drives. Clear the error and return.
1900Sstevel@tonic-gate 			 */
1910Sstevel@tonic-gate 			mdclrerror(ep);
1920Sstevel@tonic-gate 			return (0);
1930Sstevel@tonic-gate 		}
1940Sstevel@tonic-gate 
1950Sstevel@tonic-gate 		if ((np = metaslicename(dnp, rep_slice, ep)) == NULL)
1960Sstevel@tonic-gate 			return (-1);
1970Sstevel@tonic-gate 
1980Sstevel@tonic-gate 		if ((tcinfop = metagetcinfo(np, ep)) == NULL)
1990Sstevel@tonic-gate 			return (-1);
2000Sstevel@tonic-gate 
2010Sstevel@tonic-gate 		if (metagetvtoc(np, FALSE, NULL, ep) == NULL)
2020Sstevel@tonic-gate 			errored = 1;
2030Sstevel@tonic-gate 
2040Sstevel@tonic-gate 		if (rep_has_err(rlp, np))
2050Sstevel@tonic-gate 			errored = 1;
2060Sstevel@tonic-gate 	} else
2070Sstevel@tonic-gate 		tcinfop = cinfop;
2080Sstevel@tonic-gate 
2090Sstevel@tonic-gate 	for (/* void */; *clpp != NULL; clpp = &(*clpp)->ctl_next) {
2100Sstevel@tonic-gate 		/*
2110Sstevel@tonic-gate 		 * Try to locate ctlr.
2120Sstevel@tonic-gate 		 */
2130Sstevel@tonic-gate 		(void) sdssc_convert_cluster_path(tcinfop->cname, &cmp_name_1);
2140Sstevel@tonic-gate 		(void) sdssc_convert_cluster_path((*clpp)->ctl_cinfop->cname,
2150Sstevel@tonic-gate 		    &cmp_name_2);
2160Sstevel@tonic-gate 
2170Sstevel@tonic-gate 		if (tcinfop->ctype != (*clpp)->ctl_cinfop->ctype ||
2180Sstevel@tonic-gate 		    tcinfop->cnum != (*clpp)->ctl_cinfop->cnum ||
2190Sstevel@tonic-gate 		    strncmp(cmp_name_1, cmp_name_2, 16) != 0 ||
2200Sstevel@tonic-gate 		    (with_bus && tcinfop->bus != (*clpp)->ctl_cinfop->bus)) {
2210Sstevel@tonic-gate 			not_found = 1;
2220Sstevel@tonic-gate 		} else
2230Sstevel@tonic-gate 			not_found = 0;
2240Sstevel@tonic-gate 
2250Sstevel@tonic-gate 
2260Sstevel@tonic-gate 		sdssc_convert_path_free(cmp_name_1);
2270Sstevel@tonic-gate 		sdssc_convert_path_free(cmp_name_2);
2280Sstevel@tonic-gate 
2290Sstevel@tonic-gate 		if (not_found)
2300Sstevel@tonic-gate 			continue;
2310Sstevel@tonic-gate 
2320Sstevel@tonic-gate 		/*
2330Sstevel@tonic-gate 		 * Found ctlr, try to locate the drive.
2340Sstevel@tonic-gate 		 */
2350Sstevel@tonic-gate 		for (dpp = &(*clpp)->ctl_drvs; *dpp != NULL;
2360Sstevel@tonic-gate 		    dpp = &(*dpp)->drv_next) {
2370Sstevel@tonic-gate 			(void) sdssc_convert_cluster_path(
2380Sstevel@tonic-gate 			    (*dpp)->drv_dnp->cname, &cmp_name_1);
2390Sstevel@tonic-gate 			(void) sdssc_convert_cluster_path(dnp->cname,
2400Sstevel@tonic-gate 			    &cmp_name_2);
2410Sstevel@tonic-gate 
2420Sstevel@tonic-gate 			not_found = strcmp(cmp_name_1, cmp_name_2);
2430Sstevel@tonic-gate 
2440Sstevel@tonic-gate 			sdssc_convert_path_free(cmp_name_1);
2450Sstevel@tonic-gate 			sdssc_convert_path_free(cmp_name_2);
2460Sstevel@tonic-gate 
2470Sstevel@tonic-gate 			if (not_found)
2486698Ssk102515 				continue;
2490Sstevel@tonic-gate 
2500Sstevel@tonic-gate 			/*
2510Sstevel@tonic-gate 			 * Found drive, must be deleting.
2520Sstevel@tonic-gate 			 */
2530Sstevel@tonic-gate 			(*dpp)->drv_op = DRV_DEL;
2540Sstevel@tonic-gate 			if (indiskset)
2550Sstevel@tonic-gate 				(*dpp)->drv_flags |= DRV_F_INDISKSET;
2560Sstevel@tonic-gate 			if (errored) {
2570Sstevel@tonic-gate 				mdclrerror(ep);
2580Sstevel@tonic-gate 				(*dpp)->drv_flags |= DRV_F_ERROR;
2590Sstevel@tonic-gate 			}
2600Sstevel@tonic-gate 			(*clpp)->ctl_dbcnt -= (*dpp)->drv_dbcnt;
2610Sstevel@tonic-gate 			(*clpp)->ctl_drcnt--;
2620Sstevel@tonic-gate 			return (0);
2630Sstevel@tonic-gate 		}
2640Sstevel@tonic-gate 		/*
2650Sstevel@tonic-gate 		 * The ctlr was found, but not the drive, so add
2660Sstevel@tonic-gate 		 * the drive
2670Sstevel@tonic-gate 		 */
2680Sstevel@tonic-gate 		(*dpp) = Zalloc(sizeof (**dpp));
2690Sstevel@tonic-gate 
2700Sstevel@tonic-gate 
2710Sstevel@tonic-gate 		if (indiskset) {
2720Sstevel@tonic-gate 			(*dpp)->drv_op = DRV_NOP;
2730Sstevel@tonic-gate 			(*dpp)->drv_flags |= DRV_F_INDISKSET;
2740Sstevel@tonic-gate 			if (errored) {
2750Sstevel@tonic-gate 				mdclrerror(ep);
2760Sstevel@tonic-gate 				(*dpp)->drv_flags |= DRV_F_ERROR;
2770Sstevel@tonic-gate 			}
2780Sstevel@tonic-gate 		} else {
2790Sstevel@tonic-gate 			(*dpp)->drv_op = DRV_ADD;
2800Sstevel@tonic-gate 			if (errored) {
2810Sstevel@tonic-gate 				(*dpp)->drv_flags |= DRV_F_ERROR;
2820Sstevel@tonic-gate 				return (-1);
2830Sstevel@tonic-gate 			}
2840Sstevel@tonic-gate 			assert(dbsize != 0);
2850Sstevel@tonic-gate 		}
2860Sstevel@tonic-gate 		(*dpp)->drv_dbcnt = dbcnt;
2870Sstevel@tonic-gate 		(*dpp)->drv_dbsize = dbsize;
2880Sstevel@tonic-gate 		(*dpp)->drv_dnp = dnp;
2890Sstevel@tonic-gate 		(*clpp)->ctl_dbcnt += dbcnt;
2900Sstevel@tonic-gate 		(*clpp)->ctl_drcnt++;
2910Sstevel@tonic-gate 		return (0);
2920Sstevel@tonic-gate 	}
2930Sstevel@tonic-gate 	/*
2940Sstevel@tonic-gate 	 * No ctlr was located, so add the ctlr, then recurse to add the
2950Sstevel@tonic-gate 	 * drive to the ctlr.
2960Sstevel@tonic-gate 	 */
2970Sstevel@tonic-gate 	(*clpp) = Zalloc(sizeof (**clpp));
2980Sstevel@tonic-gate 
2990Sstevel@tonic-gate 	(*clpp)->ctl_cinfop = tcinfop;
3000Sstevel@tonic-gate 
3010Sstevel@tonic-gate 	return (add_drv_to_ctl_lst(clpp, rlp, dnp, dbcnt, dbsize, tcinfop,
3020Sstevel@tonic-gate 	    indiskset, with_bus, errored, ep));
3030Sstevel@tonic-gate }
3040Sstevel@tonic-gate 
3050Sstevel@tonic-gate static int
add_replica_to_ctl(mdsetname_t * sp,md_ctlr_ctl_t * c,int minimum_replicas,md_error_t * ep)3060Sstevel@tonic-gate add_replica_to_ctl(
3070Sstevel@tonic-gate 	mdsetname_t		*sp,
3080Sstevel@tonic-gate 	md_ctlr_ctl_t		*c,
3090Sstevel@tonic-gate 	int			minimum_replicas,
3100Sstevel@tonic-gate 	md_error_t		*ep
3110Sstevel@tonic-gate )
3120Sstevel@tonic-gate {
3130Sstevel@tonic-gate 	md_ctlr_drv_t		*d;
3140Sstevel@tonic-gate 	int			maxdb = 0;
3150Sstevel@tonic-gate 
3160Sstevel@tonic-gate 	/*
3170Sstevel@tonic-gate 	 * If this ctrl has no "usable" drives, assert() or just return if
3180Sstevel@tonic-gate 	 * assert()'s are turned off.
3190Sstevel@tonic-gate 	 */
3200Sstevel@tonic-gate 	if (c->ctl_drcnt == 0) {
3210Sstevel@tonic-gate 		assert(0);
3220Sstevel@tonic-gate 		return (0);
3230Sstevel@tonic-gate 	}
3240Sstevel@tonic-gate 
3250Sstevel@tonic-gate 	/*
3260Sstevel@tonic-gate 	 * Determine the largest DB count on a drive.
3270Sstevel@tonic-gate 	 */
3280Sstevel@tonic-gate 	for (d = c->ctl_drvs; d != NULL; d = d->drv_next)
3290Sstevel@tonic-gate 		if (d->drv_dbcnt > maxdb && d->drv_op != DRV_DEL)
3300Sstevel@tonic-gate 			maxdb = d->drv_dbcnt;
3310Sstevel@tonic-gate 
3320Sstevel@tonic-gate 	/*
3330Sstevel@tonic-gate 	 * Make sure we start at a reasonable number
3340Sstevel@tonic-gate 	 */
3350Sstevel@tonic-gate 	if (maxdb == 0)
3360Sstevel@tonic-gate 		maxdb = 1;
3370Sstevel@tonic-gate 
3380Sstevel@tonic-gate 	/*
3390Sstevel@tonic-gate 	 * Add a replica to a drive on this ctrl.
3400Sstevel@tonic-gate 	 */
3410Sstevel@tonic-gate 	/*CONSTCOND*/
3420Sstevel@tonic-gate 	while (1) {
3430Sstevel@tonic-gate 		for (d = c->ctl_drvs; d != NULL; d = d->drv_next) {
3440Sstevel@tonic-gate 			/*
3450Sstevel@tonic-gate 			 * If this drive is being deleted, skip it.
3460Sstevel@tonic-gate 			 */
3470Sstevel@tonic-gate 			if (d->drv_op == DRV_DEL)
3480Sstevel@tonic-gate 				continue;
3490Sstevel@tonic-gate 
3500Sstevel@tonic-gate 			if (d->drv_flags & DRV_F_ERROR)
3510Sstevel@tonic-gate 				continue;
3520Sstevel@tonic-gate 			/*
3530Sstevel@tonic-gate 			 * Make sure that the replicas are distributed across
3540Sstevel@tonic-gate 			 * the drives.
3550Sstevel@tonic-gate 			 */
3560Sstevel@tonic-gate 			if (d->drv_dbcnt >= maxdb)
3570Sstevel@tonic-gate 				continue;
3580Sstevel@tonic-gate 			/*
3590Sstevel@tonic-gate 			 * See if the drive already has replicas,
3600Sstevel@tonic-gate 			 * if it does, then delete the exisiting
3610Sstevel@tonic-gate 			 * replica(s) and re-add n+1 replicas to the drive.
3620Sstevel@tonic-gate 			 */
3630Sstevel@tonic-gate 			/* ==== Vulnerability - no DB's start ==== */
3640Sstevel@tonic-gate 			if (d->drv_dbcnt > 0) {
3650Sstevel@tonic-gate 				if (del_replica(sp, d->drv_dnp, ep) == -1) {
3660Sstevel@tonic-gate 					d->drv_flags |= DRV_F_ERROR;
3670Sstevel@tonic-gate 					if (! (d->drv_flags & DRV_F_INDISKSET))
3680Sstevel@tonic-gate 						return (-1);
3690Sstevel@tonic-gate 					mdclrerror(ep);
3700Sstevel@tonic-gate 					continue;
3710Sstevel@tonic-gate 				}
3720Sstevel@tonic-gate 			}
3730Sstevel@tonic-gate 			if (add_replica(sp, d->drv_dnp, (d->drv_dbcnt + 1),
3740Sstevel@tonic-gate 			    d->drv_dbsize, ep) == -1) {
375*7779SPeter.Dennis@Sun.COM 				md_error_t nep = mdnullerror;
3766698Ssk102515 
3770Sstevel@tonic-gate 				if (d->drv_dbcnt) {
3786698Ssk102515 					/*
3796698Ssk102515 					 * We have to to bring the replica
3806698Ssk102515 					 * in the drive to the previous
3816698Ssk102515 					 * status by adding the original no
3826698Ssk102515 					 * of replicas to the drive since
3836698Ssk102515 					 * the addition of (drv_dbcnt+1) no
3846698Ssk102515 					 * of replicas has failed. If we
3856698Ssk102515 					 * leave it at this state, we might
3866698Ssk102515 					 * end up having no replicas at
3876698Ssk102515 					 * all for the diskset.
3886698Ssk102515 					 */
3896698Ssk102515 					if (add_replica(sp, d->drv_dnp,
3906698Ssk102515 					    d->drv_dbcnt, d->drv_dbsize,
3916698Ssk102515 					    &nep) == -1) {
3926698Ssk102515 						c->ctl_dbcnt -= d->drv_dbcnt;
3936698Ssk102515 						d->drv_dbcnt = 0;
394*7779SPeter.Dennis@Sun.COM 						mdclrerror(&nep);
3956698Ssk102515 					}
3960Sstevel@tonic-gate 				}
3970Sstevel@tonic-gate 
3980Sstevel@tonic-gate 				if (mdismddberror(ep, MDE_TOOMANY_REPLICAS))
3990Sstevel@tonic-gate 					return (-1);
4000Sstevel@tonic-gate 
4010Sstevel@tonic-gate 				if (mdismddberror(ep, MDE_REPLICA_TOOSMALL))
4026698Ssk102515 					continue;
4030Sstevel@tonic-gate 
4040Sstevel@tonic-gate 				d->drv_flags |= DRV_F_ERROR;
4050Sstevel@tonic-gate 				if (! (d->drv_flags & DRV_F_INDISKSET))
4060Sstevel@tonic-gate 					return (-1);
4070Sstevel@tonic-gate 				mdclrerror(ep);
4080Sstevel@tonic-gate 				continue;
4090Sstevel@tonic-gate 			}
4100Sstevel@tonic-gate 
4110Sstevel@tonic-gate 			d->drv_dbcnt++;
4120Sstevel@tonic-gate 			c->ctl_dbcnt++;
4130Sstevel@tonic-gate 			/* ==== Vulnerability - no DB's end ==== */
4140Sstevel@tonic-gate 			return (1);
4150Sstevel@tonic-gate 		}
4160Sstevel@tonic-gate 		maxdb++;
4170Sstevel@tonic-gate 		if (maxdb > minimum_replicas)
4180Sstevel@tonic-gate 			return (0);
4190Sstevel@tonic-gate 	}
4200Sstevel@tonic-gate 	/*NOTREACHED*/
4210Sstevel@tonic-gate }
4220Sstevel@tonic-gate 
4230Sstevel@tonic-gate static int
del_replica_from_ctl(mdsetname_t * sp,md_ctlr_ctl_t * c,md_error_t * ep)4240Sstevel@tonic-gate del_replica_from_ctl(
4250Sstevel@tonic-gate 	mdsetname_t		*sp,
4260Sstevel@tonic-gate 	md_ctlr_ctl_t		*c,
4270Sstevel@tonic-gate 	md_error_t		*ep
4280Sstevel@tonic-gate )
4290Sstevel@tonic-gate {
4300Sstevel@tonic-gate 	md_ctlr_drv_t		*d;
4310Sstevel@tonic-gate 	int			maxdb = 0;
4320Sstevel@tonic-gate 
4330Sstevel@tonic-gate 	/*
4340Sstevel@tonic-gate 	 * If this ctrl has no "usable" drives, assert() or just return if
4350Sstevel@tonic-gate 	 * assert()'s are turned off.
4360Sstevel@tonic-gate 	 */
4370Sstevel@tonic-gate 	if (c->ctl_drcnt == 0) {
4380Sstevel@tonic-gate 		assert(0);
4390Sstevel@tonic-gate 		return (0);
4400Sstevel@tonic-gate 	}
4410Sstevel@tonic-gate 
4420Sstevel@tonic-gate 	/*
4430Sstevel@tonic-gate 	 * Determine the largest DB count on a drive.
4440Sstevel@tonic-gate 	 */
4450Sstevel@tonic-gate 	for (d = c->ctl_drvs; d != NULL; d = d->drv_next)
4460Sstevel@tonic-gate 		if (d->drv_dbcnt > maxdb && d->drv_op != DRV_DEL)
4470Sstevel@tonic-gate 			maxdb = d->drv_dbcnt;
4480Sstevel@tonic-gate 
4490Sstevel@tonic-gate 	if (maxdb == 0)
4500Sstevel@tonic-gate 		return (0);
4510Sstevel@tonic-gate 
4520Sstevel@tonic-gate 	/*
4530Sstevel@tonic-gate 	 * Delete a replica from a drive on this ctrl.
4540Sstevel@tonic-gate 	 */
4550Sstevel@tonic-gate 	/*CONSTCOND*/
4560Sstevel@tonic-gate 	while (1) {
4570Sstevel@tonic-gate 		for (d = c->ctl_drvs; d != NULL; d = d->drv_next) {
4580Sstevel@tonic-gate 			/*
4590Sstevel@tonic-gate 			 * If this drive is being deleted, skip it.
4600Sstevel@tonic-gate 			 */
4610Sstevel@tonic-gate 			if (d->drv_op == DRV_DEL)
4620Sstevel@tonic-gate 				continue;
4630Sstevel@tonic-gate 
4640Sstevel@tonic-gate 			/*
4650Sstevel@tonic-gate 			 * Make sure that there are replicas on this drive to
4660Sstevel@tonic-gate 			 * delete.
4670Sstevel@tonic-gate 			 */
4680Sstevel@tonic-gate 			if (d->drv_dbcnt == 0)
4690Sstevel@tonic-gate 				continue;
4700Sstevel@tonic-gate 
4710Sstevel@tonic-gate 			if (d->drv_flags & DRV_F_ERROR)
4720Sstevel@tonic-gate 				continue;
4730Sstevel@tonic-gate 
4740Sstevel@tonic-gate 			/*
4750Sstevel@tonic-gate 			 * We need to keep the DB's distributed across the
4760Sstevel@tonic-gate 			 * drives.
4770Sstevel@tonic-gate 			 */
4780Sstevel@tonic-gate 			if (d->drv_dbcnt < maxdb)
4790Sstevel@tonic-gate 				continue;
4800Sstevel@tonic-gate 
4810Sstevel@tonic-gate 			/*
4820Sstevel@tonic-gate 			 * Delete all the replicas on the drive.
4830Sstevel@tonic-gate 			 */
4840Sstevel@tonic-gate 			/* ==== Vulnerability - no DB's start ==== */
4850Sstevel@tonic-gate 			if (del_replica(sp, d->drv_dnp, ep) == -1) {
4860Sstevel@tonic-gate 				d->drv_flags |= DRV_F_ERROR;
4870Sstevel@tonic-gate 				if (! (d->drv_flags & DRV_F_INDISKSET))
4880Sstevel@tonic-gate 					return (-1);
4890Sstevel@tonic-gate 				mdclrerror(ep);
4900Sstevel@tonic-gate 				continue;
4910Sstevel@tonic-gate 			}
4920Sstevel@tonic-gate 			d->drv_dbcnt--;
4930Sstevel@tonic-gate 			c->ctl_dbcnt--;
4940Sstevel@tonic-gate 			/*
4950Sstevel@tonic-gate 			 * If there is still a dbcnt for this drive, then add
4960Sstevel@tonic-gate 			 * back the needed DB's.
4970Sstevel@tonic-gate 			 */
4980Sstevel@tonic-gate 			if (d->drv_dbcnt > 0) {
4990Sstevel@tonic-gate 				if (add_replica(sp, d->drv_dnp, d->drv_dbcnt,
5000Sstevel@tonic-gate 				    d->drv_dbsize, ep) == -1) {
5010Sstevel@tonic-gate 					c->ctl_dbcnt -= d->drv_dbcnt;
5020Sstevel@tonic-gate 					d->drv_dbcnt = 0;
5030Sstevel@tonic-gate 
5040Sstevel@tonic-gate 					if (mdismddberror(ep,
5050Sstevel@tonic-gate 					    MDE_TOOMANY_REPLICAS))
5060Sstevel@tonic-gate 						return (-1);
5070Sstevel@tonic-gate 
5080Sstevel@tonic-gate 					d->drv_flags |= DRV_F_ERROR;
5090Sstevel@tonic-gate 					if (! (d->drv_flags & DRV_F_INDISKSET))
5100Sstevel@tonic-gate 						return (-1);
5110Sstevel@tonic-gate 					mdclrerror(ep);
5120Sstevel@tonic-gate 					continue;
5130Sstevel@tonic-gate 				}
5140Sstevel@tonic-gate 			}
5150Sstevel@tonic-gate 			/* ==== Vulnerability - no DB's end ==== */
5160Sstevel@tonic-gate 			return (1);
5170Sstevel@tonic-gate 		}
5180Sstevel@tonic-gate 		maxdb--;
5190Sstevel@tonic-gate 		if (maxdb <= 0)
5200Sstevel@tonic-gate 			return (0);
5210Sstevel@tonic-gate 	}
5220Sstevel@tonic-gate 	/*NOTREACHED*/
5230Sstevel@tonic-gate }
5240Sstevel@tonic-gate 
5250Sstevel@tonic-gate static int
del_replicas(mdsetname_t * sp,md_ctlr_ctl_t * clp,md_error_t * ep)5260Sstevel@tonic-gate del_replicas(mdsetname_t *sp, md_ctlr_ctl_t *clp, md_error_t *ep)
5270Sstevel@tonic-gate {
5280Sstevel@tonic-gate 	md_ctlr_ctl_t		*c;
5290Sstevel@tonic-gate 	md_ctlr_drv_t		*d;
5300Sstevel@tonic-gate 	mdnamelist_t		*nlp;
5310Sstevel@tonic-gate 	mdname_t		*np;
5320Sstevel@tonic-gate 
5330Sstevel@tonic-gate 	for (c = clp; c != NULL; c = c->ctl_next) {
5340Sstevel@tonic-gate 		for (d = c->ctl_drvs; d != NULL; d = d->drv_next) {
5350Sstevel@tonic-gate 			uint_t	rep_slice;
5360Sstevel@tonic-gate 
5370Sstevel@tonic-gate 			if (! (d->drv_flags & DRV_F_ERROR) &&
5380Sstevel@tonic-gate 			    (d->drv_op != DRV_DEL))
5390Sstevel@tonic-gate 				continue;
5400Sstevel@tonic-gate 
5410Sstevel@tonic-gate 			if (d->drv_dbcnt == 0)
5420Sstevel@tonic-gate 				continue;
5430Sstevel@tonic-gate 
5440Sstevel@tonic-gate 			if (meta_replicaslice(d->drv_dnp,
5450Sstevel@tonic-gate 			    &rep_slice, ep) != 0)
5460Sstevel@tonic-gate 				return (-1);
5470Sstevel@tonic-gate 
5480Sstevel@tonic-gate 			np = metaslicename(d->drv_dnp, rep_slice, ep);
5490Sstevel@tonic-gate 			if (np == NULL)
5500Sstevel@tonic-gate 				return (-1);
5510Sstevel@tonic-gate 
5520Sstevel@tonic-gate 			nlp = NULL;
5530Sstevel@tonic-gate 			(void) metanamelist_append(&nlp, np);
5540Sstevel@tonic-gate 
5550Sstevel@tonic-gate 			/*
5560Sstevel@tonic-gate 			 * Delete the replicas listed.
5570Sstevel@tonic-gate 			 */
5580Sstevel@tonic-gate 			if (meta_db_detach(sp, nlp,
5590Sstevel@tonic-gate 			    (MDFORCE_DS | MDFORCE_SET_LOCKED), NULL,
5600Sstevel@tonic-gate 			    ep) == -1) {
5610Sstevel@tonic-gate 				metafreenamelist(nlp);
5620Sstevel@tonic-gate 				if (d->drv_flags & DRV_F_INDISKSET) {
5630Sstevel@tonic-gate 					mdclrerror(ep);
5640Sstevel@tonic-gate 					continue;
5650Sstevel@tonic-gate 				}
5660Sstevel@tonic-gate 				return (-1);
5670Sstevel@tonic-gate 			}
5680Sstevel@tonic-gate 			metafreenamelist(nlp);
5690Sstevel@tonic-gate 		}
5700Sstevel@tonic-gate 	}
5710Sstevel@tonic-gate 
5720Sstevel@tonic-gate 	return (0);
5730Sstevel@tonic-gate }
5740Sstevel@tonic-gate 
5750Sstevel@tonic-gate static void
free_ctlr_lst(md_ctlr_ctl_t ** clpp)5760Sstevel@tonic-gate free_ctlr_lst(md_ctlr_ctl_t **clpp)
5770Sstevel@tonic-gate {
5780Sstevel@tonic-gate 	md_ctlr_ctl_t		*c, *tc = NULL;
5790Sstevel@tonic-gate 	md_ctlr_drv_t		*d, *td = NULL;
5800Sstevel@tonic-gate 
5810Sstevel@tonic-gate 	for (c = *clpp; c != NULL; c = tc) {
5820Sstevel@tonic-gate 		tc = c->ctl_next;
5830Sstevel@tonic-gate 		for (d = c->ctl_drvs; d != NULL; d = td) {
5840Sstevel@tonic-gate 			td = d->drv_next;
5850Sstevel@tonic-gate 			Free(d);
5860Sstevel@tonic-gate 		}
5870Sstevel@tonic-gate 		Free(c);
5880Sstevel@tonic-gate 	}
5890Sstevel@tonic-gate 	*clpp = NULL;
5900Sstevel@tonic-gate }
5910Sstevel@tonic-gate 
5920Sstevel@tonic-gate static int
build_ctlr_lst(mdsetname_t * sp,md_ctlr_ctl_t ** clpp,md_drive_desc * opdd,md_drive_desc * curdd,int with_bus,daddr_t dbsize,md_error_t * ep)5930Sstevel@tonic-gate build_ctlr_lst(
5940Sstevel@tonic-gate 	mdsetname_t		*sp,
5950Sstevel@tonic-gate 	md_ctlr_ctl_t		**clpp,
5960Sstevel@tonic-gate 	md_drive_desc		*opdd,
5970Sstevel@tonic-gate 	md_drive_desc		*curdd,
5980Sstevel@tonic-gate 	int			with_bus,
5990Sstevel@tonic-gate 	daddr_t			dbsize,
6000Sstevel@tonic-gate 	md_error_t		*ep
6010Sstevel@tonic-gate )
6020Sstevel@tonic-gate {
6030Sstevel@tonic-gate 	md_drive_desc			*d;
6040Sstevel@tonic-gate 	md_set_desc			*sd;
6050Sstevel@tonic-gate 	daddr_t				nblks;
6060Sstevel@tonic-gate 	md_replicalist_t		*rlp = NULL;
6070Sstevel@tonic-gate 	static	daddr_t			min_dbsize = 0;
6080Sstevel@tonic-gate 
6090Sstevel@tonic-gate 	if (min_dbsize == 0) {
6100Sstevel@tonic-gate 		if ((nblks = meta_db_minreplica(sp, ep)) < 0) {
6110Sstevel@tonic-gate 			min_dbsize = MD_DBSIZE;
6120Sstevel@tonic-gate 
6130Sstevel@tonic-gate 			if (! metaislocalset(sp)) {
6140Sstevel@tonic-gate 				if ((sd = metaget_setdesc(sp, ep)) == NULL)
6150Sstevel@tonic-gate 					return (-1);
6160Sstevel@tonic-gate 
6170Sstevel@tonic-gate 				if (MD_MNSET_DESC(sd))
6180Sstevel@tonic-gate 					min_dbsize = MD_MN_DBSIZE;
6190Sstevel@tonic-gate 			}
6200Sstevel@tonic-gate 			mdclrerror(ep);
6210Sstevel@tonic-gate 		} else
6220Sstevel@tonic-gate 			min_dbsize = nblks;
6230Sstevel@tonic-gate 	}
6240Sstevel@tonic-gate 
6250Sstevel@tonic-gate 	if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) < 0) {
6260Sstevel@tonic-gate 		if (! mdismddberror(ep, MDE_DB_NODB) &&
6270Sstevel@tonic-gate 		    ! mdismddberror(ep, MDE_DB_NOTOWNER))
6280Sstevel@tonic-gate 			return (-1);
6290Sstevel@tonic-gate 		mdclrerror(ep);
6300Sstevel@tonic-gate 	}
6310Sstevel@tonic-gate 
6320Sstevel@tonic-gate 	/*
6330Sstevel@tonic-gate 	 * Add drives currently in the set to the ctlr list.
6340Sstevel@tonic-gate 	 */
6350Sstevel@tonic-gate 	for (d = curdd; d != NULL; d = d->dd_next) {
6360Sstevel@tonic-gate 		daddr_t	this_dbsize = d->dd_dbsize;
6370Sstevel@tonic-gate 
6380Sstevel@tonic-gate 		if (this_dbsize == 0)
6390Sstevel@tonic-gate 			this_dbsize = min_dbsize;
6400Sstevel@tonic-gate 
6410Sstevel@tonic-gate 		if (add_drv_to_ctl_lst(clpp, rlp, d->dd_dnp, d->dd_dbcnt,
6420Sstevel@tonic-gate 		    this_dbsize, NULL, TRUE, with_bus, 0, ep) == -1)
6430Sstevel@tonic-gate 			return (-1);
6440Sstevel@tonic-gate 	}
6450Sstevel@tonic-gate 
6460Sstevel@tonic-gate 	/*
6470Sstevel@tonic-gate 	 * Add the drives that are being operated on to the ctlr list.
6480Sstevel@tonic-gate 	 */
6490Sstevel@tonic-gate 	for (d = opdd; d != NULL; d = d->dd_next)
6500Sstevel@tonic-gate 		if (add_drv_to_ctl_lst(clpp, rlp, d->dd_dnp, 0, dbsize, NULL,
6510Sstevel@tonic-gate 		    FALSE, with_bus, 0, ep) == -1)
6520Sstevel@tonic-gate 			return (-1);
6530Sstevel@tonic-gate 
6540Sstevel@tonic-gate 	metafreereplicalist(rlp);
6550Sstevel@tonic-gate 	return (0);
6560Sstevel@tonic-gate }
6570Sstevel@tonic-gate 
6580Sstevel@tonic-gate static int
count_replica_on_ctl(md_ctlr_ctl_t * c,int adding,int * db_cnt,int minimum_replicas)6590Sstevel@tonic-gate count_replica_on_ctl(
6600Sstevel@tonic-gate 	md_ctlr_ctl_t		*c,
6610Sstevel@tonic-gate 	int			adding,
6620Sstevel@tonic-gate 	int			*db_cnt,
6630Sstevel@tonic-gate 	int			minimum_replicas
6640Sstevel@tonic-gate )
6650Sstevel@tonic-gate {
6660Sstevel@tonic-gate 	md_ctlr_drv_t		*d;
6670Sstevel@tonic-gate 	int			maxdb = 0;
6680Sstevel@tonic-gate 
6690Sstevel@tonic-gate 	/*
6700Sstevel@tonic-gate 	 * If this ctrl has no "usable" drives, nothing to do.
6710Sstevel@tonic-gate 	 */
6720Sstevel@tonic-gate 	if (c->ctl_drcnt == 0)
6730Sstevel@tonic-gate 		return (0);
6740Sstevel@tonic-gate 
6750Sstevel@tonic-gate 	/*
6760Sstevel@tonic-gate 	 * Determine the largest DB count on a drive.
6770Sstevel@tonic-gate 	 */
6780Sstevel@tonic-gate 	for (d = c->ctl_drvs; d != NULL; d = d->drv_next)
6790Sstevel@tonic-gate 		if (d->drv_new_dbcnt > maxdb && d->drv_op != DRV_DEL)
6800Sstevel@tonic-gate 			maxdb = d->drv_new_dbcnt;
6810Sstevel@tonic-gate 
6820Sstevel@tonic-gate 	/*
6830Sstevel@tonic-gate 	 * Make sure we start at a reasonable number
6840Sstevel@tonic-gate 	 */
6850Sstevel@tonic-gate 	if (maxdb == 0) {
6860Sstevel@tonic-gate 		if (!adding)
6870Sstevel@tonic-gate 			return (0);
6880Sstevel@tonic-gate 		maxdb = 1;
6890Sstevel@tonic-gate 	}
6900Sstevel@tonic-gate 
6910Sstevel@tonic-gate 	/*
6920Sstevel@tonic-gate 	 * Count or Un-Count replicas that would be
6930Sstevel@tonic-gate 	 * added or deleted respectively.
6940Sstevel@tonic-gate 	 */
6950Sstevel@tonic-gate 	/*CONSTCOND*/
6960Sstevel@tonic-gate 	while (1) {
6970Sstevel@tonic-gate 		for (d = c->ctl_drvs; d != NULL; d = d->drv_next) {
6980Sstevel@tonic-gate 			/*
6990Sstevel@tonic-gate 			 * If this drive is being deleted, skip it.
7000Sstevel@tonic-gate 			 */
7010Sstevel@tonic-gate 			if (d->drv_op == DRV_DEL)
7020Sstevel@tonic-gate 				continue;
7030Sstevel@tonic-gate 
7040Sstevel@tonic-gate 			/*
7050Sstevel@tonic-gate 			 * If the drive is errored and adding, skip it.
7060Sstevel@tonic-gate 			 */
7070Sstevel@tonic-gate 			if (adding && (d->drv_flags & DRV_F_ERROR))
7080Sstevel@tonic-gate 				continue;
7090Sstevel@tonic-gate 
7100Sstevel@tonic-gate 			/*
7110Sstevel@tonic-gate 			 * Make sure that the replicas are distributed across
7120Sstevel@tonic-gate 			 * the drives.
7130Sstevel@tonic-gate 			 */
7140Sstevel@tonic-gate 			if (adding) {
7150Sstevel@tonic-gate 				if (d->drv_new_dbcnt >= maxdb)
7160Sstevel@tonic-gate 					continue;
7170Sstevel@tonic-gate 			} else {
7180Sstevel@tonic-gate 				if (d->drv_new_dbcnt == 0)
7190Sstevel@tonic-gate 					continue;
7200Sstevel@tonic-gate 				if (d->drv_new_dbcnt < maxdb)
7210Sstevel@tonic-gate 					continue;
7220Sstevel@tonic-gate 			}
7230Sstevel@tonic-gate 
7240Sstevel@tonic-gate 			/*
7250Sstevel@tonic-gate 			 * Count or Un-Count replicas here.
7260Sstevel@tonic-gate 			 */
7270Sstevel@tonic-gate 			if (adding) {
7280Sstevel@tonic-gate 				mdpart_t	*partp;
7290Sstevel@tonic-gate 				uint_t		rep_slice;
730*7779SPeter.Dennis@Sun.COM 				md_error_t	mde = mdnullerror;
7310Sstevel@tonic-gate 
7320Sstevel@tonic-gate 				if (meta_replicaslice(d->drv_dnp,
733*7779SPeter.Dennis@Sun.COM 				    &rep_slice, &mde) != 0) {
734*7779SPeter.Dennis@Sun.COM 					mdclrerror(&mde);
7350Sstevel@tonic-gate 					continue;
736*7779SPeter.Dennis@Sun.COM 				}
7370Sstevel@tonic-gate 
7380Sstevel@tonic-gate 				partp = &d->drv_dnp->vtoc.parts[rep_slice];
7390Sstevel@tonic-gate 				if (! partp)
7400Sstevel@tonic-gate 					continue;
7410Sstevel@tonic-gate 
7420Sstevel@tonic-gate 				if (((d->drv_new_dbcnt + 1) * d->drv_dbsize) >
7430Sstevel@tonic-gate 				    (partp->size - 16))
7440Sstevel@tonic-gate 					continue;
7450Sstevel@tonic-gate 				(*db_cnt)++;
7460Sstevel@tonic-gate 				d->drv_new_dbcnt++;
7470Sstevel@tonic-gate 			} else {
7480Sstevel@tonic-gate 				(*db_cnt)--;
7490Sstevel@tonic-gate 				d->drv_new_dbcnt--;
7500Sstevel@tonic-gate 			}
7510Sstevel@tonic-gate 			return (0);
7520Sstevel@tonic-gate 		}
7530Sstevel@tonic-gate 
7540Sstevel@tonic-gate 		/*
7550Sstevel@tonic-gate 		 * This should make sure they get spread
7560Sstevel@tonic-gate 		 * around.  This is to emulate the {add,del}_replica
7570Sstevel@tonic-gate 		 * routines.
7580Sstevel@tonic-gate 		 */
7590Sstevel@tonic-gate 		if (adding) {
7600Sstevel@tonic-gate 			maxdb++;
7610Sstevel@tonic-gate 			if (maxdb > minimum_replicas)
7620Sstevel@tonic-gate 				return (-1);
7630Sstevel@tonic-gate 		} else {
7640Sstevel@tonic-gate 			maxdb--;
7650Sstevel@tonic-gate 			if (maxdb <= 0)
7660Sstevel@tonic-gate 				return (-1);
7670Sstevel@tonic-gate 		}
7680Sstevel@tonic-gate 	}
7690Sstevel@tonic-gate 	/*NOTREACHED*/
7700Sstevel@tonic-gate }
7710Sstevel@tonic-gate 
7720Sstevel@tonic-gate static int
count_replicas(md_ctlr_ctl_t * clp,int min_reps)7730Sstevel@tonic-gate count_replicas(
7740Sstevel@tonic-gate 	md_ctlr_ctl_t		*clp,
7750Sstevel@tonic-gate 	int			min_reps
7760Sstevel@tonic-gate )
7770Sstevel@tonic-gate {
7780Sstevel@tonic-gate 	md_ctlr_ctl_t		*c;
7790Sstevel@tonic-gate 	md_ctlr_drv_t		*d;
7800Sstevel@tonic-gate 	int			db_cnt;
7810Sstevel@tonic-gate 	int			uctlrs = 0;
7820Sstevel@tonic-gate 	int			total_cnt = 0;
7830Sstevel@tonic-gate 
7840Sstevel@tonic-gate 	/*
7850Sstevel@tonic-gate 	 * Count the number of controllers,
7860Sstevel@tonic-gate 	 * counting the replicas is slightly different based
7870Sstevel@tonic-gate 	 * on the controller count.
7880Sstevel@tonic-gate 	 */
7890Sstevel@tonic-gate 	for (c = clp; c != NULL; c = c->ctl_next)
7900Sstevel@tonic-gate 		if (c->ctl_drcnt > 0) {
7910Sstevel@tonic-gate 			uctlrs++;
7920Sstevel@tonic-gate 			for (d = c->ctl_drvs; d != NULL; d = d->drv_next)
7930Sstevel@tonic-gate 				d->drv_new_dbcnt = d->drv_dbcnt;
7940Sstevel@tonic-gate 		}
7950Sstevel@tonic-gate 
7960Sstevel@tonic-gate 	if (uctlrs > 2) {
7970Sstevel@tonic-gate 		for (c = clp; c != NULL; c = c->ctl_next) {
7980Sstevel@tonic-gate 			if (c->ctl_drcnt == 0)
7990Sstevel@tonic-gate 				continue;
8000Sstevel@tonic-gate 
8010Sstevel@tonic-gate 			db_cnt = c->ctl_dbcnt;
8020Sstevel@tonic-gate 			/*
8030Sstevel@tonic-gate 			 * Count the replicas that would be added.
8040Sstevel@tonic-gate 			 */
8050Sstevel@tonic-gate 			while (db_cnt < min_reps)
8060Sstevel@tonic-gate 				if (count_replica_on_ctl(c, TRUE,
8070Sstevel@tonic-gate 				    &db_cnt, min_reps))
8080Sstevel@tonic-gate 					return (-1);
8090Sstevel@tonic-gate 
8100Sstevel@tonic-gate 			/*
8110Sstevel@tonic-gate 			 * Un-Count the replicas that would be deleted.
8120Sstevel@tonic-gate 			 */
8130Sstevel@tonic-gate 			while (db_cnt > min_reps)
8140Sstevel@tonic-gate 				if (count_replica_on_ctl(c, FALSE,
8150Sstevel@tonic-gate 				    &db_cnt, min_reps))
8160Sstevel@tonic-gate 					return (-1);
8170Sstevel@tonic-gate 			total_cnt += db_cnt;
8180Sstevel@tonic-gate 		}
8190Sstevel@tonic-gate 	} else {
8200Sstevel@tonic-gate 		for (c = clp; c != NULL; c = c->ctl_next) {
8210Sstevel@tonic-gate 			if (c->ctl_drcnt == 0)
8220Sstevel@tonic-gate 				continue;
8230Sstevel@tonic-gate 
8240Sstevel@tonic-gate 			db_cnt = c->ctl_dbcnt;
8250Sstevel@tonic-gate 			/*
8260Sstevel@tonic-gate 			 * Count the replicas that woud be added.
8270Sstevel@tonic-gate 			 */
8280Sstevel@tonic-gate 			while (db_cnt < (min_reps * c->ctl_drcnt))
8290Sstevel@tonic-gate 				if (count_replica_on_ctl(c, TRUE,
8300Sstevel@tonic-gate 				    &db_cnt, min_reps))
8310Sstevel@tonic-gate 					return (-1);
8320Sstevel@tonic-gate 
8330Sstevel@tonic-gate 			total_cnt += db_cnt;
8340Sstevel@tonic-gate 		}
8350Sstevel@tonic-gate 	}
8360Sstevel@tonic-gate 
8370Sstevel@tonic-gate 	return (total_cnt);
8380Sstevel@tonic-gate }
8390Sstevel@tonic-gate 
8400Sstevel@tonic-gate static int
balance_replicas(mdsetname_t * sp,md_ctlr_ctl_t ** clpp,md_drive_desc * opdd,md_drive_desc * curdd,daddr_t dbsize,int * minimum_replicas,md_error_t * ep)8410Sstevel@tonic-gate balance_replicas(
8420Sstevel@tonic-gate 	mdsetname_t		*sp,
8430Sstevel@tonic-gate 	md_ctlr_ctl_t		**clpp,
8440Sstevel@tonic-gate 	md_drive_desc		*opdd,
8450Sstevel@tonic-gate 	md_drive_desc		*curdd,
8460Sstevel@tonic-gate 	daddr_t			dbsize,
8470Sstevel@tonic-gate 	int			*minimum_replicas,
8480Sstevel@tonic-gate 	md_error_t		*ep
8490Sstevel@tonic-gate )
8500Sstevel@tonic-gate {
8510Sstevel@tonic-gate 	int			n;
8520Sstevel@tonic-gate 	int			rctlrs = 0;
8530Sstevel@tonic-gate 	int			uctlrs;
8540Sstevel@tonic-gate 	int			ructlrs;
8550Sstevel@tonic-gate 	int			octlrs;
8560Sstevel@tonic-gate 	int			save_done;
8570Sstevel@tonic-gate 	int			prevcnt = 0, issame = 1;
8580Sstevel@tonic-gate 	uint_t			drvcnt = ~0U;
8590Sstevel@tonic-gate 	uint_t			save_cnum;
8600Sstevel@tonic-gate 	mhd_ctlrtype_t		save_ctype;
8616698Ssk102515 	char			save_cname[16];
8626698Ssk102515 	char			*cmp_name_1, *cmp_name_2;
8630Sstevel@tonic-gate 	int			reps;
8640Sstevel@tonic-gate 	md_ctlr_ctl_t		*c;
8650Sstevel@tonic-gate 
8660Sstevel@tonic-gate 	/*
8670Sstevel@tonic-gate 	 * Build a ctlr list with SSA-100 busses NOT as separate controllers.
8680Sstevel@tonic-gate 	 */
8690Sstevel@tonic-gate 	if (build_ctlr_lst(sp, clpp, opdd, curdd, FALSE, dbsize, ep) == -1)
8700Sstevel@tonic-gate 		return (-1);
8710Sstevel@tonic-gate 
8720Sstevel@tonic-gate 	/*
8730Sstevel@tonic-gate 	 * Determine what controllers are usable in the sense of being able to
8740Sstevel@tonic-gate 	 * add a replica to a drive on the controller.
8750Sstevel@tonic-gate 	 * Also find the minimum number of drives on a controller.
8760Sstevel@tonic-gate 	 */
8770Sstevel@tonic-gate 	for (c = *clpp; c != NULL; c = c->ctl_next) {
8780Sstevel@tonic-gate 		if (c->ctl_drcnt > 0) {
8790Sstevel@tonic-gate 			rctlrs++;
8800Sstevel@tonic-gate 			drvcnt = min(drvcnt, c->ctl_drcnt);
8810Sstevel@tonic-gate 			if (prevcnt == 0)
8820Sstevel@tonic-gate 				prevcnt = c->ctl_drcnt;
8830Sstevel@tonic-gate 			else if (prevcnt != c->ctl_drcnt)
8840Sstevel@tonic-gate 				issame = 0;
8850Sstevel@tonic-gate 		}
8860Sstevel@tonic-gate 	}
8870Sstevel@tonic-gate 
8880Sstevel@tonic-gate 	if ((rctlrs <= 2) || (issame && (drvcnt >= 30)))
8890Sstevel@tonic-gate 		goto cont;
8900Sstevel@tonic-gate 
8910Sstevel@tonic-gate 	/*
8920Sstevel@tonic-gate 	 * If here: Handling 3 or more controllers most
8930Sstevel@tonic-gate 	 *	    likely with non-symmetrical number of
8940Sstevel@tonic-gate 	 *	    disks. The number of replicas will be
8950Sstevel@tonic-gate 	 *	    the minimum number of disks on a controller.
8960Sstevel@tonic-gate 	 *
8970Sstevel@tonic-gate 	 *	    The main point is to insure that a
8980Sstevel@tonic-gate 	 *	    controller does not have more than half
8990Sstevel@tonic-gate 	 *	    of the replicas.
9000Sstevel@tonic-gate 	 */
9010Sstevel@tonic-gate 	drvcnt = min(drvcnt, 12);
9020Sstevel@tonic-gate 	drvcnt = max(drvcnt, MD_MINBALREP);
9030Sstevel@tonic-gate 
9040Sstevel@tonic-gate 	/*
9050Sstevel@tonic-gate 	 * Can we find fewer than the maximum replicas by reducing the
9060Sstevel@tonic-gate 	 * number of replicas per drive.
9070Sstevel@tonic-gate 	 */
9080Sstevel@tonic-gate 	for (n = drvcnt; n > 0; n--) {
9090Sstevel@tonic-gate 		reps = count_replicas(*clpp, n);
9100Sstevel@tonic-gate 		if (reps > 0 && reps <= MDDB_NLB) {
9110Sstevel@tonic-gate 			*minimum_replicas = n;
9120Sstevel@tonic-gate 			return (0);
9130Sstevel@tonic-gate 		}
9140Sstevel@tonic-gate 	}
9150Sstevel@tonic-gate 
9160Sstevel@tonic-gate cont:
9170Sstevel@tonic-gate 	free_ctlr_lst(clpp);
9180Sstevel@tonic-gate 
9190Sstevel@tonic-gate 	/*
9200Sstevel@tonic-gate 	 * Build a ctlr list with SSA-100 busses as separate controllers.
9210Sstevel@tonic-gate 	 *
9220Sstevel@tonic-gate 	 * If Here: Try to put 2 replicas per controller/bus
9230Sstevel@tonic-gate 	 *	    If that doesn't work put 1 replica per controller/bus
9240Sstevel@tonic-gate 	 */
9250Sstevel@tonic-gate 	if (build_ctlr_lst(sp, clpp, opdd, curdd, TRUE, dbsize, ep) == -1)
9260Sstevel@tonic-gate 		return (-1);
9270Sstevel@tonic-gate 
9280Sstevel@tonic-gate 	/*
9290Sstevel@tonic-gate 	 * If the number of "real" controllers is 2, special handling may be
9300Sstevel@tonic-gate 	 * needed.
9310Sstevel@tonic-gate 	 */
9320Sstevel@tonic-gate 	if (rctlrs != 2) {
9330Sstevel@tonic-gate 		drvcnt = MD_MINBALREP;
9340Sstevel@tonic-gate 		goto other;
9350Sstevel@tonic-gate 	}
9360Sstevel@tonic-gate 
9370Sstevel@tonic-gate 	/*
9380Sstevel@tonic-gate 	 * Determine what controllers are usable in the sense of being able to
9390Sstevel@tonic-gate 	 * add a replica to a drive on the controller.
9400Sstevel@tonic-gate 	 * Also find the minimum number of drives on a controller.
9410Sstevel@tonic-gate 	 */
9420Sstevel@tonic-gate 	drvcnt = ~0U;
9430Sstevel@tonic-gate 	uctlrs = 0;
9440Sstevel@tonic-gate 	for (c = *clpp; c != NULL; c = c->ctl_next) {
9450Sstevel@tonic-gate 		if (c->ctl_drcnt > 0) {
9460Sstevel@tonic-gate 			uctlrs++;
9470Sstevel@tonic-gate 			drvcnt = min(drvcnt, c->ctl_drcnt);
9480Sstevel@tonic-gate 		}
9490Sstevel@tonic-gate 	}
9500Sstevel@tonic-gate 
9510Sstevel@tonic-gate 	/*
9520Sstevel@tonic-gate 	 * If the number of controllers is not changed, continue with original
9530Sstevel@tonic-gate 	 * strategy.
9540Sstevel@tonic-gate 	 */
9550Sstevel@tonic-gate 	if (uctlrs == rctlrs) {
9560Sstevel@tonic-gate 		drvcnt = MD_MINBALREP;
9570Sstevel@tonic-gate 		goto other;
9580Sstevel@tonic-gate 	}
9590Sstevel@tonic-gate 
9600Sstevel@tonic-gate 	/*
9610Sstevel@tonic-gate 	 * Check the distribution of bus ctlrs across real controllers.
9620Sstevel@tonic-gate 	 */
9630Sstevel@tonic-gate 	ructlrs = 0;
9640Sstevel@tonic-gate 	octlrs = 0;
9650Sstevel@tonic-gate 	save_done = 0;
9660Sstevel@tonic-gate 	for (c = *clpp; c != NULL; c = c->ctl_next) {
9670Sstevel@tonic-gate 		if (c->ctl_drcnt == 0)
9680Sstevel@tonic-gate 			continue;
9690Sstevel@tonic-gate 
9700Sstevel@tonic-gate 		if (! save_done) {
9710Sstevel@tonic-gate 			save_cnum = c->ctl_cinfop->cnum;
9720Sstevel@tonic-gate 			save_ctype = c->ctl_cinfop->ctype;
9730Sstevel@tonic-gate 			(void) strncpy(save_cname, c->ctl_cinfop->cname, 16);
9740Sstevel@tonic-gate 			save_done = 1;
9750Sstevel@tonic-gate 		}
9760Sstevel@tonic-gate 
9770Sstevel@tonic-gate 		(void) sdssc_convert_cluster_path(c->ctl_cinfop->cname,
9780Sstevel@tonic-gate 		    &cmp_name_1);
9790Sstevel@tonic-gate 		(void) sdssc_convert_cluster_path(save_cname, &cmp_name_2);
9800Sstevel@tonic-gate 
9810Sstevel@tonic-gate 		if (save_ctype != c->ctl_cinfop->ctype ||
9820Sstevel@tonic-gate 		    save_cnum != c->ctl_cinfop->cnum ||
9830Sstevel@tonic-gate 		    strncmp(cmp_name_1, cmp_name_2, 16) != 0)
9840Sstevel@tonic-gate 			octlrs++;
9850Sstevel@tonic-gate 		else
9860Sstevel@tonic-gate 			ructlrs++;
9870Sstevel@tonic-gate 
9880Sstevel@tonic-gate 		sdssc_convert_path_free(cmp_name_1);
9890Sstevel@tonic-gate 		sdssc_convert_path_free(cmp_name_2);
9900Sstevel@tonic-gate 	}
9910Sstevel@tonic-gate 
9920Sstevel@tonic-gate 	/*
9930Sstevel@tonic-gate 	 * Take the largest of the counts
9940Sstevel@tonic-gate 	 */
9950Sstevel@tonic-gate 	ructlrs = max(ructlrs, octlrs);
9960Sstevel@tonic-gate 
9970Sstevel@tonic-gate 	/*
9980Sstevel@tonic-gate 	 * If the distribution of bus controlers is half of the total, then
9990Sstevel@tonic-gate 	 * this layout strategy will work, doit.
10000Sstevel@tonic-gate 	 */
10010Sstevel@tonic-gate 	if ((uctlrs / 2) == ructlrs) {
10020Sstevel@tonic-gate 		drvcnt = MD_MINBALREP;
10030Sstevel@tonic-gate 		goto other;
10040Sstevel@tonic-gate 	}
10050Sstevel@tonic-gate 
10060Sstevel@tonic-gate 	/*
10070Sstevel@tonic-gate 	 * If here, there is a distribution of bus controllers that will cause
10080Sstevel@tonic-gate 	 * the real controller distribution to be unbalanced, so a different
10090Sstevel@tonic-gate 	 * strategy is used.
10100Sstevel@tonic-gate 	 */
10110Sstevel@tonic-gate 	free_ctlr_lst(clpp);
10120Sstevel@tonic-gate 
10130Sstevel@tonic-gate 	/*
10140Sstevel@tonic-gate 	 * Build the ctlr list with SSA-100 busses NOT as separate controllers.
10150Sstevel@tonic-gate 	 */
10160Sstevel@tonic-gate 	if (build_ctlr_lst(sp, clpp, opdd, curdd, FALSE, dbsize, ep) == -1)
10170Sstevel@tonic-gate 		return (-1);
10180Sstevel@tonic-gate 
10190Sstevel@tonic-gate 	/*
10200Sstevel@tonic-gate 	 * Make ctl_drcnt limit the number of replicas
10210Sstevel@tonic-gate 	 */
10220Sstevel@tonic-gate 	for (c = *clpp; c != NULL; c = c->ctl_next)
10230Sstevel@tonic-gate 		c->ctl_drcnt = min(drvcnt, c->ctl_drcnt);
10240Sstevel@tonic-gate 
10250Sstevel@tonic-gate 	/*
10260Sstevel@tonic-gate 	 * Try at least MD_MINBALREP's per controller after changing ctl_drcnt
10270Sstevel@tonic-gate 	 */
10280Sstevel@tonic-gate 	drvcnt = MD_MINBALREP;
10290Sstevel@tonic-gate 
10300Sstevel@tonic-gate other:
10310Sstevel@tonic-gate 	/*
10320Sstevel@tonic-gate 	 * Can we find fewer than the maximum replicas by reducing the number
10330Sstevel@tonic-gate 	 * of replicas per drive.
10340Sstevel@tonic-gate 	 */
10350Sstevel@tonic-gate 	for (n = drvcnt; n > 0; n--) {
10360Sstevel@tonic-gate 		reps = count_replicas(*clpp, n);
10370Sstevel@tonic-gate 		if (reps > 0 && reps <= MDDB_NLB) {
10380Sstevel@tonic-gate 			*minimum_replicas = n;
10390Sstevel@tonic-gate 			return (0);
10400Sstevel@tonic-gate 		}
10410Sstevel@tonic-gate 	}
10420Sstevel@tonic-gate 
10430Sstevel@tonic-gate 	free_ctlr_lst(clpp);
10440Sstevel@tonic-gate 
10450Sstevel@tonic-gate 	/*
10460Sstevel@tonic-gate 	 * Build a ctlr list with SSA-100 busses NOT as separate controllers.
10470Sstevel@tonic-gate 	 *
10480Sstevel@tonic-gate 	 * If Here: Try to put 2 replicas per controller (not on busses)
10490Sstevel@tonic-gate 	 *	    If that doesn't work put 1 replica per controller
10500Sstevel@tonic-gate 	 */
10510Sstevel@tonic-gate 	if (build_ctlr_lst(sp, clpp, opdd, curdd, FALSE, dbsize, ep) == -1)
10520Sstevel@tonic-gate 		return (-1);
10530Sstevel@tonic-gate 
10540Sstevel@tonic-gate 	/*
10550Sstevel@tonic-gate 	 * Can we find fewer than the maximum replicas by reducing the
10560Sstevel@tonic-gate 	 * number of replicas per drive.
10570Sstevel@tonic-gate 	 */
10580Sstevel@tonic-gate 	for (n = MD_MINBALREP; n > 0; n--) {
10590Sstevel@tonic-gate 		reps = count_replicas(*clpp, n);
10600Sstevel@tonic-gate 		if (reps > 0 && reps <= MDDB_NLB) {
10610Sstevel@tonic-gate 			*minimum_replicas = n;
10620Sstevel@tonic-gate 			return (0);
10630Sstevel@tonic-gate 		}
10640Sstevel@tonic-gate 	}
10650Sstevel@tonic-gate 
10660Sstevel@tonic-gate 	/*
10670Sstevel@tonic-gate 	 * Return a ctrl list that does not include the SSA-100 buses as
10680Sstevel@tonic-gate 	 * separate controllers.  This will create fewer separate controllers.
10690Sstevel@tonic-gate 	 */
10700Sstevel@tonic-gate 	*minimum_replicas = 1;
10710Sstevel@tonic-gate 	return (0);
10720Sstevel@tonic-gate }
10730Sstevel@tonic-gate 
10740Sstevel@tonic-gate static int
morethan2_ctl_balance(mdsetname_t * sp,md_ctlr_ctl_t * clp,int min_reps,md_error_t * ep)10750Sstevel@tonic-gate morethan2_ctl_balance(
10760Sstevel@tonic-gate 	mdsetname_t		*sp,
10770Sstevel@tonic-gate 	md_ctlr_ctl_t		*clp,
10780Sstevel@tonic-gate 	int			min_reps,
10790Sstevel@tonic-gate 	md_error_t		*ep
10800Sstevel@tonic-gate )
10810Sstevel@tonic-gate {
10820Sstevel@tonic-gate 	md_ctlr_ctl_t		*c;
10830Sstevel@tonic-gate 	int			err;
10842150Sjeanm 	int			multiple_reps = 0;
10852150Sjeanm 	md_ctlr_drv_t		*d;
10860Sstevel@tonic-gate 
10870Sstevel@tonic-gate 	for (c = clp; c != NULL; c = c->ctl_next) {
10880Sstevel@tonic-gate 		if (c->ctl_drcnt == 0)
10890Sstevel@tonic-gate 			continue;
10900Sstevel@tonic-gate 
10912150Sjeanm 		/*
10922150Sjeanm 		 * check for multiple databases on a disk and compensate
10932150Sjeanm 		 */
10942150Sjeanm 		for (d = c->ctl_drvs; d != NULL; d = d->drv_next) {
10952150Sjeanm 			if (d->drv_dbcnt)
10962150Sjeanm 				multiple_reps += d->drv_dbcnt - 1;
10972150Sjeanm 		}
10982150Sjeanm 
10992150Sjeanm 		/*
11002150Sjeanm 		 * remove the number of multiple databases count from the
11012150Sjeanm 		 * total db count. This enables us to rebalance if one of
11022150Sjeanm 		 * the disks has a large enough slice for 2 metadb's. If we
11032150Sjeanm 		 * then add a disk with a smaller slice into the set, we want
11042150Sjeanm 		 * that disk to get a replica on it. If we just compare to
11052150Sjeanm 		 * ctl_dbcnt, it won't.
11062150Sjeanm 		 */
11072150Sjeanm 		while ((c->ctl_dbcnt - multiple_reps) <
11082150Sjeanm 		    min_reps) {
11090Sstevel@tonic-gate 			if ((err = add_replica_to_ctl(sp, c, min_reps, ep)) < 0)
11100Sstevel@tonic-gate 				return (-1);
11110Sstevel@tonic-gate 			if (err == 0)
11120Sstevel@tonic-gate 				break;
11130Sstevel@tonic-gate 		}
11140Sstevel@tonic-gate 
11150Sstevel@tonic-gate 		while (c->ctl_dbcnt > min_reps) {
11160Sstevel@tonic-gate 			if ((err = del_replica_from_ctl(sp, c, ep)) < 0)
11170Sstevel@tonic-gate 				return (-1);
11180Sstevel@tonic-gate 			if (err == 0)
11190Sstevel@tonic-gate 				break;
11200Sstevel@tonic-gate 		}
11210Sstevel@tonic-gate 	}
11220Sstevel@tonic-gate 
11230Sstevel@tonic-gate 	return (0);
11240Sstevel@tonic-gate }
11250Sstevel@tonic-gate 
11260Sstevel@tonic-gate static int
lessthan3_ctl_balance(mdsetname_t * sp,md_ctlr_ctl_t * clp,int min_reps,md_error_t * ep)11270Sstevel@tonic-gate lessthan3_ctl_balance(
11280Sstevel@tonic-gate 	mdsetname_t		*sp,
11290Sstevel@tonic-gate 	md_ctlr_ctl_t		*clp,
11300Sstevel@tonic-gate 	int			min_reps,
11310Sstevel@tonic-gate 	md_error_t		*ep
11320Sstevel@tonic-gate )
11330Sstevel@tonic-gate {
11340Sstevel@tonic-gate 	md_ctlr_ctl_t		*c;
11350Sstevel@tonic-gate 	int			err;
11362150Sjeanm 	int			multiple_reps = 0;
11372150Sjeanm 	md_ctlr_drv_t		*d;
11380Sstevel@tonic-gate 
11390Sstevel@tonic-gate 	for (c = clp; c != NULL; c = c->ctl_next) {
11400Sstevel@tonic-gate 		if (c->ctl_drcnt == 0)
11410Sstevel@tonic-gate 			continue;
11420Sstevel@tonic-gate 
11432150Sjeanm 		/*
11442150Sjeanm 		 * check for multiple databases on a disk and compensate
11452150Sjeanm 		 */
11462150Sjeanm 		for (d = c->ctl_drvs; d != NULL; d = d->drv_next) {
11472150Sjeanm 			if (d->drv_dbcnt)
11482150Sjeanm 				multiple_reps += d->drv_dbcnt - 1;
11492150Sjeanm 		}
11502150Sjeanm 
11512150Sjeanm 		/*
11522150Sjeanm 		 * remove the number of multiple databases count from the
11532150Sjeanm 		 * total db count. This enables us to rebalance if one of
11542150Sjeanm 		 * the disks has a large enough slice for 2 metadb's. If we
11552150Sjeanm 		 * then add a disk with a smaller slice into the set, we want
11562150Sjeanm 		 * that disk to get a replica on it. If we just compare to
11572150Sjeanm 		 * ctl_dbcnt, it won't.
11582150Sjeanm 		 */
11592150Sjeanm 		while ((c->ctl_dbcnt - multiple_reps) <
11602150Sjeanm 		    (min_reps * c->ctl_drcnt)) {
11610Sstevel@tonic-gate 			if ((err = add_replica_to_ctl(sp, c, min_reps, ep)) < 0)
11620Sstevel@tonic-gate 				return (-1);
11630Sstevel@tonic-gate 			if (err == 0)
11640Sstevel@tonic-gate 				break;
11650Sstevel@tonic-gate 		}
11660Sstevel@tonic-gate 
11670Sstevel@tonic-gate 		while (c->ctl_dbcnt > (min_reps * c->ctl_drcnt)) {
11680Sstevel@tonic-gate 			if ((err = del_replica_from_ctl(sp, c, ep)) < 0)
11690Sstevel@tonic-gate 				return (-1);
11700Sstevel@tonic-gate 			if (err == 0)
11710Sstevel@tonic-gate 				break;
11720Sstevel@tonic-gate 		}
11730Sstevel@tonic-gate 	}
11740Sstevel@tonic-gate 
11750Sstevel@tonic-gate 	return (0);
11760Sstevel@tonic-gate }
11770Sstevel@tonic-gate 
11780Sstevel@tonic-gate static int
try_again(md_ctlr_ctl_t * clp,md_error_t * ep)11790Sstevel@tonic-gate try_again(
11800Sstevel@tonic-gate 	md_ctlr_ctl_t	*clp,
11810Sstevel@tonic-gate 	md_error_t	*ep
11820Sstevel@tonic-gate )
11830Sstevel@tonic-gate {
11840Sstevel@tonic-gate 	md_ctlr_ctl_t	*c;
11850Sstevel@tonic-gate 	md_ctlr_drv_t	*d;
11860Sstevel@tonic-gate 
11870Sstevel@tonic-gate 	if (mdismddberror(ep, MDE_TOOMANY_REPLICAS))
11880Sstevel@tonic-gate 		return (TRUE);
11890Sstevel@tonic-gate 
11900Sstevel@tonic-gate 	/*
11910Sstevel@tonic-gate 	 * retry if all the errored drives are already in the diskset.
11920Sstevel@tonic-gate 	 */
11930Sstevel@tonic-gate 	for (c = clp; c != NULL; c = c->ctl_next) {
11940Sstevel@tonic-gate 		for (d = c->ctl_drvs; d != NULL; d = d->drv_next) {
11950Sstevel@tonic-gate 			if ((d->drv_flags & (DRV_F_INDISKSET|DRV_F_ERROR))
11960Sstevel@tonic-gate 			    == DRV_F_ERROR)
11970Sstevel@tonic-gate 				return (FALSE);
11980Sstevel@tonic-gate 		}
11990Sstevel@tonic-gate 	}
12000Sstevel@tonic-gate 	return (TRUE);
12010Sstevel@tonic-gate }
12020Sstevel@tonic-gate 
12030Sstevel@tonic-gate int
meta_db_balance(mdsetname_t * sp,md_drive_desc * opdd,md_drive_desc * curdd,daddr_t dbsize,md_error_t * ep)12040Sstevel@tonic-gate meta_db_balance(
12050Sstevel@tonic-gate 	mdsetname_t		*sp,
12060Sstevel@tonic-gate 	md_drive_desc		*opdd,
12070Sstevel@tonic-gate 	md_drive_desc		*curdd,
12080Sstevel@tonic-gate 	daddr_t			dbsize,
12090Sstevel@tonic-gate 	md_error_t		*ep
12100Sstevel@tonic-gate )
12110Sstevel@tonic-gate {
12120Sstevel@tonic-gate 	int			min_reps;
12130Sstevel@tonic-gate 	md_ctlr_ctl_t		*c, *cl = NULL;
12140Sstevel@tonic-gate 	int			uctlrs = 0;
12150Sstevel@tonic-gate 	int			retry = 0;
12160Sstevel@tonic-gate 	int			rval = 0;
12170Sstevel@tonic-gate 
12180Sstevel@tonic-gate 	if (balance_replicas(sp, &cl, opdd, curdd, dbsize, &min_reps, ep) == -1)
12190Sstevel@tonic-gate 		return (-1);
12200Sstevel@tonic-gate 
12210Sstevel@tonic-gate 	/*
12220Sstevel@tonic-gate 	 * Determine what controllers are usable in the sense of being able to
12230Sstevel@tonic-gate 	 * add a replica to a drive on the controller.
12240Sstevel@tonic-gate 	 */
12250Sstevel@tonic-gate 	for (c = cl; c != NULL; c = c->ctl_next)
12260Sstevel@tonic-gate 		if (c->ctl_drcnt > 0)
12270Sstevel@tonic-gate 			uctlrs++;
12280Sstevel@tonic-gate 
12290Sstevel@tonic-gate 	/*
12300Sstevel@tonic-gate 	 * Add replicas to achieve a balance.
12310Sstevel@tonic-gate 	 */
12320Sstevel@tonic-gate 	if (uctlrs > 2)
12330Sstevel@tonic-gate 		rval = morethan2_ctl_balance(sp, cl, min_reps, ep);
12340Sstevel@tonic-gate 	else
12350Sstevel@tonic-gate 		rval = lessthan3_ctl_balance(sp, cl, min_reps, ep);
12360Sstevel@tonic-gate 
12370Sstevel@tonic-gate 	if (rval) {
12380Sstevel@tonic-gate 		if ((retry = try_again(cl, ep)) == TRUE) {
12390Sstevel@tonic-gate 			mdclrerror(ep);
12400Sstevel@tonic-gate 			rval = 0;
12410Sstevel@tonic-gate 		}
12420Sstevel@tonic-gate 	}
12430Sstevel@tonic-gate 
12440Sstevel@tonic-gate 	/*
12450Sstevel@tonic-gate 	 * Delete all the replicas from drives that are so marked.
12460Sstevel@tonic-gate 	 */
12470Sstevel@tonic-gate 	if (! rval)
12480Sstevel@tonic-gate 		rval = del_replicas(sp, cl, ep);
12490Sstevel@tonic-gate 
12500Sstevel@tonic-gate 	if (retry) {
12510Sstevel@tonic-gate 		if (uctlrs > 2)
12520Sstevel@tonic-gate 			rval = morethan2_ctl_balance(sp, cl, min_reps, ep);
12530Sstevel@tonic-gate 		else
12540Sstevel@tonic-gate 			rval = lessthan3_ctl_balance(sp, cl, min_reps, ep);
12550Sstevel@tonic-gate 
12560Sstevel@tonic-gate 		if (rval && mdismddberror(ep, MDE_TOOMANY_REPLICAS)) {
12570Sstevel@tonic-gate 			mdclrerror(ep);
12580Sstevel@tonic-gate 			rval = 0;
12590Sstevel@tonic-gate 		}
12600Sstevel@tonic-gate 	}
12610Sstevel@tonic-gate 
12620Sstevel@tonic-gate 	/*
12630Sstevel@tonic-gate 	 * Free up the ctlr list.
12640Sstevel@tonic-gate 	 */
12650Sstevel@tonic-gate 	free_ctlr_lst(&cl);
12660Sstevel@tonic-gate 
12670Sstevel@tonic-gate 	return (rval);
12680Sstevel@tonic-gate }
1269