xref: /onnv-gate/usr/src/lib/lvm/libmeta/common/meta_db_balance.c (revision 2150:e99313126b1a)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * Database location balancing code.
30  */
31 
32 #include <meta.h>
33 #include <sys/lvm/md_mddb.h>
34 #include <sdssc.h>
35 
36 #define	MD_MINBALREP	2
37 
38 /*
39  * Stuff for DB balancing.
40  */
41 enum md_ctlr_ops_t {
42 	DRV_NOP = 0,
43 	DRV_ADD = 1,
44 	DRV_DEL = 2
45 };
46 typedef enum md_ctlr_ops_t md_ctlr_ops_t;
47 
48 /* drive flag fields */
49 #define	DRV_F_ERROR	0x1
50 #define	DRV_F_INDISKSET	0x2
51 
52 struct md_ctlr_drv_t {
53 	md_ctlr_ops_t drv_op;
54 	int drv_flags;
55 	int drv_dbcnt;
56 	int drv_new_dbcnt;
57 	daddr_t drv_dbsize;
58 	mddrivename_t *drv_dnp;
59 	struct md_ctlr_drv_t *drv_next;
60 };
61 typedef struct md_ctlr_drv_t md_ctlr_drv_t;
62 
63 struct md_ctlr_ctl_t {
64 	mdcinfo_t *ctl_cinfop;
65 	int ctl_dbcnt;
66 	int ctl_drcnt;
67 	md_ctlr_drv_t *ctl_drvs;
68 	struct md_ctlr_ctl_t *ctl_next;
69 };
70 typedef struct md_ctlr_ctl_t md_ctlr_ctl_t;
71 
72 static int
73 add_replica(
74 	mdsetname_t		*sp,
75 	mddrivename_t		*dnp,
76 	int			dbcnt,
77 	daddr_t			dbsize,
78 	md_error_t		*ep
79 )
80 {
81 	mdnamelist_t		*nlp = NULL;
82 	mdname_t		*np;
83 	md_set_desc		*sd;
84 	uint_t			rep_slice;
85 
86 	if (meta_replicaslice(dnp, &rep_slice, ep) != 0)
87 		return (-1);
88 
89 	if ((np = metaslicename(dnp, rep_slice, ep)) == NULL)
90 		return (-1);
91 
92 	(void) metanamelist_append(&nlp, np);
93 
94 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
95 		metafreenamelist(nlp);
96 		return (-1);
97 	}
98 
99 	if (meta_db_attach(sp, nlp, (MDCHK_DRVINSET | MDCHK_SET_LOCKED),
100 	    (&sd->sd_ctime), dbcnt, dbsize, NULL, ep) == -1) {
101 		metafreenamelist(nlp);
102 		return (-1);
103 	}
104 
105 	metafreenamelist(nlp);
106 	return (0);
107 }
108 
109 static int
110 del_replica(
111 	mdsetname_t		*sp,
112 	mddrivename_t		*dnp,
113 	md_error_t		*ep
114 )
115 {
116 	mdnamelist_t		*nlp = NULL;
117 	mdname_t		*np;
118 	uint_t			rep_slice;
119 
120 	if (meta_replicaslice(dnp, &rep_slice, ep) != 0)
121 		return (-1);
122 
123 	if ((np = metaslicename(dnp, rep_slice, ep)) == NULL)
124 		return (-1);
125 
126 	(void) metanamelist_append(&nlp, np);
127 
128 	if (meta_db_detach(sp, nlp, (MDFORCE_DS | MDFORCE_SET_LOCKED),
129 	    NULL, ep) == -1) {
130 		metafreenamelist(nlp);
131 		return (-1);
132 	}
133 
134 	metafreenamelist(nlp);
135 	return (0);
136 }
137 
138 static int
139 rep_has_err(md_replicalist_t *rlp, mdname_t *np)
140 {
141 	md_replicalist_t	*rl;
142 
143 	for (rl = rlp; rl != NULL; rl = rl->rl_next) {
144 		md_replica_t	*r = rl->rl_repp;
145 
146 		if (strcmp(r->r_namep->cname, np->cname) != 0)
147 			continue;
148 
149 		if (r->r_flags & (MDDB_F_EREAD | MDDB_F_EFMT | MDDB_F_EDATA |
150 		    MDDB_F_EMASTER | MDDB_F_EWRITE))
151 			return (1);
152 
153 	}
154 	return (0);
155 }
156 
157 static int
158 add_drv_to_ctl_lst(
159 	md_ctlr_ctl_t		**clpp,
160 	md_replicalist_t	*rlp,
161 	mddrivename_t		*dnp,
162 	int			dbcnt,
163 	daddr_t			dbsize,
164 	mdcinfo_t		*cinfop,
165 	int			indiskset,
166 	int			with_bus,
167 	int			errored,
168 	md_error_t		*ep
169 )
170 {
171 	md_ctlr_drv_t		**dpp;
172 	mdname_t		*np;
173 	mdcinfo_t		*tcinfop;
174 	char			*cmp_name_1,
175 				*cmp_name_2;
176 	int			not_found;
177 
178 	/*
179 	 * The user must pass in a list head.
180 	 */
181 	assert(clpp != NULL);
182 
183 	if (cinfop == NULL) {
184 		uint_t	rep_slice;
185 
186 		if (meta_replicaslice(dnp, &rep_slice, ep) != 0) {
187 			/*
188 			 * A failure to get the slice information can occur
189 			 * because the drive has failed, if this is the
190 			 * case then there is nothing that can be done
191 			 * with this drive, so do not include it in the
192 			 * list of drives. Clear the error and return.
193 			 */
194 			mdclrerror(ep);
195 			return (0);
196 		}
197 
198 		if ((np = metaslicename(dnp, rep_slice, ep)) == NULL)
199 			return (-1);
200 
201 		if ((tcinfop = metagetcinfo(np, ep)) == NULL)
202 			return (-1);
203 
204 		if (metagetvtoc(np, FALSE, NULL, ep) == NULL)
205 			errored = 1;
206 
207 		if (rep_has_err(rlp, np))
208 			errored = 1;
209 	} else
210 		tcinfop = cinfop;
211 
212 	for (/* void */; *clpp != NULL; clpp = &(*clpp)->ctl_next) {
213 		/*
214 		 * Try to locate ctlr.
215 		 */
216 		(void) sdssc_convert_cluster_path(tcinfop->cname, &cmp_name_1);
217 		(void) sdssc_convert_cluster_path((*clpp)->ctl_cinfop->cname,
218 		    &cmp_name_2);
219 
220 		if (tcinfop->ctype != (*clpp)->ctl_cinfop->ctype ||
221 		    tcinfop->cnum != (*clpp)->ctl_cinfop->cnum ||
222 		    strncmp(cmp_name_1, cmp_name_2, 16) != 0 ||
223 		    (with_bus && tcinfop->bus != (*clpp)->ctl_cinfop->bus)) {
224 			not_found = 1;
225 		} else
226 			not_found = 0;
227 
228 
229 		sdssc_convert_path_free(cmp_name_1);
230 		sdssc_convert_path_free(cmp_name_2);
231 
232 		if (not_found)
233 			continue;
234 
235 		/*
236 		 * Found ctlr, try to locate the drive.
237 		 */
238 		for (dpp = &(*clpp)->ctl_drvs; *dpp != NULL;
239 		    dpp = &(*dpp)->drv_next) {
240 			(void) sdssc_convert_cluster_path(
241 			    (*dpp)->drv_dnp->cname, &cmp_name_1);
242 			(void) sdssc_convert_cluster_path(dnp->cname,
243 			    &cmp_name_2);
244 
245 			not_found = strcmp(cmp_name_1, cmp_name_2);
246 
247 			sdssc_convert_path_free(cmp_name_1);
248 			sdssc_convert_path_free(cmp_name_2);
249 
250 			if (not_found)
251 			    continue;
252 
253 			/*
254 			 * Found drive, must be deleting.
255 			 */
256 			(*dpp)->drv_op = DRV_DEL;
257 			if (indiskset)
258 				(*dpp)->drv_flags |= DRV_F_INDISKSET;
259 			if (errored) {
260 				mdclrerror(ep);
261 				(*dpp)->drv_flags |= DRV_F_ERROR;
262 			}
263 			(*clpp)->ctl_dbcnt -= (*dpp)->drv_dbcnt;
264 			(*clpp)->ctl_drcnt--;
265 			return (0);
266 		}
267 		/*
268 		 * The ctlr was found, but not the drive, so add
269 		 * the drive
270 		 */
271 		(*dpp) = Zalloc(sizeof (**dpp));
272 
273 
274 		if (indiskset) {
275 			(*dpp)->drv_op = DRV_NOP;
276 			(*dpp)->drv_flags |= DRV_F_INDISKSET;
277 			if (errored) {
278 				mdclrerror(ep);
279 				(*dpp)->drv_flags |= DRV_F_ERROR;
280 			}
281 		} else {
282 			(*dpp)->drv_op = DRV_ADD;
283 			if (errored) {
284 				(*dpp)->drv_flags |= DRV_F_ERROR;
285 				return (-1);
286 			}
287 			assert(dbsize != 0);
288 		}
289 		(*dpp)->drv_dbcnt = dbcnt;
290 		(*dpp)->drv_dbsize = dbsize;
291 		(*dpp)->drv_dnp = dnp;
292 		(*clpp)->ctl_dbcnt += dbcnt;
293 		(*clpp)->ctl_drcnt++;
294 		return (0);
295 	}
296 	/*
297 	 * No ctlr was located, so add the ctlr, then recurse to add the
298 	 * drive to the ctlr.
299 	 */
300 	(*clpp) = Zalloc(sizeof (**clpp));
301 
302 	(*clpp)->ctl_cinfop = tcinfop;
303 
304 	return (add_drv_to_ctl_lst(clpp, rlp, dnp, dbcnt, dbsize, tcinfop,
305 	    indiskset, with_bus, errored, ep));
306 }
307 
308 static int
309 add_replica_to_ctl(
310 	mdsetname_t		*sp,
311 	md_ctlr_ctl_t		*c,
312 	int			minimum_replicas,
313 	md_error_t		*ep
314 )
315 {
316 	md_ctlr_drv_t		*d;
317 	int			maxdb = 0;
318 
319 	/*
320 	 * If this ctrl has no "usable" drives, assert() or just return if
321 	 * assert()'s are turned off.
322 	 */
323 	if (c->ctl_drcnt == 0) {
324 		assert(0);
325 		return (0);
326 	}
327 
328 	/*
329 	 * Determine the largest DB count on a drive.
330 	 */
331 	for (d = c->ctl_drvs; d != NULL; d = d->drv_next)
332 		if (d->drv_dbcnt > maxdb && d->drv_op != DRV_DEL)
333 			maxdb = d->drv_dbcnt;
334 
335 	/*
336 	 * Make sure we start at a reasonable number
337 	 */
338 	if (maxdb == 0)
339 		maxdb = 1;
340 
341 	/*
342 	 * Add a replica to a drive on this ctrl.
343 	 */
344 	/*CONSTCOND*/
345 	while (1) {
346 		for (d = c->ctl_drvs; d != NULL; d = d->drv_next) {
347 			/*
348 			 * If this drive is being deleted, skip it.
349 			 */
350 			if (d->drv_op == DRV_DEL)
351 				continue;
352 
353 			if (d->drv_flags & DRV_F_ERROR)
354 				continue;
355 			/*
356 			 * Make sure that the replicas are distributed across
357 			 * the drives.
358 			 */
359 			if (d->drv_dbcnt >= maxdb)
360 				continue;
361 			/*
362 			 * See if the drive already has replicas,
363 			 * if it does, then delete the exisiting
364 			 * replica(s) and re-add n+1 replicas to the drive.
365 			 */
366 			/* ==== Vulnerability - no DB's start ==== */
367 			if (d->drv_dbcnt > 0) {
368 				if (del_replica(sp, d->drv_dnp, ep) == -1) {
369 					d->drv_flags |= DRV_F_ERROR;
370 					if (! (d->drv_flags & DRV_F_INDISKSET))
371 						return (-1);
372 					mdclrerror(ep);
373 					continue;
374 				}
375 			}
376 			if (add_replica(sp, d->drv_dnp, (d->drv_dbcnt + 1),
377 			    d->drv_dbsize, ep) == -1) {
378 				if (d->drv_dbcnt) {
379 					c->ctl_dbcnt -= d->drv_dbcnt;
380 					d->drv_dbcnt = 0;
381 				}
382 
383 				if (mdismddberror(ep, MDE_TOOMANY_REPLICAS))
384 					return (-1);
385 
386 				if (mdismddberror(ep, MDE_REPLICA_TOOSMALL))
387 					return (-1);
388 
389 				d->drv_flags |= DRV_F_ERROR;
390 				if (! (d->drv_flags & DRV_F_INDISKSET))
391 					return (-1);
392 				mdclrerror(ep);
393 				continue;
394 			}
395 
396 			d->drv_dbcnt++;
397 			c->ctl_dbcnt++;
398 			/* ==== Vulnerability - no DB's end ==== */
399 			return (1);
400 		}
401 		maxdb++;
402 		if (maxdb > minimum_replicas)
403 			return (0);
404 	}
405 	/*NOTREACHED*/
406 }
407 
408 static int
409 del_replica_from_ctl(
410 	mdsetname_t		*sp,
411 	md_ctlr_ctl_t		*c,
412 	md_error_t		*ep
413 )
414 {
415 	md_ctlr_drv_t		*d;
416 	int			maxdb = 0;
417 
418 	/*
419 	 * If this ctrl has no "usable" drives, assert() or just return if
420 	 * assert()'s are turned off.
421 	 */
422 	if (c->ctl_drcnt == 0) {
423 		assert(0);
424 		return (0);
425 	}
426 
427 	/*
428 	 * Determine the largest DB count on a drive.
429 	 */
430 	for (d = c->ctl_drvs; d != NULL; d = d->drv_next)
431 		if (d->drv_dbcnt > maxdb && d->drv_op != DRV_DEL)
432 			maxdb = d->drv_dbcnt;
433 
434 	if (maxdb == 0)
435 		return (0);
436 
437 	/*
438 	 * Delete a replica from a drive on this ctrl.
439 	 */
440 	/*CONSTCOND*/
441 	while (1) {
442 		for (d = c->ctl_drvs; d != NULL; d = d->drv_next) {
443 			/*
444 			 * If this drive is being deleted, skip it.
445 			 */
446 			if (d->drv_op == DRV_DEL)
447 				continue;
448 
449 			/*
450 			 * Make sure that there are replicas on this drive to
451 			 * delete.
452 			 */
453 			if (d->drv_dbcnt == 0)
454 				continue;
455 
456 			if (d->drv_flags & DRV_F_ERROR)
457 				continue;
458 
459 			/*
460 			 * We need to keep the DB's distributed across the
461 			 * drives.
462 			 */
463 			if (d->drv_dbcnt < maxdb)
464 				continue;
465 
466 			/*
467 			 * Delete all the replicas on the drive.
468 			 */
469 			/* ==== Vulnerability - no DB's start ==== */
470 			if (del_replica(sp, d->drv_dnp, ep) == -1) {
471 				d->drv_flags |= DRV_F_ERROR;
472 				if (! (d->drv_flags & DRV_F_INDISKSET))
473 					return (-1);
474 				mdclrerror(ep);
475 				continue;
476 			}
477 			d->drv_dbcnt--;
478 			c->ctl_dbcnt--;
479 			/*
480 			 * If there is still a dbcnt for this drive, then add
481 			 * back the needed DB's.
482 			 */
483 			if (d->drv_dbcnt > 0) {
484 				if (add_replica(sp, d->drv_dnp, d->drv_dbcnt,
485 				    d->drv_dbsize, ep) == -1) {
486 					c->ctl_dbcnt -= d->drv_dbcnt;
487 					d->drv_dbcnt = 0;
488 
489 					if (mdismddberror(ep,
490 					    MDE_TOOMANY_REPLICAS))
491 						return (-1);
492 
493 					d->drv_flags |= DRV_F_ERROR;
494 					if (! (d->drv_flags & DRV_F_INDISKSET))
495 						return (-1);
496 					mdclrerror(ep);
497 					continue;
498 				}
499 			}
500 			/* ==== Vulnerability - no DB's end ==== */
501 			return (1);
502 		}
503 		maxdb--;
504 		if (maxdb <= 0)
505 			return (0);
506 	}
507 	/*NOTREACHED*/
508 }
509 
510 static int
511 del_replicas(mdsetname_t *sp, md_ctlr_ctl_t *clp, md_error_t *ep)
512 {
513 	md_ctlr_ctl_t		*c;
514 	md_ctlr_drv_t		*d;
515 	mdnamelist_t		*nlp;
516 	mdname_t		*np;
517 
518 	for (c = clp; c != NULL; c = c->ctl_next) {
519 		for (d = c->ctl_drvs; d != NULL; d = d->drv_next) {
520 			uint_t	rep_slice;
521 
522 			if (! (d->drv_flags & DRV_F_ERROR) &&
523 			    (d->drv_op != DRV_DEL))
524 				continue;
525 
526 			if (d->drv_dbcnt == 0)
527 				continue;
528 
529 			if (meta_replicaslice(d->drv_dnp,
530 			    &rep_slice, ep) != 0)
531 				return (-1);
532 
533 			np = metaslicename(d->drv_dnp, rep_slice, ep);
534 			if (np == NULL)
535 				return (-1);
536 
537 			nlp = NULL;
538 			(void) metanamelist_append(&nlp, np);
539 
540 			/*
541 			 * Delete the replicas listed.
542 			 */
543 			if (meta_db_detach(sp, nlp,
544 			    (MDFORCE_DS | MDFORCE_SET_LOCKED), NULL,
545 			    ep) == -1) {
546 				metafreenamelist(nlp);
547 				if (d->drv_flags & DRV_F_INDISKSET) {
548 					mdclrerror(ep);
549 					continue;
550 				}
551 				return (-1);
552 			}
553 			metafreenamelist(nlp);
554 		}
555 	}
556 
557 	return (0);
558 }
559 
560 static void
561 free_ctlr_lst(md_ctlr_ctl_t **clpp)
562 {
563 	md_ctlr_ctl_t		*c, *tc = NULL;
564 	md_ctlr_drv_t		*d, *td = NULL;
565 
566 	for (c = *clpp; c != NULL; c = tc) {
567 		tc = c->ctl_next;
568 		for (d = c->ctl_drvs; d != NULL; d = td) {
569 			td = d->drv_next;
570 			Free(d);
571 		}
572 		Free(c);
573 	}
574 	*clpp = NULL;
575 }
576 
577 static int
578 build_ctlr_lst(
579 	mdsetname_t		*sp,
580 	md_ctlr_ctl_t		**clpp,
581 	md_drive_desc		*opdd,
582 	md_drive_desc		*curdd,
583 	int			with_bus,
584 	daddr_t			dbsize,
585 	md_error_t		*ep
586 )
587 {
588 	md_drive_desc			*d;
589 	md_set_desc			*sd;
590 	daddr_t				nblks;
591 	md_replicalist_t		*rlp = NULL;
592 	static	daddr_t			min_dbsize = 0;
593 
594 	if (min_dbsize == 0) {
595 		if ((nblks = meta_db_minreplica(sp, ep)) < 0) {
596 			min_dbsize = MD_DBSIZE;
597 
598 			if (! metaislocalset(sp)) {
599 				if ((sd = metaget_setdesc(sp, ep)) == NULL)
600 					return (-1);
601 
602 				if (MD_MNSET_DESC(sd))
603 					min_dbsize = MD_MN_DBSIZE;
604 			}
605 			mdclrerror(ep);
606 		} else
607 			min_dbsize = nblks;
608 	}
609 
610 	if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) < 0) {
611 		if (! mdismddberror(ep, MDE_DB_NODB) &&
612 		    ! mdismddberror(ep, MDE_DB_NOTOWNER))
613 			return (-1);
614 		mdclrerror(ep);
615 	}
616 
617 	/*
618 	 * Add drives currently in the set to the ctlr list.
619 	 */
620 	for (d = curdd; d != NULL; d = d->dd_next) {
621 		daddr_t	this_dbsize = d->dd_dbsize;
622 
623 		if (this_dbsize == 0)
624 			this_dbsize = min_dbsize;
625 
626 		if (add_drv_to_ctl_lst(clpp, rlp, d->dd_dnp, d->dd_dbcnt,
627 		    this_dbsize, NULL, TRUE, with_bus, 0, ep) == -1)
628 			return (-1);
629 	}
630 
631 	/*
632 	 * Add the drives that are being operated on to the ctlr list.
633 	 */
634 	for (d = opdd; d != NULL; d = d->dd_next)
635 		if (add_drv_to_ctl_lst(clpp, rlp, d->dd_dnp, 0, dbsize, NULL,
636 		    FALSE, with_bus, 0, ep) == -1)
637 			return (-1);
638 
639 	metafreereplicalist(rlp);
640 	return (0);
641 }
642 
643 static int
644 count_replica_on_ctl(
645 	md_ctlr_ctl_t		*c,
646 	int			adding,
647 	int			*db_cnt,
648 	int			minimum_replicas
649 )
650 {
651 	md_ctlr_drv_t		*d;
652 	int			maxdb = 0;
653 
654 	/*
655 	 * If this ctrl has no "usable" drives, nothing to do.
656 	 */
657 	if (c->ctl_drcnt == 0)
658 		return (0);
659 
660 	/*
661 	 * Determine the largest DB count on a drive.
662 	 */
663 	for (d = c->ctl_drvs; d != NULL; d = d->drv_next)
664 		if (d->drv_new_dbcnt > maxdb && d->drv_op != DRV_DEL)
665 			maxdb = d->drv_new_dbcnt;
666 
667 	/*
668 	 * Make sure we start at a reasonable number
669 	 */
670 	if (maxdb == 0) {
671 		if (!adding)
672 			return (0);
673 		maxdb = 1;
674 	}
675 
676 	/*
677 	 * Count or Un-Count replicas that would be
678 	 * added or deleted respectively.
679 	 */
680 	/*CONSTCOND*/
681 	while (1) {
682 		for (d = c->ctl_drvs; d != NULL; d = d->drv_next) {
683 			/*
684 			 * If this drive is being deleted, skip it.
685 			 */
686 			if (d->drv_op == DRV_DEL)
687 				continue;
688 
689 			/*
690 			 * If the drive is errored and adding, skip it.
691 			 */
692 			if (adding && (d->drv_flags & DRV_F_ERROR))
693 				continue;
694 
695 			/*
696 			 * Make sure that the replicas are distributed across
697 			 * the drives.
698 			 */
699 			if (adding) {
700 				if (d->drv_new_dbcnt >= maxdb)
701 					continue;
702 			} else {
703 				if (d->drv_new_dbcnt == 0)
704 					continue;
705 				if (d->drv_new_dbcnt < maxdb)
706 					continue;
707 			}
708 
709 			/*
710 			 * Count or Un-Count replicas here.
711 			 */
712 			if (adding) {
713 				mdpart_t	*partp;
714 				uint_t		rep_slice;
715 				md_error_t	mde;
716 
717 				if (meta_replicaslice(d->drv_dnp,
718 				    &rep_slice, &mde) != 0)
719 					continue;
720 
721 				partp = &d->drv_dnp->vtoc.parts[rep_slice];
722 				if (! partp)
723 					continue;
724 
725 				if (((d->drv_new_dbcnt + 1) * d->drv_dbsize) >
726 				    (partp->size - 16))
727 					continue;
728 				(*db_cnt)++;
729 				d->drv_new_dbcnt++;
730 			} else {
731 				(*db_cnt)--;
732 				d->drv_new_dbcnt--;
733 			}
734 			return (0);
735 		}
736 
737 		/*
738 		 * This should make sure they get spread
739 		 * around.  This is to emulate the {add,del}_replica
740 		 * routines.
741 		 */
742 		if (adding) {
743 			maxdb++;
744 			if (maxdb > minimum_replicas)
745 				return (-1);
746 		} else {
747 			maxdb--;
748 			if (maxdb <= 0)
749 				return (-1);
750 		}
751 	}
752 	/*NOTREACHED*/
753 }
754 
755 static int
756 count_replicas(
757 	md_ctlr_ctl_t		*clp,
758 	int			min_reps
759 )
760 {
761 	md_ctlr_ctl_t		*c;
762 	md_ctlr_drv_t		*d;
763 	int			db_cnt;
764 	int			uctlrs = 0;
765 	int			total_cnt = 0;
766 
767 	/*
768 	 * Count the number of controllers,
769 	 * counting the replicas is slightly different based
770 	 * on the controller count.
771 	 */
772 	for (c = clp; c != NULL; c = c->ctl_next)
773 		if (c->ctl_drcnt > 0) {
774 			uctlrs++;
775 			for (d = c->ctl_drvs; d != NULL; d = d->drv_next)
776 				d->drv_new_dbcnt = d->drv_dbcnt;
777 		}
778 
779 	if (uctlrs > 2) {
780 		for (c = clp; c != NULL; c = c->ctl_next) {
781 			if (c->ctl_drcnt == 0)
782 				continue;
783 
784 			db_cnt = c->ctl_dbcnt;
785 			/*
786 			 * Count the replicas that would be added.
787 			 */
788 			while (db_cnt < min_reps)
789 				if (count_replica_on_ctl(c, TRUE,
790 				    &db_cnt, min_reps))
791 					return (-1);
792 
793 			/*
794 			 * Un-Count the replicas that would be deleted.
795 			 */
796 			while (db_cnt > min_reps)
797 				if (count_replica_on_ctl(c, FALSE,
798 				    &db_cnt, min_reps))
799 					return (-1);
800 			total_cnt += db_cnt;
801 		}
802 	} else {
803 		for (c = clp; c != NULL; c = c->ctl_next) {
804 			if (c->ctl_drcnt == 0)
805 				continue;
806 
807 			db_cnt = c->ctl_dbcnt;
808 			/*
809 			 * Count the replicas that woud be added.
810 			 */
811 			while (db_cnt < (min_reps * c->ctl_drcnt))
812 				if (count_replica_on_ctl(c, TRUE,
813 				    &db_cnt, min_reps))
814 					return (-1);
815 
816 			total_cnt += db_cnt;
817 		}
818 	}
819 
820 	return (total_cnt);
821 }
822 
823 static int
824 balance_replicas(
825 	mdsetname_t		*sp,
826 	md_ctlr_ctl_t		**clpp,
827 	md_drive_desc		*opdd,
828 	md_drive_desc		*curdd,
829 	daddr_t			dbsize,
830 	int			*minimum_replicas,
831 	md_error_t		*ep
832 )
833 {
834 	int			n;
835 	int			rctlrs = 0;
836 	int			uctlrs;
837 	int			ructlrs;
838 	int			octlrs;
839 	int			save_done;
840 	int			prevcnt = 0, issame = 1;
841 	uint_t			drvcnt = ~0U;
842 	uint_t			save_cnum;
843 	mhd_ctlrtype_t		save_ctype;
844 	char			save_cname[16],
845 				*cmp_name_1,
846 				*cmp_name_2;
847 	int			reps;
848 	md_ctlr_ctl_t		*c;
849 
850 	/*
851 	 * Build a ctlr list with SSA-100 busses NOT as separate controllers.
852 	 */
853 	if (build_ctlr_lst(sp, clpp, opdd, curdd, FALSE, dbsize, ep) == -1)
854 		return (-1);
855 
856 	/*
857 	 * Determine what controllers are usable in the sense of being able to
858 	 * add a replica to a drive on the controller.
859 	 * Also find the minimum number of drives on a controller.
860 	 */
861 	for (c = *clpp; c != NULL; c = c->ctl_next) {
862 		if (c->ctl_drcnt > 0) {
863 			rctlrs++;
864 			drvcnt = min(drvcnt, c->ctl_drcnt);
865 			if (prevcnt == 0)
866 				prevcnt = c->ctl_drcnt;
867 			else if (prevcnt != c->ctl_drcnt)
868 				issame = 0;
869 		}
870 	}
871 
872 	if ((rctlrs <= 2) || (issame && (drvcnt >= 30)))
873 		goto cont;
874 
875 	/*
876 	 * If here: Handling 3 or more controllers most
877 	 *	    likely with non-symmetrical number of
878 	 *	    disks. The number of replicas will be
879 	 *	    the minimum number of disks on a controller.
880 	 *
881 	 *	    The main point is to insure that a
882 	 *	    controller does not have more than half
883 	 *	    of the replicas.
884 	 */
885 	drvcnt = min(drvcnt, 12);
886 	drvcnt = max(drvcnt, MD_MINBALREP);
887 
888 	/*
889 	 * Can we find fewer than the maximum replicas by reducing the
890 	 * number of replicas per drive.
891 	 */
892 	for (n = drvcnt; n > 0; n--) {
893 		reps = count_replicas(*clpp, n);
894 		if (reps > 0 && reps <= MDDB_NLB) {
895 			*minimum_replicas = n;
896 			return (0);
897 		}
898 	}
899 
900 cont:
901 	free_ctlr_lst(clpp);
902 
903 	/*
904 	 * Build a ctlr list with SSA-100 busses as separate controllers.
905 	 *
906 	 * If Here: Try to put 2 replicas per controller/bus
907 	 *	    If that doesn't work put 1 replica per controller/bus
908 	 */
909 	if (build_ctlr_lst(sp, clpp, opdd, curdd, TRUE, dbsize, ep) == -1)
910 		return (-1);
911 
912 	/*
913 	 * If the number of "real" controllers is 2, special handling may be
914 	 * needed.
915 	 */
916 	if (rctlrs != 2) {
917 		drvcnt = MD_MINBALREP;
918 		goto other;
919 	}
920 
921 	/*
922 	 * Determine what controllers are usable in the sense of being able to
923 	 * add a replica to a drive on the controller.
924 	 * Also find the minimum number of drives on a controller.
925 	 */
926 	drvcnt = ~0U;
927 	uctlrs = 0;
928 	for (c = *clpp; c != NULL; c = c->ctl_next) {
929 		if (c->ctl_drcnt > 0) {
930 			uctlrs++;
931 			drvcnt = min(drvcnt, c->ctl_drcnt);
932 		}
933 	}
934 
935 	/*
936 	 * If the number of controllers is not changed, continue with original
937 	 * strategy.
938 	 */
939 	if (uctlrs == rctlrs) {
940 		drvcnt = MD_MINBALREP;
941 		goto other;
942 	}
943 
944 	/*
945 	 * Check the distribution of bus ctlrs across real controllers.
946 	 */
947 	ructlrs = 0;
948 	octlrs = 0;
949 	save_done = 0;
950 	for (c = *clpp; c != NULL; c = c->ctl_next) {
951 		if (c->ctl_drcnt == 0)
952 			continue;
953 
954 		if (! save_done) {
955 			save_cnum = c->ctl_cinfop->cnum;
956 			save_ctype = c->ctl_cinfop->ctype;
957 			(void) strncpy(save_cname, c->ctl_cinfop->cname, 16);
958 			save_done = 1;
959 		}
960 
961 		(void) sdssc_convert_cluster_path(c->ctl_cinfop->cname,
962 		    &cmp_name_1);
963 		(void) sdssc_convert_cluster_path(save_cname, &cmp_name_2);
964 
965 		if (save_ctype != c->ctl_cinfop->ctype ||
966 		    save_cnum != c->ctl_cinfop->cnum ||
967 		    strncmp(cmp_name_1, cmp_name_2, 16) != 0)
968 			octlrs++;
969 		else
970 			ructlrs++;
971 
972 		sdssc_convert_path_free(cmp_name_1);
973 		sdssc_convert_path_free(cmp_name_2);
974 	}
975 
976 	/*
977 	 * Take the largest of the counts
978 	 */
979 	ructlrs = max(ructlrs, octlrs);
980 
981 	/*
982 	 * If the distribution of bus controlers is half of the total, then
983 	 * this layout strategy will work, doit.
984 	 */
985 	if ((uctlrs / 2) == ructlrs) {
986 		drvcnt = MD_MINBALREP;
987 		goto other;
988 	}
989 
990 	/*
991 	 * If here, there is a distribution of bus controllers that will cause
992 	 * the real controller distribution to be unbalanced, so a different
993 	 * strategy is used.
994 	 */
995 	free_ctlr_lst(clpp);
996 
997 	/*
998 	 * Build the ctlr list with SSA-100 busses NOT as separate controllers.
999 	 */
1000 	if (build_ctlr_lst(sp, clpp, opdd, curdd, FALSE, dbsize, ep) == -1)
1001 		return (-1);
1002 
1003 	/*
1004 	 * Make ctl_drcnt limit the number of replicas
1005 	 */
1006 	for (c = *clpp; c != NULL; c = c->ctl_next)
1007 		c->ctl_drcnt = min(drvcnt, c->ctl_drcnt);
1008 
1009 	/*
1010 	 * Try at least MD_MINBALREP's per controller after changing ctl_drcnt
1011 	 */
1012 	drvcnt = MD_MINBALREP;
1013 
1014 other:
1015 	/*
1016 	 * Can we find fewer than the maximum replicas by reducing the number
1017 	 * of replicas per drive.
1018 	 */
1019 	for (n = drvcnt; n > 0; n--) {
1020 		reps = count_replicas(*clpp, n);
1021 		if (reps > 0 && reps <= MDDB_NLB) {
1022 			*minimum_replicas = n;
1023 			return (0);
1024 		}
1025 	}
1026 
1027 	free_ctlr_lst(clpp);
1028 
1029 	/*
1030 	 * Build a ctlr list with SSA-100 busses NOT as separate controllers.
1031 	 *
1032 	 * If Here: Try to put 2 replicas per controller (not on busses)
1033 	 *	    If that doesn't work put 1 replica per controller
1034 	 */
1035 	if (build_ctlr_lst(sp, clpp, opdd, curdd, FALSE, dbsize, ep) == -1)
1036 		return (-1);
1037 
1038 	/*
1039 	 * Can we find fewer than the maximum replicas by reducing the
1040 	 * number of replicas per drive.
1041 	 */
1042 	for (n = MD_MINBALREP; n > 0; n--) {
1043 		reps = count_replicas(*clpp, n);
1044 		if (reps > 0 && reps <= MDDB_NLB) {
1045 			*minimum_replicas = n;
1046 			return (0);
1047 		}
1048 	}
1049 
1050 	/*
1051 	 * Return a ctrl list that does not include the SSA-100 buses as
1052 	 * separate controllers.  This will create fewer separate controllers.
1053 	 */
1054 	*minimum_replicas = 1;
1055 	return (0);
1056 }
1057 
1058 static int
1059 morethan2_ctl_balance(
1060 	mdsetname_t		*sp,
1061 	md_ctlr_ctl_t		*clp,
1062 	int			min_reps,
1063 	md_error_t		*ep
1064 )
1065 {
1066 	md_ctlr_ctl_t		*c;
1067 	int			err;
1068 	int			multiple_reps = 0;
1069 	md_ctlr_drv_t		*d;
1070 
1071 	for (c = clp; c != NULL; c = c->ctl_next) {
1072 		if (c->ctl_drcnt == 0)
1073 			continue;
1074 
1075 		/*
1076 		 * check for multiple databases on a disk and compensate
1077 		 */
1078 		for (d = c->ctl_drvs; d != NULL; d = d->drv_next) {
1079 			if (d->drv_dbcnt)
1080 				multiple_reps += d->drv_dbcnt - 1;
1081 		}
1082 
1083 		/*
1084 		 * remove the number of multiple databases count from the
1085 		 * total db count. This enables us to rebalance if one of
1086 		 * the disks has a large enough slice for 2 metadb's. If we
1087 		 * then add a disk with a smaller slice into the set, we want
1088 		 * that disk to get a replica on it. If we just compare to
1089 		 * ctl_dbcnt, it won't.
1090 		 */
1091 		while ((c->ctl_dbcnt - multiple_reps) <
1092 		    min_reps) {
1093 			if ((err = add_replica_to_ctl(sp, c, min_reps, ep)) < 0)
1094 				return (-1);
1095 			if (err == 0)
1096 				break;
1097 		}
1098 
1099 		while (c->ctl_dbcnt > min_reps) {
1100 			if ((err = del_replica_from_ctl(sp, c, ep)) < 0)
1101 				return (-1);
1102 			if (err == 0)
1103 				break;
1104 		}
1105 	}
1106 
1107 	return (0);
1108 }
1109 
1110 static int
1111 lessthan3_ctl_balance(
1112 	mdsetname_t		*sp,
1113 	md_ctlr_ctl_t		*clp,
1114 	int			min_reps,
1115 	md_error_t		*ep
1116 )
1117 {
1118 	md_ctlr_ctl_t		*c;
1119 	int			err;
1120 	int			multiple_reps = 0;
1121 	md_ctlr_drv_t		*d;
1122 
1123 	for (c = clp; c != NULL; c = c->ctl_next) {
1124 		if (c->ctl_drcnt == 0)
1125 			continue;
1126 
1127 		/*
1128 		 * check for multiple databases on a disk and compensate
1129 		 */
1130 		for (d = c->ctl_drvs; d != NULL; d = d->drv_next) {
1131 			if (d->drv_dbcnt)
1132 				multiple_reps += d->drv_dbcnt - 1;
1133 		}
1134 
1135 		/*
1136 		 * remove the number of multiple databases count from the
1137 		 * total db count. This enables us to rebalance if one of
1138 		 * the disks has a large enough slice for 2 metadb's. If we
1139 		 * then add a disk with a smaller slice into the set, we want
1140 		 * that disk to get a replica on it. If we just compare to
1141 		 * ctl_dbcnt, it won't.
1142 		 */
1143 		while ((c->ctl_dbcnt - multiple_reps) <
1144 		    (min_reps * c->ctl_drcnt)) {
1145 			if ((err = add_replica_to_ctl(sp, c, min_reps, ep)) < 0)
1146 				return (-1);
1147 			if (err == 0)
1148 				break;
1149 		}
1150 
1151 		while (c->ctl_dbcnt > (min_reps * c->ctl_drcnt)) {
1152 			if ((err = del_replica_from_ctl(sp, c, ep)) < 0)
1153 				return (-1);
1154 			if (err == 0)
1155 				break;
1156 		}
1157 	}
1158 
1159 	return (0);
1160 }
1161 
1162 static int
1163 try_again(
1164 	md_ctlr_ctl_t	*clp,
1165 	md_error_t	*ep
1166 )
1167 {
1168 	md_ctlr_ctl_t	*c;
1169 	md_ctlr_drv_t	*d;
1170 
1171 	if (mdismddberror(ep, MDE_TOOMANY_REPLICAS))
1172 		return (TRUE);
1173 
1174 	/*
1175 	 * retry if all the errored drives are already in the diskset.
1176 	 */
1177 	for (c = clp; c != NULL; c = c->ctl_next) {
1178 		for (d = c->ctl_drvs; d != NULL; d = d->drv_next) {
1179 			if ((d->drv_flags & (DRV_F_INDISKSET|DRV_F_ERROR))
1180 			    == DRV_F_ERROR)
1181 				return (FALSE);
1182 		}
1183 	}
1184 	return (TRUE);
1185 }
1186 
1187 int
1188 meta_db_balance(
1189 	mdsetname_t		*sp,
1190 	md_drive_desc		*opdd,
1191 	md_drive_desc		*curdd,
1192 	daddr_t			dbsize,
1193 	md_error_t		*ep
1194 )
1195 {
1196 	int			min_reps;
1197 	md_ctlr_ctl_t		*c, *cl = NULL;
1198 	int			uctlrs = 0;
1199 	int			retry = 0;
1200 	int			rval = 0;
1201 
1202 	if (balance_replicas(sp, &cl, opdd, curdd, dbsize, &min_reps, ep) == -1)
1203 		return (-1);
1204 
1205 	/*
1206 	 * Determine what controllers are usable in the sense of being able to
1207 	 * add a replica to a drive on the controller.
1208 	 */
1209 	for (c = cl; c != NULL; c = c->ctl_next)
1210 		if (c->ctl_drcnt > 0)
1211 			uctlrs++;
1212 
1213 	/*
1214 	 * Add replicas to achieve a balance.
1215 	 */
1216 	if (uctlrs > 2)
1217 		rval = morethan2_ctl_balance(sp, cl, min_reps, ep);
1218 	else
1219 		rval = lessthan3_ctl_balance(sp, cl, min_reps, ep);
1220 
1221 	if (rval) {
1222 		if ((retry = try_again(cl, ep)) == TRUE) {
1223 			mdclrerror(ep);
1224 			rval = 0;
1225 		}
1226 	}
1227 
1228 	/*
1229 	 * Delete all the replicas from drives that are so marked.
1230 	 */
1231 	if (! rval)
1232 		rval = del_replicas(sp, cl, ep);
1233 
1234 	if (retry) {
1235 		if (uctlrs > 2)
1236 			rval = morethan2_ctl_balance(sp, cl, min_reps, ep);
1237 		else
1238 			rval = lessthan3_ctl_balance(sp, cl, min_reps, ep);
1239 
1240 		if (rval && mdismddberror(ep, MDE_TOOMANY_REPLICAS)) {
1241 			mdclrerror(ep);
1242 			rval = 0;
1243 		}
1244 	}
1245 
1246 	/*
1247 	 * Free up the ctlr list.
1248 	 */
1249 	free_ctlr_lst(&cl);
1250 
1251 	return (rval);
1252 }
1253