1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * Database location balancing code. 30 */ 31 32 #include <meta.h> 33 #include <sys/lvm/md_mddb.h> 34 #include <sdssc.h> 35 36 #define MD_MINBALREP 2 37 38 /* 39 * Stuff for DB balancing. 40 */ 41 enum md_ctlr_ops_t { 42 DRV_NOP = 0, 43 DRV_ADD = 1, 44 DRV_DEL = 2 45 }; 46 typedef enum md_ctlr_ops_t md_ctlr_ops_t; 47 48 /* drive flag fields */ 49 #define DRV_F_ERROR 0x1 50 #define DRV_F_INDISKSET 0x2 51 52 struct md_ctlr_drv_t { 53 md_ctlr_ops_t drv_op; 54 int drv_flags; 55 int drv_dbcnt; 56 int drv_new_dbcnt; 57 daddr_t drv_dbsize; 58 mddrivename_t *drv_dnp; 59 struct md_ctlr_drv_t *drv_next; 60 }; 61 typedef struct md_ctlr_drv_t md_ctlr_drv_t; 62 63 struct md_ctlr_ctl_t { 64 mdcinfo_t *ctl_cinfop; 65 int ctl_dbcnt; 66 int ctl_drcnt; 67 md_ctlr_drv_t *ctl_drvs; 68 struct md_ctlr_ctl_t *ctl_next; 69 }; 70 typedef struct md_ctlr_ctl_t md_ctlr_ctl_t; 71 72 static int 73 add_replica( 74 mdsetname_t *sp, 75 mddrivename_t *dnp, 76 int dbcnt, 77 daddr_t dbsize, 78 md_error_t *ep 79 ) 80 { 81 mdnamelist_t *nlp = NULL; 82 mdname_t *np; 83 md_set_desc *sd; 84 uint_t rep_slice; 85 86 if (meta_replicaslice(dnp, &rep_slice, ep) != 0) 87 return (-1); 88 89 if ((np = metaslicename(dnp, rep_slice, ep)) == NULL) 90 return (-1); 91 92 (void) metanamelist_append(&nlp, np); 93 94 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 95 metafreenamelist(nlp); 96 return (-1); 97 } 98 99 if (meta_db_attach(sp, nlp, (MDCHK_DRVINSET | MDCHK_SET_LOCKED), 100 (&sd->sd_ctime), dbcnt, dbsize, NULL, ep) == -1) { 101 metafreenamelist(nlp); 102 return (-1); 103 } 104 105 metafreenamelist(nlp); 106 return (0); 107 } 108 109 static int 110 del_replica( 111 mdsetname_t *sp, 112 mddrivename_t *dnp, 113 md_error_t *ep 114 ) 115 { 116 mdnamelist_t *nlp = NULL; 117 mdname_t *np; 118 uint_t rep_slice; 119 120 if (meta_replicaslice(dnp, &rep_slice, ep) != 0) 121 return (-1); 122 123 if ((np = metaslicename(dnp, rep_slice, ep)) == NULL) 124 return (-1); 125 126 (void) metanamelist_append(&nlp, np); 127 128 if (meta_db_detach(sp, nlp, (MDFORCE_DS | MDFORCE_SET_LOCKED), 129 NULL, ep) == -1) { 130 metafreenamelist(nlp); 131 return (-1); 132 } 133 134 metafreenamelist(nlp); 135 return (0); 136 } 137 138 static int 139 rep_has_err(md_replicalist_t *rlp, mdname_t *np) 140 { 141 md_replicalist_t *rl; 142 143 for (rl = rlp; rl != NULL; rl = rl->rl_next) { 144 md_replica_t *r = rl->rl_repp; 145 146 if (strcmp(r->r_namep->cname, np->cname) != 0) 147 continue; 148 149 if (r->r_flags & (MDDB_F_EREAD | MDDB_F_EFMT | MDDB_F_EDATA | 150 MDDB_F_EMASTER | MDDB_F_EWRITE)) 151 return (1); 152 153 } 154 return (0); 155 } 156 157 static int 158 add_drv_to_ctl_lst( 159 md_ctlr_ctl_t **clpp, 160 md_replicalist_t *rlp, 161 mddrivename_t *dnp, 162 int dbcnt, 163 daddr_t dbsize, 164 mdcinfo_t *cinfop, 165 int indiskset, 166 int with_bus, 167 int errored, 168 md_error_t *ep 169 ) 170 { 171 md_ctlr_drv_t **dpp; 172 mdname_t *np; 173 mdcinfo_t *tcinfop; 174 char *cmp_name_1, 175 *cmp_name_2; 176 int not_found; 177 178 /* 179 * The user must pass in a list head. 180 */ 181 assert(clpp != NULL); 182 183 if (cinfop == NULL) { 184 uint_t rep_slice; 185 186 if (meta_replicaslice(dnp, &rep_slice, ep) != 0) { 187 /* 188 * A failure to get the slice information can occur 189 * because the drive has failed, if this is the 190 * case then there is nothing that can be done 191 * with this drive, so do not include it in the 192 * list of drives. Clear the error and return. 193 */ 194 mdclrerror(ep); 195 return (0); 196 } 197 198 if ((np = metaslicename(dnp, rep_slice, ep)) == NULL) 199 return (-1); 200 201 if ((tcinfop = metagetcinfo(np, ep)) == NULL) 202 return (-1); 203 204 if (metagetvtoc(np, FALSE, NULL, ep) == NULL) 205 errored = 1; 206 207 if (rep_has_err(rlp, np)) 208 errored = 1; 209 } else 210 tcinfop = cinfop; 211 212 for (/* void */; *clpp != NULL; clpp = &(*clpp)->ctl_next) { 213 /* 214 * Try to locate ctlr. 215 */ 216 (void) sdssc_convert_cluster_path(tcinfop->cname, &cmp_name_1); 217 (void) sdssc_convert_cluster_path((*clpp)->ctl_cinfop->cname, 218 &cmp_name_2); 219 220 if (tcinfop->ctype != (*clpp)->ctl_cinfop->ctype || 221 tcinfop->cnum != (*clpp)->ctl_cinfop->cnum || 222 strncmp(cmp_name_1, cmp_name_2, 16) != 0 || 223 (with_bus && tcinfop->bus != (*clpp)->ctl_cinfop->bus)) { 224 not_found = 1; 225 } else 226 not_found = 0; 227 228 229 sdssc_convert_path_free(cmp_name_1); 230 sdssc_convert_path_free(cmp_name_2); 231 232 if (not_found) 233 continue; 234 235 /* 236 * Found ctlr, try to locate the drive. 237 */ 238 for (dpp = &(*clpp)->ctl_drvs; *dpp != NULL; 239 dpp = &(*dpp)->drv_next) { 240 (void) sdssc_convert_cluster_path( 241 (*dpp)->drv_dnp->cname, &cmp_name_1); 242 (void) sdssc_convert_cluster_path(dnp->cname, 243 &cmp_name_2); 244 245 not_found = strcmp(cmp_name_1, cmp_name_2); 246 247 sdssc_convert_path_free(cmp_name_1); 248 sdssc_convert_path_free(cmp_name_2); 249 250 if (not_found) 251 continue; 252 253 /* 254 * Found drive, must be deleting. 255 */ 256 (*dpp)->drv_op = DRV_DEL; 257 if (indiskset) 258 (*dpp)->drv_flags |= DRV_F_INDISKSET; 259 if (errored) { 260 mdclrerror(ep); 261 (*dpp)->drv_flags |= DRV_F_ERROR; 262 } 263 (*clpp)->ctl_dbcnt -= (*dpp)->drv_dbcnt; 264 (*clpp)->ctl_drcnt--; 265 return (0); 266 } 267 /* 268 * The ctlr was found, but not the drive, so add 269 * the drive 270 */ 271 (*dpp) = Zalloc(sizeof (**dpp)); 272 273 274 if (indiskset) { 275 (*dpp)->drv_op = DRV_NOP; 276 (*dpp)->drv_flags |= DRV_F_INDISKSET; 277 if (errored) { 278 mdclrerror(ep); 279 (*dpp)->drv_flags |= DRV_F_ERROR; 280 } 281 } else { 282 (*dpp)->drv_op = DRV_ADD; 283 if (errored) { 284 (*dpp)->drv_flags |= DRV_F_ERROR; 285 return (-1); 286 } 287 assert(dbsize != 0); 288 } 289 (*dpp)->drv_dbcnt = dbcnt; 290 (*dpp)->drv_dbsize = dbsize; 291 (*dpp)->drv_dnp = dnp; 292 (*clpp)->ctl_dbcnt += dbcnt; 293 (*clpp)->ctl_drcnt++; 294 return (0); 295 } 296 /* 297 * No ctlr was located, so add the ctlr, then recurse to add the 298 * drive to the ctlr. 299 */ 300 (*clpp) = Zalloc(sizeof (**clpp)); 301 302 (*clpp)->ctl_cinfop = tcinfop; 303 304 return (add_drv_to_ctl_lst(clpp, rlp, dnp, dbcnt, dbsize, tcinfop, 305 indiskset, with_bus, errored, ep)); 306 } 307 308 static int 309 add_replica_to_ctl( 310 mdsetname_t *sp, 311 md_ctlr_ctl_t *c, 312 int minimum_replicas, 313 md_error_t *ep 314 ) 315 { 316 md_ctlr_drv_t *d; 317 int maxdb = 0; 318 319 /* 320 * If this ctrl has no "usable" drives, assert() or just return if 321 * assert()'s are turned off. 322 */ 323 if (c->ctl_drcnt == 0) { 324 assert(0); 325 return (0); 326 } 327 328 /* 329 * Determine the largest DB count on a drive. 330 */ 331 for (d = c->ctl_drvs; d != NULL; d = d->drv_next) 332 if (d->drv_dbcnt > maxdb && d->drv_op != DRV_DEL) 333 maxdb = d->drv_dbcnt; 334 335 /* 336 * Make sure we start at a reasonable number 337 */ 338 if (maxdb == 0) 339 maxdb = 1; 340 341 /* 342 * Add a replica to a drive on this ctrl. 343 */ 344 /*CONSTCOND*/ 345 while (1) { 346 for (d = c->ctl_drvs; d != NULL; d = d->drv_next) { 347 /* 348 * If this drive is being deleted, skip it. 349 */ 350 if (d->drv_op == DRV_DEL) 351 continue; 352 353 if (d->drv_flags & DRV_F_ERROR) 354 continue; 355 /* 356 * Make sure that the replicas are distributed across 357 * the drives. 358 */ 359 if (d->drv_dbcnt >= maxdb) 360 continue; 361 /* 362 * See if the drive already has replicas, 363 * if it does, then delete the exisiting 364 * replica(s) and re-add n+1 replicas to the drive. 365 */ 366 /* ==== Vulnerability - no DB's start ==== */ 367 if (d->drv_dbcnt > 0) { 368 if (del_replica(sp, d->drv_dnp, ep) == -1) { 369 d->drv_flags |= DRV_F_ERROR; 370 if (! (d->drv_flags & DRV_F_INDISKSET)) 371 return (-1); 372 mdclrerror(ep); 373 continue; 374 } 375 } 376 if (add_replica(sp, d->drv_dnp, (d->drv_dbcnt + 1), 377 d->drv_dbsize, ep) == -1) { 378 if (d->drv_dbcnt) { 379 c->ctl_dbcnt -= d->drv_dbcnt; 380 d->drv_dbcnt = 0; 381 } 382 383 if (mdismddberror(ep, MDE_TOOMANY_REPLICAS)) 384 return (-1); 385 386 if (mdismddberror(ep, MDE_REPLICA_TOOSMALL)) 387 return (-1); 388 389 d->drv_flags |= DRV_F_ERROR; 390 if (! (d->drv_flags & DRV_F_INDISKSET)) 391 return (-1); 392 mdclrerror(ep); 393 continue; 394 } 395 396 d->drv_dbcnt++; 397 c->ctl_dbcnt++; 398 /* ==== Vulnerability - no DB's end ==== */ 399 return (1); 400 } 401 maxdb++; 402 if (maxdb > minimum_replicas) 403 return (0); 404 } 405 /*NOTREACHED*/ 406 } 407 408 static int 409 del_replica_from_ctl( 410 mdsetname_t *sp, 411 md_ctlr_ctl_t *c, 412 md_error_t *ep 413 ) 414 { 415 md_ctlr_drv_t *d; 416 int maxdb = 0; 417 418 /* 419 * If this ctrl has no "usable" drives, assert() or just return if 420 * assert()'s are turned off. 421 */ 422 if (c->ctl_drcnt == 0) { 423 assert(0); 424 return (0); 425 } 426 427 /* 428 * Determine the largest DB count on a drive. 429 */ 430 for (d = c->ctl_drvs; d != NULL; d = d->drv_next) 431 if (d->drv_dbcnt > maxdb && d->drv_op != DRV_DEL) 432 maxdb = d->drv_dbcnt; 433 434 if (maxdb == 0) 435 return (0); 436 437 /* 438 * Delete a replica from a drive on this ctrl. 439 */ 440 /*CONSTCOND*/ 441 while (1) { 442 for (d = c->ctl_drvs; d != NULL; d = d->drv_next) { 443 /* 444 * If this drive is being deleted, skip it. 445 */ 446 if (d->drv_op == DRV_DEL) 447 continue; 448 449 /* 450 * Make sure that there are replicas on this drive to 451 * delete. 452 */ 453 if (d->drv_dbcnt == 0) 454 continue; 455 456 if (d->drv_flags & DRV_F_ERROR) 457 continue; 458 459 /* 460 * We need to keep the DB's distributed across the 461 * drives. 462 */ 463 if (d->drv_dbcnt < maxdb) 464 continue; 465 466 /* 467 * Delete all the replicas on the drive. 468 */ 469 /* ==== Vulnerability - no DB's start ==== */ 470 if (del_replica(sp, d->drv_dnp, ep) == -1) { 471 d->drv_flags |= DRV_F_ERROR; 472 if (! (d->drv_flags & DRV_F_INDISKSET)) 473 return (-1); 474 mdclrerror(ep); 475 continue; 476 } 477 d->drv_dbcnt--; 478 c->ctl_dbcnt--; 479 /* 480 * If there is still a dbcnt for this drive, then add 481 * back the needed DB's. 482 */ 483 if (d->drv_dbcnt > 0) { 484 if (add_replica(sp, d->drv_dnp, d->drv_dbcnt, 485 d->drv_dbsize, ep) == -1) { 486 c->ctl_dbcnt -= d->drv_dbcnt; 487 d->drv_dbcnt = 0; 488 489 if (mdismddberror(ep, 490 MDE_TOOMANY_REPLICAS)) 491 return (-1); 492 493 d->drv_flags |= DRV_F_ERROR; 494 if (! (d->drv_flags & DRV_F_INDISKSET)) 495 return (-1); 496 mdclrerror(ep); 497 continue; 498 } 499 } 500 /* ==== Vulnerability - no DB's end ==== */ 501 return (1); 502 } 503 maxdb--; 504 if (maxdb <= 0) 505 return (0); 506 } 507 /*NOTREACHED*/ 508 } 509 510 static int 511 del_replicas(mdsetname_t *sp, md_ctlr_ctl_t *clp, md_error_t *ep) 512 { 513 md_ctlr_ctl_t *c; 514 md_ctlr_drv_t *d; 515 mdnamelist_t *nlp; 516 mdname_t *np; 517 518 for (c = clp; c != NULL; c = c->ctl_next) { 519 for (d = c->ctl_drvs; d != NULL; d = d->drv_next) { 520 uint_t rep_slice; 521 522 if (! (d->drv_flags & DRV_F_ERROR) && 523 (d->drv_op != DRV_DEL)) 524 continue; 525 526 if (d->drv_dbcnt == 0) 527 continue; 528 529 if (meta_replicaslice(d->drv_dnp, 530 &rep_slice, ep) != 0) 531 return (-1); 532 533 np = metaslicename(d->drv_dnp, rep_slice, ep); 534 if (np == NULL) 535 return (-1); 536 537 nlp = NULL; 538 (void) metanamelist_append(&nlp, np); 539 540 /* 541 * Delete the replicas listed. 542 */ 543 if (meta_db_detach(sp, nlp, 544 (MDFORCE_DS | MDFORCE_SET_LOCKED), NULL, 545 ep) == -1) { 546 metafreenamelist(nlp); 547 if (d->drv_flags & DRV_F_INDISKSET) { 548 mdclrerror(ep); 549 continue; 550 } 551 return (-1); 552 } 553 metafreenamelist(nlp); 554 } 555 } 556 557 return (0); 558 } 559 560 static void 561 free_ctlr_lst(md_ctlr_ctl_t **clpp) 562 { 563 md_ctlr_ctl_t *c, *tc = NULL; 564 md_ctlr_drv_t *d, *td = NULL; 565 566 for (c = *clpp; c != NULL; c = tc) { 567 tc = c->ctl_next; 568 for (d = c->ctl_drvs; d != NULL; d = td) { 569 td = d->drv_next; 570 Free(d); 571 } 572 Free(c); 573 } 574 *clpp = NULL; 575 } 576 577 static int 578 build_ctlr_lst( 579 mdsetname_t *sp, 580 md_ctlr_ctl_t **clpp, 581 md_drive_desc *opdd, 582 md_drive_desc *curdd, 583 int with_bus, 584 daddr_t dbsize, 585 md_error_t *ep 586 ) 587 { 588 md_drive_desc *d; 589 md_set_desc *sd; 590 daddr_t nblks; 591 md_replicalist_t *rlp = NULL; 592 static daddr_t min_dbsize = 0; 593 594 if (min_dbsize == 0) { 595 if ((nblks = meta_db_minreplica(sp, ep)) < 0) { 596 min_dbsize = MD_DBSIZE; 597 598 if (! metaislocalset(sp)) { 599 if ((sd = metaget_setdesc(sp, ep)) == NULL) 600 return (-1); 601 602 if (MD_MNSET_DESC(sd)) 603 min_dbsize = MD_MN_DBSIZE; 604 } 605 mdclrerror(ep); 606 } else 607 min_dbsize = nblks; 608 } 609 610 if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) < 0) { 611 if (! mdismddberror(ep, MDE_DB_NODB) && 612 ! mdismddberror(ep, MDE_DB_NOTOWNER)) 613 return (-1); 614 mdclrerror(ep); 615 } 616 617 /* 618 * Add drives currently in the set to the ctlr list. 619 */ 620 for (d = curdd; d != NULL; d = d->dd_next) { 621 daddr_t this_dbsize = d->dd_dbsize; 622 623 if (this_dbsize == 0) 624 this_dbsize = min_dbsize; 625 626 if (add_drv_to_ctl_lst(clpp, rlp, d->dd_dnp, d->dd_dbcnt, 627 this_dbsize, NULL, TRUE, with_bus, 0, ep) == -1) 628 return (-1); 629 } 630 631 /* 632 * Add the drives that are being operated on to the ctlr list. 633 */ 634 for (d = opdd; d != NULL; d = d->dd_next) 635 if (add_drv_to_ctl_lst(clpp, rlp, d->dd_dnp, 0, dbsize, NULL, 636 FALSE, with_bus, 0, ep) == -1) 637 return (-1); 638 639 metafreereplicalist(rlp); 640 return (0); 641 } 642 643 static int 644 count_replica_on_ctl( 645 md_ctlr_ctl_t *c, 646 int adding, 647 int *db_cnt, 648 int minimum_replicas 649 ) 650 { 651 md_ctlr_drv_t *d; 652 int maxdb = 0; 653 654 /* 655 * If this ctrl has no "usable" drives, nothing to do. 656 */ 657 if (c->ctl_drcnt == 0) 658 return (0); 659 660 /* 661 * Determine the largest DB count on a drive. 662 */ 663 for (d = c->ctl_drvs; d != NULL; d = d->drv_next) 664 if (d->drv_new_dbcnt > maxdb && d->drv_op != DRV_DEL) 665 maxdb = d->drv_new_dbcnt; 666 667 /* 668 * Make sure we start at a reasonable number 669 */ 670 if (maxdb == 0) { 671 if (!adding) 672 return (0); 673 maxdb = 1; 674 } 675 676 /* 677 * Count or Un-Count replicas that would be 678 * added or deleted respectively. 679 */ 680 /*CONSTCOND*/ 681 while (1) { 682 for (d = c->ctl_drvs; d != NULL; d = d->drv_next) { 683 /* 684 * If this drive is being deleted, skip it. 685 */ 686 if (d->drv_op == DRV_DEL) 687 continue; 688 689 /* 690 * If the drive is errored and adding, skip it. 691 */ 692 if (adding && (d->drv_flags & DRV_F_ERROR)) 693 continue; 694 695 /* 696 * Make sure that the replicas are distributed across 697 * the drives. 698 */ 699 if (adding) { 700 if (d->drv_new_dbcnt >= maxdb) 701 continue; 702 } else { 703 if (d->drv_new_dbcnt == 0) 704 continue; 705 if (d->drv_new_dbcnt < maxdb) 706 continue; 707 } 708 709 /* 710 * Count or Un-Count replicas here. 711 */ 712 if (adding) { 713 mdpart_t *partp; 714 uint_t rep_slice; 715 md_error_t mde; 716 717 if (meta_replicaslice(d->drv_dnp, 718 &rep_slice, &mde) != 0) 719 continue; 720 721 partp = &d->drv_dnp->vtoc.parts[rep_slice]; 722 if (! partp) 723 continue; 724 725 if (((d->drv_new_dbcnt + 1) * d->drv_dbsize) > 726 (partp->size - 16)) 727 continue; 728 (*db_cnt)++; 729 d->drv_new_dbcnt++; 730 } else { 731 (*db_cnt)--; 732 d->drv_new_dbcnt--; 733 } 734 return (0); 735 } 736 737 /* 738 * This should make sure they get spread 739 * around. This is to emulate the {add,del}_replica 740 * routines. 741 */ 742 if (adding) { 743 maxdb++; 744 if (maxdb > minimum_replicas) 745 return (-1); 746 } else { 747 maxdb--; 748 if (maxdb <= 0) 749 return (-1); 750 } 751 } 752 /*NOTREACHED*/ 753 } 754 755 static int 756 count_replicas( 757 md_ctlr_ctl_t *clp, 758 int min_reps 759 ) 760 { 761 md_ctlr_ctl_t *c; 762 md_ctlr_drv_t *d; 763 int db_cnt; 764 int uctlrs = 0; 765 int total_cnt = 0; 766 767 /* 768 * Count the number of controllers, 769 * counting the replicas is slightly different based 770 * on the controller count. 771 */ 772 for (c = clp; c != NULL; c = c->ctl_next) 773 if (c->ctl_drcnt > 0) { 774 uctlrs++; 775 for (d = c->ctl_drvs; d != NULL; d = d->drv_next) 776 d->drv_new_dbcnt = d->drv_dbcnt; 777 } 778 779 if (uctlrs > 2) { 780 for (c = clp; c != NULL; c = c->ctl_next) { 781 if (c->ctl_drcnt == 0) 782 continue; 783 784 db_cnt = c->ctl_dbcnt; 785 /* 786 * Count the replicas that would be added. 787 */ 788 while (db_cnt < min_reps) 789 if (count_replica_on_ctl(c, TRUE, 790 &db_cnt, min_reps)) 791 return (-1); 792 793 /* 794 * Un-Count the replicas that would be deleted. 795 */ 796 while (db_cnt > min_reps) 797 if (count_replica_on_ctl(c, FALSE, 798 &db_cnt, min_reps)) 799 return (-1); 800 total_cnt += db_cnt; 801 } 802 } else { 803 for (c = clp; c != NULL; c = c->ctl_next) { 804 if (c->ctl_drcnt == 0) 805 continue; 806 807 db_cnt = c->ctl_dbcnt; 808 /* 809 * Count the replicas that woud be added. 810 */ 811 while (db_cnt < (min_reps * c->ctl_drcnt)) 812 if (count_replica_on_ctl(c, TRUE, 813 &db_cnt, min_reps)) 814 return (-1); 815 816 total_cnt += db_cnt; 817 } 818 } 819 820 return (total_cnt); 821 } 822 823 static int 824 balance_replicas( 825 mdsetname_t *sp, 826 md_ctlr_ctl_t **clpp, 827 md_drive_desc *opdd, 828 md_drive_desc *curdd, 829 daddr_t dbsize, 830 int *minimum_replicas, 831 md_error_t *ep 832 ) 833 { 834 int n; 835 int rctlrs = 0; 836 int uctlrs; 837 int ructlrs; 838 int octlrs; 839 int save_done; 840 int prevcnt = 0, issame = 1; 841 uint_t drvcnt = ~0U; 842 uint_t save_cnum; 843 mhd_ctlrtype_t save_ctype; 844 char save_cname[16], 845 *cmp_name_1, 846 *cmp_name_2; 847 int reps; 848 md_ctlr_ctl_t *c; 849 850 /* 851 * Build a ctlr list with SSA-100 busses NOT as separate controllers. 852 */ 853 if (build_ctlr_lst(sp, clpp, opdd, curdd, FALSE, dbsize, ep) == -1) 854 return (-1); 855 856 /* 857 * Determine what controllers are usable in the sense of being able to 858 * add a replica to a drive on the controller. 859 * Also find the minimum number of drives on a controller. 860 */ 861 for (c = *clpp; c != NULL; c = c->ctl_next) { 862 if (c->ctl_drcnt > 0) { 863 rctlrs++; 864 drvcnt = min(drvcnt, c->ctl_drcnt); 865 if (prevcnt == 0) 866 prevcnt = c->ctl_drcnt; 867 else if (prevcnt != c->ctl_drcnt) 868 issame = 0; 869 } 870 } 871 872 if ((rctlrs <= 2) || (issame && (drvcnt >= 30))) 873 goto cont; 874 875 /* 876 * If here: Handling 3 or more controllers most 877 * likely with non-symmetrical number of 878 * disks. The number of replicas will be 879 * the minimum number of disks on a controller. 880 * 881 * The main point is to insure that a 882 * controller does not have more than half 883 * of the replicas. 884 */ 885 drvcnt = min(drvcnt, 12); 886 drvcnt = max(drvcnt, MD_MINBALREP); 887 888 /* 889 * Can we find fewer than the maximum replicas by reducing the 890 * number of replicas per drive. 891 */ 892 for (n = drvcnt; n > 0; n--) { 893 reps = count_replicas(*clpp, n); 894 if (reps > 0 && reps <= MDDB_NLB) { 895 *minimum_replicas = n; 896 return (0); 897 } 898 } 899 900 cont: 901 free_ctlr_lst(clpp); 902 903 /* 904 * Build a ctlr list with SSA-100 busses as separate controllers. 905 * 906 * If Here: Try to put 2 replicas per controller/bus 907 * If that doesn't work put 1 replica per controller/bus 908 */ 909 if (build_ctlr_lst(sp, clpp, opdd, curdd, TRUE, dbsize, ep) == -1) 910 return (-1); 911 912 /* 913 * If the number of "real" controllers is 2, special handling may be 914 * needed. 915 */ 916 if (rctlrs != 2) { 917 drvcnt = MD_MINBALREP; 918 goto other; 919 } 920 921 /* 922 * Determine what controllers are usable in the sense of being able to 923 * add a replica to a drive on the controller. 924 * Also find the minimum number of drives on a controller. 925 */ 926 drvcnt = ~0U; 927 uctlrs = 0; 928 for (c = *clpp; c != NULL; c = c->ctl_next) { 929 if (c->ctl_drcnt > 0) { 930 uctlrs++; 931 drvcnt = min(drvcnt, c->ctl_drcnt); 932 } 933 } 934 935 /* 936 * If the number of controllers is not changed, continue with original 937 * strategy. 938 */ 939 if (uctlrs == rctlrs) { 940 drvcnt = MD_MINBALREP; 941 goto other; 942 } 943 944 /* 945 * Check the distribution of bus ctlrs across real controllers. 946 */ 947 ructlrs = 0; 948 octlrs = 0; 949 save_done = 0; 950 for (c = *clpp; c != NULL; c = c->ctl_next) { 951 if (c->ctl_drcnt == 0) 952 continue; 953 954 if (! save_done) { 955 save_cnum = c->ctl_cinfop->cnum; 956 save_ctype = c->ctl_cinfop->ctype; 957 (void) strncpy(save_cname, c->ctl_cinfop->cname, 16); 958 save_done = 1; 959 } 960 961 (void) sdssc_convert_cluster_path(c->ctl_cinfop->cname, 962 &cmp_name_1); 963 (void) sdssc_convert_cluster_path(save_cname, &cmp_name_2); 964 965 if (save_ctype != c->ctl_cinfop->ctype || 966 save_cnum != c->ctl_cinfop->cnum || 967 strncmp(cmp_name_1, cmp_name_2, 16) != 0) 968 octlrs++; 969 else 970 ructlrs++; 971 972 sdssc_convert_path_free(cmp_name_1); 973 sdssc_convert_path_free(cmp_name_2); 974 } 975 976 /* 977 * Take the largest of the counts 978 */ 979 ructlrs = max(ructlrs, octlrs); 980 981 /* 982 * If the distribution of bus controlers is half of the total, then 983 * this layout strategy will work, doit. 984 */ 985 if ((uctlrs / 2) == ructlrs) { 986 drvcnt = MD_MINBALREP; 987 goto other; 988 } 989 990 /* 991 * If here, there is a distribution of bus controllers that will cause 992 * the real controller distribution to be unbalanced, so a different 993 * strategy is used. 994 */ 995 free_ctlr_lst(clpp); 996 997 /* 998 * Build the ctlr list with SSA-100 busses NOT as separate controllers. 999 */ 1000 if (build_ctlr_lst(sp, clpp, opdd, curdd, FALSE, dbsize, ep) == -1) 1001 return (-1); 1002 1003 /* 1004 * Make ctl_drcnt limit the number of replicas 1005 */ 1006 for (c = *clpp; c != NULL; c = c->ctl_next) 1007 c->ctl_drcnt = min(drvcnt, c->ctl_drcnt); 1008 1009 /* 1010 * Try at least MD_MINBALREP's per controller after changing ctl_drcnt 1011 */ 1012 drvcnt = MD_MINBALREP; 1013 1014 other: 1015 /* 1016 * Can we find fewer than the maximum replicas by reducing the number 1017 * of replicas per drive. 1018 */ 1019 for (n = drvcnt; n > 0; n--) { 1020 reps = count_replicas(*clpp, n); 1021 if (reps > 0 && reps <= MDDB_NLB) { 1022 *minimum_replicas = n; 1023 return (0); 1024 } 1025 } 1026 1027 free_ctlr_lst(clpp); 1028 1029 /* 1030 * Build a ctlr list with SSA-100 busses NOT as separate controllers. 1031 * 1032 * If Here: Try to put 2 replicas per controller (not on busses) 1033 * If that doesn't work put 1 replica per controller 1034 */ 1035 if (build_ctlr_lst(sp, clpp, opdd, curdd, FALSE, dbsize, ep) == -1) 1036 return (-1); 1037 1038 /* 1039 * Can we find fewer than the maximum replicas by reducing the 1040 * number of replicas per drive. 1041 */ 1042 for (n = MD_MINBALREP; n > 0; n--) { 1043 reps = count_replicas(*clpp, n); 1044 if (reps > 0 && reps <= MDDB_NLB) { 1045 *minimum_replicas = n; 1046 return (0); 1047 } 1048 } 1049 1050 /* 1051 * Return a ctrl list that does not include the SSA-100 buses as 1052 * separate controllers. This will create fewer separate controllers. 1053 */ 1054 *minimum_replicas = 1; 1055 return (0); 1056 } 1057 1058 static int 1059 morethan2_ctl_balance( 1060 mdsetname_t *sp, 1061 md_ctlr_ctl_t *clp, 1062 int min_reps, 1063 md_error_t *ep 1064 ) 1065 { 1066 md_ctlr_ctl_t *c; 1067 int err; 1068 int multiple_reps = 0; 1069 md_ctlr_drv_t *d; 1070 1071 for (c = clp; c != NULL; c = c->ctl_next) { 1072 if (c->ctl_drcnt == 0) 1073 continue; 1074 1075 /* 1076 * check for multiple databases on a disk and compensate 1077 */ 1078 for (d = c->ctl_drvs; d != NULL; d = d->drv_next) { 1079 if (d->drv_dbcnt) 1080 multiple_reps += d->drv_dbcnt - 1; 1081 } 1082 1083 /* 1084 * remove the number of multiple databases count from the 1085 * total db count. This enables us to rebalance if one of 1086 * the disks has a large enough slice for 2 metadb's. If we 1087 * then add a disk with a smaller slice into the set, we want 1088 * that disk to get a replica on it. If we just compare to 1089 * ctl_dbcnt, it won't. 1090 */ 1091 while ((c->ctl_dbcnt - multiple_reps) < 1092 min_reps) { 1093 if ((err = add_replica_to_ctl(sp, c, min_reps, ep)) < 0) 1094 return (-1); 1095 if (err == 0) 1096 break; 1097 } 1098 1099 while (c->ctl_dbcnt > min_reps) { 1100 if ((err = del_replica_from_ctl(sp, c, ep)) < 0) 1101 return (-1); 1102 if (err == 0) 1103 break; 1104 } 1105 } 1106 1107 return (0); 1108 } 1109 1110 static int 1111 lessthan3_ctl_balance( 1112 mdsetname_t *sp, 1113 md_ctlr_ctl_t *clp, 1114 int min_reps, 1115 md_error_t *ep 1116 ) 1117 { 1118 md_ctlr_ctl_t *c; 1119 int err; 1120 int multiple_reps = 0; 1121 md_ctlr_drv_t *d; 1122 1123 for (c = clp; c != NULL; c = c->ctl_next) { 1124 if (c->ctl_drcnt == 0) 1125 continue; 1126 1127 /* 1128 * check for multiple databases on a disk and compensate 1129 */ 1130 for (d = c->ctl_drvs; d != NULL; d = d->drv_next) { 1131 if (d->drv_dbcnt) 1132 multiple_reps += d->drv_dbcnt - 1; 1133 } 1134 1135 /* 1136 * remove the number of multiple databases count from the 1137 * total db count. This enables us to rebalance if one of 1138 * the disks has a large enough slice for 2 metadb's. If we 1139 * then add a disk with a smaller slice into the set, we want 1140 * that disk to get a replica on it. If we just compare to 1141 * ctl_dbcnt, it won't. 1142 */ 1143 while ((c->ctl_dbcnt - multiple_reps) < 1144 (min_reps * c->ctl_drcnt)) { 1145 if ((err = add_replica_to_ctl(sp, c, min_reps, ep)) < 0) 1146 return (-1); 1147 if (err == 0) 1148 break; 1149 } 1150 1151 while (c->ctl_dbcnt > (min_reps * c->ctl_drcnt)) { 1152 if ((err = del_replica_from_ctl(sp, c, ep)) < 0) 1153 return (-1); 1154 if (err == 0) 1155 break; 1156 } 1157 } 1158 1159 return (0); 1160 } 1161 1162 static int 1163 try_again( 1164 md_ctlr_ctl_t *clp, 1165 md_error_t *ep 1166 ) 1167 { 1168 md_ctlr_ctl_t *c; 1169 md_ctlr_drv_t *d; 1170 1171 if (mdismddberror(ep, MDE_TOOMANY_REPLICAS)) 1172 return (TRUE); 1173 1174 /* 1175 * retry if all the errored drives are already in the diskset. 1176 */ 1177 for (c = clp; c != NULL; c = c->ctl_next) { 1178 for (d = c->ctl_drvs; d != NULL; d = d->drv_next) { 1179 if ((d->drv_flags & (DRV_F_INDISKSET|DRV_F_ERROR)) 1180 == DRV_F_ERROR) 1181 return (FALSE); 1182 } 1183 } 1184 return (TRUE); 1185 } 1186 1187 int 1188 meta_db_balance( 1189 mdsetname_t *sp, 1190 md_drive_desc *opdd, 1191 md_drive_desc *curdd, 1192 daddr_t dbsize, 1193 md_error_t *ep 1194 ) 1195 { 1196 int min_reps; 1197 md_ctlr_ctl_t *c, *cl = NULL; 1198 int uctlrs = 0; 1199 int retry = 0; 1200 int rval = 0; 1201 1202 if (balance_replicas(sp, &cl, opdd, curdd, dbsize, &min_reps, ep) == -1) 1203 return (-1); 1204 1205 /* 1206 * Determine what controllers are usable in the sense of being able to 1207 * add a replica to a drive on the controller. 1208 */ 1209 for (c = cl; c != NULL; c = c->ctl_next) 1210 if (c->ctl_drcnt > 0) 1211 uctlrs++; 1212 1213 /* 1214 * Add replicas to achieve a balance. 1215 */ 1216 if (uctlrs > 2) 1217 rval = morethan2_ctl_balance(sp, cl, min_reps, ep); 1218 else 1219 rval = lessthan3_ctl_balance(sp, cl, min_reps, ep); 1220 1221 if (rval) { 1222 if ((retry = try_again(cl, ep)) == TRUE) { 1223 mdclrerror(ep); 1224 rval = 0; 1225 } 1226 } 1227 1228 /* 1229 * Delete all the replicas from drives that are so marked. 1230 */ 1231 if (! rval) 1232 rval = del_replicas(sp, cl, ep); 1233 1234 if (retry) { 1235 if (uctlrs > 2) 1236 rval = morethan2_ctl_balance(sp, cl, min_reps, ep); 1237 else 1238 rval = lessthan3_ctl_balance(sp, cl, min_reps, ep); 1239 1240 if (rval && mdismddberror(ep, MDE_TOOMANY_REPLICAS)) { 1241 mdclrerror(ep); 1242 rval = 0; 1243 } 1244 } 1245 1246 /* 1247 * Free up the ctlr list. 1248 */ 1249 free_ctlr_lst(&cl); 1250 1251 return (rval); 1252 } 1253