1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <sys/types.h> 28 #include <sys/conf.h> 29 #include <sys/time.h> 30 #include <sys/uio.h> 31 #include <sys/param.h> 32 #include <sys/systm.h> 33 #include <sys/systeminfo.h> 34 #include <sys/sysmacros.h> 35 #include <sys/buf.h> 36 #include <sys/kmem.h> 37 #include <sys/file.h> 38 #include <sys/open.h> 39 #include <sys/debug.h> 40 #include <sys/stat.h> 41 #include <sys/lvm/mdvar.h> 42 #include <sys/lvm/md_crc.h> 43 #include <sys/lvm/md_convert.h> 44 #include <sys/types.h> 45 #include <sys/kmem.h> 46 #include <sys/lvm/mdmn_commd.h> 47 #include <sys/cladm.h> 48 49 mhd_mhiargs_t defmhiargs = { 50 1000, 51 { 6000, 6000, 30000 } 52 }; 53 54 #define MDDB 55 56 #include <sys/lvm/mdvar.h> 57 #include <sys/lvm/mdmed.h> 58 #include <sys/lvm/md_names.h> 59 #include <sys/cred.h> 60 #include <sys/ddi.h> 61 #include <sys/sunddi.h> 62 #include <sys/esunddi.h> 63 64 #include <sys/sysevent/eventdefs.h> 65 #include <sys/sysevent/svm.h> 66 67 extern char svm_bootpath[]; 68 69 int md_maxbootlist = MAXBOOTLIST; 70 static ulong_t mddb_maxblocks = 0; /* tune for small records */ 71 static int mddb_maxbufheaders = 50; 72 static uint_t mddb_maxcopies = MDDB_NLB; 73 74 /* 75 * If this is set, more detailed messages about DB init will be given, instead 76 * of just the MDE_DB_NODB. 77 */ 78 static int mddb_db_err_detail = 0; 79 80 /* 81 * This lock is used to single-thread load/unload of all sets 82 */ 83 static kmutex_t mddb_lock; 84 85 /* 86 * You really do NOT want to change this boolean. 87 * It can be VERY dangerous to do so. Loss of 88 * data may occur. USE AT YOUR OWN RISK!!!! 89 */ 90 static int mddb_allow_half = 0; 91 /* 92 * For mirrored root allow reboot with only half the replicas available 93 * Flag inserted for Santa Fe project. 94 */ 95 int mirrored_root_flag; 96 97 #define ISWHITE(c) (((c) == ' ') || ((c) == '\t') || \ 98 ((c) == '\r') || ((c) == '\n')) 99 #define ISNUM(c) (((c) >= '0') && ((c) <= '9')) 100 101 #define SETMUTEX(setno) (&md_set[setno].s_dbmx) 102 103 extern md_krwlock_t md_unit_array_rw; /* md.c */ 104 extern set_t md_nsets; /* md.c */ 105 extern int md_nmedh; /* md.c */ 106 extern md_set_t md_set[]; /* md.c */ 107 extern int (*mdv_strategy_tstpnt)(buf_t *, int, void*); 108 extern dev_info_t *md_devinfo; 109 extern int md_init_debug; 110 extern int md_status; 111 extern md_ops_t *md_opslist; 112 extern md_krwlock_t nm_lock; 113 114 static int update_locatorblock(mddb_set_t *s, md_dev64_t dev, 115 ddi_devid_t didptr, ddi_devid_t old_didptr); 116 117 /* 118 * Defines for crc calculation for records 119 * rec_crcgen generates a crc checksum for a record block 120 * rec_crcchk checks the crc checksum for a record block 121 */ 122 #define REC_CRCGEN 0 123 #define REC_CRCCHK 1 124 #define rec_crcgen(s, dep, rbp) \ 125 (void) rec_crcfunc(s, dep, rbp, REC_CRCGEN) 126 #define rec_crcchk(s, dep, rbp) \ 127 rec_crcfunc(s, dep, rbp, REC_CRCCHK) 128 129 /* 130 * During upgrade, SVM basically runs with the devt from the target 131 * being upgraded. Translations are made from the target devt to the 132 * miniroot devt when writing data out to the disk. This is done by 133 * the following routines: 134 * wrtblklst 135 * writeblks 136 * readblklst 137 * readblks 138 * dt_read 139 * 140 * The following routines are used by the routines listed above and 141 * expect a translated (aka miniroot) devt: 142 * getblks 143 * getmasters 144 * 145 * Also, when calling any system routines, such as ddi_lyr_get_devid, 146 * the translated (aka miniroot) devt must be used. 147 * 148 * By the same token, the major number and major name conversion operations 149 * need to use the name_to_major file from the target system instead 150 * of the name_to_major file on the miniroot. So, calls to 151 * ddi_name_to_major must be replaced with calls to md_targ_name_to_major 152 * when running on an upgrade. Same is true with calls to 153 * ddi_major_to_name. 154 */ 155 156 157 #ifndef MDDB_FAKE 158 159 static int 160 mddb_rwdata( 161 mddb_set_t *s, /* incore db set structure */ 162 int flag, /* B_ASYNC, B_FAILFAST or 0 passed in here */ 163 buf_t *bp 164 ) 165 { 166 int err = 0; 167 168 bp->b_flags = (flag | B_BUSY) & (~B_ASYNC); 169 170 mutex_exit(SETMUTEX(s->s_setno)); 171 if (mdv_strategy_tstpnt == NULL || 172 (*mdv_strategy_tstpnt)(bp, 0, NULL) == 0) 173 (void) bdev_strategy(bp); 174 175 if (flag & B_ASYNC) { 176 mutex_enter(SETMUTEX(s->s_setno)); 177 return (0); 178 } 179 180 err = biowait(bp); 181 mutex_enter(SETMUTEX(s->s_setno)); 182 return (err); 183 } 184 185 static void 186 setidentifier( 187 mddb_set_t *s, 188 identifier_t *ident 189 ) 190 { 191 if (s->s_setno == MD_LOCAL_SET) 192 (void) strcpy(&ident->serial[0], s->s_ident.serial); 193 else 194 ident->createtime = s->s_ident.createtime; 195 } 196 197 static int 198 cmpidentifier( 199 mddb_set_t *s, 200 identifier_t *ident 201 ) 202 { 203 if (s->s_setno == MD_LOCAL_SET) 204 return (strcmp(ident->serial, s->s_ident.serial)); 205 else 206 return (timercmp(&ident->createtime, 207 /*CSTYLED*/ 208 &s->s_ident.createtime, !=)); 209 } 210 211 static int 212 mddb_devopen( 213 md_dev64_t dev 214 ) 215 { 216 dev_t ddi_dev = md_dev64_to_dev(dev); 217 218 if (dev_lopen(&ddi_dev, FREAD|FWRITE, OTYP_LYR, kcred) == 0) 219 return (0); 220 return (1); 221 } 222 223 static void 224 mddb_devclose( 225 md_dev64_t dev 226 ) 227 { 228 (void) dev_lclose(md_dev64_to_dev(dev), FREAD|FWRITE, OTYP_LYR, kcred); 229 } 230 231 /* 232 * stripe_skip_ts 233 * 234 * Returns a list of fields to be skipped in the stripe record structure. 235 * These fields are ms_timestamp in the component structure. 236 * Used to skip these fields when calculating the checksum. 237 */ 238 static crc_skip_t * 239 stripe_skip_ts(void *un, uint_t revision) 240 { 241 struct ms_row32_od *small_mdr; 242 struct ms_row *big_mdr; 243 uint_t row, comp, ncomps, compoff; 244 crc_skip_t *skip; 245 crc_skip_t *skip_prev; 246 crc_skip_t skip_start = {0, 0, 0}; 247 ms_unit_t *big_un; 248 ms_unit32_od_t *small_un; 249 uint_t rb_off = offsetof(mddb_rb32_t, rb_data[0]); 250 251 switch (revision) { 252 case MDDB_REV_RB: 253 case MDDB_REV_RBFN: 254 small_un = (ms_unit32_od_t *)un; 255 skip_prev = &skip_start; 256 257 if (small_un->un_nrows == 0) 258 return (NULL); 259 /* 260 * walk through all rows to find the total number 261 * of components 262 */ 263 small_mdr = &small_un->un_row[0]; 264 ncomps = 0; 265 for (row = 0; (row < small_un->un_nrows); row++) { 266 ncomps += small_mdr[row].un_ncomp; 267 } 268 269 /* Now walk through the components */ 270 compoff = small_un->un_ocomp + rb_off; 271 for (comp = 0; (comp < ncomps); ++comp) { 272 uint_t mdcp = compoff + 273 (comp * sizeof (ms_comp32_od_t)); 274 skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), 275 KM_SLEEP); 276 skip->skip_offset = mdcp + 277 offsetof(ms_comp32_od_t, un_mirror.ms_timestamp); 278 skip->skip_size = sizeof (md_timeval32_t); 279 skip_prev->skip_next = skip; 280 skip_prev = skip; 281 } 282 break; 283 case MDDB_REV_RB64: 284 case MDDB_REV_RB64FN: 285 big_un = (ms_unit_t *)un; 286 skip_prev = &skip_start; 287 288 if (big_un->un_nrows == 0) 289 return (NULL); 290 /* 291 * walk through all rows to find the total number 292 * of components 293 */ 294 big_mdr = &big_un->un_row[0]; 295 ncomps = 0; 296 for (row = 0; (row < big_un->un_nrows); row++) { 297 ncomps += big_mdr[row].un_ncomp; 298 } 299 300 /* Now walk through the components */ 301 compoff = big_un->un_ocomp + rb_off; 302 for (comp = 0; (comp < ncomps); ++comp) { 303 uint_t mdcp = compoff + 304 (comp * sizeof (ms_comp_t)); 305 skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), 306 KM_SLEEP); 307 skip->skip_offset = mdcp + 308 offsetof(ms_comp_t, un_mirror.ms_timestamp); 309 skip->skip_size = sizeof (md_timeval32_t); 310 skip_prev->skip_next = skip; 311 skip_prev = skip; 312 } 313 break; 314 } 315 /* Return the start of the list of fields to skip */ 316 return (skip_start.skip_next); 317 } 318 319 /* 320 * mirror_skip_ts 321 * 322 * Returns a list of fields to be skipped in the mirror record structure. 323 * This includes un_last_read and sm_timestamp for each submirror 324 * Used to skip these fields when calculating the checksum. 325 */ 326 static crc_skip_t * 327 mirror_skip_ts(uint_t revision) 328 { 329 int i; 330 crc_skip_t *skip; 331 crc_skip_t *skip_prev; 332 crc_skip_t skip_start = {0, 0, 0}; 333 uint_t rb_off = offsetof(mddb_rb32_t, rb_data[0]); 334 335 skip_prev = &skip_start; 336 337 skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), KM_SLEEP); 338 switch (revision) { 339 case MDDB_REV_RB: 340 case MDDB_REV_RBFN: 341 skip->skip_offset = offsetof(mm_unit32_od_t, 342 un_last_read) + rb_off; 343 break; 344 case MDDB_REV_RB64: 345 case MDDB_REV_RB64FN: 346 skip->skip_offset = offsetof(mm_unit_t, 347 un_last_read) + rb_off; 348 break; 349 } 350 skip->skip_size = sizeof (int); 351 skip_prev->skip_next = skip; 352 skip_prev = skip; 353 354 for (i = 0; i < NMIRROR; i++) { 355 skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), KM_SLEEP); 356 switch (revision) { 357 case MDDB_REV_RB: 358 case MDDB_REV_RBFN: 359 skip->skip_offset = offsetof(mm_unit32_od_t, 360 un_sm[i].sm_timestamp) + rb_off; 361 break; 362 case MDDB_REV_RB64: 363 case MDDB_REV_RB64FN: 364 skip->skip_offset = offsetof(mm_unit_t, 365 un_sm[i].sm_timestamp) + rb_off; 366 break; 367 } 368 skip->skip_size = sizeof (md_timeval32_t); 369 skip_prev->skip_next = skip; 370 skip_prev = skip; 371 } 372 /* Return the start of the list of fields to skip */ 373 return (skip_start.skip_next); 374 } 375 376 /* 377 * hotspare_skip_ts 378 * 379 * Returns a list of the timestamp fields in the hotspare record structure. 380 * Used to skip these fields when calculating the checksum. 381 */ 382 static crc_skip_t * 383 hotspare_skip_ts(uint_t revision) 384 { 385 crc_skip_t *skip; 386 uint_t rb_off = offsetof(mddb_rb32_t, rb_data[0]); 387 388 skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), KM_SLEEP); 389 switch (revision) { 390 case MDDB_REV_RB: 391 case MDDB_REV_RBFN: 392 skip->skip_offset = offsetof(hot_spare32_od_t, hs_timestamp) + 393 rb_off; 394 break; 395 case MDDB_REV_RB64: 396 case MDDB_REV_RB64FN: 397 skip->skip_offset = offsetof(hot_spare_t, hs_timestamp) + 398 rb_off; 399 break; 400 } 401 skip->skip_size = sizeof (md_timeval32_t); 402 return (skip); 403 } 404 405 /* 406 * rec_crcfunc 407 * 408 * Calculate or check the checksum for a record 409 * Calculate the crc if check == 0, Check the crc if check == 1 410 * 411 * Record block may be written by different nodes in a multi-owner diskset 412 * (in case of master change), the function rec_crcchk excludes timestamp 413 * fields in crc computation of record data. 414 * Otherwise, timestamp fields will cause each node to have a different 415 * checksum for same record block causing the exclusive-or of all record block 416 * checksums and data block record sums to be non-zero after new master writes 417 * at least one record block. 418 */ 419 static uint_t 420 rec_crcfunc( 421 mddb_set_t *s, 422 mddb_de_ic_t *dep, 423 mddb_rb32_t *rbp, 424 int check 425 ) 426 { 427 crc_skip_t *skip; 428 crc_skip_t *skip_tail; 429 mddb_type_t type = dep->de_type1; 430 uint_t ret; 431 432 /* 433 * Generate a list of the areas to be skipped when calculating 434 * the checksum. 435 * First skip rb_checksum, rb_private and rb_userdata. 436 */ 437 skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), KM_SLEEP); 438 skip->skip_offset = offsetof(mddb_rb32_t, rb_checksum_fiddle); 439 skip->skip_size = 3 * sizeof (uint_t); 440 skip_tail = skip; 441 if (MD_MNSET_SETNO(s->s_setno)) { 442 /* For a MN set, skip rb_timestamp */ 443 skip_tail = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), 444 KM_SLEEP); 445 skip_tail->skip_offset = offsetof(mddb_rb32_t, rb_timestamp); 446 skip_tail->skip_size = sizeof (md_timeval32_t); 447 skip->skip_next = skip_tail; 448 449 /* Now add a list of timestamps to be skipped */ 450 if (type >= MDDB_FIRST_MODID) { 451 switch (dep->de_flags) { 452 case MDDB_F_STRIPE: 453 skip_tail->skip_next = 454 stripe_skip_ts((void *)rbp->rb_data, 455 rbp->rb_revision); 456 break; 457 case MDDB_F_MIRROR: 458 skip_tail->skip_next = 459 mirror_skip_ts(rbp->rb_revision); 460 break; 461 case MDDB_F_HOTSPARE: 462 skip_tail->skip_next = 463 hotspare_skip_ts(rbp->rb_revision); 464 break; 465 default: 466 break; 467 } 468 } 469 } 470 471 if (check) { 472 ret = crcchk(rbp, &rbp->rb_checksum, dep->de_recsize, skip); 473 } else { 474 crcgen(rbp, &rbp->rb_checksum, dep->de_recsize, skip); 475 ret = rbp->rb_checksum; 476 } 477 while (skip) { 478 crc_skip_t *skip_save = skip; 479 480 skip = skip->skip_next; 481 kmem_free(skip_save, sizeof (crc_skip_t)); 482 } 483 return (ret); 484 } 485 486 static mddb_bf_t * 487 allocbuffer( 488 mddb_set_t *s, 489 int sleepflag 490 ) 491 { 492 mddb_bf_t *bfp; 493 494 while ((bfp = s->s_freebufhead) == NULL) { 495 if (sleepflag == MDDB_NOSLEEP) 496 return ((mddb_bf_t *)NULL); 497 ++s->s_bufmisses; 498 #ifdef DEBUG 499 if (s->s_bufmisses == 1) 500 cmn_err(CE_NOTE, 501 "md: mddb: set %u sleeping for buffer", s->s_setno); 502 #endif 503 s->s_bufwakeup = 1; 504 cv_wait(&s->s_buf_cv, SETMUTEX(s->s_setno)); 505 } 506 s->s_freebufhead = bfp->bf_next; 507 bzero((caddr_t)bfp, sizeof (*bfp)); 508 bfp->bf_buf.b_back = bfp->bf_buf.b_forw = &bfp->bf_buf; 509 bfp->bf_buf.b_flags = B_BUSY; /* initialize flags */ 510 return (bfp); 511 } 512 513 static void 514 freebuffer( 515 mddb_set_t *s, 516 mddb_bf_t *bfp 517 ) 518 { 519 bfp->bf_next = s->s_freebufhead; 520 s->s_freebufhead = bfp; 521 if (s->s_bufwakeup) { 522 cv_broadcast(&s->s_buf_cv); 523 s->s_bufwakeup = 0; 524 } 525 } 526 527 528 static void 529 blkbusy( 530 mddb_set_t *s, 531 mddb_block_t blk 532 ) 533 { 534 int bit, byte; 535 536 s->s_freeblkcnt--; 537 byte = blk / 8; 538 bit = 1 << (blk & 7); 539 ASSERT(! (s->s_freebitmap[byte] & bit)); 540 s->s_freebitmap[byte] |= bit; 541 } 542 543 static void 544 blkfree( 545 mddb_set_t *s, 546 mddb_block_t blk 547 ) 548 { 549 int bit, byte; 550 551 s->s_freeblkcnt++; 552 byte = blk / 8; 553 bit = 1 << (blk & 7); 554 ASSERT(s->s_freebitmap[byte] & bit); 555 s->s_freebitmap[byte] &= ~bit; 556 } 557 558 static int 559 blkcheck( 560 mddb_set_t *s, 561 mddb_block_t blk 562 ) 563 { 564 int bit, byte; 565 566 byte = blk / 8; 567 bit = 1 << (blk & 7); 568 return (s->s_freebitmap[byte] & bit); 569 } 570 571 /* 572 * not fast but simple 573 */ 574 static mddb_block_t 575 getfreeblks( 576 mddb_set_t *s, 577 size_t count 578 ) 579 { 580 int i; 581 size_t contig; 582 583 contig = 0; 584 for (i = 0; i < s->s_totalblkcnt; i++) { 585 if (blkcheck(s, i)) { 586 contig = 0; 587 } else { 588 contig++; 589 if (contig == count) { 590 contig = i - count + 1; 591 for (i = (int)contig; i < contig + count; i++) 592 blkbusy(s, i); 593 return ((mddb_block_t)contig); 594 } 595 } 596 } 597 return (0); 598 } 599 600 static void 601 computefreeblks( 602 mddb_set_t *s 603 ) 604 { 605 mddb_db_t *dbp; 606 mddb_de_ic_t *dep; 607 int i; 608 int minblks; 609 int freeblks; 610 mddb_mb_ic_t *mbip; 611 mddb_lb_t *lbp; 612 mddb_block_t maxblk; 613 mddb_did_db_t *did_dbp; 614 int nblks; 615 616 minblks = 0; 617 lbp = s->s_lbp; 618 maxblk = 0; 619 620 /* 621 * Determine the max number of blocks. 622 */ 623 nblks = (lbp->lb_flags & MDDB_MNSET) ? MDDB_MN_MAXBLKS : MDDB_MAXBLKS; 624 /* 625 * go through and find highest logical block 626 */ 627 for (dbp = s->s_dbp; dbp != 0; dbp = dbp->db_next) { 628 if (dbp->db_blknum > maxblk) 629 maxblk = dbp->db_blknum; 630 for (dep = dbp->db_firstentry; dep != 0; dep = dep->de_next) 631 for (i = 0; i < dep->de_blkcount; i++) 632 if (dep->de_blks[i] > maxblk) 633 maxblk = dep->de_blks[i]; 634 } 635 636 for (i = 0; i < lbp->lb_loccnt; i++) { 637 mddb_locator_t *lp = &lbp->lb_locators[i]; 638 639 if ((lp->l_flags & MDDB_F_DELETED) || 640 (lp->l_flags & MDDB_F_EMASTER)) 641 continue; 642 643 freeblks = 0; 644 for (mbip = s->s_mbiarray[i]; mbip != NULL; 645 mbip = mbip->mbi_next) { 646 freeblks += mbip->mbi_mddb_mb.mb_blkcnt; 647 } 648 if (freeblks == 0) /* this happen when there is no */ 649 continue; /* master blk */ 650 651 if (freeblks <= maxblk) { 652 lp->l_flags |= MDDB_F_TOOSMALL; 653 lp->l_flags &= ~MDDB_F_ACTIVE; 654 } 655 656 if (freeblks < minblks || minblks == 0) 657 minblks = freeblks; 658 } 659 /* 660 * set up reasonable freespace if no 661 * data bases exist 662 */ 663 if (minblks == 0) 664 minblks = 100; 665 if (minblks > nblks) 666 minblks = nblks; 667 s->s_freeblkcnt = minblks; 668 s->s_totalblkcnt = minblks; 669 if (! s->s_freebitmapsize) { 670 s->s_freebitmapsize = nblks / 8; 671 s->s_freebitmap = (uchar_t *)kmem_zalloc(s->s_freebitmapsize, 672 KM_SLEEP); 673 } 674 bzero((caddr_t)s->s_freebitmap, s->s_freebitmapsize); 675 676 /* locator block sectors */ 677 for (i = 0; i < s->s_lbp->lb_blkcnt; i++) 678 blkbusy(s, i); 679 680 /* locator name sectors */ 681 for (i = 0; i < s->s_lbp->lb_lnblkcnt; i++) 682 blkbusy(s, (s->s_lbp->lb_lnfirstblk + i)); 683 684 if (lbp->lb_flags & MDDB_DEVID_STYLE) { 685 /* locator block device id information */ 686 for (i = 0; i < s->s_lbp->lb_didblkcnt; i++) 687 blkbusy(s, (s->s_lbp->lb_didfirstblk + i)); 688 689 /* disk blocks containing actual device ids */ 690 did_dbp = s->s_did_icp->did_ic_dbp; 691 while (did_dbp) { 692 for (i = 0; i < did_dbp->db_blkcnt; i++) { 693 blkbusy(s, did_dbp->db_firstblk + i); 694 } 695 did_dbp = did_dbp->db_next; 696 } 697 } 698 699 /* Only use data tags if not a MN set */ 700 if (!(lbp->lb_flags & MDDB_MNSET)) { 701 /* Found a bad tag, do NOT mark the data tag blks busy here */ 702 if (! (md_get_setstatus(s->s_setno) & MD_SET_BADTAG)) { 703 for (i = 0; i < s->s_lbp->lb_dtblkcnt; i++) 704 blkbusy(s, (s->s_lbp->lb_dtfirstblk + i)); 705 } 706 } 707 708 /* directory block/entry sectors */ 709 for (dbp = s->s_dbp; dbp != 0; dbp = dbp->db_next) { 710 blkbusy(s, dbp->db_blknum); 711 for (dep = dbp->db_firstentry; dep != 0; dep = dep->de_next) 712 for (i = 0; i < dep->de_blkcount; i++) 713 blkbusy(s, dep->de_blks[i]); 714 } 715 } 716 717 /* 718 * Add free space to the device id incore free list. 719 * Called: 720 * - During startup when all devid blocks are temporarily placed on the 721 * free list 722 * - After a devid has been deleted via the metadb command. 723 * - When mddb_devid_free_get adds unused space from a disk block 724 * to free list 725 */ 726 static int 727 mddb_devid_free_add( 728 mddb_set_t *s, 729 uint_t firstblk, 730 uint_t offset, 731 uint_t length 732 ) 733 { 734 mddb_did_free_t *did_freep; 735 736 if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) { 737 return (0); 738 } 739 740 did_freep = (mddb_did_free_t *)kmem_zalloc(sizeof (mddb_did_free_t), 741 KM_SLEEP); 742 did_freep->free_blk = firstblk; 743 did_freep->free_offset = offset; 744 did_freep->free_length = length; 745 did_freep->free_next = s->s_did_icp->did_ic_freep; 746 s->s_did_icp->did_ic_freep = did_freep; 747 748 return (0); 749 } 750 751 /* 752 * Remove specific free space from the device id incore free list. 753 * Called at startup (after all devid blocks have been placed on 754 * free list) in order to remove the free space from the list that 755 * contains actual devids. 756 * Returns 0 if area successfully removed. 757 * Returns 1 if no matching area is found - so nothing removed. 758 */ 759 static int 760 mddb_devid_free_delete( 761 mddb_set_t *s, 762 uint_t firstblk, 763 uint_t offset, 764 uint_t length 765 ) 766 { 767 int block_found = 0; 768 mddb_did_free_t *did_freep1; /* next free block */ 769 mddb_did_free_t *did_freep2 = 0; /* previous free block */ 770 mddb_did_free_t *did_freep_before; /* area before offset, len */ 771 mddb_did_free_t *did_freep_after; /* area after offset, len */ 772 uint_t old_length; 773 774 if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) { 775 return (1); 776 } 777 778 /* find free block for this devid */ 779 did_freep1 = s->s_did_icp->did_ic_freep; 780 while (did_freep1) { 781 /* 782 * Look through free list of <block, offset, length> to 783 * find our entry in the free list. Our entry should 784 * exist since the entire devid block was placed into 785 * this free list at startup. This code is just removing 786 * the non-free (in-use) portions of the devid block so 787 * that the remaining linked list does indeed just 788 * contain a free list. 789 * 790 * Our entry has been found if 791 * - the blocks match, 792 * - the offset (starting address) in the free list is 793 * less than the offset of our entry and 794 * - the length+offset (ending address) in the free list is 795 * greater than the length+offset of our entry. 796 */ 797 if ((did_freep1->free_blk == firstblk) && 798 (did_freep1->free_offset <= offset) && 799 ((did_freep1->free_length + did_freep1->free_offset) >= 800 (length + offset))) { 801 /* Have found our entry - remove from list */ 802 block_found = 1; 803 did_freep_before = did_freep1; 804 old_length = did_freep1->free_length; 805 /* did_freep1 - pts to next free block */ 806 did_freep1 = did_freep1->free_next; 807 if (did_freep2) { 808 did_freep2->free_next = did_freep1; 809 } else { 810 s->s_did_icp->did_ic_freep = did_freep1; 811 } 812 813 /* 814 * did_freep_before points to area in block before 815 * offset, length. 816 */ 817 did_freep_before->free_length = offset - 818 did_freep_before->free_offset; 819 /* 820 * did_freep_after points to area in block after 821 * offset, length. 822 */ 823 did_freep_after = (mddb_did_free_t *)kmem_zalloc 824 (sizeof (mddb_did_free_t), KM_SLEEP); 825 did_freep_after->free_blk = did_freep_before->free_blk; 826 did_freep_after->free_offset = offset + length; 827 did_freep_after->free_length = old_length - length - 828 did_freep_before->free_length; 829 /* 830 * Add before and after areas to free list 831 * If area before or after offset, length has length 832 * of 0, that entry is not added. 833 */ 834 if (did_freep_after->free_length) { 835 did_freep_after->free_next = did_freep1; 836 if (did_freep2) { 837 did_freep2->free_next = 838 did_freep_after; 839 } else { 840 s->s_did_icp->did_ic_freep = 841 did_freep_after; 842 } 843 did_freep1 = did_freep_after; 844 } else { 845 kmem_free(did_freep_after, 846 sizeof (mddb_did_free_t)); 847 } 848 849 if (did_freep_before->free_length) { 850 did_freep_before->free_next = did_freep1; 851 if (did_freep2) { 852 did_freep2->free_next = 853 did_freep_before; 854 } else { 855 s->s_did_icp->did_ic_freep = 856 did_freep_before; 857 } 858 } else { 859 kmem_free(did_freep_before, 860 sizeof (mddb_did_free_t)); 861 } 862 break; 863 } else { 864 did_freep2 = did_freep1; 865 did_freep1 = did_freep1->free_next; 866 } 867 } 868 if (block_found == 0) { 869 return (1); 870 } else { 871 return (0); 872 } 873 } 874 875 /* 876 * Find free space of devid length and remove free space from list. 877 * Return a pointer to the previously free area. 878 * 879 * If there's not enough free space on the free list, get an empty 880 * disk block, put the empty disk block on the did_ic_dbp linked list, 881 * and add the disk block space not used for devid to the free list. 882 * 883 * Return pointer to address (inside disk block) of free area for devid. 884 * Return 0 if error. 885 */ 886 static caddr_t 887 mddb_devid_free_get( 888 mddb_set_t *s, 889 uint_t len, 890 uint_t *blk, 891 uint_t *cnt, 892 uint_t *offset 893 ) 894 { 895 mddb_did_free_t *freep, *freep2; 896 mddb_did_db_t *dbp; 897 uint_t blk_cnt, blk_num; 898 ddi_devid_t devid_ptr = NULL; 899 900 if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) { 901 return (0); 902 } 903 904 freep = s->s_did_icp->did_ic_freep; 905 freep2 = (mddb_did_free_t *)NULL; 906 while (freep) { 907 /* found a free area - remove from free list */ 908 if (len <= freep->free_length) { 909 *blk = freep->free_blk; 910 *offset = freep->free_offset; 911 /* find disk block pointer that contains free area */ 912 dbp = s->s_did_icp->did_ic_dbp; 913 while (dbp) { 914 if (dbp->db_firstblk == *blk) 915 break; 916 else 917 dbp = dbp->db_next; 918 } 919 /* 920 * If a disk block pointer can't be found - something 921 * is wrong, so don't use this free space. 922 */ 923 if (dbp == NULL) { 924 freep2 = freep; 925 freep = freep->free_next; 926 continue; 927 } 928 929 devid_ptr = (ddi_devid_t)(dbp->db_ptr + *offset); 930 *cnt = dbp->db_blkcnt; 931 932 /* Update free list information */ 933 freep->free_offset += len; 934 freep->free_length -= len; 935 if (freep->free_length == 0) { 936 if (freep2) { 937 freep2->free_next = 938 freep->free_next; 939 } else { 940 s->s_did_icp->did_ic_freep = 941 freep->free_next; 942 } 943 kmem_free(freep, sizeof (mddb_did_free_t)); 944 } 945 break; 946 } 947 freep2 = freep; 948 freep = freep->free_next; 949 } 950 951 /* Didn't find a free spot */ 952 if (freep == NULL) { 953 /* get free logical disk blk in replica */ 954 blk_cnt = btodb(len + (MDDB_BSIZE - 1)); 955 blk_num = getfreeblks(s, blk_cnt); 956 if (blk_num == 0) 957 return (0); 958 959 /* Add disk block to disk block linked list */ 960 dbp = kmem_zalloc(sizeof (mddb_did_db_t), KM_SLEEP); 961 dbp->db_firstblk = blk_num; 962 dbp->db_blkcnt = blk_cnt; 963 dbp->db_ptr = (caddr_t)kmem_zalloc(dbtob(blk_cnt), KM_SLEEP); 964 dbp->db_next = s->s_did_icp->did_ic_dbp; 965 s->s_did_icp->did_ic_dbp = dbp; 966 devid_ptr = (ddi_devid_t)dbp->db_ptr; 967 968 /* Update return values */ 969 *blk = blk_num; 970 *offset = 0; 971 *cnt = blk_cnt; 972 973 /* Add unused part of block to free list */ 974 (void) mddb_devid_free_add(s, blk_num, 975 len, (dbtob(blk_cnt) - len)); 976 } 977 978 return ((caddr_t)devid_ptr); 979 } 980 981 /* 982 * Add device id information for locator index to device id area in set. 983 * Get free area to store device id from free list. Update checksum 984 * for mddb_did_blk. 985 * 986 * This routine does not write any data out to disk. 987 * After this routine has been called, the routine, writelocall, should 988 * be called to write both the locator block and device id area out 989 * to disk. 990 */ 991 static int 992 mddb_devid_add( 993 mddb_set_t *s, 994 uint_t index, 995 ddi_devid_t devid, 996 char *minor_name 997 ) 998 { 999 uint_t devid_len; 1000 uint_t blk, offset; 1001 ddi_devid_t devid_ptr; 1002 mddb_did_info_t *did_info; 1003 uint_t blkcnt, i; 1004 mddb_did_blk_t *did_blk; 1005 1006 if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) { 1007 return (1); 1008 } 1009 if (strlen(minor_name) > (MDDB_MINOR_NAME_MAX - 1)) 1010 return (1); 1011 1012 /* Check if device id has already been added */ 1013 did_blk = s->s_did_icp->did_ic_blkp; 1014 did_info = &(did_blk->blk_info[index]); 1015 if (did_info->info_flags & MDDB_DID_EXISTS) 1016 return (0); 1017 1018 devid_len = ddi_devid_sizeof(devid); 1019 devid_ptr = (ddi_devid_t)mddb_devid_free_get(s, 1020 devid_len, &blk, &blkcnt, &offset); 1021 1022 if (devid_ptr == NULL) { 1023 return (1); 1024 } 1025 1026 /* Copy devid into devid free area */ 1027 for (i = 0; i < devid_len; i++) 1028 ((char *)devid_ptr)[i] = ((char *)devid)[i]; 1029 1030 /* Update mddb_did_info area for new device id */ 1031 did_info->info_flags = MDDB_DID_EXISTS | MDDB_DID_VALID; 1032 1033 /* 1034 * Only set UPDATED flag for non-replicated import cases. 1035 * This allows the side locator driver name index to get 1036 * updated in load_old_replicas. 1037 */ 1038 if (!(md_get_setstatus(s->s_setno) & MD_SET_REPLICATED_IMPORT)) 1039 did_info->info_flags |= MDDB_DID_UPDATED; 1040 1041 did_info->info_firstblk = blk; 1042 did_info->info_blkcnt = blkcnt; 1043 did_info->info_offset = offset; 1044 did_info->info_length = devid_len; 1045 (void) strcpy(did_info->info_minor_name, minor_name); 1046 crcgen(devid_ptr, &did_info->info_checksum, devid_len, NULL); 1047 1048 /* Add device id pointer to did_ic_devid array */ 1049 s->s_did_icp->did_ic_devid[index] = devid_ptr; 1050 1051 return (0); 1052 } 1053 1054 1055 /* 1056 * Delete device id information for locator index from device id area in set. 1057 * Add device id space to free area. 1058 * 1059 * This routine does not write any data out to disk. 1060 * After this routine has been called, the routine, writelocall, should 1061 * be called to write both the locator block and device id area out 1062 * to disk. 1063 */ 1064 static int 1065 mddb_devid_delete(mddb_set_t *s, uint_t index) 1066 { 1067 mddb_did_info_t *did_info; 1068 mddb_did_blk_t *did_blk; 1069 1070 if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) { 1071 return (1); 1072 } 1073 1074 /* Get device id information from mddb_did_blk */ 1075 did_blk = s->s_did_icp->did_ic_blkp; 1076 did_info = &(did_blk->blk_info[index]); 1077 1078 /* 1079 * Ensure that the underlying device supports device ids 1080 * before arbitrarily removing them. 1081 */ 1082 if (!(did_info->info_flags & MDDB_DID_EXISTS)) { 1083 return (1); 1084 } 1085 1086 /* Remove device id information from mddb_did_blk */ 1087 did_info->info_flags = 0; 1088 1089 /* Remove device id from incore area */ 1090 s->s_did_icp->did_ic_devid[index] = (ddi_devid_t)NULL; 1091 1092 /* Add new free space in disk block to free list */ 1093 (void) mddb_devid_free_add(s, did_info->info_firstblk, 1094 did_info->info_offset, did_info->info_length); 1095 1096 return (0); 1097 } 1098 1099 /* 1100 * Check if there is a device id for a locator index. 1101 * 1102 * Caller of this routine should not free devid or minor_name since 1103 * these will point to internal data structures that should not 1104 * be freed. 1105 */ 1106 static int 1107 mddb_devid_get( 1108 mddb_set_t *s, 1109 uint_t index, 1110 ddi_devid_t *devid, 1111 char **minor_name 1112 ) 1113 { 1114 mddb_did_info_t *did_info; 1115 1116 if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) { 1117 return (0); 1118 } 1119 did_info = &(s->s_did_icp->did_ic_blkp->blk_info[index]); 1120 1121 if (did_info->info_flags & MDDB_DID_EXISTS) { 1122 *devid = s->s_did_icp->did_ic_devid[index]; 1123 *minor_name = 1124 s->s_did_icp->did_ic_blkp->blk_info[index].info_minor_name; 1125 return (1); 1126 } else 1127 return (0); 1128 1129 1130 } 1131 1132 /* 1133 * Check if device id is valid on current system. 1134 * Needs devid, previously known dev_t and current minor_name. 1135 * 1136 * Success: 1137 * Returns 0 if valid device id is found and updates 1138 * dev_t if the dev_t associated with the device id is 1139 * different than dev_t. 1140 * Failure: 1141 * Returns 1 if device id not valid on current system. 1142 */ 1143 static int 1144 mddb_devid_validate(ddi_devid_t devid, md_dev64_t *dev, char *minor_name) 1145 { 1146 int retndevs; 1147 dev_t *ddi_devs; 1148 int devid_flag = 0; 1149 int cnt; 1150 1151 if (dev == 0) 1152 return (1); 1153 /* 1154 * See if devid is valid in the current system. 1155 * If so, set dev to match the devid. 1156 */ 1157 if (ddi_lyr_devid_to_devlist(devid, minor_name, 1158 &retndevs, &ddi_devs) == DDI_SUCCESS) { 1159 if (retndevs > 0) { 1160 /* devid is valid to use */ 1161 devid_flag = 1; 1162 /* does dev_t in list match dev */ 1163 cnt = 0; 1164 while (cnt < retndevs) { 1165 if (*dev == md_expldev(ddi_devs[cnt])) 1166 break; 1167 cnt++; 1168 } 1169 /* 1170 * If a different dev_t, then setup 1171 * new dev and new major name 1172 */ 1173 if (cnt == retndevs) { 1174 *dev = md_expldev(ddi_devs[0]); 1175 } 1176 ddi_lyr_free_devlist(ddi_devs, retndevs); 1177 } 1178 } 1179 if (devid_flag) 1180 return (0); 1181 else 1182 return (1); 1183 } 1184 1185 1186 /* 1187 * Free the devid incore data areas 1188 */ 1189 static void 1190 mddb_devid_icp_free(mddb_did_ic_t **did_icp, mddb_lb_t *lbp) 1191 { 1192 mddb_did_free_t *did_freep1, *did_freep2; 1193 mddb_did_db_t *did_dbp1, *did_dbp2; 1194 mddb_did_ic_t *icp = *did_icp; 1195 1196 if (icp) { 1197 if (icp->did_ic_blkp) { 1198 kmem_free((caddr_t)icp->did_ic_blkp, 1199 dbtob(lbp->lb_didblkcnt)); 1200 icp->did_ic_blkp = (mddb_did_blk_t *)NULL; 1201 } 1202 1203 if (icp->did_ic_dbp) { 1204 did_dbp1 = icp->did_ic_dbp; 1205 while (did_dbp1) { 1206 did_dbp2 = did_dbp1->db_next; 1207 kmem_free((caddr_t)did_dbp1->db_ptr, 1208 dbtob(did_dbp1->db_blkcnt)); 1209 kmem_free((caddr_t)did_dbp1, 1210 sizeof (mddb_did_db_t)); 1211 did_dbp1 = did_dbp2; 1212 } 1213 } 1214 1215 if (icp->did_ic_freep) { 1216 did_freep1 = icp->did_ic_freep; 1217 while (did_freep1) { 1218 did_freep2 = did_freep1->free_next; 1219 kmem_free((caddr_t)did_freep1, 1220 sizeof (mddb_did_free_t)); 1221 did_freep1 = did_freep2; 1222 } 1223 } 1224 1225 kmem_free((caddr_t)icp, sizeof (mddb_did_ic_t)); 1226 *did_icp = (mddb_did_ic_t *)NULL; 1227 } 1228 1229 } 1230 1231 static daddr_t 1232 getphysblk( 1233 mddb_block_t blk, 1234 mddb_mb_ic_t *mbip 1235 ) 1236 { 1237 mddb_mb_t *mbp = &(mbip->mbi_mddb_mb); 1238 1239 while (blk >= mbp->mb_blkcnt) { 1240 if (! mbip->mbi_next) 1241 return ((daddr_t)-1); /* no such block */ 1242 blk -= mbp->mb_blkcnt; 1243 mbip = mbip->mbi_next; 1244 mbp = &(mbip->mbi_mddb_mb); 1245 } 1246 1247 if (blk >= mbp->mb_blkmap.m_consecutive) 1248 return ((daddr_t)-1); /* no such block */ 1249 1250 return ((daddr_t)(mbp->mb_blkmap.m_firstblk + blk)); 1251 } 1252 1253 /* 1254 * when a buf header is passed in the new buffer must be 1255 * put on the front of the chain. writerec counts on it 1256 */ 1257 static int 1258 putblks( 1259 mddb_set_t *s, /* incore db set structure */ 1260 caddr_t buffer, /* adr of buffer to be written */ 1261 daddr_t blk, /* block number for first block */ 1262 int cnt, /* number of blocks to be written */ 1263 md_dev64_t device, /* device to be written to */ 1264 mddb_bf_t **bufhead /* if non-zero then ASYNC I/O */ 1265 /* and put buf address here */ 1266 ) 1267 { 1268 buf_t *bp; 1269 mddb_bf_t *bfp; 1270 int err = 0; 1271 1272 bfp = allocbuffer(s, MDDB_SLEEPOK); 1273 bp = &bfp->bf_buf; 1274 bp->b_bcount = MDDB_BSIZE * cnt; 1275 bp->b_un.b_addr = buffer; 1276 bp->b_blkno = blk; 1277 bp->b_edev = md_dev64_to_dev(device); 1278 /* 1279 * if a header for a buf chain is passed in this is async io. 1280 * currently only done for optimize records 1281 */ 1282 if (bufhead) { 1283 bfp->bf_next = *bufhead; 1284 *bufhead = bfp; 1285 (void) mddb_rwdata(s, B_WRITE|B_ASYNC, bp); 1286 return (0); 1287 } 1288 err = mddb_rwdata(s, B_WRITE, bp); 1289 freebuffer(s, bfp); 1290 if (err) { 1291 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, SVM_TAG_REPLICA, 1292 s->s_setno, device); 1293 return (MDDB_F_EWRITE); 1294 } 1295 return (0); 1296 } 1297 1298 /* 1299 * wrtblklst - takes an array of logical block numbers 1300 * and writes the buffer to those blocks (scatter). 1301 * If called during upgrade, this routine expects a 1302 * non-translated (aka target) dev. 1303 */ 1304 static int 1305 wrtblklst( 1306 mddb_set_t *s, /* incore set structure */ 1307 caddr_t buffer, /* buffer to be written (record blk) */ 1308 mddb_block_t blka[], /* list of logical blks for record */ 1309 daddr_t cnt, /* number of logical blks */ 1310 const int li, /* locator index */ 1311 mddb_bf_t **bufhead, /* if non-zero then ASYNC I/O */ 1312 /* and put buf address here */ 1313 int master_only /* allow only master node to write */ 1314 ) 1315 { 1316 daddr_t blk; 1317 daddr_t blk1; 1318 int err = 0; 1319 int cons; 1320 mddb_lb_t *lbp = s->s_lbp; 1321 mddb_locator_t *lp = &lbp->lb_locators[li]; 1322 md_dev64_t dev; 1323 mddb_mb_ic_t *mbip = s->s_mbiarray[li]; 1324 1325 /* 1326 * If a MN diskset and only the master can write, 1327 * then a non-master node will just return success. 1328 */ 1329 if (lbp->lb_flags & MDDB_MNSET) { 1330 if (master_only == MDDB_WR_ONLY_MASTER) { 1331 /* return successfully if we aren't the master */ 1332 if (!(md_set[s->s_setno].s_am_i_master)) { 1333 return (0); 1334 } 1335 } 1336 if (mbip == NULL) 1337 return (MDDB_F_EWRITE); 1338 } 1339 1340 dev = md_xlate_targ_2_mini(md_expldev(lp->l_dev)); 1341 if (dev == NODEV64) { 1342 return (1); 1343 } 1344 1345 blk = getphysblk(blka[0], mbip); 1346 ASSERT(blk >= 0); 1347 1348 cons = 1; 1349 while (cnt) { 1350 if (cons != cnt) { 1351 blk1 = getphysblk(blka[cons], mbip); 1352 ASSERT(blk1 >= 0); 1353 if ((blk + cons) == blk1) { 1354 cons++; 1355 continue; 1356 } 1357 } 1358 if (err = putblks(s, buffer, blk, cons, dev, bufhead)) { 1359 /* 1360 * If an MN diskset and any_node_can_write 1361 * then this request is coming from writeoptrecord 1362 * and l_flags field should not be updated. 1363 * l_flags will be updated as a result of sending 1364 * a class1 message to the master. Setting l_flags 1365 * here will cause slave to be out of sync with 1366 * master. 1367 * 1368 * Otherwise, set the error in l_flags 1369 * (this occurs if this is not a MN diskset or 1370 * only_master_can_write is set). 1371 */ 1372 if ((!(lbp->lb_flags & MDDB_MNSET)) || 1373 (master_only == MDDB_WR_ONLY_MASTER)) { 1374 lp->l_flags |= MDDB_F_EWRITE; 1375 } 1376 return (err); 1377 } 1378 if (bufhead) 1379 (*bufhead)->bf_locator = lp; 1380 1381 buffer += MDDB_BSIZE * cons; 1382 cnt -= cons; 1383 blka += cons; 1384 if (cnt) { 1385 blk = getphysblk(blka[0], mbip); 1386 ASSERT(blk >= 0); 1387 } 1388 cons = 1; 1389 } 1390 1391 return (0); 1392 } 1393 1394 /* 1395 * writeblks - takes a logical block number/block count pair 1396 * and writes the buffer to those contiguous logical blocks. 1397 * If called during upgrade, this routine expects a non-translated 1398 * (aka target) dev. 1399 */ 1400 static int 1401 writeblks( 1402 mddb_set_t *s, /* incore set structure */ 1403 caddr_t buffer, /* buffer to be written */ 1404 mddb_block_t blk, /* starting logical block number */ 1405 int cnt, /* number of log blocks to be written */ 1406 const int li, /* locator index */ 1407 int master_only /* allow only master node to write */ 1408 ) 1409 { 1410 daddr_t physblk; 1411 int err = 0; 1412 int i; 1413 mddb_lb_t *lbp = s->s_lbp; 1414 mddb_locator_t *lp = &lbp->lb_locators[li]; 1415 md_dev64_t dev; 1416 mddb_block_t *blkarray; 1417 int size; 1418 int ret; 1419 1420 /* 1421 * If a MN diskset and only the master can write, 1422 * then a non-master node will just return success. 1423 */ 1424 if ((lbp->lb_flags & MDDB_MNSET) && 1425 (master_only == MDDB_WR_ONLY_MASTER)) { 1426 /* return successfully if we aren't the master */ 1427 if (!(md_set[s->s_setno].s_am_i_master)) { 1428 return (0); 1429 } 1430 } 1431 1432 dev = md_xlate_targ_2_mini(md_expldev(lp->l_dev)); 1433 if (dev == NODEV64) { 1434 return (1); 1435 } 1436 1437 if (cnt > 1) { 1438 size = sizeof (mddb_block_t) * cnt; 1439 blkarray = (mddb_block_t *)kmem_alloc(size, KM_SLEEP); 1440 for (i = 0; i < cnt; i++) 1441 blkarray[i] = blk + i; 1442 ret = wrtblklst(s, buffer, blkarray, cnt, 1443 li, 0, MDDB_WR_ONLY_MASTER); 1444 kmem_free(blkarray, size); 1445 return (ret); 1446 } 1447 physblk = getphysblk(blk, s->s_mbiarray[li]); 1448 ASSERT(physblk > 0); 1449 if (err = putblks(s, buffer, physblk, 1, dev, (mddb_bf_t **)0)) { 1450 lp->l_flags |= MDDB_F_EWRITE; 1451 return (err); 1452 } 1453 return (0); 1454 } 1455 1456 /* 1457 * writeall - will write the buffer to all ACTIVE/NON-ERRORED replicas. 1458 */ 1459 static int 1460 writeall( 1461 mddb_set_t *s, /* incore set structure */ 1462 caddr_t buffer, /* buffer to be written */ 1463 mddb_block_t block, /* starting logical block number */ 1464 int cnt, /* number of log blocks to be written */ 1465 int master_only /* allow only master node to write */ 1466 ) 1467 { 1468 int li; 1469 int err = 0; 1470 mddb_lb_t *lbp = s->s_lbp; 1471 1472 for (li = 0; li < lbp->lb_loccnt; li++) { 1473 mddb_locator_t *lp = &lbp->lb_locators[li]; 1474 1475 if ((! (lp->l_flags & MDDB_F_ACTIVE)) || 1476 (lp->l_flags & MDDB_F_EWRITE)) 1477 continue; 1478 1479 err |= writeblks(s, buffer, block, cnt, li, master_only); 1480 } 1481 1482 return (err); 1483 } 1484 1485 /* 1486 * writelocall - write the locator block and device id information (if 1487 * replica is in device id format) to all ACTIVE/NON-ERRORER replicas. 1488 * 1489 * Increments the locator block's commitcnt. Updates the device id area's 1490 * commitcnt if the replica is in device id format. Regenerates the 1491 * checksums after updating the commitcnt(s). 1492 */ 1493 static int 1494 writelocall( 1495 mddb_set_t *s /* incore set structure */ 1496 ) 1497 { 1498 int li; 1499 int err = 0; 1500 mddb_lb_t *lbp = s->s_lbp; 1501 mddb_did_blk_t *did_blk; 1502 mddb_did_db_t *did_dbp; 1503 1504 s->s_lbp->lb_commitcnt++; 1505 if (lbp->lb_flags & MDDB_DEVID_STYLE) { 1506 did_blk = s->s_did_icp->did_ic_blkp; 1507 did_blk->blk_commitcnt = s->s_lbp->lb_commitcnt; 1508 crcgen(did_blk, &did_blk->blk_checksum, 1509 dbtob(lbp->lb_didblkcnt), NULL); 1510 } 1511 crcgen(lbp, &lbp->lb_checksum, dbtob(lbp->lb_blkcnt), NULL); 1512 1513 for (li = 0; li < lbp->lb_loccnt; li++) { 1514 mddb_locator_t *lp = &lbp->lb_locators[li]; 1515 1516 if ((! (lp->l_flags & MDDB_F_ACTIVE)) || 1517 (lp->l_flags & MDDB_F_EWRITE)) 1518 continue; 1519 1520 if (lbp->lb_flags & MDDB_DEVID_STYLE) { 1521 /* write out blocks containing actual device ids */ 1522 did_dbp = s->s_did_icp->did_ic_dbp; 1523 while (did_dbp) { 1524 err |= writeblks(s, (caddr_t)did_dbp->db_ptr, 1525 did_dbp->db_firstblk, 1526 did_dbp->db_blkcnt, li, 1527 MDDB_WR_ONLY_MASTER); 1528 did_dbp = did_dbp->db_next; 1529 } 1530 1531 /* write out device id area block */ 1532 err |= writeblks(s, (caddr_t)did_blk, 1533 lbp->lb_didfirstblk, lbp->lb_didblkcnt, li, 1534 MDDB_WR_ONLY_MASTER); 1535 } 1536 /* write out locator block */ 1537 err |= writeblks(s, (caddr_t)lbp, 0, lbp->lb_blkcnt, li, 1538 MDDB_WR_ONLY_MASTER); 1539 } 1540 1541 /* 1542 * If a MN diskset and this is the master, set the PARSE_LOCBLK flag 1543 * in the mddb_set structure to show that the locator block has 1544 * been changed. 1545 */ 1546 1547 if ((lbp->lb_flags & MDDB_MNSET) && 1548 (md_set[s->s_setno].s_am_i_master)) { 1549 s->s_mn_parseflags |= MDDB_PARSE_LOCBLK; 1550 } 1551 return (err); 1552 } 1553 1554 /* 1555 * If called during upgrade, this routine expects a translated 1556 * (aka miniroot) dev. 1557 */ 1558 static int 1559 getblks( 1560 mddb_set_t *s, /* incore db set structure */ 1561 caddr_t buffer, /* buffer to read data into */ 1562 md_dev64_t device, /* device to read from */ 1563 daddr_t blk, /* physical block number to read */ 1564 int cnt, /* number of blocks to read */ 1565 int flag /* flags for I/O */ 1566 ) 1567 { 1568 buf_t *bp; 1569 mddb_bf_t *bfp; 1570 int err = 0; 1571 1572 bfp = allocbuffer(s, MDDB_SLEEPOK); /* this will never sleep */ 1573 bp = &bfp->bf_buf; 1574 bp->b_bcount = MDDB_BSIZE * cnt; 1575 bp->b_un.b_addr = buffer; 1576 bp->b_blkno = blk; 1577 bp->b_edev = md_dev64_to_dev(device); 1578 err = mddb_rwdata(s, (B_READ | flag), bp); 1579 freebuffer(s, bfp); 1580 if (err) { 1581 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, SVM_TAG_REPLICA, 1582 s->s_setno, device); 1583 return (MDDB_F_EREAD); 1584 } 1585 return (0); 1586 } 1587 1588 /* 1589 * readblklst - takes an array of logical block numbers 1590 * and reads those blocks (gather) into the buffer. 1591 * If called during upgrade, this routine expects a non-translated 1592 * (aka target) dev. 1593 */ 1594 static int 1595 readblklst( 1596 mddb_set_t *s, /* incore set structure */ 1597 caddr_t buffer, /* buffer to be read (record block) */ 1598 mddb_block_t blka[], /* list of logical blocks to be read */ 1599 daddr_t cnt, /* number of logical blocks */ 1600 int li, /* locator index */ 1601 int flag /* flags for I/O */ 1602 ) 1603 { 1604 daddr_t blk; 1605 daddr_t blk1; 1606 int err = 0; 1607 int cons; 1608 md_dev64_t dev; 1609 mddb_mb_ic_t *mbip; 1610 1611 mbip = s->s_mbiarray[li]; 1612 dev = md_expldev(s->s_lbp->lb_locators[li].l_dev); 1613 dev = md_xlate_targ_2_mini(dev); 1614 if (dev == NODEV64) { 1615 return (1); 1616 } 1617 1618 blk = getphysblk(blka[0], mbip); 1619 ASSERT(blk >= 0); 1620 1621 cons = 1; 1622 while (cnt) { 1623 if (cons != cnt) { 1624 blk1 = getphysblk(blka[cons], mbip); 1625 ASSERT(blk1 >= 0); 1626 if ((blk + cons) == blk1) { 1627 cons++; 1628 continue; 1629 } 1630 } 1631 if (err = getblks(s, buffer, dev, blk, cons, flag)) 1632 return (err); 1633 buffer += MDDB_BSIZE * cons; 1634 cnt -= cons; 1635 blka += cons; 1636 if (cnt) { 1637 blk = getphysblk(blka[0], mbip); 1638 ASSERT(blk >= 0); 1639 } 1640 cons = 1; 1641 } 1642 return (0); 1643 } 1644 1645 /* 1646 * readblks - takes a logical block number/block count pair 1647 * and reads those contiguous logical blocks into the buffer. 1648 * If called during upgrade, this routine expects a non-translated 1649 * (aka target) dev. 1650 */ 1651 static int 1652 readblks( 1653 mddb_set_t *s, /* incore set structure */ 1654 caddr_t buffer, /* buffer to be read into */ 1655 mddb_block_t blk, /* logical block number to be read */ 1656 int cnt, /* number of logical blocks to be read */ 1657 int li /* locator index */ 1658 ) 1659 { 1660 daddr_t physblk; 1661 md_dev64_t device; 1662 int i; 1663 mddb_block_t *blkarray; 1664 int size; 1665 int ret; 1666 1667 if (cnt > 1) { 1668 size = sizeof (mddb_block_t) * cnt; 1669 blkarray = (mddb_block_t *)kmem_alloc(size, KM_SLEEP); 1670 for (i = 0; i < cnt; i++) 1671 blkarray[i] = blk + i; 1672 ret = readblklst(s, buffer, blkarray, cnt, li, 0); 1673 kmem_free(blkarray, size); 1674 return (ret); 1675 } 1676 physblk = getphysblk(blk, s->s_mbiarray[li]); 1677 ASSERT(physblk > 0); 1678 device = md_expldev(s->s_lbp->lb_locators[li].l_dev); 1679 device = md_xlate_targ_2_mini(device); 1680 if (device == NODEV64) { 1681 return (1); 1682 } 1683 return (getblks(s, buffer, device, physblk, 1, 0)); 1684 } 1685 1686 static void 1687 single_thread_start( 1688 mddb_set_t *s 1689 ) 1690 { 1691 while (s->s_singlelockgotten) { 1692 s->s_singlelockwanted++; 1693 cv_wait(&s->s_single_thread_cv, SETMUTEX(s->s_setno)); 1694 } 1695 s->s_singlelockgotten++; 1696 } 1697 1698 static void 1699 single_thread_end( 1700 mddb_set_t *s 1701 ) 1702 { 1703 ASSERT(s->s_singlelockgotten); 1704 s->s_singlelockgotten = 0; 1705 if (s->s_singlelockwanted) { 1706 s->s_singlelockwanted = 0; 1707 cv_broadcast(&s->s_single_thread_cv); 1708 } 1709 } 1710 1711 static size_t 1712 sizeofde( 1713 mddb_de_ic_t *dep 1714 ) 1715 { 1716 size_t size; 1717 1718 size = sizeof (mddb_de_ic_t) - sizeof (mddb_block_t) + 1719 sizeof (mddb_block_t) * dep->de_blkcount; 1720 return (size); 1721 } 1722 1723 static size_t 1724 sizeofde32( 1725 mddb_de32_t *dep 1726 ) 1727 { 1728 size_t size; 1729 1730 size = sizeof (*dep) - sizeof (dep->de32_blks) + 1731 sizeof (mddb_block_t) * dep->de32_blkcount; 1732 return (size); 1733 } 1734 1735 static mddb_de32_t * 1736 nextentry( 1737 mddb_de32_t *dep 1738 ) 1739 { 1740 mddb_de32_t *ret; 1741 1742 ret = (mddb_de32_t *)((void *)((caddr_t)dep + sizeofde32(dep))); 1743 return (ret); 1744 } 1745 1746 static void 1747 create_db32rec( 1748 mddb_db32_t *db32p, 1749 mddb_db_t *dbp 1750 ) 1751 { 1752 mddb_de_ic_t *dep; 1753 mddb_de32_t *de32p; 1754 1755 #if defined(_ILP32) && !defined(lint) 1756 ASSERT(sizeof (mddb_de_t) == sizeof (mddb_de32_t)); 1757 ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t)); 1758 #endif 1759 1760 dbtodb32(dbp, db32p); 1761 if ((dbp->db_firstentry != NULL) && (db32p->db32_firstentry == 0)) 1762 db32p->db32_firstentry = 0x4; 1763 de32p = (mddb_de32_t *)((void *) ((caddr_t)(&db32p->db32_firstentry) 1764 + sizeof (db32p->db32_firstentry))); 1765 for (dep = dbp->db_firstentry; dep; dep = dep->de_next) { 1766 detode32(dep, de32p); 1767 if ((dep->de_next != NULL) && (de32p->de32_next == 0)) 1768 de32p->de32_next = 0x4; 1769 de32p = nextentry(de32p); 1770 } 1771 ASSERT((uintptr_t)de32p <= (uintptr_t)de32p + MDDB_BSIZE); 1772 } 1773 1774 /* 1775 * If called during upgrade, this routine expects a translated 1776 * (aka miniroot) dev. 1777 * If master blocks are found, set the mn_set parameter to 1 if the 1778 * the master block revision number is MDDB_REV_MNMB; otherwise, 1779 * set it to 0. 1780 * If master blocks are not found, do not change the mnset parameter. 1781 */ 1782 static mddb_mb_ic_t * 1783 getmasters( 1784 mddb_set_t *s, 1785 md_dev64_t dev, 1786 daddr_t blkno, 1787 uint_t *flag, 1788 int *mn_set 1789 ) 1790 { 1791 mddb_mb_ic_t *mbi = NULL; 1792 mddb_mb_t *mb; 1793 int error = 0; 1794 ddi_devid_t devid; 1795 1796 1797 if (mddb_devopen(dev)) { 1798 if (flag) 1799 *flag |= MDDB_F_EMASTER; 1800 return ((mddb_mb_ic_t *)NULL); 1801 } 1802 1803 1804 mbi = (mddb_mb_ic_t *)kmem_zalloc(MDDB_IC_BSIZE, KM_SLEEP); 1805 mb = &(mbi->mbi_mddb_mb); 1806 if (error = getblks(s, (caddr_t)mb, dev, blkno, 1807 btodb(MDDB_BSIZE), 0)) { 1808 error |= MDDB_F_EMASTER; 1809 } 1810 if (mb->mb_magic != MDDB_MAGIC_MB) { 1811 error = MDDB_F_EFMT | MDDB_F_EMASTER; 1812 } 1813 /* Check for MDDB_REV_MNMB and lower */ 1814 if (revchk(MDDB_REV_MNMB, mb->mb_revision)) { 1815 error = MDDB_F_EFMT | MDDB_F_EMASTER; 1816 } 1817 if (crcchk(mb, &mb->mb_checksum, MDDB_BSIZE, NULL)) { 1818 error = MDDB_F_EFMT | MDDB_F_EMASTER; 1819 } 1820 1821 if (!(md_get_setstatus(s->s_setno) & 1822 (MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT)) && 1823 (mb->mb_setno != s->s_setno)) { 1824 error = MDDB_F_EFMT | MDDB_F_EMASTER; 1825 } 1826 if (mb->mb_blkno != blkno) { 1827 error = MDDB_F_EFMT | MDDB_F_EMASTER; 1828 } 1829 mb->mb_next = NULL; 1830 mbi->mbi_next = NULL; 1831 1832 if (error) 1833 goto out; 1834 1835 /* 1836 * Check the md_devid_destroy and md_keep_repl_state flags 1837 * to see if we need to regen the devid or not. 1838 * 1839 * Don't care about devid in local set since it is not used 1840 * and this should not be part of set importing 1841 */ 1842 if ((s->s_setno != MD_LOCAL_SET) && 1843 !(md_get_setstatus(s->s_setno) & 1844 (MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT))) { 1845 /* 1846 * Now check the destroy flag. We also need to handle 1847 * the case where the destroy flag is reset after the 1848 * destroy 1849 */ 1850 if (md_devid_destroy || (mb->mb_devid_len == 0)) { 1851 1852 if (md_devid_destroy) { 1853 bzero(mb->mb_devid, mb->mb_devid_len); 1854 mb->mb_devid_len = 0; 1855 } 1856 1857 /* 1858 * Try to regenerate it if the 'keep' flag is not set 1859 */ 1860 if (!md_keep_repl_state) { 1861 if (ddi_lyr_get_devid(md_dev64_to_dev(dev), 1862 &devid) == DDI_SUCCESS) { 1863 mb->mb_devid_len = 1864 ddi_devid_sizeof(devid); 1865 bcopy(devid, mb->mb_devid, 1866 mb->mb_devid_len); 1867 ddi_devid_free(devid); 1868 } else { 1869 error = MDDB_F_EFMT | MDDB_F_EMASTER; 1870 } 1871 } 1872 1873 crcgen(mb, &mb->mb_checksum, MDDB_BSIZE, NULL); 1874 1875 /* 1876 * Push 1877 */ 1878 if (putblks(s, (caddr_t)mb, blkno, 1, dev, 0) != 0) { 1879 error = MDDB_F_EFMT | MDDB_F_EMASTER; 1880 } 1881 } 1882 } 1883 1884 if (! error) { 1885 /* Set mn_set parameter to 1 if a MN set */ 1886 if (mb->mb_revision == MDDB_REV_MNMB) 1887 *mn_set = 1; 1888 else 1889 *mn_set = 0; 1890 return (mbi); 1891 } 1892 1893 out: 1894 /* Error Out */ 1895 if (flag) 1896 *flag |= error; 1897 1898 kmem_free((caddr_t)mbi, MDDB_IC_BSIZE); 1899 mddb_devclose(dev); 1900 return ((mddb_mb_ic_t *)NULL); 1901 } 1902 1903 static int 1904 getrecord( 1905 mddb_set_t *s, 1906 mddb_de_ic_t *dep, 1907 int li 1908 ) 1909 { 1910 int err = 0; 1911 mddb_rb32_t *rbp; 1912 1913 #if defined(_ILP32) && !defined(lint) 1914 ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t)); 1915 #endif 1916 1917 1918 dep->de_rb = (mddb_rb32_t *)kmem_zalloc(dep->de_recsize, KM_SLEEP); 1919 rbp = dep->de_rb; 1920 1921 err = readblklst(s, (caddr_t)rbp, dep->de_blks, 1922 dep->de_blkcount, li, 0); 1923 if (err) { 1924 return (MDDB_F_EDATA | err); 1925 } 1926 if (rbp->rb_magic != MDDB_MAGIC_RB) { 1927 return (MDDB_F_EFMT | MDDB_F_EDATA); 1928 } 1929 if ((revchk(MDDB_REV_RB, rbp->rb_revision) != 0) && 1930 (revchk(MDDB_REV_RB64, rbp->rb_revision) != 0) && 1931 (revchk(MDDB_REV_RBFN, rbp->rb_revision) != 0) && 1932 (revchk(MDDB_REV_RB64FN, rbp->rb_revision) != 0)) { 1933 return (MDDB_F_EFMT | MDDB_F_EDATA); 1934 } 1935 /* Check crc for this record */ 1936 if (rec_crcchk(s, dep, rbp)) { 1937 return (MDDB_F_EFMT | MDDB_F_EDATA); 1938 } 1939 return (0); 1940 } 1941 1942 /* 1943 * Code to read in the locator name information 1944 */ 1945 static int 1946 readlocnames( 1947 mddb_set_t *s, 1948 int li 1949 ) 1950 { 1951 mddb_ln_t *lnp; 1952 int err = 0; 1953 mddb_block_t ln_blkcnt, ln_blkno; 1954 1955 /* 1956 * read in the locator name blocks 1957 */ 1958 s->s_lnp = NULL; 1959 1960 ln_blkno = s->s_lbp->lb_lnfirstblk; 1961 ln_blkcnt = s->s_lbp->lb_lnblkcnt; 1962 lnp = (mddb_ln_t *)kmem_zalloc(dbtob(ln_blkcnt), KM_SLEEP); 1963 1964 err = readblks(s, (caddr_t)lnp, ln_blkno, ln_blkcnt, li); 1965 if (err) { 1966 err |= MDDB_F_EDATA; 1967 goto out; 1968 } 1969 if (lnp->ln_magic != MDDB_MAGIC_LN) { 1970 err = MDDB_F_EDATA | MDDB_F_EFMT; 1971 goto out; 1972 } 1973 if (s->s_lbp->lb_flags & MDDB_MNSET) { 1974 if (revchk(MDDB_REV_MNLN, lnp->ln_revision)) { 1975 err = MDDB_F_EDATA | MDDB_F_EFMT; 1976 goto out; 1977 } 1978 } else { 1979 if (revchk(MDDB_REV_LN, lnp->ln_revision)) { 1980 err = MDDB_F_EDATA | MDDB_F_EFMT; 1981 goto out; 1982 } 1983 } 1984 if (crcchk(lnp, &lnp->ln_checksum, dbtob(ln_blkcnt), NULL)) { 1985 err = MDDB_F_EDATA | MDDB_F_EFMT; 1986 goto out; 1987 } 1988 out: 1989 /* 1990 * if error occurred in locator name blocks free them 1991 * and return 1992 */ 1993 if (err) { 1994 kmem_free((caddr_t)lnp, dbtob(ln_blkcnt)); 1995 return (err); 1996 } 1997 s->s_lnp = lnp; 1998 return (0); 1999 } 2000 2001 /* 2002 * code to read in a copy of the database. 2003 */ 2004 2005 static int 2006 readcopy( 2007 mddb_set_t *s, 2008 int li 2009 ) 2010 { 2011 uint_t blk; 2012 mddb_db_t *dbp, *dbp1, *dbhp; 2013 mddb_db32_t *db32p; 2014 mddb_de_ic_t *dep, *dep2; 2015 mddb_de32_t *de32p, *de32p2; 2016 int err = 0; 2017 uint_t checksum; 2018 2019 2020 #if defined(_ILP32) && !defined(lint) 2021 ASSERT(sizeof (mddb_de_t) == sizeof (mddb_de32_t)); 2022 ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t)); 2023 #endif 2024 2025 dbp = NULL; 2026 dbhp = NULL; 2027 /* 2028 * read in all the directory blocks 2029 */ 2030 blk = s->s_lbp->lb_dbfirstblk; 2031 db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, KM_SLEEP); 2032 2033 for (; blk != 0; blk = dbp->db_nextblk) { 2034 dbp1 = (mddb_db_t *)kmem_zalloc(sizeof (mddb_db_t), KM_SLEEP); 2035 if (! dbhp) { 2036 dbhp = dbp1; 2037 } else { 2038 dbp->db_next = dbp1; 2039 } 2040 dbp = dbp1; 2041 2042 err = readblks(s, (caddr_t)db32p, blk, 1, li); 2043 if (err) { 2044 err |= MDDB_F_EDATA; 2045 break; 2046 } 2047 db32todb(db32p, dbp); 2048 if (db32p->db32_magic != MDDB_MAGIC_DB) { 2049 err = MDDB_F_EDATA | MDDB_F_EFMT; 2050 break; 2051 } 2052 if (revchk(MDDB_REV_DB, db32p->db32_revision)) { 2053 err = MDDB_F_EDATA | MDDB_F_EFMT; 2054 break; 2055 } 2056 if (crcchk(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL)) { 2057 err = MDDB_F_EDATA | MDDB_F_EFMT; 2058 break; 2059 } 2060 /* 2061 * first go through and fix up all de_next pointers 2062 */ 2063 if (dbp->db_firstentry) { 2064 2065 de32p = (mddb_de32_t *) 2066 ((void *) ((caddr_t)(&db32p->db32_firstentry) 2067 + sizeof (db32p->db32_firstentry))); 2068 2069 dep = (mddb_de_ic_t *) 2070 kmem_zalloc(sizeof (mddb_de_ic_t) - 2071 sizeof (mddb_block_t) + 2072 sizeof (mddb_block_t) * de32p->de32_blkcount, 2073 KM_SLEEP); 2074 de32tode(de32p, dep); 2075 2076 dbp->db_firstentry = dep; 2077 while (de32p && de32p->de32_next) { 2078 2079 de32p2 = nextentry(de32p); 2080 2081 dep2 = (mddb_de_ic_t *)kmem_zalloc( 2082 sizeof (mddb_de_ic_t) - 2083 sizeof (mddb_block_t) + 2084 sizeof (mddb_block_t) * 2085 de32p2->de32_blkcount, KM_SLEEP); 2086 2087 de32tode(de32p2, dep2); 2088 2089 dep->de_next = dep2; 2090 dep = dep2; 2091 de32p = de32p2; 2092 } 2093 } 2094 /* 2095 * go through and make all of the pointer to record blocks 2096 * are null; 2097 */ 2098 for (dep = dbp->db_firstentry; dep != NULL; dep = dep->de_next) 2099 dep->de_rb = NULL; 2100 } 2101 kmem_free((caddr_t)db32p, MDDB_BSIZE); 2102 dbp->db_next = NULL; 2103 /* 2104 * if error occurred in directory blocks free them 2105 * and return 2106 */ 2107 if (err) { 2108 dbp = dbhp; 2109 while (dbp) { 2110 dep = dbp->db_firstentry; 2111 while (dep) { 2112 /* No mddb_rb32_t structures yet */ 2113 dep2 = dep->de_next; 2114 kmem_free((caddr_t)dep, sizeofde(dep)); 2115 dep = dep2; 2116 } 2117 dbp1 = dbp->db_next; 2118 kmem_free((caddr_t)dbp, sizeof (mddb_db_t)); 2119 dbp = dbp1; 2120 } 2121 s->s_dbp = NULL; 2122 return (err); 2123 2124 } 2125 /* 2126 */ 2127 err = 0; 2128 checksum = MDDB_GLOBAL_XOR; 2129 for (dbp = dbhp; dbp != NULL; dbp = dbp->db_next) { 2130 checksum ^= dbp->db_recsum; 2131 for (dep = dbp->db_firstentry; dep; dep = dep->de_next) { 2132 if (dep->de_flags & MDDB_F_OPT) 2133 continue; 2134 err = getrecord(s, dep, li); 2135 if (err) 2136 break; 2137 /* Don't include CHANGELOG in big XOR */ 2138 if (dep->de_flags & MDDB_F_CHANGELOG) 2139 continue; 2140 checksum ^= dep->de_rb->rb_checksum; 2141 checksum ^= dep->de_rb->rb_checksum_fiddle; 2142 } 2143 if (err) 2144 break; 2145 } 2146 if (checksum) { 2147 if (! err) 2148 err = MDDB_F_EDATA | MDDB_F_EFMT; 2149 } 2150 if (err) { 2151 dbp = dbhp; 2152 dbhp = NULL; 2153 while (dbp) { 2154 dep = dbp->db_firstentry; 2155 while (dep) { 2156 if (dep->de_rb) 2157 kmem_free((caddr_t)dep->de_rb, 2158 dep->de_recsize); 2159 dep2 = dep->de_next; 2160 kmem_free((caddr_t)dep, sizeofde(dep)); 2161 dep = dep2; 2162 } 2163 dbp1 = dbp->db_next; 2164 kmem_free((caddr_t)dbp, sizeof (mddb_db_t)); 2165 dbp = dbp1; 2166 } 2167 } 2168 s->s_dbp = dbhp; 2169 return (err); 2170 } 2171 2172 static int 2173 getoptcnt( 2174 mddb_set_t *s, 2175 int li) 2176 { 2177 int result; 2178 mddb_de_ic_t *dep; 2179 mddb_db_t *dbp; 2180 2181 #if defined(_ILP32) && !defined(lint) 2182 ASSERT(sizeof (mddb_de_t) == sizeof (mddb_de32_t)); 2183 ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t)); 2184 #endif 2185 2186 result = 0; 2187 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 2188 dep = dbp->db_firstentry; 2189 for (; dep != NULL; dep = dep->de_next) { 2190 if (! (dep->de_flags & MDDB_F_OPT)) 2191 continue; 2192 if (((dep->de_optinfo[0].o_flags & MDDB_F_ACTIVE) && 2193 (li == dep->de_optinfo[0].o_li)) || 2194 ((dep->de_optinfo[1].o_flags & MDDB_F_ACTIVE) && 2195 (li == dep->de_optinfo[1].o_li))) 2196 result++; 2197 } 2198 } 2199 return (result); 2200 } 2201 2202 static void 2203 getoptdev( 2204 mddb_set_t *s, 2205 mddb_de_ic_t *rdep, 2206 int opti 2207 ) 2208 { 2209 mddb_lb_t *lbp; 2210 mddb_locator_t *lp; 2211 mddb_optinfo_t *otherop; 2212 mddb_optinfo_t *resultop; 2213 int li; 2214 dev_t otherdev; 2215 int blkonly = 0; 2216 int mincnt; 2217 int thiscnt; 2218 2219 lbp = s->s_lbp; 2220 2221 resultop = &rdep->de_optinfo[opti]; 2222 otherop = &rdep->de_optinfo[1-opti]; 2223 2224 resultop->o_flags = 0; 2225 2226 /* 2227 * scan through and see if data bases have to vary by only device 2228 */ 2229 2230 if (otherop->o_flags & MDDB_F_ACTIVE) { 2231 blkonly = 1; 2232 otherdev = expldev(lbp->lb_locators[otherop->o_li].l_dev); 2233 for (li = 0; li < lbp->lb_loccnt; li++) { 2234 lp = &lbp->lb_locators[li]; 2235 if (! (lp->l_flags & MDDB_F_ACTIVE)) 2236 continue; 2237 if (expldev(lp->l_dev) != otherdev) { 2238 blkonly = 0; 2239 break; 2240 } 2241 } 2242 } 2243 2244 mincnt = 999999; 2245 for (li = 0; li < lbp->lb_loccnt; li++) { 2246 dev_info_t *devi; 2247 int removable = 0; 2248 2249 lp = &lbp->lb_locators[li]; 2250 if (! (lp->l_flags & MDDB_F_ACTIVE)) 2251 continue; 2252 if (otherop->o_flags & MDDB_F_ACTIVE) { 2253 if (blkonly) { 2254 if (otherop->o_li == li) 2255 continue; 2256 } else { 2257 if (otherdev == expldev(lp->l_dev)) 2258 continue; 2259 } 2260 } 2261 2262 /* 2263 * Check if this is a removable device. If it is we 2264 * assume it is something like a USB flash disk, a zip disk 2265 * or even a floppy that is being used to help maintain 2266 * mddb quorum. We don't want to put any optimized resync 2267 * records on these kinds of disks since they are usually 2268 * slower or don't have the same read/write lifetimes as 2269 * a regular fixed disk. 2270 */ 2271 if ((devi = e_ddi_hold_devi_by_dev(lp->l_dev, 0)) != NULL) { 2272 int error; 2273 struct cb_ops *cb; 2274 ddi_prop_op_t prop_op = PROP_LEN_AND_VAL_BUF; 2275 int propvalue = 0; 2276 int proplength = sizeof (int); 2277 2278 if ((cb = devopsp[getmajor(lp->l_dev)]->devo_cb_ops) 2279 != NULL) { 2280 error = (*cb->cb_prop_op)(DDI_DEV_T_ANY, devi, 2281 prop_op, DDI_PROP_NOTPROM | 2282 DDI_PROP_DONTPASS, "removable-media", 2283 (caddr_t)&propvalue, &proplength); 2284 2285 if (error == DDI_PROP_SUCCESS) 2286 removable = 1; 2287 } 2288 2289 ddi_release_devi(devi); 2290 } 2291 2292 if (removable) 2293 continue; 2294 2295 thiscnt = getoptcnt(s, li); 2296 if (thiscnt < mincnt) { 2297 resultop->o_li = li; 2298 mincnt = thiscnt; 2299 resultop->o_flags = MDDB_F_ACTIVE; 2300 } 2301 } 2302 } 2303 2304 static void 2305 allocuserdata( 2306 mddb_de_ic_t *dep 2307 ) 2308 { 2309 mddb_rb32_t *rbp; 2310 2311 #if defined(_ILP32) && !defined(lint) 2312 ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t)); 2313 #endif 2314 2315 rbp = dep->de_rb; 2316 rbp->rb_private = 0; 2317 dep->de_rb_userdata = kmem_zalloc(dep->de_reqsize, KM_SLEEP); 2318 rbp->rb_userdata = 0x4; /* Make sure this is non-zero */ 2319 bcopy((caddr_t)rbp->rb_data, dep->de_rb_userdata, dep->de_reqsize); 2320 } 2321 2322 2323 static void 2324 getuserdata( 2325 set_t setno, 2326 mddb_de_ic_t *dep 2327 ) 2328 { 2329 mddb_rb32_t *rbp; 2330 2331 2332 mddb_type_t type = dep->de_type1; 2333 caddr_t data, udata; 2334 2335 #if defined(_ILP32) && !defined(lint) 2336 ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t)); 2337 #endif 2338 rbp = dep->de_rb; 2339 data = (caddr_t)rbp->rb_data; 2340 udata = (caddr_t)dep->de_rb_userdata; 2341 2342 /* 2343 * If it's a driver record, and an old style record, and not a DRL 2344 * record, we must convert it because it was incore as a 64 bit 2345 * structure but its on disk layout has only 32 bit for block sizes 2346 */ 2347 if (!(md_get_setstatus(setno) & 2348 (MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT)) && 2349 (type >= MDDB_FIRST_MODID) && 2350 ((rbp->rb_revision == MDDB_REV_RB) || 2351 (rbp->rb_revision == MDDB_REV_RBFN))) { 2352 2353 switch (dep->de_flags) { 2354 2355 case MDDB_F_STRIPE: 2356 stripe_convert(data, udata, BIG_2_SMALL); 2357 break; 2358 2359 case MDDB_F_MIRROR: 2360 mirror_convert(data, udata, BIG_2_SMALL); 2361 break; 2362 2363 case MDDB_F_RAID: 2364 raid_convert(data, udata, BIG_2_SMALL); 2365 break; 2366 2367 case MDDB_F_SOFTPART: 2368 softpart_convert(data, udata, BIG_2_SMALL); 2369 break; 2370 2371 case MDDB_F_TRANS_MASTER: 2372 trans_master_convert(data, udata, BIG_2_SMALL); 2373 break; 2374 2375 case MDDB_F_TRANS_LOG: 2376 trans_log_convert(data, udata, BIG_2_SMALL); 2377 break; 2378 2379 case MDDB_F_HOTSPARE: 2380 hs_convert(data, udata, BIG_2_SMALL); 2381 break; 2382 2383 case MDDB_F_OPT: 2384 default: 2385 bcopy(udata, data, dep->de_reqsize); 2386 } 2387 } else { 2388 bcopy(udata, data, dep->de_reqsize); 2389 } 2390 } 2391 2392 static void 2393 getoptrecord( 2394 mddb_set_t *s, 2395 mddb_de_ic_t *dep 2396 ) 2397 { 2398 mddb_lb_t *lbp; 2399 mddb_locator_t *lp; 2400 mddb_rb32_t *rbp, *crbp; 2401 int li; 2402 int i; 2403 int err = 0; 2404 size_t recsize; 2405 2406 #if defined(_ILP32) && !defined(lint) 2407 ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t)); 2408 #endif 2409 2410 lbp = s->s_lbp; 2411 2412 recsize = dep->de_recsize; 2413 dep->de_rb = (mddb_rb32_t *)kmem_zalloc(recsize, KM_SLEEP); 2414 rbp = dep->de_rb; 2415 crbp = (mddb_rb32_t *)kmem_zalloc(recsize, KM_SLEEP); 2416 2417 dep->de_optinfo[0].o_flags |= MDDB_F_EDATA; 2418 dep->de_optinfo[1].o_flags |= MDDB_F_EDATA; 2419 2420 for (i = 0; i < 2; i++) { 2421 if (! (dep->de_optinfo[i].o_flags & MDDB_F_ACTIVE)) 2422 continue; 2423 li = dep->de_optinfo[i].o_li; 2424 lp = &lbp->lb_locators[li]; 2425 2426 if (! (lp->l_flags & MDDB_F_ACTIVE) || 2427 (lp->l_flags & MDDB_F_EMASTER)) 2428 continue; 2429 2430 err = readblklst(s, (caddr_t)rbp, dep->de_blks, 2431 dep->de_blkcount, li, 0); 2432 2433 if (err) 2434 continue; 2435 2436 if (rbp->rb_magic != MDDB_MAGIC_RB) 2437 continue; 2438 2439 if (revchk(MDDB_REV_RB, rbp->rb_revision)) 2440 continue; 2441 2442 /* Check the crc for this record */ 2443 if (rec_crcchk(s, dep, rbp)) { 2444 continue; 2445 } 2446 2447 dep->de_optinfo[i].o_flags = MDDB_F_ACTIVE; 2448 2449 if (rbp == crbp) { 2450 if (rbp->rb_checksum != crbp->rb_checksum) 2451 dep->de_optinfo[1].o_flags |= MDDB_F_EDATA; 2452 break; 2453 } 2454 rbp = crbp; 2455 } 2456 2457 if (rbp == crbp) { 2458 rbp->rb_private = 0; 2459 kmem_free((caddr_t)crbp, recsize); 2460 return; 2461 } 2462 bzero((caddr_t)rbp, recsize); 2463 rbp->rb_magic = MDDB_MAGIC_RB; 2464 rbp->rb_revision = MDDB_REV_RB; 2465 uniqtime32(&rbp->rb_timestamp); 2466 /* Generate the crc for this record */ 2467 rec_crcgen(s, dep, rbp); 2468 kmem_free((caddr_t)crbp, recsize); 2469 } 2470 2471 /* 2472 * writeoptrecord writes out an optimized record. 2473 */ 2474 static int 2475 writeoptrecord( 2476 mddb_set_t *s, 2477 mddb_de_ic_t *dep 2478 ) 2479 { 2480 mddb_rb32_t *rbp; 2481 int li; 2482 int err = 0, wrt_err = 0; 2483 mddb_bf_t *bufhead, *bfp; 2484 mddb_lb_t *lbp = s->s_lbp; 2485 mddb_locator_t *lp; 2486 int i; 2487 2488 #if defined(_ILP32) && !defined(lint) 2489 ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t)); 2490 #endif 2491 2492 bufhead = NULL; 2493 err = 0; 2494 2495 while (s->s_opthavequeuinglck) { 2496 s->s_optwantqueuinglck++; 2497 cv_wait(&s->s_optqueuing_cv, SETMUTEX(s->s_setno)); 2498 } 2499 s->s_opthavequeuinglck++; 2500 rbp = dep->de_rb; 2501 for (i = 0; i < 2; i++) { 2502 /* 2503 * only possible error is xlate. This can 2504 * occur if a replica was off line and came 2505 * back. During the mean time the database grew 2506 * large than the now on line replica can store 2507 */ 2508 if (! (dep->de_optinfo[i].o_flags & MDDB_F_ACTIVE)) 2509 continue; 2510 li = dep->de_optinfo[i].o_li; 2511 /* 2512 * In a MN diskset, any node can write optimized record(s). 2513 */ 2514 wrt_err = wrtblklst(s, (caddr_t)rbp, dep->de_blks, 2515 dep->de_blkcount, li, &bufhead, MDDB_WR_ANY_NODE); 2516 /* 2517 * For MN diskset, set error in optinfo structure so 2518 * that mddb_commitrec knows which replica failed. 2519 */ 2520 if ((MD_MNSET_SETNO(s->s_setno)) && 2521 (wrt_err & MDDB_F_EWRITE)) { 2522 dep->de_optinfo[i].o_flags |= MDDB_F_EWRITE; 2523 } 2524 err |= wrt_err; 2525 } 2526 s->s_opthavequeuinglck = 0; 2527 if (s->s_optwantqueuinglck) { 2528 s->s_optwantqueuinglck = 0; 2529 cv_broadcast(&s->s_optqueuing_cv); 2530 } 2531 for (bfp = bufhead; bfp; bfp = bufhead) { 2532 mutex_exit(SETMUTEX(s->s_setno)); 2533 (void) biowait(&bfp->bf_buf); 2534 mutex_enter(SETMUTEX(s->s_setno)); 2535 if (bfp->bf_buf.b_flags & B_ERROR) { 2536 /* 2537 * If an MN diskset, don't set replica 2538 * in error since this hasn't been set in master. 2539 * Setting replica in error before master could 2540 * leave the nodes with different views of the 2541 * world since a class 1 configuration change 2542 * could occur in mddb_commitrec as soon as 2543 * all locks are dropped. Must keep this 2544 * node the same as master and can't afford a 2545 * failure from the class 1 config change 2546 * if master succeeded. 2547 */ 2548 if (!(MD_MNSET_SETNO(s->s_setno))) { 2549 bfp->bf_locator->l_flags |= MDDB_F_EWRITE; 2550 } else { 2551 /* 2552 * Find which de_optinfo (which replica) 2553 * had a failure and set the failure in 2554 * the o_flags field. 2555 */ 2556 lp = &lbp->lb_locators[dep->de_optinfo[0].o_li]; 2557 if (lp == bfp->bf_locator) { 2558 dep->de_optinfo[0].o_flags |= 2559 MDDB_F_EWRITE; 2560 } else { 2561 dep->de_optinfo[1].o_flags |= 2562 MDDB_F_EWRITE; 2563 } 2564 } 2565 err |= MDDB_F_EWRITE; 2566 } 2567 bufhead = bfp->bf_next; 2568 freebuffer(s, bfp); 2569 } 2570 return (err); 2571 } 2572 2573 /* 2574 * Fix up the optimized resync record. Used in the traditional and local 2575 * disksets to move an optimized record from a failed or deleted mddb 2576 * to an active one. 2577 * 2578 * In a MN diskset, the fixing of the optimized record is split between 2579 * the master and slave nodes. If the master node moves the optimized 2580 * resync record, then the master node will send a MDDB_PARSE_OPTRECS 2581 * message to the slave nodes causing the slave nodes to reget the 2582 * directory entry containing the location of the optimized resync record. 2583 * After the record is reread from disk, then writeoptrecord is called 2584 * if the location of the optimized resync record or flags have changed. 2585 * When writeoptrecord is called, the node that is the owner of this record 2586 * will write the optimized record to the location specified in the directory 2587 * entry. Since the master node uses the highest class message (PARSE) 2588 * the record owner node is guaranteed to already have an updated 2589 * directory entry incore. 2590 * 2591 * The other difference between the traditional/local set and MN diskset 2592 * is that the directory entry can be written to disk before the optimized 2593 * record in a MN diskset if the record is owned by a slave node. So, 2594 * the users of an optimized record must handle the failure case when no 2595 * data is available from an optimized record since the master node could 2596 * have failed during the relocation of the optimized record to another mddb. 2597 */ 2598 static int 2599 fixoptrecord( 2600 mddb_set_t *s, 2601 mddb_de_ic_t *dep, 2602 mddb_db_t *dbp 2603 ) 2604 { 2605 int changed; 2606 int writedata; 2607 int err = 0; 2608 int i; 2609 mddb_lb_t *lbp; 2610 mddb_optinfo_t *op; 2611 mddb_db32_t *db32p; 2612 int rec_owner; /* Is node owner of record? */ 2613 2614 #if defined(_ILP32) && !defined(lint) 2615 ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t)); 2616 #endif 2617 2618 lbp = s->s_lbp; 2619 changed = 0; 2620 writedata = 0; 2621 for (i = 0; i < 2; i++) { 2622 op = &dep->de_optinfo[i]; 2623 2624 if (! (lbp->lb_locators[op->o_li].l_flags & MDDB_F_ACTIVE)) 2625 op->o_flags = 0; 2626 2627 /* 2628 * If optimized record has seen a replica failure, 2629 * assign new replica to record and re-write data 2630 * to new record. 2631 */ 2632 if (! (op->o_flags & MDDB_F_ACTIVE)) { 2633 getoptdev(s, dep, i); 2634 writedata++; 2635 changed++; 2636 /* Set flag for slaves to reread dep and write rec */ 2637 if (lbp->lb_flags & MDDB_MNSET) { 2638 s->s_mn_parseflags |= MDDB_PARSE_OPTRECS; 2639 } 2640 } 2641 2642 /* 2643 * If just an error in the data was seen, set 2644 * the optimized record's replica flag to active (ok) 2645 * and try again. 2646 */ 2647 if (op->o_flags & MDDB_F_EDATA) { 2648 dep->de_optinfo[0].o_flags = MDDB_F_ACTIVE; 2649 writedata++; 2650 } 2651 } 2652 2653 rec_owner = 0; 2654 if (lbp->lb_flags & MDDB_MNSET) { 2655 /* 2656 * If a MN diskset then check the owner of optimized record. 2657 * If the master node owns the record or if there is 2658 * no owner of the record, then the master can write the 2659 * optimized record to disk. 2660 * Master node can write the optimized record now, but 2661 * slave nodes write their records during handling of 2662 * the MDDB_PARSE_OPTRECS message. 2663 */ 2664 if ((dep->de_owner_nodeid == MD_MN_INVALID_NID) || 2665 (dep->de_owner_nodeid == md_set[s->s_setno].s_nodeid)) { 2666 rec_owner = 1; 2667 } 2668 } else { 2669 /* 2670 * In traditional diskset and local set, this node 2671 * is always the record owner and always the master. 2672 */ 2673 rec_owner = 1; 2674 } 2675 2676 /* 2677 * If this node is the record owner, write out record. 2678 */ 2679 if ((writedata) && (rec_owner)) { 2680 if (err = writeoptrecord(s, dep)) { 2681 return (err); 2682 } 2683 } 2684 if (! changed) 2685 return (0); 2686 uniqtime32(&dbp->db_timestamp); 2687 dbp->db_revision = MDDB_REV_DB; 2688 db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, KM_SLEEP); 2689 create_db32rec(db32p, dbp); 2690 crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL); 2691 err = writeall(s, (caddr_t)db32p, db32p->db32_blknum, 2692 1, MDDB_WR_ONLY_MASTER); 2693 kmem_free((caddr_t)db32p, MDDB_BSIZE); 2694 return (err); 2695 } 2696 2697 static int 2698 fixoptrecords( 2699 mddb_set_t *s 2700 ) 2701 { 2702 mddb_de_ic_t *dep; 2703 mddb_db_t *dbp; 2704 int err = 0; 2705 set_t setno; 2706 2707 /* 2708 * In a MN diskset, the master node is the only node that runs 2709 * fixoptrecords. If the master node changes anything, then the 2710 * master node sends PARSE message to the slave nodes. The slave 2711 * nodes will then re-read in the locator block or re-read in the 2712 * directory blocks and re-write the optimized resync records. 2713 */ 2714 setno = s->s_setno; 2715 if ((setno != MD_LOCAL_SET) && (s->s_lbp->lb_flags & MDDB_MNSET) && 2716 (md_set[setno].s_am_i_master == 0)) { 2717 return (0); 2718 } 2719 2720 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 2721 for (dep = dbp->db_firstentry; dep; dep = dep->de_next) { 2722 if (! (dep->de_flags & MDDB_F_OPT)) 2723 continue; 2724 err = fixoptrecord(s, dep, dbp); 2725 if (err != 0) 2726 return (err); 2727 } 2728 } 2729 return (0); 2730 } 2731 2732 /* 2733 * Checks incore version of mddb data to mddb data ondisk. 2734 * 2735 * Returns: 2736 * - 0 if the data was successfully read and is good. 2737 * - MDDB_F_EREAD if a read error occurred. 2738 * - 1 if the data read is bad (checksum failed, etc) 2739 */ 2740 static int 2741 checkcopy 2742 ( 2743 mddb_set_t *s, 2744 int li 2745 ) 2746 { 2747 mddb_db_t *dbp; 2748 mddb_db32_t *cdb32p; 2749 mddb_de_ic_t *dep; 2750 mddb_de32_t *cde32p; 2751 mddb_rb32_t *rbp, *crbp; 2752 size_t size; 2753 int i; 2754 int retval = 1; 2755 2756 #if defined(_ILP32) && !defined(lint) 2757 ASSERT(sizeof (mddb_de_t) == sizeof (mddb_de32_t)); 2758 ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t)); 2759 ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t)); 2760 #endif 2761 2762 if (s->s_databuffer_size == 0) { 2763 size_t maxrecsize = MDDB_BSIZE; 2764 2765 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) 2766 for (dep = dbp->db_firstentry; dep; dep = dep->de_next) 2767 if (! (dep->de_flags & MDDB_F_OPT) && 2768 dep->de_recsize > maxrecsize) 2769 maxrecsize = dep->de_recsize; 2770 2771 s->s_databuffer = (caddr_t)kmem_zalloc(maxrecsize, KM_SLEEP); 2772 s->s_databuffer_size = maxrecsize; 2773 } 2774 2775 cdb32p = (mddb_db32_t *)s->s_databuffer; 2776 2777 /* 2778 * first go through and make sure all directory stuff 2779 * is the same 2780 */ 2781 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 2782 if (readblks(s, (caddr_t)cdb32p, dbp->db_blknum, 1, li)) { 2783 retval = MDDB_F_EREAD; 2784 goto err; 2785 } 2786 if (cdb32p->db32_magic != MDDB_MAGIC_DB) 2787 goto err; 2788 if (revchk(MDDB_REV_DB, cdb32p->db32_revision)) 2789 goto err; 2790 if (crcchk(cdb32p, &cdb32p->db32_checksum, MDDB_BSIZE, NULL)) 2791 goto err; 2792 if (cdb32p->db32_nextblk != dbp->db_nextblk) 2793 goto err; 2794 if (cdb32p->db32_recsum != dbp->db_recsum) 2795 goto err; 2796 if (cdb32p->db32_firstentry) { 2797 cde32p = (mddb_de32_t *) 2798 ((void *)((caddr_t)(&cdb32p->db32_firstentry) 2799 + sizeof (cdb32p->db32_firstentry))); 2800 } else 2801 cde32p = NULL; 2802 2803 dep = dbp->db_firstentry; 2804 /* 2805 * check if all directory entries are identical 2806 */ 2807 while (dep && cde32p) { 2808 if (dep->de_recid != cde32p->de32_recid) 2809 goto err; 2810 if (dep->de_type1 != cde32p->de32_type1) 2811 goto err; 2812 if (dep->de_type2 != cde32p->de32_type2) 2813 goto err; 2814 if (dep->de_reqsize != cde32p->de32_reqsize) 2815 goto err; 2816 if (dep->de_flags != cde32p->de32_flags) 2817 goto err; 2818 2819 for (i = 0; i < 2; i++) { 2820 if (dep->de_optinfo[i].o_li != 2821 cde32p->de32_optinfo[i].o_li) 2822 break; 2823 } 2824 if (i != 2) 2825 goto err; 2826 size = sizeof (mddb_block_t) * dep->de_blkcount; 2827 if (bcmp((caddr_t)dep->de_blks, 2828 (caddr_t)cde32p->de32_blks, size)) 2829 goto err; 2830 dep = dep->de_next; 2831 if (cde32p->de32_next) 2832 cde32p = nextentry(cde32p); 2833 else 2834 cde32p = NULL; 2835 } 2836 if (dep || cde32p) 2837 goto err; 2838 } 2839 /* 2840 * If here, all directories are functionally identical 2841 * check to make sure all records are identical 2842 * the reason the records are not just bcmped is that the 2843 * lock flag does not want to be compared. 2844 */ 2845 crbp = (mddb_rb32_t *)cdb32p; 2846 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 2847 for (dep = dbp->db_firstentry; dep; dep = dep->de_next) { 2848 if ((dep->de_flags & MDDB_F_OPT) || 2849 (dep->de_flags & MDDB_F_CHANGELOG)) 2850 continue; 2851 rbp = (mddb_rb32_t *)dep->de_rb; 2852 if (readblklst(s, (caddr_t)crbp, dep->de_blks, 2853 dep->de_blkcount, li, 0)) { 2854 retval = MDDB_F_EREAD; 2855 goto err; 2856 } 2857 /* Check the crc for this record */ 2858 if (rec_crcchk(s, dep, crbp)) 2859 goto err; 2860 2861 if (rbp->rb_checksum != crbp->rb_checksum || 2862 rbp->rb_checksum_fiddle != crbp->rb_checksum_fiddle) 2863 goto err; 2864 } 2865 } 2866 return (0); 2867 err: 2868 return (retval); 2869 } 2870 2871 /* 2872 * Determine if the location information for two mddbs is the same. 2873 * The device slice and block offset should match. If both have devids then 2874 * use that for the comparison, otherwise we compare the dev_ts. 2875 * Comparing with the devid allows us to handle the case where a mddb was 2876 * relocated to a dead mddbs dev_t. The live mddb will have the dev_t of 2877 * the dead mddb but the devid comparison will catch this and not match. 2878 * 2879 * Return 1 if the location of the two mddbs match, 0 if not. 2880 */ 2881 static int 2882 match_mddb(mddb_ri_t *rip, ddi_devid_t devid, char *minor, md_dev64_t dev, 2883 daddr32_t blkno) 2884 { 2885 if (rip->ri_flags & MDDB_F_EMASTER) { 2886 /* 2887 * If this element is errored then we don't try to match on it. 2888 * If we try to match we could erroneously match on the dev_t 2889 * of a relocated disk. 2890 */ 2891 return (0); 2892 } 2893 2894 if (rip->ri_devid && devid && minor) { 2895 /* 2896 * If old devid exists, then this is a replicated diskset 2897 * and both old and new devids must be checked. 2898 */ 2899 if (rip->ri_old_devid) { 2900 if (((ddi_devid_compare(rip->ri_devid, devid) != 0) && 2901 (ddi_devid_compare(rip->ri_old_devid, 2902 devid) != 0)) || 2903 (strcmp(rip->ri_minor_name, minor) != 0)) 2904 return (0); 2905 } else { 2906 if (ddi_devid_compare(rip->ri_devid, devid) != 0 || 2907 strcmp(rip->ri_minor_name, minor) != 0) 2908 return (0); 2909 } 2910 } else { 2911 if (rip->ri_dev != dev) 2912 return (0); 2913 } 2914 2915 if (rip->ri_blkno != blkno) 2916 return (0); 2917 2918 return (1); 2919 } 2920 2921 static int 2922 ridev( 2923 mddb_ri_t **rip, 2924 mddb_cfg_loc_t *clp, 2925 dev32_t *dev_2b_fixed, 2926 int flag) 2927 { 2928 mddb_ri_t *r, *r1; 2929 md_dev64_t ldev, ndev; 2930 major_t majordev; 2931 int sz; 2932 2933 if (MD_UPGRADE) { 2934 ldev = md_makedevice(md_targ_name_to_major(clp->l_driver), 2935 clp->l_mnum); 2936 } else { 2937 if (ddi_name_to_major(clp->l_driver) == (major_t)-1) 2938 return (EINVAL); 2939 2940 ldev = md_makedevice(ddi_name_to_major(clp->l_driver), 2941 clp->l_mnum); 2942 } 2943 2944 if (clp->l_devid != 0) { 2945 /* 2946 * Get dev associated with device id and minor name. 2947 * Setup correct driver name if dev is now different. 2948 * Don't change driver name if during upgrade. 2949 */ 2950 ndev = ldev; 2951 if (!mddb_devid_validate((ddi_devid_t)(uintptr_t)clp->l_devid, 2952 &ndev, clp->l_minor_name)) { 2953 if ((ndev != ldev) && (!(MD_UPGRADE))) { 2954 majordev = md_getmajor(ndev); 2955 (void) strcpy(clp->l_driver, 2956 ddi_major_to_name(majordev)); 2957 clp->l_mnum = md_getminor(ndev); 2958 clp->l_devid_flags |= MDDB_DEVID_VALID; 2959 ldev = ndev; 2960 } 2961 } else { 2962 /* Mark as invalid */ 2963 clp->l_devid_flags &= ~MDDB_DEVID_VALID; 2964 } 2965 } 2966 2967 clp->l_dev = md_cmpldev(ldev); 2968 if (dev_2b_fixed) 2969 *dev_2b_fixed = clp->l_dev; 2970 r = *rip; 2971 2972 while (r) { 2973 if (match_mddb(r, (ddi_devid_t)(uintptr_t)clp->l_devid, 2974 clp->l_minor_name, ldev, clp->l_blkno)) { 2975 if ((clp->l_devid != 0) && 2976 !(clp->l_devid_flags & MDDB_DEVID_VALID)) { 2977 r->ri_flags |= MDDB_F_EMASTER; 2978 } else { 2979 r->ri_flags |= flag; 2980 } 2981 return (0); /* already entered return success */ 2982 } 2983 r = r->ri_next; 2984 } 2985 2986 /* 2987 * This replica not represented in the current rip list, 2988 * so add it to the list. 2989 */ 2990 r = (mddb_ri_t *)kmem_zalloc(sizeof (**rip), KM_SLEEP); 2991 r->ri_dev = ldev; 2992 r->ri_blkno = clp->l_blkno; 2993 (void) strncpy(r->ri_driver, clp->l_driver, MD_MAXDRVNM); 2994 if (strlen(clp->l_driver) >= MD_MAXDRVNM) { 2995 r->ri_driver[(MD_MAXDRVNM -1)] = '\0'; 2996 } 2997 if (clp->l_devname != NULL) { 2998 (void) strcpy(r->ri_devname, clp->l_devname); 2999 } 3000 r->ri_flags |= flag; 3001 if (clp->l_devid != 0) { 3002 sz = clp->l_devid_sz; 3003 r->ri_devid = (ddi_devid_t)kmem_zalloc(sz, KM_SLEEP); 3004 bcopy((void *)(uintptr_t)clp->l_devid, (char *)r->ri_devid, sz); 3005 3006 if (clp->l_old_devid != NULL) { 3007 sz = clp->l_old_devid_sz; 3008 r->ri_old_devid = (ddi_devid_t)kmem_zalloc(sz, 3009 KM_SLEEP); 3010 bcopy((char *)(uintptr_t)clp->l_old_devid, 3011 (char *)r->ri_old_devid, sz); 3012 } else { 3013 r->ri_old_devid = 0; 3014 } 3015 if (strlen(clp->l_minor_name) < MDDB_MINOR_NAME_MAX) 3016 (void) strcpy(r->ri_minor_name, clp->l_minor_name); 3017 3018 if (!(clp->l_devid_flags & MDDB_DEVID_VALID)) { 3019 /* 3020 * Devid is present, but not valid. This could 3021 * happen if device has been powered off or if 3022 * the device has been removed. Mark the device in 3023 * error. Don't allow any writes to this device 3024 * based on the dev_t since another device could 3025 * have been placed in its spot and be responding to 3026 * the dev_t accesses. 3027 */ 3028 r->ri_flags |= MDDB_F_EMASTER; 3029 } 3030 } else { 3031 r->ri_devid = 0; 3032 r->ri_old_devid = 0; 3033 } 3034 3035 /* 3036 * If the rip list is empty then this entry 3037 * is the list. 3038 */ 3039 if (*rip == NULL) { 3040 *rip = r; 3041 return (0); 3042 } 3043 3044 /* 3045 * Add this entry to the end of the rip list 3046 */ 3047 r1 = *rip; 3048 while (r1->ri_next) 3049 r1 = r1->ri_next; 3050 r1->ri_next = r; 3051 return (0); 3052 } 3053 3054 /* 3055 * writecopy writes the incore data blocks out to all of the replicas. 3056 * This is called from writestart 3057 * - when a diskset is started or 3058 * - when an error has been enountered during the write to a mddb. 3059 * and from newdev when a new mddb is being added. 3060 * 3061 * flag can be 2 values: 3062 * MDDB_WRITECOPY_ALL - write all records to all mddbs. This is 3063 * always used for traditional and local disksets. 3064 * For MN diskset: 3065 * All nodes can call writecopy, but only the 3066 * master node actually writes data to the disk 3067 * except for optimized resync records. 3068 * An optimized resync record can only be written to 3069 * by the record owner. 3070 * MDDB_WRITECOPY_SYNC - special case for MN diskset. When a new 3071 * master has been chosen, the new master may need to 3072 * write its incore mddb to disk (this is the case where the 3073 * old master had executed a message but hadn't relayed it 3074 * to this slave yet). New master should not write the 3075 * change log records since new master would be overwriting 3076 * valuable data. Only used during a reconfig cycle. 3077 */ 3078 static int 3079 writecopy( 3080 mddb_set_t *s, 3081 int li, 3082 int flag 3083 ) 3084 { 3085 mddb_db_t *dbp; 3086 mddb_db32_t *db32p; 3087 mddb_de_ic_t *dep; 3088 mddb_rb32_t *rbp; 3089 uint_t checksum; 3090 int err = 0; 3091 3092 #if defined(_ILP32) && !defined(lint) 3093 ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t)); 3094 ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t)); 3095 #endif 3096 3097 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 3098 db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, KM_SLEEP); 3099 create_db32rec(db32p, dbp); 3100 crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL); 3101 err = writeblks(s, (caddr_t)db32p, dbp->db_blknum, 1, li, 3102 MDDB_WR_ONLY_MASTER); 3103 kmem_free((caddr_t)db32p, MDDB_BSIZE); 3104 if (err) 3105 return (err); 3106 for (dep = dbp->db_firstentry; dep; dep = dep->de_next) { 3107 /* 3108 * In a multinode diskset, when a new master is 3109 * chosen the new master may need to write its 3110 * incore copy of the mddb to disk. In this case, 3111 * don't want to overwrite the change log records 3112 * so new master sets flag to MDDB_WRITECOPY_SYNC. 3113 */ 3114 if (flag == MDDB_WRITECOPY_SYNC) { 3115 if (dep->de_flags & MDDB_F_CHANGELOG) 3116 continue; 3117 } 3118 /* 3119 * In a multinode diskset, don't write out optimized 3120 * resync resyncs since only the mirror owner node 3121 * will have the correct data. If writecopy is 3122 * being called from writestart as a result of 3123 * an mddb failure, then writestart will handle 3124 * the optimized records when it calls fixoptrecords. 3125 */ 3126 if ((MD_MNSET_SETNO(s->s_setno)) && 3127 (dep->de_flags & MDDB_F_OPT)) { 3128 continue; 3129 } 3130 3131 rbp = dep->de_rb; 3132 checksum = rbp->rb_checksum_fiddle; 3133 checksum ^= rbp->rb_checksum; 3134 /* Generate the crc for this record */ 3135 rec_crcgen(s, dep, rbp); 3136 checksum ^= rbp->rb_checksum; 3137 rbp->rb_checksum_fiddle = checksum; 3138 if (err = wrtblklst(s, (caddr_t)rbp, dep->de_blks, 3139 dep->de_blkcount, li, (mddb_bf_t **)0, 3140 MDDB_WR_ONLY_MASTER)) 3141 return (err); 3142 } 3143 } 3144 return (0); 3145 } 3146 3147 static int 3148 upd_med( 3149 mddb_set_t *s, 3150 char *tag 3151 ) 3152 { 3153 med_data_t meddb; 3154 int medok; 3155 mddb_lb_t *lbp = s->s_lbp; 3156 set_t setno = s->s_setno; 3157 int li; 3158 int alc; 3159 int lc; 3160 3161 3162 /* If no mediator hosts, nothing to do */ 3163 if (s->s_med.n_cnt == 0) 3164 return (0); 3165 3166 /* 3167 * If this is a MN set and we are not the master, then don't 3168 * update mediator hosts or mark mediator as golden since 3169 * only master node should do that. 3170 */ 3171 if ((setno != MD_LOCAL_SET) && (s->s_lbp->lb_flags & MDDB_MNSET) && 3172 (md_set[setno].s_am_i_master == 0)) { 3173 return (0); 3174 } 3175 3176 bzero((char *)&meddb, sizeof (med_data_t)); 3177 meddb.med_dat_mag = MED_DATA_MAGIC; 3178 meddb.med_dat_rev = MED_DATA_REV; 3179 meddb.med_dat_fl = 0; 3180 meddb.med_dat_sn = setno; 3181 meddb.med_dat_cc = lbp->lb_commitcnt; 3182 TIMEVAL32_TO_TIMEVAL(&meddb.med_dat_id, &lbp->lb_ident.createtime); 3183 crcgen(&meddb, &meddb.med_dat_cks, sizeof (med_data_t), NULL); 3184 3185 /* count accessible mediators */ 3186 medok = upd_med_hosts(&s->s_med, s->s_setname, &meddb, tag); 3187 3188 /* count accessible and existing replicas */ 3189 for (li = 0, alc = 0, lc = 0; li < lbp->lb_loccnt; li++) { 3190 mddb_locator_t *lp = &lbp->lb_locators[li]; 3191 3192 if (lp->l_flags & MDDB_F_DELETED) 3193 continue; 3194 3195 lc++; 3196 3197 if (! (lp->l_flags & MDDB_F_ACTIVE) || 3198 (lp->l_flags & MDDB_F_EMASTER) || 3199 (lp->l_flags & MDDB_F_EWRITE)) 3200 continue; 3201 3202 alc++; 3203 } 3204 3205 /* 3206 * Mediator update quorum is >= 50%: check for less than 3207 * "mediator update" quorum. 3208 */ 3209 if ((medok * 2) < s->s_med.n_cnt) { 3210 /* panic if <= 50% of all replicas are accessible */ 3211 if ((lc > 0) && ((alc * 2) <= lc)) { 3212 cmn_err(CE_PANIC, 3213 "md: Update of 50%% of the mediator hosts failed"); 3214 /* NOTREACHED */ 3215 } 3216 3217 cmn_err(CE_WARN, 3218 "md: Update of 50%% of the mediator hosts failed"); 3219 } 3220 3221 /* 3222 * If we have mediator update quorum and exactly 50% of the replicas 3223 * are accessible then mark the mediator as golden. 3224 */ 3225 if (((medok * 2) >= (s->s_med.n_cnt + 1)) && (lc > 0) && 3226 ((alc * 2) == lc)) { 3227 meddb.med_dat_fl = MED_DFL_GOLDEN; 3228 crcgen(&meddb, &meddb.med_dat_cks, sizeof (med_data_t), NULL); 3229 (void) upd_med_hosts(&s->s_med, s->s_setname, &meddb, tag); 3230 } 3231 3232 return (0); 3233 } 3234 3235 static int 3236 push_lb(mddb_set_t *s) 3237 { 3238 mddb_lb_t *lbp = s->s_lbp; 3239 3240 /* push the change to all the replicas */ 3241 uniqtime32(&lbp->lb_timestamp); 3242 if (MD_MNSET_SETNO(s->s_setno)) { 3243 lbp->lb_revision = MDDB_REV_MNLB; 3244 } else { 3245 lbp->lb_revision = MDDB_REV_LB; 3246 } 3247 /* 3248 * The updates to the mediator hosts are done 3249 * by the callers of this function. 3250 */ 3251 return (writelocall(s)); 3252 } 3253 3254 /* Should not call for MN diskset since data tags are not supported */ 3255 static int 3256 dtl_cmp(const mddb_dtag_t *odtp, const mddb_dtag_t *ndtp) 3257 { 3258 int diff = 0; 3259 3260 diff = (int)(odtp->dt_setno - ndtp->dt_setno); 3261 if (diff) 3262 return (diff); 3263 3264 diff = strncmp(odtp->dt_sn, ndtp->dt_sn, MDDB_SN_LEN); 3265 if (diff) 3266 return (diff); 3267 3268 diff = strncmp(odtp->dt_hn, ndtp->dt_hn, MD_MAX_NODENAME_PLUS_1); 3269 if (diff) 3270 return (diff); 3271 3272 /*CSTYLED*/ 3273 return (timercmp(&odtp->dt_tv, &ndtp->dt_tv, !=)); 3274 } 3275 3276 /* Should not call for MN diskset since data tags are not supported */ 3277 static int 3278 dtl_addl(mddb_set_t *s, const mddb_dtag_t *ndtp) 3279 { 3280 int nextid = 0; 3281 mddb_dtag_lst_t **dtlpp = &s->s_dtlp; 3282 3283 /* Run to the end of the list */ 3284 for (/* void */; (*dtlpp != NULL); dtlpp = &(*dtlpp)->dtl_nx) { 3285 if (dtl_cmp(&(*dtlpp)->dtl_dt, ndtp) == 0) 3286 return (0); 3287 nextid++; 3288 } 3289 3290 /* Add the new member */ 3291 *dtlpp = kmem_zalloc(sizeof (**dtlpp), KM_SLEEP); 3292 3293 /* Update the dtag portion of the list */ 3294 bcopy((caddr_t)ndtp, (caddr_t)&((*dtlpp)->dtl_dt), 3295 sizeof (mddb_dtag_t)); 3296 3297 /* Fix up the id value */ 3298 (*dtlpp)->dtl_dt.dt_id = ++nextid; 3299 3300 return (0); 3301 } 3302 3303 /* 3304 * Even though data tags are not supported in MN disksets, dt_cntl may 3305 * be called for a MN diskset since this routine is called even before 3306 * it is known the kind of diskset being read in from disk. 3307 * For a MNdiskset, s_dtlp is 0 so a count of 0 is returned. 3308 */ 3309 static int 3310 dtl_cntl(mddb_set_t *s) 3311 { 3312 mddb_dtag_lst_t *dtlp = s->s_dtlp; 3313 int ndt = 0; 3314 3315 while (dtlp != NULL) { 3316 ndt++; 3317 dtlp = dtlp->dtl_nx; 3318 } 3319 3320 return (ndt); 3321 } 3322 3323 /* 3324 * Even though data tags are not supported in MN disksets, dt_cntl may 3325 * be called for a MN diskset since this routine is called even before 3326 * it is known the kind of diskset being read in from disk. 3327 * For a MNdiskset, s_dtlp is 0 so a 0 is returned. 3328 */ 3329 static mddb_dtag_t * 3330 dtl_findl(mddb_set_t *s, int id) 3331 { 3332 mddb_dtag_lst_t *dtlp = s->s_dtlp; 3333 3334 while (dtlp != NULL) { 3335 if (dtlp->dtl_dt.dt_id == id) 3336 return (&dtlp->dtl_dt); 3337 dtlp = dtlp->dtl_nx; 3338 } 3339 return ((mddb_dtag_t *)NULL); 3340 } 3341 3342 /* Should not call for MN diskset since data tags are not supported */ 3343 static void 3344 dtl_freel(mddb_dtag_lst_t **dtlpp) 3345 { 3346 mddb_dtag_lst_t *dtlp; 3347 mddb_dtag_lst_t *tdtlp; 3348 3349 3350 for (tdtlp = *dtlpp; tdtlp != NULL; tdtlp = dtlp) { 3351 dtlp = tdtlp->dtl_nx; 3352 kmem_free(tdtlp, sizeof (mddb_dtag_lst_t)); 3353 } 3354 *dtlpp = (mddb_dtag_lst_t *)NULL; 3355 } 3356 3357 /* 3358 * Even though data tags are not supported in MN disksets, dt_setup will 3359 * be called for a MN diskset since this routine is called even before 3360 * it is known the kind of diskset being read in from disk. 3361 * Once this set is known as a MN diskset, the dtp area will be freed. 3362 */ 3363 static void 3364 dt_setup(mddb_set_t *s, const mddb_dtag_t *dtagp) 3365 { 3366 mddb_dt_t *dtp; 3367 set_t setno = s->s_setno; 3368 3369 3370 if (md_set[setno].s_dtp == (mddb_dt_t *)NULL) 3371 md_set[setno].s_dtp = kmem_zalloc(MDDB_DT_BYTES, KM_SLEEP); 3372 else if (dtagp == (mddb_dtag_t *)NULL) 3373 bzero((caddr_t)md_set[setno].s_dtp, MDDB_DT_BYTES); 3374 3375 /* shorthand */ 3376 dtp = (mddb_dt_t *)md_set[setno].s_dtp; 3377 3378 dtp->dt_mag = MDDB_MAGIC_DT; 3379 dtp->dt_rev = MDDB_REV_DT; 3380 3381 if (dtagp != NULL) 3382 dtp->dt_dtag = *dtagp; /* structure assignment */ 3383 3384 /* Initialize the setno */ 3385 dtp->dt_dtag.dt_setno = setno; 3386 3387 /* Clear the id and flags, this is only used in user land */ 3388 dtp->dt_dtag.dt_id = 0; 3389 3390 /* Checksum it */ 3391 crcgen(dtp, &dtp->dt_cks, MDDB_DT_BYTES, NULL); 3392 } 3393 3394 /* Should not call for MN diskset since data tags are not supported */ 3395 static int 3396 set_dtag(mddb_set_t *s, md_error_t *ep) 3397 { 3398 mddb_lb_t *lbp = s->s_lbp; 3399 mddb_dtag_t tag; 3400 3401 if (lbp->lb_dtblkcnt == 0) { 3402 /* Data tags not used in a MN set - so no failure returned */ 3403 if (lbp->lb_flags & MDDB_MNSET) 3404 return (0); 3405 3406 cmn_err(CE_WARN, 3407 "No tag record allocated, unable to tag data"); 3408 (void) mdmddberror(ep, MDE_DB_NOTAGREC, NODEV32, s->s_setno); 3409 return (1); 3410 } 3411 3412 /* Clear the stack variable */ 3413 bzero((caddr_t)&tag, sizeof (mddb_dtag_t)); 3414 3415 /* Get the HW serial number for this host */ 3416 (void) snprintf(tag.dt_sn, MDDB_SN_LEN, "%u", zone_get_hostid(NULL)); 3417 tag.dt_sn[MDDB_SN_LEN - 1] = '\0'; 3418 3419 /* Get the nodename that this host goes by */ 3420 (void) strncpy(tag.dt_hn, utsname.nodename, MD_MAX_NODENAME); 3421 tag.dt_hn[MD_MAX_NODENAME] = '\0'; 3422 3423 /* Get a time stamp for NOW */ 3424 uniqtime32(&tag.dt_tv); 3425 3426 /* Setup the data tag record */ 3427 dt_setup(s, &tag); 3428 3429 /* Free any list of tags if they exist */ 3430 dtl_freel(&s->s_dtlp); 3431 3432 /* Put the new tag onto the tag list */ 3433 (void) dtl_addl(s, &tag); 3434 3435 return (0); 3436 } 3437 3438 /* 3439 * If called during upgrade, this routine expects a non-translated 3440 * (aka target) dev. 3441 * Should not call for MN diskset since data tags are not supported. 3442 */ 3443 static int 3444 dt_read(mddb_set_t *s, mddb_lb_t *lbp, mddb_ri_t *rip) 3445 { 3446 int err = 0; 3447 md_dev64_t dev; 3448 caddr_t tbuf; 3449 daddr_t physblk; 3450 mddb_block_t blk; 3451 mddb_dt_t *dtp; 3452 mddb_dtag_t *dtagp; 3453 set_t setno = s->s_setno; 3454 3455 /* If have not allocated a data tag record, there is nothing to do */ 3456 if (lbp->lb_dtblkcnt == 0) 3457 return (1); 3458 3459 dtp = rip->ri_dtp = (mddb_dt_t *)kmem_zalloc(MDDB_DT_BYTES, KM_SLEEP); 3460 3461 if (dtp == (mddb_dt_t *)NULL) 3462 return (1); 3463 3464 /* shorthand */ 3465 dev = md_xlate_targ_2_mini(rip->ri_dev); 3466 if (dev == NODEV64) { 3467 return (1); 3468 } 3469 3470 tbuf = (caddr_t)rip->ri_dtp; 3471 3472 for (blk = 0; blk < lbp->lb_dtblkcnt; blk++) { 3473 physblk = getphysblk((blk + lbp->lb_dtfirstblk), rip->ri_mbip); 3474 err = getblks(s, tbuf, dev, physblk, btodb(MDDB_BSIZE), 0); 3475 /* error reading the tag */ 3476 if (err) { 3477 err = 1; 3478 goto out; 3479 } 3480 tbuf += MDDB_BSIZE; 3481 } 3482 3483 /* magic is valid? */ 3484 if (dtp->dt_mag != MDDB_MAGIC_DT) { 3485 err = 1; 3486 goto out; 3487 } 3488 3489 /* revision is valid? */ 3490 if (revchk(MDDB_REV_DT, dtp->dt_rev)) { 3491 err = 1; 3492 goto out; 3493 } 3494 3495 /* crc is valid? */ 3496 if (crcchk(dtp, &dtp->dt_cks, MDDB_DT_BYTES, NULL)) { 3497 err = 1; 3498 goto out; 3499 } 3500 3501 /* shorthand */ 3502 dtagp = &dtp->dt_dtag; 3503 3504 /* set number match? */ 3505 if (dtagp->dt_setno != setno) { 3506 err = 1; 3507 goto out; 3508 } 3509 3510 /* tag is not empty? */ 3511 if (dtagp->dt_sn[0] == '\0' && dtagp->dt_hn[0] == '\0' && 3512 (dtagp->dt_tv.tv_sec == 0 && dtagp->dt_tv.tv_usec == 0) && 3513 dtagp->dt_id == 0) { 3514 err = 2; 3515 goto out; 3516 } 3517 3518 /* Mark the locator as having tagged data */ 3519 rip->ri_flags |= MDDB_F_TAGDATA; 3520 3521 out: 3522 if (err) { 3523 if (err == 1) { 3524 md_set_setstatus(setno, MD_SET_BADTAG); 3525 rip->ri_flags |= MDDB_F_BADTAG; 3526 } 3527 if (dtp != NULL) { 3528 kmem_free(dtp, MDDB_DT_BYTES); 3529 rip->ri_dtp = (mddb_dt_t *)NULL; 3530 } 3531 } 3532 3533 return (err); 3534 } 3535 3536 /* Should not call for MN diskset since data tags are not supported */ 3537 static int 3538 dt_write(mddb_set_t *s) 3539 { 3540 int li; 3541 int err = 0; 3542 int werr; 3543 int empty_tag = 0; 3544 mddb_dtag_t *dtagp; 3545 mddb_dt_t *dtp; 3546 mddb_lb_t *lbp = s->s_lbp; 3547 set_t setno = s->s_setno; 3548 uint_t set_status = md_get_setstatus(setno); 3549 3550 3551 ASSERT(md_set[setno].s_dtp != NULL); 3552 3553 /* Nowhere to write to */ 3554 if (lbp->lb_dtblkcnt == 0) 3555 return (err); 3556 3557 if (set_status & MD_SET_BADTAG) 3558 return (err); 3559 3560 /* shorthand */ 3561 dtp = (mddb_dt_t *)md_set[setno].s_dtp; 3562 dtagp = &dtp->dt_dtag; 3563 3564 /* See if the tag is empty. */ 3565 if (dtagp->dt_sn[0] == '\0' && dtagp->dt_hn[0] == '\0' && 3566 (dtagp->dt_tv.tv_sec == 0 && dtagp->dt_tv.tv_usec == 0) && 3567 dtagp->dt_id == 0) 3568 empty_tag = 1; 3569 3570 /* Write the tag to the locators and reset appropriate flags. */ 3571 for (li = 0; li < lbp->lb_loccnt; li++) { 3572 mddb_locator_t *lp = &lbp->lb_locators[li]; 3573 3574 if ((! (lp->l_flags & MDDB_F_ACTIVE)) || 3575 (lp->l_flags & MDDB_F_DELETED) || 3576 (lp->l_flags & MDDB_F_EWRITE)) 3577 continue; 3578 3579 werr = writeblks(s, (caddr_t)dtp, lbp->lb_dtfirstblk, 3580 MDDB_DT_BLOCKS, li, MDDB_WR_ONLY_MASTER); 3581 3582 if (werr) { 3583 err |= werr; 3584 continue; 3585 } 3586 3587 if (empty_tag) 3588 lp->l_flags &= ~(MDDB_F_BADTAG | MDDB_F_TAGDATA); 3589 else { 3590 lp->l_flags |= MDDB_F_TAGDATA; 3591 lp->l_flags &= ~MDDB_F_BADTAG; 3592 } 3593 } 3594 3595 if (err) 3596 return (err); 3597 3598 3599 /* If the tags were written, check to see if any tags remain. */ 3600 for (li = 0; li < lbp->lb_loccnt; li++) { 3601 mddb_locator_t *lp = &lbp->lb_locators[li]; 3602 3603 if ((! (lp->l_flags & MDDB_F_ACTIVE)) || 3604 (lp->l_flags & MDDB_F_DELETED) || 3605 (lp->l_flags & MDDB_F_EWRITE)) 3606 continue; 3607 3608 if (lp->l_flags & MDDB_F_TAGDATA) 3609 break; 3610 } 3611 3612 /* If there are no tags, then clear CLRTAG and TAGDATA */ 3613 if (li == lbp->lb_loccnt) { 3614 md_clr_setstatus(setno, MD_SET_CLRTAG); 3615 md_clr_setstatus(setno, MD_SET_TAGDATA); 3616 } 3617 3618 return (err); 3619 } 3620 3621 /* Should not call for MN diskset since data tags are not supported */ 3622 static int 3623 dt_alloc_if_needed(mddb_set_t *s) 3624 { 3625 int i; 3626 int li; 3627 int moveit = 0; 3628 mddb_lb_t *lbp = s->s_lbp; 3629 mddb_block_t blkcnt = lbp->lb_dtblkcnt; 3630 set_t setno = s->s_setno; 3631 uint_t set_status = md_get_setstatus(setno); 3632 3633 /* 3634 * If the data tag record is allocated (blkcnt != 0) and a bad tag was 3635 * not detected, there is nothing to do. 3636 */ 3637 if (blkcnt != 0 && ! (set_status & MD_SET_BADTAG)) 3638 return (0); 3639 3640 /* Bitmap not setup, checks can't be done */ 3641 if (s->s_totalblkcnt == 0) 3642 return (0); 3643 3644 /* While reading the tag(s) an invalid tag data record was seen */ 3645 if (set_status & MD_SET_BADTAG) 3646 /* See if the invalid tag needs to be moved */ 3647 for (i = 0; i < MDDB_DT_BLOCKS; i++) 3648 if (blkcheck(s, (i + lbp->lb_dtfirstblk))) { 3649 moveit = 1; 3650 break; 3651 } 3652 3653 /* Need to move or allocate the tag data record */ 3654 if (moveit || blkcnt == 0) { 3655 lbp->lb_dtfirstblk = getfreeblks(s, MDDB_DT_BLOCKS); 3656 if (lbp->lb_dtfirstblk == 0) { 3657 cmn_err(CE_WARN, 3658 "Unable to allocate data tag record"); 3659 return (0); 3660 } 3661 lbp->lb_dtblkcnt = MDDB_DT_BLOCKS; 3662 3663 /* Mark the locators so that they get written to disk. */ 3664 for (li = 0; li < lbp->lb_loccnt; li++) { 3665 mddb_locator_t *lp = &lbp->lb_locators[li]; 3666 3667 if ((! (lp->l_flags & MDDB_F_ACTIVE)) || 3668 (lp->l_flags & MDDB_F_DELETED) || 3669 (lp->l_flags & MDDB_F_EWRITE)) 3670 continue; 3671 3672 lp->l_flags |= MDDB_F_BADTAG; 3673 } 3674 return (1); 3675 } 3676 3677 /* 3678 * Make sure the blocks are owned, since the calculation in 3679 * computefreeblks() is bypassed when MD_SET_BADTAG is set. 3680 */ 3681 for (i = 0; i < MDDB_DT_BLOCKS; i++) 3682 blkbusy(s, (i + lbp->lb_dtfirstblk)); 3683 3684 return (1); 3685 } 3686 3687 /* 3688 * Writestart writes the incore mddb out to all of the replicas. 3689 * This is called when a diskset is started and when an error has 3690 * been enountered during the write to a mddb. 3691 * 3692 * flag can be 2 values: 3693 * MDDB_WRITECOPY_ALL - write all records to all mddbs. This is 3694 * always used for traditional and local disksets. 3695 * This is the normal path for MN disksets since the slave 3696 * nodes aren't actually allowed to write to disk. 3697 * MDDB_WRITECOPY_SYNC - special case for MN diskset. When a new 3698 * master has been chosen, the new master may need to 3699 * write its incore mddb to disk (this is the case where the 3700 * old master had executed a message but hadn't relayed it 3701 * to this slave yet). New master should not write the 3702 * change log records since new master would be overwriting 3703 * valuable data. Only used during a reconfig cycle. 3704 */ 3705 static int 3706 writestart( 3707 mddb_set_t *s, 3708 int flag 3709 ) 3710 { 3711 int li; 3712 mddb_locator_t *lp; 3713 mddb_lb_t *lbp; 3714 mddb_ln_t *lnp; 3715 int err = 0; 3716 uint_t set_status; 3717 3718 lbp = s->s_lbp; 3719 3720 for (li = 0; li < lbp->lb_loccnt; li++) { 3721 lp = &lbp->lb_locators[li]; 3722 if (! (lp->l_flags & MDDB_F_ACTIVE)) 3723 continue; 3724 if (! (lp->l_flags & MDDB_F_SUSPECT)) 3725 continue; 3726 if (writecopy(s, li, flag)) 3727 return (1); 3728 lp->l_flags |= MDDB_F_UP2DATE; 3729 } 3730 3731 for (li = 0; li < lbp->lb_loccnt; li++) { 3732 lp = &lbp->lb_locators[li]; 3733 if (! (lp->l_flags & MDDB_F_ACTIVE)) 3734 continue; 3735 if ((lp->l_flags & MDDB_F_UP2DATE)) 3736 continue; 3737 if (checkcopy(s, li)) 3738 if (err = writecopy(s, li, flag)) 3739 return (1); 3740 lp->l_flags |= MDDB_F_UP2DATE; 3741 } 3742 3743 /* 3744 * Call fixoptrecord even during a reconfig cycle since a replica 3745 * failure may force the master to re-assign the optimized 3746 * resync record to another replica. 3747 */ 3748 if (fixoptrecords(s)) 3749 return (1); 3750 3751 set_status = md_get_setstatus(s->s_setno); 3752 3753 /* See if any (ACTIVE and not OLDACT) or (not ACTIVE and OLDACT) */ 3754 for (li = 0; li < lbp->lb_loccnt; li++) { 3755 lp = &lbp->lb_locators[li]; 3756 3757 if (lp->l_flags & MDDB_F_DELETED) 3758 continue; 3759 3760 if (((lp->l_flags & MDDB_F_ACTIVE) != 0 && 3761 (lp->l_flags & MDDB_F_OLDACT) == 0) || 3762 ((lp->l_flags & MDDB_F_ACTIVE) == 0 && 3763 (lp->l_flags & MDDB_F_OLDACT) != 0)) 3764 break; 3765 3766 if ((set_status & MD_SET_TAGDATA) || 3767 (set_status & MD_SET_CLRTAG)) 3768 if ((lp->l_flags & MDDB_F_TAGDATA) || 3769 (lp->l_flags & MDDB_F_BADTAG)) 3770 break; 3771 } 3772 3773 /* 3774 * If we found (ACTIVE and not OLDACT) or (not ACTIVE and OLDACT) 3775 * the lbp identifier and the set identifier doesn't match. 3776 */ 3777 if (li != lbp->lb_loccnt || cmpidentifier(s, &lbp->lb_ident)) { 3778 3779 /* Only call for traditional and local sets */ 3780 if (!(lbp->lb_flags & MDDB_MNSET)) 3781 (void) dt_write(s); 3782 3783 setidentifier(s, &lbp->lb_ident); 3784 3785 if (err = push_lb(s)) { 3786 (void) upd_med(s, "writestart(0)"); 3787 return (err); 3788 } 3789 3790 (void) upd_med(s, "writestart(0)"); 3791 3792 if (err = push_lb(s)) { 3793 (void) upd_med(s, "writestart(1)"); 3794 return (err); 3795 } 3796 3797 (void) upd_med(s, "writestart(1)"); 3798 3799 lnp = s->s_lnp; 3800 uniqtime32(&lnp->ln_timestamp); 3801 if (lbp->lb_flags & MDDB_MNSET) 3802 lnp->ln_revision = MDDB_REV_MNLN; 3803 else 3804 lnp->ln_revision = MDDB_REV_LN; 3805 crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL); 3806 err = writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk, 3807 lbp->lb_lnblkcnt, 0); 3808 /* 3809 * If a MN diskset and this is the master, set the PARSE_LOCNM 3810 * flag in the mddb_set structure to show that the locator 3811 * names have changed. 3812 * Don't set parseflags as a result of a new master sync 3813 * during reconfig cycle since slaves nodes are already 3814 * in-sync with the new master. 3815 */ 3816 3817 if ((lbp->lb_flags & MDDB_MNSET) && 3818 (md_set[s->s_setno].s_am_i_master) && 3819 (flag != MDDB_WRITECOPY_SYNC)) { 3820 s->s_mn_parseflags |= MDDB_PARSE_LOCNM; 3821 } 3822 3823 if (err) 3824 return (err); 3825 } 3826 3827 for (li = 0; li < lbp->lb_loccnt; li++) { 3828 lp = &lbp->lb_locators[li]; 3829 if (lp->l_flags & MDDB_F_DELETED) 3830 continue; 3831 if (lp->l_flags & MDDB_F_ACTIVE) { 3832 lp->l_flags |= MDDB_F_OLDACT; 3833 } else { 3834 lp->l_flags &= ~MDDB_F_OLDACT; 3835 } 3836 } 3837 3838 md_clr_setstatus(s->s_setno, MD_SET_STALE); 3839 3840 return (0); 3841 } 3842 3843 /* 3844 * selectreplicas selects the working replicas and may write the incore 3845 * version of the mddb out to the replicas ondisk. 3846 * 3847 * flag can be 3 values: 3848 * MDDB_RETRYSCAN - quick scan to see if there is an error. 3849 * If no new error, returns without writing mddb 3850 * to disks. If a new error is seen, writes out 3851 * mddb to disks. 3852 * MDDB_SCANALL - lengthy scan to check out mddbs and always writes 3853 * out mddb to the replica ondisk. Calls writecopy 3854 * with MDDB_WRITECOPY_ALL flag which writes out 3855 * all records to the replicas ondisk. 3856 * MDDB_SCANALLSYNC - called during reconfig cycle to sync up incore 3857 * and ondisk mddbs by writing incore values to disk. 3858 * Calls writecopy with MDDB_WRITECOPY_SYNC flag so 3859 * that change log records are not written out. 3860 * Only used by MN disksets. 3861 * 3862 * Returns: 3863 * 0 - Successful 3864 * 1 - Unable to write incore mddb data to disk since < 50% replicas. 3865 */ 3866 int 3867 selectreplicas( 3868 mddb_set_t *s, 3869 int flag 3870 ) 3871 { 3872 int li; 3873 int alc; 3874 int lc; 3875 mddb_locator_t *lp; 3876 mddb_lb_t *lbp = s->s_lbp; 3877 set_t setno = s->s_setno; 3878 int wc_flag; 3879 3880 /* 3881 * can never transition from stale to not stale 3882 */ 3883 if (md_get_setstatus(setno) & MD_SET_STALE) { 3884 for (li = 0; li < lbp->lb_loccnt; li++) { 3885 lp = &lbp->lb_locators[li]; 3886 if (lp->l_flags & MDDB_F_DELETED) 3887 continue; 3888 if (! (lp->l_flags & MDDB_F_EMASTER)) { 3889 lp->l_flags |= MDDB_F_ACTIVE; 3890 } else { 3891 lp->l_flags &= ~MDDB_F_ACTIVE; 3892 } 3893 } 3894 return (1); 3895 } 3896 3897 if ((flag == MDDB_SCANALL) || (flag == MDDB_SCANALLSYNC)) { 3898 for (li = 0; li < lbp->lb_loccnt; li++) { 3899 lp = &lbp->lb_locators[li]; 3900 if (lp->l_flags & MDDB_F_DELETED) 3901 continue; 3902 if (lp->l_flags & MDDB_F_ACTIVE) { 3903 lp->l_flags |= MDDB_F_OLDACT; 3904 lp->l_flags &= ~MDDB_F_SUSPECT; 3905 } else { 3906 lp->l_flags |= MDDB_F_SUSPECT; 3907 lp->l_flags &= ~MDDB_F_OLDACT; 3908 } 3909 3910 if (! (lp->l_flags & MDDB_F_EMASTER)) { 3911 lp->l_flags |= MDDB_F_ACTIVE; 3912 lp->l_flags &= ~MDDB_F_EWRITE; 3913 lp->l_flags &= ~MDDB_F_TOOSMALL; 3914 } else { 3915 lp->l_flags &= ~MDDB_F_ACTIVE; 3916 } 3917 } 3918 computefreeblks(s); /* set up free block bits */ 3919 } else { 3920 for (li = 0; li < lbp->lb_loccnt; li++) { 3921 lp = &lbp->lb_locators[li]; 3922 if (! (lp->l_flags & MDDB_F_ACTIVE)) 3923 continue; 3924 if (lp->l_flags & MDDB_F_EWRITE) 3925 break; 3926 } 3927 3928 /* 3929 * if there are no errors this is error has already 3930 * been processed return current state 3931 */ 3932 if (li == lbp->lb_loccnt) 3933 return (md_get_setstatus(setno) & MD_SET_TOOFEW); 3934 3935 lp->l_flags &= ~MDDB_F_ACTIVE; 3936 do { 3937 lp = &lbp->lb_locators[li]; 3938 lp->l_flags &= ~MDDB_F_UP2DATE; 3939 } while (++li < lbp->lb_loccnt); 3940 } 3941 3942 alc = 0; 3943 lc = 0; 3944 for (li = 0; li < lbp->lb_loccnt; li++) { 3945 lp = &lbp->lb_locators[li]; 3946 if (lp->l_flags & MDDB_F_DELETED) 3947 continue; 3948 lc++; 3949 if (! (lp->l_flags & MDDB_F_ACTIVE)) 3950 continue; 3951 alc++; 3952 } 3953 3954 if (alc < ((lc + 1) / 2)) { 3955 md_set_setstatus(setno, MD_SET_TOOFEW); 3956 return (1); 3957 } 3958 3959 /* Set wc_flag based on flag passed in. */ 3960 if (flag == MDDB_SCANALLSYNC) 3961 wc_flag = MDDB_WRITECOPY_SYNC; 3962 else 3963 wc_flag = MDDB_WRITECOPY_ALL; 3964 3965 do { 3966 if (! writestart(s, wc_flag)) { 3967 md_clr_setstatus(setno, MD_SET_TOOFEW); 3968 return (0); 3969 } 3970 alc = 0; 3971 for (li = 0; li < lbp->lb_loccnt; li++) { 3972 lp = &lbp->lb_locators[li]; 3973 if ((lp->l_flags & MDDB_F_DELETED) || 3974 (lp->l_flags & MDDB_F_EMASTER)) 3975 continue; 3976 3977 if (lp->l_flags & MDDB_F_EWRITE) { 3978 lp->l_flags &= ~MDDB_F_ACTIVE; 3979 lp->l_flags &= ~MDDB_F_UP2DATE; 3980 continue; 3981 } 3982 alc++; 3983 } 3984 } while (alc >= ((lc + 1) / 2)); 3985 md_set_setstatus(setno, MD_SET_TOOFEW); 3986 return (1); 3987 } 3988 3989 static int 3990 checkstate( 3991 mddb_set_t *s, 3992 int probe 3993 ) 3994 { 3995 int error; 3996 uint_t set_status = md_get_setstatus(s->s_setno); 3997 3998 ASSERT(s != NULL); 3999 4000 if (! (set_status & MD_SET_STALE) && ! (set_status & MD_SET_TOOFEW)) 4001 return (0); 4002 4003 if (probe == MDDB_NOPROBE) 4004 return (1); 4005 4006 single_thread_start(s); 4007 error = selectreplicas(s, MDDB_SCANALL); 4008 single_thread_end(s); 4009 4010 if (error == 0 && s->s_zombie != 0) { 4011 mutex_exit(SETMUTEX(s->s_setno)); 4012 error = mddb_deleterec(s->s_zombie); 4013 mutex_enter(SETMUTEX(s->s_setno)); 4014 if (error == 0) 4015 s->s_zombie = 0; 4016 } 4017 return (error); 4018 } 4019 4020 static int 4021 writeretry( 4022 mddb_set_t *s 4023 ) 4024 { 4025 if (selectreplicas(s, MDDB_RETRYSCAN)) 4026 if (selectreplicas(s, MDDB_SCANALL)) 4027 return (1); 4028 return (0); 4029 } 4030 4031 static void 4032 free_mbipp(mddb_mb_ic_t **mbipp) 4033 { 4034 mddb_mb_ic_t *mbip1, *mbip2; 4035 4036 for (mbip1 = *mbipp; mbip1 != NULL; mbip1 = mbip2) { 4037 mbip2 = mbip1->mbi_next; 4038 kmem_free((caddr_t)mbip1, MDDB_IC_BSIZE); 4039 } 4040 *mbipp = (mddb_mb_ic_t *)NULL; 4041 } 4042 4043 static mddb_ri_t * 4044 save_rip(mddb_set_t *s) 4045 { 4046 mddb_ri_t *trip = s->s_rip; 4047 mddb_ri_t *nrip = NULL; 4048 mddb_ri_t **nripp = &nrip; 4049 mddb_ri_t *rip; 4050 4051 while (trip) { 4052 /* Run to the end of the list */ 4053 for (/* void */; (*nripp != NULL); nripp = &(*nripp)->ri_next) 4054 /* void */; 4055 4056 /* Add the new member */ 4057 *nripp = kmem_zalloc(sizeof (**nripp), KM_SLEEP); 4058 4059 ASSERT(*nripp != NULL); 4060 4061 /* shorthand */ 4062 rip = *nripp; 4063 4064 *rip = *trip; /* structure assignment */ 4065 4066 /* Clear the stuff that is not needed for hints */ 4067 rip->ri_flags = 0; 4068 rip->ri_commitcnt = 0; 4069 rip->ri_transplant = 0; 4070 rip->ri_mbip = (mddb_mb_ic_t *)NULL; 4071 rip->ri_dtp = (mddb_dt_t *)NULL; 4072 rip->ri_lbp = (mddb_lb_t *)NULL; 4073 rip->ri_did_icp = (mddb_did_ic_t *)NULL; 4074 rip->ri_devid = (ddi_devid_t)NULL; 4075 rip->ri_old_devid = (ddi_devid_t)NULL; 4076 rip->ri_next = (mddb_ri_t *)NULL; 4077 4078 trip = trip->ri_next; 4079 } 4080 return (nrip); 4081 } 4082 4083 static void 4084 free_rip(mddb_ri_t **ripp) 4085 { 4086 mddb_ri_t *rip; 4087 mddb_ri_t *arip; 4088 4089 for (rip = *ripp; rip != (mddb_ri_t *)NULL; rip = arip) { 4090 arip = rip->ri_next; 4091 if (rip->ri_devid != (ddi_devid_t)NULL) { 4092 ddi_devid_free(rip->ri_devid); 4093 rip->ri_devid = (ddi_devid_t)NULL; 4094 } 4095 if (rip->ri_old_devid != (ddi_devid_t)NULL) { 4096 ddi_devid_free(rip->ri_old_devid); 4097 rip->ri_old_devid = (ddi_devid_t)NULL; 4098 } 4099 kmem_free((caddr_t)rip, sizeof (*rip)); 4100 } 4101 *ripp = (mddb_ri_t *)NULL; 4102 } 4103 4104 /* 4105 * this routine selects the correct replica to use 4106 * the rules are as follows 4107 * 1. if all replica has same init time select highest commit count 4108 * 2. if some but not all replicas are from another hostid discard 4109 * them. 4110 * 3. find which init time is present is most replicas 4111 * 4. discard all replicas which do not match most init times 4112 * 5. select replica with highest commit count 4113 */ 4114 4115 static mddb_lb_t * 4116 selectlocator( 4117 mddb_set_t *s 4118 ) 4119 { 4120 mddb_ri_t *rip = s->s_rip; 4121 mddb_ri_t *r, *r1; 4122 mddb_lb_t *lbp; 4123 struct timeval32 *tp = (struct timeval32 *)NULL; 4124 int different; 4125 int same; 4126 int count; 4127 int maxcount; 4128 set_t setno = s->s_setno; 4129 size_t sz; 4130 int mn_set = 0; 4131 4132 /* Clear the ri_transplant flag on all the rip entries. */ 4133 /* Set ri_commitcnt to locator's commitcnt - if available */ 4134 for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) { 4135 r->ri_transplant = 0; 4136 if (r->ri_lbp != (mddb_lb_t *)NULL) { 4137 r->ri_commitcnt = r->ri_lbp->lb_commitcnt; 4138 /* If any locators have MN bit set, set flag */ 4139 if (r->ri_lbp->lb_flags & MDDB_MNSET) 4140 mn_set = 1; 4141 } 4142 } 4143 4144 /* 4145 * A data tag is being used, so use it to limit the selection first. 4146 * Data tags not used in MN diskset. 4147 */ 4148 if ((mn_set == 0) && (md_get_setstatus(setno) & MD_SET_USETAG)) { 4149 mddb_dt_t *dtp = (mddb_dt_t *)md_set[setno].s_dtp; 4150 4151 /* 4152 * now toss any locators that have a different data tag 4153 */ 4154 for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) { 4155 if (r->ri_lbp == (mddb_lb_t *)NULL) 4156 continue; 4157 4158 if (r->ri_dtp != (mddb_dt_t *)NULL) { 4159 /* If same tag, keep it */ 4160 if (dtl_cmp(&dtp->dt_dtag, 4161 &r->ri_dtp->dt_dtag) == 0) 4162 continue; 4163 } 4164 4165 if (r->ri_dtp != (mddb_dt_t *)NULL) { 4166 kmem_free((caddr_t)r->ri_dtp, MDDB_DT_BYTES); 4167 r->ri_dtp = (mddb_dt_t *)NULL; 4168 } 4169 4170 mddb_devid_icp_free(&r->ri_did_icp, r->ri_lbp); 4171 if (!(md_get_setstatus(setno) & 4172 MD_SET_REPLICATED_IMPORT)) { 4173 if (r->ri_old_devid != (ddi_devid_t)NULL) { 4174 sz = ddi_devid_sizeof(r->ri_old_devid); 4175 kmem_free((caddr_t)r->ri_old_devid, sz); 4176 r->ri_old_devid = (ddi_devid_t)NULL; 4177 } 4178 } 4179 4180 kmem_free((caddr_t)r->ri_lbp, 4181 dbtob(r->ri_lbp->lb_blkcnt)); 4182 r->ri_lbp = (mddb_lb_t *)NULL; 4183 4184 r->ri_transplant = 1; 4185 } 4186 4187 /* Tag used, clear the bit */ 4188 md_clr_setstatus(s->s_setno, MD_SET_USETAG); 4189 4190 if (md_get_setstatus(s->s_setno) & MD_SET_TAGDATA) { 4191 /* 4192 * Get rid of the list of tags. 4193 */ 4194 dtl_freel(&s->s_dtlp); 4195 4196 /* 4197 * Re-create the list with the tag used. 4198 */ 4199 (void) dtl_addl(s, &dtp->dt_dtag); 4200 } 4201 } 4202 4203 /* 4204 * scan to see if all replicas have same time 4205 */ 4206 for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) { 4207 if (r->ri_lbp == (mddb_lb_t *)NULL) 4208 continue; 4209 if (tp == NULL) { 4210 tp = &r->ri_lbp->lb_inittime; 4211 continue; 4212 } 4213 /* CSTYLED */ 4214 if (timercmp(tp, &r->ri_lbp->lb_inittime, !=)) 4215 break; 4216 } 4217 4218 /* 4219 * if r == NULL then they were all them same. Choose highest 4220 * commit count 4221 */ 4222 if (r == (mddb_ri_t *)NULL) 4223 goto out; 4224 4225 /* 4226 * If here, a bogus replica is present and at least 1 lb_inittime 4227 * did not match. 4228 */ 4229 4230 /* 4231 * look and see if any but not all are from different id 4232 */ 4233 4234 different = 0; 4235 same = 0; 4236 for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) { 4237 if (r->ri_lbp == (mddb_lb_t *)NULL) 4238 continue; 4239 if (cmpidentifier(s, &r->ri_lbp->lb_ident)) 4240 different = 1; 4241 else 4242 same = 1; 4243 } 4244 4245 /* 4246 * now go through and throw out different if there are some 4247 * that are the same 4248 */ 4249 if (different != 0 && same != 0) { 4250 for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) { 4251 if (r->ri_lbp == (mddb_lb_t *)NULL) 4252 continue; 4253 4254 if (!cmpidentifier(s, &r->ri_lbp->lb_ident)) 4255 continue; 4256 4257 if (r->ri_dtp != (mddb_dt_t *)NULL) { 4258 kmem_free((caddr_t)r->ri_dtp, MDDB_DT_BYTES); 4259 r->ri_dtp = (mddb_dt_t *)NULL; 4260 } 4261 4262 mddb_devid_icp_free(&r->ri_did_icp, r->ri_lbp); 4263 if (!(md_get_setstatus(setno) & 4264 MD_SET_REPLICATED_IMPORT)) { 4265 if (r->ri_old_devid != (ddi_devid_t)NULL) { 4266 sz = ddi_devid_sizeof(r->ri_old_devid); 4267 kmem_free((caddr_t)r->ri_old_devid, sz); 4268 r->ri_old_devid = (ddi_devid_t)NULL; 4269 } 4270 } 4271 4272 kmem_free((caddr_t)r->ri_lbp, 4273 dbtob(r->ri_lbp->lb_blkcnt)); 4274 r->ri_lbp = (mddb_lb_t *)NULL; 4275 4276 r->ri_transplant = 1; 4277 } 4278 } 4279 4280 /* 4281 * go through and pick highest. Use n square because it is 4282 * simple and 40 some is max possible 4283 */ 4284 maxcount = 0; 4285 lbp = (mddb_lb_t *)NULL; 4286 for (r1 = rip; r1 != (mddb_ri_t *)NULL; r1 = r1->ri_next) { 4287 if (r1->ri_lbp == (mddb_lb_t *)NULL) 4288 continue; 4289 count = 0; 4290 for (r = r1; r != (mddb_ri_t *)NULL; r = r->ri_next) { 4291 if (r->ri_lbp == (mddb_lb_t *)NULL) 4292 continue; 4293 if (timercmp(&r1->ri_lbp->lb_inittime, /* CSTYLED */ 4294 &r->ri_lbp->lb_inittime, ==)) 4295 count++; 4296 } 4297 if (count > maxcount) { 4298 maxcount = count; 4299 lbp = r1->ri_lbp; 4300 } 4301 } 4302 4303 /* 4304 * now go though and toss any that are of a different time stamp 4305 */ 4306 for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) { 4307 if (r->ri_lbp == (mddb_lb_t *)NULL) 4308 continue; 4309 if (timercmp(&lbp->lb_inittime, /* CSTYLED */ 4310 &r->ri_lbp->lb_inittime, ==)) 4311 continue; 4312 4313 if (r->ri_dtp != (mddb_dt_t *)NULL) { 4314 kmem_free((caddr_t)r->ri_dtp, MDDB_DT_BYTES); 4315 r->ri_dtp = (mddb_dt_t *)NULL; 4316 } 4317 4318 mddb_devid_icp_free(&r->ri_did_icp, r->ri_lbp); 4319 if (!(md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) { 4320 if (r->ri_old_devid != (ddi_devid_t)NULL) { 4321 sz = ddi_devid_sizeof(r->ri_old_devid); 4322 kmem_free((caddr_t)r->ri_old_devid, sz); 4323 r->ri_old_devid = (ddi_devid_t)NULL; 4324 } 4325 } 4326 4327 kmem_free((caddr_t)r->ri_lbp, dbtob(r->ri_lbp->lb_blkcnt)); 4328 r->ri_lbp = (mddb_lb_t *)NULL; 4329 4330 r->ri_transplant = 1; 4331 } 4332 4333 out: 4334 /* 4335 * Find the locator with the highest commit count, and make it the 4336 * "chosen" one. 4337 */ 4338 lbp = (mddb_lb_t *)NULL; 4339 for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) { 4340 if (r->ri_lbp == (mddb_lb_t *)NULL) 4341 continue; 4342 4343 if (lbp == NULL) { 4344 lbp = r->ri_lbp; 4345 continue; 4346 } 4347 4348 if (r->ri_lbp->lb_commitcnt > lbp->lb_commitcnt) 4349 lbp = r->ri_lbp; 4350 } 4351 4352 /* Toss all locator blocks, except the "chosen" one. */ 4353 for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) { 4354 if (r->ri_lbp == (mddb_lb_t *)NULL) 4355 continue; 4356 4357 /* Get rid of all dtp's */ 4358 if (r->ri_dtp != (mddb_dt_t *)NULL) { 4359 kmem_free((caddr_t)r->ri_dtp, MDDB_DT_BYTES); 4360 r->ri_dtp = (mddb_dt_t *)NULL; 4361 } 4362 4363 if (r->ri_lbp == lbp) 4364 continue; 4365 4366 /* Get rid of extra locator devid block info */ 4367 mddb_devid_icp_free(&r->ri_did_icp, r->ri_lbp); 4368 if (!(md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) { 4369 if (r->ri_old_devid != (ddi_devid_t)NULL) { 4370 sz = ddi_devid_sizeof(r->ri_old_devid); 4371 kmem_free((caddr_t)r->ri_old_devid, sz); 4372 r->ri_old_devid = (ddi_devid_t)NULL; 4373 } 4374 } 4375 4376 /* Get rid of extra locators */ 4377 kmem_free((caddr_t)r->ri_lbp, dbtob(r->ri_lbp->lb_blkcnt)); 4378 r->ri_lbp = (mddb_lb_t *)NULL; 4379 } 4380 return (lbp); 4381 } 4382 4383 static void 4384 locator2cfgloc( 4385 mddb_lb_t *lbp, 4386 mddb_cfg_loc_t *clp, 4387 int li, 4388 side_t sideno, 4389 mddb_did_ic_t *did_icp 4390 ) 4391 { 4392 mddb_drvnm_t *dn; 4393 mddb_locator_t *lp = &lbp->lb_locators[li]; 4394 mddb_sidelocator_t *slp; 4395 mddb_mnsidelocator_t *mnslp; 4396 mddb_did_info_t *did_info; 4397 int i, sz, szalloc; 4398 int mn_set = 0; 4399 mddb_mnlb_t *mnlbp; 4400 4401 if (lbp->lb_flags & MDDB_MNSET) { 4402 mn_set = 1; 4403 mnlbp = (mddb_mnlb_t *)lbp; 4404 for (i = 0; i < MD_MNMAXSIDES; i++) { 4405 mnslp = &mnlbp->lb_mnsidelocators[i][li]; 4406 if (mnslp->mnl_sideno == sideno) 4407 break; 4408 } 4409 if (i == MD_MNMAXSIDES) 4410 return; 4411 } else { 4412 slp = &lbp->lb_sidelocators[sideno][li]; 4413 } 4414 4415 if (lbp->lb_flags & MDDB_DEVID_STYLE) { 4416 did_info = &(did_icp->did_ic_blkp->blk_info[li]); 4417 if (did_info->info_flags & MDDB_DID_EXISTS) { 4418 sz = (int)ddi_devid_sizeof(did_icp->did_ic_devid[li]); 4419 if (clp->l_devid_flags & MDDB_DEVID_SPACE) { 4420 /* 4421 * copy device id from mddb to 4422 * cfg_loc structure 4423 */ 4424 szalloc = clp->l_devid_sz; 4425 if (sz <= szalloc) { 4426 for (i = 0; i < sz; i++) { 4427 ((char *)(uintptr_t) 4428 clp->l_devid)[i] = 4429 ((char *)did_icp-> 4430 did_ic_devid[li])[i]; 4431 } 4432 clp->l_devid_flags |= MDDB_DEVID_VALID; 4433 (void) strcpy(clp->l_minor_name, 4434 did_info->info_minor_name); 4435 } else { 4436 clp->l_devid_flags |= 4437 MDDB_DEVID_NOSPACE; 4438 } 4439 } else if (clp->l_devid_flags & MDDB_DEVID_GETSZ) { 4440 clp->l_devid_flags = MDDB_DEVID_SZ; 4441 clp->l_devid_sz = sz; 4442 } 4443 } 4444 } 4445 4446 /* 4447 * Even if a devid exists, use the dev, drvnm and mnum in the locators 4448 * and sidelocators. During startup, the dev, drvnm and mnum in 4449 * these structures may not match the devid (the locators and 4450 * sidelocators will be updated to match the devid by the routine 4451 * load_old_replicas). Using out-of-sync values won't cause any 4452 * problems since ridev will re-derive these from the devid and mnum. 4453 * After startup, the dev, drvnm and mnum in these structures have 4454 * been updated and can be used. 4455 */ 4456 4457 clp->l_blkno = lp->l_blkno; 4458 clp->l_flags = lp->l_flags; 4459 clp->l_dev = lp->l_dev; 4460 4461 if (mn_set) { 4462 dn = &lbp->lb_drvnm[mnslp->mnl_drvnm_index]; 4463 clp->l_mnum = mnslp->mnl_mnum; 4464 } else { 4465 dn = &lbp->lb_drvnm[slp->l_drvnm_index]; 4466 clp->l_mnum = slp->l_mnum; 4467 } 4468 (void) strncpy(clp->l_driver, dn->dn_data, MD_MAXDRVNM); 4469 } 4470 4471 /* 4472 * Find the index into the mnsidelocator where entry will go. 4473 * Then index can be fed into both splitname2locatorblocks and 4474 * cfgloc2locator so that those entries can be kept in sync. 4475 * 4476 * Returns: 4477 * -1 if failed to find unused slot or if a traditional diskset 4478 * index, if successful (0 <= index <= MD_MNMAXSIDES) 4479 */ 4480 static int 4481 checklocator( 4482 mddb_lb_t *lbp, 4483 int li, 4484 side_t sideno 4485 ) 4486 { 4487 uchar_t i; 4488 mddb_mnsidelocator_t *mnslp; 4489 mddb_mnlb_t *mnlbp; 4490 int index = -1; 4491 4492 if (lbp->lb_flags & MDDB_MNSET) { 4493 /* 4494 * Checking side locator structure. First, check if 4495 * there is already an entry for this side. If so, 4496 * then use that entry. Otherwise, find an entry 4497 * that has a sideno of 0. 4498 */ 4499 mnlbp = (mddb_mnlb_t *)lbp; 4500 for (i = 0; i < MD_MNMAXSIDES; i++) { 4501 mnslp = &mnlbp->lb_mnsidelocators[i][li]; 4502 if (mnslp->mnl_sideno == sideno) { 4503 /* Found a match - stop looking */ 4504 index = i; 4505 break; 4506 } else if ((mnslp->mnl_sideno == 0) && (index == -1)) { 4507 /* Set first empty slot, but keep looking */ 4508 index = i; 4509 } 4510 } 4511 /* Didn't find empty slot or previously used slot */ 4512 if ((i == MD_MNMAXSIDES) && (index == -1)) { 4513 return (-1); 4514 } 4515 return (index); 4516 } else 4517 return (0); 4518 } 4519 4520 /* 4521 * Takes locator information (driver name, minor number, sideno) and 4522 * stores it in the locator block. 4523 * For traditional diskset, the sideno is the index into the sidelocator 4524 * array in the locator block. 4525 * For the MN diskset, the sideno is the nodeid which can be any number, 4526 * so the index passed in is the index into the mnsidelocator array 4527 * in the locator block. 4528 */ 4529 static int 4530 cfgloc2locator( 4531 mddb_lb_t *lbp, 4532 mddb_cfg_loc_t *clp, 4533 int li, 4534 side_t sideno, 4535 int index /* Only useful in MNsets when > 1 */ 4536 ) 4537 { 4538 uchar_t i; 4539 mddb_sidelocator_t *slp; 4540 mddb_mnsidelocator_t *mnslp; 4541 mddb_set_t *s; 4542 int mn_set = 0; 4543 mddb_mnlb_t *mnlbp; 4544 4545 if (lbp->lb_flags & MDDB_MNSET) { 4546 mnlbp = (mddb_mnlb_t *)lbp; 4547 mn_set = 1; 4548 /* 4549 * Index will be the slot that has the given sideno or 4550 * the first empty slot if no match is found. 4551 * This was pre-checked out in check locator. 4552 */ 4553 mnslp = &mnlbp->lb_mnsidelocators[index][li]; 4554 } else { 4555 slp = &lbp->lb_sidelocators[sideno][li]; 4556 } 4557 4558 /* 4559 * Look for the driver name 4560 */ 4561 for (i = 0; i < MDDB_DRVNMCNT; i++) { 4562 if (lbp->lb_drvnm[i].dn_len == 0) 4563 continue; 4564 if (strncmp(lbp->lb_drvnm[i].dn_data, clp->l_driver, 4565 MD_MAXDRVNM) == 0) 4566 break; 4567 } 4568 4569 /* 4570 * Didn't find one, add a new one 4571 */ 4572 if (i == MDDB_DRVNMCNT) { 4573 for (i = 0; i < MDDB_DRVNMCNT; i++) { 4574 if (lbp->lb_drvnm[i].dn_len == 0) 4575 break; 4576 } 4577 if (i == MDDB_DRVNMCNT) 4578 return (1); 4579 (void) strncpy(lbp->lb_drvnm[i].dn_data, clp->l_driver, 4580 MD_MAXDRVNM); 4581 lbp->lb_drvnm[i].dn_len = (uchar_t)strlen(clp->l_driver); 4582 } 4583 4584 /* Fill in the drvnm index */ 4585 if (mn_set) { 4586 mnslp->mnl_drvnm_index = i; 4587 mnslp->mnl_mnum = clp->l_mnum; 4588 mnslp->mnl_sideno = sideno; 4589 } else { 4590 slp->l_drvnm_index = i; 4591 slp->l_mnum = clp->l_mnum; 4592 } 4593 4594 if (lbp->lb_flags & MDDB_DEVID_STYLE) { 4595 /* 4596 * This device id could already be associated with this index 4597 * if this is not the first side added to the set. 4598 * If device id is 0, there is no device id for this device. 4599 */ 4600 if ((ddi_devid_t)(uintptr_t)clp->l_devid == 0) 4601 return (0); 4602 s = (mddb_set_t *)md_set[lbp->lb_setno].s_db; 4603 if (mddb_devid_add(s, li, (ddi_devid_t)(uintptr_t)clp->l_devid, 4604 clp->l_minor_name)) { 4605 return (1); 4606 } 4607 } 4608 4609 return (0); 4610 } 4611 4612 /* 4613 * See if there are mediator hosts and try to use the data. 4614 */ 4615 static int 4616 mediate( 4617 mddb_set_t *s 4618 ) 4619 { 4620 mddb_lb_t *lbp = s->s_lbp; 4621 med_data_lst_t *meddlp = NULL; 4622 med_data_lst_t *tmeddlp = NULL; 4623 med_data_t *meddp; 4624 int medok = 0; 4625 int medacc = 0; 4626 uint_t maxcc; 4627 int golden = 0; 4628 int err = 1; 4629 set_t setno = s->s_setno; 4630 4631 /* Do not have a mediator, then the state is stale */ 4632 if (s->s_med.n_cnt == 0) 4633 return (err); 4634 4635 /* Contact the mediator hosts for the data */ 4636 meddlp = get_med_host_data(&s->s_med, s->s_setname, setno); 4637 4638 /* No mediator data, stale */ 4639 if (meddlp == NULL) 4640 return (err); 4641 4642 /* Mark all the mediator data that is not for this set as errored */ 4643 for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) { 4644 struct timeval32 tmptime; 4645 meddp = tmeddlp->mdl_med; 4646 4647 /* Count the number of mediators contacted */ 4648 medacc++; 4649 4650 /* Paranoid check */ 4651 if (meddp->med_dat_sn != setno) 4652 meddp->med_dat_fl |= MED_DFL_ERROR; 4653 4654 TIMEVAL_TO_TIMEVAL32(&tmptime, &meddp->med_dat_id); 4655 4656 /*CSTYLED*/ 4657 if (timercmp(&tmptime, &lbp->lb_ident.createtime, !=)) 4658 meddp->med_dat_fl |= MED_DFL_ERROR; 4659 } 4660 4661 /* Get the max commitcount */ 4662 maxcc = 0; 4663 for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) { 4664 meddp = tmeddlp->mdl_med; 4665 if (meddp->med_dat_fl & MED_DFL_ERROR) 4666 continue; 4667 if (meddp->med_dat_cc > maxcc) 4668 maxcc = meddp->med_dat_cc; 4669 } 4670 4671 /* Now mark the records that don't have the highest cc as errored */ 4672 for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) { 4673 meddp = tmeddlp->mdl_med; 4674 if (meddp->med_dat_fl & MED_DFL_ERROR) 4675 continue; 4676 if (meddp->med_dat_cc != maxcc) 4677 meddp->med_dat_fl |= MED_DFL_ERROR; 4678 } 4679 4680 /* Now mark the records that don't match the lb commitcnt as errored */ 4681 for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) { 4682 meddp = tmeddlp->mdl_med; 4683 if (meddp->med_dat_fl & MED_DFL_ERROR) 4684 continue; 4685 if (meddp->med_dat_cc != lbp->lb_commitcnt) 4686 meddp->med_dat_fl |= MED_DFL_ERROR; 4687 } 4688 4689 /* Is there a "golden" copy and how many valid mediators */ 4690 for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) { 4691 meddp = tmeddlp->mdl_med; 4692 if (meddp->med_dat_fl & MED_DFL_ERROR) 4693 continue; 4694 4695 if (meddp->med_dat_fl & MED_DFL_GOLDEN) 4696 golden++; 4697 4698 medok++; 4699 } 4700 4701 /* No survivors, stale */ 4702 if (medok == 0) 4703 goto out; 4704 4705 /* No mediator quorum and no golden copies, stale */ 4706 if (medacc < ((s->s_med.n_cnt / 2) + 1) && ! golden) { 4707 /* Skip odd numbers, no exact 50% */ 4708 if (s->s_med.n_cnt & 1) 4709 goto out; 4710 /* Have 50%, allow an accept */ 4711 if (medacc == (s->s_med.n_cnt / 2)) 4712 md_set_setstatus(setno, MD_SET_ACCOK); 4713 goto out; 4714 } 4715 4716 /* We either have a quorum or a golden copy, or both */ 4717 err = 0; 4718 4719 out: 4720 if (meddlp) { 4721 for (/* void */; meddlp != NULL; meddlp = tmeddlp) { 4722 tmeddlp = meddlp->mdl_nx; 4723 kmem_free(meddlp->mdl_med, sizeof (med_data_t)); 4724 kmem_free(meddlp, sizeof (med_data_lst_t)); 4725 } 4726 } 4727 4728 return (err); 4729 } 4730 4731 /* 4732 * 1. read masterblks and locator blocks for all know database locations 4733 * a. keep track of which have good master blks 4734 * b. keep track of which have good locators 4735 * 4736 */ 4737 static int 4738 get_mbs_n_lbs( 4739 mddb_set_t *s, 4740 int *write_lb 4741 ) 4742 { 4743 mddb_lb_t *lbp = NULL; /* pointer to locator block */ 4744 /* May be cast to mddb_mnlb_t */ 4745 /* if accessing sidenames in */ 4746 /* MN set */ 4747 mddb_did_ic_t *did_icp = NULL; /* ptr to Device ID incore */ 4748 mddb_did_blk_t *did_blkp = 0; 4749 int did_blkp_sz = 0; 4750 mddb_did_db_t *did_dbp; 4751 mddb_did_info_t *did_info; 4752 caddr_t did_block; 4753 mddb_ri_t *rip; 4754 mddb_dtag_lst_t *dtlp; 4755 mddb_locator_t *lp; 4756 daddr_t physblk; 4757 int li; 4758 uint_t blk; 4759 md_dev64_t dev; 4760 caddr_t buffer; 4761 uint_t lb_blkcnt; 4762 int retval = 0; 4763 int err = 0; 4764 int lb_ok = 0; 4765 int lb_total = 0; 4766 int lb_tagged = 0; 4767 int lb_tags; 4768 set_t setno = s->s_setno; 4769 int cont_flag, i; 4770 mddb_did_db_t *did_dbp1, *did_dbp2; 4771 int mn_set = 0; 4772 mddb_cfg_loc_t *cl; 4773 4774 /* 4775 * read in master blocks and locator block for all known locators. 4776 * lb_blkcnt will be set correctly for MN set later once getmasters 4777 * has determined that the set is a MN set. 4778 */ 4779 lb_blkcnt = ((setno == MD_LOCAL_SET) ? MDDB_LOCAL_LBCNT : MDDB_LBCNT); 4780 4781 for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) { 4782 rip->ri_flags &= (MDDB_F_PTCHED | MDDB_F_IOCTL | 4783 MDDB_F_EMASTER); 4784 rip->ri_lbp = (mddb_lb_t *)NULL; 4785 rip->ri_did_icp = (mddb_did_ic_t *)NULL; 4786 4787 /* 4788 * Translated dev is only used in calls to getmasters and 4789 * getblks which expect a translated (aka miniroot) dev. 4790 */ 4791 dev = md_xlate_targ_2_mini(rip->ri_dev); 4792 if (dev == NODEV64) { 4793 /* Set error flag that getmasters would have set */ 4794 /* if getmasters had been allowed to fail */ 4795 rip->ri_flags |= MDDB_F_EMASTER; 4796 } 4797 4798 /* 4799 * Invalid device id on system (due to failed or 4800 * removed device) or invalid devt during upgrade 4801 * (due to powered off device) will cause this 4802 * replica to be marked in error and not used. 4803 */ 4804 if (rip->ri_flags & MDDB_F_EMASTER) 4805 continue; 4806 4807 /* get all master blocks, does mddb_devopen() */ 4808 rip->ri_mbip = getmasters(s, dev, rip->ri_blkno, 4809 &rip->ri_flags, &mn_set); 4810 4811 /* if invalid master block - try next replica */ 4812 if (! rip->ri_mbip) 4813 continue; 4814 4815 /* 4816 * If lbp alloc'd to wrong size - reset it. 4817 * If MN set, lb_blkcnt must be MDDB_MNLBCNT. 4818 * If a traditional set, lb_blkcnt must NOT be MDDB_MNLBCNT. 4819 */ 4820 if (lbp) { 4821 if (((mn_set) && (lb_blkcnt != MDDB_MNLBCNT)) || 4822 ((!mn_set) && (lb_blkcnt == MDDB_MNLBCNT))) { 4823 kmem_free((caddr_t)lbp, dbtob(lb_blkcnt)); 4824 lbp = (mddb_lb_t *)NULL; 4825 } 4826 } 4827 4828 if (lbp == (mddb_lb_t *)NULL) { 4829 /* If a MN set, set lb_blkcnt for MN loc blk size */ 4830 if (mn_set) 4831 lb_blkcnt = MDDB_MNLBCNT; 4832 lbp = (mddb_lb_t *)kmem_zalloc(dbtob(lb_blkcnt), 4833 KM_SLEEP); 4834 } 4835 4836 /* 4837 * Read in all the sectors for the locator block 4838 * NOTE: Need to use getblks, rather than readblklst. 4839 * because it is too early and things are 4840 * NOT set up yet for read*()'s 4841 */ 4842 buffer = (caddr_t)lbp; 4843 for (blk = 0; blk < lb_blkcnt; blk++) { 4844 physblk = getphysblk(blk, rip->ri_mbip); 4845 err = getblks(s, buffer, dev, physblk, 4846 btodb(MDDB_BSIZE), 0); 4847 if (err) { 4848 rip->ri_flags |= err; 4849 break; 4850 } 4851 buffer += MDDB_BSIZE; 4852 } 4853 4854 if (err) 4855 continue; 4856 4857 /* Verify the locator block */ 4858 if (blk != lb_blkcnt) 4859 continue; 4860 if (lbp->lb_magic != MDDB_MAGIC_LB) 4861 continue; 4862 if (lbp->lb_blkcnt != lb_blkcnt) 4863 continue; 4864 if (mn_set) { 4865 /* If a MN set, check for MNLB revision in lb. */ 4866 if (revchk(MDDB_REV_MNLB, lbp->lb_revision)) 4867 continue; 4868 } else { 4869 /* If not a MN set, check for LB revision in lb. */ 4870 if (revchk(MDDB_REV_LB, lbp->lb_revision)) 4871 continue; 4872 } 4873 if (crcchk(lbp, &lbp->lb_checksum, dbtob(lb_blkcnt), NULL)) 4874 continue; 4875 4876 /* 4877 * With the addition of MultiNode Disksets, we must make sure 4878 * to verify that this is the correct set. A node could 4879 * have been out of the config for awhile and this disk could 4880 * have been moved to a different diskset and we don't want 4881 * to accidentally start the wrong set. 4882 * 4883 * We don't do this check if we're in the middle of 4884 * importing a set. 4885 */ 4886 if (!(md_get_setstatus(s->s_setno) & 4887 (MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT)) && 4888 (lbp->lb_setno != s->s_setno)) 4889 continue; 4890 4891 rip->ri_flags |= MDDB_F_LOCACC; 4892 4893 /* 4894 * a commit count of zero means this locator has been deleted 4895 */ 4896 if (lbp->lb_commitcnt == 0) 4897 continue; 4898 4899 /* 4900 * If replica is in the device ID style and md_devid_destroy 4901 * flag is set, turn off device id style. This is only to be 4902 * used in a catastrophic failure case. Examples would be 4903 * where the device id of all drives in the system 4904 * (especially the mirror'd root drives) had been changed 4905 * by firmware upgrade or by a patch to an existing disk 4906 * driver. Another example would be in the case of non-unique 4907 * device ids due to a bug. The device id would be valid on 4908 * the system, but would return the wrong dev_t. 4909 */ 4910 if ((lbp->lb_flags & MDDB_DEVID_STYLE) && md_devid_destroy) { 4911 lbp->lb_flags &= ~MDDB_DEVID_STYLE; 4912 lbp->lb_didfirstblk = 0; 4913 lbp->lb_didblkcnt = 0; 4914 *write_lb = 1; 4915 } 4916 4917 4918 /* 4919 * If replica is in device ID style, read in device ID 4920 * block and verify device ID block information. 4921 */ 4922 if (lbp->lb_flags & MDDB_DEVID_STYLE) { 4923 4924 /* Read in device ID block */ 4925 if (did_icp == NULL) { 4926 did_icp = (mddb_did_ic_t *) 4927 kmem_zalloc(sizeof (mddb_did_ic_t), 4928 KM_SLEEP); 4929 } else { 4930 /* Reuse did_icp, but clear out data */ 4931 if (did_icp->did_ic_blkp != 4932 (mddb_did_blk_t *)NULL) { 4933 kmem_free((caddr_t)did_icp->did_ic_blkp, 4934 did_blkp_sz); 4935 did_blkp = (mddb_did_blk_t *)NULL; 4936 did_icp->did_ic_blkp = 4937 (mddb_did_blk_t *)NULL; 4938 } 4939 if (did_icp->did_ic_dbp != 4940 (mddb_did_db_t *)NULL) { 4941 did_dbp1 = did_icp->did_ic_dbp; 4942 while (did_dbp1) { 4943 did_dbp2 = did_dbp1->db_next; 4944 kmem_free((caddr_t) 4945 did_dbp1->db_ptr, 4946 dbtob(did_dbp1->db_blkcnt)); 4947 kmem_free((caddr_t)did_dbp1, 4948 sizeof (mddb_did_db_t)); 4949 did_dbp1 = did_dbp2; 4950 } 4951 did_icp->did_ic_dbp = 4952 (mddb_did_db_t *)NULL; 4953 } 4954 for (i = 0; i < MDDB_NLB; i++) { 4955 did_icp->did_ic_devid[i] = 4956 (ddi_devid_t)NULL; 4957 } 4958 } 4959 4960 /* Can't reuse blkp since size could be different */ 4961 if (did_blkp != (mddb_did_blk_t *)NULL) { 4962 kmem_free(did_blkp, did_blkp_sz); 4963 } 4964 did_blkp_sz = (int)dbtob(lbp->lb_didblkcnt); 4965 did_blkp = (mddb_did_blk_t *)kmem_zalloc(did_blkp_sz, 4966 KM_SLEEP); 4967 did_icp->did_ic_blkp = did_blkp; 4968 buffer = (caddr_t)did_blkp; 4969 for (blk = lbp->lb_didfirstblk; 4970 blk < (lbp->lb_didblkcnt + lbp->lb_didfirstblk); 4971 blk++) { 4972 physblk = getphysblk(blk, rip->ri_mbip); 4973 err = getblks(s, buffer, dev, physblk, 4974 btodb(MDDB_BSIZE), 0); 4975 if (err) { 4976 rip->ri_flags |= err; 4977 break; 4978 } 4979 buffer += MDDB_BSIZE; 4980 } 4981 if (err) 4982 continue; 4983 4984 /* Verify the Device ID block */ 4985 if (blk != (lbp->lb_didblkcnt + lbp->lb_didfirstblk)) 4986 continue; 4987 if (did_blkp->blk_magic != MDDB_MAGIC_DI) 4988 continue; 4989 if (lbp->lb_didblkcnt != MDDB_DID_BLOCKS) 4990 continue; 4991 if (revchk(MDDB_REV_DI, did_blkp->blk_revision)) 4992 continue; 4993 if (crcchk(did_blkp, &did_blkp->blk_checksum, 4994 dbtob(lbp->lb_didblkcnt), NULL)) 4995 continue; 4996 4997 /* 4998 * Check if device ID block is out of sync with the 4999 * Locator Block by checking if the locator block 5000 * commitcnt does not match the device id block 5001 * commitcnt. If an 'out of sync' condition 5002 * exists, discard this replica since it has 5003 * inconsistent data and can't be used in 5004 * determining the best replica. 5005 * 5006 * An 'out of sync' condition could happen if old 5007 * SDS code was running with new devid style replicas 5008 * or if a failure occurred between the writing of 5009 * the locator block's commitcnt and the device 5010 * id block's commitcnt. 5011 * 5012 * If old SDS code had been running, the upgrade 5013 * process should detect this situation and 5014 * have removed all of the device id information 5015 * via the md_devid_destroy flag in md.conf. 5016 */ 5017 if (did_blkp->blk_commitcnt != 5018 lbp->lb_commitcnt) { 5019 continue; 5020 } 5021 } 5022 5023 5024 /* 5025 * If replica is still in device ID style, read in all 5026 * of the device IDs, verify the checksum of the device IDs. 5027 */ 5028 if (lbp->lb_flags & MDDB_DEVID_STYLE) { 5029 /* 5030 * Reset valid bit in device id info block flags. This 5031 * flag is stored on disk, but the valid bit is reset 5032 * when reading in the replica. If the corresponding 5033 * device id is valid (aka meaning that the system 5034 * knows about this device id), the valid bit will 5035 * be set at a later time. The valid bit for this 5036 * replica's device ID will be set in this routine. 5037 * The valid bits for the rest of the device id's 5038 * will be set after the 'best' replica has 5039 * been selected in routine load_old_replicas. 5040 * Reset updated bit in device id info block flags. 5041 * This flag is also stored on disk, reset when read 5042 * in and set when the locators and side locators 5043 * have been updated to match this valid device 5044 * id information. 5045 */ 5046 for (li = 0; li < lbp->lb_loccnt; li++) { 5047 did_info = &did_blkp->blk_info[li]; 5048 if (did_info->info_flags & MDDB_DID_EXISTS) 5049 did_info->info_flags &= 5050 ~(MDDB_DID_VALID | 5051 MDDB_DID_UPDATED); 5052 } 5053 5054 cont_flag = 0; 5055 for (li = 0; li < lbp->lb_loccnt; li++) { 5056 did_info = &did_blkp->blk_info[li]; 5057 did_block = (caddr_t)NULL; 5058 if (did_info->info_flags & MDDB_DID_EXISTS) { 5059 /* 5060 * Check if block has 5061 * already been read in 5062 */ 5063 did_dbp = did_icp->did_ic_dbp; 5064 while (did_dbp != 0) { 5065 if (did_dbp->db_firstblk == 5066 did_info->info_firstblk) 5067 break; 5068 else 5069 did_dbp = 5070 did_dbp->db_next; 5071 } 5072 /* if block not found, read it in */ 5073 if (did_dbp == NULL) { 5074 did_block = (caddr_t) 5075 (kmem_zalloc(dbtob( 5076 did_info->info_blkcnt), 5077 KM_SLEEP)); 5078 buffer = (caddr_t)did_block; 5079 for (blk = 5080 did_info->info_firstblk; 5081 blk < (did_info-> 5082 info_firstblk + 5083 did_info->info_blkcnt); 5084 blk++) { 5085 physblk = 5086 getphysblk(blk, 5087 rip->ri_mbip); 5088 err = getblks(s, 5089 buffer, dev, 5090 physblk, btodb( 5091 MDDB_BSIZE), 0); 5092 if (err) { 5093 rip->ri_flags |= 5094 err; 5095 break; 5096 } 5097 buffer += MDDB_BSIZE; 5098 } 5099 if (err) { 5100 kmem_free(did_block, 5101 dbtob(did_info-> 5102 info_blkcnt)); 5103 did_block = 5104 (caddr_t)NULL; 5105 cont_flag = 1; 5106 break; 5107 } 5108 5109 /* 5110 * Block read in - 5111 * alloc Disk Block area 5112 */ 5113 did_dbp = (mddb_did_db_t *) 5114 kmem_zalloc( 5115 sizeof (mddb_did_db_t), 5116 KM_SLEEP); 5117 did_dbp->db_ptr = did_block; 5118 did_dbp->db_firstblk = 5119 did_info->info_firstblk; 5120 did_dbp->db_blkcnt = 5121 did_info->info_blkcnt; 5122 5123 /* Add to front of dbp list */ 5124 did_dbp->db_next = 5125 did_icp->did_ic_dbp; 5126 did_icp->did_ic_dbp = did_dbp; 5127 } 5128 /* Check validity of devid in block */ 5129 if (crcchk(((char *)did_dbp->db_ptr + 5130 did_info->info_offset), 5131 &did_info->info_checksum, 5132 did_info->info_length, NULL)) { 5133 cont_flag = 1; 5134 break; 5135 } 5136 5137 /* Block now pointed to by did_dbp */ 5138 did_icp->did_ic_devid[li] = 5139 (ddi_devid_t)((char *) 5140 did_dbp->db_ptr + 5141 did_info->info_offset); 5142 } 5143 } 5144 if (cont_flag) 5145 continue; 5146 } 5147 5148 /* 5149 * All blocks containing devids are now in core. 5150 */ 5151 5152 /* 5153 * If we're doing a replicated import (also known as 5154 * remote copy import), the device id in the locator 5155 * block is incorrect and we need to fix it up here 5156 * alongwith the l_dev otherwise we run into lots of 5157 * trouble later on. 5158 */ 5159 if ((md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) { 5160 mddb_ri_t *trip; 5161 for (li = 0; li < lbp->lb_loccnt; li++) { 5162 did_info = &did_blkp->blk_info[li]; 5163 lp = &lbp->lb_locators[li]; 5164 5165 if (lp->l_flags & MDDB_F_DELETED) 5166 continue; 5167 5168 if (!(did_info->info_flags & MDDB_DID_EXISTS)) 5169 continue; 5170 5171 if (did_icp->did_ic_devid[li] == NULL) 5172 continue; 5173 5174 for (trip = s->s_rip; trip != NULL; 5175 trip = trip->ri_next) { 5176 if (trip->ri_old_devid == NULL) 5177 continue; 5178 if (ddi_devid_compare( 5179 trip->ri_old_devid, 5180 did_icp->did_ic_devid[li]) != 0) { 5181 continue; 5182 } 5183 5184 /* update l_dev and side mnum */ 5185 lp->l_dev = md_cmpldev(trip->ri_dev); 5186 lbp->lb_sidelocators[0][li].l_mnum = 5187 md_getminor(trip->ri_dev); 5188 } 5189 } 5190 } 5191 5192 /* 5193 * If there is a valid devid, verify that this locator 5194 * block has information about itself by checking the 5195 * device ID, minor_name and block 5196 * number from this replica's incore data structure 5197 * against the locator block information that has just 5198 * been read in from disk. 5199 * 5200 * If not a valid devid, verify that this locator block 5201 * has information about itself by checking the minor 5202 * number, block number and driver name from this 5203 * replica's incore data structure against the locator 5204 * block information that has just been read in from disk. 5205 */ 5206 if ((rip->ri_devid != NULL) && 5207 (lbp->lb_flags & MDDB_DEVID_STYLE)) { 5208 /* 5209 * This locator block MUST have locator (replica) 5210 * information about itself. Check against devid, 5211 * slice part of minor number, and block number. 5212 */ 5213 for (li = 0; li < lbp->lb_loccnt; li++) { 5214 did_info = &did_blkp->blk_info[li]; 5215 lp = &lbp->lb_locators[li]; 5216 if (lp->l_flags & MDDB_F_DELETED) 5217 continue; 5218 5219 if (!(did_info->info_flags & MDDB_DID_EXISTS)) 5220 continue; 5221 5222 if (((md_get_setstatus(setno) & 5223 MD_SET_REPLICATED_IMPORT)) && 5224 (rip->ri_old_devid != (ddi_devid_t)NULL)) { 5225 if (ddi_devid_compare(rip->ri_old_devid, 5226 did_icp->did_ic_devid[li]) != 0) 5227 continue; 5228 } else { 5229 if (ddi_devid_compare(rip->ri_devid, 5230 did_icp->did_ic_devid[li]) != 0) 5231 continue; 5232 } 5233 5234 if (strcmp(rip->ri_minor_name, 5235 did_info->info_minor_name) != 0) 5236 continue; 5237 5238 if (lp->l_blkno == rip->ri_blkno) 5239 break; 5240 } 5241 } else { 5242 /* 5243 * This locator block MUST have locator (replica) 5244 * information about itself. 5245 */ 5246 if (!mn_set) { 5247 for (li = 0; li < lbp->lb_loccnt; li++) { 5248 mddb_drvnm_t *dn; 5249 mddb_sidelocator_t *slp; 5250 5251 lp = &lbp->lb_locators[li]; 5252 slp = &lbp-> 5253 lb_sidelocators[s->s_sideno][li]; 5254 if (lp->l_flags & MDDB_F_DELETED) 5255 continue; 5256 if (slp->l_mnum != md_getminor( 5257 rip->ri_dev)) 5258 continue; 5259 if (lp->l_blkno != rip->ri_blkno) 5260 continue; 5261 dn = &lbp->lb_drvnm[slp->l_drvnm_index]; 5262 if (strncmp(dn->dn_data, 5263 rip->ri_driver, MD_MAXDRVNM) == 0) 5264 break; 5265 } 5266 } else { 5267 for (li = 0; li < lbp->lb_loccnt; li++) { 5268 mddb_drvnm_t *dn; 5269 mddb_mnsidelocator_t *mnslp; 5270 mddb_mnlb_t *mnlbp; 5271 int i; 5272 5273 /* 5274 * Check all possible locators locking 5275 * for match to the currently read-in 5276 * locator, must match on: 5277 * - blkno 5278 * - side locator for this 5279 * node's side 5280 * - side locator minor number 5281 * - side locator driver name 5282 */ 5283 5284 /* 5285 * Looking at sidelocs: 5286 * cast lbp -> mnlbp 5287 */ 5288 mnlbp = (mddb_mnlb_t *)lbp; 5289 lp = &mnlbp->lb_locators[li]; 5290 if (lp->l_flags & MDDB_F_DELETED) 5291 continue; 5292 if (lp->l_blkno != rip->ri_blkno) 5293 continue; 5294 5295 for (i = 0; i < MD_MNMAXSIDES; i++) { 5296 mnslp = &mnlbp-> 5297 lb_mnsidelocators[i][li]; 5298 if (mnslp->mnl_sideno == 5299 s->s_sideno) { 5300 break; 5301 } 5302 } 5303 /* No matching side found */ 5304 if (i == MD_MNMAXSIDES) 5305 continue; 5306 if (mnslp->mnl_mnum != 5307 md_getminor(rip->ri_dev)) 5308 continue; 5309 dn = &lbp-> 5310 lb_drvnm[mnslp->mnl_drvnm_index]; 5311 if (strncmp(dn->dn_data, 5312 rip->ri_driver, MD_MAXDRVNM) == 0) 5313 break; 5314 } 5315 } 5316 } 5317 5318 /* 5319 * Didn't find ourself in this locator block it means 5320 * the locator block is a stale transplant. Probably from 5321 * a user doing a dd. 5322 */ 5323 if (li == lbp->lb_loccnt) 5324 continue; 5325 5326 /* 5327 * Keep track of the number of accessed and valid 5328 * locator blocks. 5329 */ 5330 lb_ok++; 5331 5332 /* 5333 * Read the tag in, skips invalid or blank tags. 5334 * Only valid tags allocate storage 5335 * Data tags are not used in MN disksets. 5336 */ 5337 if ((!mn_set) && (! dt_read(s, lbp, rip))) { 5338 /* 5339 * Keep track of the number of tagged 5340 * locator blocks. 5341 */ 5342 lb_tagged++; 5343 5344 /* Keep a list of unique tags. */ 5345 (void) dtl_addl(s, &rip->ri_dtp->dt_dtag); 5346 } 5347 5348 if (!(md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) { 5349 /* 5350 * go through locator block and add any other 5351 * locations of the data base. 5352 * For the replicated import case, this was done earlier 5353 * and we really don't need or want to do so again 5354 */ 5355 cl = kmem_zalloc(sizeof (mddb_cfg_loc_t), KM_SLEEP); 5356 for (li = 0; li < lbp->lb_loccnt; li++) { 5357 lp = &lbp->lb_locators[li]; 5358 if (lp->l_flags & MDDB_F_DELETED) 5359 continue; 5360 5361 cl->l_devid_flags = MDDB_DEVID_GETSZ; 5362 cl->l_devid = (uint64_t)0; 5363 cl->l_devid_sz = 0; 5364 cl->l_old_devid = (uint64_t)0; 5365 cl->l_old_devid_sz = 0; 5366 cl->l_minor_name[0] = '\0'; 5367 locator2cfgloc(lbp, cl, li, s->s_sideno, 5368 did_icp); 5369 5370 if (cl->l_devid_flags & MDDB_DEVID_SZ) { 5371 if ((cl->l_devid = (uintptr_t)kmem_alloc 5372 (cl->l_devid_sz, KM_SLEEP)) 5373 == NULL) { 5374 continue; 5375 } else { 5376 cl->l_devid_flags = 5377 MDDB_DEVID_SPACE; 5378 } 5379 } 5380 locator2cfgloc(lbp, cl, li, s->s_sideno, 5381 did_icp); 5382 5383 (void) ridev(&s->s_rip, cl, &lp->l_dev, 0); 5384 5385 if (cl->l_devid_flags & MDDB_DEVID_SPACE) 5386 kmem_free((caddr_t)(uintptr_t) 5387 cl->l_devid, cl->l_devid_sz); 5388 } 5389 kmem_free(cl, sizeof (mddb_cfg_loc_t)); 5390 } 5391 5392 /* Save LB for later */ 5393 rip->ri_lbp = lbp; 5394 if (lbp->lb_flags & MDDB_DEVID_STYLE) { 5395 rip->ri_did_icp = did_icp; 5396 did_icp = (mddb_did_ic_t *)NULL; 5397 did_blkp = (mddb_did_blk_t *)NULL; 5398 } else 5399 rip->ri_did_icp = NULL; 5400 lbp = (mddb_lb_t *)NULL; 5401 } 5402 5403 if (lbp != (mddb_lb_t *)NULL) 5404 kmem_free((caddr_t)lbp, dbtob(lb_blkcnt)); 5405 5406 if (did_icp != (mddb_did_ic_t *)NULL) { 5407 if (did_icp->did_ic_blkp != (mddb_did_blk_t *)NULL) { 5408 kmem_free((caddr_t)did_icp->did_ic_blkp, did_blkp_sz); 5409 did_blkp = (mddb_did_blk_t *)NULL; 5410 } 5411 if (did_icp->did_ic_dbp != (mddb_did_db_t *)NULL) { 5412 mddb_did_db_t *did_dbp1, *did_dbp2; 5413 5414 did_dbp1 = did_icp->did_ic_dbp; 5415 while (did_dbp1) { 5416 did_dbp2 = did_dbp1->db_next; 5417 kmem_free((caddr_t)did_dbp1->db_ptr, 5418 dbtob(did_dbp1->db_blkcnt)); 5419 kmem_free((caddr_t)did_dbp1, 5420 sizeof (mddb_did_db_t)); 5421 did_dbp1 = did_dbp2; 5422 } 5423 } 5424 kmem_free((caddr_t)did_icp, sizeof (mddb_did_ic_t)); 5425 } 5426 5427 if (did_blkp != (mddb_did_blk_t *)NULL) { 5428 kmem_free((caddr_t)did_blkp, did_blkp_sz); 5429 } 5430 5431 /* No locator blocks were ok */ 5432 if (lb_ok == 0) 5433 goto out; 5434 5435 /* No tagged data was found - will be 0 for MN diskset */ 5436 if (lb_tagged == 0) 5437 goto out; 5438 5439 /* Find the highest non-deleted replica count */ 5440 for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) { 5441 int lb_tot = 0; 5442 5443 if (rip->ri_mbip == (mddb_mb_ic_t *)NULL) 5444 continue; 5445 5446 if (rip->ri_lbp == (mddb_lb_t *)NULL) 5447 continue; 5448 5449 for (li = 0; li < rip->ri_lbp->lb_loccnt; li++) { 5450 lp = &rip->ri_lbp->lb_locators[li]; 5451 if (lp->l_flags & MDDB_F_DELETED) 5452 continue; 5453 lb_tot++; 5454 } 5455 5456 if (lb_tot > lb_total) 5457 lb_total = lb_tot; 5458 } 5459 5460 /* Count the number of unique tags */ 5461 for (lb_tags = 0, dtlp = s->s_dtlp; dtlp != NULL; dtlp = dtlp->dtl_nx) 5462 lb_tags++; 5463 5464 /* Should have at least one tag at this point */ 5465 ASSERT(lb_tags > 0); 5466 5467 5468 /* 5469 * If the number of tagged locators is not the same as the number of 5470 * OK locators OR more than one tag exists, then make sure the 5471 * selected tag will be written out later. 5472 */ 5473 if ((lb_tagged - lb_ok) != 0 || lb_tags > 1) 5474 md_set_setstatus(setno, MD_SET_TAGDATA); 5475 5476 /* Only a single tag, take the tagged data */ 5477 if (lb_tags == 1) { 5478 dt_setup(s, &s->s_dtlp->dtl_dt); 5479 md_set_setstatus(setno, MD_SET_USETAG); 5480 goto out; 5481 } 5482 5483 /* Multiple tags, not selecting a tag, tag mode is on */ 5484 if (! (md_get_setstatus(setno) & MD_SET_USETAG)) 5485 retval = MDDB_E_TAGDATA; 5486 5487 out: 5488 5489 return (retval); 5490 } 5491 5492 /* 5493 * 1. Select a locator. 5494 * 2. check if enough locators now have current copies 5495 * 3. read in database from one of latest 5496 * 4. if known to have latest make all database the same 5497 * 5. if configuration has changed rewrite locators 5498 * 5499 * Parameters: 5500 * s - pointer to mddb_set structure 5501 * flag - used in MN disksets to tell if this node is being joined to 5502 * a diskset that is in the STALE state. If the flag is 5503 * MDDB_MN_STALE, then this node should be marked in the STALE 5504 * state even if > 50% mddbs are available. (The diskset can 5505 * only change from STALE->OK if all nodes withdraw from the 5506 * MN diskset and then rejoin). 5507 */ 5508 static int 5509 load_old_replicas( 5510 mddb_set_t *s, 5511 int flag 5512 ) 5513 { 5514 mddb_lb_t *lbp = NULL; 5515 mddb_mnlb_t *mnlbp = NULL; 5516 mddb_ri_t *rip; 5517 mddb_locator_t *lp; 5518 mddb_db_t *dbp; 5519 mddb_de_ic_t *dep; 5520 int li; 5521 int alc; 5522 int lc; 5523 int tlc; 5524 int retval = 0; 5525 caddr_t p; 5526 size_t maxrecsize; 5527 set_t setno = s->s_setno; 5528 mddb_did_db_t *did_dbp1; 5529 mddb_did_info_t *did_info; 5530 mddb_did_ic_t *did_icp = NULL; 5531 md_dev64_t *newdev; 5532 mddb_sidelocator_t *slp = 0; 5533 mddb_mnsidelocator_t *mnslp = 0; 5534 uchar_t i; 5535 char *name; 5536 ddi_devid_t ret_devid; 5537 md_dev64_t dev; 5538 uint_t len, sz; 5539 char *minor_name; 5540 int write_lb = 0; 5541 int rval; 5542 int stale_rtn = 0; 5543 5544 /* The only error path out of get_mbs_n_lbs() is MDDB_E_TAGDATA */ 5545 if (retval = get_mbs_n_lbs(s, &write_lb)) 5546 goto errout; 5547 5548 if ((lbp = s->s_lbp = selectlocator(s)) == NULL) { 5549 retval = MDDB_E_NOLOCBLK; 5550 goto errout; 5551 } 5552 5553 /* If a multi-node set, then set md_set.s_status flag */ 5554 if (lbp->lb_flags & MDDB_MNSET) { 5555 md_set_setstatus(setno, MD_SET_MNSET); 5556 /* 5557 * If data tag area had been allocated before set type was 5558 * known - free it now. 5559 */ 5560 if (md_set[setno].s_dtp) { 5561 kmem_free((caddr_t)md_set[setno].s_dtp, MDDB_DT_BYTES); 5562 md_set[setno].s_dtp = NULL; 5563 } 5564 } 5565 5566 /* 5567 * If the replica is in devid format, setup the devid incore ptr. 5568 */ 5569 if (lbp->lb_flags & MDDB_DEVID_STYLE) { 5570 for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) { 5571 if (rip->ri_lbp == s->s_lbp) { 5572 did_icp = s->s_did_icp = rip->ri_did_icp; 5573 break; 5574 } 5575 } 5576 /* 5577 * If no devid incore info found - something has gone 5578 * wrong so errout. 5579 */ 5580 if (rip == NULL) { 5581 retval = MDDB_E_NODEVID; 5582 goto errout; 5583 } 5584 5585 /* 5586 * Add all blocks containing devids to free list. 5587 * Then remove addresses that actually contain devids. 5588 */ 5589 did_dbp1 = did_icp->did_ic_dbp; 5590 while (did_dbp1) { 5591 if (mddb_devid_free_add(s, did_dbp1->db_firstblk, 5592 0, dbtob(did_dbp1->db_blkcnt))) { 5593 retval = MDDB_E_NOSPACE; 5594 goto errout; 5595 } 5596 5597 did_dbp1 = did_dbp1->db_next; 5598 } 5599 for (li = 0; li < lbp->lb_loccnt; li++) { 5600 did_info = &(did_icp->did_ic_blkp->blk_info[li]); 5601 if (!(did_info->info_flags & MDDB_DID_EXISTS)) 5602 continue; 5603 5604 if (mddb_devid_free_delete(s, did_info->info_firstblk, 5605 did_info->info_offset, did_info->info_length)) { 5606 /* unable to find disk block */ 5607 retval = MDDB_E_NODEVID; 5608 goto errout; 5609 } 5610 } 5611 } 5612 5613 /* 5614 * create mddb_mbaray, count all locators and active locators. 5615 */ 5616 alc = 0; 5617 lc = 0; 5618 for (li = 0; li < lbp->lb_loccnt; li++) { 5619 ddi_devid_t li_devid; 5620 5621 lp = &lbp->lb_locators[li]; 5622 5623 if (lp->l_flags & MDDB_F_DELETED) 5624 continue; 5625 5626 /* Count non-deleted replicas */ 5627 lc++; 5628 5629 /* 5630 * Use the devid of this locator to compare with the rip 5631 * list. The scenario to watch out for here is that this 5632 * locator could be on a disk that is dead and there could 5633 * be a valid entry in the rip list for a different disk 5634 * that has been moved to the dead disks dev_t. We don't 5635 * want to match with the moved disk. 5636 */ 5637 li_devid = NULL; 5638 (void) mddb_devid_get(s, li, &li_devid, &minor_name); 5639 5640 for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) { 5641 if (match_mddb(rip, li_devid, minor_name, 5642 md_expldev(lp->l_dev), lp->l_blkno)) { 5643 break; 5644 } 5645 } 5646 if (rip == NULL) { 5647 /* 5648 * If rip not found, then mark error in master block 5649 * so that no writes are later attempted to this 5650 * replica. rip may not be setup if ridev 5651 * failed due to un-found driver name. 5652 */ 5653 lp->l_flags |= MDDB_F_EMASTER; 5654 continue; 5655 } 5656 5657 s->s_mbiarray[li] = rip->ri_mbip; 5658 5659 lp->l_flags &= MDDB_F_ACTIVE; 5660 lp->l_flags |= (int)rip->ri_flags; 5661 5662 if (rip->ri_transplant) 5663 lp->l_flags &= ~MDDB_F_ACTIVE; 5664 5665 if (lp->l_flags & MDDB_F_LOCACC) 5666 alc++; 5667 } 5668 5669 /* Save on a divide - calculate 50% + 1 up front */ 5670 tlc = ((lc + 1) / 2); 5671 5672 if (alc > tlc) { /* alc > tlc - OK */ 5673 md_clr_setstatus(setno, MD_SET_STALE); 5674 } else if (alc < tlc) { /* alc < tlc - stale */ 5675 md_set_setstatus(setno, MD_SET_STALE); 5676 } else if (lc & 1) { /* alc == tlc && odd - OK */ 5677 md_clr_setstatus(setno, MD_SET_STALE); 5678 } else { /* alc == tlc && even - ? */ 5679 /* Can do an accept, and are */ 5680 if (md_get_setstatus(setno) & (MD_SET_ACCOK | MD_SET_ACCEPT)) { 5681 md_clr_setstatus(setno, MD_SET_STALE); 5682 } else { /* possibly has a mediator */ 5683 if (mediate(s)) { 5684 md_set_setstatus(setno, MD_SET_STALE); 5685 } else { 5686 md_clr_setstatus(setno, MD_SET_STALE); 5687 } 5688 } 5689 5690 /* 5691 * The mirrored_root_flag allows the sysadmin to decide to 5692 * start the local set in a read/write (non-stale) mode 5693 * when there are only 50% available mddbs on the system and 5694 * when the root file system is on a mirror. This is useful 5695 * in a 2 disk system where 1 disk failure would cause an mddb 5696 * quorum failure and subsequent boot failures since the root 5697 * filesystem would be in a read-only state. 5698 */ 5699 if (mirrored_root_flag == 1 && setno == 0 && 5700 svm_bootpath[0] != 0) { 5701 md_clr_setstatus(setno, MD_SET_STALE); 5702 } else { 5703 if (md_get_setstatus(setno) & MD_SET_STALE) { 5704 /* Allow half mode - CAREFUL! */ 5705 if (mddb_allow_half) 5706 md_clr_setstatus(setno, MD_SET_STALE); 5707 } 5708 } 5709 5710 /* 5711 * In a MN diskset, 5712 * - if 50% mddbs are unavailable and this 5713 * has been marked STALE above 5714 * - master node isn't in the STALE state 5715 * - this node isn't the master node (this node 5716 * isn't the first node to join the set) 5717 * then clear the STALE state and set TOOFEW. 5718 * 5719 * If this node is the master node and set was marked STALE, 5720 * then the set stays STALE. 5721 * 5722 * If this node is not the master and this node's state is 5723 * STALE and the master node is not marked STALE, 5724 * then master node must be in the TOOFEW state or the 5725 * master is panic'ing. A MN diskset can only be placed into 5726 * the STALE state by having the first node join the set 5727 * with <= 50% mddbs. There's no way for a MN diskset to 5728 * transition between STALE and not-STALE states unless all 5729 * nodes are withdrawn from the diskset or all nodes in the 5730 * diskset are rebooted at the same time. 5731 * 5732 * So, mark this node's state as TOOFEW instead of STALE. 5733 */ 5734 if (((md_get_setstatus(setno) & (MD_SET_MNSET | MD_SET_STALE)) 5735 == (MD_SET_MNSET | MD_SET_STALE)) && 5736 ((flag & MDDB_MN_STALE) == 0) && 5737 (!(md_set[setno].s_am_i_master))) { 5738 md_clr_setstatus(setno, MD_SET_STALE); 5739 md_set_setstatus(setno, MD_SET_TOOFEW); 5740 } 5741 } 5742 5743 /* 5744 * If a MN set is marked STALE on the other nodes, 5745 * mark it stale here. Override all other considerations 5746 * such as a mediator or > 50% mddbs available. 5747 */ 5748 if (md_get_setstatus(setno) & MD_SET_MNSET) { 5749 if (flag & MDDB_MN_STALE) 5750 md_set_setstatus(setno, MD_SET_STALE); 5751 } 5752 5753 /* 5754 * read a good copy of the locator names 5755 * if an error occurs reading what is suppose 5756 * to be a good copy continue looking for another 5757 * good copy 5758 */ 5759 s->s_lnp = NULL; 5760 for (li = 0; li < lbp->lb_loccnt; li++) { 5761 lp = &lbp->lb_locators[li]; 5762 if ((! (lp->l_flags & MDDB_F_ACTIVE)) || 5763 (lp->l_flags & MDDB_F_EMASTER)) 5764 continue; 5765 5766 /* Find rip entry for this locator if one exists */ 5767 for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) { 5768 if (match_mddb(rip, NULL, NULL, md_expldev(lp->l_dev), 5769 lp->l_blkno)) 5770 break; 5771 } 5772 5773 if (rip == NULL) { 5774 continue; 5775 } 5776 5777 /* 5778 * Use the rip commitcnt since the commitcnt in lbp could 5779 * been cleared by selectlocator. Looking for a replica with 5780 * the same commitcnt as the 'golden' copy in order to 5781 * get the same data. 5782 */ 5783 if (rip->ri_commitcnt != lbp->lb_commitcnt) { 5784 continue; 5785 } 5786 5787 /* 5788 * Now have a copy of the database that is equivalent 5789 * to the chosen locator block with respect to 5790 * inittime, identifier and commitcnt. Trying the 5791 * equivalent databases in the order that they were 5792 * written will provide the most up to date data. 5793 */ 5794 lp->l_flags |= readlocnames(s, li); 5795 if (s->s_lnp) 5796 break; 5797 } 5798 5799 if (s->s_lnp == NULL) { 5800 retval = MDDB_E_NOLOCNMS; 5801 goto errout; 5802 } 5803 5804 /* 5805 * read a good copy of the data base 5806 * if an error occurs reading what is suppose 5807 * to be a good copy continue looking for another 5808 * good copy 5809 */ 5810 5811 s->s_dbp = NULL; 5812 for (li = 0; li < lbp->lb_loccnt; li++) { 5813 lp = &lbp->lb_locators[li]; 5814 if ((! (lp->l_flags & MDDB_F_ACTIVE)) || 5815 (lp->l_flags & MDDB_F_EMASTER)) 5816 continue; 5817 5818 /* Find rip entry for this locator if one exists */ 5819 for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) { 5820 if (match_mddb(rip, NULL, NULL, md_expldev(lp->l_dev), 5821 lp->l_blkno)) 5822 break; 5823 } 5824 5825 if (rip == NULL) { 5826 continue; 5827 } 5828 5829 /* 5830 * Use the rip commitcnt since the commitcnt in lbp could 5831 * been cleared by selectlocator. Looking for a replica with 5832 * the same commitcnt as the 'golden' copy in order to 5833 * get the same data. 5834 */ 5835 if (rip->ri_commitcnt != lbp->lb_commitcnt) { 5836 continue; 5837 } 5838 5839 /* 5840 * Now have a copy of the database that is equivalent 5841 * to the chosen locator block with respect to 5842 * inittime, identifier and commitcnt. Trying the 5843 * equivalent databases in the order that they were 5844 * written will provide the most up to date data. 5845 */ 5846 lp->l_flags |= readcopy(s, li); 5847 5848 if (s->s_dbp) 5849 break; 5850 } 5851 5852 if (s->s_dbp == NULL) { 5853 retval = MDDB_E_NODIRBLK; 5854 goto errout; 5855 } 5856 5857 lp->l_flags |= MDDB_F_MASTER; 5858 lp->l_flags |= MDDB_F_UP2DATE; 5859 5860 /* 5861 * go through and find largest record; 5862 * Also fixup the user data area's 5863 */ 5864 maxrecsize = MAX(MDDB_BSIZE, s->s_databuffer_size); 5865 5866 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) 5867 for (dep = dbp->db_firstentry; dep != NULL; dep = dep->de_next) 5868 if (dep->de_flags & MDDB_F_OPT) 5869 getoptrecord(s, dep); 5870 else { 5871 allocuserdata(dep); 5872 maxrecsize = MAX(dep->de_recsize, maxrecsize); 5873 } 5874 5875 if (maxrecsize > s->s_databuffer_size) { 5876 p = (caddr_t)kmem_zalloc(maxrecsize, KM_SLEEP); 5877 if (s->s_databuffer_size) 5878 kmem_free(s->s_databuffer, s->s_databuffer_size); 5879 s->s_databuffer = p; 5880 s->s_databuffer_size = maxrecsize; 5881 } 5882 5883 /* If we can clear the tag data record, do it now. */ 5884 /* Data tags not supported on MN sets */ 5885 if ((md_get_setstatus(setno) & MD_SET_CLRTAG) && 5886 (!(md_get_setstatus(setno) & MD_SET_MNSET))) 5887 dt_setup(s, NULL); 5888 5889 /* This will return non-zero if STALE or TOOFEW */ 5890 /* This will write out chosen replica image to all replicas */ 5891 stale_rtn = selectreplicas(s, MDDB_SCANALL); 5892 5893 if ((md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) { 5894 ddi_devid_t devidptr; 5895 5896 /* 5897 * ignore the return value from selectreplicas because we 5898 * may have a STALE or TOOFEW set in the case of a partial 5899 * replicated diskset. We will fix that up later. 5900 */ 5901 5902 lbp = s->s_lbp; 5903 for (li = 0; li < lbp->lb_loccnt; li++) { 5904 did_info = &(did_icp->did_ic_blkp->blk_info[li]); 5905 5906 if (did_info->info_flags & MDDB_DID_EXISTS) { 5907 devidptr = s->s_did_icp->did_ic_devid[li]; 5908 lp = &lbp->lb_locators[li]; 5909 for (rip = s->s_rip; rip != NULL; 5910 rip = rip->ri_next) { 5911 if (rip->ri_old_devid == 0) 5912 continue; 5913 if (ddi_devid_compare(rip->ri_old_devid, 5914 devidptr) != 0) { 5915 continue; 5916 } 5917 if (update_locatorblock(s, 5918 md_expldev(lp->l_dev), 5919 rip->ri_devid, rip->ri_old_devid)) { 5920 goto errout; 5921 } 5922 } 5923 } 5924 } 5925 } else { 5926 if (stale_rtn) 5927 goto errout; 5928 } 5929 5930 /* 5931 * If the replica is in device id style - validate the device id's, 5932 * if present, in the locator block devid area. 5933 */ 5934 newdev = kmem_zalloc(sizeof (md_dev64_t) * MDDB_NLB, KM_SLEEP); 5935 if (lbp->lb_flags & MDDB_DEVID_STYLE) { 5936 for (li = 0; li < lbp->lb_loccnt; li++) { 5937 newdev[li] = 0; 5938 lp = &lbp->lb_locators[li]; 5939 if (lp->l_flags & MDDB_F_DELETED) 5940 continue; 5941 did_info = &(did_icp->did_ic_blkp->blk_info[li]); 5942 dev = md_expldev(lp->l_dev); 5943 if (did_info->info_flags & MDDB_DID_EXISTS) { 5944 /* Validate device id on current system */ 5945 newdev[li] = dev; 5946 if (mddb_devid_validate( 5947 did_icp->did_ic_devid[li], 5948 &(newdev[li]), 5949 did_info->info_minor_name) == 0) { 5950 /* Set valid flag */ 5951 did_info->info_flags |= MDDB_DID_VALID; 5952 } else { 5953 lp->l_flags |= MDDB_F_EMASTER; 5954 } 5955 } else if (!(MD_UPGRADE)) { 5956 /* 5957 * If a device doesn't have a device id, 5958 * check if there is now a device ID 5959 * associated with device. If one exists, 5960 * add it to the locator block devid area. 5961 * If there's not enough space to add it, 5962 * print a warning. 5963 * Don't do this during upgrade. 5964 */ 5965 dev_t ddi_dev = md_dev64_to_dev(dev); 5966 if (ddi_lyr_get_devid(ddi_dev, &ret_devid) == 5967 DDI_SUCCESS) { 5968 if (ddi_lyr_get_minor_name(ddi_dev, 5969 S_IFBLK, &minor_name) 5970 == DDI_SUCCESS) { 5971 if (mddb_devid_add(s, li, 5972 ret_devid, minor_name)) { 5973 cmn_err(CE_WARN, 5974 "Not enough space" 5975 " in metadevice" 5976 " state" 5977 " database\n"); 5978 cmn_err(CE_WARN, 5979 "to add relocation" 5980 " information for" 5981 " device:\n"); 5982 cmn_err(CE_WARN, 5983 " major = %d, " 5984 " minor = %d\n", 5985 getmajor(ddi_dev), 5986 getminor(ddi_dev)); 5987 } else { 5988 write_lb = 1; 5989 } 5990 kmem_free(minor_name, 5991 strlen(minor_name) + 1); 5992 } 5993 ddi_devid_free(ret_devid); 5994 } 5995 } 5996 } 5997 5998 /* 5999 * If a device has a valid device id and if the dev_t 6000 * associated with the device id has changed, update the 6001 * driver name, minor num and dev_t in the local and side 6002 * locators to match the dev_t that the system currently 6003 * associates with the device id. 6004 * 6005 * Don't do this during upgrade. 6006 */ 6007 if (!(MD_UPGRADE)) { 6008 for (li = 0; li < lbp->lb_loccnt; li++) { 6009 lp = &lbp->lb_locators[li]; 6010 if (lp->l_flags & MDDB_F_DELETED) 6011 continue; 6012 did_info = &(did_icp->did_ic_blkp->blk_info 6013 [li]); 6014 if ((did_info->info_flags & MDDB_DID_VALID) && 6015 !(did_info->info_flags & 6016 MDDB_DID_UPDATED)) { 6017 if (lbp->lb_flags & MDDB_MNSET) { 6018 int j; 6019 int index = -1; 6020 mnlbp = (mddb_mnlb_t *)lbp; 6021 for (j = 0; j < MD_MNMAXSIDES; 6022 j++) { 6023 mnslp = &mnlbp-> 6024 lb_mnsidelocators[j] 6025 [li]; 6026 if (mnslp->mnl_sideno == 6027 s->s_sideno) 6028 break; 6029 if (mnslp->mnl_sideno == 6030 0) 6031 index = j; 6032 } 6033 if (j == MD_MNMAXSIDES) { 6034 /* 6035 * No match found; take 6036 * empty 6037 */ 6038 mnslp = &mnlbp-> 6039 lb_mnsidelocators 6040 [index][li]; 6041 write_lb = 1; 6042 mnslp->mnl_mnum = 6043 md_getminor(newdev 6044 [li]); 6045 } else if (mnslp->mnl_mnum != 6046 md_getminor(newdev[li])) { 6047 write_lb = 1; 6048 mnslp->mnl_mnum = 6049 md_getminor(newdev 6050 [li]); 6051 } 6052 } else { 6053 slp = &lbp-> 6054 lb_sidelocators[s->s_sideno] 6055 [li]; 6056 if (slp->l_mnum != 6057 md_getminor(newdev[li])) { 6058 write_lb = 1; 6059 slp->l_mnum = 6060 md_getminor(newdev 6061 [li]); 6062 } 6063 } 6064 name = ddi_major_to_name(md_getmajor( 6065 newdev[li])); 6066 if (lbp->lb_flags & MDDB_MNSET) 6067 i = mnslp->mnl_drvnm_index; 6068 else 6069 i = slp->l_drvnm_index; 6070 if (strncmp(lbp->lb_drvnm[i].dn_data, 6071 name, lbp->lb_drvnm[i].dn_len) != 6072 0) { 6073 /* Driver name has changed */ 6074 len = strlen(name); 6075 /* Look for the driver name */ 6076 for (i = 0; i < MDDB_DRVNMCNT; 6077 i++) { 6078 if (lbp->lb_drvnm[i]. 6079 dn_len != len) 6080 continue; 6081 if (strncmp(lbp-> 6082 lb_drvnm[i].dn_data, 6083 name, len) == 0) 6084 break; 6085 } 6086 /* Didn't find one, add it */ 6087 if (i == MDDB_DRVNMCNT) { 6088 for (i = 0; i < 6089 MDDB_DRVNMCNT; 6090 i++) { 6091 if (lbp-> 6092 lb_drvnm[i]. 6093 dn_len == 0) 6094 break; 6095 } 6096 if (i == 6097 MDDB_DRVNMCNT) { 6098 cmn_err(CE_WARN, 6099 "Unable to " 6100 " update " 6101 "driver " 6102 " name for " 6103 "dev: " 6104 "major = %d" 6105 ", minor = " 6106 "%d\n", 6107 md_getmajor( 6108 newdev[li]), 6109 md_getminor( 6110 newdev 6111 [li])); 6112 continue; 6113 } 6114 (void) strncpy(lbp-> 6115 lb_drvnm[i].dn_data, 6116 name, MD_MAXDRVNM); 6117 lbp->lb_drvnm[i]. 6118 dn_len = (uchar_t) 6119 strlen(name); 6120 } 6121 /* Fill in the drvnm index */ 6122 if (lbp->lb_flags & 6123 MDDB_MNSET) 6124 mnslp->mnl_drvnm_index = 6125 i; 6126 else 6127 slp->l_drvnm_index = i; 6128 write_lb = 1; 6129 } 6130 did_info->info_flags |= 6131 MDDB_DID_UPDATED; 6132 } 6133 } 6134 } 6135 } 6136 kmem_free(newdev, sizeof (md_dev64_t) * MDDB_NLB); 6137 6138 /* 6139 * If locator block has been changed by get_mbs_n_lbs, 6140 * by addition of new device id, by updated minor name or 6141 * by updated driver name - write out locator block. 6142 */ 6143 if (write_lb) { 6144 rval = push_lb(s); 6145 (void) upd_med(s, "load_old_replicas(0)"); 6146 if (rval) 6147 goto errout; 6148 } 6149 6150 /* 6151 * If the tag was moved, allocated, or a BADTAG was seen for some other 6152 * reason, then make sure tags are written to all the replicas. 6153 * Data tags not supported on MN sets. 6154 */ 6155 if (!(md_get_setstatus(setno) & MD_SET_MNSET)) { 6156 if (! (lc = dt_alloc_if_needed(s))) { 6157 for (li = 0; li < lbp->lb_loccnt; li++) { 6158 lp = &lbp->lb_locators[li]; 6159 6160 if ((! (lp->l_flags & MDDB_F_ACTIVE)) || 6161 (lp->l_flags & MDDB_F_EMASTER)) 6162 continue; 6163 6164 if (lp->l_flags & MDDB_F_BADTAG) { 6165 lc = 1; 6166 break; 6167 } 6168 } 6169 } 6170 6171 if (lc) { 6172 md_set_setstatus(setno, MD_SET_TAGDATA); 6173 md_clr_setstatus(setno, MD_SET_BADTAG); 6174 (void) selectreplicas(s, MDDB_SCANALL); 6175 } 6176 } 6177 6178 errout: 6179 6180 /* Free extraneous rip components. */ 6181 for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) { 6182 /* Get rid of lbp's and dtp's */ 6183 6184 if (rip->ri_lbp != lbp) { 6185 if (rip->ri_dtp != (mddb_dt_t *)NULL) { 6186 kmem_free((caddr_t)rip->ri_dtp, MDDB_DT_BYTES); 6187 rip->ri_dtp = (mddb_dt_t *)NULL; 6188 } 6189 6190 if (rip->ri_devid != (ddi_devid_t)NULL) { 6191 sz = (int)ddi_devid_sizeof(rip->ri_devid); 6192 kmem_free((caddr_t)rip->ri_devid, sz); 6193 rip->ri_devid = (ddi_devid_t)NULL; 6194 } 6195 if (rip->ri_old_devid != (ddi_devid_t)NULL) { 6196 sz = (int)ddi_devid_sizeof(rip->ri_old_devid); 6197 kmem_free((caddr_t)rip->ri_old_devid, sz); 6198 rip->ri_old_devid = (ddi_devid_t)NULL; 6199 } 6200 6201 if (rip->ri_lbp != (mddb_lb_t *)NULL) { 6202 mddb_devid_icp_free(&rip->ri_did_icp, 6203 rip->ri_lbp); 6204 6205 kmem_free((caddr_t)rip->ri_lbp, 6206 dbtob(rip->ri_lbp->lb_blkcnt)); 6207 rip->ri_lbp = (mddb_lb_t *)NULL; 6208 } 6209 } 6210 6211 if (lbp != NULL) { 6212 for (li = 0; li < lbp->lb_loccnt; li++) { 6213 lp = &lbp->lb_locators[li]; 6214 if (lp->l_flags & MDDB_F_DELETED) 6215 continue; 6216 if (rip->ri_dev == md_expldev(lp->l_dev) && 6217 rip->ri_blkno == lp->l_blkno) 6218 break; 6219 } 6220 if (li < lbp->lb_loccnt) 6221 continue; 6222 } 6223 6224 /* 6225 * Get rid of mbp's: 6226 * if lbp, those out of lb_loccnt bounds 6227 * if !lbp, all of them. 6228 */ 6229 if (rip->ri_mbip) { 6230 md_dev64_t dev64 = md_xlate_targ_2_mini(rip->ri_dev); 6231 if (dev64 != NODEV64) 6232 mddb_devclose(dev64); 6233 6234 free_mbipp(&rip->ri_mbip); 6235 } 6236 /* 6237 * Turn off MDDB_F_EMASTER flag in a diskset since diskset 6238 * code always ends up calling ridev for all replicas 6239 * before calling load_old_replicas. ridev will reset 6240 * MDDB_F_EMASTER flag if flag was due to unresolved devid. 6241 */ 6242 if (setno != MD_LOCAL_SET) 6243 rip->ri_flags &= ~MDDB_F_EMASTER; 6244 } 6245 return (retval); 6246 } 6247 6248 /* 6249 * Given the devt from the md.conf info, get the devid for the device. 6250 */ 6251 static void 6252 lookup_db_devid(mddb_cfg_loc_t *cl) 6253 { 6254 dev_t ldev; 6255 ddi_devid_t devid; 6256 char *minor; 6257 6258 if (ddi_name_to_major(cl->l_driver) == (major_t)-1) { 6259 cmn_err(CE_NOTE, "mddb: unknown major name '%s'", cl->l_driver); 6260 return; 6261 } 6262 6263 ldev = makedevice(ddi_name_to_major(cl->l_driver), cl->l_mnum); 6264 if (ddi_lyr_get_devid(ldev, &devid) != DDI_SUCCESS) { 6265 cmn_err(CE_NOTE, "mddb: unable to get devid for '%s', 0x%x", 6266 cl->l_driver, cl->l_mnum); 6267 return; 6268 } 6269 6270 if (ddi_lyr_get_minor_name(ldev, S_IFBLK, &minor) != DDI_SUCCESS) { 6271 cmn_err(CE_NOTE, "mddb: unable to get minor name 0x%x", 6272 cl->l_mnum); 6273 return; 6274 } 6275 6276 cl->l_devid_flags = MDDB_DEVID_SPACE | MDDB_DEVID_VALID | MDDB_DEVID_SZ; 6277 cl->l_devid_sz = (int)ddi_devid_sizeof(devid); 6278 cl->l_devid = (uint64_t)(uintptr_t)devid; 6279 (void) strlcpy(cl->l_minor_name, minor, MDDB_MINOR_NAME_MAX); 6280 6281 kmem_free(minor, strlen(minor) + 1); 6282 } 6283 6284 /* 6285 * grab driver name, minor, block and devid out of 6286 * strings like "driver:minor:block:devid" 6287 */ 6288 static int 6289 parse_db_loc( 6290 char *str, 6291 mddb_cfg_loc_t *clp 6292 ) 6293 { 6294 char *p, *e; 6295 char *minor_name; 6296 ddi_devid_t ret_devid; 6297 6298 clp->l_dev = 0; 6299 p = clp->l_driver; 6300 e = p + sizeof (clp->l_driver) - 1; 6301 while ((*str != ':') && (*str != '\0') && (p < e)) 6302 *p++ = *str++; 6303 *p = '\0'; 6304 if (*str++ != ':') 6305 return (-1); 6306 clp->l_mnum = 0; 6307 while (ISNUM(*str)) { 6308 clp->l_mnum *= 10; 6309 clp->l_mnum += *str++ - '0'; 6310 } 6311 if (*str++ != ':') 6312 return (-1); 6313 clp->l_blkno = 0; 6314 while (ISNUM(*str)) { 6315 clp->l_blkno *= 10; 6316 clp->l_blkno += *str++ - '0'; 6317 } 6318 if (*str++ != ':') 6319 return (-1); 6320 6321 /* 6322 * If the md_devid_destroy flag is set, ignore the device ids. 6323 * This is only to used in a catastrophic failure case. Examples 6324 * would be where the device id of all drives in the system 6325 * (especially the mirror'd root drives) had been changed 6326 * by firmware upgrade or by a patch to an existing disk 6327 * driver. Another example would be in the case of non-unique 6328 * device ids due to a bug. The device id would be valid on 6329 * the system, but would return the wrong dev_t. 6330 */ 6331 if (md_devid_destroy) { 6332 clp->l_devid_flags = 0; 6333 clp->l_devid = (uint64_t)NULL; 6334 clp->l_devid_sz = 0; 6335 clp->l_old_devid = (uint64_t)NULL; 6336 clp->l_old_devid_sz = 0; 6337 clp->l_minor_name[0] = '\0'; 6338 return (0); 6339 } 6340 6341 if (ddi_devid_str_decode(str, 6342 (ddi_devid_t *)&ret_devid, &minor_name) == DDI_FAILURE) 6343 return (-1); 6344 6345 clp->l_devid = (uint64_t)(uintptr_t)ret_devid; 6346 clp->l_devid_flags = 0; 6347 clp->l_old_devid = (uint64_t)NULL; 6348 clp->l_old_devid_sz = 0; 6349 6350 /* If no device id associated with device, just return */ 6351 if ((ddi_devid_t)(uintptr_t)clp->l_devid == (ddi_devid_t)NULL) { 6352 clp->l_devid_sz = 0; 6353 clp->l_minor_name[0] = '\0'; 6354 if (strcmp(str, "id0") == 0 && md_devid_destroy == 0 && 6355 md_keep_repl_state == 0) { 6356 /* 6357 * No devid in md.conf; we're in recovery mode so 6358 * lookup the devid for the device as specified by 6359 * the devt in md.conf. 6360 */ 6361 lookup_db_devid(clp); 6362 } 6363 return (0); 6364 } 6365 6366 clp->l_devid_flags = MDDB_DEVID_SPACE | MDDB_DEVID_VALID | 6367 MDDB_DEVID_SZ; 6368 clp->l_devid_sz = (int)ddi_devid_sizeof( 6369 (ddi_devid_t)(uintptr_t)clp->l_devid); 6370 (void) strcpy(clp->l_minor_name, minor_name); 6371 kmem_free(minor_name, strlen(minor_name) + 1); 6372 6373 return (0); 6374 } 6375 6376 /* 6377 * grab driver name, minor, and block out of 6378 * strings like "driver:minor:block:devid driver:minor:block:devid ..." 6379 */ 6380 static void 6381 parse_db_string( 6382 char *str 6383 ) 6384 { 6385 char *p, *e; 6386 mddb_cfg_loc_t *cl; 6387 char restore_space; 6388 6389 /* CSTYLED */ 6390 cl = kmem_zalloc(sizeof (mddb_cfg_loc_t), KM_SLEEP); 6391 for (p = str; (*p != '\0'); ) { 6392 for (; ((*p != '\0') && (ISWHITE(*p))); ++p) 6393 ; 6394 if (*p == '\0') 6395 break; 6396 for (e = p; ((*e != '\0') && (! ISWHITE(*e))); ++e) 6397 ; 6398 /* 6399 * Only give parse_db_loc 1 entry, so stuff a null into 6400 * the string if we're not at the end. We need to save this 6401 * char and restore it after call. 6402 */ 6403 restore_space = '\0'; 6404 if (*e != '\0') { 6405 restore_space = *e; 6406 *e = '\0'; 6407 } 6408 if (parse_db_loc(p, cl) != 0) { 6409 cmn_err(CE_NOTE, "mddb: parsing error on '%s'", p); 6410 } else { 6411 (void) ridev( 6412 &((mddb_set_t *)md_set[MD_LOCAL_SET].s_db)->s_rip, 6413 cl, NULL, MDDB_F_PTCHED); 6414 if (cl->l_devid_flags & MDDB_DEVID_SPACE) { 6415 kmem_free((caddr_t)(uintptr_t)cl->l_devid, 6416 cl->l_devid_sz); 6417 } 6418 } 6419 if (restore_space != '\0') { 6420 *e = restore_space; 6421 } 6422 p = e; 6423 } 6424 kmem_free(cl, sizeof (mddb_cfg_loc_t)); 6425 } 6426 6427 /* 6428 * grab database locations supplied by md.conf as properties 6429 */ 6430 static void 6431 parse_db_strings(void) 6432 { 6433 int bootlist_id; 6434 int proplen; 6435 /* 6436 * size of _bootlist_name should match uses of line and entry in 6437 * libmeta meta_systemfile_append_mddb routine (meta_systemfile.c) 6438 */ 6439 char _bootlist_name[MDDB_BOOTLIST_MAX_LEN]; 6440 char *bootlist_name; 6441 caddr_t prop; 6442 6443 /* 6444 * Step through the bootlist properties one at a time by forming the 6445 * correct name, fetching the property, parsing the property and 6446 * then freeing the memory. If a property does not exist or returns 6447 * some form of error just ignore it. There is no guarantee that 6448 * the properties will always exist in sequence, for example 6449 * mddb_bootlist1 may exist and mddb_bootlist2 may not exist with 6450 * mddb_bootlist3 existing. 6451 */ 6452 bootlist_name = &_bootlist_name[0]; 6453 for (bootlist_id = 0; bootlist_id < md_maxbootlist; bootlist_id++) { 6454 6455 proplen = 0; 6456 (void) sprintf(bootlist_name, "mddb_bootlist%d", bootlist_id); 6457 6458 if (ddi_getlongprop(DDI_DEV_T_ANY, md_devinfo, 6459 DDI_PROP_CANSLEEP, bootlist_name, (caddr_t)&prop, 6460 &proplen) != DDI_PROP_SUCCESS) 6461 continue; 6462 6463 if (proplen <= 0) 6464 continue; 6465 6466 if (md_init_debug) 6467 cmn_err(CE_NOTE, "%s is %s", bootlist_name, prop); 6468 6469 parse_db_string(prop); 6470 kmem_free(prop, proplen); 6471 } 6472 } 6473 6474 static int 6475 initit( 6476 set_t setno, 6477 int flag 6478 ) 6479 { 6480 int i; 6481 mddb_set_t *s; 6482 mddb_lb_t *lbp; /* pointer to locator block */ 6483 mddb_ln_t *lnp; /* pointer to locator names */ 6484 mddb_db_t *dbp; /* pointer to directory block */ 6485 mddb_did_blk_t *did_blkp; /* pointer to Device ID block */ 6486 mddb_did_ic_t *did_icp; /* pointer to Device ID incore area */ 6487 mddb_bf_t *bfp; 6488 side_t sideno; 6489 side_t maxsides; 6490 mddb_block_t lb_blkcnt; 6491 int retval = 0; 6492 md_dev64_t dev; 6493 mddb_mnlb_t *mnlbp; 6494 int devid_flag; 6495 6496 /* single thread's all loads/unloads of set's */ 6497 mutex_enter(&mddb_lock); 6498 mutex_enter(SETMUTEX(setno)); 6499 6500 if (((mddb_set_t *)md_set[setno].s_db) == NULL) { 6501 mutex_exit(SETMUTEX(setno)); 6502 mutex_exit(&mddb_lock); 6503 return (MDDB_E_NOTNOW); 6504 } 6505 6506 s = (mddb_set_t *)md_set[setno].s_db; 6507 6508 single_thread_start(s); 6509 6510 /* 6511 * init is already underway, block. Return success. 6512 */ 6513 if (s->s_lbp) { 6514 single_thread_end(s); 6515 mutex_exit(SETMUTEX(setno)); 6516 mutex_exit(&mddb_lock); 6517 return (0); 6518 } 6519 6520 uniqtime32(&s->s_inittime); 6521 6522 /* grab database locations patched by /etc/system */ 6523 if (setno == MD_LOCAL_SET) 6524 parse_db_strings(); 6525 6526 s->s_mbiarray = (mddb_mb_ic_t **)kmem_zalloc( 6527 sizeof (mddb_mb_ic_t *) * mddb_maxcopies, KM_SLEEP); 6528 6529 s->s_zombie = 0; 6530 s->s_staledeletes = 0; 6531 s->s_optcmtcnt = 0; 6532 s->s_opthavelck = 0; 6533 s->s_optwantlck = 0; 6534 s->s_optwaiterr = 0; 6535 s->s_opthungerr = 0; 6536 6537 /* 6538 * KEEPTAG can never be set for a MN diskset since no tags are 6539 * allowed to be stored in a MN diskset. No way to check 6540 * if this is a MN diskset or not at this point since the mddb 6541 * hasn't been read in from disk yet. (flag will only have 6542 * MUTLINODE bit set if a new set is being created.) 6543 */ 6544 if (! (md_get_setstatus(s->s_setno) & MD_SET_KEEPTAG)) 6545 dt_setup(s, NULL); 6546 6547 md_clr_setstatus(s->s_setno, MD_SET_TOOFEW); 6548 6549 for (i = 0; i < mddb_maxbufheaders; i++) { 6550 bfp = (mddb_bf_t *)kmem_zalloc(sizeof (*bfp), KM_SLEEP); 6551 sema_init(&bfp->bf_buf.b_io, 0, NULL, 6552 SEMA_DEFAULT, NULL); 6553 sema_init(&bfp->bf_buf.b_sem, 0, NULL, 6554 SEMA_DEFAULT, NULL); 6555 bfp->bf_buf.b_offset = -1; 6556 freebuffer(s, bfp); 6557 } 6558 6559 retval = load_old_replicas(s, flag); 6560 /* If 0 return value - success */ 6561 if (! retval) { 6562 single_thread_end(s); 6563 mutex_exit(SETMUTEX(setno)); 6564 mutex_exit(&mddb_lock); 6565 return (0); 6566 } 6567 6568 /* 6569 * If here, then the load_old_replicas() failed 6570 */ 6571 6572 6573 /* If the database was supposed to exist. */ 6574 if (flag & MDDB_MUSTEXIST) { 6575 if (s->s_mbiarray != (mddb_mb_ic_t **)NULL) { 6576 for (i = 0; i < mddb_maxcopies; i++) { 6577 if (! s->s_mbiarray[i]) 6578 continue; 6579 dev = md_expldev( 6580 s->s_lbp->lb_locators[i].l_dev); 6581 dev = md_xlate_targ_2_mini(dev); 6582 if (dev != NODEV64) 6583 mddb_devclose(dev); 6584 6585 free_mbipp(&s->s_mbiarray[i]); 6586 } 6587 6588 kmem_free((caddr_t)s->s_mbiarray, 6589 sizeof (mddb_mb_ic_t *) * mddb_maxcopies); 6590 s->s_mbiarray = NULL; 6591 } 6592 6593 if (s->s_lnp != (mddb_ln_t *)NULL) { 6594 kmem_free((caddr_t)s->s_lnp, 6595 dbtob(s->s_lbp->lb_lnblkcnt)); 6596 s->s_lnp = (mddb_ln_t *)NULL; 6597 } 6598 6599 mddb_devid_icp_free(&s->s_did_icp, s->s_lbp); 6600 6601 if (s->s_lbp != (mddb_lb_t *)NULL) { 6602 kmem_free((caddr_t)s->s_lbp, 6603 dbtob(s->s_lbp->lb_blkcnt)); 6604 s->s_lbp = (mddb_lb_t *)NULL; 6605 } 6606 6607 while ((bfp = allocbuffer(s, MDDB_NOSLEEP)) != NULL) 6608 kmem_free((caddr_t)bfp, sizeof (*bfp)); 6609 6610 single_thread_end(s); 6611 mutex_exit(SETMUTEX(setno)); 6612 mutex_exit(&mddb_lock); 6613 6614 if (retval == MDDB_E_TAGDATA) 6615 return (retval); 6616 6617 /* Want a bit more detailed error messages */ 6618 if (mddb_db_err_detail) 6619 return (retval); 6620 6621 return (MDDB_E_NODB); 6622 } 6623 6624 6625 /* 6626 * MDDB_NOOLDOK set - Creating a new database, so do 6627 * more initialization. 6628 */ 6629 6630 lb_blkcnt = (mddb_block_t)((setno == MD_LOCAL_SET) ? 6631 MDDB_LOCAL_LBCNT : MDDB_LBCNT); 6632 if (flag & MDDB_MULTINODE) { 6633 lb_blkcnt = MDDB_MNLBCNT; 6634 } 6635 6636 if (s->s_lbp == NULL) 6637 s->s_lbp = (mddb_lb_t *)kmem_alloc(dbtob(lb_blkcnt), KM_SLEEP); 6638 lbp = s->s_lbp; 6639 6640 bzero((caddr_t)lbp, dbtob(lb_blkcnt)); 6641 lbp->lb_setno = setno; 6642 lbp->lb_magic = MDDB_MAGIC_LB; 6643 if (flag & MDDB_MULTINODE) { 6644 lbp->lb_revision = MDDB_REV_MNLB; 6645 } else { 6646 lbp->lb_revision = MDDB_REV_LB; 6647 } 6648 lbp->lb_inittime = s->s_inittime; 6649 if (flag & MDDB_MULTINODE) { 6650 mnlbp = (mddb_mnlb_t *)lbp; 6651 for (i = 0; i < MDDB_NLB; i++) { 6652 for (sideno = 0; sideno < MD_MNMAXSIDES; sideno++) { 6653 mddb_mnsidelocator_t *mnslp; 6654 mnslp = &mnlbp->lb_mnsidelocators[sideno][i]; 6655 mnslp->mnl_mnum = NODEV32; 6656 mnslp->mnl_sideno = 0; 6657 mnslp->mnl_drvnm_index = 0; 6658 } 6659 } 6660 } else { 6661 maxsides = ((setno == MD_LOCAL_SET) ? 1 : MD_MAXSIDES); 6662 for (i = 0; i < MDDB_NLB; i++) { 6663 for (sideno = 0; sideno < maxsides; sideno++) { 6664 mddb_sidelocator_t *slp; 6665 slp = &lbp->lb_sidelocators[sideno][i]; 6666 slp->l_mnum = NODEV32; 6667 } 6668 } 6669 } 6670 lbp->lb_blkcnt = lb_blkcnt; 6671 6672 /* lb starts on block 0 */ 6673 /* locator names starts after locator block */ 6674 lbp->lb_lnfirstblk = lb_blkcnt; 6675 if (flag & MDDB_MULTINODE) { 6676 lbp->lb_lnblkcnt = (mddb_block_t)MDDB_MNLNCNT; 6677 } else { 6678 lbp->lb_lnblkcnt = (mddb_block_t)((setno == MD_LOCAL_SET) ? 6679 MDDB_LOCAL_LNCNT : MDDB_LNCNT); 6680 } 6681 6682 if (flag & MDDB_MULTINODE) { 6683 /* Creating a multinode diskset */ 6684 md_set_setstatus(setno, MD_SET_MNSET); 6685 lbp->lb_flags |= MDDB_MNSET; 6686 } 6687 6688 /* Data portion of mddb located after locator names */ 6689 lbp->lb_dbfirstblk = lbp->lb_lnfirstblk + lbp->lb_lnblkcnt; 6690 6691 /* the btodb that follows is converting the directory block size */ 6692 /* Data tag part of mddb located after first block of mddb data */ 6693 lbp->lb_dtfirstblk = (mddb_block_t)(lbp->lb_dbfirstblk + 6694 btodb(MDDB_BSIZE)); 6695 /* Data tags are not used in MN diskset - so set count to 0 */ 6696 if (flag & MDDB_MULTINODE) 6697 lbp->lb_dtblkcnt = (mddb_block_t)0; 6698 else 6699 lbp->lb_dtblkcnt = (mddb_block_t)MDDB_DT_BLOCKS; 6700 6701 6702 lnp = (mddb_ln_t *)kmem_zalloc(dbtob(lbp->lb_lnblkcnt), KM_SLEEP); 6703 lnp->ln_magic = MDDB_MAGIC_LN; 6704 if (flag & MDDB_MULTINODE) { 6705 lnp->ln_revision = MDDB_REV_MNLN; 6706 } else { 6707 lnp->ln_revision = MDDB_REV_LN; 6708 } 6709 s->s_lnp = lnp; 6710 6711 /* 6712 * Set up Device ID portion of Locator Block. 6713 * Do not set locator to device id style if 6714 * md_devid_destroy is 1 and md_keep_repl_state is 1 6715 * (destroy all device id data and keep replica in 6716 * non device id mode). 6717 * 6718 * This is logically equivalent to set locator to 6719 * device id style if md_devid_destroy is 0 or 6720 * md_keep_repl_state is 0. 6721 * 6722 * In SunCluster environment, device id mode is disabled 6723 * which means diskset will be run in non-devid mode. For 6724 * localset, the behavior will remain intact and run in 6725 * device id mode. 6726 * 6727 * In multinode diskset devids are turned off. 6728 */ 6729 devid_flag = 1; 6730 if (cluster_bootflags & CLUSTER_CONFIGURED) 6731 if (setno != MD_LOCAL_SET) 6732 devid_flag = 0; 6733 if (flag & MDDB_MULTINODE) 6734 devid_flag = 0; 6735 if ((md_devid_destroy == 1) && (md_keep_repl_state == 1)) 6736 devid_flag = 0; 6737 /* 6738 * if we weren't devid style before and md_keep_repl_state=1 6739 * we need to stay non-devid 6740 */ 6741 if (((lbp->lb_flags & MDDB_DEVID_STYLE) == 0) && 6742 (md_keep_repl_state == 1)) 6743 devid_flag = 0; 6744 if (devid_flag) { 6745 lbp->lb_didfirstblk = lbp->lb_dtfirstblk + 6746 lbp->lb_dtblkcnt; 6747 lbp->lb_didblkcnt = (mddb_block_t)MDDB_DID_BLOCKS; 6748 lbp->lb_flags |= MDDB_DEVID_STYLE; 6749 6750 did_icp = (mddb_did_ic_t *)kmem_zalloc 6751 (sizeof (mddb_did_ic_t), KM_SLEEP); 6752 did_blkp = (mddb_did_blk_t *) 6753 kmem_zalloc(dbtob(lbp->lb_didblkcnt), KM_SLEEP); 6754 did_blkp->blk_magic = MDDB_MAGIC_DI; 6755 did_blkp->blk_revision = MDDB_REV_DI; 6756 did_icp->did_ic_blkp = did_blkp; 6757 s->s_did_icp = did_icp; 6758 } 6759 6760 setidentifier(s, &lbp->lb_ident); 6761 uniqtime32(&lbp->lb_timestamp); 6762 dbp = (mddb_db_t *)kmem_zalloc(sizeof (mddb_db_t), KM_SLEEP); 6763 dbp->db_magic = MDDB_MAGIC_DB; 6764 dbp->db_revision = MDDB_REV_DB; 6765 uniqtime32(&dbp->db_timestamp); 6766 dbp->db_nextblk = 0; 6767 dbp->db_firstentry = NULL; 6768 dbp->db_blknum = lbp->lb_dbfirstblk; 6769 dbp->db_recsum = MDDB_GLOBAL_XOR; 6770 s->s_dbp = dbp; 6771 single_thread_end(s); 6772 mutex_exit(SETMUTEX(setno)); 6773 mutex_exit(&mddb_lock); 6774 return (0); 6775 } 6776 6777 mddb_set_t * 6778 mddb_setenter( 6779 set_t setno, 6780 int flag, 6781 int *errorcodep 6782 ) 6783 { 6784 mddb_set_t *s; 6785 int err = 0; 6786 size_t sz = sizeof (void *) * MD_MAXUNITS; 6787 6788 mutex_enter(SETMUTEX(setno)); 6789 if (! md_set[setno].s_db) { 6790 mutex_exit(SETMUTEX(setno)); 6791 if (errorcodep != NULL) 6792 *errorcodep = MDDB_E_NOTOWNER; 6793 return (NULL); 6794 } 6795 6796 /* Allocate s_un and s_ui arrays if not already present. */ 6797 if (md_set[setno].s_un == NULL) { 6798 md_set[setno].s_un = kmem_zalloc(sz, KM_NOSLEEP); 6799 if (md_set[setno].s_un == NULL) { 6800 mutex_exit(SETMUTEX(setno)); 6801 if (errorcodep != NULL) 6802 *errorcodep = MDDB_E_NOTOWNER; 6803 return (NULL); 6804 } 6805 } 6806 if (md_set[setno].s_ui == NULL) { 6807 md_set[setno].s_ui = kmem_zalloc(sz, KM_NOSLEEP); 6808 if (md_set[setno].s_ui == NULL) { 6809 mutex_exit(&md_set[setno].s_dbmx); 6810 kmem_free(md_set[setno].s_un, sz); 6811 md_set[setno].s_un = NULL; 6812 if (errorcodep != NULL) 6813 *errorcodep = MDDB_E_NOTOWNER; 6814 return (NULL); 6815 } 6816 } 6817 s = (mddb_set_t *)md_set[setno].s_db; 6818 if (s->s_lbp) 6819 return (s); 6820 6821 if (flag & MDDB_NOINIT) 6822 return (s); 6823 6824 /* 6825 * Release the set mutex - it will be acquired and released in 6826 * initit after acquiring the mddb_lock. This is done to assure 6827 * that mutexes are always acquired in the same order to prevent 6828 * possible deadlock 6829 */ 6830 mutex_exit(SETMUTEX(setno)); 6831 6832 if ((err = initit(setno, flag)) != 0) { 6833 if (errorcodep != NULL) 6834 *errorcodep = err; 6835 return (NULL); 6836 } 6837 6838 mutex_enter(SETMUTEX(setno)); 6839 return ((mddb_set_t *)md_set[setno].s_db); 6840 } 6841 6842 /* 6843 * Release the set lock for a given set. 6844 * 6845 * In a MN diskset, this routine may send messages to the rpc.mdcommd 6846 * in order to have the slave nodes re-parse parts of the mddb. 6847 * Messages are only sent if the global ioctl lock is not held. 6848 * 6849 * With the introduction of multi-threaded ioctls, there is no way 6850 * to determine which thread(s) are holding the ioctl lock. So, if 6851 * the ioctl lock is held (by process X) process X will send the 6852 * messages to the slave nodes when process X releases the ioctl lock. 6853 */ 6854 void 6855 mddb_setexit( 6856 mddb_set_t *s 6857 ) 6858 { 6859 md_mn_msg_mddb_parse_t *mddb_parse_msg; 6860 md_mn_kresult_t *kresult; 6861 mddb_lb_t *lbp = s->s_lbp; 6862 int i; 6863 int rval = 1; 6864 6865 /* 6866 * If not a MN diskset OR 6867 * a MN diskset but this node isn't master, 6868 * then release the mutex. 6869 */ 6870 if (!(MD_MNSET_SETNO(s->s_setno)) || 6871 ((MD_MNSET_SETNO(s->s_setno)) && 6872 (!md_set[s->s_setno].s_am_i_master))) { 6873 mutex_exit(SETMUTEX(s->s_setno)); 6874 return; 6875 } 6876 6877 /* 6878 * If global ioctl lock is held, then send no messages, 6879 * just release mutex and return. 6880 * 6881 */ 6882 if (md_status & MD_GBL_IOCTL_LOCK) { 6883 mutex_exit(SETMUTEX(s->s_setno)); 6884 return; 6885 } 6886 6887 /* 6888 * This thread is not holding the ioctl lock, so drop the set 6889 * lock, send messages to slave nodes to reparse portions 6890 * of the mddb and return. 6891 * 6892 * If the block parse flag is set, do not send parse messages. 6893 * This flag is set when master is adding a new mddb that would 6894 * cause parse messages to be sent to the slaves, but the slaves 6895 * don't have knowledge of the new mddb yet since the mddb add 6896 * operation hasn't been run on the slave nodes yet. When the 6897 * master unblocks the parse flag, the parse messages will be 6898 * generated. 6899 * 6900 * If s_mn_parseflags_sending is non-zero, then another thread 6901 * is already currently sending a parse message, so just release 6902 * the mutex and return. If an mddb change occurred that results 6903 * in a parse message to be generated, the thread that is currently 6904 * sending a parse message would generate the additional parse message. 6905 * 6906 * If s_mn_parseflags_sending is zero and parsing is not blocked, 6907 * then loop until s_mn_parseflags is 0 (until there are no more 6908 * messages to send). 6909 * While s_mn_parseflags is non-zero, 6910 * put snapshot of parse_flags in s_mn_parseflags_sending 6911 * set s_mn_parseflags to zero 6912 * release mutex 6913 * send message 6914 * re-grab mutex 6915 * set s_mn_parseflags_sending to zero 6916 */ 6917 mddb_parse_msg = kmem_zalloc(sizeof (md_mn_msg_mddb_parse_t), KM_SLEEP); 6918 while (((s->s_mn_parseflags_sending & MDDB_PARSE_MASK) == 0) && 6919 (s->s_mn_parseflags & MDDB_PARSE_MASK) && 6920 (!(md_get_setstatus(s->s_setno) & MD_SET_MNPARSE_BLK))) { 6921 /* Grab snapshot of parse flags */ 6922 s->s_mn_parseflags_sending = s->s_mn_parseflags; 6923 s->s_mn_parseflags = 0; 6924 6925 mutex_exit(SETMUTEX(s->s_setno)); 6926 6927 /* 6928 * Send the message to the slaves to re-parse 6929 * the indicated portions of the mddb. Send the status 6930 * of the 50 mddbs in this set so that slaves know which 6931 * mddbs that the master node thinks are 'good'. 6932 * Otherwise, slave may reparse, but from wrong replica. 6933 */ 6934 mddb_parse_msg->msg_parse_flags = s->s_mn_parseflags_sending; 6935 for (i = 0; i < MDDB_NLB; i++) { 6936 mddb_parse_msg->msg_lb_flags[i] = 6937 lbp->lb_locators[i].l_flags; 6938 } 6939 kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); 6940 while (rval != 0) { 6941 rval = mdmn_ksend_message(s->s_setno, 6942 MD_MN_MSG_MDDB_PARSE, 0, 0, 6943 (char *)mddb_parse_msg, 6944 sizeof (md_mn_msg_mddb_parse_t), kresult); 6945 if (rval != 0) 6946 cmn_err(CE_WARN, "mddb_setexit: Unable to send " 6947 "mddb update message to other nodes in " 6948 "diskset %s\n", s->s_setname); 6949 } 6950 kmem_free(kresult, sizeof (md_mn_kresult_t)); 6951 6952 /* 6953 * Re-grab mutex to clear sending field and to 6954 * see if another parse message needs to be generated. 6955 */ 6956 mutex_enter(SETMUTEX(s->s_setno)); 6957 s->s_mn_parseflags_sending = 0; 6958 } 6959 kmem_free(mddb_parse_msg, sizeof (md_mn_msg_mddb_parse_t)); 6960 mutex_exit(SETMUTEX(s->s_setno)); 6961 } 6962 6963 static void 6964 mddb_setexit_no_parse( 6965 mddb_set_t *s 6966 ) 6967 { 6968 mutex_exit(SETMUTEX(s->s_setno)); 6969 } 6970 6971 uint_t 6972 mddb_lb_did_convert(mddb_set_t *s, uint_t doit, uint_t *blk_cnt) 6973 { 6974 uint_t li; 6975 mddb_lb_t *lbp = s->s_lbp; 6976 mddb_locator_t *lp; 6977 ddi_devid_t ret_devid; 6978 uint_t devid_len; 6979 dev_t ddi_dev; 6980 mddb_did_ic_t *did_icp; 6981 mddb_did_blk_t *did_blkp; 6982 char *minor_name; 6983 size_t sz; 6984 int retval; 6985 int err; 6986 md_dev64_t dev64; /* tmp var to make code look better */ 6987 6988 6989 /* Need disk block(s) to hold mddb_did_blk_t */ 6990 *blk_cnt = MDDB_DID_BLOCKS; 6991 6992 if (doit) { 6993 /* 6994 * Alloc mddb_did_blk_t disk block and fill in header area. 6995 * Don't fill in did magic number until end of routine so 6996 * if machine panics in the middle of conversion, the 6997 * device id information will be thrown away at the 6998 * next snarfing of this set. 6999 * Need to set DEVID_STYLE so that mddb_devid_add will 7000 * function properly. 7001 */ 7002 /* grab the mutex */ 7003 if ((mddb_setenter(s->s_setno, MDDB_NOINIT, &err)) == NULL) { 7004 return (1); 7005 } 7006 single_thread_start(s); 7007 lbp->lb_didfirstblk = getfreeblks(s, MDDB_DID_BLOCKS); 7008 if (lbp->lb_didfirstblk == 0) { 7009 single_thread_end(s); 7010 mddb_setexit(s); 7011 return (1); 7012 } 7013 lbp->lb_didblkcnt = (mddb_block_t)MDDB_DID_BLOCKS; 7014 did_icp = (mddb_did_ic_t *)kmem_zalloc(sizeof (mddb_did_ic_t), 7015 KM_SLEEP); 7016 did_blkp = (mddb_did_blk_t *)kmem_zalloc(MDDB_DID_BYTES, 7017 KM_SLEEP); 7018 7019 did_blkp->blk_revision = MDDB_REV_DI; 7020 did_icp->did_ic_blkp = did_blkp; 7021 s->s_did_icp = did_icp; 7022 lbp->lb_flags |= MDDB_DEVID_STYLE; 7023 } 7024 7025 /* Fill in information in mddb_did_info_t array */ 7026 for (li = 0; li < lbp->lb_loccnt; li++) { 7027 lp = &lbp->lb_locators[li]; 7028 if (lp->l_flags & MDDB_F_DELETED) 7029 continue; 7030 7031 dev64 = md_xlate_targ_2_mini(md_expldev(lp->l_dev)); 7032 ddi_dev = md_dev64_to_dev(dev64); 7033 if (ddi_dev == NODEV) { 7034 /* 7035 * No translation available for replica. 7036 * Could fail conversion to device id replica, 7037 * but instead will just continue with next 7038 * replica in list. 7039 */ 7040 continue; 7041 } 7042 if (ddi_lyr_get_devid(ddi_dev, &ret_devid) == DDI_SUCCESS) { 7043 /* 7044 * Just count each devid as at least 1 block. This 7045 * is conservative since several device id's may fit 7046 * into 1 disk block, but it's better to overestimate 7047 * the number of blocks needed than to underestimate. 7048 */ 7049 devid_len = (int)ddi_devid_sizeof(ret_devid); 7050 *blk_cnt += btodb(devid_len + (MDDB_BSIZE - 1)); 7051 if (doit) { 7052 if (ddi_lyr_get_minor_name(ddi_dev, S_IFBLK, 7053 &minor_name) == DDI_SUCCESS) { 7054 if (mddb_devid_add(s, li, ret_devid, 7055 minor_name)) { 7056 cmn_err(CE_WARN, 7057 "Not enough space in metadb" 7058 " to add device id for" 7059 " dev: major = %d, " 7060 "minor = %d\n", 7061 getmajor(ddi_dev), 7062 getminor(ddi_dev)); 7063 } 7064 sz = strlen(minor_name) + 1; 7065 kmem_free(minor_name, sz); 7066 } 7067 } 7068 ddi_devid_free(ret_devid); 7069 } 7070 } 7071 7072 if (doit) { 7073 did_blkp->blk_magic = MDDB_MAGIC_DI; 7074 retval = push_lb(s); 7075 (void) upd_med(s, "mddb_lb_did_convert(0)"); 7076 single_thread_end(s); 7077 mddb_setexit(s); 7078 if (retval != 0) 7079 return (1); 7080 } 7081 7082 return (0); 7083 } 7084 7085 static mddb_set_t * 7086 init_set( 7087 mddb_config_t *cp, 7088 int flag, 7089 int *errp 7090 ) 7091 { 7092 mddb_set_t *s; 7093 char *setname = NULL; 7094 set_t setno = MD_LOCAL_SET; 7095 side_t sideno = 0; 7096 struct timeval32 *created = NULL; 7097 7098 if (cp != NULL) { 7099 setname = cp->c_setname; 7100 setno = cp->c_setno; 7101 sideno = cp->c_sideno; 7102 created = &cp->c_timestamp; 7103 } 7104 7105 if (setno >= MD_MAXSETS) 7106 return ((mddb_set_t *)NULL); 7107 7108 if (md_set[setno].s_db) 7109 return (mddb_setenter(setno, flag, errp)); 7110 7111 s = (mddb_set_t *)kmem_zalloc(sizeof (*s), KM_SLEEP); 7112 7113 cv_init(&s->s_buf_cv, NULL, CV_DEFAULT, NULL); 7114 cv_init(&s->s_single_thread_cv, NULL, CV_DEFAULT, NULL); 7115 cv_init(&s->s_optqueuing_cv, NULL, CV_DEFAULT, NULL); 7116 cv_init(&s->s_opthungerr_cv, NULL, CV_DEFAULT, NULL); 7117 cv_init(&s->s_optwantlck_cv, NULL, CV_DEFAULT, NULL); 7118 7119 s->s_setno = setno; 7120 s->s_sideno = sideno; 7121 if (setno == MD_LOCAL_SET) { 7122 (void) snprintf(s->s_ident.serial, sizeof (s->s_ident.serial), 7123 "%u", zone_get_hostid(NULL)); 7124 } else { 7125 s->s_ident.createtime = *created; 7126 s->s_setname = (char *)kmem_alloc(strlen(setname) + 1, 7127 KM_SLEEP); 7128 (void) strcpy(s->s_setname, setname); 7129 } 7130 7131 /* have a config struct, copy mediator information */ 7132 if (cp != NULL) 7133 s->s_med = cp->c_med; /* structure assignment */ 7134 7135 md_set[setno].s_db = (void *) s; 7136 7137 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_TAKEOVER, SVM_TAG_SET, setno, NODEV64); 7138 7139 return (mddb_setenter(setno, flag, errp)); 7140 } 7141 7142 void 7143 mddb_unload_set( 7144 set_t setno 7145 ) 7146 { 7147 7148 mddb_set_t *s; 7149 mddb_db_t *dbp, *adbp = NULL; 7150 mddb_de_ic_t *dep, *dep2; 7151 mddb_bf_t *bfp; 7152 int i; 7153 md_dev64_t dev; 7154 7155 if ((s = mddb_setenter(setno, MDDB_NOINIT, NULL)) == NULL) 7156 return; 7157 7158 single_thread_start(s); 7159 7160 s->s_opthavequeuinglck = 0; 7161 s->s_optwantqueuinglck = 0; 7162 7163 for (dbp = s->s_dbp; dbp != 0; dbp = adbp) { 7164 for (dep = dbp->db_firstentry; dep != NULL; dep = dep2) { 7165 if (dep->de_rb_userdata != NULL) { 7166 if (dep->de_icreqsize) 7167 kmem_free(dep->de_rb_userdata_ic, 7168 dep->de_icreqsize); 7169 else 7170 kmem_free(dep->de_rb_userdata, 7171 dep->de_reqsize); 7172 } 7173 kmem_free((caddr_t)dep->de_rb, dep->de_recsize); 7174 dep2 = dep->de_next; 7175 kmem_free((caddr_t)dep, sizeofde(dep)); 7176 } 7177 adbp = dbp->db_next; 7178 kmem_free((caddr_t)dbp, sizeof (mddb_db_t)); 7179 } 7180 s->s_dbp = (mddb_db_t *)NULL; 7181 7182 free_rip(&s->s_rip); 7183 7184 for (i = 0; i < mddb_maxcopies; i++) { 7185 if (! s->s_mbiarray) 7186 break; 7187 7188 if (! s->s_mbiarray[i]) 7189 continue; 7190 7191 dev = md_expldev(s->s_lbp->lb_locators[i].l_dev); 7192 dev = md_xlate_targ_2_mini(dev); 7193 if (dev != NODEV64) 7194 mddb_devclose(dev); 7195 7196 free_mbipp(&s->s_mbiarray[i]); 7197 } 7198 7199 if (s->s_mbiarray) { 7200 kmem_free((caddr_t)s->s_mbiarray, 7201 sizeof (mddb_mb_ic_t *) * mddb_maxcopies); 7202 s->s_mbiarray = (mddb_mb_ic_t **)NULL; 7203 } 7204 7205 if (s->s_lnp) { 7206 kmem_free((caddr_t)s->s_lnp, dbtob(s->s_lbp->lb_lnblkcnt)); 7207 s->s_lnp = (mddb_ln_t *)NULL; 7208 } 7209 7210 if (s->s_lbp) { 7211 mddb_devid_icp_free(&s->s_did_icp, s->s_lbp); 7212 kmem_free((caddr_t)s->s_lbp, dbtob(s->s_lbp->lb_blkcnt)); 7213 s->s_lbp = (mddb_lb_t *)NULL; 7214 } 7215 7216 if (s->s_freebitmap) { 7217 kmem_free((caddr_t)s->s_freebitmap, s->s_freebitmapsize); 7218 s->s_freebitmap = NULL; 7219 s->s_freebitmapsize = 0; 7220 } 7221 7222 while ((bfp = allocbuffer(s, MDDB_NOSLEEP)) != NULL) 7223 kmem_free((caddr_t)bfp, sizeof (*bfp)); 7224 7225 if (s->s_databuffer_size) { 7226 kmem_free(s->s_databuffer, s->s_databuffer_size); 7227 s->s_databuffer_size = 0; 7228 } 7229 7230 if (s->s_setname != NULL) 7231 kmem_free((caddr_t)s->s_setname, strlen(s->s_setname)+1); 7232 7233 /* Data tags not supported on MN sets. */ 7234 if (!(md_get_setstatus(setno) & MD_SET_MNSET)) 7235 dtl_freel(&s->s_dtlp); 7236 7237 md_set[setno].s_db = NULL; 7238 ASSERT(s->s_singlelockwanted == 0); 7239 kmem_free(s, sizeof (mddb_set_t)); 7240 7241 /* Take care of things setup in the md_set array */ 7242 if (! (md_get_setstatus(setno) & MD_SET_KEEPTAG)) { 7243 if (md_set[setno].s_dtp) { 7244 kmem_free((caddr_t)md_set[setno].s_dtp, MDDB_DT_BYTES); 7245 md_set[setno].s_dtp = NULL; 7246 } 7247 } 7248 7249 md_clr_setstatus(setno, MD_SET_ACCOK | MD_SET_ACCEPT | 7250 MD_SET_TAGDATA | MD_SET_USETAG | MD_SET_TOOFEW | MD_SET_STALE | 7251 MD_SET_OWNERSHIP | MD_SET_BADTAG | MD_SET_CLRTAG | MD_SET_MNSET | 7252 MD_SET_DIDCLUP | MD_SET_MNPARSE_BLK | MD_SET_MN_MIR_STATE_RC | 7253 MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT); 7254 7255 mutex_exit(SETMUTEX(setno)); 7256 } 7257 7258 /* 7259 * returns 0 if name can be put into locator block 7260 * returns 1 if locator block prefixes are all used 7261 * 7262 * Takes splitname (suffix, prefix, sideno) and 7263 * stores it in the locator name structure. 7264 * For traditional diskset, the sideno is the index into the suffixes 7265 * array in the locator name structure. 7266 * For the MN diskset, the sideno is the nodeid which can be any number, 7267 * so the index passed in is the index into the mnsuffixes array 7268 * in the locator structure. This index was computed by the 7269 * routine checklocator which basically checked the locator block 7270 * mnside locator structure. 7271 */ 7272 static int 7273 splitname2locatorblock( 7274 md_splitname *spn, 7275 mddb_ln_t *lnp, 7276 int li, 7277 side_t sideno, 7278 int index 7279 ) 7280 { 7281 uchar_t i; 7282 md_name_suffix *sn; 7283 md_mnname_suffix_t *mnsn; 7284 mddb_mnln_t *mnlnp; 7285 7286 for (i = 0; i < MDDB_PREFIXCNT; i++) { 7287 if (lnp->ln_prefixes[i].pre_len != SPN_PREFIX(spn).pre_len) 7288 continue; 7289 if (bcmp(lnp->ln_prefixes[i].pre_data, SPN_PREFIX(spn).pre_data, 7290 SPN_PREFIX(spn).pre_len) == 0) 7291 break; 7292 } 7293 if (i == MDDB_PREFIXCNT) { 7294 for (i = 0; i < MDDB_PREFIXCNT; i++) { 7295 if (lnp->ln_prefixes[i].pre_len == 0) 7296 break; 7297 } 7298 if (i == MDDB_PREFIXCNT) 7299 return (1); 7300 bcopy(SPN_PREFIX(spn).pre_data, lnp->ln_prefixes[i].pre_data, 7301 SPN_PREFIX(spn).pre_len); 7302 lnp->ln_prefixes[i].pre_len = SPN_PREFIX(spn).pre_len; 7303 } 7304 7305 if (lnp->ln_revision == MDDB_REV_MNLN) { 7306 /* If a MN diskset, use index */ 7307 mnlnp = (mddb_mnln_t *)lnp; 7308 mnsn = &mnlnp->ln_mnsuffixes[index][li]; 7309 mnsn->mn_ln_sideno = sideno; 7310 mnsn->mn_ln_suffix.suf_len = SPN_SUFFIX(spn).suf_len; 7311 mnsn->mn_ln_suffix.suf_prefix = i; 7312 bcopy(SPN_SUFFIX(spn).suf_data, 7313 mnsn->mn_ln_suffix.suf_data, SPN_SUFFIX(spn).suf_len); 7314 } else { 7315 sn = &lnp->ln_suffixes[sideno][li]; 7316 sn->suf_len = SPN_SUFFIX(spn).suf_len; 7317 sn->suf_prefix = i; 7318 bcopy(SPN_SUFFIX(spn).suf_data, sn->suf_data, 7319 SPN_SUFFIX(spn).suf_len); 7320 } 7321 return (0); 7322 } 7323 7324 /* 7325 * Find the locator name for the given sideno and convert the locator name 7326 * information into a splitname structure. 7327 */ 7328 void 7329 mddb_locatorblock2splitname( 7330 mddb_ln_t *lnp, 7331 int li, 7332 side_t sideno, 7333 md_splitname *spn 7334 ) 7335 { 7336 int iprefix; 7337 md_name_suffix *sn; 7338 md_mnname_suffix_t *mnsn; 7339 int i; 7340 mddb_mnln_t *mnlnp; 7341 7342 if (lnp->ln_revision == MDDB_REV_MNLN) { 7343 mnlnp = (mddb_mnln_t *)lnp; 7344 for (i = 0; i < MD_MNMAXSIDES; i++) { 7345 mnsn = &mnlnp->ln_mnsuffixes[i][li]; 7346 if (mnsn->mn_ln_sideno == sideno) 7347 break; 7348 } 7349 if (i == MD_MNMAXSIDES) 7350 return; 7351 7352 SPN_SUFFIX(spn).suf_len = mnsn->mn_ln_suffix.suf_len; 7353 bcopy(mnsn->mn_ln_suffix.suf_data, SPN_SUFFIX(spn).suf_data, 7354 SPN_SUFFIX(spn).suf_len); 7355 iprefix = mnsn->mn_ln_suffix.suf_prefix; 7356 } else { 7357 sn = &lnp->ln_suffixes[sideno][li]; 7358 SPN_SUFFIX(spn).suf_len = sn->suf_len; 7359 bcopy(sn->suf_data, SPN_SUFFIX(spn).suf_data, 7360 SPN_SUFFIX(spn).suf_len); 7361 iprefix = sn->suf_prefix; 7362 } 7363 SPN_PREFIX(spn).pre_len = lnp->ln_prefixes[iprefix].pre_len; 7364 bcopy(lnp->ln_prefixes[iprefix].pre_data, SPN_PREFIX(spn).pre_data, 7365 SPN_PREFIX(spn).pre_len); 7366 } 7367 7368 static int 7369 getdeldev( 7370 mddb_config_t *cp, 7371 int command, 7372 md_error_t *ep 7373 ) 7374 { 7375 mddb_set_t *s; 7376 mddb_lb_t *lbp; 7377 mddb_locator_t *locators; 7378 uint_t loccnt; 7379 mddb_mb_ic_t *mbip; 7380 mddb_block_t blk; 7381 int err = 0; 7382 int i, j; 7383 int li; 7384 uint_t commitcnt; 7385 set_t setno = cp->c_setno; 7386 uint_t set_status; 7387 md_dev64_t dev; 7388 int flags = MDDB_MUSTEXIST; 7389 7390 cp->c_dbmax = MDDB_NLB; 7391 7392 /* 7393 * Data checking 7394 */ 7395 if (setno >= md_nsets || cp->c_id < 0 || 7396 cp->c_id > cp->c_dbmax) { 7397 return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR)); 7398 } 7399 7400 if (cp->c_flags & MDDB_C_STALE) 7401 flags |= MDDB_MN_STALE; 7402 7403 if ((s = mddb_setenter(setno, flags, &err)) == NULL) 7404 return (mddbstatus2error(ep, err, NODEV32, setno)); 7405 7406 cp->c_flags = 0; 7407 7408 lbp = s->s_lbp; 7409 loccnt = lbp->lb_loccnt; 7410 locators = lbp->lb_locators; 7411 7412 /* shorthand */ 7413 set_status = md_get_setstatus(setno); 7414 7415 if (set_status & MD_SET_STALE) 7416 cp->c_flags |= MDDB_C_STALE; 7417 7418 if (set_status & MD_SET_TOOFEW) 7419 cp->c_flags |= MDDB_C_TOOFEW; 7420 7421 cp->c_sideno = s->s_sideno; 7422 7423 cp->c_dbcnt = 0; 7424 /* 7425 * go through and count active entries 7426 */ 7427 for (i = 0; i < loccnt; i++) { 7428 if (locators[i].l_flags & MDDB_F_DELETED) 7429 continue; 7430 cp->c_dbcnt++; 7431 } 7432 7433 /* 7434 * add the ability to accept a locator block index 7435 * which is not relative to previously deleted replicas. This 7436 * is for support of MD_DEBUG=STAT in metastat since it asks for 7437 * replica information specifically for each of the mirror resync 7438 * records. MDDB_CONFIG_SUBCMD uses one of the pad spares in 7439 * the mddb_config_t type. 7440 */ 7441 if (cp->c_subcmd == MDDB_CONFIG_ABS) { 7442 if (cp->c_id < 0 || cp->c_id > cp->c_dbmax) { 7443 mddb_setexit(s); 7444 return (mdmddberror(ep, MDE_DB_INVALID, NODEV32, 7445 setno)); 7446 } 7447 li = cp->c_id; 7448 } else { 7449 if (cp->c_id >= cp->c_dbcnt) { 7450 mddb_setexit(s); 7451 return (mdmddberror(ep, MDE_DB_INVALID, NODEV32, 7452 setno)); 7453 } 7454 7455 /* CSTYLED */ 7456 for (li = 0, j = 0; /* void */; li++) { 7457 if (locators[li].l_flags & MDDB_F_DELETED) 7458 continue; 7459 j++; 7460 if (j > cp->c_id) 7461 break; 7462 } 7463 } 7464 7465 if (command == MDDB_ENDDEV) { 7466 daddr_t ib = 0, jb; 7467 7468 blk = 0; 7469 if ((s != NULL) && s->s_mbiarray[li]) { 7470 mbip = s->s_mbiarray[li]; 7471 while ((jb = getphysblk(blk++, mbip)) > 0) { 7472 if (jb > ib) 7473 ib = jb; 7474 } 7475 cp->c_dbend = (int)ib; 7476 } else { 7477 cp->c_dbend = 0; 7478 } 7479 } 7480 7481 locator2cfgloc(lbp, &cp->c_locator, li, s->s_sideno, s->s_did_icp); 7482 mddb_locatorblock2splitname(s->s_lnp, li, s->s_sideno, &cp->c_devname); 7483 7484 if (command != MDDB_DELDEV) { 7485 mddb_setexit(s); 7486 return (0); 7487 } 7488 7489 /* Currently don't allow addition/deletion of sides during upgrade */ 7490 if (MD_UPGRADE) { 7491 cmn_err(CE_WARN, 7492 "Deletion of replica not allowed during upgrade.\n"); 7493 mddb_setexit(s); 7494 return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno)); 7495 } 7496 7497 /* 7498 * If here, replica delete in progress. 7499 */ 7500 single_thread_start(s); 7501 7502 if ((! (locators[li].l_flags & MDDB_F_EMASTER)) && 7503 (locators[li].l_flags & MDDB_F_ACTIVE)) { 7504 commitcnt = lbp->lb_commitcnt; 7505 lbp->lb_commitcnt = 0; 7506 setidentifier(s, &lbp->lb_ident); 7507 crcgen(lbp, &lbp->lb_checksum, dbtob(lbp->lb_blkcnt), NULL); 7508 /* 7509 * Don't need to write out device id area, since locator 7510 * block on this replica is being deleted by setting the 7511 * commitcnt to 0. 7512 */ 7513 (void) writeblks(s, (caddr_t)lbp, 0, lbp->lb_blkcnt, li, 7514 MDDB_WR_ONLY_MASTER); 7515 lbp->lb_commitcnt = commitcnt; 7516 } 7517 7518 if (s->s_mbiarray[li]) 7519 free_mbipp(&s->s_mbiarray[li]); 7520 7521 if (! (locators[li].l_flags & MDDB_F_EMASTER)) { 7522 dev = md_expldev(locators[li].l_dev); 7523 dev = md_xlate_targ_2_mini(dev); 7524 if (dev != NODEV64) 7525 mddb_devclose(dev); 7526 } 7527 7528 s->s_mbiarray[li] = 0; 7529 lbp->lb_locators[li].l_flags = MDDB_F_DELETED; 7530 7531 /* Only support data tags for traditional and local sets */ 7532 if ((md_get_setstatus(setno) & MD_SET_STALE) && 7533 (!(lbp->lb_flags & MDDB_MNSET)) && 7534 setno != MD_LOCAL_SET) 7535 if (set_dtag(s, ep)) 7536 mdclrerror(ep); 7537 7538 /* Write data tags to all accessible devices */ 7539 /* Only support data tags for traditional and local sets */ 7540 if (!(lbp->lb_flags & MDDB_MNSET)) { 7541 (void) dt_write(s); 7542 } 7543 7544 /* Delete device id of deleted replica */ 7545 if (lbp->lb_flags & MDDB_DEVID_STYLE) { 7546 (void) mddb_devid_delete(s, li); 7547 } 7548 /* write new locator to all devices */ 7549 err = writelocall(s); 7550 7551 (void) upd_med(s, "getdeldev(0)"); 7552 7553 SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, SVM_TAG_REPLICA, setno, 7554 md_expldev(locators[li].l_dev)); 7555 7556 computefreeblks(s); /* recompute always it may be larger */ 7557 cp->c_dbcnt--; 7558 err |= fixoptrecords(s); 7559 if (err) { 7560 if (writeretry(s)) { 7561 single_thread_end(s); 7562 mddb_setexit(s); 7563 return (mdmddberror(ep, MDDB_E_NOTNOW, NODEV32, setno)); 7564 } 7565 } 7566 7567 single_thread_end(s); 7568 mddb_setexit(s); 7569 return (0); 7570 } 7571 7572 static int 7573 getdriver( 7574 mddb_cfg_loc_t *clp 7575 ) 7576 { 7577 major_t majordev; 7578 7579 /* 7580 * Data checking 7581 */ 7582 if (clp->l_dev <= 0) 7583 return (EINVAL); 7584 7585 majordev = getmajor(expldev(clp->l_dev)); 7586 7587 if (ddi_major_to_name(majordev) == (char *)NULL) 7588 return (EINVAL); 7589 7590 if (MD_UPGRADE) 7591 (void) strcpy(clp->l_driver, md_targ_major_to_name(majordev)); 7592 else 7593 (void) strcpy(clp->l_driver, ddi_major_to_name(majordev)); 7594 return (0); 7595 } 7596 7597 /* 7598 * update_valid_replica - updates the locator block namespace (prefix 7599 * and/or suffix) with new pathname and devname. 7600 * RETURN 7601 * 1 Error 7602 * 0 Success 7603 */ 7604 static int 7605 update_valid_replica( 7606 side_t side, 7607 mddb_locator_t *lp, 7608 mddb_set_t *s, 7609 int li, 7610 char *devname, 7611 char *pathname, 7612 md_dev64_t devt 7613 ) 7614 { 7615 uchar_t pre_len, suf_len; 7616 md_name_suffix *sn; 7617 mddb_ln_t *lnp; 7618 uchar_t pre_index; 7619 uchar_t i; 7620 7621 if (md_expldev(lp->l_dev) != devt) { 7622 return (0); 7623 } 7624 7625 if (pathname[strlen(pathname) - 1] == '/') 7626 pathname[strlen(pathname) - 1] = '\0'; 7627 7628 pre_len = (uchar_t)strlen(pathname); 7629 suf_len = (uchar_t)strlen(devname); 7630 7631 if ((pre_len > MD_MAXPREFIX) || (suf_len > MD_MAXSUFFIX)) 7632 return (1); 7633 7634 lnp = s->s_lnp; 7635 7636 /* 7637 * Future note: Need to do something here for the MN diskset case 7638 * when device ids are supported in disksets. 7639 * Can't add until merging devids_in_diskset code into code base 7640 * Currently only called with side of 0. 7641 */ 7642 7643 sn = &lnp->ln_suffixes[side][li]; 7644 7645 /* 7646 * Check if prefix (Ex: /dev/dsk) needs to be changed. 7647 * If new prefix is the same as the previous prefix - no change. 7648 * 7649 * If new prefix is not the same, check if new prefix 7650 * matches an existing one. If so, use that one. 7651 * 7652 * If new prefix doesn't exist, add a new prefix. If not enough 7653 * space, return failure. 7654 */ 7655 pre_index = sn->suf_prefix; 7656 /* Check if new prefix is the same as the old prefix. */ 7657 if ((lnp->ln_prefixes[pre_index].pre_len != pre_len) || 7658 (bcmp(lnp->ln_prefixes[pre_index].pre_data, pathname, 7659 pre_len) != 0)) { 7660 /* Check if new prefix is an already known prefix. */ 7661 for (i = 0; i < MDDB_PREFIXCNT; i++) { 7662 if (lnp->ln_prefixes[i].pre_len != pre_len) { 7663 continue; 7664 } 7665 if (bcmp(lnp->ln_prefixes[i].pre_data, pathname, 7666 pre_len) == 0) { 7667 break; 7668 } 7669 } 7670 /* If no match found for new prefix - add the new prefix */ 7671 if (i == MDDB_PREFIXCNT) { 7672 for (i = 0; i < MDDB_PREFIXCNT; i++) { 7673 if (lnp->ln_prefixes[i].pre_len == 0) 7674 break; 7675 } 7676 /* No space to add new prefix - return failure */ 7677 if (i == MDDB_PREFIXCNT) { 7678 return (1); 7679 } 7680 bcopy(pathname, lnp->ln_prefixes[i].pre_data, pre_len); 7681 lnp->ln_prefixes[i].pre_len = pre_len; 7682 } 7683 sn->suf_prefix = i; 7684 } 7685 7686 /* Now, update the suffix (Ex: c0t0d0s0) if needed */ 7687 if ((sn->suf_len != suf_len) || 7688 (bcmp(sn->suf_data, devname, suf_len) != 0)) { 7689 bcopy(devname, sn->suf_data, suf_len); 7690 sn->suf_len = suf_len; 7691 } 7692 return (0); 7693 } 7694 7695 7696 /* 7697 * md_update_locator_namespace - If in devid style and active and the devid's 7698 * exist and are valid update the locator namespace pathname 7699 * and devname. 7700 * RETURN 7701 * 1 Error 7702 * 0 Success 7703 */ 7704 int 7705 md_update_locator_namespace( 7706 set_t setno, /* which set to get name from */ 7707 side_t side, 7708 char *dname, 7709 char *pname, 7710 md_dev64_t devt 7711 ) 7712 { 7713 mddb_set_t *s; 7714 mddb_lb_t *lbp; 7715 int li; 7716 uint_t flg; 7717 int err = 0; 7718 mddb_ln_t *lnp; 7719 7720 if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) 7721 return (1); 7722 single_thread_start(s); 7723 lbp = s->s_lbp; 7724 /* must be DEVID_STYLE */ 7725 if (lbp->lb_flags & MDDB_DEVID_STYLE) { 7726 for (li = 0; li < lbp->lb_loccnt; li++) { 7727 mddb_locator_t *lp = &lbp->lb_locators[li]; 7728 7729 if (lp->l_flags & MDDB_F_DELETED) { 7730 continue; 7731 } 7732 7733 /* replica also must be active */ 7734 if (lp->l_flags & MDDB_F_ACTIVE) { 7735 flg = s->s_did_icp->did_ic_blkp-> 7736 blk_info[li].info_flags; 7737 /* only update if did exists and is valid */ 7738 if ((flg & MDDB_DID_EXISTS) && 7739 (flg & MDDB_DID_VALID)) { 7740 if (update_valid_replica(side, lp, s, 7741 li, dname, pname, devt)) { 7742 err = 1; 7743 goto out; 7744 } 7745 } 7746 } 7747 } 7748 } 7749 lnp = s->s_lnp; 7750 uniqtime32(&lnp->ln_timestamp); 7751 if (lbp->lb_flags & MDDB_MNSET) 7752 lnp->ln_revision = MDDB_REV_MNLN; 7753 else 7754 lnp->ln_revision = MDDB_REV_LN; 7755 crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL); 7756 err = writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk, 7757 lbp->lb_lnblkcnt, 0); 7758 /* 7759 * If a MN diskset and this is the master, set the PARSE_LOCNM 7760 * flag in the mddb_set structure to show that the locator 7761 * names have changed. 7762 */ 7763 7764 if ((lbp->lb_flags & MDDB_MNSET) && 7765 (md_set[s->s_setno].s_am_i_master)) { 7766 s->s_mn_parseflags |= MDDB_PARSE_LOCNM; 7767 } 7768 out: 7769 single_thread_end(s); 7770 mddb_setexit(s); 7771 if (err) 7772 return (1); 7773 return (0); 7774 } 7775 7776 /* 7777 * update_locatorblock - for active entries in the locator block, check 7778 * the devt to see if it matches the given devt. If so, and 7779 * there is an associated device id which is not the same 7780 * as the passed in devid, delete old devid and add a new one. 7781 * 7782 * During import of replicated disksets, old_didptr contains 7783 * the original disk's device id. Use this device id in 7784 * addition to the devt to determine if an entry is a match 7785 * and should be updated with the new device id of the 7786 * replicated disk. Specifically, this is the case being handled: 7787 * 7788 * Original_disk Replicated_disk Disk_Available_During_Import 7789 * c1t1d0 c1t3d0 no - so old name c1t1d0 shown 7790 * c1t2d0 c1t1d0 yes - name is c1t1d0 7791 * c1t3d0 c1t2d0 yes - name is c1t2d0 7792 * 7793 * Can't just match on devt since devt for the first and third 7794 * disks will be the same, but the original disk's device id 7795 * is known and can be used to distinguish which disk's 7796 * replicated device id should be updated. 7797 * RETURN 7798 * MDDB_E_NODEVID 7799 * MDDB_E_NOLOCBLK 7800 * 1 Error 7801 * 0 Success 7802 */ 7803 static int 7804 update_locatorblock( 7805 mddb_set_t *s, 7806 md_dev64_t dev, 7807 ddi_devid_t didptr, 7808 ddi_devid_t old_didptr 7809 ) 7810 { 7811 mddb_lb_t *lbp = NULL; 7812 mddb_locator_t *lp; 7813 int li; 7814 uint_t flg; 7815 ddi_devid_t devid_ptr; 7816 int retval = 0; 7817 char *minor_name; 7818 int repl_import_flag; 7819 7820 /* Set replicated flag if this is a replicated import */ 7821 repl_import_flag = md_get_setstatus(s->s_setno) & 7822 MD_SET_REPLICATED_IMPORT; 7823 7824 lbp = s->s_lbp; 7825 /* find replicas that haven't been deleted */ 7826 for (li = 0; li < lbp->lb_loccnt; li++) { 7827 lp = &lbp->lb_locators[li]; 7828 7829 if ((lp->l_flags & MDDB_F_DELETED)) { 7830 continue; 7831 } 7832 /* 7833 * check to see if locator devt matches given dev 7834 * and if there is a device ID associated with it 7835 */ 7836 flg = s->s_did_icp->did_ic_blkp-> blk_info[li].info_flags; 7837 if ((md_expldev(lp->l_dev) == dev) && 7838 (flg & MDDB_DID_EXISTS)) { 7839 if (flg & MDDB_DID_VALID) { 7840 continue; /* cont to nxt active entry */ 7841 } 7842 devid_ptr = s->s_did_icp->did_ic_devid[li]; 7843 if (devid_ptr == NULL) { 7844 return (MDDB_E_NODEVID); 7845 } 7846 7847 /* 7848 * During a replicated import the old_didptr 7849 * must match the current devid before the 7850 * devid can be updated. 7851 */ 7852 if (repl_import_flag) { 7853 if (ddi_devid_compare(devid_ptr, 7854 old_didptr) != 0) 7855 continue; 7856 } 7857 7858 if (ddi_devid_compare(devid_ptr, didptr) != 0) { 7859 /* 7860 * devid's not equal so 7861 * delete and add 7862 */ 7863 if (ddi_lyr_get_minor_name( 7864 md_dev64_to_dev(dev), 7865 S_IFBLK, &minor_name) == DDI_SUCCESS) { 7866 (void) mddb_devid_delete(s, li); 7867 (void) mddb_devid_add(s, li, didptr, 7868 minor_name); 7869 kmem_free(minor_name, 7870 strlen(minor_name)+1); 7871 break; 7872 } else { 7873 retval = 1; 7874 goto err_out; 7875 } 7876 } 7877 } 7878 } /* end for */ 7879 retval = push_lb(s); 7880 (void) upd_med(s, "update_locatorblock(0)"); 7881 err_out: 7882 return (retval); 7883 } 7884 7885 static int 7886 update_mb_devid( 7887 mddb_set_t *s, 7888 mddb_ri_t *rip, 7889 ddi_devid_t devidptr 7890 ) 7891 { 7892 mddb_mb_ic_t *mbip; 7893 mddb_mb_t *mb = NULL; 7894 daddr_t blkno; 7895 md_dev64_t device; 7896 uint_t sz; 7897 int mb2free = 0; 7898 int err = 0; 7899 7900 7901 /* 7902 * There is case where a disk may not have mddb, 7903 * and only has dummy mddb which contains 7904 * a valid devid we like to update and in this 7905 * case, the rip_lbp will be NULL but we still 7906 * like to update the devid embedded in the 7907 * dummy mb block. 7908 * 7909 */ 7910 if (rip->ri_mbip != (mddb_mb_ic_t *)NULL) { 7911 mbip = rip->ri_mbip; 7912 mb = &mbip->mbi_mddb_mb; 7913 } else { 7914 /* 7915 * Done if it is non-replicated set 7916 */ 7917 if (devidptr != (ddi_devid_t)NULL) { 7918 mb = (mddb_mb_t *)kmem_zalloc(MDDB_BSIZE, 7919 KM_SLEEP); 7920 mb->mb_magic = MDDB_MAGIC_DU; 7921 mb->mb_revision = MDDB_REV_MB; 7922 mb2free = 1; 7923 } else { 7924 goto out; 7925 } 7926 } 7927 7928 blkno = rip->ri_blkno; 7929 device = rip->ri_dev; 7930 /* 7931 * Replace the mb_devid with the new/valid one 7932 */ 7933 if (devidptr != (ddi_devid_t)NULL) { 7934 /* 7935 * Zero out what we have previously 7936 */ 7937 if (mb->mb_devid_len) 7938 bzero(mb->mb_devid, mb->mb_devid_len); 7939 sz = ddi_devid_sizeof(devidptr); 7940 bcopy((char *)devidptr, (char *)mb->mb_devid, sz); 7941 mb->mb_devid_len = sz; 7942 } 7943 7944 mb->mb_setno = s->s_setno; 7945 uniqtime32(&mb->mb_timestamp); 7946 crcgen(mb, &mb->mb_checksum, MDDB_BSIZE, NULL); 7947 /* 7948 * putblks will 7949 * 7950 * - drop the s_dbmx lock 7951 * - biowait 7952 * - regain the s_dbmx lock 7953 * 7954 * Need to update this if we wants to handle 7955 * mb_next != NULL which it is unlikely will happen 7956 */ 7957 err = putblks(s, (caddr_t)mb, blkno, 1, device, 0); 7958 7959 if (mb2free) { 7960 kmem_free(mb, MDDB_BSIZE); 7961 } 7962 out: 7963 return (err); 7964 } 7965 7966 static int 7967 setdid( 7968 mddb_config_t *cp 7969 ) 7970 { 7971 ddi_devid_t devidp; 7972 dev_t ddi_dev; 7973 mddb_set_t *s; 7974 int err = 0; 7975 mddb_ri_t *rip; 7976 7977 /* 7978 * Data integrity check 7979 */ 7980 if (cp->c_setno >= md_nsets || cp->c_devt <= 0) 7981 return (EINVAL); 7982 7983 if ((md_get_setstatus(cp->c_setno) & MD_SET_STALE)) 7984 return (0); 7985 7986 ddi_dev = md_dev64_to_dev(cp->c_devt); 7987 if (ddi_lyr_get_devid(ddi_dev, &devidp) != DDI_SUCCESS) { 7988 return (-1); 7989 } 7990 if (devidp == NULL) { 7991 return (-1); 7992 } 7993 7994 if ((s = mddb_setenter(cp->c_setno, MDDB_MUSTEXIST, &err)) == NULL) 7995 return (-1); 7996 single_thread_start(s); 7997 7998 for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) { 7999 if (rip->ri_lbp == (mddb_lb_t *)NULL) 8000 continue; 8001 /* 8002 * We only update what is asked 8003 */ 8004 if (rip->ri_dev == cp->c_devt) { 8005 if (update_mb_devid(s, rip, devidp) != 0) { 8006 err = -1; 8007 goto out; 8008 } 8009 } 8010 } 8011 8012 if (update_locatorblock(s, cp->c_devt, devidp, NULL)) { 8013 err = -1; 8014 goto out; 8015 } 8016 8017 out: 8018 single_thread_end(s); 8019 mddb_setexit(s); 8020 ddi_devid_free(devidp); 8021 return (err); 8022 } 8023 8024 static int 8025 delnewside( 8026 mddb_config_t *cp, 8027 int command, 8028 md_error_t *ep 8029 ) 8030 { 8031 mddb_set_t *s; 8032 int li; 8033 mddb_lb_t *lbp; /* pointer to locator block */ 8034 mddb_ln_t *lnp; /* pointer to locator names */ 8035 mddb_mnln_t *mnlnp; /* pointer to locator names */ 8036 mddb_locator_t *lp; 8037 mddb_sidelocator_t *slp; 8038 mddb_cfg_loc_t *clp; 8039 int err = 0; 8040 set_t setno = cp->c_setno; 8041 ddi_devid_t devid; 8042 ddi_devid_t ret_devid = NULL; 8043 char *minor_name; 8044 uint_t use_devid = 0; 8045 dev_t ddi_dev; 8046 md_mnname_suffix_t *mnsn; 8047 mddb_mnlb_t *mnlbp; 8048 mddb_mnsidelocator_t *mnslp; 8049 8050 /* Currently don't allow addition/deletion of sides during upgrade */ 8051 if (MD_UPGRADE) { 8052 cmn_err(CE_WARN, 8053 "Addition and deletion of sides not allowed" 8054 " during upgrade. \n"); 8055 return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno)); 8056 } 8057 8058 /* 8059 * Data integrity check 8060 */ 8061 if (setno >= md_nsets || cp->c_locator.l_dev <= 0) 8062 return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR)); 8063 8064 if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) 8065 return (mddbstatus2error(ep, err, NODEV32, setno)); 8066 8067 single_thread_start(s); 8068 clp = &cp->c_locator; 8069 8070 lbp = s->s_lbp; 8071 8072 if (lbp->lb_setno != setno) { 8073 single_thread_end(s); 8074 mddb_setexit(s); 8075 return (mdmddberror(ep, MDE_DB_INVALID, NODEV32, setno)); 8076 } 8077 8078 /* 8079 * Find this device/blkno pair 8080 */ 8081 if (lbp->lb_flags & MDDB_DEVID_STYLE) { 8082 ddi_dev = md_dev64_to_dev(clp->l_dev); 8083 if ((ddi_lyr_get_devid(ddi_dev, &ret_devid) == DDI_SUCCESS) && 8084 (ddi_lyr_get_minor_name(ddi_dev, S_IFBLK, &minor_name) 8085 == DDI_SUCCESS)) { 8086 if (strlen(minor_name) < MDDB_MINOR_NAME_MAX) { 8087 clp->l_devid = (uint64_t)(uintptr_t)ret_devid; 8088 use_devid = 1; 8089 (void) strcpy(clp->l_minor_name, minor_name); 8090 } 8091 kmem_free(minor_name, strlen(minor_name)+1); 8092 } 8093 if (use_devid != 1 && ret_devid != NULL) 8094 ddi_devid_free(ret_devid); 8095 } 8096 for (li = 0; li < lbp->lb_loccnt; li++) { 8097 lp = &lbp->lb_locators[li]; 8098 if (lp->l_flags & MDDB_F_DELETED) 8099 continue; 8100 if (use_devid) { 8101 if ((mddb_devid_get(s, li, &devid, &minor_name)) == 0) 8102 continue; 8103 if ((ddi_devid_compare(devid, 8104 (ddi_devid_t)(uintptr_t)clp->l_devid) == 0) && 8105 (strcmp(clp->l_minor_name, minor_name) == 0) && 8106 ((daddr_t)lp->l_blkno == clp->l_blkno)) { 8107 break; 8108 } 8109 } else { 8110 if (lp->l_dev == clp->l_dev && 8111 (daddr_t)lp->l_blkno == clp->l_blkno) { 8112 break; 8113 } 8114 } 8115 } 8116 8117 if (li == lbp->lb_loccnt) { 8118 if (use_devid) 8119 ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid); 8120 single_thread_end(s); 8121 mddb_setexit(s); 8122 return (mdmddberror(ep, MDE_DB_INVALID, NODEV32, setno)); 8123 } 8124 8125 lnp = s->s_lnp; 8126 if (command == MDDB_NEWSIDE) { 8127 int index = 0; 8128 /* 8129 * If a MN diskset, need to find the index where the new 8130 * locator information is to be stored in the mnsidelocator 8131 * field of the locator block so that the locator name can 8132 * be stored at the same array index in the mnsuffixes 8133 * field of the locator names structure. 8134 */ 8135 if (lbp->lb_flags & MDDB_MNSET) { 8136 if ((index = checklocator(lbp, li, 8137 cp->c_sideno)) == -1) { 8138 if (use_devid) { 8139 ddi_devid_free((ddi_devid_t) 8140 (uintptr_t)clp->l_devid); 8141 } 8142 single_thread_end(s); 8143 mddb_setexit(s); 8144 return (mdmddberror(ep, MDE_DB_TOOSMALL, 8145 NODEV32, setno)); 8146 } 8147 } 8148 8149 /* 8150 * Store the locator name before the sidelocator information 8151 * in case a panic occurs between these 2 steps. Must have 8152 * the locator name information in order to print reasonable 8153 * error information. 8154 */ 8155 if (splitname2locatorblock(&cp->c_devname, lnp, li, 8156 cp->c_sideno, index)) { 8157 if (use_devid) 8158 ddi_devid_free( 8159 (ddi_devid_t)(uintptr_t)clp->l_devid); 8160 single_thread_end(s); 8161 mddb_setexit(s); 8162 return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32, 8163 setno)); 8164 } 8165 8166 if (cfgloc2locator(lbp, clp, li, cp->c_sideno, index)) { 8167 if (use_devid) 8168 ddi_devid_free( 8169 (ddi_devid_t)(uintptr_t)clp->l_devid); 8170 single_thread_end(s); 8171 mddb_setexit(s); 8172 return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32, 8173 setno)); 8174 } 8175 } 8176 8177 if (use_devid) 8178 ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid); 8179 8180 if (command == MDDB_DELSIDE) { 8181 int i; 8182 for (i = 0; i < lbp->lb_loccnt; i++) { 8183 if (lbp->lb_flags & MDDB_MNSET) { 8184 int j; 8185 mnlbp = (mddb_mnlb_t *)lbp; 8186 for (j = 0; j < MD_MNMAXSIDES; j++) { 8187 mnslp = &mnlbp->lb_mnsidelocators[j][i]; 8188 if (mnslp->mnl_sideno == cp->c_sideno) 8189 break; 8190 } 8191 if (j < MD_MNMAXSIDES) { 8192 mnslp->mnl_mnum = NODEV32; 8193 mnslp->mnl_sideno = 0; 8194 mnlnp = (mddb_mnln_t *)lnp; 8195 mnsn = &(mnlnp->ln_mnsuffixes[j][i]); 8196 bzero((caddr_t)mnsn, 8197 sizeof (md_mnname_suffix_t)); 8198 } 8199 } else { 8200 slp = &lbp->lb_sidelocators[cp->c_sideno][i]; 8201 bzero((caddr_t)&lnp->ln_suffixes 8202 [cp->c_sideno][i], sizeof (md_name_suffix)); 8203 slp->l_mnum = NODEV32; 8204 } 8205 } 8206 } 8207 8208 /* write new locator names to all devices */ 8209 uniqtime32(&lnp->ln_timestamp); 8210 if (lbp->lb_flags & MDDB_MNSET) 8211 lnp->ln_revision = MDDB_REV_MNLN; 8212 else 8213 lnp->ln_revision = MDDB_REV_LN; 8214 crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL); 8215 err |= writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk, 8216 lbp->lb_lnblkcnt, 0); 8217 /* 8218 * If a MN diskset and this is the master, set the PARSE_LOCNM 8219 * flag in the mddb_set structure to show that the locator 8220 * names have changed. 8221 */ 8222 8223 if ((lbp->lb_flags & MDDB_MNSET) && 8224 (md_set[s->s_setno].s_am_i_master)) { 8225 s->s_mn_parseflags |= MDDB_PARSE_LOCNM; 8226 } 8227 if (err) { 8228 if (writeretry(s)) { 8229 single_thread_end(s); 8230 mddb_setexit(s); 8231 return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno)); 8232 } 8233 } 8234 8235 uniqtime32(&lbp->lb_timestamp); 8236 /* write new locator to all devices */ 8237 err = writelocall(s); 8238 8239 (void) upd_med(s, "delnewside(0)"); 8240 8241 computefreeblks(s); /* recompute always it may be larger */ 8242 if (err) { 8243 if (writeretry(s)) { 8244 single_thread_end(s); 8245 mddb_setexit(s); 8246 return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno)); 8247 } 8248 } 8249 8250 single_thread_end(s); 8251 mddb_setexit(s); 8252 8253 return (0); 8254 } 8255 8256 static int 8257 newdev( 8258 mddb_config_t *cp, 8259 int command, 8260 md_error_t *ep 8261 ) 8262 { 8263 mddb_set_t *s; 8264 mddb_mb_ic_t *mbip, *mbip1; 8265 int i, j; 8266 int li; 8267 mddb_lb_t *lbp; /* pointer to locator block */ 8268 mddb_ln_t *lnp; /* pointer to locator names */ 8269 mddb_locator_t *lp; 8270 mddb_cfg_loc_t *clp; 8271 int err = 0; 8272 set_t setno = cp->c_setno; 8273 ddi_devid_t devid2; 8274 ddi_devid_t ret_devid = NULL; 8275 char *minor_name; 8276 uint_t use_devid = 0; 8277 dev_t ddi_dev; 8278 int old_flags; 8279 int flags; 8280 int mn_set = 0; 8281 int index; 8282 8283 8284 /* Currently don't allow addition of new replica during upgrade */ 8285 if (MD_UPGRADE) { 8286 cmn_err(CE_WARN, 8287 "Addition of new replica not allowed during upgrade.\n"); 8288 return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno)); 8289 } 8290 8291 /* 8292 * Data integrity check 8293 */ 8294 if (setno >= md_nsets || cp->c_locator.l_dev <= 0) 8295 return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR)); 8296 8297 /* Determine the flag settings for multinode sets */ 8298 flags = MDDB_NOOLDOK; 8299 if (cp->c_multi_node) 8300 flags |= MDDB_MULTINODE; 8301 8302 if ((s = mddb_setenter(setno, flags, &err)) == NULL) { 8303 if (err != MDDB_E_NOTOWNER) 8304 return (mddbstatus2error(ep, err, NODEV32, setno)); 8305 s = init_set(cp, flags, &err); 8306 if (s == NULL) 8307 return (mddbstatus2error(ep, err, NODEV32, setno)); 8308 } 8309 8310 single_thread_start(s); 8311 8312 /* shorthand */ 8313 clp = &cp->c_locator; 8314 8315 /* shorthand */ 8316 lbp = s->s_lbp; 8317 8318 if (lbp->lb_setno != setno) { 8319 single_thread_end(s); 8320 mddb_setexit(s); 8321 return (mdmddberror(ep, MDE_DB_INVALID, NODEV32, setno)); 8322 } 8323 8324 /* 8325 * See if this device/blkno pair is already a replica 8326 */ 8327 if (lbp->lb_flags & MDDB_DEVID_STYLE) { 8328 ddi_dev = expldev(clp->l_dev); 8329 if ((ddi_lyr_get_devid(ddi_dev, &ret_devid) == DDI_SUCCESS) && 8330 (ddi_lyr_get_minor_name(ddi_dev, 8331 S_IFBLK, &minor_name) == DDI_SUCCESS)) { 8332 if (strlen(minor_name) < MDDB_MINOR_NAME_MAX) { 8333 clp->l_devid = (uint64_t)(uintptr_t)ret_devid; 8334 use_devid = 1; 8335 (void) strcpy(clp->l_minor_name, minor_name); 8336 } 8337 kmem_free(minor_name, strlen(minor_name)+1); 8338 } 8339 if (use_devid != 1 && ret_devid != NULL) 8340 ddi_devid_free(ret_devid); 8341 } 8342 8343 for (i = 0; i < lbp->lb_loccnt; i++) { 8344 lp = &lbp->lb_locators[i]; 8345 if (lp->l_flags & MDDB_F_DELETED) 8346 continue; 8347 if (use_devid) { 8348 if ((mddb_devid_get(s, i, &devid2, &minor_name)) == 0) 8349 continue; 8350 if ((ddi_devid_compare(devid2, 8351 (ddi_devid_t)(uintptr_t)clp->l_devid) == 0) && 8352 (strcmp(clp->l_minor_name, minor_name) == 0) && 8353 ((daddr_t)lp->l_blkno == clp->l_blkno)) { 8354 if (command == MDDB_NEWDEV) { 8355 ddi_devid_free((ddi_devid_t)(uintptr_t) 8356 clp->l_devid); 8357 single_thread_end(s); 8358 mddb_setexit(s); 8359 return (mdmddberror(ep, 8360 MDE_DB_EXISTS, NODEV32, setno)); 8361 } 8362 } 8363 } else { 8364 if (lp->l_dev == clp->l_dev && 8365 (daddr_t)lp->l_blkno == clp->l_blkno) { 8366 if (command == MDDB_NEWDEV) { 8367 single_thread_end(s); 8368 mddb_setexit(s); 8369 return (mdmddberror(ep, 8370 MDE_DB_EXISTS, NODEV32, setno)); 8371 } 8372 } 8373 } 8374 } 8375 8376 /* 8377 * Really is a new replica, go get the master blocks 8378 */ 8379 mbip = getmasters(s, md_expldev(clp->l_dev), clp->l_blkno, 8380 (uint_t *)0, &mn_set); 8381 if (! mbip) { 8382 if (use_devid) 8383 ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid); 8384 single_thread_end(s); 8385 mddb_setexit(s); 8386 return (mdmddberror(ep, MDE_DB_MASTER, NODEV32, setno)); 8387 } 8388 8389 /* 8390 * Compute free blocks in replica. 8391 */ 8392 computefreeblks(s); 8393 8394 /* 8395 * Check if this is large enough 8396 */ 8397 for (mbip1 = mbip, i = 0; mbip1 != NULL; mbip1 = mbip1->mbi_next) 8398 i += mbip1->mbi_mddb_mb.mb_blkcnt; 8399 for (j = i; j < s->s_totalblkcnt; j++) { 8400 if (blkcheck(s, j)) { 8401 while (mbip) { 8402 mbip1 = mbip->mbi_next; 8403 kmem_free((caddr_t)mbip, MDDB_IC_BSIZE); 8404 mbip = mbip1; 8405 } 8406 if (use_devid) 8407 ddi_devid_free( 8408 (ddi_devid_t)(uintptr_t)clp->l_devid); 8409 mddb_devclose(md_expldev(clp->l_dev)); 8410 single_thread_end(s); 8411 mddb_setexit(s); 8412 return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32, 8413 setno)); 8414 } 8415 } 8416 8417 /* Look for a deleted slot */ 8418 for (li = 0; li < lbp->lb_loccnt; li++) { 8419 lp = &lbp->lb_locators[li]; 8420 if (lp->l_flags & MDDB_F_DELETED) 8421 break; 8422 } 8423 8424 /* If no deleted slots, add a new one */ 8425 if (li == lbp->lb_loccnt) { 8426 /* Already have the max replicas, bail */ 8427 if (lbp->lb_loccnt == MDDB_NLB) { 8428 if (use_devid) 8429 ddi_devid_free((ddi_devid_t)(uintptr_t) 8430 clp->l_devid); 8431 mddb_devclose(md_expldev(clp->l_dev)); 8432 single_thread_end(s); 8433 mddb_setexit(s); 8434 return (mdmddberror(ep, MDE_TOOMANY_REPLICAS, NODEV32, 8435 setno)); 8436 } 8437 lbp->lb_loccnt++; 8438 lp = &lbp->lb_locators[li]; 8439 } 8440 8441 /* Initialize the new or deleted slot */ 8442 old_flags = lp->l_flags; 8443 lp->l_dev = clp->l_dev; 8444 lp->l_blkno = (daddr32_t)clp->l_blkno; 8445 lp->l_flags = clp->l_flags; 8446 8447 /* shorthand */ 8448 lnp = s->s_lnp; 8449 8450 index = 0; 8451 if ((lbp->lb_flags & MDDB_MNSET) || (flags & MDDB_MULTINODE)) { 8452 /* 8453 * If a MN diskset, need to find the index where the new 8454 * locator information is to be stored in the mnsidelocator 8455 * field of the locator block so that the locator name can 8456 * be stored at the same array index in the mnsuffixes 8457 * field of the locator names structure. 8458 */ 8459 lbp->lb_flags |= MDDB_MNSET; 8460 if ((index = checklocator(lbp, li, s->s_sideno)) == -1) { 8461 if (use_devid) 8462 ddi_devid_free((ddi_devid_t)(uintptr_t)clp-> 8463 l_devid); 8464 lp->l_flags = old_flags; 8465 lbp->lb_loccnt--; 8466 mddb_devclose(md_expldev(clp->l_dev)); 8467 single_thread_end(s); 8468 mddb_setexit(s); 8469 return (mdmddberror(ep, MDE_DB_TOOSMALL, 8470 NODEV32, setno)); 8471 } 8472 } 8473 /* 8474 * Store the locator name before the sidelocator information 8475 * in case a panic occurs between these 2 steps. Must have 8476 * the locator name information in order to print reasonable 8477 * error information. 8478 */ 8479 if (splitname2locatorblock(&cp->c_devname, lnp, li, 8480 s->s_sideno, index)) { 8481 if (use_devid) 8482 ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid); 8483 lp->l_flags = old_flags; 8484 lbp->lb_loccnt--; 8485 mddb_devclose(md_expldev(clp->l_dev)); 8486 single_thread_end(s); 8487 mddb_setexit(s); 8488 return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32, setno)); 8489 } 8490 8491 /* 8492 * Compute free blocks in replica before calling cfgloc2locator 8493 * since cfgloc2locator may attempt to alloc an unused block 8494 * to store the device id. 8495 * mbiarray needs to be setup before calling computefreeblks. 8496 */ 8497 s->s_mbiarray[li] = mbip; 8498 computefreeblks(s); 8499 8500 if (cfgloc2locator(lbp, clp, li, s->s_sideno, index)) { 8501 if (use_devid) 8502 ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid); 8503 lp->l_flags = old_flags; 8504 lbp->lb_loccnt--; 8505 s->s_mbiarray[li] = 0; 8506 mddb_devclose(md_expldev(clp->l_dev)); 8507 single_thread_end(s); 8508 mddb_setexit(s); 8509 return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32, setno)); 8510 } 8511 8512 if (use_devid) 8513 ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid); 8514 8515 uniqtime32(&lbp->lb_timestamp); 8516 lp->l_flags = MDDB_F_ACTIVE; 8517 8518 /* write db copy to new device */ 8519 err = writecopy(s, li, MDDB_WRITECOPY_ALL); 8520 lp->l_flags |= MDDB_F_UP2DATE; 8521 8522 /* write new locator names to all devices */ 8523 uniqtime32(&lnp->ln_timestamp); 8524 if (lbp->lb_flags & MDDB_MNSET) 8525 lnp->ln_revision = MDDB_REV_MNLN; 8526 else 8527 lnp->ln_revision = MDDB_REV_LN; 8528 crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL); 8529 err |= writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk, 8530 lbp->lb_lnblkcnt, 0); 8531 /* 8532 * If a MN diskset and this is the master, set the PARSE_LOCNM 8533 * flag in the mddb_set structure to show that the locator 8534 * names have changed. 8535 */ 8536 8537 if ((lbp->lb_flags & MDDB_MNSET) && 8538 (md_set[s->s_setno].s_am_i_master)) { 8539 s->s_mn_parseflags |= MDDB_PARSE_LOCNM; 8540 } 8541 if (err) { 8542 if (writeretry(s)) { 8543 single_thread_end(s); 8544 mddb_setexit(s); 8545 return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno)); 8546 } 8547 } 8548 8549 /* Data tags not supported on MN sets */ 8550 if ((md_get_setstatus(setno) & MD_SET_STALE) && 8551 (!(lbp->lb_flags & MDDB_MNSET)) && 8552 setno != MD_LOCAL_SET) 8553 if (set_dtag(s, ep)) 8554 mdclrerror(ep); 8555 8556 /* Write data tags to all accessible devices */ 8557 /* Data tags not supported on MN sets */ 8558 if (!(lbp->lb_flags & MDDB_MNSET)) { 8559 (void) dt_write(s); 8560 } 8561 8562 /* write new locator to all devices */ 8563 err = writelocall(s); 8564 8565 (void) upd_med(s, "newdev(0)"); 8566 8567 SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_CREATE, SVM_TAG_REPLICA, setno, 8568 md_expldev(clp->l_dev)); 8569 8570 computefreeblks(s); /* recompute always it may be smaller */ 8571 if (err) { 8572 if (writeretry(s)) { 8573 single_thread_end(s); 8574 mddb_setexit(s); 8575 return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno)); 8576 } 8577 } 8578 8579 single_thread_end(s); 8580 mddb_setexit(s); 8581 8582 return (0); 8583 } 8584 8585 #ifdef DEBUG 8586 static void 8587 mddb_check_set( 8588 set_t setno 8589 ) 8590 { 8591 mddb_set_t *s; 8592 mddb_db_t *dbp; 8593 mddb_de_ic_t *dep; 8594 mddb_rb32_t *rbp; 8595 8596 if (! md_set[setno].s_db) 8597 return; 8598 8599 s = (mddb_set_t *)md_set[setno].s_db; 8600 8601 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 8602 for (dep = dbp->db_firstentry; 8603 dep != NULL; dep = dep->de_next) { 8604 rbp = dep->de_rb; 8605 ASSERT(rbp->rb_magic == MDDB_MAGIC_RB); 8606 if (dep->de_rb_userdata) 8607 ASSERT((uintptr_t)dep->de_rb_userdata > 2000); 8608 } 8609 } 8610 } 8611 #endif /* DEBUG */ 8612 8613 /* 8614 * Exported Entry Points 8615 */ 8616 #ifdef DEBUG 8617 void 8618 mddb_check(void) 8619 { 8620 int i; 8621 8622 for (i = 0; i < md_nsets; i++) { 8623 if (! md_set[i].s_db) 8624 return; 8625 8626 mddb_check_set(i); 8627 } 8628 8629 } 8630 #endif /* DEBUG */ 8631 8632 int 8633 mddb_configure( 8634 mddb_cfgcmd_t command, 8635 mddb_config_t *cp 8636 ) 8637 { 8638 mddb_set_t *s; 8639 md_error_t *ep = &cp->c_mde; 8640 int flag = 0; 8641 int err = 0; 8642 set_t setno = cp->c_setno; 8643 8644 mdclrerror(ep); 8645 8646 switch (command) { 8647 case MDDB_NEWDEV: 8648 err = newdev(cp, command, ep); 8649 break; 8650 8651 case MDDB_NEWSIDE: 8652 case MDDB_DELSIDE: 8653 err = delnewside(cp, command, ep); 8654 break; 8655 8656 case MDDB_GETDEV: 8657 case MDDB_DELDEV: 8658 case MDDB_ENDDEV: 8659 err = getdeldev(cp, command, ep); 8660 break; 8661 8662 case MDDB_GETDRVRNAME: 8663 err = getdriver(&cp->c_locator); 8664 break; 8665 8666 case MDDB_USEDEV: 8667 /* 8668 * Note: must allow USEDEV ioctl during upgrade to 8669 * support auto-take disksets. 8670 * 8671 * Also during the set import if the md_devid_destroy 8672 * flag is set then error out 8673 */ 8674 8675 if ((cp->c_flags & MDDB_C_IMPORT) && md_devid_destroy) 8676 return (mdmderror(ep, MDE_INVAL_UNIT, 8677 MD_ADM_MINOR)); 8678 8679 if (setno >= md_nsets) 8680 return (mdmderror(ep, MDE_INVAL_UNIT, 8681 MD_ADM_MINOR)); 8682 8683 if ((s = mddb_setenter(setno, MDDB_NOINIT, &err)) == 8684 NULL) { 8685 if ((s = init_set(cp, MDDB_NOINIT, &err)) == 8686 NULL) { 8687 err = mddbstatus2error(ep, err, 8688 NODEV32, setno); 8689 break; 8690 } 8691 } 8692 if (setno == MD_LOCAL_SET) 8693 flag = MDDB_F_IOCTL; 8694 if (cp->c_locator.l_old_devid) { 8695 md_set_setstatus(setno, 8696 MD_SET_REPLICATED_IMPORT); 8697 } 8698 err = ridev(&s->s_rip, &cp->c_locator, NULL, flag); 8699 mddb_setexit(s); 8700 break; 8701 8702 case MDDB_RELEASESET: 8703 mutex_enter(&mddb_lock); 8704 mddb_unload_set(cp->c_setno); 8705 mutex_exit(&mddb_lock); 8706 break; 8707 8708 case MDDB_SETDID: 8709 err = setdid(cp); 8710 break; 8711 8712 default: 8713 err = mdmddberror(ep, MDE_DB_INVALID, NODEV32, 8714 cp->c_setno); 8715 } 8716 8717 return (err); 8718 } 8719 8720 int 8721 mddb_getoptloc( 8722 mddb_optloc_t *ol 8723 ) 8724 { 8725 mddb_set_t *s; 8726 mddb_db_t *dbp; 8727 mddb_de_ic_t *dep; 8728 mddb_recid_t id; 8729 set_t setno; 8730 8731 ol->li[0] = -1; 8732 ol->li[1] = -1; 8733 8734 id = ol->recid; 8735 setno = DBSET(id); 8736 if (setno >= md_nsets) 8737 return (EINVAL); 8738 8739 if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, NULL)) == NULL) 8740 return (0); 8741 8742 id = DBID(id); 8743 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 8744 for (dep = dbp->db_firstentry; 8745 dep != NULL; dep = dep->de_next) { 8746 if (dep->de_recid != id) 8747 continue; 8748 ol->li[0] = dep->de_optinfo[0].o_li; 8749 ol->li[1] = dep->de_optinfo[1].o_li; 8750 mddb_setexit(s); 8751 return (0); 8752 } 8753 } 8754 mddb_setexit(s); 8755 return (0); 8756 } 8757 8758 void 8759 mddb_init(void) 8760 { 8761 mddb_set_t *s; 8762 8763 mutex_init(&mddb_lock, NULL, MUTEX_DEFAULT, NULL); 8764 if ((s = init_set(NULL, MDDB_NOINIT, NULL)) != NULL) 8765 mddb_setexit(s); 8766 } 8767 8768 8769 void 8770 mddb_unload(void) 8771 { 8772 int i; 8773 8774 mutex_enter(&mddb_lock); 8775 8776 for (i = 0; i < md_nsets; i++) { 8777 md_clr_setstatus(i, MD_SET_KEEPTAG); 8778 mddb_unload_set(i); 8779 } 8780 8781 crcfreetab(); 8782 8783 mutex_exit(&mddb_lock); 8784 } 8785 8786 mddb_recid_t 8787 mddb_createrec( 8788 size_t usersize, /* size of db record */ 8789 mddb_type_t type, /* type1 of db record */ 8790 uint_t type2, /* type2 of db record */ 8791 md_create_rec_option_t options, /* options for this creation */ 8792 set_t setno /* set number to create record in */ 8793 ) 8794 { 8795 mddb_set_t *s; 8796 mddb_db_t *dbp, *prevdbp, *newdbp; 8797 mddb_db32_t *db32p; 8798 mddb_de_ic_t *dep; 8799 /* LINTED variable unused - used for sizeof calculations */ 8800 mddb_de32_t *de32p; 8801 mddb_rb32_t *rbp; 8802 size_t recsize; 8803 ulong_t blkcnt; 8804 ulong_t maxblocks; 8805 size_t desize, desize_ic; 8806 size_t used; 8807 mddb_recid_t newid; 8808 caddr_t tmppnt; 8809 int i, err = 0; 8810 void *userdata; 8811 uint_t flag_type; 8812 8813 #if defined(_ILP32) && !defined(lint) 8814 ASSERT(sizeof (mddb_de_t) == sizeof (mddb_de32_t)); 8815 ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t)); 8816 ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t)); 8817 #endif 8818 8819 /* 8820 * everyone is supposed to sepcify if it's a 8821 * 32 bit or a 64 bit record 8822 */ 8823 if ((options &(MD_CRO_32BIT|MD_CRO_64BIT)) == 0) { 8824 return (MDDB_E_INVALID); 8825 } 8826 8827 if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) 8828 return (err); 8829 8830 if (checkstate(s, MDDB_PROBE)) { 8831 mddb_setexit(s); 8832 return (MDDB_E_NOTNOW); 8833 } 8834 8835 recsize = roundup((sizeof (*rbp) - sizeof (rbp->rb_data)) + 8836 usersize, MDDB_BSIZE); 8837 blkcnt = btodb(recsize); 8838 8839 if (mddb_maxblocks) 8840 maxblocks = mddb_maxblocks; 8841 else 8842 maxblocks = (MDDB_BSIZE - (sizeof (*db32p) + sizeof (*de32p) - 8843 sizeof (de32p->de32_blks))) / sizeof (mddb_block_t); 8844 8845 if (blkcnt > maxblocks) { 8846 mddb_setexit(s); 8847 return (MDDB_E_INVALID); 8848 } 8849 /* 8850 * allocate record block 8851 * and new directory block so to avoid sleeping 8852 * after starting single_thread 8853 */ 8854 rbp = (mddb_rb32_t *)kmem_zalloc(recsize, KM_SLEEP); 8855 if ((options & MD_CRO_OPTIMIZE) == 0) 8856 userdata = kmem_zalloc(usersize, KM_SLEEP); 8857 newdbp = (mddb_db_t *)kmem_zalloc(sizeof (*newdbp), KM_SLEEP); 8858 8859 /* 8860 * if this is the largest record allocate new buffer for 8861 * checkcopy(); 8862 */ 8863 if (recsize > s->s_databuffer_size) { 8864 tmppnt = (caddr_t)kmem_zalloc(recsize, KM_SLEEP); 8865 /* 8866 * this test is incase when to sleep during kmem_alloc 8867 * and some other task bumped max record size 8868 */ 8869 if (recsize > s->s_databuffer_size) { 8870 if (s->s_databuffer_size) 8871 kmem_free(s->s_databuffer, 8872 s->s_databuffer_size); 8873 s->s_databuffer = tmppnt; 8874 s->s_databuffer_size = recsize; 8875 } else { 8876 kmem_free(tmppnt, recsize); 8877 } 8878 } 8879 8880 single_thread_start(s); 8881 8882 newid = 0; 8883 do { 8884 newid++; 8885 if (DBID(newid) == 0) { 8886 kmem_free((caddr_t)newdbp, sizeof (*newdbp)); 8887 kmem_free((caddr_t)rbp, ((size_t)recsize)); 8888 if ((options & MD_CRO_OPTIMIZE) == 0) 8889 kmem_free(userdata, usersize); 8890 single_thread_end(s); 8891 mddb_setexit(s); 8892 return (MDDB_E_NOTNOW); 8893 } 8894 8895 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 8896 for (dep = dbp->db_firstentry; dep; 8897 dep = dep->de_next) { 8898 if (dep->de_recid == newid) 8899 break; 8900 } 8901 if (dep != NULL) 8902 break; 8903 } 8904 } while (dbp); 8905 8906 desize = (sizeof (*de32p) - sizeof (de32p->de32_blks)) + 8907 (sizeof (mddb_block_t) * blkcnt); 8908 8909 /* 8910 * see if a directory block exists which will hold this entry 8911 */ 8912 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 8913 used = sizeof (*db32p); 8914 for (dep = dbp->db_firstentry; 8915 dep != NULL; dep = dep->de_next) { 8916 used += sizeof (*de32p) - sizeof (de32p->de32_blks); 8917 used += sizeof (mddb_block_t) * dep->de_blkcount; 8918 } 8919 if ((used + desize) < MDDB_BSIZE) 8920 break; 8921 } 8922 if (dbp) { 8923 kmem_free((caddr_t)newdbp, sizeof (*newdbp)); 8924 if (blkcnt > s->s_freeblkcnt) { 8925 kmem_free((caddr_t)rbp, ((size_t)recsize)); 8926 if ((options & MD_CRO_OPTIMIZE) == 0) 8927 kmem_free(userdata, usersize); 8928 single_thread_end(s); 8929 mddb_setexit(s); 8930 return (MDDB_E_NOSPACE); 8931 } 8932 prevdbp = NULL; 8933 } else { 8934 /* 8935 * need to add directory block 8936 */ 8937 if ((blkcnt + 1) > s->s_freeblkcnt) { 8938 kmem_free((caddr_t)newdbp, sizeof (*newdbp)); 8939 kmem_free((caddr_t)rbp, ((size_t)recsize)); 8940 if ((options & MD_CRO_OPTIMIZE) == 0) 8941 kmem_free(userdata, usersize); 8942 single_thread_end(s); 8943 mddb_setexit(s); 8944 return (MDDB_E_NOSPACE); 8945 } 8946 for (dbp = s->s_dbp; dbp->db_next; dbp = dbp->db_next) 8947 ; 8948 dbp->db_next = newdbp; 8949 bzero((caddr_t)dbp->db_next, sizeof (*newdbp)); 8950 dbp->db_nextblk = getfreeblks(s, 1); 8951 dbp->db_next->db_blknum = dbp->db_nextblk; 8952 prevdbp = dbp; 8953 dbp = dbp->db_next; 8954 dbp->db_nextblk = 0; 8955 dbp->db_firstentry = NULL; 8956 dbp->db_recsum = 0; 8957 dbp->db_magic = MDDB_MAGIC_DB; 8958 } 8959 /* 8960 * ready to add record 8961 */ 8962 desize_ic = (sizeof (*dep) - sizeof (dep->de_blks)) + 8963 (sizeof (mddb_block_t) * blkcnt); 8964 if (dbp->db_firstentry) { 8965 for (dep = dbp->db_firstentry; dep->de_next; dep = dep->de_next) 8966 ; 8967 dep->de_next = (mddb_de_ic_t *)kmem_zalloc(desize_ic, KM_SLEEP); 8968 dep = dep->de_next; 8969 } else { 8970 dep = (mddb_de_ic_t *)kmem_zalloc(desize_ic, KM_SLEEP); 8971 dbp->db_firstentry = dep; 8972 } 8973 bzero((caddr_t)dep, desize_ic); 8974 dep->de_recid = newid; 8975 /* 8976 * Optimized records have an owner node associated with them in 8977 * a MN diskset. The owner is only set on a node that is actively 8978 * writing to that record. The other nodes will show that record 8979 * as having an invalid owner. The owner for an optimized record 8980 * is used during fixoptrecord to determine which node should 8981 * write out the record when the replicas associated with that 8982 * optimized record have been changed. 8983 */ 8984 if (MD_MNSET_SETNO(s->s_setno)) { 8985 dep->de_owner_nodeid = MD_MN_INVALID_NID; 8986 } 8987 dep->de_type1 = type; 8988 dep->de_type2 = type2; 8989 dep->de_reqsize = usersize; 8990 dep->de_recsize = recsize; 8991 dep->de_blkcount = blkcnt; 8992 flag_type = options & 8993 (MD_CRO_OPTIMIZE | MD_CRO_STRIPE | MD_CRO_MIRROR | MD_CRO_RAID | 8994 MD_CRO_SOFTPART | MD_CRO_TRANS_MASTER | MD_CRO_TRANS_LOG | 8995 MD_CRO_HOTSPARE | MD_CRO_HOTSPARE_POOL | MD_CRO_CHANGELOG); 8996 switch (flag_type) { 8997 case MD_CRO_OPTIMIZE: 8998 dep->de_flags = MDDB_F_OPT; 8999 getoptdev(s, dep, 0); 9000 getoptdev(s, dep, 1); 9001 break; 9002 case MD_CRO_STRIPE: 9003 dep->de_flags = MDDB_F_STRIPE; 9004 break; 9005 case MD_CRO_MIRROR: 9006 dep->de_flags = MDDB_F_MIRROR; 9007 break; 9008 case MD_CRO_RAID: 9009 dep->de_flags = MDDB_F_RAID; 9010 break; 9011 case MD_CRO_SOFTPART: 9012 dep->de_flags = MDDB_F_SOFTPART; 9013 break; 9014 case MD_CRO_TRANS_MASTER: 9015 dep->de_flags = MDDB_F_TRANS_MASTER; 9016 break; 9017 case MD_CRO_TRANS_LOG: 9018 dep->de_flags = MDDB_F_TRANS_LOG; 9019 break; 9020 case MD_CRO_HOTSPARE: 9021 dep->de_flags = MDDB_F_HOTSPARE; 9022 break; 9023 case MD_CRO_HOTSPARE_POOL: 9024 dep->de_flags = MDDB_F_HOTSPARE_POOL; 9025 break; 9026 case MD_CRO_CHANGELOG: 9027 dep->de_flags = MDDB_F_CHANGELOG; 9028 break; 9029 } 9030 /* 9031 * try to get all blocks consecutive. If not possible 9032 * just get them one at a time 9033 */ 9034 dep->de_blks[0] = getfreeblks(s, blkcnt); 9035 if (dep->de_blks[0]) { 9036 for (i = 1; i < blkcnt; i++) 9037 dep->de_blks[i] = dep->de_blks[0] + i; 9038 } else { 9039 for (i = 0; i < blkcnt; i++) 9040 dep->de_blks[i] = getfreeblks(s, 1); 9041 } 9042 dep->de_rb = rbp; 9043 bzero((caddr_t)rbp, recsize); 9044 rbp->rb_magic = MDDB_MAGIC_RB; 9045 9046 /* Do we have to create an old style (32 bit) record? */ 9047 if (options & MD_CRO_32BIT) { 9048 if (options & MD_CRO_FN) 9049 rbp->rb_revision = MDDB_REV_RBFN; 9050 else 9051 rbp->rb_revision = MDDB_REV_RB; 9052 } else { 9053 if (options & MD_CRO_FN) 9054 rbp->rb_revision = MDDB_REV_RB64FN; 9055 else 9056 rbp->rb_revision = MDDB_REV_RB64; 9057 } 9058 9059 /* set de_rb_userdata for non optimization records */ 9060 if ((options & MD_CRO_OPTIMIZE) == 0) { 9061 dep->de_rb_userdata = userdata; 9062 } 9063 9064 uniqtime32(&rbp->rb_timestamp); 9065 /* Generate the crc for this record */ 9066 rec_crcgen(s, dep, rbp); 9067 tmppnt = (caddr_t)rbp; 9068 /* 9069 * the following code writes new records to all instances of 9070 * the data base. Writing one block at a time to each instance 9071 * is safe because they are not yet in a directory entry which 9072 * has been written to the data base 9073 */ 9074 err = 0; 9075 if ((options & MD_CRO_OPTIMIZE) == 0) { 9076 for (i = 0; i < blkcnt; i++) { 9077 err |= writeall(s, (caddr_t)tmppnt, 9078 dep->de_blks[i], 1, 0); 9079 tmppnt += MDDB_BSIZE; 9080 } 9081 } else { 9082 if ((MD_MNSET_SETNO(s->s_setno)) && 9083 md_set[s->s_setno].s_am_i_master) { 9084 /* 9085 * If a MN diskset then only master writes out newly 9086 * created optimized record. 9087 */ 9088 err |= writeoptrecord(s, dep); 9089 } 9090 } 9091 uniqtime32(&dbp->db_timestamp); 9092 dbp->db_revision = MDDB_REV_DB; 9093 /* Don't include opt resync and change log records in global XOR */ 9094 if (!(dep->de_flags & MDDB_F_OPT) && 9095 !(dep->de_flags & MDDB_F_CHANGELOG)) 9096 dbp->db_recsum ^= rbp->rb_checksum; 9097 db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, KM_SLEEP); 9098 create_db32rec(db32p, dbp); 9099 crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL); 9100 err |= writeall(s, (caddr_t)db32p, dbp->db_blknum, 1, 0); 9101 if (prevdbp) { 9102 dbp = prevdbp; 9103 uniqtime32(&dbp->db_timestamp); 9104 dbp->db_revision = MDDB_REV_DB; 9105 create_db32rec(db32p, dbp); 9106 crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL); 9107 err |= writeall(s, (caddr_t)db32p, dbp->db_blknum, 1, 0); 9108 } 9109 kmem_free((caddr_t)db32p, MDDB_BSIZE); 9110 if (err) { 9111 if (writeretry(s)) { 9112 s->s_zombie = newid; 9113 single_thread_end(s); 9114 mddb_setexit(s); 9115 return (MDDB_E_NOTNOW); 9116 } 9117 } 9118 single_thread_end(s); 9119 mddb_setexit(s); 9120 9121 ASSERT((newid & MDDB_SETMASK) == 0); 9122 return (MAKERECID(setno, newid)); 9123 } 9124 9125 int 9126 mddb_deleterec( 9127 mddb_recid_t id 9128 ) 9129 { 9130 mddb_set_t *s; 9131 mddb_db_t *dbp; 9132 mddb_db32_t *db32p; 9133 mddb_de_ic_t *dep, *dep1; 9134 int i; 9135 9136 #if defined(_ILP32) && !defined(lint) 9137 ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t)); 9138 ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t)); 9139 #endif 9140 9141 s = mddb_setenter(DBSET(id), MDDB_NOINIT, NULL); 9142 ASSERT(s != NULL); 9143 9144 id = DBID(id); 9145 if (checkstate(s, MDDB_PROBE)) { 9146 mddb_setexit(s); 9147 return (MDDB_E_NOTNOW); 9148 } 9149 9150 ASSERT(s->s_lbp != NULL); 9151 single_thread_start(s); 9152 9153 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 9154 dep1 = NULL; 9155 for (dep = dbp->db_firstentry; dep; dep = dep->de_next) { 9156 if (dep->de_recid == id) 9157 break; 9158 dep1 = dep; 9159 } 9160 if (dep != NULL) 9161 break; 9162 } 9163 /* 9164 * no such record 9165 */ 9166 if (dep == NULL) { 9167 single_thread_end(s); 9168 ASSERT(s->s_staledeletes != 0); 9169 s->s_staledeletes--; 9170 mddb_setexit(s); 9171 return (0); 9172 } 9173 9174 if (!(dep->de_flags & MDDB_F_OPT) && 9175 !(dep->de_flags & MDDB_F_CHANGELOG)) { 9176 dbp->db_recsum ^= dep->de_rb->rb_checksum; 9177 dbp->db_recsum ^= dep->de_rb->rb_checksum_fiddle; 9178 } 9179 9180 if (dep->de_rb_userdata != NULL) { 9181 if (dep->de_icreqsize) 9182 kmem_free(dep->de_rb_userdata_ic, dep->de_icreqsize); 9183 else 9184 kmem_free(dep->de_rb_userdata, dep->de_reqsize); 9185 } 9186 9187 kmem_free((caddr_t)dep->de_rb, dep->de_recsize); 9188 9189 for (i = 0; i < dep->de_blkcount; i++) 9190 blkfree(s, dep->de_blks[i]); 9191 if (dep1) 9192 dep1->de_next = dep->de_next; 9193 else 9194 dbp->db_firstentry = dep->de_next; 9195 9196 kmem_free(dep, sizeofde(dep)); 9197 9198 uniqtime32(&dbp->db_timestamp); 9199 dbp->db_revision = MDDB_REV_DB; 9200 db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, KM_SLEEP); 9201 create_db32rec(db32p, dbp); 9202 crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL); 9203 if (writeall(s, (caddr_t)db32p, dbp->db_blknum, 1, 0)) { 9204 if (writeretry(s)) { 9205 /* 9206 * staledelete is used to mark deletes which failed. 9207 * its only use is to not panic when the user retries 9208 * the delete once the database is active again 9209 */ 9210 single_thread_end(s); 9211 s->s_staledeletes++; 9212 kmem_free((caddr_t)db32p, MDDB_BSIZE); 9213 mddb_setexit(s); 9214 return (MDDB_E_NOTNOW); 9215 } 9216 } 9217 single_thread_end(s); 9218 kmem_free((caddr_t)db32p, MDDB_BSIZE); 9219 mddb_setexit(s); 9220 return (0); 9221 } 9222 9223 mddb_recid_t 9224 mddb_getnextrec( 9225 mddb_recid_t id, 9226 mddb_type_t typ, 9227 uint_t type2 9228 ) 9229 { 9230 mddb_set_t *s; 9231 mddb_db_t *dbp; 9232 mddb_de_ic_t *dep; 9233 int searching, err; 9234 set_t setno; 9235 9236 setno = DBSET(id); 9237 id = DBID(id); 9238 searching = id; 9239 9240 if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) 9241 return (err); 9242 9243 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 9244 for (dep = dbp->db_firstentry; 9245 dep != NULL; dep = dep->de_next) { 9246 if (searching) { 9247 if (dep->de_recid == id) 9248 searching = 0; 9249 } else { 9250 if ((typ == MDDB_ALL || dep->de_type1 == typ) && 9251 (type2 == 0 || dep->de_type2 == type2)) { 9252 id = dep->de_recid; 9253 mddb_setexit(s); 9254 ASSERT((id & MDDB_SETMASK) == 0); 9255 return (MAKERECID(setno, id)); 9256 } 9257 } 9258 } 9259 } 9260 9261 mddb_setexit(s); 9262 9263 if (searching) 9264 return (MDDB_E_NORECORD); 9265 return (0); 9266 } 9267 9268 void * 9269 mddb_getrecaddr( 9270 mddb_recid_t id 9271 ) 9272 { 9273 mddb_set_t *s; 9274 mddb_db_t *dbp; 9275 mddb_de_ic_t *dep; 9276 void *rval; 9277 9278 if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, NULL)) == NULL) 9279 return (NULL); 9280 9281 id = DBID(id); 9282 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 9283 for (dep = dbp->db_firstentry; 9284 dep != NULL; dep = dep->de_next) { 9285 if (dep->de_recid != id) 9286 continue; 9287 if (dep->de_rb_userdata) 9288 rval = (void *)dep->de_rb_userdata; 9289 else 9290 rval = (void *)dep->de_rb->rb_data; 9291 mddb_setexit(s); 9292 return (rval); 9293 } 9294 } 9295 9296 mddb_setexit(s); 9297 return (NULL); 9298 } 9299 9300 9301 mddb_de_ic_t * 9302 mddb_getrecdep( 9303 mddb_recid_t id 9304 ) 9305 { 9306 mddb_set_t *s; 9307 mddb_db_t *dbp; 9308 mddb_de_ic_t *dep; 9309 9310 if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, NULL)) == NULL) 9311 return (NULL); 9312 9313 id = DBID(id); 9314 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 9315 for (dep = dbp->db_firstentry; 9316 dep != NULL; dep = dep->de_next) { 9317 if (dep->de_recid != id) 9318 continue; 9319 mddb_setexit(s); 9320 return (dep); 9321 } 9322 } 9323 9324 mddb_setexit(s); 9325 return (NULL); 9326 } 9327 9328 void * 9329 mddb_getrecaddr_resize( 9330 mddb_recid_t id, 9331 size_t icsize, 9332 off_t off 9333 ) 9334 { 9335 mddb_set_t *s; 9336 mddb_db_t *dbp; 9337 mddb_de_ic_t *dep; 9338 void *rval = NULL; 9339 9340 if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, NULL)) == NULL) 9341 return (NULL); 9342 9343 id = DBID(id); 9344 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 9345 for (dep = dbp->db_firstentry; 9346 dep != NULL; dep = dep->de_next) { 9347 if (dep->de_recid != id) 9348 continue; 9349 if (dep->de_rb_userdata) 9350 rval = (void *)dep->de_rb_userdata; 9351 else 9352 rval = (void *)dep->de_rb->rb_data; 9353 break; 9354 } 9355 if (rval != NULL) 9356 break; 9357 } 9358 9359 if (rval == NULL) { 9360 mddb_setexit(s); 9361 return (NULL); 9362 } 9363 9364 if (dep->de_rb_userdata) { 9365 caddr_t nud; 9366 9367 if (dep->de_icreqsize || (dep->de_reqsize >= icsize)) { 9368 mddb_setexit(s); 9369 return (rval); 9370 } 9371 ASSERT((dep->de_reqsize + off) <= icsize); 9372 nud = kmem_zalloc(icsize, KM_SLEEP); 9373 bcopy(dep->de_rb_userdata, nud + off, dep->de_reqsize); 9374 kmem_free(dep->de_rb_userdata, dep->de_reqsize); 9375 dep->de_rb_userdata = nud + off; 9376 dep->de_rb_userdata_ic = nud; 9377 dep->de_icreqsize = icsize; 9378 rval = nud; 9379 } else { 9380 size_t recsize; 9381 /* LINTED variable unused - used for sizeof calculations */ 9382 mddb_rb32_t *nrbp; 9383 9384 recsize = roundup((sizeof (*nrbp) - sizeof (nrbp->rb_data)) + 9385 icsize, MDDB_BSIZE); 9386 if (dep->de_recsize < recsize) 9387 cmn_err(CE_PANIC, "mddb_getrecaddr_resize: only " 9388 "nonoptimized records can be resized\n"); 9389 } 9390 9391 mddb_setexit(s); 9392 return (rval); 9393 } 9394 9395 int 9396 mddb_getrecprivate( 9397 mddb_recid_t id 9398 ) 9399 { 9400 mddb_set_t *s; 9401 mddb_db_t *dbp; 9402 mddb_de_ic_t *dep; 9403 int err = 0; 9404 int private; 9405 9406 if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, &err)) == NULL) 9407 return (err); 9408 9409 id = DBID(id); 9410 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 9411 for (dep = dbp->db_firstentry; 9412 dep != NULL; dep = dep->de_next) { 9413 if (dep->de_recid != id) 9414 continue; 9415 private = (int)dep->de_rb->rb_private; 9416 mddb_setexit(s); 9417 return (private); 9418 } 9419 } 9420 9421 mddb_setexit(s); 9422 return (MDDB_E_NORECORD); 9423 } 9424 9425 void 9426 mddb_setrecprivate( 9427 mddb_recid_t id, 9428 uint_t private 9429 ) 9430 { 9431 mddb_set_t *s; 9432 mddb_db_t *dbp; 9433 mddb_de_ic_t *dep; 9434 9435 if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, NULL)) == NULL) { 9436 ASSERT(0); 9437 return; 9438 } 9439 9440 id = DBID(id); 9441 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 9442 for (dep = dbp->db_firstentry; 9443 dep != NULL; dep = dep->de_next) { 9444 if (dep->de_recid != id) 9445 continue; 9446 dep->de_rb->rb_private = private; 9447 mddb_setexit(s); 9448 return; 9449 } 9450 } 9451 9452 mddb_setexit(s); 9453 ASSERT(0); 9454 } 9455 9456 mddb_type_t 9457 mddb_getrectype1( 9458 mddb_recid_t id 9459 ) 9460 { 9461 mddb_set_t *s; 9462 mddb_db_t *dbp; 9463 mddb_de_ic_t *dep; 9464 int err = 0; 9465 mddb_type_t rval; 9466 9467 if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, &err)) == NULL) 9468 return (err); 9469 9470 id = DBID(id); 9471 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 9472 for (dep = dbp->db_firstentry; 9473 dep != NULL; dep = dep->de_next) { 9474 if (dep->de_recid != id) 9475 continue; 9476 rval = dep->de_type1; 9477 mddb_setexit(s); 9478 return (rval); 9479 } 9480 } 9481 9482 mddb_setexit(s); 9483 return (MDDB_E_NORECORD); 9484 } 9485 9486 int 9487 mddb_getrectype2( 9488 mddb_recid_t id 9489 ) 9490 { 9491 mddb_set_t *s; 9492 mddb_db_t *dbp; 9493 mddb_de_ic_t *dep; 9494 int err = 0; 9495 int rval; 9496 9497 if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, &err)) == NULL) 9498 return (err); 9499 9500 id = DBID(id); 9501 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 9502 for (dep = dbp->db_firstentry; 9503 dep != NULL; dep = dep->de_next) { 9504 if (dep->de_recid != id) 9505 continue; 9506 rval = (int)dep->de_type2; 9507 mddb_setexit(s); 9508 return (rval); 9509 } 9510 } 9511 9512 mddb_setexit(s); 9513 return (MDDB_E_NORECORD); 9514 } 9515 9516 int 9517 mddb_getrecsize( 9518 mddb_recid_t id 9519 ) 9520 { 9521 mddb_set_t *s; 9522 mddb_db_t *dbp; 9523 mddb_de_ic_t *dep; 9524 int err = 0; 9525 int rval; 9526 9527 if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, &err)) == NULL) 9528 return (err); 9529 9530 id = DBID(id); 9531 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 9532 for (dep = dbp->db_firstentry; 9533 dep != NULL; dep = dep->de_next) { 9534 if (dep->de_recid != id) 9535 continue; 9536 rval = (int)dep->de_reqsize; 9537 mddb_setexit(s); 9538 return (rval); 9539 } 9540 } 9541 9542 mddb_setexit(s); 9543 return (MDDB_E_NORECORD); 9544 } 9545 9546 9547 mddb_recstatus_t 9548 mddb_getrecstatus( 9549 mddb_recid_t id 9550 ) 9551 { 9552 mddb_set_t *s; 9553 mddb_db_t *dbp; 9554 mddb_de_ic_t *dep; 9555 int err = 0; 9556 mddb_recstatus_t e_err; 9557 9558 if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, &err)) == NULL) 9559 return ((mddb_recstatus_t)err); 9560 9561 id = DBID(id); 9562 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 9563 for (dep = dbp->db_firstentry; 9564 dep != NULL; dep = dep->de_next) { 9565 if (dep->de_recid == id) 9566 break; 9567 } 9568 if (dep) 9569 break; 9570 } 9571 9572 e_err = MDDB_OK; 9573 9574 if (! dep) 9575 e_err = MDDB_NORECORD; 9576 else if (! dep->de_rb->rb_commitcnt) 9577 e_err = MDDB_NODATA; 9578 else if (md_get_setstatus(s->s_setno) & MD_SET_STALE) 9579 e_err = MDDB_STALE; 9580 9581 mddb_setexit(s); 9582 return (e_err); 9583 } 9584 9585 static int mddb_commitrec_retries = 5; 9586 9587 /* 9588 * Commit given record to disk. 9589 * If committing an optimized record, do not call 9590 * with md ioctl lock held. 9591 */ 9592 int 9593 mddb_commitrec( 9594 mddb_recid_t id 9595 ) 9596 { 9597 mddb_set_t *s; 9598 mddb_db_t *dbp; 9599 mddb_de_ic_t *dep; 9600 mddb_recid_t ids[2]; 9601 mddb_rb32_t *rbp; 9602 static int err = 0; 9603 md_mn_msg_mddb_optrecerr_t *msg_recerr; 9604 md_mn_kresult_t *kres; 9605 mddb_lb_t *lbp; 9606 mddb_mnlb_t *mnlbp; 9607 mddb_locator_t *lp; 9608 mddb_mnsidelocator_t *mnslp; 9609 mddb_drvnm_t *dn; 9610 int li; 9611 md_replica_recerr_t *recerr; 9612 int i, j; 9613 int rval; 9614 int hit_err = 0; 9615 int retry = mddb_commitrec_retries; 9616 int gave_up = 0; 9617 9618 s = mddb_setenter(DBSET(id), MDDB_NOINIT, NULL); 9619 ASSERT(s != NULL); 9620 9621 if (checkstate(s, MDDB_PROBE)) { 9622 mddb_setexit(s); 9623 return (MDDB_E_NOTNOW); 9624 } 9625 9626 if (DBID(id) == 0) { 9627 mddb_setexit(s); 9628 return (0); 9629 } 9630 9631 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 9632 for (dep = dbp->db_firstentry; dep; dep = dep->de_next) { 9633 if (dep->de_recid == DBID(id)) 9634 break; 9635 } 9636 if (dep) 9637 break; 9638 } 9639 9640 if (dep == NULL) { 9641 mddb_setexit(s); 9642 return (MDDB_E_NORECORD); 9643 } 9644 9645 if (! (dep->de_flags & MDDB_F_OPT)) { 9646 ids[0] = id; 9647 ids[1] = 0; 9648 mddb_setexit(s); 9649 return (mddb_commitrecs(ids)); 9650 } 9651 9652 /* 9653 * following code allows multiple processes to be doing 9654 * optimization commits in parallel. 9655 * NOTE: if lots of optimization commits then the lock 9656 * will not get released until it winds down 9657 */ 9658 if (s->s_optwaiterr) { 9659 while (s->s_optwaiterr) { 9660 s->s_opthungerr = 1; 9661 cv_wait(&s->s_opthungerr_cv, SETMUTEX(s->s_setno)); 9662 } 9663 if (checkstate(s, MDDB_PROBE)) { 9664 mddb_setexit(s); 9665 return (MDDB_E_NOTNOW); 9666 } 9667 } 9668 if (s->s_optcmtcnt++ == 0) { 9669 single_thread_start(s); 9670 s->s_opthavelck = 1; 9671 if (s->s_optwantlck) { 9672 cv_broadcast(&s->s_optwantlck_cv); 9673 s->s_optwantlck = 0; 9674 } 9675 } else { 9676 while (! s->s_opthavelck) { 9677 s->s_optwantlck = 1; 9678 cv_wait(&s->s_optwantlck_cv, SETMUTEX(s->s_setno)); 9679 } 9680 } 9681 9682 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 9683 for (dep = dbp->db_firstentry; dep; dep = dep->de_next) { 9684 if (dep->de_recid == DBID(id)) 9685 break; 9686 } 9687 if (dep) 9688 break; 9689 } 9690 9691 if (dep == NULL) { 9692 if (! (--s->s_optcmtcnt)) { 9693 single_thread_end(s); 9694 s->s_opthavelck = 0; 9695 } 9696 mddb_setexit(s); 9697 return (MDDB_E_NORECORD); 9698 } 9699 9700 rbp = dep->de_rb; 9701 rbp->rb_commitcnt++; 9702 uniqtime32(&rbp->rb_timestamp); 9703 /* Generate the crc for this record */ 9704 rec_crcgen(s, dep, rbp); 9705 9706 if (writeoptrecord(s, dep)) { 9707 if (MD_MNSET_SETNO(s->s_setno)) { 9708 hit_err = 1; 9709 } 9710 s->s_optwaiterr++; 9711 } 9712 if (MD_MNSET_SETNO(s->s_setno)) { 9713 /* If last thread out, release single_thread_start */ 9714 if (! (--s->s_optcmtcnt)) { 9715 single_thread_end(s); 9716 s->s_opthavelck = 0; 9717 } 9718 /* 9719 * If this thread had a writeoptrecords failure, then 9720 * need to send message to master. 9721 * But, multiple threads could all be running on the 9722 * same single_thread_start, so serialize the threads 9723 * by making each thread grab single_thread_start. 9724 * 9725 * After return from sending message to master message, 9726 * replicas associated with optimized record will havei 9727 * been changed (via a callback from the master to all 9728 * nodes), so retry call to writeoptrecord. 9729 * This code is replacing the call to writeretry that 9730 * occurs for the local and traditional disksets. 9731 */ 9732 if (hit_err) { 9733 single_thread_start(s); 9734 /* 9735 * If > 50% of replicas are alive then continue 9736 * to send message to master until writeoptrecord 9737 * succeeds. For now, assume that minor name, 9738 * major number on this node is the same as on 9739 * the master node. Once devids are turned on 9740 * for MN disksets, can send devid. 9741 */ 9742 kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP); 9743 msg_recerr = kmem_zalloc( 9744 sizeof (md_mn_msg_mddb_optrecerr_t), KM_SLEEP); 9745 while (!(md_get_setstatus(s->s_setno) & 9746 MD_SET_TOOFEW)) { 9747 bzero((caddr_t)msg_recerr, 9748 sizeof (md_mn_msg_mddb_optrecerr_t)); 9749 lbp = s->s_lbp; 9750 mnlbp = (mddb_mnlb_t *)lbp; 9751 for (i = 0; i < 2; i++) { 9752 li = dep->de_optinfo[i].o_li; 9753 lp = &lbp->lb_locators[li]; 9754 for (j = 0; j < MD_MNMAXSIDES; j++) { 9755 mnslp = 9756 &mnlbp-> 9757 lb_mnsidelocators[j][li]; 9758 if (mnslp->mnl_sideno == 9759 s->s_sideno) 9760 break; 9761 } 9762 if (j == MD_MNMAXSIDES) 9763 continue; 9764 9765 dn = &lbp-> 9766 lb_drvnm[mnslp->mnl_drvnm_index]; 9767 recerr = &msg_recerr->msg_recerr[i]; 9768 recerr->r_li = li; 9769 recerr->r_flags = 9770 dep->de_optinfo[i].o_flags; 9771 recerr->r_blkno = lp->l_blkno; 9772 recerr->r_mnum = md_getminor(lp->l_dev); 9773 (void) strncpy(recerr->r_driver_name, 9774 dn->dn_data, MD_MAXDRVNM); 9775 } 9776 9777 /* Release locks */ 9778 single_thread_end(s); 9779 mutex_exit(SETMUTEX(s->s_setno)); 9780 9781 /* 9782 * Send message to master about optimized 9783 * record failure. After return, master 9784 * should have marked failed replicas 9785 * and sent parse message to slaves causing 9786 * slaves to have fixed up the optimized 9787 * record. 9788 * On return from ksend_message, retry 9789 * the write since this node should have fixed 9790 * the optimized resync records it owns. 9791 */ 9792 rval = mdmn_ksend_message(s->s_setno, 9793 MD_MN_MSG_MDDB_OPTRECERR, 9794 MD_MSGF_NO_BCAST, 0, 9795 (char *)msg_recerr, 9796 sizeof (md_mn_msg_mddb_optrecerr_t), 9797 kres); 9798 if (!MDMN_KSEND_MSG_OK(rval, kres)) { 9799 cmn_err(CE_WARN, "mddb_commitrec: " 9800 "Unable to send optimized " 9801 "resync record failure " 9802 "message to other nodes in " 9803 "diskset %s\n", s->s_setname); 9804 mdmn_ksend_show_error(rval, kres, 9805 "MD_MN_MSG_MDDB_OPTRECERR"); 9806 } 9807 9808 /* Regrab locks */ 9809 mutex_enter(SETMUTEX(s->s_setno)); 9810 single_thread_start(s); 9811 9812 /* Start over in case mddb changed */ 9813 for (dbp = s->s_dbp; dbp != NULL; 9814 dbp = dbp->db_next) { 9815 for (dep = dbp->db_firstentry; dep; 9816 dep = dep->de_next) { 9817 if (dep->de_recid == DBID(id)) 9818 break; 9819 } 9820 if (dep) 9821 break; 9822 } 9823 if (dep) { 9824 rbp = dep->de_rb; 9825 rbp->rb_commitcnt++; 9826 uniqtime32(&rbp->rb_timestamp); 9827 /* Generate the crc for this record */ 9828 rec_crcgen(s, dep, rbp); 9829 9830 /* 9831 * If writeoptrecord succeeds, then 9832 * break out. 9833 */ 9834 if (!(writeoptrecord(s, dep))) 9835 break; 9836 } 9837 if (--retry == 0) { 9838 cmn_err(CE_WARN, "mddb_commitrec: " 9839 "giving up writing optimized " 9840 "resync record for " 9841 "diskset %s, device %s,%d " 9842 "blkno 0x%x, flags 0x%x\n", 9843 s->s_setname, recerr->r_driver_name, 9844 recerr->r_mnum, recerr->r_blkno, 9845 recerr->r_flags); 9846 gave_up++; 9847 break; 9848 } 9849 } 9850 kmem_free(kres, sizeof (md_mn_kresult_t)); 9851 kmem_free(msg_recerr, 9852 sizeof (md_mn_msg_mddb_optrecerr_t)); 9853 9854 /* Resync record should be fixed - if possible */ 9855 s->s_optwaiterr--; 9856 if (s->s_optwaiterr == 0) { 9857 /* All errors have been handled */ 9858 if (s->s_opthungerr) { 9859 s->s_opthungerr = 0; 9860 cv_broadcast(&s->s_opthungerr_cv); 9861 } 9862 } 9863 single_thread_end(s); 9864 mddb_setexit(s); 9865 if (md_get_setstatus(s->s_setno) & MD_SET_TOOFEW) { 9866 return (MDDB_E_NOTNOW); 9867 } else if (gave_up) { 9868 return (MDDB_E_STALE); 9869 } else { 9870 return (0); 9871 } 9872 } 9873 } else { 9874 /* If set is a traditional or local set */ 9875 if (! (--s->s_optcmtcnt)) { 9876 err = 0; 9877 if (s->s_optwaiterr) { 9878 err = writeretry(s); 9879 s->s_optwaiterr = 0; 9880 if (s->s_opthungerr) { 9881 s->s_opthungerr = 0; 9882 cv_broadcast(&s->s_opthungerr_cv); 9883 } 9884 } 9885 single_thread_end(s); 9886 s->s_opthavelck = 0; 9887 mddb_setexit(s); 9888 if (err) 9889 return (MDDB_E_NOTNOW); 9890 return (0); 9891 } 9892 if (s->s_optwaiterr) { 9893 while (s->s_optwaiterr) { 9894 s->s_opthungerr = 1; 9895 cv_wait(&s->s_opthungerr_cv, 9896 SETMUTEX(s->s_setno)); 9897 } 9898 if (checkstate(s, MDDB_NOPROBE)) { 9899 mddb_setexit(s); 9900 return (MDDB_E_NOTNOW); 9901 } 9902 } 9903 } 9904 9905 mddb_setexit(s); 9906 return (0); 9907 } 9908 9909 int 9910 mddb_commitrecs( 9911 mddb_recid_t ids[] 9912 ) 9913 { 9914 mddb_set_t *s; 9915 mddb_db_t *dbp; 9916 mddb_de_ic_t *dep; 9917 mddb_rb32_t *rbp; 9918 mddb_rb32_t *saverbp; 9919 mddb_lb_t *lbp; 9920 int li; 9921 uint_t checksum; 9922 mddb_recid_t *idp; 9923 int err = 0; 9924 set_t setno; 9925 9926 if (panicstr) 9927 cmn_err(CE_PANIC, "md: mddb: commit not allowed"); 9928 9929 /* 9930 * scan through and make sure ids are from the same set 9931 */ 9932 setno = DBSET(ids[0]); 9933 for (idp = ids; *idp != NULL; idp++) 9934 ASSERT(DBSET(*idp) == setno); 9935 9936 s = mddb_setenter(setno, MDDB_MUSTEXIST, NULL); 9937 9938 if (checkstate(s, MDDB_PROBE)) { 9939 mddb_setexit(s); 9940 return (MDDB_E_NOTNOW); 9941 } 9942 9943 ASSERT(s->s_lbp != NULL); 9944 err = 0; 9945 9946 if (! ids[0]) { 9947 mddb_setexit(s); 9948 return (0); 9949 } 9950 9951 single_thread_start(s); 9952 /* 9953 * scan through and make sure ids all exist 9954 */ 9955 for (idp = ids; *idp != NULL; idp++) { 9956 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 9957 for (dep = dbp->db_firstentry; dep; 9958 dep = dep->de_next) { 9959 if (dep->de_recid == DBID(*idp)) 9960 break; 9961 } 9962 if (dep != NULL) 9963 break; 9964 } 9965 if (dep == NULL) { 9966 single_thread_end(s); 9967 mddb_setexit(s); 9968 return (MDDB_E_NORECORD); 9969 } 9970 } 9971 9972 /* 9973 * scan through records fix commit counts and 9974 * zero fiddles and update time stamp and rechecksum record 9975 */ 9976 checksum = 0; 9977 idp = ids; 9978 saverbp = NULL; 9979 while (*idp) { 9980 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 9981 for (dep = dbp->db_firstentry; dep; 9982 dep = dep->de_next) { 9983 if (dep->de_recid == DBID(*idp)) 9984 break; 9985 } 9986 if (dep != NULL) 9987 break; 9988 } 9989 rbp = dep->de_rb; 9990 ASSERT(! (dep->de_flags & MDDB_F_OPT)); 9991 9992 getuserdata(setno, dep); 9993 /* Don't do fiddles for CHANGE LOG records */ 9994 if (!(dep->de_flags & MDDB_F_CHANGELOG)) { 9995 checksum ^= rbp->rb_checksum_fiddle; 9996 rbp->rb_checksum_fiddle = 0; 9997 checksum ^= rbp->rb_checksum; 9998 saverbp = rbp; 9999 } 10000 rbp->rb_commitcnt++; 10001 uniqtime32(&rbp->rb_timestamp); 10002 /* Generate the crc for this record */ 10003 rec_crcgen(s, dep, rbp); 10004 10005 /* Don't do fiddles for CHANGE LOG records */ 10006 if (!(dep->de_flags & MDDB_F_CHANGELOG)) { 10007 checksum ^= rbp->rb_checksum; 10008 } 10009 idp++; 10010 } 10011 10012 if (saverbp) 10013 saverbp->rb_checksum_fiddle = checksum; 10014 10015 /* 10016 * If this is a MN set but we are not the master, then we are not 10017 * supposed to update the mddb on disk. So we finish at this point. 10018 */ 10019 if ((setno != MD_LOCAL_SET) && (s->s_lbp->lb_flags & MDDB_MNSET) && 10020 (md_set[setno].s_am_i_master == 0)) { 10021 single_thread_end(s); 10022 mddb_setexit(s); 10023 return (0); 10024 } 10025 10026 lbp = s->s_lbp; 10027 for (li = 0; li < lbp->lb_loccnt; li++) { 10028 if (! (lbp->lb_locators[li].l_flags & MDDB_F_ACTIVE)) 10029 continue; 10030 10031 idp = ids; 10032 while (*idp) { 10033 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 10034 dep = dbp->db_firstentry; 10035 while (dep && (dep->de_recid != DBID(*idp))) 10036 dep = dep->de_next; 10037 if (dep != NULL) 10038 break; 10039 } 10040 rbp = dep->de_rb; 10041 err = wrtblklst(s, (caddr_t)rbp, dep->de_blks, 10042 dep->de_blkcount, li, (mddb_bf_t **)0, 10043 MDDB_WR_ONLY_MASTER); 10044 if (err) 10045 break; 10046 idp++; 10047 } 10048 if (err) 10049 break; 10050 } 10051 if (err) { 10052 if (writeretry(s)) { 10053 single_thread_end(s); 10054 mddb_setexit(s); 10055 return (MDDB_E_NOTNOW); 10056 } 10057 } 10058 single_thread_end(s); 10059 mddb_setexit(s); 10060 return (0); 10061 } 10062 10063 mddb_recid_t 10064 mddb_makerecid( 10065 set_t setno, 10066 mddb_recid_t id 10067 ) 10068 { 10069 return (MAKERECID(setno, id)); 10070 } 10071 10072 set_t 10073 mddb_getsetnum( 10074 mddb_recid_t id 10075 ) 10076 { 10077 return (DBSET(id)); 10078 } 10079 10080 char * 10081 mddb_getsetname( 10082 set_t setno 10083 ) 10084 { 10085 return (((mddb_set_t *)md_set[setno].s_db)->s_setname); 10086 } 10087 10088 side_t 10089 mddb_getsidenum( 10090 set_t setno 10091 ) 10092 { 10093 if (md_set[setno].s_db) 10094 return (((mddb_set_t *)md_set[setno].s_db)->s_sideno); 10095 return (0); 10096 } 10097 10098 int 10099 mddb_ownset( 10100 set_t setno 10101 ) 10102 { 10103 if ((md_get_setstatus(setno) & MD_SET_TAGDATA) && md_set[setno].s_db) 10104 return (1); 10105 10106 if (md_set[setno].s_db && ((mddb_set_t *)md_set[setno].s_db)->s_lbp) 10107 return (1); 10108 10109 return (0); 10110 } 10111 10112 /*ARGSUSED*/ 10113 int 10114 getmed_ioctl(mddb_med_parm_t *medpp, int mode) 10115 { 10116 mddb_set_t *s; 10117 int err = 0; 10118 set_t setno = medpp->med_setno; 10119 md_error_t *ep = &medpp->med_mde; 10120 10121 mdclrerror(ep); 10122 10123 if (setno >= md_nsets) 10124 return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR)); 10125 10126 if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0) 10127 return (0); 10128 10129 if ((md_get_setstatus(setno) & MD_SET_SNARFED) == 0) 10130 return (mdmddberror(ep, MDE_DB_NOTOWNER, NODEV32, setno)); 10131 10132 if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) 10133 return (mddbstatus2error(ep, err, NODEV32, setno)); 10134 10135 medpp->med = s->s_med; /* structure assignment */ 10136 10137 mddb_setexit(s); 10138 10139 return (0); 10140 } 10141 10142 int 10143 setmed_ioctl(mddb_med_parm_t *medpp, int mode) 10144 { 10145 10146 mddb_set_t *s; 10147 int err = 0; 10148 set_t setno = medpp->med_setno; 10149 md_error_t *ep = &medpp->med_mde; 10150 10151 mdclrerror(ep); 10152 10153 if ((mode & FWRITE) == 0) 10154 return (mdsyserror(ep, EACCES)); 10155 10156 /* 10157 * This should be the only thing that prevents LOCAL sets from having 10158 * mediators, at least in the kernel, userland needs to have some code 10159 * written. 10160 */ 10161 if (setno == MD_LOCAL_SET) 10162 return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR)); 10163 10164 if (setno >= md_nsets) 10165 return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR)); 10166 10167 if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0) 10168 return (0); 10169 10170 if ((md_get_setstatus(setno) & MD_SET_SNARFED) == 0) 10171 return (mdmddberror(ep, MDE_DB_NOTOWNER, NODEV32, setno)); 10172 10173 if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) 10174 return (mddbstatus2error(ep, err, NODEV32, setno)); 10175 10176 s->s_med = medpp->med; /* structure assignment */ 10177 10178 mddb_setexit(s); 10179 10180 return (0); 10181 } 10182 10183 int 10184 updmed_ioctl(mddb_med_upd_parm_t *medpp, int mode) 10185 { 10186 10187 mddb_set_t *s; 10188 int err = 0; 10189 set_t setno = medpp->med_setno; 10190 md_error_t *ep = &medpp->med_mde; 10191 10192 mdclrerror(ep); 10193 10194 if ((mode & FWRITE) == 0) 10195 return (mdsyserror(ep, EACCES)); 10196 10197 if (setno >= md_nsets) 10198 return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR)); 10199 10200 if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0) 10201 return (0); 10202 10203 if ((md_get_setstatus(setno) & MD_SET_SNARFED) == 0) 10204 return (mdmddberror(ep, MDE_DB_NOTOWNER, NODEV32, setno)); 10205 10206 if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) 10207 return (mddbstatus2error(ep, err, NODEV32, setno)); 10208 10209 single_thread_start(s); 10210 (void) upd_med(s, "updmed_ioctl()"); 10211 single_thread_end(s); 10212 10213 mddb_setexit(s); 10214 10215 return (0); 10216 } 10217 10218 int 10219 take_set(mddb_config_t *cp, int mode) 10220 { 10221 int err = 0; 10222 mddb_med_upd_parm_t medup; 10223 set_t setno = cp->c_setno; 10224 md_error_t *ep = &cp->c_mde; 10225 int snarf_ok = 0; 10226 10227 if (md_get_setstatus(setno) & MD_SET_SNARFED) 10228 return (0); 10229 10230 err = mddb_configure(MDDB_GETDEV, cp); 10231 if (! err && mdisok(ep)) { 10232 if (md_snarf_db_set(setno, ep) != 0) 10233 goto out; 10234 snarf_ok = 1; 10235 } 10236 10237 /* 10238 * Clear replicated import flag since this is 10239 * used during the take of a diskset with 10240 * previously unresolved replicated disks. 10241 */ 10242 if (md_get_setstatus(setno) & 10243 MD_SET_REPLICATED_IMPORT) { 10244 md_clr_setstatus(setno, MD_SET_REPLICATED_IMPORT); 10245 } 10246 10247 if (! err && mdisok(ep)) { 10248 if (! cp->c_flags) { 10249 medup.med_setno = setno; 10250 mdclrerror(&medup.med_mde); 10251 10252 err = updmed_ioctl(&medup, mode); 10253 if (! mdisok(&medup.med_mde)) 10254 (void) mdstealerror(ep, &medup.med_mde); 10255 } 10256 } 10257 10258 out: 10259 /* 10260 * In the case that the snarf failed, the diskset is 10261 * left with s_db set, but s_lbp not set. The node is not 10262 * an owner of the set and won't be allowed to release the 10263 * diskset in order to cleanup. With s_db set, any call to the 10264 * GETDEV or ENDDEV ioctl (done by libmeta routine metareplicalist) 10265 * will cause the diskset to be loaded. So, cleanup the diskset so 10266 * that an inadvertent start of the diskset doesn't happen later. 10267 */ 10268 if ((snarf_ok == 0) && md_set[setno].s_db && 10269 (((mddb_set_t *)md_set[setno].s_db)->s_lbp == 0)) { 10270 mutex_enter(&mddb_lock); 10271 mddb_unload_set(setno); 10272 mutex_exit(&mddb_lock); 10273 } 10274 return (err); 10275 } 10276 10277 /*ARGSUSED*/ 10278 int 10279 release_set(mddb_config_t *cp, int mode) 10280 { 10281 int err = 0; 10282 set_t setno = cp->c_setno; 10283 md_error_t *ep = &cp->c_mde; 10284 10285 /* 10286 * Data integrity check 10287 */ 10288 if (setno >= md_nsets) 10289 return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR)); 10290 10291 rw_enter(&md_unit_array_rw.lock, RW_WRITER); 10292 md_haltsnarf_enter(setno); 10293 /* 10294 * Attempt to mark set as HOLD. If it is marked as HOLD, this means 10295 * that the mirror code is currently searching all mirrors for a 10296 * errored component that needs a hotspare. While this search is in 10297 * progress, we cannot release the set and thgerefore we return EBUSY. 10298 * Once we have set HOLD, the mirror function (check_4_hotspares) will 10299 * block before the search until the set is released. 10300 */ 10301 if (md_holdset_testandenter(setno) != 0) { 10302 md_haltsnarf_exit(setno); 10303 rw_exit(&md_unit_array_rw.lock); 10304 return (EBUSY); 10305 } 10306 10307 if ((err = md_halt_set(setno, MD_HALT_ALL)) == 0) 10308 err = mddb_configure(MDDB_RELEASESET, cp); 10309 10310 md_holdset_exit(setno); 10311 md_haltsnarf_exit(setno); 10312 rw_exit(&md_unit_array_rw.lock); 10313 10314 if (! err && mdisok(ep)) { 10315 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RELEASE, SVM_TAG_SET, setno, 10316 NODEV64); 10317 } 10318 10319 return (err); 10320 } 10321 10322 int 10323 gettag_ioctl(mddb_dtag_get_parm_t *dtgpp, int mode) 10324 { 10325 mddb_set_t *s; 10326 int err = 0; 10327 mddb_dtag_lst_t *dtlp; 10328 set_t setno = dtgpp->dtgp_setno; 10329 md_error_t *ep = &dtgpp->dtgp_mde; 10330 10331 mdclrerror(ep); 10332 10333 if ((mode & FREAD) == 0) 10334 return (mdsyserror(ep, EACCES)); 10335 10336 if (setno >= md_nsets) 10337 return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR)); 10338 10339 if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0) 10340 return (0); 10341 10342 if ((s = mddb_setenter(setno, MDDB_NOINIT, &err)) == NULL) 10343 return (mddbstatus2error(ep, err, NODEV32, setno)); 10344 10345 /* 10346 * Data tags not supported on MN sets so return invalid operation. 10347 * This ioctl could be called before the mddb has been read in so 10348 * the set status may not yet be set to MNSET, so code following 10349 * this check must handle a MN diskset properly. 10350 */ 10351 if (md_get_setstatus(setno) & MD_SET_MNSET) { 10352 mddb_setexit(s); 10353 return (mderror(ep, MDE_INVAL_MNOP)); 10354 } 10355 10356 /* s_dtlp is NULL for MN diskset */ 10357 dtlp = s->s_dtlp; 10358 while (dtlp != NULL) { 10359 if (dtgpp->dtgp_dt.dt_id == 0 || 10360 dtgpp->dtgp_dt.dt_id == dtlp->dtl_dt.dt_id) { 10361 bcopy((caddr_t)&dtlp->dtl_dt, (caddr_t)&dtgpp->dtgp_dt, 10362 sizeof (mddb_dtag_t)); 10363 break; 10364 } 10365 dtlp = dtlp->dtl_nx; 10366 } 10367 10368 /* Walked the whole list and id not found, return error */ 10369 if (dtlp == (mddb_dtag_lst_t *)NULL) { 10370 mddb_setexit(s); 10371 return (mdmddberror(ep, MDE_DB_NOTAG, NODEV32, setno)); 10372 } 10373 10374 mddb_setexit(s); 10375 10376 return (0); 10377 } 10378 10379 int 10380 usetag_ioctl(mddb_dtag_use_parm_t *dtupp, int mode) 10381 { 10382 mddb_set_t *s; 10383 int err = 0; 10384 mddb_config_t *cp; 10385 mddb_ri_t *trip = NULL; 10386 mddb_dtag_t *dtagp = NULL; 10387 set_t setno = dtupp->dtup_setno; 10388 md_error_t *ep = &dtupp->dtup_mde; 10389 10390 mdclrerror(ep); 10391 10392 if ((mode & FWRITE) == 0) 10393 return (mdsyserror(ep, EACCES)); 10394 10395 if (setno >= md_nsets) 10396 return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR)); 10397 10398 if (dtupp->dtup_id < 0) 10399 return (mdsyserror(ep, EINVAL)); 10400 else if (dtupp->dtup_id == 0) 10401 return (mdmddberror(ep, MDE_DB_NOTAG, NODEV32, setno)); 10402 10403 if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0) 10404 return (0); 10405 10406 if ((md_get_setstatus(setno) & MD_SET_TAGDATA) == 0) 10407 return (mdmddberror(ep, MDE_DB_NTAGDATA, NODEV32, setno)); 10408 10409 if ((s = mddb_setenter(setno, MDDB_NOINIT, &err)) == NULL) 10410 return (mddbstatus2error(ep, err, NODEV32, setno)); 10411 10412 /* 10413 * Data tags not supported on MN sets so return invalid operation. 10414 * This ioctl could be called before the mddb has been read in so 10415 * the set status may not yet be set to MNSET, so code following 10416 * this check must handle a MN diskset properly. 10417 */ 10418 if (md_get_setstatus(setno) & MD_SET_MNSET) { 10419 mddb_setexit(s); 10420 return (mderror(ep, MDE_INVAL_MNOP)); 10421 } 10422 10423 /* Validate and find the id requested - nothing found if MN diskset */ 10424 if ((dtagp = dtl_findl(s, dtupp->dtup_id)) == NULL) { 10425 mddb_setexit(s); 10426 return (mdmddberror(ep, MDE_DB_NOTAG, NODEV32, setno)); 10427 } 10428 10429 /* Usetag is only valid when more than one tag exists */ 10430 if (dtl_cntl(s) < 2) { 10431 mddb_setexit(s); 10432 return (mdmddberror(ep, MDE_DB_NTAGDATA, NODEV32, setno)); 10433 } 10434 10435 /* Put the selected tag in place */ 10436 dt_setup(s, dtagp); 10437 10438 cp = kmem_zalloc(sizeof (mddb_config_t), KM_SLEEP); 10439 10440 /* Save the hint information */ 10441 trip = save_rip(s); 10442 10443 cp->c_timestamp = s->s_ident.createtime; /* struct assignment */ 10444 cp->c_setno = setno; 10445 cp->c_sideno = s->s_sideno; 10446 (void) strncpy(cp->c_setname, s->s_setname, MD_MAX_SETNAME); 10447 cp->c_setname[MD_MAX_SETNAME] = '\0'; 10448 cp->c_med = s->s_med; /* struct assignment */ 10449 10450 mddb_setexit(s); 10451 10452 s = NULL; 10453 10454 /* shorthand */ 10455 setno = cp->c_setno; 10456 10457 /* Let unload know not to free the tag */ 10458 md_set_setstatus(setno, MD_SET_KEEPTAG); 10459 10460 /* Release the set */ 10461 if (err = release_set(cp, mode)) 10462 goto out; 10463 10464 if (! mdisok(&cp->c_mde)) { 10465 (void) mdstealerror(ep, &cp->c_mde); 10466 err = 1; 10467 goto out; 10468 } 10469 10470 /* Re-init set using the saved mddb_config_t structure */ 10471 if ((s = mddb_setenter(setno, MDDB_NOINIT, &err)) == NULL) { 10472 if ((s = init_set(cp, MDDB_NOINIT, &err)) == NULL) { 10473 err = mddbstatus2error(ep, err, NODEV32, setno); 10474 goto out; 10475 } 10476 } 10477 10478 ASSERT(s->s_rip == (mddb_ri_t *)NULL); 10479 10480 /* use the saved rip structure */ 10481 s->s_rip = trip; 10482 trip = (mddb_ri_t *)NULL; 10483 10484 /* Let the take code know a tag is being used */ 10485 md_set_setstatus(setno, MD_SET_USETAG); 10486 10487 mddb_setexit(s); 10488 10489 s = NULL; 10490 10491 /* Take the set */ 10492 if (err = take_set(cp, mode)) 10493 goto out; 10494 10495 if (! mdisok(&cp->c_mde)) 10496 (void) mdstealerror(ep, &cp->c_mde); 10497 10498 out: 10499 md_clr_setstatus(setno, (MD_SET_USETAG | MD_SET_KEEPTAG)); 10500 10501 kmem_free(cp, sizeof (mddb_config_t)); 10502 10503 if (trip) 10504 free_rip(&trip); 10505 10506 if (s) 10507 mddb_setexit(s); 10508 10509 return (err); 10510 } 10511 10512 int 10513 accept_ioctl(mddb_accept_parm_t *accpp, int mode) 10514 { 10515 mddb_set_t *s; 10516 int err = 0; 10517 mddb_config_t *cp; 10518 mddb_ri_t *trip = NULL; 10519 set_t setno = accpp->accp_setno; 10520 md_error_t *ep = &accpp->accp_mde; 10521 10522 mdclrerror(ep); 10523 10524 if ((mode & FWRITE) == 0) 10525 return (mdsyserror(ep, EACCES)); 10526 10527 if (setno >= md_nsets) 10528 return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR)); 10529 10530 if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0) 10531 return (0); 10532 10533 if ((md_get_setstatus(setno) & MD_SET_ACCOK) == 0) 10534 return (mdmddberror(ep, MDE_DB_ACCNOTOK, NODEV32, setno)); 10535 10536 if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) 10537 return (mddbstatus2error(ep, err, NODEV32, setno)); 10538 10539 /* 10540 * Data tags not supported on MN sets so return invalid operation. 10541 * mddb is guaranteed to be incore at this point, so this 10542 * check will catch all MN disksets. 10543 */ 10544 if (md_get_setstatus(setno) & MD_SET_MNSET) { 10545 mddb_setexit(s); 10546 return (mderror(ep, MDE_INVAL_MNOP)); 10547 } 10548 10549 cp = kmem_zalloc(sizeof (mddb_config_t), KM_SLEEP); 10550 10551 trip = save_rip(s); 10552 10553 cp->c_timestamp = s->s_ident.createtime; /* struct assignment */ 10554 cp->c_setno = setno; 10555 cp->c_sideno = s->s_sideno; 10556 (void) strncpy(cp->c_setname, s->s_setname, MD_MAX_SETNAME); 10557 cp->c_setname[MD_MAX_SETNAME] = '\0'; 10558 cp->c_med = s->s_med; /* struct assignment */ 10559 10560 /* Tag the data */ 10561 if (err = set_dtag(s, ep)) { 10562 err = mdsyserror(ep, err); 10563 goto out; 10564 } 10565 10566 /* If we had a BADTAG, it will be re-written, so clear the bit. */ 10567 if (md_get_setstatus(setno) & MD_SET_BADTAG) 10568 md_clr_setstatus(setno, MD_SET_BADTAG); 10569 10570 if (err = dt_write(s)) { 10571 err = mdsyserror(ep, err); 10572 goto out; 10573 } 10574 10575 mddb_setexit(s); 10576 10577 s = NULL; 10578 10579 /* shorthand */ 10580 setno = cp->c_setno; 10581 10582 /* Clear the keeptag */ 10583 md_clr_setstatus(setno, MD_SET_KEEPTAG); 10584 10585 /* Release the set */ 10586 if (err = release_set(cp, mode)) 10587 goto out; 10588 10589 if (! mdisok(&cp->c_mde)) { 10590 (void) mdstealerror(ep, &cp->c_mde); 10591 goto out; 10592 } 10593 10594 /* Re-init set using the saved mddb_config_t structure */ 10595 if ((s = mddb_setenter(setno, MDDB_NOINIT, &err)) == NULL) { 10596 if ((s = init_set(cp, MDDB_NOINIT, &err)) == NULL) { 10597 err = mddbstatus2error(ep, err, NODEV32, setno); 10598 goto out; 10599 } 10600 } 10601 10602 ASSERT(s->s_rip == (mddb_ri_t *)NULL); 10603 10604 /* Free the allocated rip structure */ 10605 if (s->s_rip != (mddb_ri_t *)NULL) 10606 free_rip(&s->s_rip); 10607 10608 /* use the saved rip structure */ 10609 s->s_rip = trip; 10610 trip = (mddb_ri_t *)NULL; 10611 10612 /* Let the set init code know an accept is in progress */ 10613 md_set_setstatus(setno, MD_SET_ACCEPT); 10614 10615 mddb_setexit(s); 10616 10617 s = NULL; 10618 10619 /* Take the set */ 10620 if (err = take_set(cp, mode)) 10621 goto out; 10622 10623 if (! mdisok(&cp->c_mde)) 10624 (void) mdstealerror(ep, &cp->c_mde); 10625 10626 out: 10627 md_clr_setstatus(setno, (MD_SET_ACCOK | MD_SET_ACCEPT)); 10628 10629 kmem_free(cp, sizeof (mddb_config_t)); 10630 10631 if (trip) 10632 free_rip(&trip); 10633 10634 if (s) 10635 mddb_setexit(s); 10636 10637 return (err); 10638 } 10639 10640 /* 10641 * mddb_getinvlb_devid - cycles through the locator block and determines 10642 * if the device id's for any of the replica disks are invalid. 10643 * If so, it returns the diskname in the ctdptr. 10644 * RETURN 10645 * -1 Error 10646 * cnt number of invalid device id's 10647 */ 10648 int 10649 mddb_getinvlb_devid( 10650 set_t setno, 10651 int count, 10652 int size, 10653 char **ctdptr 10654 ) 10655 { 10656 mddb_set_t *s; 10657 int err = 0; 10658 mddb_lb_t *lbp; 10659 int li; 10660 mddb_did_blk_t *did_blk; 10661 mddb_did_info_t *did_info; 10662 int len; 10663 int cnt = 0; 10664 char *cptr; 10665 md_name_suffix *sn; 10666 int i, dont_add_it; 10667 char *tmpctd, *diskname; 10668 char *tmpname; 10669 10670 cptr = *ctdptr; 10671 if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) { 10672 return (-1); 10673 } 10674 10675 single_thread_start(s); 10676 lbp = s->s_lbp; 10677 10678 if (lbp->lb_setno != setno) { 10679 single_thread_end(s); 10680 mddb_setexit(s); 10681 return (-1); 10682 } 10683 10684 /* check for lb being devid style */ 10685 if (lbp->lb_flags & MDDB_DEVID_STYLE) { 10686 did_blk = s->s_did_icp->did_ic_blkp; 10687 for (li = 0; li < lbp->lb_loccnt; li++) { 10688 did_info = &(did_blk->blk_info[li]); 10689 /* Only if devid exists and isn't valid */ 10690 if ((did_info->info_flags & MDDB_DID_EXISTS) && 10691 !(did_info->info_flags & MDDB_DID_VALID)) { 10692 /* 10693 * if we count more invalid did's than 10694 * was passed in there's an error somewhere 10695 */ 10696 if (cnt++ > count) { 10697 single_thread_end(s); 10698 mddb_setexit(s); 10699 return (-1); 10700 } 10701 10702 /* 10703 * Future note: Need to do something here 10704 * for the MN diskset case when device ids 10705 * are supported in disksets. 10706 * Can't add until merging devids_in_diskset 10707 * code into code base. 10708 */ 10709 10710 sn = &s->s_lnp->ln_suffixes[0][li]; 10711 /* 10712 * check to make sure length of device name is 10713 * not greater than computed first time through 10714 */ 10715 len = sn->suf_len; 10716 if (len > size) { 10717 single_thread_end(s); 10718 mddb_setexit(s); 10719 return (-1); 10720 } 10721 tmpctd = *ctdptr; 10722 /* strip off slice part */ 10723 diskname = md_strdup(sn->suf_data); 10724 tmpname = strrchr(diskname, 's'); 10725 *tmpname = '\0'; 10726 dont_add_it = 0; 10727 /* look to see if diskname is already in list */ 10728 for (i = 0; i < (cnt-1); i++) { 10729 if (strcmp(diskname, tmpctd) == 0) { 10730 /* already there, don't add */ 10731 dont_add_it = 1; 10732 break; 10733 } 10734 /* point to next diskname in list */ 10735 tmpctd += size; 10736 } 10737 if (dont_add_it == 0) { 10738 /* add diskname to list */ 10739 (void) strcpy(cptr, diskname); 10740 cptr += size; 10741 } 10742 kmem_free(diskname, strlen(sn->suf_data) + 1); 10743 } 10744 } 10745 } 10746 /* null terminate the list */ 10747 *cptr = '\0'; 10748 /* 10749 * need to save the new pointer so that calling routine can continue 10750 * to add information onto the end. 10751 */ 10752 *ctdptr = cptr; 10753 single_thread_end(s); 10754 mddb_setexit(s); 10755 return (cnt); 10756 } 10757 10758 /* 10759 * mddb_validate_lb - count the number of lb's with invalid device id's. Keep 10760 * track of length of longest devicename. 10761 * RETURN 10762 * -1 error 10763 * cnt number of lb's with invalid devid's 10764 */ 10765 int 10766 mddb_validate_lb( 10767 set_t setno, 10768 int *rmaxsz 10769 ) 10770 { 10771 mddb_set_t *s; 10772 int err = 0; 10773 mddb_lb_t *lbp; 10774 int li; 10775 mddb_did_blk_t *did_blk; 10776 mddb_did_info_t *did_info; 10777 int len; 10778 int cnt = 0; 10779 10780 if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) 10781 return (-1); 10782 10783 single_thread_start(s); 10784 lbp = s->s_lbp; 10785 10786 if (lbp->lb_setno != setno) { 10787 single_thread_end(s); 10788 mddb_setexit(s); 10789 return (-1); 10790 } 10791 10792 /* lb must be in devid style */ 10793 if ((lbp->lb_flags & MDDB_DEVID_STYLE) == 0) 10794 goto mvl_out; 10795 10796 did_blk = s->s_did_icp->did_ic_blkp; 10797 for (li = 0; li < lbp->lb_loccnt; li++) { 10798 char *minor_name; 10799 mddb_locator_t *lp; 10800 dev_t ddi_dev; 10801 ddi_devid_t devid; 10802 ddi_devid_t rtn_devid = NULL; 10803 int get_rval; 10804 10805 did_info = &(did_blk->blk_info[li]); 10806 if (((did_info->info_flags & MDDB_DID_EXISTS) == 0) || 10807 (did_info->info_flags & MDDB_DID_VALID)) 10808 continue; 10809 10810 /* Here we know, did exists but isn't valid */ 10811 10812 lp = &lbp->lb_locators[li]; 10813 ddi_dev = expldev(lp->l_dev); 10814 get_rval = mddb_devid_get(s, li, &devid, &minor_name); 10815 ASSERT(get_rval == 1); 10816 if ((ddi_lyr_get_devid(ddi_dev, &rtn_devid) == DDI_SUCCESS) && 10817 (ddi_devid_compare(rtn_devid, devid) == 0)) { 10818 did_info->info_flags = MDDB_DID_VALID | 10819 MDDB_DID_EXISTS | MDDB_DID_UPDATED; 10820 } else { 10821 cnt++; 10822 /* 10823 * Future note: Need to do something here 10824 * for the MN diskset case when device ids 10825 * are supported in disksets. 10826 * Can't add until merging devids_in_diskset 10827 * code into code base. 10828 */ 10829 len = (&s->s_lnp->ln_suffixes[0][li])-> suf_len; 10830 if (*rmaxsz < len) 10831 *rmaxsz = len; 10832 } 10833 if (rtn_devid != NULL) 10834 ddi_devid_free(rtn_devid); 10835 } 10836 10837 mvl_out: 10838 10839 if (push_lb(s) != 0) 10840 cnt = -1; 10841 (void) upd_med(s, "mddb_validate_lb(0)"); 10842 single_thread_end(s); 10843 mddb_setexit(s); 10844 return (cnt); 10845 } 10846 10847 int 10848 check_active_locators() 10849 { 10850 mddb_set_t *s; 10851 mddb_lb_t *lbp; 10852 int li; 10853 int active = 0; 10854 10855 mutex_enter(&mddb_lock); 10856 /* there is nothing here..so we can unload */ 10857 if ((mddb_set_t *)md_set[MD_LOCAL_SET].s_db == NULL) { 10858 mutex_exit(&mddb_lock); 10859 return (0); 10860 } 10861 s = (mddb_set_t *)md_set[MD_LOCAL_SET].s_db; 10862 lbp = s->s_lbp; 10863 if (lbp == NULL) { 10864 mutex_exit(&mddb_lock); 10865 return (0); 10866 } 10867 10868 for (li = 0; li < lbp->lb_loccnt; li++) { 10869 mddb_locator_t *lp = &lbp->lb_locators[li]; 10870 if (lp->l_flags & MDDB_F_ACTIVE) { 10871 active = 1; 10872 break; 10873 } 10874 } 10875 mutex_exit(&mddb_lock); 10876 return (active); 10877 } 10878 10879 /* 10880 * regetoptrecord: 10881 * -------------- 10882 * Update the in-core optimized resync record contents by re-reading the 10883 * record from the on-disk metadb. 10884 * The contents of the resync record will be overwritten by calling this 10885 * routine. This means that callers that require the previous contents to 10886 * be preserved must save the data before calling this routine. 10887 * Return values: 10888 * 0 - successfully read in resync record from a mddb 10889 * 1 - failure. Unable to read resync record from either mddb. 10890 */ 10891 static int 10892 regetoptrecord( 10893 mddb_set_t *s, 10894 mddb_de_ic_t *dep 10895 ) 10896 { 10897 mddb_lb_t *lbp; 10898 mddb_locator_t *lp; 10899 mddb_rb32_t *rbp, *crbp; 10900 int li; 10901 int i; 10902 int err = 0; 10903 size_t recsize; 10904 10905 #if defined(_ILP32) && !defined(lint) 10906 ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t)); 10907 #endif 10908 10909 recsize = dep->de_recsize; 10910 crbp = (mddb_rb32_t *)kmem_zalloc(recsize, KM_SLEEP); 10911 10912 single_thread_start(s); 10913 rbp = dep->de_rb; 10914 10915 dep->de_optinfo[0].o_flags |= MDDB_F_EDATA; 10916 dep->de_optinfo[1].o_flags |= MDDB_F_EDATA; 10917 10918 lbp = s->s_lbp; 10919 10920 for (i = 0; i < 2; i++) { 10921 if (! (dep->de_optinfo[i].o_flags & MDDB_F_ACTIVE)) 10922 continue; 10923 li = dep->de_optinfo[i].o_li; 10924 lp = &lbp->lb_locators[li]; 10925 10926 if (! (lp->l_flags & MDDB_F_ACTIVE) || 10927 (lp->l_flags & MDDB_F_EMASTER)) 10928 continue; 10929 10930 /* 10931 * re-read the optimized resync record with failfast set 10932 * since a failed disk could lead to a very long wait. 10933 */ 10934 err = readblklst(s, (caddr_t)rbp, dep->de_blks, 10935 dep->de_blkcount, li, B_FAILFAST); 10936 10937 if (err) 10938 continue; 10939 10940 if (rbp->rb_magic != MDDB_MAGIC_RB) 10941 continue; 10942 10943 if (revchk(MDDB_REV_RB, rbp->rb_revision)) 10944 continue; 10945 10946 /* Check the crc for this record */ 10947 if (rec_crcchk(s, dep, rbp)) { 10948 continue; 10949 } 10950 dep->de_optinfo[i].o_flags = MDDB_F_ACTIVE; 10951 10952 if (rbp == crbp) { 10953 if (rbp->rb_checksum != crbp->rb_checksum) 10954 dep->de_optinfo[1].o_flags |= MDDB_F_EDATA; 10955 break; 10956 } 10957 rbp = crbp; 10958 } 10959 10960 single_thread_end(s); 10961 10962 if (rbp == crbp) { 10963 rbp->rb_private = 0; 10964 kmem_free((caddr_t)crbp, recsize); 10965 return (0); 10966 } 10967 uniqtime32(&rbp->rb_timestamp); 10968 /* Generate the crc for this record */ 10969 rec_crcgen(s, dep, rbp); 10970 kmem_free((caddr_t)crbp, recsize); 10971 return (1); 10972 } 10973 10974 /* 10975 * mddb_reread_rr: 10976 * Re-read the resync record from the on-disk copy. This is required for 10977 * multi-node support so that a new mirror-owner can determine if a resync 10978 * operation is required to guarantee data integrity. 10979 * 10980 * Arguments: 10981 * setno Associated set 10982 * id Resync record ID 10983 * 10984 * Return Value: 10985 * 0 successful reread 10986 * -1 invalid set (not multi-node or non-existant) 10987 * >0 metadb state invalid, failed to reread 10988 */ 10989 int 10990 mddb_reread_rr( 10991 set_t setno, 10992 mddb_recid_t id 10993 ) 10994 { 10995 mddb_set_t *s; 10996 int err = 0; 10997 mddb_db_t *dbp; 10998 mddb_de_ic_t *dep; 10999 11000 if (setno >= md_nsets) 11001 return (-1); 11002 11003 if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) 11004 return (-1); 11005 11006 if ((setno == MD_LOCAL_SET) || !(s->s_lbp->lb_flags & MDDB_MNSET)) { 11007 mddb_setexit(s); 11008 return (-1); 11009 } 11010 11011 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 11012 dep = dbp->db_firstentry; 11013 while (dep && (dep->de_recid != DBID(id))) 11014 dep = dep->de_next; 11015 if (dep != NULL) 11016 break; 11017 } 11018 11019 if (dep != NULL) { 11020 err = regetoptrecord(s, dep); 11021 } else { 11022 err = -1; 11023 } 11024 mddb_setexit(s); 11025 return (err); 11026 } 11027 11028 /* 11029 * Set owner associated with MN optimized resync record. 11030 * 11031 * Optimized records have an owner node associated with them in 11032 * a MN diskset. The owner is only set on a node that is actively 11033 * writing to that record. The other nodes will show that record 11034 * as having an invalid owner. The owner for an optimized record 11035 * is used during fixoptrecord to determine which node should 11036 * write out the record when the replicas associated with that 11037 * optimized record have been changed. 11038 * 11039 * Called directly from mirror driver and not from an ioctl. 11040 * 11041 * Returns 11042 * NULL if successful. 11043 * MDDB_E_NORECORD if record not found. 11044 */ 11045 int 11046 mddb_setowner( 11047 mddb_recid_t id, 11048 md_mn_nodeid_t owner 11049 ) 11050 { 11051 mddb_set_t *s; 11052 mddb_db_t *dbp; 11053 mddb_de_ic_t *dep; 11054 int found = 0; 11055 11056 11057 if (DBSET(id) >= md_nsets) 11058 return (MDDB_E_NORECORD); 11059 11060 if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, NULL)) == NULL) 11061 return (MDDB_E_NORECORD); 11062 11063 id = DBID(id); 11064 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 11065 for (dep = dbp->db_firstentry; 11066 dep != NULL; dep = dep->de_next) { 11067 if (dep->de_recid != id) 11068 continue; 11069 dep->de_owner_nodeid = owner; 11070 found = 1; 11071 break; 11072 } 11073 if (found) 11074 break; 11075 } 11076 11077 mddb_setexit(s); 11078 11079 if (!found) { 11080 return (MDDB_E_NORECORD); 11081 } 11082 11083 return (NULL); 11084 } 11085 11086 /* 11087 * mddb_parse re-reads portions of the mddb from disk given a list 11088 * of good replicas to read from and flags describing 11089 * which portion of the mddb to read in. 11090 * 11091 * Used in a MN diskset when the master has made a change to some part 11092 * of the mddb and wants to relay this information to the slaves. 11093 */ 11094 int 11095 mddb_parse(mddb_parse_parm_t *mpp) 11096 { 11097 mddb_set_t *s; 11098 int err = 0; 11099 mddb_locator_t *lp, *old_lp; 11100 mddb_lb_t *lbp, *old_lbp; 11101 int rval = 0; 11102 int i, li; 11103 int found_good_one = 0; 11104 mddb_ln_t *lnp; 11105 mddb_block_t ln_blkcnt; 11106 md_error_t *ep = &mpp->c_mde; 11107 11108 if (mpp->c_setno >= md_nsets) 11109 return (EINVAL); 11110 11111 if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0) 11112 return (0); 11113 11114 if ((s = mddb_setenter(mpp->c_setno, MDDB_MUSTEXIST, &err)) == NULL) { 11115 return (mddbstatus2error(ep, err, NODEV32, mpp->c_setno)); 11116 } 11117 11118 if (!(MD_MNSET_SETNO(mpp->c_setno))) { 11119 mddb_setexit_no_parse(s); 11120 return (EINVAL); 11121 } 11122 11123 /* 11124 * Master node initiated this request, so there's no work for 11125 * the master node to do. 11126 */ 11127 if (md_set[mpp->c_setno].s_am_i_master) { 11128 mddb_setexit_no_parse(s); 11129 return (rval); 11130 } 11131 11132 single_thread_start(s); 11133 11134 if (mpp->c_parse_flags & MDDB_PARSE_LOCBLK) { 11135 lbp = 0; 11136 for (i = 0; i < MDDB_NLB; i++) { 11137 /* Walk through master's active list */ 11138 if (!(mpp->c_lb_flags[i] & MDDB_F_ACTIVE)) 11139 continue; 11140 if (s->s_mbiarray[i] == NULL) 11141 continue; 11142 11143 /* Assumes master blocks are already setup */ 11144 if (lbp == (mddb_lb_t *)NULL) { 11145 lbp = (mddb_lb_t *)kmem_zalloc( 11146 dbtob(MDDB_MNLBCNT), KM_SLEEP); 11147 } 11148 err |= readblks(s, (caddr_t)lbp, 0, lbp->lb_blkcnt, i); 11149 11150 if (err) 11151 continue; 11152 11153 if (lbp->lb_magic != MDDB_MAGIC_LB) 11154 continue; 11155 if (lbp->lb_blkcnt != MDDB_MNLBCNT) 11156 continue; 11157 if (revchk(MDDB_REV_MNLB, lbp->lb_revision)) 11158 continue; 11159 if (crcchk(lbp, &lbp->lb_checksum, dbtob(MDDB_MNLBCNT), 11160 NULL)) 11161 continue; 11162 if (lbp->lb_setno != s->s_setno) 11163 continue; 11164 /* 11165 * a commit count of zero means this locator has 11166 * been deleted 11167 */ 11168 if (lbp->lb_commitcnt == 0) { 11169 continue; 11170 } 11171 /* Found a good locator - keep it */ 11172 found_good_one = 1; 11173 break; 11174 } 11175 11176 /* 11177 * If found a good copy of the mddb, then read it into 11178 * this node's locator block. Fix up the set's s_mbiarray 11179 * pointer (master block incore array pointer) to be 11180 * in sync with the newly read in locator block. If a 11181 * new mddb was added, read in the master blocks associated 11182 * with the new mddb. If an mddb was deleted, free the 11183 * master blocks associated with deleted mddb. 11184 */ 11185 if (found_good_one) { 11186 /* Compare old and new view of mddb locator blocks */ 11187 old_lbp = s->s_lbp; 11188 for (li = 0; li < lbp->lb_loccnt; li++) { 11189 int mn_set; 11190 11191 lp = &lbp->lb_locators[li]; 11192 old_lp = &old_lbp->lb_locators[li]; 11193 11194 /* If old and new views match, continue */ 11195 if ((lp->l_flags & MDDB_F_ACTIVE) == 11196 (old_lp->l_flags & MDDB_F_ACTIVE)) 11197 continue; 11198 11199 if (lp->l_flags & MDDB_F_ACTIVE) { 11200 /* 11201 * If new mddb has been added - delete 11202 * old mbiarray and get new one. 11203 * 11204 * When devids are supported, will 11205 * need to get dev from devid. 11206 */ 11207 if (s->s_mbiarray[li]) { 11208 free_mbipp(&s->s_mbiarray[li]); 11209 } 11210 /* 11211 * If getmasters fails, getmasters 11212 * will set appropriate error flags. 11213 */ 11214 s->s_mbiarray[li] = getmasters(s, 11215 md_expldev(lp->l_dev), lp->l_blkno, 11216 (uint_t *)&(lp->l_flags), &mn_set); 11217 } else if (lp->l_flags & MDDB_F_DELETED) { 11218 /* 11219 * If old one has been deleted - 11220 * delete old mbiarray. 11221 */ 11222 if (s->s_mbiarray[li]) { 11223 free_mbipp(&s->s_mbiarray[li]); 11224 } 11225 } 11226 } 11227 11228 /* Free this node's old view of mddb locator blocks */ 11229 kmem_free((caddr_t)s->s_lbp, 11230 dbtob(s->s_lbp->lb_blkcnt)); 11231 s->s_lbp = lbp; 11232 } else { 11233 if (lbp) 11234 kmem_free(lbp, dbtob(MDDB_MNLBCNT)); 11235 } 11236 } 11237 11238 if (mpp->c_parse_flags & MDDB_PARSE_LOCNM) { 11239 lnp = s->s_lnp; 11240 lbp = s->s_lbp; 11241 ln_blkcnt = lbp->lb_lnblkcnt; 11242 s->s_lnp = NULL; /* readlocnames does this anyway */ 11243 for (li = 0; li < lbp->lb_loccnt; li++) { 11244 lp = &lbp->lb_locators[li]; 11245 11246 if ((! (lp->l_flags & MDDB_F_ACTIVE)) || 11247 (lp->l_flags & MDDB_F_EMASTER)) 11248 continue; 11249 11250 /* Successfully read the locator names */ 11251 if (readlocnames(s, li) == 0) 11252 break; 11253 } 11254 11255 if (li == lbp->lb_loccnt) { 11256 /* Did not successfully read locnames; restore lnp */ 11257 s->s_lnp = lnp; 11258 } else { 11259 /* readlocnames successful, free old struct */ 11260 kmem_free((caddr_t)lnp, dbtob(ln_blkcnt)); 11261 } 11262 } 11263 11264 if (mpp->c_parse_flags & MDDB_PARSE_OPTRECS) { 11265 mddb_de_ic_t *dep, *tdep, *first_dep, *dep2; 11266 mddb_db_t *dbp; 11267 mddb_db32_t *db32p; 11268 mddb_de32_t *de32p, *de32p2; 11269 int writeout; 11270 11271 lbp = s->s_lbp; 11272 /* 11273 * Walk through directory block and directory entry incore 11274 * linked list looking for optimized resync records. 11275 * For each opt record found, re-read in directory block. 11276 * The directoy block consists of a number of directory 11277 * entries. The directory entry for this opt record will 11278 * describe which 2 mddbs actually contain the resync record 11279 * since it could have been relocated by the master node 11280 * due to mddb failure or mddb deletion. If this node 11281 * is the record owner for this opt record, then write out 11282 * the record to the 2 mddbs listed in the directory entry 11283 * if the mddbs locations are different than previously known. 11284 */ 11285 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 11286 for (dep = dbp->db_firstentry; dep; 11287 dep = dep->de_next) { 11288 /* Found an opt record */ 11289 if (dep->de_flags & MDDB_F_OPT) 11290 break; 11291 } 11292 /* If no opt records found, go to next dbp */ 11293 if (dep == NULL) 11294 continue; 11295 11296 /* 11297 * Reread directory block from disk since 11298 * master could have rewritten in during fixoptrecord. 11299 */ 11300 db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, 11301 KM_SLEEP); 11302 create_db32rec(db32p, dbp); 11303 for (li = 0; li < lbp->lb_loccnt; li++) { 11304 lp = &lbp->lb_locators[li]; 11305 11306 if ((! (lp->l_flags & MDDB_F_ACTIVE)) || 11307 (lp->l_flags & MDDB_F_EMASTER)) 11308 continue; 11309 11310 err = readblks(s, (caddr_t)db32p, 11311 db32p->db32_blknum, 1, li); 11312 if (err) 11313 continue; 11314 11315 /* Reverify db; go to next mddb if bad */ 11316 if ((db32p->db32_magic != MDDB_MAGIC_DB) || 11317 (revchk(MDDB_REV_DB, 11318 db32p->db32_revision)) || 11319 (crcchk(db32p, &db32p->db32_checksum, 11320 MDDB_BSIZE, NULL))) { 11321 continue; 11322 } else { 11323 break; 11324 } 11325 } 11326 /* 11327 * If all mddbs are unavailable then panic since 11328 * this slave cannot be allowed to continue out-of-sync 11329 * with the master node. Since the optimized resync 11330 * records are written by all nodes, all nodes must 11331 * stay in sync with the master. 11332 * 11333 * This also handles the case when all storage 11334 * connectivity to a slave node has failed. The 11335 * slave node will send an MDDB_OPTRECERR message to 11336 * the master node when the slave node has been unable 11337 * to write an optimized resync record to both 11338 * designated mddbs. After the master has fixed the 11339 * optimized records to be on available mddbs, the 11340 * MDDB_PARSE message (with the flag MDDB_PARSE_OPTRECS) 11341 * is sent to all slave nodes. If a slave node is 11342 * unable to access any mddb in order to read in the 11343 * relocated optimized resync record, then the slave 11344 * node must panic. 11345 */ 11346 if (li == lbp->lb_loccnt) { 11347 kmem_free((caddr_t)db32p, MDDB_BSIZE); 11348 cmn_err(CE_PANIC, "md: mddb: Node unable to " 11349 "access any SVM state database " 11350 "replicas for diskset %s\n", s->s_setname); 11351 } 11352 /* 11353 * Setup temp copy of linked list of de's. 11354 * Already have an incore copy, but need to walk 11355 * the directory entry list contained in the 11356 * new directory block that was just read in above. 11357 * After finding the directory entry of an opt record 11358 * by walking the incore list, find the corresponding 11359 * entry in the temporary list and then update 11360 * the incore directory entry record with 11361 * the (possibly changed) mddb location stored 11362 * for the optimized resync records. 11363 */ 11364 de32p = (mddb_de32_t *) 11365 ((void *) ((caddr_t) 11366 (&db32p->db32_firstentry) 11367 + sizeof (db32p->db32_firstentry))); 11368 tdep = (mddb_de_ic_t *) 11369 kmem_zalloc(sizeof (mddb_de_ic_t) - 11370 sizeof (mddb_block_t) + 11371 sizeof (mddb_block_t) * 11372 de32p->de32_blkcount, KM_SLEEP); 11373 de32tode(de32p, tdep); 11374 first_dep = tdep; 11375 while (de32p && de32p->de32_next) { 11376 de32p2 = nextentry(de32p); 11377 dep2 = (mddb_de_ic_t *)kmem_zalloc( 11378 sizeof (mddb_de_ic_t) - 11379 sizeof (mddb_block_t) + 11380 sizeof (mddb_block_t) * 11381 de32p2->de32_blkcount, KM_SLEEP); 11382 de32tode(de32p2, dep2); 11383 tdep->de_next = dep2; 11384 tdep = dep2; 11385 de32p = de32p2; 11386 } 11387 11388 /* Now, walk the incore directory entry list */ 11389 for (dep = dbp->db_firstentry; dep; 11390 dep = dep->de_next) { 11391 if (! (dep->de_flags & MDDB_F_OPT)) 11392 continue; 11393 /* 11394 * Found an opt record in the incore copy. 11395 * Find the corresponding entry in the temp 11396 * list. If anything has changed in the 11397 * opt record info between the incore copy 11398 * and the temp copy, update the incore copy 11399 * and set a flag to writeout the opt record 11400 * to the new mddb locations. 11401 */ 11402 for (tdep = first_dep; tdep; 11403 tdep = tdep->de_next) { 11404 if (dep->de_recid == tdep->de_recid) { 11405 writeout = 0; 11406 /* Check first mddb location */ 11407 if ((dep->de_optinfo[0].o_li != 11408 tdep->de_optinfo[0].o_li) || 11409 (dep->de_optinfo[0]. 11410 o_flags != tdep->de_optinfo 11411 [0].o_flags)) { 11412 dep->de_optinfo[0] = 11413 tdep->de_optinfo[0]; 11414 writeout = 1; 11415 } 11416 /* Check second mddb location */ 11417 if ((dep->de_optinfo[1].o_li != 11418 tdep->de_optinfo[1].o_li) || 11419 (dep->de_optinfo[1]. 11420 o_flags != tdep->de_optinfo 11421 [1].o_flags)) { 11422 dep->de_optinfo[1] = 11423 tdep->de_optinfo[1]; 11424 writeout = 1; 11425 } 11426 /* 11427 * Record owner should rewrite 11428 * it 11429 */ 11430 if ((writeout) && 11431 (dep->de_owner_nodeid == 11432 md_set[mpp->c_setno]. 11433 s_nodeid)) 11434 (void) writeoptrecord(s, 11435 dep); 11436 break; 11437 } 11438 } 11439 } 11440 /* 11441 * Update the incore checksum information for this 11442 * directory block to match the newly read in checksum. 11443 * This should have only changed if the incore and 11444 * temp directory entries differed, but it takes 11445 * more code to do the check than to just update 11446 * the information everytime. 11447 */ 11448 dbp->db_checksum = db32p->db32_checksum; 11449 11450 /* Now free everything */ 11451 tdep = first_dep; 11452 while (tdep) { 11453 dep2 = tdep->de_next; 11454 kmem_free((caddr_t)tdep, 11455 sizeofde(tdep)); 11456 tdep = dep2; 11457 } 11458 kmem_free((caddr_t)db32p, MDDB_BSIZE); 11459 } 11460 rval = 0; 11461 } 11462 out: 11463 single_thread_end(s); 11464 mddb_setexit_no_parse(s); 11465 return (rval); 11466 } 11467 11468 int 11469 mddb_block(mddb_block_parm_t *mbp) 11470 { 11471 mddb_set_t *s; 11472 int err = 0; 11473 md_error_t *ep = &mbp->c_mde; 11474 11475 if (mbp->c_setno >= md_nsets) 11476 return (EINVAL); 11477 11478 /* 11479 * If the new_master flag is set for this setno we are in the middle 11480 * of a reconfig cycle, and blocking or unblocking is not needed. 11481 * Hence we can return success immediately 11482 */ 11483 if (md_get_setstatus(mbp->c_setno) & MD_SET_MN_NEWMAS_RC) { 11484 return (0); 11485 } 11486 11487 if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0) 11488 return (0); 11489 11490 if ((s = mddb_setenter(mbp->c_setno, MDDB_MUSTEXIST, &err)) == NULL) { 11491 return (mddbstatus2error(ep, err, NODEV32, mbp->c_setno)); 11492 } 11493 11494 if (!(MD_MNSET_SETNO(mbp->c_setno))) { 11495 mddb_setexit_no_parse(s); 11496 return (EINVAL); 11497 } 11498 11499 single_thread_start(s); 11500 11501 if (mbp->c_blk_flags & MDDB_BLOCK_PARSE) 11502 md_set_setstatus(mbp->c_setno, MD_SET_MNPARSE_BLK); 11503 11504 if (mbp->c_blk_flags & MDDB_UNBLOCK_PARSE) 11505 md_clr_setstatus(mbp->c_setno, MD_SET_MNPARSE_BLK); 11506 11507 single_thread_end(s); 11508 mddb_setexit_no_parse(s); 11509 return (err); 11510 } 11511 11512 /* 11513 * mddb_optrecfix marks up to 2 mddbs as failed and calls fixoptrecords 11514 * to relocate any optimized resync records to available mddbs. 11515 * This routine is only called on the master node. 11516 * 11517 * Used in a MN diskset when a slave node has failed to write an optimized 11518 * resync record. The failed mddb information is sent to the master node 11519 * so the master can relocate the optimized records, if possible. If the 11520 * failed mddb information has a mddb marked as failed that was previously 11521 * marked active on the master, the master sets its incore mddb state to 11522 * EWRITE and sets the PARSE_LOCBLK flag. The master node then attempts 11523 * to relocate any optimized records on the newly failed mddbs by calling 11524 * fixoptrecords. (fixoptrecords will set the PARSE_OPTRECS flag if any 11525 * optimized records are relocated.) 11526 * 11527 * When mddb_optrecfix is finished, the ioctl exit code will notice the PARSE 11528 * flags and will send a PARSE message to the slave nodes. The PARSE_LOCBLK 11529 * flag causes the slave node to re-read in the locator block from disk. 11530 * The PARSE_OPTRECS flag causes the slave node to re-read in the directory 11531 * blocks and write out any optimized resync records that have been 11532 * relocated to a different mddb. 11533 */ 11534 int 11535 mddb_optrecfix(mddb_optrec_parm_t *mop) 11536 { 11537 mddb_set_t *s; 11538 int err = 0; 11539 mddb_lb_t *lbp; 11540 mddb_mnlb_t *mnlbp; 11541 mddb_locator_t *lp; 11542 int li; 11543 mddb_mnsidelocator_t *mnslp; 11544 mddb_drvnm_t *dn; 11545 int i, j; 11546 md_replica_recerr_t *recerr; 11547 md_error_t *ep = &mop->c_mde; 11548 int something_changed = 0; 11549 int alc, lc; 11550 int setno; 11551 11552 setno = mop->c_setno; 11553 if (mop->c_setno >= md_nsets) 11554 return (EINVAL); 11555 11556 if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0) 11557 return (0); 11558 11559 if ((s = mddb_setenter(mop->c_setno, MDDB_MUSTEXIST, &err)) == NULL) { 11560 return (mddbstatus2error(ep, err, NODEV32, mop->c_setno)); 11561 } 11562 11563 if (!(MD_MNSET_SETNO(mop->c_setno))) { 11564 mddb_setexit(s); 11565 return (EINVAL); 11566 } 11567 11568 single_thread_start(s); 11569 lbp = s->s_lbp; 11570 mnlbp = (mddb_mnlb_t *)lbp; 11571 11572 /* 11573 * If slave node has seen an mddb failure, but the master node 11574 * hasn't encountered this failure, mark the mddb as failed on 11575 * the master node and set the something_changed flag to 1. 11576 */ 11577 for (i = 0; i < 2; i++) { 11578 recerr = &mop->c_recerr[i]; 11579 if (recerr->r_flags & MDDB_F_EWRITE) { 11580 li = recerr->r_li; 11581 lp = &lbp->lb_locators[li]; 11582 for (j = 0; j < MD_MNMAXSIDES; j++) { 11583 mnslp = &mnlbp->lb_mnsidelocators[j][li]; 11584 if (mnslp->mnl_sideno == s->s_sideno) 11585 break; 11586 } 11587 /* Do quick check using li */ 11588 if (j != MD_MNMAXSIDES) 11589 dn = &lbp->lb_drvnm[mnslp->mnl_drvnm_index]; 11590 11591 if ((j != MD_MNMAXSIDES) && 11592 (strncmp(dn->dn_data, recerr->r_driver_name, 11593 MD_MAXDRVNM) == 0) && 11594 (recerr->r_blkno == lp->l_blkno) && 11595 (recerr->r_mnum == mnslp->mnl_mnum)) { 11596 if ((lp->l_flags & MDDB_F_ACTIVE) || 11597 ((lp->l_flags & MDDB_F_EWRITE) == 0)) { 11598 something_changed = 1; 11599 lp->l_flags |= MDDB_F_EWRITE; 11600 lp->l_flags &= ~MDDB_F_ACTIVE; 11601 } 11602 } else { 11603 /* 11604 * Passed in li from slave does not match 11605 * the replica in the master's structures. 11606 * This could have occurred if a delete 11607 * mddb command was running when the 11608 * optimized resync record had a failure. 11609 * Search all replicas for this entry. 11610 * If no match, just ignore. 11611 * If a match, set replica in error. 11612 */ 11613 for (li = 0; li < lbp->lb_loccnt; li++) { 11614 lp = &lbp->lb_locators[li]; 11615 if (lp->l_flags & MDDB_F_DELETED) 11616 continue; 11617 11618 for (j = 0; j < MD_MNMAXSIDES; j++) { 11619 mnslp = 11620 &mnlbp-> 11621 lb_mnsidelocators[j][li]; 11622 if (mnslp->mnl_sideno == 11623 s->s_sideno) 11624 break; 11625 } 11626 if (j == MD_MNMAXSIDES) 11627 continue; 11628 11629 dn = &lbp-> 11630 lb_drvnm[mnslp->mnl_drvnm_index]; 11631 if ((strncmp(dn->dn_data, 11632 recerr->r_driver_name, 11633 MD_MAXDRVNM) == 0) && 11634 (recerr->r_blkno == lp->l_blkno) && 11635 (recerr->r_mnum == 11636 mnslp->mnl_mnum)) { 11637 if ((lp->l_flags & 11638 MDDB_F_ACTIVE) || 11639 ((lp->l_flags & 11640 MDDB_F_EWRITE) == 0)) { 11641 something_changed = 1; 11642 lp->l_flags |= 11643 MDDB_F_EWRITE; 11644 lp->l_flags &= 11645 ~MDDB_F_ACTIVE; 11646 } 11647 break; 11648 } 11649 } 11650 } 11651 } 11652 } 11653 11654 /* 11655 * If this message changed nothing, then we're done since this 11656 * failure has already been handled. 11657 * If some mddb state has been changed, send a parse message to 11658 * the slave nodes so that the slaves will re-read the locator 11659 * block from disk. 11660 */ 11661 if (something_changed == 0) { 11662 single_thread_end(s); 11663 mddb_setexit(s); 11664 return (0); 11665 } else { 11666 s->s_mn_parseflags |= MDDB_PARSE_LOCBLK; 11667 } 11668 11669 /* 11670 * Scan replicas setting MD_SET_TOOFEW if 11671 * 50% or more of the mddbs have seen errors. 11672 * Note: Don't call selectreplicas or writeretry 11673 * since these routines may end up setting the ACTIVE flag 11674 * on a failed mddb if the master is able to access the mddb 11675 * but the slave node couldn't. Need to have the ACTIVE flag 11676 * turned off in order to relocate the optimized records to 11677 * mddbs that are (hopefully) available on all nodes. 11678 */ 11679 alc = 0; 11680 lc = 0; 11681 for (li = 0; li < lbp->lb_loccnt; li++) { 11682 lp = &lbp->lb_locators[li]; 11683 if (lp->l_flags & MDDB_F_DELETED) 11684 continue; 11685 lc++; 11686 if (! (lp->l_flags & MDDB_F_ACTIVE)) 11687 continue; 11688 alc++; 11689 } 11690 11691 /* 11692 * If more than 50% mddbs have failed, then don't relocate opt recs. 11693 * The node sending the mddb failure information will detect TOOFEW 11694 * and will panic when it attempts to re-write the optimized record. 11695 */ 11696 if (alc < ((lc + 1) / 2)) { 11697 md_set_setstatus(setno, MD_SET_TOOFEW); 11698 (void) push_lb(s); 11699 (void) upd_med(s, "mddb_optrecfix(0)"); 11700 single_thread_end(s); 11701 mddb_setexit(s); 11702 return (0); 11703 } 11704 11705 /* Attempt to relocate optimized records that are on failed mddbs */ 11706 (void) fixoptrecords(s); 11707 11708 /* Push changed locator block out to disk */ 11709 (void) push_lb(s); 11710 (void) upd_med(s, "mddb_optrecfix(1)"); 11711 11712 /* Recheck for TOOFEW after writing out locator blocks */ 11713 alc = 0; 11714 lc = 0; 11715 for (li = 0; li < lbp->lb_loccnt; li++) { 11716 lp = &lbp->lb_locators[li]; 11717 if (lp->l_flags & MDDB_F_DELETED) 11718 continue; 11719 lc++; 11720 if (! (lp->l_flags & MDDB_F_ACTIVE)) 11721 continue; 11722 alc++; 11723 } 11724 11725 /* If more than 50% mddbs have failed, then don't relocate opt recs */ 11726 if (alc < ((lc + 1) / 2)) { 11727 md_set_setstatus(setno, MD_SET_TOOFEW); 11728 single_thread_end(s); 11729 mddb_setexit(s); 11730 return (0); 11731 } 11732 11733 single_thread_end(s); 11734 mddb_setexit(s); 11735 return (0); 11736 } 11737 11738 /* 11739 * Check if incore mddb on master node matches ondisk mddb. 11740 * If not, master writes out incore view to all mddbs. 11741 * Have previously verified that master is an owner of the 11742 * diskset (master has snarfed diskset) and that diskset is 11743 * not stale. 11744 * 11745 * Meant to be called during reconfig cycle during change of master. 11746 * Previous master in diskset may have changed the mddb and 11747 * panic'd before relaying information to slave nodes. New 11748 * master node just writes out its incore view of the mddb and 11749 * the replay of the change log will resync all the nodes. 11750 * 11751 * Only supported for MN disksets. 11752 * 11753 * Return values: 11754 * 0 - success 11755 * non-zero - failure 11756 */ 11757 int 11758 mddb_check_write_ioctl(mddb_config_t *info) 11759 { 11760 int err = 0; 11761 set_t setno = info->c_setno; 11762 mddb_set_t *s; 11763 int li; 11764 mddb_locator_t *lp; 11765 mddb_lb_t *lbp; 11766 mddb_mnlb_t *mnlbp_od; 11767 mddb_ln_t *lnp; 11768 mddb_mnln_t *mnlnp_od; 11769 mddb_db_t *dbp; 11770 mddb_de_ic_t *dep; 11771 int write_out_mddb; 11772 md_error_t *ep = &info->c_mde; 11773 int mddb_err = 0; 11774 int prev_li = 0; 11775 int rval = 0; 11776 int alc, lc; 11777 int mddbs_present = 0; 11778 11779 /* Verify that setno is in valid range */ 11780 if (setno >= md_nsets) 11781 return (EINVAL); 11782 11783 if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0) 11784 return (0); 11785 11786 if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) { 11787 return (mddbstatus2error(ep, err, NODEV32, setno)); 11788 } 11789 11790 /* Calling diskset must be a MN diskset */ 11791 if (!(MD_MNSET_SETNO(setno))) { 11792 mddb_setexit(s); 11793 return (EINVAL); 11794 } 11795 11796 /* Re-verify that set is not stale */ 11797 if (md_get_setstatus(setno) & MD_SET_STALE) { 11798 mddb_setexit(s); 11799 return (mdmddberror(ep, MDE_DB_STALE, NODEV32, setno)); 11800 } 11801 11802 lbp = s->s_lbp; 11803 lnp = s->s_lnp; 11804 11805 /* 11806 * Previous master could have died during the write of data to 11807 * the mddbs so that the ondisk mddbs may not be consistent. 11808 * So, need to check the contents of the first and last active mddb 11809 * to see if the mddbs need to be rewritten. 11810 */ 11811 for (li = 0; li < lbp->lb_loccnt; li++) { 11812 int checkcopy_err; 11813 11814 lp = &lbp->lb_locators[li]; 11815 /* Find replica that is active */ 11816 if (lp->l_flags & MDDB_F_DELETED) 11817 continue; 11818 mddbs_present = 1; 11819 if (! (lp->l_flags & MDDB_F_ACTIVE)) 11820 continue; 11821 if (s->s_mbiarray[li] == NULL) 11822 continue; 11823 /* Check locator block */ 11824 mnlbp_od = (mddb_mnlb_t *)kmem_zalloc(dbtob(MDDB_MNLBCNT), 11825 KM_SLEEP); 11826 /* read in on-disk locator block */ 11827 err = readblks(s, (caddr_t)mnlbp_od, 0, lbp->lb_blkcnt, li); 11828 11829 /* If err, try next mddb */ 11830 if (err) { 11831 kmem_free(mnlbp_od, dbtob(MDDB_MNLBCNT)); 11832 continue; 11833 } 11834 11835 /* 11836 * We resnarf all changelog entries for this set. 11837 * They may have been altered by the previous master 11838 */ 11839 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 11840 for (dep = dbp->db_firstentry; dep; dep = 11841 dep->de_next) { 11842 if ((dep->de_flags & MDDB_F_CHANGELOG) == 0) { 11843 continue; 11844 } 11845 /* 11846 * This has been alloc'ed while 11847 * joining the set 11848 */ 11849 if (dep->de_rb) { 11850 kmem_free(dep->de_rb, dep->de_recsize); 11851 dep->de_rb = (mddb_rb32_t *)NULL; 11852 } 11853 if (dep->de_rb_userdata) { 11854 kmem_free(dep->de_rb_userdata, 11855 dep->de_reqsize); 11856 dep->de_rb_userdata = (caddr_t)NULL; 11857 } 11858 11859 err = getrecord(s, dep, li); 11860 if (err) { 11861 /* 11862 * When we see on error while reading 11863 * the changelog entries, we move on 11864 * to the next mddb 11865 */ 11866 err = 1; 11867 break; /* out of inner for-loop */ 11868 } 11869 allocuserdata(dep); 11870 } 11871 if (err) 11872 break; /* out of outer for-loop */ 11873 } 11874 11875 /* If err, try next mddb */ 11876 if (err) { 11877 kmem_free(mnlbp_od, dbtob(MDDB_MNLBCNT)); 11878 continue; 11879 } 11880 11881 /* Is incore locator block same as ondisk? */ 11882 if (bcmp((mddb_mnlb_t *)lbp, mnlbp_od, dbtob(MDDB_MNLBCNT)) 11883 == 1) { 11884 write_out_mddb = 1; 11885 kmem_free((caddr_t)mnlbp_od, dbtob(MDDB_MNLBCNT)); 11886 break; 11887 } 11888 11889 kmem_free((caddr_t)mnlbp_od, dbtob(MDDB_MNLBCNT)); 11890 11891 /* If lb ok, check locator names */ 11892 mnlnp_od = (mddb_mnln_t *)kmem_zalloc(dbtob(MDDB_MNLNCNT), 11893 KM_SLEEP); 11894 /* read in on-disk locator names */ 11895 err = readblks(s, (caddr_t)mnlnp_od, lbp->lb_lnfirstblk, 11896 lbp->lb_lnblkcnt, li); 11897 11898 /* If err, try next mddb */ 11899 if (err) { 11900 kmem_free(mnlnp_od, dbtob(MDDB_MNLNCNT)); 11901 continue; 11902 } 11903 11904 /* Are incore locator names same as ondisk? */ 11905 if (bcmp((mddb_mnln_t *)lnp, mnlnp_od, dbtob(MDDB_MNLNCNT)) 11906 == 1) { 11907 kmem_free((caddr_t)mnlnp_od, dbtob(MDDB_MNLNCNT)); 11908 write_out_mddb = 1; 11909 break; 11910 } 11911 11912 kmem_free((caddr_t)mnlnp_od, dbtob(MDDB_MNLNCNT)); 11913 11914 /* 11915 * Check records in mddb. 11916 * If a read error is encountered, set the error flag and 11917 * continue to the next mddb. Otherwise, if incore data is 11918 * different from ondisk, then set the flag to write out 11919 * the mddb and break out. 11920 */ 11921 checkcopy_err = checkcopy(s, li); 11922 if (checkcopy_err == MDDB_F_EREAD) { 11923 lp->l_flags |= MDDB_F_EREAD; 11924 mddb_err = 1; 11925 continue; 11926 } else if (checkcopy_err == 1) { 11927 write_out_mddb = 1; 11928 break; 11929 } 11930 /* 11931 * Have found first active mddb and the data is the same as 11932 * incore - break out of loop 11933 */ 11934 write_out_mddb = 0; 11935 break; 11936 } 11937 11938 /* 11939 * Skip checking for last active mddb if: 11940 * - already found a mismatch in the first active mddb 11941 * (write_out_mddb is 1) OR 11942 * - didn't find a readable mddb when looking for first 11943 * active mddb (there are mddbs present but all failed 11944 * when read was attempted). 11945 * 11946 * In either case, go to write_out_mddb label in order to attempt 11947 * to write out the data. If < 50% mddbs are available, panic. 11948 */ 11949 if ((write_out_mddb == 1) || 11950 ((li == lbp->lb_loccnt) && mddbs_present)) { 11951 write_out_mddb = 1; 11952 goto write_out_mddb; 11953 } 11954 11955 /* 11956 * Save which index was checked for the first active mddb. If only 1 11957 * active mddb, don't want to recheck the same mddb when looking for 11958 * last active mddb. 11959 */ 11960 prev_li = li; 11961 11962 /* 11963 * Now, checking for last active mddb. If found same index as before 11964 * (only 1 active mddb), then skip. 11965 */ 11966 for (li = (lbp->lb_loccnt - 1); li >= 0; li--) { 11967 int checkcopy_err; 11968 11969 lp = &lbp->lb_locators[li]; 11970 /* Find replica that is active */ 11971 if (! (lp->l_flags & MDDB_F_ACTIVE)) 11972 continue; 11973 if (lp->l_flags & MDDB_F_DELETED) 11974 continue; 11975 if (s->s_mbiarray[li] == NULL) 11976 continue; 11977 /* If already checked mddb, bail out */ 11978 if (li == prev_li) 11979 break; 11980 /* Check locator block */ 11981 mnlbp_od = (mddb_mnlb_t *)kmem_zalloc(dbtob(MDDB_MNLBCNT), 11982 KM_SLEEP); 11983 /* read in on-disk locator block */ 11984 err = readblks(s, (caddr_t)mnlbp_od, 0, lbp->lb_blkcnt, li); 11985 11986 /* If err, try next mddb */ 11987 if (err) { 11988 kmem_free(mnlbp_od, dbtob(MDDB_MNLBCNT)); 11989 continue; 11990 } 11991 11992 11993 /* Is incore locator block same as ondisk? */ 11994 if (bcmp((mddb_mnlb_t *)lbp, mnlbp_od, dbtob(MDDB_MNLBCNT)) 11995 == 1) { 11996 kmem_free((caddr_t)mnlbp_od, dbtob(MDDB_MNLBCNT)); 11997 write_out_mddb = 1; 11998 break; 11999 } 12000 12001 kmem_free((caddr_t)mnlbp_od, dbtob(MDDB_MNLBCNT)); 12002 12003 /* If lb ok, check locator names */ 12004 mnlnp_od = (mddb_mnln_t *) 12005 kmem_zalloc(dbtob(MDDB_MNLNCNT), KM_SLEEP); 12006 12007 /* read in on-disk locator names */ 12008 err = readblks(s, (caddr_t)mnlnp_od, lbp->lb_lnfirstblk, 12009 lbp->lb_lnblkcnt, li); 12010 12011 /* If err, try next mddb */ 12012 if (err) { 12013 kmem_free(mnlnp_od, dbtob(MDDB_MNLNCNT)); 12014 continue; 12015 } 12016 12017 /* Are incore locator names same as ondisk? */ 12018 if (bcmp((mddb_mnln_t *)lnp, mnlnp_od, dbtob(MDDB_MNLNCNT)) 12019 == 1) { 12020 kmem_free((caddr_t)mnlnp_od, dbtob(MDDB_MNLNCNT)); 12021 write_out_mddb = 1; 12022 break; 12023 } 12024 12025 kmem_free((caddr_t)mnlnp_od, dbtob(MDDB_MNLNCNT)); 12026 12027 /* 12028 * Check records in mddb. 12029 * If a read error is encountered, set the error flag and 12030 * continue to the next mddb. Otherwise, if incore data is 12031 * different from ondisk, then set the flag to write out 12032 * the mddb and break out. 12033 */ 12034 checkcopy_err = checkcopy(s, li); 12035 if (checkcopy_err == MDDB_F_EREAD) { 12036 lp->l_flags |= MDDB_F_EREAD; 12037 mddb_err = 1; 12038 continue; 12039 } else if (checkcopy_err == 1) { 12040 write_out_mddb = 1; 12041 break; 12042 } 12043 /* 12044 * Have found last active mddb and the data is the same as 12045 * incore - break out of loop 12046 */ 12047 write_out_mddb = 0; 12048 break; 12049 } 12050 12051 /* 12052 * If ondisk and incore versions of the mddb don't match, then 12053 * write out this node's incore version to disk. 12054 * Or, if unable to read a copy of the mddb, attempt to write 12055 * out a new one. 12056 */ 12057 write_out_mddb: 12058 if (write_out_mddb) { 12059 /* Recompute free blocks based on incore information */ 12060 computefreeblks(s); /* set up free block bits */ 12061 12062 /* 12063 * Write directory entries and record blocks. 12064 * Use flag MDDB_WRITECOPY_SYNC so that writecopy 12065 * routine won't write out change log records. 12066 */ 12067 for (li = 0; li < lbp->lb_loccnt; li++) { 12068 lp = &lbp->lb_locators[li]; 12069 /* Don't write to inactive or deleted mddbs */ 12070 if (! (lp->l_flags & MDDB_F_ACTIVE)) 12071 continue; 12072 if (lp->l_flags & MDDB_F_DELETED) 12073 continue; 12074 if (s->s_mbiarray[li] == NULL) 12075 continue; 12076 /* If encounter a write error, save it for later */ 12077 if (writecopy(s, li, MDDB_WRITECOPY_SYNC)) { 12078 lp->l_flags |= MDDB_F_EWRITE; 12079 mddb_err = 1; 12080 } 12081 } 12082 12083 /* 12084 * Write out locator blocks to all replicas. 12085 * push_lb will set MDDB_F_EWRITE on replicas that fail. 12086 */ 12087 if (push_lb(s)) 12088 mddb_err = 1; 12089 (void) upd_med(s, "mddb_check_write_ioctl(0)"); 12090 12091 /* Write out locator names to all replicas */ 12092 lnp = s->s_lnp; 12093 uniqtime32(&lnp->ln_timestamp); 12094 lnp->ln_revision = MDDB_REV_MNLN; 12095 crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL); 12096 12097 /* writeall sets MDDB_F_EWRITE if writes fails to replica */ 12098 if (writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk, 12099 lbp->lb_lnblkcnt, 0)) 12100 mddb_err = 1; 12101 12102 /* 12103 * The writes to the replicas above would have set 12104 * the MDDB_F_EWRITE flags if any write error was 12105 * encountered. 12106 * If < 50% of the mddbs are available, panic. 12107 */ 12108 lc = alc = 0; 12109 for (li = 0; li < lbp->lb_loccnt; li++) { 12110 lp = &lbp->lb_locators[li]; 12111 if (lp->l_flags & MDDB_F_DELETED) 12112 continue; 12113 lc++; 12114 /* 12115 * If mddb: 12116 * - is not active (previously had an error) 12117 * - had an error reading the master blocks or 12118 * - had an error in writing to the mddb 12119 * then don't count this mddb in the active count. 12120 */ 12121 if (! (lp->l_flags & MDDB_F_ACTIVE) || 12122 (lp->l_flags & MDDB_F_EMASTER) || 12123 (lp->l_flags & MDDB_F_EWRITE)) 12124 continue; 12125 alc++; 12126 } 12127 if (alc < ((lc + 1) / 2)) { 12128 cmn_err(CE_PANIC, 12129 "md: Panic due to lack of DiskSuite state\n" 12130 " database replicas. Fewer than 50%% of " 12131 "the total were available,\n so panic to " 12132 "ensure data integrity."); 12133 } 12134 } 12135 12136 /* 12137 * If encountered an error during checking or writing of 12138 * mddbs, call selectreplicas so that replica error can 12139 * be properly handled. This will involve another attempt 12140 * to write the mddb out to any mddb marked MDDB_F_EWRITE. 12141 * If mddb still fails, it will have the MDDB_F_ACTIVE bit 12142 * turned off. Set the MDDB_SCANALLSYNC flag so that 12143 * selectreplicas doesn't overwrite the change log entries. 12144 * 12145 * Set the PARSE_LOCBLK flag in the mddb_set structure to show 12146 * that the locator block has been changed. 12147 */ 12148 if (mddb_err) { 12149 (void) selectreplicas(s, MDDB_SCANALLSYNC); 12150 s->s_mn_parseflags |= MDDB_PARSE_LOCBLK; 12151 } 12152 12153 write_out_end: 12154 mddb_setexit(s); 12155 return (rval); 12156 } 12157 12158 /* 12159 * Set/reset/get set flags in set structure. 12160 * Used during reconfig cycle 12161 * Only supported for MN disksets. 12162 * 12163 * Return values: 12164 * 0 - success 12165 * non-zero - failure 12166 */ 12167 int 12168 mddb_setflags_ioctl(mddb_setflags_config_t *info) 12169 { 12170 set_t setno = info->sf_setno; 12171 12172 /* Verify that setno is in valid range */ 12173 if (setno >= md_nsets) 12174 return (EINVAL); 12175 12176 /* 12177 * When setting the flags, the set may not 12178 * be snarfed yet. So, don't check for SNARFED or MNset 12179 * and don't call mddb_setenter. 12180 * In order to discourage bad ioctl calls, 12181 * verify that magic field in structure is set correctly. 12182 */ 12183 if (info->sf_magic != MDDB_SETFLAGS_MAGIC) 12184 return (EINVAL); 12185 12186 switch (info->sf_flags) { 12187 case MDDB_NM_SET: 12188 if (info->sf_setflags & MD_SET_MN_NEWMAS_RC) 12189 md_set_setstatus(setno, MD_SET_MN_NEWMAS_RC); 12190 if (info->sf_setflags & MD_SET_MN_START_RC) 12191 md_set_setstatus(setno, MD_SET_MN_START_RC); 12192 if (info->sf_setflags & MD_SET_MN_MIR_STATE_RC) 12193 md_set_setstatus(setno, MD_SET_MN_MIR_STATE_RC); 12194 break; 12195 12196 case MDDB_NM_RESET: 12197 if (info->sf_setflags & MD_SET_MN_NEWMAS_RC) 12198 md_clr_setstatus(setno, MD_SET_MN_NEWMAS_RC); 12199 if (info->sf_setflags & MD_SET_MN_START_RC) 12200 md_clr_setstatus(setno, MD_SET_MN_START_RC); 12201 if (info->sf_setflags & MD_SET_MN_MIR_STATE_RC) 12202 md_clr_setstatus(setno, MD_SET_MN_MIR_STATE_RC); 12203 break; 12204 12205 case MDDB_NM_GET: 12206 info->sf_setflags = md_get_setstatus(setno) & 12207 (MD_SET_MN_NEWMAS_RC|MD_SET_MN_START_RC| 12208 MD_SET_MN_MIR_STATE_RC); 12209 break; 12210 } 12211 12212 return (0); 12213 } 12214 12215 /* 12216 * md_update_minor 12217 * 12218 * This function updates the minor in the namespace entry for an 12219 * underlying metadevice. The function is called in mod_imp_set 12220 * where mod is sp, stripe, mirror and raid. 12221 * 12222 */ 12223 int 12224 md_update_minor( 12225 set_t setno, 12226 side_t side, 12227 mdkey_t key 12228 ) 12229 { 12230 struct nm_next_hdr *nh; 12231 struct nm_name *n; 12232 char *shn; 12233 int retval = 1; 12234 12235 /* 12236 * Load the devid name space if it exists 12237 */ 12238 (void) md_load_namespace(setno, NULL, NM_DEVID); 12239 if (! md_load_namespace(setno, NULL, 0L)) { 12240 /* 12241 * Unload the devid namespace 12242 */ 12243 (void) md_unload_namespace(setno, NM_DEVID); 12244 return (0); 12245 } 12246 12247 rw_enter(&nm_lock.lock, RW_READER); 12248 12249 if ((nh = get_first_record(setno, 0, NM_NOTSHARED)) == NULL) { 12250 retval = 0; 12251 goto out; 12252 } 12253 12254 /* 12255 * Look up the key 12256 */ 12257 if ((n = lookup_entry(nh, setno, side, key, NODEV64, 0L)) != NULL) { 12258 /* 12259 * Find the entry, update its n_minor if metadevice 12260 */ 12261 if ((shn = (char *)getshared_name(setno, n->n_drv_key, 0L)) 12262 == NULL) { 12263 retval = 0; 12264 goto out; 12265 } 12266 12267 if (strcmp(shn, "md") == 0) { 12268 n->n_minor = MD_MKMIN(setno, MD_MIN2UNIT(n->n_minor)); 12269 } 12270 } 12271 12272 out: 12273 rw_exit(&nm_lock.lock); 12274 return (retval); 12275 } 12276 12277 /* 12278 * md_update_top_device_minor 12279 * 12280 * This function updates the minor in the namespace entry for a top 12281 * level metadevice. The function is called in mod_imp_set where 12282 * mod is sp, stripe, mirror and raid. 12283 * 12284 */ 12285 int 12286 md_update_top_device_minor( 12287 set_t setno, 12288 side_t side, 12289 md_dev64_t dev 12290 ) 12291 { 12292 struct nm_next_hdr *nh; 12293 struct nm_name *n; 12294 char *shn; 12295 int retval = 1; 12296 12297 /* 12298 * Load the devid name space if it exists 12299 */ 12300 (void) md_load_namespace(setno, NULL, NM_DEVID); 12301 if (! md_load_namespace(setno, NULL, 0L)) { 12302 /* 12303 * Unload the devid namespace 12304 */ 12305 (void) md_unload_namespace(setno, NM_DEVID); 12306 return (0); 12307 } 12308 12309 rw_enter(&nm_lock.lock, RW_READER); 12310 12311 if ((nh = get_first_record(setno, 0, NM_NOTSHARED)) == NULL) { 12312 retval = 0; 12313 goto out; 12314 } 12315 12316 /* 12317 * Look up the key 12318 */ 12319 if ((n = lookup_entry(nh, setno, side, MD_KEYWILD, dev, 0L)) != NULL) { 12320 /* 12321 * Find the entry, update its n_minor if metadevice 12322 */ 12323 if ((shn = (char *)getshared_name(setno, n->n_drv_key, 0L)) 12324 == NULL) { 12325 retval = 0; 12326 goto out; 12327 } 12328 12329 if (strcmp(shn, "md") == 0) { 12330 n->n_minor = MD_MKMIN(setno, MD_MIN2UNIT(n->n_minor)); 12331 } 12332 } 12333 12334 out: 12335 rw_exit(&nm_lock.lock); 12336 return (retval); 12337 } 12338 12339 static void 12340 md_imp_nm( 12341 mddb_set_t *s 12342 ) 12343 { 12344 mddb_db_t *dbp; 12345 mddb_de_ic_t *dep; 12346 struct nm_rec_hdr *hdr; 12347 struct nm_header *hhdr; 12348 set_t setno = s->s_setno; 12349 12350 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 12351 for (dep = dbp->db_firstentry; dep != NULL; 12352 dep = dep->de_next) { 12353 switch (dep->de_type1) { 12354 12355 case MDDB_NM_HDR: 12356 case MDDB_DID_NM_HDR: 12357 12358 hhdr = (struct nm_header *) 12359 dep->de_rb_userdata; 12360 12361 hdr = &hhdr->h_names; 12362 if (hdr->r_next_recid > 0) { 12363 hdr->r_next_recid = MAKERECID(setno, 12364 DBID(hdr->r_next_recid)); 12365 } 12366 12367 hdr = &hhdr->h_shared; 12368 if (hdr->r_next_recid > 0) { 12369 hdr->r_next_recid = MAKERECID(setno, 12370 DBID(hdr->r_next_recid)); 12371 } 12372 break; 12373 12374 case MDDB_NM: 12375 case MDDB_DID_NM: 12376 case MDDB_SHR_NM: 12377 case MDDB_DID_SHR_NM: 12378 12379 hdr = (struct nm_rec_hdr *) 12380 dep->de_rb_userdata; 12381 12382 if (hdr->r_next_recid > 0) { 12383 hdr->r_next_recid = MAKERECID 12384 (setno, DBID(hdr->r_next_recid)); 12385 } 12386 break; 12387 12388 default: 12389 break; 12390 } 12391 } 12392 } 12393 } 12394 12395 static int 12396 update_db_rec( 12397 mddb_set_t *s 12398 ) 12399 { 12400 mddb_db_t *dbp; 12401 mddb_de_ic_t *dep; 12402 mddb_recid_t ids[2]; 12403 12404 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 12405 for (dep = dbp->db_firstentry; dep != NULL; 12406 dep = dep->de_next) { 12407 if (! (dep->de_flags & MDDB_F_OPT)) { 12408 ids[0] = MAKERECID(s->s_setno, dep->de_recid); 12409 ids[1] = 0; 12410 if (mddb_commitrecs(ids)) { 12411 return (MDDB_E_NORECORD); 12412 } 12413 } 12414 } 12415 } 12416 return (0); 12417 } 12418 12419 static int 12420 update_mb( 12421 mddb_set_t *s 12422 ) 12423 { 12424 mddb_ri_t *rip; 12425 int err = 0; 12426 12427 for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) { 12428 if (rip->ri_flags & MDDB_F_EMASTER) 12429 /* disk is powered off or not there */ 12430 continue; 12431 12432 if (md_get_setstatus(s->s_setno) & MD_SET_REPLICATED_IMPORT) { 12433 /* 12434 * It is a replicated set 12435 */ 12436 if (rip->ri_devid == (ddi_devid_t)NULL) { 12437 return (-1); 12438 } 12439 err = update_mb_devid(s, rip, rip->ri_devid); 12440 } else { 12441 /* 12442 * It is a non-replicated set 12443 * and there is no need to update 12444 * devid 12445 */ 12446 err = update_mb_devid(s, rip, NULL); 12447 } 12448 12449 if (err) 12450 return (err); 12451 } 12452 12453 return (0); 12454 } 12455 12456 static int 12457 update_setname( 12458 set_t setno 12459 ) 12460 { 12461 struct nm_next_hdr *nh; 12462 struct nm_shared_name *shn, *new_shn; 12463 char *prefix = "/dev/md/"; 12464 char *shrname; 12465 int len; 12466 mdkey_t o_key; 12467 uint32_t o_count, o_data; 12468 mddb_recid_t recid, ids[3]; 12469 int err = 0; 12470 mddb_set_t *dbp; 12471 12472 /* Import setname */ 12473 dbp = (mddb_set_t *)md_set[setno].s_db; 12474 len = strlen(prefix) + strlen(dbp->s_setname) + strlen("/dsk/") + 1; 12475 shrname = kmem_zalloc(len, KM_SLEEP); 12476 (void) sprintf(shrname, "%s%s%s", prefix, dbp->s_setname, "/dsk/"); 12477 12478 rw_enter(&nm_lock.lock, RW_WRITER); 12479 if ((nh = get_first_record(setno, 0, NM_SHARED)) == NULL) { 12480 /* 12481 * No namespace is okay 12482 */ 12483 err = 0; 12484 goto out; 12485 } 12486 12487 if ((shn = (struct nm_shared_name *)lookup_shared_entry(nh, 12488 0, prefix, NULL, NM_SHARED | NM_IMP_SHARED)) == NULL) { 12489 /* 12490 * No metadevice is okay 12491 */ 12492 err = 0; 12493 goto out; 12494 } 12495 12496 /* 12497 * We have it, go ahead and update the namespace. 12498 */ 12499 o_key = shn->sn_key; 12500 o_count = shn->sn_count; 12501 o_data = shn->sn_data; 12502 12503 if (remove_shared_entry(nh, o_key, NULL, 0L | NM_IMP_SHARED | 12504 NM_NOCOMMIT)) { 12505 err = MDDB_E_NORECORD; 12506 goto out; 12507 } 12508 if ((new_shn = (struct nm_shared_name *)alloc_entry( 12509 nh, md_set[setno].s_nmid, len, NM_SHARED | 12510 NM_NOCOMMIT, &recid)) == NULL) { 12511 err = MDDB_E_NORECORD; 12512 goto out; 12513 } 12514 12515 new_shn->sn_key = o_key; 12516 new_shn->sn_count = o_count; 12517 new_shn->sn_data = o_data; 12518 new_shn->sn_namlen = (ushort_t)len; 12519 (void) strcpy(new_shn->sn_name, shrname); 12520 12521 ids[0] = recid; 12522 ids[1] = md_set[setno].s_nmid; 12523 ids[2] = 0; 12524 err = mddb_commitrecs(ids); 12525 12526 out: 12527 if (shrname) 12528 kmem_free(shrname, len); 12529 rw_exit(&nm_lock.lock); 12530 return (err); 12531 } 12532 12533 /* 12534 * Returns 0 on success. 12535 * Returns -1 on failure with ep filled in. 12536 */ 12537 static int 12538 md_imp_db( 12539 set_t setno, 12540 int stale_flag, 12541 md_error_t *ep 12542 ) 12543 { 12544 mddb_set_t *s; 12545 int err = 0; 12546 mddb_dt_t *dtp; 12547 mddb_lb_t *lbp; 12548 int i; 12549 int loccnt; 12550 12551 if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) { 12552 return (mddbstatus2error(ep, err, NODEV32, setno)); 12553 } 12554 12555 /* Update dt */ 12556 if ((dtp = (mddb_dt_t *)md_set[setno].s_dtp) != NULL) { 12557 crcgen(dtp, &dtp->dt_cks, MDDB_DT_BYTES, NULL); 12558 } 12559 12560 if ((err = dt_write(s)) != 0) { 12561 err = mdsyserror(ep, err); 12562 mddb_setexit(s); 12563 return (err); 12564 } 12565 12566 /* 12567 * Update lb, no need to update the mediator because 12568 * the diskset will only exist on the importing node 12569 * and as such a mediator adds no value. 12570 */ 12571 12572 /* Update lb */ 12573 if (stale_flag & MD_IMP_STALE_SET) { 12574 lbp = s->s_lbp; 12575 loccnt = lbp->lb_loccnt; 12576 for (i = 0; i < loccnt; i++) { 12577 mddb_locator_t *lp = &lbp->lb_locators[i]; 12578 md_dev64_t ndev = md_expldev(lp->l_dev); 12579 ddi_devid_t devid_ptr; 12580 12581 devid_ptr = s->s_did_icp->did_ic_devid[i]; 12582 if (devid_ptr == NULL) { 12583 /* 12584 * Already deleted, go to next one. 12585 */ 12586 continue; 12587 } 12588 if (mddb_devid_validate((ddi_devid_t)devid_ptr, &ndev, 12589 NULL)) { 12590 /* disk unavailable, mark deleted */ 12591 lp->l_flags = MDDB_F_DELETED; 12592 /* then remove the device id from the list */ 12593 free_mbipp(&s->s_mbiarray[i]); 12594 (void) mddb_devid_delete(s, i); 12595 } 12596 } 12597 md_clr_setstatus(setno, MD_SET_STALE); 12598 } 12599 12600 if ((err = writelocall(s)) != 0) { 12601 err = mdmddberror(ep, MDDB_E_NOTNOW, NODEV32, setno); 12602 mddb_setexit(s); 12603 return (err); 12604 } 12605 12606 mddb_setexit(s); 12607 12608 /* Update db records */ 12609 if ((err = update_db_rec(s)) != 0) { 12610 return (mddbstatus2error(ep, err, NODEV32, setno)); 12611 } 12612 12613 /* Update setname embedded in the namespace */ 12614 if ((err = update_setname(setno)) != 0) 12615 return (mddbstatus2error(ep, err, NODEV32, setno)); 12616 12617 return (err); 12618 } 12619 12620 static void 12621 md_dr_add( 12622 md_set_record *sr, 12623 md_drive_record *dr 12624 ) 12625 { 12626 md_drive_record *drv; 12627 12628 if (sr->sr_driverec == 0) { 12629 sr->sr_driverec = dr->dr_selfid; 12630 return; 12631 } 12632 12633 for (drv = (md_drive_record *)mddb_getrecaddr(sr->sr_driverec); 12634 drv->dr_nextrec != 0; 12635 drv = (md_drive_record *)mddb_getrecaddr(drv->dr_nextrec)) 12636 ; 12637 drv->dr_nextrec = dr->dr_selfid; 12638 } 12639 12640 static void 12641 md_setup_recids( 12642 md_set_record *sr, 12643 mddb_recid_t **ids, 12644 size_t size 12645 ) 12646 { 12647 md_drive_record *drv; 12648 int cnt; 12649 mddb_recid_t *recids; 12650 12651 recids = (mddb_recid_t *)kmem_zalloc(sizeof (mddb_recid_t) 12652 * size, KM_SLEEP); 12653 recids[0] = sr->sr_selfid; 12654 cnt = 1; 12655 12656 for (drv = (md_drive_record *)mddb_getrecaddr(sr->sr_driverec); 12657 /* CSTYLED */ 12658 drv != NULL;) { 12659 recids[cnt++] = drv->dr_selfid; 12660 if (drv->dr_nextrec != 0) 12661 drv = (md_drive_record *)mddb_getrecaddr 12662 (drv->dr_nextrec); 12663 else 12664 drv = NULL; 12665 } 12666 recids[cnt] = 0; 12667 *ids = &recids[0]; 12668 } 12669 12670 /* 12671 * The purpose of this function is to replace the old_devid with the 12672 * new_devid in the given namespace. This is used for importing 12673 * remotely replicated drives. 12674 */ 12675 int 12676 md_update_namespace_rr_did( 12677 mddb_config_t *cp 12678 ) 12679 { 12680 set_t setno = cp->c_setno; 12681 struct nm_next_hdr *nh; 12682 mdkey_t key = MD_KEYWILD; 12683 side_t side = MD_SIDEWILD; 12684 mddb_recid_t recids[3]; 12685 struct did_min_name *n; 12686 struct nm_next_hdr *did_shr_nh; 12687 struct did_shr_name *shr_n; 12688 mdkey_t ent_did_key; 12689 uint32_t ent_did_count; 12690 uint32_t ent_did_data; 12691 size_t ent_size, size; 12692 ddi_devid_t devid = NULL; 12693 struct did_shr_name *shn; 12694 size_t offset; 12695 struct nm_next_hdr *this_did_shr_nh; 12696 void *old_devid, *new_devid; 12697 12698 if (!(md_get_setstatus(setno) & MD_SET_NM_LOADED)) 12699 return (EIO); 12700 12701 old_devid = (void *)(uintptr_t)cp->c_locator.l_old_devid; 12702 new_devid = (void *)(uintptr_t)cp->c_locator.l_devid; 12703 12704 /* 12705 * It is okay if we dont have any configuration 12706 */ 12707 offset = (sizeof (struct devid_shr_rec) - sizeof (struct did_shr_name)); 12708 if ((nh = get_first_record(setno, 0, NM_DEVID | NM_NOTSHARED)) 12709 == NULL) { 12710 return (0); 12711 } 12712 while ((key = md_getnextkey(setno, side, key, NULL)) != MD_KEYWILD) { 12713 /* check out every entry in the namespace */ 12714 if ((n = (struct did_min_name *)lookup_entry(nh, setno, 12715 side, key, NODEV64, NM_DEVID)) == NULL) { 12716 continue; 12717 } else { 12718 did_shr_nh = get_first_record(setno, 0, NM_DEVID | 12719 NM_SHARED); 12720 if (did_shr_nh == NULL) { 12721 return (ENOENT); 12722 } 12723 this_did_shr_nh = did_shr_nh->nmn_nextp; 12724 shr_n = (struct did_shr_name *)lookup_shared_entry( 12725 did_shr_nh, n->min_devid_key, (char *)0, 12726 &recids[0], NM_DEVID); 12727 if (shr_n == NULL) { 12728 return (ENOENT); 12729 } 12730 rw_enter(&nm_lock.lock, RW_WRITER); 12731 devid = (ddi_devid_t)shr_n->did_devid; 12732 /* find this devid in the incore replica */ 12733 if (ddi_devid_compare(devid, old_devid) == 0) { 12734 /* 12735 * found the corresponding entry 12736 * update with new devid 12737 */ 12738 /* first remove old devid info */ 12739 ent_did_key = shr_n ->did_key; 12740 ent_did_count = shr_n->did_count; 12741 ent_did_data = shr_n->did_data; 12742 ent_size = DID_SHR_NAMSIZ(shr_n); 12743 size = ((struct nm_rec_hdr *) 12744 this_did_shr_nh->nmn_record)-> 12745 r_used_size - offset - ent_size; 12746 if (size == 0) { 12747 (void) bzero(shr_n, ent_size); 12748 } else { 12749 (void) ovbcopy((caddr_t)shr_n + 12750 ent_size, shr_n, size); 12751 (void) bzero((caddr_t)shr_n + 12752 size, ent_size); 12753 } 12754 ((struct nm_rec_hdr *)this_did_shr_nh-> 12755 nmn_record)->r_used_size -= 12756 ent_size; 12757 /* add in new devid info */ 12758 if ((shn = (struct did_shr_name *) 12759 alloc_entry(did_shr_nh, 12760 md_set[setno].s_did_nmid, 12761 cp->c_locator.l_devid_sz, 12762 NM_DEVID | NM_SHARED | NM_NOCOMMIT, 12763 &recids[0])) == NULL) { 12764 rw_exit(&nm_lock.lock); 12765 return (ENOMEM); 12766 } 12767 shn->did_key = ent_did_key; 12768 shn->did_count = ent_did_count; 12769 ent_did_data |= NM_DEVID_VALID; 12770 shn->did_data = ent_did_data; 12771 shn->did_size = ddi_devid_sizeof( 12772 new_devid); 12773 bcopy((void *)new_devid, (void *) 12774 shn->did_devid, shn->did_size); 12775 recids[1] = md_set[setno].s_nmid; 12776 recids[2] = 0; 12777 mddb_commitrecs_wrapper(recids); 12778 } 12779 rw_exit(&nm_lock.lock); 12780 } 12781 } 12782 12783 return (0); 12784 } 12785 12786 /* 12787 * namespace is loaded before this is called. 12788 * This function is a wrapper for md_update_namespace_rr_did. 12789 * 12790 * md_update_namespace_rr_did may be called twice if attempting to 12791 * resolve a replicated device id during the take of a diskset - once 12792 * for the diskset namespace and a second time for the local namespace. 12793 * The local namespace would need to be updated when a drive has been 12794 * found during a take of the diskset that hadn't been resolved during 12795 * the import (aka partial replicated import). 12796 * 12797 * If being called during the import of the diskset (IMPORT flag set) 12798 * md_update_namespace_rr_did will only be called once with the disket 12799 * namespace. 12800 */ 12801 int 12802 md_update_nm_rr_did_ioctl( 12803 mddb_config_t *cp 12804 ) 12805 { 12806 int rval = 0; 12807 12808 /* If update of diskset namespace fails, stop and return failure */ 12809 if ((rval = md_update_namespace_rr_did(cp)) != 0) 12810 return (rval); 12811 12812 if (cp->c_flags & MDDB_C_IMPORT) 12813 return (0); 12814 12815 /* If update of local namespace fails, return failure */ 12816 cp->c_setno = MD_LOCAL_SET; 12817 rval = md_update_namespace_rr_did(cp); 12818 return (rval); 12819 } 12820 12821 /*ARGSUSED*/ 12822 int 12823 md_imp_snarf_set( 12824 mddb_config_t *cp 12825 ) 12826 { 12827 set_t setno; 12828 int stale_flag; 12829 mddb_set_t *s; 12830 int i, err = 0; 12831 md_ops_t *ops; 12832 md_error_t *ep = &cp->c_mde; 12833 12834 setno = cp->c_setno; 12835 stale_flag = cp->c_flags; 12836 12837 mdclrerror(ep); 12838 if (setno >= md_nsets) { 12839 return (mdsyserror(ep, EINVAL)); 12840 } 12841 12842 md_haltsnarf_enter(setno); 12843 if (md_get_setstatus(setno) & MD_SET_IMPORT) { 12844 goto out; 12845 } 12846 12847 /* Set the bit first otherwise load_old_replicas can fail */ 12848 md_set_setstatus(setno, MD_SET_IMPORT); 12849 12850 if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) { 12851 err = mddbstatus2error(ep, err, NODEV32, setno); 12852 goto out; 12853 } 12854 12855 /* 12856 * Upon completion of load_old_replicas, the old setno is 12857 * restored from the disk so we need to reset 12858 */ 12859 s->s_lbp->lb_setno = setno; 12860 12861 /* 12862 * Fixup the NM records before loading namespace 12863 */ 12864 (void) md_imp_nm(s); 12865 mddb_setexit(s); 12866 12867 /* 12868 * Load the devid name space if it exists 12869 * and ask each module to fixup unit records 12870 */ 12871 if (!md_load_namespace(setno, NULL, NM_DEVID)) { 12872 err = mdsyserror(ep, ENOENT); 12873 goto cleanup; 12874 } 12875 if (!md_load_namespace(setno, NULL, 0L)) { 12876 (void) md_unload_namespace(setno, NM_DEVID); 12877 err = mdsyserror(ep, ENOENT); 12878 goto cleanup; 12879 } 12880 12881 do { 12882 i = 0; 12883 for (ops = md_opslist; ops != NULL; ops = ops->md_next) 12884 if (ops->md_imp_set != NULL) 12885 i += ops->md_imp_set(setno); 12886 } while (i); 12887 12888 /* 12889 * Fixup 12890 * (1) locator block 12891 * (2) locator name block if necessary 12892 * (3) master block 12893 * (4) directory block 12894 * calls appropriate writes to push changes out 12895 */ 12896 if ((err = md_imp_db(setno, stale_flag, ep)) != 0) { 12897 goto cleanup; 12898 } 12899 12900 /* 12901 * Don't unload namespace if importing a replicated diskset. 12902 * Namespace will be unloaded with an explicit RELEASE_SET ioctl. 12903 */ 12904 if (md_get_setstatus(s->s_setno) & MD_SET_REPLICATED_IMPORT) { 12905 md_haltsnarf_exit(setno); 12906 return (err); 12907 } 12908 12909 cleanup: 12910 /* 12911 * Halt the set 12912 */ 12913 rw_enter(&md_unit_array_rw.lock, RW_WRITER); 12914 (void) md_halt_set(setno, MD_HALT_ALL); 12915 rw_exit(&md_unit_array_rw.lock); 12916 12917 /* 12918 * Unload the namespace for the imported set 12919 */ 12920 mutex_enter(&mddb_lock); 12921 mddb_unload_set(setno); 12922 mutex_exit(&mddb_lock); 12923 12924 out: 12925 md_haltsnarf_exit(setno); 12926 md_clr_setstatus(setno, MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT); 12927 return (err); 12928 } 12929 #endif /* MDDB_FAKE */ 12930