1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/types.h> 29 #include <sys/conf.h> 30 #include <sys/time.h> 31 #include <sys/uio.h> 32 #include <sys/param.h> 33 #include <sys/systm.h> 34 #include <sys/systeminfo.h> 35 #include <sys/sysmacros.h> 36 #include <sys/buf.h> 37 #include <sys/kmem.h> 38 #include <sys/file.h> 39 #include <sys/open.h> 40 #include <sys/debug.h> 41 #include <sys/stat.h> 42 #include <sys/lvm/mdvar.h> 43 #include <sys/lvm/md_crc.h> 44 #include <sys/lvm/md_convert.h> 45 #include <sys/types.h> 46 #include <sys/kmem.h> 47 #include <sys/lvm/mdmn_commd.h> 48 #include <sys/cladm.h> 49 50 mhd_mhiargs_t defmhiargs = { 51 1000, 52 { 6000, 6000, 30000 } 53 }; 54 55 #define MDDB 56 57 #include <sys/lvm/mdvar.h> 58 #include <sys/lvm/mdmed.h> 59 #include <sys/lvm/md_names.h> 60 #include <sys/cred.h> 61 #include <sys/ddi.h> 62 #include <sys/sunddi.h> 63 #include <sys/esunddi.h> 64 65 #include <sys/sysevent/eventdefs.h> 66 #include <sys/sysevent/svm.h> 67 68 extern char svm_bootpath[]; 69 70 int md_maxbootlist = MAXBOOTLIST; 71 static ulong_t mddb_maxblocks = 0; /* tune for small records */ 72 static int mddb_maxbufheaders = 50; 73 static uint_t mddb_maxcopies = MDDB_NLB; 74 75 /* 76 * If this is set, more detailed messages about DB init will be given, instead 77 * of just the MDE_DB_NODB. 78 */ 79 static int mddb_db_err_detail = 0; 80 81 /* 82 * This lock is used to single-thread load/unload of all sets 83 */ 84 static kmutex_t mddb_lock; 85 86 /* 87 * You really do NOT want to change this boolean. 88 * It can be VERY dangerous to do so. Loss of 89 * data may occur. USE AT YOUR OWN RISK!!!! 90 */ 91 static int mddb_allow_half = 0; 92 /* 93 * For mirrored root allow reboot with only half the replicas available 94 * Flag inserted for Santa Fe project. 95 */ 96 int mirrored_root_flag; 97 98 #define ISWHITE(c) (((c) == ' ') || ((c) == '\t') || \ 99 ((c) == '\r') || ((c) == '\n')) 100 #define ISNUM(c) (((c) >= '0') && ((c) <= '9')) 101 102 #define SETMUTEX(setno) (&md_set[setno].s_dbmx) 103 104 extern md_krwlock_t md_unit_array_rw; /* md.c */ 105 extern set_t md_nsets; /* md.c */ 106 extern int md_nmedh; /* md.c */ 107 extern md_set_t md_set[]; /* md.c */ 108 extern int (*mdv_strategy_tstpnt)(buf_t *, int, void*); 109 extern dev_info_t *md_devinfo; 110 extern int md_init_debug; 111 extern int md_status; 112 extern md_ops_t *md_opslist; 113 extern md_krwlock_t nm_lock; 114 115 static int update_locatorblock(mddb_set_t *s, md_dev64_t dev, 116 ddi_devid_t didptr, ddi_devid_t old_didptr); 117 118 /* 119 * Defines for crc calculation for records 120 * rec_crcgen generates a crc checksum for a record block 121 * rec_crcchk checks the crc checksum for a record block 122 */ 123 #define REC_CRCGEN 0 124 #define REC_CRCCHK 1 125 #define rec_crcgen(s, dep, rbp) \ 126 (void) rec_crcfunc(s, dep, rbp, REC_CRCGEN) 127 #define rec_crcchk(s, dep, rbp) \ 128 rec_crcfunc(s, dep, rbp, REC_CRCCHK) 129 130 /* 131 * During upgrade, SVM basically runs with the devt from the target 132 * being upgraded. Translations are made from the target devt to the 133 * miniroot devt when writing data out to the disk. This is done by 134 * the following routines: 135 * wrtblklst 136 * writeblks 137 * readblklst 138 * readblks 139 * dt_read 140 * 141 * The following routines are used by the routines listed above and 142 * expect a translated (aka miniroot) devt: 143 * getblks 144 * getmasters 145 * 146 * Also, when calling any system routines, such as ddi_lyr_get_devid, 147 * the translated (aka miniroot) devt must be used. 148 * 149 * By the same token, the major number and major name conversion operations 150 * need to use the name_to_major file from the target system instead 151 * of the name_to_major file on the miniroot. So, calls to 152 * ddi_name_to_major must be replaced with calls to md_targ_name_to_major 153 * when running on an upgrade. Same is true with calls to 154 * ddi_major_to_name. 155 */ 156 157 158 #ifndef MDDB_FAKE 159 160 static int 161 mddb_rwdata( 162 mddb_set_t *s, /* incore db set structure */ 163 int flag, /* B_ASYNC, B_FAILFAST or 0 passed in here */ 164 buf_t *bp 165 ) 166 { 167 int err = 0; 168 169 bp->b_flags = (flag | B_BUSY) & (~B_ASYNC); 170 171 mutex_exit(SETMUTEX(s->s_setno)); 172 if (mdv_strategy_tstpnt == NULL || 173 (*mdv_strategy_tstpnt)(bp, 0, NULL) == 0) 174 (void) bdev_strategy(bp); 175 176 if (flag & B_ASYNC) { 177 mutex_enter(SETMUTEX(s->s_setno)); 178 return (0); 179 } 180 181 err = biowait(bp); 182 mutex_enter(SETMUTEX(s->s_setno)); 183 return (err); 184 } 185 186 static void 187 setidentifier( 188 mddb_set_t *s, 189 identifier_t *ident 190 ) 191 { 192 if (s->s_setno == MD_LOCAL_SET) 193 (void) strcpy(&ident->serial[0], s->s_ident.serial); 194 else 195 ident->createtime = s->s_ident.createtime; 196 } 197 198 static int 199 cmpidentifier( 200 mddb_set_t *s, 201 identifier_t *ident 202 ) 203 { 204 if (s->s_setno == MD_LOCAL_SET) 205 return (strcmp(ident->serial, s->s_ident.serial)); 206 else 207 return (timercmp(&ident->createtime, 208 /*CSTYLED*/ 209 &s->s_ident.createtime, !=)); 210 } 211 212 static int 213 mddb_devopen( 214 md_dev64_t dev 215 ) 216 { 217 dev_t ddi_dev = md_dev64_to_dev(dev); 218 219 if (dev_lopen(&ddi_dev, FREAD|FWRITE, OTYP_LYR, kcred) == 0) 220 return (0); 221 return (1); 222 } 223 224 static void 225 mddb_devclose( 226 md_dev64_t dev 227 ) 228 { 229 (void) dev_lclose(md_dev64_to_dev(dev), FREAD|FWRITE, OTYP_LYR, kcred); 230 } 231 232 /* 233 * stripe_skip_ts 234 * 235 * Returns a list of fields to be skipped in the stripe record structure. 236 * These fields are ms_timestamp in the component structure. 237 * Used to skip these fields when calculating the checksum. 238 */ 239 static crc_skip_t * 240 stripe_skip_ts(void *un, uint_t revision) 241 { 242 struct ms_row32_od *small_mdr; 243 struct ms_row *big_mdr; 244 uint_t row, comp, ncomps, compoff; 245 crc_skip_t *skip; 246 crc_skip_t *skip_prev; 247 crc_skip_t skip_start = {0, 0, 0}; 248 ms_unit_t *big_un; 249 ms_unit32_od_t *small_un; 250 uint_t rb_off = offsetof(mddb_rb32_t, rb_data[0]); 251 252 switch (revision) { 253 case MDDB_REV_RB: 254 case MDDB_REV_RBFN: 255 small_un = (ms_unit32_od_t *)un; 256 skip_prev = &skip_start; 257 258 if (small_un->un_nrows == 0) 259 return (NULL); 260 /* 261 * walk through all rows to find the total number 262 * of components 263 */ 264 small_mdr = &small_un->un_row[0]; 265 ncomps = 0; 266 for (row = 0; (row < small_un->un_nrows); row++) { 267 ncomps += small_mdr[row].un_ncomp; 268 } 269 270 /* Now walk through the components */ 271 compoff = small_un->un_ocomp + rb_off; 272 for (comp = 0; (comp < ncomps); ++comp) { 273 uint_t mdcp = compoff + 274 (comp * sizeof (ms_comp32_od_t)); 275 skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), 276 KM_SLEEP); 277 skip->skip_offset = mdcp + 278 offsetof(ms_comp32_od_t, un_mirror.ms_timestamp); 279 skip->skip_size = sizeof (md_timeval32_t); 280 skip_prev->skip_next = skip; 281 skip_prev = skip; 282 } 283 break; 284 case MDDB_REV_RB64: 285 case MDDB_REV_RB64FN: 286 big_un = (ms_unit_t *)un; 287 skip_prev = &skip_start; 288 289 if (big_un->un_nrows == 0) 290 return (NULL); 291 /* 292 * walk through all rows to find the total number 293 * of components 294 */ 295 big_mdr = &big_un->un_row[0]; 296 ncomps = 0; 297 for (row = 0; (row < big_un->un_nrows); row++) { 298 ncomps += big_mdr[row].un_ncomp; 299 } 300 301 /* Now walk through the components */ 302 compoff = big_un->un_ocomp + rb_off; 303 for (comp = 0; (comp < ncomps); ++comp) { 304 uint_t mdcp = compoff + 305 (comp * sizeof (ms_comp_t)); 306 skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), 307 KM_SLEEP); 308 skip->skip_offset = mdcp + 309 offsetof(ms_comp_t, un_mirror.ms_timestamp); 310 skip->skip_size = sizeof (md_timeval32_t); 311 skip_prev->skip_next = skip; 312 skip_prev = skip; 313 } 314 break; 315 } 316 /* Return the start of the list of fields to skip */ 317 return (skip_start.skip_next); 318 } 319 320 /* 321 * mirror_skip_ts 322 * 323 * Returns a list of fields to be skipped in the mirror record structure. 324 * This includes un_last_read and sm_timestamp for each submirror 325 * Used to skip these fields when calculating the checksum. 326 */ 327 static crc_skip_t * 328 mirror_skip_ts(uint_t revision) 329 { 330 int i; 331 crc_skip_t *skip; 332 crc_skip_t *skip_prev; 333 crc_skip_t skip_start = {0, 0, 0}; 334 uint_t rb_off = offsetof(mddb_rb32_t, rb_data[0]); 335 336 skip_prev = &skip_start; 337 338 skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), KM_SLEEP); 339 switch (revision) { 340 case MDDB_REV_RB: 341 case MDDB_REV_RBFN: 342 skip->skip_offset = offsetof(mm_unit32_od_t, 343 un_last_read) + rb_off; 344 break; 345 case MDDB_REV_RB64: 346 case MDDB_REV_RB64FN: 347 skip->skip_offset = offsetof(mm_unit_t, 348 un_last_read) + rb_off; 349 break; 350 } 351 skip->skip_size = sizeof (int); 352 skip_prev->skip_next = skip; 353 skip_prev = skip; 354 355 for (i = 0; i < NMIRROR; i++) { 356 skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), KM_SLEEP); 357 switch (revision) { 358 case MDDB_REV_RB: 359 case MDDB_REV_RBFN: 360 skip->skip_offset = offsetof(mm_unit32_od_t, 361 un_sm[i].sm_timestamp) + rb_off; 362 break; 363 case MDDB_REV_RB64: 364 case MDDB_REV_RB64FN: 365 skip->skip_offset = offsetof(mm_unit_t, 366 un_sm[i].sm_timestamp) + rb_off; 367 break; 368 } 369 skip->skip_size = sizeof (md_timeval32_t); 370 skip_prev->skip_next = skip; 371 skip_prev = skip; 372 } 373 /* Return the start of the list of fields to skip */ 374 return (skip_start.skip_next); 375 } 376 377 /* 378 * hotspare_skip_ts 379 * 380 * Returns a list of the timestamp fields in the hotspare record structure. 381 * Used to skip these fields when calculating the checksum. 382 */ 383 static crc_skip_t * 384 hotspare_skip_ts(uint_t revision) 385 { 386 crc_skip_t *skip; 387 uint_t rb_off = offsetof(mddb_rb32_t, rb_data[0]); 388 389 skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), KM_SLEEP); 390 switch (revision) { 391 case MDDB_REV_RB: 392 case MDDB_REV_RBFN: 393 skip->skip_offset = offsetof(hot_spare32_od_t, hs_timestamp) + 394 rb_off; 395 break; 396 case MDDB_REV_RB64: 397 case MDDB_REV_RB64FN: 398 skip->skip_offset = offsetof(hot_spare_t, hs_timestamp) + 399 rb_off; 400 break; 401 } 402 skip->skip_size = sizeof (md_timeval32_t); 403 return (skip); 404 } 405 406 /* 407 * rec_crcfunc 408 * 409 * Calculate or check the checksum for a record 410 * Calculate the crc if check == 0, Check the crc if check == 1 411 * 412 * Record block may be written by different nodes in a multi-owner diskset 413 * (in case of master change), the function rec_crcchk excludes timestamp 414 * fields in crc computation of record data. 415 * Otherwise, timestamp fields will cause each node to have a different 416 * checksum for same record block causing the exclusive-or of all record block 417 * checksums and data block record sums to be non-zero after new master writes 418 * at least one record block. 419 */ 420 static uint_t 421 rec_crcfunc( 422 mddb_set_t *s, 423 mddb_de_ic_t *dep, 424 mddb_rb32_t *rbp, 425 int check 426 ) 427 { 428 crc_skip_t *skip; 429 crc_skip_t *skip_tail; 430 mddb_type_t type = dep->de_type1; 431 uint_t ret; 432 433 /* 434 * Generate a list of the areas to be skipped when calculating 435 * the checksum. 436 * First skip rb_checksum, rb_private and rb_userdata. 437 */ 438 skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), KM_SLEEP); 439 skip->skip_offset = offsetof(mddb_rb32_t, rb_checksum_fiddle); 440 skip->skip_size = 3 * sizeof (uint_t); 441 skip_tail = skip; 442 if (MD_MNSET_SETNO(s->s_setno)) { 443 /* For a MN set, skip rb_timestamp */ 444 skip_tail = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), 445 KM_SLEEP); 446 skip_tail->skip_offset = offsetof(mddb_rb32_t, rb_timestamp); 447 skip_tail->skip_size = sizeof (md_timeval32_t); 448 skip->skip_next = skip_tail; 449 450 /* Now add a list of timestamps to be skipped */ 451 if (type >= MDDB_FIRST_MODID) { 452 switch (dep->de_flags) { 453 case MDDB_F_STRIPE: 454 skip_tail->skip_next = 455 stripe_skip_ts((void *)rbp->rb_data, 456 rbp->rb_revision); 457 break; 458 case MDDB_F_MIRROR: 459 skip_tail->skip_next = 460 mirror_skip_ts(rbp->rb_revision); 461 break; 462 case MDDB_F_HOTSPARE: 463 skip_tail->skip_next = 464 hotspare_skip_ts(rbp->rb_revision); 465 break; 466 default: 467 break; 468 } 469 } 470 } 471 472 if (check) { 473 ret = crcchk(rbp, &rbp->rb_checksum, dep->de_recsize, skip); 474 } else { 475 crcgen(rbp, &rbp->rb_checksum, dep->de_recsize, skip); 476 ret = rbp->rb_checksum; 477 } 478 while (skip) { 479 crc_skip_t *skip_save = skip; 480 481 skip = skip->skip_next; 482 kmem_free(skip_save, sizeof (crc_skip_t)); 483 } 484 return (ret); 485 } 486 487 static mddb_bf_t * 488 allocbuffer( 489 mddb_set_t *s, 490 int sleepflag 491 ) 492 { 493 mddb_bf_t *bfp; 494 495 while ((bfp = s->s_freebufhead) == NULL) { 496 if (sleepflag == MDDB_NOSLEEP) 497 return ((mddb_bf_t *)NULL); 498 ++s->s_bufmisses; 499 #ifdef DEBUG 500 if (s->s_bufmisses == 1) 501 cmn_err(CE_NOTE, 502 "md: mddb: set %u sleeping for buffer", s->s_setno); 503 #endif 504 s->s_bufwakeup = 1; 505 cv_wait(&s->s_buf_cv, SETMUTEX(s->s_setno)); 506 } 507 s->s_freebufhead = bfp->bf_next; 508 bzero((caddr_t)bfp, sizeof (*bfp)); 509 bfp->bf_buf.b_back = bfp->bf_buf.b_forw = &bfp->bf_buf; 510 bfp->bf_buf.b_flags = B_BUSY; /* initialize flags */ 511 return (bfp); 512 } 513 514 static void 515 freebuffer( 516 mddb_set_t *s, 517 mddb_bf_t *bfp 518 ) 519 { 520 bfp->bf_next = s->s_freebufhead; 521 s->s_freebufhead = bfp; 522 if (s->s_bufwakeup) { 523 cv_broadcast(&s->s_buf_cv); 524 s->s_bufwakeup = 0; 525 } 526 } 527 528 529 static void 530 blkbusy( 531 mddb_set_t *s, 532 mddb_block_t blk 533 ) 534 { 535 int bit, byte; 536 537 s->s_freeblkcnt--; 538 byte = blk / 8; 539 bit = 1 << (blk & 7); 540 ASSERT(! (s->s_freebitmap[byte] & bit)); 541 s->s_freebitmap[byte] |= bit; 542 } 543 544 static void 545 blkfree( 546 mddb_set_t *s, 547 mddb_block_t blk 548 ) 549 { 550 int bit, byte; 551 552 s->s_freeblkcnt++; 553 byte = blk / 8; 554 bit = 1 << (blk & 7); 555 ASSERT(s->s_freebitmap[byte] & bit); 556 s->s_freebitmap[byte] &= ~bit; 557 } 558 559 static int 560 blkcheck( 561 mddb_set_t *s, 562 mddb_block_t blk 563 ) 564 { 565 int bit, byte; 566 567 byte = blk / 8; 568 bit = 1 << (blk & 7); 569 return (s->s_freebitmap[byte] & bit); 570 } 571 572 /* 573 * not fast but simple 574 */ 575 static mddb_block_t 576 getfreeblks( 577 mddb_set_t *s, 578 size_t count 579 ) 580 { 581 int i; 582 size_t contig; 583 584 contig = 0; 585 for (i = 0; i < s->s_totalblkcnt; i++) { 586 if (blkcheck(s, i)) { 587 contig = 0; 588 } else { 589 contig++; 590 if (contig == count) { 591 contig = i - count + 1; 592 for (i = (int)contig; i < contig + count; i++) 593 blkbusy(s, i); 594 return ((mddb_block_t)contig); 595 } 596 } 597 } 598 return (0); 599 } 600 601 static void 602 computefreeblks( 603 mddb_set_t *s 604 ) 605 { 606 mddb_db_t *dbp; 607 mddb_de_ic_t *dep; 608 int i; 609 int minblks; 610 int freeblks; 611 mddb_mb_ic_t *mbip; 612 mddb_lb_t *lbp; 613 mddb_block_t maxblk; 614 mddb_did_db_t *did_dbp; 615 int nblks; 616 617 minblks = 0; 618 lbp = s->s_lbp; 619 maxblk = 0; 620 621 /* 622 * Determine the max number of blocks. 623 */ 624 nblks = (lbp->lb_flags & MDDB_MNSET) ? MDDB_MN_MAXBLKS : MDDB_MAXBLKS; 625 /* 626 * go through and find highest logical block 627 */ 628 for (dbp = s->s_dbp; dbp != 0; dbp = dbp->db_next) { 629 if (dbp->db_blknum > maxblk) 630 maxblk = dbp->db_blknum; 631 for (dep = dbp->db_firstentry; dep != 0; dep = dep->de_next) 632 for (i = 0; i < dep->de_blkcount; i++) 633 if (dep->de_blks[i] > maxblk) 634 maxblk = dep->de_blks[i]; 635 } 636 637 for (i = 0; i < lbp->lb_loccnt; i++) { 638 mddb_locator_t *lp = &lbp->lb_locators[i]; 639 640 if ((lp->l_flags & MDDB_F_DELETED) || 641 (lp->l_flags & MDDB_F_EMASTER)) 642 continue; 643 644 freeblks = 0; 645 for (mbip = s->s_mbiarray[i]; mbip != NULL; 646 mbip = mbip->mbi_next) { 647 freeblks += mbip->mbi_mddb_mb.mb_blkcnt; 648 } 649 if (freeblks == 0) /* this happen when there is no */ 650 continue; /* master blk */ 651 652 if (freeblks <= maxblk) { 653 lp->l_flags |= MDDB_F_TOOSMALL; 654 lp->l_flags &= ~MDDB_F_ACTIVE; 655 } 656 657 if (freeblks < minblks || minblks == 0) 658 minblks = freeblks; 659 } 660 /* 661 * set up reasonable freespace if no 662 * data bases exist 663 */ 664 if (minblks == 0) 665 minblks = 100; 666 if (minblks > nblks) 667 minblks = nblks; 668 s->s_freeblkcnt = minblks; 669 s->s_totalblkcnt = minblks; 670 if (! s->s_freebitmapsize) { 671 s->s_freebitmapsize = nblks / 8; 672 s->s_freebitmap = (uchar_t *)kmem_zalloc(s->s_freebitmapsize, 673 KM_SLEEP); 674 } 675 bzero((caddr_t)s->s_freebitmap, s->s_freebitmapsize); 676 677 /* locator block sectors */ 678 for (i = 0; i < s->s_lbp->lb_blkcnt; i++) 679 blkbusy(s, i); 680 681 /* locator name sectors */ 682 for (i = 0; i < s->s_lbp->lb_lnblkcnt; i++) 683 blkbusy(s, (s->s_lbp->lb_lnfirstblk + i)); 684 685 if (lbp->lb_flags & MDDB_DEVID_STYLE) { 686 /* locator block device id information */ 687 for (i = 0; i < s->s_lbp->lb_didblkcnt; i++) 688 blkbusy(s, (s->s_lbp->lb_didfirstblk + i)); 689 690 /* disk blocks containing actual device ids */ 691 did_dbp = s->s_did_icp->did_ic_dbp; 692 while (did_dbp) { 693 for (i = 0; i < did_dbp->db_blkcnt; i++) { 694 blkbusy(s, did_dbp->db_firstblk + i); 695 } 696 did_dbp = did_dbp->db_next; 697 } 698 } 699 700 /* Only use data tags if not a MN set */ 701 if (!(lbp->lb_flags & MDDB_MNSET)) { 702 /* Found a bad tag, do NOT mark the data tag blks busy here */ 703 if (! (md_get_setstatus(s->s_setno) & MD_SET_BADTAG)) { 704 for (i = 0; i < s->s_lbp->lb_dtblkcnt; i++) 705 blkbusy(s, (s->s_lbp->lb_dtfirstblk + i)); 706 } 707 } 708 709 /* directory block/entry sectors */ 710 for (dbp = s->s_dbp; dbp != 0; dbp = dbp->db_next) { 711 blkbusy(s, dbp->db_blknum); 712 for (dep = dbp->db_firstentry; dep != 0; dep = dep->de_next) 713 for (i = 0; i < dep->de_blkcount; i++) 714 blkbusy(s, dep->de_blks[i]); 715 } 716 } 717 718 /* 719 * Add free space to the device id incore free list. 720 * Called: 721 * - During startup when all devid blocks are temporarily placed on the 722 * free list 723 * - After a devid has been deleted via the metadb command. 724 * - When mddb_devid_free_get adds unused space from a disk block 725 * to free list 726 */ 727 static int 728 mddb_devid_free_add( 729 mddb_set_t *s, 730 uint_t firstblk, 731 uint_t offset, 732 uint_t length 733 ) 734 { 735 mddb_did_free_t *did_freep; 736 737 if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) { 738 return (0); 739 } 740 741 did_freep = (mddb_did_free_t *)kmem_zalloc(sizeof (mddb_did_free_t), 742 KM_SLEEP); 743 did_freep->free_blk = firstblk; 744 did_freep->free_offset = offset; 745 did_freep->free_length = length; 746 did_freep->free_next = s->s_did_icp->did_ic_freep; 747 s->s_did_icp->did_ic_freep = did_freep; 748 749 return (0); 750 } 751 752 /* 753 * Remove specific free space from the device id incore free list. 754 * Called at startup (after all devid blocks have been placed on 755 * free list) in order to remove the free space from the list that 756 * contains actual devids. 757 * Returns 0 if area successfully removed. 758 * Returns 1 if no matching area is found - so nothing removed. 759 */ 760 static int 761 mddb_devid_free_delete( 762 mddb_set_t *s, 763 uint_t firstblk, 764 uint_t offset, 765 uint_t length 766 ) 767 { 768 int block_found = 0; 769 mddb_did_free_t *did_freep1; /* next free block */ 770 mddb_did_free_t *did_freep2 = 0; /* previous free block */ 771 mddb_did_free_t *did_freep_before; /* area before offset, len */ 772 mddb_did_free_t *did_freep_after; /* area after offset, len */ 773 uint_t old_length; 774 775 if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) { 776 return (1); 777 } 778 779 /* find free block for this devid */ 780 did_freep1 = s->s_did_icp->did_ic_freep; 781 while (did_freep1) { 782 /* 783 * Look through free list of <block, offset, length> to 784 * find our entry in the free list. Our entry should 785 * exist since the entire devid block was placed into 786 * this free list at startup. This code is just removing 787 * the non-free (in-use) portions of the devid block so 788 * that the remaining linked list does indeed just 789 * contain a free list. 790 * 791 * Our entry has been found if 792 * - the blocks match, 793 * - the offset (starting address) in the free list is 794 * less than the offset of our entry and 795 * - the length+offset (ending address) in the free list is 796 * greater than the length+offset of our entry. 797 */ 798 if ((did_freep1->free_blk == firstblk) && 799 (did_freep1->free_offset <= offset) && 800 ((did_freep1->free_length + did_freep1->free_offset) >= 801 (length + offset))) { 802 /* Have found our entry - remove from list */ 803 block_found = 1; 804 did_freep_before = did_freep1; 805 old_length = did_freep1->free_length; 806 /* did_freep1 - pts to next free block */ 807 did_freep1 = did_freep1->free_next; 808 if (did_freep2) { 809 did_freep2->free_next = did_freep1; 810 } else { 811 s->s_did_icp->did_ic_freep = did_freep1; 812 } 813 814 /* 815 * did_freep_before points to area in block before 816 * offset, length. 817 */ 818 did_freep_before->free_length = offset - 819 did_freep_before->free_offset; 820 /* 821 * did_freep_after points to area in block after 822 * offset, length. 823 */ 824 did_freep_after = (mddb_did_free_t *)kmem_zalloc 825 (sizeof (mddb_did_free_t), KM_SLEEP); 826 did_freep_after->free_blk = did_freep_before->free_blk; 827 did_freep_after->free_offset = offset + length; 828 did_freep_after->free_length = old_length - length - 829 did_freep_before->free_length; 830 /* 831 * Add before and after areas to free list 832 * If area before or after offset, length has length 833 * of 0, that entry is not added. 834 */ 835 if (did_freep_after->free_length) { 836 did_freep_after->free_next = did_freep1; 837 if (did_freep2) { 838 did_freep2->free_next = did_freep_after; 839 } else { 840 s->s_did_icp->did_ic_freep = 841 did_freep_after; 842 } 843 did_freep1 = did_freep_after; 844 } else { 845 kmem_free(did_freep_after, 846 sizeof (mddb_did_free_t)); 847 } 848 849 if (did_freep_before->free_length) { 850 did_freep_before->free_next = did_freep1; 851 if (did_freep2) { 852 did_freep2->free_next = did_freep_before; 853 } else { 854 s->s_did_icp->did_ic_freep = 855 did_freep_before; 856 } 857 } else { 858 kmem_free(did_freep_before, 859 sizeof (mddb_did_free_t)); 860 } 861 break; 862 } else { 863 did_freep2 = did_freep1; 864 did_freep1 = did_freep1->free_next; 865 } 866 } 867 if (block_found == 0) { 868 return (1); 869 } else { 870 return (0); 871 } 872 } 873 874 /* 875 * Find free space of devid length and remove free space from list. 876 * Return a pointer to the previously free area. 877 * 878 * If there's not enough free space on the free list, get an empty 879 * disk block, put the empty disk block on the did_ic_dbp linked list, 880 * and add the disk block space not used for devid to the free list. 881 * 882 * Return pointer to address (inside disk block) of free area for devid. 883 * Return 0 if error. 884 */ 885 static caddr_t 886 mddb_devid_free_get( 887 mddb_set_t *s, 888 uint_t len, 889 uint_t *blk, 890 uint_t *cnt, 891 uint_t *offset 892 ) 893 { 894 mddb_did_free_t *freep, *freep2; 895 mddb_did_db_t *dbp; 896 uint_t blk_cnt, blk_num; 897 ddi_devid_t devid_ptr = NULL; 898 899 if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) { 900 return (0); 901 } 902 903 freep = s->s_did_icp->did_ic_freep; 904 freep2 = (mddb_did_free_t *)NULL; 905 while (freep) { 906 /* found a free area - remove from free list */ 907 if (len <= freep->free_length) { 908 *blk = freep->free_blk; 909 *offset = freep->free_offset; 910 /* find disk block pointer that contains free area */ 911 dbp = s->s_did_icp->did_ic_dbp; 912 while (dbp) { 913 if (dbp->db_firstblk == *blk) 914 break; 915 else 916 dbp = dbp->db_next; 917 } 918 /* 919 * If a disk block pointer can't be found - something 920 * is wrong, so don't use this free space. 921 */ 922 if (dbp == NULL) { 923 freep2 = freep; 924 freep = freep->free_next; 925 continue; 926 } 927 928 devid_ptr = (ddi_devid_t)(dbp->db_ptr + *offset); 929 *cnt = dbp->db_blkcnt; 930 931 /* Update free list information */ 932 freep->free_offset += len; 933 freep->free_length -= len; 934 if (freep->free_length == 0) { 935 if (freep2) { 936 freep2->free_next = 937 freep->free_next; 938 } else { 939 s->s_did_icp->did_ic_freep = 940 freep->free_next; 941 } 942 kmem_free(freep, sizeof (mddb_did_free_t)); 943 } 944 break; 945 } 946 freep2 = freep; 947 freep = freep->free_next; 948 } 949 950 /* Didn't find a free spot */ 951 if (freep == NULL) { 952 /* get free logical disk blk in replica */ 953 blk_cnt = btodb(len + (MDDB_BSIZE - 1)); 954 blk_num = getfreeblks(s, blk_cnt); 955 if (blk_num == 0) 956 return (0); 957 958 /* Add disk block to disk block linked list */ 959 dbp = kmem_zalloc(sizeof (mddb_did_db_t), KM_SLEEP); 960 dbp->db_firstblk = blk_num; 961 dbp->db_blkcnt = blk_cnt; 962 dbp->db_ptr = (caddr_t)kmem_zalloc(dbtob(blk_cnt), KM_SLEEP); 963 dbp->db_next = s->s_did_icp->did_ic_dbp; 964 s->s_did_icp->did_ic_dbp = dbp; 965 devid_ptr = (ddi_devid_t)dbp->db_ptr; 966 967 /* Update return values */ 968 *blk = blk_num; 969 *offset = 0; 970 *cnt = blk_cnt; 971 972 /* Add unused part of block to free list */ 973 (void) mddb_devid_free_add(s, blk_num, 974 len, (dbtob(blk_cnt) - len)); 975 } 976 977 return ((caddr_t)devid_ptr); 978 } 979 980 /* 981 * Add device id information for locator index to device id area in set. 982 * Get free area to store device id from free list. Update checksum 983 * for mddb_did_blk. 984 * 985 * This routine does not write any data out to disk. 986 * After this routine has been called, the routine, writelocall, should 987 * be called to write both the locator block and device id area out 988 * to disk. 989 */ 990 static int 991 mddb_devid_add( 992 mddb_set_t *s, 993 uint_t index, 994 ddi_devid_t devid, 995 char *minor_name 996 ) 997 { 998 uint_t devid_len; 999 uint_t blk, offset; 1000 ddi_devid_t devid_ptr; 1001 mddb_did_info_t *did_info; 1002 uint_t blkcnt, i; 1003 mddb_did_blk_t *did_blk; 1004 1005 if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) { 1006 return (1); 1007 } 1008 if (strlen(minor_name) > (MDDB_MINOR_NAME_MAX - 1)) 1009 return (1); 1010 1011 /* Check if device id has already been added */ 1012 did_blk = s->s_did_icp->did_ic_blkp; 1013 did_info = &(did_blk->blk_info[index]); 1014 if (did_info->info_flags & MDDB_DID_EXISTS) 1015 return (0); 1016 1017 devid_len = ddi_devid_sizeof(devid); 1018 devid_ptr = (ddi_devid_t) 1019 mddb_devid_free_get(s, devid_len, &blk, &blkcnt, 1020 &offset); 1021 if (devid_ptr == NULL) { 1022 return (1); 1023 } 1024 1025 /* Copy devid into devid free area */ 1026 for (i = 0; i < devid_len; i++) 1027 ((char *)devid_ptr)[i] = ((char *)devid)[i]; 1028 1029 /* Update mddb_did_info area for new device id */ 1030 did_info->info_flags = MDDB_DID_EXISTS | MDDB_DID_VALID; 1031 1032 /* 1033 * Only set UPDATED flag for non-replicated import cases. 1034 * This allows the side locator driver name index to get 1035 * updated in load_old_replicas. 1036 */ 1037 if (!(md_get_setstatus(s->s_setno) & MD_SET_REPLICATED_IMPORT)) 1038 did_info->info_flags |= MDDB_DID_UPDATED; 1039 1040 did_info->info_firstblk = blk; 1041 did_info->info_blkcnt = blkcnt; 1042 did_info->info_offset = offset; 1043 did_info->info_length = devid_len; 1044 (void) strcpy(did_info->info_minor_name, minor_name); 1045 crcgen(devid_ptr, &did_info->info_checksum, devid_len, NULL); 1046 1047 /* Add device id pointer to did_ic_devid array */ 1048 s->s_did_icp->did_ic_devid[index] = devid_ptr; 1049 1050 return (0); 1051 } 1052 1053 1054 /* 1055 * Delete device id information for locator index from device id area in set. 1056 * Add device id space to free area. 1057 * 1058 * This routine does not write any data out to disk. 1059 * After this routine has been called, the routine, writelocall, should 1060 * be called to write both the locator block and device id area out 1061 * to disk. 1062 */ 1063 static int 1064 mddb_devid_delete(mddb_set_t *s, uint_t index) 1065 { 1066 mddb_did_info_t *did_info; 1067 mddb_did_blk_t *did_blk; 1068 1069 if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) { 1070 return (1); 1071 } 1072 1073 /* Get device id information from mddb_did_blk */ 1074 did_blk = s->s_did_icp->did_ic_blkp; 1075 did_info = &(did_blk->blk_info[index]); 1076 1077 /* 1078 * Ensure that the underlying device supports device ids 1079 * before arbitrarily removing them. 1080 */ 1081 if (!(did_info->info_flags & MDDB_DID_EXISTS)) { 1082 return (1); 1083 } 1084 1085 /* Remove device id information from mddb_did_blk */ 1086 did_info->info_flags = 0; 1087 1088 /* Remove device id from incore area */ 1089 s->s_did_icp->did_ic_devid[index] = (ddi_devid_t)NULL; 1090 1091 /* Add new free space in disk block to free list */ 1092 (void) mddb_devid_free_add(s, did_info->info_firstblk, 1093 did_info->info_offset, did_info->info_length); 1094 1095 return (0); 1096 } 1097 1098 /* 1099 * Check if there is a device id for a locator index. 1100 * 1101 * Caller of this routine should not free devid or minor_name since 1102 * these will point to internal data structures that should not 1103 * be freed. 1104 */ 1105 static int 1106 mddb_devid_get( 1107 mddb_set_t *s, 1108 uint_t index, 1109 ddi_devid_t *devid, 1110 char **minor_name 1111 ) 1112 { 1113 mddb_did_info_t *did_info; 1114 1115 if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) { 1116 return (0); 1117 } 1118 did_info = &(s->s_did_icp->did_ic_blkp->blk_info[index]); 1119 1120 if (did_info->info_flags & MDDB_DID_EXISTS) { 1121 *devid = s->s_did_icp->did_ic_devid[index]; 1122 *minor_name = 1123 s->s_did_icp->did_ic_blkp->blk_info[index].info_minor_name; 1124 return (1); 1125 } else 1126 return (0); 1127 1128 1129 } 1130 1131 /* 1132 * Check if device id is valid on current system. 1133 * Needs devid, previously known dev_t and current minor_name. 1134 * 1135 * Success: 1136 * Returns 0 if valid device id is found and updates 1137 * dev_t if the dev_t associated with the device id is 1138 * different than dev_t. 1139 * Failure: 1140 * Returns 1 if device id not valid on current system. 1141 */ 1142 static int 1143 mddb_devid_validate(ddi_devid_t devid, md_dev64_t *dev, char *minor_name) 1144 { 1145 int retndevs; 1146 dev_t *ddi_devs; 1147 int devid_flag = 0; 1148 int cnt; 1149 1150 if (dev == 0) 1151 return (1); 1152 /* 1153 * See if devid is valid in the current system. 1154 * If so, set dev to match the devid. 1155 */ 1156 if (ddi_lyr_devid_to_devlist(devid, minor_name, 1157 &retndevs, &ddi_devs) == DDI_SUCCESS) { 1158 if (retndevs > 0) { 1159 /* devid is valid to use */ 1160 devid_flag = 1; 1161 /* does dev_t in list match dev */ 1162 cnt = 0; 1163 while (cnt < retndevs) { 1164 if (*dev == md_expldev(ddi_devs[cnt])) 1165 break; 1166 cnt++; 1167 } 1168 /* 1169 * If a different dev_t, then setup 1170 * new dev and new major name 1171 */ 1172 if (cnt == retndevs) { 1173 *dev = md_expldev(ddi_devs[0]); 1174 } 1175 ddi_lyr_free_devlist(ddi_devs, retndevs); 1176 } 1177 } 1178 if (devid_flag) 1179 return (0); 1180 else 1181 return (1); 1182 } 1183 1184 1185 /* 1186 * Free the devid incore data areas 1187 */ 1188 static void 1189 mddb_devid_icp_free(mddb_did_ic_t **did_icp, mddb_lb_t *lbp) 1190 { 1191 mddb_did_free_t *did_freep1, *did_freep2; 1192 mddb_did_db_t *did_dbp1, *did_dbp2; 1193 mddb_did_ic_t *icp = *did_icp; 1194 1195 if (icp) { 1196 if (icp->did_ic_blkp) { 1197 kmem_free((caddr_t)icp->did_ic_blkp, 1198 dbtob(lbp->lb_didblkcnt)); 1199 icp->did_ic_blkp = (mddb_did_blk_t *)NULL; 1200 } 1201 1202 if (icp->did_ic_dbp) { 1203 did_dbp1 = icp->did_ic_dbp; 1204 while (did_dbp1) { 1205 did_dbp2 = did_dbp1->db_next; 1206 kmem_free((caddr_t)did_dbp1->db_ptr, 1207 dbtob(did_dbp1->db_blkcnt)); 1208 kmem_free((caddr_t)did_dbp1, 1209 sizeof (mddb_did_db_t)); 1210 did_dbp1 = did_dbp2; 1211 } 1212 } 1213 1214 if (icp->did_ic_freep) { 1215 did_freep1 = icp->did_ic_freep; 1216 while (did_freep1) { 1217 did_freep2 = did_freep1->free_next; 1218 kmem_free((caddr_t)did_freep1, 1219 sizeof (mddb_did_free_t)); 1220 did_freep1 = did_freep2; 1221 } 1222 } 1223 1224 kmem_free((caddr_t)icp, sizeof (mddb_did_ic_t)); 1225 *did_icp = (mddb_did_ic_t *)NULL; 1226 } 1227 1228 } 1229 1230 static daddr_t 1231 getphysblk( 1232 mddb_block_t blk, 1233 mddb_mb_ic_t *mbip 1234 ) 1235 { 1236 mddb_mb_t *mbp = &(mbip->mbi_mddb_mb); 1237 1238 while (blk >= mbp->mb_blkcnt) { 1239 if (! mbip->mbi_next) 1240 return ((daddr_t)-1); /* no such block */ 1241 blk -= mbp->mb_blkcnt; 1242 mbip = mbip->mbi_next; 1243 mbp = &(mbip->mbi_mddb_mb); 1244 } 1245 1246 if (blk >= mbp->mb_blkmap.m_consecutive) 1247 return ((daddr_t)-1); /* no such block */ 1248 1249 return ((daddr_t)(mbp->mb_blkmap.m_firstblk + blk)); 1250 } 1251 1252 /* 1253 * when a buf header is passed in the new buffer must be 1254 * put on the front of the chain. writerec counts on it 1255 */ 1256 static int 1257 putblks( 1258 mddb_set_t *s, /* incore db set structure */ 1259 caddr_t buffer, /* adr of buffer to be written */ 1260 daddr_t blk, /* block number for first block */ 1261 int cnt, /* number of blocks to be written */ 1262 md_dev64_t device, /* device to be written to */ 1263 mddb_bf_t **bufhead /* if non-zero then ASYNC I/O */ 1264 /* and put buf address here */ 1265 ) 1266 { 1267 buf_t *bp; 1268 mddb_bf_t *bfp; 1269 int err = 0; 1270 1271 bfp = allocbuffer(s, MDDB_SLEEPOK); 1272 bp = &bfp->bf_buf; 1273 bp->b_bcount = MDDB_BSIZE * cnt; 1274 bp->b_un.b_addr = buffer; 1275 bp->b_blkno = blk; 1276 bp->b_edev = md_dev64_to_dev(device); 1277 /* 1278 * if a header for a buf chain is passed in this is async io. 1279 * currently only done for optimize records 1280 */ 1281 if (bufhead) { 1282 bfp->bf_next = *bufhead; 1283 *bufhead = bfp; 1284 (void) mddb_rwdata(s, B_WRITE|B_ASYNC, bp); 1285 return (0); 1286 } 1287 err = mddb_rwdata(s, B_WRITE, bp); 1288 freebuffer(s, bfp); 1289 if (err) { 1290 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, SVM_TAG_REPLICA, 1291 s->s_setno, device); 1292 return (MDDB_F_EWRITE); 1293 } 1294 return (0); 1295 } 1296 1297 /* 1298 * wrtblklst - takes an array of logical block numbers 1299 * and writes the buffer to those blocks (scatter). 1300 * If called during upgrade, this routine expects a 1301 * non-translated (aka target) dev. 1302 */ 1303 static int 1304 wrtblklst( 1305 mddb_set_t *s, /* incore set structure */ 1306 caddr_t buffer, /* buffer to be written (record blk) */ 1307 mddb_block_t blka[], /* list of logical blks for record */ 1308 daddr_t cnt, /* number of logical blks */ 1309 const int li, /* locator index */ 1310 mddb_bf_t **bufhead, /* if non-zero then ASYNC I/O */ 1311 /* and put buf address here */ 1312 int master_only /* allow only master node to write */ 1313 ) 1314 { 1315 daddr_t blk; 1316 daddr_t blk1; 1317 int err = 0; 1318 int cons; 1319 mddb_lb_t *lbp = s->s_lbp; 1320 mddb_locator_t *lp = &lbp->lb_locators[li]; 1321 md_dev64_t dev; 1322 mddb_mb_ic_t *mbip = s->s_mbiarray[li]; 1323 1324 /* 1325 * If a MN diskset and only the master can write, 1326 * then a non-master node will just return success. 1327 */ 1328 if ((lbp->lb_flags & MDDB_MNSET) && 1329 (master_only == MDDB_WR_ONLY_MASTER)) { 1330 1331 /* return successfully if we aren't the master */ 1332 if (!(md_set[s->s_setno].s_am_i_master)) { 1333 return (0); 1334 } 1335 } 1336 1337 dev = md_xlate_targ_2_mini(md_expldev(lp->l_dev)); 1338 if (dev == NODEV64) { 1339 return (1); 1340 } 1341 1342 blk = getphysblk(blka[0], mbip); 1343 ASSERT(blk >= 0); 1344 1345 cons = 1; 1346 while (cnt) { 1347 if (cons != cnt) { 1348 blk1 = getphysblk(blka[cons], mbip); 1349 ASSERT(blk1 >= 0); 1350 if ((blk + cons) == blk1) { 1351 cons++; 1352 continue; 1353 } 1354 } 1355 if (err = putblks(s, buffer, blk, cons, dev, bufhead)) { 1356 /* 1357 * If an MN diskset and any_node_can_write 1358 * then this request is coming from writeoptrecord 1359 * and l_flags field should not be updated. 1360 * l_flags will be updated as a result of sending 1361 * a class1 message to the master. Setting l_flags 1362 * here will cause slave to be out of sync with 1363 * master. 1364 * 1365 * Otherwise, set the error in l_flags 1366 * (this occurs if this is not a MN diskset or 1367 * only_master_can_write is set). 1368 */ 1369 if ((!(lbp->lb_flags & MDDB_MNSET)) || 1370 (master_only == MDDB_WR_ONLY_MASTER)) { 1371 lp->l_flags |= MDDB_F_EWRITE; 1372 } 1373 return (err); 1374 } 1375 if (bufhead) 1376 (*bufhead)->bf_locator = lp; 1377 1378 buffer += MDDB_BSIZE * cons; 1379 cnt -= cons; 1380 blka += cons; 1381 if (cnt) { 1382 blk = getphysblk(blka[0], mbip); 1383 ASSERT(blk >= 0); 1384 } 1385 cons = 1; 1386 } 1387 1388 return (0); 1389 } 1390 1391 /* 1392 * writeblks - takes a logical block number/block count pair 1393 * and writes the buffer to those contiguous logical blocks. 1394 * If called during upgrade, this routine expects a non-translated 1395 * (aka target) dev. 1396 */ 1397 static int 1398 writeblks( 1399 mddb_set_t *s, /* incore set structure */ 1400 caddr_t buffer, /* buffer to be written */ 1401 mddb_block_t blk, /* starting logical block number */ 1402 int cnt, /* number of log blocks to be written */ 1403 const int li, /* locator index */ 1404 int master_only /* allow only master node to write */ 1405 ) 1406 { 1407 daddr_t physblk; 1408 int err = 0; 1409 int i; 1410 mddb_lb_t *lbp = s->s_lbp; 1411 mddb_locator_t *lp = &lbp->lb_locators[li]; 1412 md_dev64_t dev; 1413 mddb_block_t *blkarray; 1414 int size; 1415 int ret; 1416 1417 /* 1418 * If a MN diskset and only the master can write, 1419 * then a non-master node will just return success. 1420 */ 1421 if ((lbp->lb_flags & MDDB_MNSET) && 1422 (master_only == MDDB_WR_ONLY_MASTER)) { 1423 /* return successfully if we aren't the master */ 1424 if (!(md_set[s->s_setno].s_am_i_master)) { 1425 return (0); 1426 } 1427 } 1428 1429 dev = md_xlate_targ_2_mini(md_expldev(lp->l_dev)); 1430 if (dev == NODEV64) { 1431 return (1); 1432 } 1433 1434 if (cnt > 1) { 1435 size = sizeof (mddb_block_t) * cnt; 1436 blkarray = (mddb_block_t *)kmem_alloc(size, KM_SLEEP); 1437 for (i = 0; i < cnt; i++) 1438 blkarray[i] = blk + i; 1439 ret = wrtblklst(s, buffer, blkarray, cnt, 1440 li, 0, MDDB_WR_ONLY_MASTER); 1441 kmem_free(blkarray, size); 1442 return (ret); 1443 } 1444 physblk = getphysblk(blk, s->s_mbiarray[li]); 1445 ASSERT(physblk > 0); 1446 if (err = putblks(s, buffer, physblk, 1, dev, (mddb_bf_t **)0)) { 1447 lp->l_flags |= MDDB_F_EWRITE; 1448 return (err); 1449 } 1450 return (0); 1451 } 1452 1453 /* 1454 * writeall - will write the buffer to all ACTIVE/NON-ERRORED replicas. 1455 */ 1456 static int 1457 writeall( 1458 mddb_set_t *s, /* incore set structure */ 1459 caddr_t buffer, /* buffer to be written */ 1460 mddb_block_t block, /* starting logical block number */ 1461 int cnt, /* number of log blocks to be written */ 1462 int master_only /* allow only master node to write */ 1463 ) 1464 { 1465 int li; 1466 int err = 0; 1467 mddb_lb_t *lbp = s->s_lbp; 1468 1469 for (li = 0; li < lbp->lb_loccnt; li++) { 1470 mddb_locator_t *lp = &lbp->lb_locators[li]; 1471 1472 if ((! (lp->l_flags & MDDB_F_ACTIVE)) || 1473 (lp->l_flags & MDDB_F_EWRITE)) 1474 continue; 1475 1476 err |= writeblks(s, buffer, block, cnt, li, master_only); 1477 } 1478 1479 return (err); 1480 } 1481 1482 /* 1483 * writelocall - write the locator block and device id information (if 1484 * replica is in device id format) to all ACTIVE/NON-ERRORER replicas. 1485 * 1486 * Increments the locator block's commitcnt. Updates the device id area's 1487 * commitcnt if the replica is in device id format. Regenerates the 1488 * checksums after updating the commitcnt(s). 1489 */ 1490 static int 1491 writelocall( 1492 mddb_set_t *s /* incore set structure */ 1493 ) 1494 { 1495 int li; 1496 int err = 0; 1497 mddb_lb_t *lbp = s->s_lbp; 1498 mddb_did_blk_t *did_blk; 1499 mddb_did_db_t *did_dbp; 1500 1501 s->s_lbp->lb_commitcnt++; 1502 if (lbp->lb_flags & MDDB_DEVID_STYLE) { 1503 did_blk = s->s_did_icp->did_ic_blkp; 1504 did_blk->blk_commitcnt = s->s_lbp->lb_commitcnt; 1505 crcgen(did_blk, &did_blk->blk_checksum, 1506 dbtob(lbp->lb_didblkcnt), NULL); 1507 } 1508 crcgen(lbp, &lbp->lb_checksum, dbtob(lbp->lb_blkcnt), NULL); 1509 1510 for (li = 0; li < lbp->lb_loccnt; li++) { 1511 mddb_locator_t *lp = &lbp->lb_locators[li]; 1512 1513 if ((! (lp->l_flags & MDDB_F_ACTIVE)) || 1514 (lp->l_flags & MDDB_F_EWRITE)) 1515 continue; 1516 1517 if (lbp->lb_flags & MDDB_DEVID_STYLE) { 1518 /* write out blocks containing actual device ids */ 1519 did_dbp = s->s_did_icp->did_ic_dbp; 1520 while (did_dbp) { 1521 err |= writeblks(s, (caddr_t)did_dbp->db_ptr, 1522 did_dbp->db_firstblk, 1523 did_dbp->db_blkcnt, li, 1524 MDDB_WR_ONLY_MASTER); 1525 did_dbp = did_dbp->db_next; 1526 } 1527 1528 /* write out device id area block */ 1529 err |= writeblks(s, (caddr_t)did_blk, 1530 lbp->lb_didfirstblk, lbp->lb_didblkcnt, li, 1531 MDDB_WR_ONLY_MASTER); 1532 } 1533 /* write out locator block */ 1534 err |= writeblks(s, (caddr_t)lbp, 0, lbp->lb_blkcnt, li, 1535 MDDB_WR_ONLY_MASTER); 1536 } 1537 1538 /* 1539 * If a MN diskset and this is the master, set the PARSE_LOCBLK flag 1540 * in the mddb_set structure to show that the locator block has 1541 * been changed. 1542 */ 1543 1544 if ((lbp->lb_flags & MDDB_MNSET) && 1545 (md_set[s->s_setno].s_am_i_master)) { 1546 s->s_mn_parseflags |= MDDB_PARSE_LOCBLK; 1547 } 1548 return (err); 1549 } 1550 1551 /* 1552 * If called during upgrade, this routine expects a translated 1553 * (aka miniroot) dev. 1554 */ 1555 static int 1556 getblks( 1557 mddb_set_t *s, /* incore db set structure */ 1558 caddr_t buffer, /* buffer to read data into */ 1559 md_dev64_t device, /* device to read from */ 1560 daddr_t blk, /* physical block number to read */ 1561 int cnt, /* number of blocks to read */ 1562 int flag /* flags for I/O */ 1563 ) 1564 { 1565 buf_t *bp; 1566 mddb_bf_t *bfp; 1567 int err = 0; 1568 1569 bfp = allocbuffer(s, MDDB_SLEEPOK); /* this will never sleep */ 1570 bp = &bfp->bf_buf; 1571 bp->b_bcount = MDDB_BSIZE * cnt; 1572 bp->b_un.b_addr = buffer; 1573 bp->b_blkno = blk; 1574 bp->b_edev = md_dev64_to_dev(device); 1575 err = mddb_rwdata(s, (B_READ | flag), bp); 1576 freebuffer(s, bfp); 1577 if (err) { 1578 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, SVM_TAG_REPLICA, 1579 s->s_setno, device); 1580 return (MDDB_F_EREAD); 1581 } 1582 return (0); 1583 } 1584 1585 /* 1586 * readblklst - takes an array of logical block numbers 1587 * and reads those blocks (gather) into the buffer. 1588 * If called during upgrade, this routine expects a non-translated 1589 * (aka target) dev. 1590 */ 1591 static int 1592 readblklst( 1593 mddb_set_t *s, /* incore set structure */ 1594 caddr_t buffer, /* buffer to be read (record block) */ 1595 mddb_block_t blka[], /* list of logical blocks to be read */ 1596 daddr_t cnt, /* number of logical blocks */ 1597 int li, /* locator index */ 1598 int flag /* flags for I/O */ 1599 ) 1600 { 1601 daddr_t blk; 1602 daddr_t blk1; 1603 int err = 0; 1604 int cons; 1605 md_dev64_t dev; 1606 mddb_mb_ic_t *mbip; 1607 1608 mbip = s->s_mbiarray[li]; 1609 dev = md_expldev(s->s_lbp->lb_locators[li].l_dev); 1610 dev = md_xlate_targ_2_mini(dev); 1611 if (dev == NODEV64) { 1612 return (1); 1613 } 1614 1615 blk = getphysblk(blka[0], mbip); 1616 ASSERT(blk >= 0); 1617 1618 cons = 1; 1619 while (cnt) { 1620 if (cons != cnt) { 1621 blk1 = getphysblk(blka[cons], mbip); 1622 ASSERT(blk1 >= 0); 1623 if ((blk + cons) == blk1) { 1624 cons++; 1625 continue; 1626 } 1627 } 1628 if (err = getblks(s, buffer, dev, blk, cons, flag)) 1629 return (err); 1630 buffer += MDDB_BSIZE * cons; 1631 cnt -= cons; 1632 blka += cons; 1633 if (cnt) { 1634 blk = getphysblk(blka[0], mbip); 1635 ASSERT(blk >= 0); 1636 } 1637 cons = 1; 1638 } 1639 return (0); 1640 } 1641 1642 /* 1643 * readblks - takes a logical block number/block count pair 1644 * and reads those contiguous logical blocks into the buffer. 1645 * If called during upgrade, this routine expects a non-translated 1646 * (aka target) dev. 1647 */ 1648 static int 1649 readblks( 1650 mddb_set_t *s, /* incore set structure */ 1651 caddr_t buffer, /* buffer to be read into */ 1652 mddb_block_t blk, /* logical block number to be read */ 1653 int cnt, /* number of logical blocks to be read */ 1654 int li /* locator index */ 1655 ) 1656 { 1657 daddr_t physblk; 1658 md_dev64_t device; 1659 int i; 1660 mddb_block_t *blkarray; 1661 int size; 1662 int ret; 1663 1664 if (cnt > 1) { 1665 size = sizeof (mddb_block_t) * cnt; 1666 blkarray = (mddb_block_t *)kmem_alloc(size, KM_SLEEP); 1667 for (i = 0; i < cnt; i++) 1668 blkarray[i] = blk + i; 1669 ret = readblklst(s, buffer, blkarray, cnt, li, 0); 1670 kmem_free(blkarray, size); 1671 return (ret); 1672 } 1673 physblk = getphysblk(blk, s->s_mbiarray[li]); 1674 ASSERT(physblk > 0); 1675 device = md_expldev(s->s_lbp->lb_locators[li].l_dev); 1676 device = md_xlate_targ_2_mini(device); 1677 if (device == NODEV64) { 1678 return (1); 1679 } 1680 return (getblks(s, buffer, device, physblk, 1, 0)); 1681 } 1682 1683 static void 1684 single_thread_start( 1685 mddb_set_t *s 1686 ) 1687 { 1688 while (s->s_singlelockgotten) { 1689 s->s_singlelockwanted++; 1690 cv_wait(&s->s_single_thread_cv, SETMUTEX(s->s_setno)); 1691 } 1692 s->s_singlelockgotten++; 1693 } 1694 1695 static void 1696 single_thread_end( 1697 mddb_set_t *s 1698 ) 1699 { 1700 ASSERT(s->s_singlelockgotten); 1701 s->s_singlelockgotten = 0; 1702 if (s->s_singlelockwanted) { 1703 s->s_singlelockwanted = 0; 1704 cv_broadcast(&s->s_single_thread_cv); 1705 } 1706 } 1707 1708 static size_t 1709 sizeofde( 1710 mddb_de_ic_t *dep 1711 ) 1712 { 1713 size_t size; 1714 1715 size = sizeof (mddb_de_ic_t) - sizeof (mddb_block_t) + 1716 sizeof (mddb_block_t) * dep->de_blkcount; 1717 return (size); 1718 } 1719 1720 static size_t 1721 sizeofde32( 1722 mddb_de32_t *dep 1723 ) 1724 { 1725 size_t size; 1726 1727 size = sizeof (*dep) - sizeof (dep->de32_blks) + 1728 sizeof (mddb_block_t) * dep->de32_blkcount; 1729 return (size); 1730 } 1731 1732 static mddb_de32_t * 1733 nextentry( 1734 mddb_de32_t *dep 1735 ) 1736 { 1737 mddb_de32_t *ret; 1738 1739 ret = (mddb_de32_t *)((void *)((caddr_t)dep + sizeofde32(dep))); 1740 return (ret); 1741 } 1742 1743 static void 1744 create_db32rec( 1745 mddb_db32_t *db32p, 1746 mddb_db_t *dbp 1747 ) 1748 { 1749 mddb_de_ic_t *dep; 1750 mddb_de32_t *de32p; 1751 1752 #if defined(_ILP32) && !defined(lint) 1753 ASSERT(sizeof (mddb_de_t) == sizeof (mddb_de32_t)); 1754 ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t)); 1755 #endif 1756 1757 dbtodb32(dbp, db32p); 1758 if ((dbp->db_firstentry != NULL) && (db32p->db32_firstentry == 0)) 1759 db32p->db32_firstentry = 0x4; 1760 de32p = (mddb_de32_t *)((void *) ((caddr_t)(&db32p->db32_firstentry) 1761 + sizeof (db32p->db32_firstentry))); 1762 for (dep = dbp->db_firstentry; dep; dep = dep->de_next) { 1763 detode32(dep, de32p); 1764 if ((dep->de_next != NULL) && (de32p->de32_next == 0)) 1765 de32p->de32_next = 0x4; 1766 de32p = nextentry(de32p); 1767 } 1768 ASSERT((uintptr_t)de32p <= (uintptr_t)de32p + MDDB_BSIZE); 1769 } 1770 1771 /* 1772 * If called during upgrade, this routine expects a translated 1773 * (aka miniroot) dev. 1774 * If master blocks are found, set the mn_set parameter to 1 if the 1775 * the master block revision number is MDDB_REV_MNMB; otherwise, 1776 * set it to 0. 1777 * If master blocks are not found, do not change the mnset parameter. 1778 */ 1779 static mddb_mb_ic_t * 1780 getmasters( 1781 mddb_set_t *s, 1782 md_dev64_t dev, 1783 daddr_t blkno, 1784 uint_t *flag, 1785 int *mn_set 1786 ) 1787 { 1788 mddb_mb_ic_t *mbi = NULL; 1789 mddb_mb_t *mb; 1790 int error = 0; 1791 ddi_devid_t devid; 1792 1793 1794 if (mddb_devopen(dev)) { 1795 if (flag) 1796 *flag |= MDDB_F_EMASTER; 1797 return ((mddb_mb_ic_t *)NULL); 1798 } 1799 1800 1801 mbi = (mddb_mb_ic_t *)kmem_zalloc(MDDB_IC_BSIZE, KM_SLEEP); 1802 mb = &(mbi->mbi_mddb_mb); 1803 if (error = getblks(s, (caddr_t)mb, dev, blkno, 1804 btodb(MDDB_BSIZE), 0)) { 1805 error |= MDDB_F_EMASTER; 1806 } 1807 if (mb->mb_magic != MDDB_MAGIC_MB) { 1808 error = MDDB_F_EFMT | MDDB_F_EMASTER; 1809 } 1810 /* Check for MDDB_REV_MNMB and lower */ 1811 if (revchk(MDDB_REV_MNMB, mb->mb_revision)) { 1812 error = MDDB_F_EFMT | MDDB_F_EMASTER; 1813 } 1814 if (crcchk(mb, &mb->mb_checksum, MDDB_BSIZE, NULL)) { 1815 error = MDDB_F_EFMT | MDDB_F_EMASTER; 1816 } 1817 1818 if (!(md_get_setstatus(s->s_setno) & 1819 (MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT)) && 1820 (mb->mb_setno != s->s_setno)) { 1821 error = MDDB_F_EFMT | MDDB_F_EMASTER; 1822 } 1823 if (mb->mb_blkno != blkno) { 1824 error = MDDB_F_EFMT | MDDB_F_EMASTER; 1825 } 1826 mb->mb_next = NULL; 1827 mbi->mbi_next = NULL; 1828 1829 if (error) 1830 goto out; 1831 1832 /* 1833 * Check the md_devid_destroy and md_keep_repl_state flags 1834 * to see if we need to regen the devid or not. 1835 * 1836 * Don't care about devid in local set since it is not used 1837 * and this should not be part of set importing 1838 */ 1839 if ((s->s_setno != MD_LOCAL_SET) && 1840 !(md_get_setstatus(s->s_setno) & 1841 (MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT))) { 1842 /* 1843 * Now check the destroy flag. We also need to handle 1844 * the case where the destroy flag is reset after the 1845 * destroy 1846 */ 1847 if (md_devid_destroy || (mb->mb_devid_len == 0)) { 1848 1849 if (md_devid_destroy) { 1850 bzero(mb->mb_devid, mb->mb_devid_len); 1851 mb->mb_devid_len = 0; 1852 } 1853 1854 /* 1855 * Try to regenerate it if the 'keep' flag is not set 1856 */ 1857 if (!md_keep_repl_state) { 1858 if (ddi_lyr_get_devid(md_dev64_to_dev(dev), 1859 &devid) == DDI_SUCCESS) { 1860 mb->mb_devid_len = 1861 ddi_devid_sizeof(devid); 1862 bcopy(devid, mb->mb_devid, 1863 mb->mb_devid_len); 1864 ddi_devid_free(devid); 1865 } else { 1866 error = MDDB_F_EFMT | MDDB_F_EMASTER; 1867 } 1868 } 1869 1870 crcgen(mb, &mb->mb_checksum, MDDB_BSIZE, NULL); 1871 1872 /* 1873 * Push 1874 */ 1875 if (putblks(s, (caddr_t)mb, blkno, 1, dev, 0) != 0) { 1876 error = MDDB_F_EFMT | MDDB_F_EMASTER; 1877 } 1878 } 1879 } 1880 1881 if (! error) { 1882 /* Set mn_set parameter to 1 if a MN set */ 1883 if (mb->mb_revision == MDDB_REV_MNMB) 1884 *mn_set = 1; 1885 else 1886 *mn_set = 0; 1887 return (mbi); 1888 } 1889 1890 out: 1891 /* Error Out */ 1892 if (flag) 1893 *flag |= error; 1894 1895 kmem_free((caddr_t)mbi, MDDB_IC_BSIZE); 1896 mddb_devclose(dev); 1897 return ((mddb_mb_ic_t *)NULL); 1898 } 1899 1900 static int 1901 getrecord( 1902 mddb_set_t *s, 1903 mddb_de_ic_t *dep, 1904 int li 1905 ) 1906 { 1907 int err = 0; 1908 mddb_rb32_t *rbp; 1909 1910 #if defined(_ILP32) && !defined(lint) 1911 ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t)); 1912 #endif 1913 1914 1915 dep->de_rb = (mddb_rb32_t *)kmem_zalloc(dep->de_recsize, KM_SLEEP); 1916 rbp = dep->de_rb; 1917 1918 err = readblklst(s, (caddr_t)rbp, dep->de_blks, 1919 dep->de_blkcount, li, 0); 1920 if (err) { 1921 return (MDDB_F_EDATA | err); 1922 } 1923 if (rbp->rb_magic != MDDB_MAGIC_RB) { 1924 return (MDDB_F_EFMT | MDDB_F_EDATA); 1925 } 1926 if ((revchk(MDDB_REV_RB, rbp->rb_revision) != 0) && 1927 (revchk(MDDB_REV_RB64, rbp->rb_revision) != 0) && 1928 (revchk(MDDB_REV_RBFN, rbp->rb_revision) != 0) && 1929 (revchk(MDDB_REV_RB64FN, rbp->rb_revision) != 0)) { 1930 return (MDDB_F_EFMT | MDDB_F_EDATA); 1931 } 1932 /* Check crc for this record */ 1933 if (rec_crcchk(s, dep, rbp)) { 1934 return (MDDB_F_EFMT | MDDB_F_EDATA); 1935 } 1936 return (0); 1937 } 1938 1939 /* 1940 * Code to read in the locator name information 1941 */ 1942 static int 1943 readlocnames( 1944 mddb_set_t *s, 1945 int li 1946 ) 1947 { 1948 mddb_ln_t *lnp; 1949 int err = 0; 1950 mddb_block_t ln_blkcnt, ln_blkno; 1951 1952 /* 1953 * read in the locator name blocks 1954 */ 1955 s->s_lnp = NULL; 1956 1957 ln_blkno = s->s_lbp->lb_lnfirstblk; 1958 ln_blkcnt = s->s_lbp->lb_lnblkcnt; 1959 lnp = (mddb_ln_t *)kmem_zalloc(dbtob(ln_blkcnt), KM_SLEEP); 1960 1961 err = readblks(s, (caddr_t)lnp, ln_blkno, ln_blkcnt, li); 1962 if (err) { 1963 err |= MDDB_F_EDATA; 1964 goto out; 1965 } 1966 if (lnp->ln_magic != MDDB_MAGIC_LN) { 1967 err = MDDB_F_EDATA | MDDB_F_EFMT; 1968 goto out; 1969 } 1970 if (s->s_lbp->lb_flags & MDDB_MNSET) { 1971 if (revchk(MDDB_REV_MNLN, lnp->ln_revision)) { 1972 err = MDDB_F_EDATA | MDDB_F_EFMT; 1973 goto out; 1974 } 1975 } else { 1976 if (revchk(MDDB_REV_LN, lnp->ln_revision)) { 1977 err = MDDB_F_EDATA | MDDB_F_EFMT; 1978 goto out; 1979 } 1980 } 1981 if (crcchk(lnp, &lnp->ln_checksum, dbtob(ln_blkcnt), NULL)) { 1982 err = MDDB_F_EDATA | MDDB_F_EFMT; 1983 goto out; 1984 } 1985 out: 1986 /* 1987 * if error occurred in locator name blocks free them 1988 * and return 1989 */ 1990 if (err) { 1991 kmem_free((caddr_t)lnp, dbtob(ln_blkcnt)); 1992 return (err); 1993 } 1994 s->s_lnp = lnp; 1995 return (0); 1996 } 1997 1998 /* 1999 * code to read in a copy of the database. 2000 */ 2001 2002 static int 2003 readcopy( 2004 mddb_set_t *s, 2005 int li 2006 ) 2007 { 2008 uint_t blk; 2009 mddb_db_t *dbp, *dbp1, *dbhp; 2010 mddb_db32_t *db32p; 2011 mddb_de_ic_t *dep, *dep2; 2012 mddb_de32_t *de32p, *de32p2; 2013 int err = 0; 2014 uint_t checksum; 2015 2016 2017 #if defined(_ILP32) && !defined(lint) 2018 ASSERT(sizeof (mddb_de_t) == sizeof (mddb_de32_t)); 2019 ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t)); 2020 #endif 2021 2022 dbp = NULL; 2023 dbhp = NULL; 2024 /* 2025 * read in all the directory blocks 2026 */ 2027 blk = s->s_lbp->lb_dbfirstblk; 2028 db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, KM_SLEEP); 2029 2030 for (; blk != 0; blk = dbp->db_nextblk) { 2031 dbp1 = (mddb_db_t *)kmem_zalloc(sizeof (mddb_db_t), KM_SLEEP); 2032 if (! dbhp) { 2033 dbhp = dbp1; 2034 } else { 2035 dbp->db_next = dbp1; 2036 } 2037 dbp = dbp1; 2038 2039 err = readblks(s, (caddr_t)db32p, blk, 1, li); 2040 if (err) { 2041 err |= MDDB_F_EDATA; 2042 break; 2043 } 2044 db32todb(db32p, dbp); 2045 if (db32p->db32_magic != MDDB_MAGIC_DB) { 2046 err = MDDB_F_EDATA | MDDB_F_EFMT; 2047 break; 2048 } 2049 if (revchk(MDDB_REV_DB, db32p->db32_revision)) { 2050 err = MDDB_F_EDATA | MDDB_F_EFMT; 2051 break; 2052 } 2053 if (crcchk(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL)) { 2054 err = MDDB_F_EDATA | MDDB_F_EFMT; 2055 break; 2056 } 2057 /* 2058 * first go through and fix up all de_next pointers 2059 */ 2060 if (dbp->db_firstentry) { 2061 2062 de32p = (mddb_de32_t *) 2063 ((void *) ((caddr_t)(&db32p->db32_firstentry) 2064 + sizeof (db32p->db32_firstentry))); 2065 2066 dep = (mddb_de_ic_t *) 2067 kmem_zalloc(sizeof (mddb_de_ic_t) - 2068 sizeof (mddb_block_t) + 2069 sizeof (mddb_block_t) * de32p->de32_blkcount, 2070 KM_SLEEP); 2071 de32tode(de32p, dep); 2072 2073 dbp->db_firstentry = dep; 2074 while (de32p && de32p->de32_next) { 2075 2076 de32p2 = nextentry(de32p); 2077 2078 dep2 = (mddb_de_ic_t *)kmem_zalloc( 2079 sizeof (mddb_de_ic_t) - 2080 sizeof (mddb_block_t) + 2081 sizeof (mddb_block_t) * 2082 de32p2->de32_blkcount, KM_SLEEP); 2083 2084 de32tode(de32p2, dep2); 2085 2086 dep->de_next = dep2; 2087 dep = dep2; 2088 de32p = de32p2; 2089 } 2090 } 2091 /* 2092 * go through and make all of the pointer to record blocks 2093 * are null; 2094 */ 2095 for (dep = dbp->db_firstentry; dep != NULL; dep = dep->de_next) 2096 dep->de_rb = NULL; 2097 } 2098 kmem_free((caddr_t)db32p, MDDB_BSIZE); 2099 dbp->db_next = NULL; 2100 /* 2101 * if error occurred in directory blocks free them 2102 * and return 2103 */ 2104 if (err) { 2105 dbp = dbhp; 2106 while (dbp) { 2107 dep = dbp->db_firstentry; 2108 while (dep) { 2109 /* No mddb_rb32_t structures yet */ 2110 dep2 = dep->de_next; 2111 kmem_free((caddr_t)dep, sizeofde(dep)); 2112 dep = dep2; 2113 } 2114 dbp1 = dbp->db_next; 2115 kmem_free((caddr_t)dbp, sizeof (mddb_db_t)); 2116 dbp = dbp1; 2117 } 2118 s->s_dbp = NULL; 2119 return (err); 2120 2121 } 2122 /* 2123 */ 2124 err = 0; 2125 checksum = MDDB_GLOBAL_XOR; 2126 for (dbp = dbhp; dbp != NULL; dbp = dbp->db_next) { 2127 checksum ^= dbp->db_recsum; 2128 for (dep = dbp->db_firstentry; dep; dep = dep->de_next) { 2129 if (dep->de_flags & MDDB_F_OPT) 2130 continue; 2131 err = getrecord(s, dep, li); 2132 if (err) 2133 break; 2134 /* Don't include CHANGELOG in big XOR */ 2135 if (dep->de_flags & MDDB_F_CHANGELOG) 2136 continue; 2137 checksum ^= dep->de_rb->rb_checksum; 2138 checksum ^= dep->de_rb->rb_checksum_fiddle; 2139 } 2140 if (err) 2141 break; 2142 } 2143 if (checksum) { 2144 if (! err) 2145 err = MDDB_F_EDATA | MDDB_F_EFMT; 2146 } 2147 if (err) { 2148 dbp = dbhp; 2149 dbhp = NULL; 2150 while (dbp) { 2151 dep = dbp->db_firstentry; 2152 while (dep) { 2153 if (dep->de_rb) 2154 kmem_free((caddr_t)dep->de_rb, 2155 dep->de_recsize); 2156 dep2 = dep->de_next; 2157 kmem_free((caddr_t)dep, sizeofde(dep)); 2158 dep = dep2; 2159 } 2160 dbp1 = dbp->db_next; 2161 kmem_free((caddr_t)dbp, sizeof (mddb_db_t)); 2162 dbp = dbp1; 2163 } 2164 } 2165 s->s_dbp = dbhp; 2166 return (err); 2167 } 2168 2169 static int 2170 getoptcnt( 2171 mddb_set_t *s, 2172 int li) 2173 { 2174 int result; 2175 mddb_de_ic_t *dep; 2176 mddb_db_t *dbp; 2177 2178 #if defined(_ILP32) && !defined(lint) 2179 ASSERT(sizeof (mddb_de_t) == sizeof (mddb_de32_t)); 2180 ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t)); 2181 #endif 2182 2183 result = 0; 2184 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 2185 dep = dbp->db_firstentry; 2186 for (; dep != NULL; dep = dep->de_next) { 2187 if (! (dep->de_flags & MDDB_F_OPT)) 2188 continue; 2189 if (((dep->de_optinfo[0].o_flags & MDDB_F_ACTIVE) && 2190 (li == dep->de_optinfo[0].o_li)) || 2191 ((dep->de_optinfo[1].o_flags & MDDB_F_ACTIVE) && 2192 (li == dep->de_optinfo[1].o_li))) 2193 result++; 2194 } 2195 } 2196 return (result); 2197 } 2198 2199 static void 2200 getoptdev( 2201 mddb_set_t *s, 2202 mddb_de_ic_t *rdep, 2203 int opti 2204 ) 2205 { 2206 mddb_lb_t *lbp; 2207 mddb_locator_t *lp; 2208 mddb_optinfo_t *otherop; 2209 mddb_optinfo_t *resultop; 2210 int li; 2211 dev_t otherdev; 2212 int blkonly = 0; 2213 int mincnt; 2214 int thiscnt; 2215 2216 lbp = s->s_lbp; 2217 2218 resultop = &rdep->de_optinfo[opti]; 2219 otherop = &rdep->de_optinfo[1-opti]; 2220 2221 resultop->o_flags = 0; 2222 2223 /* 2224 * scan through and see if data bases have to vary by only device 2225 */ 2226 2227 if (otherop->o_flags & MDDB_F_ACTIVE) { 2228 blkonly = 1; 2229 otherdev = expldev(lbp->lb_locators[otherop->o_li].l_dev); 2230 for (li = 0; li < lbp->lb_loccnt; li++) { 2231 lp = &lbp->lb_locators[li]; 2232 if (! (lp->l_flags & MDDB_F_ACTIVE)) 2233 continue; 2234 if (expldev(lp->l_dev) != otherdev) { 2235 blkonly = 0; 2236 break; 2237 } 2238 } 2239 } 2240 2241 mincnt = 999999; 2242 for (li = 0; li < lbp->lb_loccnt; li++) { 2243 dev_info_t *devi; 2244 int removable = 0; 2245 2246 lp = &lbp->lb_locators[li]; 2247 if (! (lp->l_flags & MDDB_F_ACTIVE)) 2248 continue; 2249 if (otherop->o_flags & MDDB_F_ACTIVE) { 2250 if (blkonly) { 2251 if (otherop->o_li == li) 2252 continue; 2253 } else { 2254 if (otherdev == expldev(lp->l_dev)) 2255 continue; 2256 } 2257 } 2258 2259 /* 2260 * Check if this is a removable device. If it is we 2261 * assume it is something like a USB flash disk, a zip disk 2262 * or even a floppy that is being used to help maintain 2263 * mddb quorum. We don't want to put any optimized resync 2264 * records on these kinds of disks since they are usually 2265 * slower or don't have the same read/write lifetimes as 2266 * a regular fixed disk. 2267 */ 2268 if ((devi = e_ddi_hold_devi_by_dev(lp->l_dev, 0)) != NULL) { 2269 int error; 2270 struct cb_ops *cb; 2271 ddi_prop_op_t prop_op = PROP_LEN_AND_VAL_BUF; 2272 int propvalue = 0; 2273 int proplength = sizeof (int); 2274 2275 if ((cb = devopsp[getmajor(lp->l_dev)]->devo_cb_ops) 2276 != NULL) { 2277 error = (*cb->cb_prop_op)(DDI_DEV_T_ANY, devi, 2278 prop_op, 2279 DDI_PROP_NOTPROM|DDI_PROP_DONTPASS, 2280 "removable-media", 2281 (caddr_t)&propvalue, &proplength); 2282 2283 if (error == DDI_PROP_SUCCESS) 2284 removable = 1; 2285 } 2286 2287 ddi_release_devi(devi); 2288 } 2289 2290 if (removable) 2291 continue; 2292 2293 thiscnt = getoptcnt(s, li); 2294 if (thiscnt < mincnt) { 2295 resultop->o_li = li; 2296 mincnt = thiscnt; 2297 resultop->o_flags = MDDB_F_ACTIVE; 2298 } 2299 } 2300 } 2301 2302 static void 2303 allocuserdata( 2304 mddb_de_ic_t *dep 2305 ) 2306 { 2307 mddb_rb32_t *rbp; 2308 2309 #if defined(_ILP32) && !defined(lint) 2310 ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t)); 2311 #endif 2312 2313 rbp = dep->de_rb; 2314 rbp->rb_private = 0; 2315 dep->de_rb_userdata = kmem_zalloc(dep->de_reqsize, KM_SLEEP); 2316 rbp->rb_userdata = 0x4; /* Make sure this is non-zero */ 2317 bcopy((caddr_t)rbp->rb_data, dep->de_rb_userdata, dep->de_reqsize); 2318 } 2319 2320 2321 static void 2322 getuserdata( 2323 set_t setno, 2324 mddb_de_ic_t *dep 2325 ) 2326 { 2327 mddb_rb32_t *rbp; 2328 2329 2330 mddb_type_t type = dep->de_type1; 2331 caddr_t data, udata; 2332 2333 #if defined(_ILP32) && !defined(lint) 2334 ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t)); 2335 #endif 2336 rbp = dep->de_rb; 2337 data = (caddr_t)rbp->rb_data; 2338 udata = (caddr_t)dep->de_rb_userdata; 2339 2340 /* 2341 * If it's a driver record, and an old style record, and not a DRL 2342 * record, we must convert it because it was incore as a 64 bit 2343 * structure but its on disk layout has only 32 bit for block sizes 2344 */ 2345 if (!(md_get_setstatus(setno) & 2346 (MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT)) && 2347 (type >= MDDB_FIRST_MODID) && 2348 ((rbp->rb_revision == MDDB_REV_RB) || 2349 (rbp->rb_revision == MDDB_REV_RBFN))) { 2350 2351 switch (dep->de_flags) { 2352 2353 case MDDB_F_STRIPE: 2354 stripe_convert(data, udata, BIG_2_SMALL); 2355 break; 2356 2357 case MDDB_F_MIRROR: 2358 mirror_convert(data, udata, BIG_2_SMALL); 2359 break; 2360 2361 case MDDB_F_RAID: 2362 raid_convert(data, udata, BIG_2_SMALL); 2363 break; 2364 2365 case MDDB_F_SOFTPART: 2366 softpart_convert(data, udata, BIG_2_SMALL); 2367 break; 2368 2369 case MDDB_F_TRANS_MASTER: 2370 trans_master_convert(data, udata, BIG_2_SMALL); 2371 break; 2372 2373 case MDDB_F_TRANS_LOG: 2374 trans_log_convert(data, udata, BIG_2_SMALL); 2375 break; 2376 2377 case MDDB_F_HOTSPARE: 2378 hs_convert(data, udata, BIG_2_SMALL); 2379 break; 2380 2381 case MDDB_F_OPT: 2382 default: 2383 bcopy(udata, data, dep->de_reqsize); 2384 } 2385 } else { 2386 bcopy(udata, data, dep->de_reqsize); 2387 } 2388 } 2389 2390 static void 2391 getoptrecord( 2392 mddb_set_t *s, 2393 mddb_de_ic_t *dep 2394 ) 2395 { 2396 mddb_lb_t *lbp; 2397 mddb_locator_t *lp; 2398 mddb_rb32_t *rbp, *crbp; 2399 int li; 2400 int i; 2401 int err = 0; 2402 size_t recsize; 2403 2404 #if defined(_ILP32) && !defined(lint) 2405 ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t)); 2406 #endif 2407 2408 lbp = s->s_lbp; 2409 2410 recsize = dep->de_recsize; 2411 dep->de_rb = (mddb_rb32_t *)kmem_zalloc(recsize, KM_SLEEP); 2412 rbp = dep->de_rb; 2413 crbp = (mddb_rb32_t *)kmem_zalloc(recsize, KM_SLEEP); 2414 2415 dep->de_optinfo[0].o_flags |= MDDB_F_EDATA; 2416 dep->de_optinfo[1].o_flags |= MDDB_F_EDATA; 2417 2418 for (i = 0; i < 2; i++) { 2419 if (! (dep->de_optinfo[i].o_flags & MDDB_F_ACTIVE)) 2420 continue; 2421 li = dep->de_optinfo[i].o_li; 2422 lp = &lbp->lb_locators[li]; 2423 2424 if (! (lp->l_flags & MDDB_F_ACTIVE) || 2425 (lp->l_flags & MDDB_F_EMASTER)) 2426 continue; 2427 2428 err = readblklst(s, (caddr_t)rbp, dep->de_blks, 2429 dep->de_blkcount, li, 0); 2430 2431 if (err) 2432 continue; 2433 2434 if (rbp->rb_magic != MDDB_MAGIC_RB) 2435 continue; 2436 2437 if (revchk(MDDB_REV_RB, rbp->rb_revision)) 2438 continue; 2439 2440 /* Check the crc for this record */ 2441 if (rec_crcchk(s, dep, rbp)) { 2442 continue; 2443 } 2444 2445 dep->de_optinfo[i].o_flags = MDDB_F_ACTIVE; 2446 2447 if (rbp == crbp) { 2448 if (rbp->rb_checksum != crbp->rb_checksum) 2449 dep->de_optinfo[1].o_flags |= MDDB_F_EDATA; 2450 break; 2451 } 2452 rbp = crbp; 2453 } 2454 2455 if (rbp == crbp) { 2456 rbp->rb_private = 0; 2457 kmem_free((caddr_t)crbp, recsize); 2458 return; 2459 } 2460 bzero((caddr_t)rbp, recsize); 2461 rbp->rb_magic = MDDB_MAGIC_RB; 2462 rbp->rb_revision = MDDB_REV_RB; 2463 uniqtime32(&rbp->rb_timestamp); 2464 /* Generate the crc for this record */ 2465 rec_crcgen(s, dep, rbp); 2466 kmem_free((caddr_t)crbp, recsize); 2467 } 2468 2469 /* 2470 * writeoptrecord writes out an optimized record. 2471 */ 2472 static int 2473 writeoptrecord( 2474 mddb_set_t *s, 2475 mddb_de_ic_t *dep 2476 ) 2477 { 2478 mddb_rb32_t *rbp; 2479 int li; 2480 int err = 0, wrt_err = 0; 2481 mddb_bf_t *bufhead, *bfp; 2482 mddb_lb_t *lbp = s->s_lbp; 2483 mddb_locator_t *lp; 2484 int i; 2485 2486 #if defined(_ILP32) && !defined(lint) 2487 ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t)); 2488 #endif 2489 2490 bufhead = NULL; 2491 err = 0; 2492 2493 while (s->s_opthavequeuinglck) { 2494 s->s_optwantqueuinglck++; 2495 cv_wait(&s->s_optqueuing_cv, SETMUTEX(s->s_setno)); 2496 } 2497 s->s_opthavequeuinglck++; 2498 rbp = dep->de_rb; 2499 for (i = 0; i < 2; i++) { 2500 /* 2501 * only possible error is xlate. This can 2502 * occur if a replica was off line and came 2503 * back. During the mean time the database grew 2504 * large than the now on line replica can store 2505 */ 2506 if (! (dep->de_optinfo[i].o_flags & MDDB_F_ACTIVE)) 2507 continue; 2508 li = dep->de_optinfo[i].o_li; 2509 /* 2510 * In a MN diskset, any node can write optimized record(s). 2511 */ 2512 wrt_err = wrtblklst(s, (caddr_t)rbp, dep->de_blks, 2513 dep->de_blkcount, li, &bufhead, MDDB_WR_ANY_NODE); 2514 /* 2515 * For MN diskset, set error in optinfo structure so 2516 * that mddb_commitrec knows which replica failed. 2517 */ 2518 if ((MD_MNSET_SETNO(s->s_setno)) && 2519 (wrt_err & MDDB_F_EWRITE)) { 2520 dep->de_optinfo[i].o_flags |= MDDB_F_EWRITE; 2521 } 2522 err |= wrt_err; 2523 } 2524 s->s_opthavequeuinglck = 0; 2525 if (s->s_optwantqueuinglck) { 2526 s->s_optwantqueuinglck = 0; 2527 cv_broadcast(&s->s_optqueuing_cv); 2528 } 2529 for (bfp = bufhead; bfp; bfp = bufhead) { 2530 mutex_exit(SETMUTEX(s->s_setno)); 2531 (void) biowait(&bfp->bf_buf); 2532 mutex_enter(SETMUTEX(s->s_setno)); 2533 if (bfp->bf_buf.b_flags & B_ERROR) { 2534 /* 2535 * If an MN diskset, don't set replica 2536 * in error since this hasn't been set in master. 2537 * Setting replica in error before master could 2538 * leave the nodes with different views of the 2539 * world since a class 1 configuration change 2540 * could occur in mddb_commitrec as soon as 2541 * all locks are dropped. Must keep this 2542 * node the same as master and can't afford a 2543 * failure from the class 1 config change 2544 * if master succeeded. 2545 */ 2546 if (!(MD_MNSET_SETNO(s->s_setno))) { 2547 bfp->bf_locator->l_flags |= MDDB_F_EWRITE; 2548 } else { 2549 /* 2550 * Find which de_optinfo (which replica) 2551 * had a failure and set the failure in 2552 * the o_flags field. 2553 */ 2554 lp = &lbp->lb_locators[dep->de_optinfo[0].o_li]; 2555 if (lp == bfp->bf_locator) { 2556 dep->de_optinfo[0].o_flags |= 2557 MDDB_F_EWRITE; 2558 } else { 2559 dep->de_optinfo[1].o_flags |= 2560 MDDB_F_EWRITE; 2561 } 2562 } 2563 err |= MDDB_F_EWRITE; 2564 } 2565 bufhead = bfp->bf_next; 2566 freebuffer(s, bfp); 2567 } 2568 return (err); 2569 } 2570 2571 /* 2572 * Fix up the optimized resync record. Used in the traditional and local 2573 * disksets to move an optimized record from a failed or deleted mddb 2574 * to an active one. 2575 * 2576 * In a MN diskset, the fixing of the optimized record is split between 2577 * the master and slave nodes. If the master node moves the optimized 2578 * resync record, then the master node will send a MDDB_PARSE_OPTRECS 2579 * message to the slave nodes causing the slave nodes to reget the 2580 * directory entry containing the location of the optimized resync record. 2581 * After the record is reread from disk, then writeoptrecord is called 2582 * if the location of the optimized resync record or flags have changed. 2583 * When writeoptrecord is called, the node that is the owner of this record 2584 * will write the optimized record to the location specified in the directory 2585 * entry. Since the master node uses the highest class message (PARSE) 2586 * the record owner node is guaranteed to already have an updated 2587 * directory entry incore. 2588 * 2589 * The other difference between the traditional/local set and MN diskset 2590 * is that the directory entry can be written to disk before the optimized 2591 * record in a MN diskset if the record is owned by a slave node. So, 2592 * the users of an optimized record must handle the failure case when no 2593 * data is available from an optimized record since the master node could 2594 * have failed during the relocation of the optimized record to another mddb. 2595 */ 2596 static int 2597 fixoptrecord( 2598 mddb_set_t *s, 2599 mddb_de_ic_t *dep, 2600 mddb_db_t *dbp 2601 ) 2602 { 2603 int changed; 2604 int writedata; 2605 int err = 0; 2606 int i; 2607 mddb_lb_t *lbp; 2608 mddb_optinfo_t *op; 2609 mddb_db32_t *db32p; 2610 int rec_owner; /* Is node owner of record? */ 2611 2612 #if defined(_ILP32) && !defined(lint) 2613 ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t)); 2614 #endif 2615 2616 lbp = s->s_lbp; 2617 changed = 0; 2618 writedata = 0; 2619 for (i = 0; i < 2; i++) { 2620 op = &dep->de_optinfo[i]; 2621 2622 if (! (lbp->lb_locators[op->o_li].l_flags & MDDB_F_ACTIVE)) 2623 op->o_flags = 0; 2624 2625 /* 2626 * If optimized record has seen a replica failure, 2627 * assign new replica to record and re-write data 2628 * to new record. 2629 */ 2630 if (! (op->o_flags & MDDB_F_ACTIVE)) { 2631 getoptdev(s, dep, i); 2632 writedata++; 2633 changed++; 2634 /* Set flag for slaves to reread dep and write rec */ 2635 if (lbp->lb_flags & MDDB_MNSET) { 2636 s->s_mn_parseflags |= MDDB_PARSE_OPTRECS; 2637 } 2638 } 2639 2640 /* 2641 * If just an error in the data was seen, set 2642 * the optimized record's replica flag to active (ok) 2643 * and try again. 2644 */ 2645 if (op->o_flags & MDDB_F_EDATA) { 2646 dep->de_optinfo[0].o_flags = MDDB_F_ACTIVE; 2647 writedata++; 2648 } 2649 } 2650 2651 rec_owner = 0; 2652 if (lbp->lb_flags & MDDB_MNSET) { 2653 /* 2654 * If a MN diskset then check the owner of optimized record. 2655 * If the master node owns the record or if there is 2656 * no owner of the record, then the master can write the 2657 * optimized record to disk. 2658 * Master node can write the optimized record now, but 2659 * slave nodes write their records during handling of 2660 * the MDDB_PARSE_OPTRECS message. 2661 */ 2662 if ((dep->de_owner_nodeid == MD_MN_INVALID_NID) || 2663 (dep->de_owner_nodeid == md_set[s->s_setno].s_nodeid)) { 2664 rec_owner = 1; 2665 } 2666 } else { 2667 /* 2668 * In traditional diskset and local set, this node 2669 * is always the record owner and always the master. 2670 */ 2671 rec_owner = 1; 2672 } 2673 2674 /* 2675 * If this node is the record owner, write out record. 2676 */ 2677 if ((writedata) && (rec_owner)) { 2678 if (err = writeoptrecord(s, dep)) { 2679 return (err); 2680 } 2681 } 2682 if (! changed) 2683 return (0); 2684 uniqtime32(&dbp->db_timestamp); 2685 dbp->db_revision = MDDB_REV_DB; 2686 db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, KM_SLEEP); 2687 create_db32rec(db32p, dbp); 2688 crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL); 2689 err = writeall(s, (caddr_t)db32p, db32p->db32_blknum, 2690 1, MDDB_WR_ONLY_MASTER); 2691 kmem_free((caddr_t)db32p, MDDB_BSIZE); 2692 return (err); 2693 } 2694 2695 static int 2696 fixoptrecords( 2697 mddb_set_t *s 2698 ) 2699 { 2700 mddb_de_ic_t *dep; 2701 mddb_db_t *dbp; 2702 int err = 0; 2703 set_t setno; 2704 2705 /* 2706 * In a MN diskset, the master node is the only node that runs 2707 * fixoptrecords. If the master node changes anything, then the 2708 * master node sends PARSE message to the slave nodes. The slave 2709 * nodes will then re-read in the locator block or re-read in the 2710 * directory blocks and re-write the optimized resync records. 2711 */ 2712 setno = s->s_setno; 2713 if ((setno != MD_LOCAL_SET) && (s->s_lbp->lb_flags & MDDB_MNSET) && 2714 (md_set[setno].s_am_i_master == 0)) { 2715 return (0); 2716 } 2717 2718 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 2719 for (dep = dbp->db_firstentry; dep; dep = dep->de_next) { 2720 if (! (dep->de_flags & MDDB_F_OPT)) 2721 continue; 2722 err = fixoptrecord(s, dep, dbp); 2723 if (err != 0) 2724 return (err); 2725 } 2726 } 2727 return (0); 2728 } 2729 2730 /* 2731 * Checks incore version of mddb data to mddb data ondisk. 2732 * 2733 * Returns: 2734 * - 0 if the data was successfully read and is good. 2735 * - MDDB_F_EREAD if a read error occurred. 2736 * - 1 if the data read is bad (checksum failed, etc) 2737 */ 2738 static int 2739 checkcopy 2740 ( 2741 mddb_set_t *s, 2742 int li 2743 ) 2744 { 2745 mddb_db_t *dbp; 2746 mddb_db32_t *cdb32p; 2747 mddb_de_ic_t *dep; 2748 mddb_de32_t *cde32p; 2749 mddb_rb32_t *rbp, *crbp; 2750 size_t size; 2751 int i; 2752 int retval = 1; 2753 2754 #if defined(_ILP32) && !defined(lint) 2755 ASSERT(sizeof (mddb_de_t) == sizeof (mddb_de32_t)); 2756 ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t)); 2757 ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t)); 2758 #endif 2759 2760 if (s->s_databuffer_size == 0) { 2761 size_t maxrecsize = MDDB_BSIZE; 2762 2763 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) 2764 for (dep = dbp->db_firstentry; dep; dep = dep->de_next) 2765 if (! (dep->de_flags & MDDB_F_OPT) && 2766 dep->de_recsize > maxrecsize) 2767 maxrecsize = dep->de_recsize; 2768 2769 s->s_databuffer = (caddr_t)kmem_zalloc(maxrecsize, KM_SLEEP); 2770 s->s_databuffer_size = maxrecsize; 2771 } 2772 2773 cdb32p = (mddb_db32_t *)s->s_databuffer; 2774 2775 /* 2776 * first go through and make sure all directory stuff 2777 * is the same 2778 */ 2779 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 2780 if (readblks(s, (caddr_t)cdb32p, dbp->db_blknum, 1, li)) { 2781 retval = MDDB_F_EREAD; 2782 goto err; 2783 } 2784 if (cdb32p->db32_magic != MDDB_MAGIC_DB) 2785 goto err; 2786 if (revchk(MDDB_REV_DB, cdb32p->db32_revision)) 2787 goto err; 2788 if (crcchk(cdb32p, &cdb32p->db32_checksum, MDDB_BSIZE, NULL)) 2789 goto err; 2790 if (cdb32p->db32_nextblk != dbp->db_nextblk) 2791 goto err; 2792 if (cdb32p->db32_recsum != dbp->db_recsum) 2793 goto err; 2794 if (cdb32p->db32_firstentry) { 2795 cde32p = (mddb_de32_t *) 2796 ((void *)((caddr_t)(&cdb32p->db32_firstentry) 2797 + sizeof (cdb32p->db32_firstentry))); 2798 } else 2799 cde32p = NULL; 2800 2801 dep = dbp->db_firstentry; 2802 /* 2803 * check if all directory entries are identical 2804 */ 2805 while (dep && cde32p) { 2806 if (dep->de_recid != cde32p->de32_recid) 2807 goto err; 2808 if (dep->de_type1 != cde32p->de32_type1) 2809 goto err; 2810 if (dep->de_type2 != cde32p->de32_type2) 2811 goto err; 2812 if (dep->de_reqsize != cde32p->de32_reqsize) 2813 goto err; 2814 if (dep->de_flags != cde32p->de32_flags) 2815 goto err; 2816 2817 for (i = 0; i < 2; i++) { 2818 if (dep->de_optinfo[i].o_li != 2819 cde32p->de32_optinfo[i].o_li) 2820 break; 2821 } 2822 if (i != 2) 2823 goto err; 2824 size = sizeof (mddb_block_t) * dep->de_blkcount; 2825 if (bcmp((caddr_t)dep->de_blks, 2826 (caddr_t)cde32p->de32_blks, size)) 2827 goto err; 2828 dep = dep->de_next; 2829 if (cde32p->de32_next) 2830 cde32p = nextentry(cde32p); 2831 else 2832 cde32p = NULL; 2833 } 2834 if (dep || cde32p) 2835 goto err; 2836 } 2837 /* 2838 * If here, all directories are functionally identical 2839 * check to make sure all records are identical 2840 * the reason the records are not just bcmped is that the 2841 * lock flag does not want to be compared. 2842 */ 2843 crbp = (mddb_rb32_t *)cdb32p; 2844 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 2845 for (dep = dbp->db_firstentry; dep; dep = dep->de_next) { 2846 if ((dep->de_flags & MDDB_F_OPT) || 2847 (dep->de_flags & MDDB_F_CHANGELOG)) 2848 continue; 2849 rbp = (mddb_rb32_t *)dep->de_rb; 2850 if (readblklst(s, (caddr_t)crbp, dep->de_blks, 2851 dep->de_blkcount, li, 0)) { 2852 retval = MDDB_F_EREAD; 2853 goto err; 2854 } 2855 /* Check the crc for this record */ 2856 if (rec_crcchk(s, dep, crbp)) 2857 goto err; 2858 2859 if (rbp->rb_checksum != crbp->rb_checksum || 2860 rbp->rb_checksum_fiddle != crbp->rb_checksum_fiddle) 2861 goto err; 2862 } 2863 } 2864 return (0); 2865 err: 2866 return (retval); 2867 } 2868 2869 /* 2870 * Determine if the location information for two mddbs is the same. 2871 * The device slice and block offset should match. If both have devids then 2872 * use that for the comparison, otherwise we compare the dev_ts. 2873 * Comparing with the devid allows us to handle the case where a mddb was 2874 * relocated to a dead mddbs dev_t. The live mddb will have the dev_t of 2875 * the dead mddb but the devid comparison will catch this and not match. 2876 * 2877 * Return 1 if the location of the two mddbs match, 0 if not. 2878 */ 2879 static int 2880 match_mddb(mddb_ri_t *rip, ddi_devid_t devid, char *minor, md_dev64_t dev, 2881 daddr32_t blkno) 2882 { 2883 if (rip->ri_flags & MDDB_F_EMASTER) { 2884 /* 2885 * If this element is errored then we don't try to match on it. 2886 * If we try to match we could erroneously match on the dev_t 2887 * of a relocated disk. 2888 */ 2889 return (0); 2890 } 2891 2892 if (rip->ri_devid && devid && minor) { 2893 /* 2894 * If old devid exists, then this is a replicated diskset 2895 * and both old and new devids must be checked. 2896 */ 2897 if (rip->ri_old_devid) { 2898 if (((ddi_devid_compare(rip->ri_devid, devid) != 0) && 2899 (ddi_devid_compare(rip->ri_old_devid, 2900 devid) != 0)) || 2901 (strcmp(rip->ri_minor_name, minor) != 0)) 2902 return (0); 2903 } else { 2904 if (ddi_devid_compare(rip->ri_devid, devid) != 0 || 2905 strcmp(rip->ri_minor_name, minor) != 0) 2906 return (0); 2907 } 2908 } else { 2909 if (rip->ri_dev != dev) 2910 return (0); 2911 } 2912 2913 if (rip->ri_blkno != blkno) 2914 return (0); 2915 2916 return (1); 2917 } 2918 2919 static int 2920 ridev( 2921 mddb_ri_t **rip, 2922 mddb_cfg_loc_t *clp, 2923 dev32_t *dev_2b_fixed, 2924 int flag) 2925 { 2926 mddb_ri_t *r, *r1; 2927 md_dev64_t ldev, ndev; 2928 major_t majordev; 2929 int sz; 2930 2931 if (MD_UPGRADE) { 2932 ldev = md_makedevice(md_targ_name_to_major(clp->l_driver), 2933 clp->l_mnum); 2934 } else { 2935 if (ddi_name_to_major(clp->l_driver) == (major_t)-1) 2936 return (EINVAL); 2937 2938 ldev = md_makedevice(ddi_name_to_major(clp->l_driver), 2939 clp->l_mnum); 2940 } 2941 2942 if (clp->l_devid != 0) { 2943 /* 2944 * Get dev associated with device id and minor name. 2945 * Setup correct driver name if dev is now different. 2946 * Don't change driver name if during upgrade. 2947 */ 2948 ndev = ldev; 2949 if (!mddb_devid_validate((ddi_devid_t)(uintptr_t)clp->l_devid, 2950 &ndev, clp->l_minor_name)) { 2951 if ((ndev != ldev) && (!(MD_UPGRADE))) { 2952 majordev = md_getmajor(ndev); 2953 (void) strcpy(clp->l_driver, 2954 ddi_major_to_name(majordev)); 2955 clp->l_mnum = md_getminor(ndev); 2956 clp->l_devid_flags |= MDDB_DEVID_VALID; 2957 ldev = ndev; 2958 } 2959 } else { 2960 /* Mark as invalid */ 2961 clp->l_devid_flags &= ~MDDB_DEVID_VALID; 2962 } 2963 } 2964 2965 clp->l_dev = md_cmpldev(ldev); 2966 if (dev_2b_fixed) 2967 *dev_2b_fixed = clp->l_dev; 2968 r = *rip; 2969 2970 while (r) { 2971 if (match_mddb(r, (ddi_devid_t)(uintptr_t)clp->l_devid, 2972 clp->l_minor_name, ldev, clp->l_blkno)) { 2973 if ((clp->l_devid != 0) && 2974 !(clp->l_devid_flags & MDDB_DEVID_VALID)) { 2975 r->ri_flags |= MDDB_F_EMASTER; 2976 } else { 2977 r->ri_flags |= flag; 2978 } 2979 return (0); /* already entered return success */ 2980 } 2981 r = r->ri_next; 2982 } 2983 2984 /* 2985 * This replica not represented in the current rip list, 2986 * so add it to the list. 2987 */ 2988 r = (mddb_ri_t *)kmem_zalloc(sizeof (**rip), KM_SLEEP); 2989 r->ri_dev = ldev; 2990 r->ri_blkno = clp->l_blkno; 2991 (void) strncpy(r->ri_driver, clp->l_driver, MD_MAXDRVNM); 2992 if (strlen(clp->l_driver) >= MD_MAXDRVNM) { 2993 r->ri_driver[(MD_MAXDRVNM -1)] = '\0'; 2994 } 2995 if (clp->l_devname != NULL) { 2996 (void) strcpy(r->ri_devname, clp->l_devname); 2997 } 2998 r->ri_flags |= flag; 2999 if (clp->l_devid != 0) { 3000 sz = clp->l_devid_sz; 3001 r->ri_devid = (ddi_devid_t)kmem_zalloc(sz, KM_SLEEP); 3002 bcopy((void *)(uintptr_t)clp->l_devid, (char *)r->ri_devid, sz); 3003 3004 if (clp->l_old_devid != NULL) { 3005 sz = clp->l_old_devid_sz; 3006 r->ri_old_devid = (ddi_devid_t)kmem_zalloc(sz, 3007 KM_SLEEP); 3008 bcopy((char *)(uintptr_t)clp->l_old_devid, 3009 (char *)r->ri_old_devid, sz); 3010 } else { 3011 r->ri_old_devid = 0; 3012 } 3013 if (strlen(clp->l_minor_name) < MDDB_MINOR_NAME_MAX) 3014 (void) strcpy(r->ri_minor_name, clp->l_minor_name); 3015 3016 if (!(clp->l_devid_flags & MDDB_DEVID_VALID)) { 3017 /* 3018 * Devid is present, but not valid. This could 3019 * happen if device has been powered off or if 3020 * the device has been removed. Mark the device in 3021 * error. Don't allow any writes to this device 3022 * based on the dev_t since another device could 3023 * have been placed in its spot and be responding to 3024 * the dev_t accesses. 3025 */ 3026 r->ri_flags |= MDDB_F_EMASTER; 3027 } 3028 } else { 3029 r->ri_devid = 0; 3030 r->ri_old_devid = 0; 3031 } 3032 3033 /* 3034 * If the rip list is empty then this entry 3035 * is the list. 3036 */ 3037 if (*rip == NULL) { 3038 *rip = r; 3039 return (0); 3040 } 3041 3042 /* 3043 * Add this entry to the end of the rip list 3044 */ 3045 r1 = *rip; 3046 while (r1->ri_next) 3047 r1 = r1->ri_next; 3048 r1->ri_next = r; 3049 return (0); 3050 } 3051 3052 /* 3053 * writecopy writes the incore data blocks out to all of the replicas. 3054 * This is called from writestart 3055 * - when a diskset is started or 3056 * - when an error has been enountered during the write to a mddb. 3057 * and from newdev when a new mddb is being added. 3058 * 3059 * flag can be 2 values: 3060 * MDDB_WRITECOPY_ALL - write all records to all mddbs. This is 3061 * always used for traditional and local disksets. 3062 * For MN diskset: 3063 * All nodes can call writecopy, but only the 3064 * master node actually writes data to the disk 3065 * except for optimized resync records. 3066 * An optimized resync record can only be written to 3067 * by the record owner. 3068 * MDDB_WRITECOPY_SYNC - special case for MN diskset. When a new 3069 * master has been chosen, the new master may need to 3070 * write its incore mddb to disk (this is the case where the 3071 * old master had executed a message but hadn't relayed it 3072 * to this slave yet). New master should not write the 3073 * change log records since new master would be overwriting 3074 * valuable data. Only used during a reconfig cycle. 3075 */ 3076 static int 3077 writecopy( 3078 mddb_set_t *s, 3079 int li, 3080 int flag 3081 ) 3082 { 3083 mddb_db_t *dbp; 3084 mddb_db32_t *db32p; 3085 mddb_de_ic_t *dep; 3086 mddb_rb32_t *rbp; 3087 uint_t checksum; 3088 int err = 0; 3089 3090 #if defined(_ILP32) && !defined(lint) 3091 ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t)); 3092 ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t)); 3093 #endif 3094 3095 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 3096 db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, KM_SLEEP); 3097 create_db32rec(db32p, dbp); 3098 crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL); 3099 err = writeblks(s, (caddr_t)db32p, dbp->db_blknum, 1, li, 3100 MDDB_WR_ONLY_MASTER); 3101 kmem_free((caddr_t)db32p, MDDB_BSIZE); 3102 if (err) 3103 return (err); 3104 for (dep = dbp->db_firstentry; dep; dep = dep->de_next) { 3105 /* 3106 * In a multinode diskset, when a new master is 3107 * chosen the new master may need to write its 3108 * incore copy of the mddb to disk. In this case, 3109 * don't want to overwrite the change log records 3110 * so new master sets flag to MDDB_WRITECOPY_SYNC. 3111 */ 3112 if (flag == MDDB_WRITECOPY_SYNC) { 3113 if (dep->de_flags & MDDB_F_CHANGELOG) 3114 continue; 3115 } 3116 /* 3117 * In a multinode diskset, don't write out optimized 3118 * resync resyncs since only the mirror owner node 3119 * will have the correct data. If writecopy is 3120 * being called from writestart as a result of 3121 * an mddb failure, then writestart will handle 3122 * the optimized records when it calls fixoptrecords. 3123 */ 3124 if ((MD_MNSET_SETNO(s->s_setno)) && 3125 (dep->de_flags & MDDB_F_OPT)) { 3126 continue; 3127 } 3128 3129 rbp = dep->de_rb; 3130 checksum = rbp->rb_checksum_fiddle; 3131 checksum ^= rbp->rb_checksum; 3132 /* Generate the crc for this record */ 3133 rec_crcgen(s, dep, rbp); 3134 checksum ^= rbp->rb_checksum; 3135 rbp->rb_checksum_fiddle = checksum; 3136 if (err = wrtblklst(s, (caddr_t)rbp, dep->de_blks, 3137 dep->de_blkcount, li, (mddb_bf_t **)0, 3138 MDDB_WR_ONLY_MASTER)) 3139 return (err); 3140 } 3141 } 3142 return (0); 3143 } 3144 3145 static int 3146 upd_med( 3147 mddb_set_t *s, 3148 char *tag 3149 ) 3150 { 3151 med_data_t meddb; 3152 int medok; 3153 mddb_lb_t *lbp = s->s_lbp; 3154 set_t setno = s->s_setno; 3155 int li; 3156 int alc; 3157 int lc; 3158 3159 3160 /* If no mediator hosts, nothing to do */ 3161 if (s->s_med.n_cnt == 0) 3162 return (0); 3163 3164 /* 3165 * If this is a MN set and we are not the master, then don't 3166 * update mediator hosts or mark mediator as golden since 3167 * only master node should do that. 3168 */ 3169 if ((setno != MD_LOCAL_SET) && (s->s_lbp->lb_flags & MDDB_MNSET) && 3170 (md_set[setno].s_am_i_master == 0)) { 3171 return (0); 3172 } 3173 3174 bzero((char *)&meddb, sizeof (med_data_t)); 3175 meddb.med_dat_mag = MED_DATA_MAGIC; 3176 meddb.med_dat_rev = MED_DATA_REV; 3177 meddb.med_dat_fl = 0; 3178 meddb.med_dat_sn = setno; 3179 meddb.med_dat_cc = lbp->lb_commitcnt; 3180 TIMEVAL32_TO_TIMEVAL(&meddb.med_dat_id, &lbp->lb_ident.createtime); 3181 crcgen(&meddb, &meddb.med_dat_cks, sizeof (med_data_t), NULL); 3182 3183 /* count accessible mediators */ 3184 medok = upd_med_hosts(&s->s_med, s->s_setname, &meddb, tag); 3185 3186 /* count accessible and existing replicas */ 3187 for (li = 0, alc = 0, lc = 0; li < lbp->lb_loccnt; li++) { 3188 mddb_locator_t *lp = &lbp->lb_locators[li]; 3189 3190 if (lp->l_flags & MDDB_F_DELETED) 3191 continue; 3192 3193 lc++; 3194 3195 if (! (lp->l_flags & MDDB_F_ACTIVE) || 3196 (lp->l_flags & MDDB_F_EMASTER) || 3197 (lp->l_flags & MDDB_F_EWRITE)) 3198 continue; 3199 3200 alc++; 3201 } 3202 3203 /* 3204 * Mediator update quorum is >= 50%: check for less than 3205 * "mediator update" quorum. 3206 */ 3207 if ((medok * 2) < s->s_med.n_cnt) { 3208 /* panic if <= 50% of all replicas are accessible */ 3209 if ((lc > 0) && ((alc * 2) <= lc)) { 3210 cmn_err(CE_PANIC, 3211 "md: Update of 50%% of the mediator hosts failed"); 3212 /* NOTREACHED */ 3213 } 3214 3215 cmn_err(CE_WARN, 3216 "md: Update of 50%% of the mediator hosts failed"); 3217 } 3218 3219 /* 3220 * If we have mediator update quorum and exactly 50% of the replicas 3221 * are accessible then mark the mediator as golden. 3222 */ 3223 if (((medok * 2) >= (s->s_med.n_cnt + 1)) && (lc > 0) && 3224 ((alc * 2) == lc)) { 3225 meddb.med_dat_fl = MED_DFL_GOLDEN; 3226 crcgen(&meddb, &meddb.med_dat_cks, sizeof (med_data_t), NULL); 3227 (void) upd_med_hosts(&s->s_med, s->s_setname, &meddb, tag); 3228 } 3229 3230 return (0); 3231 } 3232 3233 static int 3234 push_lb(mddb_set_t *s) 3235 { 3236 mddb_lb_t *lbp = s->s_lbp; 3237 3238 /* push the change to all the replicas */ 3239 uniqtime32(&lbp->lb_timestamp); 3240 if (MD_MNSET_SETNO(s->s_setno)) { 3241 lbp->lb_revision = MDDB_REV_MNLB; 3242 } else { 3243 lbp->lb_revision = MDDB_REV_LB; 3244 } 3245 /* 3246 * The updates to the mediator hosts are done 3247 * by the callers of this function. 3248 */ 3249 return (writelocall(s)); 3250 } 3251 3252 /* Should not call for MN diskset since data tags are not supported */ 3253 static int 3254 dtl_cmp(const mddb_dtag_t *odtp, const mddb_dtag_t *ndtp) 3255 { 3256 int diff = 0; 3257 3258 diff = (int)(odtp->dt_setno - ndtp->dt_setno); 3259 if (diff) 3260 return (diff); 3261 3262 diff = strncmp(odtp->dt_sn, ndtp->dt_sn, MDDB_SN_LEN); 3263 if (diff) 3264 return (diff); 3265 3266 diff = strncmp(odtp->dt_hn, ndtp->dt_hn, MD_MAX_NODENAME_PLUS_1); 3267 if (diff) 3268 return (diff); 3269 3270 /*CSTYLED*/ 3271 return (timercmp(&odtp->dt_tv, &ndtp->dt_tv, !=)); 3272 } 3273 3274 /* Should not call for MN diskset since data tags are not supported */ 3275 static int 3276 dtl_addl(mddb_set_t *s, const mddb_dtag_t *ndtp) 3277 { 3278 int nextid = 0; 3279 mddb_dtag_lst_t **dtlpp = &s->s_dtlp; 3280 3281 /* Run to the end of the list */ 3282 for (/* void */; (*dtlpp != NULL); dtlpp = &(*dtlpp)->dtl_nx) { 3283 if (dtl_cmp(&(*dtlpp)->dtl_dt, ndtp) == 0) 3284 return (0); 3285 nextid++; 3286 } 3287 3288 /* Add the new member */ 3289 *dtlpp = kmem_zalloc(sizeof (**dtlpp), KM_SLEEP); 3290 3291 /* Update the dtag portion of the list */ 3292 bcopy((caddr_t)ndtp, (caddr_t)&((*dtlpp)->dtl_dt), 3293 sizeof (mddb_dtag_t)); 3294 3295 /* Fix up the id value */ 3296 (*dtlpp)->dtl_dt.dt_id = ++nextid; 3297 3298 return (0); 3299 } 3300 3301 /* 3302 * Even though data tags are not supported in MN disksets, dt_cntl may 3303 * be called for a MN diskset since this routine is called even before 3304 * it is known the kind of diskset being read in from disk. 3305 * For a MNdiskset, s_dtlp is 0 so a count of 0 is returned. 3306 */ 3307 static int 3308 dtl_cntl(mddb_set_t *s) 3309 { 3310 mddb_dtag_lst_t *dtlp = s->s_dtlp; 3311 int ndt = 0; 3312 3313 while (dtlp != NULL) { 3314 ndt++; 3315 dtlp = dtlp->dtl_nx; 3316 } 3317 3318 return (ndt); 3319 } 3320 3321 /* 3322 * Even though data tags are not supported in MN disksets, dt_cntl may 3323 * be called for a MN diskset since this routine is called even before 3324 * it is known the kind of diskset being read in from disk. 3325 * For a MNdiskset, s_dtlp is 0 so a 0 is returned. 3326 */ 3327 static mddb_dtag_t * 3328 dtl_findl(mddb_set_t *s, int id) 3329 { 3330 mddb_dtag_lst_t *dtlp = s->s_dtlp; 3331 3332 while (dtlp != NULL) { 3333 if (dtlp->dtl_dt.dt_id == id) 3334 return (&dtlp->dtl_dt); 3335 dtlp = dtlp->dtl_nx; 3336 } 3337 return ((mddb_dtag_t *)NULL); 3338 } 3339 3340 /* Should not call for MN diskset since data tags are not supported */ 3341 static void 3342 dtl_freel(mddb_dtag_lst_t **dtlpp) 3343 { 3344 mddb_dtag_lst_t *dtlp; 3345 mddb_dtag_lst_t *tdtlp; 3346 3347 3348 for (tdtlp = *dtlpp; tdtlp != NULL; tdtlp = dtlp) { 3349 dtlp = tdtlp->dtl_nx; 3350 kmem_free(tdtlp, sizeof (mddb_dtag_lst_t)); 3351 } 3352 *dtlpp = (mddb_dtag_lst_t *)NULL; 3353 } 3354 3355 /* 3356 * Even though data tags are not supported in MN disksets, dt_setup will 3357 * be called for a MN diskset since this routine is called even before 3358 * it is known the kind of diskset being read in from disk. 3359 * Once this set is known as a MN diskset, the dtp area will be freed. 3360 */ 3361 static void 3362 dt_setup(mddb_set_t *s, const mddb_dtag_t *dtagp) 3363 { 3364 mddb_dt_t *dtp; 3365 set_t setno = s->s_setno; 3366 3367 3368 if (md_set[setno].s_dtp == (mddb_dt_t *)NULL) 3369 md_set[setno].s_dtp = kmem_zalloc(MDDB_DT_BYTES, KM_SLEEP); 3370 else if (dtagp == (mddb_dtag_t *)NULL) 3371 bzero((caddr_t)md_set[setno].s_dtp, MDDB_DT_BYTES); 3372 3373 /* shorthand */ 3374 dtp = (mddb_dt_t *)md_set[setno].s_dtp; 3375 3376 dtp->dt_mag = MDDB_MAGIC_DT; 3377 dtp->dt_rev = MDDB_REV_DT; 3378 3379 if (dtagp != NULL) 3380 dtp->dt_dtag = *dtagp; /* structure assignment */ 3381 3382 /* Initialize the setno */ 3383 dtp->dt_dtag.dt_setno = setno; 3384 3385 /* Clear the id and flags, this is only used in user land */ 3386 dtp->dt_dtag.dt_id = 0; 3387 3388 /* Checksum it */ 3389 crcgen(dtp, &dtp->dt_cks, MDDB_DT_BYTES, NULL); 3390 } 3391 3392 /* Should not call for MN diskset since data tags are not supported */ 3393 static int 3394 set_dtag(mddb_set_t *s, md_error_t *ep) 3395 { 3396 mddb_lb_t *lbp = s->s_lbp; 3397 mddb_dtag_t tag; 3398 3399 if (lbp->lb_dtblkcnt == 0) { 3400 /* Data tags not used in a MN set - so no failure returned */ 3401 if (lbp->lb_flags & MDDB_MNSET) 3402 return (0); 3403 3404 cmn_err(CE_WARN, 3405 "No tag record allocated, unable to tag data"); 3406 (void) mdmddberror(ep, MDE_DB_NOTAGREC, NODEV32, s->s_setno); 3407 return (1); 3408 } 3409 3410 /* Clear the stack variable */ 3411 bzero((caddr_t)&tag, sizeof (mddb_dtag_t)); 3412 3413 /* Get the HW serial number for this host */ 3414 (void) strncpy(tag.dt_sn, hw_serial, MDDB_SN_LEN); 3415 tag.dt_sn[MDDB_SN_LEN - 1] = '\0'; 3416 3417 /* Get the nodename that this host goes by */ 3418 (void) strncpy(tag.dt_hn, utsname.nodename, MD_MAX_NODENAME); 3419 tag.dt_hn[MD_MAX_NODENAME] = '\0'; 3420 3421 /* Get a time stamp for NOW */ 3422 uniqtime32(&tag.dt_tv); 3423 3424 /* Setup the data tag record */ 3425 dt_setup(s, &tag); 3426 3427 /* Free any list of tags if they exist */ 3428 dtl_freel(&s->s_dtlp); 3429 3430 /* Put the new tag onto the tag list */ 3431 (void) dtl_addl(s, &tag); 3432 3433 return (0); 3434 } 3435 3436 /* 3437 * If called during upgrade, this routine expects a non-translated 3438 * (aka target) dev. 3439 * Should not call for MN diskset since data tags are not supported. 3440 */ 3441 static int 3442 dt_read(mddb_set_t *s, mddb_lb_t *lbp, mddb_ri_t *rip) 3443 { 3444 int err = 0; 3445 md_dev64_t dev; 3446 caddr_t tbuf; 3447 daddr_t physblk; 3448 mddb_block_t blk; 3449 mddb_dt_t *dtp; 3450 mddb_dtag_t *dtagp; 3451 set_t setno = s->s_setno; 3452 3453 /* If have not allocated a data tag record, there is nothing to do */ 3454 if (lbp->lb_dtblkcnt == 0) 3455 return (1); 3456 3457 dtp = rip->ri_dtp = (mddb_dt_t *)kmem_zalloc(MDDB_DT_BYTES, KM_SLEEP); 3458 3459 if (dtp == (mddb_dt_t *)NULL) 3460 return (1); 3461 3462 /* shorthand */ 3463 dev = md_xlate_targ_2_mini(rip->ri_dev); 3464 if (dev == NODEV64) { 3465 return (1); 3466 } 3467 3468 tbuf = (caddr_t)rip->ri_dtp; 3469 3470 for (blk = 0; blk < lbp->lb_dtblkcnt; blk++) { 3471 physblk = getphysblk((blk + lbp->lb_dtfirstblk), rip->ri_mbip); 3472 err = getblks(s, tbuf, dev, physblk, btodb(MDDB_BSIZE), 0); 3473 /* error reading the tag */ 3474 if (err) { 3475 err = 1; 3476 goto out; 3477 } 3478 tbuf += MDDB_BSIZE; 3479 } 3480 3481 /* magic is valid? */ 3482 if (dtp->dt_mag != MDDB_MAGIC_DT) { 3483 err = 1; 3484 goto out; 3485 } 3486 3487 /* revision is valid? */ 3488 if (revchk(MDDB_REV_DT, dtp->dt_rev)) { 3489 err = 1; 3490 goto out; 3491 } 3492 3493 /* crc is valid? */ 3494 if (crcchk(dtp, &dtp->dt_cks, MDDB_DT_BYTES, NULL)) { 3495 err = 1; 3496 goto out; 3497 } 3498 3499 /* shorthand */ 3500 dtagp = &dtp->dt_dtag; 3501 3502 /* set number match? */ 3503 if (dtagp->dt_setno != setno) { 3504 err = 1; 3505 goto out; 3506 } 3507 3508 /* tag is not empty? */ 3509 if (dtagp->dt_sn[0] == '\0' && dtagp->dt_hn[0] == '\0' && 3510 (dtagp->dt_tv.tv_sec == 0 && dtagp->dt_tv.tv_usec == 0) && 3511 dtagp->dt_id == 0) { 3512 err = 2; 3513 goto out; 3514 } 3515 3516 /* Mark the locator as having tagged data */ 3517 rip->ri_flags |= MDDB_F_TAGDATA; 3518 3519 out: 3520 if (err) { 3521 if (err == 1) { 3522 md_set_setstatus(setno, MD_SET_BADTAG); 3523 rip->ri_flags |= MDDB_F_BADTAG; 3524 } 3525 if (dtp != NULL) { 3526 kmem_free(dtp, MDDB_DT_BYTES); 3527 rip->ri_dtp = (mddb_dt_t *)NULL; 3528 } 3529 } 3530 3531 return (err); 3532 } 3533 3534 /* Should not call for MN diskset since data tags are not supported */ 3535 static int 3536 dt_write(mddb_set_t *s) 3537 { 3538 int li; 3539 int err = 0; 3540 int werr; 3541 int empty_tag = 0; 3542 mddb_dtag_t *dtagp; 3543 mddb_dt_t *dtp; 3544 mddb_lb_t *lbp = s->s_lbp; 3545 set_t setno = s->s_setno; 3546 uint_t set_status = md_get_setstatus(setno); 3547 3548 3549 ASSERT(md_set[setno].s_dtp != NULL); 3550 3551 /* Nowhere to write to */ 3552 if (lbp->lb_dtblkcnt == 0) 3553 return (err); 3554 3555 if (set_status & MD_SET_BADTAG) 3556 return (err); 3557 3558 /* shorthand */ 3559 dtp = (mddb_dt_t *)md_set[setno].s_dtp; 3560 dtagp = &dtp->dt_dtag; 3561 3562 /* See if the tag is empty. */ 3563 if (dtagp->dt_sn[0] == '\0' && dtagp->dt_hn[0] == '\0' && 3564 (dtagp->dt_tv.tv_sec == 0 && dtagp->dt_tv.tv_usec == 0) && 3565 dtagp->dt_id == 0) 3566 empty_tag = 1; 3567 3568 /* Write the tag to the locators and reset appropriate flags. */ 3569 for (li = 0; li < lbp->lb_loccnt; li++) { 3570 mddb_locator_t *lp = &lbp->lb_locators[li]; 3571 3572 if ((! (lp->l_flags & MDDB_F_ACTIVE)) || 3573 (lp->l_flags & MDDB_F_DELETED) || 3574 (lp->l_flags & MDDB_F_EWRITE)) 3575 continue; 3576 3577 werr = writeblks(s, (caddr_t)dtp, lbp->lb_dtfirstblk, 3578 MDDB_DT_BLOCKS, li, MDDB_WR_ONLY_MASTER); 3579 3580 if (werr) { 3581 err |= werr; 3582 continue; 3583 } 3584 3585 if (empty_tag) 3586 lp->l_flags &= ~(MDDB_F_BADTAG | MDDB_F_TAGDATA); 3587 else { 3588 lp->l_flags |= MDDB_F_TAGDATA; 3589 lp->l_flags &= ~MDDB_F_BADTAG; 3590 } 3591 } 3592 3593 if (err) 3594 return (err); 3595 3596 3597 /* If the tags were written, check to see if any tags remain. */ 3598 for (li = 0; li < lbp->lb_loccnt; li++) { 3599 mddb_locator_t *lp = &lbp->lb_locators[li]; 3600 3601 if ((! (lp->l_flags & MDDB_F_ACTIVE)) || 3602 (lp->l_flags & MDDB_F_DELETED) || 3603 (lp->l_flags & MDDB_F_EWRITE)) 3604 continue; 3605 3606 if (lp->l_flags & MDDB_F_TAGDATA) 3607 break; 3608 } 3609 3610 /* If there are no tags, then clear CLRTAG and TAGDATA */ 3611 if (li == lbp->lb_loccnt) { 3612 md_clr_setstatus(setno, MD_SET_CLRTAG); 3613 md_clr_setstatus(setno, MD_SET_TAGDATA); 3614 } 3615 3616 return (err); 3617 } 3618 3619 /* Should not call for MN diskset since data tags are not supported */ 3620 static int 3621 dt_alloc_if_needed(mddb_set_t *s) 3622 { 3623 int i; 3624 int li; 3625 int moveit = 0; 3626 mddb_lb_t *lbp = s->s_lbp; 3627 mddb_block_t blkcnt = lbp->lb_dtblkcnt; 3628 set_t setno = s->s_setno; 3629 uint_t set_status = md_get_setstatus(setno); 3630 3631 /* 3632 * If the data tag record is allocated (blkcnt != 0) and a bad tag was 3633 * not detected, there is nothing to do. 3634 */ 3635 if (blkcnt != 0 && ! (set_status & MD_SET_BADTAG)) 3636 return (0); 3637 3638 /* Bitmap not setup, checks can't be done */ 3639 if (s->s_totalblkcnt == 0) 3640 return (0); 3641 3642 /* While reading the tag(s) an invalid tag data record was seen */ 3643 if (set_status & MD_SET_BADTAG) 3644 /* See if the invalid tag needs to be moved */ 3645 for (i = 0; i < MDDB_DT_BLOCKS; i++) 3646 if (blkcheck(s, (i + lbp->lb_dtfirstblk))) { 3647 moveit = 1; 3648 break; 3649 } 3650 3651 /* Need to move or allocate the tag data record */ 3652 if (moveit || blkcnt == 0) { 3653 lbp->lb_dtfirstblk = getfreeblks(s, MDDB_DT_BLOCKS); 3654 if (lbp->lb_dtfirstblk == 0) { 3655 cmn_err(CE_WARN, 3656 "Unable to allocate data tag record"); 3657 return (0); 3658 } 3659 lbp->lb_dtblkcnt = MDDB_DT_BLOCKS; 3660 3661 /* Mark the locators so that they get written to disk. */ 3662 for (li = 0; li < lbp->lb_loccnt; li++) { 3663 mddb_locator_t *lp = &lbp->lb_locators[li]; 3664 3665 if ((! (lp->l_flags & MDDB_F_ACTIVE)) || 3666 (lp->l_flags & MDDB_F_DELETED) || 3667 (lp->l_flags & MDDB_F_EWRITE)) 3668 continue; 3669 3670 lp->l_flags |= MDDB_F_BADTAG; 3671 } 3672 return (1); 3673 } 3674 3675 /* 3676 * Make sure the blocks are owned, since the calculation in 3677 * computefreeblks() is bypassed when MD_SET_BADTAG is set. 3678 */ 3679 for (i = 0; i < MDDB_DT_BLOCKS; i++) 3680 blkbusy(s, (i + lbp->lb_dtfirstblk)); 3681 3682 return (1); 3683 } 3684 3685 /* 3686 * Writestart writes the incore mddb out to all of the replicas. 3687 * This is called when a diskset is started and when an error has 3688 * been enountered during the write to a mddb. 3689 * 3690 * flag can be 2 values: 3691 * MDDB_WRITECOPY_ALL - write all records to all mddbs. This is 3692 * always used for traditional and local disksets. 3693 * This is the normal path for MN disksets since the slave 3694 * nodes aren't actually allowed to write to disk. 3695 * MDDB_WRITECOPY_SYNC - special case for MN diskset. When a new 3696 * master has been chosen, the new master may need to 3697 * write its incore mddb to disk (this is the case where the 3698 * old master had executed a message but hadn't relayed it 3699 * to this slave yet). New master should not write the 3700 * change log records since new master would be overwriting 3701 * valuable data. Only used during a reconfig cycle. 3702 */ 3703 static int 3704 writestart( 3705 mddb_set_t *s, 3706 int flag 3707 ) 3708 { 3709 int li; 3710 mddb_locator_t *lp; 3711 mddb_lb_t *lbp; 3712 mddb_ln_t *lnp; 3713 int err = 0; 3714 uint_t set_status; 3715 3716 lbp = s->s_lbp; 3717 3718 for (li = 0; li < lbp->lb_loccnt; li++) { 3719 lp = &lbp->lb_locators[li]; 3720 if (! (lp->l_flags & MDDB_F_ACTIVE)) 3721 continue; 3722 if (! (lp->l_flags & MDDB_F_SUSPECT)) 3723 continue; 3724 if (writecopy(s, li, flag)) 3725 return (1); 3726 lp->l_flags |= MDDB_F_UP2DATE; 3727 } 3728 3729 for (li = 0; li < lbp->lb_loccnt; li++) { 3730 lp = &lbp->lb_locators[li]; 3731 if (! (lp->l_flags & MDDB_F_ACTIVE)) 3732 continue; 3733 if ((lp->l_flags & MDDB_F_UP2DATE)) 3734 continue; 3735 if (checkcopy(s, li)) 3736 if (err = writecopy(s, li, flag)) 3737 return (1); 3738 lp->l_flags |= MDDB_F_UP2DATE; 3739 } 3740 3741 /* 3742 * Call fixoptrecord even during a reconfig cycle since a replica 3743 * failure may force the master to re-assign the optimized 3744 * resync record to another replica. 3745 */ 3746 if (fixoptrecords(s)) 3747 return (1); 3748 3749 set_status = md_get_setstatus(s->s_setno); 3750 3751 /* See if any (ACTIVE and not OLDACT) or (not ACTIVE and OLDACT) */ 3752 for (li = 0; li < lbp->lb_loccnt; li++) { 3753 lp = &lbp->lb_locators[li]; 3754 3755 if (lp->l_flags & MDDB_F_DELETED) 3756 continue; 3757 3758 if (((lp->l_flags & MDDB_F_ACTIVE) != 0 && 3759 (lp->l_flags & MDDB_F_OLDACT) == 0) || 3760 ((lp->l_flags & MDDB_F_ACTIVE) == 0 && 3761 (lp->l_flags & MDDB_F_OLDACT) != 0)) 3762 break; 3763 3764 if ((set_status & MD_SET_TAGDATA) || 3765 (set_status & MD_SET_CLRTAG)) 3766 if ((lp->l_flags & MDDB_F_TAGDATA) || 3767 (lp->l_flags & MDDB_F_BADTAG)) 3768 break; 3769 } 3770 3771 /* 3772 * If we found (ACTIVE and not OLDACT) or (not ACTIVE and OLDACT) 3773 * the lbp identifier and the set identifier doesn't match. 3774 */ 3775 if (li != lbp->lb_loccnt || cmpidentifier(s, &lbp->lb_ident)) { 3776 3777 /* Only call for traditional and local sets */ 3778 if (!(lbp->lb_flags & MDDB_MNSET)) 3779 (void) dt_write(s); 3780 3781 setidentifier(s, &lbp->lb_ident); 3782 3783 if (err = push_lb(s)) { 3784 (void) upd_med(s, "writestart(0)"); 3785 return (err); 3786 } 3787 3788 (void) upd_med(s, "writestart(0)"); 3789 3790 if (err = push_lb(s)) { 3791 (void) upd_med(s, "writestart(1)"); 3792 return (err); 3793 } 3794 3795 (void) upd_med(s, "writestart(1)"); 3796 3797 lnp = s->s_lnp; 3798 uniqtime32(&lnp->ln_timestamp); 3799 if (lbp->lb_flags & MDDB_MNSET) 3800 lnp->ln_revision = MDDB_REV_MNLN; 3801 else 3802 lnp->ln_revision = MDDB_REV_LN; 3803 crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL); 3804 err = writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk, 3805 lbp->lb_lnblkcnt, 0); 3806 /* 3807 * If a MN diskset and this is the master, set the PARSE_LOCNM 3808 * flag in the mddb_set structure to show that the locator 3809 * names have changed. 3810 * Don't set parseflags as a result of a new master sync 3811 * during reconfig cycle since slaves nodes are already 3812 * in-sync with the new master. 3813 */ 3814 3815 if ((lbp->lb_flags & MDDB_MNSET) && 3816 (md_set[s->s_setno].s_am_i_master) && 3817 (flag != MDDB_WRITECOPY_SYNC)) { 3818 s->s_mn_parseflags |= MDDB_PARSE_LOCNM; 3819 } 3820 3821 if (err) 3822 return (err); 3823 } 3824 3825 for (li = 0; li < lbp->lb_loccnt; li++) { 3826 lp = &lbp->lb_locators[li]; 3827 if (lp->l_flags & MDDB_F_DELETED) 3828 continue; 3829 if (lp->l_flags & MDDB_F_ACTIVE) { 3830 lp->l_flags |= MDDB_F_OLDACT; 3831 } else { 3832 lp->l_flags &= ~MDDB_F_OLDACT; 3833 } 3834 } 3835 3836 md_clr_setstatus(s->s_setno, MD_SET_STALE); 3837 3838 return (0); 3839 } 3840 3841 /* 3842 * selectreplicas selects the working replicas and may write the incore 3843 * version of the mddb out to the replicas ondisk. 3844 * 3845 * flag can be 3 values: 3846 * MDDB_RETRYSCAN - quick scan to see if there is an error. 3847 * If no new error, returns without writing mddb 3848 * to disks. If a new error is seen, writes out 3849 * mddb to disks. 3850 * MDDB_SCANALL - lengthy scan to check out mddbs and always writes 3851 * out mddb to the replica ondisk. Calls writecopy 3852 * with MDDB_WRITECOPY_ALL flag which writes out 3853 * all records to the replicas ondisk. 3854 * MDDB_SCANALLSYNC - called during reconfig cycle to sync up incore 3855 * and ondisk mddbs by writing incore values to disk. 3856 * Calls writecopy with MDDB_WRITECOPY_SYNC flag so 3857 * that change log records are not written out. 3858 * Only used by MN disksets. 3859 * 3860 * Returns: 3861 * 0 - Successful 3862 * 1 - Unable to write incore mddb data to disk since < 50% replicas. 3863 */ 3864 int 3865 selectreplicas( 3866 mddb_set_t *s, 3867 int flag 3868 ) 3869 { 3870 int li; 3871 int alc; 3872 int lc; 3873 mddb_locator_t *lp; 3874 mddb_lb_t *lbp = s->s_lbp; 3875 set_t setno = s->s_setno; 3876 int wc_flag; 3877 3878 /* 3879 * can never transition from stale to not stale 3880 */ 3881 if (md_get_setstatus(setno) & MD_SET_STALE) { 3882 for (li = 0; li < lbp->lb_loccnt; li++) { 3883 lp = &lbp->lb_locators[li]; 3884 if (lp->l_flags & MDDB_F_DELETED) 3885 continue; 3886 if (! (lp->l_flags & MDDB_F_EMASTER)) { 3887 lp->l_flags |= MDDB_F_ACTIVE; 3888 } else { 3889 lp->l_flags &= ~MDDB_F_ACTIVE; 3890 } 3891 } 3892 return (1); 3893 } 3894 3895 if ((flag == MDDB_SCANALL) || (flag == MDDB_SCANALLSYNC)) { 3896 for (li = 0; li < lbp->lb_loccnt; li++) { 3897 lp = &lbp->lb_locators[li]; 3898 if (lp->l_flags & MDDB_F_DELETED) 3899 continue; 3900 if (lp->l_flags & MDDB_F_ACTIVE) { 3901 lp->l_flags |= MDDB_F_OLDACT; 3902 lp->l_flags &= ~MDDB_F_SUSPECT; 3903 } else { 3904 lp->l_flags |= MDDB_F_SUSPECT; 3905 lp->l_flags &= ~MDDB_F_OLDACT; 3906 } 3907 3908 if (! (lp->l_flags & MDDB_F_EMASTER)) { 3909 lp->l_flags |= MDDB_F_ACTIVE; 3910 lp->l_flags &= ~MDDB_F_EWRITE; 3911 lp->l_flags &= ~MDDB_F_TOOSMALL; 3912 } else { 3913 lp->l_flags &= ~MDDB_F_ACTIVE; 3914 } 3915 } 3916 computefreeblks(s); /* set up free block bits */ 3917 } else { 3918 for (li = 0; li < lbp->lb_loccnt; li++) { 3919 lp = &lbp->lb_locators[li]; 3920 if (! (lp->l_flags & MDDB_F_ACTIVE)) 3921 continue; 3922 if (lp->l_flags & MDDB_F_EWRITE) 3923 break; 3924 } 3925 3926 /* 3927 * if there are no errors this is error has already 3928 * been processed return current state 3929 */ 3930 if (li == lbp->lb_loccnt) 3931 return (md_get_setstatus(setno) & MD_SET_TOOFEW); 3932 3933 lp->l_flags &= ~MDDB_F_ACTIVE; 3934 do { 3935 lp = &lbp->lb_locators[li]; 3936 lp->l_flags &= ~MDDB_F_UP2DATE; 3937 } while (++li < lbp->lb_loccnt); 3938 } 3939 3940 alc = 0; 3941 lc = 0; 3942 for (li = 0; li < lbp->lb_loccnt; li++) { 3943 lp = &lbp->lb_locators[li]; 3944 if (lp->l_flags & MDDB_F_DELETED) 3945 continue; 3946 lc++; 3947 if (! (lp->l_flags & MDDB_F_ACTIVE)) 3948 continue; 3949 alc++; 3950 } 3951 3952 if (alc < ((lc + 1) / 2)) { 3953 md_set_setstatus(setno, MD_SET_TOOFEW); 3954 return (1); 3955 } 3956 3957 /* Set wc_flag based on flag passed in. */ 3958 if (flag == MDDB_SCANALLSYNC) 3959 wc_flag = MDDB_WRITECOPY_SYNC; 3960 else 3961 wc_flag = MDDB_WRITECOPY_ALL; 3962 3963 do { 3964 if (! writestart(s, wc_flag)) { 3965 md_clr_setstatus(setno, MD_SET_TOOFEW); 3966 return (0); 3967 } 3968 alc = 0; 3969 for (li = 0; li < lbp->lb_loccnt; li++) { 3970 lp = &lbp->lb_locators[li]; 3971 if ((lp->l_flags & MDDB_F_DELETED) || 3972 (lp->l_flags & MDDB_F_EMASTER)) 3973 continue; 3974 3975 if (lp->l_flags & MDDB_F_EWRITE) { 3976 lp->l_flags &= ~MDDB_F_ACTIVE; 3977 lp->l_flags &= ~MDDB_F_UP2DATE; 3978 continue; 3979 } 3980 alc++; 3981 } 3982 } while (alc >= ((lc + 1) / 2)); 3983 md_set_setstatus(setno, MD_SET_TOOFEW); 3984 return (1); 3985 } 3986 3987 static int 3988 checkstate( 3989 mddb_set_t *s, 3990 int probe 3991 ) 3992 { 3993 int error; 3994 uint_t set_status = md_get_setstatus(s->s_setno); 3995 3996 ASSERT(s != NULL); 3997 3998 if (! (set_status & MD_SET_STALE) && ! (set_status & MD_SET_TOOFEW)) 3999 return (0); 4000 4001 if (probe == MDDB_NOPROBE) 4002 return (1); 4003 4004 single_thread_start(s); 4005 error = selectreplicas(s, MDDB_SCANALL); 4006 single_thread_end(s); 4007 4008 if (error == 0 && s->s_zombie != 0) { 4009 mutex_exit(SETMUTEX(s->s_setno)); 4010 error = mddb_deleterec(s->s_zombie); 4011 mutex_enter(SETMUTEX(s->s_setno)); 4012 if (error == 0) 4013 s->s_zombie = 0; 4014 } 4015 return (error); 4016 } 4017 4018 static int 4019 writeretry( 4020 mddb_set_t *s 4021 ) 4022 { 4023 if (selectreplicas(s, MDDB_RETRYSCAN)) 4024 if (selectreplicas(s, MDDB_SCANALL)) 4025 return (1); 4026 return (0); 4027 } 4028 4029 static void 4030 free_mbipp(mddb_mb_ic_t **mbipp) 4031 { 4032 mddb_mb_ic_t *mbip1, *mbip2; 4033 4034 for (mbip1 = *mbipp; mbip1 != NULL; mbip1 = mbip2) { 4035 mbip2 = mbip1->mbi_next; 4036 kmem_free((caddr_t)mbip1, MDDB_IC_BSIZE); 4037 } 4038 *mbipp = (mddb_mb_ic_t *)NULL; 4039 } 4040 4041 static mddb_ri_t * 4042 save_rip(mddb_set_t *s) 4043 { 4044 mddb_ri_t *trip = s->s_rip; 4045 mddb_ri_t *nrip = NULL; 4046 mddb_ri_t **nripp = &nrip; 4047 mddb_ri_t *rip; 4048 4049 while (trip) { 4050 /* Run to the end of the list */ 4051 for (/* void */; (*nripp != NULL); nripp = &(*nripp)->ri_next) 4052 /* void */; 4053 4054 /* Add the new member */ 4055 *nripp = kmem_zalloc(sizeof (**nripp), KM_SLEEP); 4056 4057 ASSERT(*nripp != NULL); 4058 4059 /* shorthand */ 4060 rip = *nripp; 4061 4062 *rip = *trip; /* structure assignment */ 4063 4064 /* Clear the stuff that is not needed for hints */ 4065 rip->ri_flags = 0; 4066 rip->ri_commitcnt = 0; 4067 rip->ri_transplant = 0; 4068 rip->ri_mbip = (mddb_mb_ic_t *)NULL; 4069 rip->ri_dtp = (mddb_dt_t *)NULL; 4070 rip->ri_lbp = (mddb_lb_t *)NULL; 4071 rip->ri_did_icp = (mddb_did_ic_t *)NULL; 4072 rip->ri_devid = (ddi_devid_t)NULL; 4073 rip->ri_old_devid = (ddi_devid_t)NULL; 4074 rip->ri_next = (mddb_ri_t *)NULL; 4075 4076 trip = trip->ri_next; 4077 } 4078 return (nrip); 4079 } 4080 4081 static void 4082 free_rip(mddb_ri_t **ripp) 4083 { 4084 mddb_ri_t *rip; 4085 mddb_ri_t *arip; 4086 4087 for (rip = *ripp; rip != (mddb_ri_t *)NULL; rip = arip) { 4088 arip = rip->ri_next; 4089 if (rip->ri_devid != (ddi_devid_t)NULL) { 4090 ddi_devid_free(rip->ri_devid); 4091 rip->ri_devid = (ddi_devid_t)NULL; 4092 } 4093 if (rip->ri_old_devid != (ddi_devid_t)NULL) { 4094 ddi_devid_free(rip->ri_old_devid); 4095 rip->ri_old_devid = (ddi_devid_t)NULL; 4096 } 4097 kmem_free((caddr_t)rip, sizeof (*rip)); 4098 } 4099 *ripp = (mddb_ri_t *)NULL; 4100 } 4101 4102 /* 4103 * this routine selects the correct replica to use 4104 * the rules are as follows 4105 * 1. if all replica has same init time select highest commit count 4106 * 2. if some but not all replicas are from another hostid discard 4107 * them. 4108 * 3. find which init time is present is most replicas 4109 * 4. discard all replicas which do not match most init times 4110 * 5. select replica with highest commit count 4111 */ 4112 4113 static mddb_lb_t * 4114 selectlocator( 4115 mddb_set_t *s 4116 ) 4117 { 4118 mddb_ri_t *rip = s->s_rip; 4119 mddb_ri_t *r, *r1; 4120 mddb_lb_t *lbp; 4121 struct timeval32 *tp = (struct timeval32 *)NULL; 4122 int different; 4123 int same; 4124 int count; 4125 int maxcount; 4126 set_t setno = s->s_setno; 4127 size_t sz; 4128 int mn_set = 0; 4129 4130 /* Clear the ri_transplant flag on all the rip entries. */ 4131 /* Set ri_commitcnt to locator's commitcnt - if available */ 4132 for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) { 4133 r->ri_transplant = 0; 4134 if (r->ri_lbp != (mddb_lb_t *)NULL) { 4135 r->ri_commitcnt = r->ri_lbp->lb_commitcnt; 4136 /* If any locators have MN bit set, set flag */ 4137 if (r->ri_lbp->lb_flags & MDDB_MNSET) 4138 mn_set = 1; 4139 } 4140 } 4141 4142 /* 4143 * A data tag is being used, so use it to limit the selection first. 4144 * Data tags not used in MN diskset. 4145 */ 4146 if ((mn_set == 0) && (md_get_setstatus(setno) & MD_SET_USETAG)) { 4147 mddb_dt_t *dtp = (mddb_dt_t *)md_set[setno].s_dtp; 4148 4149 /* 4150 * now toss any locators that have a different data tag 4151 */ 4152 for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) { 4153 if (r->ri_lbp == (mddb_lb_t *)NULL) 4154 continue; 4155 4156 if (r->ri_dtp != (mddb_dt_t *)NULL) { 4157 /* If same tag, keep it */ 4158 if (dtl_cmp(&dtp->dt_dtag, 4159 &r->ri_dtp->dt_dtag) == 0) 4160 continue; 4161 } 4162 4163 if (r->ri_dtp != (mddb_dt_t *)NULL) { 4164 kmem_free((caddr_t)r->ri_dtp, MDDB_DT_BYTES); 4165 r->ri_dtp = (mddb_dt_t *)NULL; 4166 } 4167 4168 mddb_devid_icp_free(&r->ri_did_icp, r->ri_lbp); 4169 if (!(md_get_setstatus(setno) & 4170 MD_SET_REPLICATED_IMPORT)) { 4171 if (r->ri_old_devid != (ddi_devid_t)NULL) { 4172 sz = ddi_devid_sizeof(r->ri_old_devid); 4173 kmem_free((caddr_t)r->ri_old_devid, sz); 4174 r->ri_old_devid = (ddi_devid_t)NULL; 4175 } 4176 } 4177 4178 kmem_free((caddr_t)r->ri_lbp, 4179 dbtob(r->ri_lbp->lb_blkcnt)); 4180 r->ri_lbp = (mddb_lb_t *)NULL; 4181 4182 r->ri_transplant = 1; 4183 } 4184 4185 /* Tag used, clear the bit */ 4186 md_clr_setstatus(s->s_setno, MD_SET_USETAG); 4187 4188 if (md_get_setstatus(s->s_setno) & MD_SET_TAGDATA) { 4189 /* 4190 * Get rid of the list of tags. 4191 */ 4192 dtl_freel(&s->s_dtlp); 4193 4194 /* 4195 * Re-create the list with the tag used. 4196 */ 4197 (void) dtl_addl(s, &dtp->dt_dtag); 4198 } 4199 } 4200 4201 /* 4202 * scan to see if all replicas have same time 4203 */ 4204 for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) { 4205 if (r->ri_lbp == (mddb_lb_t *)NULL) 4206 continue; 4207 if (tp == NULL) { 4208 tp = &r->ri_lbp->lb_inittime; 4209 continue; 4210 } 4211 /* CSTYLED */ 4212 if (timercmp(tp, &r->ri_lbp->lb_inittime, !=)) 4213 break; 4214 } 4215 4216 /* 4217 * if r == NULL then they were all them same. Choose highest 4218 * commit count 4219 */ 4220 if (r == (mddb_ri_t *)NULL) 4221 goto out; 4222 4223 /* 4224 * If here, a bogus replica is present and at least 1 lb_inittime 4225 * did not match. 4226 */ 4227 4228 /* 4229 * look and see if any but not all are from different id 4230 */ 4231 4232 different = 0; 4233 same = 0; 4234 for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) { 4235 if (r->ri_lbp == (mddb_lb_t *)NULL) 4236 continue; 4237 if (cmpidentifier(s, &r->ri_lbp->lb_ident)) 4238 different = 1; 4239 else 4240 same = 1; 4241 } 4242 4243 /* 4244 * now go through and throw out different if there are some 4245 * that are the same 4246 */ 4247 if (different != 0 && same != 0) { 4248 for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) { 4249 if (r->ri_lbp == (mddb_lb_t *)NULL) 4250 continue; 4251 4252 if (!cmpidentifier(s, &r->ri_lbp->lb_ident)) 4253 continue; 4254 4255 if (r->ri_dtp != (mddb_dt_t *)NULL) { 4256 kmem_free((caddr_t)r->ri_dtp, MDDB_DT_BYTES); 4257 r->ri_dtp = (mddb_dt_t *)NULL; 4258 } 4259 4260 mddb_devid_icp_free(&r->ri_did_icp, r->ri_lbp); 4261 if (!(md_get_setstatus(setno) & 4262 MD_SET_REPLICATED_IMPORT)) { 4263 if (r->ri_old_devid != (ddi_devid_t)NULL) { 4264 sz = ddi_devid_sizeof(r->ri_old_devid); 4265 kmem_free((caddr_t)r->ri_old_devid, sz); 4266 r->ri_old_devid = (ddi_devid_t)NULL; 4267 } 4268 } 4269 4270 kmem_free((caddr_t)r->ri_lbp, 4271 dbtob(r->ri_lbp->lb_blkcnt)); 4272 r->ri_lbp = (mddb_lb_t *)NULL; 4273 4274 r->ri_transplant = 1; 4275 } 4276 } 4277 4278 /* 4279 * go through and pick highest. Use n square because it is 4280 * simple and 40 some is max possible 4281 */ 4282 maxcount = 0; 4283 lbp = (mddb_lb_t *)NULL; 4284 for (r1 = rip; r1 != (mddb_ri_t *)NULL; r1 = r1->ri_next) { 4285 if (r1->ri_lbp == (mddb_lb_t *)NULL) 4286 continue; 4287 count = 0; 4288 for (r = r1; r != (mddb_ri_t *)NULL; r = r->ri_next) { 4289 if (r->ri_lbp == (mddb_lb_t *)NULL) 4290 continue; 4291 if (timercmp(&r1->ri_lbp->lb_inittime, /* CSTYLED */ 4292 &r->ri_lbp->lb_inittime, ==)) 4293 count++; 4294 } 4295 if (count > maxcount) { 4296 maxcount = count; 4297 lbp = r1->ri_lbp; 4298 } 4299 } 4300 4301 /* 4302 * now go though and toss any that are of a different time stamp 4303 */ 4304 for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) { 4305 if (r->ri_lbp == (mddb_lb_t *)NULL) 4306 continue; 4307 if (timercmp(&lbp->lb_inittime, /* CSTYLED */ 4308 &r->ri_lbp->lb_inittime, ==)) 4309 continue; 4310 4311 if (r->ri_dtp != (mddb_dt_t *)NULL) { 4312 kmem_free((caddr_t)r->ri_dtp, MDDB_DT_BYTES); 4313 r->ri_dtp = (mddb_dt_t *)NULL; 4314 } 4315 4316 mddb_devid_icp_free(&r->ri_did_icp, r->ri_lbp); 4317 if (!(md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) { 4318 if (r->ri_old_devid != (ddi_devid_t)NULL) { 4319 sz = ddi_devid_sizeof(r->ri_old_devid); 4320 kmem_free((caddr_t)r->ri_old_devid, sz); 4321 r->ri_old_devid = (ddi_devid_t)NULL; 4322 } 4323 } 4324 4325 kmem_free((caddr_t)r->ri_lbp, dbtob(r->ri_lbp->lb_blkcnt)); 4326 r->ri_lbp = (mddb_lb_t *)NULL; 4327 4328 r->ri_transplant = 1; 4329 } 4330 4331 out: 4332 /* 4333 * Find the locator with the highest commit count, and make it the 4334 * "chosen" one. 4335 */ 4336 lbp = (mddb_lb_t *)NULL; 4337 for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) { 4338 if (r->ri_lbp == (mddb_lb_t *)NULL) 4339 continue; 4340 4341 if (lbp == NULL) { 4342 lbp = r->ri_lbp; 4343 continue; 4344 } 4345 4346 if (r->ri_lbp->lb_commitcnt > lbp->lb_commitcnt) 4347 lbp = r->ri_lbp; 4348 } 4349 4350 /* Toss all locator blocks, except the "chosen" one. */ 4351 for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) { 4352 if (r->ri_lbp == (mddb_lb_t *)NULL) 4353 continue; 4354 4355 /* Get rid of all dtp's */ 4356 if (r->ri_dtp != (mddb_dt_t *)NULL) { 4357 kmem_free((caddr_t)r->ri_dtp, MDDB_DT_BYTES); 4358 r->ri_dtp = (mddb_dt_t *)NULL; 4359 } 4360 4361 if (r->ri_lbp == lbp) 4362 continue; 4363 4364 /* Get rid of extra locator devid block info */ 4365 mddb_devid_icp_free(&r->ri_did_icp, r->ri_lbp); 4366 if (!(md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) { 4367 if (r->ri_old_devid != (ddi_devid_t)NULL) { 4368 sz = ddi_devid_sizeof(r->ri_old_devid); 4369 kmem_free((caddr_t)r->ri_old_devid, sz); 4370 r->ri_old_devid = (ddi_devid_t)NULL; 4371 } 4372 } 4373 4374 /* Get rid of extra locators */ 4375 kmem_free((caddr_t)r->ri_lbp, dbtob(r->ri_lbp->lb_blkcnt)); 4376 r->ri_lbp = (mddb_lb_t *)NULL; 4377 } 4378 return (lbp); 4379 } 4380 4381 static void 4382 locator2cfgloc( 4383 mddb_lb_t *lbp, 4384 mddb_cfg_loc_t *clp, 4385 int li, 4386 side_t sideno, 4387 mddb_did_ic_t *did_icp 4388 ) 4389 { 4390 mddb_drvnm_t *dn; 4391 mddb_locator_t *lp = &lbp->lb_locators[li]; 4392 mddb_sidelocator_t *slp; 4393 mddb_mnsidelocator_t *mnslp; 4394 mddb_did_info_t *did_info; 4395 int i, sz, szalloc; 4396 int mn_set = 0; 4397 mddb_mnlb_t *mnlbp; 4398 4399 if (lbp->lb_flags & MDDB_MNSET) { 4400 mn_set = 1; 4401 mnlbp = (mddb_mnlb_t *)lbp; 4402 for (i = 0; i < MD_MNMAXSIDES; i++) { 4403 mnslp = &mnlbp->lb_mnsidelocators[i][li]; 4404 if (mnslp->mnl_sideno == sideno) 4405 break; 4406 } 4407 if (i == MD_MNMAXSIDES) 4408 return; 4409 } else { 4410 slp = &lbp->lb_sidelocators[sideno][li]; 4411 } 4412 4413 if (lbp->lb_flags & MDDB_DEVID_STYLE) { 4414 did_info = &(did_icp->did_ic_blkp->blk_info[li]); 4415 if (did_info->info_flags & MDDB_DID_EXISTS) { 4416 sz = (int)ddi_devid_sizeof(did_icp->did_ic_devid[li]); 4417 if (clp->l_devid_flags & MDDB_DEVID_SPACE) { 4418 /* copy device id from mddb to cfg_loc structure */ 4419 szalloc = clp->l_devid_sz; 4420 if (sz <= szalloc) { 4421 for (i = 0; i < sz; i++) { 4422 ((char *)(uintptr_t)clp->l_devid)[i] = 4423 ((char *)did_icp->did_ic_devid[li])[i]; 4424 } 4425 clp->l_devid_flags |= MDDB_DEVID_VALID; 4426 (void) strcpy(clp->l_minor_name, 4427 did_info->info_minor_name); 4428 } else { 4429 clp->l_devid_flags |= MDDB_DEVID_NOSPACE; 4430 } 4431 } else if (clp->l_devid_flags & MDDB_DEVID_GETSZ) { 4432 clp->l_devid_flags = MDDB_DEVID_SZ; 4433 clp->l_devid_sz = sz; 4434 } 4435 } 4436 } 4437 4438 /* 4439 * Even if a devid exists, use the dev, drvnm and mnum in the locators 4440 * and sidelocators. During startup, the dev, drvnm and mnum in 4441 * these structures may not match the devid (the locators and 4442 * sidelocators will be updated to match the devid by the routine 4443 * load_old_replicas). Using out-of-sync values won't cause any 4444 * problems since ridev will re-derive these from the devid and mnum. 4445 * After startup, the dev, drvnm and mnum in these structures have 4446 * been updated and can be used. 4447 */ 4448 4449 clp->l_blkno = lp->l_blkno; 4450 clp->l_flags = lp->l_flags; 4451 clp->l_dev = lp->l_dev; 4452 4453 if (mn_set) { 4454 dn = &lbp->lb_drvnm[mnslp->mnl_drvnm_index]; 4455 clp->l_mnum = mnslp->mnl_mnum; 4456 } else { 4457 dn = &lbp->lb_drvnm[slp->l_drvnm_index]; 4458 clp->l_mnum = slp->l_mnum; 4459 } 4460 (void) strncpy(clp->l_driver, dn->dn_data, MD_MAXDRVNM); 4461 } 4462 4463 /* 4464 * Find the index into the mnsidelocator where entry will go. 4465 * Then index can be fed into both splitname2locatorblocks and 4466 * cfgloc2locator so that those entries can be kept in sync. 4467 * 4468 * Returns: 4469 * -1 if failed to find unused slot or if a traditional diskset 4470 * index, if successful (0 <= index <= MD_MNMAXSIDES) 4471 */ 4472 static int 4473 checklocator( 4474 mddb_lb_t *lbp, 4475 int li, 4476 side_t sideno 4477 ) 4478 { 4479 uchar_t i; 4480 mddb_mnsidelocator_t *mnslp; 4481 mddb_mnlb_t *mnlbp; 4482 int index = -1; 4483 4484 if (lbp->lb_flags & MDDB_MNSET) { 4485 /* 4486 * Checking side locator structure. First, check if 4487 * there is already an entry for this side. If so, 4488 * then use that entry. Otherwise, find an entry 4489 * that has a sideno of 0. 4490 */ 4491 mnlbp = (mddb_mnlb_t *)lbp; 4492 for (i = 0; i < MD_MNMAXSIDES; i++) { 4493 mnslp = &mnlbp->lb_mnsidelocators[i][li]; 4494 if (mnslp->mnl_sideno == sideno) { 4495 /* Found a match - stop looking */ 4496 index = i; 4497 break; 4498 } else if ((mnslp->mnl_sideno == 0) && (index == -1)) { 4499 /* Set first empty slot, but keep looking */ 4500 index = i; 4501 } 4502 } 4503 /* Didn't find empty slot or previously used slot */ 4504 if ((i == MD_MNMAXSIDES) && (index == -1)) { 4505 return (-1); 4506 } 4507 return (index); 4508 } else 4509 return (0); 4510 } 4511 4512 /* 4513 * Takes locator information (driver name, minor number, sideno) and 4514 * stores it in the locator block. 4515 * For traditional diskset, the sideno is the index into the sidelocator 4516 * array in the locator block. 4517 * For the MN diskset, the sideno is the nodeid which can be any number, 4518 * so the index passed in is the index into the mnsidelocator array 4519 * in the locator block. 4520 */ 4521 static int 4522 cfgloc2locator( 4523 mddb_lb_t *lbp, 4524 mddb_cfg_loc_t *clp, 4525 int li, 4526 side_t sideno, 4527 int index /* Only useful in MNsets when > 1 */ 4528 ) 4529 { 4530 uchar_t i; 4531 mddb_sidelocator_t *slp; 4532 mddb_mnsidelocator_t *mnslp; 4533 mddb_set_t *s; 4534 int mn_set = 0; 4535 mddb_mnlb_t *mnlbp; 4536 4537 if (lbp->lb_flags & MDDB_MNSET) { 4538 mnlbp = (mddb_mnlb_t *)lbp; 4539 mn_set = 1; 4540 /* 4541 * Index will be the slot that has the given sideno or 4542 * the first empty slot if no match is found. 4543 * This was pre-checked out in check locator. 4544 */ 4545 mnslp = &mnlbp->lb_mnsidelocators[index][li]; 4546 } else { 4547 slp = &lbp->lb_sidelocators[sideno][li]; 4548 } 4549 4550 /* 4551 * Look for the driver name 4552 */ 4553 for (i = 0; i < MDDB_DRVNMCNT; i++) { 4554 if (lbp->lb_drvnm[i].dn_len == 0) 4555 continue; 4556 if (strncmp(lbp->lb_drvnm[i].dn_data, clp->l_driver, 4557 MD_MAXDRVNM) == 0) 4558 break; 4559 } 4560 4561 /* 4562 * Didn't find one, add a new one 4563 */ 4564 if (i == MDDB_DRVNMCNT) { 4565 for (i = 0; i < MDDB_DRVNMCNT; i++) { 4566 if (lbp->lb_drvnm[i].dn_len == 0) 4567 break; 4568 } 4569 if (i == MDDB_DRVNMCNT) 4570 return (1); 4571 (void) strncpy(lbp->lb_drvnm[i].dn_data, clp->l_driver, 4572 MD_MAXDRVNM); 4573 lbp->lb_drvnm[i].dn_len = (uchar_t)strlen(clp->l_driver); 4574 } 4575 4576 /* Fill in the drvnm index */ 4577 if (mn_set) { 4578 mnslp->mnl_drvnm_index = i; 4579 mnslp->mnl_mnum = clp->l_mnum; 4580 mnslp->mnl_sideno = sideno; 4581 } else { 4582 slp->l_drvnm_index = i; 4583 slp->l_mnum = clp->l_mnum; 4584 } 4585 4586 if (lbp->lb_flags & MDDB_DEVID_STYLE) { 4587 /* 4588 * This device id could already be associated with this index 4589 * if this is not the first side added to the set. 4590 * If device id is 0, there is no device id for this device. 4591 */ 4592 if ((ddi_devid_t)(uintptr_t)clp->l_devid == 0) 4593 return (0); 4594 s = (mddb_set_t *)md_set[lbp->lb_setno].s_db; 4595 if (mddb_devid_add(s, li, (ddi_devid_t)(uintptr_t)clp->l_devid, 4596 clp->l_minor_name)) { 4597 return (1); 4598 } 4599 } 4600 4601 return (0); 4602 } 4603 4604 /* 4605 * See if there are mediator hosts and try to use the data. 4606 */ 4607 static int 4608 mediate( 4609 mddb_set_t *s 4610 ) 4611 { 4612 mddb_lb_t *lbp = s->s_lbp; 4613 med_data_lst_t *meddlp = NULL; 4614 med_data_lst_t *tmeddlp = NULL; 4615 med_data_t *meddp; 4616 int medok = 0; 4617 int medacc = 0; 4618 uint_t maxcc; 4619 int golden = 0; 4620 int err = 1; 4621 set_t setno = s->s_setno; 4622 4623 /* Do not have a mediator, then the state is stale */ 4624 if (s->s_med.n_cnt == 0) 4625 return (err); 4626 4627 /* Contact the mediator hosts for the data */ 4628 meddlp = get_med_host_data(&s->s_med, s->s_setname, setno); 4629 4630 /* No mediator data, stale */ 4631 if (meddlp == NULL) 4632 return (err); 4633 4634 /* Mark all the mediator data that is not for this set as errored */ 4635 for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) { 4636 struct timeval32 tmptime; 4637 meddp = tmeddlp->mdl_med; 4638 4639 /* Count the number of mediators contacted */ 4640 medacc++; 4641 4642 /* Paranoid check */ 4643 if (meddp->med_dat_sn != setno) 4644 meddp->med_dat_fl |= MED_DFL_ERROR; 4645 4646 TIMEVAL_TO_TIMEVAL32(&tmptime, &meddp->med_dat_id); 4647 4648 /*CSTYLED*/ 4649 if (timercmp(&tmptime, &lbp->lb_ident.createtime, !=)) 4650 meddp->med_dat_fl |= MED_DFL_ERROR; 4651 } 4652 4653 /* Get the max commitcount */ 4654 maxcc = 0; 4655 for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) { 4656 meddp = tmeddlp->mdl_med; 4657 if (meddp->med_dat_fl & MED_DFL_ERROR) 4658 continue; 4659 if (meddp->med_dat_cc > maxcc) 4660 maxcc = meddp->med_dat_cc; 4661 } 4662 4663 /* Now mark the records that don't have the highest cc as errored */ 4664 for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) { 4665 meddp = tmeddlp->mdl_med; 4666 if (meddp->med_dat_fl & MED_DFL_ERROR) 4667 continue; 4668 if (meddp->med_dat_cc != maxcc) 4669 meddp->med_dat_fl |= MED_DFL_ERROR; 4670 } 4671 4672 /* Now mark the records that don't match the lb commitcnt as errored */ 4673 for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) { 4674 meddp = tmeddlp->mdl_med; 4675 if (meddp->med_dat_fl & MED_DFL_ERROR) 4676 continue; 4677 if (meddp->med_dat_cc != lbp->lb_commitcnt) 4678 meddp->med_dat_fl |= MED_DFL_ERROR; 4679 } 4680 4681 /* Is there a "golden" copy and how many valid mediators */ 4682 for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) { 4683 meddp = tmeddlp->mdl_med; 4684 if (meddp->med_dat_fl & MED_DFL_ERROR) 4685 continue; 4686 4687 if (meddp->med_dat_fl & MED_DFL_GOLDEN) 4688 golden++; 4689 4690 medok++; 4691 } 4692 4693 /* No survivors, stale */ 4694 if (medok == 0) 4695 goto out; 4696 4697 /* No mediator quorum and no golden copies, stale */ 4698 if (medacc < ((s->s_med.n_cnt / 2) + 1) && ! golden) { 4699 /* Skip odd numbers, no exact 50% */ 4700 if (s->s_med.n_cnt & 1) 4701 goto out; 4702 /* Have 50%, allow an accept */ 4703 if (medacc == (s->s_med.n_cnt / 2)) 4704 md_set_setstatus(setno, MD_SET_ACCOK); 4705 goto out; 4706 } 4707 4708 /* We either have a quorum or a golden copy, or both */ 4709 err = 0; 4710 4711 out: 4712 if (meddlp) { 4713 for (/* void */; meddlp != NULL; meddlp = tmeddlp) { 4714 tmeddlp = meddlp->mdl_nx; 4715 kmem_free(meddlp->mdl_med, sizeof (med_data_t)); 4716 kmem_free(meddlp, sizeof (med_data_lst_t)); 4717 } 4718 } 4719 4720 return (err); 4721 } 4722 4723 /* 4724 * 1. read masterblks and locator blocks for all know database locations 4725 * a. keep track of which have good master blks 4726 * b. keep track of which have good locators 4727 * 4728 */ 4729 static int 4730 get_mbs_n_lbs( 4731 mddb_set_t *s, 4732 int *write_lb 4733 ) 4734 { 4735 mddb_lb_t *lbp = NULL; /* pointer to locator block */ 4736 /* May be cast to mddb_mnlb_t */ 4737 /* if accessing sidenames in */ 4738 /* MN set */ 4739 mddb_did_ic_t *did_icp = NULL; /* ptr to Device ID incore */ 4740 mddb_did_blk_t *did_blkp = 0; 4741 int did_blkp_sz = 0; 4742 mddb_did_db_t *did_dbp; 4743 mddb_did_info_t *did_info; 4744 caddr_t did_block; 4745 mddb_ri_t *rip; 4746 mddb_dtag_lst_t *dtlp; 4747 mddb_locator_t *lp; 4748 daddr_t physblk; 4749 int li; 4750 uint_t blk; 4751 md_dev64_t dev; 4752 caddr_t buffer; 4753 uint_t lb_blkcnt; 4754 int retval = 0; 4755 int err = 0; 4756 int lb_ok = 0; 4757 int lb_total = 0; 4758 int lb_tagged = 0; 4759 int lb_tags; 4760 set_t setno = s->s_setno; 4761 int cont_flag, i; 4762 mddb_did_db_t *did_dbp1, *did_dbp2; 4763 int mn_set = 0; 4764 mddb_cfg_loc_t *cl; 4765 4766 /* 4767 * read in master blocks and locator block for all known locators. 4768 * lb_blkcnt will be set correctly for MN set later once getmasters 4769 * has determined that the set is a MN set. 4770 */ 4771 lb_blkcnt = ((setno == MD_LOCAL_SET) ? 4772 MDDB_LOCAL_LBCNT : MDDB_LBCNT); 4773 4774 for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) { 4775 rip->ri_flags &= (MDDB_F_PTCHED | MDDB_F_IOCTL | 4776 MDDB_F_EMASTER); 4777 rip->ri_lbp = (mddb_lb_t *)NULL; 4778 rip->ri_did_icp = (mddb_did_ic_t *)NULL; 4779 4780 /* 4781 * Translated dev is only used in calls to getmasters and 4782 * getblks which expect a translated (aka miniroot) dev. 4783 */ 4784 dev = md_xlate_targ_2_mini(rip->ri_dev); 4785 if (dev == NODEV64) { 4786 /* Set error flag that getmasters would have set */ 4787 /* if getmasters had been allowed to fail */ 4788 rip->ri_flags |= MDDB_F_EMASTER; 4789 } 4790 4791 /* 4792 * Invalid device id on system (due to failed or 4793 * removed device) or invalid devt during upgrade 4794 * (due to powered off device) will cause this 4795 * replica to be marked in error and not used. 4796 */ 4797 if (rip->ri_flags & MDDB_F_EMASTER) 4798 continue; 4799 4800 /* get all master blocks, does mddb_devopen() */ 4801 rip->ri_mbip = getmasters(s, dev, rip->ri_blkno, 4802 &rip->ri_flags, &mn_set); 4803 4804 /* if invalid master block - try next replica */ 4805 if (! rip->ri_mbip) 4806 continue; 4807 4808 /* 4809 * If lbp alloc'd to wrong size - reset it. 4810 * If MN set, lb_blkcnt must be MDDB_MNLBCNT. 4811 * If a traditional set, lb_blkcnt must NOT be MDDB_MNLBCNT. 4812 */ 4813 if (lbp) { 4814 if (((mn_set) && (lb_blkcnt != MDDB_MNLBCNT)) || 4815 ((!mn_set) && (lb_blkcnt == MDDB_MNLBCNT))) { 4816 kmem_free((caddr_t)lbp, dbtob(lb_blkcnt)); 4817 lbp = (mddb_lb_t *)NULL; 4818 } 4819 } 4820 4821 if (lbp == (mddb_lb_t *)NULL) { 4822 /* If a MN set, set lb_blkcnt for MN loc blk size */ 4823 if (mn_set) 4824 lb_blkcnt = MDDB_MNLBCNT; 4825 lbp = (mddb_lb_t *)kmem_zalloc(dbtob(lb_blkcnt), 4826 KM_SLEEP); 4827 } 4828 4829 /* 4830 * Read in all the sectors for the locator block 4831 * NOTE: Need to use getblks, rather than readblklst. 4832 * because it is too early and things are 4833 * NOT set up yet for read*()'s 4834 */ 4835 buffer = (caddr_t)lbp; 4836 for (blk = 0; blk < lb_blkcnt; blk++) { 4837 physblk = getphysblk(blk, rip->ri_mbip); 4838 err = getblks(s, buffer, dev, physblk, 4839 btodb(MDDB_BSIZE), 0); 4840 if (err) { 4841 rip->ri_flags |= err; 4842 break; 4843 } 4844 buffer += MDDB_BSIZE; 4845 } 4846 4847 if (err) 4848 continue; 4849 4850 /* Verify the locator block */ 4851 if (blk != lb_blkcnt) 4852 continue; 4853 if (lbp->lb_magic != MDDB_MAGIC_LB) 4854 continue; 4855 if (lbp->lb_blkcnt != lb_blkcnt) 4856 continue; 4857 if (mn_set) { 4858 /* If a MN set, check for MNLB revision in lb. */ 4859 if (revchk(MDDB_REV_MNLB, lbp->lb_revision)) 4860 continue; 4861 } else { 4862 /* If not a MN set, check for LB revision in lb. */ 4863 if (revchk(MDDB_REV_LB, lbp->lb_revision)) 4864 continue; 4865 } 4866 if (crcchk(lbp, &lbp->lb_checksum, dbtob(lb_blkcnt), NULL)) 4867 continue; 4868 4869 /* 4870 * With the addition of MultiNode Disksets, we must make sure 4871 * to verify that this is the correct set. A node could 4872 * have been out of the config for awhile and this disk could 4873 * have been moved to a different diskset and we don't want 4874 * to accidentally start the wrong set. 4875 * 4876 * We don't do this check if we're in the middle of 4877 * importing a set. 4878 */ 4879 if (!(md_get_setstatus(s->s_setno) & 4880 (MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT)) && 4881 (lbp->lb_setno != s->s_setno)) 4882 continue; 4883 4884 rip->ri_flags |= MDDB_F_LOCACC; 4885 4886 /* 4887 * a commit count of zero means this locator has been deleted 4888 */ 4889 if (lbp->lb_commitcnt == 0) 4890 continue; 4891 4892 /* 4893 * If replica is in the device ID style and md_devid_destroy 4894 * flag is set, turn off device id style. This is only to be 4895 * used in a catastrophic failure case. Examples would be 4896 * where the device id of all drives in the system 4897 * (especially the mirror'd root drives) had been changed 4898 * by firmware upgrade or by a patch to an existing disk 4899 * driver. Another example would be in the case of non-unique 4900 * device ids due to a bug. The device id would be valid on 4901 * the system, but would return the wrong dev_t. 4902 */ 4903 if ((lbp->lb_flags & MDDB_DEVID_STYLE) && md_devid_destroy) { 4904 lbp->lb_flags &= ~MDDB_DEVID_STYLE; 4905 lbp->lb_didfirstblk = 0; 4906 lbp->lb_didblkcnt = 0; 4907 *write_lb = 1; 4908 } 4909 4910 4911 /* 4912 * If replica is in device ID style, read in device ID 4913 * block and verify device ID block information. 4914 */ 4915 if (lbp->lb_flags & MDDB_DEVID_STYLE) { 4916 4917 /* Read in device ID block */ 4918 if (did_icp == NULL) { 4919 did_icp = (mddb_did_ic_t *) 4920 kmem_zalloc(sizeof (mddb_did_ic_t), 4921 KM_SLEEP); 4922 } else { 4923 /* Reuse did_icp, but clear out data */ 4924 if (did_icp->did_ic_blkp != 4925 (mddb_did_blk_t *)NULL) { 4926 kmem_free((caddr_t)did_icp->did_ic_blkp, 4927 did_blkp_sz); 4928 did_blkp = (mddb_did_blk_t *)NULL; 4929 did_icp->did_ic_blkp = 4930 (mddb_did_blk_t *)NULL; 4931 } 4932 if (did_icp->did_ic_dbp != 4933 (mddb_did_db_t *)NULL) { 4934 did_dbp1 = did_icp->did_ic_dbp; 4935 while (did_dbp1) { 4936 did_dbp2 = did_dbp1->db_next; 4937 kmem_free((caddr_t)did_dbp1->db_ptr, 4938 dbtob(did_dbp1->db_blkcnt)); 4939 kmem_free((caddr_t)did_dbp1, 4940 sizeof (mddb_did_db_t)); 4941 did_dbp1 = did_dbp2; 4942 } 4943 did_icp->did_ic_dbp = 4944 (mddb_did_db_t *)NULL; 4945 } 4946 for (i = 0; i < MDDB_NLB; i++) { 4947 did_icp->did_ic_devid[i] = 4948 (ddi_devid_t)NULL; 4949 } 4950 } 4951 4952 /* Can't reuse blkp since size could be different */ 4953 if (did_blkp != (mddb_did_blk_t *)NULL) { 4954 kmem_free(did_blkp, did_blkp_sz); 4955 } 4956 did_blkp_sz = (int)dbtob(lbp->lb_didblkcnt); 4957 did_blkp = (mddb_did_blk_t *)kmem_zalloc(did_blkp_sz, 4958 KM_SLEEP); 4959 did_icp->did_ic_blkp = did_blkp; 4960 buffer = (caddr_t)did_blkp; 4961 for (blk = lbp->lb_didfirstblk; 4962 blk < (lbp->lb_didblkcnt + lbp->lb_didfirstblk); 4963 blk++) { 4964 physblk = getphysblk(blk, rip->ri_mbip); 4965 err = getblks(s, buffer, dev, physblk, 4966 btodb(MDDB_BSIZE), 0); 4967 if (err) { 4968 rip->ri_flags |= err; 4969 break; 4970 } 4971 buffer += MDDB_BSIZE; 4972 } 4973 if (err) 4974 continue; 4975 4976 /* Verify the Device ID block */ 4977 if (blk != (lbp->lb_didblkcnt + lbp->lb_didfirstblk)) 4978 continue; 4979 if (did_blkp->blk_magic != MDDB_MAGIC_DI) 4980 continue; 4981 if (lbp->lb_didblkcnt != MDDB_DID_BLOCKS) 4982 continue; 4983 if (revchk(MDDB_REV_DI, did_blkp->blk_revision)) 4984 continue; 4985 if (crcchk(did_blkp, &did_blkp->blk_checksum, 4986 dbtob(lbp->lb_didblkcnt), NULL)) 4987 continue; 4988 4989 /* 4990 * Check if device ID block is out of sync with the 4991 * Locator Block by checking if the locator block 4992 * commitcnt does not match the device id block 4993 * commitcnt. If an 'out of sync' condition 4994 * exists, discard this replica since it has 4995 * inconsistent data and can't be used in 4996 * determining the best replica. 4997 * 4998 * An 'out of sync' condition could happen if old 4999 * SDS code was running with new devid style replicas 5000 * or if a failure occurred between the writing of 5001 * the locator block's commitcnt and the device 5002 * id block's commitcnt. 5003 * 5004 * If old SDS code had been running, the upgrade 5005 * process should detect this situation and 5006 * have removed all of the device id information 5007 * via the md_devid_destroy flag in md.conf. 5008 */ 5009 if (did_blkp->blk_commitcnt != 5010 lbp->lb_commitcnt) { 5011 continue; 5012 } 5013 } 5014 5015 5016 /* 5017 * If replica is still in device ID style, read in all 5018 * of the device IDs, verify the checksum of the device IDs. 5019 */ 5020 if (lbp->lb_flags & MDDB_DEVID_STYLE) { 5021 /* 5022 * Reset valid bit in device id info block flags. This 5023 * flag is stored on disk, but the valid bit is reset 5024 * when reading in the replica. If the corresponding 5025 * device id is valid (aka meaning that the system 5026 * knows about this device id), the valid bit will 5027 * be set at a later time. The valid bit for this 5028 * replica's device ID will be set in this routine. 5029 * The valid bits for the rest of the device id's 5030 * will be set after the 'best' replica has 5031 * been selected in routine load_old_replicas. 5032 * Reset updated bit in device id info block flags. 5033 * This flag is also stored on disk, reset when read 5034 * in and set when the locators and side locators 5035 * have been updated to match this valid device 5036 * id information. 5037 */ 5038 for (li = 0; li < lbp->lb_loccnt; li++) { 5039 did_info = &did_blkp->blk_info[li]; 5040 if (did_info->info_flags & MDDB_DID_EXISTS) 5041 did_info->info_flags &= 5042 ~(MDDB_DID_VALID | MDDB_DID_UPDATED); 5043 } 5044 5045 cont_flag = 0; 5046 for (li = 0; li < lbp->lb_loccnt; li++) { 5047 did_info = &did_blkp->blk_info[li]; 5048 did_block = (caddr_t)NULL; 5049 if (did_info->info_flags & MDDB_DID_EXISTS) { 5050 /* Check if block has already been read in */ 5051 did_dbp = did_icp->did_ic_dbp; 5052 while (did_dbp != 0) { 5053 if (did_dbp->db_firstblk == 5054 did_info->info_firstblk) 5055 break; 5056 else 5057 did_dbp = did_dbp->db_next; 5058 } 5059 /* if block not found, read it in */ 5060 if (did_dbp == NULL) { 5061 did_block = (caddr_t)(kmem_zalloc(dbtob 5062 (did_info->info_blkcnt), KM_SLEEP)); 5063 buffer = (caddr_t)did_block; 5064 for (blk = did_info->info_firstblk; 5065 blk < (did_info->info_firstblk + 5066 did_info->info_blkcnt); blk++) { 5067 physblk = getphysblk(blk, rip->ri_mbip); 5068 err = getblks(s, buffer, dev, physblk, 5069 btodb(MDDB_BSIZE), 0); 5070 if (err) { 5071 rip->ri_flags |= err; 5072 break; 5073 } 5074 buffer += MDDB_BSIZE; 5075 } 5076 if (err) { 5077 kmem_free(did_block, 5078 dbtob(did_info->info_blkcnt)); 5079 did_block = (caddr_t)NULL; 5080 cont_flag = 1; 5081 break; 5082 } 5083 5084 /* 5085 * Block read in - alloc Disk Block area 5086 */ 5087 did_dbp = (mddb_did_db_t *)kmem_zalloc( 5088 sizeof (mddb_did_db_t), KM_SLEEP); 5089 did_dbp->db_ptr = did_block; 5090 did_dbp->db_firstblk = did_info->info_firstblk; 5091 did_dbp->db_blkcnt = did_info->info_blkcnt; 5092 5093 /* Add to front of dbp list */ 5094 did_dbp->db_next = did_icp->did_ic_dbp; 5095 did_icp->did_ic_dbp = did_dbp; 5096 } 5097 /* Check validity of devid in block */ 5098 if (crcchk(((char *)did_dbp->db_ptr + 5099 did_info->info_offset), 5100 &did_info->info_checksum, 5101 did_info->info_length, NULL)) { 5102 cont_flag = 1; 5103 break; 5104 } 5105 5106 /* Block now pointed to by did_dbp */ 5107 did_icp->did_ic_devid[li] = (ddi_devid_t) 5108 ((char *)did_dbp->db_ptr + 5109 did_info->info_offset); 5110 } 5111 } 5112 if (cont_flag) 5113 continue; 5114 } 5115 5116 /* 5117 * All blocks containing devids are now in core. 5118 */ 5119 5120 /* 5121 * If we're doing a replicated import (also known as 5122 * remote copy import), the device id in the locator 5123 * block is incorrect and we need to fix it up here 5124 * alongwith the l_dev otherwise we run into lots of 5125 * trouble later on. 5126 */ 5127 if ((md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) { 5128 mddb_ri_t *trip; 5129 for (li = 0; li < lbp->lb_loccnt; li++) { 5130 did_info = &did_blkp->blk_info[li]; 5131 lp = &lbp->lb_locators[li]; 5132 5133 if (lp->l_flags & MDDB_F_DELETED) 5134 continue; 5135 5136 if (!(did_info->info_flags & MDDB_DID_EXISTS)) 5137 continue; 5138 5139 if (did_icp->did_ic_devid[li] == NULL) 5140 continue; 5141 5142 for (trip = s->s_rip; trip != NULL; 5143 trip = trip->ri_next) { 5144 if (trip->ri_old_devid == NULL) 5145 continue; 5146 if (ddi_devid_compare( 5147 trip->ri_old_devid, 5148 did_icp->did_ic_devid[li]) != 0) { 5149 continue; 5150 } 5151 5152 /* update l_dev and side mnum */ 5153 lp->l_dev = md_cmpldev(trip->ri_dev); 5154 lbp->lb_sidelocators[0][li].l_mnum = 5155 md_getminor(trip->ri_dev); 5156 } 5157 } 5158 } 5159 5160 /* 5161 * If there is a valid devid, verify that this locator 5162 * block has information about itself by checking the 5163 * device ID, minor_name and block 5164 * number from this replica's incore data structure 5165 * against the locator block information that has just 5166 * been read in from disk. 5167 * 5168 * If not a valid devid, verify that this locator block 5169 * has information about itself by checking the minor 5170 * number, block number and driver name from this 5171 * replica's incore data structure against the locator 5172 * block information that has just been read in from disk. 5173 */ 5174 if ((rip->ri_devid != NULL) && 5175 (lbp->lb_flags & MDDB_DEVID_STYLE)) { 5176 /* 5177 * This locator block MUST have locator (replica) 5178 * information about itself. Check against devid, 5179 * slice part of minor number, and block number. 5180 */ 5181 for (li = 0; li < lbp->lb_loccnt; li++) { 5182 did_info = &did_blkp->blk_info[li]; 5183 lp = &lbp->lb_locators[li]; 5184 if (lp->l_flags & MDDB_F_DELETED) 5185 continue; 5186 5187 if (!(did_info->info_flags & MDDB_DID_EXISTS)) 5188 continue; 5189 5190 if (((md_get_setstatus(setno) & 5191 MD_SET_REPLICATED_IMPORT)) && 5192 (rip->ri_old_devid != (ddi_devid_t)NULL)) { 5193 if (ddi_devid_compare(rip->ri_old_devid, 5194 did_icp->did_ic_devid[li]) != 0) 5195 continue; 5196 } else { 5197 if (ddi_devid_compare(rip->ri_devid, 5198 did_icp->did_ic_devid[li]) != 0) 5199 continue; 5200 } 5201 5202 if (strcmp(rip->ri_minor_name, 5203 did_info->info_minor_name) != 0) 5204 continue; 5205 5206 if (lp->l_blkno == rip->ri_blkno) 5207 break; 5208 } 5209 } else { 5210 /* 5211 * This locator block MUST have locator (replica) 5212 * information about itself. 5213 */ 5214 if (!mn_set) { 5215 for (li = 0; li < lbp->lb_loccnt; li++) { 5216 mddb_drvnm_t *dn; 5217 mddb_sidelocator_t *slp; 5218 5219 lp = &lbp->lb_locators[li]; 5220 slp = &lbp->lb_sidelocators[s->s_sideno][li]; 5221 if (lp->l_flags & MDDB_F_DELETED) 5222 continue; 5223 if (slp->l_mnum != md_getminor(rip->ri_dev)) 5224 continue; 5225 if (lp->l_blkno != rip->ri_blkno) 5226 continue; 5227 dn = &lbp->lb_drvnm[slp->l_drvnm_index]; 5228 if (strncmp(dn->dn_data, rip->ri_driver, 5229 MD_MAXDRVNM) == 0) 5230 break; 5231 } 5232 } else { 5233 for (li = 0; li < lbp->lb_loccnt; li++) { 5234 mddb_drvnm_t *dn; 5235 mddb_mnsidelocator_t *mnslp; 5236 mddb_mnlb_t *mnlbp; 5237 int i; 5238 5239 /* 5240 * Check all possible locators locking for 5241 * match to the currently read-in locator, 5242 * must match on: 5243 * - blkno 5244 * - side locator for this node's side 5245 * - side locator minor number 5246 * - side locator driver name 5247 */ 5248 5249 /* Looking at sidelocs - cast lbp -> mnlbp */ 5250 mnlbp = (mddb_mnlb_t *)lbp; 5251 lp = &mnlbp->lb_locators[li]; 5252 if (lp->l_flags & MDDB_F_DELETED) 5253 continue; 5254 if (lp->l_blkno != rip->ri_blkno) 5255 continue; 5256 5257 for (i = 0; i < MD_MNMAXSIDES; i++) { 5258 mnslp = &mnlbp->lb_mnsidelocators[i][li]; 5259 if (mnslp->mnl_sideno == s->s_sideno) { 5260 break; 5261 } 5262 } 5263 /* No matching side found */ 5264 if (i == MD_MNMAXSIDES) 5265 continue; 5266 if (mnslp->mnl_mnum != md_getminor(rip->ri_dev)) 5267 continue; 5268 dn = &lbp->lb_drvnm[mnslp->mnl_drvnm_index]; 5269 if (strncmp(dn->dn_data, rip->ri_driver, 5270 MD_MAXDRVNM) == 0) 5271 break; 5272 } 5273 } 5274 } 5275 5276 /* 5277 * Didn't find ourself in this locator block it means 5278 * the locator block is a stale transplant. Probably from 5279 * a user doing a dd. 5280 */ 5281 if (li == lbp->lb_loccnt) 5282 continue; 5283 5284 /* 5285 * Keep track of the number of accessed and valid 5286 * locator blocks. 5287 */ 5288 lb_ok++; 5289 5290 /* 5291 * Read the tag in, skips invalid or blank tags. 5292 * Only valid tags allocate storage 5293 * Data tags are not used in MN disksets. 5294 */ 5295 if ((!mn_set) && (! dt_read(s, lbp, rip))) { 5296 /* 5297 * Keep track of the number of tagged 5298 * locator blocks. 5299 */ 5300 lb_tagged++; 5301 5302 /* Keep a list of unique tags. */ 5303 (void) dtl_addl(s, &rip->ri_dtp->dt_dtag); 5304 } 5305 5306 if (!(md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) { 5307 /* 5308 * go through locator block and add any other 5309 * locations of the data base. 5310 * For the replicated import case, this was done earlier 5311 * and we really don't need or want to do so again 5312 */ 5313 cl = kmem_zalloc(sizeof (mddb_cfg_loc_t), KM_SLEEP); 5314 for (li = 0; li < lbp->lb_loccnt; li++) { 5315 lp = &lbp->lb_locators[li]; 5316 if (lp->l_flags & MDDB_F_DELETED) 5317 continue; 5318 5319 cl->l_devid_flags = MDDB_DEVID_GETSZ; 5320 cl->l_devid = (uint64_t)0; 5321 cl->l_devid_sz = 0; 5322 cl->l_old_devid = (uint64_t)0; 5323 cl->l_old_devid_sz = 0; 5324 cl->l_minor_name[0] = '\0'; 5325 locator2cfgloc(lbp, cl, li, s->s_sideno, 5326 did_icp); 5327 5328 if (cl->l_devid_flags & MDDB_DEVID_SZ) { 5329 if ((cl->l_devid = (uintptr_t)kmem_alloc 5330 (cl->l_devid_sz, KM_SLEEP)) 5331 == NULL) { 5332 continue; 5333 } else { 5334 cl->l_devid_flags = 5335 MDDB_DEVID_SPACE; 5336 } 5337 } 5338 locator2cfgloc(lbp, cl, li, s->s_sideno, 5339 did_icp); 5340 5341 (void) ridev(&s->s_rip, cl, &lp->l_dev, 0); 5342 5343 if (cl->l_devid_flags & MDDB_DEVID_SPACE) 5344 kmem_free((caddr_t)(uintptr_t) 5345 cl->l_devid, cl->l_devid_sz); 5346 } 5347 kmem_free(cl, sizeof (mddb_cfg_loc_t)); 5348 } 5349 5350 /* Save LB for later */ 5351 rip->ri_lbp = lbp; 5352 if (lbp->lb_flags & MDDB_DEVID_STYLE) { 5353 rip->ri_did_icp = did_icp; 5354 did_icp = (mddb_did_ic_t *)NULL; 5355 did_blkp = (mddb_did_blk_t *)NULL; 5356 } else 5357 rip->ri_did_icp = NULL; 5358 lbp = (mddb_lb_t *)NULL; 5359 } 5360 5361 if (lbp != (mddb_lb_t *)NULL) 5362 kmem_free((caddr_t)lbp, dbtob(lb_blkcnt)); 5363 5364 if (did_icp != (mddb_did_ic_t *)NULL) { 5365 if (did_icp->did_ic_blkp != (mddb_did_blk_t *)NULL) { 5366 kmem_free((caddr_t)did_icp->did_ic_blkp, did_blkp_sz); 5367 did_blkp = (mddb_did_blk_t *)NULL; 5368 } 5369 if (did_icp->did_ic_dbp != (mddb_did_db_t *)NULL) { 5370 mddb_did_db_t *did_dbp1, *did_dbp2; 5371 5372 did_dbp1 = did_icp->did_ic_dbp; 5373 while (did_dbp1) { 5374 did_dbp2 = did_dbp1->db_next; 5375 kmem_free((caddr_t)did_dbp1->db_ptr, 5376 dbtob(did_dbp1->db_blkcnt)); 5377 kmem_free((caddr_t)did_dbp1, 5378 sizeof (mddb_did_db_t)); 5379 did_dbp1 = did_dbp2; 5380 } 5381 } 5382 kmem_free((caddr_t)did_icp, sizeof (mddb_did_ic_t)); 5383 } 5384 5385 if (did_blkp != (mddb_did_blk_t *)NULL) { 5386 kmem_free((caddr_t)did_blkp, did_blkp_sz); 5387 } 5388 5389 /* No locator blocks were ok */ 5390 if (lb_ok == 0) 5391 goto out; 5392 5393 /* No tagged data was found - will be 0 for MN diskset */ 5394 if (lb_tagged == 0) 5395 goto out; 5396 5397 /* Find the highest non-deleted replica count */ 5398 for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) { 5399 int lb_tot = 0; 5400 5401 if (rip->ri_mbip == (mddb_mb_ic_t *)NULL) 5402 continue; 5403 5404 if (rip->ri_lbp == (mddb_lb_t *)NULL) 5405 continue; 5406 5407 for (li = 0; li < rip->ri_lbp->lb_loccnt; li++) { 5408 lp = &rip->ri_lbp->lb_locators[li]; 5409 if (lp->l_flags & MDDB_F_DELETED) 5410 continue; 5411 lb_tot++; 5412 } 5413 5414 if (lb_tot > lb_total) 5415 lb_total = lb_tot; 5416 } 5417 5418 /* Count the number of unique tags */ 5419 for (lb_tags = 0, dtlp = s->s_dtlp; dtlp != NULL; dtlp = dtlp->dtl_nx) 5420 lb_tags++; 5421 5422 /* Should have at least one tag at this point */ 5423 ASSERT(lb_tags > 0); 5424 5425 5426 /* 5427 * If the number of tagged locators is not the same as the number of 5428 * OK locators OR more than one tag exists, then make sure the 5429 * selected tag will be written out later. 5430 */ 5431 if ((lb_tagged - lb_ok) != 0 || lb_tags > 1) 5432 md_set_setstatus(setno, MD_SET_TAGDATA); 5433 5434 /* Only a single tag, take the tagged data */ 5435 if (lb_tags == 1) { 5436 dt_setup(s, &s->s_dtlp->dtl_dt); 5437 md_set_setstatus(setno, MD_SET_USETAG); 5438 goto out; 5439 } 5440 5441 /* Multiple tags, not selecting a tag, tag mode is on */ 5442 if (! (md_get_setstatus(setno) & MD_SET_USETAG)) 5443 retval = MDDB_E_TAGDATA; 5444 5445 out: 5446 5447 return (retval); 5448 } 5449 5450 /* 5451 * 1. Select a locator. 5452 * 2. check if enough locators now have current copies 5453 * 3. read in database from one of latest 5454 * 4. if known to have latest make all database the same 5455 * 5. if configuration has changed rewrite locators 5456 * 5457 * Parameters: 5458 * s - pointer to mddb_set structure 5459 * flag - used in MN disksets to tell if this node is being joined to 5460 * a diskset that is in the STALE state. If the flag is 5461 * MDDB_MN_STALE, then this node should be marked in the STALE 5462 * state even if > 50% mddbs are available. (The diskset can 5463 * only change from STALE->OK if all nodes withdraw from the 5464 * MN diskset and then rejoin). 5465 */ 5466 static int 5467 load_old_replicas( 5468 mddb_set_t *s, 5469 int flag 5470 ) 5471 { 5472 mddb_lb_t *lbp = NULL; 5473 mddb_mnlb_t *mnlbp = NULL; 5474 mddb_ri_t *rip; 5475 mddb_locator_t *lp; 5476 mddb_db_t *dbp; 5477 mddb_de_ic_t *dep; 5478 int li; 5479 int alc; 5480 int lc; 5481 int tlc; 5482 int retval = 0; 5483 caddr_t p; 5484 size_t maxrecsize; 5485 set_t setno = s->s_setno; 5486 mddb_did_db_t *did_dbp1; 5487 mddb_did_info_t *did_info; 5488 mddb_did_ic_t *did_icp = NULL; 5489 md_dev64_t *newdev; 5490 mddb_sidelocator_t *slp = 0; 5491 mddb_mnsidelocator_t *mnslp = 0; 5492 uchar_t i; 5493 char *name; 5494 ddi_devid_t ret_devid; 5495 md_dev64_t dev; 5496 uint_t len, sz; 5497 char *minor_name; 5498 int write_lb = 0; 5499 int rval; 5500 int stale_rtn = 0; 5501 5502 /* The only error path out of get_mbs_n_lbs() is MDDB_E_TAGDATA */ 5503 if (retval = get_mbs_n_lbs(s, &write_lb)) 5504 goto errout; 5505 5506 if ((lbp = s->s_lbp = selectlocator(s)) == NULL) { 5507 retval = MDDB_E_NOLOCBLK; 5508 goto errout; 5509 } 5510 5511 /* If a multi-node set, then set md_set.s_status flag */ 5512 if (lbp->lb_flags & MDDB_MNSET) { 5513 md_set_setstatus(setno, MD_SET_MNSET); 5514 /* 5515 * If data tag area had been allocated before set type was 5516 * known - free it now. 5517 */ 5518 if (md_set[setno].s_dtp) { 5519 kmem_free((caddr_t)md_set[setno].s_dtp, MDDB_DT_BYTES); 5520 md_set[setno].s_dtp = NULL; 5521 } 5522 } 5523 5524 /* 5525 * If the replica is in devid format, setup the devid incore ptr. 5526 */ 5527 if (lbp->lb_flags & MDDB_DEVID_STYLE) { 5528 for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) { 5529 if (rip->ri_lbp == s->s_lbp) { 5530 did_icp = s->s_did_icp = rip->ri_did_icp; 5531 break; 5532 } 5533 } 5534 /* 5535 * If no devid incore info found - something has gone 5536 * wrong so errout. 5537 */ 5538 if (rip == NULL) { 5539 retval = MDDB_E_NODEVID; 5540 goto errout; 5541 } 5542 5543 /* 5544 * Add all blocks containing devids to free list. 5545 * Then remove addresses that actually contain devids. 5546 */ 5547 did_dbp1 = did_icp->did_ic_dbp; 5548 while (did_dbp1) { 5549 if (mddb_devid_free_add(s, did_dbp1->db_firstblk, 5550 0, dbtob(did_dbp1->db_blkcnt))) { 5551 retval = MDDB_E_NOSPACE; 5552 goto errout; 5553 } 5554 5555 did_dbp1 = did_dbp1->db_next; 5556 } 5557 for (li = 0; li < lbp->lb_loccnt; li++) { 5558 did_info = &(did_icp->did_ic_blkp->blk_info[li]); 5559 if (!(did_info->info_flags & MDDB_DID_EXISTS)) 5560 continue; 5561 5562 if (mddb_devid_free_delete(s, did_info->info_firstblk, 5563 did_info->info_offset, did_info->info_length)) { 5564 /* unable to find disk block */ 5565 retval = MDDB_E_NODEVID; 5566 goto errout; 5567 } 5568 } 5569 } 5570 5571 /* 5572 * create mddb_mbaray, count all locators and active locators. 5573 */ 5574 alc = 0; 5575 lc = 0; 5576 for (li = 0; li < lbp->lb_loccnt; li++) { 5577 ddi_devid_t li_devid; 5578 5579 lp = &lbp->lb_locators[li]; 5580 5581 if (lp->l_flags & MDDB_F_DELETED) 5582 continue; 5583 5584 /* Count non-deleted replicas */ 5585 lc++; 5586 5587 /* 5588 * Use the devid of this locator to compare with the rip 5589 * list. The scenario to watch out for here is that this 5590 * locator could be on a disk that is dead and there could 5591 * be a valid entry in the rip list for a different disk 5592 * that has been moved to the dead disks dev_t. We don't 5593 * want to match with the moved disk. 5594 */ 5595 li_devid = NULL; 5596 (void) mddb_devid_get(s, li, &li_devid, &minor_name); 5597 5598 for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) { 5599 if (match_mddb(rip, li_devid, minor_name, 5600 md_expldev(lp->l_dev), lp->l_blkno)) { 5601 break; 5602 } 5603 } 5604 if (rip == NULL) { 5605 /* 5606 * If rip not found, then mark error in master block 5607 * so that no writes are later attempted to this 5608 * replica. rip may not be setup if ridev 5609 * failed due to un-found driver name. 5610 */ 5611 lp->l_flags |= MDDB_F_EMASTER; 5612 continue; 5613 } 5614 5615 s->s_mbiarray[li] = rip->ri_mbip; 5616 5617 lp->l_flags &= MDDB_F_ACTIVE; 5618 lp->l_flags |= (int)rip->ri_flags; 5619 5620 if (rip->ri_transplant) 5621 lp->l_flags &= ~MDDB_F_ACTIVE; 5622 5623 if (lp->l_flags & MDDB_F_LOCACC) 5624 alc++; 5625 } 5626 5627 /* Save on a divide - calculate 50% + 1 up front */ 5628 tlc = ((lc + 1) / 2); 5629 5630 if (alc > tlc) { /* alc > tlc - OK */ 5631 md_clr_setstatus(setno, MD_SET_STALE); 5632 } else if (alc < tlc) { /* alc < tlc - stale */ 5633 md_set_setstatus(setno, MD_SET_STALE); 5634 } else if (lc & 1) { /* alc == tlc && odd - OK */ 5635 md_clr_setstatus(setno, MD_SET_STALE); 5636 } else { /* alc == tlc && even - ? */ 5637 /* Can do an accept, and are */ 5638 if (md_get_setstatus(setno) & (MD_SET_ACCOK | MD_SET_ACCEPT)) { 5639 md_clr_setstatus(setno, MD_SET_STALE); 5640 } else { /* possibly has a mediator */ 5641 if (mediate(s)) { 5642 md_set_setstatus(setno, MD_SET_STALE); 5643 } else { 5644 md_clr_setstatus(setno, MD_SET_STALE); 5645 } 5646 } 5647 5648 /* 5649 * The mirrored_root_flag allows the sysadmin to decide to 5650 * start the local set in a read/write (non-stale) mode 5651 * when there are only 50% available mddbs on the system and 5652 * when the root file system is on a mirror. This is useful 5653 * in a 2 disk system where 1 disk failure would cause an mddb 5654 * quorum failure and subsequent boot failures since the root 5655 * filesystem would be in a read-only state. 5656 */ 5657 if (mirrored_root_flag == 1 && setno == 0 && 5658 svm_bootpath[0] != 0) { 5659 md_clr_setstatus(setno, MD_SET_STALE); 5660 } else { 5661 if (md_get_setstatus(setno) & MD_SET_STALE) { 5662 /* Allow half mode - CAREFUL! */ 5663 if (mddb_allow_half) 5664 md_clr_setstatus(setno, MD_SET_STALE); 5665 } 5666 } 5667 5668 /* 5669 * In a MN diskset, 5670 * - if 50% mddbs are unavailable and this 5671 * has been marked STALE above 5672 * - master node isn't in the STALE state 5673 * - this node isn't the master node (this node 5674 * isn't the first node to join the set) 5675 * then clear the STALE state and set TOOFEW. 5676 * 5677 * If this node is the master node and set was marked STALE, 5678 * then the set stays STALE. 5679 * 5680 * If this node is not the master and this node's state is 5681 * STALE and the master node is not marked STALE, 5682 * then master node must be in the TOOFEW state or the 5683 * master is panic'ing. A MN diskset can only be placed into 5684 * the STALE state by having the first node join the set 5685 * with <= 50% mddbs. There's no way for a MN diskset to 5686 * transition between STALE and not-STALE states unless all 5687 * nodes are withdrawn from the diskset or all nodes in the 5688 * diskset are rebooted at the same time. 5689 * 5690 * So, mark this node's state as TOOFEW instead of STALE. 5691 */ 5692 if (((md_get_setstatus(setno) & (MD_SET_MNSET | MD_SET_STALE)) 5693 == (MD_SET_MNSET | MD_SET_STALE)) && 5694 ((flag & MDDB_MN_STALE) == 0) && 5695 (!(md_set[setno].s_am_i_master))) { 5696 md_clr_setstatus(setno, MD_SET_STALE); 5697 md_set_setstatus(setno, MD_SET_TOOFEW); 5698 } 5699 } 5700 5701 /* 5702 * If a MN set is marked STALE on the other nodes, 5703 * mark it stale here. Override all other considerations 5704 * such as a mediator or > 50% mddbs available. 5705 */ 5706 if (md_get_setstatus(setno) & MD_SET_MNSET) { 5707 if (flag & MDDB_MN_STALE) 5708 md_set_setstatus(setno, MD_SET_STALE); 5709 } 5710 5711 /* 5712 * read a good copy of the locator names 5713 * if an error occurs reading what is suppose 5714 * to be a good copy continue looking for another 5715 * good copy 5716 */ 5717 s->s_lnp = NULL; 5718 for (li = 0; li < lbp->lb_loccnt; li++) { 5719 lp = &lbp->lb_locators[li]; 5720 if ((! (lp->l_flags & MDDB_F_ACTIVE)) || 5721 (lp->l_flags & MDDB_F_EMASTER)) 5722 continue; 5723 5724 /* Find rip entry for this locator if one exists */ 5725 for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) { 5726 if (match_mddb(rip, NULL, NULL, md_expldev(lp->l_dev), 5727 lp->l_blkno)) 5728 break; 5729 } 5730 5731 if (rip == NULL) { 5732 continue; 5733 } 5734 5735 /* 5736 * Use the rip commitcnt since the commitcnt in lbp could 5737 * been cleared by selectlocator. Looking for a replica with 5738 * the same commitcnt as the 'golden' copy in order to 5739 * get the same data. 5740 */ 5741 if (rip->ri_commitcnt != lbp->lb_commitcnt) { 5742 continue; 5743 } 5744 5745 /* 5746 * Now have a copy of the database that is equivalent 5747 * to the chosen locator block with respect to 5748 * inittime, identifier and commitcnt. Trying the 5749 * equivalent databases in the order that they were 5750 * written will provide the most up to date data. 5751 */ 5752 lp->l_flags |= readlocnames(s, li); 5753 if (s->s_lnp) 5754 break; 5755 } 5756 5757 if (s->s_lnp == NULL) { 5758 retval = MDDB_E_NOLOCNMS; 5759 goto errout; 5760 } 5761 5762 /* 5763 * read a good copy of the data base 5764 * if an error occurs reading what is suppose 5765 * to be a good copy continue looking for another 5766 * good copy 5767 */ 5768 5769 s->s_dbp = NULL; 5770 for (li = 0; li < lbp->lb_loccnt; li++) { 5771 lp = &lbp->lb_locators[li]; 5772 if ((! (lp->l_flags & MDDB_F_ACTIVE)) || 5773 (lp->l_flags & MDDB_F_EMASTER)) 5774 continue; 5775 5776 /* Find rip entry for this locator if one exists */ 5777 for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) { 5778 if (match_mddb(rip, NULL, NULL, md_expldev(lp->l_dev), 5779 lp->l_blkno)) 5780 break; 5781 } 5782 5783 if (rip == NULL) { 5784 continue; 5785 } 5786 5787 /* 5788 * Use the rip commitcnt since the commitcnt in lbp could 5789 * been cleared by selectlocator. Looking for a replica with 5790 * the same commitcnt as the 'golden' copy in order to 5791 * get the same data. 5792 */ 5793 if (rip->ri_commitcnt != lbp->lb_commitcnt) { 5794 continue; 5795 } 5796 5797 /* 5798 * Now have a copy of the database that is equivalent 5799 * to the chosen locator block with respect to 5800 * inittime, identifier and commitcnt. Trying the 5801 * equivalent databases in the order that they were 5802 * written will provide the most up to date data. 5803 */ 5804 lp->l_flags |= readcopy(s, li); 5805 5806 if (s->s_dbp) 5807 break; 5808 } 5809 5810 if (s->s_dbp == NULL) { 5811 retval = MDDB_E_NODIRBLK; 5812 goto errout; 5813 } 5814 5815 lp->l_flags |= MDDB_F_MASTER; 5816 lp->l_flags |= MDDB_F_UP2DATE; 5817 5818 /* 5819 * go through and find largest record; 5820 * Also fixup the user data area's 5821 */ 5822 maxrecsize = MAX(MDDB_BSIZE, s->s_databuffer_size); 5823 5824 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) 5825 for (dep = dbp->db_firstentry; dep != NULL; dep = dep->de_next) 5826 if (dep->de_flags & MDDB_F_OPT) 5827 getoptrecord(s, dep); 5828 else { 5829 allocuserdata(dep); 5830 maxrecsize = MAX(dep->de_recsize, maxrecsize); 5831 } 5832 5833 if (maxrecsize > s->s_databuffer_size) { 5834 p = (caddr_t)kmem_zalloc(maxrecsize, KM_SLEEP); 5835 if (s->s_databuffer_size) 5836 kmem_free(s->s_databuffer, s->s_databuffer_size); 5837 s->s_databuffer = p; 5838 s->s_databuffer_size = maxrecsize; 5839 } 5840 5841 /* If we can clear the tag data record, do it now. */ 5842 /* Data tags not supported on MN sets */ 5843 if ((md_get_setstatus(setno) & MD_SET_CLRTAG) && 5844 (!(md_get_setstatus(setno) & MD_SET_MNSET))) 5845 dt_setup(s, NULL); 5846 5847 /* This will return non-zero if STALE or TOOFEW */ 5848 /* This will write out chosen replica image to all replicas */ 5849 stale_rtn = selectreplicas(s, MDDB_SCANALL); 5850 5851 if ((md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) { 5852 ddi_devid_t devidptr; 5853 5854 /* 5855 * ignore the return value from selectreplicas because we 5856 * may have a STALE or TOOFEW set in the case of a partial 5857 * replicated diskset. We will fix that up later. 5858 */ 5859 5860 lbp = s->s_lbp; 5861 for (li = 0; li < lbp->lb_loccnt; li++) { 5862 did_info = &(did_icp->did_ic_blkp->blk_info[li]); 5863 5864 if (did_info->info_flags & MDDB_DID_EXISTS) { 5865 devidptr = s->s_did_icp->did_ic_devid[li]; 5866 lp = &lbp->lb_locators[li]; 5867 for (rip = s->s_rip; rip != NULL; 5868 rip = rip->ri_next) { 5869 if (rip->ri_old_devid == 0) 5870 continue; 5871 if (ddi_devid_compare(rip->ri_old_devid, 5872 devidptr) != 0) { 5873 continue; 5874 } 5875 if (update_locatorblock(s, 5876 md_expldev(lp->l_dev), 5877 rip->ri_devid, rip->ri_old_devid)) { 5878 goto errout; 5879 } 5880 } 5881 } 5882 } 5883 } else { 5884 if (stale_rtn) 5885 goto errout; 5886 } 5887 5888 /* 5889 * If the replica is in device id style - validate the device id's, 5890 * if present, in the locator block devid area. 5891 */ 5892 newdev = kmem_zalloc(sizeof (md_dev64_t) * MDDB_NLB, KM_SLEEP); 5893 if (lbp->lb_flags & MDDB_DEVID_STYLE) { 5894 for (li = 0; li < lbp->lb_loccnt; li++) { 5895 newdev[li] = 0; 5896 lp = &lbp->lb_locators[li]; 5897 if (lp->l_flags & MDDB_F_DELETED) 5898 continue; 5899 did_info = &(did_icp->did_ic_blkp->blk_info[li]); 5900 dev = md_expldev(lp->l_dev); 5901 if (did_info->info_flags & MDDB_DID_EXISTS) { 5902 /* Validate device id on current system */ 5903 newdev[li] = dev; 5904 if (mddb_devid_validate( 5905 did_icp->did_ic_devid[li], 5906 &(newdev[li]), 5907 did_info->info_minor_name) == 0) { 5908 /* Set valid flag */ 5909 did_info->info_flags |= MDDB_DID_VALID; 5910 } else { 5911 lp->l_flags |= MDDB_F_EMASTER; 5912 } 5913 } else if (!(MD_UPGRADE)) { 5914 /* 5915 * If a device doesn't have a device id, 5916 * check if there is now a device ID 5917 * associated with device. If one exists, 5918 * add it to the locator block devid area. 5919 * If there's not enough space to add it, 5920 * print a warning. 5921 * Don't do this during upgrade. 5922 */ 5923 dev_t ddi_dev = md_dev64_to_dev(dev); 5924 if (ddi_lyr_get_devid(ddi_dev, &ret_devid) == 5925 DDI_SUCCESS) { 5926 if (ddi_lyr_get_minor_name(ddi_dev, 5927 S_IFBLK, &minor_name) 5928 == DDI_SUCCESS) { 5929 if (mddb_devid_add(s, li, 5930 ret_devid, minor_name)) { 5931 cmn_err(CE_WARN, 5932 "Not enough space in" 5933 " metadevice state" 5934 " database\n"); 5935 cmn_err(CE_WARN, 5936 "to add relocation" 5937 " information for" 5938 " device:\n"); 5939 cmn_err(CE_WARN, 5940 " major = %d, " 5941 " minor = %d\n", 5942 getmajor(ddi_dev), 5943 getminor(ddi_dev)); 5944 } else { 5945 write_lb = 1; 5946 } 5947 kmem_free(minor_name, 5948 strlen(minor_name) + 1); 5949 } 5950 ddi_devid_free(ret_devid); 5951 } 5952 } 5953 } 5954 5955 /* 5956 * If a device has a valid device id and if the dev_t 5957 * associated with the device id has changed, update the 5958 * driver name, minor num and dev_t in the local and side 5959 * locators to match the dev_t that the system currently 5960 * associates with the device id. 5961 * 5962 * Don't do this during upgrade. 5963 */ 5964 if (!(MD_UPGRADE)) { 5965 for (li = 0; li < lbp->lb_loccnt; li++) { 5966 lp = &lbp->lb_locators[li]; 5967 if (lp->l_flags & MDDB_F_DELETED) 5968 continue; 5969 did_info = &(did_icp->did_ic_blkp->blk_info[li]); 5970 if ((did_info->info_flags & MDDB_DID_VALID) && 5971 !(did_info->info_flags & MDDB_DID_UPDATED)) { 5972 if (lbp->lb_flags & MDDB_MNSET) { 5973 int j; 5974 int index = -1; 5975 mnlbp = (mddb_mnlb_t *)lbp; 5976 for (j = 0; j < MD_MNMAXSIDES; j++) { 5977 mnslp = &mnlbp-> 5978 lb_mnsidelocators[j][li]; 5979 if (mnslp->mnl_sideno == 5980 s->s_sideno) 5981 break; 5982 if (mnslp->mnl_sideno == 0) 5983 index = j; 5984 } 5985 if (j == MD_MNMAXSIDES) { 5986 /* No match found; take empty */ 5987 mnslp = &mnlbp-> 5988 lb_mnsidelocators[index][li]; 5989 write_lb = 1; 5990 mnslp->mnl_mnum = 5991 md_getminor(newdev[li]); 5992 } else if (mnslp->mnl_mnum != 5993 md_getminor(newdev[li])) { 5994 write_lb = 1; 5995 mnslp->mnl_mnum = 5996 md_getminor(newdev[li]); 5997 } 5998 } else { 5999 slp = &lbp-> 6000 lb_sidelocators[s->s_sideno][li]; 6001 if (slp->l_mnum != 6002 md_getminor(newdev[li])) { 6003 write_lb = 1; 6004 slp->l_mnum = 6005 md_getminor(newdev[li]); 6006 } 6007 } 6008 name = ddi_major_to_name( 6009 md_getmajor(newdev[li])); 6010 if (lbp->lb_flags & MDDB_MNSET) { 6011 i = mnslp->mnl_drvnm_index; 6012 } else { 6013 i = slp->l_drvnm_index; 6014 } 6015 if (strncmp(lbp->lb_drvnm[i].dn_data, name, 6016 lbp->lb_drvnm[i].dn_len) != 0) { 6017 /* Driver name has changed */ 6018 len = strlen(name); 6019 /* Look for the driver name */ 6020 for (i = 0; i < MDDB_DRVNMCNT; i++) { 6021 if (lbp->lb_drvnm[i].dn_len 6022 != len) 6023 continue; 6024 if (strncmp( 6025 lbp->lb_drvnm[i].dn_data, 6026 name, len) == 0) 6027 break; 6028 } 6029 /* Didn't find one, add it */ 6030 if (i == MDDB_DRVNMCNT) { 6031 for (i = 0; i < MDDB_DRVNMCNT; 6032 i++) { 6033 if (lbp->lb_drvnm[i].dn_len 6034 == 0) 6035 break; 6036 } 6037 if (i == MDDB_DRVNMCNT) { 6038 cmn_err(CE_WARN, 6039 "Unable to update driver" 6040 " name for dev: " 6041 "major = %d, " 6042 "minor = %d\n", 6043 md_getmajor(newdev[li]), 6044 md_getminor(newdev[li])); 6045 continue; 6046 } 6047 (void) strncpy( 6048 lbp->lb_drvnm[i].dn_data, 6049 name, MD_MAXDRVNM); 6050 lbp->lb_drvnm[i].dn_len = 6051 (uchar_t)strlen(name); 6052 } 6053 /* Fill in the drvnm index */ 6054 if (lbp->lb_flags & MDDB_MNSET) { 6055 mnslp->mnl_drvnm_index = i; 6056 } else { 6057 slp->l_drvnm_index = i; 6058 } 6059 write_lb = 1; 6060 } 6061 did_info->info_flags |= MDDB_DID_UPDATED; 6062 } 6063 } 6064 } 6065 } 6066 kmem_free(newdev, sizeof (md_dev64_t) * MDDB_NLB); 6067 6068 /* 6069 * If locator block has been changed by get_mbs_n_lbs, 6070 * by addition of new device id, by updated minor name or 6071 * by updated driver name - write out locator block. 6072 */ 6073 if (write_lb) { 6074 rval = push_lb(s); 6075 (void) upd_med(s, "load_old_replicas(0)"); 6076 if (rval) 6077 goto errout; 6078 } 6079 6080 /* 6081 * If the tag was moved, allocated, or a BADTAG was seen for some other 6082 * reason, then make sure tags are written to all the replicas. 6083 * Data tags not supported on MN sets. 6084 */ 6085 if (!(md_get_setstatus(setno) & MD_SET_MNSET)) { 6086 if (! (lc = dt_alloc_if_needed(s))) { 6087 for (li = 0; li < lbp->lb_loccnt; li++) { 6088 lp = &lbp->lb_locators[li]; 6089 6090 if ((! (lp->l_flags & MDDB_F_ACTIVE)) || 6091 (lp->l_flags & MDDB_F_EMASTER)) 6092 continue; 6093 6094 if (lp->l_flags & MDDB_F_BADTAG) { 6095 lc = 1; 6096 break; 6097 } 6098 } 6099 } 6100 6101 if (lc) { 6102 md_set_setstatus(setno, MD_SET_TAGDATA); 6103 md_clr_setstatus(setno, MD_SET_BADTAG); 6104 (void) selectreplicas(s, MDDB_SCANALL); 6105 } 6106 } 6107 6108 errout: 6109 6110 /* Free extraneous rip components. */ 6111 for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) { 6112 /* Get rid of lbp's and dtp's */ 6113 6114 if (rip->ri_lbp != lbp) { 6115 if (rip->ri_dtp != (mddb_dt_t *)NULL) { 6116 kmem_free((caddr_t)rip->ri_dtp, MDDB_DT_BYTES); 6117 rip->ri_dtp = (mddb_dt_t *)NULL; 6118 } 6119 6120 if (rip->ri_devid != (ddi_devid_t)NULL) { 6121 sz = (int)ddi_devid_sizeof(rip->ri_devid); 6122 kmem_free((caddr_t)rip->ri_devid, sz); 6123 rip->ri_devid = (ddi_devid_t)NULL; 6124 } 6125 if (rip->ri_old_devid != (ddi_devid_t)NULL) { 6126 sz = (int)ddi_devid_sizeof(rip->ri_old_devid); 6127 kmem_free((caddr_t)rip->ri_old_devid, sz); 6128 rip->ri_old_devid = (ddi_devid_t)NULL; 6129 } 6130 6131 if (rip->ri_lbp != (mddb_lb_t *)NULL) { 6132 mddb_devid_icp_free(&rip->ri_did_icp, 6133 rip->ri_lbp); 6134 6135 kmem_free((caddr_t)rip->ri_lbp, 6136 dbtob(rip->ri_lbp->lb_blkcnt)); 6137 rip->ri_lbp = (mddb_lb_t *)NULL; 6138 } 6139 } 6140 6141 if (lbp != NULL) { 6142 for (li = 0; li < lbp->lb_loccnt; li++) { 6143 lp = &lbp->lb_locators[li]; 6144 if (lp->l_flags & MDDB_F_DELETED) 6145 continue; 6146 if (rip->ri_dev == md_expldev(lp->l_dev) && 6147 rip->ri_blkno == lp->l_blkno) 6148 break; 6149 } 6150 if (li < lbp->lb_loccnt) 6151 continue; 6152 } 6153 6154 /* 6155 * Get rid of mbp's: 6156 * if lbp, those out of lb_loccnt bounds 6157 * if !lbp, all of them. 6158 */ 6159 if (rip->ri_mbip) { 6160 md_dev64_t dev64 = md_xlate_targ_2_mini(rip->ri_dev); 6161 if (dev64 != NODEV64) { 6162 mddb_devclose(dev64); 6163 free_mbipp(&rip->ri_mbip); 6164 } 6165 } 6166 /* 6167 * Turn off MDDB_F_EMASTER flag in a diskset since diskset 6168 * code always ends up calling ridev for all replicas 6169 * before calling load_old_replicas. ridev will reset 6170 * MDDB_F_EMASTER flag if flag was due to unresolved devid. 6171 */ 6172 if (setno != MD_LOCAL_SET) 6173 rip->ri_flags &= ~MDDB_F_EMASTER; 6174 } 6175 return (retval); 6176 } 6177 6178 /* 6179 * Given the devt from the md.conf info, get the devid for the device. 6180 */ 6181 static void 6182 lookup_db_devid(mddb_cfg_loc_t *cl) 6183 { 6184 dev_t ldev; 6185 ddi_devid_t devid; 6186 char *minor; 6187 6188 if (ddi_name_to_major(cl->l_driver) == (major_t)-1) { 6189 cmn_err(CE_NOTE, "mddb: unknown major name '%s'", cl->l_driver); 6190 return; 6191 } 6192 6193 ldev = makedevice(ddi_name_to_major(cl->l_driver), cl->l_mnum); 6194 if (ddi_lyr_get_devid(ldev, &devid) != DDI_SUCCESS) { 6195 cmn_err(CE_NOTE, "mddb: unable to get devid for '%s', 0x%x", 6196 cl->l_driver, cl->l_mnum); 6197 return; 6198 } 6199 6200 if (ddi_lyr_get_minor_name(ldev, S_IFBLK, &minor) != DDI_SUCCESS) { 6201 cmn_err(CE_NOTE, "mddb: unable to get minor name 0x%x", 6202 cl->l_mnum); 6203 return; 6204 } 6205 6206 cl->l_devid_flags = MDDB_DEVID_SPACE | MDDB_DEVID_VALID | MDDB_DEVID_SZ; 6207 cl->l_devid_sz = (int)ddi_devid_sizeof(devid); 6208 cl->l_devid = (uint64_t)(uintptr_t)devid; 6209 (void) strlcpy(cl->l_minor_name, minor, MDDB_MINOR_NAME_MAX); 6210 6211 kmem_free(minor, strlen(minor) + 1); 6212 } 6213 6214 /* 6215 * grab driver name, minor, block and devid out of 6216 * strings like "driver:minor:block:devid" 6217 */ 6218 static int 6219 parse_db_loc( 6220 char *str, 6221 mddb_cfg_loc_t *clp 6222 ) 6223 { 6224 char *p, *e; 6225 char *minor_name; 6226 ddi_devid_t ret_devid; 6227 6228 clp->l_dev = 0; 6229 p = clp->l_driver; 6230 e = p + sizeof (clp->l_driver) - 1; 6231 while ((*str != ':') && (*str != '\0') && (p < e)) 6232 *p++ = *str++; 6233 *p = '\0'; 6234 if (*str++ != ':') 6235 return (-1); 6236 clp->l_mnum = 0; 6237 while (ISNUM(*str)) { 6238 clp->l_mnum *= 10; 6239 clp->l_mnum += *str++ - '0'; 6240 } 6241 if (*str++ != ':') 6242 return (-1); 6243 clp->l_blkno = 0; 6244 while (ISNUM(*str)) { 6245 clp->l_blkno *= 10; 6246 clp->l_blkno += *str++ - '0'; 6247 } 6248 if (*str++ != ':') 6249 return (-1); 6250 6251 /* 6252 * If the md_devid_destroy flag is set, ignore the device ids. 6253 * This is only to used in a catastrophic failure case. Examples 6254 * would be where the device id of all drives in the system 6255 * (especially the mirror'd root drives) had been changed 6256 * by firmware upgrade or by a patch to an existing disk 6257 * driver. Another example would be in the case of non-unique 6258 * device ids due to a bug. The device id would be valid on 6259 * the system, but would return the wrong dev_t. 6260 */ 6261 if (md_devid_destroy) { 6262 clp->l_devid_flags = 0; 6263 clp->l_devid = (uint64_t)NULL; 6264 clp->l_devid_sz = 0; 6265 clp->l_old_devid = (uint64_t)NULL; 6266 clp->l_old_devid_sz = 0; 6267 clp->l_minor_name[0] = '\0'; 6268 return (0); 6269 } 6270 6271 if (ddi_devid_str_decode(str, 6272 (ddi_devid_t *)&ret_devid, &minor_name) == DDI_FAILURE) 6273 return (-1); 6274 6275 clp->l_devid = (uint64_t)(uintptr_t)ret_devid; 6276 clp->l_devid_flags = 0; 6277 clp->l_old_devid = (uint64_t)NULL; 6278 clp->l_old_devid_sz = 0; 6279 6280 /* If no device id associated with device, just return */ 6281 if ((ddi_devid_t)(uintptr_t)clp->l_devid == (ddi_devid_t)NULL) { 6282 clp->l_devid_sz = 0; 6283 clp->l_minor_name[0] = '\0'; 6284 if (strcmp(str, "id0") == 0 && md_devid_destroy == 0 && 6285 md_keep_repl_state == 0) { 6286 /* 6287 * No devid in md.conf; we're in recovery mode so 6288 * lookup the devid for the device as specified by 6289 * the devt in md.conf. 6290 */ 6291 lookup_db_devid(clp); 6292 } 6293 return (0); 6294 } 6295 6296 clp->l_devid_flags = MDDB_DEVID_SPACE | MDDB_DEVID_VALID | 6297 MDDB_DEVID_SZ; 6298 clp->l_devid_sz = (int)ddi_devid_sizeof( 6299 (ddi_devid_t)(uintptr_t)clp->l_devid); 6300 (void) strcpy(clp->l_minor_name, minor_name); 6301 kmem_free(minor_name, strlen(minor_name) + 1); 6302 6303 return (0); 6304 } 6305 6306 /* 6307 * grab driver name, minor, and block out of 6308 * strings like "driver:minor:block:devid driver:minor:block:devid ..." 6309 */ 6310 static void 6311 parse_db_string( 6312 char *str 6313 ) 6314 { 6315 char *p, *e; 6316 mddb_cfg_loc_t *cl; 6317 char restore_space; 6318 6319 /* CSTYLED */ 6320 cl = kmem_zalloc(sizeof (mddb_cfg_loc_t), KM_SLEEP); 6321 for (p = str; (*p != '\0'); ) { 6322 for (; ((*p != '\0') && (ISWHITE(*p))); ++p) 6323 ; 6324 if (*p == '\0') 6325 break; 6326 for (e = p; ((*e != '\0') && (! ISWHITE(*e))); ++e) 6327 ; 6328 /* 6329 * Only give parse_db_loc 1 entry, so stuff a null into 6330 * the string if we're not at the end. We need to save this 6331 * char and restore it after call. 6332 */ 6333 restore_space = '\0'; 6334 if (*e != '\0') { 6335 restore_space = *e; 6336 *e = '\0'; 6337 } 6338 if (parse_db_loc(p, cl) != 0) { 6339 cmn_err(CE_NOTE, "mddb: parsing error on '%s'", p); 6340 } else { 6341 (void) ridev( 6342 &((mddb_set_t *)md_set[MD_LOCAL_SET].s_db)->s_rip, 6343 cl, NULL, MDDB_F_PTCHED); 6344 if (cl->l_devid_flags & MDDB_DEVID_SPACE) { 6345 kmem_free((caddr_t)(uintptr_t)cl->l_devid, 6346 cl->l_devid_sz); 6347 } 6348 } 6349 if (restore_space != '\0') { 6350 *e = restore_space; 6351 } 6352 p = e; 6353 } 6354 kmem_free(cl, sizeof (mddb_cfg_loc_t)); 6355 } 6356 6357 /* 6358 * grab database locations supplied by md.conf as properties 6359 */ 6360 static void 6361 parse_db_strings(void) 6362 { 6363 int bootlist_id; 6364 int proplen; 6365 /* 6366 * size of _bootlist_name should match uses of line and entry in 6367 * libmeta meta_systemfile_append_mddb routine (meta_systemfile.c) 6368 */ 6369 char _bootlist_name[MDDB_BOOTLIST_MAX_LEN]; 6370 char *bootlist_name; 6371 caddr_t prop; 6372 6373 /* 6374 * Step through the bootlist properties one at a time by forming the 6375 * correct name, fetching the property, parsing the property and 6376 * then freeing the memory. If a property does not exist or returns 6377 * some form of error just ignore it. There is no guarantee that 6378 * the properties will always exist in sequence, for example 6379 * mddb_bootlist1 may exist and mddb_bootlist2 may not exist with 6380 * mddb_bootlist3 existing. 6381 */ 6382 bootlist_name = &_bootlist_name[0]; 6383 for (bootlist_id = 0; bootlist_id < md_maxbootlist; bootlist_id++) { 6384 6385 proplen = 0; 6386 (void) sprintf(bootlist_name, "mddb_bootlist%d", bootlist_id); 6387 6388 if (ddi_getlongprop(DDI_DEV_T_ANY, md_devinfo, 6389 DDI_PROP_CANSLEEP, bootlist_name, (caddr_t)&prop, 6390 &proplen) != DDI_PROP_SUCCESS) 6391 continue; 6392 6393 if (proplen <= 0) 6394 continue; 6395 6396 if (md_init_debug) 6397 cmn_err(CE_NOTE, "%s is %s", bootlist_name, prop); 6398 6399 parse_db_string(prop); 6400 kmem_free(prop, proplen); 6401 } 6402 } 6403 6404 static int 6405 initit( 6406 set_t setno, 6407 int flag 6408 ) 6409 { 6410 int i; 6411 mddb_set_t *s; 6412 mddb_lb_t *lbp; /* pointer to locator block */ 6413 mddb_ln_t *lnp; /* pointer to locator names */ 6414 mddb_db_t *dbp; /* pointer to directory block */ 6415 mddb_did_blk_t *did_blkp; /* pointer to Device ID block */ 6416 mddb_did_ic_t *did_icp; /* pointer to Device ID incore area */ 6417 mddb_bf_t *bfp; 6418 side_t sideno; 6419 side_t maxsides; 6420 mddb_block_t lb_blkcnt; 6421 int retval = 0; 6422 md_dev64_t dev; 6423 mddb_mnlb_t *mnlbp; 6424 int devid_flag; 6425 6426 /* single thread's all loads/unloads of set's */ 6427 mutex_enter(&mddb_lock); 6428 mutex_enter(SETMUTEX(setno)); 6429 6430 if (((mddb_set_t *)md_set[setno].s_db) == NULL) { 6431 mutex_exit(SETMUTEX(setno)); 6432 mutex_exit(&mddb_lock); 6433 return (MDDB_E_NOTNOW); 6434 } 6435 6436 s = (mddb_set_t *)md_set[setno].s_db; 6437 6438 single_thread_start(s); 6439 6440 /* 6441 * init is already underway, block. Return success. 6442 */ 6443 if (s->s_lbp) { 6444 single_thread_end(s); 6445 mutex_exit(SETMUTEX(setno)); 6446 mutex_exit(&mddb_lock); 6447 return (0); 6448 } 6449 6450 uniqtime32(&s->s_inittime); 6451 6452 /* grab database locations patched by /etc/system */ 6453 if (setno == MD_LOCAL_SET) 6454 parse_db_strings(); 6455 6456 s->s_mbiarray = (mddb_mb_ic_t **)kmem_zalloc( 6457 sizeof (mddb_mb_ic_t *) * mddb_maxcopies, KM_SLEEP); 6458 6459 s->s_zombie = 0; 6460 s->s_staledeletes = 0; 6461 s->s_optcmtcnt = 0; 6462 s->s_opthavelck = 0; 6463 s->s_optwantlck = 0; 6464 s->s_optwaiterr = 0; 6465 s->s_opthungerr = 0; 6466 6467 /* 6468 * KEEPTAG can never be set for a MN diskset since no tags are 6469 * allowed to be stored in a MN diskset. No way to check 6470 * if this is a MN diskset or not at this point since the mddb 6471 * hasn't been read in from disk yet. (flag will only have 6472 * MUTLINODE bit set if a new set is being created.) 6473 */ 6474 if (! (md_get_setstatus(s->s_setno) & MD_SET_KEEPTAG)) 6475 dt_setup(s, NULL); 6476 6477 md_clr_setstatus(s->s_setno, MD_SET_TOOFEW); 6478 6479 for (i = 0; i < mddb_maxbufheaders; i++) { 6480 bfp = (mddb_bf_t *)kmem_zalloc(sizeof (*bfp), KM_SLEEP); 6481 sema_init(&bfp->bf_buf.b_io, 0, NULL, 6482 SEMA_DEFAULT, NULL); 6483 sema_init(&bfp->bf_buf.b_sem, 0, NULL, 6484 SEMA_DEFAULT, NULL); 6485 bfp->bf_buf.b_offset = -1; 6486 freebuffer(s, bfp); 6487 } 6488 6489 retval = load_old_replicas(s, flag); 6490 /* If 0 return value - success */ 6491 if (! retval) { 6492 single_thread_end(s); 6493 mutex_exit(SETMUTEX(setno)); 6494 mutex_exit(&mddb_lock); 6495 return (0); 6496 } 6497 6498 /* 6499 * If here, then the load_old_replicas() failed 6500 */ 6501 6502 6503 /* If the database was supposed to exist. */ 6504 if (flag & MDDB_MUSTEXIST) { 6505 if (s->s_mbiarray != (mddb_mb_ic_t **)NULL) { 6506 for (i = 0; i < mddb_maxcopies; i++) { 6507 if (! s->s_mbiarray[i]) 6508 continue; 6509 dev = md_expldev( 6510 s->s_lbp->lb_locators[i].l_dev); 6511 dev = md_xlate_targ_2_mini(dev); 6512 if (dev != NODEV64) { 6513 mddb_devclose(dev); 6514 free_mbipp(&s->s_mbiarray[i]); 6515 } 6516 } 6517 6518 kmem_free((caddr_t)s->s_mbiarray, 6519 sizeof (mddb_mb_ic_t *) * mddb_maxcopies); 6520 s->s_mbiarray = NULL; 6521 } 6522 6523 if (s->s_lnp != (mddb_ln_t *)NULL) { 6524 kmem_free((caddr_t)s->s_lnp, 6525 dbtob(s->s_lbp->lb_lnblkcnt)); 6526 s->s_lnp = (mddb_ln_t *)NULL; 6527 } 6528 6529 mddb_devid_icp_free(&s->s_did_icp, s->s_lbp); 6530 6531 if (s->s_lbp != (mddb_lb_t *)NULL) { 6532 kmem_free((caddr_t)s->s_lbp, 6533 dbtob(s->s_lbp->lb_blkcnt)); 6534 s->s_lbp = (mddb_lb_t *)NULL; 6535 } 6536 6537 while ((bfp = allocbuffer(s, MDDB_NOSLEEP)) != NULL) 6538 kmem_free((caddr_t)bfp, sizeof (*bfp)); 6539 6540 single_thread_end(s); 6541 mutex_exit(SETMUTEX(setno)); 6542 mutex_exit(&mddb_lock); 6543 6544 if (retval == MDDB_E_TAGDATA) 6545 return (retval); 6546 6547 /* Want a bit more detailed error messages */ 6548 if (mddb_db_err_detail) 6549 return (retval); 6550 6551 return (MDDB_E_NODB); 6552 } 6553 6554 6555 /* 6556 * MDDB_NOOLDOK set - Creating a new database, so do 6557 * more initialization. 6558 */ 6559 6560 lb_blkcnt = (mddb_block_t)((setno == MD_LOCAL_SET) ? 6561 MDDB_LOCAL_LBCNT : MDDB_LBCNT); 6562 if (flag & MDDB_MULTINODE) { 6563 lb_blkcnt = MDDB_MNLBCNT; 6564 } 6565 6566 if (s->s_lbp == NULL) 6567 s->s_lbp = (mddb_lb_t *)kmem_alloc(dbtob(lb_blkcnt), KM_SLEEP); 6568 lbp = s->s_lbp; 6569 6570 bzero((caddr_t)lbp, dbtob(lb_blkcnt)); 6571 lbp->lb_setno = setno; 6572 lbp->lb_magic = MDDB_MAGIC_LB; 6573 if (flag & MDDB_MULTINODE) { 6574 lbp->lb_revision = MDDB_REV_MNLB; 6575 } else { 6576 lbp->lb_revision = MDDB_REV_LB; 6577 } 6578 lbp->lb_inittime = s->s_inittime; 6579 if (flag & MDDB_MULTINODE) { 6580 mnlbp = (mddb_mnlb_t *)lbp; 6581 for (i = 0; i < MDDB_NLB; i++) { 6582 for (sideno = 0; sideno < MD_MNMAXSIDES; sideno++) { 6583 mddb_mnsidelocator_t *mnslp; 6584 mnslp = &mnlbp->lb_mnsidelocators[sideno][i]; 6585 mnslp->mnl_mnum = NODEV32; 6586 mnslp->mnl_sideno = 0; 6587 mnslp->mnl_drvnm_index = 0; 6588 } 6589 } 6590 } else { 6591 maxsides = ((setno == MD_LOCAL_SET) ? 1 : MD_MAXSIDES); 6592 for (i = 0; i < MDDB_NLB; i++) { 6593 for (sideno = 0; sideno < maxsides; sideno++) { 6594 mddb_sidelocator_t *slp; 6595 slp = &lbp->lb_sidelocators[sideno][i]; 6596 slp->l_mnum = NODEV32; 6597 } 6598 } 6599 } 6600 lbp->lb_blkcnt = lb_blkcnt; 6601 6602 /* lb starts on block 0 */ 6603 /* locator names starts after locator block */ 6604 lbp->lb_lnfirstblk = lb_blkcnt; 6605 if (flag & MDDB_MULTINODE) { 6606 lbp->lb_lnblkcnt = (mddb_block_t)MDDB_MNLNCNT; 6607 } else { 6608 lbp->lb_lnblkcnt = (mddb_block_t)((setno == MD_LOCAL_SET) ? 6609 MDDB_LOCAL_LNCNT : MDDB_LNCNT); 6610 } 6611 6612 if (flag & MDDB_MULTINODE) { 6613 /* Creating a multinode diskset */ 6614 md_set_setstatus(setno, MD_SET_MNSET); 6615 lbp->lb_flags |= MDDB_MNSET; 6616 } 6617 6618 /* Data portion of mddb located after locator names */ 6619 lbp->lb_dbfirstblk = lbp->lb_lnfirstblk + lbp->lb_lnblkcnt; 6620 6621 /* the btodb that follows is converting the directory block size */ 6622 /* Data tag part of mddb located after first block of mddb data */ 6623 lbp->lb_dtfirstblk = (mddb_block_t)(lbp->lb_dbfirstblk + 6624 btodb(MDDB_BSIZE)); 6625 /* Data tags are not used in MN diskset - so set count to 0 */ 6626 if (flag & MDDB_MULTINODE) 6627 lbp->lb_dtblkcnt = (mddb_block_t)0; 6628 else 6629 lbp->lb_dtblkcnt = (mddb_block_t)MDDB_DT_BLOCKS; 6630 6631 6632 lnp = (mddb_ln_t *)kmem_zalloc(dbtob(lbp->lb_lnblkcnt), KM_SLEEP); 6633 lnp->ln_magic = MDDB_MAGIC_LN; 6634 if (flag & MDDB_MULTINODE) { 6635 lnp->ln_revision = MDDB_REV_MNLN; 6636 } else { 6637 lnp->ln_revision = MDDB_REV_LN; 6638 } 6639 s->s_lnp = lnp; 6640 6641 /* 6642 * Set up Device ID portion of Locator Block. 6643 * Do not set locator to device id style if 6644 * md_devid_destroy is 1 and md_keep_repl_state is 1 6645 * (destroy all device id data and keep replica in 6646 * non device id mode). 6647 * 6648 * This is logically equivalent to set locator to 6649 * device id style if md_devid_destroy is 0 or 6650 * md_keep_repl_state is 0. 6651 * 6652 * In SunCluster environment, device id mode is disabled 6653 * which means diskset will be run in non-devid mode. For 6654 * localset, the behavior will remain intact and run in 6655 * device id mode. 6656 * 6657 * In multinode diskset devids are turned off. 6658 */ 6659 devid_flag = 1; 6660 if (cluster_bootflags & CLUSTER_CONFIGURED) 6661 if (setno != MD_LOCAL_SET) 6662 devid_flag = 0; 6663 if (flag & MDDB_MULTINODE) 6664 devid_flag = 0; 6665 if ((md_devid_destroy == 1) && (md_keep_repl_state == 1)) 6666 devid_flag = 0; 6667 /* 6668 * if we weren't devid style before and md_keep_repl_state=1 6669 * we need to stay non-devid 6670 */ 6671 if (((lbp->lb_flags & MDDB_DEVID_STYLE) == 0) && 6672 (md_keep_repl_state == 1)) 6673 devid_flag = 0; 6674 if (devid_flag) { 6675 lbp->lb_didfirstblk = lbp->lb_dtfirstblk + 6676 lbp->lb_dtblkcnt; 6677 lbp->lb_didblkcnt = (mddb_block_t)MDDB_DID_BLOCKS; 6678 lbp->lb_flags |= MDDB_DEVID_STYLE; 6679 6680 did_icp = (mddb_did_ic_t *)kmem_zalloc 6681 (sizeof (mddb_did_ic_t), KM_SLEEP); 6682 did_blkp = (mddb_did_blk_t *) 6683 kmem_zalloc(dbtob(lbp->lb_didblkcnt), KM_SLEEP); 6684 did_blkp->blk_magic = MDDB_MAGIC_DI; 6685 did_blkp->blk_revision = MDDB_REV_DI; 6686 did_icp->did_ic_blkp = did_blkp; 6687 s->s_did_icp = did_icp; 6688 } 6689 6690 setidentifier(s, &lbp->lb_ident); 6691 uniqtime32(&lbp->lb_timestamp); 6692 dbp = (mddb_db_t *)kmem_zalloc(sizeof (mddb_db_t), KM_SLEEP); 6693 dbp->db_magic = MDDB_MAGIC_DB; 6694 dbp->db_revision = MDDB_REV_DB; 6695 uniqtime32(&dbp->db_timestamp); 6696 dbp->db_nextblk = 0; 6697 dbp->db_firstentry = NULL; 6698 dbp->db_blknum = lbp->lb_dbfirstblk; 6699 dbp->db_recsum = MDDB_GLOBAL_XOR; 6700 s->s_dbp = dbp; 6701 single_thread_end(s); 6702 mutex_exit(SETMUTEX(setno)); 6703 mutex_exit(&mddb_lock); 6704 return (0); 6705 } 6706 6707 mddb_set_t * 6708 mddb_setenter( 6709 set_t setno, 6710 int flag, 6711 int *errorcodep 6712 ) 6713 { 6714 mddb_set_t *s; 6715 int err = 0; 6716 size_t sz = sizeof (void *) * MD_MAXUNITS; 6717 6718 mutex_enter(SETMUTEX(setno)); 6719 if (! md_set[setno].s_db) { 6720 mutex_exit(SETMUTEX(setno)); 6721 if (errorcodep != NULL) 6722 *errorcodep = MDDB_E_NOTOWNER; 6723 return (NULL); 6724 } 6725 6726 /* Allocate s_un and s_ui arrays if not already present. */ 6727 if (md_set[setno].s_un == NULL) { 6728 md_set[setno].s_un = kmem_zalloc(sz, KM_NOSLEEP); 6729 if (md_set[setno].s_un == NULL) { 6730 mutex_exit(SETMUTEX(setno)); 6731 if (errorcodep != NULL) 6732 *errorcodep = MDDB_E_NOTOWNER; 6733 return (NULL); 6734 } 6735 } 6736 if (md_set[setno].s_ui == NULL) { 6737 md_set[setno].s_ui = kmem_zalloc(sz, KM_NOSLEEP); 6738 if (md_set[setno].s_ui == NULL) { 6739 mutex_exit(&md_set[setno].s_dbmx); 6740 kmem_free(md_set[setno].s_un, sz); 6741 md_set[setno].s_un = NULL; 6742 if (errorcodep != NULL) 6743 *errorcodep = MDDB_E_NOTOWNER; 6744 return (NULL); 6745 } 6746 } 6747 s = (mddb_set_t *)md_set[setno].s_db; 6748 if (s->s_lbp) 6749 return (s); 6750 6751 if (flag & MDDB_NOINIT) 6752 return (s); 6753 6754 /* 6755 * Release the set mutex - it will be acquired and released in 6756 * initit after acquiring the mddb_lock. This is done to assure 6757 * that mutexes are always acquired in the same order to prevent 6758 * possible deadlock 6759 */ 6760 mutex_exit(SETMUTEX(setno)); 6761 6762 if ((err = initit(setno, flag)) != 0) { 6763 if (errorcodep != NULL) 6764 *errorcodep = err; 6765 return (NULL); 6766 } 6767 6768 mutex_enter(SETMUTEX(setno)); 6769 return ((mddb_set_t *)md_set[setno].s_db); 6770 } 6771 6772 /* 6773 * Release the set lock for a given set. 6774 * 6775 * In a MN diskset, this routine may send messages to the rpc.mdcommd 6776 * in order to have the slave nodes re-parse parts of the mddb. 6777 * Messages are only sent if the global ioctl lock is not held. 6778 * 6779 * With the introduction of multi-threaded ioctls, there is no way 6780 * to determine which thread(s) are holding the ioctl lock. So, if 6781 * the ioctl lock is held (by process X) process X will send the 6782 * messages to the slave nodes when process X releases the ioctl lock. 6783 */ 6784 void 6785 mddb_setexit( 6786 mddb_set_t *s 6787 ) 6788 { 6789 md_mn_msg_mddb_parse_t *mddb_parse_msg; 6790 md_mn_kresult_t *kresult; 6791 mddb_lb_t *lbp = s->s_lbp; 6792 int i; 6793 int rval = 1; 6794 6795 /* 6796 * If not a MN diskset OR 6797 * a MN diskset but this node isn't master, 6798 * then release the mutex. 6799 */ 6800 if (!(MD_MNSET_SETNO(s->s_setno)) || 6801 ((MD_MNSET_SETNO(s->s_setno)) && 6802 (!md_set[s->s_setno].s_am_i_master))) { 6803 mutex_exit(SETMUTEX(s->s_setno)); 6804 return; 6805 } 6806 6807 /* 6808 * If global ioctl lock is held, then send no messages, 6809 * just release mutex and return. 6810 * 6811 */ 6812 if (md_status & MD_GBL_IOCTL_LOCK) { 6813 mutex_exit(SETMUTEX(s->s_setno)); 6814 return; 6815 } 6816 6817 /* 6818 * This thread is not holding the ioctl lock, so drop the set 6819 * lock, send messages to slave nodes to reparse portions 6820 * of the mddb and return. 6821 * 6822 * If the block parse flag is set, do not send parse messages. 6823 * This flag is set when master is adding a new mddb that would 6824 * cause parse messages to be sent to the slaves, but the slaves 6825 * don't have knowledge of the new mddb yet since the mddb add 6826 * operation hasn't been run on the slave nodes yet. When the 6827 * master unblocks the parse flag, the parse messages will be 6828 * generated. 6829 * 6830 * If s_mn_parseflags_sending is non-zero, then another thread 6831 * is already currently sending a parse message, so just release 6832 * the mutex and return. If an mddb change occurred that results 6833 * in a parse message to be generated, the thread that is currently 6834 * sending a parse message would generate the additional parse message. 6835 * 6836 * If s_mn_parseflags_sending is zero and parsing is not blocked, 6837 * then loop until s_mn_parseflags is 0 (until there are no more 6838 * messages to send). 6839 * While s_mn_parseflags is non-zero, 6840 * put snapshot of parse_flags in s_mn_parseflags_sending 6841 * set s_mn_parseflags to zero 6842 * release mutex 6843 * send message 6844 * re-grab mutex 6845 * set s_mn_parseflags_sending to zero 6846 */ 6847 mddb_parse_msg = kmem_zalloc(sizeof (md_mn_msg_mddb_parse_t), 6848 KM_SLEEP); 6849 while (((s->s_mn_parseflags_sending & MDDB_PARSE_MASK) == 0) && 6850 (s->s_mn_parseflags & MDDB_PARSE_MASK) && 6851 (!(md_get_setstatus(s->s_setno) & MD_SET_MNPARSE_BLK))) { 6852 /* Grab snapshot of parse flags */ 6853 s->s_mn_parseflags_sending = s->s_mn_parseflags; 6854 s->s_mn_parseflags = 0; 6855 6856 mutex_exit(SETMUTEX(s->s_setno)); 6857 6858 /* 6859 * Send the message to the slaves to re-parse 6860 * the indicated portions of the mddb. Send the status 6861 * of the 50 mddbs in this set so that slaves know which 6862 * mddbs that the master node thinks are 'good'. 6863 * Otherwise, slave may reparse, but from wrong replica. 6864 */ 6865 mddb_parse_msg->msg_parse_flags = s->s_mn_parseflags_sending; 6866 for (i = 0; i < MDDB_NLB; i++) { 6867 mddb_parse_msg->msg_lb_flags[i] = 6868 lbp->lb_locators[i].l_flags; 6869 } 6870 kresult = kmem_zalloc(sizeof (md_mn_kresult_t), KM_SLEEP); 6871 while (rval != 0) { 6872 rval = mdmn_ksend_message(s->s_setno, 6873 MD_MN_MSG_MDDB_PARSE, 0, 6874 (char *)mddb_parse_msg, 6875 sizeof (mddb_parse_msg), kresult); 6876 if (rval != 0) 6877 cmn_err(CE_WARN, "mddb_setexit: Unable to send " 6878 "mddb update message to other nodes in " 6879 "diskset %s\n", s->s_setname); 6880 } 6881 kmem_free(kresult, sizeof (md_mn_kresult_t)); 6882 6883 /* 6884 * Re-grab mutex to clear sending field and to 6885 * see if another parse message needs to be generated. 6886 */ 6887 mutex_enter(SETMUTEX(s->s_setno)); 6888 s->s_mn_parseflags_sending = 0; 6889 } 6890 kmem_free(mddb_parse_msg, sizeof (md_mn_msg_mddb_parse_t)); 6891 mutex_exit(SETMUTEX(s->s_setno)); 6892 } 6893 6894 static void 6895 mddb_setexit_no_parse( 6896 mddb_set_t *s 6897 ) 6898 { 6899 mutex_exit(SETMUTEX(s->s_setno)); 6900 } 6901 6902 uint_t 6903 mddb_lb_did_convert(mddb_set_t *s, uint_t doit, uint_t *blk_cnt) 6904 { 6905 uint_t li; 6906 mddb_lb_t *lbp = s->s_lbp; 6907 mddb_locator_t *lp; 6908 ddi_devid_t ret_devid; 6909 uint_t devid_len; 6910 dev_t ddi_dev; 6911 mddb_did_ic_t *did_icp; 6912 mddb_did_blk_t *did_blkp; 6913 char *minor_name; 6914 size_t sz; 6915 int retval; 6916 int err; 6917 md_dev64_t dev64; /* tmp var to make code look better */ 6918 6919 6920 /* Need disk block(s) to hold mddb_did_blk_t */ 6921 *blk_cnt = MDDB_DID_BLOCKS; 6922 6923 if (doit) { 6924 /* 6925 * Alloc mddb_did_blk_t disk block and fill in header area. 6926 * Don't fill in did magic number until end of routine so 6927 * if machine panics in the middle of conversion, the 6928 * device id information will be thrown away at the 6929 * next snarfing of this set. 6930 * Need to set DEVID_STYLE so that mddb_devid_add will 6931 * function properly. 6932 */ 6933 /* grab the mutex */ 6934 if ((mddb_setenter(s->s_setno, MDDB_NOINIT, &err)) == NULL) { 6935 return (1); 6936 } 6937 single_thread_start(s); 6938 lbp->lb_didfirstblk = getfreeblks(s, MDDB_DID_BLOCKS); 6939 if (lbp->lb_didfirstblk == 0) { 6940 single_thread_end(s); 6941 mddb_setexit(s); 6942 return (1); 6943 } 6944 lbp->lb_didblkcnt = (mddb_block_t)MDDB_DID_BLOCKS; 6945 did_icp = (mddb_did_ic_t *)kmem_zalloc(sizeof (mddb_did_ic_t), 6946 KM_SLEEP); 6947 did_blkp = (mddb_did_blk_t *)kmem_zalloc(MDDB_DID_BYTES, 6948 KM_SLEEP); 6949 6950 did_blkp->blk_revision = MDDB_REV_DI; 6951 did_icp->did_ic_blkp = did_blkp; 6952 s->s_did_icp = did_icp; 6953 lbp->lb_flags |= MDDB_DEVID_STYLE; 6954 } 6955 6956 /* Fill in information in mddb_did_info_t array */ 6957 for (li = 0; li < lbp->lb_loccnt; li++) { 6958 lp = &lbp->lb_locators[li]; 6959 if (lp->l_flags & MDDB_F_DELETED) 6960 continue; 6961 6962 dev64 = md_xlate_targ_2_mini(md_expldev(lp->l_dev)); 6963 ddi_dev = md_dev64_to_dev(dev64); 6964 if (ddi_dev == NODEV) { 6965 /* 6966 * No translation available for replica. 6967 * Could fail conversion to device id replica, 6968 * but instead will just continue with next 6969 * replica in list. 6970 */ 6971 continue; 6972 } 6973 if (ddi_lyr_get_devid(ddi_dev, &ret_devid) == DDI_SUCCESS) { 6974 /* 6975 * Just count each devid as at least 1 block. This 6976 * is conservative since several device id's may fit 6977 * into 1 disk block, but it's better to overestimate 6978 * the number of blocks needed than to underestimate. 6979 */ 6980 devid_len = (int)ddi_devid_sizeof(ret_devid); 6981 *blk_cnt += btodb(devid_len + (MDDB_BSIZE - 1)); 6982 if (doit) { 6983 if (ddi_lyr_get_minor_name(ddi_dev, S_IFBLK, 6984 &minor_name) == DDI_SUCCESS) { 6985 if (mddb_devid_add(s, li, ret_devid, 6986 minor_name)) { 6987 cmn_err(CE_WARN, 6988 "Not enough space in metadb" 6989 " to add device id for" 6990 " dev: major = %d, " 6991 "minor = %d\n", 6992 getmajor(ddi_dev), 6993 getminor(ddi_dev)); 6994 } 6995 sz = strlen(minor_name) + 1; 6996 kmem_free(minor_name, sz); 6997 } 6998 } 6999 ddi_devid_free(ret_devid); 7000 } 7001 } 7002 7003 if (doit) { 7004 did_blkp->blk_magic = MDDB_MAGIC_DI; 7005 retval = push_lb(s); 7006 (void) upd_med(s, "mddb_lb_did_convert(0)"); 7007 single_thread_end(s); 7008 mddb_setexit(s); 7009 if (retval != 0) 7010 return (1); 7011 } 7012 7013 return (0); 7014 } 7015 7016 static mddb_set_t * 7017 init_set( 7018 mddb_config_t *cp, 7019 int flag, 7020 int *errp 7021 ) 7022 { 7023 mddb_set_t *s; 7024 char *setname = NULL; 7025 set_t setno = MD_LOCAL_SET; 7026 side_t sideno = 0; 7027 struct timeval32 *created = NULL; 7028 7029 if (cp != NULL) { 7030 setname = cp->c_setname; 7031 setno = cp->c_setno; 7032 sideno = cp->c_sideno; 7033 created = &cp->c_timestamp; 7034 } 7035 7036 if (setno >= MD_MAXSETS) 7037 return ((mddb_set_t *)NULL); 7038 7039 if (md_set[setno].s_db) 7040 return (mddb_setenter(setno, flag, errp)); 7041 7042 s = (mddb_set_t *)kmem_zalloc(sizeof (*s), KM_SLEEP); 7043 7044 cv_init(&s->s_buf_cv, NULL, CV_DEFAULT, NULL); 7045 cv_init(&s->s_single_thread_cv, NULL, CV_DEFAULT, NULL); 7046 cv_init(&s->s_optqueuing_cv, NULL, CV_DEFAULT, NULL); 7047 cv_init(&s->s_opthungerr_cv, NULL, CV_DEFAULT, NULL); 7048 cv_init(&s->s_optwantlck_cv, NULL, CV_DEFAULT, NULL); 7049 7050 s->s_setno = setno; 7051 s->s_sideno = sideno; 7052 if (setno == MD_LOCAL_SET) { 7053 (void) strcpy(s->s_ident.serial, hw_serial); 7054 } else { 7055 s->s_ident.createtime = *created; 7056 s->s_setname = (char *)kmem_alloc(strlen(setname) + 1, 7057 KM_SLEEP); 7058 (void) strcpy(s->s_setname, setname); 7059 } 7060 7061 /* have a config struct, copy mediator information */ 7062 if (cp != NULL) 7063 s->s_med = cp->c_med; /* structure assignment */ 7064 7065 md_set[setno].s_db = (void *) s; 7066 7067 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_TAKEOVER, SVM_TAG_SET, setno, NODEV64); 7068 7069 return (mddb_setenter(setno, flag, errp)); 7070 } 7071 7072 void 7073 mddb_unload_set( 7074 set_t setno 7075 ) 7076 { 7077 7078 mddb_set_t *s; 7079 mddb_db_t *dbp, *adbp = NULL; 7080 mddb_de_ic_t *dep, *dep2; 7081 mddb_bf_t *bfp; 7082 int i; 7083 md_dev64_t dev; 7084 7085 if ((s = mddb_setenter(setno, MDDB_NOINIT, NULL)) == NULL) 7086 return; 7087 7088 single_thread_start(s); 7089 7090 s->s_opthavequeuinglck = 0; 7091 s->s_optwantqueuinglck = 0; 7092 7093 for (dbp = s->s_dbp; dbp != 0; dbp = adbp) { 7094 for (dep = dbp->db_firstentry; dep != NULL; dep = dep2) { 7095 if (dep->de_rb_userdata != NULL) { 7096 if (dep->de_icreqsize) 7097 kmem_free(dep->de_rb_userdata_ic, 7098 dep->de_icreqsize); 7099 else 7100 kmem_free(dep->de_rb_userdata, 7101 dep->de_reqsize); 7102 } 7103 kmem_free((caddr_t)dep->de_rb, dep->de_recsize); 7104 dep2 = dep->de_next; 7105 kmem_free((caddr_t)dep, sizeofde(dep)); 7106 } 7107 adbp = dbp->db_next; 7108 kmem_free((caddr_t)dbp, sizeof (mddb_db_t)); 7109 } 7110 s->s_dbp = (mddb_db_t *)NULL; 7111 7112 free_rip(&s->s_rip); 7113 7114 for (i = 0; i < mddb_maxcopies; i++) { 7115 if (! s->s_mbiarray) 7116 break; 7117 7118 if (! s->s_mbiarray[i]) 7119 continue; 7120 7121 dev = md_expldev(s->s_lbp->lb_locators[i].l_dev); 7122 dev = md_xlate_targ_2_mini(dev); 7123 if (dev != NODEV64) { 7124 mddb_devclose(dev); 7125 free_mbipp(&s->s_mbiarray[i]); 7126 } 7127 } 7128 7129 if (s->s_mbiarray) { 7130 kmem_free((caddr_t)s->s_mbiarray, 7131 sizeof (mddb_mb_ic_t *) * mddb_maxcopies); 7132 s->s_mbiarray = (mddb_mb_ic_t **)NULL; 7133 } 7134 7135 if (s->s_lnp) { 7136 kmem_free((caddr_t)s->s_lnp, dbtob(s->s_lbp->lb_lnblkcnt)); 7137 s->s_lnp = (mddb_ln_t *)NULL; 7138 } 7139 7140 if (s->s_lbp) { 7141 mddb_devid_icp_free(&s->s_did_icp, s->s_lbp); 7142 kmem_free((caddr_t)s->s_lbp, dbtob(s->s_lbp->lb_blkcnt)); 7143 s->s_lbp = (mddb_lb_t *)NULL; 7144 } 7145 7146 if (s->s_freebitmap) { 7147 kmem_free((caddr_t)s->s_freebitmap, s->s_freebitmapsize); 7148 s->s_freebitmap = NULL; 7149 s->s_freebitmapsize = 0; 7150 } 7151 7152 while ((bfp = allocbuffer(s, MDDB_NOSLEEP)) != NULL) 7153 kmem_free((caddr_t)bfp, sizeof (*bfp)); 7154 7155 if (s->s_databuffer_size) { 7156 kmem_free(s->s_databuffer, s->s_databuffer_size); 7157 s->s_databuffer_size = 0; 7158 } 7159 7160 if (s->s_setname != NULL) 7161 kmem_free((caddr_t)s->s_setname, strlen(s->s_setname)+1); 7162 7163 /* Data tags not supported on MN sets. */ 7164 if (!(md_get_setstatus(setno) & MD_SET_MNSET)) 7165 dtl_freel(&s->s_dtlp); 7166 7167 md_set[setno].s_db = NULL; 7168 ASSERT(s->s_singlelockwanted == 0); 7169 kmem_free(s, sizeof (mddb_set_t)); 7170 7171 /* Take care of things setup in the md_set array */ 7172 if (! (md_get_setstatus(setno) & MD_SET_KEEPTAG)) { 7173 if (md_set[setno].s_dtp) { 7174 kmem_free((caddr_t)md_set[setno].s_dtp, MDDB_DT_BYTES); 7175 md_set[setno].s_dtp = NULL; 7176 } 7177 } 7178 7179 md_clr_setstatus(setno, MD_SET_ACCOK | MD_SET_ACCEPT | 7180 MD_SET_TAGDATA | MD_SET_USETAG | 7181 MD_SET_TOOFEW | MD_SET_STALE | 7182 MD_SET_OWNERSHIP | MD_SET_BADTAG | 7183 MD_SET_CLRTAG | MD_SET_MNSET | 7184 MD_SET_DIDCLUP | MD_SET_MNPARSE_BLK | 7185 MD_SET_MN_MIR_STATE_RC | MD_SET_IMPORT | 7186 MD_SET_REPLICATED_IMPORT); 7187 7188 mutex_exit(SETMUTEX(setno)); 7189 } 7190 7191 /* 7192 * returns 0 if name can be put into locator block 7193 * returns 1 if locator block prefixes are all used 7194 * 7195 * Takes splitname (suffix, prefix, sideno) and 7196 * stores it in the locator name structure. 7197 * For traditional diskset, the sideno is the index into the suffixes 7198 * array in the locator name structure. 7199 * For the MN diskset, the sideno is the nodeid which can be any number, 7200 * so the index passed in is the index into the mnsuffixes array 7201 * in the locator structure. This index was computed by the 7202 * routine checklocator which basically checked the locator block 7203 * mnside locator structure. 7204 */ 7205 static int 7206 splitname2locatorblock( 7207 md_splitname *spn, 7208 mddb_ln_t *lnp, 7209 int li, 7210 side_t sideno, 7211 int index 7212 ) 7213 { 7214 uchar_t i; 7215 md_name_suffix *sn; 7216 md_mnname_suffix_t *mnsn; 7217 mddb_mnln_t *mnlnp; 7218 7219 for (i = 0; i < MDDB_PREFIXCNT; i++) { 7220 if (lnp->ln_prefixes[i].pre_len != SPN_PREFIX(spn).pre_len) 7221 continue; 7222 if (bcmp(lnp->ln_prefixes[i].pre_data, SPN_PREFIX(spn).pre_data, 7223 SPN_PREFIX(spn).pre_len) == 0) 7224 break; 7225 } 7226 if (i == MDDB_PREFIXCNT) { 7227 for (i = 0; i < MDDB_PREFIXCNT; i++) { 7228 if (lnp->ln_prefixes[i].pre_len == 0) 7229 break; 7230 } 7231 if (i == MDDB_PREFIXCNT) 7232 return (1); 7233 bcopy(SPN_PREFIX(spn).pre_data, lnp->ln_prefixes[i].pre_data, 7234 SPN_PREFIX(spn).pre_len); 7235 lnp->ln_prefixes[i].pre_len = SPN_PREFIX(spn).pre_len; 7236 } 7237 7238 if (lnp->ln_revision == MDDB_REV_MNLN) { 7239 /* If a MN diskset, use index */ 7240 mnlnp = (mddb_mnln_t *)lnp; 7241 mnsn = &mnlnp->ln_mnsuffixes[index][li]; 7242 mnsn->mn_ln_sideno = sideno; 7243 mnsn->mn_ln_suffix.suf_len = SPN_SUFFIX(spn).suf_len; 7244 mnsn->mn_ln_suffix.suf_prefix = i; 7245 bcopy(SPN_SUFFIX(spn).suf_data, 7246 mnsn->mn_ln_suffix.suf_data, SPN_SUFFIX(spn).suf_len); 7247 } else { 7248 sn = &lnp->ln_suffixes[sideno][li]; 7249 sn->suf_len = SPN_SUFFIX(spn).suf_len; 7250 sn->suf_prefix = i; 7251 bcopy(SPN_SUFFIX(spn).suf_data, sn->suf_data, 7252 SPN_SUFFIX(spn).suf_len); 7253 } 7254 return (0); 7255 } 7256 7257 /* 7258 * Find the locator name for the given sideno and convert the locator name 7259 * information into a splitname structure. 7260 */ 7261 void 7262 mddb_locatorblock2splitname( 7263 mddb_ln_t *lnp, 7264 int li, 7265 side_t sideno, 7266 md_splitname *spn 7267 ) 7268 { 7269 int iprefix; 7270 md_name_suffix *sn; 7271 md_mnname_suffix_t *mnsn; 7272 int i; 7273 mddb_mnln_t *mnlnp; 7274 7275 if (lnp->ln_revision == MDDB_REV_MNLN) { 7276 mnlnp = (mddb_mnln_t *)lnp; 7277 for (i = 0; i < MD_MNMAXSIDES; i++) { 7278 mnsn = &mnlnp->ln_mnsuffixes[i][li]; 7279 if (mnsn->mn_ln_sideno == sideno) 7280 break; 7281 } 7282 if (i == MD_MNMAXSIDES) 7283 return; 7284 7285 SPN_SUFFIX(spn).suf_len = mnsn->mn_ln_suffix.suf_len; 7286 bcopy(mnsn->mn_ln_suffix.suf_data, SPN_SUFFIX(spn).suf_data, 7287 SPN_SUFFIX(spn).suf_len); 7288 iprefix = mnsn->mn_ln_suffix.suf_prefix; 7289 } else { 7290 sn = &lnp->ln_suffixes[sideno][li]; 7291 SPN_SUFFIX(spn).suf_len = sn->suf_len; 7292 bcopy(sn->suf_data, SPN_SUFFIX(spn).suf_data, 7293 SPN_SUFFIX(spn).suf_len); 7294 iprefix = sn->suf_prefix; 7295 } 7296 SPN_PREFIX(spn).pre_len = lnp->ln_prefixes[iprefix].pre_len; 7297 bcopy(lnp->ln_prefixes[iprefix].pre_data, SPN_PREFIX(spn).pre_data, 7298 SPN_PREFIX(spn).pre_len); 7299 } 7300 7301 static int 7302 getdeldev( 7303 mddb_config_t *cp, 7304 int command, 7305 md_error_t *ep 7306 ) 7307 { 7308 mddb_set_t *s; 7309 mddb_lb_t *lbp; 7310 mddb_locator_t *locators; 7311 uint_t loccnt; 7312 mddb_mb_ic_t *mbip; 7313 mddb_block_t blk; 7314 int err = 0; 7315 int i, j; 7316 int li; 7317 uint_t commitcnt; 7318 set_t setno = cp->c_setno; 7319 uint_t set_status; 7320 md_dev64_t dev; 7321 int flags = MDDB_MUSTEXIST; 7322 7323 cp->c_dbmax = MDDB_NLB; 7324 7325 /* 7326 * Data checking 7327 */ 7328 if (setno >= md_nsets || cp->c_id < 0 || 7329 cp->c_id > cp->c_dbmax) { 7330 return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR)); 7331 } 7332 7333 if (cp->c_flags & MDDB_C_STALE) 7334 flags |= MDDB_MN_STALE; 7335 7336 if ((s = mddb_setenter(setno, flags, &err)) == NULL) 7337 return (mddbstatus2error(ep, err, NODEV32, setno)); 7338 7339 cp->c_flags = 0; 7340 7341 lbp = s->s_lbp; 7342 loccnt = lbp->lb_loccnt; 7343 locators = lbp->lb_locators; 7344 7345 /* shorthand */ 7346 set_status = md_get_setstatus(setno); 7347 7348 if (set_status & MD_SET_STALE) 7349 cp->c_flags |= MDDB_C_STALE; 7350 7351 if (set_status & MD_SET_TOOFEW) 7352 cp->c_flags |= MDDB_C_TOOFEW; 7353 7354 cp->c_sideno = s->s_sideno; 7355 7356 cp->c_dbcnt = 0; 7357 /* 7358 * go through and count active entries 7359 */ 7360 for (i = 0; i < loccnt; i++) { 7361 if (locators[i].l_flags & MDDB_F_DELETED) 7362 continue; 7363 cp->c_dbcnt++; 7364 } 7365 7366 /* 7367 * add the ability to accept a locator block index 7368 * which is not relative to previously deleted replicas. This 7369 * is for support of MD_DEBUG=STAT in metastat since it asks for 7370 * replica information specifically for each of the mirror resync 7371 * records. MDDB_CONFIG_SUBCMD uses one of the pad spares in 7372 * the mddb_config_t type. 7373 */ 7374 if (cp->c_subcmd == MDDB_CONFIG_ABS) { 7375 if (cp->c_id < 0 || cp->c_id > cp->c_dbmax) { 7376 mddb_setexit(s); 7377 return (mdmddberror(ep, MDE_DB_INVALID, NODEV32, 7378 setno)); 7379 } 7380 li = cp->c_id; 7381 } else { 7382 if (cp->c_id >= cp->c_dbcnt) { 7383 mddb_setexit(s); 7384 return (mdmddberror(ep, MDE_DB_INVALID, NODEV32, 7385 setno)); 7386 } 7387 7388 /* CSTYLED */ 7389 for (li = 0, j = 0; /* void */; li++) { 7390 if (locators[li].l_flags & MDDB_F_DELETED) 7391 continue; 7392 j++; 7393 if (j > cp->c_id) 7394 break; 7395 } 7396 } 7397 7398 if (command == MDDB_ENDDEV) { 7399 daddr_t ib = 0, jb; 7400 7401 blk = 0; 7402 if ((s != NULL) && s->s_mbiarray[li]) { 7403 mbip = s->s_mbiarray[li]; 7404 while ((jb = getphysblk(blk++, mbip)) > 0) { 7405 if (jb > ib) 7406 ib = jb; 7407 } 7408 cp->c_dbend = (int)ib; 7409 } else { 7410 cp->c_dbend = 0; 7411 } 7412 } 7413 7414 locator2cfgloc(lbp, &cp->c_locator, li, s->s_sideno, s->s_did_icp); 7415 mddb_locatorblock2splitname(s->s_lnp, li, s->s_sideno, &cp->c_devname); 7416 7417 if (command != MDDB_DELDEV) { 7418 mddb_setexit(s); 7419 return (0); 7420 } 7421 7422 /* Currently don't allow addition/deletion of sides during upgrade */ 7423 if (MD_UPGRADE) { 7424 cmn_err(CE_WARN, 7425 "Deletion of replica not allowed during upgrade.\n"); 7426 mddb_setexit(s); 7427 return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno)); 7428 } 7429 7430 /* 7431 * If here, replica delete in progress. 7432 */ 7433 single_thread_start(s); 7434 7435 if ((! (locators[li].l_flags & MDDB_F_EMASTER)) && 7436 (locators[li].l_flags & MDDB_F_ACTIVE)) { 7437 commitcnt = lbp->lb_commitcnt; 7438 lbp->lb_commitcnt = 0; 7439 setidentifier(s, &lbp->lb_ident); 7440 crcgen(lbp, &lbp->lb_checksum, dbtob(lbp->lb_blkcnt), NULL); 7441 /* 7442 * Don't need to write out device id area, since locator 7443 * block on this replica is being deleted by setting the 7444 * commitcnt to 0. 7445 */ 7446 (void) writeblks(s, (caddr_t)lbp, 0, lbp->lb_blkcnt, li, 7447 MDDB_WR_ONLY_MASTER); 7448 lbp->lb_commitcnt = commitcnt; 7449 } 7450 7451 if (s->s_mbiarray[li]) 7452 free_mbipp(&s->s_mbiarray[li]); 7453 7454 if (! (locators[li].l_flags & MDDB_F_EMASTER)) { 7455 dev = md_expldev(locators[li].l_dev); 7456 dev = md_xlate_targ_2_mini(dev); 7457 if (dev != NODEV64) 7458 mddb_devclose(dev); 7459 } 7460 7461 s->s_mbiarray[li] = 0; 7462 lbp->lb_locators[li].l_flags = MDDB_F_DELETED; 7463 7464 /* Only support data tags for traditional and local sets */ 7465 if ((md_get_setstatus(setno) & MD_SET_STALE) && 7466 (!(lbp->lb_flags & MDDB_MNSET)) && 7467 setno != MD_LOCAL_SET) 7468 if (set_dtag(s, ep)) 7469 mdclrerror(ep); 7470 7471 /* Write data tags to all accessible devices */ 7472 /* Only support data tags for traditional and local sets */ 7473 if (!(lbp->lb_flags & MDDB_MNSET)) { 7474 (void) dt_write(s); 7475 } 7476 7477 /* Delete device id of deleted replica */ 7478 if (lbp->lb_flags & MDDB_DEVID_STYLE) { 7479 (void) mddb_devid_delete(s, li); 7480 } 7481 /* write new locator to all devices */ 7482 err = writelocall(s); 7483 7484 (void) upd_med(s, "getdeldev(0)"); 7485 7486 SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, SVM_TAG_REPLICA, setno, 7487 md_expldev(locators[li].l_dev)); 7488 7489 computefreeblks(s); /* recompute always it may be larger */ 7490 cp->c_dbcnt--; 7491 err |= fixoptrecords(s); 7492 if (err) { 7493 if (writeretry(s)) { 7494 single_thread_end(s); 7495 mddb_setexit(s); 7496 return (mdmddberror(ep, MDDB_E_NOTNOW, NODEV32, setno)); 7497 } 7498 } 7499 7500 single_thread_end(s); 7501 mddb_setexit(s); 7502 return (0); 7503 } 7504 7505 static int 7506 getdriver( 7507 mddb_cfg_loc_t *clp 7508 ) 7509 { 7510 major_t majordev; 7511 7512 /* 7513 * Data checking 7514 */ 7515 if (clp->l_dev <= 0) 7516 return (EINVAL); 7517 7518 majordev = getmajor(expldev(clp->l_dev)); 7519 7520 if (ddi_major_to_name(majordev) == (char *)NULL) 7521 return (EINVAL); 7522 7523 if (MD_UPGRADE) 7524 (void) strcpy(clp->l_driver, md_targ_major_to_name(majordev)); 7525 else 7526 (void) strcpy(clp->l_driver, ddi_major_to_name(majordev)); 7527 return (0); 7528 } 7529 7530 /* 7531 * update_valid_replica - updates the locator block namespace (prefix 7532 * and/or suffix) with new pathname and devname. 7533 * RETURN 7534 * 1 Error 7535 * 0 Success 7536 */ 7537 static int 7538 update_valid_replica( 7539 side_t side, 7540 mddb_locator_t *lp, 7541 mddb_set_t *s, 7542 int li, 7543 char *devname, 7544 char *pathname, 7545 md_dev64_t devt 7546 ) 7547 { 7548 uchar_t pre_len, suf_len; 7549 md_name_suffix *sn; 7550 mddb_ln_t *lnp; 7551 uchar_t pre_index; 7552 uchar_t i; 7553 7554 if (md_expldev(lp->l_dev) != devt) { 7555 return (0); 7556 } 7557 7558 if (pathname[strlen(pathname) - 1] == '/') 7559 pathname[strlen(pathname) - 1] = '\0'; 7560 7561 pre_len = (uchar_t)strlen(pathname); 7562 suf_len = (uchar_t)strlen(devname); 7563 7564 if ((pre_len > MD_MAXPREFIX) || (suf_len > MD_MAXSUFFIX)) 7565 return (1); 7566 7567 lnp = s->s_lnp; 7568 7569 /* 7570 * Future note: Need to do something here for the MN diskset case 7571 * when device ids are supported in disksets. 7572 * Can't add until merging devids_in_diskset code into code base 7573 * Currently only called with side of 0. 7574 */ 7575 7576 sn = &lnp->ln_suffixes[side][li]; 7577 7578 /* 7579 * Check if prefix (Ex: /dev/dsk) needs to be changed. 7580 * If new prefix is the same as the previous prefix - no change. 7581 * 7582 * If new prefix is not the same, check if new prefix 7583 * matches an existing one. If so, use that one. 7584 * 7585 * If new prefix doesn't exist, add a new prefix. If not enough 7586 * space, return failure. 7587 */ 7588 pre_index = sn->suf_prefix; 7589 /* Check if new prefix is the same as the old prefix. */ 7590 if ((lnp->ln_prefixes[pre_index].pre_len != pre_len) || 7591 (bcmp(lnp->ln_prefixes[pre_index].pre_data, pathname, 7592 pre_len) != 0)) { 7593 /* Check if new prefix is an already known prefix. */ 7594 for (i = 0; i < MDDB_PREFIXCNT; i++) { 7595 if (lnp->ln_prefixes[i].pre_len != pre_len) { 7596 continue; 7597 } 7598 if (bcmp(lnp->ln_prefixes[i].pre_data, pathname, 7599 pre_len) == 0) { 7600 break; 7601 } 7602 } 7603 /* If no match found for new prefix - add the new prefix */ 7604 if (i == MDDB_PREFIXCNT) { 7605 for (i = 0; i < MDDB_PREFIXCNT; i++) { 7606 if (lnp->ln_prefixes[i].pre_len == 0) 7607 break; 7608 } 7609 /* No space to add new prefix - return failure */ 7610 if (i == MDDB_PREFIXCNT) { 7611 return (1); 7612 } 7613 bcopy(pathname, lnp->ln_prefixes[i].pre_data, pre_len); 7614 lnp->ln_prefixes[i].pre_len = pre_len; 7615 } 7616 sn->suf_prefix = i; 7617 } 7618 7619 /* Now, update the suffix (Ex: c0t0d0s0) if needed */ 7620 if ((sn->suf_len != suf_len) || 7621 (bcmp(sn->suf_data, devname, suf_len) != 0)) { 7622 bcopy(devname, sn->suf_data, suf_len); 7623 sn->suf_len = suf_len; 7624 } 7625 return (0); 7626 } 7627 7628 7629 /* 7630 * md_update_locator_namespace - If in devid style and active and the devid's 7631 * exist and are valid update the locator namespace pathname 7632 * and devname. 7633 * RETURN 7634 * 1 Error 7635 * 0 Success 7636 */ 7637 int 7638 md_update_locator_namespace( 7639 set_t setno, /* which set to get name from */ 7640 side_t side, 7641 char *dname, 7642 char *pname, 7643 md_dev64_t devt 7644 ) 7645 { 7646 mddb_set_t *s; 7647 mddb_lb_t *lbp; 7648 int li; 7649 uint_t flg; 7650 int err = 0; 7651 mddb_ln_t *lnp; 7652 7653 if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) 7654 return (1); 7655 single_thread_start(s); 7656 lbp = s->s_lbp; 7657 /* must be DEVID_STYLE */ 7658 if (lbp->lb_flags & MDDB_DEVID_STYLE) { 7659 for (li = 0; li < lbp->lb_loccnt; li++) { 7660 mddb_locator_t *lp = &lbp->lb_locators[li]; 7661 7662 if (lp->l_flags & MDDB_F_DELETED) { 7663 continue; 7664 } 7665 7666 /* replica also must be active */ 7667 if (lp->l_flags & MDDB_F_ACTIVE) { 7668 flg = s->s_did_icp->did_ic_blkp-> 7669 blk_info[li].info_flags; 7670 /* only update if did exists and is valid */ 7671 if ((flg & MDDB_DID_EXISTS) && 7672 (flg & MDDB_DID_VALID)) { 7673 if (update_valid_replica(side, lp, s, 7674 li, dname, pname, devt)) { 7675 err = 1; 7676 goto out; 7677 } 7678 } 7679 } 7680 } 7681 } 7682 lnp = s->s_lnp; 7683 uniqtime32(&lnp->ln_timestamp); 7684 if (lbp->lb_flags & MDDB_MNSET) 7685 lnp->ln_revision = MDDB_REV_MNLN; 7686 else 7687 lnp->ln_revision = MDDB_REV_LN; 7688 crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL); 7689 err = writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk, 7690 lbp->lb_lnblkcnt, 0); 7691 /* 7692 * If a MN diskset and this is the master, set the PARSE_LOCNM 7693 * flag in the mddb_set structure to show that the locator 7694 * names have changed. 7695 */ 7696 7697 if ((lbp->lb_flags & MDDB_MNSET) && 7698 (md_set[s->s_setno].s_am_i_master)) { 7699 s->s_mn_parseflags |= MDDB_PARSE_LOCNM; 7700 } 7701 out: 7702 single_thread_end(s); 7703 mddb_setexit(s); 7704 if (err) 7705 return (1); 7706 return (0); 7707 } 7708 7709 /* 7710 * update_locatorblock - for active entries in the locator block, check 7711 * the devt to see if it matches the given devt. If so, and 7712 * there is an associated device id which is not the same 7713 * as the passed in devid, delete old devid and add a new one. 7714 * 7715 * During import of replicated disksets, old_didptr contains 7716 * the original disk's device id. Use this device id in 7717 * addition to the devt to determine if an entry is a match 7718 * and should be updated with the new device id of the 7719 * replicated disk. Specifically, this is the case being handled: 7720 * 7721 * Original_disk Replicated_disk Disk_Available_During_Import 7722 * c1t1d0 c1t3d0 no - so old name c1t1d0 shown 7723 * c1t2d0 c1t1d0 yes - name is c1t1d0 7724 * c1t3d0 c1t2d0 yes - name is c1t2d0 7725 * 7726 * Can't just match on devt since devt for the first and third 7727 * disks will be the same, but the original disk's device id 7728 * is known and can be used to distinguish which disk's 7729 * replicated device id should be updated. 7730 * RETURN 7731 * MDDB_E_NODEVID 7732 * MDDB_E_NOLOCBLK 7733 * 1 Error 7734 * 0 Success 7735 */ 7736 static int 7737 update_locatorblock( 7738 mddb_set_t *s, 7739 md_dev64_t dev, 7740 ddi_devid_t didptr, 7741 ddi_devid_t old_didptr 7742 ) 7743 { 7744 mddb_lb_t *lbp = NULL; 7745 mddb_locator_t *lp; 7746 int li; 7747 uint_t flg; 7748 ddi_devid_t devid_ptr; 7749 int retval = 0; 7750 char *minor_name; 7751 int repl_import_flag; 7752 7753 /* Set replicated flag if this is a replicated import */ 7754 repl_import_flag = md_get_setstatus(s->s_setno) & 7755 MD_SET_REPLICATED_IMPORT; 7756 7757 lbp = s->s_lbp; 7758 /* find replicas that haven't been deleted */ 7759 for (li = 0; li < lbp->lb_loccnt; li++) { 7760 lp = &lbp->lb_locators[li]; 7761 7762 if ((lp->l_flags & MDDB_F_DELETED)) { 7763 continue; 7764 } 7765 /* 7766 * check to see if locator devt matches given dev 7767 * and if there is a device ID associated with it 7768 */ 7769 flg = s->s_did_icp->did_ic_blkp-> blk_info[li].info_flags; 7770 if ((md_expldev(lp->l_dev) == dev) && 7771 (flg & MDDB_DID_EXISTS)) { 7772 if (flg & MDDB_DID_VALID) { 7773 continue; /* cont to nxt active entry */ 7774 } 7775 devid_ptr = s->s_did_icp->did_ic_devid[li]; 7776 if (devid_ptr == NULL) { 7777 return (MDDB_E_NODEVID); 7778 } 7779 7780 /* 7781 * During a replicated import the old_didptr 7782 * must match the current devid before the 7783 * devid can be updated. 7784 */ 7785 if (repl_import_flag) { 7786 if (ddi_devid_compare(devid_ptr, 7787 old_didptr) != 0) 7788 continue; 7789 } 7790 7791 if (ddi_devid_compare(devid_ptr, didptr) != 0) { 7792 /* 7793 * devid's not equal so 7794 * delete and add 7795 */ 7796 if (ddi_lyr_get_minor_name( 7797 md_dev64_to_dev(dev), 7798 S_IFBLK, &minor_name) == DDI_SUCCESS) { 7799 (void) mddb_devid_delete(s, li); 7800 (void) mddb_devid_add(s, li, didptr, 7801 minor_name); 7802 kmem_free(minor_name, 7803 strlen(minor_name)+1); 7804 break; 7805 } else { 7806 retval = 1; 7807 goto err_out; 7808 } 7809 } 7810 } 7811 } /* end for */ 7812 retval = push_lb(s); 7813 (void) upd_med(s, "update_locatorblock(0)"); 7814 err_out: 7815 return (retval); 7816 } 7817 7818 static int 7819 update_mb_devid( 7820 mddb_set_t *s, 7821 mddb_ri_t *rip, 7822 ddi_devid_t devidptr 7823 ) 7824 { 7825 mddb_mb_ic_t *mbip; 7826 mddb_mb_t *mb = NULL; 7827 daddr_t blkno; 7828 md_dev64_t device; 7829 uint_t sz; 7830 int mb2free = 0; 7831 int err = 0; 7832 7833 7834 /* 7835 * There is case where a disk may not have mddb, 7836 * and only has dummy mddb which contains 7837 * a valid devid we like to update and in this 7838 * case, the rip_lbp will be NULL but we still 7839 * like to update the devid embedded in the 7840 * dummy mb block. 7841 * 7842 */ 7843 if (rip->ri_mbip != (mddb_mb_ic_t *)NULL) { 7844 mbip = rip->ri_mbip; 7845 mb = &mbip->mbi_mddb_mb; 7846 } else { 7847 /* 7848 * Done if it is non-replicated set 7849 */ 7850 if (devidptr != (ddi_devid_t)NULL) { 7851 mb = (mddb_mb_t *)kmem_zalloc(MDDB_BSIZE, 7852 KM_SLEEP); 7853 mb->mb_magic = MDDB_MAGIC_DU; 7854 mb->mb_revision = MDDB_REV_MB; 7855 mb2free = 1; 7856 } else { 7857 goto out; 7858 } 7859 } 7860 7861 blkno = rip->ri_blkno; 7862 device = rip->ri_dev; 7863 /* 7864 * Replace the mb_devid with the new/valid one 7865 */ 7866 if (devidptr != (ddi_devid_t)NULL) { 7867 /* 7868 * Zero out what we have previously 7869 */ 7870 if (mb->mb_devid_len) 7871 bzero(mb->mb_devid, mb->mb_devid_len); 7872 sz = ddi_devid_sizeof(devidptr); 7873 bcopy((char *)devidptr, (char *)mb->mb_devid, sz); 7874 mb->mb_devid_len = sz; 7875 } 7876 7877 mb->mb_setno = s->s_setno; 7878 uniqtime32(&mb->mb_timestamp); 7879 crcgen(mb, &mb->mb_checksum, MDDB_BSIZE, NULL); 7880 /* 7881 * putblks will 7882 * 7883 * - drop the s_dbmx lock 7884 * - biowait 7885 * - regain the s_dbmx lock 7886 * 7887 * Need to update this if we wants to handle 7888 * mb_next != NULL which it is unlikely will happen 7889 */ 7890 err = putblks(s, (caddr_t)mb, blkno, 1, device, 0); 7891 7892 if (mb2free) { 7893 kmem_free(mb, MDDB_BSIZE); 7894 } 7895 out: 7896 return (err); 7897 } 7898 7899 static int 7900 setdid( 7901 mddb_config_t *cp 7902 ) 7903 { 7904 ddi_devid_t devidp; 7905 dev_t ddi_dev; 7906 mddb_set_t *s; 7907 int err = 0; 7908 mddb_ri_t *rip; 7909 7910 /* 7911 * Data integrity check 7912 */ 7913 if (cp->c_setno >= md_nsets || cp->c_devt <= 0) 7914 return (EINVAL); 7915 7916 if ((md_get_setstatus(cp->c_setno) & MD_SET_STALE)) 7917 return (0); 7918 7919 ddi_dev = md_dev64_to_dev(cp->c_devt); 7920 if (ddi_lyr_get_devid(ddi_dev, &devidp) != DDI_SUCCESS) { 7921 return (-1); 7922 } 7923 if (devidp == NULL) { 7924 return (-1); 7925 } 7926 7927 if ((s = mddb_setenter(cp->c_setno, MDDB_MUSTEXIST, &err)) == NULL) 7928 return (-1); 7929 single_thread_start(s); 7930 7931 for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) { 7932 if (rip->ri_lbp == (mddb_lb_t *)NULL) 7933 continue; 7934 /* 7935 * We only update what is asked 7936 */ 7937 if (rip->ri_dev == cp->c_devt) { 7938 if (update_mb_devid(s, rip, devidp) != 0) { 7939 err = -1; 7940 goto out; 7941 } 7942 } 7943 } 7944 7945 if (update_locatorblock(s, cp->c_devt, devidp, NULL)) { 7946 err = -1; 7947 goto out; 7948 } 7949 7950 out: 7951 single_thread_end(s); 7952 mddb_setexit(s); 7953 ddi_devid_free(devidp); 7954 return (err); 7955 } 7956 7957 static int 7958 delnewside( 7959 mddb_config_t *cp, 7960 int command, 7961 md_error_t *ep 7962 ) 7963 { 7964 mddb_set_t *s; 7965 int li; 7966 mddb_lb_t *lbp; /* pointer to locator block */ 7967 mddb_ln_t *lnp; /* pointer to locator names */ 7968 mddb_mnln_t *mnlnp; /* pointer to locator names */ 7969 mddb_locator_t *lp; 7970 mddb_sidelocator_t *slp; 7971 mddb_cfg_loc_t *clp; 7972 int err = 0; 7973 set_t setno = cp->c_setno; 7974 ddi_devid_t devid; 7975 ddi_devid_t ret_devid = NULL; 7976 char *minor_name; 7977 uint_t use_devid = 0; 7978 dev_t ddi_dev; 7979 md_mnname_suffix_t *mnsn; 7980 mddb_mnlb_t *mnlbp; 7981 mddb_mnsidelocator_t *mnslp; 7982 7983 /* Currently don't allow addition/deletion of sides during upgrade */ 7984 if (MD_UPGRADE) { 7985 cmn_err(CE_WARN, 7986 "Addition and deletion of sides not allowed" 7987 " during upgrade. \n"); 7988 return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno)); 7989 } 7990 7991 /* 7992 * Data integrity check 7993 */ 7994 if (setno >= md_nsets || cp->c_locator.l_dev <= 0) 7995 return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR)); 7996 7997 if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) 7998 return (mddbstatus2error(ep, err, NODEV32, setno)); 7999 8000 single_thread_start(s); 8001 clp = &cp->c_locator; 8002 8003 lbp = s->s_lbp; 8004 8005 if (lbp->lb_setno != setno) { 8006 single_thread_end(s); 8007 mddb_setexit(s); 8008 return (mdmddberror(ep, MDE_DB_INVALID, NODEV32, setno)); 8009 } 8010 8011 /* 8012 * Find this device/blkno pair 8013 */ 8014 if (lbp->lb_flags & MDDB_DEVID_STYLE) { 8015 ddi_dev = md_dev64_to_dev(clp->l_dev); 8016 if ((ddi_lyr_get_devid(ddi_dev, &ret_devid) == DDI_SUCCESS) && 8017 (ddi_lyr_get_minor_name(ddi_dev, S_IFBLK, &minor_name) 8018 == DDI_SUCCESS)) { 8019 if (strlen(minor_name) < MDDB_MINOR_NAME_MAX) { 8020 clp->l_devid = (uint64_t)(uintptr_t)ret_devid; 8021 use_devid = 1; 8022 (void) strcpy(clp->l_minor_name, minor_name); 8023 } 8024 kmem_free(minor_name, strlen(minor_name)+1); 8025 } 8026 if (use_devid != 1 && ret_devid != NULL) 8027 ddi_devid_free(ret_devid); 8028 } 8029 for (li = 0; li < lbp->lb_loccnt; li++) { 8030 lp = &lbp->lb_locators[li]; 8031 if (lp->l_flags & MDDB_F_DELETED) 8032 continue; 8033 if (use_devid) { 8034 if ((mddb_devid_get(s, li, &devid, &minor_name)) == 0) 8035 continue; 8036 if ((ddi_devid_compare(devid, 8037 (ddi_devid_t)(uintptr_t)clp->l_devid) == 0) && 8038 (strcmp(clp->l_minor_name, minor_name) == 0) && 8039 ((daddr_t)lp->l_blkno == clp->l_blkno)) { 8040 break; 8041 } 8042 } else { 8043 if (lp->l_dev == clp->l_dev && 8044 (daddr_t)lp->l_blkno == clp->l_blkno) { 8045 break; 8046 } 8047 } 8048 } 8049 8050 if (li == lbp->lb_loccnt) { 8051 if (use_devid) 8052 ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid); 8053 single_thread_end(s); 8054 mddb_setexit(s); 8055 return (mdmddberror(ep, MDE_DB_INVALID, NODEV32, setno)); 8056 } 8057 8058 lnp = s->s_lnp; 8059 if (command == MDDB_NEWSIDE) { 8060 int index = 0; 8061 /* 8062 * If a MN diskset, need to find the index where the new 8063 * locator information is to be stored in the mnsidelocator 8064 * field of the locator block so that the locator name can 8065 * be stored at the same array index in the mnsuffixes 8066 * field of the locator names structure. 8067 */ 8068 if (lbp->lb_flags & MDDB_MNSET) { 8069 if ((index = checklocator(lbp, li, 8070 cp->c_sideno)) == -1) { 8071 if (use_devid) { 8072 ddi_devid_free((ddi_devid_t) 8073 (uintptr_t)clp->l_devid); 8074 } 8075 single_thread_end(s); 8076 mddb_setexit(s); 8077 return (mdmddberror(ep, MDE_DB_TOOSMALL, 8078 NODEV32, setno)); 8079 } 8080 } 8081 8082 /* 8083 * Store the locator name before the sidelocator information 8084 * in case a panic occurs between these 2 steps. Must have 8085 * the locator name information in order to print reasonable 8086 * error information. 8087 */ 8088 if (splitname2locatorblock(&cp->c_devname, lnp, li, 8089 cp->c_sideno, index)) { 8090 if (use_devid) 8091 ddi_devid_free( 8092 (ddi_devid_t)(uintptr_t)clp->l_devid); 8093 single_thread_end(s); 8094 mddb_setexit(s); 8095 return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32, 8096 setno)); 8097 } 8098 8099 if (cfgloc2locator(lbp, clp, li, cp->c_sideno, index)) { 8100 if (use_devid) 8101 ddi_devid_free( 8102 (ddi_devid_t)(uintptr_t)clp->l_devid); 8103 single_thread_end(s); 8104 mddb_setexit(s); 8105 return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32, 8106 setno)); 8107 } 8108 } 8109 8110 if (use_devid) 8111 ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid); 8112 8113 if (command == MDDB_DELSIDE) { 8114 int i; 8115 for (i = 0; i < lbp->lb_loccnt; i++) { 8116 if (lbp->lb_flags & MDDB_MNSET) { 8117 int j; 8118 mnlbp = (mddb_mnlb_t *)lbp; 8119 for (j = 0; j < MD_MNMAXSIDES; j++) { 8120 mnslp = &mnlbp->lb_mnsidelocators[j][i]; 8121 if (mnslp->mnl_sideno == cp->c_sideno) 8122 break; 8123 } 8124 if (j < MD_MNMAXSIDES) { 8125 mnslp->mnl_mnum = NODEV32; 8126 mnslp->mnl_sideno = 0; 8127 mnlnp = (mddb_mnln_t *)lnp; 8128 mnsn = &(mnlnp->ln_mnsuffixes[j][i]); 8129 bzero((caddr_t)mnsn, 8130 sizeof (md_mnname_suffix_t)); 8131 } 8132 } else { 8133 slp = &lbp->lb_sidelocators[cp->c_sideno][i]; 8134 bzero((caddr_t)&lnp->ln_suffixes 8135 [cp->c_sideno][i], sizeof (md_name_suffix)); 8136 slp->l_mnum = NODEV32; 8137 } 8138 } 8139 } 8140 8141 /* write new locator names to all devices */ 8142 uniqtime32(&lnp->ln_timestamp); 8143 if (lbp->lb_flags & MDDB_MNSET) 8144 lnp->ln_revision = MDDB_REV_MNLN; 8145 else 8146 lnp->ln_revision = MDDB_REV_LN; 8147 crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL); 8148 err |= writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk, 8149 lbp->lb_lnblkcnt, 0); 8150 /* 8151 * If a MN diskset and this is the master, set the PARSE_LOCNM 8152 * flag in the mddb_set structure to show that the locator 8153 * names have changed. 8154 */ 8155 8156 if ((lbp->lb_flags & MDDB_MNSET) && 8157 (md_set[s->s_setno].s_am_i_master)) { 8158 s->s_mn_parseflags |= MDDB_PARSE_LOCNM; 8159 } 8160 if (err) { 8161 if (writeretry(s)) { 8162 single_thread_end(s); 8163 mddb_setexit(s); 8164 return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno)); 8165 } 8166 } 8167 8168 uniqtime32(&lbp->lb_timestamp); 8169 /* write new locator to all devices */ 8170 err = writelocall(s); 8171 8172 (void) upd_med(s, "delnewside(0)"); 8173 8174 computefreeblks(s); /* recompute always it may be larger */ 8175 if (err) { 8176 if (writeretry(s)) { 8177 single_thread_end(s); 8178 mddb_setexit(s); 8179 return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno)); 8180 } 8181 } 8182 8183 single_thread_end(s); 8184 mddb_setexit(s); 8185 8186 return (0); 8187 } 8188 8189 static int 8190 newdev( 8191 mddb_config_t *cp, 8192 int command, 8193 md_error_t *ep 8194 ) 8195 { 8196 mddb_set_t *s; 8197 mddb_mb_ic_t *mbip, *mbip1; 8198 int i, j; 8199 int li; 8200 mddb_lb_t *lbp; /* pointer to locator block */ 8201 mddb_ln_t *lnp; /* pointer to locator names */ 8202 mddb_locator_t *lp; 8203 mddb_cfg_loc_t *clp; 8204 int err = 0; 8205 set_t setno = cp->c_setno; 8206 ddi_devid_t devid2; 8207 ddi_devid_t ret_devid = NULL; 8208 char *minor_name; 8209 uint_t use_devid = 0; 8210 dev_t ddi_dev; 8211 int old_flags; 8212 int flags; 8213 int mn_set = 0; 8214 int index; 8215 8216 8217 /* Currently don't allow addition of new replica during upgrade */ 8218 if (MD_UPGRADE) { 8219 cmn_err(CE_WARN, 8220 "Addition of new replica not allowed during upgrade.\n"); 8221 return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno)); 8222 } 8223 8224 /* 8225 * Data integrity check 8226 */ 8227 if (setno >= md_nsets || cp->c_locator.l_dev <= 0) 8228 return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR)); 8229 8230 /* Determine the flag settings for multinode sets */ 8231 flags = MDDB_NOOLDOK; 8232 if (cp->c_multi_node) 8233 flags |= MDDB_MULTINODE; 8234 8235 if ((s = mddb_setenter(setno, flags, &err)) == NULL) { 8236 if (err != MDDB_E_NOTOWNER) 8237 return (mddbstatus2error(ep, err, NODEV32, setno)); 8238 s = init_set(cp, flags, &err); 8239 if (s == NULL) 8240 return (mddbstatus2error(ep, err, NODEV32, setno)); 8241 } 8242 8243 single_thread_start(s); 8244 8245 /* shorthand */ 8246 clp = &cp->c_locator; 8247 8248 /* shorthand */ 8249 lbp = s->s_lbp; 8250 8251 if (lbp->lb_setno != setno) { 8252 single_thread_end(s); 8253 mddb_setexit(s); 8254 return (mdmddberror(ep, MDE_DB_INVALID, NODEV32, setno)); 8255 } 8256 8257 /* 8258 * See if this device/blkno pair is already a replica 8259 */ 8260 if (lbp->lb_flags & MDDB_DEVID_STYLE) { 8261 ddi_dev = expldev(clp->l_dev); 8262 if ((ddi_lyr_get_devid(ddi_dev, &ret_devid) == DDI_SUCCESS) && 8263 (ddi_lyr_get_minor_name(ddi_dev, 8264 S_IFBLK, &minor_name) == DDI_SUCCESS)) { 8265 if (strlen(minor_name) < MDDB_MINOR_NAME_MAX) { 8266 clp->l_devid = (uint64_t)(uintptr_t)ret_devid; 8267 use_devid = 1; 8268 (void) strcpy(clp->l_minor_name, minor_name); 8269 } 8270 kmem_free(minor_name, strlen(minor_name)+1); 8271 } 8272 if (use_devid != 1 && ret_devid != NULL) 8273 ddi_devid_free(ret_devid); 8274 } 8275 8276 for (i = 0; i < lbp->lb_loccnt; i++) { 8277 lp = &lbp->lb_locators[i]; 8278 if (lp->l_flags & MDDB_F_DELETED) 8279 continue; 8280 if (use_devid) { 8281 if ((mddb_devid_get(s, i, &devid2, &minor_name)) == 0) 8282 continue; 8283 if ((ddi_devid_compare(devid2, 8284 (ddi_devid_t)(uintptr_t)clp->l_devid) == 0) && 8285 (strcmp(clp->l_minor_name, minor_name) == 0) && 8286 ((daddr_t)lp->l_blkno == clp->l_blkno)) { 8287 if (command == MDDB_NEWDEV) { 8288 ddi_devid_free((ddi_devid_t)(uintptr_t) 8289 clp->l_devid); 8290 single_thread_end(s); 8291 mddb_setexit(s); 8292 return (mdmddberror(ep, 8293 MDE_DB_EXISTS, NODEV32, setno)); 8294 } 8295 } 8296 } else { 8297 if (lp->l_dev == clp->l_dev && 8298 (daddr_t)lp->l_blkno == clp->l_blkno) { 8299 if (command == MDDB_NEWDEV) { 8300 single_thread_end(s); 8301 mddb_setexit(s); 8302 return (mdmddberror(ep, 8303 MDE_DB_EXISTS, NODEV32, setno)); 8304 } 8305 } 8306 } 8307 } 8308 8309 /* 8310 * Really is a new replica, go get the master blocks 8311 */ 8312 mbip = getmasters(s, md_expldev(clp->l_dev), clp->l_blkno, 8313 (uint_t *)0, &mn_set); 8314 if (! mbip) { 8315 if (use_devid) 8316 ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid); 8317 single_thread_end(s); 8318 mddb_setexit(s); 8319 return (mdmddberror(ep, MDE_DB_MASTER, NODEV32, setno)); 8320 } 8321 8322 /* 8323 * Compute free blocks in replica. 8324 */ 8325 computefreeblks(s); 8326 8327 /* 8328 * Check if this is large enough 8329 */ 8330 for (mbip1 = mbip, i = 0; mbip1 != NULL; mbip1 = mbip1->mbi_next) 8331 i += mbip1->mbi_mddb_mb.mb_blkcnt; 8332 for (j = i; j < s->s_totalblkcnt; j++) { 8333 if (blkcheck(s, j)) { 8334 while (mbip) { 8335 mbip1 = mbip->mbi_next; 8336 kmem_free((caddr_t)mbip, MDDB_IC_BSIZE); 8337 mbip = mbip1; 8338 } 8339 if (use_devid) 8340 ddi_devid_free( 8341 (ddi_devid_t)(uintptr_t)clp->l_devid); 8342 mddb_devclose(md_expldev(clp->l_dev)); 8343 single_thread_end(s); 8344 mddb_setexit(s); 8345 return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32, 8346 setno)); 8347 } 8348 } 8349 8350 /* Look for a deleted slot */ 8351 for (li = 0; li < lbp->lb_loccnt; li++) { 8352 lp = &lbp->lb_locators[li]; 8353 if (lp->l_flags & MDDB_F_DELETED) 8354 break; 8355 } 8356 8357 /* If no deleted slots, add a new one */ 8358 if (li == lbp->lb_loccnt) { 8359 /* Already have the max replicas, bail */ 8360 if (lbp->lb_loccnt == MDDB_NLB) { 8361 if (use_devid) 8362 ddi_devid_free((ddi_devid_t)(uintptr_t) 8363 clp->l_devid); 8364 mddb_devclose(md_expldev(clp->l_dev)); 8365 single_thread_end(s); 8366 mddb_setexit(s); 8367 return (mdmddberror(ep, MDE_TOOMANY_REPLICAS, NODEV32, 8368 setno)); 8369 } 8370 lbp->lb_loccnt++; 8371 lp = &lbp->lb_locators[li]; 8372 } 8373 8374 /* Initialize the new or deleted slot */ 8375 old_flags = lp->l_flags; 8376 lp->l_dev = clp->l_dev; 8377 lp->l_blkno = (daddr32_t)clp->l_blkno; 8378 lp->l_flags = clp->l_flags; 8379 8380 /* shorthand */ 8381 lnp = s->s_lnp; 8382 8383 index = 0; 8384 if ((lbp->lb_flags & MDDB_MNSET) || (flags & MDDB_MULTINODE)) { 8385 /* 8386 * If a MN diskset, need to find the index where the new 8387 * locator information is to be stored in the mnsidelocator 8388 * field of the locator block so that the locator name can 8389 * be stored at the same array index in the mnsuffixes 8390 * field of the locator names structure. 8391 */ 8392 lbp->lb_flags |= MDDB_MNSET; 8393 if ((index = checklocator(lbp, li, s->s_sideno)) == -1) { 8394 if (use_devid) 8395 ddi_devid_free((ddi_devid_t)(uintptr_t)clp-> 8396 l_devid); 8397 lp->l_flags = old_flags; 8398 lbp->lb_loccnt--; 8399 mddb_devclose(md_expldev(clp->l_dev)); 8400 single_thread_end(s); 8401 mddb_setexit(s); 8402 return (mdmddberror(ep, MDE_DB_TOOSMALL, 8403 NODEV32, setno)); 8404 } 8405 } 8406 /* 8407 * Store the locator name before the sidelocator information 8408 * in case a panic occurs between these 2 steps. Must have 8409 * the locator name information in order to print reasonable 8410 * error information. 8411 */ 8412 if (splitname2locatorblock(&cp->c_devname, lnp, li, 8413 s->s_sideno, index)) { 8414 if (use_devid) 8415 ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid); 8416 lp->l_flags = old_flags; 8417 lbp->lb_loccnt--; 8418 mddb_devclose(md_expldev(clp->l_dev)); 8419 single_thread_end(s); 8420 mddb_setexit(s); 8421 return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32, setno)); 8422 } 8423 8424 /* 8425 * Compute free blocks in replica before calling cfgloc2locator 8426 * since cfgloc2locator may attempt to alloc an unused block 8427 * to store the device id. 8428 * mbiarray needs to be setup before calling computefreeblks. 8429 */ 8430 s->s_mbiarray[li] = mbip; 8431 computefreeblks(s); 8432 8433 if (cfgloc2locator(lbp, clp, li, s->s_sideno, index)) { 8434 if (use_devid) 8435 ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid); 8436 lp->l_flags = old_flags; 8437 lbp->lb_loccnt--; 8438 s->s_mbiarray[li] = 0; 8439 mddb_devclose(md_expldev(clp->l_dev)); 8440 single_thread_end(s); 8441 mddb_setexit(s); 8442 return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32, setno)); 8443 } 8444 8445 if (use_devid) 8446 ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid); 8447 8448 uniqtime32(&lbp->lb_timestamp); 8449 lp->l_flags = MDDB_F_ACTIVE; 8450 8451 /* write db copy to new device */ 8452 err = writecopy(s, li, MDDB_WRITECOPY_ALL); 8453 lp->l_flags |= MDDB_F_UP2DATE; 8454 8455 /* write new locator names to all devices */ 8456 uniqtime32(&lnp->ln_timestamp); 8457 if (lbp->lb_flags & MDDB_MNSET) 8458 lnp->ln_revision = MDDB_REV_MNLN; 8459 else 8460 lnp->ln_revision = MDDB_REV_LN; 8461 crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL); 8462 err |= writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk, 8463 lbp->lb_lnblkcnt, 0); 8464 /* 8465 * If a MN diskset and this is the master, set the PARSE_LOCNM 8466 * flag in the mddb_set structure to show that the locator 8467 * names have changed. 8468 */ 8469 8470 if ((lbp->lb_flags & MDDB_MNSET) && 8471 (md_set[s->s_setno].s_am_i_master)) { 8472 s->s_mn_parseflags |= MDDB_PARSE_LOCNM; 8473 } 8474 if (err) { 8475 if (writeretry(s)) { 8476 single_thread_end(s); 8477 mddb_setexit(s); 8478 return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno)); 8479 } 8480 } 8481 8482 /* Data tags not supported on MN sets */ 8483 if ((md_get_setstatus(setno) & MD_SET_STALE) && 8484 (!(lbp->lb_flags & MDDB_MNSET)) && 8485 setno != MD_LOCAL_SET) 8486 if (set_dtag(s, ep)) 8487 mdclrerror(ep); 8488 8489 /* Write data tags to all accessible devices */ 8490 /* Data tags not supported on MN sets */ 8491 if (!(lbp->lb_flags & MDDB_MNSET)) { 8492 (void) dt_write(s); 8493 } 8494 8495 /* write new locator to all devices */ 8496 err = writelocall(s); 8497 8498 (void) upd_med(s, "newdev(0)"); 8499 8500 SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_CREATE, SVM_TAG_REPLICA, setno, 8501 md_expldev(clp->l_dev)); 8502 8503 computefreeblks(s); /* recompute always it may be smaller */ 8504 if (err) { 8505 if (writeretry(s)) { 8506 single_thread_end(s); 8507 mddb_setexit(s); 8508 return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno)); 8509 } 8510 } 8511 8512 single_thread_end(s); 8513 mddb_setexit(s); 8514 8515 return (0); 8516 } 8517 8518 #ifdef DEBUG 8519 static void 8520 mddb_check_set( 8521 set_t setno 8522 ) 8523 { 8524 mddb_set_t *s; 8525 mddb_db_t *dbp; 8526 mddb_de_ic_t *dep; 8527 mddb_rb32_t *rbp; 8528 8529 if (! md_set[setno].s_db) 8530 return; 8531 8532 s = (mddb_set_t *)md_set[setno].s_db; 8533 8534 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 8535 for (dep = dbp->db_firstentry; 8536 dep != NULL; dep = dep->de_next) { 8537 rbp = dep->de_rb; 8538 ASSERT(rbp->rb_magic == MDDB_MAGIC_RB); 8539 if (dep->de_rb_userdata) 8540 ASSERT((uintptr_t)dep->de_rb_userdata > 2000); 8541 } 8542 } 8543 } 8544 #endif /* DEBUG */ 8545 8546 /* 8547 * Exported Entry Points 8548 */ 8549 #ifdef DEBUG 8550 void 8551 mddb_check(void) 8552 { 8553 int i; 8554 8555 for (i = 0; i < md_nsets; i++) { 8556 if (! md_set[i].s_db) 8557 return; 8558 8559 mddb_check_set(i); 8560 } 8561 8562 } 8563 #endif /* DEBUG */ 8564 8565 int 8566 mddb_configure( 8567 mddb_cfgcmd_t command, 8568 mddb_config_t *cp 8569 ) 8570 { 8571 mddb_set_t *s; 8572 md_error_t *ep = &cp->c_mde; 8573 int flag = 0; 8574 int err = 0; 8575 set_t setno = cp->c_setno; 8576 8577 mdclrerror(ep); 8578 8579 switch (command) { 8580 case MDDB_NEWDEV: 8581 err = newdev(cp, command, ep); 8582 break; 8583 8584 case MDDB_NEWSIDE: 8585 case MDDB_DELSIDE: 8586 err = delnewside(cp, command, ep); 8587 break; 8588 8589 case MDDB_GETDEV: 8590 case MDDB_DELDEV: 8591 case MDDB_ENDDEV: 8592 err = getdeldev(cp, command, ep); 8593 break; 8594 8595 case MDDB_GETDRVRNAME: 8596 err = getdriver(&cp->c_locator); 8597 break; 8598 8599 case MDDB_USEDEV: 8600 /* 8601 * Note: must allow USEDEV ioctl during upgrade to support 8602 * auto-take disksets. 8603 * 8604 * Also during the set import if the md_devid_destroy 8605 * flag is set then error out 8606 */ 8607 8608 if ((cp->c_flags & MDDB_C_IMPORT) && md_devid_destroy) 8609 return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR)); 8610 8611 if (setno >= md_nsets) 8612 return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR)); 8613 8614 if ((s = mddb_setenter(setno, MDDB_NOINIT, &err)) == NULL) { 8615 if ((s = init_set(cp, MDDB_NOINIT, &err)) == NULL) { 8616 err = mddbstatus2error(ep, err, NODEV32, setno); 8617 break; 8618 } 8619 } 8620 if (setno == MD_LOCAL_SET) 8621 flag = MDDB_F_IOCTL; 8622 if (cp->c_locator.l_old_devid) { 8623 md_set_setstatus(setno, MD_SET_REPLICATED_IMPORT); 8624 } 8625 err = ridev(&s->s_rip, &cp->c_locator, NULL, flag); 8626 mddb_setexit(s); 8627 break; 8628 8629 case MDDB_RELEASESET: 8630 mutex_enter(&mddb_lock); 8631 mddb_unload_set(cp->c_setno); 8632 mutex_exit(&mddb_lock); 8633 break; 8634 8635 case MDDB_SETDID: 8636 err = setdid(cp); 8637 break; 8638 8639 default: 8640 err = mdmddberror(ep, MDE_DB_INVALID, NODEV32, cp->c_setno); 8641 } 8642 8643 return (err); 8644 } 8645 8646 int 8647 mddb_getoptloc( 8648 mddb_optloc_t *ol 8649 ) 8650 { 8651 mddb_set_t *s; 8652 mddb_db_t *dbp; 8653 mddb_de_ic_t *dep; 8654 mddb_recid_t id; 8655 set_t setno; 8656 8657 ol->li[0] = -1; 8658 ol->li[1] = -1; 8659 8660 id = ol->recid; 8661 setno = DBSET(id); 8662 if (setno >= md_nsets) 8663 return (EINVAL); 8664 8665 if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, NULL)) == NULL) 8666 return (0); 8667 8668 id = DBID(id); 8669 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 8670 for (dep = dbp->db_firstentry; 8671 dep != NULL; dep = dep->de_next) { 8672 if (dep->de_recid != id) 8673 continue; 8674 ol->li[0] = dep->de_optinfo[0].o_li; 8675 ol->li[1] = dep->de_optinfo[1].o_li; 8676 mddb_setexit(s); 8677 return (0); 8678 } 8679 } 8680 mddb_setexit(s); 8681 return (0); 8682 } 8683 8684 void 8685 mddb_init(void) 8686 { 8687 mddb_set_t *s; 8688 8689 mutex_init(&mddb_lock, NULL, MUTEX_DEFAULT, NULL); 8690 if ((s = init_set(NULL, MDDB_NOINIT, NULL)) != NULL) 8691 mddb_setexit(s); 8692 } 8693 8694 8695 void 8696 mddb_unload(void) 8697 { 8698 int i; 8699 8700 mutex_enter(&mddb_lock); 8701 8702 for (i = 0; i < md_nsets; i++) { 8703 md_clr_setstatus(i, MD_SET_KEEPTAG); 8704 mddb_unload_set(i); 8705 } 8706 8707 crcfreetab(); 8708 8709 mutex_exit(&mddb_lock); 8710 } 8711 8712 mddb_recid_t 8713 mddb_createrec( 8714 size_t usersize, /* size of db record */ 8715 mddb_type_t type, /* type1 of db record */ 8716 uint_t type2, /* type2 of db record */ 8717 md_create_rec_option_t options, /* options for this creation */ 8718 set_t setno /* set number to create record in */ 8719 ) 8720 { 8721 mddb_set_t *s; 8722 mddb_db_t *dbp, *prevdbp, *newdbp; 8723 mddb_db32_t *db32p; 8724 mddb_de_ic_t *dep; 8725 /* LINTED variable unused - used for sizeof calculations */ 8726 mddb_de32_t *de32p; 8727 mddb_rb32_t *rbp; 8728 size_t recsize; 8729 ulong_t blkcnt; 8730 ulong_t maxblocks; 8731 size_t desize, desize_ic; 8732 size_t used; 8733 mddb_recid_t newid; 8734 caddr_t tmppnt; 8735 int i, err = 0; 8736 void *userdata; 8737 uint_t flag_type; 8738 8739 #if defined(_ILP32) && !defined(lint) 8740 ASSERT(sizeof (mddb_de_t) == sizeof (mddb_de32_t)); 8741 ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t)); 8742 ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t)); 8743 #endif 8744 8745 /* 8746 * everyone is supposed to sepcify if it's a 8747 * 32 bit or a 64 bit record 8748 */ 8749 if ((options &(MD_CRO_32BIT|MD_CRO_64BIT)) == 0) { 8750 return (MDDB_E_INVALID); 8751 } 8752 8753 if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) 8754 return (err); 8755 8756 if (checkstate(s, MDDB_PROBE)) { 8757 mddb_setexit(s); 8758 return (MDDB_E_NOTNOW); 8759 } 8760 8761 recsize = roundup((sizeof (*rbp) - sizeof (rbp->rb_data)) + 8762 usersize, MDDB_BSIZE); 8763 blkcnt = btodb(recsize); 8764 8765 if (mddb_maxblocks) 8766 maxblocks = mddb_maxblocks; 8767 else 8768 maxblocks = (MDDB_BSIZE - 8769 (sizeof (*db32p) + sizeof (*de32p) - 8770 sizeof (de32p->de32_blks))) / sizeof (mddb_block_t); 8771 8772 if (blkcnt > maxblocks) { 8773 mddb_setexit(s); 8774 return (MDDB_E_INVALID); 8775 } 8776 /* 8777 * allocate record block 8778 * and new directory block so to avoid sleeping 8779 * after starting single_thread 8780 */ 8781 rbp = (mddb_rb32_t *)kmem_zalloc(recsize, KM_SLEEP); 8782 if ((options & MD_CRO_OPTIMIZE) == 0) 8783 userdata = kmem_zalloc(usersize, KM_SLEEP); 8784 newdbp = (mddb_db_t *)kmem_zalloc(sizeof (*newdbp), KM_SLEEP); 8785 8786 /* 8787 * if this is the largest record allocate new buffer for 8788 * checkcopy(); 8789 */ 8790 if (recsize > s->s_databuffer_size) { 8791 tmppnt = (caddr_t)kmem_zalloc(recsize, KM_SLEEP); 8792 /* 8793 * this test is incase when to sleep during kmem_alloc 8794 * and some other task bumped max record size 8795 */ 8796 if (recsize > s->s_databuffer_size) { 8797 if (s->s_databuffer_size) 8798 kmem_free(s->s_databuffer, 8799 s->s_databuffer_size); 8800 s->s_databuffer = tmppnt; 8801 s->s_databuffer_size = recsize; 8802 } else { 8803 kmem_free(tmppnt, recsize); 8804 } 8805 } 8806 8807 single_thread_start(s); 8808 8809 newid = 0; 8810 do { 8811 newid++; 8812 if (DBID(newid) == 0) { 8813 kmem_free((caddr_t)newdbp, sizeof (*newdbp)); 8814 kmem_free((caddr_t)rbp, ((size_t)recsize)); 8815 if ((options & MD_CRO_OPTIMIZE) == 0) 8816 kmem_free(userdata, usersize); 8817 single_thread_end(s); 8818 mddb_setexit(s); 8819 return (MDDB_E_NOTNOW); 8820 } 8821 8822 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 8823 for (dep = dbp->db_firstentry; dep; 8824 dep = dep->de_next) { 8825 if (dep->de_recid == newid) 8826 break; 8827 } 8828 if (dep != NULL) 8829 break; 8830 } 8831 } while (dbp); 8832 8833 desize = (sizeof (*de32p) - sizeof (de32p->de32_blks)) + 8834 (sizeof (mddb_block_t) * blkcnt); 8835 8836 /* 8837 * see if a directory block exists which will hold this entry 8838 */ 8839 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 8840 used = sizeof (*db32p); 8841 for (dep = dbp->db_firstentry; 8842 dep != NULL; dep = dep->de_next) { 8843 used += sizeof (*de32p) - sizeof (de32p->de32_blks); 8844 used += sizeof (mddb_block_t) * dep->de_blkcount; 8845 } 8846 if ((used + desize) < MDDB_BSIZE) 8847 break; 8848 } 8849 if (dbp) { 8850 kmem_free((caddr_t)newdbp, sizeof (*newdbp)); 8851 if (blkcnt > s->s_freeblkcnt) { 8852 kmem_free((caddr_t)rbp, ((size_t)recsize)); 8853 if ((options & MD_CRO_OPTIMIZE) == 0) 8854 kmem_free(userdata, usersize); 8855 single_thread_end(s); 8856 mddb_setexit(s); 8857 return (MDDB_E_NOSPACE); 8858 } 8859 prevdbp = NULL; 8860 } else { 8861 /* 8862 * need to add directory block 8863 */ 8864 if ((blkcnt + 1) > s->s_freeblkcnt) { 8865 kmem_free((caddr_t)newdbp, sizeof (*newdbp)); 8866 kmem_free((caddr_t)rbp, ((size_t)recsize)); 8867 if ((options & MD_CRO_OPTIMIZE) == 0) 8868 kmem_free(userdata, usersize); 8869 single_thread_end(s); 8870 mddb_setexit(s); 8871 return (MDDB_E_NOSPACE); 8872 } 8873 for (dbp = s->s_dbp; dbp->db_next; dbp = dbp->db_next); 8874 dbp->db_next = newdbp; 8875 bzero((caddr_t)dbp->db_next, sizeof (*newdbp)); 8876 dbp->db_nextblk = getfreeblks(s, 1); 8877 dbp->db_next->db_blknum = dbp->db_nextblk; 8878 prevdbp = dbp; 8879 dbp = dbp->db_next; 8880 dbp->db_nextblk = 0; 8881 dbp->db_firstentry = NULL; 8882 dbp->db_recsum = 0; 8883 dbp->db_magic = MDDB_MAGIC_DB; 8884 } 8885 /* 8886 * ready to add record 8887 */ 8888 desize_ic = (sizeof (*dep) - sizeof (dep->de_blks)) + 8889 (sizeof (mddb_block_t) * blkcnt); 8890 if (dbp->db_firstentry) { 8891 for (dep = dbp->db_firstentry; dep->de_next; 8892 dep = dep->de_next); 8893 dep->de_next = (mddb_de_ic_t *)kmem_zalloc(desize_ic, KM_SLEEP); 8894 dep = dep->de_next; 8895 } else { 8896 dep = (mddb_de_ic_t *)kmem_zalloc(desize_ic, KM_SLEEP); 8897 dbp->db_firstentry = dep; 8898 } 8899 bzero((caddr_t)dep, desize_ic); 8900 dep->de_recid = newid; 8901 /* 8902 * Optimized records have an owner node associated with them in 8903 * a MN diskset. The owner is only set on a node that is actively 8904 * writing to that record. The other nodes will show that record 8905 * as having an invalid owner. The owner for an optimized record 8906 * is used during fixoptrecord to determine which node should 8907 * write out the record when the replicas associated with that 8908 * optimized record have been changed. 8909 */ 8910 if (MD_MNSET_SETNO(s->s_setno)) { 8911 dep->de_owner_nodeid = MD_MN_INVALID_NID; 8912 } 8913 dep->de_type1 = type; 8914 dep->de_type2 = type2; 8915 dep->de_reqsize = usersize; 8916 dep->de_recsize = recsize; 8917 dep->de_blkcount = blkcnt; 8918 flag_type = options & 8919 (MD_CRO_OPTIMIZE | MD_CRO_STRIPE | MD_CRO_MIRROR | MD_CRO_RAID | 8920 MD_CRO_SOFTPART | MD_CRO_TRANS_MASTER | MD_CRO_TRANS_LOG | 8921 MD_CRO_HOTSPARE | MD_CRO_HOTSPARE_POOL | MD_CRO_CHANGELOG); 8922 switch (flag_type) { 8923 case MD_CRO_OPTIMIZE: 8924 dep->de_flags = MDDB_F_OPT; 8925 getoptdev(s, dep, 0); 8926 getoptdev(s, dep, 1); 8927 break; 8928 case MD_CRO_STRIPE: 8929 dep->de_flags = MDDB_F_STRIPE; 8930 break; 8931 case MD_CRO_MIRROR: 8932 dep->de_flags = MDDB_F_MIRROR; 8933 break; 8934 case MD_CRO_RAID: 8935 dep->de_flags = MDDB_F_RAID; 8936 break; 8937 case MD_CRO_SOFTPART: 8938 dep->de_flags = MDDB_F_SOFTPART; 8939 break; 8940 case MD_CRO_TRANS_MASTER: 8941 dep->de_flags = MDDB_F_TRANS_MASTER; 8942 break; 8943 case MD_CRO_TRANS_LOG: 8944 dep->de_flags = MDDB_F_TRANS_LOG; 8945 break; 8946 case MD_CRO_HOTSPARE: 8947 dep->de_flags = MDDB_F_HOTSPARE; 8948 break; 8949 case MD_CRO_HOTSPARE_POOL: 8950 dep->de_flags = MDDB_F_HOTSPARE_POOL; 8951 break; 8952 case MD_CRO_CHANGELOG: 8953 dep->de_flags = MDDB_F_CHANGELOG; 8954 break; 8955 } 8956 /* 8957 * try to get all blocks consecutive. If not possible 8958 * just get them one at a time 8959 */ 8960 dep->de_blks[0] = getfreeblks(s, blkcnt); 8961 if (dep->de_blks[0]) { 8962 for (i = 1; i < blkcnt; i++) 8963 dep->de_blks[i] = dep->de_blks[0] + i; 8964 } else { 8965 for (i = 0; i < blkcnt; i++) 8966 dep->de_blks[i] = getfreeblks(s, 1); 8967 } 8968 dep->de_rb = rbp; 8969 bzero((caddr_t)rbp, recsize); 8970 rbp->rb_magic = MDDB_MAGIC_RB; 8971 8972 /* Do we have to create an old style (32 bit) record? */ 8973 if (options & MD_CRO_32BIT) { 8974 if (options & MD_CRO_FN) 8975 rbp->rb_revision = MDDB_REV_RBFN; 8976 else 8977 rbp->rb_revision = MDDB_REV_RB; 8978 } else { 8979 if (options & MD_CRO_FN) 8980 rbp->rb_revision = MDDB_REV_RB64FN; 8981 else 8982 rbp->rb_revision = MDDB_REV_RB64; 8983 } 8984 8985 /* set de_rb_userdata for non optimization records */ 8986 if ((options & MD_CRO_OPTIMIZE) == 0) { 8987 dep->de_rb_userdata = userdata; 8988 } 8989 8990 uniqtime32(&rbp->rb_timestamp); 8991 /* Generate the crc for this record */ 8992 rec_crcgen(s, dep, rbp); 8993 tmppnt = (caddr_t)rbp; 8994 /* 8995 * the following code writes new records to all instances of 8996 * the data base. Writing one block at a time to each instance 8997 * is safe because they are not yet in a directory entry which 8998 * has been written to the data base 8999 */ 9000 err = 0; 9001 if ((options & MD_CRO_OPTIMIZE) == 0) { 9002 for (i = 0; i < blkcnt; i++) { 9003 err |= writeall(s, (caddr_t)tmppnt, 9004 dep->de_blks[i], 1, 0); 9005 tmppnt += MDDB_BSIZE; 9006 } 9007 } else { 9008 if ((MD_MNSET_SETNO(s->s_setno)) && 9009 md_set[s->s_setno].s_am_i_master) { 9010 /* 9011 * If a MN diskset then only master writes out newly 9012 * created optimized record. 9013 */ 9014 err |= writeoptrecord(s, dep); 9015 } 9016 } 9017 uniqtime32(&dbp->db_timestamp); 9018 dbp->db_revision = MDDB_REV_DB; 9019 /* Don't include opt resync and change log records in global XOR */ 9020 if (!(dep->de_flags & MDDB_F_OPT) && 9021 !(dep->de_flags & MDDB_F_CHANGELOG)) 9022 dbp->db_recsum ^= rbp->rb_checksum; 9023 db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, KM_SLEEP); 9024 create_db32rec(db32p, dbp); 9025 crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL); 9026 err |= writeall(s, (caddr_t)db32p, dbp->db_blknum, 1, 0); 9027 if (prevdbp) { 9028 dbp = prevdbp; 9029 uniqtime32(&dbp->db_timestamp); 9030 dbp->db_revision = MDDB_REV_DB; 9031 create_db32rec(db32p, dbp); 9032 crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL); 9033 err |= writeall(s, (caddr_t)db32p, dbp->db_blknum, 1, 0); 9034 } 9035 kmem_free((caddr_t)db32p, MDDB_BSIZE); 9036 if (err) { 9037 if (writeretry(s)) { 9038 s->s_zombie = newid; 9039 single_thread_end(s); 9040 mddb_setexit(s); 9041 return (MDDB_E_NOTNOW); 9042 } 9043 } 9044 single_thread_end(s); 9045 mddb_setexit(s); 9046 9047 ASSERT((newid & MDDB_SETMASK) == 0); 9048 return (MAKERECID(setno, newid)); 9049 } 9050 9051 int 9052 mddb_deleterec( 9053 mddb_recid_t id 9054 ) 9055 { 9056 mddb_set_t *s; 9057 mddb_db_t *dbp; 9058 mddb_db32_t *db32p; 9059 mddb_de_ic_t *dep, *dep1; 9060 int i; 9061 9062 #if defined(_ILP32) && !defined(lint) 9063 ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t)); 9064 ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t)); 9065 #endif 9066 9067 s = mddb_setenter(DBSET(id), MDDB_NOINIT, NULL); 9068 ASSERT(s != NULL); 9069 9070 id = DBID(id); 9071 if (checkstate(s, MDDB_PROBE)) { 9072 mddb_setexit(s); 9073 return (MDDB_E_NOTNOW); 9074 } 9075 9076 ASSERT(s->s_lbp != NULL); 9077 single_thread_start(s); 9078 9079 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 9080 dep1 = NULL; 9081 for (dep = dbp->db_firstentry; dep; dep = dep->de_next) { 9082 if (dep->de_recid == id) 9083 break; 9084 dep1 = dep; 9085 } 9086 if (dep != NULL) 9087 break; 9088 } 9089 /* 9090 * no such record 9091 */ 9092 if (dep == NULL) { 9093 single_thread_end(s); 9094 ASSERT(s->s_staledeletes != 0); 9095 s->s_staledeletes--; 9096 mddb_setexit(s); 9097 return (0); 9098 } 9099 9100 if (!(dep->de_flags & MDDB_F_OPT) && 9101 !(dep->de_flags & MDDB_F_CHANGELOG)) { 9102 dbp->db_recsum ^= dep->de_rb->rb_checksum; 9103 dbp->db_recsum ^= dep->de_rb->rb_checksum_fiddle; 9104 } 9105 9106 if (dep->de_rb_userdata != NULL) { 9107 if (dep->de_icreqsize) 9108 kmem_free(dep->de_rb_userdata_ic, dep->de_icreqsize); 9109 else 9110 kmem_free(dep->de_rb_userdata, dep->de_reqsize); 9111 } 9112 9113 kmem_free((caddr_t)dep->de_rb, dep->de_recsize); 9114 9115 for (i = 0; i < dep->de_blkcount; i++) 9116 blkfree(s, dep->de_blks[i]); 9117 if (dep1) 9118 dep1->de_next = dep->de_next; 9119 else 9120 dbp->db_firstentry = dep->de_next; 9121 9122 kmem_free(dep, sizeofde(dep)); 9123 9124 uniqtime32(&dbp->db_timestamp); 9125 dbp->db_revision = MDDB_REV_DB; 9126 db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, KM_SLEEP); 9127 create_db32rec(db32p, dbp); 9128 crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL); 9129 if (writeall(s, (caddr_t)db32p, dbp->db_blknum, 1, 0)) { 9130 if (writeretry(s)) { 9131 /* 9132 * staledelete is used to mark deletes which failed. 9133 * its only use is to not panic when the user retries 9134 * the delete once the database is active again 9135 */ 9136 single_thread_end(s); 9137 s->s_staledeletes++; 9138 kmem_free((caddr_t)db32p, MDDB_BSIZE); 9139 mddb_setexit(s); 9140 return (MDDB_E_NOTNOW); 9141 } 9142 } 9143 single_thread_end(s); 9144 kmem_free((caddr_t)db32p, MDDB_BSIZE); 9145 mddb_setexit(s); 9146 return (0); 9147 } 9148 9149 mddb_recid_t 9150 mddb_getnextrec( 9151 mddb_recid_t id, 9152 mddb_type_t typ, 9153 uint_t type2 9154 ) 9155 { 9156 mddb_set_t *s; 9157 mddb_db_t *dbp; 9158 mddb_de_ic_t *dep; 9159 int searching, err; 9160 set_t setno; 9161 9162 setno = DBSET(id); 9163 id = DBID(id); 9164 searching = id; 9165 9166 if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) 9167 return (err); 9168 9169 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 9170 for (dep = dbp->db_firstentry; 9171 dep != NULL; dep = dep->de_next) { 9172 if (searching) { 9173 if (dep->de_recid == id) 9174 searching = 0; 9175 } else { 9176 if ((typ == MDDB_ALL || dep->de_type1 == typ) && 9177 (type2 == 0 || dep->de_type2 == type2)) { 9178 id = dep->de_recid; 9179 mddb_setexit(s); 9180 ASSERT((id & MDDB_SETMASK) == 0); 9181 return (MAKERECID(setno, id)); 9182 } 9183 } 9184 } 9185 } 9186 9187 mddb_setexit(s); 9188 9189 if (searching) 9190 return (MDDB_E_NORECORD); 9191 return (0); 9192 } 9193 9194 void * 9195 mddb_getrecaddr( 9196 mddb_recid_t id 9197 ) 9198 { 9199 mddb_set_t *s; 9200 mddb_db_t *dbp; 9201 mddb_de_ic_t *dep; 9202 void *rval; 9203 9204 if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, NULL)) == NULL) 9205 return (NULL); 9206 9207 id = DBID(id); 9208 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 9209 for (dep = dbp->db_firstentry; 9210 dep != NULL; dep = dep->de_next) { 9211 if (dep->de_recid != id) 9212 continue; 9213 if (dep->de_rb_userdata) 9214 rval = (void *)dep->de_rb_userdata; 9215 else 9216 rval = (void *)dep->de_rb->rb_data; 9217 mddb_setexit(s); 9218 return (rval); 9219 } 9220 } 9221 9222 mddb_setexit(s); 9223 return (NULL); 9224 } 9225 9226 9227 mddb_de_ic_t * 9228 mddb_getrecdep( 9229 mddb_recid_t id 9230 ) 9231 { 9232 mddb_set_t *s; 9233 mddb_db_t *dbp; 9234 mddb_de_ic_t *dep; 9235 9236 if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, NULL)) == NULL) 9237 return (NULL); 9238 9239 id = DBID(id); 9240 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 9241 for (dep = dbp->db_firstentry; 9242 dep != NULL; dep = dep->de_next) { 9243 if (dep->de_recid != id) 9244 continue; 9245 mddb_setexit(s); 9246 return (dep); 9247 } 9248 } 9249 9250 mddb_setexit(s); 9251 return (NULL); 9252 } 9253 9254 void * 9255 mddb_getrecaddr_resize( 9256 mddb_recid_t id, 9257 size_t icsize, 9258 off_t off 9259 ) 9260 { 9261 mddb_set_t *s; 9262 mddb_db_t *dbp; 9263 mddb_de_ic_t *dep; 9264 void *rval = NULL; 9265 9266 if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, NULL)) == NULL) 9267 return (NULL); 9268 9269 id = DBID(id); 9270 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 9271 for (dep = dbp->db_firstentry; 9272 dep != NULL; dep = dep->de_next) { 9273 if (dep->de_recid != id) 9274 continue; 9275 if (dep->de_rb_userdata) 9276 rval = (void *)dep->de_rb_userdata; 9277 else 9278 rval = (void *)dep->de_rb->rb_data; 9279 break; 9280 } 9281 if (rval != NULL) 9282 break; 9283 } 9284 9285 if (rval == NULL) { 9286 mddb_setexit(s); 9287 return (NULL); 9288 } 9289 9290 if (dep->de_rb_userdata) { 9291 caddr_t nud; 9292 9293 if (dep->de_icreqsize || (dep->de_reqsize >= icsize)) { 9294 mddb_setexit(s); 9295 return (rval); 9296 } 9297 ASSERT((dep->de_reqsize + off) <= icsize); 9298 nud = kmem_zalloc(icsize, KM_SLEEP); 9299 bcopy(dep->de_rb_userdata, nud + off, dep->de_reqsize); 9300 kmem_free(dep->de_rb_userdata, dep->de_reqsize); 9301 dep->de_rb_userdata = nud + off; 9302 dep->de_rb_userdata_ic = nud; 9303 dep->de_icreqsize = icsize; 9304 rval = nud; 9305 } else { 9306 size_t recsize; 9307 /* LINTED variable unused - used for sizeof calculations */ 9308 mddb_rb32_t *nrbp; 9309 9310 recsize = roundup((sizeof (*nrbp) - sizeof (nrbp->rb_data)) + 9311 icsize, MDDB_BSIZE); 9312 if (dep->de_recsize < recsize) 9313 cmn_err(CE_PANIC, "mddb_getrecaddr_resize: only " 9314 "nonoptimized records can be resized\n"); 9315 } 9316 9317 mddb_setexit(s); 9318 return (rval); 9319 } 9320 9321 int 9322 mddb_getrecprivate( 9323 mddb_recid_t id 9324 ) 9325 { 9326 mddb_set_t *s; 9327 mddb_db_t *dbp; 9328 mddb_de_ic_t *dep; 9329 int err = 0; 9330 int private; 9331 9332 if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, &err)) == NULL) 9333 return (err); 9334 9335 id = DBID(id); 9336 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 9337 for (dep = dbp->db_firstentry; 9338 dep != NULL; dep = dep->de_next) { 9339 if (dep->de_recid != id) 9340 continue; 9341 private = (int)dep->de_rb->rb_private; 9342 mddb_setexit(s); 9343 return (private); 9344 } 9345 } 9346 9347 mddb_setexit(s); 9348 return (MDDB_E_NORECORD); 9349 } 9350 9351 void 9352 mddb_setrecprivate( 9353 mddb_recid_t id, 9354 uint_t private 9355 ) 9356 { 9357 mddb_set_t *s; 9358 mddb_db_t *dbp; 9359 mddb_de_ic_t *dep; 9360 9361 if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, NULL)) == NULL) { 9362 ASSERT(0); 9363 return; 9364 } 9365 9366 id = DBID(id); 9367 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 9368 for (dep = dbp->db_firstentry; 9369 dep != NULL; dep = dep->de_next) { 9370 if (dep->de_recid != id) 9371 continue; 9372 dep->de_rb->rb_private = private; 9373 mddb_setexit(s); 9374 return; 9375 } 9376 } 9377 9378 mddb_setexit(s); 9379 ASSERT(0); 9380 } 9381 9382 mddb_type_t 9383 mddb_getrectype1( 9384 mddb_recid_t id 9385 ) 9386 { 9387 mddb_set_t *s; 9388 mddb_db_t *dbp; 9389 mddb_de_ic_t *dep; 9390 int err = 0; 9391 mddb_type_t rval; 9392 9393 if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, &err)) == NULL) 9394 return (err); 9395 9396 id = DBID(id); 9397 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 9398 for (dep = dbp->db_firstentry; 9399 dep != NULL; dep = dep->de_next) { 9400 if (dep->de_recid != id) 9401 continue; 9402 rval = dep->de_type1; 9403 mddb_setexit(s); 9404 return (rval); 9405 } 9406 } 9407 9408 mddb_setexit(s); 9409 return (MDDB_E_NORECORD); 9410 } 9411 9412 int 9413 mddb_getrectype2( 9414 mddb_recid_t id 9415 ) 9416 { 9417 mddb_set_t *s; 9418 mddb_db_t *dbp; 9419 mddb_de_ic_t *dep; 9420 int err = 0; 9421 int rval; 9422 9423 if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, &err)) == NULL) 9424 return (err); 9425 9426 id = DBID(id); 9427 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 9428 for (dep = dbp->db_firstentry; 9429 dep != NULL; dep = dep->de_next) { 9430 if (dep->de_recid != id) 9431 continue; 9432 rval = (int)dep->de_type2; 9433 mddb_setexit(s); 9434 return (rval); 9435 } 9436 } 9437 9438 mddb_setexit(s); 9439 return (MDDB_E_NORECORD); 9440 } 9441 9442 int 9443 mddb_getrecsize( 9444 mddb_recid_t id 9445 ) 9446 { 9447 mddb_set_t *s; 9448 mddb_db_t *dbp; 9449 mddb_de_ic_t *dep; 9450 int err = 0; 9451 int rval; 9452 9453 if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, &err)) == NULL) 9454 return (err); 9455 9456 id = DBID(id); 9457 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 9458 for (dep = dbp->db_firstentry; 9459 dep != NULL; dep = dep->de_next) { 9460 if (dep->de_recid != id) 9461 continue; 9462 rval = (int)dep->de_reqsize; 9463 mddb_setexit(s); 9464 return (rval); 9465 } 9466 } 9467 9468 mddb_setexit(s); 9469 return (MDDB_E_NORECORD); 9470 } 9471 9472 9473 mddb_recstatus_t 9474 mddb_getrecstatus( 9475 mddb_recid_t id 9476 ) 9477 { 9478 mddb_set_t *s; 9479 mddb_db_t *dbp; 9480 mddb_de_ic_t *dep; 9481 int err = 0; 9482 mddb_recstatus_t e_err; 9483 9484 if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, &err)) == NULL) 9485 return ((mddb_recstatus_t)err); 9486 9487 id = DBID(id); 9488 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 9489 for (dep = dbp->db_firstentry; 9490 dep != NULL; dep = dep->de_next) { 9491 if (dep->de_recid == id) 9492 break; 9493 } 9494 if (dep) 9495 break; 9496 } 9497 9498 e_err = MDDB_OK; 9499 9500 if (! dep) 9501 e_err = MDDB_NORECORD; 9502 else if (! dep->de_rb->rb_commitcnt) 9503 e_err = MDDB_NODATA; 9504 else if (md_get_setstatus(s->s_setno) & MD_SET_STALE) 9505 e_err = MDDB_STALE; 9506 9507 mddb_setexit(s); 9508 return (e_err); 9509 } 9510 9511 /* 9512 * Commit given record to disk. 9513 * If committing an optimized record, do not call 9514 * with md ioctl lock held. 9515 */ 9516 int 9517 mddb_commitrec( 9518 mddb_recid_t id 9519 ) 9520 { 9521 mddb_set_t *s; 9522 mddb_db_t *dbp; 9523 mddb_de_ic_t *dep; 9524 mddb_recid_t ids[2]; 9525 mddb_rb32_t *rbp; 9526 static int err = 0; 9527 md_mn_msg_mddb_optrecerr_t *msg_recerr; 9528 md_mn_kresult_t *kres; 9529 mddb_lb_t *lbp; 9530 mddb_mnlb_t *mnlbp; 9531 mddb_locator_t *lp; 9532 mddb_mnsidelocator_t *mnslp; 9533 mddb_drvnm_t *dn; 9534 int li; 9535 md_replica_recerr_t *recerr; 9536 int i, j; 9537 int rval; 9538 int hit_err = 0; 9539 9540 s = mddb_setenter(DBSET(id), MDDB_NOINIT, NULL); 9541 ASSERT(s != NULL); 9542 9543 if (checkstate(s, MDDB_PROBE)) { 9544 mddb_setexit(s); 9545 return (MDDB_E_NOTNOW); 9546 } 9547 9548 if (DBID(id) == 0) { 9549 mddb_setexit(s); 9550 return (0); 9551 } 9552 9553 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 9554 for (dep = dbp->db_firstentry; dep; dep = dep->de_next) { 9555 if (dep->de_recid == DBID(id)) 9556 break; 9557 } 9558 if (dep) 9559 break; 9560 } 9561 9562 if (dep == NULL) { 9563 mddb_setexit(s); 9564 return (MDDB_E_NORECORD); 9565 } 9566 9567 if (! (dep->de_flags & MDDB_F_OPT)) { 9568 ids[0] = id; 9569 ids[1] = 0; 9570 mddb_setexit(s); 9571 return (mddb_commitrecs(ids)); 9572 } 9573 9574 /* 9575 * following code allows multiple processes to be doing 9576 * optimization commits in parallel. 9577 * NOTE: if lots of optimization commits then the lock 9578 * will not get released until it winds down 9579 */ 9580 if (s->s_optwaiterr) { 9581 while (s->s_optwaiterr) { 9582 s->s_opthungerr = 1; 9583 cv_wait(&s->s_opthungerr_cv, SETMUTEX(s->s_setno)); 9584 } 9585 if (checkstate(s, MDDB_PROBE)) { 9586 mddb_setexit(s); 9587 return (MDDB_E_NOTNOW); 9588 } 9589 } 9590 if (s->s_optcmtcnt++ == 0) { 9591 single_thread_start(s); 9592 s->s_opthavelck = 1; 9593 if (s->s_optwantlck) { 9594 cv_broadcast(&s->s_optwantlck_cv); 9595 s->s_optwantlck = 0; 9596 } 9597 } else { 9598 while (! s->s_opthavelck) { 9599 s->s_optwantlck = 1; 9600 cv_wait(&s->s_optwantlck_cv, SETMUTEX(s->s_setno)); 9601 } 9602 } 9603 9604 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 9605 for (dep = dbp->db_firstentry; dep; dep = dep->de_next) { 9606 if (dep->de_recid == DBID(id)) 9607 break; 9608 } 9609 if (dep) 9610 break; 9611 } 9612 9613 if (dep == NULL) { 9614 if (! (--s->s_optcmtcnt)) { 9615 single_thread_end(s); 9616 s->s_opthavelck = 0; 9617 } 9618 mddb_setexit(s); 9619 return (MDDB_E_NORECORD); 9620 } 9621 9622 rbp = dep->de_rb; 9623 rbp->rb_commitcnt++; 9624 uniqtime32(&rbp->rb_timestamp); 9625 /* Generate the crc for this record */ 9626 rec_crcgen(s, dep, rbp); 9627 9628 if (writeoptrecord(s, dep)) { 9629 if (MD_MNSET_SETNO(s->s_setno)) { 9630 hit_err = 1; 9631 } 9632 s->s_optwaiterr++; 9633 } 9634 if (MD_MNSET_SETNO(s->s_setno)) { 9635 /* If last thread out, release single_thread_start */ 9636 if (! (--s->s_optcmtcnt)) { 9637 single_thread_end(s); 9638 s->s_opthavelck = 0; 9639 } 9640 /* 9641 * If this thread had a writeoptrecords failure, then 9642 * need to send message to master. 9643 * But, multiple threads could all be running on the 9644 * same single_thread_start, so serialize the threads 9645 * by making each thread grab single_thread_start. 9646 * 9647 * After return from sending message to master message, 9648 * replicas associated with optimized record will havei 9649 * been changed (via a callback from the master to all 9650 * nodes), so retry call to writeoptrecord. 9651 * This code is replacing the call to writeretry that 9652 * occurs for the local and traditional disksets. 9653 */ 9654 if (hit_err) { 9655 single_thread_start(s); 9656 /* 9657 * If > 50% of replicas are alive then continue 9658 * to send message to master until writeoptrecord 9659 * succeeds. For now, assume that minor name, 9660 * major number on this node is the same as on 9661 * the master node. Once devids are turned on 9662 * for MN disksets, can send devid. 9663 */ 9664 kres = kmem_zalloc(sizeof (md_mn_kresult_t), KM_SLEEP); 9665 msg_recerr = kmem_zalloc( 9666 sizeof (md_mn_msg_mddb_optrecerr_t), KM_SLEEP); 9667 while (!(md_get_setstatus(s->s_setno) & 9668 MD_SET_TOOFEW)) { 9669 bzero((caddr_t)msg_recerr, 9670 sizeof (md_mn_msg_mddb_optrecerr_t)); 9671 lbp = s->s_lbp; 9672 mnlbp = (mddb_mnlb_t *)lbp; 9673 for (i = 0; i < 2; i++) { 9674 li = dep->de_optinfo[i].o_li; 9675 lp = &lbp->lb_locators[li]; 9676 for (j = 0; j < MD_MNMAXSIDES; j++) { 9677 mnslp = 9678 &mnlbp->lb_mnsidelocators[j][li]; 9679 if (mnslp->mnl_sideno == s->s_sideno) 9680 break; 9681 } 9682 if (j == MD_MNMAXSIDES) 9683 continue; 9684 9685 dn = &lbp->lb_drvnm[mnslp->mnl_drvnm_index]; 9686 recerr = &msg_recerr->msg_recerr[i]; 9687 recerr->r_li = li; 9688 recerr->r_flags = 9689 dep->de_optinfo[i].o_flags; 9690 recerr->r_blkno = lp->l_blkno; 9691 recerr->r_mnum = md_getminor(lp->l_dev); 9692 (void) strncpy(recerr->r_driver_name, 9693 dn->dn_data, MD_MAXDRVNM); 9694 } 9695 9696 /* Release locks */ 9697 single_thread_end(s); 9698 mutex_exit(SETMUTEX(s->s_setno)); 9699 9700 /* 9701 * Send message to master about optimized 9702 * record failure. After return, master 9703 * should have marked failed replicas 9704 * and sent parse message to slaves causing 9705 * slaves to have fixed up the optimized 9706 * record. 9707 * On return from ksend_message, retry 9708 * the write since this node should have fixed 9709 * the optimized resync records it owns. 9710 */ 9711 rval = mdmn_ksend_message(s->s_setno, 9712 MD_MN_MSG_MDDB_OPTRECERR, 9713 MD_MSGF_NO_BCAST, 9714 (char *)msg_recerr, 9715 sizeof (md_mn_msg_mddb_optrecerr_t), 9716 kres); 9717 if (!MDMN_KSEND_MSG_OK(rval, kres)) { 9718 cmn_err(CE_WARN, "mddb_commitrec: " 9719 "Unable to send optimized " 9720 "resync record failure " 9721 "message to other nodes in " 9722 "diskset %s\n", s->s_setname); 9723 mdmn_ksend_show_error(rval, kres, 9724 "MD_MN_MSG_MDDB_OPTRECERR"); 9725 } 9726 9727 /* Regrab locks */ 9728 mutex_enter(SETMUTEX(s->s_setno)); 9729 single_thread_start(s); 9730 9731 /* Start over in case mddb changed */ 9732 for (dbp = s->s_dbp; dbp != NULL; 9733 dbp = dbp->db_next) { 9734 for (dep = dbp->db_firstentry; dep; 9735 dep = dep->de_next) { 9736 if (dep->de_recid == DBID(id)) 9737 break; 9738 } 9739 if (dep) 9740 break; 9741 } 9742 if (dep) { 9743 rbp = dep->de_rb; 9744 rbp->rb_commitcnt++; 9745 uniqtime32(&rbp->rb_timestamp); 9746 /* Generate the crc for this record */ 9747 rec_crcgen(s, dep, rbp); 9748 9749 /* 9750 * If writeoptrecord succeeds, then 9751 * break out. 9752 */ 9753 if (!(writeoptrecord(s, dep))) 9754 break; 9755 } 9756 } 9757 kmem_free(kres, sizeof (md_mn_kresult_t)); 9758 kmem_free(msg_recerr, 9759 sizeof (md_mn_msg_mddb_optrecerr_t)); 9760 9761 /* Resync record should be fixed - if possible */ 9762 s->s_optwaiterr--; 9763 if (s->s_optwaiterr == 0) { 9764 /* All errors have been handled */ 9765 if (s->s_opthungerr) { 9766 s->s_opthungerr = 0; 9767 cv_broadcast(&s->s_opthungerr_cv); 9768 } 9769 } 9770 single_thread_end(s); 9771 mddb_setexit(s); 9772 if (md_get_setstatus(s->s_setno) & MD_SET_TOOFEW) { 9773 return (MDDB_E_NOTNOW); 9774 } else { 9775 return (0); 9776 } 9777 } 9778 } else { 9779 /* If set is a traditional or local set */ 9780 if (! (--s->s_optcmtcnt)) { 9781 err = 0; 9782 if (s->s_optwaiterr) { 9783 err = writeretry(s); 9784 s->s_optwaiterr = 0; 9785 if (s->s_opthungerr) { 9786 s->s_opthungerr = 0; 9787 cv_broadcast(&s->s_opthungerr_cv); 9788 } 9789 } 9790 single_thread_end(s); 9791 s->s_opthavelck = 0; 9792 mddb_setexit(s); 9793 if (err) 9794 return (MDDB_E_NOTNOW); 9795 return (0); 9796 } 9797 if (s->s_optwaiterr) { 9798 while (s->s_optwaiterr) { 9799 s->s_opthungerr = 1; 9800 cv_wait(&s->s_opthungerr_cv, 9801 SETMUTEX(s->s_setno)); 9802 } 9803 if (checkstate(s, MDDB_NOPROBE)) { 9804 mddb_setexit(s); 9805 return (MDDB_E_NOTNOW); 9806 } 9807 } 9808 } 9809 9810 mddb_setexit(s); 9811 return (0); 9812 } 9813 9814 int 9815 mddb_commitrecs( 9816 mddb_recid_t ids[] 9817 ) 9818 { 9819 mddb_set_t *s; 9820 mddb_db_t *dbp; 9821 mddb_de_ic_t *dep; 9822 mddb_rb32_t *rbp; 9823 mddb_rb32_t *saverbp; 9824 mddb_lb_t *lbp; 9825 int li; 9826 uint_t checksum; 9827 mddb_recid_t *idp; 9828 int err = 0; 9829 set_t setno; 9830 9831 if (panicstr) 9832 cmn_err(CE_PANIC, "md: mddb: commit not allowed"); 9833 9834 /* 9835 * scan through and make sure ids are from the same set 9836 */ 9837 setno = DBSET(ids[0]); 9838 for (idp = ids; *idp != NULL; idp++) 9839 ASSERT(DBSET(*idp) == setno); 9840 9841 s = mddb_setenter(setno, MDDB_MUSTEXIST, NULL); 9842 9843 if (checkstate(s, MDDB_PROBE)) { 9844 mddb_setexit(s); 9845 return (MDDB_E_NOTNOW); 9846 } 9847 9848 ASSERT(s->s_lbp != NULL); 9849 err = 0; 9850 9851 if (! ids[0]) { 9852 mddb_setexit(s); 9853 return (0); 9854 } 9855 9856 single_thread_start(s); 9857 /* 9858 * scan through and make sure ids all exist 9859 */ 9860 for (idp = ids; *idp != NULL; idp++) { 9861 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 9862 for (dep = dbp->db_firstentry; dep; 9863 dep = dep->de_next) { 9864 if (dep->de_recid == DBID(*idp)) 9865 break; 9866 } 9867 if (dep != NULL) 9868 break; 9869 } 9870 if (dep == NULL) { 9871 single_thread_end(s); 9872 mddb_setexit(s); 9873 return (MDDB_E_NORECORD); 9874 } 9875 } 9876 9877 /* 9878 * scan through records fix commit counts and 9879 * zero fiddles and update time stamp and rechecksum record 9880 */ 9881 checksum = 0; 9882 idp = ids; 9883 saverbp = NULL; 9884 while (*idp) { 9885 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 9886 for (dep = dbp->db_firstentry; dep; 9887 dep = dep->de_next) { 9888 if (dep->de_recid == DBID(*idp)) 9889 break; 9890 } 9891 if (dep != NULL) 9892 break; 9893 } 9894 rbp = dep->de_rb; 9895 ASSERT(! (dep->de_flags & MDDB_F_OPT)); 9896 9897 getuserdata(setno, dep); 9898 /* Don't do fiddles for CHANGE LOG records */ 9899 if (!(dep->de_flags & MDDB_F_CHANGELOG)) { 9900 checksum ^= rbp->rb_checksum_fiddle; 9901 rbp->rb_checksum_fiddle = 0; 9902 checksum ^= rbp->rb_checksum; 9903 saverbp = rbp; 9904 } 9905 rbp->rb_commitcnt++; 9906 uniqtime32(&rbp->rb_timestamp); 9907 /* Generate the crc for this record */ 9908 rec_crcgen(s, dep, rbp); 9909 9910 /* Don't do fiddles for CHANGE LOG records */ 9911 if (!(dep->de_flags & MDDB_F_CHANGELOG)) { 9912 checksum ^= rbp->rb_checksum; 9913 } 9914 idp++; 9915 } 9916 9917 if (saverbp) 9918 saverbp->rb_checksum_fiddle = checksum; 9919 9920 /* 9921 * If this is a MN set but we are not the master, then we are not 9922 * supposed to update the mddb on disk. So we finish at this point. 9923 */ 9924 if ((setno != MD_LOCAL_SET) && (s->s_lbp->lb_flags & MDDB_MNSET) && 9925 (md_set[setno].s_am_i_master == 0)) { 9926 single_thread_end(s); 9927 mddb_setexit(s); 9928 return (0); 9929 } 9930 9931 lbp = s->s_lbp; 9932 for (li = 0; li < lbp->lb_loccnt; li++) { 9933 if (! (lbp->lb_locators[li].l_flags & MDDB_F_ACTIVE)) 9934 continue; 9935 9936 idp = ids; 9937 while (*idp) { 9938 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 9939 dep = dbp->db_firstentry; 9940 while (dep && (dep->de_recid != DBID(*idp))) 9941 dep = dep->de_next; 9942 if (dep != NULL) 9943 break; 9944 } 9945 rbp = dep->de_rb; 9946 err = wrtblklst(s, (caddr_t)rbp, dep->de_blks, 9947 dep->de_blkcount, li, (mddb_bf_t **)0, 9948 MDDB_WR_ONLY_MASTER); 9949 if (err) 9950 break; 9951 idp++; 9952 } 9953 if (err) 9954 break; 9955 } 9956 if (err) { 9957 if (writeretry(s)) { 9958 single_thread_end(s); 9959 mddb_setexit(s); 9960 return (MDDB_E_NOTNOW); 9961 } 9962 } 9963 single_thread_end(s); 9964 mddb_setexit(s); 9965 return (0); 9966 } 9967 9968 mddb_recid_t 9969 mddb_makerecid( 9970 set_t setno, 9971 mddb_recid_t id 9972 ) 9973 { 9974 return (MAKERECID(setno, id)); 9975 } 9976 9977 set_t 9978 mddb_getsetnum( 9979 mddb_recid_t id 9980 ) 9981 { 9982 return (DBSET(id)); 9983 } 9984 9985 char * 9986 mddb_getsetname( 9987 set_t setno 9988 ) 9989 { 9990 return (((mddb_set_t *)md_set[setno].s_db)->s_setname); 9991 } 9992 9993 side_t 9994 mddb_getsidenum( 9995 set_t setno 9996 ) 9997 { 9998 if (md_set[setno].s_db) 9999 return (((mddb_set_t *)md_set[setno].s_db)->s_sideno); 10000 return (0); 10001 } 10002 10003 int 10004 mddb_ownset( 10005 set_t setno 10006 ) 10007 { 10008 if ((md_get_setstatus(setno) & MD_SET_TAGDATA) && md_set[setno].s_db) 10009 return (1); 10010 10011 if (md_set[setno].s_db && ((mddb_set_t *)md_set[setno].s_db)->s_lbp) 10012 return (1); 10013 10014 return (0); 10015 } 10016 10017 /*ARGSUSED*/ 10018 int 10019 getmed_ioctl(mddb_med_parm_t *medpp, int mode) 10020 { 10021 mddb_set_t *s; 10022 int err = 0; 10023 set_t setno = medpp->med_setno; 10024 md_error_t *ep = &medpp->med_mde; 10025 10026 mdclrerror(ep); 10027 10028 if (setno >= md_nsets) 10029 return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR)); 10030 10031 if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0) 10032 return (0); 10033 10034 if ((md_get_setstatus(setno) & MD_SET_SNARFED) == 0) 10035 return (mdmddberror(ep, MDE_DB_NOTOWNER, NODEV32, setno)); 10036 10037 if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) 10038 return (mddbstatus2error(ep, err, NODEV32, setno)); 10039 10040 medpp->med = s->s_med; /* structure assignment */ 10041 10042 mddb_setexit(s); 10043 10044 return (0); 10045 } 10046 10047 int 10048 setmed_ioctl(mddb_med_parm_t *medpp, int mode) 10049 { 10050 10051 mddb_set_t *s; 10052 int err = 0; 10053 set_t setno = medpp->med_setno; 10054 md_error_t *ep = &medpp->med_mde; 10055 10056 mdclrerror(ep); 10057 10058 if ((mode & FWRITE) == 0) 10059 return (mdsyserror(ep, EACCES)); 10060 10061 /* 10062 * This should be the only thing that prevents LOCAL sets from having 10063 * mediators, at least in the kernel, userland needs to have some code 10064 * written. 10065 */ 10066 if (setno == MD_LOCAL_SET) 10067 return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR)); 10068 10069 if (setno >= md_nsets) 10070 return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR)); 10071 10072 if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0) 10073 return (0); 10074 10075 if ((md_get_setstatus(setno) & MD_SET_SNARFED) == 0) 10076 return (mdmddberror(ep, MDE_DB_NOTOWNER, NODEV32, setno)); 10077 10078 if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) 10079 return (mddbstatus2error(ep, err, NODEV32, setno)); 10080 10081 s->s_med = medpp->med; /* structure assignment */ 10082 10083 mddb_setexit(s); 10084 10085 return (0); 10086 } 10087 10088 int 10089 updmed_ioctl(mddb_med_upd_parm_t *medpp, int mode) 10090 { 10091 10092 mddb_set_t *s; 10093 int err = 0; 10094 set_t setno = medpp->med_setno; 10095 md_error_t *ep = &medpp->med_mde; 10096 10097 mdclrerror(ep); 10098 10099 if ((mode & FWRITE) == 0) 10100 return (mdsyserror(ep, EACCES)); 10101 10102 if (setno >= md_nsets) 10103 return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR)); 10104 10105 if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0) 10106 return (0); 10107 10108 if ((md_get_setstatus(setno) & MD_SET_SNARFED) == 0) 10109 return (mdmddberror(ep, MDE_DB_NOTOWNER, NODEV32, setno)); 10110 10111 if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) 10112 return (mddbstatus2error(ep, err, NODEV32, setno)); 10113 10114 single_thread_start(s); 10115 (void) upd_med(s, "updmed_ioctl()"); 10116 single_thread_end(s); 10117 10118 mddb_setexit(s); 10119 10120 return (0); 10121 } 10122 10123 int 10124 take_set(mddb_config_t *cp, int mode) 10125 { 10126 int err = 0; 10127 mddb_med_upd_parm_t medup; 10128 set_t setno = cp->c_setno; 10129 md_error_t *ep = &cp->c_mde; 10130 int snarf_ok = 0; 10131 10132 if (md_get_setstatus(setno) & MD_SET_SNARFED) 10133 return (0); 10134 10135 err = mddb_configure(MDDB_GETDEV, cp); 10136 if (! err && mdisok(ep)) { 10137 if (md_snarf_db_set(setno, ep) != 0) 10138 goto out; 10139 snarf_ok = 1; 10140 } 10141 10142 /* 10143 * Clear replicated import flag since this is 10144 * used during the take of a diskset with 10145 * previously unresolved replicated disks. 10146 */ 10147 if (md_get_setstatus(setno) & 10148 MD_SET_REPLICATED_IMPORT) { 10149 md_clr_setstatus(setno, MD_SET_REPLICATED_IMPORT); 10150 } 10151 10152 if (! err && mdisok(ep)) { 10153 if (! cp->c_flags) { 10154 medup.med_setno = setno; 10155 mdclrerror(&medup.med_mde); 10156 10157 err = updmed_ioctl(&medup, mode); 10158 if (! mdisok(&medup.med_mde)) 10159 (void) mdstealerror(ep, &medup.med_mde); 10160 } 10161 } 10162 10163 out: 10164 /* 10165 * In the case that the snarf failed, the diskset is 10166 * left with s_db set, but s_lbp not set. The node is not 10167 * an owner of the set and won't be allowed to release the 10168 * diskset in order to cleanup. With s_db set, any call to the 10169 * GETDEV or ENDDEV ioctl (done by libmeta routine metareplicalist) 10170 * will cause the diskset to be loaded. So, cleanup the diskset so 10171 * that an inadvertent start of the diskset doesn't happen later. 10172 */ 10173 if ((snarf_ok == 0) && md_set[setno].s_db && 10174 (((mddb_set_t *)md_set[setno].s_db)->s_lbp == 0)) { 10175 mutex_enter(&mddb_lock); 10176 mddb_unload_set(setno); 10177 mutex_exit(&mddb_lock); 10178 } 10179 return (err); 10180 } 10181 10182 /*ARGSUSED*/ 10183 int 10184 release_set(mddb_config_t *cp, int mode) 10185 { 10186 int err = 0; 10187 set_t setno = cp->c_setno; 10188 md_error_t *ep = &cp->c_mde; 10189 10190 /* 10191 * Data integrity check 10192 */ 10193 if (setno >= md_nsets) 10194 return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR)); 10195 10196 rw_enter(&md_unit_array_rw.lock, RW_WRITER); 10197 md_haltsnarf_enter(setno); 10198 /* 10199 * Attempt to mark set as HOLD. If it is marked as HOLD, this means 10200 * that the mirror code is currently searching all mirrors for a 10201 * errored component that needs a hotspare. While this search is in 10202 * progress, we cannot release the set and thgerefore we return EBUSY. 10203 * Once we have set HOLD, the mirror function (check_4_hotspares) will 10204 * block before the search until the set is released. 10205 */ 10206 if (md_holdset_testandenter(setno) != 0) { 10207 md_haltsnarf_exit(setno); 10208 rw_exit(&md_unit_array_rw.lock); 10209 return (EBUSY); 10210 } 10211 10212 if ((err = md_halt_set(setno, MD_HALT_ALL)) == 0) 10213 err = mddb_configure(MDDB_RELEASESET, cp); 10214 10215 md_holdset_exit(setno); 10216 md_haltsnarf_exit(setno); 10217 rw_exit(&md_unit_array_rw.lock); 10218 10219 if (! err && mdisok(ep)) { 10220 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RELEASE, SVM_TAG_SET, setno, 10221 NODEV64); 10222 } 10223 10224 return (err); 10225 } 10226 10227 int 10228 gettag_ioctl(mddb_dtag_get_parm_t *dtgpp, int mode) 10229 { 10230 mddb_set_t *s; 10231 int err = 0; 10232 mddb_dtag_lst_t *dtlp; 10233 set_t setno = dtgpp->dtgp_setno; 10234 md_error_t *ep = &dtgpp->dtgp_mde; 10235 10236 mdclrerror(ep); 10237 10238 if ((mode & FREAD) == 0) 10239 return (mdsyserror(ep, EACCES)); 10240 10241 if (setno >= md_nsets) 10242 return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR)); 10243 10244 if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0) 10245 return (0); 10246 10247 if ((s = mddb_setenter(setno, MDDB_NOINIT, &err)) == NULL) 10248 return (mddbstatus2error(ep, err, NODEV32, setno)); 10249 10250 /* 10251 * Data tags not supported on MN sets so return invalid operation. 10252 * This ioctl could be called before the mddb has been read in so 10253 * the set status may not yet be set to MNSET, so code following 10254 * this check must handle a MN diskset properly. 10255 */ 10256 if (md_get_setstatus(setno) & MD_SET_MNSET) { 10257 mddb_setexit(s); 10258 return (mderror(ep, MDE_INVAL_MNOP)); 10259 } 10260 10261 /* s_dtlp is NULL for MN diskset */ 10262 dtlp = s->s_dtlp; 10263 while (dtlp != NULL) { 10264 if (dtgpp->dtgp_dt.dt_id == 0 || 10265 dtgpp->dtgp_dt.dt_id == dtlp->dtl_dt.dt_id) { 10266 bcopy((caddr_t)&dtlp->dtl_dt, (caddr_t)&dtgpp->dtgp_dt, 10267 sizeof (mddb_dtag_t)); 10268 break; 10269 } 10270 dtlp = dtlp->dtl_nx; 10271 } 10272 10273 /* Walked the whole list and id not found, return error */ 10274 if (dtlp == (mddb_dtag_lst_t *)NULL) { 10275 mddb_setexit(s); 10276 return (mdmddberror(ep, MDE_DB_NOTAG, NODEV32, setno)); 10277 } 10278 10279 mddb_setexit(s); 10280 10281 return (0); 10282 } 10283 10284 int 10285 usetag_ioctl(mddb_dtag_use_parm_t *dtupp, int mode) 10286 { 10287 mddb_set_t *s; 10288 int err = 0; 10289 mddb_config_t *cp; 10290 mddb_ri_t *trip = NULL; 10291 mddb_dtag_t *dtagp = NULL; 10292 set_t setno = dtupp->dtup_setno; 10293 md_error_t *ep = &dtupp->dtup_mde; 10294 10295 mdclrerror(ep); 10296 10297 if ((mode & FWRITE) == 0) 10298 return (mdsyserror(ep, EACCES)); 10299 10300 if (setno >= md_nsets) 10301 return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR)); 10302 10303 if (dtupp->dtup_id < 0) 10304 return (mdsyserror(ep, EINVAL)); 10305 else if (dtupp->dtup_id == 0) 10306 return (mdmddberror(ep, MDE_DB_NOTAG, NODEV32, setno)); 10307 10308 if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0) 10309 return (0); 10310 10311 if ((md_get_setstatus(setno) & MD_SET_TAGDATA) == 0) 10312 return (mdmddberror(ep, MDE_DB_NTAGDATA, NODEV32, setno)); 10313 10314 if ((s = mddb_setenter(setno, MDDB_NOINIT, &err)) == NULL) 10315 return (mddbstatus2error(ep, err, NODEV32, setno)); 10316 10317 /* 10318 * Data tags not supported on MN sets so return invalid operation. 10319 * This ioctl could be called before the mddb has been read in so 10320 * the set status may not yet be set to MNSET, so code following 10321 * this check must handle a MN diskset properly. 10322 */ 10323 if (md_get_setstatus(setno) & MD_SET_MNSET) { 10324 mddb_setexit(s); 10325 return (mderror(ep, MDE_INVAL_MNOP)); 10326 } 10327 10328 /* Validate and find the id requested - nothing found if MN diskset */ 10329 if ((dtagp = dtl_findl(s, dtupp->dtup_id)) == NULL) { 10330 mddb_setexit(s); 10331 return (mdmddberror(ep, MDE_DB_NOTAG, NODEV32, setno)); 10332 } 10333 10334 /* Usetag is only valid when more than one tag exists */ 10335 if (dtl_cntl(s) < 2) { 10336 mddb_setexit(s); 10337 return (mdmddberror(ep, MDE_DB_NTAGDATA, NODEV32, setno)); 10338 } 10339 10340 /* Put the selected tag in place */ 10341 dt_setup(s, dtagp); 10342 10343 cp = kmem_zalloc(sizeof (mddb_config_t), KM_SLEEP); 10344 10345 /* Save the hint information */ 10346 trip = save_rip(s); 10347 10348 cp->c_timestamp = s->s_ident.createtime; /* struct assignment */ 10349 cp->c_setno = setno; 10350 cp->c_sideno = s->s_sideno; 10351 (void) strncpy(cp->c_setname, s->s_setname, MD_MAX_SETNAME); 10352 cp->c_setname[MD_MAX_SETNAME] = '\0'; 10353 cp->c_med = s->s_med; /* struct assignment */ 10354 10355 mddb_setexit(s); 10356 10357 s = NULL; 10358 10359 /* shorthand */ 10360 setno = cp->c_setno; 10361 10362 /* Let unload know not to free the tag */ 10363 md_set_setstatus(setno, MD_SET_KEEPTAG); 10364 10365 /* Release the set */ 10366 if (err = release_set(cp, mode)) 10367 goto out; 10368 10369 if (! mdisok(&cp->c_mde)) { 10370 (void) mdstealerror(ep, &cp->c_mde); 10371 err = 1; 10372 goto out; 10373 } 10374 10375 /* Re-init set using the saved mddb_config_t structure */ 10376 if ((s = mddb_setenter(setno, MDDB_NOINIT, &err)) == NULL) { 10377 if ((s = init_set(cp, MDDB_NOINIT, &err)) == NULL) { 10378 err = mddbstatus2error(ep, err, NODEV32, setno); 10379 goto out; 10380 } 10381 } 10382 10383 ASSERT(s->s_rip == (mddb_ri_t *)NULL); 10384 10385 /* use the saved rip structure */ 10386 s->s_rip = trip; 10387 trip = (mddb_ri_t *)NULL; 10388 10389 /* Let the take code know a tag is being used */ 10390 md_set_setstatus(setno, MD_SET_USETAG); 10391 10392 mddb_setexit(s); 10393 10394 s = NULL; 10395 10396 /* Take the set */ 10397 if (err = take_set(cp, mode)) 10398 goto out; 10399 10400 if (! mdisok(&cp->c_mde)) 10401 (void) mdstealerror(ep, &cp->c_mde); 10402 10403 out: 10404 md_clr_setstatus(setno, (MD_SET_USETAG | MD_SET_KEEPTAG)); 10405 10406 kmem_free(cp, sizeof (mddb_config_t)); 10407 10408 if (trip) 10409 free_rip(&trip); 10410 10411 if (s) 10412 mddb_setexit(s); 10413 10414 return (err); 10415 } 10416 10417 int 10418 accept_ioctl(mddb_accept_parm_t *accpp, int mode) 10419 { 10420 mddb_set_t *s; 10421 int err = 0; 10422 mddb_config_t *cp; 10423 mddb_ri_t *trip = NULL; 10424 set_t setno = accpp->accp_setno; 10425 md_error_t *ep = &accpp->accp_mde; 10426 10427 mdclrerror(ep); 10428 10429 if ((mode & FWRITE) == 0) 10430 return (mdsyserror(ep, EACCES)); 10431 10432 if (setno >= md_nsets) 10433 return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR)); 10434 10435 if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0) 10436 return (0); 10437 10438 if ((md_get_setstatus(setno) & MD_SET_ACCOK) == 0) 10439 return (mdmddberror(ep, MDE_DB_ACCNOTOK, NODEV32, setno)); 10440 10441 if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) 10442 return (mddbstatus2error(ep, err, NODEV32, setno)); 10443 10444 /* 10445 * Data tags not supported on MN sets so return invalid operation. 10446 * mddb is guaranteed to be incore at this point, so this 10447 * check will catch all MN disksets. 10448 */ 10449 if (md_get_setstatus(setno) & MD_SET_MNSET) { 10450 mddb_setexit(s); 10451 return (mderror(ep, MDE_INVAL_MNOP)); 10452 } 10453 10454 cp = kmem_zalloc(sizeof (mddb_config_t), KM_SLEEP); 10455 10456 trip = save_rip(s); 10457 10458 cp->c_timestamp = s->s_ident.createtime; /* struct assignment */ 10459 cp->c_setno = setno; 10460 cp->c_sideno = s->s_sideno; 10461 (void) strncpy(cp->c_setname, s->s_setname, MD_MAX_SETNAME); 10462 cp->c_setname[MD_MAX_SETNAME] = '\0'; 10463 cp->c_med = s->s_med; /* struct assignment */ 10464 10465 /* Tag the data */ 10466 if (err = set_dtag(s, ep)) { 10467 err = mdsyserror(ep, err); 10468 goto out; 10469 } 10470 10471 /* If we had a BADTAG, it will be re-written, so clear the bit. */ 10472 if (md_get_setstatus(setno) & MD_SET_BADTAG) 10473 md_clr_setstatus(setno, MD_SET_BADTAG); 10474 10475 if (err = dt_write(s)) { 10476 err = mdsyserror(ep, err); 10477 goto out; 10478 } 10479 10480 mddb_setexit(s); 10481 10482 s = NULL; 10483 10484 /* shorthand */ 10485 setno = cp->c_setno; 10486 10487 /* Clear the keeptag */ 10488 md_clr_setstatus(setno, MD_SET_KEEPTAG); 10489 10490 /* Release the set */ 10491 if (err = release_set(cp, mode)) 10492 goto out; 10493 10494 if (! mdisok(&cp->c_mde)) { 10495 (void) mdstealerror(ep, &cp->c_mde); 10496 goto out; 10497 } 10498 10499 /* Re-init set using the saved mddb_config_t structure */ 10500 if ((s = mddb_setenter(setno, MDDB_NOINIT, &err)) == NULL) { 10501 if ((s = init_set(cp, MDDB_NOINIT, &err)) == NULL) { 10502 err = mddbstatus2error(ep, err, NODEV32, setno); 10503 goto out; 10504 } 10505 } 10506 10507 ASSERT(s->s_rip == (mddb_ri_t *)NULL); 10508 10509 /* Free the allocated rip structure */ 10510 if (s->s_rip != (mddb_ri_t *)NULL) 10511 free_rip(&s->s_rip); 10512 10513 /* use the saved rip structure */ 10514 s->s_rip = trip; 10515 trip = (mddb_ri_t *)NULL; 10516 10517 /* Let the set init code know an accept is in progress */ 10518 md_set_setstatus(setno, MD_SET_ACCEPT); 10519 10520 mddb_setexit(s); 10521 10522 s = NULL; 10523 10524 /* Take the set */ 10525 if (err = take_set(cp, mode)) 10526 goto out; 10527 10528 if (! mdisok(&cp->c_mde)) 10529 (void) mdstealerror(ep, &cp->c_mde); 10530 10531 out: 10532 md_clr_setstatus(setno, (MD_SET_ACCOK | MD_SET_ACCEPT)); 10533 10534 kmem_free(cp, sizeof (mddb_config_t)); 10535 10536 if (trip) 10537 free_rip(&trip); 10538 10539 if (s) 10540 mddb_setexit(s); 10541 10542 return (err); 10543 } 10544 10545 /* 10546 * mddb_getinvlb_devid - cycles through the locator block and determines 10547 * if the device id's for any of the replica disks are invalid. 10548 * If so, it returns the diskname in the ctdptr. 10549 * RETURN 10550 * -1 Error 10551 * cnt number of invalid device id's 10552 */ 10553 int 10554 mddb_getinvlb_devid( 10555 set_t setno, 10556 int count, 10557 int size, 10558 char **ctdptr 10559 ) 10560 { 10561 mddb_set_t *s; 10562 int err = 0; 10563 mddb_lb_t *lbp; 10564 int li; 10565 mddb_did_blk_t *did_blk; 10566 mddb_did_info_t *did_info; 10567 int len; 10568 int cnt = 0; 10569 char *cptr; 10570 md_name_suffix *sn; 10571 int i, dont_add_it; 10572 char *tmpctd, *diskname; 10573 char *tmpname; 10574 10575 cptr = *ctdptr; 10576 if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) { 10577 return (-1); 10578 } 10579 10580 single_thread_start(s); 10581 lbp = s->s_lbp; 10582 10583 if (lbp->lb_setno != setno) { 10584 single_thread_end(s); 10585 mddb_setexit(s); 10586 return (-1); 10587 } 10588 10589 /* check for lb being devid style */ 10590 if (lbp->lb_flags & MDDB_DEVID_STYLE) { 10591 did_blk = s->s_did_icp->did_ic_blkp; 10592 for (li = 0; li < lbp->lb_loccnt; li++) { 10593 did_info = &(did_blk->blk_info[li]); 10594 /* Only if devid exists and isn't valid */ 10595 if ((did_info->info_flags & MDDB_DID_EXISTS) && 10596 !(did_info->info_flags & MDDB_DID_VALID)) { 10597 /* 10598 * if we count more invalid did's than 10599 * was passed in there's an error somewhere 10600 */ 10601 if (cnt++ > count) { 10602 single_thread_end(s); 10603 mddb_setexit(s); 10604 return (-1); 10605 } 10606 10607 /* 10608 * Future note: Need to do something here 10609 * for the MN diskset case when device ids 10610 * are supported in disksets. 10611 * Can't add until merging devids_in_diskset 10612 * code into code base. 10613 */ 10614 10615 sn = &s->s_lnp->ln_suffixes[0][li]; 10616 /* 10617 * check to make sure length of device name is 10618 * not greater than computed first time through 10619 */ 10620 len = sn->suf_len; 10621 if (len > size) { 10622 single_thread_end(s); 10623 mddb_setexit(s); 10624 return (-1); 10625 } 10626 tmpctd = *ctdptr; 10627 /* strip off slice part */ 10628 diskname = md_strdup(sn->suf_data); 10629 tmpname = strrchr(diskname, 's'); 10630 *tmpname = '\0'; 10631 dont_add_it = 0; 10632 /* look to see if diskname is already in list */ 10633 for (i = 0; i < (cnt-1); i++) { 10634 if (strcmp(diskname, tmpctd) == 0) { 10635 /* already there, don't add */ 10636 dont_add_it = 1; 10637 break; 10638 } 10639 /* point to next diskname in list */ 10640 tmpctd += size; 10641 } 10642 if (dont_add_it == 0) { 10643 /* add diskname to list */ 10644 (void) strcpy(cptr, diskname); 10645 cptr += size; 10646 } 10647 kmem_free(diskname, strlen(sn->suf_data) + 1); 10648 } 10649 } 10650 } 10651 /* null terminate the list */ 10652 *cptr = '\0'; 10653 /* 10654 * need to save the new pointer so that calling routine can continue 10655 * to add information onto the end. 10656 */ 10657 *ctdptr = cptr; 10658 single_thread_end(s); 10659 mddb_setexit(s); 10660 return (cnt); 10661 } 10662 10663 /* 10664 * mddb_validate_lb - count the number of lb's with invalid device id's. Keep 10665 * track of length of longest devicename. 10666 * RETURN 10667 * -1 error 10668 * cnt number of lb's with invalid devid's 10669 */ 10670 int 10671 mddb_validate_lb( 10672 set_t setno, 10673 int *rmaxsz 10674 ) 10675 { 10676 mddb_set_t *s; 10677 int err = 0; 10678 mddb_lb_t *lbp; 10679 int li; 10680 mddb_did_blk_t *did_blk; 10681 mddb_did_info_t *did_info; 10682 int len; 10683 int cnt = 0; 10684 10685 if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) 10686 return (-1); 10687 10688 single_thread_start(s); 10689 lbp = s->s_lbp; 10690 10691 if (lbp->lb_setno != setno) { 10692 single_thread_end(s); 10693 mddb_setexit(s); 10694 return (-1); 10695 } 10696 10697 /* lb must be in devid style */ 10698 if ((lbp->lb_flags & MDDB_DEVID_STYLE) == 0) 10699 goto mvl_out; 10700 10701 did_blk = s->s_did_icp->did_ic_blkp; 10702 for (li = 0; li < lbp->lb_loccnt; li++) { 10703 char *minor_name; 10704 mddb_locator_t *lp; 10705 dev_t ddi_dev; 10706 ddi_devid_t devid; 10707 ddi_devid_t rtn_devid = NULL; 10708 int get_rval; 10709 10710 did_info = &(did_blk->blk_info[li]); 10711 if (((did_info->info_flags & MDDB_DID_EXISTS) == 0) || 10712 (did_info->info_flags & MDDB_DID_VALID)) 10713 continue; 10714 10715 /* Here we know, did exists but isn't valid */ 10716 10717 lp = &lbp->lb_locators[li]; 10718 ddi_dev = expldev(lp->l_dev); 10719 get_rval = mddb_devid_get(s, li, &devid, &minor_name); 10720 ASSERT(get_rval == 1); 10721 if ((ddi_lyr_get_devid(ddi_dev, &rtn_devid) == DDI_SUCCESS) && 10722 (ddi_devid_compare(rtn_devid, devid) == 0)) { 10723 did_info->info_flags = MDDB_DID_VALID | 10724 MDDB_DID_EXISTS | 10725 MDDB_DID_UPDATED; 10726 } else { 10727 cnt++; 10728 /* 10729 * Future note: Need to do something here 10730 * for the MN diskset case when device ids 10731 * are supported in disksets. 10732 * Can't add until merging devids_in_diskset 10733 * code into code base. 10734 */ 10735 len = (&s->s_lnp->ln_suffixes[0][li])-> suf_len; 10736 if (*rmaxsz < len) 10737 *rmaxsz = len; 10738 } 10739 if (rtn_devid != NULL) 10740 ddi_devid_free(rtn_devid); 10741 } 10742 10743 mvl_out: 10744 10745 if (push_lb(s) != 0) 10746 cnt = -1; 10747 (void) upd_med(s, "mddb_validate_lb(0)"); 10748 single_thread_end(s); 10749 mddb_setexit(s); 10750 return (cnt); 10751 } 10752 10753 int 10754 check_active_locators() 10755 { 10756 mddb_set_t *s; 10757 mddb_lb_t *lbp; 10758 int li; 10759 int active = 0; 10760 10761 mutex_enter(&mddb_lock); 10762 /* there is nothing here..so we can unload */ 10763 if ((mddb_set_t *)md_set[MD_LOCAL_SET].s_db == NULL) { 10764 mutex_exit(&mddb_lock); 10765 return (0); 10766 } 10767 s = (mddb_set_t *)md_set[MD_LOCAL_SET].s_db; 10768 lbp = s->s_lbp; 10769 if (lbp == NULL) { 10770 mutex_exit(&mddb_lock); 10771 return (0); 10772 } 10773 10774 for (li = 0; li < lbp->lb_loccnt; li++) { 10775 mddb_locator_t *lp = &lbp->lb_locators[li]; 10776 if (lp->l_flags & MDDB_F_ACTIVE) { 10777 active = 1; 10778 break; 10779 } 10780 } 10781 mutex_exit(&mddb_lock); 10782 return (active); 10783 } 10784 10785 /* 10786 * regetoptrecord: 10787 * -------------- 10788 * Update the in-core optimized resync record contents by re-reading the 10789 * record from the on-disk metadb. 10790 * The contents of the resync record will be overwritten by calling this 10791 * routine. This means that callers that require the previous contents to 10792 * be preserved must save the data before calling this routine. 10793 * Return values: 10794 * 0 - successfully read in resync record from a mddb 10795 * 1 - failure. Unable to read resync record from either mddb. 10796 */ 10797 static int 10798 regetoptrecord( 10799 mddb_set_t *s, 10800 mddb_de_ic_t *dep 10801 ) 10802 { 10803 mddb_lb_t *lbp; 10804 mddb_locator_t *lp; 10805 mddb_rb32_t *rbp, *crbp; 10806 int li; 10807 int i; 10808 int err = 0; 10809 size_t recsize; 10810 10811 #if defined(_ILP32) && !defined(lint) 10812 ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t)); 10813 #endif 10814 10815 recsize = dep->de_recsize; 10816 crbp = (mddb_rb32_t *)kmem_zalloc(recsize, KM_SLEEP); 10817 10818 single_thread_start(s); 10819 rbp = dep->de_rb; 10820 10821 dep->de_optinfo[0].o_flags |= MDDB_F_EDATA; 10822 dep->de_optinfo[1].o_flags |= MDDB_F_EDATA; 10823 10824 lbp = s->s_lbp; 10825 10826 for (i = 0; i < 2; i++) { 10827 if (! (dep->de_optinfo[i].o_flags & MDDB_F_ACTIVE)) 10828 continue; 10829 li = dep->de_optinfo[i].o_li; 10830 lp = &lbp->lb_locators[li]; 10831 10832 if (! (lp->l_flags & MDDB_F_ACTIVE) || 10833 (lp->l_flags & MDDB_F_EMASTER)) 10834 continue; 10835 10836 /* 10837 * re-read the optimized resync record with failfast set 10838 * since a failed disk could lead to a very long wait. 10839 */ 10840 err = readblklst(s, (caddr_t)rbp, dep->de_blks, 10841 dep->de_blkcount, li, B_FAILFAST); 10842 10843 if (err) 10844 continue; 10845 10846 if (rbp->rb_magic != MDDB_MAGIC_RB) 10847 continue; 10848 10849 if (revchk(MDDB_REV_RB, rbp->rb_revision)) 10850 continue; 10851 10852 /* Check the crc for this record */ 10853 if (rec_crcchk(s, dep, rbp)) { 10854 continue; 10855 } 10856 dep->de_optinfo[i].o_flags = MDDB_F_ACTIVE; 10857 10858 if (rbp == crbp) { 10859 if (rbp->rb_checksum != crbp->rb_checksum) 10860 dep->de_optinfo[1].o_flags |= MDDB_F_EDATA; 10861 break; 10862 } 10863 rbp = crbp; 10864 } 10865 10866 single_thread_end(s); 10867 10868 if (rbp == crbp) { 10869 rbp->rb_private = 0; 10870 kmem_free((caddr_t)crbp, recsize); 10871 return (0); 10872 } 10873 uniqtime32(&rbp->rb_timestamp); 10874 /* Generate the crc for this record */ 10875 rec_crcgen(s, dep, rbp); 10876 kmem_free((caddr_t)crbp, recsize); 10877 return (1); 10878 } 10879 10880 /* 10881 * mddb_reread_rr: 10882 * Re-read the resync record from the on-disk copy. This is required for 10883 * multi-node support so that a new mirror-owner can determine if a resync 10884 * operation is required to guarantee data integrity. 10885 * 10886 * Arguments: 10887 * setno Associated set 10888 * id Resync record ID 10889 * 10890 * Return Value: 10891 * 0 successful reread 10892 * -1 invalid set (not multi-node or non-existant) 10893 * >0 metadb state invalid, failed to reread 10894 */ 10895 int 10896 mddb_reread_rr( 10897 set_t setno, 10898 mddb_recid_t id 10899 ) 10900 { 10901 mddb_set_t *s; 10902 int err = 0; 10903 mddb_db_t *dbp; 10904 mddb_de_ic_t *dep; 10905 10906 if (setno >= md_nsets) 10907 return (-1); 10908 10909 if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) 10910 return (-1); 10911 10912 if ((setno == MD_LOCAL_SET) || !(s->s_lbp->lb_flags & MDDB_MNSET)) { 10913 mddb_setexit(s); 10914 return (-1); 10915 } 10916 10917 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 10918 dep = dbp->db_firstentry; 10919 while (dep && (dep->de_recid != DBID(id))) 10920 dep = dep->de_next; 10921 if (dep != NULL) 10922 break; 10923 } 10924 10925 if (dep != NULL) { 10926 err = regetoptrecord(s, dep); 10927 } else { 10928 err = -1; 10929 } 10930 mddb_setexit(s); 10931 return (err); 10932 } 10933 10934 /* 10935 * Set owner associated with MN optimized resync record. 10936 * 10937 * Optimized records have an owner node associated with them in 10938 * a MN diskset. The owner is only set on a node that is actively 10939 * writing to that record. The other nodes will show that record 10940 * as having an invalid owner. The owner for an optimized record 10941 * is used during fixoptrecord to determine which node should 10942 * write out the record when the replicas associated with that 10943 * optimized record have been changed. 10944 * 10945 * Called directly from mirror driver and not from an ioctl. 10946 * 10947 * Returns 10948 * NULL if successful. 10949 * MDDB_E_NORECORD if record not found. 10950 */ 10951 int 10952 mddb_setowner( 10953 mddb_recid_t id, 10954 md_mn_nodeid_t owner 10955 ) 10956 { 10957 mddb_set_t *s; 10958 mddb_db_t *dbp; 10959 mddb_de_ic_t *dep; 10960 int found = 0; 10961 10962 10963 if (DBSET(id) >= md_nsets) 10964 return (MDDB_E_NORECORD); 10965 10966 if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, NULL)) == NULL) 10967 return (MDDB_E_NORECORD); 10968 10969 id = DBID(id); 10970 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 10971 for (dep = dbp->db_firstentry; 10972 dep != NULL; dep = dep->de_next) { 10973 if (dep->de_recid != id) 10974 continue; 10975 dep->de_owner_nodeid = owner; 10976 found = 1; 10977 break; 10978 } 10979 if (found) 10980 break; 10981 } 10982 10983 mddb_setexit(s); 10984 10985 if (!found) { 10986 return (MDDB_E_NORECORD); 10987 } 10988 10989 return (NULL); 10990 } 10991 10992 /* 10993 * mddb_parse re-reads portions of the mddb from disk given a list 10994 * of good replicas to read from and flags describing 10995 * which portion of the mddb to read in. 10996 * 10997 * Used in a MN diskset when the master has made a change to some part 10998 * of the mddb and wants to relay this information to the slaves. 10999 */ 11000 int 11001 mddb_parse(mddb_parse_parm_t *mpp) 11002 { 11003 mddb_set_t *s; 11004 int err = 0; 11005 mddb_locator_t *lp, *old_lp; 11006 mddb_lb_t *lbp, *old_lbp; 11007 int rval = 0; 11008 int i, li; 11009 int found_good_one = 0; 11010 mddb_ln_t *lnp; 11011 mddb_block_t ln_blkcnt; 11012 md_error_t *ep = &mpp->c_mde; 11013 11014 if (mpp->c_setno >= md_nsets) 11015 return (EINVAL); 11016 11017 if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0) 11018 return (0); 11019 11020 if ((s = mddb_setenter(mpp->c_setno, MDDB_MUSTEXIST, &err)) == NULL) { 11021 return (mddbstatus2error(ep, err, NODEV32, mpp->c_setno)); 11022 } 11023 11024 if (!(MD_MNSET_SETNO(mpp->c_setno))) { 11025 mddb_setexit_no_parse(s); 11026 return (EINVAL); 11027 } 11028 11029 /* 11030 * Master node initiated this request, so there's no work for 11031 * the master node to do. 11032 */ 11033 if (md_set[mpp->c_setno].s_am_i_master) { 11034 mddb_setexit_no_parse(s); 11035 return (rval); 11036 } 11037 11038 single_thread_start(s); 11039 11040 if (mpp->c_parse_flags & MDDB_PARSE_LOCBLK) { 11041 lbp = 0; 11042 for (i = 0; i < MDDB_NLB; i++) { 11043 /* Walk through master's active list */ 11044 if (!(mpp->c_lb_flags[i] & MDDB_F_ACTIVE)) 11045 continue; 11046 if (s->s_mbiarray[i] == NULL) 11047 continue; 11048 11049 /* Assumes master blocks are already setup */ 11050 if (lbp == (mddb_lb_t *)NULL) { 11051 lbp = (mddb_lb_t *)kmem_zalloc( 11052 dbtob(MDDB_MNLBCNT), KM_SLEEP); 11053 } 11054 err |= readblks(s, (caddr_t)lbp, 0, lbp->lb_blkcnt, i); 11055 11056 if (err) 11057 continue; 11058 11059 if (lbp->lb_magic != MDDB_MAGIC_LB) 11060 continue; 11061 if (lbp->lb_blkcnt != MDDB_MNLBCNT) 11062 continue; 11063 if (revchk(MDDB_REV_MNLB, lbp->lb_revision)) 11064 continue; 11065 if (crcchk(lbp, &lbp->lb_checksum, dbtob(MDDB_MNLBCNT), 11066 NULL)) 11067 continue; 11068 if (lbp->lb_setno != s->s_setno) 11069 continue; 11070 /* 11071 * a commit count of zero means this locator has 11072 * been deleted 11073 */ 11074 if (lbp->lb_commitcnt == 0) { 11075 continue; 11076 } 11077 /* Found a good locator - keep it */ 11078 found_good_one = 1; 11079 break; 11080 } 11081 11082 /* 11083 * If found a good copy of the mddb, then read it into 11084 * this node's locator block. Fix up the set's s_mbiarray 11085 * pointer (master block incore array pointer) to be 11086 * in sync with the newly read in locator block. If a 11087 * new mddb was added, read in the master blocks associated 11088 * with the new mddb. If an mddb was deleted, free the 11089 * master blocks associated with deleted mddb. 11090 */ 11091 if (found_good_one) { 11092 /* Compare old and new view of mddb locator blocks */ 11093 old_lbp = s->s_lbp; 11094 for (li = 0; li < lbp->lb_loccnt; li++) { 11095 int mn_set; 11096 11097 lp = &lbp->lb_locators[li]; 11098 old_lp = &old_lbp->lb_locators[li]; 11099 11100 /* If old and new views match, continue */ 11101 if ((lp->l_flags & MDDB_F_ACTIVE) == 11102 (old_lp->l_flags & MDDB_F_ACTIVE)) 11103 continue; 11104 11105 if (lp->l_flags & MDDB_F_ACTIVE) { 11106 /* 11107 * If new mddb has been added - delete 11108 * old mbiarray and get new one. 11109 * 11110 * When devids are supported, will 11111 * need to get dev from devid. 11112 */ 11113 if (s->s_mbiarray[li]) { 11114 free_mbipp(&s->s_mbiarray[li]); 11115 } 11116 /* 11117 * If getmasters fails, getmasters 11118 * will set appropriate error flags. 11119 */ 11120 s->s_mbiarray[li] = getmasters(s, 11121 md_expldev(lp->l_dev), lp->l_blkno, 11122 (uint_t *)&(lp->l_flags), &mn_set); 11123 } else if (lp->l_flags & MDDB_F_DELETED) { 11124 /* 11125 * If old one has been deleted - 11126 * delete old mbiarray. 11127 */ 11128 if (s->s_mbiarray[li]) { 11129 free_mbipp(&s->s_mbiarray[li]); 11130 } 11131 } 11132 } 11133 11134 /* Free this node's old view of mddb locator blocks */ 11135 kmem_free((caddr_t)s->s_lbp, 11136 dbtob(s->s_lbp->lb_blkcnt)); 11137 s->s_lbp = lbp; 11138 } else { 11139 if (lbp) 11140 kmem_free(lbp, dbtob(MDDB_MNLBCNT)); 11141 } 11142 } 11143 11144 if (mpp->c_parse_flags & MDDB_PARSE_LOCNM) { 11145 lnp = s->s_lnp; 11146 lbp = s->s_lbp; 11147 ln_blkcnt = lbp->lb_lnblkcnt; 11148 s->s_lnp = NULL; /* readlocnames does this anyway */ 11149 for (li = 0; li < lbp->lb_loccnt; li++) { 11150 lp = &lbp->lb_locators[li]; 11151 11152 if ((! (lp->l_flags & MDDB_F_ACTIVE)) || 11153 (lp->l_flags & MDDB_F_EMASTER)) 11154 continue; 11155 11156 /* Successfully read the locator names */ 11157 if (readlocnames(s, li) == 0) 11158 break; 11159 } 11160 11161 if (li == lbp->lb_loccnt) { 11162 /* Did not successfully read locnames; restore lnp */ 11163 s->s_lnp = lnp; 11164 } else { 11165 /* readlocnames successful, free old struct */ 11166 kmem_free((caddr_t)lnp, dbtob(ln_blkcnt)); 11167 } 11168 } 11169 11170 if (mpp->c_parse_flags & MDDB_PARSE_OPTRECS) { 11171 mddb_de_ic_t *dep, *tdep, *first_dep, *dep2; 11172 mddb_db_t *dbp; 11173 mddb_db32_t *db32p; 11174 mddb_de32_t *de32p, *de32p2; 11175 int writeout; 11176 11177 lbp = s->s_lbp; 11178 /* 11179 * Walk through directory block and directory entry incore 11180 * linked list looking for optimized resync records. 11181 * For each opt record found, re-read in directory block. 11182 * The directoy block consists of a number of directory 11183 * entries. The directory entry for this opt record will 11184 * describe which 2 mddbs actually contain the resync record 11185 * since it could have been relocated by the master node 11186 * due to mddb failure or mddb deletion. If this node 11187 * is the record owner for this opt record, then write out 11188 * the record to the 2 mddbs listed in the directory entry 11189 * if the mddbs locations are different than previously known. 11190 */ 11191 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 11192 for (dep = dbp->db_firstentry; dep; 11193 dep = dep->de_next) { 11194 /* Found an opt record */ 11195 if (dep->de_flags & MDDB_F_OPT) 11196 break; 11197 } 11198 /* If no opt records found, go to next dbp */ 11199 if (dep == NULL) 11200 continue; 11201 11202 /* 11203 * Reread directory block from disk since 11204 * master could have rewritten in during fixoptrecord. 11205 */ 11206 db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, 11207 KM_SLEEP); 11208 create_db32rec(db32p, dbp); 11209 for (li = 0; li < lbp->lb_loccnt; li++) { 11210 lp = &lbp->lb_locators[li]; 11211 11212 if ((! (lp->l_flags & MDDB_F_ACTIVE)) || 11213 (lp->l_flags & MDDB_F_EMASTER)) 11214 continue; 11215 11216 err = readblks(s, (caddr_t)db32p, 11217 db32p->db32_blknum, 1, li); 11218 if (err) 11219 continue; 11220 11221 /* Reverify db; go to next mddb if bad */ 11222 if ((db32p->db32_magic != MDDB_MAGIC_DB) || 11223 (revchk(MDDB_REV_DB, 11224 db32p->db32_revision)) || 11225 (crcchk(db32p, &db32p->db32_checksum, 11226 MDDB_BSIZE, NULL))) { 11227 continue; 11228 } else { 11229 break; 11230 } 11231 } 11232 /* 11233 * If all mddbs are unavailable then panic since 11234 * this slave cannot be allowed to continue out-of-sync 11235 * with the master node. Since the optimized resync 11236 * records are written by all nodes, all nodes must 11237 * stay in sync with the master. 11238 * 11239 * This also handles the case when all storage 11240 * connectivity to a slave node has failed. The 11241 * slave node will send an MDDB_OPTRECERR message to 11242 * the master node when the slave node has been unable 11243 * to write an optimized resync record to both 11244 * designated mddbs. After the master has fixed the 11245 * optimized records to be on available mddbs, the 11246 * MDDB_PARSE message (with the flag MDDB_PARSE_OPTRECS) 11247 * is sent to all slave nodes. If a slave node is 11248 * unable to access any mddb in order to read in the 11249 * relocated optimized resync record, then the slave 11250 * node must panic. 11251 */ 11252 if (li == lbp->lb_loccnt) { 11253 kmem_free((caddr_t)db32p, MDDB_BSIZE); 11254 cmn_err(CE_PANIC, "md: mddb: Node unable to " 11255 "access any SVM state database " 11256 "replicas for diskset %s\n", 11257 s->s_setname); 11258 } 11259 /* 11260 * Setup temp copy of linked list of de's. 11261 * Already have an incore copy, but need to walk 11262 * the directory entry list contained in the 11263 * new directory block that was just read in above. 11264 * After finding the directory entry of an opt record 11265 * by walking the incore list, find the corresponding 11266 * entry in the temporary list and then update 11267 * the incore directory entry record with 11268 * the (possibly changed) mddb location stored 11269 * for the optimized resync records. 11270 */ 11271 de32p = (mddb_de32_t *) 11272 ((void *) ((caddr_t) 11273 (&db32p->db32_firstentry) 11274 + sizeof (db32p->db32_firstentry))); 11275 tdep = (mddb_de_ic_t *) 11276 kmem_zalloc(sizeof (mddb_de_ic_t) - 11277 sizeof (mddb_block_t) + 11278 sizeof (mddb_block_t) * 11279 de32p->de32_blkcount, KM_SLEEP); 11280 de32tode(de32p, tdep); 11281 first_dep = tdep; 11282 while (de32p && de32p->de32_next) { 11283 de32p2 = nextentry(de32p); 11284 dep2 = (mddb_de_ic_t *)kmem_zalloc( 11285 sizeof (mddb_de_ic_t) - 11286 sizeof (mddb_block_t) + 11287 sizeof (mddb_block_t) * 11288 de32p2->de32_blkcount, KM_SLEEP); 11289 de32tode(de32p2, dep2); 11290 tdep->de_next = dep2; 11291 tdep = dep2; 11292 de32p = de32p2; 11293 } 11294 11295 /* Now, walk the incore directory entry list */ 11296 for (dep = dbp->db_firstentry; dep; 11297 dep = dep->de_next) { 11298 if (! (dep->de_flags & MDDB_F_OPT)) 11299 continue; 11300 /* 11301 * Found an opt record in the incore copy. 11302 * Find the corresponding entry in the temp 11303 * list. If anything has changed in the 11304 * opt record info between the incore copy 11305 * and the temp copy, update the incore copy 11306 * and set a flag to writeout the opt record 11307 * to the new mddb locations. 11308 */ 11309 for (tdep = first_dep; tdep; 11310 tdep = tdep->de_next) { 11311 if (dep->de_recid == tdep->de_recid) { 11312 writeout = 0; 11313 /* Check first mddb location */ 11314 if ((dep->de_optinfo[0].o_li != 11315 tdep->de_optinfo[0].o_li) || 11316 (dep->de_optinfo[0].o_flags != 11317 tdep->de_optinfo[0].o_flags)) { 11318 dep->de_optinfo[0] = 11319 tdep->de_optinfo[0]; 11320 writeout = 1; 11321 } 11322 /* Check second mddb location */ 11323 if ((dep->de_optinfo[1].o_li != 11324 tdep->de_optinfo[1].o_li) || 11325 (dep->de_optinfo[1].o_flags != 11326 tdep->de_optinfo[1].o_flags)) { 11327 dep->de_optinfo[1] = 11328 tdep->de_optinfo[1]; 11329 writeout = 1; 11330 } 11331 /* Record owner should rewrite it */ 11332 if ((writeout) && 11333 (dep->de_owner_nodeid == 11334 md_set[mpp->c_setno]. 11335 s_nodeid)) { 11336 (void) writeoptrecord(s, 11337 dep); 11338 } 11339 break; 11340 } 11341 } 11342 } 11343 /* 11344 * Update the incore checksum information for this 11345 * directory block to match the newly read in checksum. 11346 * This should have only changed if the incore and 11347 * temp directory entries differed, but it takes 11348 * more code to do the check than to just update 11349 * the information everytime. 11350 */ 11351 dbp->db_checksum = db32p->db32_checksum; 11352 11353 /* Now free everything */ 11354 tdep = first_dep; 11355 while (tdep) { 11356 dep2 = tdep->de_next; 11357 kmem_free((caddr_t)tdep, 11358 sizeofde(tdep)); 11359 tdep = dep2; 11360 } 11361 kmem_free((caddr_t)db32p, MDDB_BSIZE); 11362 } 11363 rval = 0; 11364 } 11365 out: 11366 single_thread_end(s); 11367 mddb_setexit_no_parse(s); 11368 return (rval); 11369 } 11370 11371 int 11372 mddb_block(mddb_block_parm_t *mbp) 11373 { 11374 mddb_set_t *s; 11375 int err = 0; 11376 md_error_t *ep = &mbp->c_mde; 11377 11378 if (mbp->c_setno >= md_nsets) 11379 return (EINVAL); 11380 11381 /* 11382 * If the new_master flag is set for this setno we are in the middle 11383 * of a reconfig cycle, and blocking or unblocking is not needed. 11384 * Hence we can return success immediately 11385 */ 11386 if (md_get_setstatus(mbp->c_setno) & MD_SET_MN_NEWMAS_RC) { 11387 return (0); 11388 } 11389 11390 if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0) 11391 return (0); 11392 11393 if ((s = mddb_setenter(mbp->c_setno, MDDB_MUSTEXIST, &err)) == NULL) { 11394 return (mddbstatus2error(ep, err, NODEV32, mbp->c_setno)); 11395 } 11396 11397 if (!(MD_MNSET_SETNO(mbp->c_setno))) { 11398 mddb_setexit_no_parse(s); 11399 return (EINVAL); 11400 } 11401 11402 single_thread_start(s); 11403 11404 if (mbp->c_blk_flags & MDDB_BLOCK_PARSE) 11405 md_set_setstatus(mbp->c_setno, MD_SET_MNPARSE_BLK); 11406 11407 if (mbp->c_blk_flags & MDDB_UNBLOCK_PARSE) 11408 md_clr_setstatus(mbp->c_setno, MD_SET_MNPARSE_BLK); 11409 11410 single_thread_end(s); 11411 mddb_setexit_no_parse(s); 11412 return (err); 11413 } 11414 11415 /* 11416 * mddb_optrecfix marks up to 2 mddbs as failed and calls fixoptrecords 11417 * to relocate any optimized resync records to available mddbs. 11418 * This routine is only called on the master node. 11419 * 11420 * Used in a MN diskset when a slave node has failed to write an optimized 11421 * resync record. The failed mddb information is sent to the master node 11422 * so the master can relocate the optimized records, if possible. If the 11423 * failed mddb information has a mddb marked as failed that was previously 11424 * marked active on the master, the master sets its incore mddb state to 11425 * EWRITE and sets the PARSE_LOCBLK flag. The master node then attempts 11426 * to relocate any optimized records on the newly failed mddbs by calling 11427 * fixoptrecords. (fixoptrecords will set the PARSE_OPTRECS flag if any 11428 * optimized records are relocated.) 11429 * 11430 * When mddb_optrecfix is finished, the ioctl exit code will notice the PARSE 11431 * flags and will send a PARSE message to the slave nodes. The PARSE_LOCBLK 11432 * flag causes the slave node to re-read in the locator block from disk. 11433 * The PARSE_OPTRECS flag causes the slave node to re-read in the directory 11434 * blocks and write out any optimized resync records that have been 11435 * relocated to a different mddb. 11436 */ 11437 int 11438 mddb_optrecfix(mddb_optrec_parm_t *mop) 11439 { 11440 mddb_set_t *s; 11441 int err = 0; 11442 mddb_lb_t *lbp; 11443 mddb_mnlb_t *mnlbp; 11444 mddb_locator_t *lp; 11445 int li; 11446 mddb_mnsidelocator_t *mnslp; 11447 mddb_drvnm_t *dn; 11448 int i, j; 11449 md_replica_recerr_t *recerr; 11450 md_error_t *ep = &mop->c_mde; 11451 int something_changed = 0; 11452 int alc, lc; 11453 int setno; 11454 11455 setno = mop->c_setno; 11456 if (mop->c_setno >= md_nsets) 11457 return (EINVAL); 11458 11459 if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0) 11460 return (0); 11461 11462 if ((s = mddb_setenter(mop->c_setno, MDDB_MUSTEXIST, &err)) == NULL) { 11463 return (mddbstatus2error(ep, err, NODEV32, mop->c_setno)); 11464 } 11465 11466 if (!(MD_MNSET_SETNO(mop->c_setno))) { 11467 mddb_setexit(s); 11468 return (EINVAL); 11469 } 11470 11471 single_thread_start(s); 11472 lbp = s->s_lbp; 11473 mnlbp = (mddb_mnlb_t *)lbp; 11474 11475 /* 11476 * If slave node has seen an mddb failure, but the master node 11477 * hasn't encountered this failure, mark the mddb as failed on 11478 * the master node and set the something_changed flag to 1. 11479 */ 11480 for (i = 0; i < 2; i++) { 11481 recerr = &mop->c_recerr[i]; 11482 if (recerr->r_flags & MDDB_F_EWRITE) { 11483 li = recerr->r_li; 11484 lp = &lbp->lb_locators[li]; 11485 for (j = 0; j < MD_MNMAXSIDES; j++) { 11486 mnslp = &mnlbp->lb_mnsidelocators[j][li]; 11487 if (mnslp->mnl_sideno == s->s_sideno) 11488 break; 11489 } 11490 /* Do quick check using li */ 11491 if (j != MD_MNMAXSIDES) 11492 dn = &lbp->lb_drvnm[mnslp->mnl_drvnm_index]; 11493 11494 if ((j != MD_MNMAXSIDES) && 11495 (strncmp(dn->dn_data, recerr->r_driver_name, 11496 MD_MAXDRVNM) == 0) && 11497 (recerr->r_blkno == lp->l_blkno) && 11498 (recerr->r_mnum == mnslp->mnl_mnum)) { 11499 if ((lp->l_flags & MDDB_F_ACTIVE) || 11500 ((lp->l_flags & MDDB_F_EWRITE) == 0)) { 11501 something_changed = 1; 11502 lp->l_flags |= MDDB_F_EWRITE; 11503 lp->l_flags &= ~MDDB_F_ACTIVE; 11504 } 11505 } else { 11506 /* 11507 * Passed in li from slave does not match 11508 * the replica in the master's structures. 11509 * This could have occurred if a delete 11510 * mddb command was running when the 11511 * optimized resync record had a failure. 11512 * Search all replicas for this entry. 11513 * If no match, just ignore. 11514 * If a match, set replica in error. 11515 */ 11516 for (li = 0; li < lbp->lb_loccnt; li++) { 11517 lp = &lbp->lb_locators[li]; 11518 if (lp->l_flags & MDDB_F_DELETED) 11519 continue; 11520 11521 for (j = 0; j < MD_MNMAXSIDES; j++) { 11522 mnslp = 11523 &mnlbp->lb_mnsidelocators[j][li]; 11524 if (mnslp->mnl_sideno == s->s_sideno) 11525 break; 11526 } 11527 if (j == MD_MNMAXSIDES) 11528 continue; 11529 11530 dn = &lbp->lb_drvnm[mnslp->mnl_drvnm_index]; 11531 if ((strncmp(dn->dn_data, recerr->r_driver_name, 11532 MD_MAXDRVNM) == 0) && 11533 (recerr->r_blkno == lp->l_blkno) && 11534 (recerr->r_mnum == mnslp->mnl_mnum)) { 11535 if ((lp->l_flags & MDDB_F_ACTIVE) || 11536 ((lp->l_flags & MDDB_F_EWRITE) 11537 == 0)) { 11538 something_changed = 1; 11539 lp->l_flags |= MDDB_F_EWRITE; 11540 lp->l_flags &= ~MDDB_F_ACTIVE; 11541 } 11542 break; 11543 } 11544 } 11545 } 11546 } 11547 } 11548 11549 /* 11550 * If this message changed nothing, then we're done since this 11551 * failure has already been handled. 11552 * If some mddb state has been changed, send a parse message to 11553 * the slave nodes so that the slaves will re-read the locator 11554 * block from disk. 11555 */ 11556 if (something_changed == 0) { 11557 single_thread_end(s); 11558 mddb_setexit(s); 11559 return (0); 11560 } else { 11561 s->s_mn_parseflags |= MDDB_PARSE_LOCBLK; 11562 } 11563 11564 /* 11565 * Scan replicas setting MD_SET_TOOFEW if 11566 * 50% or more of the mddbs have seen errors. 11567 * Note: Don't call selectreplicas or writeretry 11568 * since these routines may end up setting the ACTIVE flag 11569 * on a failed mddb if the master is able to access the mddb 11570 * but the slave node couldn't. Need to have the ACTIVE flag 11571 * turned off in order to relocate the optimized records to 11572 * mddbs that are (hopefully) available on all nodes. 11573 */ 11574 alc = 0; 11575 lc = 0; 11576 for (li = 0; li < lbp->lb_loccnt; li++) { 11577 lp = &lbp->lb_locators[li]; 11578 if (lp->l_flags & MDDB_F_DELETED) 11579 continue; 11580 lc++; 11581 if (! (lp->l_flags & MDDB_F_ACTIVE)) 11582 continue; 11583 alc++; 11584 } 11585 11586 /* 11587 * If more than 50% mddbs have failed, then don't relocate opt recs. 11588 * The node sending the mddb failure information will detect TOOFEW 11589 * and will panic when it attempts to re-write the optimized record. 11590 */ 11591 if (alc < ((lc + 1) / 2)) { 11592 md_set_setstatus(setno, MD_SET_TOOFEW); 11593 (void) push_lb(s); 11594 (void) upd_med(s, "mddb_optrecfix(0)"); 11595 single_thread_end(s); 11596 mddb_setexit(s); 11597 return (0); 11598 } 11599 11600 /* Attempt to relocate optimized records that are on failed mddbs */ 11601 (void) fixoptrecords(s); 11602 11603 /* Push changed locator block out to disk */ 11604 (void) push_lb(s); 11605 (void) upd_med(s, "mddb_optrecfix(1)"); 11606 11607 /* Recheck for TOOFEW after writing out locator blocks */ 11608 alc = 0; 11609 lc = 0; 11610 for (li = 0; li < lbp->lb_loccnt; li++) { 11611 lp = &lbp->lb_locators[li]; 11612 if (lp->l_flags & MDDB_F_DELETED) 11613 continue; 11614 lc++; 11615 if (! (lp->l_flags & MDDB_F_ACTIVE)) 11616 continue; 11617 alc++; 11618 } 11619 11620 /* If more than 50% mddbs have failed, then don't relocate opt recs */ 11621 if (alc < ((lc + 1) / 2)) { 11622 md_set_setstatus(setno, MD_SET_TOOFEW); 11623 single_thread_end(s); 11624 mddb_setexit(s); 11625 return (0); 11626 } 11627 11628 single_thread_end(s); 11629 mddb_setexit(s); 11630 return (0); 11631 } 11632 11633 /* 11634 * Check if incore mddb on master node matches ondisk mddb. 11635 * If not, master writes out incore view to all mddbs. 11636 * Have previously verified that master is an owner of the 11637 * diskset (master has snarfed diskset) and that diskset is 11638 * not stale. 11639 * 11640 * Meant to be called during reconfig cycle during change of master. 11641 * Previous master in diskset may have changed the mddb and 11642 * panic'd before relaying information to slave nodes. New 11643 * master node just writes out its incore view of the mddb and 11644 * the replay of the change log will resync all the nodes. 11645 * 11646 * Only supported for MN disksets. 11647 * 11648 * Return values: 11649 * 0 - success 11650 * non-zero - failure 11651 */ 11652 int 11653 mddb_check_write_ioctl(mddb_config_t *info) 11654 { 11655 int err = 0; 11656 set_t setno = info->c_setno; 11657 mddb_set_t *s; 11658 int li; 11659 mddb_locator_t *lp; 11660 mddb_lb_t *lbp; 11661 mddb_mnlb_t *mnlbp_od; 11662 mddb_ln_t *lnp; 11663 mddb_mnln_t *mnlnp_od; 11664 mddb_db_t *dbp; 11665 mddb_de_ic_t *dep; 11666 int write_out_mddb; 11667 md_error_t *ep = &info->c_mde; 11668 int mddb_err = 0; 11669 int prev_li = 0; 11670 int rval = 0; 11671 int alc, lc; 11672 int mddbs_present = 0; 11673 11674 /* Verify that setno is in valid range */ 11675 if (setno >= md_nsets) 11676 return (EINVAL); 11677 11678 if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0) 11679 return (0); 11680 11681 if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) { 11682 return (mddbstatus2error(ep, err, NODEV32, setno)); 11683 } 11684 11685 /* Calling diskset must be a MN diskset */ 11686 if (!(MD_MNSET_SETNO(setno))) { 11687 mddb_setexit(s); 11688 return (EINVAL); 11689 } 11690 11691 /* Re-verify that set is not stale */ 11692 if (md_get_setstatus(setno) & MD_SET_STALE) { 11693 mddb_setexit(s); 11694 return (mdmddberror(ep, MDE_DB_STALE, 11695 NODEV32, setno)); 11696 } 11697 11698 lbp = s->s_lbp; 11699 lnp = s->s_lnp; 11700 11701 /* 11702 * Previous master could have died during the write of data to 11703 * the mddbs so that the ondisk mddbs may not be consistent. 11704 * So, need to check the contents of the first and last active mddb 11705 * to see if the mddbs need to be rewritten. 11706 */ 11707 for (li = 0; li < lbp->lb_loccnt; li++) { 11708 int checkcopy_err; 11709 11710 lp = &lbp->lb_locators[li]; 11711 /* Find replica that is active */ 11712 if (lp->l_flags & MDDB_F_DELETED) 11713 continue; 11714 mddbs_present = 1; 11715 if (! (lp->l_flags & MDDB_F_ACTIVE)) 11716 continue; 11717 if (s->s_mbiarray[li] == NULL) 11718 continue; 11719 /* Check locator block */ 11720 mnlbp_od = (mddb_mnlb_t *)kmem_zalloc(dbtob(MDDB_MNLBCNT), 11721 KM_SLEEP); 11722 /* read in on-disk locator block */ 11723 err = readblks(s, (caddr_t)mnlbp_od, 0, lbp->lb_blkcnt, li); 11724 11725 /* If err, try next mddb */ 11726 if (err) { 11727 kmem_free(mnlbp_od, dbtob(MDDB_MNLBCNT)); 11728 continue; 11729 } 11730 11731 /* 11732 * We resnarf all changelog entries for this set. 11733 * They may have been altered by the previous master 11734 */ 11735 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 11736 for (dep = dbp->db_firstentry; dep; dep = dep->de_next) { 11737 if ((dep->de_flags & MDDB_F_CHANGELOG) == 0) { 11738 continue; 11739 } 11740 /* This has been alloc'ed while joining the set */ 11741 if (dep->de_rb) { 11742 kmem_free(dep->de_rb, dep->de_recsize); 11743 dep->de_rb = (mddb_rb32_t *)NULL; 11744 } 11745 if (dep->de_rb_userdata) { 11746 kmem_free(dep->de_rb_userdata, dep->de_reqsize); 11747 dep->de_rb_userdata = (caddr_t)NULL; 11748 } 11749 11750 err = getrecord(s, dep, li); 11751 if (err) { 11752 /* 11753 * When we see on error while reading the 11754 * changelog entries, we move on to the next 11755 * mddb 11756 */ 11757 err = 1; 11758 break; /* out of inner for-loop */ 11759 } 11760 allocuserdata(dep); 11761 } 11762 if (err) 11763 break; /* out of outer for-loop */ 11764 } 11765 11766 /* If err, try next mddb */ 11767 if (err) { 11768 kmem_free(mnlbp_od, dbtob(MDDB_MNLBCNT)); 11769 continue; 11770 } 11771 11772 /* Is incore locator block same as ondisk? */ 11773 if (bcmp((mddb_mnlb_t *)lbp, mnlbp_od, dbtob(MDDB_MNLBCNT)) 11774 == 1) { 11775 write_out_mddb = 1; 11776 kmem_free((caddr_t)mnlbp_od, dbtob(MDDB_MNLBCNT)); 11777 break; 11778 } 11779 11780 kmem_free((caddr_t)mnlbp_od, dbtob(MDDB_MNLBCNT)); 11781 11782 /* If lb ok, check locator names */ 11783 mnlnp_od = (mddb_mnln_t *)kmem_zalloc(dbtob(MDDB_MNLNCNT), 11784 KM_SLEEP); 11785 /* read in on-disk locator names */ 11786 err = readblks(s, (caddr_t)mnlnp_od, lbp->lb_lnfirstblk, 11787 lbp->lb_lnblkcnt, li); 11788 11789 /* If err, try next mddb */ 11790 if (err) { 11791 kmem_free(mnlnp_od, dbtob(MDDB_MNLNCNT)); 11792 continue; 11793 } 11794 11795 /* Are incore locator names same as ondisk? */ 11796 if (bcmp((mddb_mnln_t *)lnp, mnlnp_od, dbtob(MDDB_MNLNCNT)) 11797 == 1) { 11798 kmem_free((caddr_t)mnlnp_od, dbtob(MDDB_MNLNCNT)); 11799 write_out_mddb = 1; 11800 break; 11801 } 11802 11803 kmem_free((caddr_t)mnlnp_od, dbtob(MDDB_MNLNCNT)); 11804 11805 /* 11806 * Check records in mddb. 11807 * If a read error is encountered, set the error flag and 11808 * continue to the next mddb. Otherwise, if incore data is 11809 * different from ondisk, then set the flag to write out 11810 * the mddb and break out. 11811 */ 11812 checkcopy_err = checkcopy(s, li); 11813 if (checkcopy_err == MDDB_F_EREAD) { 11814 lp->l_flags |= MDDB_F_EREAD; 11815 mddb_err = 1; 11816 continue; 11817 } else if (checkcopy_err == 1) { 11818 write_out_mddb = 1; 11819 break; 11820 } 11821 /* 11822 * Have found first active mddb and the data is the same as 11823 * incore - break out of loop 11824 */ 11825 write_out_mddb = 0; 11826 break; 11827 } 11828 11829 /* 11830 * Skip checking for last active mddb if: 11831 * - already found a mismatch in the first active mddb 11832 * (write_out_mddb is 1) OR 11833 * - didn't find a readable mddb when looking for first 11834 * active mddb (there are mddbs present but all failed 11835 * when read was attempted). 11836 * 11837 * In either case, go to write_out_mddb label in order to attempt 11838 * to write out the data. If < 50% mddbs are available, panic. 11839 */ 11840 if ((write_out_mddb == 1) || 11841 ((li == lbp->lb_loccnt) && mddbs_present)) { 11842 write_out_mddb = 1; 11843 goto write_out_mddb; 11844 } 11845 11846 /* 11847 * Save which index was checked for the first active mddb. If only 1 11848 * active mddb, don't want to recheck the same mddb when looking for 11849 * last active mddb. 11850 */ 11851 prev_li = li; 11852 11853 /* 11854 * Now, checking for last active mddb. If found same index as before 11855 * (only 1 active mddb), then skip. 11856 */ 11857 for (li = (lbp->lb_loccnt - 1); li >= 0; li--) { 11858 int checkcopy_err; 11859 11860 lp = &lbp->lb_locators[li]; 11861 /* Find replica that is active */ 11862 if (! (lp->l_flags & MDDB_F_ACTIVE)) 11863 continue; 11864 if (lp->l_flags & MDDB_F_DELETED) 11865 continue; 11866 if (s->s_mbiarray[li] == NULL) 11867 continue; 11868 /* If already checked mddb, bail out */ 11869 if (li == prev_li) 11870 break; 11871 /* Check locator block */ 11872 mnlbp_od = (mddb_mnlb_t *)kmem_zalloc(dbtob(MDDB_MNLBCNT), 11873 KM_SLEEP); 11874 /* read in on-disk locator block */ 11875 err = readblks(s, (caddr_t)mnlbp_od, 0, lbp->lb_blkcnt, li); 11876 11877 /* If err, try next mddb */ 11878 if (err) { 11879 kmem_free(mnlbp_od, dbtob(MDDB_MNLBCNT)); 11880 continue; 11881 } 11882 11883 11884 /* Is incore locator block same as ondisk? */ 11885 if (bcmp((mddb_mnlb_t *)lbp, mnlbp_od, dbtob(MDDB_MNLBCNT)) 11886 == 1) { 11887 kmem_free((caddr_t)mnlbp_od, dbtob(MDDB_MNLBCNT)); 11888 write_out_mddb = 1; 11889 break; 11890 } 11891 11892 kmem_free((caddr_t)mnlbp_od, dbtob(MDDB_MNLBCNT)); 11893 11894 /* If lb ok, check locator names */ 11895 mnlnp_od = (mddb_mnln_t *) 11896 kmem_zalloc(dbtob(MDDB_MNLNCNT), KM_SLEEP); 11897 11898 /* read in on-disk locator names */ 11899 err = readblks(s, (caddr_t)mnlnp_od, lbp->lb_lnfirstblk, 11900 lbp->lb_lnblkcnt, li); 11901 11902 /* If err, try next mddb */ 11903 if (err) { 11904 kmem_free(mnlnp_od, dbtob(MDDB_MNLNCNT)); 11905 continue; 11906 } 11907 11908 /* Are incore locator names same as ondisk? */ 11909 if (bcmp((mddb_mnln_t *)lnp, mnlnp_od, dbtob(MDDB_MNLNCNT)) 11910 == 1) { 11911 kmem_free((caddr_t)mnlnp_od, dbtob(MDDB_MNLNCNT)); 11912 write_out_mddb = 1; 11913 break; 11914 } 11915 11916 kmem_free((caddr_t)mnlnp_od, dbtob(MDDB_MNLNCNT)); 11917 11918 /* 11919 * Check records in mddb. 11920 * If a read error is encountered, set the error flag and 11921 * continue to the next mddb. Otherwise, if incore data is 11922 * different from ondisk, then set the flag to write out 11923 * the mddb and break out. 11924 */ 11925 checkcopy_err = checkcopy(s, li); 11926 if (checkcopy_err == MDDB_F_EREAD) { 11927 lp->l_flags |= MDDB_F_EREAD; 11928 mddb_err = 1; 11929 continue; 11930 } else if (checkcopy_err == 1) { 11931 write_out_mddb = 1; 11932 break; 11933 } 11934 /* 11935 * Have found last active mddb and the data is the same as 11936 * incore - break out of loop 11937 */ 11938 write_out_mddb = 0; 11939 break; 11940 } 11941 11942 /* 11943 * If ondisk and incore versions of the mddb don't match, then 11944 * write out this node's incore version to disk. 11945 * Or, if unable to read a copy of the mddb, attempt to write 11946 * out a new one. 11947 */ 11948 write_out_mddb: 11949 if (write_out_mddb) { 11950 /* Recompute free blocks based on incore information */ 11951 computefreeblks(s); /* set up free block bits */ 11952 11953 /* 11954 * Write directory entries and record blocks. 11955 * Use flag MDDB_WRITECOPY_SYNC so that writecopy 11956 * routine won't write out change log records. 11957 */ 11958 for (li = 0; li < lbp->lb_loccnt; li++) { 11959 lp = &lbp->lb_locators[li]; 11960 /* Don't write to inactive or deleted mddbs */ 11961 if (! (lp->l_flags & MDDB_F_ACTIVE)) 11962 continue; 11963 if (lp->l_flags & MDDB_F_DELETED) 11964 continue; 11965 if (s->s_mbiarray[li] == NULL) 11966 continue; 11967 /* If encounter a write error, save it for later */ 11968 if (writecopy(s, li, MDDB_WRITECOPY_SYNC)) { 11969 lp->l_flags |= MDDB_F_EWRITE; 11970 mddb_err = 1; 11971 } 11972 } 11973 11974 /* 11975 * Write out locator blocks to all replicas. 11976 * push_lb will set MDDB_F_EWRITE on replicas that fail. 11977 */ 11978 if (push_lb(s)) 11979 mddb_err = 1; 11980 (void) upd_med(s, "mddb_check_write_ioctl(0)"); 11981 11982 /* Write out locator names to all replicas */ 11983 lnp = s->s_lnp; 11984 uniqtime32(&lnp->ln_timestamp); 11985 lnp->ln_revision = MDDB_REV_MNLN; 11986 crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL); 11987 11988 /* writeall sets MDDB_F_EWRITE if writes fails to replica */ 11989 if (writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk, 11990 lbp->lb_lnblkcnt, 0)) 11991 mddb_err = 1; 11992 11993 /* 11994 * The writes to the replicas above would have set 11995 * the MDDB_F_EWRITE flags if any write error was 11996 * encountered. 11997 * If < 50% of the mddbs are available, panic. 11998 */ 11999 lc = alc = 0; 12000 for (li = 0; li < lbp->lb_loccnt; li++) { 12001 lp = &lbp->lb_locators[li]; 12002 if (lp->l_flags & MDDB_F_DELETED) 12003 continue; 12004 lc++; 12005 /* 12006 * If mddb: 12007 * - is not active (previously had an error) 12008 * - had an error reading the master blocks or 12009 * - had an error in writing to the mddb 12010 * then don't count this mddb in the active count. 12011 */ 12012 if (! (lp->l_flags & MDDB_F_ACTIVE) || 12013 (lp->l_flags & MDDB_F_EMASTER) || 12014 (lp->l_flags & MDDB_F_EWRITE)) 12015 continue; 12016 alc++; 12017 } 12018 if (alc < ((lc + 1) / 2)) { 12019 cmn_err(CE_PANIC, 12020 "md: Panic due to lack of DiskSuite state\n" 12021 " database replicas. Fewer than 50%% of " 12022 "the total were available,\n so panic to " 12023 "ensure data integrity."); 12024 } 12025 } 12026 12027 /* 12028 * If encountered an error during checking or writing of 12029 * mddbs, call selectreplicas so that replica error can 12030 * be properly handled. This will involve another attempt 12031 * to write the mddb out to any mddb marked MDDB_F_EWRITE. 12032 * If mddb still fails, it will have the MDDB_F_ACTIVE bit 12033 * turned off. Set the MDDB_SCANALLSYNC flag so that 12034 * selectreplicas doesn't overwrite the change log entries. 12035 * 12036 * Set the PARSE_LOCBLK flag in the mddb_set structure to show 12037 * that the locator block has been changed. 12038 */ 12039 if (mddb_err) { 12040 (void) selectreplicas(s, MDDB_SCANALLSYNC); 12041 s->s_mn_parseflags |= MDDB_PARSE_LOCBLK; 12042 } 12043 12044 write_out_end: 12045 mddb_setexit(s); 12046 return (rval); 12047 } 12048 12049 /* 12050 * Set/reset/get set flags in set structure. 12051 * Used during reconfig cycle 12052 * Only supported for MN disksets. 12053 * 12054 * Return values: 12055 * 0 - success 12056 * non-zero - failure 12057 */ 12058 int 12059 mddb_setflags_ioctl(mddb_setflags_config_t *info) 12060 { 12061 set_t setno = info->sf_setno; 12062 12063 /* Verify that setno is in valid range */ 12064 if (setno >= md_nsets) 12065 return (EINVAL); 12066 12067 /* 12068 * When setting the flags, the set may not 12069 * be snarfed yet. So, don't check for SNARFED or MNset 12070 * and don't call mddb_setenter. 12071 * In order to discourage bad ioctl calls, 12072 * verify that magic field in structure is set correctly. 12073 */ 12074 if (info->sf_magic != MDDB_SETFLAGS_MAGIC) 12075 return (EINVAL); 12076 12077 switch (info->sf_flags) { 12078 case MDDB_NM_SET: 12079 if (info->sf_setflags & MD_SET_MN_NEWMAS_RC) 12080 md_set_setstatus(setno, MD_SET_MN_NEWMAS_RC); 12081 if (info->sf_setflags & MD_SET_MN_START_RC) 12082 md_set_setstatus(setno, MD_SET_MN_START_RC); 12083 if (info->sf_setflags & MD_SET_MN_MIR_STATE_RC) 12084 md_set_setstatus(setno, MD_SET_MN_MIR_STATE_RC); 12085 break; 12086 12087 case MDDB_NM_RESET: 12088 if (info->sf_setflags & MD_SET_MN_NEWMAS_RC) 12089 md_clr_setstatus(setno, MD_SET_MN_NEWMAS_RC); 12090 if (info->sf_setflags & MD_SET_MN_START_RC) 12091 md_clr_setstatus(setno, MD_SET_MN_START_RC); 12092 if (info->sf_setflags & MD_SET_MN_MIR_STATE_RC) 12093 md_clr_setstatus(setno, MD_SET_MN_MIR_STATE_RC); 12094 break; 12095 12096 case MDDB_NM_GET: 12097 info->sf_setflags = md_get_setstatus(setno) & 12098 (MD_SET_MN_NEWMAS_RC|MD_SET_MN_START_RC| 12099 MD_SET_MN_MIR_STATE_RC); 12100 break; 12101 } 12102 12103 return (0); 12104 } 12105 12106 /* 12107 * md_update_minor 12108 * 12109 * This function updates the minor in the namespace entry for an 12110 * underlying metadevice. The function is called in mod_imp_set 12111 * where mod is sp, stripe, mirror and raid. 12112 * 12113 */ 12114 int 12115 md_update_minor( 12116 set_t setno, 12117 side_t side, 12118 mdkey_t key 12119 ) 12120 { 12121 struct nm_next_hdr *nh; 12122 struct nm_name *n; 12123 char *shn; 12124 int retval = 1; 12125 12126 /* 12127 * Load the devid name space if it exists 12128 */ 12129 (void) md_load_namespace(setno, NULL, NM_DEVID); 12130 if (! md_load_namespace(setno, NULL, 0L)) { 12131 /* 12132 * Unload the devid namespace 12133 */ 12134 (void) md_unload_namespace(setno, NM_DEVID); 12135 return (0); 12136 } 12137 12138 rw_enter(&nm_lock.lock, RW_READER); 12139 12140 if ((nh = get_first_record(setno, 0, NM_NOTSHARED)) == NULL) { 12141 retval = 0; 12142 goto out; 12143 } 12144 12145 /* 12146 * Look up the key 12147 */ 12148 if ((n = lookup_entry(nh, setno, side, key, NODEV64, 0L)) != NULL) { 12149 /* 12150 * Find the entry, update its n_minor if metadevice 12151 */ 12152 if ((shn = (char *)getshared_name(setno, n->n_drv_key, 0L)) 12153 == NULL) { 12154 retval = 0; 12155 goto out; 12156 } 12157 12158 if (strcmp(shn, "md") == 0) { 12159 n->n_minor = MD_MKMIN(setno, MD_MIN2UNIT(n->n_minor)); 12160 } 12161 } 12162 12163 out: 12164 rw_exit(&nm_lock.lock); 12165 return (retval); 12166 } 12167 12168 /* 12169 * md_update_top_device_minor 12170 * 12171 * This function updates the minor in the namespace entry for a top 12172 * level metadevice. The function is called in mod_imp_set where 12173 * mod is sp, stripe, mirror and raid. 12174 * 12175 */ 12176 int 12177 md_update_top_device_minor( 12178 set_t setno, 12179 side_t side, 12180 md_dev64_t dev 12181 ) 12182 { 12183 struct nm_next_hdr *nh; 12184 struct nm_name *n; 12185 char *shn; 12186 int retval = 1; 12187 12188 /* 12189 * Load the devid name space if it exists 12190 */ 12191 (void) md_load_namespace(setno, NULL, NM_DEVID); 12192 if (! md_load_namespace(setno, NULL, 0L)) { 12193 /* 12194 * Unload the devid namespace 12195 */ 12196 (void) md_unload_namespace(setno, NM_DEVID); 12197 return (0); 12198 } 12199 12200 rw_enter(&nm_lock.lock, RW_READER); 12201 12202 if ((nh = get_first_record(setno, 0, NM_NOTSHARED)) == NULL) { 12203 retval = 0; 12204 goto out; 12205 } 12206 12207 /* 12208 * Look up the key 12209 */ 12210 if ((n = lookup_entry(nh, setno, side, MD_KEYWILD, dev, 0L)) != NULL) { 12211 /* 12212 * Find the entry, update its n_minor if metadevice 12213 */ 12214 if ((shn = (char *)getshared_name(setno, n->n_drv_key, 0L)) 12215 == NULL) { 12216 retval = 0; 12217 goto out; 12218 } 12219 12220 if (strcmp(shn, "md") == 0) { 12221 n->n_minor = MD_MKMIN(setno, MD_MIN2UNIT(n->n_minor)); 12222 } 12223 } 12224 12225 out: 12226 rw_exit(&nm_lock.lock); 12227 return (retval); 12228 } 12229 12230 static void 12231 md_imp_nm( 12232 mddb_set_t *s 12233 ) 12234 { 12235 mddb_db_t *dbp; 12236 mddb_de_ic_t *dep; 12237 struct nm_rec_hdr *hdr; 12238 struct nm_header *hhdr; 12239 set_t setno = s->s_setno; 12240 12241 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 12242 for (dep = dbp->db_firstentry; dep != NULL; 12243 dep = dep->de_next) { 12244 switch (dep->de_type1) { 12245 12246 case MDDB_NM_HDR: 12247 case MDDB_DID_NM_HDR: 12248 12249 hhdr = (struct nm_header *) 12250 dep->de_rb_userdata; 12251 12252 hdr = &hhdr->h_names; 12253 if (hdr->r_next_recid > 0) { 12254 hdr->r_next_recid = MAKERECID(setno, 12255 DBID(hdr->r_next_recid)); 12256 } 12257 12258 hdr = &hhdr->h_shared; 12259 if (hdr->r_next_recid > 0) { 12260 hdr->r_next_recid = MAKERECID(setno, 12261 DBID(hdr->r_next_recid)); 12262 } 12263 break; 12264 12265 case MDDB_NM: 12266 case MDDB_DID_NM: 12267 case MDDB_SHR_NM: 12268 case MDDB_DID_SHR_NM: 12269 12270 hdr = (struct nm_rec_hdr *) 12271 dep->de_rb_userdata; 12272 12273 if (hdr->r_next_recid > 0) { 12274 hdr->r_next_recid = MAKERECID 12275 (setno, DBID(hdr->r_next_recid)); 12276 } 12277 break; 12278 12279 default: 12280 break; 12281 } 12282 } 12283 } 12284 } 12285 12286 static int 12287 update_db_rec( 12288 mddb_set_t *s 12289 ) 12290 { 12291 mddb_db_t *dbp; 12292 mddb_de_ic_t *dep; 12293 mddb_recid_t ids[2]; 12294 12295 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 12296 for (dep = dbp->db_firstentry; dep != NULL; 12297 dep = dep->de_next) { 12298 if (! (dep->de_flags & MDDB_F_OPT)) { 12299 ids[0] = MAKERECID(s->s_setno, dep->de_recid); 12300 ids[1] = 0; 12301 if (mddb_commitrecs(ids)) { 12302 return (MDDB_E_NORECORD); 12303 } 12304 } 12305 } 12306 } 12307 return (0); 12308 } 12309 12310 static int 12311 update_mb( 12312 mddb_set_t *s 12313 ) 12314 { 12315 mddb_ri_t *rip; 12316 int err = 0; 12317 12318 for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) { 12319 if (rip->ri_flags & MDDB_F_EMASTER) 12320 /* disk is powered off or not there */ 12321 continue; 12322 12323 if (md_get_setstatus(s->s_setno) & 12324 MD_SET_REPLICATED_IMPORT) { 12325 /* 12326 * It is a replicated set 12327 */ 12328 if (rip->ri_devid == (ddi_devid_t)NULL) { 12329 return (-1); 12330 } 12331 err = update_mb_devid(s, rip, rip->ri_devid); 12332 } else { 12333 /* 12334 * It is a non-replicated set 12335 * and there is no need to update 12336 * devid 12337 */ 12338 err = update_mb_devid(s, rip, NULL); 12339 } 12340 12341 if (err) 12342 return (err); 12343 } 12344 12345 return (0); 12346 } 12347 12348 static int 12349 update_setname( 12350 set_t setno 12351 ) 12352 { 12353 struct nm_next_hdr *nh; 12354 struct nm_shared_name *shn, *new_shn; 12355 char *prefix = "/dev/md/"; 12356 char *shrname; 12357 int len; 12358 mdkey_t o_key; 12359 uint32_t o_count, o_data; 12360 mddb_recid_t recid, ids[3]; 12361 int err = 0; 12362 mddb_set_t *dbp; 12363 12364 /* Import setname */ 12365 dbp = (mddb_set_t *)md_set[setno].s_db; 12366 len = strlen(prefix) + strlen(dbp->s_setname) + strlen("/dsk/") + 1; 12367 shrname = kmem_zalloc(len, KM_SLEEP); 12368 (void) sprintf(shrname, "%s%s%s", prefix, dbp->s_setname, "/dsk/"); 12369 12370 rw_enter(&nm_lock.lock, RW_WRITER); 12371 if ((nh = get_first_record(setno, 0, NM_SHARED)) == NULL) { 12372 /* 12373 * No namespace is okay 12374 */ 12375 err = 0; 12376 goto out; 12377 } 12378 12379 if ((shn = (struct nm_shared_name *)lookup_shared_entry(nh, 12380 0, prefix, NULL, NM_SHARED | NM_IMP_SHARED)) == NULL) { 12381 /* 12382 * No metadevice is okay 12383 */ 12384 err = 0; 12385 goto out; 12386 } 12387 12388 /* 12389 * We have it, go ahead and update the namespace. 12390 */ 12391 o_key = shn->sn_key; 12392 o_count = shn->sn_count; 12393 o_data = shn->sn_data; 12394 12395 if (remove_shared_entry(nh, o_key, NULL, 0L | NM_IMP_SHARED | 12396 NM_NOCOMMIT)) { 12397 err = MDDB_E_NORECORD; 12398 goto out; 12399 } 12400 if ((new_shn = (struct nm_shared_name *)alloc_entry( 12401 nh, md_set[setno].s_nmid, len, NM_SHARED | 12402 NM_NOCOMMIT, &recid)) == NULL) { 12403 err = MDDB_E_NORECORD; 12404 goto out; 12405 } 12406 12407 new_shn->sn_key = o_key; 12408 new_shn->sn_count = o_count; 12409 new_shn->sn_data = o_data; 12410 new_shn->sn_namlen = (ushort_t)len; 12411 (void) strcpy(new_shn->sn_name, shrname); 12412 12413 ids[0] = recid; 12414 ids[1] = md_set[setno].s_nmid; 12415 ids[2] = 0; 12416 err = mddb_commitrecs(ids); 12417 12418 out: 12419 if (shrname) 12420 kmem_free(shrname, len); 12421 rw_exit(&nm_lock.lock); 12422 return (err); 12423 } 12424 12425 /* 12426 * Returns 0 on success. 12427 * Returns -1 on failure with ep filled in. 12428 */ 12429 static int 12430 md_imp_db( 12431 set_t setno, 12432 int stale_flag, 12433 md_error_t *ep 12434 ) 12435 { 12436 mddb_set_t *s; 12437 int err = 0; 12438 mddb_dt_t *dtp; 12439 mddb_lb_t *lbp; 12440 int i; 12441 int loccnt; 12442 12443 if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) { 12444 return (mddbstatus2error(ep, err, NODEV32, setno)); 12445 } 12446 12447 /* Update dt */ 12448 if ((dtp = (mddb_dt_t *)md_set[setno].s_dtp) != NULL) { 12449 crcgen(dtp, &dtp->dt_cks, MDDB_DT_BYTES, NULL); 12450 } 12451 12452 if ((err = dt_write(s)) != 0) { 12453 err = mdsyserror(ep, err); 12454 mddb_setexit(s); 12455 return (err); 12456 } 12457 12458 /* 12459 * Update lb, no need to update the mediator because 12460 * the diskset will only exist on the importing node 12461 * and as such a mediator adds no value. 12462 */ 12463 12464 /* Update lb */ 12465 if (stale_flag & MD_IMP_STALE_SET) { 12466 lbp = s->s_lbp; 12467 loccnt = lbp->lb_loccnt; 12468 for (i = 0; i < loccnt; i++) { 12469 mddb_locator_t *lp = &lbp->lb_locators[i]; 12470 md_dev64_t ndev = md_expldev(lp->l_dev); 12471 ddi_devid_t devid_ptr; 12472 12473 devid_ptr = s->s_did_icp->did_ic_devid[i]; 12474 if (devid_ptr == NULL) { 12475 /* 12476 * Already deleted, go to next one. 12477 */ 12478 continue; 12479 } 12480 if (mddb_devid_validate((ddi_devid_t)devid_ptr, &ndev, 12481 NULL)) { 12482 /* disk unavailable, mark deleted */ 12483 lp->l_flags = MDDB_F_DELETED; 12484 /* then remove the device id from the list */ 12485 free_mbipp(&s->s_mbiarray[i]); 12486 s->s_mbiarray[i] = 0; 12487 (void) mddb_devid_delete(s, i); 12488 } 12489 } 12490 md_clr_setstatus(setno, MD_SET_STALE); 12491 } 12492 12493 if ((err = writelocall(s)) != 0) { 12494 err = mdmddberror(ep, MDDB_E_NOTNOW, NODEV32, setno); 12495 mddb_setexit(s); 12496 return (err); 12497 } 12498 12499 mddb_setexit(s); 12500 12501 /* Update db records */ 12502 if ((err = update_db_rec(s)) != 0) { 12503 return (mddbstatus2error(ep, err, NODEV32, setno)); 12504 } 12505 12506 /* Update setname embedded in the namespace */ 12507 if ((err = update_setname(setno)) != 0) 12508 return (mddbstatus2error(ep, err, NODEV32, setno)); 12509 12510 return (err); 12511 } 12512 12513 static void 12514 md_dr_add( 12515 md_set_record *sr, 12516 md_drive_record *dr 12517 ) 12518 { 12519 md_drive_record *drv; 12520 12521 if (sr->sr_driverec == 0) { 12522 sr->sr_driverec = dr->dr_selfid; 12523 return; 12524 } 12525 12526 for (drv = (md_drive_record *)mddb_getrecaddr(sr->sr_driverec); 12527 drv->dr_nextrec != 0; 12528 drv = (md_drive_record *)mddb_getrecaddr(drv->dr_nextrec)) 12529 ; 12530 drv->dr_nextrec = dr->dr_selfid; 12531 } 12532 12533 static void 12534 md_setup_recids( 12535 md_set_record *sr, 12536 mddb_recid_t **ids, 12537 size_t size 12538 ) 12539 { 12540 md_drive_record *drv; 12541 int cnt; 12542 mddb_recid_t *recids; 12543 12544 recids = (mddb_recid_t *)kmem_zalloc(sizeof (mddb_recid_t) 12545 * size, KM_SLEEP); 12546 recids[0] = sr->sr_selfid; 12547 cnt = 1; 12548 12549 for (drv = (md_drive_record *)mddb_getrecaddr(sr->sr_driverec); 12550 /* CSTYLED */ 12551 drv != NULL;) { 12552 recids[cnt++] = drv->dr_selfid; 12553 if (drv->dr_nextrec != 0) 12554 drv = (md_drive_record *)mddb_getrecaddr 12555 (drv->dr_nextrec); 12556 else 12557 drv = NULL; 12558 } 12559 recids[cnt] = 0; 12560 *ids = &recids[0]; 12561 } 12562 12563 /* 12564 * The purpose of this function is to replace the old_devid with the 12565 * new_devid in the given namespace. This is used for importing 12566 * remotely replicated drives. 12567 */ 12568 int 12569 md_update_namespace_rr_did( 12570 mddb_config_t *cp 12571 ) 12572 { 12573 set_t setno = cp->c_setno; 12574 struct nm_next_hdr *nh; 12575 mdkey_t key = MD_KEYWILD; 12576 side_t side = MD_SIDEWILD; 12577 mddb_recid_t recids[3]; 12578 struct did_min_name *n; 12579 struct nm_next_hdr *did_shr_nh; 12580 struct did_shr_name *shr_n; 12581 mdkey_t ent_did_key; 12582 uint32_t ent_did_count; 12583 uint32_t ent_did_data; 12584 size_t ent_size, size; 12585 ddi_devid_t devid = NULL; 12586 struct did_shr_name *shn; 12587 size_t offset; 12588 struct nm_next_hdr *this_did_shr_nh; 12589 void *old_devid, *new_devid; 12590 12591 if (!(md_get_setstatus(setno) & MD_SET_NM_LOADED)) 12592 return (EIO); 12593 12594 old_devid = (void *)(uintptr_t)cp->c_locator.l_old_devid; 12595 new_devid = (void *)(uintptr_t)cp->c_locator.l_devid; 12596 12597 /* 12598 * It is okay if we dont have any configuration 12599 */ 12600 offset = (sizeof (struct devid_shr_rec) - sizeof (struct did_shr_name)); 12601 if ((nh = get_first_record(setno, 0, NM_DEVID | NM_NOTSHARED)) 12602 == NULL) { 12603 return (0); 12604 } 12605 while ((key = md_getnextkey(setno, side, key, NULL)) != MD_KEYWILD) { 12606 /* check out every entry in the namespace */ 12607 if ((n = (struct did_min_name *)lookup_entry(nh, setno, 12608 side, key, NODEV64, NM_DEVID)) == NULL) { 12609 continue; 12610 } else { 12611 did_shr_nh = get_first_record(setno, 0, NM_DEVID | 12612 NM_SHARED); 12613 if (did_shr_nh == NULL) { 12614 return (ENOENT); 12615 } 12616 this_did_shr_nh = did_shr_nh->nmn_nextp; 12617 shr_n = (struct did_shr_name *)lookup_shared_entry( 12618 did_shr_nh, n->min_devid_key, (char *)0, 12619 &recids[0], NM_DEVID); 12620 if (shr_n == NULL) { 12621 return (ENOENT); 12622 } 12623 rw_enter(&nm_lock.lock, RW_WRITER); 12624 devid = (ddi_devid_t)shr_n->did_devid; 12625 /* find this devid in the incore replica */ 12626 if (ddi_devid_compare(devid, old_devid) == 0) { 12627 /* 12628 * found the corresponding entry 12629 * update with new devid 12630 */ 12631 /* first remove old devid info */ 12632 ent_did_key = shr_n ->did_key; 12633 ent_did_count = shr_n->did_count; 12634 ent_did_data = shr_n->did_data; 12635 ent_size = DID_SHR_NAMSIZ(shr_n); 12636 size = ((struct nm_rec_hdr *) 12637 this_did_shr_nh->nmn_record)-> 12638 r_used_size - offset - ent_size; 12639 if (size == 0) { 12640 (void) bzero(shr_n, ent_size); 12641 } else { 12642 (void) ovbcopy((caddr_t)shr_n + 12643 ent_size, shr_n, size); 12644 (void) bzero((caddr_t)shr_n + 12645 size, ent_size); 12646 } 12647 ((struct nm_rec_hdr *)this_did_shr_nh-> 12648 nmn_record)->r_used_size -= 12649 ent_size; 12650 /* add in new devid info */ 12651 if ((shn = (struct did_shr_name *) 12652 alloc_entry(did_shr_nh, 12653 md_set[setno].s_did_nmid, 12654 cp->c_locator.l_devid_sz, 12655 NM_DEVID | NM_SHARED | NM_NOCOMMIT, 12656 &recids[0])) == NULL) { 12657 rw_exit(&nm_lock.lock); 12658 return (ENOMEM); 12659 } 12660 shn->did_key = ent_did_key; 12661 shn->did_count = ent_did_count; 12662 ent_did_data |= NM_DEVID_VALID; 12663 shn->did_data = ent_did_data; 12664 shn->did_size = ddi_devid_sizeof( 12665 new_devid); 12666 bcopy((void *)new_devid, (void *) 12667 shn->did_devid, shn->did_size); 12668 recids[1] = md_set[setno].s_nmid; 12669 recids[2] = 0; 12670 mddb_commitrecs_wrapper(recids); 12671 } 12672 rw_exit(&nm_lock.lock); 12673 } 12674 } 12675 12676 return (0); 12677 } 12678 12679 /* 12680 * namespace is loaded before this is called. 12681 * This function is a wrapper for md_update_namespace_rr_did. 12682 * 12683 * md_update_namespace_rr_did may be called twice if attempting to 12684 * resolve a replicated device id during the take of a diskset - once 12685 * for the diskset namespace and a second time for the local namespace. 12686 * The local namespace would need to be updated when a drive has been 12687 * found during a take of the diskset that hadn't been resolved during 12688 * the import (aka partial replicated import). 12689 * 12690 * If being called during the import of the diskset (IMPORT flag set) 12691 * md_update_namespace_rr_did will only be called once with the disket 12692 * namespace. 12693 */ 12694 int 12695 md_update_nm_rr_did_ioctl( 12696 mddb_config_t *cp 12697 ) 12698 { 12699 int rval = 0; 12700 12701 /* If update of diskset namespace fails, stop and return failure */ 12702 if ((rval = md_update_namespace_rr_did(cp)) != 0) 12703 return (rval); 12704 12705 if (cp->c_flags & MDDB_C_IMPORT) 12706 return (0); 12707 12708 /* If update of local namespace fails, return failure */ 12709 cp->c_setno = MD_LOCAL_SET; 12710 rval = md_update_namespace_rr_did(cp); 12711 return (rval); 12712 } 12713 12714 /*ARGSUSED*/ 12715 int 12716 md_imp_snarf_set( 12717 mddb_config_t *cp 12718 ) 12719 { 12720 set_t setno; 12721 int stale_flag; 12722 mddb_set_t *s; 12723 int i, err = 0; 12724 md_ops_t *ops; 12725 md_error_t *ep = &cp->c_mde; 12726 12727 setno = cp->c_setno; 12728 stale_flag = cp->c_flags; 12729 12730 mdclrerror(ep); 12731 if (setno >= md_nsets) { 12732 return (mdsyserror(ep, EINVAL)); 12733 } 12734 12735 md_haltsnarf_enter(setno); 12736 if (md_get_setstatus(setno) & MD_SET_IMPORT) { 12737 goto out; 12738 } 12739 12740 /* Set the bit first otherwise load_old_replicas can fail */ 12741 md_set_setstatus(setno, MD_SET_IMPORT); 12742 12743 if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) { 12744 err = mddbstatus2error(ep, err, NODEV32, setno); 12745 goto out; 12746 } 12747 12748 /* 12749 * Upon completion of load_old_replicas, the old setno is 12750 * restored from the disk so we need to reset 12751 */ 12752 s->s_lbp->lb_setno = setno; 12753 12754 /* 12755 * Fixup the NM records before loading namespace 12756 */ 12757 (void) md_imp_nm(s); 12758 mddb_setexit(s); 12759 12760 /* 12761 * Load the devid name space if it exists 12762 * and ask each module to fixup unit records 12763 */ 12764 if (!md_load_namespace(setno, NULL, NM_DEVID)) { 12765 err = mdsyserror(ep, ENOENT); 12766 goto cleanup; 12767 } 12768 if (!md_load_namespace(setno, NULL, 0L)) { 12769 (void) md_unload_namespace(setno, NM_DEVID); 12770 err = mdsyserror(ep, ENOENT); 12771 goto cleanup; 12772 } 12773 12774 do { 12775 i = 0; 12776 for (ops = md_opslist; ops != NULL; ops = ops->md_next) 12777 if (ops->md_imp_set != NULL) 12778 i += ops->md_imp_set(setno); 12779 } while (i); 12780 12781 /* 12782 * Fixup 12783 * (1) locator block 12784 * (2) locator name block if necessary 12785 * (3) master block 12786 * (4) directory block 12787 * calls appropriate writes to push changes out 12788 */ 12789 if ((err = md_imp_db(setno, stale_flag, ep)) != 0) { 12790 goto cleanup; 12791 } 12792 12793 /* 12794 * Don't unload namespace if importing a replicated diskset. 12795 * Namespace will be unloaded with an explicit RELEASE_SET ioctl. 12796 */ 12797 if (md_get_setstatus(s->s_setno) & MD_SET_REPLICATED_IMPORT) { 12798 md_haltsnarf_exit(setno); 12799 return (err); 12800 } 12801 12802 cleanup: 12803 /* 12804 * Halt the set 12805 */ 12806 rw_enter(&md_unit_array_rw.lock, RW_WRITER); 12807 (void) md_halt_set(setno, MD_HALT_ALL); 12808 rw_exit(&md_unit_array_rw.lock); 12809 12810 /* 12811 * Unload the namespace for the imported set 12812 */ 12813 mutex_enter(&mddb_lock); 12814 mddb_unload_set(setno); 12815 mutex_exit(&mddb_lock); 12816 12817 out: 12818 md_haltsnarf_exit(setno); 12819 md_clr_setstatus(setno, MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT); 12820 return (err); 12821 } 12822 #endif /* MDDB_FAKE */ 12823