1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/types.h> 30 #include <sys/conf.h> 31 #include <sys/time.h> 32 #include <sys/uio.h> 33 #include <sys/param.h> 34 #include <sys/systm.h> 35 #include <sys/systeminfo.h> 36 #include <sys/sysmacros.h> 37 #include <sys/buf.h> 38 #include <sys/kmem.h> 39 #include <sys/file.h> 40 #include <sys/open.h> 41 #include <sys/debug.h> 42 #include <sys/stat.h> 43 #include <sys/lvm/mdvar.h> 44 #include <sys/lvm/md_crc.h> 45 #include <sys/lvm/md_convert.h> 46 #include <sys/types.h> 47 #include <sys/kmem.h> 48 #include <sys/lvm/mdmn_commd.h> 49 #include <sys/cladm.h> 50 51 mhd_mhiargs_t defmhiargs = { 52 1000, 53 { 6000, 6000, 30000 } 54 }; 55 56 #define MDDB 57 58 #include <sys/lvm/mdvar.h> 59 #include <sys/lvm/mdmed.h> 60 #include <sys/lvm/md_names.h> 61 #include <sys/cred.h> 62 #include <sys/ddi.h> 63 #include <sys/sunddi.h> 64 #include <sys/esunddi.h> 65 66 #include <sys/sysevent/eventdefs.h> 67 #include <sys/sysevent/svm.h> 68 69 extern char svm_bootpath[]; 70 71 int md_maxbootlist = MAXBOOTLIST; 72 static ulong_t mddb_maxblocks = 0; /* tune for small records */ 73 static int mddb_maxbufheaders = 50; 74 static uint_t mddb_maxcopies = MDDB_NLB; 75 76 /* 77 * If this is set, more detailed messages about DB init will be given, instead 78 * of just the MDE_DB_NODB. 79 */ 80 static int mddb_db_err_detail = 0; 81 82 /* 83 * This lock is used to single-thread load/unload of all sets 84 */ 85 static kmutex_t mddb_lock; 86 87 /* 88 * You really do NOT want to change this boolean. 89 * It can be VERY dangerous to do so. Loss of 90 * data may occur. USE AT YOUR OWN RISK!!!! 91 */ 92 static int mddb_allow_half = 0; 93 /* 94 * For mirrored root allow reboot with only half the replicas available 95 * Flag inserted for Santa Fe project. 96 */ 97 int mirrored_root_flag; 98 99 #define ISWHITE(c) (((c) == ' ') || ((c) == '\t') || \ 100 ((c) == '\r') || ((c) == '\n')) 101 #define ISNUM(c) (((c) >= '0') && ((c) <= '9')) 102 103 #define SETMUTEX(setno) (&md_set[setno].s_dbmx) 104 105 extern md_krwlock_t md_unit_array_rw; /* md.c */ 106 extern set_t md_nsets; /* md.c */ 107 extern int md_nmedh; /* md.c */ 108 extern md_set_t md_set[]; /* md.c */ 109 extern int (*mdv_strategy_tstpnt)(buf_t *, int, void*); 110 extern dev_info_t *md_devinfo; 111 extern int md_init_debug; 112 extern int md_status; 113 extern md_ops_t *md_opslist; 114 extern md_krwlock_t nm_lock; 115 116 static int update_locatorblock(mddb_set_t *s, md_dev64_t dev, 117 ddi_devid_t didptr); 118 119 /* 120 * Defines for crc calculation for records 121 * rec_crcgen generates a crc checksum for a record block 122 * rec_crcchk checks the crc checksum for a record block 123 */ 124 #define REC_CRCGEN 0 125 #define REC_CRCCHK 1 126 #define rec_crcgen(s, dep, rbp) \ 127 (void) rec_crcfunc(s, dep, rbp, REC_CRCGEN) 128 #define rec_crcchk(s, dep, rbp) \ 129 rec_crcfunc(s, dep, rbp, REC_CRCCHK) 130 131 /* 132 * During upgrade, SVM basically runs with the devt from the target 133 * being upgraded. Translations are made from the target devt to the 134 * miniroot devt when writing data out to the disk. This is done by 135 * the following routines: 136 * wrtblklst 137 * writeblks 138 * readblklst 139 * readblks 140 * dt_read 141 * 142 * The following routines are used by the routines listed above and 143 * expect a translated (aka miniroot) devt: 144 * getblks 145 * getmasters 146 * 147 * Also, when calling any system routines, such as ddi_lyr_get_devid, 148 * the translated (aka miniroot) devt must be used. 149 * 150 * By the same token, the major number and major name conversion operations 151 * need to use the name_to_major file from the target system instead 152 * of the name_to_major file on the miniroot. So, calls to 153 * ddi_name_to_major must be replaced with calls to md_targ_name_to_major 154 * when running on an upgrade. Same is true with calls to 155 * ddi_major_to_name. 156 */ 157 158 159 #ifndef MDDB_FAKE 160 161 static int 162 mddb_rwdata( 163 mddb_set_t *s, /* incore db set structure */ 164 int flag, /* B_ASYNC or 0 passed in here */ 165 buf_t *bp 166 ) 167 { 168 int err = 0; 169 170 bp->b_flags = (flag | B_BUSY) & (~B_ASYNC); 171 172 mutex_exit(SETMUTEX(s->s_setno)); 173 if (mdv_strategy_tstpnt == NULL || 174 (*mdv_strategy_tstpnt)(bp, 0, NULL) == 0) 175 (void) bdev_strategy(bp); 176 177 if (flag & B_ASYNC) { 178 mutex_enter(SETMUTEX(s->s_setno)); 179 return (0); 180 } 181 182 err = biowait(bp); 183 mutex_enter(SETMUTEX(s->s_setno)); 184 return (err); 185 } 186 187 static void 188 setidentifier( 189 mddb_set_t *s, 190 identifier_t *ident 191 ) 192 { 193 if (s->s_setno == MD_LOCAL_SET) 194 (void) strcpy(&ident->serial[0], s->s_ident.serial); 195 else 196 ident->createtime = s->s_ident.createtime; 197 } 198 199 static int 200 cmpidentifier( 201 mddb_set_t *s, 202 identifier_t *ident 203 ) 204 { 205 if (s->s_setno == MD_LOCAL_SET) 206 return (strcmp(ident->serial, s->s_ident.serial)); 207 else 208 return (timercmp(&ident->createtime, 209 /*CSTYLED*/ 210 &s->s_ident.createtime, !=)); 211 } 212 213 static int 214 mddb_devopen( 215 md_dev64_t dev 216 ) 217 { 218 dev_t ddi_dev = md_dev64_to_dev(dev); 219 220 if (dev_lopen(&ddi_dev, FREAD|FWRITE, OTYP_LYR, kcred) == 0) 221 return (0); 222 return (1); 223 } 224 225 static void 226 mddb_devclose( 227 md_dev64_t dev 228 ) 229 { 230 (void) dev_lclose(md_dev64_to_dev(dev), FREAD|FWRITE, OTYP_LYR, kcred); 231 } 232 233 /* 234 * stripe_skip_ts 235 * 236 * Returns a list of fields to be skipped in the stripe record structure. 237 * These fields are ms_timestamp in the component structure. 238 * Used to skip these fields when calculating the checksum. 239 */ 240 static crc_skip_t * 241 stripe_skip_ts(void *un, uint_t revision) 242 { 243 struct ms_row32_od *small_mdr; 244 struct ms_row *big_mdr; 245 uint_t row, comp, ncomps, compoff; 246 crc_skip_t *skip; 247 crc_skip_t *skip_prev; 248 crc_skip_t skip_start = {0, 0, 0}; 249 ms_unit_t *big_un; 250 ms_unit32_od_t *small_un; 251 uint_t rb_off = offsetof(mddb_rb32_t, rb_data[0]); 252 253 if (revision == MDDB_REV_RB) { 254 small_un = (ms_unit32_od_t *)un; 255 skip_prev = &skip_start; 256 257 if (small_un->un_nrows == 0) 258 return (NULL); 259 /* 260 * walk through all rows to find the total number 261 * of components 262 */ 263 small_mdr = &small_un->un_row[0]; 264 ncomps = 0; 265 for (row = 0; (row < small_un->un_nrows); row++) { 266 ncomps += small_mdr[row].un_ncomp; 267 } 268 269 /* Now walk through the components */ 270 compoff = small_un->un_ocomp + rb_off; 271 for (comp = 0; (comp < ncomps); ++comp) { 272 uint_t mdcp = compoff + 273 (comp * sizeof (ms_comp32_od_t)); 274 skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), 275 KM_SLEEP); 276 skip->skip_offset = mdcp + 277 offsetof(ms_comp32_od_t, un_mirror.ms_timestamp); 278 skip->skip_size = sizeof (md_timeval32_t); 279 skip_prev->skip_next = skip; 280 skip_prev = skip; 281 } 282 } else { 283 big_un = (ms_unit_t *)un; 284 skip_prev = &skip_start; 285 286 if (big_un->un_nrows == 0) 287 return (NULL); 288 /* 289 * walk through all rows to find the total number 290 * of components 291 */ 292 big_mdr = &big_un->un_row[0]; 293 ncomps = 0; 294 for (row = 0; (row < big_un->un_nrows); row++) { 295 ncomps += big_mdr[row].un_ncomp; 296 } 297 298 /* Now walk through the components */ 299 compoff = big_un->un_ocomp + rb_off; 300 for (comp = 0; (comp < ncomps); ++comp) { 301 uint_t mdcp = compoff + 302 (comp * sizeof (ms_comp_t)); 303 skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), 304 KM_SLEEP); 305 skip->skip_offset = mdcp + 306 offsetof(ms_comp_t, un_mirror.ms_timestamp); 307 skip->skip_size = sizeof (md_timeval32_t); 308 skip_prev->skip_next = skip; 309 skip_prev = skip; 310 } 311 } 312 /* Return the start of the list of fields to skip */ 313 return (skip_start.skip_next); 314 } 315 316 /* 317 * mirror_skip_ts 318 * 319 * Returns a list of fields to be skipped in the mirror record structure. 320 * This includes un_last_read and sm_timestamp for each submirror 321 * Used to skip these fields when calculating the checksum. 322 */ 323 static crc_skip_t * 324 mirror_skip_ts(uint_t revision) 325 { 326 int i; 327 crc_skip_t *skip; 328 crc_skip_t *skip_prev; 329 crc_skip_t skip_start = {0, 0, 0}; 330 uint_t rb_off = offsetof(mddb_rb32_t, rb_data[0]); 331 332 skip_prev = &skip_start; 333 334 skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), KM_SLEEP); 335 if (revision == MDDB_REV_RB) { 336 skip->skip_offset = offsetof(mm_unit32_od_t, 337 un_last_read) + rb_off; 338 } else { 339 skip->skip_offset = offsetof(mm_unit_t, 340 un_last_read) + rb_off; 341 } 342 skip->skip_size = sizeof (int); 343 skip_prev->skip_next = skip; 344 skip_prev = skip; 345 346 for (i = 0; i < NMIRROR; i++) { 347 skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), KM_SLEEP); 348 if (revision == MDDB_REV_RB) { 349 skip->skip_offset = offsetof(mm_unit32_od_t, 350 un_sm[i].sm_timestamp) + rb_off; 351 } else { 352 skip->skip_offset = offsetof(mm_unit_t, 353 un_sm[i].sm_timestamp) + rb_off; 354 } 355 skip->skip_size = sizeof (md_timeval32_t); 356 skip_prev->skip_next = skip; 357 skip_prev = skip; 358 } 359 /* Return the start of the list of fields to skip */ 360 return (skip_start.skip_next); 361 } 362 363 /* 364 * hotspare_skip_ts 365 * 366 * Returns a list of the timestamp fields in the hotspare record structure. 367 * Used to skip these fields when calculating the checksum. 368 */ 369 static crc_skip_t * 370 hotspare_skip_ts(uint_t revision) 371 { 372 crc_skip_t *skip; 373 uint_t rb_off = offsetof(mddb_rb32_t, rb_data[0]); 374 375 skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), KM_SLEEP); 376 if (revision == MDDB_REV_RB) { 377 skip->skip_offset = offsetof(hot_spare32_od_t, hs_timestamp) + 378 rb_off; 379 } else { 380 skip->skip_offset = offsetof(hot_spare_t, hs_timestamp) + 381 rb_off; 382 } 383 skip->skip_size = sizeof (md_timeval32_t); 384 return (skip); 385 } 386 387 /* 388 * rec_crcfunc 389 * 390 * Calculate or check the checksum for a record 391 * Calculate the crc if check == 0, Check the crc if check == 1 392 * 393 * Record block may be written by different nodes in a multi-owner diskset 394 * (in case of master change), the function rec_crcchk excludes timestamp 395 * fields in crc computation of record data. 396 * Otherwise, timestamp fields will cause each node to have a different 397 * checksum for same record block causing the exclusive-or of all record block 398 * checksums and data block record sums to be non-zero after new master writes 399 * at least one record block. 400 */ 401 static uint_t 402 rec_crcfunc( 403 mddb_set_t *s, 404 mddb_de_ic_t *dep, 405 mddb_rb32_t *rbp, 406 int check 407 ) 408 { 409 crc_skip_t *skip; 410 crc_skip_t *skip_tail; 411 mddb_type_t type = dep->de_type1; 412 uint_t ret; 413 414 /* 415 * Generate a list of the areas to be skipped when calculating 416 * the checksum. 417 * First skip rb_checksum, rb_private and rb_userdata. 418 */ 419 skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), KM_SLEEP); 420 skip->skip_offset = offsetof(mddb_rb32_t, rb_checksum_fiddle); 421 skip->skip_size = 3 * sizeof (uint_t); 422 skip_tail = skip; 423 if (MD_MNSET_SETNO(s->s_setno)) { 424 /* For a MN set, skip rb_timestamp */ 425 skip_tail = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), 426 KM_SLEEP); 427 skip_tail->skip_offset = offsetof(mddb_rb32_t, rb_timestamp); 428 skip_tail->skip_size = sizeof (md_timeval32_t); 429 skip->skip_next = skip_tail; 430 431 /* Now add a list of timestamps to be skipped */ 432 if (type >= MDDB_FIRST_MODID) { 433 switch (dep->de_flags) { 434 case MDDB_F_STRIPE: 435 skip_tail->skip_next = 436 stripe_skip_ts((void *)rbp->rb_data, 437 rbp->rb_revision); 438 break; 439 case MDDB_F_MIRROR: 440 skip_tail->skip_next = 441 mirror_skip_ts(rbp->rb_revision); 442 break; 443 case MDDB_F_HOTSPARE: 444 skip_tail->skip_next = 445 hotspare_skip_ts(rbp->rb_revision); 446 break; 447 default: 448 break; 449 } 450 } 451 } 452 453 if (check) { 454 ret = crcchk(rbp, &rbp->rb_checksum, dep->de_recsize, skip); 455 } else { 456 crcgen(rbp, &rbp->rb_checksum, dep->de_recsize, skip); 457 ret = rbp->rb_checksum; 458 } 459 while (skip) { 460 crc_skip_t *skip_save = skip; 461 462 skip = skip->skip_next; 463 kmem_free(skip_save, sizeof (crc_skip_t)); 464 } 465 return (ret); 466 } 467 468 static mddb_bf_t * 469 allocbuffer( 470 mddb_set_t *s, 471 int sleepflag 472 ) 473 { 474 mddb_bf_t *bfp; 475 476 while ((bfp = s->s_freebufhead) == NULL) { 477 if (sleepflag == MDDB_NOSLEEP) 478 return ((mddb_bf_t *)NULL); 479 ++s->s_bufmisses; 480 #ifdef DEBUG 481 if (s->s_bufmisses == 1) 482 cmn_err(CE_NOTE, 483 "md: mddb: set %u sleeping for buffer", s->s_setno); 484 #endif 485 s->s_bufwakeup = 1; 486 cv_wait(&s->s_buf_cv, SETMUTEX(s->s_setno)); 487 } 488 s->s_freebufhead = bfp->bf_next; 489 bzero((caddr_t)bfp, sizeof (*bfp)); 490 bfp->bf_buf.b_back = bfp->bf_buf.b_forw = &bfp->bf_buf; 491 bfp->bf_buf.b_flags = B_BUSY; /* initialize flags */ 492 return (bfp); 493 } 494 495 static void 496 freebuffer( 497 mddb_set_t *s, 498 mddb_bf_t *bfp 499 ) 500 { 501 bfp->bf_next = s->s_freebufhead; 502 s->s_freebufhead = bfp; 503 if (s->s_bufwakeup) { 504 cv_broadcast(&s->s_buf_cv); 505 s->s_bufwakeup = 0; 506 } 507 } 508 509 int 510 revchk( 511 uint_t mine, 512 uint_t data 513 ) 514 { 515 if ((MDDB_REV_MAJOR & mine) != (MDDB_REV_MAJOR & data)) 516 return (1); 517 if ((MDDB_REV_MINOR & mine) < (MDDB_REV_MINOR & data)) 518 return (1); 519 return (0); 520 } 521 522 static void 523 blkbusy( 524 mddb_set_t *s, 525 mddb_block_t blk 526 ) 527 { 528 int bit, byte; 529 530 s->s_freeblkcnt--; 531 byte = blk / 8; 532 bit = 1 << (blk & 7); 533 ASSERT(! (s->s_freebitmap[byte] & bit)); 534 s->s_freebitmap[byte] |= bit; 535 } 536 537 static void 538 blkfree( 539 mddb_set_t *s, 540 mddb_block_t blk 541 ) 542 { 543 int bit, byte; 544 545 s->s_freeblkcnt++; 546 byte = blk / 8; 547 bit = 1 << (blk & 7); 548 ASSERT(s->s_freebitmap[byte] & bit); 549 s->s_freebitmap[byte] &= ~bit; 550 } 551 552 static int 553 blkcheck( 554 mddb_set_t *s, 555 mddb_block_t blk 556 ) 557 { 558 int bit, byte; 559 560 byte = blk / 8; 561 bit = 1 << (blk & 7); 562 return (s->s_freebitmap[byte] & bit); 563 } 564 565 /* 566 * not fast but simple 567 */ 568 static mddb_block_t 569 getfreeblks( 570 mddb_set_t *s, 571 size_t count 572 ) 573 { 574 int i; 575 size_t contig; 576 577 contig = 0; 578 for (i = 0; i < s->s_totalblkcnt; i++) { 579 if (blkcheck(s, i)) { 580 contig = 0; 581 } else { 582 contig++; 583 if (contig == count) { 584 contig = i - count + 1; 585 for (i = (int)contig; i < contig + count; i++) 586 blkbusy(s, i); 587 return ((mddb_block_t)contig); 588 } 589 } 590 } 591 return (0); 592 } 593 594 static void 595 computefreeblks( 596 mddb_set_t *s 597 ) 598 { 599 mddb_db_t *dbp; 600 mddb_de_ic_t *dep; 601 int i; 602 int minblks; 603 int freeblks; 604 mddb_mb_ic_t *mbip; 605 mddb_lb_t *lbp; 606 mddb_block_t maxblk; 607 mddb_did_db_t *did_dbp; 608 int nblks; 609 610 minblks = 0; 611 lbp = s->s_lbp; 612 maxblk = 0; 613 614 /* 615 * Determine the max number of blocks. 616 */ 617 nblks = (lbp->lb_flags & MDDB_MNSET) ? MDDB_MN_MAXBLKS : MDDB_MAXBLKS; 618 /* 619 * go through and find highest logical block 620 */ 621 for (dbp = s->s_dbp; dbp != 0; dbp = dbp->db_next) { 622 if (dbp->db_blknum > maxblk) 623 maxblk = dbp->db_blknum; 624 for (dep = dbp->db_firstentry; dep != 0; dep = dep->de_next) 625 for (i = 0; i < dep->de_blkcount; i++) 626 if (dep->de_blks[i] > maxblk) 627 maxblk = dep->de_blks[i]; 628 } 629 630 for (i = 0; i < lbp->lb_loccnt; i++) { 631 mddb_locator_t *lp = &lbp->lb_locators[i]; 632 633 if ((lp->l_flags & MDDB_F_DELETED) || 634 (lp->l_flags & MDDB_F_EMASTER)) 635 continue; 636 637 freeblks = 0; 638 for (mbip = s->s_mbiarray[i]; mbip != NULL; 639 mbip = mbip->mbi_next) { 640 freeblks += mbip->mbi_mddb_mb.mb_blkcnt; 641 } 642 if (freeblks == 0) /* this happen when there is no */ 643 continue; /* master blk */ 644 645 if (freeblks <= maxblk) { 646 lp->l_flags |= MDDB_F_TOOSMALL; 647 lp->l_flags &= ~MDDB_F_ACTIVE; 648 } 649 650 if (freeblks < minblks || minblks == 0) 651 minblks = freeblks; 652 } 653 /* 654 * set up reasonable freespace if no 655 * data bases exist 656 */ 657 if (minblks == 0) 658 minblks = 100; 659 if (minblks > nblks) 660 minblks = nblks; 661 s->s_freeblkcnt = minblks; 662 s->s_totalblkcnt = minblks; 663 if (! s->s_freebitmapsize) { 664 s->s_freebitmapsize = nblks / 8; 665 s->s_freebitmap = (uchar_t *)kmem_zalloc(s->s_freebitmapsize, 666 KM_SLEEP); 667 } 668 bzero((caddr_t)s->s_freebitmap, s->s_freebitmapsize); 669 670 /* locator block sectors */ 671 for (i = 0; i < s->s_lbp->lb_blkcnt; i++) 672 blkbusy(s, i); 673 674 /* locator name sectors */ 675 for (i = 0; i < s->s_lbp->lb_lnblkcnt; i++) 676 blkbusy(s, (s->s_lbp->lb_lnfirstblk + i)); 677 678 if (lbp->lb_flags & MDDB_DEVID_STYLE) { 679 /* locator block device id information */ 680 for (i = 0; i < s->s_lbp->lb_didblkcnt; i++) 681 blkbusy(s, (s->s_lbp->lb_didfirstblk + i)); 682 683 /* disk blocks containing actual device ids */ 684 did_dbp = s->s_did_icp->did_ic_dbp; 685 while (did_dbp) { 686 for (i = 0; i < did_dbp->db_blkcnt; i++) { 687 blkbusy(s, did_dbp->db_firstblk + i); 688 } 689 did_dbp = did_dbp->db_next; 690 } 691 } 692 693 /* Only use data tags if not a MN set */ 694 if (!(lbp->lb_flags & MDDB_MNSET)) { 695 /* Found a bad tag, do NOT mark the data tag blks busy here */ 696 if (! (md_get_setstatus(s->s_setno) & MD_SET_BADTAG)) { 697 for (i = 0; i < s->s_lbp->lb_dtblkcnt; i++) 698 blkbusy(s, (s->s_lbp->lb_dtfirstblk + i)); 699 } 700 } 701 702 /* directory block/entry sectors */ 703 for (dbp = s->s_dbp; dbp != 0; dbp = dbp->db_next) { 704 blkbusy(s, dbp->db_blknum); 705 for (dep = dbp->db_firstentry; dep != 0; dep = dep->de_next) 706 for (i = 0; i < dep->de_blkcount; i++) 707 blkbusy(s, dep->de_blks[i]); 708 } 709 } 710 711 /* 712 * Add free space to the device id incore free list. 713 * Called: 714 * - During startup when all devid blocks are temporarily placed on the 715 * free list 716 * - After a devid has been deleted via the metadb command. 717 * - When mddb_devid_free_get adds unused space from a disk block 718 * to free list 719 */ 720 static int 721 mddb_devid_free_add( 722 mddb_set_t *s, 723 uint_t firstblk, 724 uint_t offset, 725 uint_t length 726 ) 727 { 728 mddb_did_free_t *did_freep; 729 730 if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) { 731 return (0); 732 } 733 734 did_freep = (mddb_did_free_t *)kmem_zalloc(sizeof (mddb_did_free_t), 735 KM_SLEEP); 736 did_freep->free_blk = firstblk; 737 did_freep->free_offset = offset; 738 did_freep->free_length = length; 739 did_freep->free_next = s->s_did_icp->did_ic_freep; 740 s->s_did_icp->did_ic_freep = did_freep; 741 742 return (0); 743 } 744 745 /* 746 * Remove specific free space from the device id incore free list. 747 * Called at startup (after all devid blocks have been placed on 748 * free list) in order to remove the free space from the list that 749 * contains actual devids. 750 * Returns 0 if area successfully removed. 751 * Returns 1 if no matching area is found - so nothing removed. 752 */ 753 static int 754 mddb_devid_free_delete( 755 mddb_set_t *s, 756 uint_t firstblk, 757 uint_t offset, 758 uint_t length 759 ) 760 { 761 int block_found = 0; 762 mddb_did_free_t *did_freep1; /* next free block */ 763 mddb_did_free_t *did_freep2 = 0; /* previous free block */ 764 mddb_did_free_t *did_freep_before; /* area before offset, len */ 765 mddb_did_free_t *did_freep_after; /* area after offset, len */ 766 uint_t old_length; 767 768 if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) { 769 return (1); 770 } 771 772 /* find free block for this devid */ 773 did_freep1 = s->s_did_icp->did_ic_freep; 774 while (did_freep1) { 775 /* 776 * Look through free list of <block, offset, length> to 777 * find our entry in the free list. Our entry should 778 * exist since the entire devid block was placed into 779 * this free list at startup. This code is just removing 780 * the non-free (in-use) portions of the devid block so 781 * that the remaining linked list does indeed just 782 * contain a free list. 783 * 784 * Our entry has been found if 785 * - the blocks match, 786 * - the offset (starting address) in the free list is 787 * less than the offset of our entry and 788 * - the length+offset (ending address) in the free list is 789 * greater than the length+offset of our entry. 790 */ 791 if ((did_freep1->free_blk == firstblk) && 792 (did_freep1->free_offset <= offset) && 793 ((did_freep1->free_length + did_freep1->free_offset) >= 794 (length + offset))) { 795 /* Have found our entry - remove from list */ 796 block_found = 1; 797 did_freep_before = did_freep1; 798 old_length = did_freep1->free_length; 799 /* did_freep1 - pts to next free block */ 800 did_freep1 = did_freep1->free_next; 801 if (did_freep2) { 802 did_freep2->free_next = did_freep1; 803 } else { 804 s->s_did_icp->did_ic_freep = did_freep1; 805 } 806 807 /* 808 * did_freep_before points to area in block before 809 * offset, length. 810 */ 811 did_freep_before->free_length = offset - 812 did_freep_before->free_offset; 813 /* 814 * did_freep_after points to area in block after 815 * offset, length. 816 */ 817 did_freep_after = (mddb_did_free_t *)kmem_zalloc 818 (sizeof (mddb_did_free_t), KM_SLEEP); 819 did_freep_after->free_blk = did_freep_before->free_blk; 820 did_freep_after->free_offset = offset + length; 821 did_freep_after->free_length = old_length - length - 822 did_freep_before->free_length; 823 /* 824 * Add before and after areas to free list 825 * If area before or after offset, length has length 826 * of 0, that entry is not added. 827 */ 828 if (did_freep_after->free_length) { 829 did_freep_after->free_next = did_freep1; 830 if (did_freep2) { 831 did_freep2->free_next = did_freep_after; 832 } else { 833 s->s_did_icp->did_ic_freep = 834 did_freep_after; 835 } 836 did_freep1 = did_freep_after; 837 } else { 838 kmem_free(did_freep_after, 839 sizeof (mddb_did_free_t)); 840 } 841 842 if (did_freep_before->free_length) { 843 did_freep_before->free_next = did_freep1; 844 if (did_freep2) { 845 did_freep2->free_next = did_freep_before; 846 } else { 847 s->s_did_icp->did_ic_freep = 848 did_freep_before; 849 } 850 } else { 851 kmem_free(did_freep_before, 852 sizeof (mddb_did_free_t)); 853 } 854 break; 855 } else { 856 did_freep2 = did_freep1; 857 did_freep1 = did_freep1->free_next; 858 } 859 } 860 if (block_found == 0) { 861 return (1); 862 } else { 863 return (0); 864 } 865 } 866 867 /* 868 * Find free space of devid length and remove free space from list. 869 * Return a pointer to the previously free area. 870 * 871 * If there's not enough free space on the free list, get an empty 872 * disk block, put the empty disk block on the did_ic_dbp linked list, 873 * and add the disk block space not used for devid to the free list. 874 * 875 * Return pointer to address (inside disk block) of free area for devid. 876 * Return 0 if error. 877 */ 878 static caddr_t 879 mddb_devid_free_get( 880 mddb_set_t *s, 881 uint_t len, 882 uint_t *blk, 883 uint_t *cnt, 884 uint_t *offset 885 ) 886 { 887 mddb_did_free_t *freep, *freep2; 888 mddb_did_db_t *dbp; 889 uint_t blk_cnt, blk_num; 890 ddi_devid_t devid_ptr = NULL; 891 892 if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) { 893 return (0); 894 } 895 896 freep = s->s_did_icp->did_ic_freep; 897 freep2 = (mddb_did_free_t *)NULL; 898 while (freep) { 899 /* found a free area - remove from free list */ 900 if (len <= freep->free_length) { 901 *blk = freep->free_blk; 902 *offset = freep->free_offset; 903 /* find disk block pointer that contains free area */ 904 dbp = s->s_did_icp->did_ic_dbp; 905 while (dbp) { 906 if (dbp->db_firstblk == *blk) 907 break; 908 else 909 dbp = dbp->db_next; 910 } 911 /* 912 * If a disk block pointer can't be found - something 913 * is wrong, so don't use this free space. 914 */ 915 if (dbp == NULL) { 916 freep2 = freep; 917 freep = freep->free_next; 918 continue; 919 } 920 921 devid_ptr = (ddi_devid_t)(dbp->db_ptr + *offset); 922 *cnt = dbp->db_blkcnt; 923 924 /* Update free list information */ 925 freep->free_offset += len; 926 freep->free_length -= len; 927 if (freep->free_length == 0) { 928 if (freep2) { 929 freep2->free_next = 930 freep->free_next; 931 } else { 932 s->s_did_icp->did_ic_freep = 933 freep->free_next; 934 } 935 kmem_free(freep, sizeof (mddb_did_free_t)); 936 } 937 break; 938 } 939 freep2 = freep; 940 freep = freep->free_next; 941 } 942 943 /* Didn't find a free spot */ 944 if (freep == NULL) { 945 /* get free logical disk blk in replica */ 946 blk_cnt = btodb(len + (MDDB_BSIZE - 1)); 947 blk_num = getfreeblks(s, blk_cnt); 948 if (blk_num == 0) 949 return (0); 950 951 /* Add disk block to disk block linked list */ 952 dbp = kmem_zalloc(sizeof (mddb_did_db_t), KM_SLEEP); 953 dbp->db_firstblk = blk_num; 954 dbp->db_blkcnt = blk_cnt; 955 dbp->db_ptr = (caddr_t)kmem_zalloc(dbtob(blk_cnt), KM_SLEEP); 956 dbp->db_next = s->s_did_icp->did_ic_dbp; 957 s->s_did_icp->did_ic_dbp = dbp; 958 devid_ptr = (ddi_devid_t)dbp->db_ptr; 959 960 /* Update return values */ 961 *blk = blk_num; 962 *offset = 0; 963 *cnt = blk_cnt; 964 965 /* Add unused part of block to free list */ 966 (void) mddb_devid_free_add(s, blk_num, 967 len, (dbtob(blk_cnt) - len)); 968 } 969 970 return ((caddr_t)devid_ptr); 971 } 972 973 /* 974 * Add device id information for locator index to device id area in set. 975 * Get free area to store device id from free list. Update checksum 976 * for mddb_did_blk. 977 * 978 * This routine does not write any data out to disk. 979 * After this routine has been called, the routine, writelocall, should 980 * be called to write both the locator block and device id area out 981 * to disk. 982 */ 983 static int 984 mddb_devid_add( 985 mddb_set_t *s, 986 uint_t index, 987 ddi_devid_t devid, 988 char *minor_name 989 ) 990 { 991 uint_t devid_len; 992 uint_t blk, offset; 993 ddi_devid_t devid_ptr; 994 mddb_did_info_t *did_info; 995 uint_t blkcnt, i; 996 mddb_did_blk_t *did_blk; 997 998 if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) { 999 return (1); 1000 } 1001 if (strlen(minor_name) > (MDDB_MINOR_NAME_MAX - 1)) 1002 return (1); 1003 1004 /* Check if device id has already been added */ 1005 did_blk = s->s_did_icp->did_ic_blkp; 1006 did_info = &(did_blk->blk_info[index]); 1007 if (did_info->info_flags & MDDB_DID_EXISTS) 1008 return (0); 1009 1010 devid_len = ddi_devid_sizeof(devid); 1011 devid_ptr = (ddi_devid_t) 1012 mddb_devid_free_get(s, devid_len, &blk, &blkcnt, 1013 &offset); 1014 if (devid_ptr == NULL) { 1015 return (1); 1016 } 1017 1018 /* Copy devid into devid free area */ 1019 for (i = 0; i < devid_len; i++) 1020 ((char *)devid_ptr)[i] = ((char *)devid)[i]; 1021 1022 /* Update mddb_did_info area for new device id */ 1023 did_info->info_flags = MDDB_DID_EXISTS | MDDB_DID_VALID | 1024 MDDB_DID_UPDATED; 1025 did_info->info_firstblk = blk; 1026 did_info->info_blkcnt = blkcnt; 1027 did_info->info_offset = offset; 1028 did_info->info_length = devid_len; 1029 (void) strcpy(did_info->info_minor_name, minor_name); 1030 crcgen(devid_ptr, &did_info->info_checksum, devid_len, NULL); 1031 1032 /* Add device id pointer to did_ic_devid array */ 1033 s->s_did_icp->did_ic_devid[index] = devid_ptr; 1034 1035 return (0); 1036 } 1037 1038 1039 /* 1040 * Delete device id information for locator index from device id area in set. 1041 * Add device id space to free area. 1042 * 1043 * This routine does not write any data out to disk. 1044 * After this routine has been called, the routine, writelocall, should 1045 * be called to write both the locator block and device id area out 1046 * to disk. 1047 */ 1048 static int 1049 mddb_devid_delete(mddb_set_t *s, uint_t index) 1050 { 1051 mddb_did_info_t *did_info; 1052 mddb_did_blk_t *did_blk; 1053 1054 if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) { 1055 return (1); 1056 } 1057 1058 /* Get device id information from mddb_did_blk */ 1059 did_blk = s->s_did_icp->did_ic_blkp; 1060 did_info = &(did_blk->blk_info[index]); 1061 1062 /* 1063 * Ensure that the underlying device supports device ids 1064 * before arbitrarily removing them. 1065 */ 1066 if (!(did_info->info_flags & MDDB_DID_EXISTS)) { 1067 return (1); 1068 } 1069 1070 /* Remove device id information from mddb_did_blk */ 1071 did_info->info_flags = 0; 1072 1073 /* Remove device id from incore area */ 1074 s->s_did_icp->did_ic_devid[index] = (ddi_devid_t)NULL; 1075 1076 /* Add new free space in disk block to free list */ 1077 (void) mddb_devid_free_add(s, did_info->info_firstblk, 1078 did_info->info_offset, did_info->info_length); 1079 1080 return (0); 1081 } 1082 1083 /* 1084 * Check if there is a device id for a locator index. 1085 * 1086 * Caller of this routine should not free devid or minor_name since 1087 * these will point to internal data structures that should not 1088 * be freed. 1089 */ 1090 static int 1091 mddb_devid_get( 1092 mddb_set_t *s, 1093 uint_t index, 1094 ddi_devid_t *devid, 1095 char **minor_name 1096 ) 1097 { 1098 mddb_did_info_t *did_info; 1099 1100 if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) { 1101 return (0); 1102 } 1103 did_info = &(s->s_did_icp->did_ic_blkp->blk_info[index]); 1104 1105 if (did_info->info_flags & MDDB_DID_EXISTS) { 1106 *devid = s->s_did_icp->did_ic_devid[index]; 1107 *minor_name = 1108 s->s_did_icp->did_ic_blkp->blk_info[index].info_minor_name; 1109 return (1); 1110 } else 1111 return (0); 1112 1113 1114 } 1115 1116 /* 1117 * Check if device id is valid on current system. 1118 * Needs devid, previously known dev_t and current minor_name. 1119 * 1120 * Success: 1121 * Returns 0 if valid device id is found and updates 1122 * dev_t if the dev_t associated with the device id is 1123 * different than dev_t. 1124 * Failure: 1125 * Returns 1 if device id not valid on current system. 1126 */ 1127 static int 1128 mddb_devid_validate(ddi_devid_t devid, md_dev64_t *dev, char *minor_name) 1129 { 1130 int retndevs; 1131 dev_t *ddi_devs; 1132 int devid_flag = 0; 1133 int cnt; 1134 1135 if (dev == 0) 1136 return (1); 1137 /* 1138 * See if devid is valid in the current system. 1139 * If so, set dev to match the devid. 1140 */ 1141 if (ddi_lyr_devid_to_devlist(devid, minor_name, 1142 &retndevs, &ddi_devs) == DDI_SUCCESS) { 1143 if (retndevs > 0) { 1144 /* devid is valid to use */ 1145 devid_flag = 1; 1146 /* does dev_t in list match dev */ 1147 cnt = 0; 1148 while (cnt < retndevs) { 1149 if (*dev == md_expldev(ddi_devs[cnt])) 1150 break; 1151 cnt++; 1152 } 1153 /* 1154 * If a different dev_t, then setup 1155 * new dev and new major name 1156 */ 1157 if (cnt == retndevs) { 1158 *dev = md_expldev(ddi_devs[0]); 1159 } 1160 ddi_lyr_free_devlist(ddi_devs, retndevs); 1161 } 1162 } 1163 if (devid_flag) 1164 return (0); 1165 else 1166 return (1); 1167 } 1168 1169 1170 /* 1171 * Free the devid incore data areas 1172 */ 1173 static void 1174 mddb_devid_icp_free(mddb_did_ic_t **did_icp, mddb_lb_t *lbp) 1175 { 1176 mddb_did_free_t *did_freep1, *did_freep2; 1177 mddb_did_db_t *did_dbp1, *did_dbp2; 1178 mddb_did_ic_t *icp = *did_icp; 1179 1180 if (icp) { 1181 if (icp->did_ic_blkp) { 1182 kmem_free((caddr_t)icp->did_ic_blkp, 1183 dbtob(lbp->lb_didblkcnt)); 1184 icp->did_ic_blkp = (mddb_did_blk_t *)NULL; 1185 } 1186 1187 if (icp->did_ic_dbp) { 1188 did_dbp1 = icp->did_ic_dbp; 1189 while (did_dbp1) { 1190 did_dbp2 = did_dbp1->db_next; 1191 kmem_free((caddr_t)did_dbp1->db_ptr, 1192 dbtob(did_dbp1->db_blkcnt)); 1193 kmem_free((caddr_t)did_dbp1, 1194 sizeof (mddb_did_db_t)); 1195 did_dbp1 = did_dbp2; 1196 } 1197 } 1198 1199 if (icp->did_ic_freep) { 1200 did_freep1 = icp->did_ic_freep; 1201 while (did_freep1) { 1202 did_freep2 = did_freep1->free_next; 1203 kmem_free((caddr_t)did_freep1, 1204 sizeof (mddb_did_free_t)); 1205 did_freep1 = did_freep2; 1206 } 1207 } 1208 1209 kmem_free((caddr_t)icp, sizeof (mddb_did_ic_t)); 1210 *did_icp = (mddb_did_ic_t *)NULL; 1211 } 1212 1213 } 1214 1215 static daddr_t 1216 getphysblk( 1217 mddb_block_t blk, 1218 mddb_mb_ic_t *mbip 1219 ) 1220 { 1221 mddb_mb_t *mbp = &(mbip->mbi_mddb_mb); 1222 1223 while (blk >= mbp->mb_blkcnt) { 1224 if (! mbip->mbi_next) 1225 return ((daddr_t)-1); /* no such block */ 1226 blk -= mbp->mb_blkcnt; 1227 mbip = mbip->mbi_next; 1228 mbp = &(mbip->mbi_mddb_mb); 1229 } 1230 1231 if (blk >= mbp->mb_blkmap.m_consecutive) 1232 return ((daddr_t)-1); /* no such block */ 1233 1234 return ((daddr_t)(mbp->mb_blkmap.m_firstblk + blk)); 1235 } 1236 1237 /* 1238 * when a buf header is passed in the new buffer must be 1239 * put on the front of the chain. writerec counts on it 1240 */ 1241 static int 1242 putblks( 1243 mddb_set_t *s, /* incore db set structure */ 1244 caddr_t buffer, /* adr of buffer to be written */ 1245 daddr_t blk, /* block number for first block */ 1246 int cnt, /* number of blocks to be written */ 1247 md_dev64_t device, /* device to be written to */ 1248 mddb_bf_t **bufhead /* if non-zero then ASYNC I/O */ 1249 /* and put buf address here */ 1250 ) 1251 { 1252 buf_t *bp; 1253 mddb_bf_t *bfp; 1254 int err = 0; 1255 1256 bfp = allocbuffer(s, MDDB_SLEEPOK); 1257 bp = &bfp->bf_buf; 1258 bp->b_bcount = MDDB_BSIZE * cnt; 1259 bp->b_un.b_addr = buffer; 1260 bp->b_blkno = blk; 1261 bp->b_edev = md_dev64_to_dev(device); 1262 /* 1263 * if a header for a buf chain is passed in this is async io. 1264 * currently only done for optimize records 1265 */ 1266 if (bufhead) { 1267 bfp->bf_next = *bufhead; 1268 *bufhead = bfp; 1269 (void) mddb_rwdata(s, B_WRITE|B_ASYNC, bp); 1270 return (0); 1271 } 1272 err = mddb_rwdata(s, B_WRITE, bp); 1273 freebuffer(s, bfp); 1274 if (err) { 1275 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, SVM_TAG_REPLICA, 1276 s->s_setno, device); 1277 return (MDDB_F_EWRITE); 1278 } 1279 return (0); 1280 } 1281 1282 /* 1283 * wrtblklst - takes an array of logical block numbers 1284 * and writes the buffer to those blocks (scatter). 1285 * If called during upgrade, this routine expects a 1286 * non-translated (aka target) dev. 1287 */ 1288 static int 1289 wrtblklst( 1290 mddb_set_t *s, /* incore set structure */ 1291 caddr_t buffer, /* buffer to be written (record blk) */ 1292 mddb_block_t blka[], /* list of logical blks for record */ 1293 daddr_t cnt, /* number of logical blks */ 1294 const int li, /* locator index */ 1295 mddb_bf_t **bufhead, /* if non-zero then ASYNC I/O */ 1296 /* and put buf address here */ 1297 int master_only /* allow only master node to write */ 1298 ) 1299 { 1300 daddr_t blk; 1301 daddr_t blk1; 1302 int err = 0; 1303 int cons; 1304 mddb_lb_t *lbp = s->s_lbp; 1305 mddb_locator_t *lp = &lbp->lb_locators[li]; 1306 md_dev64_t dev; 1307 mddb_mb_ic_t *mbip = s->s_mbiarray[li]; 1308 1309 /* 1310 * If a MN diskset and only the master can write, 1311 * then a non-master node will just return success. 1312 */ 1313 if ((lbp->lb_flags & MDDB_MNSET) && 1314 (master_only == MDDB_WR_ONLY_MASTER)) { 1315 1316 /* return successfully if we aren't the master */ 1317 if (!(md_set[s->s_setno].s_am_i_master)) { 1318 return (0); 1319 } 1320 } 1321 1322 dev = md_xlate_targ_2_mini(md_expldev(lp->l_dev)); 1323 if (dev == NODEV64) { 1324 return (1); 1325 } 1326 1327 blk = getphysblk(blka[0], mbip); 1328 ASSERT(blk >= 0); 1329 1330 cons = 1; 1331 while (cnt) { 1332 if (cons != cnt) { 1333 blk1 = getphysblk(blka[cons], mbip); 1334 ASSERT(blk1 >= 0); 1335 if ((blk + cons) == blk1) { 1336 cons++; 1337 continue; 1338 } 1339 } 1340 if (err = putblks(s, buffer, blk, cons, dev, bufhead)) { 1341 /* 1342 * If an MN diskset and any_node_can_write 1343 * then this request is coming from writeoptrecord 1344 * and l_flags field should not be updated. 1345 * l_flags will be updated as a result of sending 1346 * a class1 message to the master. Setting l_flags 1347 * here will cause slave to be out of sync with 1348 * master. 1349 * 1350 * Otherwise, set the error in l_flags 1351 * (this occurs if this is not a MN diskset or 1352 * only_master_can_write is set). 1353 */ 1354 if ((!(lbp->lb_flags & MDDB_MNSET)) || 1355 (master_only == MDDB_WR_ONLY_MASTER)) { 1356 lp->l_flags |= MDDB_F_EWRITE; 1357 } 1358 return (err); 1359 } 1360 if (bufhead) 1361 (*bufhead)->bf_locator = lp; 1362 1363 buffer += MDDB_BSIZE * cons; 1364 cnt -= cons; 1365 blka += cons; 1366 if (cnt) { 1367 blk = getphysblk(blka[0], mbip); 1368 ASSERT(blk >= 0); 1369 } 1370 cons = 1; 1371 } 1372 1373 return (0); 1374 } 1375 1376 /* 1377 * writeblks - takes a logical block number/block count pair 1378 * and writes the buffer to those contiguous logical blocks. 1379 * If called during upgrade, this routine expects a non-translated 1380 * (aka target) dev. 1381 */ 1382 static int 1383 writeblks( 1384 mddb_set_t *s, /* incore set structure */ 1385 caddr_t buffer, /* buffer to be written */ 1386 mddb_block_t blk, /* starting logical block number */ 1387 int cnt, /* number of log blocks to be written */ 1388 const int li, /* locator index */ 1389 int master_only /* allow only master node to write */ 1390 ) 1391 { 1392 daddr_t physblk; 1393 int err = 0; 1394 int i; 1395 mddb_lb_t *lbp = s->s_lbp; 1396 mddb_locator_t *lp = &lbp->lb_locators[li]; 1397 md_dev64_t dev; 1398 mddb_block_t *blkarray; 1399 int size; 1400 int ret; 1401 1402 /* 1403 * If a MN diskset and only the master can write, 1404 * then a non-master node will just return success. 1405 */ 1406 if ((lbp->lb_flags & MDDB_MNSET) && 1407 (master_only == MDDB_WR_ONLY_MASTER)) { 1408 /* return successfully if we aren't the master */ 1409 if (!(md_set[s->s_setno].s_am_i_master)) { 1410 return (0); 1411 } 1412 } 1413 1414 dev = md_xlate_targ_2_mini(md_expldev(lp->l_dev)); 1415 if (dev == NODEV64) { 1416 return (1); 1417 } 1418 1419 if (cnt > 1) { 1420 size = sizeof (mddb_block_t) * cnt; 1421 blkarray = (mddb_block_t *)kmem_alloc(size, KM_SLEEP); 1422 for (i = 0; i < cnt; i++) 1423 blkarray[i] = blk + i; 1424 ret = wrtblklst(s, buffer, blkarray, cnt, 1425 li, 0, MDDB_WR_ONLY_MASTER); 1426 kmem_free(blkarray, size); 1427 return (ret); 1428 } 1429 physblk = getphysblk(blk, s->s_mbiarray[li]); 1430 ASSERT(physblk > 0); 1431 if (err = putblks(s, buffer, physblk, 1, dev, (mddb_bf_t **)0)) { 1432 lp->l_flags |= MDDB_F_EWRITE; 1433 return (err); 1434 } 1435 return (0); 1436 } 1437 1438 /* 1439 * writeall - will write the buffer to all ACTIVE/NON-ERRORED replicas. 1440 */ 1441 static int 1442 writeall( 1443 mddb_set_t *s, /* incore set structure */ 1444 caddr_t buffer, /* buffer to be written */ 1445 mddb_block_t block, /* starting logical block number */ 1446 int cnt, /* number of log blocks to be written */ 1447 int master_only /* allow only master node to write */ 1448 ) 1449 { 1450 int li; 1451 int err = 0; 1452 mddb_lb_t *lbp = s->s_lbp; 1453 1454 for (li = 0; li < lbp->lb_loccnt; li++) { 1455 mddb_locator_t *lp = &lbp->lb_locators[li]; 1456 1457 if ((! (lp->l_flags & MDDB_F_ACTIVE)) || 1458 (lp->l_flags & MDDB_F_EWRITE)) 1459 continue; 1460 1461 err |= writeblks(s, buffer, block, cnt, li, master_only); 1462 } 1463 1464 return (err); 1465 } 1466 1467 /* 1468 * writelocall - write the locator block and device id information (if 1469 * replica is in device id format) to all ACTIVE/NON-ERRORER replicas. 1470 * 1471 * Increments the locator block's commitcnt. Updates the device id area's 1472 * commitcnt if the replica is in device id format. Regenerates the 1473 * checksums after updating the commitcnt(s). 1474 */ 1475 static int 1476 writelocall( 1477 mddb_set_t *s /* incore set structure */ 1478 ) 1479 { 1480 int li; 1481 int err = 0; 1482 mddb_lb_t *lbp = s->s_lbp; 1483 mddb_did_blk_t *did_blk; 1484 mddb_did_db_t *did_dbp; 1485 1486 s->s_lbp->lb_commitcnt++; 1487 if (lbp->lb_flags & MDDB_DEVID_STYLE) { 1488 did_blk = s->s_did_icp->did_ic_blkp; 1489 did_blk->blk_commitcnt = s->s_lbp->lb_commitcnt; 1490 crcgen(did_blk, &did_blk->blk_checksum, 1491 dbtob(lbp->lb_didblkcnt), NULL); 1492 } 1493 crcgen(lbp, &lbp->lb_checksum, dbtob(lbp->lb_blkcnt), NULL); 1494 1495 for (li = 0; li < lbp->lb_loccnt; li++) { 1496 mddb_locator_t *lp = &lbp->lb_locators[li]; 1497 1498 if ((! (lp->l_flags & MDDB_F_ACTIVE)) || 1499 (lp->l_flags & MDDB_F_EWRITE)) 1500 continue; 1501 1502 if (lbp->lb_flags & MDDB_DEVID_STYLE) { 1503 /* write out blocks containing actual device ids */ 1504 did_dbp = s->s_did_icp->did_ic_dbp; 1505 while (did_dbp) { 1506 err |= writeblks(s, (caddr_t)did_dbp->db_ptr, 1507 did_dbp->db_firstblk, 1508 did_dbp->db_blkcnt, li, 1509 MDDB_WR_ONLY_MASTER); 1510 did_dbp = did_dbp->db_next; 1511 } 1512 1513 /* write out device id area block */ 1514 err |= writeblks(s, (caddr_t)did_blk, 1515 lbp->lb_didfirstblk, lbp->lb_didblkcnt, li, 1516 MDDB_WR_ONLY_MASTER); 1517 } 1518 /* write out locator block */ 1519 err |= writeblks(s, (caddr_t)lbp, 0, lbp->lb_blkcnt, li, 1520 MDDB_WR_ONLY_MASTER); 1521 } 1522 1523 /* 1524 * If a MN diskset and this is the master, set the PARSE_LOCBLK flag 1525 * in the mddb_set structure to show that the locator block has 1526 * been changed. 1527 */ 1528 1529 if ((lbp->lb_flags & MDDB_MNSET) && 1530 (md_set[s->s_setno].s_am_i_master)) { 1531 s->s_mn_parseflags |= MDDB_PARSE_LOCBLK; 1532 } 1533 return (err); 1534 } 1535 1536 /* 1537 * If called during upgrade, this routine expects a translated 1538 * (aka miniroot) dev. 1539 */ 1540 static int 1541 getblks( 1542 mddb_set_t *s, /* incore db set structure */ 1543 caddr_t buffer, /* buffer to read data into */ 1544 md_dev64_t device, /* device to read from */ 1545 daddr_t blk, /* physical block number to read */ 1546 int cnt /* number of blocks to read */ 1547 ) 1548 { 1549 buf_t *bp; 1550 mddb_bf_t *bfp; 1551 int err = 0; 1552 1553 bfp = allocbuffer(s, MDDB_SLEEPOK); /* this will never sleep */ 1554 bp = &bfp->bf_buf; 1555 bp->b_bcount = MDDB_BSIZE * cnt; 1556 bp->b_un.b_addr = buffer; 1557 bp->b_blkno = blk; 1558 bp->b_edev = md_dev64_to_dev(device); 1559 err = mddb_rwdata(s, B_READ, bp); 1560 freebuffer(s, bfp); 1561 if (err) { 1562 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, SVM_TAG_REPLICA, 1563 s->s_setno, device); 1564 return (MDDB_F_EREAD); 1565 } 1566 return (0); 1567 } 1568 1569 /* 1570 * readblklst - takes an array of logical block numbers 1571 * and reads those blocks (gather) into the buffer. 1572 * If called during upgrade, this routine expects a non-translated 1573 * (aka target) dev. 1574 */ 1575 static int 1576 readblklst( 1577 mddb_set_t *s, /* incore set structure */ 1578 caddr_t buffer, /* buffer to be read (record block) */ 1579 mddb_block_t blka[], /* list of logical blocks to be read */ 1580 daddr_t cnt, /* number of logical blocks */ 1581 int li /* locator index */ 1582 ) 1583 { 1584 daddr_t blk; 1585 daddr_t blk1; 1586 int err = 0; 1587 int cons; 1588 md_dev64_t dev; 1589 mddb_mb_ic_t *mbip; 1590 1591 mbip = s->s_mbiarray[li]; 1592 dev = md_expldev(s->s_lbp->lb_locators[li].l_dev); 1593 dev = md_xlate_targ_2_mini(dev); 1594 if (dev == NODEV64) { 1595 return (1); 1596 } 1597 1598 blk = getphysblk(blka[0], mbip); 1599 ASSERT(blk >= 0); 1600 1601 cons = 1; 1602 while (cnt) { 1603 if (cons != cnt) { 1604 blk1 = getphysblk(blka[cons], mbip); 1605 ASSERT(blk1 >= 0); 1606 if ((blk + cons) == blk1) { 1607 cons++; 1608 continue; 1609 } 1610 } 1611 if (err = getblks(s, buffer, dev, blk, cons)) 1612 return (err); 1613 buffer += MDDB_BSIZE * cons; 1614 cnt -= cons; 1615 blka += cons; 1616 if (cnt) { 1617 blk = getphysblk(blka[0], mbip); 1618 ASSERT(blk >= 0); 1619 } 1620 cons = 1; 1621 } 1622 return (0); 1623 } 1624 1625 /* 1626 * readblks - takes a logical block number/block count pair 1627 * and reads those contiguous logical blocks into the buffer. 1628 * If called during upgrade, this routine expects a non-translated 1629 * (aka target) dev. 1630 */ 1631 static int 1632 readblks( 1633 mddb_set_t *s, /* incore set structure */ 1634 caddr_t buffer, /* buffer to be read into */ 1635 mddb_block_t blk, /* logical block number to be read */ 1636 int cnt, /* number of logical blocks to be read */ 1637 int li /* locator index */ 1638 ) 1639 { 1640 daddr_t physblk; 1641 md_dev64_t device; 1642 int i; 1643 mddb_block_t *blkarray; 1644 int size; 1645 int ret; 1646 1647 if (cnt > 1) { 1648 size = sizeof (mddb_block_t) * cnt; 1649 blkarray = (mddb_block_t *)kmem_alloc(size, KM_SLEEP); 1650 for (i = 0; i < cnt; i++) 1651 blkarray[i] = blk + i; 1652 ret = readblklst(s, buffer, blkarray, cnt, li); 1653 kmem_free(blkarray, size); 1654 return (ret); 1655 } 1656 physblk = getphysblk(blk, s->s_mbiarray[li]); 1657 ASSERT(physblk > 0); 1658 device = md_expldev(s->s_lbp->lb_locators[li].l_dev); 1659 device = md_xlate_targ_2_mini(device); 1660 if (device == NODEV64) { 1661 return (1); 1662 } 1663 return (getblks(s, buffer, device, physblk, 1)); 1664 } 1665 1666 static void 1667 single_thread_start( 1668 mddb_set_t *s 1669 ) 1670 { 1671 while (s->s_singlelockgotten) { 1672 s->s_singlelockwanted++; 1673 cv_wait(&s->s_single_thread_cv, SETMUTEX(s->s_setno)); 1674 } 1675 s->s_singlelockgotten++; 1676 } 1677 1678 static void 1679 single_thread_end( 1680 mddb_set_t *s 1681 ) 1682 { 1683 ASSERT(s->s_singlelockgotten); 1684 s->s_singlelockgotten = 0; 1685 if (s->s_singlelockwanted) { 1686 s->s_singlelockwanted = 0; 1687 cv_broadcast(&s->s_single_thread_cv); 1688 } 1689 } 1690 1691 static size_t 1692 sizeofde( 1693 mddb_de_ic_t *dep 1694 ) 1695 { 1696 size_t size; 1697 1698 size = sizeof (mddb_de_ic_t) - sizeof (mddb_block_t) + 1699 sizeof (mddb_block_t) * dep->de_blkcount; 1700 return (size); 1701 } 1702 1703 static size_t 1704 sizeofde32( 1705 mddb_de32_t *dep 1706 ) 1707 { 1708 size_t size; 1709 1710 size = sizeof (*dep) - sizeof (dep->de32_blks) + 1711 sizeof (mddb_block_t) * dep->de32_blkcount; 1712 return (size); 1713 } 1714 1715 static mddb_de32_t * 1716 nextentry( 1717 mddb_de32_t *dep 1718 ) 1719 { 1720 mddb_de32_t *ret; 1721 1722 ret = (mddb_de32_t *)((void *)((caddr_t)dep + sizeofde32(dep))); 1723 return (ret); 1724 } 1725 1726 static void 1727 create_db32rec( 1728 mddb_db32_t *db32p, 1729 mddb_db_t *dbp 1730 ) 1731 { 1732 mddb_de_ic_t *dep; 1733 mddb_de32_t *de32p; 1734 1735 #if defined(_ILP32) && !defined(lint) 1736 ASSERT(sizeof (mddb_de_t) == sizeof (mddb_de32_t)); 1737 ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t)); 1738 #endif 1739 1740 dbtodb32(dbp, db32p); 1741 if ((dbp->db_firstentry != NULL) && (db32p->db32_firstentry == 0)) 1742 db32p->db32_firstentry = 0x4; 1743 de32p = (mddb_de32_t *)((void *) ((caddr_t)(&db32p->db32_firstentry) 1744 + sizeof (db32p->db32_firstentry))); 1745 for (dep = dbp->db_firstentry; dep; dep = dep->de_next) { 1746 detode32(dep, de32p); 1747 if ((dep->de_next != NULL) && (de32p->de32_next == 0)) 1748 de32p->de32_next = 0x4; 1749 de32p = nextentry(de32p); 1750 } 1751 ASSERT((uintptr_t)de32p <= (uintptr_t)de32p + MDDB_BSIZE); 1752 } 1753 1754 /* 1755 * If called during upgrade, this routine expects a translated 1756 * (aka miniroot) dev. 1757 * If master blocks are found, set the mn_set parameter to 1 if the 1758 * the master block revision number is MDDB_REV_MNMB; otherwise, 1759 * set it to 0. 1760 * If master blocks are not found, do not change the mnset parameter. 1761 */ 1762 static mddb_mb_ic_t * 1763 getmasters( 1764 mddb_set_t *s, 1765 md_dev64_t dev, 1766 daddr_t blkno, 1767 uint_t *flag, 1768 int *mn_set 1769 ) 1770 { 1771 mddb_mb_ic_t *mbi = NULL; 1772 mddb_mb_t *mb; 1773 int error = 0; 1774 ddi_devid_t devid; 1775 1776 1777 if (mddb_devopen(dev)) { 1778 if (flag) 1779 *flag |= MDDB_F_EMASTER; 1780 return ((mddb_mb_ic_t *)NULL); 1781 } 1782 1783 1784 mbi = (mddb_mb_ic_t *)kmem_zalloc(MDDB_IC_BSIZE, KM_SLEEP); 1785 mb = &(mbi->mbi_mddb_mb); 1786 if (error = getblks(s, (caddr_t)mb, dev, blkno, 1787 btodb(MDDB_BSIZE))) { 1788 error |= MDDB_F_EMASTER; 1789 } 1790 if (mb->mb_magic != MDDB_MAGIC_MB) { 1791 error = MDDB_F_EFMT | MDDB_F_EMASTER; 1792 } 1793 /* Check for MDDB_REV_MNMB and lower */ 1794 if (revchk(MDDB_REV_MNMB, mb->mb_revision)) { 1795 error = MDDB_F_EFMT | MDDB_F_EMASTER; 1796 } 1797 if (crcchk(mb, &mb->mb_checksum, MDDB_BSIZE, NULL)) { 1798 error = MDDB_F_EFMT | MDDB_F_EMASTER; 1799 } 1800 if (!(md_get_setstatus(s->s_setno) & MD_SET_IMPORT) && 1801 (mb->mb_setno != s->s_setno)) { 1802 error = MDDB_F_EFMT | MDDB_F_EMASTER; 1803 } 1804 if (mb->mb_blkno != blkno) { 1805 error = MDDB_F_EFMT | MDDB_F_EMASTER; 1806 } 1807 mb->mb_next = NULL; 1808 mbi->mbi_next = NULL; 1809 1810 if (error) 1811 goto out; 1812 1813 /* 1814 * Check the md_devid_destroy and md_keep_repl_state flags 1815 * to see if we need to regen the devid or not. 1816 * 1817 * Don't care about devid in local set since it is not used 1818 * and this should not be part of set importing 1819 */ 1820 if ((s->s_setno != MD_LOCAL_SET) && !(md_get_setstatus(s->s_setno) & 1821 MD_SET_IMPORT)) { 1822 /* 1823 * Now check the destroy flag. We also need to handle 1824 * the case where the destroy flag is reset after the 1825 * destroy 1826 */ 1827 if (md_devid_destroy || (mb->mb_devid_len == 0)) { 1828 1829 if (md_devid_destroy) { 1830 bzero(mb->mb_devid, mb->mb_devid_len); 1831 mb->mb_devid_len = 0; 1832 } 1833 1834 /* 1835 * Try to regenerate it if the 'keep' flag is not set 1836 */ 1837 if (!md_keep_repl_state) { 1838 if (ddi_lyr_get_devid(md_dev64_to_dev(dev), 1839 &devid) == DDI_SUCCESS) { 1840 mb->mb_devid_len = 1841 ddi_devid_sizeof(devid); 1842 bcopy(devid, mb->mb_devid, 1843 mb->mb_devid_len); 1844 ddi_devid_free(devid); 1845 } else { 1846 error = MDDB_F_EFMT | MDDB_F_EMASTER; 1847 } 1848 } 1849 1850 crcgen(mb, &mb->mb_checksum, MDDB_BSIZE, NULL); 1851 1852 /* 1853 * Push 1854 */ 1855 if (putblks(s, (caddr_t)mb, blkno, 1, dev, 0) != 0) { 1856 error = MDDB_F_EFMT | MDDB_F_EMASTER; 1857 } 1858 } 1859 } 1860 1861 if (! error) { 1862 /* Set mn_set parameter to 1 if a MN set */ 1863 if (mb->mb_revision == MDDB_REV_MNMB) 1864 *mn_set = 1; 1865 else 1866 *mn_set = 0; 1867 return (mbi); 1868 } 1869 1870 out: 1871 /* Error Out */ 1872 if (flag) 1873 *flag |= error; 1874 1875 kmem_free((caddr_t)mbi, MDDB_IC_BSIZE); 1876 mddb_devclose(dev); 1877 return ((mddb_mb_ic_t *)NULL); 1878 } 1879 1880 static int 1881 getrecord( 1882 mddb_set_t *s, 1883 mddb_de_ic_t *dep, 1884 int li 1885 ) 1886 { 1887 int err = 0; 1888 mddb_rb32_t *rbp; 1889 1890 #if defined(_ILP32) && !defined(lint) 1891 ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t)); 1892 #endif 1893 1894 1895 dep->de_rb = (mddb_rb32_t *)kmem_zalloc(dep->de_recsize, KM_SLEEP); 1896 rbp = dep->de_rb; 1897 1898 err = readblklst(s, (caddr_t)rbp, dep->de_blks, dep->de_blkcount, li); 1899 if (err) { 1900 return (MDDB_F_EDATA | err); 1901 } 1902 if (rbp->rb_magic != MDDB_MAGIC_RB) { 1903 return (MDDB_F_EFMT | MDDB_F_EDATA); 1904 } 1905 if ((revchk(MDDB_REV_RB, rbp->rb_revision) != 0) && 1906 (revchk(MDDB_REV_RB64, rbp->rb_revision) != 0)) { 1907 return (MDDB_F_EFMT | MDDB_F_EDATA); 1908 } 1909 /* Check crc for this record */ 1910 if (rec_crcchk(s, dep, rbp)) { 1911 return (MDDB_F_EFMT | MDDB_F_EDATA); 1912 } 1913 return (0); 1914 } 1915 1916 /* 1917 * Code to read in the locator name information 1918 */ 1919 static int 1920 readlocnames( 1921 mddb_set_t *s, 1922 int li 1923 ) 1924 { 1925 mddb_ln_t *lnp; 1926 int err = 0; 1927 mddb_block_t ln_blkcnt, ln_blkno; 1928 1929 /* 1930 * read in the locator name blocks 1931 */ 1932 s->s_lnp = NULL; 1933 1934 ln_blkno = s->s_lbp->lb_lnfirstblk; 1935 ln_blkcnt = s->s_lbp->lb_lnblkcnt; 1936 lnp = (mddb_ln_t *)kmem_zalloc(dbtob(ln_blkcnt), KM_SLEEP); 1937 1938 err = readblks(s, (caddr_t)lnp, ln_blkno, ln_blkcnt, li); 1939 if (err) { 1940 err |= MDDB_F_EDATA; 1941 goto out; 1942 } 1943 if (lnp->ln_magic != MDDB_MAGIC_LN) { 1944 err = MDDB_F_EDATA | MDDB_F_EFMT; 1945 goto out; 1946 } 1947 if (s->s_lbp->lb_flags & MDDB_MNSET) { 1948 if (revchk(MDDB_REV_MNLN, lnp->ln_revision)) { 1949 err = MDDB_F_EDATA | MDDB_F_EFMT; 1950 goto out; 1951 } 1952 } else { 1953 if (revchk(MDDB_REV_LN, lnp->ln_revision)) { 1954 err = MDDB_F_EDATA | MDDB_F_EFMT; 1955 goto out; 1956 } 1957 } 1958 if (crcchk(lnp, &lnp->ln_checksum, dbtob(ln_blkcnt), NULL)) { 1959 err = MDDB_F_EDATA | MDDB_F_EFMT; 1960 goto out; 1961 } 1962 out: 1963 /* 1964 * if error occurred in locator name blocks free them 1965 * and return 1966 */ 1967 if (err) { 1968 kmem_free((caddr_t)lnp, dbtob(ln_blkcnt)); 1969 return (err); 1970 } 1971 s->s_lnp = lnp; 1972 return (0); 1973 } 1974 1975 /* 1976 * code to read in a copy of the database. 1977 */ 1978 1979 static int 1980 readcopy( 1981 mddb_set_t *s, 1982 int li 1983 ) 1984 { 1985 uint_t blk; 1986 mddb_db_t *dbp, *dbp1, *dbhp; 1987 mddb_db32_t *db32p; 1988 mddb_de_ic_t *dep, *dep2; 1989 mddb_de32_t *de32p, *de32p2; 1990 int err = 0; 1991 uint_t checksum; 1992 1993 1994 #if defined(_ILP32) && !defined(lint) 1995 ASSERT(sizeof (mddb_de_t) == sizeof (mddb_de32_t)); 1996 ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t)); 1997 #endif 1998 1999 dbp = NULL; 2000 dbhp = NULL; 2001 /* 2002 * read in all the directory blocks 2003 */ 2004 blk = s->s_lbp->lb_dbfirstblk; 2005 db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, KM_SLEEP); 2006 2007 for (; blk != 0; blk = dbp->db_nextblk) { 2008 dbp1 = (mddb_db_t *)kmem_zalloc(sizeof (mddb_db_t), KM_SLEEP); 2009 if (! dbhp) { 2010 dbhp = dbp1; 2011 } else { 2012 dbp->db_next = dbp1; 2013 } 2014 dbp = dbp1; 2015 2016 err = readblks(s, (caddr_t)db32p, blk, 1, li); 2017 if (err) { 2018 err |= MDDB_F_EDATA; 2019 break; 2020 } 2021 db32todb(db32p, dbp); 2022 if (db32p->db32_magic != MDDB_MAGIC_DB) { 2023 err = MDDB_F_EDATA | MDDB_F_EFMT; 2024 break; 2025 } 2026 if (revchk(MDDB_REV_DB, db32p->db32_revision)) { 2027 err = MDDB_F_EDATA | MDDB_F_EFMT; 2028 break; 2029 } 2030 if (crcchk(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL)) { 2031 err = MDDB_F_EDATA | MDDB_F_EFMT; 2032 break; 2033 } 2034 /* 2035 * first go through and fix up all de_next pointers 2036 */ 2037 if (dbp->db_firstentry) { 2038 2039 de32p = (mddb_de32_t *) 2040 ((void *) ((caddr_t)(&db32p->db32_firstentry) 2041 + sizeof (db32p->db32_firstentry))); 2042 2043 dep = (mddb_de_ic_t *) 2044 kmem_zalloc(sizeof (mddb_de_ic_t) - 2045 sizeof (mddb_block_t) + 2046 sizeof (mddb_block_t) * de32p->de32_blkcount, 2047 KM_SLEEP); 2048 de32tode(de32p, dep); 2049 2050 dbp->db_firstentry = dep; 2051 while (de32p && de32p->de32_next) { 2052 2053 de32p2 = nextentry(de32p); 2054 2055 dep2 = (mddb_de_ic_t *)kmem_zalloc( 2056 sizeof (mddb_de_ic_t) - 2057 sizeof (mddb_block_t) + 2058 sizeof (mddb_block_t) * 2059 de32p2->de32_blkcount, KM_SLEEP); 2060 2061 de32tode(de32p2, dep2); 2062 2063 dep->de_next = dep2; 2064 dep = dep2; 2065 de32p = de32p2; 2066 } 2067 } 2068 /* 2069 * go through and make all of the pointer to record blocks 2070 * are null; 2071 */ 2072 for (dep = dbp->db_firstentry; dep != NULL; dep = dep->de_next) 2073 dep->de_rb = NULL; 2074 } 2075 kmem_free((caddr_t)db32p, MDDB_BSIZE); 2076 dbp->db_next = NULL; 2077 /* 2078 * if error occurred in directory blocks free them 2079 * and return 2080 */ 2081 if (err) { 2082 dbp = dbhp; 2083 while (dbp) { 2084 dep = dbp->db_firstentry; 2085 while (dep) { 2086 /* No mddb_rb32_t structures yet */ 2087 dep2 = dep->de_next; 2088 kmem_free((caddr_t)dep, sizeofde(dep)); 2089 dep = dep2; 2090 } 2091 dbp1 = dbp->db_next; 2092 kmem_free((caddr_t)dbp, sizeof (mddb_db_t)); 2093 dbp = dbp1; 2094 } 2095 s->s_dbp = NULL; 2096 return (err); 2097 2098 } 2099 /* 2100 */ 2101 err = 0; 2102 checksum = MDDB_GLOBAL_XOR; 2103 for (dbp = dbhp; dbp != NULL; dbp = dbp->db_next) { 2104 checksum ^= dbp->db_recsum; 2105 for (dep = dbp->db_firstentry; dep; dep = dep->de_next) { 2106 if (dep->de_flags & MDDB_F_OPT) 2107 continue; 2108 err = getrecord(s, dep, li); 2109 if (err) 2110 break; 2111 /* Don't include CHANGELOG in big XOR */ 2112 if (dep->de_flags & MDDB_F_CHANGELOG) 2113 continue; 2114 checksum ^= dep->de_rb->rb_checksum; 2115 checksum ^= dep->de_rb->rb_checksum_fiddle; 2116 } 2117 if (err) 2118 break; 2119 } 2120 if (checksum) { 2121 if (! err) 2122 err = MDDB_F_EDATA | MDDB_F_EFMT; 2123 } 2124 if (err) { 2125 dbp = dbhp; 2126 dbhp = NULL; 2127 while (dbp) { 2128 dep = dbp->db_firstentry; 2129 while (dep) { 2130 if (dep->de_rb) 2131 kmem_free((caddr_t)dep->de_rb, 2132 dep->de_recsize); 2133 dep2 = dep->de_next; 2134 kmem_free((caddr_t)dep, sizeofde(dep)); 2135 dep = dep2; 2136 } 2137 dbp1 = dbp->db_next; 2138 kmem_free((caddr_t)dbp, sizeof (mddb_db_t)); 2139 dbp = dbp1; 2140 } 2141 } 2142 s->s_dbp = dbhp; 2143 return (err); 2144 } 2145 2146 static int 2147 getoptcnt( 2148 mddb_set_t *s, 2149 int li) 2150 { 2151 int result; 2152 mddb_de_ic_t *dep; 2153 mddb_db_t *dbp; 2154 2155 #if defined(_ILP32) && !defined(lint) 2156 ASSERT(sizeof (mddb_de_t) == sizeof (mddb_de32_t)); 2157 ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t)); 2158 #endif 2159 2160 result = 0; 2161 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 2162 dep = dbp->db_firstentry; 2163 for (; dep != NULL; dep = dep->de_next) { 2164 if (! (dep->de_flags & MDDB_F_OPT)) 2165 continue; 2166 if (((dep->de_optinfo[0].o_flags & MDDB_F_ACTIVE) && 2167 (li == dep->de_optinfo[0].o_li)) || 2168 ((dep->de_optinfo[1].o_flags & MDDB_F_ACTIVE) && 2169 (li == dep->de_optinfo[1].o_li))) 2170 result++; 2171 } 2172 } 2173 return (result); 2174 } 2175 2176 static void 2177 getoptdev( 2178 mddb_set_t *s, 2179 mddb_de_ic_t *rdep, 2180 int opti 2181 ) 2182 { 2183 mddb_lb_t *lbp; 2184 mddb_locator_t *lp; 2185 mddb_optinfo_t *otherop; 2186 mddb_optinfo_t *resultop; 2187 int li; 2188 dev_t otherdev; 2189 int blkonly = 0; 2190 int mincnt; 2191 int thiscnt; 2192 2193 lbp = s->s_lbp; 2194 2195 resultop = &rdep->de_optinfo[opti]; 2196 otherop = &rdep->de_optinfo[1-opti]; 2197 2198 resultop->o_flags = 0; 2199 2200 /* 2201 * scan through and see if data bases have to vary by only device 2202 */ 2203 2204 if (otherop->o_flags & MDDB_F_ACTIVE) { 2205 blkonly = 1; 2206 otherdev = expldev(lbp->lb_locators[otherop->o_li].l_dev); 2207 for (li = 0; li < lbp->lb_loccnt; li++) { 2208 lp = &lbp->lb_locators[li]; 2209 if (! (lp->l_flags & MDDB_F_ACTIVE)) 2210 continue; 2211 if (expldev(lp->l_dev) != otherdev) { 2212 blkonly = 0; 2213 break; 2214 } 2215 } 2216 } 2217 2218 mincnt = 999999; 2219 for (li = 0; li < lbp->lb_loccnt; li++) { 2220 dev_info_t *devi; 2221 int removable = 0; 2222 2223 lp = &lbp->lb_locators[li]; 2224 if (! (lp->l_flags & MDDB_F_ACTIVE)) 2225 continue; 2226 if (otherop->o_flags & MDDB_F_ACTIVE) { 2227 if (blkonly) { 2228 if (otherop->o_li == li) 2229 continue; 2230 } else { 2231 if (otherdev == expldev(lp->l_dev)) 2232 continue; 2233 } 2234 } 2235 2236 /* 2237 * Check if this is a removable device. If it is we 2238 * assume it is something like a USB flash disk, a zip disk 2239 * or even a floppy that is being used to help maintain 2240 * mddb quorum. We don't want to put any optimized resync 2241 * records on these kinds of disks since they are usually 2242 * slower or don't have the same read/write lifetimes as 2243 * a regular fixed disk. 2244 */ 2245 if ((devi = e_ddi_hold_devi_by_dev(lp->l_dev, 0)) != NULL) { 2246 int error; 2247 struct cb_ops *cb; 2248 ddi_prop_op_t prop_op = PROP_LEN_AND_VAL_BUF; 2249 int propvalue = 0; 2250 int proplength = sizeof (int); 2251 2252 if ((cb = devopsp[getmajor(lp->l_dev)]->devo_cb_ops) 2253 != NULL) { 2254 error = (*cb->cb_prop_op)(DDI_DEV_T_ANY, devi, 2255 prop_op, 2256 DDI_PROP_NOTPROM|DDI_PROP_DONTPASS, 2257 "removable-media", 2258 (caddr_t)&propvalue, &proplength); 2259 2260 if (error == DDI_PROP_SUCCESS) 2261 removable = 1; 2262 } 2263 2264 ddi_release_devi(devi); 2265 } 2266 2267 if (removable) 2268 continue; 2269 2270 thiscnt = getoptcnt(s, li); 2271 if (thiscnt < mincnt) { 2272 resultop->o_li = li; 2273 mincnt = thiscnt; 2274 resultop->o_flags = MDDB_F_ACTIVE; 2275 } 2276 } 2277 } 2278 2279 static void 2280 allocuserdata( 2281 mddb_de_ic_t *dep 2282 ) 2283 { 2284 mddb_rb32_t *rbp; 2285 2286 #if defined(_ILP32) && !defined(lint) 2287 ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t)); 2288 #endif 2289 2290 rbp = dep->de_rb; 2291 rbp->rb_private = 0; 2292 dep->de_rb_userdata = kmem_zalloc(dep->de_reqsize, KM_SLEEP); 2293 rbp->rb_userdata = 0x4; /* Make sure this is non-zero */ 2294 bcopy((caddr_t)rbp->rb_data, dep->de_rb_userdata, dep->de_reqsize); 2295 } 2296 2297 2298 static void 2299 getuserdata( 2300 set_t setno, 2301 mddb_de_ic_t *dep 2302 ) 2303 { 2304 mddb_rb32_t *rbp; 2305 2306 2307 mddb_type_t type = dep->de_type1; 2308 caddr_t data, udata; 2309 2310 #if defined(_ILP32) && !defined(lint) 2311 ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t)); 2312 #endif 2313 rbp = dep->de_rb; 2314 data = (caddr_t)rbp->rb_data; 2315 udata = (caddr_t)dep->de_rb_userdata; 2316 2317 /* 2318 * If it's a driver record, and an old style record, and not a DRL 2319 * record, we must convert it because it was incore as a 64 bit 2320 * structure but its on disk layout has only 32 bit for block sizes 2321 */ 2322 if (!(md_get_setstatus(setno) & MD_SET_IMPORT) && 2323 (type >= MDDB_FIRST_MODID) && 2324 (rbp->rb_revision == MDDB_REV_RB)) { 2325 2326 switch (dep->de_flags) { 2327 2328 case MDDB_F_STRIPE: 2329 stripe_convert(data, udata, BIG_2_SMALL); 2330 break; 2331 2332 case MDDB_F_MIRROR: 2333 mirror_convert(data, udata, BIG_2_SMALL); 2334 break; 2335 2336 case MDDB_F_RAID: 2337 raid_convert(data, udata, BIG_2_SMALL); 2338 break; 2339 2340 case MDDB_F_SOFTPART: 2341 softpart_convert(data, udata, BIG_2_SMALL); 2342 break; 2343 2344 case MDDB_F_TRANS_MASTER: 2345 trans_master_convert(data, udata, BIG_2_SMALL); 2346 break; 2347 2348 case MDDB_F_TRANS_LOG: 2349 trans_log_convert(data, udata, BIG_2_SMALL); 2350 break; 2351 2352 case MDDB_F_HOTSPARE: 2353 hs_convert(data, udata, BIG_2_SMALL); 2354 break; 2355 2356 case MDDB_F_OPT: 2357 default: 2358 bcopy(udata, data, dep->de_reqsize); 2359 } 2360 } else { 2361 bcopy(udata, data, dep->de_reqsize); 2362 } 2363 } 2364 2365 static void 2366 getoptrecord( 2367 mddb_set_t *s, 2368 mddb_de_ic_t *dep 2369 ) 2370 { 2371 mddb_lb_t *lbp; 2372 mddb_locator_t *lp; 2373 mddb_rb32_t *rbp, *crbp; 2374 int li; 2375 int i; 2376 int err = 0; 2377 size_t recsize; 2378 2379 #if defined(_ILP32) && !defined(lint) 2380 ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t)); 2381 #endif 2382 2383 lbp = s->s_lbp; 2384 2385 recsize = dep->de_recsize; 2386 dep->de_rb = (mddb_rb32_t *)kmem_zalloc(recsize, KM_SLEEP); 2387 rbp = dep->de_rb; 2388 crbp = (mddb_rb32_t *)kmem_zalloc(recsize, KM_SLEEP); 2389 2390 dep->de_optinfo[0].o_flags |= MDDB_F_EDATA; 2391 dep->de_optinfo[1].o_flags |= MDDB_F_EDATA; 2392 2393 for (i = 0; i < 2; i++) { 2394 if (! (dep->de_optinfo[i].o_flags & MDDB_F_ACTIVE)) 2395 continue; 2396 li = dep->de_optinfo[i].o_li; 2397 lp = &lbp->lb_locators[li]; 2398 2399 if (! (lp->l_flags & MDDB_F_ACTIVE) || 2400 (lp->l_flags & MDDB_F_EMASTER)) 2401 continue; 2402 2403 err = readblklst(s, (caddr_t)rbp, dep->de_blks, 2404 dep->de_blkcount, li); 2405 2406 if (err) 2407 continue; 2408 2409 if (rbp->rb_magic != MDDB_MAGIC_RB) 2410 continue; 2411 2412 if (revchk(MDDB_REV_RB, rbp->rb_revision)) 2413 continue; 2414 2415 /* Check the crc for this record */ 2416 if (rec_crcchk(s, dep, rbp)) { 2417 continue; 2418 } 2419 2420 dep->de_optinfo[i].o_flags = MDDB_F_ACTIVE; 2421 2422 if (rbp == crbp) { 2423 if (rbp->rb_checksum != crbp->rb_checksum) 2424 dep->de_optinfo[1].o_flags |= MDDB_F_EDATA; 2425 break; 2426 } 2427 rbp = crbp; 2428 } 2429 2430 if (rbp == crbp) { 2431 rbp->rb_private = 0; 2432 kmem_free((caddr_t)crbp, recsize); 2433 return; 2434 } 2435 bzero((caddr_t)rbp, recsize); 2436 rbp->rb_magic = MDDB_MAGIC_RB; 2437 rbp->rb_revision = MDDB_REV_RB; 2438 uniqtime32(&rbp->rb_timestamp); 2439 /* Generate the crc for this record */ 2440 rec_crcgen(s, dep, rbp); 2441 kmem_free((caddr_t)crbp, recsize); 2442 } 2443 2444 /* 2445 * writeoptrecord writes out an optimized record. 2446 */ 2447 static int 2448 writeoptrecord( 2449 mddb_set_t *s, 2450 mddb_de_ic_t *dep 2451 ) 2452 { 2453 mddb_rb32_t *rbp; 2454 int li; 2455 int err = 0, wrt_err = 0; 2456 mddb_bf_t *bufhead, *bfp; 2457 mddb_lb_t *lbp = s->s_lbp; 2458 mddb_locator_t *lp; 2459 int i; 2460 2461 #if defined(_ILP32) && !defined(lint) 2462 ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t)); 2463 #endif 2464 2465 bufhead = NULL; 2466 err = 0; 2467 2468 while (s->s_opthavequeuinglck) { 2469 s->s_optwantqueuinglck++; 2470 cv_wait(&s->s_optqueuing_cv, SETMUTEX(s->s_setno)); 2471 } 2472 s->s_opthavequeuinglck++; 2473 rbp = dep->de_rb; 2474 for (i = 0; i < 2; i++) { 2475 /* 2476 * only possible error is xlate. This can 2477 * occur if a replica was off line and came 2478 * back. During the mean time the database grew 2479 * large than the now on line replica can store 2480 */ 2481 if (! (dep->de_optinfo[i].o_flags & MDDB_F_ACTIVE)) 2482 continue; 2483 li = dep->de_optinfo[i].o_li; 2484 /* 2485 * In a MN diskset, any node can write optimized record(s). 2486 */ 2487 wrt_err = wrtblklst(s, (caddr_t)rbp, dep->de_blks, 2488 dep->de_blkcount, li, &bufhead, MDDB_WR_ANY_NODE); 2489 /* 2490 * For MN diskset, set error in optinfo structure so 2491 * that mddb_commitrec knows which replica failed. 2492 */ 2493 if ((MD_MNSET_SETNO(s->s_setno)) && 2494 (wrt_err & MDDB_F_EWRITE)) { 2495 dep->de_optinfo[i].o_flags |= MDDB_F_EWRITE; 2496 } 2497 err |= wrt_err; 2498 } 2499 s->s_opthavequeuinglck = 0; 2500 if (s->s_optwantqueuinglck) { 2501 s->s_optwantqueuinglck = 0; 2502 cv_broadcast(&s->s_optqueuing_cv); 2503 } 2504 for (bfp = bufhead; bfp; bfp = bufhead) { 2505 mutex_exit(SETMUTEX(s->s_setno)); 2506 (void) biowait(&bfp->bf_buf); 2507 mutex_enter(SETMUTEX(s->s_setno)); 2508 if (bfp->bf_buf.b_flags & B_ERROR) { 2509 /* 2510 * If an MN diskset, don't set replica 2511 * in error since this hasn't been set in master. 2512 * Setting replica in error before master could 2513 * leave the nodes with different views of the 2514 * world since a class 1 configuration change 2515 * could occur in mddb_commitrec as soon as 2516 * all locks are dropped. Must keep this 2517 * node the same as master and can't afford a 2518 * failure from the class 1 config change 2519 * if master succeeded. 2520 */ 2521 if (!(MD_MNSET_SETNO(s->s_setno))) { 2522 bfp->bf_locator->l_flags |= MDDB_F_EWRITE; 2523 } else { 2524 /* 2525 * Find which de_optinfo (which replica) 2526 * had a failure and set the failure in 2527 * the o_flags field. 2528 */ 2529 lp = &lbp->lb_locators[dep->de_optinfo[0].o_li]; 2530 if (lp == bfp->bf_locator) { 2531 dep->de_optinfo[0].o_flags |= 2532 MDDB_F_EWRITE; 2533 } else { 2534 dep->de_optinfo[1].o_flags |= 2535 MDDB_F_EWRITE; 2536 } 2537 } 2538 err |= MDDB_F_EWRITE; 2539 } 2540 bufhead = bfp->bf_next; 2541 freebuffer(s, bfp); 2542 } 2543 return (err); 2544 } 2545 2546 /* 2547 * Fix up the optimized resync record. Used in the traditional and local 2548 * disksets to move an optimized record from a failed or deleted mddb 2549 * to an active one. 2550 * 2551 * In a MN diskset, the fixing of the optimized record is split between 2552 * the master and slave nodes. If the master node moves the optimized 2553 * resync record, then the master node will send a MDDB_PARSE_OPTRECS 2554 * message to the slave nodes causing the slave nodes to reget the 2555 * directory entry containing the location of the optimized resync record. 2556 * After the record is reread from disk, then writeoptrecord is called 2557 * if the location of the optimized resync record or flags have changed. 2558 * When writeoptrecord is called, the node that is the owner of this record 2559 * will write the optimized record to the location specified in the directory 2560 * entry. Since the master node uses the highest class message (PARSE) 2561 * the record owner node is guaranteed to already have an updated 2562 * directory entry incore. 2563 * 2564 * The other difference between the traditional/local set and MN diskset 2565 * is that the directory entry can be written to disk before the optimized 2566 * record in a MN diskset if the record is owned by a slave node. So, 2567 * the users of an optimized record must handle the failure case when no 2568 * data is available from an optimized record since the master node could 2569 * have failed during the relocation of the optimized record to another mddb. 2570 */ 2571 static int 2572 fixoptrecord( 2573 mddb_set_t *s, 2574 mddb_de_ic_t *dep, 2575 mddb_db_t *dbp 2576 ) 2577 { 2578 int changed; 2579 int writedata; 2580 int err = 0; 2581 int i; 2582 mddb_lb_t *lbp; 2583 mddb_optinfo_t *op; 2584 mddb_db32_t *db32p; 2585 int rec_owner; /* Is node owner of record? */ 2586 2587 #if defined(_ILP32) && !defined(lint) 2588 ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t)); 2589 #endif 2590 2591 lbp = s->s_lbp; 2592 changed = 0; 2593 writedata = 0; 2594 for (i = 0; i < 2; i++) { 2595 op = &dep->de_optinfo[i]; 2596 2597 if (! (lbp->lb_locators[op->o_li].l_flags & MDDB_F_ACTIVE)) 2598 op->o_flags = 0; 2599 2600 /* 2601 * If optimized record has seen a replica failure, 2602 * assign new replica to record and re-write data 2603 * to new record. 2604 */ 2605 if (! (op->o_flags & MDDB_F_ACTIVE)) { 2606 getoptdev(s, dep, i); 2607 writedata++; 2608 changed++; 2609 /* Set flag for slaves to reread dep and write rec */ 2610 if (lbp->lb_flags & MDDB_MNSET) { 2611 s->s_mn_parseflags |= MDDB_PARSE_OPTRECS; 2612 } 2613 } 2614 2615 /* 2616 * If just an error in the data was seen, set 2617 * the optimized record's replica flag to active (ok) 2618 * and try again. 2619 */ 2620 if (op->o_flags & MDDB_F_EDATA) { 2621 dep->de_optinfo[0].o_flags = MDDB_F_ACTIVE; 2622 writedata++; 2623 } 2624 } 2625 2626 rec_owner = 0; 2627 if (lbp->lb_flags & MDDB_MNSET) { 2628 /* 2629 * If a MN diskset then check the owner of optimized record. 2630 * If the master node owns the record or if there is 2631 * no owner of the record, then the master can write the 2632 * optimized record to disk. 2633 * Master node can write the optimized record now, but 2634 * slave nodes write their records during handling of 2635 * the MDDB_PARSE_OPTRECS message. 2636 */ 2637 if ((dep->de_owner_nodeid == MD_MN_INVALID_NID) || 2638 (dep->de_owner_nodeid == md_set[s->s_setno].s_nodeid)) { 2639 rec_owner = 1; 2640 } 2641 } else { 2642 /* 2643 * In traditional diskset and local set, this node 2644 * is always the record owner and always the master. 2645 */ 2646 rec_owner = 1; 2647 } 2648 2649 /* 2650 * If this node is the record owner, write out record. 2651 */ 2652 if ((writedata) && (rec_owner)) { 2653 if (err = writeoptrecord(s, dep)) { 2654 return (err); 2655 } 2656 } 2657 if (! changed) 2658 return (0); 2659 uniqtime32(&dbp->db_timestamp); 2660 dbp->db_revision = MDDB_REV_DB; 2661 db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, KM_SLEEP); 2662 create_db32rec(db32p, dbp); 2663 crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL); 2664 err = writeall(s, (caddr_t)db32p, db32p->db32_blknum, 2665 1, MDDB_WR_ONLY_MASTER); 2666 kmem_free((caddr_t)db32p, MDDB_BSIZE); 2667 return (err); 2668 } 2669 2670 static int 2671 fixoptrecords( 2672 mddb_set_t *s 2673 ) 2674 { 2675 mddb_de_ic_t *dep; 2676 mddb_db_t *dbp; 2677 int err = 0; 2678 set_t setno; 2679 2680 /* 2681 * In a MN diskset, the master node is the only node that runs 2682 * fixoptrecords. If the master node changes anything, then the 2683 * master node sends PARSE message to the slave nodes. The slave 2684 * nodes will then re-read in the locator block or re-read in the 2685 * directory blocks and re-write the optimized resync records. 2686 */ 2687 setno = s->s_setno; 2688 if ((setno != MD_LOCAL_SET) && (s->s_lbp->lb_flags & MDDB_MNSET) && 2689 (md_set[setno].s_am_i_master == 0)) { 2690 return (0); 2691 } 2692 2693 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 2694 for (dep = dbp->db_firstentry; dep; dep = dep->de_next) { 2695 if (! (dep->de_flags & MDDB_F_OPT)) 2696 continue; 2697 err = fixoptrecord(s, dep, dbp); 2698 if (err != 0) 2699 return (err); 2700 } 2701 } 2702 return (0); 2703 } 2704 2705 /* 2706 * Checks incore version of mddb data to mddb data ondisk. 2707 * 2708 * Returns: 2709 * - 0 if the data was successfully read and is good. 2710 * - MDDB_F_EREAD if a read error occurred. 2711 * - 1 if the data read is bad (checksum failed, etc) 2712 */ 2713 static int 2714 checkcopy 2715 ( 2716 mddb_set_t *s, 2717 int li 2718 ) 2719 { 2720 mddb_db_t *dbp; 2721 mddb_db32_t *cdb32p; 2722 mddb_de_ic_t *dep; 2723 mddb_de32_t *cde32p; 2724 mddb_rb32_t *rbp, *crbp; 2725 size_t size; 2726 int i; 2727 int retval = 1; 2728 2729 #if defined(_ILP32) && !defined(lint) 2730 ASSERT(sizeof (mddb_de_t) == sizeof (mddb_de32_t)); 2731 ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t)); 2732 ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t)); 2733 #endif 2734 2735 if (s->s_databuffer_size == 0) { 2736 size_t maxrecsize = MDDB_BSIZE; 2737 2738 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) 2739 for (dep = dbp->db_firstentry; dep; dep = dep->de_next) 2740 if (! (dep->de_flags & MDDB_F_OPT) && 2741 dep->de_recsize > maxrecsize) 2742 maxrecsize = dep->de_recsize; 2743 2744 s->s_databuffer = (caddr_t)kmem_zalloc(maxrecsize, KM_SLEEP); 2745 s->s_databuffer_size = maxrecsize; 2746 } 2747 2748 cdb32p = (mddb_db32_t *)s->s_databuffer; 2749 2750 /* 2751 * first go through and make sure all directory stuff 2752 * is the same 2753 */ 2754 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 2755 if (readblks(s, (caddr_t)cdb32p, dbp->db_blknum, 1, li)) { 2756 retval = MDDB_F_EREAD; 2757 goto err; 2758 } 2759 if (cdb32p->db32_magic != MDDB_MAGIC_DB) 2760 goto err; 2761 if (revchk(MDDB_REV_DB, cdb32p->db32_revision)) 2762 goto err; 2763 if (crcchk(cdb32p, &cdb32p->db32_checksum, MDDB_BSIZE, NULL)) 2764 goto err; 2765 if (cdb32p->db32_nextblk != dbp->db_nextblk) 2766 goto err; 2767 if (cdb32p->db32_recsum != dbp->db_recsum) 2768 goto err; 2769 if (cdb32p->db32_firstentry) { 2770 cde32p = (mddb_de32_t *) 2771 ((void *)((caddr_t)(&cdb32p->db32_firstentry) 2772 + sizeof (cdb32p->db32_firstentry))); 2773 } else 2774 cde32p = NULL; 2775 2776 dep = dbp->db_firstentry; 2777 /* 2778 * check if all directory entries are identical 2779 */ 2780 while (dep && cde32p) { 2781 if (dep->de_recid != cde32p->de32_recid) 2782 goto err; 2783 if (dep->de_type1 != cde32p->de32_type1) 2784 goto err; 2785 if (dep->de_type2 != cde32p->de32_type2) 2786 goto err; 2787 if (dep->de_reqsize != cde32p->de32_reqsize) 2788 goto err; 2789 if (dep->de_flags != cde32p->de32_flags) 2790 goto err; 2791 2792 for (i = 0; i < 2; i++) { 2793 if (dep->de_optinfo[i].o_li != 2794 cde32p->de32_optinfo[i].o_li) 2795 break; 2796 } 2797 if (i != 2) 2798 goto err; 2799 size = sizeof (mddb_block_t) * dep->de_blkcount; 2800 if (bcmp((caddr_t)dep->de_blks, 2801 (caddr_t)cde32p->de32_blks, size)) 2802 goto err; 2803 dep = dep->de_next; 2804 if (cde32p->de32_next) 2805 cde32p = nextentry(cde32p); 2806 else 2807 cde32p = NULL; 2808 } 2809 if (dep || cde32p) 2810 goto err; 2811 } 2812 /* 2813 * If here, all directories are functionally identical 2814 * check to make sure all records are identical 2815 * the reason the records are not just bcmped is that the 2816 * lock flag does not want to be compared. 2817 */ 2818 crbp = (mddb_rb32_t *)cdb32p; 2819 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 2820 for (dep = dbp->db_firstentry; dep; dep = dep->de_next) { 2821 if ((dep->de_flags & MDDB_F_OPT) || 2822 (dep->de_flags & MDDB_F_CHANGELOG)) 2823 continue; 2824 rbp = (mddb_rb32_t *)dep->de_rb; 2825 if (readblklst(s, (caddr_t)crbp, dep->de_blks, 2826 dep->de_blkcount, li)) { 2827 retval = MDDB_F_EREAD; 2828 goto err; 2829 } 2830 /* Check the crc for this record */ 2831 if (rec_crcchk(s, dep, crbp)) 2832 goto err; 2833 2834 if (rbp->rb_checksum != crbp->rb_checksum || 2835 rbp->rb_checksum_fiddle != crbp->rb_checksum_fiddle) 2836 goto err; 2837 } 2838 } 2839 return (0); 2840 err: 2841 return (retval); 2842 } 2843 2844 /* 2845 * Determine if the location information for two mddbs is the same. 2846 * The device slice and block offset should match. If both have devids then 2847 * use that for the comparison, otherwise we compare the dev_ts. 2848 * Comparing with the devid allows us to handle the case where a mddb was 2849 * relocated to a dead mddbs dev_t. The live mddb will have the dev_t of 2850 * the dead mddb but the devid comparison will catch this and not match. 2851 * 2852 * Return 1 if the location of the two mddbs match, 0 if not. 2853 */ 2854 static int 2855 match_mddb(mddb_ri_t *rip, ddi_devid_t devid, char *minor, md_dev64_t dev, 2856 daddr32_t blkno) 2857 { 2858 if (rip->ri_flags & MDDB_F_EMASTER) { 2859 /* 2860 * If this element is errored then we don't try to match on it. 2861 * If we try to match we could erroneously match on the dev_t 2862 * of a relocated disk. 2863 */ 2864 return (0); 2865 } 2866 2867 if (rip->ri_devid && devid && minor) { 2868 if (ddi_devid_compare(rip->ri_devid, devid) != 0 || 2869 strcmp(rip->ri_minor_name, minor) != 0) 2870 return (0); 2871 } else { 2872 if (rip->ri_dev != dev) 2873 return (0); 2874 } 2875 2876 if (rip->ri_blkno != blkno) 2877 return (0); 2878 2879 return (1); 2880 } 2881 2882 static int 2883 ridev( 2884 mddb_ri_t **rip, 2885 mddb_cfg_loc_t *clp, 2886 dev32_t *dev_2b_fixed, 2887 int flag) 2888 { 2889 mddb_ri_t *r, *r1; 2890 md_dev64_t ldev, ndev; 2891 major_t majordev; 2892 int sz; 2893 2894 if (MD_UPGRADE) { 2895 ldev = md_makedevice(md_targ_name_to_major(clp->l_driver), 2896 clp->l_mnum); 2897 } else { 2898 if (ddi_name_to_major(clp->l_driver) == (major_t)-1) 2899 return (EINVAL); 2900 2901 ldev = md_makedevice(ddi_name_to_major(clp->l_driver), 2902 clp->l_mnum); 2903 } 2904 2905 if (clp->l_devid != 0) { 2906 /* 2907 * Get dev associated with device id and minor name. 2908 * Setup correct driver name if dev is now different. 2909 * Don't change driver name if during upgrade. 2910 */ 2911 ndev = ldev; 2912 if (!mddb_devid_validate((ddi_devid_t)(uintptr_t)clp->l_devid, 2913 &ndev, clp->l_minor_name)) { 2914 if ((ndev != ldev) && (!(MD_UPGRADE))) { 2915 majordev = md_getmajor(ndev); 2916 (void) strcpy(clp->l_driver, 2917 ddi_major_to_name(majordev)); 2918 clp->l_mnum = md_getminor(ndev); 2919 clp->l_devid_flags |= MDDB_DEVID_VALID; 2920 ldev = ndev; 2921 } 2922 } else { 2923 /* Mark as invalid */ 2924 clp->l_devid_flags &= ~MDDB_DEVID_VALID; 2925 } 2926 } 2927 2928 clp->l_dev = md_cmpldev(ldev); 2929 if (dev_2b_fixed) 2930 *dev_2b_fixed = clp->l_dev; 2931 r = *rip; 2932 2933 while (r) { 2934 if (match_mddb(r, (ddi_devid_t)(uintptr_t)clp->l_devid, 2935 clp->l_minor_name, ldev, clp->l_blkno)) { 2936 if ((clp->l_devid != 0) && 2937 !(clp->l_devid_flags & MDDB_DEVID_VALID)) { 2938 r->ri_flags |= MDDB_F_EMASTER; 2939 } else { 2940 r->ri_flags |= flag; 2941 } 2942 return (0); /* already entered return success */ 2943 } 2944 r = r->ri_next; 2945 } 2946 2947 /* 2948 * This replica not represented in the current rip list, 2949 * so add it to the list. 2950 */ 2951 r = (mddb_ri_t *)kmem_zalloc(sizeof (**rip), KM_SLEEP); 2952 r->ri_dev = ldev; 2953 r->ri_blkno = clp->l_blkno; 2954 (void) strncpy(r->ri_driver, clp->l_driver, MD_MAXDRVNM); 2955 if (strlen(clp->l_driver) >= MD_MAXDRVNM) { 2956 r->ri_driver[(MD_MAXDRVNM -1)] = '\0'; 2957 } 2958 if (clp->l_devname != NULL) { 2959 (void) strcpy(r->ri_devname, clp->l_devname); 2960 } 2961 r->ri_flags |= flag; 2962 if (clp->l_devid != 0) { 2963 sz = clp->l_devid_sz; 2964 r->ri_devid = (ddi_devid_t)kmem_zalloc(sz, KM_SLEEP); 2965 bcopy((void *)(uintptr_t)clp->l_devid, (char *)r->ri_devid, sz); 2966 2967 if (clp->l_old_devid != NULL) { 2968 sz = clp->l_old_devid_sz; 2969 r->ri_old_devid = (ddi_devid_t)kmem_zalloc(sz, 2970 KM_SLEEP); 2971 bcopy((char *)(uintptr_t)clp->l_old_devid, 2972 (char *)r->ri_old_devid, sz); 2973 } else { 2974 r->ri_old_devid = 0; 2975 } 2976 if (strlen(clp->l_minor_name) < MDDB_MINOR_NAME_MAX) 2977 (void) strcpy(r->ri_minor_name, clp->l_minor_name); 2978 2979 if (!(clp->l_devid_flags & MDDB_DEVID_VALID)) { 2980 /* 2981 * Devid is present, but not valid. This could 2982 * happen if device has been powered off or if 2983 * the device has been removed. Mark the device in 2984 * error. Don't allow any writes to this device 2985 * based on the dev_t since another device could 2986 * have been placed in its spot and be responding to 2987 * the dev_t accesses. 2988 */ 2989 r->ri_flags |= MDDB_F_EMASTER; 2990 } 2991 } else { 2992 r->ri_devid = 0; 2993 r->ri_old_devid = 0; 2994 } 2995 2996 /* 2997 * If the rip list is empty then this entry 2998 * is the list. 2999 */ 3000 if (*rip == NULL) { 3001 *rip = r; 3002 return (0); 3003 } 3004 3005 /* 3006 * Add this entry to the end of the rip list 3007 */ 3008 r1 = *rip; 3009 while (r1->ri_next) 3010 r1 = r1->ri_next; 3011 r1->ri_next = r; 3012 return (0); 3013 } 3014 3015 /* 3016 * writecopy writes the incore data blocks out to all of the replicas. 3017 * This is called from writestart 3018 * - when a diskset is started or 3019 * - when an error has been enountered during the write to a mddb. 3020 * and from newdev when a new mddb is being added. 3021 * 3022 * flag can be 2 values: 3023 * MDDB_WRITECOPY_ALL - write all records to all mddbs. This is 3024 * always used for traditional and local disksets. 3025 * For MN diskset: 3026 * All nodes can call writecopy, but only the 3027 * master node actually writes data to the disk 3028 * except for optimized resync records. 3029 * An optimized resync record can only be written to 3030 * by the record owner. 3031 * MDDB_WRITECOPY_SYNC - special case for MN diskset. When a new 3032 * master has been chosen, the new master may need to 3033 * write its incore mddb to disk (this is the case where the 3034 * old master had executed a message but hadn't relayed it 3035 * to this slave yet). New master should not write the 3036 * change log records since new master would be overwriting 3037 * valuable data. Only used during a reconfig cycle. 3038 */ 3039 static int 3040 writecopy( 3041 mddb_set_t *s, 3042 int li, 3043 int flag 3044 ) 3045 { 3046 mddb_db_t *dbp; 3047 mddb_db32_t *db32p; 3048 mddb_de_ic_t *dep; 3049 mddb_rb32_t *rbp; 3050 uint_t checksum; 3051 int err = 0; 3052 3053 #if defined(_ILP32) && !defined(lint) 3054 ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t)); 3055 ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t)); 3056 #endif 3057 3058 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 3059 db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, KM_SLEEP); 3060 create_db32rec(db32p, dbp); 3061 crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL); 3062 err = writeblks(s, (caddr_t)db32p, dbp->db_blknum, 1, li, 3063 MDDB_WR_ONLY_MASTER); 3064 kmem_free((caddr_t)db32p, MDDB_BSIZE); 3065 if (err) 3066 return (err); 3067 for (dep = dbp->db_firstentry; dep; dep = dep->de_next) { 3068 /* 3069 * In a multinode diskset, when a new master is 3070 * chosen the new master may need to write its 3071 * incore copy of the mddb to disk. In this case, 3072 * don't want to overwrite the change log records 3073 * so new master sets flag to MDDB_WRITECOPY_SYNC. 3074 */ 3075 if (flag == MDDB_WRITECOPY_SYNC) { 3076 if (dep->de_flags & MDDB_F_CHANGELOG) 3077 continue; 3078 } 3079 /* 3080 * In a multinode diskset, don't write out optimized 3081 * resync resyncs since only the mirror owner node 3082 * will have the correct data. If writecopy is 3083 * being called from writestart as a result of 3084 * an mddb failure, then writestart will handle 3085 * the optimized records when it calls fixoptrecords. 3086 */ 3087 if ((MD_MNSET_SETNO(s->s_setno)) && 3088 (dep->de_flags & MDDB_F_OPT)) { 3089 continue; 3090 } 3091 3092 rbp = dep->de_rb; 3093 checksum = rbp->rb_checksum_fiddle; 3094 checksum ^= rbp->rb_checksum; 3095 /* Generate the crc for this record */ 3096 rec_crcgen(s, dep, rbp); 3097 checksum ^= rbp->rb_checksum; 3098 rbp->rb_checksum_fiddle = checksum; 3099 if (err = wrtblklst(s, (caddr_t)rbp, dep->de_blks, 3100 dep->de_blkcount, li, (mddb_bf_t **)0, 3101 MDDB_WR_ONLY_MASTER)) 3102 return (err); 3103 } 3104 } 3105 return (0); 3106 } 3107 3108 static int 3109 upd_med( 3110 mddb_set_t *s, 3111 char *tag 3112 ) 3113 { 3114 med_data_t meddb; 3115 int medok; 3116 mddb_lb_t *lbp = s->s_lbp; 3117 set_t setno = s->s_setno; 3118 int li; 3119 int alc; 3120 int lc; 3121 3122 3123 /* If no mediator hosts, nothing to do */ 3124 if (s->s_med.n_cnt == 0) 3125 return (0); 3126 3127 /* 3128 * If this is a MN set and we are not the master, then don't 3129 * update mediator hosts or mark mediator as golden since 3130 * only master node should do that. 3131 */ 3132 if ((setno != MD_LOCAL_SET) && (s->s_lbp->lb_flags & MDDB_MNSET) && 3133 (md_set[setno].s_am_i_master == 0)) { 3134 return (0); 3135 } 3136 3137 bzero((char *)&meddb, sizeof (med_data_t)); 3138 meddb.med_dat_mag = MED_DATA_MAGIC; 3139 meddb.med_dat_rev = MED_DATA_REV; 3140 meddb.med_dat_fl = 0; 3141 meddb.med_dat_sn = setno; 3142 meddb.med_dat_cc = lbp->lb_commitcnt; 3143 TIMEVAL32_TO_TIMEVAL(&meddb.med_dat_id, &lbp->lb_ident.createtime); 3144 crcgen(&meddb, &meddb.med_dat_cks, sizeof (med_data_t), NULL); 3145 3146 /* count accessible mediators */ 3147 medok = upd_med_hosts(&s->s_med, s->s_setname, &meddb, tag); 3148 3149 /* count accessible and existing replicas */ 3150 for (li = 0, alc = 0, lc = 0; li < lbp->lb_loccnt; li++) { 3151 mddb_locator_t *lp = &lbp->lb_locators[li]; 3152 3153 if (lp->l_flags & MDDB_F_DELETED) 3154 continue; 3155 3156 lc++; 3157 3158 if (! (lp->l_flags & MDDB_F_ACTIVE) || 3159 (lp->l_flags & MDDB_F_EMASTER) || 3160 (lp->l_flags & MDDB_F_EWRITE)) 3161 continue; 3162 3163 alc++; 3164 } 3165 3166 /* 3167 * Mediator update quorum is >= 50%: check for less than 3168 * "mediator update" quorum. 3169 */ 3170 if ((medok * 2) < s->s_med.n_cnt) { 3171 /* panic if <= 50% of all replicas are accessible */ 3172 if ((lc > 0) && ((alc * 2) <= lc)) { 3173 cmn_err(CE_PANIC, 3174 "md: Update of 50%% of the mediator hosts failed"); 3175 /* NOTREACHED */ 3176 } 3177 3178 cmn_err(CE_WARN, 3179 "md: Update of 50%% of the mediator hosts failed"); 3180 } 3181 3182 /* 3183 * If we have mediator update quorum and exactly 50% of the replicas 3184 * are accessible then mark the mediator as golden. 3185 */ 3186 if (((medok * 2) >= (s->s_med.n_cnt + 1)) && (lc > 0) && 3187 ((alc * 2) == lc)) { 3188 meddb.med_dat_fl = MED_DFL_GOLDEN; 3189 crcgen(&meddb, &meddb.med_dat_cks, sizeof (med_data_t), NULL); 3190 (void) upd_med_hosts(&s->s_med, s->s_setname, &meddb, tag); 3191 } 3192 3193 return (0); 3194 } 3195 3196 static int 3197 push_lb(mddb_set_t *s) 3198 { 3199 mddb_lb_t *lbp = s->s_lbp; 3200 3201 /* push the change to all the replicas */ 3202 uniqtime32(&lbp->lb_timestamp); 3203 if (MD_MNSET_SETNO(s->s_setno)) { 3204 lbp->lb_revision = MDDB_REV_MNLB; 3205 } else { 3206 lbp->lb_revision = MDDB_REV_LB; 3207 } 3208 return (writelocall(s)); 3209 } 3210 3211 /* Should not call for MN diskset since data tags are not supported */ 3212 static int 3213 dtl_cmp(const mddb_dtag_t *odtp, const mddb_dtag_t *ndtp) 3214 { 3215 int diff = 0; 3216 3217 diff = (int)(odtp->dt_setno - ndtp->dt_setno); 3218 if (diff) 3219 return (diff); 3220 3221 diff = strncmp(odtp->dt_sn, ndtp->dt_sn, MDDB_SN_LEN); 3222 if (diff) 3223 return (diff); 3224 3225 diff = strncmp(odtp->dt_hn, ndtp->dt_hn, MD_MAX_NODENAME_PLUS_1); 3226 if (diff) 3227 return (diff); 3228 3229 /*CSTYLED*/ 3230 return (timercmp(&odtp->dt_tv, &ndtp->dt_tv, !=)); 3231 } 3232 3233 /* Should not call for MN diskset since data tags are not supported */ 3234 static int 3235 dtl_addl(mddb_set_t *s, const mddb_dtag_t *ndtp) 3236 { 3237 int nextid = 0; 3238 mddb_dtag_lst_t **dtlpp = &s->s_dtlp; 3239 3240 /* Run to the end of the list */ 3241 for (/* void */; (*dtlpp != NULL); dtlpp = &(*dtlpp)->dtl_nx) { 3242 if (dtl_cmp(&(*dtlpp)->dtl_dt, ndtp) == 0) 3243 return (0); 3244 nextid++; 3245 } 3246 3247 /* Add the new member */ 3248 *dtlpp = kmem_zalloc(sizeof (**dtlpp), KM_SLEEP); 3249 3250 /* Update the dtag portion of the list */ 3251 bcopy((caddr_t)ndtp, (caddr_t)&((*dtlpp)->dtl_dt), 3252 sizeof (mddb_dtag_t)); 3253 3254 /* Fix up the id value */ 3255 (*dtlpp)->dtl_dt.dt_id = ++nextid; 3256 3257 return (0); 3258 } 3259 3260 /* 3261 * Even though data tags are not supported in MN disksets, dt_cntl may 3262 * be called for a MN diskset since this routine is called even before 3263 * it is known the kind of diskset being read in from disk. 3264 * For a MNdiskset, s_dtlp is 0 so a count of 0 is returned. 3265 */ 3266 static int 3267 dtl_cntl(mddb_set_t *s) 3268 { 3269 mddb_dtag_lst_t *dtlp = s->s_dtlp; 3270 int ndt = 0; 3271 3272 while (dtlp != NULL) { 3273 ndt++; 3274 dtlp = dtlp->dtl_nx; 3275 } 3276 3277 return (ndt); 3278 } 3279 3280 /* 3281 * Even though data tags are not supported in MN disksets, dt_cntl may 3282 * be called for a MN diskset since this routine is called even before 3283 * it is known the kind of diskset being read in from disk. 3284 * For a MNdiskset, s_dtlp is 0 so a 0 is returned. 3285 */ 3286 static mddb_dtag_t * 3287 dtl_findl(mddb_set_t *s, int id) 3288 { 3289 mddb_dtag_lst_t *dtlp = s->s_dtlp; 3290 3291 while (dtlp != NULL) { 3292 if (dtlp->dtl_dt.dt_id == id) 3293 return (&dtlp->dtl_dt); 3294 dtlp = dtlp->dtl_nx; 3295 } 3296 return ((mddb_dtag_t *)NULL); 3297 } 3298 3299 /* Should not call for MN diskset since data tags are not supported */ 3300 static void 3301 dtl_freel(mddb_dtag_lst_t **dtlpp) 3302 { 3303 mddb_dtag_lst_t *dtlp; 3304 mddb_dtag_lst_t *tdtlp; 3305 3306 3307 for (tdtlp = *dtlpp; tdtlp != NULL; tdtlp = dtlp) { 3308 dtlp = tdtlp->dtl_nx; 3309 kmem_free(tdtlp, sizeof (mddb_dtag_lst_t)); 3310 } 3311 *dtlpp = (mddb_dtag_lst_t *)NULL; 3312 } 3313 3314 /* 3315 * Even though data tags are not supported in MN disksets, dt_setup will 3316 * be called for a MN diskset since this routine is called even before 3317 * it is known the kind of diskset being read in from disk. 3318 * Once this set is known as a MN diskset, the dtp area will be freed. 3319 */ 3320 static void 3321 dt_setup(mddb_set_t *s, const mddb_dtag_t *dtagp) 3322 { 3323 mddb_dt_t *dtp; 3324 set_t setno = s->s_setno; 3325 3326 3327 if (md_set[setno].s_dtp == (mddb_dt_t *)NULL) 3328 md_set[setno].s_dtp = kmem_zalloc(MDDB_DT_BYTES, KM_SLEEP); 3329 else if (dtagp == (mddb_dtag_t *)NULL) 3330 bzero((caddr_t)md_set[setno].s_dtp, MDDB_DT_BYTES); 3331 3332 /* shorthand */ 3333 dtp = (mddb_dt_t *)md_set[setno].s_dtp; 3334 3335 dtp->dt_mag = MDDB_MAGIC_DT; 3336 dtp->dt_rev = MDDB_REV_DT; 3337 3338 if (dtagp != NULL) 3339 dtp->dt_dtag = *dtagp; /* structure assignment */ 3340 3341 /* Initialize the setno */ 3342 dtp->dt_dtag.dt_setno = setno; 3343 3344 /* Clear the id and flags, this is only used in user land */ 3345 dtp->dt_dtag.dt_id = 0; 3346 3347 /* Checksum it */ 3348 crcgen(dtp, &dtp->dt_cks, MDDB_DT_BYTES, NULL); 3349 } 3350 3351 /* Should not call for MN diskset since data tags are not supported */ 3352 static int 3353 set_dtag(mddb_set_t *s, md_error_t *ep) 3354 { 3355 mddb_lb_t *lbp = s->s_lbp; 3356 mddb_dtag_t tag; 3357 3358 if (lbp->lb_dtblkcnt == 0) { 3359 /* Data tags not used in a MN set - so no failure returned */ 3360 if (lbp->lb_flags & MDDB_MNSET) 3361 return (0); 3362 3363 cmn_err(CE_WARN, 3364 "No tag record allocated, unable to tag data"); 3365 (void) mdmddberror(ep, MDE_DB_NOTAGREC, NODEV32, s->s_setno); 3366 return (1); 3367 } 3368 3369 /* Clear the stack variable */ 3370 bzero((caddr_t)&tag, sizeof (mddb_dtag_t)); 3371 3372 /* Get the HW serial number for this host */ 3373 (void) strncpy(tag.dt_sn, hw_serial, MDDB_SN_LEN); 3374 tag.dt_sn[MDDB_SN_LEN - 1] = '\0'; 3375 3376 /* Get the nodename that this host goes by */ 3377 (void) strncpy(tag.dt_hn, utsname.nodename, MD_MAX_NODENAME); 3378 tag.dt_hn[MD_MAX_NODENAME] = '\0'; 3379 3380 /* Get a time stamp for NOW */ 3381 uniqtime32(&tag.dt_tv); 3382 3383 /* Setup the data tag record */ 3384 dt_setup(s, &tag); 3385 3386 /* Free any list of tags if they exist */ 3387 dtl_freel(&s->s_dtlp); 3388 3389 /* Put the new tag onto the tag list */ 3390 (void) dtl_addl(s, &tag); 3391 3392 return (0); 3393 } 3394 3395 /* 3396 * If called during upgrade, this routine expects a non-translated 3397 * (aka target) dev. 3398 * Should not call for MN diskset since data tags are not supported. 3399 */ 3400 static int 3401 dt_read(mddb_set_t *s, mddb_lb_t *lbp, mddb_ri_t *rip) 3402 { 3403 int err = 0; 3404 md_dev64_t dev; 3405 caddr_t tbuf; 3406 daddr_t physblk; 3407 mddb_block_t blk; 3408 mddb_dt_t *dtp; 3409 mddb_dtag_t *dtagp; 3410 set_t setno = s->s_setno; 3411 3412 /* If have not allocated a data tag record, there is nothing to do */ 3413 if (lbp->lb_dtblkcnt == 0) 3414 return (1); 3415 3416 dtp = rip->ri_dtp = (mddb_dt_t *)kmem_zalloc(MDDB_DT_BYTES, KM_SLEEP); 3417 3418 if (dtp == (mddb_dt_t *)NULL) 3419 return (1); 3420 3421 /* shorthand */ 3422 dev = md_xlate_targ_2_mini(rip->ri_dev); 3423 if (dev == NODEV64) { 3424 return (1); 3425 } 3426 3427 tbuf = (caddr_t)rip->ri_dtp; 3428 3429 for (blk = 0; blk < lbp->lb_dtblkcnt; blk++) { 3430 physblk = getphysblk((blk + lbp->lb_dtfirstblk), rip->ri_mbip); 3431 err = getblks(s, tbuf, dev, physblk, btodb(MDDB_BSIZE)); 3432 /* error reading the tag */ 3433 if (err) { 3434 err = 1; 3435 goto out; 3436 } 3437 tbuf += MDDB_BSIZE; 3438 } 3439 3440 /* magic is valid? */ 3441 if (dtp->dt_mag != MDDB_MAGIC_DT) { 3442 err = 1; 3443 goto out; 3444 } 3445 3446 /* revision is valid? */ 3447 if (revchk(MDDB_REV_DT, dtp->dt_rev)) { 3448 err = 1; 3449 goto out; 3450 } 3451 3452 /* crc is valid? */ 3453 if (crcchk(dtp, &dtp->dt_cks, MDDB_DT_BYTES, NULL)) { 3454 err = 1; 3455 goto out; 3456 } 3457 3458 /* shorthand */ 3459 dtagp = &dtp->dt_dtag; 3460 3461 /* set number match? */ 3462 if (dtagp->dt_setno != setno) { 3463 err = 1; 3464 goto out; 3465 } 3466 3467 /* tag is not empty? */ 3468 if (dtagp->dt_sn[0] == '\0' && dtagp->dt_hn[0] == '\0' && 3469 (dtagp->dt_tv.tv_sec == 0 && dtagp->dt_tv.tv_usec == 0) && 3470 dtagp->dt_id == 0) { 3471 err = 2; 3472 goto out; 3473 } 3474 3475 /* Mark the locator as having tagged data */ 3476 rip->ri_flags |= MDDB_F_TAGDATA; 3477 3478 out: 3479 if (err) { 3480 if (err == 1) { 3481 md_set_setstatus(setno, MD_SET_BADTAG); 3482 rip->ri_flags |= MDDB_F_BADTAG; 3483 } 3484 if (dtp != NULL) { 3485 kmem_free(dtp, MDDB_DT_BYTES); 3486 rip->ri_dtp = (mddb_dt_t *)NULL; 3487 } 3488 } 3489 3490 return (err); 3491 } 3492 3493 /* Should not call for MN diskset since data tags are not supported */ 3494 static int 3495 dt_write(mddb_set_t *s) 3496 { 3497 int li; 3498 int err = 0; 3499 int werr; 3500 int empty_tag = 0; 3501 mddb_dtag_t *dtagp; 3502 mddb_dt_t *dtp; 3503 mddb_lb_t *lbp = s->s_lbp; 3504 set_t setno = s->s_setno; 3505 uint_t set_status = md_get_setstatus(setno); 3506 3507 3508 ASSERT(md_set[setno].s_dtp != NULL); 3509 3510 /* Nowhere to write to */ 3511 if (lbp->lb_dtblkcnt == 0) 3512 return (err); 3513 3514 if (set_status & MD_SET_BADTAG) 3515 return (err); 3516 3517 /* shorthand */ 3518 dtp = (mddb_dt_t *)md_set[setno].s_dtp; 3519 dtagp = &dtp->dt_dtag; 3520 3521 /* See if the tag is empty. */ 3522 if (dtagp->dt_sn[0] == '\0' && dtagp->dt_hn[0] == '\0' && 3523 (dtagp->dt_tv.tv_sec == 0 && dtagp->dt_tv.tv_usec == 0) && 3524 dtagp->dt_id == 0) 3525 empty_tag = 1; 3526 3527 /* Write the tag to the locators and reset appropriate flags. */ 3528 for (li = 0; li < lbp->lb_loccnt; li++) { 3529 mddb_locator_t *lp = &lbp->lb_locators[li]; 3530 3531 if ((! (lp->l_flags & MDDB_F_ACTIVE)) || 3532 (lp->l_flags & MDDB_F_DELETED) || 3533 (lp->l_flags & MDDB_F_EWRITE)) 3534 continue; 3535 3536 werr = writeblks(s, (caddr_t)dtp, lbp->lb_dtfirstblk, 3537 MDDB_DT_BLOCKS, li, MDDB_WR_ONLY_MASTER); 3538 3539 if (werr) { 3540 err |= werr; 3541 continue; 3542 } 3543 3544 if (empty_tag) 3545 lp->l_flags &= ~(MDDB_F_BADTAG | MDDB_F_TAGDATA); 3546 else { 3547 lp->l_flags |= MDDB_F_TAGDATA; 3548 lp->l_flags &= ~MDDB_F_BADTAG; 3549 } 3550 } 3551 3552 if (err) 3553 return (err); 3554 3555 3556 /* If the tags were written, check to see if any tags remain. */ 3557 for (li = 0; li < lbp->lb_loccnt; li++) { 3558 mddb_locator_t *lp = &lbp->lb_locators[li]; 3559 3560 if ((! (lp->l_flags & MDDB_F_ACTIVE)) || 3561 (lp->l_flags & MDDB_F_DELETED) || 3562 (lp->l_flags & MDDB_F_EWRITE)) 3563 continue; 3564 3565 if (lp->l_flags & MDDB_F_TAGDATA) 3566 break; 3567 } 3568 3569 /* If there are no tags, then clear CLRTAG and TAGDATA */ 3570 if (li == lbp->lb_loccnt) { 3571 md_clr_setstatus(setno, MD_SET_CLRTAG); 3572 md_clr_setstatus(setno, MD_SET_TAGDATA); 3573 } 3574 3575 return (err); 3576 } 3577 3578 /* Should not call for MN diskset since data tags are not supported */ 3579 static int 3580 dt_alloc_if_needed(mddb_set_t *s) 3581 { 3582 int i; 3583 int li; 3584 int moveit = 0; 3585 mddb_lb_t *lbp = s->s_lbp; 3586 mddb_block_t blkcnt = lbp->lb_dtblkcnt; 3587 set_t setno = s->s_setno; 3588 uint_t set_status = md_get_setstatus(setno); 3589 3590 /* 3591 * If the data tag record is allocated (blkcnt != 0) and a bad tag was 3592 * not detected, there is nothing to do. 3593 */ 3594 if (blkcnt != 0 && ! (set_status & MD_SET_BADTAG)) 3595 return (0); 3596 3597 /* Bitmap not setup, checks can't be done */ 3598 if (s->s_totalblkcnt == 0) 3599 return (0); 3600 3601 /* While reading the tag(s) an invalid tag data record was seen */ 3602 if (set_status & MD_SET_BADTAG) 3603 /* See if the invalid tag needs to be moved */ 3604 for (i = 0; i < MDDB_DT_BLOCKS; i++) 3605 if (blkcheck(s, (i + lbp->lb_dtfirstblk))) { 3606 moveit = 1; 3607 break; 3608 } 3609 3610 /* Need to move or allocate the tag data record */ 3611 if (moveit || blkcnt == 0) { 3612 lbp->lb_dtfirstblk = getfreeblks(s, MDDB_DT_BLOCKS); 3613 if (lbp->lb_dtfirstblk == 0) { 3614 cmn_err(CE_WARN, 3615 "Unable to allocate data tag record"); 3616 return (0); 3617 } 3618 lbp->lb_dtblkcnt = MDDB_DT_BLOCKS; 3619 3620 /* Mark the locators so that they get written to disk. */ 3621 for (li = 0; li < lbp->lb_loccnt; li++) { 3622 mddb_locator_t *lp = &lbp->lb_locators[li]; 3623 3624 if ((! (lp->l_flags & MDDB_F_ACTIVE)) || 3625 (lp->l_flags & MDDB_F_DELETED) || 3626 (lp->l_flags & MDDB_F_EWRITE)) 3627 continue; 3628 3629 lp->l_flags |= MDDB_F_BADTAG; 3630 } 3631 return (1); 3632 } 3633 3634 /* 3635 * Make sure the blocks are owned, since the calculation in 3636 * computefreeblks() is bypassed when MD_SET_BADTAG is set. 3637 */ 3638 for (i = 0; i < MDDB_DT_BLOCKS; i++) 3639 blkbusy(s, (i + lbp->lb_dtfirstblk)); 3640 3641 return (1); 3642 } 3643 3644 /* 3645 * Writestart writes the incore mddb out to all of the replicas. 3646 * This is called when a diskset is started and when an error has 3647 * been enountered during the write to a mddb. 3648 * 3649 * flag can be 2 values: 3650 * MDDB_WRITECOPY_ALL - write all records to all mddbs. This is 3651 * always used for traditional and local disksets. 3652 * This is the normal path for MN disksets since the slave 3653 * nodes aren't actually allowed to write to disk. 3654 * MDDB_WRITECOPY_SYNC - special case for MN diskset. When a new 3655 * master has been chosen, the new master may need to 3656 * write its incore mddb to disk (this is the case where the 3657 * old master had executed a message but hadn't relayed it 3658 * to this slave yet). New master should not write the 3659 * change log records since new master would be overwriting 3660 * valuable data. Only used during a reconfig cycle. 3661 */ 3662 static int 3663 writestart( 3664 mddb_set_t *s, 3665 int flag 3666 ) 3667 { 3668 int li; 3669 mddb_locator_t *lp; 3670 mddb_lb_t *lbp; 3671 mddb_ln_t *lnp; 3672 int err = 0; 3673 uint_t set_status; 3674 3675 lbp = s->s_lbp; 3676 3677 for (li = 0; li < lbp->lb_loccnt; li++) { 3678 lp = &lbp->lb_locators[li]; 3679 if (! (lp->l_flags & MDDB_F_ACTIVE)) 3680 continue; 3681 if (! (lp->l_flags & MDDB_F_SUSPECT)) 3682 continue; 3683 if (writecopy(s, li, flag)) 3684 return (1); 3685 lp->l_flags |= MDDB_F_UP2DATE; 3686 } 3687 3688 for (li = 0; li < lbp->lb_loccnt; li++) { 3689 lp = &lbp->lb_locators[li]; 3690 if (! (lp->l_flags & MDDB_F_ACTIVE)) 3691 continue; 3692 if ((lp->l_flags & MDDB_F_UP2DATE)) 3693 continue; 3694 if (checkcopy(s, li)) 3695 if (err = writecopy(s, li, flag)) 3696 return (1); 3697 lp->l_flags |= MDDB_F_UP2DATE; 3698 } 3699 3700 /* 3701 * Call fixoptrecord even during a reconfig cycle since a replica 3702 * failure may force the master to re-assign the optimized 3703 * resync record to another replica. 3704 */ 3705 if (fixoptrecords(s)) 3706 return (1); 3707 3708 set_status = md_get_setstatus(s->s_setno); 3709 3710 /* See if any (ACTIVE and not OLDACT) or (not ACTIVE and OLDACT) */ 3711 for (li = 0; li < lbp->lb_loccnt; li++) { 3712 lp = &lbp->lb_locators[li]; 3713 3714 if (lp->l_flags & MDDB_F_DELETED) 3715 continue; 3716 3717 if (((lp->l_flags & MDDB_F_ACTIVE) != 0 && 3718 (lp->l_flags & MDDB_F_OLDACT) == 0) || 3719 ((lp->l_flags & MDDB_F_ACTIVE) == 0 && 3720 (lp->l_flags & MDDB_F_OLDACT) != 0)) 3721 break; 3722 3723 if ((set_status & MD_SET_TAGDATA) || 3724 (set_status & MD_SET_CLRTAG)) 3725 if ((lp->l_flags & MDDB_F_TAGDATA) || 3726 (lp->l_flags & MDDB_F_BADTAG)) 3727 break; 3728 } 3729 3730 /* 3731 * If we found (ACTIVE and not OLDACT) or (not ACTIVE and OLDACT) 3732 * the lbp identifier and the set identifier doesn't match. 3733 */ 3734 if (li != lbp->lb_loccnt || cmpidentifier(s, &lbp->lb_ident)) { 3735 3736 /* Only call for traditional and local sets */ 3737 if (!(lbp->lb_flags & MDDB_MNSET)) 3738 (void) dt_write(s); 3739 3740 setidentifier(s, &lbp->lb_ident); 3741 3742 if (err = push_lb(s)) 3743 return (err); 3744 3745 (void) upd_med(s, "writestart(0)"); 3746 3747 if (err = push_lb(s)) 3748 return (err); 3749 3750 (void) upd_med(s, "writestart(1)"); 3751 3752 lnp = s->s_lnp; 3753 uniqtime32(&lnp->ln_timestamp); 3754 if (lbp->lb_flags & MDDB_MNSET) 3755 lnp->ln_revision = MDDB_REV_MNLN; 3756 else 3757 lnp->ln_revision = MDDB_REV_LN; 3758 crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL); 3759 err = writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk, 3760 lbp->lb_lnblkcnt, 0); 3761 /* 3762 * If a MN diskset and this is the master, set the PARSE_LOCNM 3763 * flag in the mddb_set structure to show that the locator 3764 * names have changed. 3765 * Don't set parseflags as a result of a new master sync 3766 * during reconfig cycle since slaves nodes are already 3767 * in-sync with the new master. 3768 */ 3769 3770 if ((lbp->lb_flags & MDDB_MNSET) && 3771 (md_set[s->s_setno].s_am_i_master) && 3772 (flag != MDDB_WRITECOPY_SYNC)) { 3773 s->s_mn_parseflags |= MDDB_PARSE_LOCNM; 3774 } 3775 3776 if (err) 3777 return (err); 3778 } 3779 3780 for (li = 0; li < lbp->lb_loccnt; li++) { 3781 lp = &lbp->lb_locators[li]; 3782 if (lp->l_flags & MDDB_F_DELETED) 3783 continue; 3784 if (lp->l_flags & MDDB_F_ACTIVE) { 3785 lp->l_flags |= MDDB_F_OLDACT; 3786 } else { 3787 lp->l_flags &= ~MDDB_F_OLDACT; 3788 } 3789 } 3790 3791 md_clr_setstatus(s->s_setno, MD_SET_STALE); 3792 3793 return (0); 3794 } 3795 3796 /* 3797 * selectreplicas selects the working replicas and may write the incore 3798 * version of the mddb out to the replicas ondisk. 3799 * 3800 * flag can be 3 values: 3801 * MDDB_RETRYSCAN - quick scan to see if there is an error. 3802 * If no new error, returns without writing mddb 3803 * to disks. If a new error is seen, writes out 3804 * mddb to disks. 3805 * MDDB_SCANALL - lengthy scan to check out mddbs and always writes 3806 * out mddb to the replica ondisk. Calls writecopy 3807 * with MDDB_WRITECOPY_ALL flag which writes out 3808 * all records to the replicas ondisk. 3809 * MDDB_SCANALLSYNC - called during reconfig cycle to sync up incore 3810 * and ondisk mddbs by writing incore values to disk. 3811 * Calls writecopy with MDDB_WRITECOPY_SYNC flag so 3812 * that change log records are not written out. 3813 * Only used by MN disksets. 3814 * 3815 * Returns: 3816 * 0 - Successful 3817 * 1 - Unable to write incore mddb data to disk since < 50% replicas. 3818 */ 3819 int 3820 selectreplicas( 3821 mddb_set_t *s, 3822 int flag 3823 ) 3824 { 3825 int li; 3826 int alc; 3827 int lc; 3828 mddb_locator_t *lp; 3829 mddb_lb_t *lbp = s->s_lbp; 3830 set_t setno = s->s_setno; 3831 int wc_flag; 3832 3833 /* 3834 * can never transition from stale to not stale 3835 */ 3836 if (md_get_setstatus(setno) & MD_SET_STALE) { 3837 for (li = 0; li < lbp->lb_loccnt; li++) { 3838 lp = &lbp->lb_locators[li]; 3839 if (lp->l_flags & MDDB_F_DELETED) 3840 continue; 3841 if (! (lp->l_flags & MDDB_F_EMASTER)) { 3842 lp->l_flags |= MDDB_F_ACTIVE; 3843 } else { 3844 lp->l_flags &= ~MDDB_F_ACTIVE; 3845 } 3846 } 3847 return (1); 3848 } 3849 3850 if ((flag == MDDB_SCANALL) || (flag == MDDB_SCANALLSYNC)) { 3851 for (li = 0; li < lbp->lb_loccnt; li++) { 3852 lp = &lbp->lb_locators[li]; 3853 if (lp->l_flags & MDDB_F_DELETED) 3854 continue; 3855 if (lp->l_flags & MDDB_F_ACTIVE) { 3856 lp->l_flags |= MDDB_F_OLDACT; 3857 lp->l_flags &= ~MDDB_F_SUSPECT; 3858 } else { 3859 lp->l_flags |= MDDB_F_SUSPECT; 3860 lp->l_flags &= ~MDDB_F_OLDACT; 3861 } 3862 3863 if (! (lp->l_flags & MDDB_F_EMASTER)) { 3864 lp->l_flags |= MDDB_F_ACTIVE; 3865 lp->l_flags &= ~MDDB_F_EWRITE; 3866 lp->l_flags &= ~MDDB_F_TOOSMALL; 3867 } else { 3868 lp->l_flags &= ~MDDB_F_ACTIVE; 3869 } 3870 } 3871 computefreeblks(s); /* set up free block bits */ 3872 } else { 3873 for (li = 0; li < lbp->lb_loccnt; li++) { 3874 lp = &lbp->lb_locators[li]; 3875 if (! (lp->l_flags & MDDB_F_ACTIVE)) 3876 continue; 3877 if (lp->l_flags & MDDB_F_EWRITE) 3878 break; 3879 } 3880 3881 /* 3882 * if there are no errors this is error has already 3883 * been processed return current state 3884 */ 3885 if (li == lbp->lb_loccnt) 3886 return (md_get_setstatus(setno) & MD_SET_TOOFEW); 3887 3888 lp->l_flags &= ~MDDB_F_ACTIVE; 3889 do { 3890 lp = &lbp->lb_locators[li]; 3891 lp->l_flags &= ~MDDB_F_UP2DATE; 3892 } while (++li < lbp->lb_loccnt); 3893 } 3894 3895 alc = 0; 3896 lc = 0; 3897 for (li = 0; li < lbp->lb_loccnt; li++) { 3898 lp = &lbp->lb_locators[li]; 3899 if (lp->l_flags & MDDB_F_DELETED) 3900 continue; 3901 lc++; 3902 if (! (lp->l_flags & MDDB_F_ACTIVE)) 3903 continue; 3904 alc++; 3905 } 3906 3907 if (alc < ((lc + 1) / 2)) { 3908 md_set_setstatus(setno, MD_SET_TOOFEW); 3909 return (1); 3910 } 3911 3912 /* Set wc_flag based on flag passed in. */ 3913 if (flag == MDDB_SCANALLSYNC) 3914 wc_flag = MDDB_WRITECOPY_SYNC; 3915 else 3916 wc_flag = MDDB_WRITECOPY_ALL; 3917 3918 do { 3919 if (! writestart(s, wc_flag)) { 3920 md_clr_setstatus(setno, MD_SET_TOOFEW); 3921 return (0); 3922 } 3923 alc = 0; 3924 for (li = 0; li < lbp->lb_loccnt; li++) { 3925 lp = &lbp->lb_locators[li]; 3926 if ((lp->l_flags & MDDB_F_DELETED) || 3927 (lp->l_flags & MDDB_F_EMASTER)) 3928 continue; 3929 3930 if (lp->l_flags & MDDB_F_EWRITE) { 3931 lp->l_flags &= ~MDDB_F_ACTIVE; 3932 lp->l_flags &= ~MDDB_F_UP2DATE; 3933 continue; 3934 } 3935 alc++; 3936 } 3937 } while (alc >= ((lc + 1) / 2)); 3938 md_set_setstatus(setno, MD_SET_TOOFEW); 3939 return (1); 3940 } 3941 3942 static int 3943 checkstate( 3944 mddb_set_t *s, 3945 int probe 3946 ) 3947 { 3948 int error; 3949 uint_t set_status = md_get_setstatus(s->s_setno); 3950 3951 ASSERT(s != NULL); 3952 3953 if (! (set_status & MD_SET_STALE) && ! (set_status & MD_SET_TOOFEW)) 3954 return (0); 3955 3956 if (probe == MDDB_NOPROBE) 3957 return (1); 3958 3959 single_thread_start(s); 3960 error = selectreplicas(s, MDDB_SCANALL); 3961 single_thread_end(s); 3962 3963 if (error == 0 && s->s_zombie != 0) { 3964 mutex_exit(SETMUTEX(s->s_setno)); 3965 error = mddb_deleterec(s->s_zombie); 3966 mutex_enter(SETMUTEX(s->s_setno)); 3967 if (error == 0) 3968 s->s_zombie = 0; 3969 } 3970 return (error); 3971 } 3972 3973 static int 3974 writeretry( 3975 mddb_set_t *s 3976 ) 3977 { 3978 if (selectreplicas(s, MDDB_RETRYSCAN)) 3979 if (selectreplicas(s, MDDB_SCANALL)) 3980 return (1); 3981 return (0); 3982 } 3983 3984 static void 3985 free_mbipp(mddb_mb_ic_t **mbipp) 3986 { 3987 mddb_mb_ic_t *mbip1, *mbip2; 3988 3989 for (mbip1 = *mbipp; mbip1 != NULL; mbip1 = mbip2) { 3990 mbip2 = mbip1->mbi_next; 3991 kmem_free((caddr_t)mbip1, MDDB_IC_BSIZE); 3992 } 3993 *mbipp = (mddb_mb_ic_t *)NULL; 3994 } 3995 3996 static mddb_ri_t * 3997 save_rip(mddb_set_t *s) 3998 { 3999 mddb_ri_t *trip = s->s_rip; 4000 mddb_ri_t *nrip = NULL; 4001 mddb_ri_t **nripp = &nrip; 4002 mddb_ri_t *rip; 4003 4004 while (trip) { 4005 /* Run to the end of the list */ 4006 for (/* void */; (*nripp != NULL); nripp = &(*nripp)->ri_next) 4007 /* void */; 4008 4009 /* Add the new member */ 4010 *nripp = kmem_zalloc(sizeof (**nripp), KM_SLEEP); 4011 4012 ASSERT(*nripp != NULL); 4013 4014 /* shorthand */ 4015 rip = *nripp; 4016 4017 *rip = *trip; /* structure assignment */ 4018 4019 /* Clear the stuff that is not needed for hints */ 4020 rip->ri_flags = 0; 4021 rip->ri_commitcnt = 0; 4022 rip->ri_transplant = 0; 4023 rip->ri_mbip = (mddb_mb_ic_t *)NULL; 4024 rip->ri_dtp = (mddb_dt_t *)NULL; 4025 rip->ri_lbp = (mddb_lb_t *)NULL; 4026 rip->ri_did_icp = (mddb_did_ic_t *)NULL; 4027 rip->ri_devid = (ddi_devid_t)NULL; 4028 rip->ri_old_devid = (ddi_devid_t)NULL; 4029 rip->ri_next = (mddb_ri_t *)NULL; 4030 4031 trip = trip->ri_next; 4032 } 4033 return (nrip); 4034 } 4035 4036 static void 4037 free_rip(mddb_ri_t **ripp) 4038 { 4039 mddb_ri_t *rip; 4040 mddb_ri_t *arip; 4041 4042 for (rip = *ripp; rip != (mddb_ri_t *)NULL; rip = arip) { 4043 arip = rip->ri_next; 4044 if (rip->ri_devid != (ddi_devid_t)NULL) { 4045 ddi_devid_free(rip->ri_devid); 4046 rip->ri_devid = (ddi_devid_t)NULL; 4047 } 4048 if (rip->ri_old_devid != (ddi_devid_t)NULL) { 4049 ddi_devid_free(rip->ri_old_devid); 4050 rip->ri_old_devid = (ddi_devid_t)NULL; 4051 } 4052 kmem_free((caddr_t)rip, sizeof (*rip)); 4053 } 4054 *ripp = (mddb_ri_t *)NULL; 4055 } 4056 4057 /* 4058 * this routine selects the correct replica to use 4059 * the rules are as follows 4060 * 1. if all replica has same init time select highest commit count 4061 * 2. if some but not all replicas are from another hostid discard 4062 * them. 4063 * 3. find which init time is present is most replicas 4064 * 4. discard all replicas which do not match most init times 4065 * 5. select replica with highest commit count 4066 */ 4067 4068 static mddb_lb_t * 4069 selectlocator( 4070 mddb_set_t *s 4071 ) 4072 { 4073 mddb_ri_t *rip = s->s_rip; 4074 mddb_ri_t *r, *r1; 4075 mddb_lb_t *lbp; 4076 struct timeval32 *tp = (struct timeval32 *)NULL; 4077 int different; 4078 int same; 4079 int count; 4080 int maxcount; 4081 set_t setno = s->s_setno; 4082 size_t sz; 4083 int mn_set = 0; 4084 4085 /* Clear the ri_transplant flag on all the rip entries. */ 4086 /* Set ri_commitcnt to locator's commitcnt - if available */ 4087 for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) { 4088 r->ri_transplant = 0; 4089 if (r->ri_lbp != (mddb_lb_t *)NULL) { 4090 r->ri_commitcnt = r->ri_lbp->lb_commitcnt; 4091 /* If any locators have MN bit set, set flag */ 4092 if (r->ri_lbp->lb_flags & MDDB_MNSET) 4093 mn_set = 1; 4094 } 4095 } 4096 4097 /* 4098 * A data tag is being used, so use it to limit the selection first. 4099 * Data tags not used in MN diskset. 4100 */ 4101 if ((mn_set == 0) && (md_get_setstatus(setno) & MD_SET_USETAG)) { 4102 mddb_dt_t *dtp = (mddb_dt_t *)md_set[setno].s_dtp; 4103 4104 /* 4105 * now toss any locators that have a different data tag 4106 */ 4107 for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) { 4108 if (r->ri_lbp == (mddb_lb_t *)NULL) 4109 continue; 4110 4111 if (r->ri_dtp != (mddb_dt_t *)NULL) { 4112 /* If same tag, keep it */ 4113 if (dtl_cmp(&dtp->dt_dtag, 4114 &r->ri_dtp->dt_dtag) == 0) 4115 continue; 4116 } 4117 4118 if (r->ri_dtp != (mddb_dt_t *)NULL) { 4119 kmem_free((caddr_t)r->ri_dtp, MDDB_DT_BYTES); 4120 r->ri_dtp = (mddb_dt_t *)NULL; 4121 } 4122 4123 mddb_devid_icp_free(&r->ri_did_icp, r->ri_lbp); 4124 if (!(md_get_setstatus(setno) & 4125 MD_SET_REPLICATED_IMPORT)) { 4126 if (r->ri_old_devid != (ddi_devid_t)NULL) { 4127 sz = ddi_devid_sizeof(r->ri_old_devid); 4128 kmem_free((caddr_t)r->ri_old_devid, sz); 4129 r->ri_old_devid = (ddi_devid_t)NULL; 4130 } 4131 } 4132 4133 kmem_free((caddr_t)r->ri_lbp, 4134 dbtob(r->ri_lbp->lb_blkcnt)); 4135 r->ri_lbp = (mddb_lb_t *)NULL; 4136 4137 r->ri_transplant = 1; 4138 } 4139 4140 /* Tag used, clear the bit */ 4141 md_clr_setstatus(s->s_setno, MD_SET_USETAG); 4142 4143 if (md_get_setstatus(s->s_setno) & MD_SET_TAGDATA) { 4144 /* 4145 * Get rid of the list of tags. 4146 */ 4147 dtl_freel(&s->s_dtlp); 4148 4149 /* 4150 * Re-create the list with the tag used. 4151 */ 4152 (void) dtl_addl(s, &dtp->dt_dtag); 4153 } 4154 } 4155 4156 /* 4157 * scan to see if all replicas have same time 4158 */ 4159 for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) { 4160 if (r->ri_lbp == (mddb_lb_t *)NULL) 4161 continue; 4162 if (tp == NULL) { 4163 tp = &r->ri_lbp->lb_inittime; 4164 continue; 4165 } 4166 /* CSTYLED */ 4167 if (timercmp(tp, &r->ri_lbp->lb_inittime, !=)) 4168 break; 4169 } 4170 4171 /* 4172 * if r == NULL then they were all them same. Choose highest 4173 * commit count 4174 */ 4175 if (r == (mddb_ri_t *)NULL) 4176 goto out; 4177 4178 /* 4179 * If here, a bogus replica is present and at least 1 lb_inittime 4180 * did not match. 4181 */ 4182 4183 /* 4184 * look and see if any but not all are from different id 4185 */ 4186 4187 different = 0; 4188 same = 0; 4189 for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) { 4190 if (r->ri_lbp == (mddb_lb_t *)NULL) 4191 continue; 4192 if (cmpidentifier(s, &r->ri_lbp->lb_ident)) 4193 different = 1; 4194 else 4195 same = 1; 4196 } 4197 4198 /* 4199 * now go through and throw out different if there are some 4200 * that are the same 4201 */ 4202 if (different != 0 && same != 0) { 4203 for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) { 4204 if (r->ri_lbp == (mddb_lb_t *)NULL) 4205 continue; 4206 4207 if (cmpidentifier(s, &r->ri_lbp->lb_ident)) 4208 continue; 4209 4210 if (r->ri_dtp != (mddb_dt_t *)NULL) { 4211 kmem_free((caddr_t)r->ri_dtp, MDDB_DT_BYTES); 4212 r->ri_dtp = (mddb_dt_t *)NULL; 4213 } 4214 4215 mddb_devid_icp_free(&r->ri_did_icp, r->ri_lbp); 4216 if (!(md_get_setstatus(setno) & 4217 MD_SET_REPLICATED_IMPORT)) { 4218 if (r->ri_old_devid != (ddi_devid_t)NULL) { 4219 sz = ddi_devid_sizeof(r->ri_old_devid); 4220 kmem_free((caddr_t)r->ri_old_devid, sz); 4221 r->ri_old_devid = (ddi_devid_t)NULL; 4222 } 4223 } 4224 4225 kmem_free((caddr_t)r->ri_lbp, 4226 dbtob(r->ri_lbp->lb_blkcnt)); 4227 r->ri_lbp = (mddb_lb_t *)NULL; 4228 4229 r->ri_transplant = 1; 4230 } 4231 } 4232 4233 /* 4234 * go through and pick highest. Use n square because it is 4235 * simple and 40 some is max possible 4236 */ 4237 maxcount = 0; 4238 lbp = (mddb_lb_t *)NULL; 4239 for (r1 = rip; r1 != (mddb_ri_t *)NULL; r1 = r1->ri_next) { 4240 if (r1->ri_lbp == (mddb_lb_t *)NULL) 4241 continue; 4242 count = 0; 4243 for (r = r1; r != (mddb_ri_t *)NULL; r = r->ri_next) { 4244 if (r->ri_lbp == (mddb_lb_t *)NULL) 4245 continue; 4246 if (timercmp(&r1->ri_lbp->lb_inittime, /* CSTYLED */ 4247 &r->ri_lbp->lb_inittime, ==)) 4248 count++; 4249 } 4250 if (count > maxcount) { 4251 maxcount = count; 4252 lbp = r1->ri_lbp; 4253 } 4254 } 4255 4256 /* 4257 * now go though and toss any that are of a different time stamp 4258 */ 4259 for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) { 4260 if (r->ri_lbp == (mddb_lb_t *)NULL) 4261 continue; 4262 if (timercmp(&lbp->lb_inittime, /* CSTYLED */ 4263 &r->ri_lbp->lb_inittime, ==)) 4264 continue; 4265 4266 if (r->ri_dtp != (mddb_dt_t *)NULL) { 4267 kmem_free((caddr_t)r->ri_dtp, MDDB_DT_BYTES); 4268 r->ri_dtp = (mddb_dt_t *)NULL; 4269 } 4270 4271 mddb_devid_icp_free(&r->ri_did_icp, r->ri_lbp); 4272 if (!(md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) { 4273 if (r->ri_old_devid != (ddi_devid_t)NULL) { 4274 sz = ddi_devid_sizeof(r->ri_old_devid); 4275 kmem_free((caddr_t)r->ri_old_devid, sz); 4276 r->ri_old_devid = (ddi_devid_t)NULL; 4277 } 4278 } 4279 4280 kmem_free((caddr_t)r->ri_lbp, dbtob(r->ri_lbp->lb_blkcnt)); 4281 r->ri_lbp = (mddb_lb_t *)NULL; 4282 4283 r->ri_transplant = 1; 4284 } 4285 4286 out: 4287 /* 4288 * Find the locator with the highest commit count, and make it the 4289 * "chosen" one. 4290 */ 4291 lbp = (mddb_lb_t *)NULL; 4292 for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) { 4293 if (r->ri_lbp == (mddb_lb_t *)NULL) 4294 continue; 4295 4296 if (lbp == NULL) { 4297 lbp = r->ri_lbp; 4298 continue; 4299 } 4300 4301 if (r->ri_lbp->lb_commitcnt > lbp->lb_commitcnt) 4302 lbp = r->ri_lbp; 4303 } 4304 4305 /* Toss all locator blocks, except the "chosen" one. */ 4306 for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) { 4307 if (r->ri_lbp == (mddb_lb_t *)NULL) 4308 continue; 4309 4310 /* Get rid of all dtp's */ 4311 if (r->ri_dtp != (mddb_dt_t *)NULL) { 4312 kmem_free((caddr_t)r->ri_dtp, MDDB_DT_BYTES); 4313 r->ri_dtp = (mddb_dt_t *)NULL; 4314 } 4315 4316 if (r->ri_lbp == lbp) 4317 continue; 4318 4319 /* Get rid of extra locator devid block info */ 4320 mddb_devid_icp_free(&r->ri_did_icp, r->ri_lbp); 4321 if (!(md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) { 4322 if (r->ri_old_devid != (ddi_devid_t)NULL) { 4323 sz = ddi_devid_sizeof(r->ri_old_devid); 4324 kmem_free((caddr_t)r->ri_old_devid, sz); 4325 r->ri_old_devid = (ddi_devid_t)NULL; 4326 } 4327 } 4328 4329 /* Get rid of extra locators */ 4330 kmem_free((caddr_t)r->ri_lbp, dbtob(r->ri_lbp->lb_blkcnt)); 4331 r->ri_lbp = (mddb_lb_t *)NULL; 4332 } 4333 return (lbp); 4334 } 4335 4336 static void 4337 locator2cfgloc( 4338 mddb_lb_t *lbp, 4339 mddb_cfg_loc_t *clp, 4340 int li, 4341 side_t sideno, 4342 mddb_did_ic_t *did_icp 4343 ) 4344 { 4345 mddb_drvnm_t *dn; 4346 mddb_locator_t *lp = &lbp->lb_locators[li]; 4347 mddb_sidelocator_t *slp; 4348 mddb_mnsidelocator_t *mnslp; 4349 mddb_did_info_t *did_info; 4350 int i, sz, szalloc; 4351 int mn_set = 0; 4352 mddb_mnlb_t *mnlbp; 4353 4354 if (lbp->lb_flags & MDDB_MNSET) { 4355 mn_set = 1; 4356 mnlbp = (mddb_mnlb_t *)lbp; 4357 for (i = 0; i < MD_MNMAXSIDES; i++) { 4358 mnslp = &mnlbp->lb_mnsidelocators[i][li]; 4359 if (mnslp->mnl_sideno == sideno) 4360 break; 4361 } 4362 if (i == MD_MNMAXSIDES) 4363 return; 4364 } else { 4365 slp = &lbp->lb_sidelocators[sideno][li]; 4366 } 4367 4368 if (lbp->lb_flags & MDDB_DEVID_STYLE) { 4369 did_info = &(did_icp->did_ic_blkp->blk_info[li]); 4370 if (did_info->info_flags & MDDB_DID_EXISTS) { 4371 sz = (int)ddi_devid_sizeof(did_icp->did_ic_devid[li]); 4372 if (clp->l_devid_flags & MDDB_DEVID_SPACE) { 4373 /* copy device id from mddb to cfg_loc structure */ 4374 szalloc = clp->l_devid_sz; 4375 if (sz <= szalloc) { 4376 for (i = 0; i < sz; i++) { 4377 ((char *)(uintptr_t)clp->l_devid)[i] = 4378 ((char *)did_icp->did_ic_devid[li])[i]; 4379 } 4380 clp->l_devid_flags |= MDDB_DEVID_VALID; 4381 (void) strcpy(clp->l_minor_name, 4382 did_info->info_minor_name); 4383 } else { 4384 clp->l_devid_flags |= MDDB_DEVID_NOSPACE; 4385 } 4386 } else if (clp->l_devid_flags & MDDB_DEVID_GETSZ) { 4387 clp->l_devid_flags = MDDB_DEVID_SZ; 4388 clp->l_devid_sz = sz; 4389 } 4390 } 4391 } 4392 4393 /* 4394 * Even if a devid exists, use the dev, drvnm and mnum in the locators 4395 * and sidelocators. During startup, the dev, drvnm and mnum in 4396 * these structures may not match the devid (the locators and 4397 * sidelocators will be updated to match the devid by the routine 4398 * load_old_replicas). Using out-of-sync values won't cause any 4399 * problems since ridev will re-derive these from the devid and mnum. 4400 * After startup, the dev, drvnm and mnum in these structures have 4401 * been updated and can be used. 4402 */ 4403 4404 clp->l_blkno = lp->l_blkno; 4405 clp->l_flags = lp->l_flags; 4406 clp->l_dev = lp->l_dev; 4407 4408 if (mn_set) { 4409 dn = &lbp->lb_drvnm[mnslp->mnl_drvnm_index]; 4410 clp->l_mnum = mnslp->mnl_mnum; 4411 } else { 4412 dn = &lbp->lb_drvnm[slp->l_drvnm_index]; 4413 clp->l_mnum = slp->l_mnum; 4414 } 4415 (void) strncpy(clp->l_driver, dn->dn_data, MD_MAXDRVNM); 4416 } 4417 4418 /* 4419 * Find the index into the mnsidelocator where entry will go. 4420 * Then index can be fed into both splitname2locatorblocks and 4421 * cfgloc2locator so that those entries can be kept in sync. 4422 * 4423 * Returns: 4424 * -1 if failed to find unused slot or if a traditional diskset 4425 * index, if successful (0 <= index <= MD_MNMAXSIDES) 4426 */ 4427 static int 4428 checklocator( 4429 mddb_lb_t *lbp, 4430 int li, 4431 side_t sideno 4432 ) 4433 { 4434 uchar_t i; 4435 mddb_mnsidelocator_t *mnslp; 4436 mddb_mnlb_t *mnlbp; 4437 int index = -1; 4438 4439 if (lbp->lb_flags & MDDB_MNSET) { 4440 /* 4441 * Checking side locator structure. First, check if 4442 * there is already an entry for this side. If so, 4443 * then use that entry. Otherwise, find an entry 4444 * that has a sideno of 0. 4445 */ 4446 mnlbp = (mddb_mnlb_t *)lbp; 4447 for (i = 0; i < MD_MNMAXSIDES; i++) { 4448 mnslp = &mnlbp->lb_mnsidelocators[i][li]; 4449 if (mnslp->mnl_sideno == sideno) { 4450 /* Found a match - stop looking */ 4451 index = i; 4452 break; 4453 } else if ((mnslp->mnl_sideno == 0) && (index == -1)) { 4454 /* Set first empty slot, but keep looking */ 4455 index = i; 4456 } 4457 } 4458 /* Didn't find empty slot or previously used slot */ 4459 if ((i == MD_MNMAXSIDES) && (index == -1)) { 4460 return (-1); 4461 } 4462 return (index); 4463 } else 4464 return (0); 4465 } 4466 4467 /* 4468 * Takes locator information (driver name, minor number, sideno) and 4469 * stores it in the locator block. 4470 * For traditional diskset, the sideno is the index into the sidelocator 4471 * array in the locator block. 4472 * For the MN diskset, the sideno is the nodeid which can be any number, 4473 * so the index passed in is the index into the mnsidelocator array 4474 * in the locator block. 4475 */ 4476 static int 4477 cfgloc2locator( 4478 mddb_lb_t *lbp, 4479 mddb_cfg_loc_t *clp, 4480 int li, 4481 side_t sideno, 4482 int index /* Only useful in MNsets when > 1 */ 4483 ) 4484 { 4485 uchar_t i; 4486 mddb_sidelocator_t *slp; 4487 mddb_mnsidelocator_t *mnslp; 4488 mddb_set_t *s; 4489 int mn_set = 0; 4490 mddb_mnlb_t *mnlbp; 4491 4492 if (lbp->lb_flags & MDDB_MNSET) { 4493 mnlbp = (mddb_mnlb_t *)lbp; 4494 mn_set = 1; 4495 /* 4496 * Index will be the slot that has the given sideno or 4497 * the first empty slot if no match is found. 4498 * This was pre-checked out in check locator. 4499 */ 4500 mnslp = &mnlbp->lb_mnsidelocators[index][li]; 4501 } else { 4502 slp = &lbp->lb_sidelocators[sideno][li]; 4503 } 4504 4505 /* 4506 * Look for the driver name 4507 */ 4508 for (i = 0; i < MDDB_DRVNMCNT; i++) { 4509 if (lbp->lb_drvnm[i].dn_len == 0) 4510 continue; 4511 if (strncmp(lbp->lb_drvnm[i].dn_data, clp->l_driver, 4512 MD_MAXDRVNM) == 0) 4513 break; 4514 } 4515 4516 /* 4517 * Didn't find one, add a new one 4518 */ 4519 if (i == MDDB_DRVNMCNT) { 4520 for (i = 0; i < MDDB_DRVNMCNT; i++) { 4521 if (lbp->lb_drvnm[i].dn_len == 0) 4522 break; 4523 } 4524 if (i == MDDB_DRVNMCNT) 4525 return (1); 4526 (void) strncpy(lbp->lb_drvnm[i].dn_data, clp->l_driver, 4527 MD_MAXDRVNM); 4528 lbp->lb_drvnm[i].dn_len = (uchar_t)strlen(clp->l_driver); 4529 } 4530 4531 /* Fill in the drvnm index */ 4532 if (mn_set) { 4533 mnslp->mnl_drvnm_index = i; 4534 mnslp->mnl_mnum = clp->l_mnum; 4535 mnslp->mnl_sideno = sideno; 4536 } else { 4537 slp->l_drvnm_index = i; 4538 slp->l_mnum = clp->l_mnum; 4539 } 4540 4541 if (lbp->lb_flags & MDDB_DEVID_STYLE) { 4542 /* 4543 * This device id could already be associated with this index 4544 * if this is not the first side added to the set. 4545 * If device id is 0, there is no device id for this device. 4546 */ 4547 if ((ddi_devid_t)(uintptr_t)clp->l_devid == 0) 4548 return (0); 4549 s = (mddb_set_t *)md_set[lbp->lb_setno].s_db; 4550 if (mddb_devid_add(s, li, (ddi_devid_t)(uintptr_t)clp->l_devid, 4551 clp->l_minor_name)) { 4552 return (1); 4553 } 4554 } 4555 4556 return (0); 4557 } 4558 4559 /* 4560 * See if there are mediator hosts and try to use the data. 4561 */ 4562 static int 4563 mediate( 4564 mddb_set_t *s 4565 ) 4566 { 4567 mddb_lb_t *lbp = s->s_lbp; 4568 med_data_lst_t *meddlp = NULL; 4569 med_data_lst_t *tmeddlp = NULL; 4570 med_data_t *meddp; 4571 int medok = 0; 4572 int medacc = 0; 4573 uint_t maxcc; 4574 int golden = 0; 4575 int err = 1; 4576 set_t setno = s->s_setno; 4577 4578 /* Do not have a mediator, then the state is stale */ 4579 if (s->s_med.n_cnt == 0) 4580 return (err); 4581 4582 /* Contact the mediator hosts for the data */ 4583 meddlp = get_med_host_data(&s->s_med, s->s_setname, setno); 4584 4585 /* No mediator data, stale */ 4586 if (meddlp == NULL) 4587 return (err); 4588 4589 /* Mark all the mediator data that is not for this set as errored */ 4590 for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) { 4591 struct timeval32 tmptime; 4592 meddp = tmeddlp->mdl_med; 4593 4594 /* Count the number of mediators contacted */ 4595 medacc++; 4596 4597 /* Paranoid check */ 4598 if (meddp->med_dat_sn != setno) 4599 meddp->med_dat_fl |= MED_DFL_ERROR; 4600 4601 TIMEVAL_TO_TIMEVAL32(&tmptime, &meddp->med_dat_id); 4602 4603 /*CSTYLED*/ 4604 if (timercmp(&tmptime, &lbp->lb_ident.createtime, !=)) 4605 meddp->med_dat_fl |= MED_DFL_ERROR; 4606 } 4607 4608 /* Get the max commitcount */ 4609 maxcc = 0; 4610 for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) { 4611 meddp = tmeddlp->mdl_med; 4612 if (meddp->med_dat_fl & MED_DFL_ERROR) 4613 continue; 4614 if (meddp->med_dat_cc > maxcc) 4615 maxcc = meddp->med_dat_cc; 4616 } 4617 4618 /* Now mark the records that don't have the highest cc as errored */ 4619 for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) { 4620 meddp = tmeddlp->mdl_med; 4621 if (meddp->med_dat_fl & MED_DFL_ERROR) 4622 continue; 4623 if (meddp->med_dat_cc != maxcc) 4624 meddp->med_dat_fl |= MED_DFL_ERROR; 4625 } 4626 4627 /* Now mark the records that don't match the lb commitcnt as errored */ 4628 for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) { 4629 meddp = tmeddlp->mdl_med; 4630 if (meddp->med_dat_fl & MED_DFL_ERROR) 4631 continue; 4632 if (meddp->med_dat_cc != lbp->lb_commitcnt) 4633 meddp->med_dat_fl |= MED_DFL_ERROR; 4634 } 4635 4636 /* Is there a "golden" copy and how many valid mediators */ 4637 for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) { 4638 meddp = tmeddlp->mdl_med; 4639 if (meddp->med_dat_fl & MED_DFL_ERROR) 4640 continue; 4641 4642 if (meddp->med_dat_fl & MED_DFL_GOLDEN) 4643 golden++; 4644 4645 medok++; 4646 } 4647 4648 /* No survivors, stale */ 4649 if (medok == 0) 4650 goto out; 4651 4652 /* No mediator quorum and no golden copies, stale */ 4653 if (medacc < ((s->s_med.n_cnt / 2) + 1) && ! golden) { 4654 /* Skip odd numbers, no exact 50% */ 4655 if (s->s_med.n_cnt & 1) 4656 goto out; 4657 /* Have 50%, allow an accept */ 4658 if (medacc == (s->s_med.n_cnt / 2)) 4659 md_set_setstatus(setno, MD_SET_ACCOK); 4660 goto out; 4661 } 4662 4663 /* We either have a quorum or a golden copy, or both */ 4664 err = 0; 4665 4666 out: 4667 if (meddlp) { 4668 for (/* void */; meddlp != NULL; meddlp = tmeddlp) { 4669 tmeddlp = meddlp->mdl_nx; 4670 kmem_free(meddlp->mdl_med, sizeof (med_data_t)); 4671 kmem_free(meddlp, sizeof (med_data_lst_t)); 4672 } 4673 } 4674 4675 return (err); 4676 } 4677 4678 /* 4679 * 1. read masterblks and locator blocks for all know database locations 4680 * a. keep track of which have good master blks 4681 * b. keep track of which have good locators 4682 * 4683 */ 4684 static int 4685 get_mbs_n_lbs( 4686 mddb_set_t *s, 4687 int *write_lb 4688 ) 4689 { 4690 mddb_lb_t *lbp = NULL; /* pointer to locator block */ 4691 /* May be cast to mddb_mnlb_t */ 4692 /* if accessing sidenames in */ 4693 /* MN set */ 4694 mddb_did_ic_t *did_icp = NULL; /* ptr to Device ID incore */ 4695 mddb_did_blk_t *did_blkp = 0; 4696 int did_blkp_sz = 0; 4697 mddb_did_db_t *did_dbp; 4698 mddb_did_info_t *did_info; 4699 caddr_t did_block; 4700 mddb_ri_t *rip; 4701 mddb_dtag_lst_t *dtlp; 4702 mddb_locator_t *lp; 4703 daddr_t physblk; 4704 int li; 4705 uint_t blk; 4706 md_dev64_t dev; 4707 caddr_t buffer; 4708 uint_t lb_blkcnt; 4709 int retval = 0; 4710 int err = 0; 4711 int lb_ok = 0; 4712 int lb_total = 0; 4713 int lb_tagged = 0; 4714 int lb_tags; 4715 set_t setno = s->s_setno; 4716 int cont_flag, i; 4717 mddb_did_db_t *did_dbp1, *did_dbp2; 4718 int mn_set = 0; 4719 mddb_cfg_loc_t *cl; 4720 4721 /* 4722 * read in master blocks and locator block for all known locators. 4723 * lb_blkcnt will be set correctly for MN set later once getmasters 4724 * has determined that the set is a MN set. 4725 */ 4726 lb_blkcnt = ((setno == MD_LOCAL_SET) ? 4727 MDDB_LOCAL_LBCNT : MDDB_LBCNT); 4728 4729 for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) { 4730 rip->ri_flags &= (MDDB_F_PTCHED | MDDB_F_IOCTL | 4731 MDDB_F_EMASTER); 4732 rip->ri_lbp = (mddb_lb_t *)NULL; 4733 rip->ri_did_icp = (mddb_did_ic_t *)NULL; 4734 4735 /* 4736 * Translated dev is only used in calls to getmasters and 4737 * getblks which expect a translated (aka miniroot) dev. 4738 */ 4739 dev = md_xlate_targ_2_mini(rip->ri_dev); 4740 if (dev == NODEV64) { 4741 /* Set error flag that getmasters would have set */ 4742 /* if getmasters had been allowed to fail */ 4743 rip->ri_flags |= MDDB_F_EMASTER; 4744 } 4745 4746 /* 4747 * Invalid device id on system (due to failed or 4748 * removed device) or invalid devt during upgrade 4749 * (due to powered off device) will cause this 4750 * replica to be marked in error and not used. 4751 */ 4752 if (rip->ri_flags & MDDB_F_EMASTER) 4753 continue; 4754 4755 /* get all master blocks, does mddb_devopen() */ 4756 rip->ri_mbip = getmasters(s, dev, rip->ri_blkno, 4757 &rip->ri_flags, &mn_set); 4758 4759 /* if invalid master block - try next replica */ 4760 if (! rip->ri_mbip) 4761 continue; 4762 4763 /* 4764 * If lbp alloc'd to wrong size - reset it. 4765 * If MN set, lb_blkcnt must be MDDB_MNLBCNT. 4766 * If a traditional set, lb_blkcnt must NOT be MDDB_MNLBCNT. 4767 */ 4768 if (lbp) { 4769 if (((mn_set) && (lb_blkcnt != MDDB_MNLBCNT)) || 4770 ((!mn_set) && (lb_blkcnt == MDDB_MNLBCNT))) { 4771 kmem_free((caddr_t)lbp, dbtob(lb_blkcnt)); 4772 lbp = (mddb_lb_t *)NULL; 4773 } 4774 } 4775 4776 if (lbp == (mddb_lb_t *)NULL) { 4777 /* If a MN set, set lb_blkcnt for MN loc blk size */ 4778 if (mn_set) 4779 lb_blkcnt = MDDB_MNLBCNT; 4780 lbp = (mddb_lb_t *)kmem_zalloc(dbtob(lb_blkcnt), 4781 KM_SLEEP); 4782 } 4783 4784 /* 4785 * Read in all the sectors for the locator block 4786 * NOTE: Need to use getblks, rather than readblklst. 4787 * because it is too early and things are 4788 * NOT set up yet for read*()'s 4789 */ 4790 buffer = (caddr_t)lbp; 4791 for (blk = 0; blk < lb_blkcnt; blk++) { 4792 physblk = getphysblk(blk, rip->ri_mbip); 4793 err = getblks(s, buffer, dev, physblk, 4794 btodb(MDDB_BSIZE)); 4795 if (err) { 4796 rip->ri_flags |= err; 4797 break; 4798 } 4799 buffer += MDDB_BSIZE; 4800 } 4801 4802 if (err) 4803 continue; 4804 4805 /* Verify the locator block */ 4806 if (blk != lb_blkcnt) 4807 continue; 4808 if (lbp->lb_magic != MDDB_MAGIC_LB) 4809 continue; 4810 if (lbp->lb_blkcnt != lb_blkcnt) 4811 continue; 4812 if (mn_set) { 4813 /* If a MN set, check for MNLB revision in lb. */ 4814 if (revchk(MDDB_REV_MNLB, lbp->lb_revision)) 4815 continue; 4816 } else { 4817 /* If not a MN set, check for LB revision in lb. */ 4818 if (revchk(MDDB_REV_LB, lbp->lb_revision)) 4819 continue; 4820 } 4821 if (crcchk(lbp, &lbp->lb_checksum, dbtob(lb_blkcnt), NULL)) 4822 continue; 4823 4824 /* 4825 * With the addition of MultiNode Disksets, we must make sure 4826 * to verify that this is the correct set. A node could 4827 * have been out of the config for awhile and this disk could 4828 * have been moved to a different diskset and we don't want 4829 * to accidentally start the wrong set. 4830 * 4831 * We don't do this check if we're in the middle of 4832 * importing a set. 4833 */ 4834 if (!(md_get_setstatus(s->s_setno) & MD_SET_IMPORT) && 4835 (lbp->lb_setno != s->s_setno)) 4836 continue; 4837 4838 rip->ri_flags |= MDDB_F_LOCACC; 4839 4840 /* 4841 * a commit count of zero means this locator has been deleted 4842 */ 4843 if (lbp->lb_commitcnt == 0) 4844 continue; 4845 4846 /* 4847 * If replica is in the device ID style and md_devid_destroy 4848 * flag is set, turn off device id style. This is only to be 4849 * used in a catastrophic failure case. Examples would be 4850 * where the device id of all drives in the system 4851 * (especially the mirror'd root drives) had been changed 4852 * by firmware upgrade or by a patch to an existing disk 4853 * driver. Another example would be in the case of non-unique 4854 * device ids due to a bug. The device id would be valid on 4855 * the system, but would return the wrong dev_t. 4856 */ 4857 if ((lbp->lb_flags & MDDB_DEVID_STYLE) && md_devid_destroy) { 4858 lbp->lb_flags &= ~MDDB_DEVID_STYLE; 4859 lbp->lb_didfirstblk = 0; 4860 lbp->lb_didblkcnt = 0; 4861 *write_lb = 1; 4862 } 4863 4864 4865 /* 4866 * If replica is in device ID style, read in device ID 4867 * block and verify device ID block information. 4868 */ 4869 if (lbp->lb_flags & MDDB_DEVID_STYLE) { 4870 4871 /* Read in device ID block */ 4872 if (did_icp == NULL) { 4873 did_icp = (mddb_did_ic_t *) 4874 kmem_zalloc(sizeof (mddb_did_ic_t), 4875 KM_SLEEP); 4876 } else { 4877 /* Reuse did_icp, but clear out data */ 4878 if (did_icp->did_ic_blkp != 4879 (mddb_did_blk_t *)NULL) { 4880 kmem_free((caddr_t)did_icp->did_ic_blkp, 4881 did_blkp_sz); 4882 did_blkp = (mddb_did_blk_t *)NULL; 4883 did_icp->did_ic_blkp = 4884 (mddb_did_blk_t *)NULL; 4885 } 4886 if (did_icp->did_ic_dbp != 4887 (mddb_did_db_t *)NULL) { 4888 did_dbp1 = did_icp->did_ic_dbp; 4889 while (did_dbp1) { 4890 did_dbp2 = did_dbp1->db_next; 4891 kmem_free((caddr_t)did_dbp1->db_ptr, 4892 dbtob(did_dbp1->db_blkcnt)); 4893 kmem_free((caddr_t)did_dbp1, 4894 sizeof (mddb_did_db_t)); 4895 did_dbp1 = did_dbp2; 4896 } 4897 did_icp->did_ic_dbp = 4898 (mddb_did_db_t *)NULL; 4899 } 4900 for (i = 0; i < MDDB_NLB; i++) { 4901 did_icp->did_ic_devid[i] = 4902 (ddi_devid_t)NULL; 4903 } 4904 } 4905 4906 /* Can't reuse blkp since size could be different */ 4907 if (did_blkp != (mddb_did_blk_t *)NULL) { 4908 kmem_free(did_blkp, did_blkp_sz); 4909 } 4910 did_blkp_sz = (int)dbtob(lbp->lb_didblkcnt); 4911 did_blkp = (mddb_did_blk_t *)kmem_zalloc(did_blkp_sz, 4912 KM_SLEEP); 4913 did_icp->did_ic_blkp = did_blkp; 4914 buffer = (caddr_t)did_blkp; 4915 for (blk = lbp->lb_didfirstblk; 4916 blk < (lbp->lb_didblkcnt + lbp->lb_didfirstblk); 4917 blk++) { 4918 physblk = getphysblk(blk, rip->ri_mbip); 4919 err = getblks(s, buffer, dev, physblk, 4920 btodb(MDDB_BSIZE)); 4921 if (err) { 4922 rip->ri_flags |= err; 4923 break; 4924 } 4925 buffer += MDDB_BSIZE; 4926 } 4927 if (err) 4928 continue; 4929 4930 /* Verify the Device ID block */ 4931 if (blk != (lbp->lb_didblkcnt + lbp->lb_didfirstblk)) 4932 continue; 4933 if (did_blkp->blk_magic != MDDB_MAGIC_DI) 4934 continue; 4935 if (lbp->lb_didblkcnt != MDDB_DID_BLOCKS) 4936 continue; 4937 if (revchk(MDDB_REV_DI, did_blkp->blk_revision)) 4938 continue; 4939 if (crcchk(did_blkp, &did_blkp->blk_checksum, 4940 dbtob(lbp->lb_didblkcnt), NULL)) 4941 continue; 4942 4943 /* 4944 * Check if device ID block is out of sync with the 4945 * Locator Block by checking if the locator block 4946 * commitcnt does not match the device id block 4947 * commitcnt. If an 'out of sync' condition 4948 * exists, discard this replica since it has 4949 * inconsistent data and can't be used in 4950 * determining the best replica. 4951 * 4952 * An 'out of sync' condition could happen if old 4953 * SDS code was running with new devid style replicas 4954 * or if a failure occurred between the writing of 4955 * the locator block's commitcnt and the device 4956 * id block's commitcnt. 4957 * 4958 * If old SDS code had been running, the upgrade 4959 * process should detect this situation and 4960 * have removed all of the device id information 4961 * via the md_devid_destroy flag in md.conf. 4962 */ 4963 if (did_blkp->blk_commitcnt != 4964 lbp->lb_commitcnt) { 4965 continue; 4966 } 4967 } 4968 4969 4970 /* 4971 * If replica is still in device ID style, read in all 4972 * of the device IDs, verify the checksum of the device IDs. 4973 */ 4974 if (lbp->lb_flags & MDDB_DEVID_STYLE) { 4975 /* 4976 * Reset valid bit in device id info block flags. This 4977 * flag is stored on disk, but the valid bit is reset 4978 * when reading in the replica. If the corresponding 4979 * device id is valid (aka meaning that the system 4980 * knows about this device id), the valid bit will 4981 * be set at a later time. The valid bit for this 4982 * replica's device ID will be set in this routine. 4983 * The valid bits for the rest of the device id's 4984 * will be set after the 'best' replica has 4985 * been selected in routine load_old_replicas. 4986 * Reset updated bit in device id info block flags. 4987 * This flag is also stored on disk, reset when read 4988 * in and set when the locators and side locators 4989 * have been updated to match this valid device 4990 * id information. 4991 */ 4992 for (li = 0; li < lbp->lb_loccnt; li++) { 4993 did_info = &did_blkp->blk_info[li]; 4994 if (did_info->info_flags & MDDB_DID_EXISTS) 4995 did_info->info_flags &= 4996 ~(MDDB_DID_VALID | MDDB_DID_UPDATED); 4997 } 4998 4999 cont_flag = 0; 5000 for (li = 0; li < lbp->lb_loccnt; li++) { 5001 did_info = &did_blkp->blk_info[li]; 5002 did_block = (caddr_t)NULL; 5003 if (did_info->info_flags & MDDB_DID_EXISTS) { 5004 /* Check if block has already been read in */ 5005 did_dbp = did_icp->did_ic_dbp; 5006 while (did_dbp != 0) { 5007 if (did_dbp->db_firstblk == 5008 did_info->info_firstblk) 5009 break; 5010 else 5011 did_dbp = did_dbp->db_next; 5012 } 5013 /* if block not found, read it in */ 5014 if (did_dbp == NULL) { 5015 did_block = (caddr_t)(kmem_zalloc(dbtob 5016 (did_info->info_blkcnt), KM_SLEEP)); 5017 buffer = (caddr_t)did_block; 5018 for (blk = did_info->info_firstblk; 5019 blk < (did_info->info_firstblk + 5020 did_info->info_blkcnt); blk++) { 5021 physblk = getphysblk(blk, rip->ri_mbip); 5022 err = getblks(s, buffer, dev, physblk, 5023 btodb(MDDB_BSIZE)); 5024 if (err) { 5025 rip->ri_flags |= err; 5026 break; 5027 } 5028 buffer += MDDB_BSIZE; 5029 } 5030 if (err) { 5031 kmem_free(did_block, 5032 dbtob(did_info->info_blkcnt)); 5033 did_block = (caddr_t)NULL; 5034 cont_flag = 1; 5035 break; 5036 } 5037 5038 /* 5039 * Block read in - alloc Disk Block area 5040 */ 5041 did_dbp = (mddb_did_db_t *)kmem_zalloc( 5042 sizeof (mddb_did_db_t), KM_SLEEP); 5043 did_dbp->db_ptr = did_block; 5044 did_dbp->db_firstblk = did_info->info_firstblk; 5045 did_dbp->db_blkcnt = did_info->info_blkcnt; 5046 5047 /* Add to front of dbp list */ 5048 did_dbp->db_next = did_icp->did_ic_dbp; 5049 did_icp->did_ic_dbp = did_dbp; 5050 } 5051 /* Check validity of devid in block */ 5052 if (crcchk(((char *)did_dbp->db_ptr + 5053 did_info->info_offset), 5054 &did_info->info_checksum, 5055 did_info->info_length, NULL)) { 5056 cont_flag = 1; 5057 break; 5058 } 5059 5060 /* Block now pointed to by did_dbp */ 5061 did_icp->did_ic_devid[li] = (ddi_devid_t) 5062 ((char *)did_dbp->db_ptr + 5063 did_info->info_offset); 5064 } 5065 } 5066 if (cont_flag) 5067 continue; 5068 } 5069 5070 /* 5071 * All blocks containing devids are now in core. 5072 */ 5073 5074 /* 5075 * If we're doing a replicated import (also known as 5076 * remote copy import), the device id in the locator 5077 * block is incorrect and we need to fix it up here 5078 * alongwith the l_dev otherwise we run into lots of 5079 * trouble later on. 5080 */ 5081 if ((md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) { 5082 mddb_ri_t *trip; 5083 for (li = 0; li < lbp->lb_loccnt; li++) { 5084 did_info = &did_blkp->blk_info[li]; 5085 lp = &lbp->lb_locators[li]; 5086 5087 if (lp->l_flags & MDDB_F_DELETED) 5088 continue; 5089 5090 if (!(did_info->info_flags & MDDB_DID_EXISTS)) 5091 continue; 5092 5093 if (rip->ri_old_devid == NULL) 5094 continue; 5095 5096 if (did_icp->did_ic_devid[li] == NULL) 5097 continue; 5098 5099 for (trip = s->s_rip; trip != NULL; 5100 trip = trip->ri_next) { 5101 if (ddi_devid_compare( 5102 trip->ri_old_devid, 5103 did_icp->did_ic_devid[li]) != 0) { 5104 continue; 5105 } 5106 5107 /* update l_dev */ 5108 lp->l_dev = md_cmpldev(trip->ri_dev); 5109 } 5110 } 5111 } 5112 5113 5114 /* 5115 * If there is a valid devid, verify that this locator 5116 * block has information about itself by checking the 5117 * device ID, minor_name and block 5118 * number from this replica's incore data structure 5119 * against the locator block information that has just 5120 * been read in from disk. 5121 * 5122 * If not a valid devid, verify that this locator block 5123 * has information about itself by checking the minor 5124 * number, block number and driver name from this 5125 * replica's incore data structure against the locator 5126 * block information that has just been read in from disk. 5127 */ 5128 if ((rip->ri_devid != NULL) && 5129 (lbp->lb_flags & MDDB_DEVID_STYLE)) { 5130 /* 5131 * This locator block MUST have locator (replica) 5132 * information about itself. Check against devid, 5133 * slice part of minor number, and block number. 5134 */ 5135 for (li = 0; li < lbp->lb_loccnt; li++) { 5136 did_info = &did_blkp->blk_info[li]; 5137 lp = &lbp->lb_locators[li]; 5138 if (lp->l_flags & MDDB_F_DELETED) 5139 continue; 5140 5141 if (!(did_info->info_flags & MDDB_DID_EXISTS)) 5142 continue; 5143 5144 if ((md_get_setstatus(setno) & 5145 MD_SET_REPLICATED_IMPORT)) { 5146 if (ddi_devid_compare(rip->ri_old_devid, 5147 did_icp->did_ic_devid[li]) != 0) 5148 continue; 5149 } else { 5150 if (ddi_devid_compare(rip->ri_devid, 5151 did_icp->did_ic_devid[li]) != 0) 5152 continue; 5153 } 5154 5155 if (strcmp(rip->ri_minor_name, 5156 did_info->info_minor_name) != 0) 5157 continue; 5158 5159 if (lp->l_blkno == rip->ri_blkno) 5160 break; 5161 } 5162 } else { 5163 /* 5164 * This locator block MUST have locator (replica) 5165 * information about itself. 5166 */ 5167 if (!mn_set) { 5168 for (li = 0; li < lbp->lb_loccnt; li++) { 5169 mddb_drvnm_t *dn; 5170 mddb_sidelocator_t *slp; 5171 5172 lp = &lbp->lb_locators[li]; 5173 slp = &lbp->lb_sidelocators[s->s_sideno][li]; 5174 if (lp->l_flags & MDDB_F_DELETED) 5175 continue; 5176 if (slp->l_mnum != md_getminor(rip->ri_dev)) 5177 continue; 5178 if (lp->l_blkno != rip->ri_blkno) 5179 continue; 5180 dn = &lbp->lb_drvnm[slp->l_drvnm_index]; 5181 if (strncmp(dn->dn_data, rip->ri_driver, 5182 MD_MAXDRVNM) == 0) 5183 break; 5184 } 5185 } else { 5186 for (li = 0; li < lbp->lb_loccnt; li++) { 5187 mddb_drvnm_t *dn; 5188 mddb_mnsidelocator_t *mnslp; 5189 mddb_mnlb_t *mnlbp; 5190 int i; 5191 5192 /* 5193 * Check all possible locators locking for 5194 * match to the currently read-in locator, 5195 * must match on: 5196 * - blkno 5197 * - side locator for this node's side 5198 * - side locator minor number 5199 * - side locator driver name 5200 */ 5201 5202 /* Looking at sidelocs - cast lbp -> mnlbp */ 5203 mnlbp = (mddb_mnlb_t *)lbp; 5204 lp = &mnlbp->lb_locators[li]; 5205 if (lp->l_flags & MDDB_F_DELETED) 5206 continue; 5207 if (lp->l_blkno != rip->ri_blkno) 5208 continue; 5209 5210 for (i = 0; i < MD_MNMAXSIDES; i++) { 5211 mnslp = &mnlbp->lb_mnsidelocators[i][li]; 5212 if (mnslp->mnl_sideno == s->s_sideno) { 5213 break; 5214 } 5215 } 5216 /* No matching side found */ 5217 if (i == MD_MNMAXSIDES) 5218 continue; 5219 if (mnslp->mnl_mnum != md_getminor(rip->ri_dev)) 5220 continue; 5221 dn = &lbp->lb_drvnm[mnslp->mnl_drvnm_index]; 5222 if (strncmp(dn->dn_data, rip->ri_driver, 5223 MD_MAXDRVNM) == 0) 5224 break; 5225 } 5226 } 5227 } 5228 5229 /* 5230 * Didn't find ourself in this locator block it means 5231 * the locator block is a stale transplant. Probably from 5232 * a user doing a dd. 5233 */ 5234 if (li == lbp->lb_loccnt) 5235 continue; 5236 5237 /* 5238 * Keep track of the number of accessed and valid 5239 * locator blocks. 5240 */ 5241 lb_ok++; 5242 5243 /* 5244 * Read the tag in, skips invalid or blank tags. 5245 * Only valid tags allocate storage 5246 * Data tags are not used in MN disksets. 5247 */ 5248 if ((!mn_set) && (! dt_read(s, lbp, rip))) { 5249 /* 5250 * Keep track of the number of tagged 5251 * locator blocks. 5252 */ 5253 lb_tagged++; 5254 5255 /* Keep a list of unique tags. */ 5256 (void) dtl_addl(s, &rip->ri_dtp->dt_dtag); 5257 } 5258 5259 if (!(md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) { 5260 /* 5261 * go through locator block and add any other 5262 * locations of the data base. 5263 * For the replicated import case, this was done earlier 5264 * and we really don't need or want to do so again 5265 */ 5266 cl = kmem_zalloc(sizeof (mddb_cfg_loc_t), KM_SLEEP); 5267 for (li = 0; li < lbp->lb_loccnt; li++) { 5268 lp = &lbp->lb_locators[li]; 5269 if (lp->l_flags & MDDB_F_DELETED) 5270 continue; 5271 5272 cl->l_devid_flags = MDDB_DEVID_GETSZ; 5273 cl->l_devid = (uint64_t)0; 5274 cl->l_devid_sz = 0; 5275 cl->l_old_devid = (uint64_t)0; 5276 cl->l_old_devid_sz = 0; 5277 cl->l_minor_name[0] = '\0'; 5278 locator2cfgloc(lbp, cl, li, s->s_sideno, 5279 did_icp); 5280 5281 if (cl->l_devid_flags & MDDB_DEVID_SZ) { 5282 if ((cl->l_devid = (uintptr_t)kmem_alloc 5283 (cl->l_devid_sz, KM_SLEEP)) 5284 == NULL) { 5285 continue; 5286 } else { 5287 cl->l_devid_flags = 5288 MDDB_DEVID_SPACE; 5289 } 5290 } 5291 locator2cfgloc(lbp, cl, li, s->s_sideno, 5292 did_icp); 5293 5294 (void) ridev(&s->s_rip, cl, &lp->l_dev, 0); 5295 5296 if (cl->l_devid_flags & MDDB_DEVID_SPACE) 5297 kmem_free((caddr_t)(uintptr_t) 5298 cl->l_devid, cl->l_devid_sz); 5299 } 5300 kmem_free(cl, sizeof (mddb_cfg_loc_t)); 5301 } 5302 5303 /* Save LB for later */ 5304 rip->ri_lbp = lbp; 5305 if (lbp->lb_flags & MDDB_DEVID_STYLE) { 5306 rip->ri_did_icp = did_icp; 5307 did_icp = (mddb_did_ic_t *)NULL; 5308 did_blkp = (mddb_did_blk_t *)NULL; 5309 } else 5310 rip->ri_did_icp = NULL; 5311 lbp = (mddb_lb_t *)NULL; 5312 } 5313 5314 if (lbp != (mddb_lb_t *)NULL) 5315 kmem_free((caddr_t)lbp, dbtob(lb_blkcnt)); 5316 5317 if (did_icp != (mddb_did_ic_t *)NULL) { 5318 if (did_icp->did_ic_blkp != (mddb_did_blk_t *)NULL) { 5319 kmem_free((caddr_t)did_icp->did_ic_blkp, did_blkp_sz); 5320 did_blkp = (mddb_did_blk_t *)NULL; 5321 } 5322 if (did_icp->did_ic_dbp != (mddb_did_db_t *)NULL) { 5323 mddb_did_db_t *did_dbp1, *did_dbp2; 5324 5325 did_dbp1 = did_icp->did_ic_dbp; 5326 while (did_dbp1) { 5327 did_dbp2 = did_dbp1->db_next; 5328 kmem_free((caddr_t)did_dbp1->db_ptr, 5329 dbtob(did_dbp1->db_blkcnt)); 5330 kmem_free((caddr_t)did_dbp1, 5331 sizeof (mddb_did_db_t)); 5332 did_dbp1 = did_dbp2; 5333 } 5334 } 5335 kmem_free((caddr_t)did_icp, sizeof (mddb_did_ic_t)); 5336 } 5337 5338 if (did_blkp != (mddb_did_blk_t *)NULL) { 5339 kmem_free((caddr_t)did_blkp, did_blkp_sz); 5340 } 5341 5342 /* No locator blocks were ok */ 5343 if (lb_ok == 0) 5344 goto out; 5345 5346 /* No tagged data was found - will be 0 for MN diskset */ 5347 if (lb_tagged == 0) 5348 goto out; 5349 5350 /* Find the highest non-deleted replica count */ 5351 for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) { 5352 int lb_tot = 0; 5353 5354 if (rip->ri_mbip == (mddb_mb_ic_t *)NULL) 5355 continue; 5356 5357 if (rip->ri_lbp == (mddb_lb_t *)NULL) 5358 continue; 5359 5360 for (li = 0; li < rip->ri_lbp->lb_loccnt; li++) { 5361 lp = &rip->ri_lbp->lb_locators[li]; 5362 if (lp->l_flags & MDDB_F_DELETED) 5363 continue; 5364 lb_tot++; 5365 } 5366 5367 if (lb_tot > lb_total) 5368 lb_total = lb_tot; 5369 } 5370 5371 /* Count the number of unique tags */ 5372 for (lb_tags = 0, dtlp = s->s_dtlp; dtlp != NULL; dtlp = dtlp->dtl_nx) 5373 lb_tags++; 5374 5375 /* Should have at least one tag at this point */ 5376 ASSERT(lb_tags > 0); 5377 5378 5379 /* 5380 * If the number of tagged locators is not the same as the number of 5381 * OK locators OR more than one tag exists, then make sure the 5382 * selected tag will be written out later. 5383 */ 5384 if ((lb_tagged - lb_ok) != 0 || lb_tags > 1) 5385 md_set_setstatus(setno, MD_SET_TAGDATA); 5386 5387 /* Only a single tag, take the tagged data */ 5388 if (lb_tags == 1) { 5389 dt_setup(s, &s->s_dtlp->dtl_dt); 5390 md_set_setstatus(setno, MD_SET_USETAG); 5391 goto out; 5392 } 5393 5394 /* Multiple tags, not selecting a tag, tag mode is on */ 5395 if (! (md_get_setstatus(setno) & MD_SET_USETAG)) 5396 retval = MDDB_E_TAGDATA; 5397 5398 out: 5399 5400 return (retval); 5401 } 5402 5403 /* 5404 * 1. Select a locator. 5405 * 2. check if enough locators now have current copies 5406 * 3. read in database from one of latest 5407 * 4. if known to have latest make all database the same 5408 * 5. if configuration has changed rewrite locators 5409 * 5410 * Parameters: 5411 * s - pointer to mddb_set structure 5412 * flag - used in MN disksets to tell if this node is being joined to 5413 * a diskset that is in the STALE state. If the flag is 5414 * MDDB_MN_STALE, then this node should be marked in the STALE 5415 * state even if > 50% mddbs are available. (The diskset can 5416 * only change from STALE->OK if all nodes withdraw from the 5417 * MN diskset and then rejoin). 5418 */ 5419 static int 5420 load_old_replicas( 5421 mddb_set_t *s, 5422 int flag 5423 ) 5424 { 5425 mddb_lb_t *lbp = NULL; 5426 mddb_mnlb_t *mnlbp = NULL; 5427 mddb_ri_t *rip; 5428 mddb_locator_t *lp; 5429 mddb_db_t *dbp; 5430 mddb_de_ic_t *dep; 5431 int li; 5432 int alc; 5433 int lc; 5434 int tlc; 5435 int retval = 0; 5436 caddr_t p; 5437 size_t maxrecsize; 5438 set_t setno = s->s_setno; 5439 mddb_did_db_t *did_dbp1; 5440 mddb_did_info_t *did_info; 5441 mddb_did_ic_t *did_icp = NULL; 5442 md_dev64_t *newdev; 5443 mddb_sidelocator_t *slp = 0; 5444 mddb_mnsidelocator_t *mnslp = 0; 5445 uchar_t i; 5446 char *name; 5447 ddi_devid_t ret_devid; 5448 md_dev64_t dev; 5449 uint_t len, sz; 5450 char *minor_name; 5451 int write_lb = 0; 5452 5453 /* The only error path out of get_mbs_n_lbs() is MDDB_E_TAGDATA */ 5454 if (retval = get_mbs_n_lbs(s, &write_lb)) 5455 goto errout; 5456 5457 if ((lbp = s->s_lbp = selectlocator(s)) == NULL) { 5458 retval = MDDB_E_NOLOCBLK; 5459 goto errout; 5460 } 5461 5462 /* If a multi-node set, then set md_set.s_status flag */ 5463 if (lbp->lb_flags & MDDB_MNSET) { 5464 md_set_setstatus(setno, MD_SET_MNSET); 5465 /* 5466 * If data tag area had been allocated before set type was 5467 * known - free it now. 5468 */ 5469 if (md_set[setno].s_dtp) { 5470 kmem_free((caddr_t)md_set[setno].s_dtp, MDDB_DT_BYTES); 5471 md_set[setno].s_dtp = NULL; 5472 } 5473 } 5474 5475 /* 5476 * If the replica is in devid format, setup the devid incore ptr. 5477 */ 5478 if (lbp->lb_flags & MDDB_DEVID_STYLE) { 5479 for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) { 5480 if (rip->ri_lbp == s->s_lbp) { 5481 did_icp = s->s_did_icp = rip->ri_did_icp; 5482 break; 5483 } 5484 } 5485 /* 5486 * If no devid incore info found - something has gone 5487 * wrong so errout. 5488 */ 5489 if (rip == NULL) { 5490 retval = MDDB_E_NODEVID; 5491 goto errout; 5492 } 5493 5494 /* 5495 * Add all blocks containing devids to free list. 5496 * Then remove addresses that actually contain devids. 5497 */ 5498 did_dbp1 = did_icp->did_ic_dbp; 5499 while (did_dbp1) { 5500 if (mddb_devid_free_add(s, did_dbp1->db_firstblk, 5501 0, dbtob(did_dbp1->db_blkcnt))) { 5502 retval = MDDB_E_NOSPACE; 5503 goto errout; 5504 } 5505 5506 did_dbp1 = did_dbp1->db_next; 5507 } 5508 for (li = 0; li < lbp->lb_loccnt; li++) { 5509 did_info = &(did_icp->did_ic_blkp->blk_info[li]); 5510 if (!(did_info->info_flags & MDDB_DID_EXISTS)) 5511 continue; 5512 5513 if (mddb_devid_free_delete(s, did_info->info_firstblk, 5514 did_info->info_offset, did_info->info_length)) { 5515 /* unable to find disk block */ 5516 retval = MDDB_E_NODEVID; 5517 goto errout; 5518 } 5519 } 5520 } 5521 5522 /* 5523 * create mddb_mbaray, count all locators and active locators. 5524 */ 5525 alc = 0; 5526 lc = 0; 5527 for (li = 0; li < lbp->lb_loccnt; li++) { 5528 ddi_devid_t li_devid; 5529 5530 lp = &lbp->lb_locators[li]; 5531 5532 if (lp->l_flags & MDDB_F_DELETED) 5533 continue; 5534 5535 /* Count non-deleted replicas */ 5536 lc++; 5537 5538 /* 5539 * Use the devid of this locator to compare with the rip 5540 * list. The scenario to watch out for here is that this 5541 * locator could be on a disk that is dead and there could 5542 * be a valid entry in the rip list for a different disk 5543 * that has been moved to the dead disks dev_t. We don't 5544 * want to match with the moved disk. 5545 */ 5546 li_devid = NULL; 5547 (void) mddb_devid_get(s, li, &li_devid, &minor_name); 5548 5549 for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) { 5550 if (match_mddb(rip, li_devid, minor_name, 5551 md_expldev(lp->l_dev), lp->l_blkno)) { 5552 break; 5553 } 5554 } 5555 if (rip == NULL) { 5556 /* 5557 * If rip not found, then mark error in master block 5558 * so that no writes are later attempted to this 5559 * replica. rip may not be setup if ridev 5560 * failed due to un-found driver name. 5561 */ 5562 lp->l_flags |= MDDB_F_EMASTER; 5563 continue; 5564 } 5565 5566 s->s_mbiarray[li] = rip->ri_mbip; 5567 5568 lp->l_flags &= MDDB_F_ACTIVE; 5569 lp->l_flags |= (int)rip->ri_flags; 5570 5571 if (rip->ri_transplant) 5572 lp->l_flags &= ~MDDB_F_ACTIVE; 5573 5574 if (lp->l_flags & MDDB_F_LOCACC) 5575 alc++; 5576 } 5577 5578 /* Save on a divide - calculate 50% + 1 up front */ 5579 tlc = ((lc + 1) / 2); 5580 5581 if (alc > tlc) { /* alc > tlc - OK */ 5582 md_clr_setstatus(setno, MD_SET_STALE); 5583 } else if (alc < tlc) { /* alc < tlc - stale */ 5584 md_set_setstatus(setno, MD_SET_STALE); 5585 } else if (lc & 1) { /* alc == tlc && odd - OK */ 5586 md_clr_setstatus(setno, MD_SET_STALE); 5587 } else { /* alc == tlc && even - ? */ 5588 /* Can do an accept, and are */ 5589 if (md_get_setstatus(setno) & (MD_SET_ACCOK | MD_SET_ACCEPT)) { 5590 md_clr_setstatus(setno, MD_SET_STALE); 5591 } else { /* possibly has a mediator */ 5592 if (mediate(s)) { 5593 md_set_setstatus(setno, MD_SET_STALE); 5594 } else { 5595 md_clr_setstatus(setno, MD_SET_STALE); 5596 } 5597 } 5598 5599 /* 5600 * The mirrored_root_flag allows the sysadmin to decide to 5601 * start the local set in a read/write (non-stale) mode 5602 * when there are only 50% available mddbs on the system and 5603 * when the root file system is on a mirror. This is useful 5604 * in a 2 disk system where 1 disk failure would cause an mddb 5605 * quorum failure and subsequent boot failures since the root 5606 * filesystem would be in a read-only state. 5607 */ 5608 if (mirrored_root_flag == 1 && setno == 0 && 5609 svm_bootpath[0] != 0) { 5610 md_clr_setstatus(setno, MD_SET_STALE); 5611 } else { 5612 if (md_get_setstatus(setno) & MD_SET_STALE) { 5613 /* Allow half mode - CAREFUL! */ 5614 if (mddb_allow_half) 5615 md_clr_setstatus(setno, MD_SET_STALE); 5616 } 5617 } 5618 5619 /* 5620 * In a MN diskset, 5621 * - if 50% mddbs are unavailable and this 5622 * has been marked STALE above 5623 * - master node isn't in the STALE state 5624 * - this node isn't the master node (this node 5625 * isn't the first node to join the set) 5626 * then clear the STALE state and set TOOFEW. 5627 * 5628 * If this node is the master node and set was marked STALE, 5629 * then the set stays STALE. 5630 * 5631 * If this node is not the master and this node's state is 5632 * STALE and the master node is not marked STALE, 5633 * then master node must be in the TOOFEW state or the 5634 * master is panic'ing. A MN diskset can only be placed into 5635 * the STALE state by having the first node join the set 5636 * with <= 50% mddbs. There's no way for a MN diskset to 5637 * transition between STALE and not-STALE states unless all 5638 * nodes are withdrawn from the diskset or all nodes in the 5639 * diskset are rebooted at the same time. 5640 * 5641 * So, mark this node's state as TOOFEW instead of STALE. 5642 */ 5643 if (((md_get_setstatus(setno) & (MD_SET_MNSET | MD_SET_STALE)) 5644 == (MD_SET_MNSET | MD_SET_STALE)) && 5645 ((flag & MDDB_MN_STALE) == 0) && 5646 (!(md_set[setno].s_am_i_master))) { 5647 md_clr_setstatus(setno, MD_SET_STALE); 5648 md_set_setstatus(setno, MD_SET_TOOFEW); 5649 } 5650 } 5651 5652 /* 5653 * If a MN set is marked STALE on the other nodes, 5654 * mark it stale here. Override all other considerations 5655 * such as a mediator or > 50% mddbs available. 5656 */ 5657 if (md_get_setstatus(setno) & MD_SET_MNSET) { 5658 if (flag & MDDB_MN_STALE) 5659 md_set_setstatus(setno, MD_SET_STALE); 5660 } 5661 5662 /* 5663 * read a good copy of the locator names 5664 * if an error occurs reading what is suppose 5665 * to be a good copy continue looking for another 5666 * good copy 5667 */ 5668 s->s_lnp = NULL; 5669 for (li = 0; li < lbp->lb_loccnt; li++) { 5670 lp = &lbp->lb_locators[li]; 5671 if ((! (lp->l_flags & MDDB_F_ACTIVE)) || 5672 (lp->l_flags & MDDB_F_EMASTER)) 5673 continue; 5674 5675 /* Find rip entry for this locator if one exists */ 5676 for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) { 5677 if (match_mddb(rip, NULL, NULL, md_expldev(lp->l_dev), 5678 lp->l_blkno)) 5679 break; 5680 } 5681 5682 if (rip == NULL) { 5683 continue; 5684 } 5685 if (rip->ri_lbp == (mddb_lb_t *)NULL) { 5686 continue; 5687 } 5688 if (rip->ri_lbp->lb_commitcnt != lbp->lb_commitcnt) { 5689 continue; 5690 } 5691 5692 /* 5693 * Now have a copy of the database that is equivalent 5694 * to the chosen locator block with respect to 5695 * inittime, identifier and commitcnt. Trying the 5696 * equivalent databases in the order that they were 5697 * written will provide the most up to date data. 5698 */ 5699 lp->l_flags |= readlocnames(s, li); 5700 if (s->s_lnp) 5701 break; 5702 } 5703 5704 if (s->s_lnp == NULL) { 5705 retval = MDDB_E_NOLOCNMS; 5706 goto errout; 5707 } 5708 5709 /* 5710 * read a good copy of the data base 5711 * if an error occurs reading what is suppose 5712 * to be a good copy continue looking for another 5713 * good copy 5714 */ 5715 5716 s->s_dbp = NULL; 5717 for (li = 0; li < lbp->lb_loccnt; li++) { 5718 lp = &lbp->lb_locators[li]; 5719 if ((! (lp->l_flags & MDDB_F_ACTIVE)) || 5720 (lp->l_flags & MDDB_F_EMASTER)) 5721 continue; 5722 5723 /* Find rip entry for this locator if one exists */ 5724 for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) { 5725 if (match_mddb(rip, NULL, NULL, md_expldev(lp->l_dev), 5726 lp->l_blkno)) 5727 break; 5728 } 5729 5730 if (rip == NULL) { 5731 continue; 5732 } 5733 if (rip->ri_lbp == (mddb_lb_t *)NULL) { 5734 continue; 5735 } 5736 if (rip->ri_lbp->lb_commitcnt != lbp->lb_commitcnt) { 5737 continue; 5738 } 5739 5740 /* 5741 * Now have a copy of the database that is equivalent 5742 * to the chosen locator block with respect to 5743 * inittime, identifier and commitcnt. Trying the 5744 * equivalent databases in the order that they were 5745 * written will provide the most up to date data. 5746 */ 5747 lp->l_flags |= readcopy(s, li); 5748 5749 if (s->s_dbp) 5750 break; 5751 } 5752 5753 if (s->s_dbp == NULL) { 5754 retval = MDDB_E_NODIRBLK; 5755 goto errout; 5756 } 5757 5758 lp->l_flags |= MDDB_F_MASTER; 5759 lp->l_flags |= MDDB_F_UP2DATE; 5760 5761 /* 5762 * go through and find largest record; 5763 * Also fixup the user data area's 5764 */ 5765 maxrecsize = MAX(MDDB_BSIZE, s->s_databuffer_size); 5766 5767 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) 5768 for (dep = dbp->db_firstentry; dep != NULL; dep = dep->de_next) 5769 if (dep->de_flags & MDDB_F_OPT) 5770 getoptrecord(s, dep); 5771 else { 5772 allocuserdata(dep); 5773 maxrecsize = MAX(dep->de_recsize, maxrecsize); 5774 } 5775 5776 if (maxrecsize > s->s_databuffer_size) { 5777 p = (caddr_t)kmem_zalloc(maxrecsize, KM_SLEEP); 5778 if (s->s_databuffer_size) 5779 kmem_free(s->s_databuffer, s->s_databuffer_size); 5780 s->s_databuffer = p; 5781 s->s_databuffer_size = maxrecsize; 5782 } 5783 5784 /* If we can clear the tag data record, do it now. */ 5785 /* Data tags not supported on MN sets */ 5786 if ((md_get_setstatus(setno) & MD_SET_CLRTAG) && 5787 (!(md_get_setstatus(setno) & MD_SET_MNSET))) 5788 dt_setup(s, NULL); 5789 5790 /* This will return non-zero if STALE or TOOFEW */ 5791 /* This will write out chosen replica image to all replicas */ 5792 if (selectreplicas(s, MDDB_SCANALL)) 5793 goto errout; 5794 5795 if ((md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) { 5796 ddi_devid_t devidptr; 5797 5798 lbp = s->s_lbp; 5799 for (li = 0; li < lbp->lb_loccnt; li++) { 5800 did_info = &(did_icp->did_ic_blkp->blk_info[li]); 5801 5802 if (did_info->info_flags & MDDB_DID_EXISTS) { 5803 devidptr = s->s_did_icp->did_ic_devid[li]; 5804 lp = &lbp->lb_locators[li]; 5805 for (rip = s->s_rip; rip != NULL; 5806 rip = rip->ri_next) { 5807 if (rip->ri_old_devid == 0) 5808 continue; 5809 if (ddi_devid_compare(rip->ri_old_devid, 5810 devidptr) != 0) { 5811 continue; 5812 } 5813 if (update_locatorblock(s, 5814 md_expldev(lp->l_dev), 5815 rip->ri_devid)) { 5816 goto errout; 5817 } 5818 } 5819 } 5820 } 5821 } 5822 /* 5823 * If the replica is in device id style - validate the device id's, 5824 * if present, in the locator block devid area. 5825 */ 5826 newdev = kmem_zalloc(sizeof (md_dev64_t) * MDDB_NLB, KM_SLEEP); 5827 if (lbp->lb_flags & MDDB_DEVID_STYLE) { 5828 for (li = 0; li < lbp->lb_loccnt; li++) { 5829 newdev[li] = 0; 5830 lp = &lbp->lb_locators[li]; 5831 if (lp->l_flags & MDDB_F_DELETED) 5832 continue; 5833 did_info = &(did_icp->did_ic_blkp->blk_info[li]); 5834 dev = md_expldev(lp->l_dev); 5835 if (did_info->info_flags & MDDB_DID_EXISTS) { 5836 /* Validate device id on current system */ 5837 newdev[li] = dev; 5838 if (mddb_devid_validate( 5839 did_icp->did_ic_devid[li], 5840 &(newdev[li]), 5841 did_info->info_minor_name) == 0) { 5842 /* Set valid flag */ 5843 did_info->info_flags |= MDDB_DID_VALID; 5844 } else { 5845 lp->l_flags |= MDDB_F_EMASTER; 5846 } 5847 } else if (!(MD_UPGRADE)) { 5848 /* 5849 * If a device doesn't have a device id, 5850 * check if there is now a device ID 5851 * associated with device. If one exists, 5852 * add it to the locator block devid area. 5853 * If there's not enough space to add it, 5854 * print a warning. 5855 * Don't do this during upgrade. 5856 */ 5857 dev_t ddi_dev = md_dev64_to_dev(dev); 5858 if (ddi_lyr_get_devid(ddi_dev, &ret_devid) == 5859 DDI_SUCCESS) { 5860 if (ddi_lyr_get_minor_name(ddi_dev, 5861 S_IFBLK, &minor_name) 5862 == DDI_SUCCESS) { 5863 if (mddb_devid_add(s, li, 5864 ret_devid, minor_name)) { 5865 cmn_err(CE_WARN, 5866 "Not enough space in" 5867 " metadevice state" 5868 " database\n"); 5869 cmn_err(CE_WARN, 5870 "to add relocation" 5871 " information for" 5872 " device:\n"); 5873 cmn_err(CE_WARN, 5874 " major = %d, " 5875 " minor = %d\n", 5876 getmajor(ddi_dev), 5877 getminor(ddi_dev)); 5878 } else { 5879 write_lb = 1; 5880 } 5881 kmem_free(minor_name, 5882 strlen(minor_name) + 1); 5883 } 5884 ddi_devid_free(ret_devid); 5885 } 5886 } 5887 } 5888 5889 /* 5890 * If a device has a valid device id and if the dev_t 5891 * associated with the device id has changed, update the 5892 * driver name, minor num and dev_t in the local and side 5893 * locators to match the dev_t that the system currently 5894 * associates with the device id. 5895 * 5896 * Don't do this during upgrade. 5897 */ 5898 if (!(MD_UPGRADE)) { 5899 for (li = 0; li < lbp->lb_loccnt; li++) { 5900 lp = &lbp->lb_locators[li]; 5901 if (lp->l_flags & MDDB_F_DELETED) 5902 continue; 5903 did_info = &(did_icp->did_ic_blkp->blk_info[li]); 5904 if ((did_info->info_flags & MDDB_DID_VALID) && 5905 !(did_info->info_flags & MDDB_DID_UPDATED)) { 5906 if (lbp->lb_flags & MDDB_MNSET) { 5907 int j; 5908 int index = -1; 5909 mnlbp = (mddb_mnlb_t *)lbp; 5910 for (j = 0; j < MD_MNMAXSIDES; j++) { 5911 mnslp = &mnlbp-> 5912 lb_mnsidelocators[j][li]; 5913 if (mnslp->mnl_sideno == 5914 s->s_sideno) 5915 break; 5916 if (mnslp->mnl_sideno == 0) 5917 index = j; 5918 } 5919 if (j == MD_MNMAXSIDES) { 5920 /* No match found; take empty */ 5921 mnslp = &mnlbp-> 5922 lb_mnsidelocators[index][li]; 5923 write_lb = 1; 5924 mnslp->mnl_mnum = 5925 md_getminor(newdev[li]); 5926 } else if (mnslp->mnl_mnum != 5927 md_getminor(newdev[li])) { 5928 write_lb = 1; 5929 mnslp->mnl_mnum = 5930 md_getminor(newdev[li]); 5931 } 5932 } else { 5933 slp = &lbp-> 5934 lb_sidelocators[s->s_sideno][li]; 5935 if (slp->l_mnum != 5936 md_getminor(newdev[li])) { 5937 write_lb = 1; 5938 slp->l_mnum = 5939 md_getminor(newdev[li]); 5940 } 5941 } 5942 name = ddi_major_to_name( 5943 md_getmajor(newdev[li])); 5944 if (lbp->lb_flags & MDDB_MNSET) { 5945 i = mnslp->mnl_drvnm_index; 5946 } else { 5947 i = slp->l_drvnm_index; 5948 } 5949 if (strncmp(lbp->lb_drvnm[i].dn_data, name, 5950 lbp->lb_drvnm[i].dn_len) != 0) { 5951 /* Driver name has changed */ 5952 len = strlen(name); 5953 /* Look for the driver name */ 5954 for (i = 0; i < MDDB_DRVNMCNT; i++) { 5955 if (lbp->lb_drvnm[i].dn_len 5956 != len) 5957 continue; 5958 if (strncmp( 5959 lbp->lb_drvnm[i].dn_data, 5960 name, len) == 0) 5961 break; 5962 } 5963 /* Didn't find one, add it */ 5964 if (i == MDDB_DRVNMCNT) { 5965 for (i = 0; i < MDDB_DRVNMCNT; 5966 i++) { 5967 if (lbp->lb_drvnm[i].dn_len 5968 == 0) 5969 break; 5970 } 5971 if (i == MDDB_DRVNMCNT) { 5972 cmn_err(CE_WARN, 5973 "Unable to update driver" 5974 " name for dev: " 5975 "major = %d, " 5976 "minor = %d\n", 5977 md_getmajor(newdev[li]), 5978 md_getminor(newdev[li])); 5979 continue; 5980 } 5981 (void) strncpy( 5982 lbp->lb_drvnm[i].dn_data, 5983 name, MD_MAXDRVNM); 5984 lbp->lb_drvnm[i].dn_len = 5985 (uchar_t)strlen(name); 5986 } 5987 /* Fill in the drvnm index */ 5988 if (lbp->lb_flags & MDDB_MNSET) { 5989 mnslp->mnl_drvnm_index = i; 5990 } else { 5991 slp->l_drvnm_index = i; 5992 } 5993 write_lb = 1; 5994 } 5995 did_info->info_flags |= MDDB_DID_UPDATED; 5996 } 5997 } 5998 } 5999 } 6000 kmem_free(newdev, sizeof (md_dev64_t) * MDDB_NLB); 6001 6002 /* 6003 * If locator block has been changed by get_mbs_n_lbs, 6004 * by addition of new device id, by updated minor name or 6005 * by updated driver name - write out locator block. 6006 */ 6007 if (write_lb) { 6008 if (push_lb(s)) 6009 goto errout; 6010 } 6011 6012 /* 6013 * If the tag was moved, allocated, or a BADTAG was seen for some other 6014 * reason, then make sure tags are written to all the replicas. 6015 * Data tags not supported on MN sets. 6016 */ 6017 if (!(md_get_setstatus(setno) & MD_SET_MNSET)) { 6018 if (! (lc = dt_alloc_if_needed(s))) { 6019 for (li = 0; li < lbp->lb_loccnt; li++) { 6020 lp = &lbp->lb_locators[li]; 6021 6022 if ((! (lp->l_flags & MDDB_F_ACTIVE)) || 6023 (lp->l_flags & MDDB_F_EMASTER)) 6024 continue; 6025 6026 if (lp->l_flags & MDDB_F_BADTAG) { 6027 lc = 1; 6028 break; 6029 } 6030 } 6031 } 6032 6033 if (lc) { 6034 md_set_setstatus(setno, MD_SET_TAGDATA); 6035 md_clr_setstatus(setno, MD_SET_BADTAG); 6036 (void) selectreplicas(s, MDDB_SCANALL); 6037 } 6038 } 6039 6040 errout: 6041 6042 /* Free extraneous rip components. */ 6043 for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) { 6044 /* Get rid of lbp's and dtp's */ 6045 6046 if (rip->ri_lbp != lbp) { 6047 if (rip->ri_dtp != (mddb_dt_t *)NULL) { 6048 kmem_free((caddr_t)rip->ri_dtp, MDDB_DT_BYTES); 6049 rip->ri_dtp = (mddb_dt_t *)NULL; 6050 } 6051 6052 if (rip->ri_devid != (ddi_devid_t)NULL) { 6053 sz = (int)ddi_devid_sizeof(rip->ri_devid); 6054 kmem_free((caddr_t)rip->ri_devid, sz); 6055 rip->ri_devid = (ddi_devid_t)NULL; 6056 } 6057 if (rip->ri_old_devid != (ddi_devid_t)NULL) { 6058 sz = (int)ddi_devid_sizeof(rip->ri_old_devid); 6059 kmem_free((caddr_t)rip->ri_old_devid, sz); 6060 rip->ri_old_devid = (ddi_devid_t)NULL; 6061 } 6062 6063 if (rip->ri_lbp != (mddb_lb_t *)NULL) { 6064 mddb_devid_icp_free(&rip->ri_did_icp, 6065 rip->ri_lbp); 6066 6067 kmem_free((caddr_t)rip->ri_lbp, 6068 dbtob(rip->ri_lbp->lb_blkcnt)); 6069 rip->ri_lbp = (mddb_lb_t *)NULL; 6070 } 6071 } 6072 6073 if (lbp != NULL) { 6074 for (li = 0; li < lbp->lb_loccnt; li++) { 6075 lp = &lbp->lb_locators[li]; 6076 if (lp->l_flags & MDDB_F_DELETED) 6077 continue; 6078 if (rip->ri_dev == md_expldev(lp->l_dev) && 6079 rip->ri_blkno == lp->l_blkno) 6080 break; 6081 } 6082 if (li < lbp->lb_loccnt) 6083 continue; 6084 } 6085 6086 /* 6087 * Get rid of mbp's: 6088 * if lbp, those out of lb_loccnt bounds 6089 * if !lbp, all of them. 6090 */ 6091 if (rip->ri_mbip) { 6092 md_dev64_t dev64 = md_xlate_targ_2_mini(rip->ri_dev); 6093 if (dev64 != NODEV64) { 6094 mddb_devclose(dev64); 6095 free_mbipp(&rip->ri_mbip); 6096 } 6097 } 6098 /* 6099 * Turn off MDDB_F_EMASTER flag in a diskset since diskset 6100 * code always ends up calling ridev for all replicas 6101 * before calling load_old_replicas. ridev will reset 6102 * MDDB_F_EMASTER flag if flag was due to unresolved devid. 6103 */ 6104 if (setno != MD_LOCAL_SET) 6105 rip->ri_flags &= ~MDDB_F_EMASTER; 6106 } 6107 return (retval); 6108 } 6109 6110 /* 6111 * Given the devt from the md.conf info, get the devid for the device. 6112 */ 6113 static void 6114 lookup_db_devid(mddb_cfg_loc_t *cl) 6115 { 6116 dev_t ldev; 6117 ddi_devid_t devid; 6118 char *minor; 6119 6120 if (ddi_name_to_major(cl->l_driver) == (major_t)-1) { 6121 cmn_err(CE_NOTE, "mddb: unknown major name '%s'", cl->l_driver); 6122 return; 6123 } 6124 6125 ldev = makedevice(ddi_name_to_major(cl->l_driver), cl->l_mnum); 6126 if (ddi_lyr_get_devid(ldev, &devid) != DDI_SUCCESS) { 6127 cmn_err(CE_NOTE, "mddb: unable to get devid for '%s', 0x%x", 6128 cl->l_driver, cl->l_mnum); 6129 return; 6130 } 6131 6132 if (ddi_lyr_get_minor_name(ldev, S_IFBLK, &minor) != DDI_SUCCESS) { 6133 cmn_err(CE_NOTE, "mddb: unable to get minor name 0x%x", 6134 cl->l_mnum); 6135 return; 6136 } 6137 6138 cl->l_devid_flags = MDDB_DEVID_SPACE | MDDB_DEVID_VALID | MDDB_DEVID_SZ; 6139 cl->l_devid_sz = (int)ddi_devid_sizeof(devid); 6140 cl->l_devid = (uint64_t)(uintptr_t)devid; 6141 (void) strlcpy(cl->l_minor_name, minor, MDDB_MINOR_NAME_MAX); 6142 6143 kmem_free(minor, strlen(minor) + 1); 6144 } 6145 6146 /* 6147 * grab driver name, minor, block and devid out of 6148 * strings like "driver:minor:block:devid" 6149 */ 6150 static int 6151 parse_db_loc( 6152 char *str, 6153 mddb_cfg_loc_t *clp 6154 ) 6155 { 6156 char *p, *e; 6157 char *minor_name; 6158 ddi_devid_t ret_devid; 6159 6160 clp->l_dev = 0; 6161 p = clp->l_driver; 6162 e = p + sizeof (clp->l_driver) - 1; 6163 while ((*str != ':') && (*str != '\0') && (p < e)) 6164 *p++ = *str++; 6165 *p = '\0'; 6166 if (*str++ != ':') 6167 return (-1); 6168 clp->l_mnum = 0; 6169 while (ISNUM(*str)) { 6170 clp->l_mnum *= 10; 6171 clp->l_mnum += *str++ - '0'; 6172 } 6173 if (*str++ != ':') 6174 return (-1); 6175 clp->l_blkno = 0; 6176 while (ISNUM(*str)) { 6177 clp->l_blkno *= 10; 6178 clp->l_blkno += *str++ - '0'; 6179 } 6180 if (*str++ != ':') 6181 return (-1); 6182 6183 /* 6184 * If the md_devid_destroy flag is set, ignore the device ids. 6185 * This is only to used in a catastrophic failure case. Examples 6186 * would be where the device id of all drives in the system 6187 * (especially the mirror'd root drives) had been changed 6188 * by firmware upgrade or by a patch to an existing disk 6189 * driver. Another example would be in the case of non-unique 6190 * device ids due to a bug. The device id would be valid on 6191 * the system, but would return the wrong dev_t. 6192 */ 6193 if (md_devid_destroy) { 6194 clp->l_devid_flags = 0; 6195 clp->l_devid = (uint64_t)NULL; 6196 clp->l_devid_sz = 0; 6197 clp->l_old_devid = (uint64_t)NULL; 6198 clp->l_old_devid_sz = 0; 6199 clp->l_minor_name[0] = '\0'; 6200 return (0); 6201 } 6202 6203 if (ddi_devid_str_decode(str, 6204 (ddi_devid_t *)&ret_devid, &minor_name) == DDI_FAILURE) 6205 return (-1); 6206 6207 clp->l_devid = (uint64_t)(uintptr_t)ret_devid; 6208 clp->l_devid_flags = 0; 6209 clp->l_old_devid = (uint64_t)NULL; 6210 clp->l_old_devid_sz = 0; 6211 6212 /* If no device id associated with device, just return */ 6213 if ((ddi_devid_t)(uintptr_t)clp->l_devid == (ddi_devid_t)NULL) { 6214 clp->l_devid_sz = 0; 6215 clp->l_minor_name[0] = '\0'; 6216 if (strcmp(str, "id0") == 0 && md_devid_destroy == 0 && 6217 md_keep_repl_state == 0) { 6218 /* 6219 * No devid in md.conf; we're in recovery mode so 6220 * lookup the devid for the device as specified by 6221 * the devt in md.conf. 6222 */ 6223 lookup_db_devid(clp); 6224 } 6225 return (0); 6226 } 6227 6228 clp->l_devid_flags = MDDB_DEVID_SPACE | MDDB_DEVID_VALID | 6229 MDDB_DEVID_SZ; 6230 clp->l_devid_sz = (int)ddi_devid_sizeof( 6231 (ddi_devid_t)(uintptr_t)clp->l_devid); 6232 (void) strcpy(clp->l_minor_name, minor_name); 6233 kmem_free(minor_name, strlen(minor_name) + 1); 6234 6235 return (0); 6236 } 6237 6238 /* 6239 * grab driver name, minor, and block out of 6240 * strings like "driver:minor:block:devid driver:minor:block:devid ..." 6241 */ 6242 static void 6243 parse_db_string( 6244 char *str 6245 ) 6246 { 6247 char *p, *e; 6248 mddb_cfg_loc_t *cl; 6249 char restore_space; 6250 6251 /* CSTYLED */ 6252 cl = kmem_zalloc(sizeof (mddb_cfg_loc_t), KM_SLEEP); 6253 for (p = str; (*p != '\0'); ) { 6254 for (; ((*p != '\0') && (ISWHITE(*p))); ++p) 6255 ; 6256 if (*p == '\0') 6257 break; 6258 for (e = p; ((*e != '\0') && (! ISWHITE(*e))); ++e) 6259 ; 6260 /* 6261 * Only give parse_db_loc 1 entry, so stuff a null into 6262 * the string if we're not at the end. We need to save this 6263 * char and restore it after call. 6264 */ 6265 restore_space = '\0'; 6266 if (*e != '\0') { 6267 restore_space = *e; 6268 *e = '\0'; 6269 } 6270 if (parse_db_loc(p, cl) != 0) { 6271 cmn_err(CE_NOTE, "mddb: parsing error on '%s'", p); 6272 } else { 6273 (void) ridev( 6274 &((mddb_set_t *)md_set[MD_LOCAL_SET].s_db)->s_rip, 6275 cl, NULL, MDDB_F_PTCHED); 6276 if (cl->l_devid_flags & MDDB_DEVID_SPACE) { 6277 kmem_free((caddr_t)(uintptr_t)cl->l_devid, 6278 cl->l_devid_sz); 6279 } 6280 } 6281 if (restore_space != '\0') { 6282 *e = restore_space; 6283 } 6284 p = e; 6285 } 6286 kmem_free(cl, sizeof (mddb_cfg_loc_t)); 6287 } 6288 6289 /* 6290 * grab database locations supplied by md.conf as properties 6291 */ 6292 static void 6293 parse_db_strings(void) 6294 { 6295 int bootlist_id; 6296 int proplen; 6297 /* 6298 * size of _bootlist_name should match uses of line and entry in 6299 * libmeta meta_systemfile_append_mddb routine (meta_systemfile.c) 6300 */ 6301 char _bootlist_name[MDDB_BOOTLIST_MAX_LEN]; 6302 char *bootlist_name; 6303 caddr_t prop; 6304 6305 /* 6306 * Step through the bootlist properties one at a time by forming the 6307 * correct name, fetching the property, parsing the property and 6308 * then freeing the memory. If a property does not exist or returns 6309 * some form of error just ignore it. There is no guarantee that 6310 * the properties will always exist in sequence, for example 6311 * mddb_bootlist1 may exist and mddb_bootlist2 may not exist with 6312 * mddb_bootlist3 existing. 6313 */ 6314 bootlist_name = &_bootlist_name[0]; 6315 for (bootlist_id = 0; bootlist_id < md_maxbootlist; bootlist_id++) { 6316 6317 proplen = 0; 6318 (void) sprintf(bootlist_name, "mddb_bootlist%d", bootlist_id); 6319 6320 if (ddi_getlongprop(DDI_DEV_T_ANY, md_devinfo, 6321 DDI_PROP_CANSLEEP, bootlist_name, (caddr_t)&prop, 6322 &proplen) != DDI_PROP_SUCCESS) 6323 continue; 6324 6325 if (proplen <= 0) 6326 continue; 6327 6328 if (md_init_debug) 6329 cmn_err(CE_NOTE, "%s is %s", bootlist_name, prop); 6330 6331 parse_db_string(prop); 6332 kmem_free(prop, proplen); 6333 } 6334 } 6335 6336 static int 6337 initit( 6338 set_t setno, 6339 int flag 6340 ) 6341 { 6342 int i; 6343 mddb_set_t *s; 6344 mddb_lb_t *lbp; /* pointer to locator block */ 6345 mddb_ln_t *lnp; /* pointer to locator names */ 6346 mddb_db_t *dbp; /* pointer to directory block */ 6347 mddb_did_blk_t *did_blkp; /* pointer to Device ID block */ 6348 mddb_did_ic_t *did_icp; /* pointer to Device ID incore area */ 6349 mddb_bf_t *bfp; 6350 side_t sideno; 6351 side_t maxsides; 6352 mddb_block_t lb_blkcnt; 6353 int retval = 0; 6354 md_dev64_t dev; 6355 mddb_mnlb_t *mnlbp; 6356 int devid_flag; 6357 6358 /* single thread's all loads/unloads of set's */ 6359 mutex_enter(&mddb_lock); 6360 mutex_enter(SETMUTEX(setno)); 6361 6362 if (((mddb_set_t *)md_set[setno].s_db) == NULL) { 6363 mutex_exit(SETMUTEX(setno)); 6364 mutex_exit(&mddb_lock); 6365 return (MDDB_E_NOTNOW); 6366 } 6367 6368 s = (mddb_set_t *)md_set[setno].s_db; 6369 6370 single_thread_start(s); 6371 6372 /* 6373 * init is already underway, block. Return success. 6374 */ 6375 if (s->s_lbp) { 6376 single_thread_end(s); 6377 mutex_exit(SETMUTEX(setno)); 6378 mutex_exit(&mddb_lock); 6379 return (0); 6380 } 6381 6382 uniqtime32(&s->s_inittime); 6383 6384 /* grab database locations patched by /etc/system */ 6385 if (setno == MD_LOCAL_SET) 6386 parse_db_strings(); 6387 6388 s->s_mbiarray = (mddb_mb_ic_t **)kmem_zalloc( 6389 sizeof (mddb_mb_ic_t *) * mddb_maxcopies, KM_SLEEP); 6390 6391 s->s_zombie = 0; 6392 s->s_staledeletes = 0; 6393 s->s_optcmtcnt = 0; 6394 s->s_opthavelck = 0; 6395 s->s_optwantlck = 0; 6396 s->s_optwaiterr = 0; 6397 s->s_opthungerr = 0; 6398 6399 /* 6400 * KEEPTAG can never be set for a MN diskset since no tags are 6401 * allowed to be stored in a MN diskset. No way to check 6402 * if this is a MN diskset or not at this point since the mddb 6403 * hasn't been read in from disk yet. (flag will only have 6404 * MUTLINODE bit set if a new set is being created.) 6405 */ 6406 if (! (md_get_setstatus(s->s_setno) & MD_SET_KEEPTAG)) 6407 dt_setup(s, NULL); 6408 6409 md_clr_setstatus(s->s_setno, MD_SET_TOOFEW); 6410 6411 for (i = 0; i < mddb_maxbufheaders; i++) { 6412 bfp = (mddb_bf_t *)kmem_zalloc(sizeof (*bfp), KM_SLEEP); 6413 sema_init(&bfp->bf_buf.b_io, 0, NULL, 6414 SEMA_DEFAULT, NULL); 6415 sema_init(&bfp->bf_buf.b_sem, 0, NULL, 6416 SEMA_DEFAULT, NULL); 6417 bfp->bf_buf.b_offset = -1; 6418 freebuffer(s, bfp); 6419 } 6420 6421 retval = load_old_replicas(s, flag); 6422 /* If 0 return value - success */ 6423 if (! retval) { 6424 single_thread_end(s); 6425 mutex_exit(SETMUTEX(setno)); 6426 mutex_exit(&mddb_lock); 6427 return (0); 6428 } 6429 6430 /* 6431 * If here, then the load_old_replicas() failed 6432 */ 6433 6434 6435 /* If the database was supposed to exist. */ 6436 if (flag & MDDB_MUSTEXIST) { 6437 if (s->s_mbiarray != (mddb_mb_ic_t **)NULL) { 6438 for (i = 0; i < mddb_maxcopies; i++) { 6439 if (! s->s_mbiarray[i]) 6440 continue; 6441 dev = md_expldev( 6442 s->s_lbp->lb_locators[i].l_dev); 6443 dev = md_xlate_targ_2_mini(dev); 6444 if (dev != NODEV64) { 6445 mddb_devclose(dev); 6446 free_mbipp(&s->s_mbiarray[i]); 6447 } 6448 } 6449 6450 kmem_free((caddr_t)s->s_mbiarray, 6451 sizeof (mddb_mb_ic_t *) * mddb_maxcopies); 6452 s->s_mbiarray = NULL; 6453 } 6454 6455 if (s->s_lnp != (mddb_ln_t *)NULL) { 6456 kmem_free((caddr_t)s->s_lnp, 6457 dbtob(s->s_lbp->lb_lnblkcnt)); 6458 s->s_lnp = (mddb_ln_t *)NULL; 6459 } 6460 6461 mddb_devid_icp_free(&s->s_did_icp, s->s_lbp); 6462 6463 if (s->s_lbp != (mddb_lb_t *)NULL) { 6464 kmem_free((caddr_t)s->s_lbp, 6465 dbtob(s->s_lbp->lb_blkcnt)); 6466 s->s_lbp = (mddb_lb_t *)NULL; 6467 } 6468 6469 while ((bfp = allocbuffer(s, MDDB_NOSLEEP)) != NULL) 6470 kmem_free((caddr_t)bfp, sizeof (*bfp)); 6471 6472 single_thread_end(s); 6473 mutex_exit(SETMUTEX(setno)); 6474 mutex_exit(&mddb_lock); 6475 6476 if (retval == MDDB_E_TAGDATA) 6477 return (retval); 6478 6479 /* Want a bit more detailed error messages */ 6480 if (mddb_db_err_detail) 6481 return (retval); 6482 6483 return (MDDB_E_NODB); 6484 } 6485 6486 6487 /* 6488 * MDDB_NOOLDOK set - Creating a new database, so do 6489 * more initialization. 6490 */ 6491 6492 lb_blkcnt = (mddb_block_t)((setno == MD_LOCAL_SET) ? 6493 MDDB_LOCAL_LBCNT : MDDB_LBCNT); 6494 if (flag & MDDB_MULTINODE) { 6495 lb_blkcnt = MDDB_MNLBCNT; 6496 } 6497 6498 if (s->s_lbp == NULL) 6499 s->s_lbp = (mddb_lb_t *)kmem_alloc(dbtob(lb_blkcnt), KM_SLEEP); 6500 lbp = s->s_lbp; 6501 6502 bzero((caddr_t)lbp, dbtob(lb_blkcnt)); 6503 lbp->lb_setno = setno; 6504 lbp->lb_magic = MDDB_MAGIC_LB; 6505 if (flag & MDDB_MULTINODE) { 6506 lbp->lb_revision = MDDB_REV_MNLB; 6507 } else { 6508 lbp->lb_revision = MDDB_REV_LB; 6509 } 6510 lbp->lb_inittime = s->s_inittime; 6511 if (flag & MDDB_MULTINODE) { 6512 mnlbp = (mddb_mnlb_t *)lbp; 6513 for (i = 0; i < MDDB_NLB; i++) { 6514 for (sideno = 0; sideno < MD_MNMAXSIDES; sideno++) { 6515 mddb_mnsidelocator_t *mnslp; 6516 mnslp = &mnlbp->lb_mnsidelocators[sideno][i]; 6517 mnslp->mnl_mnum = NODEV32; 6518 mnslp->mnl_sideno = 0; 6519 mnslp->mnl_drvnm_index = 0; 6520 } 6521 } 6522 } else { 6523 maxsides = ((setno == MD_LOCAL_SET) ? 1 : MD_MAXSIDES); 6524 for (i = 0; i < MDDB_NLB; i++) { 6525 for (sideno = 0; sideno < maxsides; sideno++) { 6526 mddb_sidelocator_t *slp; 6527 slp = &lbp->lb_sidelocators[sideno][i]; 6528 slp->l_mnum = NODEV32; 6529 } 6530 } 6531 } 6532 lbp->lb_blkcnt = lb_blkcnt; 6533 6534 /* lb starts on block 0 */ 6535 /* locator names starts after locator block */ 6536 lbp->lb_lnfirstblk = lb_blkcnt; 6537 if (flag & MDDB_MULTINODE) { 6538 lbp->lb_lnblkcnt = (mddb_block_t)MDDB_MNLNCNT; 6539 } else { 6540 lbp->lb_lnblkcnt = (mddb_block_t)((setno == MD_LOCAL_SET) ? 6541 MDDB_LOCAL_LNCNT : MDDB_LNCNT); 6542 } 6543 6544 if (flag & MDDB_MULTINODE) { 6545 /* Creating a multinode diskset */ 6546 md_set_setstatus(setno, MD_SET_MNSET); 6547 lbp->lb_flags |= MDDB_MNSET; 6548 } 6549 6550 /* Data portion of mddb located after locator names */ 6551 lbp->lb_dbfirstblk = lbp->lb_lnfirstblk + lbp->lb_lnblkcnt; 6552 6553 /* the btodb that follows is converting the directory block size */ 6554 /* Data tag part of mddb located after first block of mddb data */ 6555 lbp->lb_dtfirstblk = (mddb_block_t)(lbp->lb_dbfirstblk + 6556 btodb(MDDB_BSIZE)); 6557 /* Data tags are not used in MN diskset - so set count to 0 */ 6558 if (flag & MDDB_MULTINODE) 6559 lbp->lb_dtblkcnt = (mddb_block_t)0; 6560 else 6561 lbp->lb_dtblkcnt = (mddb_block_t)MDDB_DT_BLOCKS; 6562 6563 6564 lnp = (mddb_ln_t *)kmem_zalloc(dbtob(lbp->lb_lnblkcnt), KM_SLEEP); 6565 lnp->ln_magic = MDDB_MAGIC_LN; 6566 if (flag & MDDB_MULTINODE) { 6567 lnp->ln_revision = MDDB_REV_MNLN; 6568 } else { 6569 lnp->ln_revision = MDDB_REV_LN; 6570 } 6571 s->s_lnp = lnp; 6572 6573 /* 6574 * Set up Device ID portion of Locator Block. 6575 * Do not set locator to device id style if 6576 * md_devid_destroy is 1 and md_keep_repl_state is 1 6577 * (destroy all device id data and keep replica in 6578 * non device id mode). 6579 * 6580 * This is logically equivalent to set locator to 6581 * device id style if md_devid_destroy is 0 or 6582 * md_keep_repl_state is 0. 6583 * 6584 * In SunCluster environment, device id mode is disabled 6585 * which means diskset will be run in non-devid mode. For 6586 * localset, the behavior will remain intact and run in 6587 * device id mode. 6588 * 6589 * In multinode diskset devids are turned off. 6590 */ 6591 devid_flag = 1; 6592 if (cluster_bootflags & CLUSTER_CONFIGURED) 6593 if (setno != MD_LOCAL_SET) 6594 devid_flag = 0; 6595 if (flag & MDDB_MULTINODE) 6596 devid_flag = 0; 6597 if ((md_devid_destroy == 1) && (md_keep_repl_state == 1)) 6598 devid_flag = 0; 6599 /* 6600 * if we weren't devid style before and md_keep_repl_state=1 6601 * we need to stay non-devid 6602 */ 6603 if (((lbp->lb_flags & MDDB_DEVID_STYLE) == 0) && 6604 (md_keep_repl_state == 1)) 6605 devid_flag = 0; 6606 if (devid_flag) { 6607 lbp->lb_didfirstblk = lbp->lb_dtfirstblk + 6608 lbp->lb_dtblkcnt; 6609 lbp->lb_didblkcnt = (mddb_block_t)MDDB_DID_BLOCKS; 6610 lbp->lb_flags |= MDDB_DEVID_STYLE; 6611 6612 did_icp = (mddb_did_ic_t *)kmem_zalloc 6613 (sizeof (mddb_did_ic_t), KM_SLEEP); 6614 did_blkp = (mddb_did_blk_t *) 6615 kmem_zalloc(dbtob(lbp->lb_didblkcnt), KM_SLEEP); 6616 did_blkp->blk_magic = MDDB_MAGIC_DI; 6617 did_blkp->blk_revision = MDDB_REV_DI; 6618 did_icp->did_ic_blkp = did_blkp; 6619 s->s_did_icp = did_icp; 6620 } 6621 6622 setidentifier(s, &lbp->lb_ident); 6623 uniqtime32(&lbp->lb_timestamp); 6624 dbp = (mddb_db_t *)kmem_zalloc(sizeof (mddb_db_t), KM_SLEEP); 6625 dbp->db_magic = MDDB_MAGIC_DB; 6626 dbp->db_revision = MDDB_REV_DB; 6627 uniqtime32(&dbp->db_timestamp); 6628 dbp->db_nextblk = 0; 6629 dbp->db_firstentry = NULL; 6630 dbp->db_blknum = lbp->lb_dbfirstblk; 6631 dbp->db_recsum = MDDB_GLOBAL_XOR; 6632 s->s_dbp = dbp; 6633 single_thread_end(s); 6634 mutex_exit(SETMUTEX(setno)); 6635 mutex_exit(&mddb_lock); 6636 return (0); 6637 } 6638 6639 mddb_set_t * 6640 mddb_setenter( 6641 set_t setno, 6642 int flag, 6643 int *errorcodep 6644 ) 6645 { 6646 mddb_set_t *s; 6647 int err = 0; 6648 size_t sz = sizeof (void *) * MD_MAXUNITS; 6649 6650 mutex_enter(SETMUTEX(setno)); 6651 if (! md_set[setno].s_db) { 6652 mutex_exit(SETMUTEX(setno)); 6653 if (errorcodep != NULL) 6654 *errorcodep = MDDB_E_NOTOWNER; 6655 return (NULL); 6656 } 6657 6658 /* Allocate s_un and s_ui arrays if not already present. */ 6659 if (md_set[setno].s_un == NULL) { 6660 md_set[setno].s_un = kmem_zalloc(sz, KM_NOSLEEP); 6661 if (md_set[setno].s_un == NULL) { 6662 mutex_exit(SETMUTEX(setno)); 6663 if (errorcodep != NULL) 6664 *errorcodep = MDDB_E_NOTOWNER; 6665 return (NULL); 6666 } 6667 } 6668 if (md_set[setno].s_ui == NULL) { 6669 md_set[setno].s_ui = kmem_zalloc(sz, KM_NOSLEEP); 6670 if (md_set[setno].s_ui == NULL) { 6671 mutex_exit(&md_set[setno].s_dbmx); 6672 kmem_free(md_set[setno].s_un, sz); 6673 md_set[setno].s_un = NULL; 6674 if (errorcodep != NULL) 6675 *errorcodep = MDDB_E_NOTOWNER; 6676 return (NULL); 6677 } 6678 } 6679 s = (mddb_set_t *)md_set[setno].s_db; 6680 if (s->s_lbp) 6681 return (s); 6682 6683 if (flag & MDDB_NOINIT) 6684 return (s); 6685 6686 /* 6687 * Release the set mutex - it will be acquired and released in 6688 * initit after acquiring the mddb_lock. This is done to assure 6689 * that mutexes are always acquired in the same order to prevent 6690 * possible deadlock 6691 */ 6692 mutex_exit(SETMUTEX(setno)); 6693 6694 if ((err = initit(setno, flag)) != 0) { 6695 if (errorcodep != NULL) 6696 *errorcodep = err; 6697 return (NULL); 6698 } 6699 6700 mutex_enter(SETMUTEX(setno)); 6701 return ((mddb_set_t *)md_set[setno].s_db); 6702 } 6703 6704 /* 6705 * Release the set lock for a given set. 6706 * 6707 * In a MN diskset, this routine may send messages to the rpc.mdcommd 6708 * in order to have the slave nodes re-parse parts of the mddb. 6709 * Messages are only sent if the global ioctl lock is not held. 6710 * 6711 * With the introduction of multi-threaded ioctls, there is no way 6712 * to determine which thread(s) are holding the ioctl lock. So, if 6713 * the ioctl lock is held (by process X) process X will send the 6714 * messages to the slave nodes when process X releases the ioctl lock. 6715 */ 6716 void 6717 mddb_setexit( 6718 mddb_set_t *s 6719 ) 6720 { 6721 md_mn_msg_mddb_parse_t *mddb_parse_msg; 6722 md_mn_kresult_t *kresult; 6723 mddb_lb_t *lbp = s->s_lbp; 6724 int i; 6725 int rval = 1; 6726 6727 /* 6728 * If not a MN diskset OR 6729 * a MN diskset but this node isn't master, 6730 * then release the mutex. 6731 */ 6732 if (!(MD_MNSET_SETNO(s->s_setno)) || 6733 ((MD_MNSET_SETNO(s->s_setno)) && 6734 (!md_set[s->s_setno].s_am_i_master))) { 6735 mutex_exit(SETMUTEX(s->s_setno)); 6736 return; 6737 } 6738 6739 /* 6740 * If global ioctl lock is held, then send no messages, 6741 * just release mutex and return. 6742 * 6743 */ 6744 if (md_status & MD_GBL_IOCTL_LOCK) { 6745 mutex_exit(SETMUTEX(s->s_setno)); 6746 return; 6747 } 6748 6749 /* 6750 * This thread is not holding the ioctl lock, so drop the set 6751 * lock, send messages to slave nodes to reparse portions 6752 * of the mddb and return. 6753 * 6754 * If the block parse flag is set, do not send parse messages. 6755 * This flag is set when master is adding a new mddb that would 6756 * cause parse messages to be sent to the slaves, but the slaves 6757 * don't have knowledge of the new mddb yet since the mddb add 6758 * operation hasn't been run on the slave nodes yet. When the 6759 * master unblocks the parse flag, the parse messages will be 6760 * generated. 6761 * 6762 * If s_mn_parseflags_sending is non-zero, then another thread 6763 * is already currently sending a parse message, so just release 6764 * the mutex and return. If an mddb change occurred that results 6765 * in a parse message to be generated, the thread that is currently 6766 * sending a parse message would generate the additional parse message. 6767 * 6768 * If s_mn_parseflags_sending is zero and parsing is not blocked, 6769 * then loop until s_mn_parseflags is 0 (until there are no more 6770 * messages to send). 6771 * While s_mn_parseflags is non-zero, 6772 * put snapshot of parse_flags in s_mn_parseflags_sending 6773 * set s_mn_parseflags to zero 6774 * release mutex 6775 * send message 6776 * re-grab mutex 6777 * set s_mn_parseflags_sending to zero 6778 */ 6779 mddb_parse_msg = kmem_zalloc(sizeof (md_mn_msg_mddb_parse_t), 6780 KM_SLEEP); 6781 while (((s->s_mn_parseflags_sending & MDDB_PARSE_MASK) == 0) && 6782 (s->s_mn_parseflags & MDDB_PARSE_MASK) && 6783 (!(md_get_setstatus(s->s_setno) & MD_SET_MNPARSE_BLK))) { 6784 /* Grab snapshot of parse flags */ 6785 s->s_mn_parseflags_sending = s->s_mn_parseflags; 6786 s->s_mn_parseflags = 0; 6787 6788 mutex_exit(SETMUTEX(s->s_setno)); 6789 6790 /* 6791 * Send the message to the slaves to re-parse 6792 * the indicated portions of the mddb. Send the status 6793 * of the 50 mddbs in this set so that slaves know which 6794 * mddbs that the master node thinks are 'good'. 6795 * Otherwise, slave may reparse, but from wrong replica. 6796 */ 6797 mddb_parse_msg->msg_parse_flags = s->s_mn_parseflags_sending; 6798 for (i = 0; i < MDDB_NLB; i++) { 6799 mddb_parse_msg->msg_lb_flags[i] = 6800 lbp->lb_locators[i].l_flags; 6801 } 6802 kresult = kmem_zalloc(sizeof (md_mn_kresult_t), KM_SLEEP); 6803 while (rval != 0) { 6804 rval = mdmn_ksend_message(s->s_setno, 6805 MD_MN_MSG_MDDB_PARSE, 0, 6806 (char *)mddb_parse_msg, 6807 sizeof (mddb_parse_msg), kresult); 6808 if (rval != 0) 6809 cmn_err(CE_WARN, "mddb_setexit: Unable to send " 6810 "mddb update message to other nodes in " 6811 "diskset %s\n", s->s_setname); 6812 } 6813 kmem_free(kresult, sizeof (md_mn_kresult_t)); 6814 6815 /* 6816 * Re-grab mutex to clear sending field and to 6817 * see if another parse message needs to be generated. 6818 */ 6819 mutex_enter(SETMUTEX(s->s_setno)); 6820 s->s_mn_parseflags_sending = 0; 6821 } 6822 kmem_free(mddb_parse_msg, sizeof (md_mn_msg_mddb_parse_t)); 6823 mutex_exit(SETMUTEX(s->s_setno)); 6824 } 6825 6826 static void 6827 mddb_setexit_no_parse( 6828 mddb_set_t *s 6829 ) 6830 { 6831 mutex_exit(SETMUTEX(s->s_setno)); 6832 } 6833 6834 uint_t 6835 mddb_lb_did_convert(mddb_set_t *s, uint_t doit, uint_t *blk_cnt) 6836 { 6837 uint_t li; 6838 mddb_lb_t *lbp = s->s_lbp; 6839 mddb_locator_t *lp; 6840 ddi_devid_t ret_devid; 6841 uint_t devid_len; 6842 dev_t ddi_dev; 6843 mddb_did_ic_t *did_icp; 6844 mddb_did_blk_t *did_blkp; 6845 char *minor_name; 6846 size_t sz; 6847 int retval; 6848 int err; 6849 md_dev64_t dev64; /* tmp var to make code look better */ 6850 6851 6852 /* Need disk block(s) to hold mddb_did_blk_t */ 6853 *blk_cnt = MDDB_DID_BLOCKS; 6854 6855 if (doit) { 6856 /* 6857 * Alloc mddb_did_blk_t disk block and fill in header area. 6858 * Don't fill in did magic number until end of routine so 6859 * if machine panics in the middle of conversion, the 6860 * device id information will be thrown away at the 6861 * next snarfing of this set. 6862 * Need to set DEVID_STYLE so that mddb_devid_add will 6863 * function properly. 6864 */ 6865 /* grab the mutex */ 6866 if ((mddb_setenter(s->s_setno, MDDB_NOINIT, &err)) == NULL) { 6867 return (1); 6868 } 6869 single_thread_start(s); 6870 lbp->lb_didfirstblk = getfreeblks(s, MDDB_DID_BLOCKS); 6871 if (lbp->lb_didfirstblk == 0) { 6872 single_thread_end(s); 6873 mddb_setexit(s); 6874 return (1); 6875 } 6876 lbp->lb_didblkcnt = (mddb_block_t)MDDB_DID_BLOCKS; 6877 did_icp = (mddb_did_ic_t *)kmem_zalloc(sizeof (mddb_did_ic_t), 6878 KM_SLEEP); 6879 did_blkp = (mddb_did_blk_t *)kmem_zalloc(MDDB_DID_BYTES, 6880 KM_SLEEP); 6881 6882 did_blkp->blk_revision = MDDB_REV_DI; 6883 did_icp->did_ic_blkp = did_blkp; 6884 s->s_did_icp = did_icp; 6885 lbp->lb_flags |= MDDB_DEVID_STYLE; 6886 } 6887 6888 /* Fill in information in mddb_did_info_t array */ 6889 for (li = 0; li < lbp->lb_loccnt; li++) { 6890 lp = &lbp->lb_locators[li]; 6891 if (lp->l_flags & MDDB_F_DELETED) 6892 continue; 6893 6894 dev64 = md_xlate_targ_2_mini(md_expldev(lp->l_dev)); 6895 ddi_dev = md_dev64_to_dev(dev64); 6896 if (ddi_dev == NODEV) { 6897 /* 6898 * No translation available for replica. 6899 * Could fail conversion to device id replica, 6900 * but instead will just continue with next 6901 * replica in list. 6902 */ 6903 continue; 6904 } 6905 if (ddi_lyr_get_devid(ddi_dev, &ret_devid) == DDI_SUCCESS) { 6906 /* 6907 * Just count each devid as at least 1 block. This 6908 * is conservative since several device id's may fit 6909 * into 1 disk block, but it's better to overestimate 6910 * the number of blocks needed than to underestimate. 6911 */ 6912 devid_len = (int)ddi_devid_sizeof(ret_devid); 6913 *blk_cnt += btodb(devid_len + (MDDB_BSIZE - 1)); 6914 if (doit) { 6915 if (ddi_lyr_get_minor_name(ddi_dev, S_IFBLK, 6916 &minor_name) == DDI_SUCCESS) { 6917 if (mddb_devid_add(s, li, ret_devid, 6918 minor_name)) { 6919 cmn_err(CE_WARN, 6920 "Not enough space in metadb" 6921 " to add device id for" 6922 " dev: major = %d, " 6923 "minor = %d\n", 6924 getmajor(ddi_dev), 6925 getminor(ddi_dev)); 6926 } 6927 sz = strlen(minor_name) + 1; 6928 kmem_free(minor_name, sz); 6929 } 6930 } 6931 ddi_devid_free(ret_devid); 6932 } 6933 } 6934 6935 if (doit) { 6936 did_blkp->blk_magic = MDDB_MAGIC_DI; 6937 retval = push_lb(s); 6938 single_thread_end(s); 6939 mddb_setexit(s); 6940 if (retval != 0) 6941 return (1); 6942 } 6943 6944 return (0); 6945 } 6946 6947 static mddb_set_t * 6948 init_set( 6949 mddb_config_t *cp, 6950 int flag, 6951 int *errp 6952 ) 6953 { 6954 mddb_set_t *s; 6955 char *setname = NULL; 6956 set_t setno = MD_LOCAL_SET; 6957 side_t sideno = 0; 6958 struct timeval32 *created = NULL; 6959 6960 if (cp != NULL) { 6961 setname = cp->c_setname; 6962 setno = cp->c_setno; 6963 sideno = cp->c_sideno; 6964 created = &cp->c_timestamp; 6965 } 6966 6967 if (setno >= MD_MAXSETS) 6968 return ((mddb_set_t *)NULL); 6969 6970 if (md_set[setno].s_db) 6971 return (mddb_setenter(setno, flag, errp)); 6972 6973 s = (mddb_set_t *)kmem_zalloc(sizeof (*s), KM_SLEEP); 6974 6975 cv_init(&s->s_buf_cv, NULL, CV_DEFAULT, NULL); 6976 cv_init(&s->s_single_thread_cv, NULL, CV_DEFAULT, NULL); 6977 cv_init(&s->s_optqueuing_cv, NULL, CV_DEFAULT, NULL); 6978 cv_init(&s->s_opthungerr_cv, NULL, CV_DEFAULT, NULL); 6979 cv_init(&s->s_optwantlck_cv, NULL, CV_DEFAULT, NULL); 6980 6981 s->s_setno = setno; 6982 s->s_sideno = sideno; 6983 if (setno == MD_LOCAL_SET) { 6984 (void) strcpy(s->s_ident.serial, hw_serial); 6985 } else { 6986 s->s_ident.createtime = *created; 6987 s->s_setname = (char *)kmem_alloc(strlen(setname) + 1, 6988 KM_SLEEP); 6989 (void) strcpy(s->s_setname, setname); 6990 } 6991 6992 /* have a config struct, copy mediator information */ 6993 if (cp != NULL) 6994 s->s_med = cp->c_med; /* structure assignment */ 6995 6996 md_set[setno].s_db = (void *) s; 6997 6998 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_TAKEOVER, SVM_TAG_SET, setno, NODEV64); 6999 7000 return (mddb_setenter(setno, flag, errp)); 7001 } 7002 7003 void 7004 mddb_unload_set( 7005 set_t setno 7006 ) 7007 { 7008 7009 mddb_set_t *s; 7010 mddb_db_t *dbp, *adbp = NULL; 7011 mddb_de_ic_t *dep, *dep2; 7012 mddb_bf_t *bfp; 7013 int i; 7014 md_dev64_t dev; 7015 7016 if ((s = mddb_setenter(setno, MDDB_NOINIT, NULL)) == NULL) 7017 return; 7018 7019 single_thread_start(s); 7020 7021 s->s_opthavequeuinglck = 0; 7022 s->s_optwantqueuinglck = 0; 7023 7024 for (dbp = s->s_dbp; dbp != 0; dbp = adbp) { 7025 for (dep = dbp->db_firstentry; dep != NULL; dep = dep2) { 7026 if (dep->de_rb_userdata != NULL) { 7027 if (dep->de_icreqsize) 7028 kmem_free(dep->de_rb_userdata_ic, 7029 dep->de_icreqsize); 7030 else 7031 kmem_free(dep->de_rb_userdata, 7032 dep->de_reqsize); 7033 } 7034 kmem_free((caddr_t)dep->de_rb, dep->de_recsize); 7035 dep2 = dep->de_next; 7036 kmem_free((caddr_t)dep, sizeofde(dep)); 7037 } 7038 adbp = dbp->db_next; 7039 kmem_free((caddr_t)dbp, sizeof (mddb_db_t)); 7040 } 7041 s->s_dbp = (mddb_db_t *)NULL; 7042 7043 free_rip(&s->s_rip); 7044 7045 for (i = 0; i < mddb_maxcopies; i++) { 7046 if (! s->s_mbiarray) 7047 break; 7048 7049 if (! s->s_mbiarray[i]) 7050 continue; 7051 7052 dev = md_expldev(s->s_lbp->lb_locators[i].l_dev); 7053 dev = md_xlate_targ_2_mini(dev); 7054 if (dev != NODEV64) { 7055 mddb_devclose(dev); 7056 free_mbipp(&s->s_mbiarray[i]); 7057 } 7058 } 7059 7060 if (s->s_mbiarray) { 7061 kmem_free((caddr_t)s->s_mbiarray, 7062 sizeof (mddb_mb_ic_t *) * mddb_maxcopies); 7063 s->s_mbiarray = (mddb_mb_ic_t **)NULL; 7064 } 7065 7066 if (s->s_lnp) { 7067 kmem_free((caddr_t)s->s_lnp, dbtob(s->s_lbp->lb_lnblkcnt)); 7068 s->s_lnp = (mddb_ln_t *)NULL; 7069 } 7070 7071 if (s->s_lbp) { 7072 mddb_devid_icp_free(&s->s_did_icp, s->s_lbp); 7073 kmem_free((caddr_t)s->s_lbp, dbtob(s->s_lbp->lb_blkcnt)); 7074 s->s_lbp = (mddb_lb_t *)NULL; 7075 } 7076 7077 if (s->s_freebitmap) { 7078 kmem_free((caddr_t)s->s_freebitmap, s->s_freebitmapsize); 7079 s->s_freebitmap = NULL; 7080 s->s_freebitmapsize = 0; 7081 } 7082 7083 while ((bfp = allocbuffer(s, MDDB_NOSLEEP)) != NULL) 7084 kmem_free((caddr_t)bfp, sizeof (*bfp)); 7085 7086 if (s->s_databuffer_size) { 7087 kmem_free(s->s_databuffer, s->s_databuffer_size); 7088 s->s_databuffer_size = 0; 7089 } 7090 7091 if (s->s_setname != NULL) 7092 kmem_free((caddr_t)s->s_setname, strlen(s->s_setname)+1); 7093 7094 /* Data tags not supported on MN sets. */ 7095 if (!(md_get_setstatus(setno) & MD_SET_MNSET)) 7096 dtl_freel(&s->s_dtlp); 7097 7098 md_set[setno].s_db = NULL; 7099 ASSERT(s->s_singlelockwanted == 0); 7100 kmem_free(s, sizeof (mddb_set_t)); 7101 7102 /* Take care of things setup in the md_set array */ 7103 if (! (md_get_setstatus(setno) & MD_SET_KEEPTAG)) { 7104 if (md_set[setno].s_dtp) { 7105 kmem_free((caddr_t)md_set[setno].s_dtp, MDDB_DT_BYTES); 7106 md_set[setno].s_dtp = NULL; 7107 } 7108 } 7109 7110 md_clr_setstatus(setno, MD_SET_ACCOK | MD_SET_ACCEPT | 7111 MD_SET_TAGDATA | MD_SET_USETAG | 7112 MD_SET_TOOFEW | MD_SET_STALE | 7113 MD_SET_OWNERSHIP | MD_SET_BADTAG | 7114 MD_SET_CLRTAG | MD_SET_MNSET | 7115 MD_SET_DIDCLUP | MD_SET_MNPARSE_BLK | 7116 MD_SET_MN_MIR_STATE_RC); 7117 7118 mutex_exit(SETMUTEX(setno)); 7119 } 7120 7121 /* 7122 * returns 0 if name can be put into locator block 7123 * returns 1 if locator block prefixes are all used 7124 * 7125 * Takes splitname (suffix, prefix, sideno) and 7126 * stores it in the locator name structure. 7127 * For traditional diskset, the sideno is the index into the suffixes 7128 * array in the locator name structure. 7129 * For the MN diskset, the sideno is the nodeid which can be any number, 7130 * so the index passed in is the index into the mnsuffixes array 7131 * in the locator structure. This index was computed by the 7132 * routine checklocator which basically checked the locator block 7133 * mnside locator structure. 7134 */ 7135 static int 7136 splitname2locatorblock( 7137 md_splitname *spn, 7138 mddb_ln_t *lnp, 7139 int li, 7140 side_t sideno, 7141 int index 7142 ) 7143 { 7144 uchar_t i; 7145 md_name_suffix *sn; 7146 md_mnname_suffix_t *mnsn; 7147 mddb_mnln_t *mnlnp; 7148 7149 for (i = 0; i < MDDB_PREFIXCNT; i++) { 7150 if (lnp->ln_prefixes[i].pre_len != SPN_PREFIX(spn).pre_len) 7151 continue; 7152 if (bcmp(lnp->ln_prefixes[i].pre_data, SPN_PREFIX(spn).pre_data, 7153 SPN_PREFIX(spn).pre_len) == 0) 7154 break; 7155 } 7156 if (i == MDDB_PREFIXCNT) { 7157 for (i = 0; i < MDDB_PREFIXCNT; i++) { 7158 if (lnp->ln_prefixes[i].pre_len == 0) 7159 break; 7160 } 7161 if (i == MDDB_PREFIXCNT) 7162 return (1); 7163 bcopy(SPN_PREFIX(spn).pre_data, lnp->ln_prefixes[i].pre_data, 7164 SPN_PREFIX(spn).pre_len); 7165 lnp->ln_prefixes[i].pre_len = SPN_PREFIX(spn).pre_len; 7166 } 7167 7168 if (lnp->ln_revision == MDDB_REV_MNLN) { 7169 /* If a MN diskset, use index */ 7170 mnlnp = (mddb_mnln_t *)lnp; 7171 mnsn = &mnlnp->ln_mnsuffixes[index][li]; 7172 mnsn->mn_ln_sideno = sideno; 7173 mnsn->mn_ln_suffix.suf_len = SPN_SUFFIX(spn).suf_len; 7174 mnsn->mn_ln_suffix.suf_prefix = i; 7175 bcopy(SPN_SUFFIX(spn).suf_data, 7176 mnsn->mn_ln_suffix.suf_data, SPN_SUFFIX(spn).suf_len); 7177 } else { 7178 sn = &lnp->ln_suffixes[sideno][li]; 7179 sn->suf_len = SPN_SUFFIX(spn).suf_len; 7180 sn->suf_prefix = i; 7181 bcopy(SPN_SUFFIX(spn).suf_data, sn->suf_data, 7182 SPN_SUFFIX(spn).suf_len); 7183 } 7184 return (0); 7185 } 7186 7187 /* 7188 * Find the locator name for the given sideno and convert the locator name 7189 * information into a splitname structure. 7190 */ 7191 void 7192 mddb_locatorblock2splitname( 7193 mddb_ln_t *lnp, 7194 int li, 7195 side_t sideno, 7196 md_splitname *spn 7197 ) 7198 { 7199 int iprefix; 7200 md_name_suffix *sn; 7201 md_mnname_suffix_t *mnsn; 7202 int i; 7203 mddb_mnln_t *mnlnp; 7204 7205 if (lnp->ln_revision == MDDB_REV_MNLN) { 7206 mnlnp = (mddb_mnln_t *)lnp; 7207 for (i = 0; i < MD_MNMAXSIDES; i++) { 7208 mnsn = &mnlnp->ln_mnsuffixes[i][li]; 7209 if (mnsn->mn_ln_sideno == sideno) 7210 break; 7211 } 7212 if (i == MD_MNMAXSIDES) 7213 return; 7214 7215 SPN_SUFFIX(spn).suf_len = mnsn->mn_ln_suffix.suf_len; 7216 bcopy(mnsn->mn_ln_suffix.suf_data, SPN_SUFFIX(spn).suf_data, 7217 SPN_SUFFIX(spn).suf_len); 7218 iprefix = mnsn->mn_ln_suffix.suf_prefix; 7219 } else { 7220 sn = &lnp->ln_suffixes[sideno][li]; 7221 SPN_SUFFIX(spn).suf_len = sn->suf_len; 7222 bcopy(sn->suf_data, SPN_SUFFIX(spn).suf_data, 7223 SPN_SUFFIX(spn).suf_len); 7224 iprefix = sn->suf_prefix; 7225 } 7226 SPN_PREFIX(spn).pre_len = lnp->ln_prefixes[iprefix].pre_len; 7227 bcopy(lnp->ln_prefixes[iprefix].pre_data, SPN_PREFIX(spn).pre_data, 7228 SPN_PREFIX(spn).pre_len); 7229 } 7230 7231 static int 7232 getdeldev( 7233 mddb_config_t *cp, 7234 int command, 7235 md_error_t *ep 7236 ) 7237 { 7238 mddb_set_t *s; 7239 mddb_lb_t *lbp; 7240 mddb_locator_t *locators; 7241 uint_t loccnt; 7242 mddb_mb_ic_t *mbip; 7243 mddb_block_t blk; 7244 int err = 0; 7245 int i, j; 7246 int li; 7247 uint_t commitcnt; 7248 set_t setno = cp->c_setno; 7249 uint_t set_status; 7250 md_dev64_t dev; 7251 int flags = MDDB_MUSTEXIST; 7252 7253 cp->c_dbmax = MDDB_NLB; 7254 7255 /* 7256 * Data checking 7257 */ 7258 if (setno >= md_nsets || cp->c_id < 0 || 7259 cp->c_id > cp->c_dbmax) { 7260 return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR)); 7261 } 7262 7263 if (cp->c_flags & MDDB_C_STALE) 7264 flags |= MDDB_MN_STALE; 7265 7266 if ((s = mddb_setenter(setno, flags, &err)) == NULL) 7267 return (mddbstatus2error(ep, err, NODEV32, setno)); 7268 7269 cp->c_flags = 0; 7270 7271 lbp = s->s_lbp; 7272 loccnt = lbp->lb_loccnt; 7273 locators = lbp->lb_locators; 7274 7275 /* shorthand */ 7276 set_status = md_get_setstatus(setno); 7277 7278 if (set_status & MD_SET_STALE) 7279 cp->c_flags |= MDDB_C_STALE; 7280 7281 if (set_status & MD_SET_TOOFEW) 7282 cp->c_flags |= MDDB_C_TOOFEW; 7283 7284 cp->c_sideno = s->s_sideno; 7285 7286 cp->c_dbcnt = 0; 7287 /* 7288 * go through and count active entries 7289 */ 7290 for (i = 0; i < loccnt; i++) { 7291 if (locators[i].l_flags & MDDB_F_DELETED) 7292 continue; 7293 cp->c_dbcnt++; 7294 } 7295 7296 /* 7297 * add the ability to accept a locator block index 7298 * which is not relative to previously deleted replicas. This 7299 * is for support of MD_DEBUG=STAT in metastat since it asks for 7300 * replica information specifically for each of the mirror resync 7301 * records. MDDB_CONFIG_SUBCMD uses one of the pad spares in 7302 * the mddb_config_t type. 7303 */ 7304 if (cp->c_subcmd == MDDB_CONFIG_ABS) { 7305 if (cp->c_id < 0 || cp->c_id > cp->c_dbmax) { 7306 mddb_setexit(s); 7307 return (mdmddberror(ep, MDE_DB_INVALID, NODEV32, 7308 setno)); 7309 } 7310 li = cp->c_id; 7311 } else { 7312 if (cp->c_id >= cp->c_dbcnt) { 7313 mddb_setexit(s); 7314 return (mdmddberror(ep, MDE_DB_INVALID, NODEV32, 7315 setno)); 7316 } 7317 7318 /* CSTYLED */ 7319 for (li = 0, j = 0; /* void */; li++) { 7320 if (locators[li].l_flags & MDDB_F_DELETED) 7321 continue; 7322 j++; 7323 if (j > cp->c_id) 7324 break; 7325 } 7326 } 7327 7328 if (command == MDDB_ENDDEV) { 7329 daddr_t ib = 0, jb; 7330 7331 blk = 0; 7332 if ((s != NULL) && s->s_mbiarray[li]) { 7333 mbip = s->s_mbiarray[li]; 7334 while ((jb = getphysblk(blk++, mbip)) > 0) { 7335 if (jb > ib) 7336 ib = jb; 7337 } 7338 cp->c_dbend = (int)ib; 7339 } else { 7340 cp->c_dbend = 0; 7341 } 7342 } 7343 7344 locator2cfgloc(lbp, &cp->c_locator, li, s->s_sideno, s->s_did_icp); 7345 mddb_locatorblock2splitname(s->s_lnp, li, s->s_sideno, &cp->c_devname); 7346 7347 if (command != MDDB_DELDEV) { 7348 mddb_setexit(s); 7349 return (0); 7350 } 7351 7352 /* Currently don't allow addition/deletion of sides during upgrade */ 7353 if (MD_UPGRADE) { 7354 cmn_err(CE_WARN, 7355 "Deletion of replica not allowed during upgrade.\n"); 7356 mddb_setexit(s); 7357 return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno)); 7358 } 7359 7360 /* 7361 * If here, replica delete in progress. 7362 */ 7363 single_thread_start(s); 7364 7365 if ((! (locators[li].l_flags & MDDB_F_EMASTER)) && 7366 (locators[li].l_flags & MDDB_F_ACTIVE)) { 7367 commitcnt = lbp->lb_commitcnt; 7368 lbp->lb_commitcnt = 0; 7369 setidentifier(s, &lbp->lb_ident); 7370 crcgen(lbp, &lbp->lb_checksum, dbtob(lbp->lb_blkcnt), NULL); 7371 /* 7372 * Don't need to write out device id area, since locator 7373 * block on this replica is being deleted by setting the 7374 * commitcnt to 0. 7375 */ 7376 (void) writeblks(s, (caddr_t)lbp, 0, lbp->lb_blkcnt, li, 7377 MDDB_WR_ONLY_MASTER); 7378 lbp->lb_commitcnt = commitcnt; 7379 } 7380 7381 if (s->s_mbiarray[li]) 7382 free_mbipp(&s->s_mbiarray[li]); 7383 7384 if (! (locators[li].l_flags & MDDB_F_EMASTER)) { 7385 dev = md_expldev(locators[li].l_dev); 7386 dev = md_xlate_targ_2_mini(dev); 7387 if (dev != NODEV64) 7388 mddb_devclose(dev); 7389 } 7390 7391 s->s_mbiarray[li] = 0; 7392 lbp->lb_locators[li].l_flags = MDDB_F_DELETED; 7393 7394 /* Only support data tags for traditional and local sets */ 7395 if ((md_get_setstatus(setno) & MD_SET_STALE) && 7396 (!(lbp->lb_flags & MDDB_MNSET)) && 7397 setno != MD_LOCAL_SET) 7398 if (set_dtag(s, ep)) 7399 mdclrerror(ep); 7400 7401 /* Write data tags to all accessible devices */ 7402 /* Only support data tags for traditional and local sets */ 7403 if (!(lbp->lb_flags & MDDB_MNSET)) { 7404 (void) dt_write(s); 7405 } 7406 7407 /* Delete device id of deleted replica */ 7408 if (lbp->lb_flags & MDDB_DEVID_STYLE) { 7409 (void) mddb_devid_delete(s, li); 7410 } 7411 /* write new locator to all devices */ 7412 err = writelocall(s); 7413 7414 (void) upd_med(s, "getdeldev(0)"); 7415 7416 SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, SVM_TAG_REPLICA, setno, 7417 md_expldev(locators[li].l_dev)); 7418 7419 computefreeblks(s); /* recompute always it may be larger */ 7420 cp->c_dbcnt--; 7421 err |= fixoptrecords(s); 7422 if (err) { 7423 if (writeretry(s)) { 7424 single_thread_end(s); 7425 mddb_setexit(s); 7426 return (mdmddberror(ep, MDDB_E_NOTNOW, NODEV32, setno)); 7427 } 7428 } 7429 7430 single_thread_end(s); 7431 mddb_setexit(s); 7432 return (0); 7433 } 7434 7435 static int 7436 getdriver( 7437 mddb_cfg_loc_t *clp 7438 ) 7439 { 7440 major_t majordev; 7441 7442 /* 7443 * Data checking 7444 */ 7445 if (clp->l_dev <= 0) 7446 return (EINVAL); 7447 7448 majordev = getmajor(expldev(clp->l_dev)); 7449 7450 if (ddi_major_to_name(majordev) == (char *)NULL) 7451 return (EINVAL); 7452 7453 if (MD_UPGRADE) 7454 (void) strcpy(clp->l_driver, md_targ_major_to_name(majordev)); 7455 else 7456 (void) strcpy(clp->l_driver, ddi_major_to_name(majordev)); 7457 return (0); 7458 } 7459 7460 /* 7461 * update_valid_replica - updates the locator block namespace (prefix 7462 * and/or suffix) with new pathname and devname. 7463 * RETURN 7464 * 1 Error 7465 * 0 Success 7466 */ 7467 static int 7468 update_valid_replica( 7469 side_t side, 7470 mddb_locator_t *lp, 7471 mddb_set_t *s, 7472 int li, 7473 char *devname, 7474 char *pathname, 7475 md_dev64_t devt 7476 ) 7477 { 7478 uchar_t pre_len, suf_len; 7479 md_name_suffix *sn; 7480 mddb_ln_t *lnp; 7481 uchar_t pre_index; 7482 uchar_t i; 7483 7484 if (md_expldev(lp->l_dev) != devt) { 7485 return (0); 7486 } 7487 7488 if (pathname[strlen(pathname) - 1] == '/') 7489 pathname[strlen(pathname) - 1] = '\0'; 7490 7491 pre_len = (uchar_t)strlen(pathname); 7492 suf_len = (uchar_t)strlen(devname); 7493 7494 if ((pre_len > MD_MAXPREFIX) || (suf_len > MD_MAXSUFFIX)) 7495 return (1); 7496 7497 lnp = s->s_lnp; 7498 7499 /* 7500 * Future note: Need to do something here for the MN diskset case 7501 * when device ids are supported in disksets. 7502 * Can't add until merging devids_in_diskset code into code base 7503 * Currently only called with side of 0. 7504 */ 7505 7506 sn = &lnp->ln_suffixes[side][li]; 7507 7508 /* 7509 * Check if prefix (Ex: /dev/dsk) needs to be changed. 7510 * If new prefix is the same as the previous prefix - no change. 7511 * 7512 * If new prefix is not the same, check if new prefix 7513 * matches an existing one. If so, use that one. 7514 * 7515 * If new prefix doesn't exist, add a new prefix. If not enough 7516 * space, return failure. 7517 */ 7518 pre_index = sn->suf_prefix; 7519 /* Check if new prefix is the same as the old prefix. */ 7520 if ((lnp->ln_prefixes[pre_index].pre_len != pre_len) || 7521 (bcmp(lnp->ln_prefixes[pre_index].pre_data, pathname, 7522 pre_len) != 0)) { 7523 /* Check if new prefix is an already known prefix. */ 7524 for (i = 0; i < MDDB_PREFIXCNT; i++) { 7525 if (lnp->ln_prefixes[i].pre_len != pre_len) { 7526 continue; 7527 } 7528 if (bcmp(lnp->ln_prefixes[i].pre_data, pathname, 7529 pre_len) == 0) { 7530 break; 7531 } 7532 } 7533 /* If no match found for new prefix - add the new prefix */ 7534 if (i == MDDB_PREFIXCNT) { 7535 for (i = 0; i < MDDB_PREFIXCNT; i++) { 7536 if (lnp->ln_prefixes[i].pre_len == 0) 7537 break; 7538 } 7539 /* No space to add new prefix - return failure */ 7540 if (i == MDDB_PREFIXCNT) { 7541 return (1); 7542 } 7543 bcopy(pathname, lnp->ln_prefixes[i].pre_data, pre_len); 7544 lnp->ln_prefixes[i].pre_len = pre_len; 7545 } 7546 sn->suf_prefix = i; 7547 } 7548 7549 /* Now, update the suffix (Ex: c0t0d0s0) if needed */ 7550 if ((sn->suf_len != suf_len) || 7551 (bcmp(sn->suf_data, devname, suf_len) != 0)) { 7552 bcopy(devname, sn->suf_data, suf_len); 7553 sn->suf_len = suf_len; 7554 } 7555 return (0); 7556 } 7557 7558 7559 /* 7560 * md_update_locator_namespace - If in devid style and active and the devid's 7561 * exist and are valid update the locator namespace pathname 7562 * and devname. 7563 * RETURN 7564 * 1 Error 7565 * 0 Success 7566 */ 7567 int 7568 md_update_locator_namespace( 7569 set_t setno, /* which set to get name from */ 7570 side_t side, 7571 char *dname, 7572 char *pname, 7573 md_dev64_t devt 7574 ) 7575 { 7576 mddb_set_t *s; 7577 mddb_lb_t *lbp; 7578 int li; 7579 uint_t flg; 7580 int err = 0; 7581 mddb_ln_t *lnp; 7582 7583 if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) 7584 return (1); 7585 single_thread_start(s); 7586 lbp = s->s_lbp; 7587 /* must be DEVID_STYLE */ 7588 if (lbp->lb_flags & MDDB_DEVID_STYLE) { 7589 for (li = 0; li < lbp->lb_loccnt; li++) { 7590 mddb_locator_t *lp = &lbp->lb_locators[li]; 7591 7592 if (lp->l_flags & MDDB_F_DELETED) { 7593 continue; 7594 } 7595 7596 /* replica also must be active */ 7597 if (lp->l_flags & MDDB_F_ACTIVE) { 7598 flg = s->s_did_icp->did_ic_blkp-> 7599 blk_info[li].info_flags; 7600 /* only update if did exists and is valid */ 7601 if ((flg & MDDB_DID_EXISTS) && 7602 (flg & MDDB_DID_VALID)) { 7603 if (update_valid_replica(side, lp, s, 7604 li, dname, pname, devt)) { 7605 err = 1; 7606 goto out; 7607 } 7608 } 7609 } 7610 } 7611 } 7612 lnp = s->s_lnp; 7613 uniqtime32(&lnp->ln_timestamp); 7614 if (lbp->lb_flags & MDDB_MNSET) 7615 lnp->ln_revision = MDDB_REV_MNLN; 7616 else 7617 lnp->ln_revision = MDDB_REV_LN; 7618 crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL); 7619 err = writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk, 7620 lbp->lb_lnblkcnt, 0); 7621 /* 7622 * If a MN diskset and this is the master, set the PARSE_LOCNM 7623 * flag in the mddb_set structure to show that the locator 7624 * names have changed. 7625 */ 7626 7627 if ((lbp->lb_flags & MDDB_MNSET) && 7628 (md_set[s->s_setno].s_am_i_master)) { 7629 s->s_mn_parseflags |= MDDB_PARSE_LOCNM; 7630 } 7631 out: 7632 single_thread_end(s); 7633 mddb_setexit(s); 7634 if (err) 7635 return (1); 7636 return (0); 7637 } 7638 7639 /* 7640 * update_locatorblock - for active entries in the locator block, check 7641 * the devt to see if it matches the given devt. If so, and 7642 * there is an associated device id which is not the same 7643 * as the passed in devid, delete old devid and add a new one. 7644 * RETURN 7645 * MDDB_E_NODEVID 7646 * MDDB_E_NOLOCBLK 7647 * 1 Error 7648 * 0 Success 7649 */ 7650 static int 7651 update_locatorblock(mddb_set_t *s, md_dev64_t dev, ddi_devid_t didptr) 7652 { 7653 mddb_lb_t *lbp = NULL; 7654 mddb_locator_t *lp; 7655 int li; 7656 uint_t flg; 7657 ddi_devid_t devid_ptr; 7658 int retval = 0; 7659 char *minor_name; 7660 7661 lbp = s->s_lbp; 7662 /* find replicas that haven't been deleted */ 7663 for (li = 0; li < lbp->lb_loccnt; li++) { 7664 lp = &lbp->lb_locators[li]; 7665 7666 if ((lp->l_flags & MDDB_F_DELETED)) { 7667 continue; 7668 } 7669 /* 7670 * check to see if locator devt matches given dev 7671 * and if there is a device ID associated with it 7672 */ 7673 flg = s->s_did_icp->did_ic_blkp-> blk_info[li].info_flags; 7674 if ((md_expldev(lp->l_dev) == dev) && 7675 (flg & MDDB_DID_EXISTS)) { 7676 if (flg & MDDB_DID_VALID) { 7677 continue; /* cont to nxt active entry */ 7678 } 7679 devid_ptr = s->s_did_icp->did_ic_devid[li]; 7680 if (devid_ptr == NULL) { 7681 return (MDDB_E_NODEVID); 7682 } 7683 if (ddi_devid_compare(devid_ptr, didptr) != 0) { 7684 /* 7685 * devid's not equal so 7686 * delete and add 7687 */ 7688 if (ddi_lyr_get_minor_name( 7689 md_dev64_to_dev(dev), 7690 S_IFBLK, &minor_name) == DDI_SUCCESS) { 7691 (void) mddb_devid_delete(s, li); 7692 (void) mddb_devid_add(s, li, didptr, 7693 minor_name); 7694 kmem_free(minor_name, 7695 strlen(minor_name)+1); 7696 break; 7697 } else { 7698 retval = 1; 7699 goto err_out; 7700 } 7701 } 7702 } 7703 } /* end for */ 7704 retval = push_lb(s); 7705 err_out: 7706 return (retval); 7707 } 7708 7709 static int 7710 update_mb_devid( 7711 mddb_set_t *s, 7712 mddb_ri_t *rip, 7713 ddi_devid_t devidptr 7714 ) 7715 { 7716 mddb_mb_ic_t *mbip; 7717 mddb_mb_t *mb = NULL; 7718 daddr_t blkno; 7719 md_dev64_t device; 7720 uint_t sz; 7721 int mb2free = 0; 7722 int err = 0; 7723 7724 7725 /* 7726 * There is case where a disk may not have mddb, 7727 * and only has dummy mddb which contains 7728 * a valid devid we like to update and in this 7729 * case, the rip_lbp will be NULL but we still 7730 * like to update the devid embedded in the 7731 * dummy mb block. 7732 * 7733 */ 7734 if (rip->ri_mbip != (mddb_mb_ic_t *)NULL) { 7735 mbip = rip->ri_mbip; 7736 mb = &mbip->mbi_mddb_mb; 7737 } else { 7738 /* 7739 * Done if it is non-replicated set 7740 */ 7741 if (devidptr != (ddi_devid_t)NULL) { 7742 mb = (mddb_mb_t *)kmem_zalloc(MDDB_BSIZE, 7743 KM_SLEEP); 7744 mb->mb_magic = MDDB_MAGIC_DU; 7745 mb->mb_revision = MDDB_REV_MB; 7746 mb2free = 1; 7747 } else { 7748 goto out; 7749 } 7750 } 7751 7752 blkno = rip->ri_blkno; 7753 device = rip->ri_dev; 7754 /* 7755 * Replace the mb_devid with the new/valid one 7756 */ 7757 if (devidptr != (ddi_devid_t)NULL) { 7758 /* 7759 * Zero out what we have previously 7760 */ 7761 if (mb->mb_devid_len) 7762 bzero(mb->mb_devid, mb->mb_devid_len); 7763 sz = ddi_devid_sizeof(devidptr); 7764 bcopy((char *)devidptr, (char *)mb->mb_devid, sz); 7765 mb->mb_devid_len = sz; 7766 } 7767 7768 mb->mb_setno = s->s_setno; 7769 uniqtime32(&mb->mb_timestamp); 7770 crcgen(mb, &mb->mb_checksum, MDDB_BSIZE, NULL); 7771 /* 7772 * putblks will 7773 * 7774 * - drop the s_dbmx lock 7775 * - biowait 7776 * - regain the s_dbmx lock 7777 * 7778 * Need to update this if we wants to handle 7779 * mb_next != NULL which it is unlikely will happen 7780 */ 7781 err = putblks(s, (caddr_t)mb, blkno, 1, device, 0); 7782 7783 if (mb2free) { 7784 kmem_free(mb, MDDB_BSIZE); 7785 } 7786 out: 7787 return (err); 7788 } 7789 7790 static int 7791 setdid( 7792 mddb_config_t *cp 7793 ) 7794 { 7795 ddi_devid_t devidp; 7796 dev_t ddi_dev; 7797 mddb_set_t *s; 7798 int err = 0; 7799 mddb_ri_t *rip; 7800 7801 /* 7802 * Data integrity check 7803 */ 7804 if (cp->c_setno >= md_nsets || cp->c_devt <= 0) 7805 return (EINVAL); 7806 7807 if ((md_get_setstatus(cp->c_setno) & MD_SET_STALE)) 7808 return (0); 7809 7810 ddi_dev = md_dev64_to_dev(cp->c_devt); 7811 if (ddi_lyr_get_devid(ddi_dev, &devidp) != DDI_SUCCESS) { 7812 return (-1); 7813 } 7814 if (devidp == NULL) { 7815 return (-1); 7816 } 7817 7818 if ((s = mddb_setenter(cp->c_setno, MDDB_MUSTEXIST, &err)) == NULL) 7819 return (-1); 7820 single_thread_start(s); 7821 7822 for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) { 7823 if (rip->ri_lbp == (mddb_lb_t *)NULL) 7824 continue; 7825 /* 7826 * We only update what is asked 7827 */ 7828 if (rip->ri_dev == cp->c_devt) { 7829 if (update_mb_devid(s, rip, devidp) != 0) { 7830 err = -1; 7831 goto out; 7832 } 7833 } 7834 } 7835 7836 if (update_locatorblock(s, cp->c_devt, devidp)) { 7837 err = -1; 7838 goto out; 7839 } 7840 7841 out: 7842 single_thread_end(s); 7843 mddb_setexit(s); 7844 ddi_devid_free(devidp); 7845 return (err); 7846 } 7847 7848 static int 7849 delnewside( 7850 mddb_config_t *cp, 7851 int command, 7852 md_error_t *ep 7853 ) 7854 { 7855 mddb_set_t *s; 7856 int li; 7857 mddb_lb_t *lbp; /* pointer to locator block */ 7858 mddb_ln_t *lnp; /* pointer to locator names */ 7859 mddb_mnln_t *mnlnp; /* pointer to locator names */ 7860 mddb_locator_t *lp; 7861 mddb_sidelocator_t *slp; 7862 mddb_cfg_loc_t *clp; 7863 int err = 0; 7864 set_t setno = cp->c_setno; 7865 ddi_devid_t devid; 7866 ddi_devid_t ret_devid = NULL; 7867 char *minor_name; 7868 uint_t use_devid = 0; 7869 dev_t ddi_dev; 7870 md_mnname_suffix_t *mnsn; 7871 mddb_mnlb_t *mnlbp; 7872 mddb_mnsidelocator_t *mnslp; 7873 7874 /* Currently don't allow addition/deletion of sides during upgrade */ 7875 if (MD_UPGRADE) { 7876 cmn_err(CE_WARN, 7877 "Addition and deletion of sides not allowed" 7878 " during upgrade. \n"); 7879 return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno)); 7880 } 7881 7882 /* 7883 * Data integrity check 7884 */ 7885 if (setno >= md_nsets || cp->c_locator.l_dev <= 0) 7886 return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR)); 7887 7888 if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) 7889 return (mddbstatus2error(ep, err, NODEV32, setno)); 7890 7891 single_thread_start(s); 7892 clp = &cp->c_locator; 7893 7894 lbp = s->s_lbp; 7895 7896 if (lbp->lb_setno != setno) { 7897 single_thread_end(s); 7898 mddb_setexit(s); 7899 return (mdmddberror(ep, MDE_DB_INVALID, NODEV32, setno)); 7900 } 7901 7902 /* 7903 * Find this device/blkno pair 7904 */ 7905 if (lbp->lb_flags & MDDB_DEVID_STYLE) { 7906 ddi_dev = md_dev64_to_dev(clp->l_dev); 7907 if ((ddi_lyr_get_devid(ddi_dev, &ret_devid) == DDI_SUCCESS) && 7908 (ddi_lyr_get_minor_name(ddi_dev, S_IFBLK, &minor_name) 7909 == DDI_SUCCESS)) { 7910 if (strlen(minor_name) < MDDB_MINOR_NAME_MAX) { 7911 clp->l_devid = (uint64_t)(uintptr_t)ret_devid; 7912 use_devid = 1; 7913 (void) strcpy(clp->l_minor_name, minor_name); 7914 } 7915 kmem_free(minor_name, strlen(minor_name)+1); 7916 } 7917 if (use_devid != 1 && ret_devid != NULL) 7918 ddi_devid_free(ret_devid); 7919 } 7920 for (li = 0; li < lbp->lb_loccnt; li++) { 7921 lp = &lbp->lb_locators[li]; 7922 if (lp->l_flags & MDDB_F_DELETED) 7923 continue; 7924 if (use_devid) { 7925 if ((mddb_devid_get(s, li, &devid, &minor_name)) == 0) 7926 continue; 7927 if ((ddi_devid_compare(devid, 7928 (ddi_devid_t)(uintptr_t)clp->l_devid) == 0) && 7929 (strcmp(clp->l_minor_name, minor_name) == 0) && 7930 ((daddr_t)lp->l_blkno == clp->l_blkno)) { 7931 break; 7932 } 7933 } else { 7934 if (lp->l_dev == clp->l_dev && 7935 (daddr_t)lp->l_blkno == clp->l_blkno) { 7936 break; 7937 } 7938 } 7939 } 7940 7941 if (li == lbp->lb_loccnt) { 7942 if (use_devid) 7943 ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid); 7944 single_thread_end(s); 7945 mddb_setexit(s); 7946 return (mdmddberror(ep, MDE_DB_INVALID, NODEV32, setno)); 7947 } 7948 7949 lnp = s->s_lnp; 7950 if (command == MDDB_NEWSIDE) { 7951 int index = 0; 7952 /* 7953 * If a MN diskset, need to find the index where the new 7954 * locator information is to be stored in the mnsidelocator 7955 * field of the locator block so that the locator name can 7956 * be stored at the same array index in the mnsuffixes 7957 * field of the locator names structure. 7958 */ 7959 if (lbp->lb_flags & MDDB_MNSET) { 7960 if ((index = checklocator(lbp, li, 7961 cp->c_sideno)) == -1) { 7962 if (use_devid) { 7963 ddi_devid_free((ddi_devid_t) 7964 (uintptr_t)clp->l_devid); 7965 } 7966 single_thread_end(s); 7967 mddb_setexit(s); 7968 return (mdmddberror(ep, MDE_DB_TOOSMALL, 7969 NODEV32, setno)); 7970 } 7971 } 7972 7973 /* 7974 * Store the locator name before the sidelocator information 7975 * in case a panic occurs between these 2 steps. Must have 7976 * the locator name information in order to print reasonable 7977 * error information. 7978 */ 7979 if (splitname2locatorblock(&cp->c_devname, lnp, li, 7980 cp->c_sideno, index)) { 7981 if (use_devid) 7982 ddi_devid_free( 7983 (ddi_devid_t)(uintptr_t)clp->l_devid); 7984 single_thread_end(s); 7985 mddb_setexit(s); 7986 return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32, 7987 setno)); 7988 } 7989 7990 if (cfgloc2locator(lbp, clp, li, cp->c_sideno, index)) { 7991 if (use_devid) 7992 ddi_devid_free( 7993 (ddi_devid_t)(uintptr_t)clp->l_devid); 7994 single_thread_end(s); 7995 mddb_setexit(s); 7996 return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32, 7997 setno)); 7998 } 7999 } 8000 8001 if (use_devid) 8002 ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid); 8003 8004 if (command == MDDB_DELSIDE) { 8005 int i; 8006 for (i = 0; i < lbp->lb_loccnt; i++) { 8007 if (lbp->lb_flags & MDDB_MNSET) { 8008 int j; 8009 mnlbp = (mddb_mnlb_t *)lbp; 8010 for (j = 0; j < MD_MNMAXSIDES; j++) { 8011 mnslp = &mnlbp->lb_mnsidelocators[j][i]; 8012 if (mnslp->mnl_sideno == cp->c_sideno) 8013 break; 8014 } 8015 if (j < MD_MNMAXSIDES) { 8016 mnslp->mnl_mnum = NODEV32; 8017 mnslp->mnl_sideno = 0; 8018 mnlnp = (mddb_mnln_t *)lnp; 8019 mnsn = &(mnlnp->ln_mnsuffixes[j][i]); 8020 bzero((caddr_t)mnsn, 8021 sizeof (md_mnname_suffix_t)); 8022 } 8023 } else { 8024 slp = &lbp->lb_sidelocators[cp->c_sideno][i]; 8025 bzero((caddr_t)&lnp->ln_suffixes 8026 [cp->c_sideno][i], sizeof (md_name_suffix)); 8027 slp->l_mnum = NODEV32; 8028 } 8029 } 8030 } 8031 8032 /* write new locator names to all devices */ 8033 uniqtime32(&lnp->ln_timestamp); 8034 if (lbp->lb_flags & MDDB_MNSET) 8035 lnp->ln_revision = MDDB_REV_MNLN; 8036 else 8037 lnp->ln_revision = MDDB_REV_LN; 8038 crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL); 8039 err |= writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk, 8040 lbp->lb_lnblkcnt, 0); 8041 /* 8042 * If a MN diskset and this is the master, set the PARSE_LOCNM 8043 * flag in the mddb_set structure to show that the locator 8044 * names have changed. 8045 */ 8046 8047 if ((lbp->lb_flags & MDDB_MNSET) && 8048 (md_set[s->s_setno].s_am_i_master)) { 8049 s->s_mn_parseflags |= MDDB_PARSE_LOCNM; 8050 } 8051 if (err) { 8052 if (writeretry(s)) { 8053 single_thread_end(s); 8054 mddb_setexit(s); 8055 return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno)); 8056 } 8057 } 8058 8059 uniqtime32(&lbp->lb_timestamp); 8060 /* write new locator to all devices */ 8061 err = writelocall(s); 8062 computefreeblks(s); /* recompute always it may be larger */ 8063 if (err) { 8064 if (writeretry(s)) { 8065 single_thread_end(s); 8066 mddb_setexit(s); 8067 return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno)); 8068 } 8069 } 8070 8071 single_thread_end(s); 8072 mddb_setexit(s); 8073 8074 return (0); 8075 } 8076 8077 static int 8078 newdev( 8079 mddb_config_t *cp, 8080 int command, 8081 md_error_t *ep 8082 ) 8083 { 8084 mddb_set_t *s; 8085 mddb_mb_ic_t *mbip, *mbip1; 8086 int i, j; 8087 int li; 8088 mddb_lb_t *lbp; /* pointer to locator block */ 8089 mddb_ln_t *lnp; /* pointer to locator names */ 8090 mddb_locator_t *lp; 8091 mddb_cfg_loc_t *clp; 8092 int err = 0; 8093 set_t setno = cp->c_setno; 8094 ddi_devid_t devid2; 8095 ddi_devid_t ret_devid = NULL; 8096 char *minor_name; 8097 uint_t use_devid = 0; 8098 dev_t ddi_dev; 8099 int old_flags; 8100 int flags; 8101 int mn_set = 0; 8102 int index; 8103 8104 8105 /* Currently don't allow addition of new replica during upgrade */ 8106 if (MD_UPGRADE) { 8107 cmn_err(CE_WARN, 8108 "Addition of new replica not allowed during upgrade.\n"); 8109 return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno)); 8110 } 8111 8112 /* 8113 * Data integrity check 8114 */ 8115 if (setno >= md_nsets || cp->c_locator.l_dev <= 0) 8116 return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR)); 8117 8118 /* Determine the flag settings for multinode sets */ 8119 flags = MDDB_NOOLDOK; 8120 if (cp->c_multi_node) 8121 flags |= MDDB_MULTINODE; 8122 8123 if ((s = mddb_setenter(setno, flags, &err)) == NULL) { 8124 if (err != MDDB_E_NOTOWNER) 8125 return (mddbstatus2error(ep, err, NODEV32, setno)); 8126 s = init_set(cp, flags, &err); 8127 if (s == NULL) 8128 return (mddbstatus2error(ep, err, NODEV32, setno)); 8129 } 8130 8131 single_thread_start(s); 8132 8133 /* shorthand */ 8134 clp = &cp->c_locator; 8135 8136 /* shorthand */ 8137 lbp = s->s_lbp; 8138 8139 if (lbp->lb_setno != setno) { 8140 single_thread_end(s); 8141 mddb_setexit(s); 8142 return (mdmddberror(ep, MDE_DB_INVALID, NODEV32, setno)); 8143 } 8144 8145 /* 8146 * See if this device/blkno pair is already a replica 8147 */ 8148 if (lbp->lb_flags & MDDB_DEVID_STYLE) { 8149 ddi_dev = expldev(clp->l_dev); 8150 if ((ddi_lyr_get_devid(ddi_dev, &ret_devid) == DDI_SUCCESS) && 8151 (ddi_lyr_get_minor_name(ddi_dev, 8152 S_IFBLK, &minor_name) == DDI_SUCCESS)) { 8153 if (strlen(minor_name) < MDDB_MINOR_NAME_MAX) { 8154 clp->l_devid = (uint64_t)(uintptr_t)ret_devid; 8155 use_devid = 1; 8156 (void) strcpy(clp->l_minor_name, minor_name); 8157 } 8158 kmem_free(minor_name, strlen(minor_name)+1); 8159 } 8160 if (use_devid != 1 && ret_devid != NULL) 8161 ddi_devid_free(ret_devid); 8162 } 8163 8164 for (i = 0; i < lbp->lb_loccnt; i++) { 8165 lp = &lbp->lb_locators[i]; 8166 if (lp->l_flags & MDDB_F_DELETED) 8167 continue; 8168 if (use_devid) { 8169 if ((mddb_devid_get(s, i, &devid2, &minor_name)) == 0) 8170 continue; 8171 if ((ddi_devid_compare(devid2, 8172 (ddi_devid_t)(uintptr_t)clp->l_devid) == 0) && 8173 (strcmp(clp->l_minor_name, minor_name) == 0) && 8174 ((daddr_t)lp->l_blkno == clp->l_blkno)) { 8175 if (command == MDDB_NEWDEV) { 8176 ddi_devid_free((ddi_devid_t)(uintptr_t) 8177 clp->l_devid); 8178 single_thread_end(s); 8179 mddb_setexit(s); 8180 return (mdmddberror(ep, 8181 MDE_DB_EXISTS, NODEV32, setno)); 8182 } 8183 } 8184 } else { 8185 if (lp->l_dev == clp->l_dev && 8186 (daddr_t)lp->l_blkno == clp->l_blkno) { 8187 if (command == MDDB_NEWDEV) { 8188 single_thread_end(s); 8189 mddb_setexit(s); 8190 return (mdmddberror(ep, 8191 MDE_DB_EXISTS, NODEV32, setno)); 8192 } 8193 } 8194 } 8195 } 8196 8197 /* 8198 * Really is a new replica, go get the master blocks 8199 */ 8200 mbip = getmasters(s, md_expldev(clp->l_dev), clp->l_blkno, 8201 (uint_t *)0, &mn_set); 8202 if (! mbip) { 8203 if (use_devid) 8204 ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid); 8205 single_thread_end(s); 8206 mddb_setexit(s); 8207 return (mdmddberror(ep, MDE_DB_MASTER, NODEV32, setno)); 8208 } 8209 8210 /* 8211 * Compute free blocks in replica. 8212 */ 8213 computefreeblks(s); 8214 8215 /* 8216 * Check if this is large enough 8217 */ 8218 for (mbip1 = mbip, i = 0; mbip1 != NULL; mbip1 = mbip1->mbi_next) 8219 i += mbip1->mbi_mddb_mb.mb_blkcnt; 8220 for (j = i; j < s->s_totalblkcnt; j++) { 8221 if (blkcheck(s, j)) { 8222 while (mbip) { 8223 mbip1 = mbip->mbi_next; 8224 kmem_free((caddr_t)mbip, MDDB_IC_BSIZE); 8225 mbip = mbip1; 8226 } 8227 if (use_devid) 8228 ddi_devid_free( 8229 (ddi_devid_t)(uintptr_t)clp->l_devid); 8230 mddb_devclose(md_expldev(clp->l_dev)); 8231 single_thread_end(s); 8232 mddb_setexit(s); 8233 return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32, 8234 setno)); 8235 } 8236 } 8237 8238 /* Look for a deleted slot */ 8239 for (li = 0; li < lbp->lb_loccnt; li++) { 8240 lp = &lbp->lb_locators[li]; 8241 if (lp->l_flags & MDDB_F_DELETED) 8242 break; 8243 } 8244 8245 /* If no deleted slots, add a new one */ 8246 if (li == lbp->lb_loccnt) { 8247 /* Already have the max replicas, bail */ 8248 if (lbp->lb_loccnt == MDDB_NLB) { 8249 if (use_devid) 8250 ddi_devid_free((ddi_devid_t)(uintptr_t) 8251 clp->l_devid); 8252 mddb_devclose(md_expldev(clp->l_dev)); 8253 single_thread_end(s); 8254 mddb_setexit(s); 8255 return (mdmddberror(ep, MDE_TOOMANY_REPLICAS, NODEV32, 8256 setno)); 8257 } 8258 lbp->lb_loccnt++; 8259 lp = &lbp->lb_locators[li]; 8260 } 8261 8262 /* Initialize the new or deleted slot */ 8263 old_flags = lp->l_flags; 8264 lp->l_dev = clp->l_dev; 8265 lp->l_blkno = (daddr32_t)clp->l_blkno; 8266 lp->l_flags = clp->l_flags; 8267 8268 /* shorthand */ 8269 lnp = s->s_lnp; 8270 8271 index = 0; 8272 if ((lbp->lb_flags & MDDB_MNSET) || (flags & MDDB_MULTINODE)) { 8273 /* 8274 * If a MN diskset, need to find the index where the new 8275 * locator information is to be stored in the mnsidelocator 8276 * field of the locator block so that the locator name can 8277 * be stored at the same array index in the mnsuffixes 8278 * field of the locator names structure. 8279 */ 8280 lbp->lb_flags |= MDDB_MNSET; 8281 if ((index = checklocator(lbp, li, s->s_sideno)) == -1) { 8282 if (use_devid) 8283 ddi_devid_free((ddi_devid_t)(uintptr_t)clp-> 8284 l_devid); 8285 lp->l_flags = old_flags; 8286 lbp->lb_loccnt--; 8287 mddb_devclose(md_expldev(clp->l_dev)); 8288 single_thread_end(s); 8289 mddb_setexit(s); 8290 return (mdmddberror(ep, MDE_DB_TOOSMALL, 8291 NODEV32, setno)); 8292 } 8293 } 8294 /* 8295 * Store the locator name before the sidelocator information 8296 * in case a panic occurs between these 2 steps. Must have 8297 * the locator name information in order to print reasonable 8298 * error information. 8299 */ 8300 if (splitname2locatorblock(&cp->c_devname, lnp, li, 8301 s->s_sideno, index)) { 8302 if (use_devid) 8303 ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid); 8304 lp->l_flags = old_flags; 8305 lbp->lb_loccnt--; 8306 mddb_devclose(md_expldev(clp->l_dev)); 8307 single_thread_end(s); 8308 mddb_setexit(s); 8309 return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32, setno)); 8310 } 8311 8312 /* 8313 * Compute free blocks in replica before calling cfgloc2locator 8314 * since cfgloc2locator may attempt to alloc an unused block 8315 * to store the device id. 8316 * mbiarray needs to be setup before calling computefreeblks. 8317 */ 8318 s->s_mbiarray[li] = mbip; 8319 computefreeblks(s); 8320 8321 if (cfgloc2locator(lbp, clp, li, s->s_sideno, index)) { 8322 if (use_devid) 8323 ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid); 8324 lp->l_flags = old_flags; 8325 lbp->lb_loccnt--; 8326 s->s_mbiarray[li] = 0; 8327 mddb_devclose(md_expldev(clp->l_dev)); 8328 single_thread_end(s); 8329 mddb_setexit(s); 8330 return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32, setno)); 8331 } 8332 8333 if (use_devid) 8334 ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid); 8335 8336 uniqtime32(&lbp->lb_timestamp); 8337 lp->l_flags = MDDB_F_ACTIVE; 8338 8339 /* write db copy to new device */ 8340 err = writecopy(s, li, MDDB_WRITECOPY_ALL); 8341 lp->l_flags |= MDDB_F_UP2DATE; 8342 8343 /* write new locator names to all devices */ 8344 uniqtime32(&lnp->ln_timestamp); 8345 if (lbp->lb_flags & MDDB_MNSET) 8346 lnp->ln_revision = MDDB_REV_MNLN; 8347 else 8348 lnp->ln_revision = MDDB_REV_LN; 8349 crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL); 8350 err |= writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk, 8351 lbp->lb_lnblkcnt, 0); 8352 /* 8353 * If a MN diskset and this is the master, set the PARSE_LOCNM 8354 * flag in the mddb_set structure to show that the locator 8355 * names have changed. 8356 */ 8357 8358 if ((lbp->lb_flags & MDDB_MNSET) && 8359 (md_set[s->s_setno].s_am_i_master)) { 8360 s->s_mn_parseflags |= MDDB_PARSE_LOCNM; 8361 } 8362 if (err) { 8363 if (writeretry(s)) { 8364 single_thread_end(s); 8365 mddb_setexit(s); 8366 return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno)); 8367 } 8368 } 8369 8370 /* Data tags not supported on MN sets */ 8371 if ((md_get_setstatus(setno) & MD_SET_STALE) && 8372 (!(lbp->lb_flags & MDDB_MNSET)) && 8373 setno != MD_LOCAL_SET) 8374 if (set_dtag(s, ep)) 8375 mdclrerror(ep); 8376 8377 /* Write data tags to all accessible devices */ 8378 /* Data tags not supported on MN sets */ 8379 if (!(lbp->lb_flags & MDDB_MNSET)) { 8380 (void) dt_write(s); 8381 } 8382 8383 /* write new locator to all devices */ 8384 err = writelocall(s); 8385 8386 (void) upd_med(s, "newdev(0)"); 8387 8388 SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_CREATE, SVM_TAG_REPLICA, setno, 8389 md_expldev(clp->l_dev)); 8390 8391 computefreeblks(s); /* recompute always it may be smaller */ 8392 if (err) { 8393 if (writeretry(s)) { 8394 single_thread_end(s); 8395 mddb_setexit(s); 8396 return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno)); 8397 } 8398 } 8399 8400 single_thread_end(s); 8401 mddb_setexit(s); 8402 8403 return (0); 8404 } 8405 8406 #ifdef DEBUG 8407 static void 8408 mddb_check_set( 8409 set_t setno 8410 ) 8411 { 8412 mddb_set_t *s; 8413 mddb_db_t *dbp; 8414 mddb_de_ic_t *dep; 8415 mddb_rb32_t *rbp; 8416 8417 if (! md_set[setno].s_db) 8418 return; 8419 8420 s = (mddb_set_t *)md_set[setno].s_db; 8421 8422 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 8423 for (dep = dbp->db_firstentry; 8424 dep != NULL; dep = dep->de_next) { 8425 rbp = dep->de_rb; 8426 ASSERT(rbp->rb_magic == MDDB_MAGIC_RB); 8427 if (dep->de_rb_userdata) 8428 ASSERT((uintptr_t)dep->de_rb_userdata > 2000); 8429 } 8430 } 8431 } 8432 #endif /* DEBUG */ 8433 8434 /* 8435 * Exported Entry Points 8436 */ 8437 #ifdef DEBUG 8438 void 8439 mddb_check(void) 8440 { 8441 int i; 8442 8443 for (i = 0; i < md_nsets; i++) { 8444 if (! md_set[i].s_db) 8445 return; 8446 8447 mddb_check_set(i); 8448 } 8449 8450 } 8451 #endif /* DEBUG */ 8452 8453 int 8454 mddb_configure( 8455 mddb_cfgcmd_t command, 8456 mddb_config_t *cp 8457 ) 8458 { 8459 mddb_set_t *s; 8460 md_error_t *ep = &cp->c_mde; 8461 int flag = 0; 8462 int err = 0; 8463 set_t setno = cp->c_setno; 8464 8465 mdclrerror(ep); 8466 8467 switch (command) { 8468 case MDDB_NEWDEV: 8469 err = newdev(cp, command, ep); 8470 break; 8471 8472 case MDDB_NEWSIDE: 8473 case MDDB_DELSIDE: 8474 err = delnewside(cp, command, ep); 8475 break; 8476 8477 case MDDB_GETDEV: 8478 case MDDB_DELDEV: 8479 case MDDB_ENDDEV: 8480 err = getdeldev(cp, command, ep); 8481 break; 8482 8483 case MDDB_GETDRVRNAME: 8484 err = getdriver(&cp->c_locator); 8485 break; 8486 8487 case MDDB_USEDEV: 8488 /* 8489 * Note: must allow USEDEV ioctl during upgrade to support 8490 * auto-take disksets. 8491 * 8492 * Also during the set import if the md_devid_destroy 8493 * flag is set then error out 8494 */ 8495 8496 if ((cp->c_flags & MDDB_C_IMPORT) && md_devid_destroy) 8497 return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR)); 8498 8499 if (setno >= md_nsets) 8500 return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR)); 8501 8502 if ((s = mddb_setenter(setno, MDDB_NOINIT, &err)) == NULL) { 8503 if ((s = init_set(cp, MDDB_NOINIT, &err)) == NULL) { 8504 err = mddbstatus2error(ep, err, NODEV32, setno); 8505 break; 8506 } 8507 } 8508 if (setno == MD_LOCAL_SET) 8509 flag = MDDB_F_IOCTL; 8510 if (cp->c_locator.l_old_devid) { 8511 md_set_setstatus(setno, MD_SET_REPLICATED_IMPORT); 8512 } 8513 if ((err = ridev(&s->s_rip, &cp->c_locator, NULL, flag)) != 0) 8514 err = mddbstatus2error(ep, err, NODEV32, setno); 8515 mddb_setexit(s); 8516 break; 8517 8518 case MDDB_RELEASESET: 8519 mutex_enter(&mddb_lock); 8520 mddb_unload_set(cp->c_setno); 8521 mutex_exit(&mddb_lock); 8522 break; 8523 8524 case MDDB_SETDID: 8525 err = setdid(cp); 8526 break; 8527 8528 default: 8529 err = mdmddberror(ep, MDE_DB_INVALID, NODEV32, cp->c_setno); 8530 } 8531 8532 return (err); 8533 } 8534 8535 int 8536 mddb_getoptloc( 8537 mddb_optloc_t *ol 8538 ) 8539 { 8540 mddb_set_t *s; 8541 mddb_db_t *dbp; 8542 mddb_de_ic_t *dep; 8543 mddb_recid_t id; 8544 set_t setno; 8545 8546 ol->li[0] = -1; 8547 ol->li[1] = -1; 8548 8549 id = ol->recid; 8550 setno = DBSET(id); 8551 if (setno >= md_nsets) 8552 return (EINVAL); 8553 8554 if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, NULL)) == NULL) 8555 return (0); 8556 8557 id = DBID(id); 8558 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 8559 for (dep = dbp->db_firstentry; 8560 dep != NULL; dep = dep->de_next) { 8561 if (dep->de_recid != id) 8562 continue; 8563 ol->li[0] = dep->de_optinfo[0].o_li; 8564 ol->li[1] = dep->de_optinfo[1].o_li; 8565 mddb_setexit(s); 8566 return (0); 8567 } 8568 } 8569 mddb_setexit(s); 8570 return (0); 8571 } 8572 8573 void 8574 mddb_init(void) 8575 { 8576 mddb_set_t *s; 8577 8578 mutex_init(&mddb_lock, NULL, MUTEX_DEFAULT, NULL); 8579 if ((s = init_set(NULL, MDDB_NOINIT, NULL)) != NULL) 8580 mddb_setexit(s); 8581 } 8582 8583 8584 void 8585 mddb_unload(void) 8586 { 8587 int i; 8588 8589 mutex_enter(&mddb_lock); 8590 8591 for (i = 0; i < md_nsets; i++) { 8592 md_clr_setstatus(i, MD_SET_KEEPTAG); 8593 mddb_unload_set(i); 8594 } 8595 8596 crcfreetab(); 8597 8598 mutex_exit(&mddb_lock); 8599 } 8600 8601 mddb_recid_t 8602 mddb_createrec( 8603 size_t usersize, /* size of db record */ 8604 mddb_type_t type, /* type1 of db record */ 8605 uint_t type2, /* type2 of db record */ 8606 md_create_rec_option_t options, /* options for this creation */ 8607 set_t setno /* set number to create record in */ 8608 ) 8609 { 8610 mddb_set_t *s; 8611 mddb_db_t *dbp, *prevdbp, *newdbp; 8612 mddb_db32_t *db32p; 8613 mddb_de_ic_t *dep; 8614 /* LINTED variable unused - used for sizeof calculations */ 8615 mddb_de32_t *de32p; 8616 mddb_rb32_t *rbp; 8617 size_t recsize; 8618 ulong_t blkcnt; 8619 ulong_t maxblocks; 8620 size_t desize, desize_ic; 8621 size_t used; 8622 mddb_recid_t newid; 8623 caddr_t tmppnt; 8624 int i, err = 0; 8625 void *userdata; 8626 uint_t flag_type; 8627 8628 #if defined(_ILP32) && !defined(lint) 8629 ASSERT(sizeof (mddb_de_t) == sizeof (mddb_de32_t)); 8630 ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t)); 8631 ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t)); 8632 #endif 8633 8634 /* 8635 * everyone is supposed to sepcify if it's a 8636 * 32 bit or a 64 bit record 8637 */ 8638 if ((options &(MD_CRO_32BIT|MD_CRO_64BIT)) == 0) { 8639 return (MDDB_E_INVALID); 8640 } 8641 8642 if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) 8643 return (err); 8644 8645 if (checkstate(s, MDDB_PROBE)) { 8646 mddb_setexit(s); 8647 return (MDDB_E_NOTNOW); 8648 } 8649 8650 recsize = roundup((sizeof (*rbp) - sizeof (rbp->rb_data)) + 8651 usersize, MDDB_BSIZE); 8652 blkcnt = btodb(recsize); 8653 8654 if (mddb_maxblocks) 8655 maxblocks = mddb_maxblocks; 8656 else 8657 maxblocks = (MDDB_BSIZE - 8658 (sizeof (*db32p) + sizeof (*de32p) - 8659 sizeof (de32p->de32_blks))) / sizeof (mddb_block_t); 8660 8661 if (blkcnt > maxblocks) { 8662 mddb_setexit(s); 8663 return (MDDB_E_INVALID); 8664 } 8665 /* 8666 * allocate record block 8667 * and new directory block so to avoid sleeping 8668 * after starting single_thread 8669 */ 8670 rbp = (mddb_rb32_t *)kmem_zalloc(recsize, KM_SLEEP); 8671 if ((options & MD_CRO_OPTIMIZE) == 0) 8672 userdata = kmem_zalloc(usersize, KM_SLEEP); 8673 newdbp = (mddb_db_t *)kmem_zalloc(sizeof (*newdbp), KM_SLEEP); 8674 8675 /* 8676 * if this is the largest record allocate new buffer for 8677 * checkcopy(); 8678 */ 8679 if (recsize > s->s_databuffer_size) { 8680 tmppnt = (caddr_t)kmem_zalloc(recsize, KM_SLEEP); 8681 /* 8682 * this test is incase when to sleep during kmem_alloc 8683 * and some other task bumped max record size 8684 */ 8685 if (recsize > s->s_databuffer_size) { 8686 if (s->s_databuffer_size) 8687 kmem_free(s->s_databuffer, 8688 s->s_databuffer_size); 8689 s->s_databuffer = tmppnt; 8690 s->s_databuffer_size = recsize; 8691 } else { 8692 kmem_free(tmppnt, recsize); 8693 } 8694 } 8695 8696 single_thread_start(s); 8697 8698 newid = 0; 8699 do { 8700 newid++; 8701 if (DBID(newid) == 0) { 8702 kmem_free((caddr_t)newdbp, sizeof (*newdbp)); 8703 kmem_free((caddr_t)rbp, ((size_t)recsize)); 8704 if ((options & MD_CRO_OPTIMIZE) == 0) 8705 kmem_free(userdata, usersize); 8706 single_thread_end(s); 8707 mddb_setexit(s); 8708 return (MDDB_E_NOTNOW); 8709 } 8710 8711 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 8712 for (dep = dbp->db_firstentry; dep; 8713 dep = dep->de_next) { 8714 if (dep->de_recid == newid) 8715 break; 8716 } 8717 if (dep != NULL) 8718 break; 8719 } 8720 } while (dbp); 8721 8722 desize = (sizeof (*de32p) - sizeof (de32p->de32_blks)) + 8723 (sizeof (mddb_block_t) * blkcnt); 8724 8725 /* 8726 * see if a directory block exists which will hold this entry 8727 */ 8728 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 8729 used = sizeof (*db32p); 8730 for (dep = dbp->db_firstentry; 8731 dep != NULL; dep = dep->de_next) { 8732 used += sizeof (*de32p) - sizeof (de32p->de32_blks); 8733 used += sizeof (mddb_block_t) * dep->de_blkcount; 8734 } 8735 if ((used + desize) < MDDB_BSIZE) 8736 break; 8737 } 8738 if (dbp) { 8739 kmem_free((caddr_t)newdbp, sizeof (*newdbp)); 8740 if (blkcnt > s->s_freeblkcnt) { 8741 kmem_free((caddr_t)rbp, ((size_t)recsize)); 8742 if ((options & MD_CRO_OPTIMIZE) == 0) 8743 kmem_free(userdata, usersize); 8744 single_thread_end(s); 8745 mddb_setexit(s); 8746 return (MDDB_E_NOSPACE); 8747 } 8748 prevdbp = NULL; 8749 } else { 8750 /* 8751 * need to add directory block 8752 */ 8753 if ((blkcnt + 1) > s->s_freeblkcnt) { 8754 kmem_free((caddr_t)newdbp, sizeof (*newdbp)); 8755 kmem_free((caddr_t)rbp, ((size_t)recsize)); 8756 if ((options & MD_CRO_OPTIMIZE) == 0) 8757 kmem_free(userdata, usersize); 8758 single_thread_end(s); 8759 mddb_setexit(s); 8760 return (MDDB_E_NOSPACE); 8761 } 8762 for (dbp = s->s_dbp; dbp->db_next; dbp = dbp->db_next); 8763 dbp->db_next = newdbp; 8764 bzero((caddr_t)dbp->db_next, sizeof (*newdbp)); 8765 dbp->db_nextblk = getfreeblks(s, 1); 8766 dbp->db_next->db_blknum = dbp->db_nextblk; 8767 prevdbp = dbp; 8768 dbp = dbp->db_next; 8769 dbp->db_nextblk = 0; 8770 dbp->db_firstentry = NULL; 8771 dbp->db_recsum = 0; 8772 dbp->db_magic = MDDB_MAGIC_DB; 8773 } 8774 /* 8775 * ready to add record 8776 */ 8777 desize_ic = (sizeof (*dep) - sizeof (dep->de_blks)) + 8778 (sizeof (mddb_block_t) * blkcnt); 8779 if (dbp->db_firstentry) { 8780 for (dep = dbp->db_firstentry; dep->de_next; 8781 dep = dep->de_next); 8782 dep->de_next = (mddb_de_ic_t *)kmem_zalloc(desize_ic, KM_SLEEP); 8783 dep = dep->de_next; 8784 } else { 8785 dep = (mddb_de_ic_t *)kmem_zalloc(desize_ic, KM_SLEEP); 8786 dbp->db_firstentry = dep; 8787 } 8788 bzero((caddr_t)dep, desize_ic); 8789 dep->de_recid = newid; 8790 /* 8791 * Optimized records have an owner node associated with them in 8792 * a MN diskset. The owner is only set on a node that is actively 8793 * writing to that record. The other nodes will show that record 8794 * as having an invalid owner. The owner for an optimized record 8795 * is used during fixoptrecord to determine which node should 8796 * write out the record when the replicas associated with that 8797 * optimized record have been changed. 8798 */ 8799 if (MD_MNSET_SETNO(s->s_setno)) { 8800 dep->de_owner_nodeid = MD_MN_INVALID_NID; 8801 } 8802 dep->de_type1 = type; 8803 dep->de_type2 = type2; 8804 dep->de_reqsize = usersize; 8805 dep->de_recsize = recsize; 8806 dep->de_blkcount = blkcnt; 8807 flag_type = options & 8808 (MD_CRO_OPTIMIZE | MD_CRO_STRIPE | MD_CRO_MIRROR | MD_CRO_RAID | 8809 MD_CRO_SOFTPART | MD_CRO_TRANS_MASTER | MD_CRO_TRANS_LOG | 8810 MD_CRO_HOTSPARE | MD_CRO_HOTSPARE_POOL | MD_CRO_CHANGELOG); 8811 switch (flag_type) { 8812 case MD_CRO_OPTIMIZE: 8813 dep->de_flags = MDDB_F_OPT; 8814 getoptdev(s, dep, 0); 8815 getoptdev(s, dep, 1); 8816 break; 8817 case MD_CRO_STRIPE: 8818 dep->de_flags = MDDB_F_STRIPE; 8819 break; 8820 case MD_CRO_MIRROR: 8821 dep->de_flags = MDDB_F_MIRROR; 8822 break; 8823 case MD_CRO_RAID: 8824 dep->de_flags = MDDB_F_RAID; 8825 break; 8826 case MD_CRO_SOFTPART: 8827 dep->de_flags = MDDB_F_SOFTPART; 8828 break; 8829 case MD_CRO_TRANS_MASTER: 8830 dep->de_flags = MDDB_F_TRANS_MASTER; 8831 break; 8832 case MD_CRO_TRANS_LOG: 8833 dep->de_flags = MDDB_F_TRANS_LOG; 8834 break; 8835 case MD_CRO_HOTSPARE: 8836 dep->de_flags = MDDB_F_HOTSPARE; 8837 break; 8838 case MD_CRO_HOTSPARE_POOL: 8839 dep->de_flags = MDDB_F_HOTSPARE_POOL; 8840 break; 8841 case MD_CRO_CHANGELOG: 8842 dep->de_flags = MDDB_F_CHANGELOG; 8843 break; 8844 } 8845 /* 8846 * try to get all blocks consecutive. If not possible 8847 * just get them one at a time 8848 */ 8849 dep->de_blks[0] = getfreeblks(s, blkcnt); 8850 if (dep->de_blks[0]) { 8851 for (i = 1; i < blkcnt; i++) 8852 dep->de_blks[i] = dep->de_blks[0] + i; 8853 } else { 8854 for (i = 0; i < blkcnt; i++) 8855 dep->de_blks[i] = getfreeblks(s, 1); 8856 } 8857 dep->de_rb = rbp; 8858 bzero((caddr_t)rbp, recsize); 8859 rbp->rb_magic = MDDB_MAGIC_RB; 8860 8861 /* Do we have to create an old style (32 bit) record? */ 8862 if (options & MD_CRO_32BIT) { 8863 rbp->rb_revision = MDDB_REV_RB; 8864 } else { 8865 rbp->rb_revision = MDDB_REV_RB64; 8866 } 8867 8868 /* set de_rb_userdata for non optimization records */ 8869 if ((options & MD_CRO_OPTIMIZE) == 0) { 8870 dep->de_rb_userdata = userdata; 8871 } 8872 8873 uniqtime32(&rbp->rb_timestamp); 8874 /* Generate the crc for this record */ 8875 rec_crcgen(s, dep, rbp); 8876 tmppnt = (caddr_t)rbp; 8877 /* 8878 * the following code writes new records to all instances of 8879 * the data base. Writing one block at a time to each instance 8880 * is safe because they are not yet in a directory entry which 8881 * has been written to the data base 8882 */ 8883 err = 0; 8884 if ((options & MD_CRO_OPTIMIZE) == 0) { 8885 for (i = 0; i < blkcnt; i++) { 8886 err |= writeall(s, (caddr_t)tmppnt, 8887 dep->de_blks[i], 1, 0); 8888 tmppnt += MDDB_BSIZE; 8889 } 8890 } else { 8891 if ((MD_MNSET_SETNO(s->s_setno)) && 8892 md_set[s->s_setno].s_am_i_master) { 8893 /* 8894 * If a MN diskset then only master writes out newly 8895 * created optimized record. 8896 */ 8897 err |= writeoptrecord(s, dep); 8898 } 8899 } 8900 uniqtime32(&dbp->db_timestamp); 8901 dbp->db_revision = MDDB_REV_DB; 8902 /* Don't include opt resync and change log records in global XOR */ 8903 if (!(dep->de_flags & MDDB_F_OPT) && 8904 !(dep->de_flags & MDDB_F_CHANGELOG)) 8905 dbp->db_recsum ^= rbp->rb_checksum; 8906 db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, KM_SLEEP); 8907 create_db32rec(db32p, dbp); 8908 crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL); 8909 err |= writeall(s, (caddr_t)db32p, dbp->db_blknum, 1, 0); 8910 if (prevdbp) { 8911 dbp = prevdbp; 8912 uniqtime32(&dbp->db_timestamp); 8913 dbp->db_revision = MDDB_REV_DB; 8914 create_db32rec(db32p, dbp); 8915 crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL); 8916 err |= writeall(s, (caddr_t)db32p, dbp->db_blknum, 1, 0); 8917 } 8918 kmem_free((caddr_t)db32p, MDDB_BSIZE); 8919 if (err) { 8920 if (writeretry(s)) { 8921 s->s_zombie = newid; 8922 single_thread_end(s); 8923 mddb_setexit(s); 8924 return (MDDB_E_NOTNOW); 8925 } 8926 } 8927 single_thread_end(s); 8928 mddb_setexit(s); 8929 8930 ASSERT((newid & MDDB_SETMASK) == 0); 8931 return (MAKERECID(setno, newid)); 8932 } 8933 8934 int 8935 mddb_deleterec( 8936 mddb_recid_t id 8937 ) 8938 { 8939 mddb_set_t *s; 8940 mddb_db_t *dbp; 8941 mddb_db32_t *db32p; 8942 mddb_de_ic_t *dep, *dep1; 8943 int i; 8944 8945 #if defined(_ILP32) && !defined(lint) 8946 ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t)); 8947 ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t)); 8948 #endif 8949 8950 s = mddb_setenter(DBSET(id), MDDB_NOINIT, NULL); 8951 ASSERT(s != NULL); 8952 8953 id = DBID(id); 8954 if (checkstate(s, MDDB_PROBE)) { 8955 mddb_setexit(s); 8956 return (MDDB_E_NOTNOW); 8957 } 8958 8959 ASSERT(s->s_lbp != NULL); 8960 single_thread_start(s); 8961 8962 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 8963 dep1 = NULL; 8964 for (dep = dbp->db_firstentry; dep; dep = dep->de_next) { 8965 if (dep->de_recid == id) 8966 break; 8967 dep1 = dep; 8968 } 8969 if (dep != NULL) 8970 break; 8971 } 8972 /* 8973 * no such record 8974 */ 8975 if (dep == NULL) { 8976 single_thread_end(s); 8977 ASSERT(s->s_staledeletes != 0); 8978 s->s_staledeletes--; 8979 mddb_setexit(s); 8980 return (0); 8981 } 8982 8983 if (!(dep->de_flags & MDDB_F_OPT) && 8984 !(dep->de_flags & MDDB_F_CHANGELOG)) { 8985 dbp->db_recsum ^= dep->de_rb->rb_checksum; 8986 dbp->db_recsum ^= dep->de_rb->rb_checksum_fiddle; 8987 } 8988 8989 if (dep->de_rb_userdata != NULL) { 8990 if (dep->de_icreqsize) 8991 kmem_free(dep->de_rb_userdata_ic, dep->de_icreqsize); 8992 else 8993 kmem_free(dep->de_rb_userdata, dep->de_reqsize); 8994 } 8995 8996 kmem_free((caddr_t)dep->de_rb, dep->de_recsize); 8997 8998 for (i = 0; i < dep->de_blkcount; i++) 8999 blkfree(s, dep->de_blks[i]); 9000 if (dep1) 9001 dep1->de_next = dep->de_next; 9002 else 9003 dbp->db_firstentry = dep->de_next; 9004 9005 kmem_free(dep, sizeofde(dep)); 9006 9007 uniqtime32(&dbp->db_timestamp); 9008 dbp->db_revision = MDDB_REV_DB; 9009 db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, KM_SLEEP); 9010 create_db32rec(db32p, dbp); 9011 crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL); 9012 if (writeall(s, (caddr_t)db32p, dbp->db_blknum, 1, 0)) { 9013 if (writeretry(s)) { 9014 /* 9015 * staledelete is used to mark deletes which failed. 9016 * its only use is to not panic when the user retries 9017 * the delete once the database is active again 9018 */ 9019 single_thread_end(s); 9020 s->s_staledeletes++; 9021 kmem_free((caddr_t)db32p, MDDB_BSIZE); 9022 mddb_setexit(s); 9023 return (MDDB_E_NOTNOW); 9024 } 9025 } 9026 single_thread_end(s); 9027 kmem_free((caddr_t)db32p, MDDB_BSIZE); 9028 mddb_setexit(s); 9029 return (0); 9030 } 9031 9032 mddb_recid_t 9033 mddb_getnextrec( 9034 mddb_recid_t id, 9035 mddb_type_t typ, 9036 uint_t type2 9037 ) 9038 { 9039 mddb_set_t *s; 9040 mddb_db_t *dbp; 9041 mddb_de_ic_t *dep; 9042 int searching, err; 9043 set_t setno; 9044 9045 setno = DBSET(id); 9046 id = DBID(id); 9047 searching = id; 9048 9049 if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) 9050 return (err); 9051 9052 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 9053 for (dep = dbp->db_firstentry; 9054 dep != NULL; dep = dep->de_next) { 9055 if (searching) { 9056 if (dep->de_recid == id) 9057 searching = 0; 9058 } else { 9059 if ((typ == MDDB_ALL || dep->de_type1 == typ) && 9060 (type2 == 0 || dep->de_type2 == type2)) { 9061 id = dep->de_recid; 9062 mddb_setexit(s); 9063 ASSERT((id & MDDB_SETMASK) == 0); 9064 return (MAKERECID(setno, id)); 9065 } 9066 } 9067 } 9068 } 9069 9070 mddb_setexit(s); 9071 9072 if (searching) 9073 return (MDDB_E_NORECORD); 9074 return (0); 9075 } 9076 9077 void * 9078 mddb_getrecaddr( 9079 mddb_recid_t id 9080 ) 9081 { 9082 mddb_set_t *s; 9083 mddb_db_t *dbp; 9084 mddb_de_ic_t *dep; 9085 void *rval; 9086 9087 if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, NULL)) == NULL) 9088 return (NULL); 9089 9090 id = DBID(id); 9091 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 9092 for (dep = dbp->db_firstentry; 9093 dep != NULL; dep = dep->de_next) { 9094 if (dep->de_recid != id) 9095 continue; 9096 if (dep->de_rb_userdata) 9097 rval = (void *)dep->de_rb_userdata; 9098 else 9099 rval = (void *)dep->de_rb->rb_data; 9100 mddb_setexit(s); 9101 return (rval); 9102 } 9103 } 9104 9105 mddb_setexit(s); 9106 return (NULL); 9107 } 9108 9109 9110 mddb_de_ic_t * 9111 mddb_getrecdep( 9112 mddb_recid_t id 9113 ) 9114 { 9115 mddb_set_t *s; 9116 mddb_db_t *dbp; 9117 mddb_de_ic_t *dep; 9118 9119 if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, NULL)) == NULL) 9120 return (NULL); 9121 9122 id = DBID(id); 9123 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 9124 for (dep = dbp->db_firstentry; 9125 dep != NULL; dep = dep->de_next) { 9126 if (dep->de_recid != id) 9127 continue; 9128 mddb_setexit(s); 9129 return (dep); 9130 } 9131 } 9132 9133 mddb_setexit(s); 9134 return (NULL); 9135 } 9136 9137 void * 9138 mddb_getrecaddr_resize( 9139 mddb_recid_t id, 9140 size_t icsize, 9141 off_t off 9142 ) 9143 { 9144 mddb_set_t *s; 9145 mddb_db_t *dbp; 9146 mddb_de_ic_t *dep; 9147 void *rval = NULL; 9148 9149 if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, NULL)) == NULL) 9150 return (NULL); 9151 9152 id = DBID(id); 9153 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 9154 for (dep = dbp->db_firstentry; 9155 dep != NULL; dep = dep->de_next) { 9156 if (dep->de_recid != id) 9157 continue; 9158 if (dep->de_rb_userdata) 9159 rval = (void *)dep->de_rb_userdata; 9160 else 9161 rval = (void *)dep->de_rb->rb_data; 9162 break; 9163 } 9164 if (rval != NULL) 9165 break; 9166 } 9167 9168 if (rval == NULL) { 9169 mddb_setexit(s); 9170 return (NULL); 9171 } 9172 9173 if (dep->de_rb_userdata) { 9174 caddr_t nud; 9175 9176 if (dep->de_icreqsize || (dep->de_reqsize >= icsize)) { 9177 mddb_setexit(s); 9178 return (rval); 9179 } 9180 ASSERT((dep->de_reqsize + off) <= icsize); 9181 nud = kmem_zalloc(icsize, KM_SLEEP); 9182 bcopy(dep->de_rb_userdata, nud + off, dep->de_reqsize); 9183 kmem_free(dep->de_rb_userdata, dep->de_reqsize); 9184 dep->de_rb_userdata = nud + off; 9185 dep->de_rb_userdata_ic = nud; 9186 dep->de_icreqsize = icsize; 9187 rval = nud; 9188 } else { 9189 size_t recsize; 9190 /* LINTED variable unused - used for sizeof calculations */ 9191 mddb_rb32_t *nrbp; 9192 9193 recsize = roundup((sizeof (*nrbp) - sizeof (nrbp->rb_data)) + 9194 icsize, MDDB_BSIZE); 9195 if (dep->de_recsize < recsize) 9196 cmn_err(CE_PANIC, "mddb_getrecaddr_resize: only " 9197 "nonoptimized records can be resized\n"); 9198 } 9199 9200 mddb_setexit(s); 9201 return (rval); 9202 } 9203 9204 int 9205 mddb_getrecprivate( 9206 mddb_recid_t id 9207 ) 9208 { 9209 mddb_set_t *s; 9210 mddb_db_t *dbp; 9211 mddb_de_ic_t *dep; 9212 int err = 0; 9213 int private; 9214 9215 if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, &err)) == NULL) 9216 return (err); 9217 9218 id = DBID(id); 9219 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 9220 for (dep = dbp->db_firstentry; 9221 dep != NULL; dep = dep->de_next) { 9222 if (dep->de_recid != id) 9223 continue; 9224 private = (int)dep->de_rb->rb_private; 9225 mddb_setexit(s); 9226 return (private); 9227 } 9228 } 9229 9230 mddb_setexit(s); 9231 return (MDDB_E_NORECORD); 9232 } 9233 9234 void 9235 mddb_setrecprivate( 9236 mddb_recid_t id, 9237 uint_t private 9238 ) 9239 { 9240 mddb_set_t *s; 9241 mddb_db_t *dbp; 9242 mddb_de_ic_t *dep; 9243 9244 if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, NULL)) == NULL) { 9245 ASSERT(0); 9246 return; 9247 } 9248 9249 id = DBID(id); 9250 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 9251 for (dep = dbp->db_firstentry; 9252 dep != NULL; dep = dep->de_next) { 9253 if (dep->de_recid != id) 9254 continue; 9255 dep->de_rb->rb_private = private; 9256 mddb_setexit(s); 9257 return; 9258 } 9259 } 9260 9261 mddb_setexit(s); 9262 ASSERT(0); 9263 } 9264 9265 mddb_type_t 9266 mddb_getrectype1( 9267 mddb_recid_t id 9268 ) 9269 { 9270 mddb_set_t *s; 9271 mddb_db_t *dbp; 9272 mddb_de_ic_t *dep; 9273 int err = 0; 9274 mddb_type_t rval; 9275 9276 if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, &err)) == NULL) 9277 return (err); 9278 9279 id = DBID(id); 9280 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 9281 for (dep = dbp->db_firstentry; 9282 dep != NULL; dep = dep->de_next) { 9283 if (dep->de_recid != id) 9284 continue; 9285 rval = dep->de_type1; 9286 mddb_setexit(s); 9287 return (rval); 9288 } 9289 } 9290 9291 mddb_setexit(s); 9292 return (MDDB_E_NORECORD); 9293 } 9294 9295 int 9296 mddb_getrectype2( 9297 mddb_recid_t id 9298 ) 9299 { 9300 mddb_set_t *s; 9301 mddb_db_t *dbp; 9302 mddb_de_ic_t *dep; 9303 int err = 0; 9304 int rval; 9305 9306 if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, &err)) == NULL) 9307 return (err); 9308 9309 id = DBID(id); 9310 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 9311 for (dep = dbp->db_firstentry; 9312 dep != NULL; dep = dep->de_next) { 9313 if (dep->de_recid != id) 9314 continue; 9315 rval = (int)dep->de_type2; 9316 mddb_setexit(s); 9317 return (rval); 9318 } 9319 } 9320 9321 mddb_setexit(s); 9322 return (MDDB_E_NORECORD); 9323 } 9324 9325 int 9326 mddb_getrecsize( 9327 mddb_recid_t id 9328 ) 9329 { 9330 mddb_set_t *s; 9331 mddb_db_t *dbp; 9332 mddb_de_ic_t *dep; 9333 int err = 0; 9334 int rval; 9335 9336 if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, &err)) == NULL) 9337 return (err); 9338 9339 id = DBID(id); 9340 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 9341 for (dep = dbp->db_firstentry; 9342 dep != NULL; dep = dep->de_next) { 9343 if (dep->de_recid != id) 9344 continue; 9345 rval = (int)dep->de_reqsize; 9346 mddb_setexit(s); 9347 return (rval); 9348 } 9349 } 9350 9351 mddb_setexit(s); 9352 return (MDDB_E_NORECORD); 9353 } 9354 9355 9356 mddb_recstatus_t 9357 mddb_getrecstatus( 9358 mddb_recid_t id 9359 ) 9360 { 9361 mddb_set_t *s; 9362 mddb_db_t *dbp; 9363 mddb_de_ic_t *dep; 9364 int err = 0; 9365 mddb_recstatus_t e_err; 9366 9367 if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, &err)) == NULL) 9368 return ((mddb_recstatus_t)err); 9369 9370 id = DBID(id); 9371 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 9372 for (dep = dbp->db_firstentry; 9373 dep != NULL; dep = dep->de_next) { 9374 if (dep->de_recid == id) 9375 break; 9376 } 9377 if (dep) 9378 break; 9379 } 9380 9381 e_err = MDDB_OK; 9382 9383 if (! dep) 9384 e_err = MDDB_NORECORD; 9385 else if (! dep->de_rb->rb_commitcnt) 9386 e_err = MDDB_NODATA; 9387 else if (md_get_setstatus(s->s_setno) & MD_SET_STALE) 9388 e_err = MDDB_STALE; 9389 9390 mddb_setexit(s); 9391 return (e_err); 9392 } 9393 9394 /* 9395 * Commit given record to disk. 9396 * If committing an optimized record, do not call 9397 * with md ioctl lock held. 9398 */ 9399 int 9400 mddb_commitrec( 9401 mddb_recid_t id 9402 ) 9403 { 9404 mddb_set_t *s; 9405 mddb_db_t *dbp; 9406 mddb_de_ic_t *dep; 9407 mddb_recid_t ids[2]; 9408 mddb_rb32_t *rbp; 9409 static int err = 0; 9410 md_mn_msg_mddb_optrecerr_t *msg_recerr; 9411 md_mn_kresult_t *kres; 9412 mddb_lb_t *lbp; 9413 mddb_mnlb_t *mnlbp; 9414 mddb_locator_t *lp; 9415 mddb_mnsidelocator_t *mnslp; 9416 mddb_drvnm_t *dn; 9417 int li; 9418 md_replica_recerr_t *recerr; 9419 int i, j; 9420 int rval; 9421 int hit_err = 0; 9422 9423 s = mddb_setenter(DBSET(id), MDDB_NOINIT, NULL); 9424 ASSERT(s != NULL); 9425 9426 if (checkstate(s, MDDB_PROBE)) { 9427 mddb_setexit(s); 9428 return (MDDB_E_NOTNOW); 9429 } 9430 9431 if (DBID(id) == 0) { 9432 mddb_setexit(s); 9433 return (0); 9434 } 9435 9436 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 9437 for (dep = dbp->db_firstentry; dep; dep = dep->de_next) { 9438 if (dep->de_recid == DBID(id)) 9439 break; 9440 } 9441 if (dep) 9442 break; 9443 } 9444 9445 if (dep == NULL) { 9446 mddb_setexit(s); 9447 return (MDDB_E_NORECORD); 9448 } 9449 9450 if (! (dep->de_flags & MDDB_F_OPT)) { 9451 ids[0] = id; 9452 ids[1] = 0; 9453 mddb_setexit(s); 9454 return (mddb_commitrecs(ids)); 9455 } 9456 9457 /* 9458 * following code allows multiple processes to be doing 9459 * optimization commits in parallel. 9460 * NOTE: if lots of optimization commits then the lock 9461 * will not get released until it winds down 9462 */ 9463 if (s->s_optwaiterr) { 9464 while (s->s_optwaiterr) { 9465 s->s_opthungerr = 1; 9466 cv_wait(&s->s_opthungerr_cv, SETMUTEX(s->s_setno)); 9467 } 9468 if (checkstate(s, MDDB_PROBE)) { 9469 mddb_setexit(s); 9470 return (MDDB_E_NOTNOW); 9471 } 9472 } 9473 if (s->s_optcmtcnt++ == 0) { 9474 single_thread_start(s); 9475 s->s_opthavelck = 1; 9476 if (s->s_optwantlck) { 9477 cv_broadcast(&s->s_optwantlck_cv); 9478 s->s_optwantlck = 0; 9479 } 9480 } else { 9481 while (! s->s_opthavelck) { 9482 s->s_optwantlck = 1; 9483 cv_wait(&s->s_optwantlck_cv, SETMUTEX(s->s_setno)); 9484 } 9485 } 9486 9487 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 9488 for (dep = dbp->db_firstentry; dep; dep = dep->de_next) { 9489 if (dep->de_recid == DBID(id)) 9490 break; 9491 } 9492 if (dep) 9493 break; 9494 } 9495 9496 if (dep == NULL) { 9497 if (! (--s->s_optcmtcnt)) { 9498 single_thread_end(s); 9499 s->s_opthavelck = 0; 9500 } 9501 mddb_setexit(s); 9502 return (MDDB_E_NORECORD); 9503 } 9504 9505 rbp = dep->de_rb; 9506 rbp->rb_commitcnt++; 9507 uniqtime32(&rbp->rb_timestamp); 9508 /* Generate the crc for this record */ 9509 rec_crcgen(s, dep, rbp); 9510 9511 if (writeoptrecord(s, dep)) { 9512 if (MD_MNSET_SETNO(s->s_setno)) { 9513 hit_err = 1; 9514 } 9515 s->s_optwaiterr++; 9516 } 9517 if (MD_MNSET_SETNO(s->s_setno)) { 9518 /* If last thread out, release single_thread_start */ 9519 if (! (--s->s_optcmtcnt)) { 9520 single_thread_end(s); 9521 s->s_opthavelck = 0; 9522 } 9523 /* 9524 * If this thread had a writeoptrecords failure, then 9525 * need to send message to master. 9526 * But, multiple threads could all be running on the 9527 * same single_thread_start, so serialize the threads 9528 * by making each thread grab single_thread_start. 9529 * 9530 * After return from sending message to master message, 9531 * replicas associated with optimized record will havei 9532 * been changed (via a callback from the master to all 9533 * nodes), so retry call to writeoptrecord. 9534 * This code is replacing the call to writeretry that 9535 * occurs for the local and traditional disksets. 9536 */ 9537 if (hit_err) { 9538 single_thread_start(s); 9539 /* 9540 * If > 50% of replicas are alive then continue 9541 * to send message to master until writeoptrecord 9542 * succeeds. For now, assume that minor name, 9543 * major number on this node is the same as on 9544 * the master node. Once devids are turned on 9545 * for MN disksets, can send devid. 9546 */ 9547 kres = kmem_zalloc(sizeof (md_mn_kresult_t), KM_SLEEP); 9548 msg_recerr = kmem_zalloc( 9549 sizeof (md_mn_msg_mddb_optrecerr_t), KM_SLEEP); 9550 while (!(md_get_setstatus(s->s_setno) & 9551 MD_SET_TOOFEW)) { 9552 bzero((caddr_t)msg_recerr, 9553 sizeof (md_mn_msg_mddb_optrecerr_t)); 9554 lbp = s->s_lbp; 9555 mnlbp = (mddb_mnlb_t *)lbp; 9556 for (i = 0; i < 2; i++) { 9557 li = dep->de_optinfo[i].o_li; 9558 lp = &lbp->lb_locators[li]; 9559 for (j = 0; j < MD_MNMAXSIDES; j++) { 9560 mnslp = 9561 &mnlbp->lb_mnsidelocators[j][li]; 9562 if (mnslp->mnl_sideno == s->s_sideno) 9563 break; 9564 } 9565 if (j == MD_MNMAXSIDES) 9566 continue; 9567 9568 dn = &lbp->lb_drvnm[mnslp->mnl_drvnm_index]; 9569 recerr = &msg_recerr->msg_recerr[i]; 9570 recerr->r_li = li; 9571 recerr->r_flags = 9572 dep->de_optinfo[i].o_flags; 9573 recerr->r_blkno = lp->l_blkno; 9574 recerr->r_mnum = md_getminor(lp->l_dev); 9575 (void) strncpy(recerr->r_driver_name, 9576 dn->dn_data, MD_MAXDRVNM); 9577 } 9578 9579 /* Release locks */ 9580 single_thread_end(s); 9581 mutex_exit(SETMUTEX(s->s_setno)); 9582 9583 /* 9584 * Send message to master about optimized 9585 * record failure. After return, master 9586 * should have marked failed replicas 9587 * and sent parse message to slaves causing 9588 * slaves to have fixed up the optimized 9589 * record. 9590 * On return from ksend_message, retry 9591 * the write since this node should have fixed 9592 * the optimized resync records it owns. 9593 */ 9594 rval = mdmn_ksend_message(s->s_setno, 9595 MD_MN_MSG_MDDB_OPTRECERR, 9596 MD_MSGF_NO_BCAST, 9597 (char *)msg_recerr, 9598 sizeof (md_mn_msg_mddb_optrecerr_t), 9599 kres); 9600 if (!MDMN_KSEND_MSG_OK(rval, kres)) { 9601 cmn_err(CE_WARN, "mddb_commitrec: " 9602 "Unable to send optimized " 9603 "resync record failure " 9604 "message to other nodes in " 9605 "diskset %s\n", s->s_setname); 9606 mdmn_ksend_show_error(rval, kres, 9607 "MD_MN_MSG_MDDB_OPTRECERR"); 9608 } 9609 9610 /* Regrab locks */ 9611 mutex_enter(SETMUTEX(s->s_setno)); 9612 single_thread_start(s); 9613 9614 /* Start over in case mddb changed */ 9615 for (dbp = s->s_dbp; dbp != NULL; 9616 dbp = dbp->db_next) { 9617 for (dep = dbp->db_firstentry; dep; 9618 dep = dep->de_next) { 9619 if (dep->de_recid == DBID(id)) 9620 break; 9621 } 9622 if (dep) 9623 break; 9624 } 9625 if (dep) { 9626 rbp = dep->de_rb; 9627 rbp->rb_commitcnt++; 9628 uniqtime32(&rbp->rb_timestamp); 9629 /* Generate the crc for this record */ 9630 rec_crcgen(s, dep, rbp); 9631 9632 /* 9633 * If writeoptrecord succeeds, then 9634 * break out. 9635 */ 9636 if (!(writeoptrecord(s, dep))) 9637 break; 9638 } 9639 } 9640 kmem_free(kres, sizeof (md_mn_kresult_t)); 9641 kmem_free(msg_recerr, 9642 sizeof (md_mn_msg_mddb_optrecerr_t)); 9643 9644 /* Resync record should be fixed - if possible */ 9645 s->s_optwaiterr--; 9646 if (s->s_optwaiterr == 0) { 9647 /* All errors have been handled */ 9648 if (s->s_opthungerr) { 9649 s->s_opthungerr = 0; 9650 cv_broadcast(&s->s_opthungerr_cv); 9651 } 9652 } 9653 single_thread_end(s); 9654 mddb_setexit(s); 9655 if (md_get_setstatus(s->s_setno) & MD_SET_TOOFEW) { 9656 return (MDDB_E_NOTNOW); 9657 } else { 9658 return (0); 9659 } 9660 } 9661 } else { 9662 /* If set is a traditional or local set */ 9663 if (! (--s->s_optcmtcnt)) { 9664 err = 0; 9665 if (s->s_optwaiterr) { 9666 err = writeretry(s); 9667 s->s_optwaiterr = 0; 9668 if (s->s_opthungerr) { 9669 s->s_opthungerr = 0; 9670 cv_broadcast(&s->s_opthungerr_cv); 9671 } 9672 } 9673 single_thread_end(s); 9674 s->s_opthavelck = 0; 9675 mddb_setexit(s); 9676 if (err) 9677 return (MDDB_E_NOTNOW); 9678 return (0); 9679 } 9680 if (s->s_optwaiterr) { 9681 while (s->s_optwaiterr) { 9682 s->s_opthungerr = 1; 9683 cv_wait(&s->s_opthungerr_cv, 9684 SETMUTEX(s->s_setno)); 9685 } 9686 if (checkstate(s, MDDB_NOPROBE)) { 9687 mddb_setexit(s); 9688 return (MDDB_E_NOTNOW); 9689 } 9690 } 9691 } 9692 9693 mddb_setexit(s); 9694 return (0); 9695 } 9696 9697 int 9698 mddb_commitrecs( 9699 mddb_recid_t ids[] 9700 ) 9701 { 9702 mddb_set_t *s; 9703 mddb_db_t *dbp; 9704 mddb_de_ic_t *dep; 9705 mddb_rb32_t *rbp; 9706 mddb_rb32_t *saverbp; 9707 mddb_lb_t *lbp; 9708 int li; 9709 uint_t checksum; 9710 mddb_recid_t *idp; 9711 int err = 0; 9712 set_t setno; 9713 9714 if (panicstr) 9715 cmn_err(CE_PANIC, "md: mddb: commit not allowed"); 9716 9717 /* 9718 * scan through and make sure ids are from the same set 9719 */ 9720 setno = DBSET(ids[0]); 9721 for (idp = ids; *idp != NULL; idp++) 9722 ASSERT(DBSET(*idp) == setno); 9723 9724 s = mddb_setenter(setno, MDDB_MUSTEXIST, NULL); 9725 9726 if (checkstate(s, MDDB_PROBE)) { 9727 mddb_setexit(s); 9728 return (MDDB_E_NOTNOW); 9729 } 9730 9731 ASSERT(s->s_lbp != NULL); 9732 err = 0; 9733 9734 if (! ids[0]) { 9735 mddb_setexit(s); 9736 return (0); 9737 } 9738 9739 single_thread_start(s); 9740 /* 9741 * scan through and make sure ids all exist 9742 */ 9743 for (idp = ids; *idp != NULL; idp++) { 9744 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 9745 for (dep = dbp->db_firstentry; dep; 9746 dep = dep->de_next) { 9747 if (dep->de_recid == DBID(*idp)) 9748 break; 9749 } 9750 if (dep != NULL) 9751 break; 9752 } 9753 if (dep == NULL) { 9754 single_thread_end(s); 9755 mddb_setexit(s); 9756 return (MDDB_E_NORECORD); 9757 } 9758 } 9759 9760 /* 9761 * scan through records fix commit counts and 9762 * zero fiddles and update time stamp and rechecksum record 9763 */ 9764 checksum = 0; 9765 idp = ids; 9766 saverbp = NULL; 9767 while (*idp) { 9768 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 9769 for (dep = dbp->db_firstentry; dep; 9770 dep = dep->de_next) { 9771 if (dep->de_recid == DBID(*idp)) 9772 break; 9773 } 9774 if (dep != NULL) 9775 break; 9776 } 9777 rbp = dep->de_rb; 9778 ASSERT(! (dep->de_flags & MDDB_F_OPT)); 9779 9780 getuserdata(setno, dep); 9781 /* Don't do fiddles for CHANGE LOG records */ 9782 if (!(dep->de_flags & MDDB_F_CHANGELOG)) { 9783 checksum ^= rbp->rb_checksum_fiddle; 9784 rbp->rb_checksum_fiddle = 0; 9785 checksum ^= rbp->rb_checksum; 9786 saverbp = rbp; 9787 } 9788 rbp->rb_commitcnt++; 9789 uniqtime32(&rbp->rb_timestamp); 9790 /* Generate the crc for this record */ 9791 rec_crcgen(s, dep, rbp); 9792 9793 /* Don't do fiddles for CHANGE LOG records */ 9794 if (!(dep->de_flags & MDDB_F_CHANGELOG)) { 9795 checksum ^= rbp->rb_checksum; 9796 } 9797 idp++; 9798 } 9799 9800 if (saverbp) 9801 saverbp->rb_checksum_fiddle = checksum; 9802 9803 /* 9804 * If this is a MN set but we are not the master, then we are not 9805 * supposed to update the mddb on disk. So we finish at this point. 9806 */ 9807 if ((setno != MD_LOCAL_SET) && (s->s_lbp->lb_flags & MDDB_MNSET) && 9808 (md_set[setno].s_am_i_master == 0)) { 9809 single_thread_end(s); 9810 mddb_setexit(s); 9811 return (0); 9812 } 9813 9814 lbp = s->s_lbp; 9815 for (li = 0; li < lbp->lb_loccnt; li++) { 9816 if (! (lbp->lb_locators[li].l_flags & MDDB_F_ACTIVE)) 9817 continue; 9818 9819 idp = ids; 9820 while (*idp) { 9821 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 9822 dep = dbp->db_firstentry; 9823 while (dep && (dep->de_recid != DBID(*idp))) 9824 dep = dep->de_next; 9825 if (dep != NULL) 9826 break; 9827 } 9828 rbp = dep->de_rb; 9829 err = wrtblklst(s, (caddr_t)rbp, dep->de_blks, 9830 dep->de_blkcount, li, (mddb_bf_t **)0, 9831 MDDB_WR_ONLY_MASTER); 9832 if (err) 9833 break; 9834 idp++; 9835 } 9836 if (err) 9837 break; 9838 } 9839 if (err) { 9840 if (writeretry(s)) { 9841 single_thread_end(s); 9842 mddb_setexit(s); 9843 return (MDDB_E_NOTNOW); 9844 } 9845 } 9846 single_thread_end(s); 9847 mddb_setexit(s); 9848 return (0); 9849 } 9850 9851 mddb_recid_t 9852 mddb_makerecid( 9853 set_t setno, 9854 mddb_recid_t id 9855 ) 9856 { 9857 return (MAKERECID(setno, id)); 9858 } 9859 9860 set_t 9861 mddb_getsetnum( 9862 mddb_recid_t id 9863 ) 9864 { 9865 return (DBSET(id)); 9866 } 9867 9868 char * 9869 mddb_getsetname( 9870 set_t setno 9871 ) 9872 { 9873 return (((mddb_set_t *)md_set[setno].s_db)->s_setname); 9874 } 9875 9876 side_t 9877 mddb_getsidenum( 9878 set_t setno 9879 ) 9880 { 9881 if (md_set[setno].s_db) 9882 return (((mddb_set_t *)md_set[setno].s_db)->s_sideno); 9883 return (0); 9884 } 9885 9886 int 9887 mddb_ownset( 9888 set_t setno 9889 ) 9890 { 9891 if ((md_get_setstatus(setno) & MD_SET_TAGDATA) && md_set[setno].s_db) 9892 return (1); 9893 9894 if (md_set[setno].s_db && ((mddb_set_t *)md_set[setno].s_db)->s_lbp) 9895 return (1); 9896 9897 return (0); 9898 } 9899 9900 /*ARGSUSED*/ 9901 int 9902 getmed_ioctl(mddb_med_parm_t *medpp, int mode) 9903 { 9904 mddb_set_t *s; 9905 int err = 0; 9906 set_t setno = medpp->med_setno; 9907 md_error_t *ep = &medpp->med_mde; 9908 9909 mdclrerror(ep); 9910 9911 if (setno >= md_nsets) 9912 return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR)); 9913 9914 if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0) 9915 return (0); 9916 9917 if ((md_get_setstatus(setno) & MD_SET_SNARFED) == 0) 9918 return (mdmddberror(ep, MDE_DB_NOTOWNER, NODEV32, setno)); 9919 9920 if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) 9921 return (mddbstatus2error(ep, err, NODEV32, setno)); 9922 9923 medpp->med = s->s_med; /* structure assignment */ 9924 9925 mddb_setexit(s); 9926 9927 return (0); 9928 } 9929 9930 int 9931 setmed_ioctl(mddb_med_parm_t *medpp, int mode) 9932 { 9933 9934 mddb_set_t *s; 9935 int err = 0; 9936 set_t setno = medpp->med_setno; 9937 md_error_t *ep = &medpp->med_mde; 9938 9939 mdclrerror(ep); 9940 9941 if ((mode & FWRITE) == 0) 9942 return (mdsyserror(ep, EACCES)); 9943 9944 /* 9945 * This should be the only thing that prevents LOCAL sets from having 9946 * mediators, at least in the kernel, userland needs to have some code 9947 * written. 9948 */ 9949 if (setno == MD_LOCAL_SET) 9950 return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR)); 9951 9952 if (setno >= md_nsets) 9953 return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR)); 9954 9955 if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0) 9956 return (0); 9957 9958 if ((md_get_setstatus(setno) & MD_SET_SNARFED) == 0) 9959 return (mdmddberror(ep, MDE_DB_NOTOWNER, NODEV32, setno)); 9960 9961 if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) 9962 return (mddbstatus2error(ep, err, NODEV32, setno)); 9963 9964 s->s_med = medpp->med; /* structure assignment */ 9965 9966 mddb_setexit(s); 9967 9968 return (0); 9969 } 9970 9971 int 9972 updmed_ioctl(mddb_med_upd_parm_t *medpp, int mode) 9973 { 9974 9975 mddb_set_t *s; 9976 int err = 0; 9977 set_t setno = medpp->med_setno; 9978 md_error_t *ep = &medpp->med_mde; 9979 9980 mdclrerror(ep); 9981 9982 if ((mode & FWRITE) == 0) 9983 return (mdsyserror(ep, EACCES)); 9984 9985 if (setno >= md_nsets) 9986 return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR)); 9987 9988 if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0) 9989 return (0); 9990 9991 if ((md_get_setstatus(setno) & MD_SET_SNARFED) == 0) 9992 return (mdmddberror(ep, MDE_DB_NOTOWNER, NODEV32, setno)); 9993 9994 if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) 9995 return (mddbstatus2error(ep, err, NODEV32, setno)); 9996 9997 single_thread_start(s); 9998 (void) upd_med(s, "updmed_ioctl()"); 9999 single_thread_end(s); 10000 10001 mddb_setexit(s); 10002 10003 return (0); 10004 } 10005 10006 int 10007 take_set(mddb_config_t *cp, int mode) 10008 { 10009 int err = 0; 10010 mddb_med_upd_parm_t medup; 10011 set_t setno = cp->c_setno; 10012 md_error_t *ep = &cp->c_mde; 10013 int snarf_ok = 0; 10014 10015 if (md_get_setstatus(setno) & MD_SET_SNARFED) 10016 return (0); 10017 10018 err = mddb_configure(MDDB_GETDEV, cp); 10019 if (! err && mdisok(ep)) { 10020 if (md_snarf_db_set(setno, ep) != 0) 10021 goto out; 10022 snarf_ok = 1; 10023 } 10024 10025 if (! err && mdisok(ep)) { 10026 if (! cp->c_flags) { 10027 medup.med_setno = setno; 10028 mdclrerror(&medup.med_mde); 10029 10030 err = updmed_ioctl(&medup, mode); 10031 if (! mdisok(&medup.med_mde)) 10032 (void) mdstealerror(ep, &medup.med_mde); 10033 } 10034 } 10035 10036 out: 10037 /* 10038 * In the case that the snarf failed, the diskset is 10039 * left with s_db set, but s_lbp not set. The node is not 10040 * an owner of the set and won't be allowed to release the 10041 * diskset in order to cleanup. With s_db set, any call to the 10042 * GETDEV or ENDDEV ioctl (done by libmeta routine metareplicalist) 10043 * will cause the diskset to be loaded. So, cleanup the diskset so 10044 * that an inadvertent start of the diskset doesn't happen later. 10045 */ 10046 if ((snarf_ok == 0) && md_set[setno].s_db && 10047 (((mddb_set_t *)md_set[setno].s_db)->s_lbp == 0)) { 10048 mutex_enter(&mddb_lock); 10049 mddb_unload_set(setno); 10050 mutex_exit(&mddb_lock); 10051 } 10052 return (err); 10053 } 10054 10055 /*ARGSUSED*/ 10056 int 10057 release_set(mddb_config_t *cp, int mode) 10058 { 10059 int err = 0; 10060 set_t setno = cp->c_setno; 10061 md_error_t *ep = &cp->c_mde; 10062 10063 /* 10064 * Data integrity check 10065 */ 10066 if (setno >= md_nsets) 10067 return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR)); 10068 10069 rw_enter(&md_unit_array_rw.lock, RW_WRITER); 10070 md_haltsnarf_enter(setno); 10071 /* 10072 * Attempt to mark set as HOLD. If it is marked as HOLD, this means 10073 * that the mirror code is currently searching all mirrors for a 10074 * errored component that needs a hotspare. While this search is in 10075 * progress, we cannot release the set and thgerefore we return EBUSY. 10076 * Once we have set HOLD, the mirror function (check_4_hotspares) will 10077 * block before the search until the set is released. 10078 */ 10079 if (md_holdset_testandenter(setno) != 0) { 10080 md_haltsnarf_exit(setno); 10081 rw_exit(&md_unit_array_rw.lock); 10082 return (EBUSY); 10083 } 10084 10085 if ((err = md_halt_set(setno, MD_HALT_ALL)) == 0) 10086 err = mddb_configure(MDDB_RELEASESET, cp); 10087 10088 md_holdset_exit(setno); 10089 md_haltsnarf_exit(setno); 10090 rw_exit(&md_unit_array_rw.lock); 10091 10092 if (! err && mdisok(ep)) { 10093 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RELEASE, SVM_TAG_SET, setno, 10094 NODEV64); 10095 } 10096 10097 return (err); 10098 } 10099 10100 int 10101 gettag_ioctl(mddb_dtag_get_parm_t *dtgpp, int mode) 10102 { 10103 mddb_set_t *s; 10104 int err = 0; 10105 mddb_dtag_lst_t *dtlp; 10106 set_t setno = dtgpp->dtgp_setno; 10107 md_error_t *ep = &dtgpp->dtgp_mde; 10108 10109 mdclrerror(ep); 10110 10111 if ((mode & FREAD) == 0) 10112 return (mdsyserror(ep, EACCES)); 10113 10114 if (setno >= md_nsets) 10115 return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR)); 10116 10117 if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0) 10118 return (0); 10119 10120 if ((s = mddb_setenter(setno, MDDB_NOINIT, &err)) == NULL) 10121 return (mddbstatus2error(ep, err, NODEV32, setno)); 10122 10123 /* 10124 * Data tags not supported on MN sets so return invalid operation. 10125 * This ioctl could be called before the mddb has been read in so 10126 * the set status may not yet be set to MNSET, so code following 10127 * this check must handle a MN diskset properly. 10128 */ 10129 if (md_get_setstatus(setno) & MD_SET_MNSET) { 10130 mddb_setexit(s); 10131 return (mderror(ep, MDE_INVAL_MNOP)); 10132 } 10133 10134 /* s_dtlp is NULL for MN diskset */ 10135 dtlp = s->s_dtlp; 10136 while (dtlp != NULL) { 10137 if (dtgpp->dtgp_dt.dt_id == 0 || 10138 dtgpp->dtgp_dt.dt_id == dtlp->dtl_dt.dt_id) { 10139 bcopy((caddr_t)&dtlp->dtl_dt, (caddr_t)&dtgpp->dtgp_dt, 10140 sizeof (mddb_dtag_t)); 10141 break; 10142 } 10143 dtlp = dtlp->dtl_nx; 10144 } 10145 10146 /* Walked the whole list and id not found, return error */ 10147 if (dtlp == (mddb_dtag_lst_t *)NULL) { 10148 mddb_setexit(s); 10149 return (mdmddberror(ep, MDE_DB_NOTAG, NODEV32, setno)); 10150 } 10151 10152 mddb_setexit(s); 10153 10154 return (0); 10155 } 10156 10157 int 10158 usetag_ioctl(mddb_dtag_use_parm_t *dtupp, int mode) 10159 { 10160 mddb_set_t *s; 10161 int err = 0; 10162 mddb_config_t *cp; 10163 mddb_ri_t *trip = NULL; 10164 mddb_dtag_t *dtagp = NULL; 10165 set_t setno = dtupp->dtup_setno; 10166 md_error_t *ep = &dtupp->dtup_mde; 10167 10168 mdclrerror(ep); 10169 10170 if ((mode & FWRITE) == 0) 10171 return (mdsyserror(ep, EACCES)); 10172 10173 if (setno >= md_nsets) 10174 return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR)); 10175 10176 if (dtupp->dtup_id < 0) 10177 return (mdsyserror(ep, EINVAL)); 10178 else if (dtupp->dtup_id == 0) 10179 return (mdmddberror(ep, MDE_DB_NOTAG, NODEV32, setno)); 10180 10181 if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0) 10182 return (0); 10183 10184 if ((md_get_setstatus(setno) & MD_SET_TAGDATA) == 0) 10185 return (mdmddberror(ep, MDE_DB_NTAGDATA, NODEV32, setno)); 10186 10187 if ((s = mddb_setenter(setno, MDDB_NOINIT, &err)) == NULL) 10188 return (mddbstatus2error(ep, err, NODEV32, setno)); 10189 10190 /* 10191 * Data tags not supported on MN sets so return invalid operation. 10192 * This ioctl could be called before the mddb has been read in so 10193 * the set status may not yet be set to MNSET, so code following 10194 * this check must handle a MN diskset properly. 10195 */ 10196 if (md_get_setstatus(setno) & MD_SET_MNSET) { 10197 mddb_setexit(s); 10198 return (mderror(ep, MDE_INVAL_MNOP)); 10199 } 10200 10201 /* Validate and find the id requested - nothing found if MN diskset */ 10202 if ((dtagp = dtl_findl(s, dtupp->dtup_id)) == NULL) { 10203 mddb_setexit(s); 10204 return (mdmddberror(ep, MDE_DB_NOTAG, NODEV32, setno)); 10205 } 10206 10207 /* Usetag is only valid when more than one tag exists */ 10208 if (dtl_cntl(s) < 2) { 10209 mddb_setexit(s); 10210 return (mdmddberror(ep, MDE_DB_NTAGDATA, NODEV32, setno)); 10211 } 10212 10213 /* Put the selected tag in place */ 10214 dt_setup(s, dtagp); 10215 10216 cp = kmem_zalloc(sizeof (mddb_config_t), KM_SLEEP); 10217 10218 /* Save the hint information */ 10219 trip = save_rip(s); 10220 10221 cp->c_timestamp = s->s_ident.createtime; /* struct assignment */ 10222 cp->c_setno = setno; 10223 cp->c_sideno = s->s_sideno; 10224 (void) strncpy(cp->c_setname, s->s_setname, MD_MAX_SETNAME); 10225 cp->c_setname[MD_MAX_SETNAME] = '\0'; 10226 cp->c_med = s->s_med; /* struct assignment */ 10227 10228 mddb_setexit(s); 10229 10230 s = NULL; 10231 10232 /* shorthand */ 10233 setno = cp->c_setno; 10234 10235 /* Let unload know not to free the tag */ 10236 md_set_setstatus(setno, MD_SET_KEEPTAG); 10237 10238 /* Release the set */ 10239 if (err = release_set(cp, mode)) 10240 goto out; 10241 10242 if (! mdisok(&cp->c_mde)) { 10243 (void) mdstealerror(ep, &cp->c_mde); 10244 err = 1; 10245 goto out; 10246 } 10247 10248 /* Re-init set using the saved mddb_config_t structure */ 10249 if ((s = mddb_setenter(setno, MDDB_NOINIT, &err)) == NULL) { 10250 if ((s = init_set(cp, MDDB_NOINIT, &err)) == NULL) { 10251 err = mddbstatus2error(ep, err, NODEV32, setno); 10252 goto out; 10253 } 10254 } 10255 10256 ASSERT(s->s_rip == (mddb_ri_t *)NULL); 10257 10258 /* use the saved rip structure */ 10259 s->s_rip = trip; 10260 trip = (mddb_ri_t *)NULL; 10261 10262 /* Let the take code know a tag is being used */ 10263 md_set_setstatus(setno, MD_SET_USETAG); 10264 10265 mddb_setexit(s); 10266 10267 s = NULL; 10268 10269 /* Take the set */ 10270 if (err = take_set(cp, mode)) 10271 goto out; 10272 10273 if (! mdisok(&cp->c_mde)) 10274 (void) mdstealerror(ep, &cp->c_mde); 10275 10276 out: 10277 md_clr_setstatus(setno, (MD_SET_USETAG | MD_SET_KEEPTAG)); 10278 10279 kmem_free(cp, sizeof (mddb_config_t)); 10280 10281 if (trip) 10282 free_rip(&trip); 10283 10284 if (s) 10285 mddb_setexit(s); 10286 10287 return (err); 10288 } 10289 10290 int 10291 accept_ioctl(mddb_accept_parm_t *accpp, int mode) 10292 { 10293 mddb_set_t *s; 10294 int err = 0; 10295 mddb_config_t *cp; 10296 mddb_ri_t *trip = NULL; 10297 set_t setno = accpp->accp_setno; 10298 md_error_t *ep = &accpp->accp_mde; 10299 10300 mdclrerror(ep); 10301 10302 if ((mode & FWRITE) == 0) 10303 return (mdsyserror(ep, EACCES)); 10304 10305 if (setno >= md_nsets) 10306 return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR)); 10307 10308 if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0) 10309 return (0); 10310 10311 if ((md_get_setstatus(setno) & MD_SET_ACCOK) == 0) 10312 return (mdmddberror(ep, MDE_DB_ACCNOTOK, NODEV32, setno)); 10313 10314 if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) 10315 return (mddbstatus2error(ep, err, NODEV32, setno)); 10316 10317 /* 10318 * Data tags not supported on MN sets so return invalid operation. 10319 * mddb is guaranteed to be incore at this point, so this 10320 * check will catch all MN disksets. 10321 */ 10322 if (md_get_setstatus(setno) & MD_SET_MNSET) { 10323 mddb_setexit(s); 10324 return (mderror(ep, MDE_INVAL_MNOP)); 10325 } 10326 10327 cp = kmem_zalloc(sizeof (mddb_config_t), KM_SLEEP); 10328 10329 trip = save_rip(s); 10330 10331 cp->c_timestamp = s->s_ident.createtime; /* struct assignment */ 10332 cp->c_setno = setno; 10333 cp->c_sideno = s->s_sideno; 10334 (void) strncpy(cp->c_setname, s->s_setname, MD_MAX_SETNAME); 10335 cp->c_setname[MD_MAX_SETNAME] = '\0'; 10336 cp->c_med = s->s_med; /* struct assignment */ 10337 10338 /* Tag the data */ 10339 if (err = set_dtag(s, ep)) { 10340 err = mdsyserror(ep, err); 10341 goto out; 10342 } 10343 10344 /* If we had a BADTAG, it will be re-written, so clear the bit. */ 10345 if (md_get_setstatus(setno) & MD_SET_BADTAG) 10346 md_clr_setstatus(setno, MD_SET_BADTAG); 10347 10348 if (err = dt_write(s)) { 10349 err = mdsyserror(ep, err); 10350 goto out; 10351 } 10352 10353 mddb_setexit(s); 10354 10355 s = NULL; 10356 10357 /* shorthand */ 10358 setno = cp->c_setno; 10359 10360 /* Clear the keeptag */ 10361 md_clr_setstatus(setno, MD_SET_KEEPTAG); 10362 10363 /* Release the set */ 10364 if (err = release_set(cp, mode)) 10365 goto out; 10366 10367 if (! mdisok(&cp->c_mde)) { 10368 (void) mdstealerror(ep, &cp->c_mde); 10369 goto out; 10370 } 10371 10372 /* Re-init set using the saved mddb_config_t structure */ 10373 if ((s = mddb_setenter(setno, MDDB_NOINIT, &err)) == NULL) { 10374 if ((s = init_set(cp, MDDB_NOINIT, &err)) == NULL) { 10375 err = mddbstatus2error(ep, err, NODEV32, setno); 10376 goto out; 10377 } 10378 } 10379 10380 ASSERT(s->s_rip == (mddb_ri_t *)NULL); 10381 10382 /* Free the allocated rip structure */ 10383 if (s->s_rip != (mddb_ri_t *)NULL) 10384 free_rip(&s->s_rip); 10385 10386 /* use the saved rip structure */ 10387 s->s_rip = trip; 10388 trip = (mddb_ri_t *)NULL; 10389 10390 /* Let the set init code know an accept is in progress */ 10391 md_set_setstatus(setno, MD_SET_ACCEPT); 10392 10393 mddb_setexit(s); 10394 10395 s = NULL; 10396 10397 /* Take the set */ 10398 if (err = take_set(cp, mode)) 10399 goto out; 10400 10401 if (! mdisok(&cp->c_mde)) 10402 (void) mdstealerror(ep, &cp->c_mde); 10403 10404 out: 10405 md_clr_setstatus(setno, (MD_SET_ACCOK | MD_SET_ACCEPT)); 10406 10407 kmem_free(cp, sizeof (mddb_config_t)); 10408 10409 if (trip) 10410 free_rip(&trip); 10411 10412 if (s) 10413 mddb_setexit(s); 10414 10415 return (err); 10416 } 10417 10418 /* 10419 * mddb_getinvlb_devid - cycles through the locator block and determines 10420 * if the device id's for any of the replica disks are invalid. 10421 * If so, it returns the diskname in the ctdptr. 10422 * RETURN 10423 * -1 Error 10424 * cnt number of invalid device id's 10425 */ 10426 int 10427 mddb_getinvlb_devid( 10428 set_t setno, 10429 int count, 10430 int size, 10431 char **ctdptr 10432 ) 10433 { 10434 mddb_set_t *s; 10435 int err = 0; 10436 mddb_lb_t *lbp; 10437 int li; 10438 mddb_did_blk_t *did_blk; 10439 mddb_did_info_t *did_info; 10440 int len; 10441 int cnt = 0; 10442 char *cptr; 10443 md_name_suffix *sn; 10444 int i, dont_add_it; 10445 char *tmpctd, *diskname; 10446 char *tmpname; 10447 10448 cptr = *ctdptr; 10449 if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) { 10450 return (-1); 10451 } 10452 10453 single_thread_start(s); 10454 lbp = s->s_lbp; 10455 10456 if (lbp->lb_setno != setno) { 10457 single_thread_end(s); 10458 mddb_setexit(s); 10459 return (-1); 10460 } 10461 10462 /* check for lb being devid style */ 10463 if (lbp->lb_flags & MDDB_DEVID_STYLE) { 10464 did_blk = s->s_did_icp->did_ic_blkp; 10465 for (li = 0; li < lbp->lb_loccnt; li++) { 10466 did_info = &(did_blk->blk_info[li]); 10467 /* Only if devid exists and isn't valid */ 10468 if ((did_info->info_flags & MDDB_DID_EXISTS) && 10469 !(did_info->info_flags & MDDB_DID_VALID)) { 10470 /* 10471 * if we count more invalid did's than 10472 * was passed in there's an error somewhere 10473 */ 10474 if (cnt++ > count) { 10475 single_thread_end(s); 10476 mddb_setexit(s); 10477 return (-1); 10478 } 10479 10480 /* 10481 * Future note: Need to do something here 10482 * for the MN diskset case when device ids 10483 * are supported in disksets. 10484 * Can't add until merging devids_in_diskset 10485 * code into code base. 10486 */ 10487 10488 sn = &s->s_lnp->ln_suffixes[0][li]; 10489 /* 10490 * check to make sure length of device name is 10491 * not greater than computed first time through 10492 */ 10493 len = sn->suf_len; 10494 if (len > size) { 10495 single_thread_end(s); 10496 mddb_setexit(s); 10497 return (-1); 10498 } 10499 tmpctd = *ctdptr; 10500 /* strip off slice part */ 10501 diskname = md_strdup(sn->suf_data); 10502 tmpname = strrchr(diskname, 's'); 10503 *tmpname = '\0'; 10504 dont_add_it = 0; 10505 /* look to see if diskname is already in list */ 10506 for (i = 0; i < (cnt-1); i++) { 10507 if (strcmp(diskname, tmpctd) == 0) { 10508 /* already there, don't add */ 10509 dont_add_it = 1; 10510 break; 10511 } 10512 /* point to next diskname in list */ 10513 tmpctd += size; 10514 } 10515 if (dont_add_it == 0) { 10516 /* add diskname to list */ 10517 (void) strcpy(cptr, diskname); 10518 cptr += size; 10519 } 10520 kmem_free(diskname, strlen(sn->suf_data) + 1); 10521 } 10522 } 10523 } 10524 /* null terminate the list */ 10525 *cptr = '\0'; 10526 /* 10527 * need to save the new pointer so that calling routine can continue 10528 * to add information onto the end. 10529 */ 10530 *ctdptr = cptr; 10531 single_thread_end(s); 10532 mddb_setexit(s); 10533 return (cnt); 10534 } 10535 10536 /* 10537 * mddb_validate_lb - count the number of lb's with invalid device id's. Keep 10538 * track of length of longest devicename. 10539 * RETURN 10540 * -1 error 10541 * cnt number of lb's with invalid devid's 10542 */ 10543 int 10544 mddb_validate_lb( 10545 set_t setno, 10546 int *rmaxsz 10547 ) 10548 { 10549 mddb_set_t *s; 10550 int err = 0; 10551 mddb_lb_t *lbp; 10552 int li; 10553 mddb_did_blk_t *did_blk; 10554 mddb_did_info_t *did_info; 10555 int len; 10556 int cnt = 0; 10557 10558 if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) 10559 return (-1); 10560 10561 single_thread_start(s); 10562 lbp = s->s_lbp; 10563 10564 if (lbp->lb_setno != setno) { 10565 single_thread_end(s); 10566 mddb_setexit(s); 10567 return (-1); 10568 } 10569 10570 /* lb must be in devid style */ 10571 if ((lbp->lb_flags & MDDB_DEVID_STYLE) == 0) 10572 goto mvl_out; 10573 10574 did_blk = s->s_did_icp->did_ic_blkp; 10575 for (li = 0; li < lbp->lb_loccnt; li++) { 10576 char *minor_name; 10577 mddb_locator_t *lp; 10578 dev_t ddi_dev; 10579 ddi_devid_t devid; 10580 ddi_devid_t rtn_devid = NULL; 10581 int get_rval; 10582 10583 did_info = &(did_blk->blk_info[li]); 10584 if (((did_info->info_flags & MDDB_DID_EXISTS) == 0) || 10585 (did_info->info_flags & MDDB_DID_VALID)) 10586 continue; 10587 10588 /* Here we know, did exists but isn't valid */ 10589 10590 lp = &lbp->lb_locators[li]; 10591 ddi_dev = expldev(lp->l_dev); 10592 get_rval = mddb_devid_get(s, li, &devid, &minor_name); 10593 ASSERT(get_rval == 1); 10594 if ((ddi_lyr_get_devid(ddi_dev, &rtn_devid) == DDI_SUCCESS) && 10595 (ddi_devid_compare(rtn_devid, devid) == 0)) { 10596 did_info->info_flags = MDDB_DID_VALID | 10597 MDDB_DID_EXISTS | 10598 MDDB_DID_UPDATED; 10599 } else { 10600 cnt++; 10601 /* 10602 * Future note: Need to do something here 10603 * for the MN diskset case when device ids 10604 * are supported in disksets. 10605 * Can't add until merging devids_in_diskset 10606 * code into code base. 10607 */ 10608 len = (&s->s_lnp->ln_suffixes[0][li])-> suf_len; 10609 if (*rmaxsz < len) 10610 *rmaxsz = len; 10611 } 10612 if (rtn_devid != NULL) 10613 ddi_devid_free(rtn_devid); 10614 } 10615 10616 mvl_out: 10617 10618 if (push_lb(s) != 0) 10619 cnt = -1; 10620 single_thread_end(s); 10621 mddb_setexit(s); 10622 return (cnt); 10623 } 10624 10625 int 10626 check_active_locators() 10627 { 10628 mddb_set_t *s; 10629 mddb_lb_t *lbp; 10630 int li; 10631 int active = 0; 10632 10633 mutex_enter(&mddb_lock); 10634 /* there is nothing here..so we can unload */ 10635 if ((mddb_set_t *)md_set[MD_LOCAL_SET].s_db == NULL) { 10636 mutex_exit(&mddb_lock); 10637 return (0); 10638 } 10639 s = (mddb_set_t *)md_set[MD_LOCAL_SET].s_db; 10640 lbp = s->s_lbp; 10641 if (lbp == NULL) { 10642 mutex_exit(&mddb_lock); 10643 return (0); 10644 } 10645 10646 for (li = 0; li < lbp->lb_loccnt; li++) { 10647 mddb_locator_t *lp = &lbp->lb_locators[li]; 10648 if (lp->l_flags & MDDB_F_ACTIVE) { 10649 active = 1; 10650 break; 10651 } 10652 } 10653 mutex_exit(&mddb_lock); 10654 return (active); 10655 } 10656 10657 /* 10658 * regetoptrecord: 10659 * -------------- 10660 * Update the in-core optimized resync record contents by re-reading the 10661 * record from the on-disk metadb. 10662 * The contents of the resync record will be overwritten by calling this 10663 * routine. This means that callers that require the previous contents to 10664 * be preserved must save the data before calling this routine. 10665 */ 10666 static void 10667 regetoptrecord( 10668 mddb_set_t *s, 10669 mddb_de_ic_t *dep 10670 ) 10671 { 10672 mddb_lb_t *lbp; 10673 mddb_locator_t *lp; 10674 mddb_rb32_t *rbp, *crbp; 10675 int li; 10676 int i; 10677 int err = 0; 10678 size_t recsize; 10679 10680 #if defined(_ILP32) && !defined(lint) 10681 ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t)); 10682 #endif 10683 10684 recsize = dep->de_recsize; 10685 crbp = (mddb_rb32_t *)kmem_zalloc(recsize, KM_SLEEP); 10686 10687 single_thread_start(s); 10688 rbp = dep->de_rb; 10689 10690 dep->de_optinfo[0].o_flags |= MDDB_F_EDATA; 10691 dep->de_optinfo[1].o_flags |= MDDB_F_EDATA; 10692 10693 lbp = s->s_lbp; 10694 10695 for (i = 0; i < 2; i++) { 10696 if (! (dep->de_optinfo[i].o_flags & MDDB_F_ACTIVE)) 10697 continue; 10698 li = dep->de_optinfo[i].o_li; 10699 lp = &lbp->lb_locators[li]; 10700 10701 if (! (lp->l_flags & MDDB_F_ACTIVE) || 10702 (lp->l_flags & MDDB_F_EMASTER)) 10703 continue; 10704 10705 err = readblklst(s, (caddr_t)rbp, dep->de_blks, 10706 dep->de_blkcount, li); 10707 10708 if (err) 10709 continue; 10710 10711 if (rbp->rb_magic != MDDB_MAGIC_RB) 10712 continue; 10713 10714 if (revchk(MDDB_REV_RB, rbp->rb_revision)) 10715 continue; 10716 10717 /* Check the crc for this record */ 10718 if (rec_crcchk(s, dep, rbp)) { 10719 continue; 10720 } 10721 dep->de_optinfo[i].o_flags = MDDB_F_ACTIVE; 10722 10723 if (rbp == crbp) { 10724 if (rbp->rb_checksum != crbp->rb_checksum) 10725 dep->de_optinfo[1].o_flags |= MDDB_F_EDATA; 10726 break; 10727 } 10728 rbp = crbp; 10729 } 10730 10731 single_thread_end(s); 10732 10733 if (rbp == crbp) { 10734 rbp->rb_private = 0; 10735 kmem_free((caddr_t)crbp, recsize); 10736 return; 10737 } 10738 uniqtime32(&rbp->rb_timestamp); 10739 /* Generate the crc for this record */ 10740 rec_crcgen(s, dep, rbp); 10741 kmem_free((caddr_t)crbp, recsize); 10742 } 10743 10744 /* 10745 * mddb_reread_rr: 10746 * Re-read the resync record from the on-disk copy. This is required for 10747 * multi-node support so that a new mirror-owner can determine if a resync 10748 * operation is required to guarantee data integrity. 10749 * 10750 * Arguments: 10751 * setno Associated set 10752 * id Resync record ID 10753 * 10754 * Return Value: 10755 * 0 successful reread 10756 * -1 invalid set (not multi-node or non-existant) 10757 * >0 metadb state invalid 10758 */ 10759 int 10760 mddb_reread_rr( 10761 set_t setno, 10762 mddb_recid_t id 10763 ) 10764 { 10765 mddb_set_t *s; 10766 int err = 0; 10767 mddb_db_t *dbp; 10768 mddb_de_ic_t *dep; 10769 10770 if (setno >= md_nsets) 10771 return (-1); 10772 10773 if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) 10774 return (-1); 10775 10776 if ((setno == MD_LOCAL_SET) || !(s->s_lbp->lb_flags & MDDB_MNSET)) { 10777 mddb_setexit(s); 10778 return (-1); 10779 } 10780 10781 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 10782 dep = dbp->db_firstentry; 10783 while (dep && (dep->de_recid != DBID(id))) 10784 dep = dep->de_next; 10785 if (dep != NULL) 10786 break; 10787 } 10788 10789 if (dep != NULL) { 10790 regetoptrecord(s, dep); 10791 err = 0; 10792 } else { 10793 err = -1; 10794 } 10795 mddb_setexit(s); 10796 return (err); 10797 } 10798 10799 /* 10800 * Set owner associated with MN optimized resync record. 10801 * 10802 * Optimized records have an owner node associated with them in 10803 * a MN diskset. The owner is only set on a node that is actively 10804 * writing to that record. The other nodes will show that record 10805 * as having an invalid owner. The owner for an optimized record 10806 * is used during fixoptrecord to determine which node should 10807 * write out the record when the replicas associated with that 10808 * optimized record have been changed. 10809 * 10810 * Called directly from mirror driver and not from an ioctl. 10811 * 10812 * Returns 10813 * NULL if successful. 10814 * MDDB_E_NORECORD if record not found. 10815 */ 10816 int 10817 mddb_setowner( 10818 mddb_recid_t id, 10819 md_mn_nodeid_t owner 10820 ) 10821 { 10822 mddb_set_t *s; 10823 mddb_db_t *dbp; 10824 mddb_de_ic_t *dep; 10825 int found = 0; 10826 10827 10828 if (DBSET(id) >= md_nsets) 10829 return (MDDB_E_NORECORD); 10830 10831 if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, NULL)) == NULL) 10832 return (MDDB_E_NORECORD); 10833 10834 id = DBID(id); 10835 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 10836 for (dep = dbp->db_firstentry; 10837 dep != NULL; dep = dep->de_next) { 10838 if (dep->de_recid != id) 10839 continue; 10840 dep->de_owner_nodeid = owner; 10841 found = 1; 10842 break; 10843 } 10844 if (found) 10845 break; 10846 } 10847 10848 mddb_setexit(s); 10849 10850 if (!found) { 10851 return (MDDB_E_NORECORD); 10852 } 10853 10854 return (NULL); 10855 } 10856 10857 /* 10858 * mddb_parse re-reads portions of the mddb from disk given a list 10859 * of good replicas to read from and flags describing 10860 * which portion of the mddb to read in. 10861 * 10862 * Used in a MN diskset when the master has made a change to some part 10863 * of the mddb and wants to relay this information to the slaves. 10864 */ 10865 int 10866 mddb_parse(mddb_parse_parm_t *mpp) 10867 { 10868 mddb_set_t *s; 10869 int err = 0; 10870 mddb_locator_t *lp, *old_lp; 10871 mddb_lb_t *lbp, *old_lbp; 10872 int rval = 0; 10873 int i, li; 10874 int found_good_one = 0; 10875 mddb_ln_t *lnp; 10876 mddb_block_t ln_blkcnt; 10877 md_error_t *ep = &mpp->c_mde; 10878 10879 if (mpp->c_setno >= md_nsets) 10880 return (EINVAL); 10881 10882 if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0) 10883 return (0); 10884 10885 if ((s = mddb_setenter(mpp->c_setno, MDDB_MUSTEXIST, &err)) == NULL) { 10886 return (mddbstatus2error(ep, err, NODEV32, mpp->c_setno)); 10887 } 10888 10889 if (!(MD_MNSET_SETNO(mpp->c_setno))) { 10890 mddb_setexit_no_parse(s); 10891 return (EINVAL); 10892 } 10893 10894 /* 10895 * Master node initiated this request, so there's no work for 10896 * the master node to do. 10897 */ 10898 if (md_set[mpp->c_setno].s_am_i_master) { 10899 mddb_setexit_no_parse(s); 10900 return (rval); 10901 } 10902 10903 single_thread_start(s); 10904 10905 if (mpp->c_parse_flags & MDDB_PARSE_LOCBLK) { 10906 lbp = 0; 10907 for (i = 0; i < MDDB_NLB; i++) { 10908 /* Walk through master's active list */ 10909 if (!(mpp->c_lb_flags[i] & MDDB_F_ACTIVE)) 10910 continue; 10911 if (s->s_mbiarray[i] == NULL) 10912 continue; 10913 10914 /* Assumes master blocks are already setup */ 10915 if (lbp == (mddb_lb_t *)NULL) { 10916 lbp = (mddb_lb_t *)kmem_zalloc( 10917 dbtob(MDDB_MNLBCNT), KM_SLEEP); 10918 } 10919 err |= readblks(s, (caddr_t)lbp, 0, lbp->lb_blkcnt, i); 10920 10921 if (err) 10922 continue; 10923 10924 if (lbp->lb_magic != MDDB_MAGIC_LB) 10925 continue; 10926 if (lbp->lb_blkcnt != MDDB_MNLBCNT) 10927 continue; 10928 if (revchk(MDDB_REV_MNLB, lbp->lb_revision)) 10929 continue; 10930 if (crcchk(lbp, &lbp->lb_checksum, dbtob(MDDB_MNLBCNT), 10931 NULL)) 10932 continue; 10933 if (lbp->lb_setno != s->s_setno) 10934 continue; 10935 /* 10936 * a commit count of zero means this locator has 10937 * been deleted 10938 */ 10939 if (lbp->lb_commitcnt == 0) { 10940 continue; 10941 } 10942 /* Found a good locator - keep it */ 10943 found_good_one = 1; 10944 break; 10945 } 10946 10947 /* 10948 * If found a good copy of the mddb, then read it into 10949 * this node's locator block. Fix up the set's s_mbiarray 10950 * pointer (master block incore array pointer) to be 10951 * in sync with the newly read in locator block. If a 10952 * new mddb was added, read in the master blocks associated 10953 * with the new mddb. If an mddb was deleted, free the 10954 * master blocks associated with deleted mddb. 10955 */ 10956 if (found_good_one) { 10957 /* Compare old and new view of mddb locator blocks */ 10958 old_lbp = s->s_lbp; 10959 for (li = 0; li < lbp->lb_loccnt; li++) { 10960 int mn_set; 10961 10962 lp = &lbp->lb_locators[li]; 10963 old_lp = &old_lbp->lb_locators[li]; 10964 10965 /* If old and new views match, continue */ 10966 if ((lp->l_flags & MDDB_F_ACTIVE) == 10967 (old_lp->l_flags & MDDB_F_ACTIVE)) 10968 continue; 10969 10970 if (lp->l_flags & MDDB_F_ACTIVE) { 10971 /* 10972 * If new mddb has been added - delete 10973 * old mbiarray and get new one. 10974 * 10975 * When devids are supported, will 10976 * need to get dev from devid. 10977 */ 10978 if (s->s_mbiarray[li]) { 10979 free_mbipp(&s->s_mbiarray[li]); 10980 } 10981 /* 10982 * If getmasters fails, getmasters 10983 * will set appropriate error flags. 10984 */ 10985 s->s_mbiarray[li] = getmasters(s, 10986 md_expldev(lp->l_dev), lp->l_blkno, 10987 (uint_t *)&(lp->l_flags), &mn_set); 10988 } else if (lp->l_flags & MDDB_F_DELETED) { 10989 /* 10990 * If old one has been deleted - 10991 * delete old mbiarray. 10992 */ 10993 if (s->s_mbiarray[li]) { 10994 free_mbipp(&s->s_mbiarray[li]); 10995 } 10996 } 10997 } 10998 10999 /* Free this node's old view of mddb locator blocks */ 11000 kmem_free((caddr_t)s->s_lbp, 11001 dbtob(s->s_lbp->lb_blkcnt)); 11002 s->s_lbp = lbp; 11003 } else { 11004 if (lbp) 11005 kmem_free(lbp, dbtob(MDDB_MNLBCNT)); 11006 } 11007 } 11008 11009 if (mpp->c_parse_flags & MDDB_PARSE_LOCNM) { 11010 lnp = s->s_lnp; 11011 lbp = s->s_lbp; 11012 ln_blkcnt = lbp->lb_lnblkcnt; 11013 s->s_lnp = NULL; /* readlocnames does this anyway */ 11014 for (li = 0; li < lbp->lb_loccnt; li++) { 11015 lp = &lbp->lb_locators[li]; 11016 11017 if ((! (lp->l_flags & MDDB_F_ACTIVE)) || 11018 (lp->l_flags & MDDB_F_EMASTER)) 11019 continue; 11020 11021 /* Successfully read the locator names */ 11022 if (readlocnames(s, li) == 0) 11023 break; 11024 } 11025 11026 if (li == lbp->lb_loccnt) { 11027 /* Did not successfully read locnames; restore lnp */ 11028 s->s_lnp = lnp; 11029 } else { 11030 /* readlocnames successful, free old struct */ 11031 kmem_free((caddr_t)lnp, dbtob(ln_blkcnt)); 11032 } 11033 } 11034 11035 if (mpp->c_parse_flags & MDDB_PARSE_OPTRECS) { 11036 mddb_de_ic_t *dep, *tdep, *first_dep, *dep2; 11037 mddb_db_t *dbp; 11038 mddb_db32_t *db32p; 11039 mddb_de32_t *de32p, *de32p2; 11040 int writeout; 11041 11042 lbp = s->s_lbp; 11043 /* 11044 * Walk through directory block and directory entry incore 11045 * linked list looking for optimized resync records. 11046 * For each opt record found, re-read in directory block. 11047 * The directoy block consists of a number of directory 11048 * entries. The directory entry for this opt record will 11049 * describe which 2 mddbs actually contain the resync record 11050 * since it could have been relocated by the master node 11051 * due to mddb failure or mddb deletion. If this node 11052 * is the record owner for this opt record, then write out 11053 * the record to the 2 mddbs listed in the directory entry 11054 * if the mddbs locations are different than previously known. 11055 */ 11056 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 11057 for (dep = dbp->db_firstentry; dep; 11058 dep = dep->de_next) { 11059 /* Found an opt record */ 11060 if (dep->de_flags & MDDB_F_OPT) 11061 break; 11062 } 11063 /* If no opt records found, go to next dbp */ 11064 if (dep == NULL) 11065 continue; 11066 11067 /* 11068 * Reread directory block from disk since 11069 * master could have rewritten in during fixoptrecord. 11070 */ 11071 db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, 11072 KM_SLEEP); 11073 create_db32rec(db32p, dbp); 11074 for (li = 0; li < lbp->lb_loccnt; li++) { 11075 lp = &lbp->lb_locators[li]; 11076 11077 if ((! (lp->l_flags & MDDB_F_ACTIVE)) || 11078 (lp->l_flags & MDDB_F_EMASTER)) 11079 continue; 11080 11081 err = readblks(s, (caddr_t)db32p, 11082 db32p->db32_blknum, 1, li); 11083 if (err) 11084 continue; 11085 11086 /* Reverify db; go to next mddb if bad */ 11087 if ((db32p->db32_magic != MDDB_MAGIC_DB) || 11088 (revchk(MDDB_REV_DB, 11089 db32p->db32_revision)) || 11090 (crcchk(db32p, &db32p->db32_checksum, 11091 MDDB_BSIZE, NULL))) { 11092 continue; 11093 } else { 11094 break; 11095 } 11096 } 11097 /* 11098 * If all mddbs are unavailable then panic since 11099 * this slave cannot be allowed to continue out-of-sync 11100 * with the master node. Since the optimized resync 11101 * records are written by all nodes, all nodes must 11102 * stay in sync with the master. 11103 * 11104 * This also handles the case when all storage 11105 * connectivity to a slave node has failed. The 11106 * slave node will send an MDDB_OPTRECERR message to 11107 * the master node when the slave node has been unable 11108 * to write an optimized resync record to both 11109 * designated mddbs. After the master has fixed the 11110 * optimized records to be on available mddbs, the 11111 * MDDB_PARSE message (with the flag MDDB_PARSE_OPTRECS) 11112 * is sent to all slave nodes. If a slave node is 11113 * unable to access any mddb in order to read in the 11114 * relocated optimized resync record, then the slave 11115 * node must panic. 11116 */ 11117 if (li == lbp->lb_loccnt) { 11118 kmem_free((caddr_t)db32p, MDDB_BSIZE); 11119 cmn_err(CE_PANIC, "md: mddb: Node unable to " 11120 "access any SVM state database " 11121 "replicas for diskset %s\n", 11122 s->s_setname); 11123 } 11124 /* 11125 * Setup temp copy of linked list of de's. 11126 * Already have an incore copy, but need to walk 11127 * the directory entry list contained in the 11128 * new directory block that was just read in above. 11129 * After finding the directory entry of an opt record 11130 * by walking the incore list, find the corresponding 11131 * entry in the temporary list and then update 11132 * the incore directory entry record with 11133 * the (possibly changed) mddb location stored 11134 * for the optimized resync records. 11135 */ 11136 de32p = (mddb_de32_t *) 11137 ((void *) ((caddr_t) 11138 (&db32p->db32_firstentry) 11139 + sizeof (db32p->db32_firstentry))); 11140 tdep = (mddb_de_ic_t *) 11141 kmem_zalloc(sizeof (mddb_de_ic_t) - 11142 sizeof (mddb_block_t) + 11143 sizeof (mddb_block_t) * 11144 de32p->de32_blkcount, KM_SLEEP); 11145 de32tode(de32p, tdep); 11146 first_dep = tdep; 11147 while (de32p && de32p->de32_next) { 11148 de32p2 = nextentry(de32p); 11149 dep2 = (mddb_de_ic_t *)kmem_zalloc( 11150 sizeof (mddb_de_ic_t) - 11151 sizeof (mddb_block_t) + 11152 sizeof (mddb_block_t) * 11153 de32p2->de32_blkcount, KM_SLEEP); 11154 de32tode(de32p2, dep2); 11155 tdep->de_next = dep2; 11156 tdep = dep2; 11157 de32p = de32p2; 11158 } 11159 11160 /* Now, walk the incore directory entry list */ 11161 for (dep = dbp->db_firstentry; dep; 11162 dep = dep->de_next) { 11163 if (! (dep->de_flags & MDDB_F_OPT)) 11164 continue; 11165 /* 11166 * Found an opt record in the incore copy. 11167 * Find the corresponding entry in the temp 11168 * list. If anything has changed in the 11169 * opt record info between the incore copy 11170 * and the temp copy, update the incore copy 11171 * and set a flag to writeout the opt record 11172 * to the new mddb locations. 11173 */ 11174 for (tdep = first_dep; tdep; 11175 tdep = tdep->de_next) { 11176 if (dep->de_recid == tdep->de_recid) { 11177 writeout = 0; 11178 /* Check first mddb location */ 11179 if ((dep->de_optinfo[0].o_li != 11180 tdep->de_optinfo[0].o_li) || 11181 (dep->de_optinfo[0].o_flags != 11182 tdep->de_optinfo[0].o_flags)) { 11183 dep->de_optinfo[0] = 11184 tdep->de_optinfo[0]; 11185 writeout = 1; 11186 } 11187 /* Check second mddb location */ 11188 if ((dep->de_optinfo[1].o_li != 11189 tdep->de_optinfo[1].o_li) || 11190 (dep->de_optinfo[1].o_flags != 11191 tdep->de_optinfo[1].o_flags)) { 11192 dep->de_optinfo[1] = 11193 tdep->de_optinfo[1]; 11194 writeout = 1; 11195 } 11196 /* Record owner should rewrite it */ 11197 if ((writeout) && 11198 (dep->de_owner_nodeid == 11199 md_set[mpp->c_setno]. 11200 s_nodeid)) { 11201 (void) writeoptrecord(s, 11202 dep); 11203 } 11204 break; 11205 } 11206 } 11207 } 11208 /* 11209 * Update the incore checksum information for this 11210 * directory block to match the newly read in checksum. 11211 * This should have only changed if the incore and 11212 * temp directory entries differed, but it takes 11213 * more code to do the check than to just update 11214 * the information everytime. 11215 */ 11216 dbp->db_checksum = db32p->db32_checksum; 11217 11218 /* Now free everything */ 11219 tdep = first_dep; 11220 while (tdep) { 11221 dep2 = tdep->de_next; 11222 kmem_free((caddr_t)tdep, 11223 sizeofde(tdep)); 11224 tdep = dep2; 11225 } 11226 kmem_free((caddr_t)db32p, MDDB_BSIZE); 11227 } 11228 rval = 0; 11229 } 11230 out: 11231 single_thread_end(s); 11232 mddb_setexit_no_parse(s); 11233 return (rval); 11234 } 11235 11236 int 11237 mddb_block(mddb_block_parm_t *mbp) 11238 { 11239 mddb_set_t *s; 11240 int err = 0; 11241 md_error_t *ep = &mbp->c_mde; 11242 11243 if (mbp->c_setno >= md_nsets) 11244 return (EINVAL); 11245 11246 /* 11247 * If the new_master flag is set for this setno we are in the middle 11248 * of a reconfig cycle, and blocking or unblocking is not needed. 11249 * Hence we can return success immediately 11250 */ 11251 if (md_get_setstatus(mbp->c_setno) & MD_SET_MN_NEWMAS_RC) { 11252 return (0); 11253 } 11254 11255 if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0) 11256 return (0); 11257 11258 if ((s = mddb_setenter(mbp->c_setno, MDDB_MUSTEXIST, &err)) == NULL) { 11259 return (mddbstatus2error(ep, err, NODEV32, mbp->c_setno)); 11260 } 11261 11262 if (!(MD_MNSET_SETNO(mbp->c_setno))) { 11263 mddb_setexit_no_parse(s); 11264 return (EINVAL); 11265 } 11266 11267 single_thread_start(s); 11268 11269 if (mbp->c_blk_flags & MDDB_BLOCK_PARSE) 11270 md_set_setstatus(mbp->c_setno, MD_SET_MNPARSE_BLK); 11271 11272 if (mbp->c_blk_flags & MDDB_UNBLOCK_PARSE) 11273 md_clr_setstatus(mbp->c_setno, MD_SET_MNPARSE_BLK); 11274 11275 single_thread_end(s); 11276 mddb_setexit_no_parse(s); 11277 return (err); 11278 } 11279 11280 /* 11281 * mddb_optrecfix marks up to 2 mddbs as failed and calls fixoptrecords 11282 * to relocate any optimized resync records to available mddbs. 11283 * This routine is only called on the master node. 11284 * 11285 * Used in a MN diskset when a slave node has failed to write an optimized 11286 * resync record. The failed mddb information is sent to the master node 11287 * so the master can relocate the optimized records, if possible. If the 11288 * failed mddb information has a mddb marked as failed that was previously 11289 * marked active on the master, the master sets its incore mddb state to 11290 * EWRITE and sets the PARSE_LOCBLK flag. The master node then attempts 11291 * to relocate any optimized records on the newly failed mddbs by calling 11292 * fixoptrecords. (fixoptrecords will set the PARSE_OPTRECS flag if any 11293 * optimized records are relocated.) 11294 * 11295 * When mddb_optrecfix is finished, the ioctl exit code will notice the PARSE 11296 * flags and will send a PARSE message to the slave nodes. The PARSE_LOCBLK 11297 * flag causes the slave node to re-read in the locator block from disk. 11298 * The PARSE_OPTRECS flag causes the slave node to re-read in the directory 11299 * blocks and write out any optimized resync records that have been 11300 * relocated to a different mddb. 11301 */ 11302 int 11303 mddb_optrecfix(mddb_optrec_parm_t *mop) 11304 { 11305 mddb_set_t *s; 11306 int err = 0; 11307 mddb_lb_t *lbp; 11308 mddb_mnlb_t *mnlbp; 11309 mddb_locator_t *lp; 11310 int li; 11311 mddb_mnsidelocator_t *mnslp; 11312 mddb_drvnm_t *dn; 11313 int i, j; 11314 md_replica_recerr_t *recerr; 11315 md_error_t *ep = &mop->c_mde; 11316 int something_changed = 0; 11317 int alc, lc; 11318 int setno; 11319 11320 setno = mop->c_setno; 11321 if (mop->c_setno >= md_nsets) 11322 return (EINVAL); 11323 11324 if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0) 11325 return (0); 11326 11327 if ((s = mddb_setenter(mop->c_setno, MDDB_MUSTEXIST, &err)) == NULL) { 11328 return (mddbstatus2error(ep, err, NODEV32, mop->c_setno)); 11329 } 11330 11331 if (!(MD_MNSET_SETNO(mop->c_setno))) { 11332 mddb_setexit(s); 11333 return (EINVAL); 11334 } 11335 11336 single_thread_start(s); 11337 lbp = s->s_lbp; 11338 mnlbp = (mddb_mnlb_t *)lbp; 11339 11340 /* 11341 * If slave node has seen an mddb failure, but the master node 11342 * hasn't encountered this failure, mark the mddb as failed on 11343 * the master node and set the something_changed flag to 1. 11344 */ 11345 for (i = 0; i < 2; i++) { 11346 recerr = &mop->c_recerr[i]; 11347 if (recerr->r_flags & MDDB_F_EWRITE) { 11348 li = recerr->r_li; 11349 lp = &lbp->lb_locators[li]; 11350 for (j = 0; j < MD_MNMAXSIDES; j++) { 11351 mnslp = &mnlbp->lb_mnsidelocators[j][li]; 11352 if (mnslp->mnl_sideno == s->s_sideno) 11353 break; 11354 } 11355 /* Do quick check using li */ 11356 if (j != MD_MNMAXSIDES) 11357 dn = &lbp->lb_drvnm[mnslp->mnl_drvnm_index]; 11358 11359 if ((j != MD_MNMAXSIDES) && 11360 (strncmp(dn->dn_data, recerr->r_driver_name, 11361 MD_MAXDRVNM) == 0) && 11362 (recerr->r_blkno == lp->l_blkno) && 11363 (recerr->r_mnum == mnslp->mnl_mnum)) { 11364 if ((lp->l_flags & MDDB_F_ACTIVE) || 11365 ((lp->l_flags & MDDB_F_EWRITE) == 0)) { 11366 something_changed = 1; 11367 lp->l_flags |= MDDB_F_EWRITE; 11368 lp->l_flags &= ~MDDB_F_ACTIVE; 11369 } 11370 } else { 11371 /* 11372 * Passed in li from slave does not match 11373 * the replica in the master's structures. 11374 * This could have occurred if a delete 11375 * mddb command was running when the 11376 * optimized resync record had a failure. 11377 * Search all replicas for this entry. 11378 * If no match, just ignore. 11379 * If a match, set replica in error. 11380 */ 11381 for (li = 0; li < lbp->lb_loccnt; li++) { 11382 lp = &lbp->lb_locators[li]; 11383 if (lp->l_flags & MDDB_F_DELETED) 11384 continue; 11385 11386 for (j = 0; j < MD_MNMAXSIDES; j++) { 11387 mnslp = 11388 &mnlbp->lb_mnsidelocators[j][li]; 11389 if (mnslp->mnl_sideno == s->s_sideno) 11390 break; 11391 } 11392 if (j == MD_MNMAXSIDES) 11393 continue; 11394 11395 dn = &lbp->lb_drvnm[mnslp->mnl_drvnm_index]; 11396 if ((strncmp(dn->dn_data, recerr->r_driver_name, 11397 MD_MAXDRVNM) == 0) && 11398 (recerr->r_blkno == lp->l_blkno) && 11399 (recerr->r_mnum == mnslp->mnl_mnum)) { 11400 if ((lp->l_flags & MDDB_F_ACTIVE) || 11401 ((lp->l_flags & MDDB_F_EWRITE) 11402 == 0)) { 11403 something_changed = 1; 11404 lp->l_flags |= MDDB_F_EWRITE; 11405 lp->l_flags &= ~MDDB_F_ACTIVE; 11406 } 11407 break; 11408 } 11409 } 11410 } 11411 } 11412 } 11413 11414 /* 11415 * If this message changed nothing, then we're done since this 11416 * failure has already been handled. 11417 * If some mddb state has been changed, send a parse message to 11418 * the slave nodes so that the slaves will re-read the locator 11419 * block from disk. 11420 */ 11421 if (something_changed == 0) { 11422 single_thread_end(s); 11423 mddb_setexit(s); 11424 return (0); 11425 } else { 11426 s->s_mn_parseflags |= MDDB_PARSE_LOCBLK; 11427 } 11428 11429 /* 11430 * Scan replicas setting MD_SET_TOOFEW if 11431 * 50% or more of the mddbs have seen errors. 11432 * Note: Don't call selectreplicas or writeretry 11433 * since these routines may end up setting the ACTIVE flag 11434 * on a failed mddb if the master is able to access the mddb 11435 * but the slave node couldn't. Need to have the ACTIVE flag 11436 * turned off in order to relocate the optimized records to 11437 * mddbs that are (hopefully) available on all nodes. 11438 */ 11439 alc = 0; 11440 lc = 0; 11441 for (li = 0; li < lbp->lb_loccnt; li++) { 11442 lp = &lbp->lb_locators[li]; 11443 if (lp->l_flags & MDDB_F_DELETED) 11444 continue; 11445 lc++; 11446 if (! (lp->l_flags & MDDB_F_ACTIVE)) 11447 continue; 11448 alc++; 11449 } 11450 11451 /* 11452 * If more than 50% mddbs have failed, then don't relocate opt recs. 11453 * The node sending the mddb failure information will detect TOOFEW 11454 * and will panic when it attempts to re-write the optimized record. 11455 */ 11456 if (alc < ((lc + 1) / 2)) { 11457 md_set_setstatus(setno, MD_SET_TOOFEW); 11458 (void) push_lb(s); 11459 single_thread_end(s); 11460 mddb_setexit(s); 11461 return (0); 11462 } 11463 11464 /* Attempt to relocate optimized records that are on failed mddbs */ 11465 (void) fixoptrecords(s); 11466 11467 /* Push changed locator block out to disk */ 11468 (void) push_lb(s); 11469 11470 /* Recheck for TOOFEW after writing out locator blocks */ 11471 alc = 0; 11472 lc = 0; 11473 for (li = 0; li < lbp->lb_loccnt; li++) { 11474 lp = &lbp->lb_locators[li]; 11475 if (lp->l_flags & MDDB_F_DELETED) 11476 continue; 11477 lc++; 11478 if (! (lp->l_flags & MDDB_F_ACTIVE)) 11479 continue; 11480 alc++; 11481 } 11482 11483 /* If more than 50% mddbs have failed, then don't relocate opt recs */ 11484 if (alc < ((lc + 1) / 2)) { 11485 md_set_setstatus(setno, MD_SET_TOOFEW); 11486 single_thread_end(s); 11487 mddb_setexit(s); 11488 return (0); 11489 } 11490 11491 single_thread_end(s); 11492 mddb_setexit(s); 11493 return (0); 11494 } 11495 11496 /* 11497 * Check if incore mddb on master node matches ondisk mddb. 11498 * If not, master writes out incore view to all mddbs. 11499 * Have previously verified that master is an owner of the 11500 * diskset (master has snarfed diskset) and that diskset is 11501 * not stale. 11502 * 11503 * Meant to be called during reconfig cycle during change of master. 11504 * Previous master in diskset may have changed the mddb and 11505 * panic'd before relaying information to slave nodes. New 11506 * master node just writes out its incore view of the mddb and 11507 * the replay of the change log will resync all the nodes. 11508 * 11509 * Only supported for MN disksets. 11510 * 11511 * Return values: 11512 * 0 - success 11513 * non-zero - failure 11514 */ 11515 int 11516 mddb_check_write_ioctl(mddb_config_t *info) 11517 { 11518 int err = 0; 11519 set_t setno = info->c_setno; 11520 mddb_set_t *s; 11521 int li; 11522 mddb_locator_t *lp; 11523 mddb_lb_t *lbp; 11524 mddb_mnlb_t *mnlbp_od; 11525 mddb_ln_t *lnp; 11526 mddb_mnln_t *mnlnp_od; 11527 mddb_db_t *dbp; 11528 mddb_de_ic_t *dep; 11529 int write_out_mddb; 11530 md_error_t *ep = &info->c_mde; 11531 int mddb_err = 0; 11532 int prev_li = 0; 11533 int rval = 0; 11534 int alc, lc; 11535 int mddbs_present = 0; 11536 11537 /* Verify that setno is in valid range */ 11538 if (setno >= md_nsets) 11539 return (EINVAL); 11540 11541 if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0) 11542 return (0); 11543 11544 if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) { 11545 return (mddbstatus2error(ep, err, NODEV32, setno)); 11546 } 11547 11548 /* Calling diskset must be a MN diskset */ 11549 if (!(MD_MNSET_SETNO(setno))) { 11550 mddb_setexit(s); 11551 return (EINVAL); 11552 } 11553 11554 /* Re-verify that set is not stale */ 11555 if (md_get_setstatus(setno) & MD_SET_STALE) { 11556 mddb_setexit(s); 11557 return (mdmddberror(ep, MDE_DB_STALE, 11558 NODEV32, setno)); 11559 } 11560 11561 lbp = s->s_lbp; 11562 lnp = s->s_lnp; 11563 11564 /* 11565 * Previous master could have died during the write of data to 11566 * the mddbs so that the ondisk mddbs may not be consistent. 11567 * So, need to check the contents of the first and last active mddb 11568 * to see if the mddbs need to be rewritten. 11569 */ 11570 for (li = 0; li < lbp->lb_loccnt; li++) { 11571 int checkcopy_err; 11572 11573 lp = &lbp->lb_locators[li]; 11574 /* Find replica that is active */ 11575 if (lp->l_flags & MDDB_F_DELETED) 11576 continue; 11577 mddbs_present = 1; 11578 if (! (lp->l_flags & MDDB_F_ACTIVE)) 11579 continue; 11580 if (s->s_mbiarray[li] == NULL) 11581 continue; 11582 /* Check locator block */ 11583 mnlbp_od = (mddb_mnlb_t *)kmem_zalloc(dbtob(MDDB_MNLBCNT), 11584 KM_SLEEP); 11585 /* read in on-disk locator block */ 11586 err = readblks(s, (caddr_t)mnlbp_od, 0, lbp->lb_blkcnt, li); 11587 11588 /* If err, try next mddb */ 11589 if (err) { 11590 kmem_free(mnlbp_od, dbtob(MDDB_MNLBCNT)); 11591 continue; 11592 } 11593 11594 /* 11595 * We resnarf all changelog entries for this set. 11596 * They may have been altered by the previous master 11597 */ 11598 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 11599 for (dep = dbp->db_firstentry; dep; dep = dep->de_next) { 11600 if ((dep->de_flags & MDDB_F_CHANGELOG) == 0) { 11601 continue; 11602 } 11603 /* This has been alloc'ed while joining the set */ 11604 if (dep->de_rb) { 11605 kmem_free(dep->de_rb, dep->de_recsize); 11606 dep->de_rb = (mddb_rb32_t *)NULL; 11607 } 11608 if (dep->de_rb_userdata) { 11609 kmem_free(dep->de_rb_userdata, dep->de_reqsize); 11610 dep->de_rb_userdata = (caddr_t)NULL; 11611 } 11612 11613 err = getrecord(s, dep, li); 11614 if (err) { 11615 /* 11616 * When we see on error while reading the 11617 * changelog entries, we move on to the next 11618 * mddb 11619 */ 11620 err = 1; 11621 break; /* out of inner for-loop */ 11622 } 11623 allocuserdata(dep); 11624 } 11625 if (err) 11626 break; /* out of outer for-loop */ 11627 } 11628 11629 /* If err, try next mddb */ 11630 if (err) { 11631 kmem_free(mnlbp_od, dbtob(MDDB_MNLBCNT)); 11632 continue; 11633 } 11634 11635 /* Is incore locator block same as ondisk? */ 11636 if (bcmp((mddb_mnlb_t *)lbp, mnlbp_od, dbtob(MDDB_MNLBCNT)) 11637 == 1) { 11638 write_out_mddb = 1; 11639 kmem_free((caddr_t)mnlbp_od, dbtob(MDDB_MNLBCNT)); 11640 break; 11641 } 11642 11643 kmem_free((caddr_t)mnlbp_od, dbtob(MDDB_MNLBCNT)); 11644 11645 /* If lb ok, check locator names */ 11646 mnlnp_od = (mddb_mnln_t *)kmem_zalloc(dbtob(MDDB_MNLNCNT), 11647 KM_SLEEP); 11648 /* read in on-disk locator names */ 11649 err = readblks(s, (caddr_t)mnlnp_od, lbp->lb_lnfirstblk, 11650 lbp->lb_lnblkcnt, li); 11651 11652 /* If err, try next mddb */ 11653 if (err) { 11654 kmem_free(mnlnp_od, dbtob(MDDB_MNLNCNT)); 11655 continue; 11656 } 11657 11658 /* Are incore locator names same as ondisk? */ 11659 if (bcmp((mddb_mnln_t *)lnp, mnlnp_od, dbtob(MDDB_MNLNCNT)) 11660 == 1) { 11661 kmem_free((caddr_t)mnlnp_od, dbtob(MDDB_MNLNCNT)); 11662 write_out_mddb = 1; 11663 break; 11664 } 11665 11666 kmem_free((caddr_t)mnlnp_od, dbtob(MDDB_MNLNCNT)); 11667 11668 /* 11669 * Check records in mddb. 11670 * If a read error is encountered, set the error flag and 11671 * continue to the next mddb. Otherwise, if incore data is 11672 * different from ondisk, then set the flag to write out 11673 * the mddb and break out. 11674 */ 11675 checkcopy_err = checkcopy(s, li); 11676 if (checkcopy_err == MDDB_F_EREAD) { 11677 lp->l_flags |= MDDB_F_EREAD; 11678 mddb_err = 1; 11679 continue; 11680 } else if (checkcopy_err == 1) { 11681 write_out_mddb = 1; 11682 break; 11683 } 11684 /* 11685 * Have found first active mddb and the data is the same as 11686 * incore - break out of loop 11687 */ 11688 write_out_mddb = 0; 11689 break; 11690 } 11691 11692 /* 11693 * Skip checking for last active mddb if: 11694 * - already found a mismatch in the first active mddb 11695 * (write_out_mddb is 1) OR 11696 * - didn't find a readable mddb when looking for first 11697 * active mddb (there are mddbs present but all failed 11698 * when read was attempted). 11699 * 11700 * In either case, go to write_out_mddb label in order to attempt 11701 * to write out the data. If < 50% mddbs are available, panic. 11702 */ 11703 if ((write_out_mddb == 1) || 11704 ((li == lbp->lb_loccnt) && mddbs_present)) { 11705 write_out_mddb = 1; 11706 goto write_out_mddb; 11707 } 11708 11709 /* 11710 * Save which index was checked for the first active mddb. If only 1 11711 * active mddb, don't want to recheck the same mddb when looking for 11712 * last active mddb. 11713 */ 11714 prev_li = li; 11715 11716 /* 11717 * Now, checking for last active mddb. If found same index as before 11718 * (only 1 active mddb), then skip. 11719 */ 11720 for (li = (lbp->lb_loccnt - 1); li >= 0; li--) { 11721 int checkcopy_err; 11722 11723 lp = &lbp->lb_locators[li]; 11724 /* Find replica that is active */ 11725 if (! (lp->l_flags & MDDB_F_ACTIVE)) 11726 continue; 11727 if (lp->l_flags & MDDB_F_DELETED) 11728 continue; 11729 if (s->s_mbiarray[li] == NULL) 11730 continue; 11731 /* If already checked mddb, bail out */ 11732 if (li == prev_li) 11733 break; 11734 /* Check locator block */ 11735 mnlbp_od = (mddb_mnlb_t *)kmem_zalloc(dbtob(MDDB_MNLBCNT), 11736 KM_SLEEP); 11737 /* read in on-disk locator block */ 11738 err = readblks(s, (caddr_t)mnlbp_od, 0, lbp->lb_blkcnt, li); 11739 11740 /* If err, try next mddb */ 11741 if (err) { 11742 kmem_free(mnlbp_od, dbtob(MDDB_MNLBCNT)); 11743 continue; 11744 } 11745 11746 11747 /* Is incore locator block same as ondisk? */ 11748 if (bcmp((mddb_mnlb_t *)lbp, mnlbp_od, dbtob(MDDB_MNLBCNT)) 11749 == 1) { 11750 kmem_free((caddr_t)mnlbp_od, dbtob(MDDB_MNLBCNT)); 11751 write_out_mddb = 1; 11752 break; 11753 } 11754 11755 kmem_free((caddr_t)mnlbp_od, dbtob(MDDB_MNLBCNT)); 11756 11757 /* If lb ok, check locator names */ 11758 mnlnp_od = (mddb_mnln_t *) 11759 kmem_zalloc(dbtob(MDDB_MNLNCNT), KM_SLEEP); 11760 11761 /* read in on-disk locator names */ 11762 err = readblks(s, (caddr_t)mnlnp_od, lbp->lb_lnfirstblk, 11763 lbp->lb_lnblkcnt, li); 11764 11765 /* If err, try next mddb */ 11766 if (err) { 11767 kmem_free(mnlnp_od, dbtob(MDDB_MNLNCNT)); 11768 continue; 11769 } 11770 11771 /* Are incore locator names same as ondisk? */ 11772 if (bcmp((mddb_mnln_t *)lnp, mnlnp_od, dbtob(MDDB_MNLNCNT)) 11773 == 1) { 11774 kmem_free((caddr_t)mnlnp_od, dbtob(MDDB_MNLNCNT)); 11775 write_out_mddb = 1; 11776 break; 11777 } 11778 11779 kmem_free((caddr_t)mnlnp_od, dbtob(MDDB_MNLNCNT)); 11780 11781 /* 11782 * Check records in mddb. 11783 * If a read error is encountered, set the error flag and 11784 * continue to the next mddb. Otherwise, if incore data is 11785 * different from ondisk, then set the flag to write out 11786 * the mddb and break out. 11787 */ 11788 checkcopy_err = checkcopy(s, li); 11789 if (checkcopy_err == MDDB_F_EREAD) { 11790 lp->l_flags |= MDDB_F_EREAD; 11791 mddb_err = 1; 11792 continue; 11793 } else if (checkcopy_err == 1) { 11794 write_out_mddb = 1; 11795 break; 11796 } 11797 /* 11798 * Have found last active mddb and the data is the same as 11799 * incore - break out of loop 11800 */ 11801 write_out_mddb = 0; 11802 break; 11803 } 11804 11805 /* 11806 * If ondisk and incore versions of the mddb don't match, then 11807 * write out this node's incore version to disk. 11808 * Or, if unable to read a copy of the mddb, attempt to write 11809 * out a new one. 11810 */ 11811 write_out_mddb: 11812 if (write_out_mddb) { 11813 /* Recompute free blocks based on incore information */ 11814 computefreeblks(s); /* set up free block bits */ 11815 11816 /* 11817 * Write directory entries and record blocks. 11818 * Use flag MDDB_WRITECOPY_SYNC so that writecopy 11819 * routine won't write out change log records. 11820 */ 11821 for (li = 0; li < lbp->lb_loccnt; li++) { 11822 lp = &lbp->lb_locators[li]; 11823 /* Don't write to inactive or deleted mddbs */ 11824 if (! (lp->l_flags & MDDB_F_ACTIVE)) 11825 continue; 11826 if (lp->l_flags & MDDB_F_DELETED) 11827 continue; 11828 if (s->s_mbiarray[li] == NULL) 11829 continue; 11830 /* If encounter a write error, save it for later */ 11831 if (writecopy(s, li, MDDB_WRITECOPY_SYNC)) { 11832 lp->l_flags |= MDDB_F_EWRITE; 11833 mddb_err = 1; 11834 } 11835 } 11836 11837 /* 11838 * Write out locator blocks to all replicas. 11839 * push_lb will set MDDB_F_EWRITE on replicas that fail. 11840 */ 11841 if (push_lb(s)) 11842 mddb_err = 1; 11843 11844 /* Write out locator names to all replicas */ 11845 lnp = s->s_lnp; 11846 uniqtime32(&lnp->ln_timestamp); 11847 lnp->ln_revision = MDDB_REV_MNLN; 11848 crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL); 11849 11850 /* writeall sets MDDB_F_EWRITE if writes fails to replica */ 11851 if (writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk, 11852 lbp->lb_lnblkcnt, 0)) 11853 mddb_err = 1; 11854 11855 /* 11856 * The writes to the replicas above would have set 11857 * the MDDB_F_EWRITE flags if any write error was 11858 * encountered. 11859 * If < 50% of the mddbs are available, panic. 11860 */ 11861 lc = alc = 0; 11862 for (li = 0; li < lbp->lb_loccnt; li++) { 11863 lp = &lbp->lb_locators[li]; 11864 if (lp->l_flags & MDDB_F_DELETED) 11865 continue; 11866 lc++; 11867 /* 11868 * If mddb: 11869 * - is not active (previously had an error) 11870 * - had an error reading the master blocks or 11871 * - had an error in writing to the mddb 11872 * then don't count this mddb in the active count. 11873 */ 11874 if (! (lp->l_flags & MDDB_F_ACTIVE) || 11875 (lp->l_flags & MDDB_F_EMASTER) || 11876 (lp->l_flags & MDDB_F_EWRITE)) 11877 continue; 11878 alc++; 11879 } 11880 if (alc < ((lc + 1) / 2)) { 11881 cmn_err(CE_PANIC, 11882 "md: Panic due to lack of DiskSuite state\n" 11883 " database replicas. Fewer than 50%% of " 11884 "the total were available,\n so panic to " 11885 "ensure data integrity."); 11886 } 11887 } 11888 11889 /* 11890 * If encountered an error during checking or writing of 11891 * mddbs, call selectreplicas so that replica error can 11892 * be properly handled. This will involve another attempt 11893 * to write the mddb out to any mddb marked MDDB_F_EWRITE. 11894 * If mddb still fails, it will have the MDDB_F_ACTIVE bit 11895 * turned off. Set the MDDB_SCANALLSYNC flag so that 11896 * selectreplicas doesn't overwrite the change log entries. 11897 * 11898 * Set the PARSE_LOCBLK flag in the mddb_set structure to show 11899 * that the locator block has been changed. 11900 */ 11901 if (mddb_err) { 11902 (void) selectreplicas(s, MDDB_SCANALLSYNC); 11903 s->s_mn_parseflags |= MDDB_PARSE_LOCBLK; 11904 } 11905 11906 write_out_end: 11907 mddb_setexit(s); 11908 return (rval); 11909 } 11910 11911 /* 11912 * Set/reset/get set flags in set structure. 11913 * Used during reconfig cycle 11914 * Only supported for MN disksets. 11915 * 11916 * Return values: 11917 * 0 - success 11918 * non-zero - failure 11919 */ 11920 int 11921 mddb_setflags_ioctl(mddb_setflags_config_t *info) 11922 { 11923 set_t setno = info->sf_setno; 11924 11925 /* Verify that setno is in valid range */ 11926 if (setno >= md_nsets) 11927 return (EINVAL); 11928 11929 /* 11930 * When setting the flags, the set may not 11931 * be snarfed yet. So, don't check for SNARFED or MNset 11932 * and don't call mddb_setenter. 11933 * In order to discourage bad ioctl calls, 11934 * verify that magic field in structure is set correctly. 11935 */ 11936 if (info->sf_magic != MDDB_SETFLAGS_MAGIC) 11937 return (EINVAL); 11938 11939 switch (info->sf_flags) { 11940 case MDDB_NM_SET: 11941 if (info->sf_setflags & MD_SET_MN_NEWMAS_RC) 11942 md_set_setstatus(setno, MD_SET_MN_NEWMAS_RC); 11943 if (info->sf_setflags & MD_SET_MN_START_RC) 11944 md_set_setstatus(setno, MD_SET_MN_START_RC); 11945 if (info->sf_setflags & MD_SET_MN_MIR_STATE_RC) 11946 md_set_setstatus(setno, MD_SET_MN_MIR_STATE_RC); 11947 break; 11948 11949 case MDDB_NM_RESET: 11950 if (info->sf_setflags & MD_SET_MN_NEWMAS_RC) 11951 md_clr_setstatus(setno, MD_SET_MN_NEWMAS_RC); 11952 if (info->sf_setflags & MD_SET_MN_START_RC) 11953 md_clr_setstatus(setno, MD_SET_MN_START_RC); 11954 if (info->sf_setflags & MD_SET_MN_MIR_STATE_RC) 11955 md_clr_setstatus(setno, MD_SET_MN_MIR_STATE_RC); 11956 break; 11957 11958 case MDDB_NM_GET: 11959 info->sf_setflags = md_get_setstatus(setno) & 11960 (MD_SET_MN_NEWMAS_RC|MD_SET_MN_START_RC| 11961 MD_SET_MN_MIR_STATE_RC); 11962 break; 11963 } 11964 11965 return (0); 11966 } 11967 11968 int 11969 md_update_minor( 11970 set_t setno, 11971 side_t side, 11972 mdkey_t key 11973 ) 11974 { 11975 struct nm_next_hdr *nh; 11976 struct nm_name *n; 11977 char *shn; 11978 int retval = 1; 11979 11980 /* 11981 * Load the devid name space if it exists 11982 */ 11983 (void) md_load_namespace(setno, NULL, NM_DEVID); 11984 if (! md_load_namespace(setno, NULL, 0L)) { 11985 /* 11986 * Unload the devid namespace 11987 */ 11988 (void) md_unload_namespace(setno, NM_DEVID); 11989 return (0); 11990 } 11991 11992 rw_enter(&nm_lock.lock, RW_READER); 11993 11994 if ((nh = get_first_record(setno, 0, NM_NOTSHARED)) == NULL) { 11995 retval = 0; 11996 goto out; 11997 } 11998 11999 /* 12000 * Look up the key 12001 */ 12002 if ((n = lookup_entry(nh, setno, side, key, NODEV64, 0L)) != NULL) { 12003 /* 12004 * Find the entry, update its n_minor if metadevice 12005 */ 12006 if ((shn = (char *)getshared_name(setno, n->n_drv_key, 0L)) 12007 == NULL) { 12008 retval = 0; 12009 goto out; 12010 } 12011 12012 if (strcmp(shn, "md") == 0) { 12013 n->n_minor = MD_MKMIN(setno, MD_MIN2UNIT(n->n_minor)); 12014 } 12015 } 12016 12017 out: 12018 rw_exit(&nm_lock.lock); 12019 return (retval); 12020 } 12021 12022 static void 12023 md_imp_nm( 12024 mddb_set_t *s 12025 ) 12026 { 12027 mddb_db_t *dbp; 12028 mddb_de_ic_t *dep; 12029 struct nm_rec_hdr *hdr; 12030 struct nm_header *hhdr; 12031 set_t setno = s->s_setno; 12032 12033 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 12034 for (dep = dbp->db_firstentry; dep != NULL; 12035 dep = dep->de_next) { 12036 switch (dep->de_type1) { 12037 12038 case MDDB_NM_HDR: 12039 case MDDB_DID_NM_HDR: 12040 12041 hhdr = (struct nm_header *) 12042 dep->de_rb_userdata; 12043 12044 hdr = &hhdr->h_names; 12045 if (hdr->r_next_recid > 0) { 12046 hdr->r_next_recid = MAKERECID(setno, 12047 DBID(hdr->r_next_recid)); 12048 } 12049 12050 hdr = &hhdr->h_shared; 12051 if (hdr->r_next_recid > 0) { 12052 hdr->r_next_recid = MAKERECID(setno, 12053 DBID(hdr->r_next_recid)); 12054 } 12055 break; 12056 12057 case MDDB_NM: 12058 case MDDB_DID_NM: 12059 case MDDB_SHR_NM: 12060 case MDDB_DID_SHR_NM: 12061 12062 hdr = (struct nm_rec_hdr *) 12063 dep->de_rb_userdata; 12064 12065 if (hdr->r_next_recid > 0) { 12066 hdr->r_next_recid = MAKERECID 12067 (setno, DBID(hdr->r_next_recid)); 12068 } 12069 break; 12070 12071 default: 12072 break; 12073 } 12074 } 12075 } 12076 } 12077 12078 static int 12079 update_db_rec( 12080 mddb_set_t *s 12081 ) 12082 { 12083 mddb_db_t *dbp; 12084 mddb_de_ic_t *dep; 12085 mddb_recid_t ids[2]; 12086 12087 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) { 12088 for (dep = dbp->db_firstentry; dep != NULL; 12089 dep = dep->de_next) { 12090 if (! (dep->de_flags & MDDB_F_OPT)) { 12091 ids[0] = MAKERECID(s->s_setno, dep->de_recid); 12092 ids[1] = 0; 12093 if (mddb_commitrecs(ids)) { 12094 return (MDDB_E_NORECORD); 12095 } 12096 } 12097 } 12098 } 12099 return (0); 12100 } 12101 12102 static int 12103 update_mb( 12104 mddb_set_t *s 12105 ) 12106 { 12107 mddb_ri_t *rip; 12108 int err = 0; 12109 12110 for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) { 12111 12112 if (md_get_setstatus(s->s_setno) & 12113 MD_SET_REPLICATED_IMPORT) { 12114 /* 12115 * It is a replicated set 12116 */ 12117 if (rip->ri_devid == (ddi_devid_t)NULL) { 12118 return (-1); 12119 } 12120 err = update_mb_devid(s, rip, rip->ri_devid); 12121 } else { 12122 /* 12123 * It is a non-replicated set 12124 * and there is no need to update 12125 * devid 12126 */ 12127 err = update_mb_devid(s, rip, NULL); 12128 } 12129 12130 if (err) 12131 return (err); 12132 } 12133 12134 return (0); 12135 } 12136 12137 static int 12138 update_setname( 12139 set_t setno 12140 ) 12141 { 12142 struct nm_next_hdr *nh; 12143 struct nm_shared_name *shn, *new_shn; 12144 char *prefix = "/dev/md/"; 12145 char *shrname; 12146 int len; 12147 mdkey_t o_key; 12148 uint32_t o_count, o_data; 12149 mddb_recid_t recid, ids[3]; 12150 int err = 0; 12151 mddb_set_t *dbp; 12152 12153 /* Import setname */ 12154 dbp = (mddb_set_t *)md_set[setno].s_db; 12155 len = strlen(prefix) + strlen(dbp->s_setname) + strlen("/dsk/") + 1; 12156 shrname = kmem_zalloc(len, KM_SLEEP); 12157 (void) sprintf(shrname, "%s%s%s", prefix, dbp->s_setname, "/dsk/"); 12158 12159 rw_enter(&nm_lock.lock, RW_WRITER); 12160 if ((nh = get_first_record(setno, 0, NM_SHARED)) == NULL) { 12161 err = MD_KEYBAD; 12162 goto out; 12163 } 12164 12165 if ((shn = (struct nm_shared_name *)lookup_shared_entry(nh, 12166 0, prefix, NULL, NM_SHARED | NM_IMP_SHARED)) == NULL) { 12167 /* 12168 * No metadevice is okay 12169 */ 12170 err = 0; 12171 goto out; 12172 } 12173 12174 /* 12175 * We have it, go ahead and update the namespace. 12176 */ 12177 o_key = shn->sn_key; 12178 o_count = shn->sn_count; 12179 o_data = shn->sn_data; 12180 12181 if (remove_shared_entry(nh, o_key, NULL, 0L | NM_IMP_SHARED | 12182 NM_NOCOMMIT)) { 12183 err = MD_KEYBAD; 12184 goto out; 12185 } 12186 if ((new_shn = (struct nm_shared_name *)alloc_entry( 12187 nh, md_set[setno].s_nmid, len, NM_SHARED | 12188 NM_NOCOMMIT, &recid)) == NULL) { 12189 err = MD_KEYBAD; 12190 goto out; 12191 } 12192 12193 new_shn->sn_key = o_key; 12194 new_shn->sn_count = o_count; 12195 new_shn->sn_data = o_data; 12196 new_shn->sn_namlen = (ushort_t)len; 12197 (void) strcpy(new_shn->sn_name, shrname); 12198 12199 ids[0] = recid; 12200 ids[1] = md_set[setno].s_nmid; 12201 ids[2] = 0; 12202 err = mddb_commitrecs(ids); 12203 12204 out: 12205 if (shrname) 12206 kmem_free(shrname, len); 12207 rw_exit(&nm_lock.lock); 12208 return (err); 12209 } 12210 12211 static int 12212 md_imp_db( 12213 set_t setno 12214 ) 12215 { 12216 mddb_set_t *s; 12217 int err = 0; 12218 mddb_dt_t *dtp; 12219 12220 if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) { 12221 return (err); 12222 } 12223 12224 /* Update dt */ 12225 if ((dtp = (mddb_dt_t *)md_set[setno].s_dtp) != NULL) { 12226 crcgen(dtp, &dtp->dt_cks, MDDB_DT_BYTES, NULL); 12227 } 12228 12229 if ((err = dt_write(s)) != 0) { 12230 mddb_setexit(s); 12231 return (err); 12232 } 12233 12234 /* Update lb */ 12235 if ((err = writelocall(s)) != 0) { 12236 mddb_setexit(s); 12237 return (err); 12238 } 12239 12240 12241 /* Update mb */ 12242 if ((err = update_mb(s)) != 0) { 12243 mddb_setexit(s); 12244 return (err); 12245 } 12246 12247 mddb_setexit(s); 12248 12249 /* Update db records */ 12250 if ((err = update_db_rec(s)) != 0) 12251 return (err); 12252 12253 /* Update setname embedded in the namespace */ 12254 err = update_setname(setno); 12255 12256 return (err); 12257 } 12258 12259 static void 12260 md_dr_add( 12261 md_set_record *sr, 12262 md_drive_record *dr 12263 ) 12264 { 12265 md_drive_record *drv; 12266 12267 if (sr->sr_driverec == 0) { 12268 sr->sr_driverec = dr->dr_selfid; 12269 return; 12270 } 12271 12272 for (drv = (md_drive_record *)mddb_getrecaddr(sr->sr_driverec); 12273 drv->dr_nextrec != 0; 12274 drv = (md_drive_record *)mddb_getrecaddr(drv->dr_nextrec)) 12275 ; 12276 drv->dr_nextrec = dr->dr_selfid; 12277 } 12278 12279 static void 12280 md_setup_recids( 12281 md_set_record *sr, 12282 mddb_recid_t **ids, 12283 size_t size 12284 ) 12285 { 12286 md_drive_record *drv; 12287 int cnt; 12288 mddb_recid_t *recids; 12289 12290 recids = (mddb_recid_t *)kmem_zalloc(sizeof (mddb_recid_t) 12291 * size, KM_SLEEP); 12292 recids[0] = sr->sr_selfid; 12293 cnt = 1; 12294 12295 for (drv = (md_drive_record *)mddb_getrecaddr(sr->sr_driverec); 12296 /* CSTYLED */ 12297 drv != NULL;) { 12298 recids[cnt++] = drv->dr_selfid; 12299 if (drv->dr_nextrec != 0) 12300 drv = (md_drive_record *)mddb_getrecaddr 12301 (drv->dr_nextrec); 12302 else 12303 drv = NULL; 12304 } 12305 recids[cnt] = 0; 12306 *ids = &recids[0]; 12307 } 12308 12309 static int 12310 md_imp_create_set( 12311 set_t setno 12312 ) 12313 { 12314 mddb_set_t *s; 12315 int drc = 0, err = 0; 12316 size_t sr_size = sizeof (md_set_record); 12317 md_set_record *sr; 12318 mddb_recid_t sr_recid, dr_recid, *ids = NULL; 12319 mddb_ri_t *rip, *trip; 12320 md_drive_record *dr; 12321 size_t dr_size = sizeof (md_drive_record); 12322 mdkey_t dr_key; 12323 12324 12325 if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) 12326 return (err); 12327 12328 /* Create and fill in set record */ 12329 if ((sr_recid = mddb_createrec(sr_size, MDDB_USER, MDDB_UR_SR, 12330 MD_CRO_32BIT, MD_LOCAL_SET)) < 0) { 12331 mddb_setexit(s); 12332 return (MDDB_E_INVALID); 12333 } 12334 12335 sr = (md_set_record *)mddb_getrecaddr(sr_recid); 12336 sr->sr_selfid = sr_recid; 12337 sr->sr_setno = s->s_setno; 12338 (void) strcpy(sr->sr_setname, s->s_setname); 12339 uniqtime32(&sr->sr_ctime); 12340 sr->sr_genid = 0; 12341 sr->sr_revision = MD_SET_RECORD_REVISION; 12342 sr->sr_flags |= MD_SR_OK; 12343 sr->sr_mhiargs = defmhiargs; 12344 (void) strcpy(sr->sr_nodes[0], utsname.nodename); 12345 12346 /* Create and fillin drive records */ 12347 for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) { 12348 /* 12349 * Add entry and create the record 12350 */ 12351 if ((dr_key = md_setdevname(MD_LOCAL_SET, 1, MD_KEYWILD, 12352 rip->ri_driver, md_getminor(rip->ri_dev), 12353 rip->ri_devname, setno)) == 0) 12354 continue; 12355 12356 if (dr_key < 0) { 12357 mddb_setexit(s); 12358 return (MD_KEYBAD); 12359 } 12360 12361 if ((dr_recid = mddb_createrec(dr_size, MDDB_USER, 12362 MDDB_UR_DR, MD_CRO_32BIT, MD_LOCAL_SET)) < 0) { 12363 mddb_setexit(s); 12364 return (MDDB_E_INVALID); 12365 } 12366 12367 dr = (md_drive_record *)mddb_getrecaddr(dr_recid); 12368 dr->dr_selfid = dr_recid; 12369 12370 /* 12371 * We need to check to see if the drive on 12372 * the rip has a replica. If it doesn't have 12373 * a replica, then we need to set the dr_dbcnt 12374 * and dr_dbsize to 0 to reflect that. 12375 */ 12376 if (rip->ri_mbip == NULL) { 12377 dr->dr_dbcnt = 0; 12378 dr->dr_dbsize = 0; 12379 } else { 12380 dr->dr_dbcnt = 1; 12381 12382 for (trip = s->s_rip; trip != NULL; 12383 trip = trip->ri_next) { 12384 12385 if (trip == rip) 12386 continue; 12387 12388 if ((trip->ri_dev == rip->ri_dev) && 12389 (strcmp(trip->ri_devname, rip->ri_devname) 12390 == 0)) 12391 dr->dr_dbcnt++; 12392 } 12393 12394 dr->dr_dbsize = rip->ri_mbip->mbi_mddb_mb.mb_blkcnt + 1; 12395 } 12396 dr->dr_key = dr_key; 12397 uniqtime32(&dr->dr_ctime); 12398 dr->dr_genid = 1; 12399 dr->dr_revision = MD_DRIVE_RECORD_REVISION; 12400 dr->dr_flags = MD_SR_OK; 12401 drc++; 12402 12403 /* Add on the linked list */ 12404 (void) md_dr_add(sr, dr); 12405 } 12406 12407 /* 12408 * Alloc and setup recids which include set record 12409 */ 12410 (void) md_setup_recids(sr, &ids, drc + 2); 12411 12412 /* 12413 * Commit all the records 12414 */ 12415 err = mddb_commitrecs(ids); 12416 12417 if (ids) 12418 kmem_free(ids, sizeof (mddb_recid_t) * (drc + 2)); 12419 mddb_setexit(s); 12420 return (err); 12421 } 12422 12423 /* 12424 * namespace is loaded before this is called. 12425 * The purpose of this function is to update the device ids in the entire 12426 * namespace using the data in the ri structure. Compare the devid found in 12427 * the namespace with ri_old_devid and if they are the same, update with the 12428 * devid in ri_devid. 12429 */ 12430 static int 12431 md_imp_update_namespace_did(mddb_set_t *s) 12432 { 12433 set_t setno = s->s_lbp->lb_setno; 12434 struct nm_next_hdr *nh; 12435 mdkey_t key = MD_KEYWILD; 12436 side_t side = MD_SIDEWILD; 12437 mddb_ri_t *rip = NULL; 12438 mddb_recid_t recids[3]; 12439 struct did_min_name *n; 12440 struct nm_next_hdr *did_shr_nh; 12441 struct did_shr_name *shr_n; 12442 mdkey_t ent_did_key; 12443 uint32_t ent_did_count; 12444 uint32_t ent_did_data; 12445 size_t ent_size, size; 12446 ddi_devid_t devid = NULL; 12447 struct did_shr_name *shn; 12448 size_t offset; 12449 struct nm_next_hdr *this_did_shr_nh; 12450 12451 /* 12452 * It is okay if we dont have any configuration 12453 */ 12454 offset = (sizeof (struct devid_shr_rec) - sizeof (struct did_shr_name)); 12455 if ((nh = get_first_record(setno, 0, NM_DEVID | NM_NOTSHARED)) 12456 == NULL) { 12457 return (0); 12458 } 12459 while ((key = md_getnextkey(setno, side, key, NULL)) != MD_KEYWILD) { 12460 /* check out every entry in the namespace */ 12461 if ((n = (struct did_min_name *)lookup_entry(nh, setno, 12462 side, key, NODEV64, NM_DEVID)) == NULL) { 12463 break; 12464 } else { 12465 did_shr_nh = get_first_record(setno, 0, NM_DEVID | 12466 NM_SHARED); 12467 if (did_shr_nh == NULL) { 12468 return (ENOENT); 12469 } 12470 this_did_shr_nh = did_shr_nh->nmn_nextp; 12471 shr_n = (struct did_shr_name *)lookup_shared_entry( 12472 did_shr_nh, n->min_devid_key, (char *)0, 12473 &recids[0], NM_DEVID); 12474 if (shr_n == NULL) { 12475 return (ENOENT); 12476 } 12477 rw_enter(&nm_lock.lock, RW_WRITER); 12478 devid = (ddi_devid_t)shr_n->did_devid; 12479 /* find this devid in the incore replica */ 12480 for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) { 12481 if (ddi_devid_compare(devid, rip->ri_old_devid) 12482 == 0) { 12483 /* 12484 * found the corresponding entry 12485 * update with new devid 12486 */ 12487 /* first remove old devid info */ 12488 ent_did_key = shr_n ->did_key; 12489 ent_did_count = shr_n->did_count; 12490 ent_did_data = shr_n->did_data; 12491 ent_size = DID_SHR_NAMSIZ(shr_n); 12492 size = ((struct nm_rec_hdr *) 12493 this_did_shr_nh->nmn_record)-> 12494 r_used_size - offset - ent_size; 12495 if (size == 0) { 12496 (void) bzero(shr_n, ent_size); 12497 } else { 12498 (void) ovbcopy((caddr_t)shr_n + 12499 ent_size, shr_n, size); 12500 (void) bzero((caddr_t)shr_n + 12501 size, ent_size); 12502 } 12503 ((struct nm_rec_hdr *)this_did_shr_nh-> 12504 nmn_record)->r_used_size -= 12505 ent_size; 12506 /* add in new devid info */ 12507 if ((shn = (struct did_shr_name *) 12508 alloc_entry(did_shr_nh, 12509 md_set[setno].s_did_nmid, 12510 ddi_devid_sizeof(rip->ri_devid), 12511 NM_DEVID | NM_SHARED | NM_NOCOMMIT, 12512 &recids[0])) == NULL) { 12513 rw_exit(&nm_lock.lock); 12514 return (ENOMEM); 12515 } 12516 shn->did_key = ent_did_key; 12517 shn->did_count = ent_did_count; 12518 ent_did_data |= NM_DEVID_VALID; 12519 shn->did_data = ent_did_data; 12520 shn->did_size = ddi_devid_sizeof( 12521 rip->ri_devid); 12522 bcopy((void *)rip->ri_devid, (void *) 12523 shn->did_devid, shn->did_size); 12524 recids[1] = md_set[setno].s_nmid; 12525 recids[2] = 0; 12526 mddb_commitrecs_wrapper(recids); 12527 } 12528 } 12529 rw_exit(&nm_lock.lock); 12530 } 12531 } 12532 return (0); 12533 } 12534 12535 /*ARGSUSED*/ 12536 int 12537 md_imp_snarf_set( 12538 set_t *setnum, 12539 int mode 12540 ) 12541 { 12542 set_t setno = *setnum; /* import setno */ 12543 mddb_set_t *s; 12544 int i, err = 0; 12545 md_ops_t *ops; 12546 12547 if (setno >= md_nsets) { 12548 return (EINVAL); 12549 } 12550 12551 md_haltsnarf_enter(setno); 12552 if (md_get_setstatus(setno) & MD_SET_IMPORT) { 12553 goto out; 12554 } 12555 12556 /* Set the bit first otherwise load_old_replicas can fail */ 12557 md_set_setstatus(setno, MD_SET_IMPORT); 12558 12559 if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) { 12560 goto out; 12561 } 12562 12563 /* 12564 * Upon completion of load_old_replicas, the old setno is 12565 * restored from the disk so we need to reset 12566 */ 12567 s->s_lbp->lb_setno = setno; 12568 12569 /* 12570 * Fixup the NM records before loading namespace 12571 */ 12572 (void) md_imp_nm(s); 12573 mddb_setexit(s); 12574 12575 /* 12576 * Load the devid name space if it exists 12577 * and ask each module to fixup unit records 12578 */ 12579 if (!md_load_namespace(setno, NULL, NM_DEVID)) { 12580 err = ENOENT; 12581 goto cleanup; 12582 } 12583 if (!md_load_namespace(setno, NULL, 0L)) { 12584 (void) md_unload_namespace(setno, NM_DEVID); 12585 err = ENOENT; 12586 goto cleanup; 12587 } 12588 12589 do { 12590 i = 0; 12591 for (ops = md_opslist; ops != NULL; ops = ops->md_next) 12592 if (ops->md_imp_set != NULL) 12593 i += ops->md_imp_set(setno); 12594 } while (i); 12595 12596 /* 12597 * Fixup 12598 * (1) locator block 12599 * (2) locator name block if necessary 12600 * (3) master block 12601 * (4) directory block 12602 * calls appropriate writes to push changes out 12603 */ 12604 if ((err = md_imp_db(setno)) != 0) 12605 goto cleanup; 12606 12607 /* 12608 * Create set in MD_LOCAL_SET 12609 */ 12610 if ((err = md_imp_create_set(setno)) != 0) 12611 goto cleanup; 12612 12613 /* 12614 * update the namespace device ids if necessary (ie. block copy disk) 12615 */ 12616 if ((md_get_setstatus(s->s_setno) & MD_SET_REPLICATED_IMPORT)) { 12617 if ((err = md_imp_update_namespace_did(s)) != 0) { 12618 goto cleanup; 12619 } 12620 } 12621 12622 cleanup: 12623 /* 12624 * Halt the set 12625 */ 12626 rw_enter(&md_unit_array_rw.lock, RW_WRITER); 12627 (void) md_halt_set(setno, MD_HALT_ALL); 12628 rw_exit(&md_unit_array_rw.lock); 12629 12630 /* 12631 * Unload the namespace for the imported set 12632 */ 12633 mutex_enter(&mddb_lock); 12634 mddb_unload_set(setno); 12635 mutex_exit(&mddb_lock); 12636 12637 out: 12638 md_haltsnarf_exit(setno); 12639 md_clr_setstatus(setno, MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT); 12640 return (err); 12641 } 12642 #endif /* MDDB_FAKE */ 12643