1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <assert.h> 30 #include <ctype.h> 31 #include <libdevinfo.h> 32 #include <mdiox.h> 33 #include <meta.h> 34 #include "meta_repartition.h" 35 #include "meta_set_prv.h" 36 #include <stdio.h> 37 #include <stdlib.h> 38 #include <string.h> 39 #include <sys/lvm/md_mddb.h> 40 #include <sys/lvm/md_names.h> 41 #include <sys/lvm/md_crc.h> 42 43 typedef struct did_list { 44 void *rdid; /* real did if replicated set */ 45 void *did; /* did stored in lb */ 46 char *devname; 47 dev_t dev; 48 uint_t did_index; 49 char *minor_name; 50 struct did_list *next; 51 } did_list_t; 52 53 typedef struct replicated_disk { 54 void *old_devid; 55 void *new_devid; 56 struct replicated_disk *next; 57 } replicated_disk_t; 58 59 /* 60 * The current implementation limits the max device id length to 256 bytes. 61 * Should the max device id length be increased, this define would have to 62 * be bumped up accordingly 63 */ 64 #define MAX_DEVID_LEN 256 65 66 /* 67 * We store a global list of all the replicated disks in the system. In 68 * order to prevent us from performing a linear search on this list, we 69 * store the disks in a two dimensional sparse array. The disks are bucketed 70 * based on the length of their device ids. 71 */ 72 static replicated_disk_t *replicated_disk_list[MAX_DEVID_LEN + 1] = {NULL}; 73 74 /* 75 * The list of replicated disks is built just once and this flag is set 76 * once it's done 77 */ 78 static int replicated_disk_list_built = 0; 79 80 /* 81 * Map logical blk to physical 82 * 83 * This is based on the routine of the same name in the md kernel module (see 84 * file md_mddb.c), with the following caveats: 85 * 86 * - The kernel routine works on in core master blocks, or mddb_mb_ic_t; this 87 * routine works instead on the mddb_mb_t read directly from the disk 88 */ 89 static daddr_t 90 getphysblk( 91 mddb_block_t blk, 92 mddb_mb_t *mbp 93 ) 94 { 95 /* 96 * Sanity check: is the block within range? If so, we then assume 97 * that the block range map in the master block is valid and 98 * consistent with the block count. Unfortunately, there is no 99 * reliable way to validate this assumption. 100 */ 101 if (blk >= mbp->mb_blkcnt || blk >= mbp->mb_blkmap.m_consecutive) 102 return ((daddr_t)-1); 103 104 return (mbp->mb_blkmap.m_firstblk + blk); 105 } 106 107 108 109 /* 110 * drive_append() 111 * 112 * Append to tail of linked list of md_im_drive_info_t. 113 * 114 * Will allocate space for new node and copy args into new space. 115 * 116 * Returns pointer to new node. 117 */ 118 static md_im_drive_info_t * 119 drive_append( 120 md_im_drive_info_t **midpp, 121 mddrivename_t *dnp, 122 void *devid, 123 void *rdevid, 124 int devid_sz, 125 char *minor_name, 126 md_timeval32_t timestamp, 127 md_im_replica_info_t *mirp 128 ) 129 { 130 md_im_drive_info_t *midp; 131 int o_devid_sz; 132 133 for (; (*midpp != NULL); midpp = &((*midpp)->mid_next)) 134 ; 135 136 midp = *midpp = Zalloc(sizeof (md_im_drive_info_t)); 137 138 midp->mid_dnp = dnp; 139 140 /* 141 * If rdevid is not NULL then we know we are dealing with 142 * replicated diskset case. 'devid_sz' will always be the 143 * size of a valid devid which can be 'devid' or 'rdevid' 144 */ 145 midp->mid_devid = (void *)Malloc(devid_sz); 146 147 if (rdevid) { 148 (void) memcpy(midp->mid_devid, rdevid, devid_sz); 149 /* 150 * Also need to store the 'other' devid 151 */ 152 o_devid_sz = devid_sizeof((ddi_devid_t)devid); 153 midp->mid_o_devid = (void *)Malloc(o_devid_sz); 154 (void) memcpy(midp->mid_o_devid, devid, o_devid_sz); 155 midp->mid_o_devid_sz = o_devid_sz; 156 } else { 157 /* 158 * In the case of regular diskset, midp->mid_o_devid 159 * will be a NULL pointer 160 */ 161 (void) memcpy(midp->mid_devid, devid, devid_sz); 162 } 163 164 midp->mid_devid_sz = devid_sz; 165 midp->mid_setcreatetimestamp = timestamp; 166 (void) strlcpy(midp->mid_minor_name, minor_name, MDDB_MINOR_NAME_MAX); 167 midp->mid_replicas = mirp; 168 169 return (midp); 170 } 171 172 173 174 /* 175 * drive_append_wrapper() 176 * 177 * Constant time append wrapper; the append function will always walk the list, 178 * this will take a tail argument and use the append function on just the tail 179 * node, doing the appropriate old-tail-next-pointer bookkeeping. 180 */ 181 static md_im_drive_info_t ** 182 drive_append_wrapper( 183 md_im_drive_info_t **tailpp, 184 mddrivename_t *dnp, 185 void *devid, 186 void *rdevid, 187 int devid_sz, 188 char *minor_name, 189 md_timeval32_t timestamp, 190 md_im_replica_info_t *mirp 191 ) 192 { 193 (void) drive_append(tailpp, dnp, devid, rdevid, devid_sz, minor_name, 194 timestamp, mirp); 195 196 if ((*tailpp)->mid_next == NULL) 197 return (tailpp); 198 199 return (&((*tailpp)->mid_next)); 200 } 201 202 203 204 /* 205 * replica_append() 206 * 207 * Append to tail of linked list of md_im_replica_info_t. 208 * 209 * Will allocate space for new node and copy args into new space. 210 * 211 * Returns pointer to new node. 212 */ 213 static md_im_replica_info_t * 214 replica_append( 215 md_im_replica_info_t **mirpp, 216 int flags, 217 daddr32_t offset, 218 daddr32_t length, 219 md_timeval32_t timestamp 220 ) 221 { 222 md_im_replica_info_t *mirp; 223 224 for (; (*mirpp != NULL); mirpp = &((*mirpp)->mir_next)) 225 ; 226 227 mirp = *mirpp = Zalloc(sizeof (md_im_replica_info_t)); 228 229 mirp->mir_flags = flags; 230 mirp->mir_offset = offset; 231 mirp->mir_length = length; 232 mirp->mir_timestamp = timestamp; 233 234 return (mirp); 235 236 } 237 238 239 240 /* 241 * replica_append_wrapper() 242 * 243 * Constant time append wrapper; the append function will always walk the list, 244 * this will take a tail argument and use the append function on just the tail 245 * node, doing the appropriate old-tail-next-pointer bookkeeping. 246 */ 247 static md_im_replica_info_t ** 248 replica_append_wrapper( 249 md_im_replica_info_t **tailpp, 250 int flags, 251 daddr32_t offset, 252 daddr32_t length, 253 md_timeval32_t timestamp 254 ) 255 { 256 (void) replica_append(tailpp, flags, offset, length, timestamp); 257 258 if ((*tailpp)->mir_next == NULL) 259 return (tailpp); 260 261 return (&(*tailpp)->mir_next); 262 } 263 264 /* 265 * map_replica_disk() 266 * 267 * Searches the device id list for a specific 268 * disk based on the locator block device id array index. 269 * 270 * Returns a pointer to the did_list node if a match was 271 * found or NULL otherwise. 272 */ 273 static did_list_t * 274 map_replica_disk( 275 did_list_t *did_listp, 276 int did_index 277 ) 278 { 279 did_list_t *tailp = did_listp; 280 281 while (tailp != NULL) { 282 if (tailp->did_index == did_index) 283 return (tailp); 284 tailp = tailp->next; 285 } 286 287 /* not found, return failure */ 288 return (NULL); 289 } 290 291 /* 292 * replicated_list_lookup() 293 * 294 * looks up a replicated disk entry in the global replicated disk list 295 * based upon the length of that disk's device id. returns the new device id 296 * for the disk. 297 * If you store the returned devid you must create a local copy. 298 */ 299 static void * 300 replicated_list_lookup( 301 uint_t devid_len, 302 void *old_devid 303 ) 304 { 305 replicated_disk_t *head = NULL; 306 307 assert(devid_len <= MAX_DEVID_LEN); 308 head = replicated_disk_list[devid_len]; 309 310 if (head == NULL) 311 return (NULL); 312 313 do { 314 if (devid_compare((ddi_devid_t)old_devid, 315 (ddi_devid_t)head->old_devid) == 0) 316 return (head->new_devid); 317 head = head->next; 318 } while (head != NULL); 319 320 return (NULL); 321 } 322 323 /* 324 * replicated_list_insert() 325 * 326 * inserts a replicated disk entry into the global replicated disk list 327 */ 328 static void 329 replicated_list_insert( 330 size_t old_devid_len, 331 void *old_devid, 332 void *new_devid 333 ) 334 { 335 replicated_disk_t *repl_disk, **first_entry; 336 void *repl_old_devid = NULL; 337 338 assert(old_devid_len <= MAX_DEVID_LEN); 339 340 repl_disk = Zalloc(sizeof (replicated_disk_t)); 341 repl_old_devid = Zalloc(old_devid_len); 342 (void) memcpy(repl_old_devid, (void *)old_devid, old_devid_len); 343 344 repl_disk->old_devid = repl_old_devid; 345 repl_disk->new_devid = new_devid; 346 347 first_entry = &replicated_disk_list[old_devid_len]; 348 349 if (*first_entry == NULL) { 350 *first_entry = repl_disk; 351 return; 352 } 353 354 repl_disk->next = *first_entry; 355 replicated_disk_list[old_devid_len] = repl_disk; 356 } 357 358 /* 359 * get_replica_disks() 360 * 361 * Will step through the locator records in the supplied locator block, and add 362 * each one with an active replica to a supplied list of md_im_drive_info_t, and 363 * add the appropriate replicas to the md_im_replica_info_t contained therein. 364 */ 365 static void 366 get_replica_disks( 367 md_im_set_desc_t *misp, 368 did_list_t *did_listp, 369 mddb_mb_t *mb, 370 mddb_lb_t *lbp, 371 md_error_t *ep, 372 int replicated 373 ) 374 { 375 mddrivename_t *dnp; 376 int indx, on_list; 377 mdsetname_t *sp = metasetname(MD_LOCAL_NAME, ep); 378 int flags; 379 int devid_sz; 380 char *minor_name; 381 did_list_t *replica_disk; 382 daddr32_t offset; 383 daddr32_t length; 384 md_timeval32_t timestamp; 385 md_im_replica_info_t **mirpp = NULL; 386 md_im_drive_info_t **midpp = &misp->mis_drives; 387 md_im_drive_info_t *midp; 388 void *did; 389 390 for (indx = 0; indx < lbp->lb_loccnt; indx++) { 391 392 on_list = 0; 393 if (lbp->lb_locators[indx].l_flags & MDDB_F_ACTIVE) { 394 395 /* 396 * search the device id list for a 397 * specific ctds based on the locator 398 * block device id array index. 399 */ 400 replica_disk = map_replica_disk(did_listp, indx); 401 402 assert(replica_disk != NULL); 403 404 405 /* 406 * metadrivename() can fail for a slice name 407 * if there is not an existing mddrivename_t. 408 * So we use metadiskname() to strip the slice 409 * number. 410 */ 411 dnp = metadrivename(&sp, 412 metadiskname(replica_disk->devname), ep); 413 414 for (midp = misp->mis_drives; midp != NULL; 415 midp = midp->mid_next) { 416 if (dnp == midp->mid_dnp) { 417 on_list = 1; 418 mirpp = &midp->mid_replicas; 419 break; 420 } 421 } 422 423 /* 424 * Get the correct devid_sz 425 */ 426 if (replicated) 427 did = replica_disk->rdid; 428 else 429 did = replica_disk->did; 430 431 devid_sz = devid_sizeof((ddi_devid_t)did); 432 minor_name = replica_disk->minor_name; 433 434 /* 435 * New on the list so add it 436 */ 437 if (!on_list) { 438 mddb_mb_t *mbp; 439 uint_t sliceno; 440 mdname_t *rsp; 441 int fd = -1; 442 443 mbp = Malloc(DEV_BSIZE); 444 445 /* determine the replica slice */ 446 if (meta_replicaslice(dnp, &sliceno, 447 ep) != 0) { 448 Free(mbp); 449 continue; 450 } 451 452 /* 453 * if the replica slice size is zero, 454 * don't bother opening 455 */ 456 if (dnp->vtoc.parts[sliceno].size == 0) { 457 Free(mbp); 458 continue; 459 } 460 461 if ((rsp = metaslicename(dnp, sliceno, 462 ep)) == NULL) { 463 Free(mbp); 464 continue; 465 } 466 467 if ((fd = open(rsp->rname, 468 O_RDONLY| O_NDELAY)) < 0) { 469 Free(mbp); 470 continue; 471 } 472 473 /* 474 * a drive may not have a master block 475 */ 476 if (read_master_block(ep, fd, mbp, 477 DEV_BSIZE) <= 0) { 478 mdclrerror(ep); 479 Free(mbp); 480 (void) close(fd); 481 continue; 482 } 483 484 (void) close(fd); 485 midpp = drive_append_wrapper(midpp, dnp, 486 replica_disk->did, replica_disk->rdid, 487 devid_sz, minor_name, mbp->mb_setcreatetime, 488 NULL); 489 mirpp = &((*midpp)->mid_replicas); 490 Free(mbp); 491 } 492 493 /* 494 * For either of these assertions to fail, it implies 495 * a NULL return from metadrivename() above. Since 496 * the args came from a presumed valid locator block, 497 * that's Bad. 498 */ 499 assert(midpp != NULL); 500 assert(mirpp != NULL); 501 502 /* 503 * Extract the parameters describing this replica. 504 * 505 * The magic "1" in the length calculation accounts 506 * for the length of the master block, in addition to 507 * the block count it describes. (The master block 508 * will always take up one block on the disk, and 509 * there will always only be one master block per 510 * replica, even though much of the code is structured 511 * to handle noncontiguous replicas.) 512 */ 513 flags = lbp->lb_locators[indx].l_flags; 514 offset = lbp->lb_locators[indx].l_blkno; 515 length = mb->mb_blkcnt + 1; 516 timestamp = mb->mb_setcreatetime; 517 518 mirpp = replica_append_wrapper(mirpp, flags, 519 offset, length, timestamp); 520 521 /* 522 * If we're here it means - 523 * 524 * a) we had an active copy of the replica, and 525 * b) we've added the disk to the list of 526 * disks as well. 527 * 528 * We need to bump up the number of active 529 * replica count for each such replica so that it 530 * can be used later for replica quorum check. 531 */ 532 misp->mis_active_replicas++; 533 } 534 } 535 } 536 537 538 539 /* 540 * get_nonreplica_disks() 541 * 542 * Extracts the disks without replicas from the locator name space and adds them 543 * to the supplied list of md_im_drive_info_t. 544 */ 545 static void 546 get_nonreplica_disks( 547 md_im_set_desc_t *misp, 548 mddb_rb_t *did_nm, 549 mddb_rb_t *did_shrnm, 550 md_error_t *ep, 551 int replicated 552 ) 553 { 554 char *search_path = "/dev"; 555 devid_nmlist_t *nmlist; 556 md_im_drive_info_t *midp, **midpp = &misp->mis_drives; 557 mddrivename_t *dnp; 558 mdsetname_t *sp = metasetname(MD_LOCAL_NAME, ep); 559 mddb_rb_t *rbp_did = did_nm; 560 mddb_rb_t *rbp_did_shr = did_shrnm; 561 int on_list = 0; 562 int devid_sz; 563 struct devid_min_rec *did_rec; 564 struct devid_shr_rec *did_shr_rec; 565 struct did_shr_name *did; 566 struct did_min_name *min; 567 void *r_did; /* NULL if not a replicated diskset */ 568 void *valid_did; 569 570 /* 571 * We got a pointer to an mddb record, which we expect to contain a 572 * name record; extract the pointer thereto. 573 */ 574 /* LINTED */ 575 did_rec = (struct devid_min_rec *)((caddr_t)(&rbp_did->rb_data)); 576 /* LINTED */ 577 did_shr_rec = (struct devid_shr_rec *) 578 ((caddr_t)(&rbp_did_shr->rb_data)); 579 580 /* 581 * Skip the nm_rec_hdr and iterate on the array of struct minor_name 582 * at the end of the devid_min_rec 583 */ 584 for (min = &did_rec->minor_name[0]; min->min_devid_key != 0; 585 /* LINTED */ 586 min = (struct did_min_name *)((char *)min + DID_NAMSIZ(min))) { 587 588 on_list = 0; 589 r_did = NULL; 590 591 /* 592 * For a give DID_NM key, locate the corresponding device 593 * id from DID_NM_SHR 594 */ 595 for (did = &did_shr_rec->device_id[0]; did->did_key != 0; 596 /* LINTED */ 597 did = (struct did_shr_name *) 598 ((char *)did + DID_SHR_NAMSIZ(did))) { 599 /* 600 * We got a match, this is the device id we're 601 * looking for 602 */ 603 if (min->min_devid_key == did->did_key) 604 break; 605 } 606 607 if (did->did_key == 0) { 608 /* we didn't find a match */ 609 assert(did->did_key != 0); 610 md_exit(NULL, 1); 611 } 612 613 /* 614 * If replicated diskset 615 */ 616 if (replicated) { 617 size_t new_devid_len; 618 char *temp; 619 /* 620 * In this case, did->did_devid will 621 * be invalid so lookup the real one 622 */ 623 temp = replicated_list_lookup(did->did_size, 624 did->did_devid); 625 new_devid_len = devid_sizeof((ddi_devid_t)temp); 626 r_did = Zalloc(new_devid_len); 627 (void) memcpy(r_did, temp, new_devid_len); 628 valid_did = r_did; 629 } else { 630 valid_did = did->did_devid; 631 } 632 633 /* Get the ctds mapping for that device id */ 634 if (meta_deviceid_to_nmlist(search_path, 635 (ddi_devid_t)valid_did, 636 &min->min_name[0], &nmlist) == 0) { 637 638 assert(nmlist->devname != NULL); 639 dnp = metadrivename(&sp, 640 metadiskname(nmlist->devname), ep); 641 642 assert(dnp != NULL); 643 /* Is it already on the list? */ 644 for (midp = misp->mis_drives; midp != NULL; 645 midp = midp->mid_next) { 646 if (midp->mid_dnp == dnp) { 647 on_list = 1; 648 break; 649 } 650 } 651 652 devid_sz = devid_sizeof( 653 (ddi_devid_t)valid_did); 654 655 if (!on_list) { 656 mddb_mb_t *mbp; 657 uint_t sliceno; 658 mdname_t *rsp; 659 int fd = -1; 660 661 mbp = Malloc(DEV_BSIZE); 662 663 /* determine the replica slice */ 664 if (meta_replicaslice(dnp, &sliceno, 665 ep) != 0) { 666 Free(mbp); 667 continue; 668 } 669 670 /* 671 * if the replica slice size is zero, 672 * don't bother opening 673 */ 674 if (dnp->vtoc.parts[sliceno].size 675 == 0) { 676 Free(mbp); 677 continue; 678 } 679 680 if ((rsp = metaslicename(dnp, sliceno, 681 ep)) == NULL) { 682 Free(mbp); 683 continue; 684 } 685 686 if ((fd = open(rsp->rname, 687 O_RDONLY| O_NDELAY)) < 0) { 688 Free(mbp); 689 continue; 690 } 691 692 /* 693 * a drive may not have a master block 694 */ 695 if (read_master_block(ep, fd, mbp, 696 DEV_BSIZE) <= 0) { 697 mdclrerror(ep); 698 Free(mbp); 699 (void) close(fd); 700 continue; 701 } 702 703 (void) close(fd); 704 /* 705 * If it is replicated diskset, 706 * r_did will be non-NULL and 707 * devid_sz will be its size 708 */ 709 midpp = drive_append_wrapper(midpp, 710 dnp, &did->did_devid, r_did, 711 devid_sz, &min->min_name[0], 712 mbp->mb_setcreatetime, NULL); 713 Free(mbp); 714 } 715 devid_free_nmlist(nmlist); 716 } 717 } 718 } 719 720 /* 721 * set_append() 722 * 723 * Append to tail of linked list of md_im_set_desc_t. 724 * 725 * Will allocate space for new node AND populate it by extracting disks with 726 * and without replicas from the locator blocks and locator namespace. 727 * 728 * Returns pointer to new node. 729 */ 730 static md_im_set_desc_t * 731 set_append( 732 md_im_set_desc_t **mispp, 733 did_list_t *did_listp, 734 mddb_mb_t *mb, 735 mddb_lb_t *lbp, 736 mddb_rb_t *nm, 737 mddb_rb_t *did_nm, 738 mddb_rb_t *did_shrnm, 739 md_error_t *ep, 740 int replicated 741 ) 742 { 743 md_im_set_desc_t *misp; 744 set_t setno = mb->mb_setno; 745 746 /* run to end of list */ 747 for (; (*mispp != NULL); mispp = &((*mispp)->mis_next)) 748 ; 749 750 /* allocate new list element */ 751 misp = *mispp = Zalloc(sizeof (md_im_set_desc_t)); 752 753 if (replicated) 754 misp->mis_flags = MD_IM_SET_REPLICATED; 755 756 misp->mis_oldsetno = setno; 757 758 /* Get the disks with and without replicas */ 759 get_replica_disks(misp, did_listp, mb, lbp, ep, replicated); 760 761 if (nm != NULL && did_nm != NULL && did_shrnm != NULL) { 762 get_nonreplica_disks(misp, did_nm, did_shrnm, ep, replicated); 763 } 764 765 /* 766 * An error in this struct could come from either of the above routines; 767 * in both cases, we want to pass it back on up. 768 */ 769 return (misp); 770 } 771 772 773 774 /* 775 * set_append_wrapper() 776 * 777 * Constant time append wrapper; the append function will always walk the list, 778 * this will take a tail argument and use the append function on just the tail 779 * node, doing the appropriate old-tail-next-pointer bookkeeping. 780 */ 781 static md_im_set_desc_t ** 782 set_append_wrapper( 783 md_im_set_desc_t **tailpp, 784 did_list_t *did_listp, 785 mddb_mb_t *mb, 786 mddb_lb_t *lbp, 787 mddb_rb_t *nm, 788 mddb_rb_t *did_nm, 789 mddb_rb_t *did_shrnm, 790 md_error_t *ep, 791 int replicated 792 ) 793 { 794 (void) set_append(tailpp, did_listp, mb, lbp, nm, did_nm, 795 did_shrnm, ep, replicated); 796 797 /* it's the first item in the list, return it instead of the next */ 798 return (((*tailpp)->mis_next == NULL) ? tailpp : &(*tailpp)->mis_next); 799 } 800 801 802 803 /* 804 * add_disk_names() 805 * 806 * Iterator to walk the minor node tree of the device snapshot, adding only the 807 * first non-block instance of each non-cdrom minor node to a list of disks. 808 */ 809 static int 810 add_disk_names(di_node_t node, di_minor_t minor, void *args) 811 { 812 char *search_path = "/dev"; 813 ddi_devid_t devid = di_devid(node); 814 devid_nmlist_t *nm; 815 char *min = di_minor_name(minor); 816 md_im_names_t *cnames = (md_im_names_t *)args; 817 static di_node_t save_node = NULL; 818 819 /* 820 * skip CD devices 821 * If a device does not have a device id, we can't 822 * do anything with it so just exclude it from our 823 * list. 824 * 825 * This would also encompass CD devices and floppy 826 * devices that don't have a device id. 827 */ 828 if (devid == NULL) { 829 return (DI_WALK_CONTINUE); 830 } 831 832 /* char disk devices (as opposed to block) */ 833 if (di_minor_spectype(minor) == S_IFCHR) { 834 835 /* only first occurrence (slice 0) of each instance */ 836 if (save_node == NULL || node != save_node) { 837 save_node = node; 838 if (meta_deviceid_to_nmlist(search_path, devid, 839 min, &nm) == 0) { 840 int index = cnames->min_count++; 841 842 assert(nm->devname != NULL); 843 cnames->min_names = 844 Realloc(cnames->min_names, 845 cnames->min_count * 846 sizeof (char *)); 847 848 assert(cnames->min_names != NULL); 849 cnames->min_names[index] = 850 metadiskname(nm->devname); 851 devid_free_nmlist(nm); 852 } 853 } 854 } 855 return (DI_WALK_CONTINUE); 856 } 857 858 859 860 /* 861 * meta_list_disks() 862 * 863 * Snapshots the device tree and extracts disk devices from the snapshot. 864 */ 865 int 866 meta_list_disks(md_error_t *ep, md_im_names_t *cnames) 867 { 868 di_node_t root_node; 869 870 assert(cnames != NULL); 871 cnames->min_count = 0; 872 cnames->min_names = NULL; 873 874 if ((root_node = di_init("/", DINFOCPYALL|DINFOFORCE)) 875 == DI_NODE_NIL) { 876 return (mdsyserror(ep, errno, NULL)); 877 } 878 879 (void) di_walk_minor(root_node, DDI_NT_BLOCK, 0, cnames, 880 add_disk_names); 881 882 di_fini(root_node); 883 return (0); 884 } 885 886 /* 887 * meta_imp_drvused 888 * 889 * Checks if given drive is mounted, swapped, part of disk configuration 890 * or in use by SVM. ep also has error code set up if drive is in use. 891 * 892 * Returns 1 if drive is in use. 893 * Returns 0 if drive is not in use. 894 */ 895 int 896 meta_imp_drvused( 897 mdsetname_t *sp, 898 mddrivename_t *dnp, 899 md_error_t *ep 900 ) 901 { 902 md_error_t status = mdnullerror; 903 md_error_t *db_ep = &status; 904 905 /* 906 * We pass in db_ep to meta_setup_db_locations 907 * and never ever use the error contained therein 908 * because all we're interested in is a check to 909 * see whether any local metadbs are present. 910 */ 911 if ((meta_check_drivemounted(sp, dnp, ep) != 0) || 912 (meta_check_driveswapped(sp, dnp, ep) != 0) || 913 (((meta_setup_db_locations(db_ep) == 0) && 914 ((meta_check_drive_inuse(sp, dnp, 1, ep) != 0) || 915 (meta_check_driveinset(sp, dnp, ep) != 0))))) { 916 return (1); 917 } else { 918 return (0); 919 } 920 } 921 922 /* 923 * meta_prune_cnames() 924 * 925 * Removes in-use disks from the list prior to further processing. 926 * 927 * Return value depends on err_on_prune flag: if set, and one or more disks 928 * are pruned, the return list will be the pruned disks. If not set, or if no 929 * disks are pruned, the return list will be the unpruned disks. 930 */ 931 mddrivenamelist_t * 932 meta_prune_cnames( 933 md_error_t *ep, 934 md_im_names_t *cnames, 935 int err_on_prune 936 ) 937 { 938 int d; 939 int fcount = 0; 940 mddrivenamelist_t *dnlp = NULL; 941 mddrivenamelist_t **dnlpp = &dnlp; 942 mddrivenamelist_t *fdnlp = NULL; 943 mddrivenamelist_t **fdnlpp = &fdnlp; 944 mdsetname_t *sp = metasetname(MD_LOCAL_NAME, ep); 945 946 for (d = 0; d < cnames->min_count; ++d) { 947 mddrivename_t *dnp; 948 949 dnp = metadrivename(&sp, cnames->min_names[d], ep); 950 if (dnp == NULL) { 951 /* 952 * Assuming we're interested in knowing about 953 * whatever error occurred, but not in stopping. 954 */ 955 mde_perror(ep, cnames->min_names[d]); 956 mdclrerror(ep); 957 958 continue; 959 } 960 961 /* 962 * Check if the drive is inuse. 963 */ 964 if (meta_imp_drvused(sp, dnp, ep)) { 965 fdnlpp = meta_drivenamelist_append_wrapper(fdnlpp, dnp); 966 fcount++; 967 mdclrerror(ep); 968 } else { 969 dnlpp = meta_drivenamelist_append_wrapper(dnlpp, dnp); 970 } 971 } 972 973 if (fcount) { 974 if (err_on_prune) { 975 (void) mddserror(ep, MDE_DS_DRIVEINUSE, 0, 976 NULL, fdnlp->drivenamep->cname, NULL); 977 metafreedrivenamelist(dnlp); 978 return (fdnlp); 979 } 980 metafreedrivenamelist(fdnlp); 981 } 982 983 return (dnlp); 984 } 985 986 /* 987 * read_master_block() 988 * 989 * Returns: 990 * < 0 for failure 991 * 0 for no valid master block 992 * 1 for valid master block 993 * 994 * The supplied buffer will be filled in for EITHER 0 or 1. 995 */ 996 int 997 read_master_block( 998 md_error_t *ep, 999 int fd, 1000 void *bp, 1001 int bsize 1002 ) 1003 { 1004 mddb_mb_t *mbp = bp; 1005 int rval = 1; 1006 1007 assert(bp != NULL); 1008 1009 if (lseek(fd, (off_t)dbtob(16), SEEK_SET) < 0) 1010 return (mdsyserror(ep, errno, NULL)); 1011 1012 if (read(fd, bp, bsize) != bsize) 1013 return (mdsyserror(ep, errno, NULL)); 1014 1015 /* 1016 * The master block magic number can either be MDDB_MAGIC_MB in 1017 * the case of a real master block, or, it can be MDDB_MAGIC_DU 1018 * in the case of a dummy master block 1019 */ 1020 if ((mbp->mb_magic != MDDB_MAGIC_MB) && 1021 (mbp->mb_magic != MDDB_MAGIC_DU)) { 1022 rval = 0; 1023 (void) mdmddberror(ep, MDE_DB_MASTER, 0, 0, 0, NULL); 1024 } 1025 1026 if (mbp->mb_revision != MDDB_REV_MB) { 1027 rval = 0; 1028 } 1029 1030 return (rval); 1031 } 1032 1033 /* 1034 * read_locator_block() 1035 * 1036 * Returns: 1037 * < 0 for failure 1038 * 0 for no valid locator block 1039 * 1 for valid locator block 1040 */ 1041 int 1042 read_locator_block( 1043 md_error_t *ep, 1044 int fd, 1045 mddb_mb_t *mbp, 1046 void *bp, 1047 int bsize 1048 ) 1049 { 1050 mddb_lb_t *lbp = bp; 1051 1052 assert(bp != NULL); 1053 1054 if (lseek(fd, (off_t)dbtob(mbp->mb_blkmap.m_firstblk), SEEK_SET) < 0) 1055 return (mdsyserror(ep, errno, NULL)); 1056 1057 if (read(fd, bp, bsize) != bsize) 1058 return (mdsyserror(ep, errno, NULL)); 1059 1060 return ((lbp->lb_magic == MDDB_MAGIC_LB) ? 1 : 0); 1061 } 1062 1063 int 1064 phys_read( 1065 md_error_t *ep, 1066 int fd, 1067 mddb_mb_t *mbp, 1068 daddr_t blk, 1069 void *bp, 1070 int bcount 1071 ) 1072 { 1073 daddr_t pblk; 1074 1075 if ((pblk = getphysblk(blk, mbp)) < 0) 1076 return (mdmddberror(ep, MDE_DB_BLKRANGE, NODEV32, 1077 MD_LOCAL_SET, blk, NULL)); 1078 1079 if (lseek(fd, (off_t)dbtob(pblk), SEEK_SET) < 0) 1080 return (mdsyserror(ep, errno, NULL)); 1081 1082 if (read(fd, bp, bcount) != bcount) 1083 return (mdsyserror(ep, errno, NULL)); 1084 1085 return (bcount); 1086 } 1087 1088 /* 1089 * read_locator_block_did() 1090 * 1091 * Returns: 1092 * < 0 for failure 1093 * 0 for no valid locator name struct 1094 * 1 for valid locator name struct 1095 */ 1096 int 1097 read_locator_block_did( 1098 md_error_t *ep, 1099 int fd, 1100 mddb_mb_t *mbp, 1101 mddb_lb_t *lbp, 1102 void *bp, 1103 int bsize 1104 ) 1105 { 1106 int lb_didfirstblk = lbp->lb_didfirstblk; 1107 mddb_did_blk_t *lbdidp = bp; 1108 int rval; 1109 1110 assert(bp != NULL); 1111 1112 if ((rval = phys_read(ep, fd, mbp, lb_didfirstblk, bp, bsize)) < 0) 1113 return (rval); 1114 1115 return ((lbdidp->blk_magic == MDDB_MAGIC_DI) ? 1 : 0); 1116 } 1117 1118 /* 1119 * read_locator_names() 1120 * 1121 * Returns: 1122 * < 0 for failure 1123 * 0 for no valid locator name struct 1124 * 1 for valid locator name struct 1125 */ 1126 int 1127 read_locator_names( 1128 md_error_t *ep, 1129 int fd, 1130 mddb_mb_t *mbp, 1131 mddb_lb_t *lbp, 1132 void *bp, 1133 int bsize 1134 ) 1135 { 1136 int lnfirstblk = lbp->lb_lnfirstblk; 1137 mddb_ln_t *lnp = bp; 1138 int rval; 1139 1140 assert(bp != NULL); 1141 1142 if ((rval = phys_read(ep, fd, mbp, lnfirstblk, bp, bsize)) < 0) 1143 return (rval); 1144 1145 return ((lnp->ln_magic == MDDB_MAGIC_LN) ? 1 : 0); 1146 } 1147 1148 1149 int 1150 read_database_block( 1151 md_error_t *ep, 1152 int fd, 1153 mddb_mb_t *mbp, 1154 int dbblk, 1155 void *bp, 1156 int bsize 1157 ) 1158 { 1159 mddb_db_t *dbp = bp; 1160 int rval; 1161 1162 assert(bp != NULL); 1163 1164 if ((rval = phys_read(ep, fd, mbp, dbblk, bp, bsize)) < 0) 1165 return (rval); 1166 1167 return ((dbp->db_magic == MDDB_MAGIC_DB) ? 1 : 0); 1168 } 1169 1170 int 1171 read_loc_didblks( 1172 md_error_t *ep, 1173 int fd, 1174 mddb_mb_t *mbp, 1175 int didblk, 1176 void *bp, 1177 int bsize 1178 ) 1179 { 1180 mddb_did_blk_t *didbp = bp; 1181 int rval; 1182 1183 assert(bp != NULL); 1184 1185 if ((rval = phys_read(ep, fd, mbp, didblk, bp, bsize)) < 0) 1186 return (rval); 1187 1188 return ((didbp->blk_magic == MDDB_MAGIC_DI) ? 1 : 0); 1189 } 1190 1191 1192 int 1193 read_loc_didinfo( 1194 md_error_t *ep, 1195 int fd, 1196 mddb_mb_t *mbp, 1197 int infoblk, 1198 void *bp, 1199 int bsize 1200 ) 1201 { 1202 int rval = 1; 1203 mddb_did_info_t *infop = bp; 1204 1205 assert(bp != NULL); 1206 1207 if ((rval = phys_read(ep, fd, mbp, infoblk, bp, bsize)) < 0) 1208 return (rval); 1209 1210 return ((infop->info_flags & MDDB_DID_EXISTS) ? 1 : 0); 1211 } 1212 1213 /* 1214 * meta_nm_rec() 1215 * 1216 * Return the DE corresponding to the requested namespace record type. 1217 * Modifies dbp to have a firstentry if one isn't there. 1218 */ 1219 static mddb_de_t * 1220 meta_nm_rec(mddb_db_t *dbp, mddb_type_t rectype) 1221 { 1222 mddb_de_t *dep; 1223 int desize; 1224 1225 if (dbp->db_firstentry != NULL) { 1226 /* LINTED */ 1227 dep = (mddb_de_t *)((caddr_t)(&dbp->db_firstentry) 1228 + sizeof (dbp->db_firstentry)); 1229 dbp->db_firstentry = dep; 1230 while (dep && dep->de_next) { 1231 desize = sizeof (*dep) - sizeof (dep->de_blks) + 1232 sizeof (daddr_t) * dep->de_blkcount; 1233 /* LINTED */ 1234 dep->de_next = (mddb_de_t *) 1235 ((caddr_t)dep + desize); 1236 dep = dep->de_next; 1237 } 1238 } 1239 1240 for (dep = dbp->db_firstentry; dep != NULL; dep = dep->de_next) { 1241 if (dep->de_type1 == rectype) 1242 break; 1243 } 1244 return (dep); 1245 } 1246 1247 /* 1248 * read_nm_rec() 1249 * 1250 * Reads the NM, NM_DID or NM_DID_SHR record in the mddb and stores the 1251 * configuration data in the buffer 'nm' 1252 * 1253 * Returns: 1254 * < 0 for failure 1255 * 0 for no valid NM/DID_NM/DID_NM_SHR record 1256 * 1 for valid NM/DID_NM/DID_NM_SHR record 1257 * 1258 */ 1259 static int 1260 read_nm_rec( 1261 md_error_t *ep, 1262 int fd, 1263 mddb_mb_t *mbp, 1264 mddb_lb_t *lbp, 1265 char **nm, 1266 mddb_type_t rectype, 1267 char *diskname 1268 ) 1269 { 1270 int cnt, dbblk, rval = 0; 1271 char db[DEV_BSIZE]; 1272 mddb_de_t *dep; 1273 /*LINTED*/ 1274 mddb_db_t *dbp = (mddb_db_t *)&db; 1275 char *tmpnm = NULL; 1276 daddr_t pblk; 1277 1278 for (dbblk = lbp->lb_dbfirstblk; 1279 dbblk != 0; 1280 dbblk = dbp->db_nextblk) { 1281 1282 if ((rval = read_database_block(ep, fd, mbp, dbblk, dbp, 1283 sizeof (db))) <= 0) 1284 return (rval); 1285 1286 /* 1287 * Locate NM/DID_NM/DID_NM_SHR record. Normally there is 1288 * only one record per mddb. There is a rare case when we 1289 * can't expand the record. If this is the case then we 1290 * will have multiple NM/DID_NM/DID_NM_SHR records linked 1291 * with r_next_recid. 1292 * 1293 * For now assume the normal case and handle the extended 1294 * namespace in Phase 2. 1295 */ 1296 if ((dep = meta_nm_rec(dbp, rectype)) != NULL) 1297 break; 1298 } 1299 1300 /* If meta_nm_rec() never succeeded, bail out */ 1301 if (dep == NULL) 1302 return (0); 1303 1304 /* Read in the appropriate record and return configurations */ 1305 tmpnm = (char *)Zalloc(dbtob(dep->de_blkcount)); 1306 *nm = tmpnm; 1307 1308 for (cnt = 0; cnt < dep->de_blkcount; cnt++) { 1309 if ((pblk = getphysblk(dep->de_blks[cnt], mbp)) < 0) { 1310 rval = mdmddberror(ep, MDE_DB_BLKRANGE, 1311 NODEV32, MD_LOCAL_SET, 1312 dep->de_blks[cnt], diskname); 1313 return (rval); 1314 } 1315 1316 if (lseek(fd, (off_t)dbtob(pblk), SEEK_SET) < 0) { 1317 rval = mdsyserror(ep, errno, diskname); 1318 return (rval); 1319 } 1320 1321 if (read(fd, tmpnm, DEV_BSIZE) != DEV_BSIZE) { 1322 rval = mdsyserror(ep, errno, diskname); 1323 return (rval); 1324 } 1325 1326 tmpnm += DEV_BSIZE; 1327 } 1328 return (1); 1329 } 1330 1331 /* 1332 * is_replicated 1333 * 1334 * Determines whether a disk has been replicated or not. It checks to see 1335 * if the device id stored in the master block is the same as the device id 1336 * registered for that disk on the current system. If the two device ids are 1337 * different, then we know that the disk has been replicated. 1338 * 1339 * If need_devid is set and the disk is replicated, fill in the new_devid. 1340 * Also, if need_devid is set, this routine allocates memory for the device 1341 * ids; the caller of this routine is responsible for free'ing up the memory. 1342 * 1343 * Returns: 1344 * 1 if it's a replicated disk 1345 * 0 if it's not a replicated disk 1346 */ 1347 static int 1348 is_replicated( 1349 int fd, 1350 mddb_mb_t *mbp, 1351 int need_devid, 1352 void **new_devid 1353 ) 1354 { 1355 ddi_devid_t current_devid; 1356 int retval = 0; 1357 size_t new_devid_len; 1358 1359 if (mbp->mb_devid_magic != MDDB_MAGIC_DE) 1360 return (retval); 1361 1362 if (devid_get(fd, ¤t_devid) != 0) 1363 return (retval); 1364 1365 if (devid_compare((ddi_devid_t)mbp->mb_devid, current_devid) != 0) 1366 retval = 1; 1367 1368 if (retval && need_devid) { 1369 new_devid_len = devid_sizeof(current_devid); 1370 *new_devid = Zalloc(new_devid_len); 1371 (void) memcpy(*new_devid, (void *)current_devid, new_devid_len); 1372 } 1373 1374 devid_free(current_devid); 1375 return (retval); 1376 } 1377 1378 /* 1379 * free_replicated_disks_list() 1380 * 1381 * this frees up all the memory allocated by build_replicated_disks_list 1382 */ 1383 static void 1384 free_replicated_disks_list() 1385 { 1386 replicated_disk_t **repl_disk, *temp; 1387 int index; 1388 1389 for (index = 0; index <= MAX_DEVID_LEN; index++) { 1390 repl_disk = &replicated_disk_list[index]; 1391 1392 while (*repl_disk != NULL) { 1393 temp = *repl_disk; 1394 *repl_disk = (*repl_disk)->next; 1395 1396 Free(temp->old_devid); 1397 Free(temp->new_devid); 1398 Free(temp); 1399 } 1400 } 1401 } 1402 1403 /* 1404 * build_replicated_disks_list() 1405 * 1406 * Builds a list of disks that have been replicated using either a 1407 * remote replication or a point-in-time replication software. The 1408 * list is stored as a two dimensional sparse array. 1409 * 1410 * Returns 1411 * 1 on success 1412 * 0 on failure 1413 */ 1414 static int 1415 build_replicated_disks_list( 1416 md_error_t *ep, 1417 mddrivenamelist_t *dnlp 1418 ) 1419 { 1420 uint_t sliceno; 1421 int fd = -1; 1422 mddrivenamelist_t *dp; 1423 mdname_t *rsp; 1424 mddb_mb_t *mbp; 1425 1426 mbp = Malloc(DEV_BSIZE); 1427 1428 for (dp = dnlp; dp != NULL; dp = dp->next) { 1429 mddrivename_t *dnp; 1430 void *new_devid; 1431 1432 dnp = dp->drivenamep; 1433 /* determine the replica slice */ 1434 if (meta_replicaslice(dnp, &sliceno, ep) != 0) 1435 continue; 1436 1437 /* 1438 * if the replica slice size is zero, don't bother opening 1439 */ 1440 if (dnp->vtoc.parts[sliceno].size == 0) 1441 continue; 1442 1443 if ((rsp = metaslicename(dnp, sliceno, ep)) == NULL) 1444 continue; 1445 1446 if ((fd = open(rsp->rname, O_RDONLY| O_NDELAY)) < 0) 1447 return (mdsyserror(ep, errno, rsp->rname)); 1448 1449 /* a drive may not have a master block so we just continue */ 1450 if (read_master_block(ep, fd, mbp, DEV_BSIZE) <= 0) { 1451 (void) close(fd); 1452 mdclrerror(ep); 1453 continue; 1454 } 1455 1456 if (is_replicated(fd, mbp, 1, &new_devid)) { 1457 replicated_list_insert(mbp->mb_devid_len, 1458 mbp->mb_devid, new_devid); 1459 } 1460 (void) close(fd); 1461 } 1462 replicated_disk_list_built = 1; 1463 1464 Free(mbp); 1465 return (1); 1466 } 1467 1468 /* 1469 * free_did_list() 1470 * 1471 * Frees the did_list allocated as part of build_did_list 1472 */ 1473 static void 1474 free_did_list( 1475 did_list_t *did_listp 1476 ) 1477 { 1478 did_list_t *temp, *head; 1479 1480 head = did_listp; 1481 1482 while (head != NULL) { 1483 temp = head; 1484 head = head->next; 1485 if (temp->rdid) 1486 Free(temp->rdid); 1487 if (temp->did) 1488 Free(temp->did); 1489 if (temp->devname) 1490 Free(temp->devname); 1491 if (temp->minor_name) 1492 Free(temp->minor_name); 1493 Free(temp); 1494 } 1495 } 1496 1497 /* 1498 * build_did_list() 1499 * 1500 * Build a list of device ids corresponding to disks in the locator block. 1501 * Memory is allocated here for the nodes in the did_list. The callers of 1502 * this routine must also call free_did_list to free up the memory after 1503 * they're done. 1504 * 1505 * Returns: 1506 * < 0 for failure 1507 * 0 for no valid locator block device id array 1508 * 1 for valid locator block device id array 1509 * ENOTSUP partial diskset, not all disks in a diskset on the 1510 * system where import is being executed 1511 */ 1512 static int 1513 build_did_list( 1514 md_error_t *ep, 1515 int fd, 1516 mddb_mb_t *mb, 1517 mddb_did_blk_t *lbdidp, 1518 did_list_t **did_listp, 1519 int replicated 1520 ) 1521 { 1522 char *search_path = "/dev"; 1523 char *minor_name; 1524 int rval, cnt; 1525 devid_nmlist_t *nm; 1526 uint_t did_info_length = 0; 1527 uint_t did_info_firstblk = 0; 1528 did_list_t *new, *head = NULL; 1529 char *bp = NULL, *temp; 1530 mddb_did_info_t *did_info = NULL; 1531 void *did = NULL; 1532 size_t new_devid_len; 1533 1534 for (cnt = 0; cnt < MDDB_NLB; cnt++) { 1535 did_info = &lbdidp->blk_info[cnt]; 1536 1537 if (!(did_info->info_flags & MDDB_DID_EXISTS)) 1538 continue; 1539 1540 new = Zalloc(sizeof (did_list_t)); 1541 new->did = Zalloc(did_info->info_length); 1542 1543 /* 1544 * If we can re-use the buffer already has been 1545 * read in then just use it. Otherwise free 1546 * the previous one and alloc a new one 1547 */ 1548 if (dbtob(did_info->info_blkcnt) != did_info_length && 1549 did_info->info_firstblk != did_info_firstblk) { 1550 1551 did_info_length = dbtob(did_info->info_blkcnt); 1552 did_info_firstblk = did_info->info_firstblk; 1553 1554 if (bp) 1555 Free(bp); 1556 bp = temp = Zalloc(did_info_length); 1557 1558 if ((rval = phys_read(ep, fd, mb, did_info_firstblk, 1559 (void *)bp, did_info_length)) < 0) 1560 return (rval); 1561 } else { 1562 temp = bp; 1563 } 1564 1565 temp += did_info->info_offset; 1566 (void) memcpy(new->did, temp, did_info->info_length); 1567 new->did_index = cnt; 1568 minor_name = did_info->info_minor_name; 1569 1570 /* 1571 * If we are not able to find the ctd mapping corresponding 1572 * to a given device id, it probably means the device id in 1573 * question is not registered with the system. 1574 * 1575 * Highly likely that the only time this happens, we've hit 1576 * a case where not all the disks that are a part of the 1577 * diskset were moved before importing the diskset. 1578 * 1579 * If set is a replicated diskset, then the device id we get 1580 * from 'lb' will be the 'other' did and we need to lookup 1581 * the real one before we call this routine. 1582 */ 1583 if (replicated) { 1584 temp = replicated_list_lookup(did_info->info_length, 1585 new->did); 1586 new_devid_len = devid_sizeof((ddi_devid_t)temp); 1587 new->rdid = Zalloc(new_devid_len); 1588 (void) memcpy(new->rdid, temp, new_devid_len); 1589 did = new->rdid; 1590 } else { 1591 did = new->did; 1592 } 1593 1594 if (devid_valid((ddi_devid_t)(did)) == 0) { 1595 return (-1); 1596 } 1597 1598 if ((rval = meta_deviceid_to_nmlist(search_path, 1599 (ddi_devid_t)did, minor_name, &nm)) != 0) { 1600 *did_listp = head; 1601 free_did_list(*did_listp); 1602 *did_listp = NULL; 1603 (void) mddserror(ep, MDE_DS_PARTIALSET, MD_SET_BAD, 1604 mynode(), NULL, NULL); 1605 return (ENOTSUP); 1606 } 1607 1608 assert(nm->devname != NULL); 1609 new->devname = Strdup(nm->devname); 1610 new->dev = nm->dev; 1611 new->minor_name = Strdup(minor_name); 1612 1613 devid_free_nmlist(nm); 1614 1615 new->next = head; 1616 head = new; 1617 } 1618 1619 /* Free the last bp */ 1620 if (bp) 1621 Free(bp); 1622 *did_listp = head; 1623 return (1); 1624 } 1625 /* 1626 * check_nm_disks 1627 * Checks the disks listed in the shared did namespace to see if they 1628 * are accessable on the system. If not, return ENOTSUP error to 1629 * indicate we have a partial diskset. 1630 * Returns: 1631 * < 0 for failure 1632 * 0 success 1633 * ENOTSUP partial diskset, not all disks in a diskset on the 1634 * system where import is being executed 1635 */ 1636 static int 1637 check_nm_disks( 1638 md_error_t *ep, 1639 struct devid_min_rec *did_nmp, 1640 struct devid_shr_rec *did_shrnmp 1641 ) 1642 { 1643 char *search_path = "/dev"; 1644 char *minor_name = NULL; 1645 uint_t used_size, min_used_size; 1646 ddi_devid_t did; 1647 devid_nmlist_t *nm; 1648 void *did_min_namep; 1649 void *did_shr_namep; 1650 size_t did_nsize, did_shr_nsize; 1651 1652 used_size = did_shrnmp->did_rec_hdr.r_used_size - 1653 sizeof (struct nm_rec_hdr); 1654 min_used_size = did_nmp->min_rec_hdr.r_used_size - 1655 sizeof (struct nm_rec_hdr); 1656 did_shr_namep = (void *)(&did_shrnmp->device_id[0]); 1657 while (used_size > (int)sizeof (struct did_shr_name)) { 1658 did_min_namep = (void *)(&did_nmp->minor_name[0]); 1659 /* grab device id and minor name from the shared spaces */ 1660 did = (ddi_devid_t)(((struct did_shr_name *) 1661 did_shr_namep)->did_devid); 1662 if (devid_valid(did) == 0) { 1663 return (-1); 1664 } 1665 1666 /* 1667 * We need to check that the DID_NM and DID_SHR_NM are in 1668 * sync. It is possible that we took a panic between writing 1669 * the two areas to disk. This would be cleaned up on the 1670 * next snarf but we don't know for sure that snarf has even 1671 * happened since we're reading from disk. 1672 */ 1673 while (((struct did_shr_name *)did_shr_namep)->did_key != 1674 ((struct did_min_name *)did_min_namep)->min_devid_key) { 1675 did_nsize = DID_NAMSIZ((struct did_min_name *) 1676 did_min_namep); 1677 did_min_namep = ((void *)((char *)did_min_namep + 1678 did_nsize)); 1679 min_used_size -= did_nsize; 1680 if (min_used_size < (int)sizeof (struct did_min_name)) 1681 continue; 1682 } 1683 minor_name = ((struct did_min_name *)did_min_namep)->min_name; 1684 1685 /* 1686 * Try to find disk in the system. If we can't find the 1687 * disk, we have a partial diskset. 1688 */ 1689 if ((meta_deviceid_to_nmlist(search_path, 1690 did, minor_name, &nm)) != 0) { 1691 (void) mddserror(ep, MDE_DS_PARTIALSET, MD_SET_BAD, 1692 mynode(), NULL, NULL); 1693 return (ENOTSUP); 1694 } 1695 devid_free_nmlist(nm); 1696 used_size -= DID_SHR_NAMSIZ((struct did_shr_name *) 1697 did_shr_namep); 1698 /* increment to next item in the shared spaces */ 1699 did_shr_nsize = DID_SHR_NAMSIZ((struct did_shr_name *) 1700 did_shr_namep); 1701 did_shr_namep = ((void *)((char *)did_shr_namep + 1702 did_shr_nsize)); 1703 } 1704 return (0); 1705 } 1706 1707 /* 1708 * meta_get_set_info 1709 * 1710 * Scans a given drive for set specific information. If the given drive 1711 * has a shared metadb, scans the shared metadb for information pertaining 1712 * to the set. 1713 * 1714 * Returns: 1715 * <0 for failure 1716 * 0 success but no replicas were found 1717 * 1 success and a replica was found 1718 * ENOTSUP for partial disksets detected 1719 */ 1720 int 1721 meta_get_set_info( 1722 mddrivenamelist_t *dp, 1723 md_im_set_desc_t **mispp, 1724 int local_mb_ok, 1725 md_error_t *ep 1726 ) 1727 { 1728 uint_t s; 1729 mdname_t *rsp; 1730 int fd; 1731 char mb[DEV_BSIZE]; 1732 /*LINTED*/ 1733 mddb_mb_t *mbp = (mddb_mb_t *)mb; 1734 char lb[dbtob(MDDB_LBCNT)]; 1735 /*LINTED*/ 1736 mddb_lb_t *lbp = (mddb_lb_t *)lb; 1737 mddb_did_blk_t *lbdidp = NULL; 1738 mddb_ln_t *lnp = NULL; 1739 int lnsize, lbdid_size; 1740 int rval = 0; 1741 char db[DEV_BSIZE]; 1742 /*LINTED*/ 1743 mddb_db_t *dbp = (mddb_db_t *)db; 1744 did_list_t *did_listp = NULL; 1745 mddrivenamelist_t *dnlp; 1746 mddrivename_t *dnp; 1747 md_im_names_t cnames = { 0, NULL}; 1748 char *nm = NULL; 1749 char *did_nm = NULL, *did_shrnm = NULL; 1750 struct nm_rec *nmp; 1751 struct devid_shr_rec *did_shrnmp; 1752 struct devid_min_rec *did_nmp; 1753 int extended_namespace = 0; 1754 int replicated = 0; 1755 1756 dnp = dp->drivenamep; 1757 1758 /* 1759 * Determine and open the replica slice 1760 */ 1761 if (meta_replicaslice(dnp, &s, ep) != 0) { 1762 return (-1); 1763 } 1764 1765 /* 1766 * Test for the size of replica slice in question. If 1767 * the size is zero, we know that this is not a disk that was 1768 * part of a set and it should be silently ignored for import. 1769 */ 1770 if (dnp->vtoc.parts[s].size == 0) 1771 return (0); 1772 1773 if ((rsp = metaslicename(dnp, s, ep)) == NULL) { 1774 return (-1); 1775 } 1776 1777 if ((fd = open(rsp->rname, O_RDONLY|O_NDELAY)) < 0) 1778 return (mdsyserror(ep, errno, rsp->cname)); 1779 1780 /* 1781 * After the open() succeeds, we should return via the "out" 1782 * label to clean up after ourselves. (Up 'til now, we can 1783 * just return directly, because there are no resources to 1784 * give back.) 1785 */ 1786 1787 if ((rval = read_master_block(ep, fd, mbp, sizeof (mb))) <= 0) 1788 goto out; 1789 1790 replicated = is_replicated(fd, mbp, 0, NULL); 1791 1792 if (!local_mb_ok && mbp->mb_setno == 0) { 1793 rval = 0; 1794 goto out; 1795 } 1796 1797 if ((rval = read_locator_block(ep, fd, mbp, lbp, sizeof (lb))) <= 0) 1798 goto out; 1799 1800 /* 1801 * Once the locator block has been read, we need to 1802 * check if the locator block commit count is zero. 1803 * If it is zero, we know that the replica we're dealing 1804 * with is on a disk that was deleted from the disk set; 1805 * and, it potentially has stale data. We need to quit 1806 * in that case 1807 */ 1808 if (lbp->lb_commitcnt == 0) { 1809 rval = 0; 1810 goto out; 1811 } 1812 1813 /* 1814 * Make sure that the disk being imported has device id 1815 * namespace present for disksets. If a disk doesn't have 1816 * device id namespace, we skip reading the replica on that disk 1817 */ 1818 if (!(lbp->lb_flags & MDDB_DEVID_STYLE)) { 1819 rval = 0; 1820 goto out; 1821 } 1822 1823 /* 1824 * Grab the locator block device id array. Allocate memory for the 1825 * array first. 1826 */ 1827 lbdid_size = dbtob(lbp->lb_didblkcnt); 1828 lbdidp = Zalloc(lbdid_size); 1829 1830 if ((rval = read_locator_block_did(ep, fd, mbp, lbp, lbdidp, 1831 lbdid_size)) <= 0) 1832 goto out; 1833 1834 /* 1835 * For a disk that has not been replicated, extract the device ids 1836 * stored in the locator block device id array and store them in 1837 * a list. 1838 * 1839 * If the disk has been replicated using replication software such 1840 * as HDS Truecopy/ShadowImage or EMC SRDF/BCV, the device ids in 1841 * the locator block are invalid and we need to build a list of 1842 * replicated disks. 1843 */ 1844 if (replicated && !replicated_disk_list_built) { 1845 /* 1846 * if there's a replicated diskset involved, we need to 1847 * scan the system one more time and build a list of all 1848 * candidate disks that might be part of that replicated set 1849 */ 1850 if (meta_list_disks(ep, &cnames) != 0) { 1851 rval = 0; 1852 goto out; 1853 } 1854 dnlp = meta_prune_cnames(ep, &cnames, 0); 1855 rval = build_replicated_disks_list(ep, dnlp); 1856 if (rval == 0) 1857 goto out; 1858 } 1859 1860 rval = build_did_list(ep, fd, mbp, lbdidp, &did_listp, replicated); 1861 1862 if ((rval <= 0) || (rval == ENOTSUP)) 1863 goto out; 1864 1865 /* 1866 * Until here, we've gotten away with fixed sizes for the 1867 * master block and locator block. The locator names, 1868 * however, are sized (and therefore allocated) dynamically 1869 * according to information in the locator block. 1870 */ 1871 lnsize = dbtob(lbp->lb_lnblkcnt); 1872 lnp = Zalloc(lnsize); 1873 1874 if ((rval = read_locator_names(ep, fd, mbp, lbp, lnp, lnsize)) <= 0) 1875 goto out; 1876 1877 /* 1878 * Read in the NM record 1879 * If no NM record was found, it still is a valid configuration 1880 * but it also means that we won't find any corresponding DID_NM 1881 * or DID_SHR_NM. 1882 */ 1883 if ((rval = read_nm_rec(ep, fd, mbp, lbp, &nm, MDDB_NM, rsp->cname)) 1884 < 0) 1885 goto out; 1886 else if (rval == 0) 1887 goto append; 1888 1889 /* 1890 * At this point, we have read in all of the blocks that form 1891 * the nm_rec. We should at least detect the corner case 1892 * mentioned above, in which r_next_recid links to another 1893 * nm_rec. Extended namespace handling is left for Phase 2. 1894 * 1895 * What this should really be is a loop, each iteration of 1896 * which reads in a nm_rec and calls the set_append_wrapper(). 1897 */ 1898 /*LINTED*/ 1899 nmp = (struct nm_rec *)(nm + sizeof (mddb_rb_t)); 1900 if (nmp->r_rec_hdr.r_next_recid != (mddb_recid_t)0) { 1901 extended_namespace = 1; 1902 rval = 0; 1903 goto out; 1904 } 1905 1906 if ((rval = read_nm_rec(ep, fd, mbp, lbp, &did_nm, 1907 MDDB_DID_NM, rsp->cname)) < 0) 1908 goto out; 1909 else if (rval == 0) 1910 goto append; 1911 1912 /*LINTED*/ 1913 did_nmp = (struct devid_min_rec *)(did_nm + sizeof (mddb_rb_t) - 1914 sizeof (int)); 1915 if (did_nmp->min_rec_hdr.r_next_recid != (mddb_recid_t)0) { 1916 extended_namespace = 1; 1917 rval = 0; 1918 goto out; 1919 } 1920 1921 if ((rval = read_nm_rec(ep, fd, mbp, lbp, &did_shrnm, 1922 MDDB_DID_SHR_NM, rsp->cname)) < 0) 1923 goto out; 1924 else if (rval == 0) 1925 goto append; 1926 1927 /*LINTED*/ 1928 did_shrnmp = (struct devid_shr_rec *)(did_shrnm + sizeof (mddb_rb_t) - 1929 sizeof (int)); 1930 if (did_shrnmp->did_rec_hdr.r_next_recid != (mddb_recid_t)0) { 1931 extended_namespace = 1; 1932 rval = 0; 1933 goto out; 1934 } 1935 1936 /* 1937 * We need to check if all of the disks listed in the namespace 1938 * are actually available. If they aren't we'll return with 1939 * an ENOTSUP error which indicates a partial diskset. 1940 */ 1941 rval = check_nm_disks(ep, did_nmp, did_shrnmp); 1942 if ((rval < 0) || (rval == ENOTSUP)) 1943 goto out; 1944 1945 append: 1946 /* Finally, we've got what we need to process this replica. */ 1947 mispp = set_append_wrapper(mispp, did_listp, mbp, lbp, 1948 /*LINTED*/ 1949 (mddb_rb_t *)nm, (mddb_rb_t *)did_nm, (mddb_rb_t *)did_shrnm, 1950 ep, replicated); 1951 1952 /* Return the fact that we found at least one set */ 1953 rval = 1; 1954 1955 out: 1956 if (fd >= 0) 1957 (void) close(fd); 1958 if (did_listp != NULL) 1959 free_did_list(did_listp); 1960 if (lnp != NULL) 1961 Free(lnp); 1962 if (nm != NULL) 1963 Free(nm); 1964 if (did_nm != NULL) 1965 Free(did_nm); 1966 if (did_shrnm != NULL) 1967 Free(did_shrnm); 1968 1969 /* 1970 * If we are at the end of the list, we must free up 1971 * the replicated list too 1972 */ 1973 if (dp->next == NULL) 1974 free_replicated_disks_list(); 1975 1976 if (extended_namespace) 1977 return (mddserror(ep, MDE_DS_EXTENDEDNM, MD_SET_BAD, 1978 mynode(), NULL, NULL)); 1979 1980 return (rval); 1981 } 1982 1983 /* 1984 * Return the minor name associated with a given disk slice 1985 */ 1986 static char * 1987 meta_getminor_name( 1988 char *devname, 1989 md_error_t *ep 1990 ) 1991 { 1992 int fd = -1; 1993 char *minor_name = NULL; 1994 char *ret_minor_name = NULL; 1995 1996 if (devname == NULL) 1997 return (NULL); 1998 1999 if ((fd = open(devname, O_RDONLY|O_NDELAY, 0)) < 0) { 2000 (void) mdsyserror(ep, errno, devname); 2001 return (NULL); 2002 } 2003 2004 if (devid_get_minor_name(fd, &minor_name) == 0) { 2005 ret_minor_name = Strdup(minor_name); 2006 devid_str_free(minor_name); 2007 } 2008 2009 (void) close(fd); 2010 return (ret_minor_name); 2011 } 2012 2013 static int 2014 meta_replica_quorum( 2015 md_im_set_desc_t *misp, 2016 md_error_t *ep 2017 ) 2018 { 2019 md_im_drive_info_t *midp; 2020 mddrivename_t *dnp; 2021 md_im_replica_info_t *midr; 2022 mdname_t *np; 2023 struct stat st_buf; 2024 uint_t rep_slice; 2025 int replica_count = 0; 2026 2027 for (midp = misp->mis_drives; midp != NULL; 2028 midp = midp->mid_next) { 2029 2030 dnp = midp->mid_dnp; 2031 2032 if ((meta_replicaslice(dnp, &rep_slice, ep) != 0) || 2033 ((np = metaslicename(dnp, rep_slice, ep)) 2034 == NULL)) { 2035 mdclrerror(ep); 2036 continue; 2037 } 2038 2039 if (stat(np->bname, &st_buf) != 0) 2040 continue; 2041 2042 /* 2043 * The drive is okay now count its replicas 2044 */ 2045 for (midr = midp->mid_replicas; midr != NULL; 2046 midr = midr->mir_next) { 2047 replica_count++; 2048 } 2049 } 2050 2051 if (replica_count < (misp->mis_active_replicas + 1)/2) 2052 return (-1); 2053 2054 return (0); 2055 } 2056 2057 static set_t 2058 meta_imp_setno( 2059 md_error_t *ep 2060 ) 2061 { 2062 set_t max_sets, setno; 2063 int bool; 2064 2065 if ((max_sets = get_max_sets(ep)) == 0) { 2066 return (MD_SET_BAD); 2067 } 2068 2069 /* 2070 * This code needs to be expanded when we run in SunCluster 2071 * environment SunCluster obtains setno internally 2072 */ 2073 for (setno = 1; setno < max_sets; setno++) { 2074 if (clnt_setnumbusy(mynode(), setno, 2075 &bool, ep) == -1) { 2076 setno = MD_SET_BAD; 2077 break; 2078 } 2079 /* 2080 * found one available 2081 */ 2082 if (bool == FALSE) 2083 break; 2084 } 2085 2086 if (setno == max_sets) { 2087 setno = MD_SET_BAD; 2088 } 2089 2090 return (setno); 2091 } 2092 2093 int 2094 meta_imp_set( 2095 md_im_set_desc_t *misp, 2096 char *setname, 2097 int force, 2098 bool_t dry_run, 2099 md_error_t *ep 2100 ) 2101 { 2102 md_timeval32_t tp; 2103 md_im_drive_info_t *midp; 2104 uint_t rep_slice; 2105 mddrivename_t *dnp; 2106 struct mddb_config c; 2107 mdname_t *np; 2108 md_im_replica_info_t *mirp; 2109 char setnum_link[MAXPATHLEN]; 2110 char setname_link[MAXPATHLEN]; 2111 char *minor_name = NULL; 2112 2113 (void) memset(&c, 0, sizeof (c)); 2114 (void) strlcpy(c.c_setname, setname, sizeof (c.c_setname)); 2115 c.c_sideno = 0; 2116 c.c_flags = MDDB_C_IMPORT; 2117 2118 /* 2119 * Check to see if the setname that the set is being imported into, 2120 * already exists. 2121 */ 2122 if (getsetbyname(c.c_setname, ep) != NULL) { 2123 return (mddserror(ep, MDE_DS_SETNAMEBUSY, MD_SET_BAD, 2124 mynode(), NULL, c.c_setname)); 2125 } 2126 2127 /* 2128 * Find the next available set number 2129 */ 2130 if ((c.c_setno = meta_imp_setno(ep)) == MD_SET_BAD) { 2131 return (mddserror(ep, MDE_DS_SETNOTIMP, MD_SET_BAD, 2132 mynode(), NULL, c.c_setname)); 2133 } 2134 2135 if (meta_gettimeofday(&tp) == -1) { 2136 return (mdsyserror(ep, errno, NULL)); 2137 } 2138 c.c_timestamp = tp; 2139 2140 /* Check to see if replica quorum requirement is fulfilled */ 2141 if (!force && meta_replica_quorum(misp, ep) == -1) 2142 return (mddserror(ep, MDE_DS_INSUFQUORUM, MD_SET_BAD, 2143 mynode(), NULL, c.c_setname)); 2144 2145 for (midp = misp->mis_drives; midp != NULL; 2146 midp = midp->mid_next) { 2147 mdcinfo_t *cinfo; 2148 2149 /* 2150 * We pass down the list of the drives in the 2151 * set down to the kernel irrespective of 2152 * whether the drives have a replica or not. 2153 * 2154 * The kernel detects which of the drives don't 2155 * have a replica and accordingly does the 2156 * right thing. 2157 */ 2158 dnp = midp->mid_dnp; 2159 if ((meta_replicaslice(dnp, &rep_slice, ep) != 0) || 2160 ((np = metaslicename(dnp, rep_slice, ep)) 2161 == NULL)) { 2162 mdclrerror(ep); 2163 continue; 2164 } 2165 2166 (void) strcpy(c.c_locator.l_devname, np->bname); 2167 c.c_locator.l_dev = meta_cmpldev(np->dev); 2168 c.c_locator.l_mnum = meta_getminor(np->dev); 2169 c.c_locator.l_devid = (uintptr_t)Malloc(midp->mid_devid_sz); 2170 (void) memcpy((void *)(uintptr_t)c.c_locator.l_devid, 2171 midp->mid_devid, midp->mid_devid_sz); 2172 c.c_locator.l_devid_sz = midp->mid_devid_sz; 2173 c.c_locator.l_devid_flags = 2174 MDDB_DEVID_VALID | MDDB_DEVID_SPACE | MDDB_DEVID_SZ; 2175 if (midp->mid_o_devid) { 2176 c.c_locator.l_old_devid = 2177 (uint64_t)(uintptr_t)Malloc(midp->mid_o_devid_sz); 2178 (void) memcpy((void *)(uintptr_t) 2179 c.c_locator.l_old_devid, 2180 midp->mid_o_devid, midp->mid_o_devid_sz); 2181 c.c_locator.l_old_devid_sz = midp->mid_o_devid_sz; 2182 } 2183 minor_name = meta_getminor_name(np->bname, ep); 2184 (void) strncpy(c.c_locator.l_minor_name, minor_name, 2185 sizeof (c.c_locator.l_minor_name)); 2186 2187 if ((cinfo = metagetcinfo(np, ep)) == NULL) { 2188 mdclrerror(ep); 2189 continue; 2190 } 2191 (void) strncpy(c.c_locator.l_driver, cinfo->dname, 2192 sizeof (c.c_locator.l_driver)); 2193 2194 mirp = midp->mid_replicas; 2195 2196 do { 2197 if (mirp) { 2198 c.c_locator.l_flags = 0; 2199 c.c_locator.l_blkno = mirp->mir_offset; 2200 mirp = mirp->mir_next; 2201 } else { 2202 /* 2203 * Default offset for dummy is 16 2204 */ 2205 c.c_locator.l_blkno = 16; 2206 } 2207 2208 if (metaioctl(MD_DB_USEDEV, &c, &c.c_mde, NULL) != 0) { 2209 Free((void *)(uintptr_t)c.c_locator.l_devid); 2210 if (c.c_locator.l_old_devid) 2211 Free((void *)(uintptr_t) 2212 c.c_locator.l_old_devid); 2213 return (mdstealerror(ep, &c.c_mde)); 2214 } 2215 } while (mirp != NULL); 2216 } 2217 2218 /* 2219 * If the dry run option was specified, flag success 2220 * and exit out 2221 */ 2222 if (dry_run == 1) { 2223 md_eprintf("%s\n", dgettext(TEXT_DOMAIN, 2224 "import should be successful")); 2225 Free((void *)(uintptr_t)c.c_locator.l_devid); 2226 if (c.c_locator.l_old_devid) 2227 Free((void *)(uintptr_t)c.c_locator.l_old_devid); 2228 return (0); 2229 } 2230 2231 /* 2232 * Now kernel should have all the information 2233 * regarding the import diskset replica. 2234 * Tell kernel to load them up and import the set 2235 */ 2236 if (metaioctl(MD_IOCIMP_LOAD, &c.c_setno, &c.c_mde, NULL) != 0) { 2237 Free((void *)(uintptr_t)c.c_locator.l_devid); 2238 if (c.c_locator.l_old_devid) 2239 Free((void *)(uintptr_t)c.c_locator.l_old_devid); 2240 return (mdstealerror(ep, &c.c_mde)); 2241 } 2242 2243 (void) meta_smf_enable(META_SMF_DISKSET, NULL); 2244 2245 /* The set has now been imported, create the appropriate symlink */ 2246 (void) snprintf(setname_link, MAXPATHLEN, "/dev/md/%s", setname); 2247 (void) snprintf(setnum_link, MAXPATHLEN, "shared/%d", c.c_setno); 2248 2249 /* 2250 * Since we already verified that the setname was OK, make sure to 2251 * cleanup before proceeding. 2252 */ 2253 if (unlink(setname_link) == -1) { 2254 if (errno != ENOENT) 2255 (void) mdsyserror(ep, errno, setname_link); 2256 } 2257 2258 if (symlink(setnum_link, setname_link) == -1) 2259 (void) mdsyserror(ep, errno, setname_link); 2260 2261 /* resnarf the set that has just been imported */ 2262 if (clnt_resnarf_set(mynode(), c.c_setno, ep) != 0) 2263 md_eprintf("%s\n", dgettext(TEXT_DOMAIN, "Please stop and " 2264 "restart rpc.metad")); 2265 2266 Free((void *)(uintptr_t)c.c_locator.l_devid); 2267 if (c.c_locator.l_old_devid) 2268 Free((void *)(uintptr_t)c.c_locator.l_old_devid); 2269 return (0); 2270 } 2271