1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * Functions to convert between a list of vdevs and an nvlist representing the 29 * configuration. Each entry in the list can be one of: 30 * 31 * Device vdevs 32 * disk=(path=..., devid=...) 33 * file=(path=...) 34 * 35 * Group vdevs 36 * raidz[1|2]=(...) 37 * mirror=(...) 38 * 39 * Hot spares 40 * 41 * While the underlying implementation supports it, group vdevs cannot contain 42 * other group vdevs. All userland verification of devices is contained within 43 * this file. If successful, the nvlist returned can be passed directly to the 44 * kernel; we've done as much verification as possible in userland. 45 * 46 * Hot spares are a special case, and passed down as an array of disk vdevs, at 47 * the same level as the root of the vdev tree. 48 * 49 * The only function exported by this file is 'make_root_vdev'. The 50 * function performs several passes: 51 * 52 * 1. Construct the vdev specification. Performs syntax validation and 53 * makes sure each device is valid. 54 * 2. Check for devices in use. Using libdiskmgt, makes sure that no 55 * devices are also in use. Some can be overridden using the 'force' 56 * flag, others cannot. 57 * 3. Check for replication errors if the 'force' flag is not specified. 58 * validates that the replication level is consistent across the 59 * entire pool. 60 * 4. Call libzfs to label any whole disks with an EFI label. 61 */ 62 63 #include <assert.h> 64 #include <devid.h> 65 #include <errno.h> 66 #include <fcntl.h> 67 #include <libintl.h> 68 #include <libnvpair.h> 69 #include <limits.h> 70 #include <stdio.h> 71 #include <string.h> 72 #include <unistd.h> 73 #include <sys/efi_partition.h> 74 #include <sys/stat.h> 75 #include <sys/vtoc.h> 76 #include <sys/mntent.h> 77 78 #include "zpool_util.h" 79 80 #define DISK_ROOT "/dev/dsk" 81 #define RDISK_ROOT "/dev/rdsk" 82 #define BACKUP_SLICE "s2" 83 84 /* 85 * For any given vdev specification, we can have multiple errors. The 86 * vdev_error() function keeps track of whether we have seen an error yet, and 87 * prints out a header if its the first error we've seen. 88 */ 89 boolean_t error_seen; 90 boolean_t is_force; 91 92 /*PRINTFLIKE1*/ 93 static void 94 vdev_error(const char *fmt, ...) 95 { 96 va_list ap; 97 98 if (!error_seen) { 99 (void) fprintf(stderr, gettext("invalid vdev specification\n")); 100 if (!is_force) 101 (void) fprintf(stderr, gettext("use '-f' to override " 102 "the following errors:\n")); 103 else 104 (void) fprintf(stderr, gettext("the following errors " 105 "must be manually repaired:\n")); 106 error_seen = B_TRUE; 107 } 108 109 va_start(ap, fmt); 110 (void) vfprintf(stderr, fmt, ap); 111 va_end(ap); 112 } 113 114 static void 115 libdiskmgt_error(int error) 116 { 117 /* 118 * ENXIO/ENODEV is a valid error message if the device doesn't live in 119 * /dev/dsk. Don't bother printing an error message in this case. 120 */ 121 if (error == ENXIO || error == ENODEV) 122 return; 123 124 (void) fprintf(stderr, gettext("warning: device in use checking " 125 "failed: %s\n"), strerror(error)); 126 } 127 128 /* 129 * Check that a file is valid. All we can do in this case is check that it's 130 * not in use by another pool, and not in use by swap. 131 */ 132 static int 133 check_file(const char *file, boolean_t force, boolean_t isspare) 134 { 135 char *name; 136 int fd; 137 int ret = 0; 138 int err; 139 pool_state_t state; 140 boolean_t inuse; 141 142 #ifndef __NetBSD__ 143 if (dm_inuse_swap(file, &err)) { 144 if (err) 145 libdiskmgt_error(err); 146 else 147 vdev_error(gettext("%s is currently used by swap. " 148 "Please see swap(1M).\n"), file); 149 return (-1); 150 } 151 #endif 152 153 if ((fd = open(file, O_RDONLY)) < 0) 154 return (0); 155 156 if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) == 0 && inuse) { 157 const char *desc; 158 159 switch (state) { 160 case POOL_STATE_ACTIVE: 161 desc = gettext("active"); 162 break; 163 164 case POOL_STATE_EXPORTED: 165 desc = gettext("exported"); 166 break; 167 168 case POOL_STATE_POTENTIALLY_ACTIVE: 169 desc = gettext("potentially active"); 170 break; 171 172 default: 173 desc = gettext("unknown"); 174 break; 175 } 176 177 /* 178 * Allow hot spares to be shared between pools. 179 */ 180 if (state == POOL_STATE_SPARE && isspare) 181 return (0); 182 183 if (state == POOL_STATE_ACTIVE || 184 state == POOL_STATE_SPARE || !force) { 185 switch (state) { 186 case POOL_STATE_SPARE: 187 vdev_error(gettext("%s is reserved as a hot " 188 "spare for pool %s\n"), file, name); 189 break; 190 default: 191 vdev_error(gettext("%s is part of %s pool " 192 "'%s'\n"), file, desc, name); 193 break; 194 } 195 ret = -1; 196 } 197 198 free(name); 199 } 200 201 (void) close(fd); 202 return (ret); 203 } 204 205 206 /* 207 * By "whole disk" we mean an entire physical disk (something we can 208 * label, toggle the write cache on, etc.) as opposed to the full 209 * capacity of a pseudo-device such as lofi or did. We act as if we 210 * are labeling the disk, which should be a pretty good test of whether 211 * it's a viable device or not. Returns B_TRUE if it is and B_FALSE if 212 * it isn't. 213 */ 214 static boolean_t 215 is_whole_disk(const char *arg) 216 { 217 struct dk_gpt *label; 218 int fd; 219 char path[MAXPATHLEN]; 220 221 (void) snprintf(path, sizeof (path), "%s%s%s", 222 RDISK_ROOT, strrchr(arg, '/'), BACKUP_SLICE); 223 if ((fd = open(path, O_RDWR | O_NDELAY)) < 0) 224 return (B_FALSE); 225 if (efi_alloc_and_init(fd, EFI_NUMPAR, &label) != 0) { 226 (void) close(fd); 227 return (B_FALSE); 228 } 229 efi_free(label); 230 (void) close(fd); 231 return (B_TRUE); 232 } 233 234 /* 235 * Create a leaf vdev. Determine if this is a file or a device. If it's a 236 * device, fill in the device id to make a complete nvlist. Valid forms for a 237 * leaf vdev are: 238 * 239 * /dev/dsk/xxx Complete disk path 240 * /xxx Full path to file 241 * xxx Shorthand for /dev/dsk/xxx 242 */ 243 static nvlist_t * 244 make_leaf_vdev(const char *arg, uint64_t is_log) 245 { 246 char path[MAXPATHLEN]; 247 struct stat64 statbuf; 248 nvlist_t *vdev = NULL; 249 char *type = NULL; 250 boolean_t wholedisk = B_FALSE; 251 252 /* 253 * Determine what type of vdev this is, and put the full path into 254 * 'path'. We detect whether this is a device of file afterwards by 255 * checking the st_mode of the file. 256 */ 257 if (arg[0] == '/') { 258 /* 259 * Complete device or file path. Exact type is determined by 260 * examining the file descriptor afterwards. 261 */ 262 wholedisk = is_whole_disk(arg); 263 if (!wholedisk && (stat64(arg, &statbuf) != 0)) { 264 (void) fprintf(stderr, 265 gettext("cannot open '%s': %s\n"), 266 arg, strerror(errno)); 267 return (NULL); 268 } 269 270 (void) strlcpy(path, arg, sizeof (path)); 271 } else { 272 /* 273 * This may be a short path for a device, or it could be total 274 * gibberish. Check to see if it's a known device in 275 * /dev/dsk/. As part of this check, see if we've been given a 276 * an entire disk (minus the slice number). 277 */ 278 (void) snprintf(path, sizeof (path), "%s/%s", DISK_ROOT, 279 arg); 280 wholedisk = is_whole_disk(path); 281 if (!wholedisk && (stat64(path, &statbuf) != 0)) { 282 /* 283 * If we got ENOENT, then the user gave us 284 * gibberish, so try to direct them with a 285 * reasonable error message. Otherwise, 286 * regurgitate strerror() since it's the best we 287 * can do. 288 */ 289 if (errno == ENOENT) { 290 (void) fprintf(stderr, 291 gettext("cannot open '%s': no such " 292 "device in %s\n"), arg, DISK_ROOT); 293 (void) fprintf(stderr, 294 gettext("must be a full path or " 295 "shorthand device name\n")); 296 return (NULL); 297 } else { 298 (void) fprintf(stderr, 299 gettext("cannot open '%s': %s\n"), 300 path, strerror(errno)); 301 return (NULL); 302 } 303 } 304 } 305 306 /* 307 * Determine whether this is a device or a file. 308 */ 309 if (wholedisk || S_ISBLK(statbuf.st_mode)) { 310 type = VDEV_TYPE_DISK; 311 } else if (S_ISREG(statbuf.st_mode)) { 312 type = VDEV_TYPE_FILE; 313 } else { 314 (void) fprintf(stderr, gettext("cannot use '%s': must be a " 315 "block device or regular file\n"), path); 316 return (NULL); 317 } 318 319 /* 320 * Finally, we have the complete device or file, and we know that it is 321 * acceptable to use. Construct the nvlist to describe this vdev. All 322 * vdevs have a 'path' element, and devices also have a 'devid' element. 323 */ 324 verify(nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) == 0); 325 verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0); 326 verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0); 327 verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_IS_LOG, is_log) == 0); 328 if (strcmp(type, VDEV_TYPE_DISK) == 0) 329 verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, 330 (uint64_t)wholedisk) == 0); 331 332 /* 333 * For a whole disk, defer getting its devid until after labeling it. 334 */ 335 if (S_ISBLK(statbuf.st_mode) && !wholedisk) { 336 /* 337 * Get the devid for the device. 338 */ 339 int fd; 340 ddi_devid_t devid; 341 char *minor = NULL, *devid_str = NULL; 342 343 if ((fd = open(path, O_RDONLY)) < 0) { 344 (void) fprintf(stderr, gettext("cannot open '%s': " 345 "%s\n"), path, strerror(errno)); 346 nvlist_free(vdev); 347 return (NULL); 348 } 349 350 if (devid_get(fd, &devid) == 0) { 351 if (devid_get_minor_name(fd, &minor) == 0 && 352 (devid_str = devid_str_encode(devid, minor)) != 353 NULL) { 354 verify(nvlist_add_string(vdev, 355 ZPOOL_CONFIG_DEVID, devid_str) == 0); 356 } 357 if (devid_str != NULL) 358 devid_str_free(devid_str); 359 if (minor != NULL) 360 devid_str_free(minor); 361 devid_free(devid); 362 } 363 364 (void) close(fd); 365 } 366 367 return (vdev); 368 } 369 370 /* 371 * Go through and verify the replication level of the pool is consistent. 372 * Performs the following checks: 373 * 374 * For the new spec, verifies that devices in mirrors and raidz are the 375 * same size. 376 * 377 * If the current configuration already has inconsistent replication 378 * levels, ignore any other potential problems in the new spec. 379 * 380 * Otherwise, make sure that the current spec (if there is one) and the new 381 * spec have consistent replication levels. 382 */ 383 typedef struct replication_level { 384 char *zprl_type; 385 uint64_t zprl_children; 386 uint64_t zprl_parity; 387 } replication_level_t; 388 389 #define ZPOOL_FUZZ (16 * 1024 * 1024) 390 391 /* 392 * Given a list of toplevel vdevs, return the current replication level. If 393 * the config is inconsistent, then NULL is returned. If 'fatal' is set, then 394 * an error message will be displayed for each self-inconsistent vdev. 395 */ 396 static replication_level_t * 397 get_replication(nvlist_t *nvroot, boolean_t fatal) 398 { 399 nvlist_t **top; 400 uint_t t, toplevels; 401 nvlist_t **child; 402 uint_t c, children; 403 nvlist_t *nv; 404 char *type; 405 replication_level_t lastrep, rep, *ret; 406 boolean_t dontreport; 407 408 ret = safe_malloc(sizeof (replication_level_t)); 409 410 verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 411 &top, &toplevels) == 0); 412 413 lastrep.zprl_type = NULL; 414 for (t = 0; t < toplevels; t++) { 415 uint64_t is_log = B_FALSE; 416 417 nv = top[t]; 418 419 /* 420 * For separate logs we ignore the top level vdev replication 421 * constraints. 422 */ 423 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &is_log); 424 if (is_log) 425 continue; 426 427 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, 428 &type) == 0); 429 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 430 &child, &children) != 0) { 431 /* 432 * This is a 'file' or 'disk' vdev. 433 */ 434 rep.zprl_type = type; 435 rep.zprl_children = 1; 436 rep.zprl_parity = 0; 437 } else { 438 uint64_t vdev_size; 439 440 /* 441 * This is a mirror or RAID-Z vdev. Go through and make 442 * sure the contents are all the same (files vs. disks), 443 * keeping track of the number of elements in the 444 * process. 445 * 446 * We also check that the size of each vdev (if it can 447 * be determined) is the same. 448 */ 449 rep.zprl_type = type; 450 rep.zprl_children = 0; 451 452 if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { 453 verify(nvlist_lookup_uint64(nv, 454 ZPOOL_CONFIG_NPARITY, 455 &rep.zprl_parity) == 0); 456 assert(rep.zprl_parity != 0); 457 } else { 458 rep.zprl_parity = 0; 459 } 460 461 /* 462 * The 'dontreport' variable indicates that we've 463 * already reported an error for this spec, so don't 464 * bother doing it again. 465 */ 466 type = NULL; 467 dontreport = 0; 468 vdev_size = -1ULL; 469 for (c = 0; c < children; c++) { 470 nvlist_t *cnv = child[c]; 471 char *path; 472 struct stat64 statbuf; 473 uint64_t size = -1ULL; 474 char *childtype; 475 int fd, err; 476 477 rep.zprl_children++; 478 479 verify(nvlist_lookup_string(cnv, 480 ZPOOL_CONFIG_TYPE, &childtype) == 0); 481 482 /* 483 * If this is a replacing or spare vdev, then 484 * get the real first child of the vdev. 485 */ 486 if (strcmp(childtype, 487 VDEV_TYPE_REPLACING) == 0 || 488 strcmp(childtype, VDEV_TYPE_SPARE) == 0) { 489 nvlist_t **rchild; 490 uint_t rchildren; 491 492 verify(nvlist_lookup_nvlist_array(cnv, 493 ZPOOL_CONFIG_CHILDREN, &rchild, 494 &rchildren) == 0); 495 assert(rchildren == 2); 496 cnv = rchild[0]; 497 498 verify(nvlist_lookup_string(cnv, 499 ZPOOL_CONFIG_TYPE, 500 &childtype) == 0); 501 } 502 503 verify(nvlist_lookup_string(cnv, 504 ZPOOL_CONFIG_PATH, &path) == 0); 505 506 /* 507 * If we have a raidz/mirror that combines disks 508 * with files, report it as an error. 509 */ 510 if (!dontreport && type != NULL && 511 strcmp(type, childtype) != 0) { 512 if (ret != NULL) 513 free(ret); 514 ret = NULL; 515 if (fatal) 516 vdev_error(gettext( 517 "mismatched replication " 518 "level: %s contains both " 519 "files and devices\n"), 520 rep.zprl_type); 521 else 522 return (NULL); 523 dontreport = B_TRUE; 524 } 525 526 /* 527 * According to stat(2), the value of 'st_size' 528 * is undefined for block devices and character 529 * devices. But there is no effective way to 530 * determine the real size in userland. 531 * 532 * Instead, we'll take advantage of an 533 * implementation detail of spec_size(). If the 534 * device is currently open, then we (should) 535 * return a valid size. 536 * 537 * If we still don't get a valid size (indicated 538 * by a size of 0 or MAXOFFSET_T), then ignore 539 * this device altogether. 540 */ 541 if ((fd = open(path, O_RDONLY)) >= 0) { 542 err = fstat64(fd, &statbuf); 543 (void) close(fd); 544 } else { 545 err = stat64(path, &statbuf); 546 } 547 548 if (err != 0 || 549 statbuf.st_size == 0 || 550 statbuf.st_size == MAXOFFSET_T) 551 continue; 552 553 size = statbuf.st_size; 554 555 /* 556 * Also make sure that devices and 557 * slices have a consistent size. If 558 * they differ by a significant amount 559 * (~16MB) then report an error. 560 */ 561 if (!dontreport && 562 (vdev_size != -1ULL && 563 (labs(size - vdev_size) > 564 ZPOOL_FUZZ))) { 565 if (ret != NULL) 566 free(ret); 567 ret = NULL; 568 if (fatal) 569 vdev_error(gettext( 570 "%s contains devices of " 571 "different sizes\n"), 572 rep.zprl_type); 573 else 574 return (NULL); 575 dontreport = B_TRUE; 576 } 577 578 type = childtype; 579 vdev_size = size; 580 } 581 } 582 583 /* 584 * At this point, we have the replication of the last toplevel 585 * vdev in 'rep'. Compare it to 'lastrep' to see if its 586 * different. 587 */ 588 if (lastrep.zprl_type != NULL) { 589 if (strcmp(lastrep.zprl_type, rep.zprl_type) != 0) { 590 if (ret != NULL) 591 free(ret); 592 ret = NULL; 593 if (fatal) 594 vdev_error(gettext( 595 "mismatched replication level: " 596 "both %s and %s vdevs are " 597 "present\n"), 598 lastrep.zprl_type, rep.zprl_type); 599 else 600 return (NULL); 601 } else if (lastrep.zprl_parity != rep.zprl_parity) { 602 if (ret) 603 free(ret); 604 ret = NULL; 605 if (fatal) 606 vdev_error(gettext( 607 "mismatched replication level: " 608 "both %llu and %llu device parity " 609 "%s vdevs are present\n"), 610 lastrep.zprl_parity, 611 rep.zprl_parity, 612 rep.zprl_type); 613 else 614 return (NULL); 615 } else if (lastrep.zprl_children != rep.zprl_children) { 616 if (ret) 617 free(ret); 618 ret = NULL; 619 if (fatal) 620 vdev_error(gettext( 621 "mismatched replication level: " 622 "both %llu-way and %llu-way %s " 623 "vdevs are present\n"), 624 lastrep.zprl_children, 625 rep.zprl_children, 626 rep.zprl_type); 627 else 628 return (NULL); 629 } 630 } 631 lastrep = rep; 632 } 633 634 if (ret != NULL) 635 *ret = rep; 636 637 return (ret); 638 } 639 640 /* 641 * Check the replication level of the vdev spec against the current pool. Calls 642 * get_replication() to make sure the new spec is self-consistent. If the pool 643 * has a consistent replication level, then we ignore any errors. Otherwise, 644 * report any difference between the two. 645 */ 646 static int 647 check_replication(nvlist_t *config, nvlist_t *newroot) 648 { 649 nvlist_t **child; 650 uint_t children; 651 replication_level_t *current = NULL, *new; 652 int ret; 653 654 /* 655 * If we have a current pool configuration, check to see if it's 656 * self-consistent. If not, simply return success. 657 */ 658 if (config != NULL) { 659 nvlist_t *nvroot; 660 661 verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 662 &nvroot) == 0); 663 if ((current = get_replication(nvroot, B_FALSE)) == NULL) 664 return (0); 665 } 666 /* 667 * for spares there may be no children, and therefore no 668 * replication level to check 669 */ 670 if ((nvlist_lookup_nvlist_array(newroot, ZPOOL_CONFIG_CHILDREN, 671 &child, &children) != 0) || (children == 0)) { 672 free(current); 673 return (0); 674 } 675 676 /* 677 * If all we have is logs then there's no replication level to check. 678 */ 679 if (num_logs(newroot) == children) { 680 free(current); 681 return (0); 682 } 683 684 /* 685 * Get the replication level of the new vdev spec, reporting any 686 * inconsistencies found. 687 */ 688 if ((new = get_replication(newroot, B_TRUE)) == NULL) { 689 free(current); 690 return (-1); 691 } 692 693 /* 694 * Check to see if the new vdev spec matches the replication level of 695 * the current pool. 696 */ 697 ret = 0; 698 if (current != NULL) { 699 if (strcmp(current->zprl_type, new->zprl_type) != 0) { 700 vdev_error(gettext( 701 "mismatched replication level: pool uses %s " 702 "and new vdev is %s\n"), 703 current->zprl_type, new->zprl_type); 704 ret = -1; 705 } else if (current->zprl_parity != new->zprl_parity) { 706 vdev_error(gettext( 707 "mismatched replication level: pool uses %llu " 708 "device parity and new vdev uses %llu\n"), 709 current->zprl_parity, new->zprl_parity); 710 ret = -1; 711 } else if (current->zprl_children != new->zprl_children) { 712 vdev_error(gettext( 713 "mismatched replication level: pool uses %llu-way " 714 "%s and new vdev uses %llu-way %s\n"), 715 current->zprl_children, current->zprl_type, 716 new->zprl_children, new->zprl_type); 717 ret = -1; 718 } 719 } 720 721 free(new); 722 if (current != NULL) 723 free(current); 724 725 return (ret); 726 } 727 728 /* 729 * Go through and find any whole disks in the vdev specification, labelling them 730 * as appropriate. When constructing the vdev spec, we were unable to open this 731 * device in order to provide a devid. Now that we have labelled the disk and 732 * know that slice 0 is valid, we can construct the devid now. 733 * 734 * If the disk was already labeled with an EFI label, we will have gotten the 735 * devid already (because we were able to open the whole disk). Otherwise, we 736 * need to get the devid after we label the disk. 737 */ 738 static int 739 make_disks(zpool_handle_t *zhp, nvlist_t *nv) 740 { 741 nvlist_t **child; 742 uint_t c, children; 743 char *type, *path, *diskname; 744 char buf[MAXPATHLEN]; 745 uint64_t wholedisk; 746 int fd; 747 int ret; 748 ddi_devid_t devid; 749 char *minor = NULL, *devid_str = NULL; 750 751 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); 752 753 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 754 &child, &children) != 0) { 755 756 if (strcmp(type, VDEV_TYPE_DISK) != 0) 757 return (0); 758 759 /* 760 * We have a disk device. Get the path to the device 761 * and see if it's a whole disk by appending the backup 762 * slice and stat()ing the device. 763 */ 764 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0); 765 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, 766 &wholedisk) != 0 || !wholedisk) 767 return (0); 768 769 diskname = strrchr(path, '/'); 770 assert(diskname != NULL); 771 diskname++; 772 if (zpool_label_disk(g_zfs, zhp, diskname) == -1) 773 return (-1); 774 775 /* 776 * Fill in the devid, now that we've labeled the disk. 777 */ 778 (void) snprintf(buf, sizeof (buf), "%ss0", path); 779 if ((fd = open(buf, O_RDONLY)) < 0) { 780 (void) fprintf(stderr, 781 gettext("cannot open '%s': %s\n"), 782 buf, strerror(errno)); 783 return (-1); 784 } 785 786 if (devid_get(fd, &devid) == 0) { 787 if (devid_get_minor_name(fd, &minor) == 0 && 788 (devid_str = devid_str_encode(devid, minor)) != 789 NULL) { 790 verify(nvlist_add_string(nv, 791 ZPOOL_CONFIG_DEVID, devid_str) == 0); 792 } 793 if (devid_str != NULL) 794 devid_str_free(devid_str); 795 if (minor != NULL) 796 devid_str_free(minor); 797 devid_free(devid); 798 } 799 800 /* 801 * Update the path to refer to the 's0' slice. The presence of 802 * the 'whole_disk' field indicates to the CLI that we should 803 * chop off the slice number when displaying the device in 804 * future output. 805 */ 806 verify(nvlist_add_string(nv, ZPOOL_CONFIG_PATH, buf) == 0); 807 808 (void) close(fd); 809 810 return (0); 811 } 812 813 for (c = 0; c < children; c++) 814 if ((ret = make_disks(zhp, child[c])) != 0) 815 return (ret); 816 817 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, 818 &child, &children) == 0) 819 for (c = 0; c < children; c++) 820 if ((ret = make_disks(zhp, child[c])) != 0) 821 return (ret); 822 823 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, 824 &child, &children) == 0) 825 for (c = 0; c < children; c++) 826 if ((ret = make_disks(zhp, child[c])) != 0) 827 return (ret); 828 829 return (0); 830 } 831 832 /* 833 * Determine if the given path is a hot spare within the given configuration. 834 */ 835 static boolean_t 836 is_spare(nvlist_t *config, const char *path) 837 { 838 int fd; 839 pool_state_t state; 840 char *name = NULL; 841 nvlist_t *label; 842 uint64_t guid, spareguid; 843 nvlist_t *nvroot; 844 nvlist_t **spares; 845 uint_t i, nspares; 846 boolean_t inuse; 847 848 if ((fd = open(path, O_RDONLY)) < 0) 849 return (B_FALSE); 850 851 if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) != 0 || 852 !inuse || 853 state != POOL_STATE_SPARE || 854 zpool_read_label(fd, &label) != 0) { 855 free(name); 856 (void) close(fd); 857 return (B_FALSE); 858 } 859 free(name); 860 861 (void) close(fd); 862 verify(nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) == 0); 863 nvlist_free(label); 864 865 verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 866 &nvroot) == 0); 867 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 868 &spares, &nspares) == 0) { 869 for (i = 0; i < nspares; i++) { 870 verify(nvlist_lookup_uint64(spares[i], 871 ZPOOL_CONFIG_GUID, &spareguid) == 0); 872 if (spareguid == guid) 873 return (B_TRUE); 874 } 875 } 876 877 return (B_FALSE); 878 } 879 880 /* 881 * Go through and find any devices that are in use. We rely on libdiskmgt for 882 * the majority of this task. 883 */ 884 static int 885 check_in_use(nvlist_t *config, nvlist_t *nv, int force, int isreplacing, 886 int isspare) 887 { 888 nvlist_t **child; 889 uint_t c, children; 890 char *type, *path; 891 int ret; 892 char buf[MAXPATHLEN]; 893 uint64_t wholedisk; 894 895 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); 896 897 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 898 &child, &children) != 0) { 899 900 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0); 901 902 /* 903 * As a generic check, we look to see if this is a replace of a 904 * hot spare within the same pool. If so, we allow it 905 * regardless of what libdiskmgt or zpool_in_use() says. 906 */ 907 if (isreplacing) { 908 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, 909 &wholedisk) == 0 && wholedisk) 910 (void) snprintf(buf, sizeof (buf), "%ss0", 911 path); 912 else 913 (void) strlcpy(buf, path, sizeof (buf)); 914 if (is_spare(config, buf)) 915 return (0); 916 } 917 918 if (strcmp(type, VDEV_TYPE_DISK) == 0 || 919 strcmp(type, VDEV_TYPE_FILE) == 0) 920 ret = check_file(path, force, isspare); 921 922 return (ret); 923 } 924 925 for (c = 0; c < children; c++) 926 if ((ret = check_in_use(config, child[c], force, 927 isreplacing, B_FALSE)) != 0) 928 return (ret); 929 930 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, 931 &child, &children) == 0) 932 for (c = 0; c < children; c++) 933 if ((ret = check_in_use(config, child[c], force, 934 isreplacing, B_TRUE)) != 0) 935 return (ret); 936 937 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, 938 &child, &children) == 0) 939 for (c = 0; c < children; c++) 940 if ((ret = check_in_use(config, child[c], force, 941 isreplacing, B_FALSE)) != 0) 942 return (ret); 943 944 return (0); 945 } 946 947 static const char * 948 is_grouping(const char *type, int *mindev, int *maxdev) 949 { 950 if (strncmp(type, "raidz", 5) == 0) { 951 const char *p = type + 5; 952 char *end; 953 long nparity; 954 955 if (*p == '\0') { 956 nparity = 1; 957 } else if (*p == '0') { 958 return (NULL); /* no zero prefixes allowed */ 959 } else { 960 errno = 0; 961 nparity = strtol(p, &end, 10); 962 if (errno != 0 || nparity < 1 || nparity >= 255 || 963 *end != '\0') 964 return (NULL); 965 } 966 967 if (mindev != NULL) 968 *mindev = nparity + 1; 969 if (maxdev != NULL) 970 *maxdev = 255; 971 return (VDEV_TYPE_RAIDZ); 972 } 973 974 if (maxdev != NULL) 975 *maxdev = INT_MAX; 976 977 if (strcmp(type, "mirror") == 0) { 978 if (mindev != NULL) 979 *mindev = 2; 980 return (VDEV_TYPE_MIRROR); 981 } 982 983 if (strcmp(type, "spare") == 0) { 984 if (mindev != NULL) 985 *mindev = 1; 986 return (VDEV_TYPE_SPARE); 987 } 988 989 if (strcmp(type, "log") == 0) { 990 if (mindev != NULL) 991 *mindev = 1; 992 return (VDEV_TYPE_LOG); 993 } 994 995 if (strcmp(type, "cache") == 0) { 996 if (mindev != NULL) 997 *mindev = 1; 998 return (VDEV_TYPE_L2CACHE); 999 } 1000 1001 return (NULL); 1002 } 1003 1004 /* 1005 * Construct a syntactically valid vdev specification, 1006 * and ensure that all devices and files exist and can be opened. 1007 * Note: we don't bother freeing anything in the error paths 1008 * because the program is just going to exit anyway. 1009 */ 1010 nvlist_t * 1011 construct_spec(int argc, char **argv) 1012 { 1013 nvlist_t *nvroot, *nv, **top, **spares, **l2cache; 1014 int t, toplevels, mindev, maxdev, nspares, nlogs, nl2cache; 1015 const char *type; 1016 uint64_t is_log; 1017 boolean_t seen_logs; 1018 1019 top = NULL; 1020 toplevels = 0; 1021 spares = NULL; 1022 l2cache = NULL; 1023 nspares = 0; 1024 nlogs = 0; 1025 nl2cache = 0; 1026 is_log = B_FALSE; 1027 seen_logs = B_FALSE; 1028 1029 while (argc > 0) { 1030 nv = NULL; 1031 1032 /* 1033 * If it's a mirror or raidz, the subsequent arguments are 1034 * its leaves -- until we encounter the next mirror or raidz. 1035 */ 1036 if ((type = is_grouping(argv[0], &mindev, &maxdev)) != NULL) { 1037 nvlist_t **child = NULL; 1038 int c, children = 0; 1039 1040 if (strcmp(type, VDEV_TYPE_SPARE) == 0) { 1041 if (spares != NULL) { 1042 (void) fprintf(stderr, 1043 gettext("invalid vdev " 1044 "specification: 'spare' can be " 1045 "specified only once\n")); 1046 return (NULL); 1047 } 1048 is_log = B_FALSE; 1049 } 1050 1051 if (strcmp(type, VDEV_TYPE_LOG) == 0) { 1052 if (seen_logs) { 1053 (void) fprintf(stderr, 1054 gettext("invalid vdev " 1055 "specification: 'log' can be " 1056 "specified only once\n")); 1057 return (NULL); 1058 } 1059 seen_logs = B_TRUE; 1060 is_log = B_TRUE; 1061 argc--; 1062 argv++; 1063 /* 1064 * A log is not a real grouping device. 1065 * We just set is_log and continue. 1066 */ 1067 continue; 1068 } 1069 1070 if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) { 1071 if (l2cache != NULL) { 1072 (void) fprintf(stderr, 1073 gettext("invalid vdev " 1074 "specification: 'cache' can be " 1075 "specified only once\n")); 1076 return (NULL); 1077 } 1078 is_log = B_FALSE; 1079 } 1080 1081 if (is_log) { 1082 if (strcmp(type, VDEV_TYPE_MIRROR) != 0) { 1083 (void) fprintf(stderr, 1084 gettext("invalid vdev " 1085 "specification: unsupported 'log' " 1086 "device: %s\n"), type); 1087 return (NULL); 1088 } 1089 nlogs++; 1090 } 1091 1092 for (c = 1; c < argc; c++) { 1093 if (is_grouping(argv[c], NULL, NULL) != NULL) 1094 break; 1095 children++; 1096 child = realloc(child, 1097 children * sizeof (nvlist_t *)); 1098 if (child == NULL) 1099 zpool_no_memory(); 1100 if ((nv = make_leaf_vdev(argv[c], B_FALSE)) 1101 == NULL) 1102 return (NULL); 1103 child[children - 1] = nv; 1104 } 1105 1106 if (children < mindev) { 1107 (void) fprintf(stderr, gettext("invalid vdev " 1108 "specification: %s requires at least %d " 1109 "devices\n"), argv[0], mindev); 1110 return (NULL); 1111 } 1112 1113 if (children > maxdev) { 1114 (void) fprintf(stderr, gettext("invalid vdev " 1115 "specification: %s supports no more than " 1116 "%d devices\n"), argv[0], maxdev); 1117 return (NULL); 1118 } 1119 1120 argc -= c; 1121 argv += c; 1122 1123 if (strcmp(type, VDEV_TYPE_SPARE) == 0) { 1124 spares = child; 1125 nspares = children; 1126 continue; 1127 } else if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) { 1128 l2cache = child; 1129 nl2cache = children; 1130 continue; 1131 } else { 1132 verify(nvlist_alloc(&nv, NV_UNIQUE_NAME, 1133 0) == 0); 1134 verify(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE, 1135 type) == 0); 1136 verify(nvlist_add_uint64(nv, 1137 ZPOOL_CONFIG_IS_LOG, is_log) == 0); 1138 if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { 1139 verify(nvlist_add_uint64(nv, 1140 ZPOOL_CONFIG_NPARITY, 1141 mindev - 1) == 0); 1142 } 1143 verify(nvlist_add_nvlist_array(nv, 1144 ZPOOL_CONFIG_CHILDREN, child, 1145 children) == 0); 1146 1147 for (c = 0; c < children; c++) 1148 nvlist_free(child[c]); 1149 free(child); 1150 } 1151 } else { 1152 /* 1153 * We have a device. Pass off to make_leaf_vdev() to 1154 * construct the appropriate nvlist describing the vdev. 1155 */ 1156 if ((nv = make_leaf_vdev(argv[0], is_log)) == NULL) 1157 return (NULL); 1158 if (is_log) 1159 nlogs++; 1160 argc--; 1161 argv++; 1162 } 1163 1164 toplevels++; 1165 top = realloc(top, toplevels * sizeof (nvlist_t *)); 1166 if (top == NULL) 1167 zpool_no_memory(); 1168 top[toplevels - 1] = nv; 1169 } 1170 1171 if (toplevels == 0 && nspares == 0 && nl2cache == 0) { 1172 (void) fprintf(stderr, gettext("invalid vdev " 1173 "specification: at least one toplevel vdev must be " 1174 "specified\n")); 1175 return (NULL); 1176 } 1177 1178 if (seen_logs && nlogs == 0) { 1179 (void) fprintf(stderr, gettext("invalid vdev specification: " 1180 "log requires at least 1 device\n")); 1181 return (NULL); 1182 } 1183 1184 /* 1185 * Finally, create nvroot and add all top-level vdevs to it. 1186 */ 1187 verify(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) == 0); 1188 verify(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 1189 VDEV_TYPE_ROOT) == 0); 1190 verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 1191 top, toplevels) == 0); 1192 if (nspares != 0) 1193 verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1194 spares, nspares) == 0); 1195 if (nl2cache != 0) 1196 verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 1197 l2cache, nl2cache) == 0); 1198 1199 for (t = 0; t < toplevels; t++) 1200 nvlist_free(top[t]); 1201 for (t = 0; t < nspares; t++) 1202 nvlist_free(spares[t]); 1203 for (t = 0; t < nl2cache; t++) 1204 nvlist_free(l2cache[t]); 1205 if (spares) 1206 free(spares); 1207 if (l2cache) 1208 free(l2cache); 1209 free(top); 1210 1211 return (nvroot); 1212 } 1213 1214 nvlist_t * 1215 split_mirror_vdev(zpool_handle_t *zhp, char *newname, nvlist_t *props, 1216 splitflags_t flags, int argc, char **argv) 1217 { 1218 nvlist_t *newroot = NULL, **child; 1219 uint_t c, children; 1220 1221 if (argc > 0) { 1222 if ((newroot = construct_spec(argc, argv)) == NULL) { 1223 (void) fprintf(stderr, gettext("Unable to build a " 1224 "pool from the specified devices\n")); 1225 return (NULL); 1226 } 1227 1228 if (!flags.dryrun && make_disks(zhp, newroot) != 0) { 1229 nvlist_free(newroot); 1230 return (NULL); 1231 } 1232 1233 /* avoid any tricks in the spec */ 1234 verify(nvlist_lookup_nvlist_array(newroot, 1235 ZPOOL_CONFIG_CHILDREN, &child, &children) == 0); 1236 for (c = 0; c < children; c++) { 1237 char *path; 1238 const char *type; 1239 int min, max; 1240 1241 verify(nvlist_lookup_string(child[c], 1242 ZPOOL_CONFIG_PATH, &path) == 0); 1243 if ((type = is_grouping(path, &min, &max)) != NULL) { 1244 (void) fprintf(stderr, gettext("Cannot use " 1245 "'%s' as a device for splitting\n"), type); 1246 nvlist_free(newroot); 1247 return (NULL); 1248 } 1249 } 1250 } 1251 1252 if (zpool_vdev_split(zhp, newname, &newroot, props, flags) != 0) { 1253 if (newroot != NULL) 1254 nvlist_free(newroot); 1255 return (NULL); 1256 } 1257 1258 return (newroot); 1259 } 1260 1261 /* 1262 * Get and validate the contents of the given vdev specification. This ensures 1263 * that the nvlist returned is well-formed, that all the devices exist, and that 1264 * they are not currently in use by any other known consumer. The 'poolconfig' 1265 * parameter is the current configuration of the pool when adding devices 1266 * existing pool, and is used to perform additional checks, such as changing the 1267 * replication level of the pool. It can be 'NULL' to indicate that this is a 1268 * new pool. The 'force' flag controls whether devices should be forcefully 1269 * added, even if they appear in use. 1270 */ 1271 nvlist_t * 1272 make_root_vdev(zpool_handle_t *zhp, int force, int check_rep, 1273 boolean_t isreplacing, boolean_t dryrun, int argc, char **argv) 1274 { 1275 nvlist_t *newroot; 1276 nvlist_t *poolconfig = NULL; 1277 is_force = force; 1278 1279 /* 1280 * Construct the vdev specification. If this is successful, we know 1281 * that we have a valid specification, and that all devices can be 1282 * opened. 1283 */ 1284 if ((newroot = construct_spec(argc, argv)) == NULL) 1285 return (NULL); 1286 1287 if (zhp && ((poolconfig = zpool_get_config(zhp, NULL)) == NULL)) 1288 return (NULL); 1289 1290 /* 1291 * Validate each device to make sure that its not shared with another 1292 * subsystem. We do this even if 'force' is set, because there are some 1293 * uses (such as a dedicated dump device) that even '-f' cannot 1294 * override. 1295 */ 1296 if (check_in_use(poolconfig, newroot, force, isreplacing, 1297 B_FALSE) != 0) { 1298 nvlist_free(newroot); 1299 return (NULL); 1300 } 1301 1302 /* 1303 * Check the replication level of the given vdevs and report any errors 1304 * found. We include the existing pool spec, if any, as we need to 1305 * catch changes against the existing replication level. 1306 */ 1307 if (check_rep && check_replication(poolconfig, newroot) != 0) { 1308 nvlist_free(newroot); 1309 return (NULL); 1310 } 1311 1312 /* 1313 * Run through the vdev specification and label any whole disks found. 1314 */ 1315 if (!dryrun && make_disks(zhp, newroot) != 0) { 1316 nvlist_free(newroot); 1317 return (NULL); 1318 } 1319 1320 return (newroot); 1321 } 1322