1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * This file contains all the routines used when modifying on-disk SPA state. 29 * This includes opening, importing, destroying, exporting a pool, and syncing a 30 * pool. 31 */ 32 33 #include <sys/zfs_context.h> 34 #include <sys/fm/fs/zfs.h> 35 #include <sys/spa_impl.h> 36 #include <sys/zio.h> 37 #include <sys/zio_checksum.h> 38 #include <sys/zio_compress.h> 39 #include <sys/dmu.h> 40 #include <sys/dmu_tx.h> 41 #include <sys/zap.h> 42 #include <sys/zil.h> 43 #include <sys/vdev_impl.h> 44 #include <sys/metaslab.h> 45 #include <sys/uberblock_impl.h> 46 #include <sys/txg.h> 47 #include <sys/avl.h> 48 #include <sys/dmu_traverse.h> 49 #include <sys/dmu_objset.h> 50 #include <sys/unique.h> 51 #include <sys/dsl_pool.h> 52 #include <sys/dsl_dataset.h> 53 #include <sys/dsl_dir.h> 54 #include <sys/dsl_prop.h> 55 #include <sys/dsl_synctask.h> 56 #include <sys/fs/zfs.h> 57 #include <sys/arc.h> 58 #include <sys/callb.h> 59 #include <sys/systeminfo.h> 60 #include <sys/sunddi.h> 61 #include <sys/spa_boot.h> 62 63 #include "zfs_prop.h" 64 #include "zfs_comutil.h" 65 66 int zio_taskq_threads = 8; 67 68 static void spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx); 69 static boolean_t spa_has_active_shared_spare(spa_t *spa); 70 71 /* 72 * ========================================================================== 73 * SPA properties routines 74 * ========================================================================== 75 */ 76 77 /* 78 * Add a (source=src, propname=propval) list to an nvlist. 79 */ 80 static void 81 spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, 82 uint64_t intval, zprop_source_t src) 83 { 84 const char *propname = zpool_prop_to_name(prop); 85 nvlist_t *propval; 86 87 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 88 VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); 89 90 if (strval != NULL) 91 VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); 92 else 93 VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0); 94 95 VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); 96 nvlist_free(propval); 97 } 98 99 /* 100 * Get property values from the spa configuration. 101 */ 102 static void 103 spa_prop_get_config(spa_t *spa, nvlist_t **nvp) 104 { 105 uint64_t size = spa_get_space(spa); 106 uint64_t used = spa_get_alloc(spa); 107 uint64_t cap, version; 108 zprop_source_t src = ZPROP_SRC_NONE; 109 spa_config_dirent_t *dp; 110 111 /* 112 * readonly properties 113 */ 114 spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa->spa_name, 0, src); 115 spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); 116 spa_prop_add_list(*nvp, ZPOOL_PROP_USED, NULL, used, src); 117 spa_prop_add_list(*nvp, ZPOOL_PROP_AVAILABLE, NULL, size - used, src); 118 119 cap = (size == 0) ? 0 : (used * 100 / size); 120 spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); 121 122 spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); 123 spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, 124 spa->spa_root_vdev->vdev_state, src); 125 126 /* 127 * settable properties that are not stored in the pool property object. 128 */ 129 version = spa_version(spa); 130 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) 131 src = ZPROP_SRC_DEFAULT; 132 else 133 src = ZPROP_SRC_LOCAL; 134 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); 135 136 if (spa->spa_root != NULL) 137 spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 138 0, ZPROP_SRC_LOCAL); 139 140 if ((dp = list_head(&spa->spa_config_list)) != NULL) { 141 if (dp->scd_path == NULL) { 142 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 143 "none", 0, ZPROP_SRC_LOCAL); 144 } else if (strcmp(dp->scd_path, spa_config_path) != 0) { 145 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 146 dp->scd_path, 0, ZPROP_SRC_LOCAL); 147 } 148 } 149 } 150 151 /* 152 * Get zpool property values. 153 */ 154 int 155 spa_prop_get(spa_t *spa, nvlist_t **nvp) 156 { 157 zap_cursor_t zc; 158 zap_attribute_t za; 159 objset_t *mos = spa->spa_meta_objset; 160 int err; 161 162 VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 163 164 /* 165 * Get properties from the spa config. 166 */ 167 spa_prop_get_config(spa, nvp); 168 169 mutex_enter(&spa->spa_props_lock); 170 /* If no pool property object, no more prop to get. */ 171 if (spa->spa_pool_props_object == 0) { 172 mutex_exit(&spa->spa_props_lock); 173 return (0); 174 } 175 176 /* 177 * Get properties from the MOS pool property object. 178 */ 179 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 180 (err = zap_cursor_retrieve(&zc, &za)) == 0; 181 zap_cursor_advance(&zc)) { 182 uint64_t intval = 0; 183 char *strval = NULL; 184 zprop_source_t src = ZPROP_SRC_DEFAULT; 185 zpool_prop_t prop; 186 187 if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL) 188 continue; 189 190 switch (za.za_integer_length) { 191 case 8: 192 /* integer property */ 193 if (za.za_first_integer != 194 zpool_prop_default_numeric(prop)) 195 src = ZPROP_SRC_LOCAL; 196 197 if (prop == ZPOOL_PROP_BOOTFS) { 198 dsl_pool_t *dp; 199 dsl_dataset_t *ds = NULL; 200 201 dp = spa_get_dsl(spa); 202 rw_enter(&dp->dp_config_rwlock, RW_READER); 203 if (err = dsl_dataset_hold_obj(dp, 204 za.za_first_integer, FTAG, &ds)) { 205 rw_exit(&dp->dp_config_rwlock); 206 break; 207 } 208 209 strval = kmem_alloc( 210 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, 211 KM_SLEEP); 212 dsl_dataset_name(ds, strval); 213 dsl_dataset_rele(ds, FTAG); 214 rw_exit(&dp->dp_config_rwlock); 215 } else { 216 strval = NULL; 217 intval = za.za_first_integer; 218 } 219 220 spa_prop_add_list(*nvp, prop, strval, intval, src); 221 222 if (strval != NULL) 223 kmem_free(strval, 224 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); 225 226 break; 227 228 case 1: 229 /* string property */ 230 strval = kmem_alloc(za.za_num_integers, KM_SLEEP); 231 err = zap_lookup(mos, spa->spa_pool_props_object, 232 za.za_name, 1, za.za_num_integers, strval); 233 if (err) { 234 kmem_free(strval, za.za_num_integers); 235 break; 236 } 237 spa_prop_add_list(*nvp, prop, strval, 0, src); 238 kmem_free(strval, za.za_num_integers); 239 break; 240 241 default: 242 break; 243 } 244 } 245 zap_cursor_fini(&zc); 246 mutex_exit(&spa->spa_props_lock); 247 out: 248 if (err && err != ENOENT) { 249 nvlist_free(*nvp); 250 *nvp = NULL; 251 return (err); 252 } 253 254 return (0); 255 } 256 257 /* 258 * Validate the given pool properties nvlist and modify the list 259 * for the property values to be set. 260 */ 261 static int 262 spa_prop_validate(spa_t *spa, nvlist_t *props) 263 { 264 nvpair_t *elem; 265 int error = 0, reset_bootfs = 0; 266 uint64_t objnum; 267 268 elem = NULL; 269 while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 270 zpool_prop_t prop; 271 char *propname, *strval; 272 uint64_t intval; 273 objset_t *os; 274 char *slash; 275 276 propname = nvpair_name(elem); 277 278 if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL) 279 return (EINVAL); 280 281 switch (prop) { 282 case ZPOOL_PROP_VERSION: 283 error = nvpair_value_uint64(elem, &intval); 284 if (!error && 285 (intval < spa_version(spa) || intval > SPA_VERSION)) 286 error = EINVAL; 287 break; 288 289 case ZPOOL_PROP_DELEGATION: 290 case ZPOOL_PROP_AUTOREPLACE: 291 case ZPOOL_PROP_LISTSNAPS: 292 error = nvpair_value_uint64(elem, &intval); 293 if (!error && intval > 1) 294 error = EINVAL; 295 break; 296 297 case ZPOOL_PROP_BOOTFS: 298 if (spa_version(spa) < SPA_VERSION_BOOTFS) { 299 error = ENOTSUP; 300 break; 301 } 302 303 /* 304 * Make sure the vdev config is bootable 305 */ 306 if (!vdev_is_bootable(spa->spa_root_vdev)) { 307 error = ENOTSUP; 308 break; 309 } 310 311 reset_bootfs = 1; 312 313 error = nvpair_value_string(elem, &strval); 314 315 if (!error) { 316 uint64_t compress; 317 318 if (strval == NULL || strval[0] == '\0') { 319 objnum = zpool_prop_default_numeric( 320 ZPOOL_PROP_BOOTFS); 321 break; 322 } 323 324 if (error = dmu_objset_open(strval, DMU_OST_ZFS, 325 DS_MODE_USER | DS_MODE_READONLY, &os)) 326 break; 327 328 /* We don't support gzip bootable datasets */ 329 if ((error = dsl_prop_get_integer(strval, 330 zfs_prop_to_name(ZFS_PROP_COMPRESSION), 331 &compress, NULL)) == 0 && 332 !BOOTFS_COMPRESS_VALID(compress)) { 333 error = ENOTSUP; 334 } else { 335 objnum = dmu_objset_id(os); 336 } 337 dmu_objset_close(os); 338 } 339 break; 340 case ZPOOL_PROP_FAILUREMODE: 341 error = nvpair_value_uint64(elem, &intval); 342 if (!error && (intval < ZIO_FAILURE_MODE_WAIT || 343 intval > ZIO_FAILURE_MODE_PANIC)) 344 error = EINVAL; 345 346 /* 347 * This is a special case which only occurs when 348 * the pool has completely failed. This allows 349 * the user to change the in-core failmode property 350 * without syncing it out to disk (I/Os might 351 * currently be blocked). We do this by returning 352 * EIO to the caller (spa_prop_set) to trick it 353 * into thinking we encountered a property validation 354 * error. 355 */ 356 if (!error && spa_state(spa) == POOL_STATE_IO_FAILURE) { 357 spa->spa_failmode = intval; 358 error = EIO; 359 } 360 break; 361 362 case ZPOOL_PROP_CACHEFILE: 363 if ((error = nvpair_value_string(elem, &strval)) != 0) 364 break; 365 366 if (strval[0] == '\0') 367 break; 368 369 if (strcmp(strval, "none") == 0) 370 break; 371 372 if (strval[0] != '/') { 373 error = EINVAL; 374 break; 375 } 376 377 slash = strrchr(strval, '/'); 378 ASSERT(slash != NULL); 379 380 if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || 381 strcmp(slash, "/..") == 0) 382 error = EINVAL; 383 break; 384 } 385 386 if (error) 387 break; 388 } 389 390 if (!error && reset_bootfs) { 391 error = nvlist_remove(props, 392 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 393 394 if (!error) { 395 error = nvlist_add_uint64(props, 396 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 397 } 398 } 399 400 return (error); 401 } 402 403 int 404 spa_prop_set(spa_t *spa, nvlist_t *nvp) 405 { 406 int error; 407 408 if ((error = spa_prop_validate(spa, nvp)) != 0) 409 return (error); 410 411 return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, 412 spa, nvp, 3)); 413 } 414 415 /* 416 * If the bootfs property value is dsobj, clear it. 417 */ 418 void 419 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 420 { 421 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 422 VERIFY(zap_remove(spa->spa_meta_objset, 423 spa->spa_pool_props_object, 424 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 425 spa->spa_bootfs = 0; 426 } 427 } 428 429 /* 430 * ========================================================================== 431 * SPA state manipulation (open/create/destroy/import/export) 432 * ========================================================================== 433 */ 434 435 static int 436 spa_error_entry_compare(const void *a, const void *b) 437 { 438 spa_error_entry_t *sa = (spa_error_entry_t *)a; 439 spa_error_entry_t *sb = (spa_error_entry_t *)b; 440 int ret; 441 442 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 443 sizeof (zbookmark_t)); 444 445 if (ret < 0) 446 return (-1); 447 else if (ret > 0) 448 return (1); 449 else 450 return (0); 451 } 452 453 /* 454 * Utility function which retrieves copies of the current logs and 455 * re-initializes them in the process. 456 */ 457 void 458 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 459 { 460 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 461 462 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 463 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 464 465 avl_create(&spa->spa_errlist_scrub, 466 spa_error_entry_compare, sizeof (spa_error_entry_t), 467 offsetof(spa_error_entry_t, se_avl)); 468 avl_create(&spa->spa_errlist_last, 469 spa_error_entry_compare, sizeof (spa_error_entry_t), 470 offsetof(spa_error_entry_t, se_avl)); 471 } 472 473 /* 474 * Activate an uninitialized pool. 475 */ 476 static void 477 spa_activate(spa_t *spa) 478 { 479 int t; 480 481 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 482 483 spa->spa_state = POOL_STATE_ACTIVE; 484 485 spa->spa_normal_class = metaslab_class_create(); 486 spa->spa_log_class = metaslab_class_create(); 487 488 for (t = 0; t < ZIO_TYPES; t++) { 489 spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue", 490 zio_taskq_threads, maxclsyspri, 50, INT_MAX, 491 TASKQ_PREPOPULATE); 492 spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr", 493 zio_taskq_threads, maxclsyspri, 50, INT_MAX, 494 TASKQ_PREPOPULATE); 495 } 496 497 list_create(&spa->spa_dirty_list, sizeof (vdev_t), 498 offsetof(vdev_t, vdev_dirty_node)); 499 list_create(&spa->spa_zio_list, sizeof (zio_t), 500 offsetof(zio_t, zio_link_node)); 501 502 txg_list_create(&spa->spa_vdev_txg_list, 503 offsetof(struct vdev, vdev_txg_node)); 504 505 avl_create(&spa->spa_errlist_scrub, 506 spa_error_entry_compare, sizeof (spa_error_entry_t), 507 offsetof(spa_error_entry_t, se_avl)); 508 avl_create(&spa->spa_errlist_last, 509 spa_error_entry_compare, sizeof (spa_error_entry_t), 510 offsetof(spa_error_entry_t, se_avl)); 511 } 512 513 /* 514 * Opposite of spa_activate(). 515 */ 516 static void 517 spa_deactivate(spa_t *spa) 518 { 519 int t; 520 521 ASSERT(spa->spa_sync_on == B_FALSE); 522 ASSERT(spa->spa_dsl_pool == NULL); 523 ASSERT(spa->spa_root_vdev == NULL); 524 525 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 526 527 txg_list_destroy(&spa->spa_vdev_txg_list); 528 529 list_destroy(&spa->spa_dirty_list); 530 list_destroy(&spa->spa_zio_list); 531 532 for (t = 0; t < ZIO_TYPES; t++) { 533 taskq_destroy(spa->spa_zio_issue_taskq[t]); 534 taskq_destroy(spa->spa_zio_intr_taskq[t]); 535 spa->spa_zio_issue_taskq[t] = NULL; 536 spa->spa_zio_intr_taskq[t] = NULL; 537 } 538 539 metaslab_class_destroy(spa->spa_normal_class); 540 spa->spa_normal_class = NULL; 541 542 metaslab_class_destroy(spa->spa_log_class); 543 spa->spa_log_class = NULL; 544 545 /* 546 * If this was part of an import or the open otherwise failed, we may 547 * still have errors left in the queues. Empty them just in case. 548 */ 549 spa_errlog_drain(spa); 550 551 avl_destroy(&spa->spa_errlist_scrub); 552 avl_destroy(&spa->spa_errlist_last); 553 554 spa->spa_state = POOL_STATE_UNINITIALIZED; 555 } 556 557 /* 558 * Verify a pool configuration, and construct the vdev tree appropriately. This 559 * will create all the necessary vdevs in the appropriate layout, with each vdev 560 * in the CLOSED state. This will prep the pool before open/creation/import. 561 * All vdev validation is done by the vdev_alloc() routine. 562 */ 563 static int 564 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 565 uint_t id, int atype) 566 { 567 nvlist_t **child; 568 uint_t c, children; 569 int error; 570 571 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 572 return (error); 573 574 if ((*vdp)->vdev_ops->vdev_op_leaf) 575 return (0); 576 577 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 578 &child, &children) != 0) { 579 vdev_free(*vdp); 580 *vdp = NULL; 581 return (EINVAL); 582 } 583 584 for (c = 0; c < children; c++) { 585 vdev_t *vd; 586 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 587 atype)) != 0) { 588 vdev_free(*vdp); 589 *vdp = NULL; 590 return (error); 591 } 592 } 593 594 ASSERT(*vdp != NULL); 595 596 return (0); 597 } 598 599 /* 600 * Opposite of spa_load(). 601 */ 602 static void 603 spa_unload(spa_t *spa) 604 { 605 int i; 606 607 /* 608 * Stop async tasks. 609 */ 610 spa_async_suspend(spa); 611 612 /* 613 * Stop syncing. 614 */ 615 if (spa->spa_sync_on) { 616 txg_sync_stop(spa->spa_dsl_pool); 617 spa->spa_sync_on = B_FALSE; 618 } 619 620 /* 621 * Wait for any outstanding prefetch I/O to complete. 622 */ 623 spa_config_enter(spa, RW_WRITER, FTAG); 624 spa_config_exit(spa, FTAG); 625 626 /* 627 * Drop and purge level 2 cache 628 */ 629 spa_l2cache_drop(spa); 630 631 /* 632 * Close the dsl pool. 633 */ 634 if (spa->spa_dsl_pool) { 635 dsl_pool_close(spa->spa_dsl_pool); 636 spa->spa_dsl_pool = NULL; 637 } 638 639 /* 640 * Close all vdevs. 641 */ 642 if (spa->spa_root_vdev) 643 vdev_free(spa->spa_root_vdev); 644 ASSERT(spa->spa_root_vdev == NULL); 645 646 for (i = 0; i < spa->spa_spares.sav_count; i++) 647 vdev_free(spa->spa_spares.sav_vdevs[i]); 648 if (spa->spa_spares.sav_vdevs) { 649 kmem_free(spa->spa_spares.sav_vdevs, 650 spa->spa_spares.sav_count * sizeof (void *)); 651 spa->spa_spares.sav_vdevs = NULL; 652 } 653 if (spa->spa_spares.sav_config) { 654 nvlist_free(spa->spa_spares.sav_config); 655 spa->spa_spares.sav_config = NULL; 656 } 657 spa->spa_spares.sav_count = 0; 658 659 for (i = 0; i < spa->spa_l2cache.sav_count; i++) 660 vdev_free(spa->spa_l2cache.sav_vdevs[i]); 661 if (spa->spa_l2cache.sav_vdevs) { 662 kmem_free(spa->spa_l2cache.sav_vdevs, 663 spa->spa_l2cache.sav_count * sizeof (void *)); 664 spa->spa_l2cache.sav_vdevs = NULL; 665 } 666 if (spa->spa_l2cache.sav_config) { 667 nvlist_free(spa->spa_l2cache.sav_config); 668 spa->spa_l2cache.sav_config = NULL; 669 } 670 spa->spa_l2cache.sav_count = 0; 671 672 spa->spa_async_suspended = 0; 673 } 674 675 /* 676 * Load (or re-load) the current list of vdevs describing the active spares for 677 * this pool. When this is called, we have some form of basic information in 678 * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and 679 * then re-generate a more complete list including status information. 680 */ 681 static void 682 spa_load_spares(spa_t *spa) 683 { 684 nvlist_t **spares; 685 uint_t nspares; 686 int i; 687 vdev_t *vd, *tvd; 688 689 /* 690 * First, close and free any existing spare vdevs. 691 */ 692 for (i = 0; i < spa->spa_spares.sav_count; i++) { 693 vd = spa->spa_spares.sav_vdevs[i]; 694 695 /* Undo the call to spa_activate() below */ 696 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 697 B_FALSE)) != NULL && tvd->vdev_isspare) 698 spa_spare_remove(tvd); 699 vdev_close(vd); 700 vdev_free(vd); 701 } 702 703 if (spa->spa_spares.sav_vdevs) 704 kmem_free(spa->spa_spares.sav_vdevs, 705 spa->spa_spares.sav_count * sizeof (void *)); 706 707 if (spa->spa_spares.sav_config == NULL) 708 nspares = 0; 709 else 710 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 711 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 712 713 spa->spa_spares.sav_count = (int)nspares; 714 spa->spa_spares.sav_vdevs = NULL; 715 716 if (nspares == 0) 717 return; 718 719 /* 720 * Construct the array of vdevs, opening them to get status in the 721 * process. For each spare, there is potentially two different vdev_t 722 * structures associated with it: one in the list of spares (used only 723 * for basic validation purposes) and one in the active vdev 724 * configuration (if it's spared in). During this phase we open and 725 * validate each vdev on the spare list. If the vdev also exists in the 726 * active configuration, then we also mark this vdev as an active spare. 727 */ 728 spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *), 729 KM_SLEEP); 730 for (i = 0; i < spa->spa_spares.sav_count; i++) { 731 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 732 VDEV_ALLOC_SPARE) == 0); 733 ASSERT(vd != NULL); 734 735 spa->spa_spares.sav_vdevs[i] = vd; 736 737 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 738 B_FALSE)) != NULL) { 739 if (!tvd->vdev_isspare) 740 spa_spare_add(tvd); 741 742 /* 743 * We only mark the spare active if we were successfully 744 * able to load the vdev. Otherwise, importing a pool 745 * with a bad active spare would result in strange 746 * behavior, because multiple pool would think the spare 747 * is actively in use. 748 * 749 * There is a vulnerability here to an equally bizarre 750 * circumstance, where a dead active spare is later 751 * brought back to life (onlined or otherwise). Given 752 * the rarity of this scenario, and the extra complexity 753 * it adds, we ignore the possibility. 754 */ 755 if (!vdev_is_dead(tvd)) 756 spa_spare_activate(tvd); 757 } 758 759 if (vdev_open(vd) != 0) 760 continue; 761 762 vd->vdev_top = vd; 763 if (vdev_validate_aux(vd) == 0) 764 spa_spare_add(vd); 765 } 766 767 /* 768 * Recompute the stashed list of spares, with status information 769 * this time. 770 */ 771 VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, 772 DATA_TYPE_NVLIST_ARRAY) == 0); 773 774 spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), 775 KM_SLEEP); 776 for (i = 0; i < spa->spa_spares.sav_count; i++) 777 spares[i] = vdev_config_generate(spa, 778 spa->spa_spares.sav_vdevs[i], B_TRUE, B_TRUE, B_FALSE); 779 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 780 ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); 781 for (i = 0; i < spa->spa_spares.sav_count; i++) 782 nvlist_free(spares[i]); 783 kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); 784 } 785 786 /* 787 * Load (or re-load) the current list of vdevs describing the active l2cache for 788 * this pool. When this is called, we have some form of basic information in 789 * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and 790 * then re-generate a more complete list including status information. 791 * Devices which are already active have their details maintained, and are 792 * not re-opened. 793 */ 794 static void 795 spa_load_l2cache(spa_t *spa) 796 { 797 nvlist_t **l2cache; 798 uint_t nl2cache; 799 int i, j, oldnvdevs; 800 uint64_t guid, size; 801 vdev_t *vd, **oldvdevs, **newvdevs; 802 spa_aux_vdev_t *sav = &spa->spa_l2cache; 803 804 if (sav->sav_config != NULL) { 805 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 806 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 807 newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); 808 } else { 809 nl2cache = 0; 810 } 811 812 oldvdevs = sav->sav_vdevs; 813 oldnvdevs = sav->sav_count; 814 sav->sav_vdevs = NULL; 815 sav->sav_count = 0; 816 817 /* 818 * Process new nvlist of vdevs. 819 */ 820 for (i = 0; i < nl2cache; i++) { 821 VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, 822 &guid) == 0); 823 824 newvdevs[i] = NULL; 825 for (j = 0; j < oldnvdevs; j++) { 826 vd = oldvdevs[j]; 827 if (vd != NULL && guid == vd->vdev_guid) { 828 /* 829 * Retain previous vdev for add/remove ops. 830 */ 831 newvdevs[i] = vd; 832 oldvdevs[j] = NULL; 833 break; 834 } 835 } 836 837 if (newvdevs[i] == NULL) { 838 /* 839 * Create new vdev 840 */ 841 VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, 842 VDEV_ALLOC_L2CACHE) == 0); 843 ASSERT(vd != NULL); 844 newvdevs[i] = vd; 845 846 /* 847 * Commit this vdev as an l2cache device, 848 * even if it fails to open. 849 */ 850 spa_l2cache_add(vd); 851 852 vd->vdev_top = vd; 853 vd->vdev_aux = sav; 854 855 spa_l2cache_activate(vd); 856 857 if (vdev_open(vd) != 0) 858 continue; 859 860 (void) vdev_validate_aux(vd); 861 862 if (!vdev_is_dead(vd)) { 863 size = vdev_get_rsize(vd); 864 l2arc_add_vdev(spa, vd, 865 VDEV_LABEL_START_SIZE, 866 size - VDEV_LABEL_START_SIZE); 867 } 868 } 869 } 870 871 /* 872 * Purge vdevs that were dropped 873 */ 874 for (i = 0; i < oldnvdevs; i++) { 875 uint64_t pool; 876 877 vd = oldvdevs[i]; 878 if (vd != NULL) { 879 if (spa_mode & FWRITE && 880 spa_l2cache_exists(vd->vdev_guid, &pool) && 881 pool != 0ULL && 882 l2arc_vdev_present(vd)) { 883 l2arc_remove_vdev(vd); 884 } 885 (void) vdev_close(vd); 886 spa_l2cache_remove(vd); 887 } 888 } 889 890 if (oldvdevs) 891 kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); 892 893 if (sav->sav_config == NULL) 894 goto out; 895 896 sav->sav_vdevs = newvdevs; 897 sav->sav_count = (int)nl2cache; 898 899 /* 900 * Recompute the stashed list of l2cache devices, with status 901 * information this time. 902 */ 903 VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 904 DATA_TYPE_NVLIST_ARRAY) == 0); 905 906 l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 907 for (i = 0; i < sav->sav_count; i++) 908 l2cache[i] = vdev_config_generate(spa, 909 sav->sav_vdevs[i], B_TRUE, B_FALSE, B_TRUE); 910 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 911 ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); 912 out: 913 for (i = 0; i < sav->sav_count; i++) 914 nvlist_free(l2cache[i]); 915 if (sav->sav_count) 916 kmem_free(l2cache, sav->sav_count * sizeof (void *)); 917 } 918 919 static int 920 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 921 { 922 dmu_buf_t *db; 923 char *packed = NULL; 924 size_t nvsize = 0; 925 int error; 926 *value = NULL; 927 928 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 929 nvsize = *(uint64_t *)db->db_data; 930 dmu_buf_rele(db, FTAG); 931 932 packed = kmem_alloc(nvsize, KM_SLEEP); 933 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed); 934 if (error == 0) 935 error = nvlist_unpack(packed, nvsize, value, 0); 936 kmem_free(packed, nvsize); 937 938 return (error); 939 } 940 941 /* 942 * Checks to see if the given vdev could not be opened, in which case we post a 943 * sysevent to notify the autoreplace code that the device has been removed. 944 */ 945 static void 946 spa_check_removed(vdev_t *vd) 947 { 948 int c; 949 950 for (c = 0; c < vd->vdev_children; c++) 951 spa_check_removed(vd->vdev_child[c]); 952 953 if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) { 954 zfs_post_autoreplace(vd->vdev_spa, vd); 955 spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK); 956 } 957 } 958 959 /* 960 * Check for missing log devices 961 */ 962 int 963 spa_check_logs(spa_t *spa) 964 { 965 switch (spa->spa_log_state) { 966 case SPA_LOG_MISSING: 967 /* need to recheck in case slog has been restored */ 968 case SPA_LOG_UNKNOWN: 969 if (dmu_objset_find(spa->spa_name, zil_check_log_chain, NULL, 970 DS_FIND_CHILDREN)) { 971 spa->spa_log_state = SPA_LOG_MISSING; 972 return (1); 973 } 974 break; 975 976 case SPA_LOG_CLEAR: 977 (void) dmu_objset_find(spa->spa_name, zil_clear_log_chain, NULL, 978 DS_FIND_CHILDREN); 979 break; 980 } 981 spa->spa_log_state = SPA_LOG_GOOD; 982 return (0); 983 } 984 985 /* 986 * Load an existing storage pool, using the pool's builtin spa_config as a 987 * source of configuration information. 988 */ 989 static int 990 spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) 991 { 992 int error = 0; 993 nvlist_t *nvroot = NULL; 994 vdev_t *rvd; 995 uberblock_t *ub = &spa->spa_uberblock; 996 uint64_t config_cache_txg = spa->spa_config_txg; 997 uint64_t pool_guid; 998 uint64_t version; 999 zio_t *zio; 1000 uint64_t autoreplace = 0; 1001 char *ereport = FM_EREPORT_ZFS_POOL; 1002 1003 spa->spa_load_state = state; 1004 1005 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 1006 nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 1007 error = EINVAL; 1008 goto out; 1009 } 1010 1011 /* 1012 * Versioning wasn't explicitly added to the label until later, so if 1013 * it's not present treat it as the initial version. 1014 */ 1015 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0) 1016 version = SPA_VERSION_INITIAL; 1017 1018 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 1019 &spa->spa_config_txg); 1020 1021 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 1022 spa_guid_exists(pool_guid, 0)) { 1023 error = EEXIST; 1024 goto out; 1025 } 1026 1027 spa->spa_load_guid = pool_guid; 1028 1029 /* 1030 * Parse the configuration into a vdev tree. We explicitly set the 1031 * value that will be returned by spa_version() since parsing the 1032 * configuration requires knowing the version number. 1033 */ 1034 spa_config_enter(spa, RW_WRITER, FTAG); 1035 spa->spa_ubsync.ub_version = version; 1036 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 1037 spa_config_exit(spa, FTAG); 1038 1039 if (error != 0) 1040 goto out; 1041 1042 ASSERT(spa->spa_root_vdev == rvd); 1043 ASSERT(spa_guid(spa) == pool_guid); 1044 1045 /* 1046 * Try to open all vdevs, loading each label in the process. 1047 */ 1048 error = vdev_open(rvd); 1049 if (error != 0) 1050 goto out; 1051 1052 /* 1053 * Validate the labels for all leaf vdevs. We need to grab the config 1054 * lock because all label I/O is done with the ZIO_FLAG_CONFIG_HELD 1055 * flag. 1056 */ 1057 spa_config_enter(spa, RW_READER, FTAG); 1058 error = vdev_validate(rvd); 1059 spa_config_exit(spa, FTAG); 1060 1061 if (error != 0) 1062 goto out; 1063 1064 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 1065 error = ENXIO; 1066 goto out; 1067 } 1068 1069 /* 1070 * Find the best uberblock. 1071 */ 1072 bzero(ub, sizeof (uberblock_t)); 1073 1074 zio = zio_root(spa, NULL, NULL, 1075 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 1076 vdev_uberblock_load(zio, rvd, ub); 1077 error = zio_wait(zio); 1078 1079 /* 1080 * If we weren't able to find a single valid uberblock, return failure. 1081 */ 1082 if (ub->ub_txg == 0) { 1083 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1084 VDEV_AUX_CORRUPT_DATA); 1085 error = ENXIO; 1086 goto out; 1087 } 1088 1089 /* 1090 * If the pool is newer than the code, we can't open it. 1091 */ 1092 if (ub->ub_version > SPA_VERSION) { 1093 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1094 VDEV_AUX_VERSION_NEWER); 1095 error = ENOTSUP; 1096 goto out; 1097 } 1098 1099 /* 1100 * If the vdev guid sum doesn't match the uberblock, we have an 1101 * incomplete configuration. 1102 */ 1103 if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 1104 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1105 VDEV_AUX_BAD_GUID_SUM); 1106 error = ENXIO; 1107 goto out; 1108 } 1109 1110 /* 1111 * Initialize internal SPA structures. 1112 */ 1113 spa->spa_state = POOL_STATE_ACTIVE; 1114 spa->spa_ubsync = spa->spa_uberblock; 1115 spa->spa_first_txg = spa_last_synced_txg(spa) + 1; 1116 error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 1117 if (error) { 1118 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1119 VDEV_AUX_CORRUPT_DATA); 1120 goto out; 1121 } 1122 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 1123 1124 if (zap_lookup(spa->spa_meta_objset, 1125 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 1126 sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { 1127 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1128 VDEV_AUX_CORRUPT_DATA); 1129 error = EIO; 1130 goto out; 1131 } 1132 1133 if (!mosconfig) { 1134 nvlist_t *newconfig; 1135 uint64_t hostid; 1136 1137 if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) { 1138 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1139 VDEV_AUX_CORRUPT_DATA); 1140 error = EIO; 1141 goto out; 1142 } 1143 1144 if (nvlist_lookup_uint64(newconfig, ZPOOL_CONFIG_HOSTID, 1145 &hostid) == 0) { 1146 char *hostname; 1147 unsigned long myhostid = 0; 1148 1149 VERIFY(nvlist_lookup_string(newconfig, 1150 ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 1151 1152 (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 1153 if (hostid != 0 && myhostid != 0 && 1154 (unsigned long)hostid != myhostid) { 1155 cmn_err(CE_WARN, "pool '%s' could not be " 1156 "loaded as it was last accessed by " 1157 "another system (host: %s hostid: 0x%lx). " 1158 "See: http://www.sun.com/msg/ZFS-8000-EY", 1159 spa->spa_name, hostname, 1160 (unsigned long)hostid); 1161 error = EBADF; 1162 goto out; 1163 } 1164 } 1165 1166 spa_config_set(spa, newconfig); 1167 spa_unload(spa); 1168 spa_deactivate(spa); 1169 spa_activate(spa); 1170 1171 return (spa_load(spa, newconfig, state, B_TRUE)); 1172 } 1173 1174 if (zap_lookup(spa->spa_meta_objset, 1175 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 1176 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { 1177 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1178 VDEV_AUX_CORRUPT_DATA); 1179 error = EIO; 1180 goto out; 1181 } 1182 1183 /* 1184 * Load the bit that tells us to use the new accounting function 1185 * (raid-z deflation). If we have an older pool, this will not 1186 * be present. 1187 */ 1188 error = zap_lookup(spa->spa_meta_objset, 1189 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 1190 sizeof (uint64_t), 1, &spa->spa_deflate); 1191 if (error != 0 && error != ENOENT) { 1192 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1193 VDEV_AUX_CORRUPT_DATA); 1194 error = EIO; 1195 goto out; 1196 } 1197 1198 /* 1199 * Load the persistent error log. If we have an older pool, this will 1200 * not be present. 1201 */ 1202 error = zap_lookup(spa->spa_meta_objset, 1203 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, 1204 sizeof (uint64_t), 1, &spa->spa_errlog_last); 1205 if (error != 0 && error != ENOENT) { 1206 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1207 VDEV_AUX_CORRUPT_DATA); 1208 error = EIO; 1209 goto out; 1210 } 1211 1212 error = zap_lookup(spa->spa_meta_objset, 1213 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, 1214 sizeof (uint64_t), 1, &spa->spa_errlog_scrub); 1215 if (error != 0 && error != ENOENT) { 1216 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1217 VDEV_AUX_CORRUPT_DATA); 1218 error = EIO; 1219 goto out; 1220 } 1221 1222 /* 1223 * Load the history object. If we have an older pool, this 1224 * will not be present. 1225 */ 1226 error = zap_lookup(spa->spa_meta_objset, 1227 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY, 1228 sizeof (uint64_t), 1, &spa->spa_history); 1229 if (error != 0 && error != ENOENT) { 1230 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1231 VDEV_AUX_CORRUPT_DATA); 1232 error = EIO; 1233 goto out; 1234 } 1235 1236 /* 1237 * Load any hot spares for this pool. 1238 */ 1239 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1240 DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares.sav_object); 1241 if (error != 0 && error != ENOENT) { 1242 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1243 VDEV_AUX_CORRUPT_DATA); 1244 error = EIO; 1245 goto out; 1246 } 1247 if (error == 0) { 1248 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 1249 if (load_nvlist(spa, spa->spa_spares.sav_object, 1250 &spa->spa_spares.sav_config) != 0) { 1251 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1252 VDEV_AUX_CORRUPT_DATA); 1253 error = EIO; 1254 goto out; 1255 } 1256 1257 spa_config_enter(spa, RW_WRITER, FTAG); 1258 spa_load_spares(spa); 1259 spa_config_exit(spa, FTAG); 1260 } 1261 1262 /* 1263 * Load any level 2 ARC devices for this pool. 1264 */ 1265 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1266 DMU_POOL_L2CACHE, sizeof (uint64_t), 1, 1267 &spa->spa_l2cache.sav_object); 1268 if (error != 0 && error != ENOENT) { 1269 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1270 VDEV_AUX_CORRUPT_DATA); 1271 error = EIO; 1272 goto out; 1273 } 1274 if (error == 0) { 1275 ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); 1276 if (load_nvlist(spa, spa->spa_l2cache.sav_object, 1277 &spa->spa_l2cache.sav_config) != 0) { 1278 vdev_set_state(rvd, B_TRUE, 1279 VDEV_STATE_CANT_OPEN, 1280 VDEV_AUX_CORRUPT_DATA); 1281 error = EIO; 1282 goto out; 1283 } 1284 1285 spa_config_enter(spa, RW_WRITER, FTAG); 1286 spa_load_l2cache(spa); 1287 spa_config_exit(spa, FTAG); 1288 } 1289 1290 if (spa_check_logs(spa)) { 1291 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1292 VDEV_AUX_BAD_LOG); 1293 error = ENXIO; 1294 ereport = FM_EREPORT_ZFS_LOG_REPLAY; 1295 goto out; 1296 } 1297 1298 1299 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 1300 1301 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1302 DMU_POOL_PROPS, sizeof (uint64_t), 1, &spa->spa_pool_props_object); 1303 1304 if (error && error != ENOENT) { 1305 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1306 VDEV_AUX_CORRUPT_DATA); 1307 error = EIO; 1308 goto out; 1309 } 1310 1311 if (error == 0) { 1312 (void) zap_lookup(spa->spa_meta_objset, 1313 spa->spa_pool_props_object, 1314 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), 1315 sizeof (uint64_t), 1, &spa->spa_bootfs); 1316 (void) zap_lookup(spa->spa_meta_objset, 1317 spa->spa_pool_props_object, 1318 zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), 1319 sizeof (uint64_t), 1, &autoreplace); 1320 (void) zap_lookup(spa->spa_meta_objset, 1321 spa->spa_pool_props_object, 1322 zpool_prop_to_name(ZPOOL_PROP_DELEGATION), 1323 sizeof (uint64_t), 1, &spa->spa_delegation); 1324 (void) zap_lookup(spa->spa_meta_objset, 1325 spa->spa_pool_props_object, 1326 zpool_prop_to_name(ZPOOL_PROP_FAILUREMODE), 1327 sizeof (uint64_t), 1, &spa->spa_failmode); 1328 } 1329 1330 /* 1331 * If the 'autoreplace' property is set, then post a resource notifying 1332 * the ZFS DE that it should not issue any faults for unopenable 1333 * devices. We also iterate over the vdevs, and post a sysevent for any 1334 * unopenable vdevs so that the normal autoreplace handler can take 1335 * over. 1336 */ 1337 if (autoreplace && state != SPA_LOAD_TRYIMPORT) 1338 spa_check_removed(spa->spa_root_vdev); 1339 1340 /* 1341 * Load the vdev state for all toplevel vdevs. 1342 */ 1343 vdev_load(rvd); 1344 1345 /* 1346 * Propagate the leaf DTLs we just loaded all the way up the tree. 1347 */ 1348 spa_config_enter(spa, RW_WRITER, FTAG); 1349 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 1350 spa_config_exit(spa, FTAG); 1351 1352 /* 1353 * Check the state of the root vdev. If it can't be opened, it 1354 * indicates one or more toplevel vdevs are faulted. 1355 */ 1356 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 1357 error = ENXIO; 1358 goto out; 1359 } 1360 1361 if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) { 1362 dmu_tx_t *tx; 1363 int need_update = B_FALSE; 1364 int c; 1365 1366 /* 1367 * Claim log blocks that haven't been committed yet. 1368 * This must all happen in a single txg. 1369 */ 1370 tx = dmu_tx_create_assigned(spa_get_dsl(spa), 1371 spa_first_txg(spa)); 1372 (void) dmu_objset_find(spa->spa_name, 1373 zil_claim, tx, DS_FIND_CHILDREN); 1374 dmu_tx_commit(tx); 1375 1376 spa->spa_sync_on = B_TRUE; 1377 txg_sync_start(spa->spa_dsl_pool); 1378 1379 /* 1380 * Wait for all claims to sync. 1381 */ 1382 txg_wait_synced(spa->spa_dsl_pool, 0); 1383 1384 /* 1385 * If the config cache is stale, or we have uninitialized 1386 * metaslabs (see spa_vdev_add()), then update the config. 1387 */ 1388 if (config_cache_txg != spa->spa_config_txg || 1389 state == SPA_LOAD_IMPORT) 1390 need_update = B_TRUE; 1391 1392 for (c = 0; c < rvd->vdev_children; c++) 1393 if (rvd->vdev_child[c]->vdev_ms_array == 0) 1394 need_update = B_TRUE; 1395 1396 /* 1397 * Update the config cache asychronously in case we're the 1398 * root pool, in which case the config cache isn't writable yet. 1399 */ 1400 if (need_update) 1401 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 1402 } 1403 1404 error = 0; 1405 out: 1406 spa->spa_minref = refcount_count(&spa->spa_refcount); 1407 if (error && error != EBADF) 1408 zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); 1409 spa->spa_load_state = SPA_LOAD_NONE; 1410 spa->spa_ena = 0; 1411 1412 return (error); 1413 } 1414 1415 /* 1416 * Pool Open/Import 1417 * 1418 * The import case is identical to an open except that the configuration is sent 1419 * down from userland, instead of grabbed from the configuration cache. For the 1420 * case of an open, the pool configuration will exist in the 1421 * POOL_STATE_UNINITIALIZED state. 1422 * 1423 * The stats information (gen/count/ustats) is used to gather vdev statistics at 1424 * the same time open the pool, without having to keep around the spa_t in some 1425 * ambiguous state. 1426 */ 1427 static int 1428 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) 1429 { 1430 spa_t *spa; 1431 int error; 1432 int locked = B_FALSE; 1433 1434 *spapp = NULL; 1435 1436 /* 1437 * As disgusting as this is, we need to support recursive calls to this 1438 * function because dsl_dir_open() is called during spa_load(), and ends 1439 * up calling spa_open() again. The real fix is to figure out how to 1440 * avoid dsl_dir_open() calling this in the first place. 1441 */ 1442 if (mutex_owner(&spa_namespace_lock) != curthread) { 1443 mutex_enter(&spa_namespace_lock); 1444 locked = B_TRUE; 1445 } 1446 1447 if ((spa = spa_lookup(pool)) == NULL) { 1448 if (locked) 1449 mutex_exit(&spa_namespace_lock); 1450 return (ENOENT); 1451 } 1452 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 1453 1454 spa_activate(spa); 1455 1456 error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE); 1457 1458 if (error == EBADF) { 1459 /* 1460 * If vdev_validate() returns failure (indicated by 1461 * EBADF), it indicates that one of the vdevs indicates 1462 * that the pool has been exported or destroyed. If 1463 * this is the case, the config cache is out of sync and 1464 * we should remove the pool from the namespace. 1465 */ 1466 spa_unload(spa); 1467 spa_deactivate(spa); 1468 spa_config_sync(spa, B_TRUE, B_TRUE); 1469 spa_remove(spa); 1470 if (locked) 1471 mutex_exit(&spa_namespace_lock); 1472 return (ENOENT); 1473 } 1474 1475 if (error) { 1476 /* 1477 * We can't open the pool, but we still have useful 1478 * information: the state of each vdev after the 1479 * attempted vdev_open(). Return this to the user. 1480 */ 1481 if (config != NULL && spa->spa_root_vdev != NULL) { 1482 spa_config_enter(spa, RW_READER, FTAG); 1483 *config = spa_config_generate(spa, NULL, -1ULL, 1484 B_TRUE); 1485 spa_config_exit(spa, FTAG); 1486 } 1487 spa_unload(spa); 1488 spa_deactivate(spa); 1489 spa->spa_last_open_failed = B_TRUE; 1490 if (locked) 1491 mutex_exit(&spa_namespace_lock); 1492 *spapp = NULL; 1493 return (error); 1494 } else { 1495 spa->spa_last_open_failed = B_FALSE; 1496 } 1497 } 1498 1499 spa_open_ref(spa, tag); 1500 1501 if (locked) 1502 mutex_exit(&spa_namespace_lock); 1503 1504 *spapp = spa; 1505 1506 if (config != NULL) { 1507 spa_config_enter(spa, RW_READER, FTAG); 1508 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 1509 spa_config_exit(spa, FTAG); 1510 } 1511 1512 return (0); 1513 } 1514 1515 int 1516 spa_open(const char *name, spa_t **spapp, void *tag) 1517 { 1518 return (spa_open_common(name, spapp, tag, NULL)); 1519 } 1520 1521 /* 1522 * Lookup the given spa_t, incrementing the inject count in the process, 1523 * preventing it from being exported or destroyed. 1524 */ 1525 spa_t * 1526 spa_inject_addref(char *name) 1527 { 1528 spa_t *spa; 1529 1530 mutex_enter(&spa_namespace_lock); 1531 if ((spa = spa_lookup(name)) == NULL) { 1532 mutex_exit(&spa_namespace_lock); 1533 return (NULL); 1534 } 1535 spa->spa_inject_ref++; 1536 mutex_exit(&spa_namespace_lock); 1537 1538 return (spa); 1539 } 1540 1541 void 1542 spa_inject_delref(spa_t *spa) 1543 { 1544 mutex_enter(&spa_namespace_lock); 1545 spa->spa_inject_ref--; 1546 mutex_exit(&spa_namespace_lock); 1547 } 1548 1549 /* 1550 * Add spares device information to the nvlist. 1551 */ 1552 static void 1553 spa_add_spares(spa_t *spa, nvlist_t *config) 1554 { 1555 nvlist_t **spares; 1556 uint_t i, nspares; 1557 nvlist_t *nvroot; 1558 uint64_t guid; 1559 vdev_stat_t *vs; 1560 uint_t vsc; 1561 uint64_t pool; 1562 1563 if (spa->spa_spares.sav_count == 0) 1564 return; 1565 1566 VERIFY(nvlist_lookup_nvlist(config, 1567 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 1568 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 1569 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1570 if (nspares != 0) { 1571 VERIFY(nvlist_add_nvlist_array(nvroot, 1572 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1573 VERIFY(nvlist_lookup_nvlist_array(nvroot, 1574 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1575 1576 /* 1577 * Go through and find any spares which have since been 1578 * repurposed as an active spare. If this is the case, update 1579 * their status appropriately. 1580 */ 1581 for (i = 0; i < nspares; i++) { 1582 VERIFY(nvlist_lookup_uint64(spares[i], 1583 ZPOOL_CONFIG_GUID, &guid) == 0); 1584 if (spa_spare_exists(guid, &pool, NULL) && 1585 pool != 0ULL) { 1586 VERIFY(nvlist_lookup_uint64_array( 1587 spares[i], ZPOOL_CONFIG_STATS, 1588 (uint64_t **)&vs, &vsc) == 0); 1589 vs->vs_state = VDEV_STATE_CANT_OPEN; 1590 vs->vs_aux = VDEV_AUX_SPARED; 1591 } 1592 } 1593 } 1594 } 1595 1596 /* 1597 * Add l2cache device information to the nvlist, including vdev stats. 1598 */ 1599 static void 1600 spa_add_l2cache(spa_t *spa, nvlist_t *config) 1601 { 1602 nvlist_t **l2cache; 1603 uint_t i, j, nl2cache; 1604 nvlist_t *nvroot; 1605 uint64_t guid; 1606 vdev_t *vd; 1607 vdev_stat_t *vs; 1608 uint_t vsc; 1609 1610 if (spa->spa_l2cache.sav_count == 0) 1611 return; 1612 1613 spa_config_enter(spa, RW_READER, FTAG); 1614 1615 VERIFY(nvlist_lookup_nvlist(config, 1616 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 1617 VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 1618 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 1619 if (nl2cache != 0) { 1620 VERIFY(nvlist_add_nvlist_array(nvroot, 1621 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 1622 VERIFY(nvlist_lookup_nvlist_array(nvroot, 1623 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 1624 1625 /* 1626 * Update level 2 cache device stats. 1627 */ 1628 1629 for (i = 0; i < nl2cache; i++) { 1630 VERIFY(nvlist_lookup_uint64(l2cache[i], 1631 ZPOOL_CONFIG_GUID, &guid) == 0); 1632 1633 vd = NULL; 1634 for (j = 0; j < spa->spa_l2cache.sav_count; j++) { 1635 if (guid == 1636 spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { 1637 vd = spa->spa_l2cache.sav_vdevs[j]; 1638 break; 1639 } 1640 } 1641 ASSERT(vd != NULL); 1642 1643 VERIFY(nvlist_lookup_uint64_array(l2cache[i], 1644 ZPOOL_CONFIG_STATS, (uint64_t **)&vs, &vsc) == 0); 1645 vdev_get_stats(vd, vs); 1646 } 1647 } 1648 1649 spa_config_exit(spa, FTAG); 1650 } 1651 1652 int 1653 spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 1654 { 1655 int error; 1656 spa_t *spa; 1657 1658 *config = NULL; 1659 error = spa_open_common(name, &spa, FTAG, config); 1660 1661 if (spa && *config != NULL) { 1662 VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, 1663 spa_get_errlog_size(spa)) == 0); 1664 1665 spa_add_spares(spa, *config); 1666 spa_add_l2cache(spa, *config); 1667 } 1668 1669 /* 1670 * We want to get the alternate root even for faulted pools, so we cheat 1671 * and call spa_lookup() directly. 1672 */ 1673 if (altroot) { 1674 if (spa == NULL) { 1675 mutex_enter(&spa_namespace_lock); 1676 spa = spa_lookup(name); 1677 if (spa) 1678 spa_altroot(spa, altroot, buflen); 1679 else 1680 altroot[0] = '\0'; 1681 spa = NULL; 1682 mutex_exit(&spa_namespace_lock); 1683 } else { 1684 spa_altroot(spa, altroot, buflen); 1685 } 1686 } 1687 1688 if (spa != NULL) 1689 spa_close(spa, FTAG); 1690 1691 return (error); 1692 } 1693 1694 /* 1695 * Validate that the auxiliary device array is well formed. We must have an 1696 * array of nvlists, each which describes a valid leaf vdev. If this is an 1697 * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be 1698 * specified, as long as they are well-formed. 1699 */ 1700 static int 1701 spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, 1702 spa_aux_vdev_t *sav, const char *config, uint64_t version, 1703 vdev_labeltype_t label) 1704 { 1705 nvlist_t **dev; 1706 uint_t i, ndev; 1707 vdev_t *vd; 1708 int error; 1709 1710 /* 1711 * It's acceptable to have no devs specified. 1712 */ 1713 if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) 1714 return (0); 1715 1716 if (ndev == 0) 1717 return (EINVAL); 1718 1719 /* 1720 * Make sure the pool is formatted with a version that supports this 1721 * device type. 1722 */ 1723 if (spa_version(spa) < version) 1724 return (ENOTSUP); 1725 1726 /* 1727 * Set the pending device list so we correctly handle device in-use 1728 * checking. 1729 */ 1730 sav->sav_pending = dev; 1731 sav->sav_npending = ndev; 1732 1733 for (i = 0; i < ndev; i++) { 1734 if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, 1735 mode)) != 0) 1736 goto out; 1737 1738 if (!vd->vdev_ops->vdev_op_leaf) { 1739 vdev_free(vd); 1740 error = EINVAL; 1741 goto out; 1742 } 1743 1744 /* 1745 * The L2ARC currently only supports disk devices. 1746 */ 1747 if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) && 1748 strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { 1749 error = ENOTBLK; 1750 goto out; 1751 } 1752 1753 vd->vdev_top = vd; 1754 1755 if ((error = vdev_open(vd)) == 0 && 1756 (error = vdev_label_init(vd, crtxg, label)) == 0) { 1757 VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, 1758 vd->vdev_guid) == 0); 1759 } 1760 1761 vdev_free(vd); 1762 1763 if (error && 1764 (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) 1765 goto out; 1766 else 1767 error = 0; 1768 } 1769 1770 out: 1771 sav->sav_pending = NULL; 1772 sav->sav_npending = 0; 1773 return (error); 1774 } 1775 1776 static int 1777 spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 1778 { 1779 int error; 1780 1781 if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, 1782 &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, 1783 VDEV_LABEL_SPARE)) != 0) { 1784 return (error); 1785 } 1786 1787 return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, 1788 &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, 1789 VDEV_LABEL_L2CACHE)); 1790 } 1791 1792 static void 1793 spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, 1794 const char *config) 1795 { 1796 int i; 1797 1798 if (sav->sav_config != NULL) { 1799 nvlist_t **olddevs; 1800 uint_t oldndevs; 1801 nvlist_t **newdevs; 1802 1803 /* 1804 * Generate new dev list by concatentating with the 1805 * current dev list. 1806 */ 1807 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, 1808 &olddevs, &oldndevs) == 0); 1809 1810 newdevs = kmem_alloc(sizeof (void *) * 1811 (ndevs + oldndevs), KM_SLEEP); 1812 for (i = 0; i < oldndevs; i++) 1813 VERIFY(nvlist_dup(olddevs[i], &newdevs[i], 1814 KM_SLEEP) == 0); 1815 for (i = 0; i < ndevs; i++) 1816 VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], 1817 KM_SLEEP) == 0); 1818 1819 VERIFY(nvlist_remove(sav->sav_config, config, 1820 DATA_TYPE_NVLIST_ARRAY) == 0); 1821 1822 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 1823 config, newdevs, ndevs + oldndevs) == 0); 1824 for (i = 0; i < oldndevs + ndevs; i++) 1825 nvlist_free(newdevs[i]); 1826 kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); 1827 } else { 1828 /* 1829 * Generate a new dev list. 1830 */ 1831 VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, 1832 KM_SLEEP) == 0); 1833 VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, 1834 devs, ndevs) == 0); 1835 } 1836 } 1837 1838 /* 1839 * Stop and drop level 2 ARC devices 1840 */ 1841 void 1842 spa_l2cache_drop(spa_t *spa) 1843 { 1844 vdev_t *vd; 1845 int i; 1846 spa_aux_vdev_t *sav = &spa->spa_l2cache; 1847 1848 for (i = 0; i < sav->sav_count; i++) { 1849 uint64_t pool; 1850 1851 vd = sav->sav_vdevs[i]; 1852 ASSERT(vd != NULL); 1853 1854 if (spa_mode & FWRITE && 1855 spa_l2cache_exists(vd->vdev_guid, &pool) && pool != 0ULL && 1856 l2arc_vdev_present(vd)) { 1857 l2arc_remove_vdev(vd); 1858 } 1859 if (vd->vdev_isl2cache) 1860 spa_l2cache_remove(vd); 1861 vdev_clear_stats(vd); 1862 (void) vdev_close(vd); 1863 } 1864 } 1865 1866 /* 1867 * Pool Creation 1868 */ 1869 int 1870 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 1871 const char *history_str, nvlist_t *zplprops) 1872 { 1873 spa_t *spa; 1874 char *altroot = NULL; 1875 vdev_t *rvd; 1876 dsl_pool_t *dp; 1877 dmu_tx_t *tx; 1878 int c, error = 0; 1879 uint64_t txg = TXG_INITIAL; 1880 nvlist_t **spares, **l2cache; 1881 uint_t nspares, nl2cache; 1882 uint64_t version; 1883 1884 /* 1885 * If this pool already exists, return failure. 1886 */ 1887 mutex_enter(&spa_namespace_lock); 1888 if (spa_lookup(pool) != NULL) { 1889 mutex_exit(&spa_namespace_lock); 1890 return (EEXIST); 1891 } 1892 1893 /* 1894 * Allocate a new spa_t structure. 1895 */ 1896 (void) nvlist_lookup_string(props, 1897 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 1898 spa = spa_add(pool, altroot); 1899 spa_activate(spa); 1900 1901 spa->spa_uberblock.ub_txg = txg - 1; 1902 1903 if (props && (error = spa_prop_validate(spa, props))) { 1904 spa_unload(spa); 1905 spa_deactivate(spa); 1906 spa_remove(spa); 1907 mutex_exit(&spa_namespace_lock); 1908 return (error); 1909 } 1910 1911 if (nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), 1912 &version) != 0) 1913 version = SPA_VERSION; 1914 ASSERT(version <= SPA_VERSION); 1915 spa->spa_uberblock.ub_version = version; 1916 spa->spa_ubsync = spa->spa_uberblock; 1917 1918 /* 1919 * Create the root vdev. 1920 */ 1921 spa_config_enter(spa, RW_WRITER, FTAG); 1922 1923 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 1924 1925 ASSERT(error != 0 || rvd != NULL); 1926 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 1927 1928 if (error == 0 && !zfs_allocatable_devs(nvroot)) 1929 error = EINVAL; 1930 1931 if (error == 0 && 1932 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 1933 (error = spa_validate_aux(spa, nvroot, txg, 1934 VDEV_ALLOC_ADD)) == 0) { 1935 for (c = 0; c < rvd->vdev_children; c++) 1936 vdev_init(rvd->vdev_child[c], txg); 1937 vdev_config_dirty(rvd); 1938 } 1939 1940 spa_config_exit(spa, FTAG); 1941 1942 if (error != 0) { 1943 spa_unload(spa); 1944 spa_deactivate(spa); 1945 spa_remove(spa); 1946 mutex_exit(&spa_namespace_lock); 1947 return (error); 1948 } 1949 1950 /* 1951 * Get the list of spares, if specified. 1952 */ 1953 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1954 &spares, &nspares) == 0) { 1955 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, 1956 KM_SLEEP) == 0); 1957 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 1958 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1959 spa_config_enter(spa, RW_WRITER, FTAG); 1960 spa_load_spares(spa); 1961 spa_config_exit(spa, FTAG); 1962 spa->spa_spares.sav_sync = B_TRUE; 1963 } 1964 1965 /* 1966 * Get the list of level 2 cache devices, if specified. 1967 */ 1968 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 1969 &l2cache, &nl2cache) == 0) { 1970 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 1971 NV_UNIQUE_NAME, KM_SLEEP) == 0); 1972 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 1973 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 1974 spa_config_enter(spa, RW_WRITER, FTAG); 1975 spa_load_l2cache(spa); 1976 spa_config_exit(spa, FTAG); 1977 spa->spa_l2cache.sav_sync = B_TRUE; 1978 } 1979 1980 spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); 1981 spa->spa_meta_objset = dp->dp_meta_objset; 1982 1983 tx = dmu_tx_create_assigned(dp, txg); 1984 1985 /* 1986 * Create the pool config object. 1987 */ 1988 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 1989 DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, 1990 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 1991 1992 if (zap_add(spa->spa_meta_objset, 1993 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 1994 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 1995 cmn_err(CE_PANIC, "failed to add pool config"); 1996 } 1997 1998 /* Newly created pools with the right version are always deflated. */ 1999 if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 2000 spa->spa_deflate = TRUE; 2001 if (zap_add(spa->spa_meta_objset, 2002 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 2003 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 2004 cmn_err(CE_PANIC, "failed to add deflate"); 2005 } 2006 } 2007 2008 /* 2009 * Create the deferred-free bplist object. Turn off compression 2010 * because sync-to-convergence takes longer if the blocksize 2011 * keeps changing. 2012 */ 2013 spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 2014 1 << 14, tx); 2015 dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 2016 ZIO_COMPRESS_OFF, tx); 2017 2018 if (zap_add(spa->spa_meta_objset, 2019 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 2020 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { 2021 cmn_err(CE_PANIC, "failed to add bplist"); 2022 } 2023 2024 /* 2025 * Create the pool's history object. 2026 */ 2027 if (version >= SPA_VERSION_ZPOOL_HISTORY) 2028 spa_history_create_obj(spa, tx); 2029 2030 /* 2031 * Set pool properties. 2032 */ 2033 spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 2034 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 2035 spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 2036 if (props) 2037 spa_sync_props(spa, props, CRED(), tx); 2038 2039 dmu_tx_commit(tx); 2040 2041 spa->spa_sync_on = B_TRUE; 2042 txg_sync_start(spa->spa_dsl_pool); 2043 2044 /* 2045 * We explicitly wait for the first transaction to complete so that our 2046 * bean counters are appropriately updated. 2047 */ 2048 txg_wait_synced(spa->spa_dsl_pool, txg); 2049 2050 spa_config_sync(spa, B_FALSE, B_TRUE); 2051 2052 if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL) 2053 (void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE); 2054 2055 mutex_exit(&spa_namespace_lock); 2056 2057 spa->spa_minref = refcount_count(&spa->spa_refcount); 2058 2059 return (0); 2060 } 2061 2062 /* 2063 * Import the given pool into the system. We set up the necessary spa_t and 2064 * then call spa_load() to do the dirty work. 2065 */ 2066 static int 2067 spa_import_common(const char *pool, nvlist_t *config, nvlist_t *props, 2068 boolean_t isroot, boolean_t allowfaulted) 2069 { 2070 spa_t *spa; 2071 char *altroot = NULL; 2072 int error, loaderr; 2073 nvlist_t *nvroot; 2074 nvlist_t **spares, **l2cache; 2075 uint_t nspares, nl2cache; 2076 2077 /* 2078 * If a pool with this name exists, return failure. 2079 */ 2080 mutex_enter(&spa_namespace_lock); 2081 if (spa_lookup(pool) != NULL) { 2082 mutex_exit(&spa_namespace_lock); 2083 return (EEXIST); 2084 } 2085 2086 /* 2087 * Create and initialize the spa structure. 2088 */ 2089 (void) nvlist_lookup_string(props, 2090 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 2091 spa = spa_add(pool, altroot); 2092 spa_activate(spa); 2093 2094 if (allowfaulted) 2095 spa->spa_import_faulted = B_TRUE; 2096 spa->spa_is_root = isroot; 2097 2098 /* 2099 * Pass off the heavy lifting to spa_load(). 2100 * Pass TRUE for mosconfig (unless this is a root pool) because 2101 * the user-supplied config is actually the one to trust when 2102 * doing an import. 2103 */ 2104 loaderr = error = spa_load(spa, config, SPA_LOAD_IMPORT, !isroot); 2105 2106 spa_config_enter(spa, RW_WRITER, FTAG); 2107 /* 2108 * Toss any existing sparelist, as it doesn't have any validity anymore, 2109 * and conflicts with spa_has_spare(). 2110 */ 2111 if (!isroot && spa->spa_spares.sav_config) { 2112 nvlist_free(spa->spa_spares.sav_config); 2113 spa->spa_spares.sav_config = NULL; 2114 spa_load_spares(spa); 2115 } 2116 if (!isroot && spa->spa_l2cache.sav_config) { 2117 nvlist_free(spa->spa_l2cache.sav_config); 2118 spa->spa_l2cache.sav_config = NULL; 2119 spa_load_l2cache(spa); 2120 } 2121 2122 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 2123 &nvroot) == 0); 2124 if (error == 0) 2125 error = spa_validate_aux(spa, nvroot, -1ULL, VDEV_ALLOC_SPARE); 2126 if (error == 0) 2127 error = spa_validate_aux(spa, nvroot, -1ULL, 2128 VDEV_ALLOC_L2CACHE); 2129 spa_config_exit(spa, FTAG); 2130 2131 if (error != 0 || (props && (error = spa_prop_set(spa, props)))) { 2132 if (loaderr != 0 && loaderr != EINVAL && allowfaulted) { 2133 /* 2134 * If we failed to load the pool, but 'allowfaulted' is 2135 * set, then manually set the config as if the config 2136 * passed in was specified in the cache file. 2137 */ 2138 error = 0; 2139 spa->spa_import_faulted = B_FALSE; 2140 if (spa->spa_config == NULL) { 2141 spa_config_enter(spa, RW_READER, FTAG); 2142 spa->spa_config = spa_config_generate(spa, 2143 NULL, -1ULL, B_TRUE); 2144 spa_config_exit(spa, FTAG); 2145 } 2146 spa_unload(spa); 2147 spa_deactivate(spa); 2148 spa_config_sync(spa, B_FALSE, B_TRUE); 2149 } else { 2150 spa_unload(spa); 2151 spa_deactivate(spa); 2152 spa_remove(spa); 2153 } 2154 mutex_exit(&spa_namespace_lock); 2155 return (error); 2156 } 2157 2158 /* 2159 * Override any spares and level 2 cache devices as specified by 2160 * the user, as these may have correct device names/devids, etc. 2161 */ 2162 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2163 &spares, &nspares) == 0) { 2164 if (spa->spa_spares.sav_config) 2165 VERIFY(nvlist_remove(spa->spa_spares.sav_config, 2166 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 2167 else 2168 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, 2169 NV_UNIQUE_NAME, KM_SLEEP) == 0); 2170 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 2171 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 2172 spa_config_enter(spa, RW_WRITER, FTAG); 2173 spa_load_spares(spa); 2174 spa_config_exit(spa, FTAG); 2175 spa->spa_spares.sav_sync = B_TRUE; 2176 } 2177 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 2178 &l2cache, &nl2cache) == 0) { 2179 if (spa->spa_l2cache.sav_config) 2180 VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, 2181 ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); 2182 else 2183 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 2184 NV_UNIQUE_NAME, KM_SLEEP) == 0); 2185 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 2186 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 2187 spa_config_enter(spa, RW_WRITER, FTAG); 2188 spa_load_l2cache(spa); 2189 spa_config_exit(spa, FTAG); 2190 spa->spa_l2cache.sav_sync = B_TRUE; 2191 } 2192 2193 if (spa_mode & FWRITE) { 2194 /* 2195 * Update the config cache to include the newly-imported pool. 2196 */ 2197 spa_config_update_common(spa, SPA_CONFIG_UPDATE_POOL, isroot); 2198 } 2199 2200 spa->spa_import_faulted = B_FALSE; 2201 mutex_exit(&spa_namespace_lock); 2202 2203 return (0); 2204 } 2205 2206 #ifdef _KERNEL 2207 /* 2208 * Build a "root" vdev for a top level vdev read in from a rootpool 2209 * device label. 2210 */ 2211 static void 2212 spa_build_rootpool_config(nvlist_t *config) 2213 { 2214 nvlist_t *nvtop, *nvroot; 2215 uint64_t pgid; 2216 2217 /* 2218 * Add this top-level vdev to the child array. 2219 */ 2220 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtop) 2221 == 0); 2222 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pgid) 2223 == 0); 2224 2225 /* 2226 * Put this pool's top-level vdevs into a root vdev. 2227 */ 2228 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 2229 VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) 2230 == 0); 2231 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 2232 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 2233 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 2234 &nvtop, 1) == 0); 2235 2236 /* 2237 * Replace the existing vdev_tree with the new root vdev in 2238 * this pool's configuration (remove the old, add the new). 2239 */ 2240 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 2241 nvlist_free(nvroot); 2242 } 2243 2244 /* 2245 * Get the root pool information from the root disk, then import the root pool 2246 * during the system boot up time. 2247 */ 2248 extern nvlist_t *vdev_disk_read_rootlabel(char *, char *); 2249 2250 int 2251 spa_check_rootconf(char *devpath, char *devid, nvlist_t **bestconf, 2252 uint64_t *besttxg) 2253 { 2254 nvlist_t *config; 2255 uint64_t txg; 2256 2257 if ((config = vdev_disk_read_rootlabel(devpath, devid)) == NULL) 2258 return (-1); 2259 2260 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); 2261 2262 if (bestconf != NULL) 2263 *bestconf = config; 2264 *besttxg = txg; 2265 return (0); 2266 } 2267 2268 boolean_t 2269 spa_rootdev_validate(nvlist_t *nv) 2270 { 2271 uint64_t ival; 2272 2273 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &ival) == 0 || 2274 nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, &ival) == 0 || 2275 nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, &ival) == 0) 2276 return (B_FALSE); 2277 2278 return (B_TRUE); 2279 } 2280 2281 2282 /* 2283 * Given the boot device's physical path or devid, check if the device 2284 * is in a valid state. If so, return the configuration from the vdev 2285 * label. 2286 */ 2287 int 2288 spa_get_rootconf(char *devpath, char *devid, nvlist_t **bestconf) 2289 { 2290 nvlist_t *conf = NULL; 2291 uint64_t txg = 0; 2292 nvlist_t *nvtop, **child; 2293 char *type; 2294 char *bootpath = NULL; 2295 uint_t children, c; 2296 char *tmp; 2297 2298 if (devpath && ((tmp = strchr(devpath, ' ')) != NULL)) 2299 *tmp = '\0'; 2300 if (spa_check_rootconf(devpath, devid, &conf, &txg) < 0) { 2301 cmn_err(CE_NOTE, "error reading device label"); 2302 nvlist_free(conf); 2303 return (EINVAL); 2304 } 2305 if (txg == 0) { 2306 cmn_err(CE_NOTE, "this device is detached"); 2307 nvlist_free(conf); 2308 return (EINVAL); 2309 } 2310 2311 VERIFY(nvlist_lookup_nvlist(conf, ZPOOL_CONFIG_VDEV_TREE, 2312 &nvtop) == 0); 2313 VERIFY(nvlist_lookup_string(nvtop, ZPOOL_CONFIG_TYPE, &type) == 0); 2314 2315 if (strcmp(type, VDEV_TYPE_DISK) == 0) { 2316 if (spa_rootdev_validate(nvtop)) { 2317 goto out; 2318 } else { 2319 nvlist_free(conf); 2320 return (EINVAL); 2321 } 2322 } 2323 2324 ASSERT(strcmp(type, VDEV_TYPE_MIRROR) == 0); 2325 2326 VERIFY(nvlist_lookup_nvlist_array(nvtop, ZPOOL_CONFIG_CHILDREN, 2327 &child, &children) == 0); 2328 2329 /* 2330 * Go thru vdevs in the mirror to see if the given device 2331 * has the most recent txg. Only the device with the most 2332 * recent txg has valid information and should be booted. 2333 */ 2334 for (c = 0; c < children; c++) { 2335 char *cdevid, *cpath; 2336 uint64_t tmptxg; 2337 2338 if (nvlist_lookup_string(child[c], ZPOOL_CONFIG_PHYS_PATH, 2339 &cpath) != 0) 2340 return (EINVAL); 2341 if (nvlist_lookup_string(child[c], ZPOOL_CONFIG_DEVID, 2342 &cdevid) != 0) 2343 return (EINVAL); 2344 if ((spa_check_rootconf(cpath, cdevid, NULL, 2345 &tmptxg) == 0) && (tmptxg > txg)) { 2346 txg = tmptxg; 2347 VERIFY(nvlist_lookup_string(child[c], 2348 ZPOOL_CONFIG_PATH, &bootpath) == 0); 2349 } 2350 } 2351 2352 /* Does the best device match the one we've booted from? */ 2353 if (bootpath) { 2354 cmn_err(CE_NOTE, "try booting from '%s'", bootpath); 2355 return (EINVAL); 2356 } 2357 out: 2358 *bestconf = conf; 2359 return (0); 2360 } 2361 2362 /* 2363 * Import a root pool. 2364 * 2365 * For x86. devpath_list will consist of devid and/or physpath name of 2366 * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a"). 2367 * The GRUB "findroot" command will return the vdev we should boot. 2368 * 2369 * For Sparc, devpath_list consists the physpath name of the booting device 2370 * no matter the rootpool is a single device pool or a mirrored pool. 2371 * e.g. 2372 * "/pci@1f,0/ide@d/disk@0,0:a" 2373 */ 2374 int 2375 spa_import_rootpool(char *devpath, char *devid) 2376 { 2377 nvlist_t *conf = NULL; 2378 char *pname; 2379 int error; 2380 2381 /* 2382 * Get the vdev pathname and configuation from the most 2383 * recently updated vdev (highest txg). 2384 */ 2385 if (error = spa_get_rootconf(devpath, devid, &conf)) 2386 goto msg_out; 2387 2388 /* 2389 * Add type "root" vdev to the config. 2390 */ 2391 spa_build_rootpool_config(conf); 2392 2393 VERIFY(nvlist_lookup_string(conf, ZPOOL_CONFIG_POOL_NAME, &pname) == 0); 2394 2395 /* 2396 * We specify 'allowfaulted' for this to be treated like spa_open() 2397 * instead of spa_import(). This prevents us from marking vdevs as 2398 * persistently unavailable, and generates FMA ereports as if it were a 2399 * pool open, not import. 2400 */ 2401 error = spa_import_common(pname, conf, NULL, B_TRUE, B_TRUE); 2402 if (error == EEXIST) 2403 error = 0; 2404 2405 nvlist_free(conf); 2406 return (error); 2407 2408 msg_out: 2409 cmn_err(CE_NOTE, "\n" 2410 " *************************************************** \n" 2411 " * This device is not bootable! * \n" 2412 " * It is either offlined or detached or faulted. * \n" 2413 " * Please try to boot from a different device. * \n" 2414 " *************************************************** "); 2415 2416 return (error); 2417 } 2418 #endif 2419 2420 /* 2421 * Import a non-root pool into the system. 2422 */ 2423 int 2424 spa_import(const char *pool, nvlist_t *config, nvlist_t *props) 2425 { 2426 return (spa_import_common(pool, config, props, B_FALSE, B_FALSE)); 2427 } 2428 2429 int 2430 spa_import_faulted(const char *pool, nvlist_t *config, nvlist_t *props) 2431 { 2432 return (spa_import_common(pool, config, props, B_FALSE, B_TRUE)); 2433 } 2434 2435 2436 /* 2437 * This (illegal) pool name is used when temporarily importing a spa_t in order 2438 * to get the vdev stats associated with the imported devices. 2439 */ 2440 #define TRYIMPORT_NAME "$import" 2441 2442 nvlist_t * 2443 spa_tryimport(nvlist_t *tryconfig) 2444 { 2445 nvlist_t *config = NULL; 2446 char *poolname; 2447 spa_t *spa; 2448 uint64_t state; 2449 2450 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 2451 return (NULL); 2452 2453 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 2454 return (NULL); 2455 2456 /* 2457 * Create and initialize the spa structure. 2458 */ 2459 mutex_enter(&spa_namespace_lock); 2460 spa = spa_add(TRYIMPORT_NAME, NULL); 2461 spa_activate(spa); 2462 2463 /* 2464 * Pass off the heavy lifting to spa_load(). 2465 * Pass TRUE for mosconfig because the user-supplied config 2466 * is actually the one to trust when doing an import. 2467 */ 2468 (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); 2469 2470 /* 2471 * If 'tryconfig' was at least parsable, return the current config. 2472 */ 2473 if (spa->spa_root_vdev != NULL) { 2474 spa_config_enter(spa, RW_READER, FTAG); 2475 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 2476 spa_config_exit(spa, FTAG); 2477 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 2478 poolname) == 0); 2479 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 2480 state) == 0); 2481 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 2482 spa->spa_uberblock.ub_timestamp) == 0); 2483 2484 /* 2485 * If the bootfs property exists on this pool then we 2486 * copy it out so that external consumers can tell which 2487 * pools are bootable. 2488 */ 2489 if (spa->spa_bootfs) { 2490 char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 2491 2492 /* 2493 * We have to play games with the name since the 2494 * pool was opened as TRYIMPORT_NAME. 2495 */ 2496 if (dsl_dsobj_to_dsname(spa->spa_name, 2497 spa->spa_bootfs, tmpname) == 0) { 2498 char *cp; 2499 char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 2500 2501 cp = strchr(tmpname, '/'); 2502 if (cp == NULL) { 2503 (void) strlcpy(dsname, tmpname, 2504 MAXPATHLEN); 2505 } else { 2506 (void) snprintf(dsname, MAXPATHLEN, 2507 "%s/%s", poolname, ++cp); 2508 } 2509 VERIFY(nvlist_add_string(config, 2510 ZPOOL_CONFIG_BOOTFS, dsname) == 0); 2511 kmem_free(dsname, MAXPATHLEN); 2512 } 2513 kmem_free(tmpname, MAXPATHLEN); 2514 } 2515 2516 /* 2517 * Add the list of hot spares and level 2 cache devices. 2518 */ 2519 spa_add_spares(spa, config); 2520 spa_add_l2cache(spa, config); 2521 } 2522 2523 spa_unload(spa); 2524 spa_deactivate(spa); 2525 spa_remove(spa); 2526 mutex_exit(&spa_namespace_lock); 2527 2528 return (config); 2529 } 2530 2531 /* 2532 * Pool export/destroy 2533 * 2534 * The act of destroying or exporting a pool is very simple. We make sure there 2535 * is no more pending I/O and any references to the pool are gone. Then, we 2536 * update the pool state and sync all the labels to disk, removing the 2537 * configuration from the cache afterwards. 2538 */ 2539 static int 2540 spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, 2541 boolean_t force) 2542 { 2543 spa_t *spa; 2544 2545 if (oldconfig) 2546 *oldconfig = NULL; 2547 2548 if (!(spa_mode & FWRITE)) 2549 return (EROFS); 2550 2551 mutex_enter(&spa_namespace_lock); 2552 if ((spa = spa_lookup(pool)) == NULL) { 2553 mutex_exit(&spa_namespace_lock); 2554 return (ENOENT); 2555 } 2556 2557 /* 2558 * Put a hold on the pool, drop the namespace lock, stop async tasks, 2559 * reacquire the namespace lock, and see if we can export. 2560 */ 2561 spa_open_ref(spa, FTAG); 2562 mutex_exit(&spa_namespace_lock); 2563 spa_async_suspend(spa); 2564 mutex_enter(&spa_namespace_lock); 2565 spa_close(spa, FTAG); 2566 2567 /* 2568 * The pool will be in core if it's openable, 2569 * in which case we can modify its state. 2570 */ 2571 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 2572 /* 2573 * Objsets may be open only because they're dirty, so we 2574 * have to force it to sync before checking spa_refcnt. 2575 */ 2576 txg_wait_synced(spa->spa_dsl_pool, 0); 2577 2578 /* 2579 * A pool cannot be exported or destroyed if there are active 2580 * references. If we are resetting a pool, allow references by 2581 * fault injection handlers. 2582 */ 2583 if (!spa_refcount_zero(spa) || 2584 (spa->spa_inject_ref != 0 && 2585 new_state != POOL_STATE_UNINITIALIZED)) { 2586 spa_async_resume(spa); 2587 mutex_exit(&spa_namespace_lock); 2588 return (EBUSY); 2589 } 2590 2591 /* 2592 * A pool cannot be exported if it has an active shared spare. 2593 * This is to prevent other pools stealing the active spare 2594 * from an exported pool. At user's own will, such pool can 2595 * be forcedly exported. 2596 */ 2597 if (!force && new_state == POOL_STATE_EXPORTED && 2598 spa_has_active_shared_spare(spa)) { 2599 spa_async_resume(spa); 2600 mutex_exit(&spa_namespace_lock); 2601 return (EXDEV); 2602 } 2603 2604 /* 2605 * We want this to be reflected on every label, 2606 * so mark them all dirty. spa_unload() will do the 2607 * final sync that pushes these changes out. 2608 */ 2609 if (new_state != POOL_STATE_UNINITIALIZED) { 2610 spa_config_enter(spa, RW_WRITER, FTAG); 2611 spa->spa_state = new_state; 2612 spa->spa_final_txg = spa_last_synced_txg(spa) + 1; 2613 vdev_config_dirty(spa->spa_root_vdev); 2614 spa_config_exit(spa, FTAG); 2615 } 2616 } 2617 2618 spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY); 2619 2620 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 2621 spa_unload(spa); 2622 spa_deactivate(spa); 2623 } 2624 2625 if (oldconfig && spa->spa_config) 2626 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 2627 2628 if (new_state != POOL_STATE_UNINITIALIZED) { 2629 spa_config_sync(spa, B_TRUE, B_TRUE); 2630 spa_remove(spa); 2631 } 2632 mutex_exit(&spa_namespace_lock); 2633 2634 return (0); 2635 } 2636 2637 /* 2638 * Destroy a storage pool. 2639 */ 2640 int 2641 spa_destroy(char *pool) 2642 { 2643 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, B_FALSE)); 2644 } 2645 2646 /* 2647 * Export a storage pool. 2648 */ 2649 int 2650 spa_export(char *pool, nvlist_t **oldconfig, boolean_t force) 2651 { 2652 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, force)); 2653 } 2654 2655 /* 2656 * Similar to spa_export(), this unloads the spa_t without actually removing it 2657 * from the namespace in any way. 2658 */ 2659 int 2660 spa_reset(char *pool) 2661 { 2662 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, 2663 B_FALSE)); 2664 } 2665 2666 /* 2667 * ========================================================================== 2668 * Device manipulation 2669 * ========================================================================== 2670 */ 2671 2672 /* 2673 * Add a device to a storage pool. 2674 */ 2675 int 2676 spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 2677 { 2678 uint64_t txg; 2679 int c, error; 2680 vdev_t *rvd = spa->spa_root_vdev; 2681 vdev_t *vd, *tvd; 2682 nvlist_t **spares, **l2cache; 2683 uint_t nspares, nl2cache; 2684 2685 txg = spa_vdev_enter(spa); 2686 2687 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 2688 VDEV_ALLOC_ADD)) != 0) 2689 return (spa_vdev_exit(spa, NULL, txg, error)); 2690 2691 spa->spa_pending_vdev = vd; 2692 2693 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 2694 &nspares) != 0) 2695 nspares = 0; 2696 2697 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, 2698 &nl2cache) != 0) 2699 nl2cache = 0; 2700 2701 if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) { 2702 spa->spa_pending_vdev = NULL; 2703 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 2704 } 2705 2706 if (vd->vdev_children != 0) { 2707 if ((error = vdev_create(vd, txg, B_FALSE)) != 0) { 2708 spa->spa_pending_vdev = NULL; 2709 return (spa_vdev_exit(spa, vd, txg, error)); 2710 } 2711 } 2712 2713 /* 2714 * We must validate the spares and l2cache devices after checking the 2715 * children. Otherwise, vdev_inuse() will blindly overwrite the spare. 2716 */ 2717 if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) { 2718 spa->spa_pending_vdev = NULL; 2719 return (spa_vdev_exit(spa, vd, txg, error)); 2720 } 2721 2722 spa->spa_pending_vdev = NULL; 2723 2724 /* 2725 * Transfer each new top-level vdev from vd to rvd. 2726 */ 2727 for (c = 0; c < vd->vdev_children; c++) { 2728 tvd = vd->vdev_child[c]; 2729 vdev_remove_child(vd, tvd); 2730 tvd->vdev_id = rvd->vdev_children; 2731 vdev_add_child(rvd, tvd); 2732 vdev_config_dirty(tvd); 2733 } 2734 2735 if (nspares != 0) { 2736 spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, 2737 ZPOOL_CONFIG_SPARES); 2738 spa_load_spares(spa); 2739 spa->spa_spares.sav_sync = B_TRUE; 2740 } 2741 2742 if (nl2cache != 0) { 2743 spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, 2744 ZPOOL_CONFIG_L2CACHE); 2745 spa_load_l2cache(spa); 2746 spa->spa_l2cache.sav_sync = B_TRUE; 2747 } 2748 2749 /* 2750 * We have to be careful when adding new vdevs to an existing pool. 2751 * If other threads start allocating from these vdevs before we 2752 * sync the config cache, and we lose power, then upon reboot we may 2753 * fail to open the pool because there are DVAs that the config cache 2754 * can't translate. Therefore, we first add the vdevs without 2755 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 2756 * and then let spa_config_update() initialize the new metaslabs. 2757 * 2758 * spa_load() checks for added-but-not-initialized vdevs, so that 2759 * if we lose power at any point in this sequence, the remaining 2760 * steps will be completed the next time we load the pool. 2761 */ 2762 (void) spa_vdev_exit(spa, vd, txg, 0); 2763 2764 mutex_enter(&spa_namespace_lock); 2765 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 2766 mutex_exit(&spa_namespace_lock); 2767 2768 return (0); 2769 } 2770 2771 /* 2772 * Attach a device to a mirror. The arguments are the path to any device 2773 * in the mirror, and the nvroot for the new device. If the path specifies 2774 * a device that is not mirrored, we automatically insert the mirror vdev. 2775 * 2776 * If 'replacing' is specified, the new device is intended to replace the 2777 * existing device; in this case the two devices are made into their own 2778 * mirror using the 'replacing' vdev, which is functionally identical to 2779 * the mirror vdev (it actually reuses all the same ops) but has a few 2780 * extra rules: you can't attach to it after it's been created, and upon 2781 * completion of resilvering, the first disk (the one being replaced) 2782 * is automatically detached. 2783 */ 2784 int 2785 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 2786 { 2787 uint64_t txg, open_txg; 2788 vdev_t *rvd = spa->spa_root_vdev; 2789 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 2790 vdev_ops_t *pvops; 2791 dmu_tx_t *tx; 2792 char *oldvdpath, *newvdpath; 2793 int newvd_isspare; 2794 int error; 2795 2796 txg = spa_vdev_enter(spa); 2797 2798 oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); 2799 2800 if (oldvd == NULL) 2801 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 2802 2803 if (!oldvd->vdev_ops->vdev_op_leaf) 2804 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 2805 2806 pvd = oldvd->vdev_parent; 2807 2808 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 2809 VDEV_ALLOC_ADD)) != 0) 2810 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 2811 2812 if (newrootvd->vdev_children != 1) 2813 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 2814 2815 newvd = newrootvd->vdev_child[0]; 2816 2817 if (!newvd->vdev_ops->vdev_op_leaf) 2818 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 2819 2820 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 2821 return (spa_vdev_exit(spa, newrootvd, txg, error)); 2822 2823 /* 2824 * Spares can't replace logs 2825 */ 2826 if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) 2827 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 2828 2829 if (!replacing) { 2830 /* 2831 * For attach, the only allowable parent is a mirror or the root 2832 * vdev. 2833 */ 2834 if (pvd->vdev_ops != &vdev_mirror_ops && 2835 pvd->vdev_ops != &vdev_root_ops) 2836 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 2837 2838 pvops = &vdev_mirror_ops; 2839 } else { 2840 /* 2841 * Active hot spares can only be replaced by inactive hot 2842 * spares. 2843 */ 2844 if (pvd->vdev_ops == &vdev_spare_ops && 2845 pvd->vdev_child[1] == oldvd && 2846 !spa_has_spare(spa, newvd->vdev_guid)) 2847 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 2848 2849 /* 2850 * If the source is a hot spare, and the parent isn't already a 2851 * spare, then we want to create a new hot spare. Otherwise, we 2852 * want to create a replacing vdev. The user is not allowed to 2853 * attach to a spared vdev child unless the 'isspare' state is 2854 * the same (spare replaces spare, non-spare replaces 2855 * non-spare). 2856 */ 2857 if (pvd->vdev_ops == &vdev_replacing_ops) 2858 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 2859 else if (pvd->vdev_ops == &vdev_spare_ops && 2860 newvd->vdev_isspare != oldvd->vdev_isspare) 2861 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 2862 else if (pvd->vdev_ops != &vdev_spare_ops && 2863 newvd->vdev_isspare) 2864 pvops = &vdev_spare_ops; 2865 else 2866 pvops = &vdev_replacing_ops; 2867 } 2868 2869 /* 2870 * Compare the new device size with the replaceable/attachable 2871 * device size. 2872 */ 2873 if (newvd->vdev_psize < vdev_get_rsize(oldvd)) 2874 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 2875 2876 /* 2877 * The new device cannot have a higher alignment requirement 2878 * than the top-level vdev. 2879 */ 2880 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 2881 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 2882 2883 /* 2884 * If this is an in-place replacement, update oldvd's path and devid 2885 * to make it distinguishable from newvd, and unopenable from now on. 2886 */ 2887 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 2888 spa_strfree(oldvd->vdev_path); 2889 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 2890 KM_SLEEP); 2891 (void) sprintf(oldvd->vdev_path, "%s/%s", 2892 newvd->vdev_path, "old"); 2893 if (oldvd->vdev_devid != NULL) { 2894 spa_strfree(oldvd->vdev_devid); 2895 oldvd->vdev_devid = NULL; 2896 } 2897 } 2898 2899 /* 2900 * If the parent is not a mirror, or if we're replacing, insert the new 2901 * mirror/replacing/spare vdev above oldvd. 2902 */ 2903 if (pvd->vdev_ops != pvops) 2904 pvd = vdev_add_parent(oldvd, pvops); 2905 2906 ASSERT(pvd->vdev_top->vdev_parent == rvd); 2907 ASSERT(pvd->vdev_ops == pvops); 2908 ASSERT(oldvd->vdev_parent == pvd); 2909 2910 /* 2911 * Extract the new device from its root and add it to pvd. 2912 */ 2913 vdev_remove_child(newrootvd, newvd); 2914 newvd->vdev_id = pvd->vdev_children; 2915 vdev_add_child(pvd, newvd); 2916 2917 /* 2918 * If newvd is smaller than oldvd, but larger than its rsize, 2919 * the addition of newvd may have decreased our parent's asize. 2920 */ 2921 pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize); 2922 2923 tvd = newvd->vdev_top; 2924 ASSERT(pvd->vdev_top == tvd); 2925 ASSERT(tvd->vdev_parent == rvd); 2926 2927 vdev_config_dirty(tvd); 2928 2929 /* 2930 * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 2931 * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 2932 */ 2933 open_txg = txg + TXG_CONCURRENT_STATES - 1; 2934 2935 mutex_enter(&newvd->vdev_dtl_lock); 2936 space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, 2937 open_txg - TXG_INITIAL + 1); 2938 mutex_exit(&newvd->vdev_dtl_lock); 2939 2940 if (newvd->vdev_isspare) 2941 spa_spare_activate(newvd); 2942 oldvdpath = spa_strdup(vdev_description(oldvd)); 2943 newvdpath = spa_strdup(vdev_description(newvd)); 2944 newvd_isspare = newvd->vdev_isspare; 2945 2946 /* 2947 * Mark newvd's DTL dirty in this txg. 2948 */ 2949 vdev_dirty(tvd, VDD_DTL, newvd, txg); 2950 2951 (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 2952 2953 tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 2954 if (dmu_tx_assign(tx, TXG_WAIT) == 0) { 2955 spa_history_internal_log(LOG_POOL_VDEV_ATTACH, spa, tx, 2956 CRED(), "%s vdev=%s %s vdev=%s", 2957 replacing && newvd_isspare ? "spare in" : 2958 replacing ? "replace" : "attach", newvdpath, 2959 replacing ? "for" : "to", oldvdpath); 2960 dmu_tx_commit(tx); 2961 } else { 2962 dmu_tx_abort(tx); 2963 } 2964 2965 spa_strfree(oldvdpath); 2966 spa_strfree(newvdpath); 2967 2968 /* 2969 * Kick off a resilver to update newvd. 2970 */ 2971 VERIFY3U(spa_scrub(spa, POOL_SCRUB_RESILVER), ==, 0); 2972 2973 return (0); 2974 } 2975 2976 /* 2977 * Detach a device from a mirror or replacing vdev. 2978 * If 'replace_done' is specified, only detach if the parent 2979 * is a replacing vdev. 2980 */ 2981 int 2982 spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) 2983 { 2984 uint64_t txg; 2985 int c, t, error; 2986 vdev_t *rvd = spa->spa_root_vdev; 2987 vdev_t *vd, *pvd, *cvd, *tvd; 2988 boolean_t unspare = B_FALSE; 2989 uint64_t unspare_guid; 2990 size_t len; 2991 2992 txg = spa_vdev_enter(spa); 2993 2994 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 2995 2996 if (vd == NULL) 2997 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 2998 2999 if (!vd->vdev_ops->vdev_op_leaf) 3000 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3001 3002 pvd = vd->vdev_parent; 3003 3004 /* 3005 * If replace_done is specified, only remove this device if it's 3006 * the first child of a replacing vdev. For the 'spare' vdev, either 3007 * disk can be removed. 3008 */ 3009 if (replace_done) { 3010 if (pvd->vdev_ops == &vdev_replacing_ops) { 3011 if (vd->vdev_id != 0) 3012 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3013 } else if (pvd->vdev_ops != &vdev_spare_ops) { 3014 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3015 } 3016 } 3017 3018 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 3019 spa_version(spa) >= SPA_VERSION_SPARES); 3020 3021 /* 3022 * Only mirror, replacing, and spare vdevs support detach. 3023 */ 3024 if (pvd->vdev_ops != &vdev_replacing_ops && 3025 pvd->vdev_ops != &vdev_mirror_ops && 3026 pvd->vdev_ops != &vdev_spare_ops) 3027 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3028 3029 /* 3030 * If there's only one replica, you can't detach it. 3031 */ 3032 if (pvd->vdev_children <= 1) 3033 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 3034 3035 /* 3036 * If all siblings have non-empty DTLs, this device may have the only 3037 * valid copy of the data, which means we cannot safely detach it. 3038 * 3039 * XXX -- as in the vdev_offline() case, we really want a more 3040 * precise DTL check. 3041 */ 3042 for (c = 0; c < pvd->vdev_children; c++) { 3043 uint64_t dirty; 3044 3045 cvd = pvd->vdev_child[c]; 3046 if (cvd == vd) 3047 continue; 3048 if (vdev_is_dead(cvd)) 3049 continue; 3050 mutex_enter(&cvd->vdev_dtl_lock); 3051 dirty = cvd->vdev_dtl_map.sm_space | 3052 cvd->vdev_dtl_scrub.sm_space; 3053 mutex_exit(&cvd->vdev_dtl_lock); 3054 if (!dirty) 3055 break; 3056 } 3057 3058 /* 3059 * If we are a replacing or spare vdev, then we can always detach the 3060 * latter child, as that is how one cancels the operation. 3061 */ 3062 if ((pvd->vdev_ops == &vdev_mirror_ops || vd->vdev_id != 1) && 3063 c == pvd->vdev_children) 3064 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 3065 3066 /* 3067 * If we are detaching the second disk from a replacing vdev, then 3068 * check to see if we changed the original vdev's path to have "/old" 3069 * at the end in spa_vdev_attach(). If so, undo that change now. 3070 */ 3071 if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id == 1 && 3072 pvd->vdev_child[0]->vdev_path != NULL && 3073 pvd->vdev_child[1]->vdev_path != NULL) { 3074 ASSERT(pvd->vdev_child[1] == vd); 3075 cvd = pvd->vdev_child[0]; 3076 len = strlen(vd->vdev_path); 3077 if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && 3078 strcmp(cvd->vdev_path + len, "/old") == 0) { 3079 spa_strfree(cvd->vdev_path); 3080 cvd->vdev_path = spa_strdup(vd->vdev_path); 3081 } 3082 } 3083 3084 /* 3085 * If we are detaching the original disk from a spare, then it implies 3086 * that the spare should become a real disk, and be removed from the 3087 * active spare list for the pool. 3088 */ 3089 if (pvd->vdev_ops == &vdev_spare_ops && 3090 vd->vdev_id == 0) 3091 unspare = B_TRUE; 3092 3093 /* 3094 * Erase the disk labels so the disk can be used for other things. 3095 * This must be done after all other error cases are handled, 3096 * but before we disembowel vd (so we can still do I/O to it). 3097 * But if we can't do it, don't treat the error as fatal -- 3098 * it may be that the unwritability of the disk is the reason 3099 * it's being detached! 3100 */ 3101 error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 3102 3103 /* 3104 * Remove vd from its parent and compact the parent's children. 3105 */ 3106 vdev_remove_child(pvd, vd); 3107 vdev_compact_children(pvd); 3108 3109 /* 3110 * Remember one of the remaining children so we can get tvd below. 3111 */ 3112 cvd = pvd->vdev_child[0]; 3113 3114 /* 3115 * If we need to remove the remaining child from the list of hot spares, 3116 * do it now, marking the vdev as no longer a spare in the process. We 3117 * must do this before vdev_remove_parent(), because that can change the 3118 * GUID if it creates a new toplevel GUID. 3119 */ 3120 if (unspare) { 3121 ASSERT(cvd->vdev_isspare); 3122 spa_spare_remove(cvd); 3123 unspare_guid = cvd->vdev_guid; 3124 } 3125 3126 /* 3127 * If the parent mirror/replacing vdev only has one child, 3128 * the parent is no longer needed. Remove it from the tree. 3129 */ 3130 if (pvd->vdev_children == 1) 3131 vdev_remove_parent(cvd); 3132 3133 /* 3134 * We don't set tvd until now because the parent we just removed 3135 * may have been the previous top-level vdev. 3136 */ 3137 tvd = cvd->vdev_top; 3138 ASSERT(tvd->vdev_parent == rvd); 3139 3140 /* 3141 * Reevaluate the parent vdev state. 3142 */ 3143 vdev_propagate_state(cvd); 3144 3145 /* 3146 * If the device we just detached was smaller than the others, it may be 3147 * possible to add metaslabs (i.e. grow the pool). vdev_metaslab_init() 3148 * can't fail because the existing metaslabs are already in core, so 3149 * there's nothing to read from disk. 3150 */ 3151 VERIFY(vdev_metaslab_init(tvd, txg) == 0); 3152 3153 vdev_config_dirty(tvd); 3154 3155 /* 3156 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 3157 * vd->vdev_detached is set and free vd's DTL object in syncing context. 3158 * But first make sure we're not on any *other* txg's DTL list, to 3159 * prevent vd from being accessed after it's freed. 3160 */ 3161 for (t = 0; t < TXG_SIZE; t++) 3162 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 3163 vd->vdev_detached = B_TRUE; 3164 vdev_dirty(tvd, VDD_DTL, vd, txg); 3165 3166 spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); 3167 3168 error = spa_vdev_exit(spa, vd, txg, 0); 3169 3170 /* 3171 * If this was the removal of the original device in a hot spare vdev, 3172 * then we want to go through and remove the device from the hot spare 3173 * list of every other pool. 3174 */ 3175 if (unspare) { 3176 spa = NULL; 3177 mutex_enter(&spa_namespace_lock); 3178 while ((spa = spa_next(spa)) != NULL) { 3179 if (spa->spa_state != POOL_STATE_ACTIVE) 3180 continue; 3181 3182 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 3183 } 3184 mutex_exit(&spa_namespace_lock); 3185 } 3186 3187 return (error); 3188 } 3189 3190 /* 3191 * Remove a spares vdev from the nvlist config. 3192 */ 3193 static int 3194 spa_remove_spares(spa_aux_vdev_t *sav, uint64_t guid, boolean_t unspare, 3195 nvlist_t **spares, int nspares, vdev_t *vd) 3196 { 3197 nvlist_t *nv, **newspares; 3198 int i, j; 3199 3200 nv = NULL; 3201 for (i = 0; i < nspares; i++) { 3202 uint64_t theguid; 3203 3204 VERIFY(nvlist_lookup_uint64(spares[i], 3205 ZPOOL_CONFIG_GUID, &theguid) == 0); 3206 if (theguid == guid) { 3207 nv = spares[i]; 3208 break; 3209 } 3210 } 3211 3212 /* 3213 * Only remove the hot spare if it's not currently in use in this pool. 3214 */ 3215 if (nv == NULL && vd == NULL) 3216 return (ENOENT); 3217 3218 if (nv == NULL && vd != NULL) 3219 return (ENOTSUP); 3220 3221 if (!unspare && nv != NULL && vd != NULL) 3222 return (EBUSY); 3223 3224 if (nspares == 1) { 3225 newspares = NULL; 3226 } else { 3227 newspares = kmem_alloc((nspares - 1) * sizeof (void *), 3228 KM_SLEEP); 3229 for (i = 0, j = 0; i < nspares; i++) { 3230 if (spares[i] != nv) 3231 VERIFY(nvlist_dup(spares[i], 3232 &newspares[j++], KM_SLEEP) == 0); 3233 } 3234 } 3235 3236 VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_SPARES, 3237 DATA_TYPE_NVLIST_ARRAY) == 0); 3238 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 3239 ZPOOL_CONFIG_SPARES, newspares, nspares - 1) == 0); 3240 for (i = 0; i < nspares - 1; i++) 3241 nvlist_free(newspares[i]); 3242 kmem_free(newspares, (nspares - 1) * sizeof (void *)); 3243 3244 return (0); 3245 } 3246 3247 /* 3248 * Remove an l2cache vdev from the nvlist config. 3249 */ 3250 static int 3251 spa_remove_l2cache(spa_aux_vdev_t *sav, uint64_t guid, nvlist_t **l2cache, 3252 int nl2cache, vdev_t *vd) 3253 { 3254 nvlist_t *nv, **newl2cache; 3255 int i, j; 3256 3257 nv = NULL; 3258 for (i = 0; i < nl2cache; i++) { 3259 uint64_t theguid; 3260 3261 VERIFY(nvlist_lookup_uint64(l2cache[i], 3262 ZPOOL_CONFIG_GUID, &theguid) == 0); 3263 if (theguid == guid) { 3264 nv = l2cache[i]; 3265 break; 3266 } 3267 } 3268 3269 if (vd == NULL) { 3270 for (i = 0; i < nl2cache; i++) { 3271 if (sav->sav_vdevs[i]->vdev_guid == guid) { 3272 vd = sav->sav_vdevs[i]; 3273 break; 3274 } 3275 } 3276 } 3277 3278 if (nv == NULL && vd == NULL) 3279 return (ENOENT); 3280 3281 if (nv == NULL && vd != NULL) 3282 return (ENOTSUP); 3283 3284 if (nl2cache == 1) { 3285 newl2cache = NULL; 3286 } else { 3287 newl2cache = kmem_alloc((nl2cache - 1) * sizeof (void *), 3288 KM_SLEEP); 3289 for (i = 0, j = 0; i < nl2cache; i++) { 3290 if (l2cache[i] != nv) 3291 VERIFY(nvlist_dup(l2cache[i], 3292 &newl2cache[j++], KM_SLEEP) == 0); 3293 } 3294 } 3295 3296 VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 3297 DATA_TYPE_NVLIST_ARRAY) == 0); 3298 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 3299 ZPOOL_CONFIG_L2CACHE, newl2cache, nl2cache - 1) == 0); 3300 for (i = 0; i < nl2cache - 1; i++) 3301 nvlist_free(newl2cache[i]); 3302 kmem_free(newl2cache, (nl2cache - 1) * sizeof (void *)); 3303 3304 return (0); 3305 } 3306 3307 /* 3308 * Remove a device from the pool. Currently, this supports removing only hot 3309 * spares and level 2 ARC devices. 3310 */ 3311 int 3312 spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 3313 { 3314 vdev_t *vd; 3315 nvlist_t **spares, **l2cache; 3316 uint_t nspares, nl2cache; 3317 int error = 0; 3318 3319 spa_config_enter(spa, RW_WRITER, FTAG); 3320 3321 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 3322 3323 if (spa->spa_spares.sav_vdevs != NULL && 3324 nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 3325 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) { 3326 if ((error = spa_remove_spares(&spa->spa_spares, guid, unspare, 3327 spares, nspares, vd)) != 0) 3328 goto cache; 3329 spa_load_spares(spa); 3330 spa->spa_spares.sav_sync = B_TRUE; 3331 goto out; 3332 } 3333 3334 cache: 3335 if (spa->spa_l2cache.sav_vdevs != NULL && 3336 nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 3337 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0) { 3338 if ((error = spa_remove_l2cache(&spa->spa_l2cache, guid, 3339 l2cache, nl2cache, vd)) != 0) 3340 goto out; 3341 spa_load_l2cache(spa); 3342 spa->spa_l2cache.sav_sync = B_TRUE; 3343 } 3344 3345 out: 3346 spa_config_exit(spa, FTAG); 3347 return (error); 3348 } 3349 3350 /* 3351 * Find any device that's done replacing, or a vdev marked 'unspare' that's 3352 * current spared, so we can detach it. 3353 */ 3354 static vdev_t * 3355 spa_vdev_resilver_done_hunt(vdev_t *vd) 3356 { 3357 vdev_t *newvd, *oldvd; 3358 int c; 3359 3360 for (c = 0; c < vd->vdev_children; c++) { 3361 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 3362 if (oldvd != NULL) 3363 return (oldvd); 3364 } 3365 3366 /* 3367 * Check for a completed replacement. 3368 */ 3369 if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 3370 oldvd = vd->vdev_child[0]; 3371 newvd = vd->vdev_child[1]; 3372 3373 mutex_enter(&newvd->vdev_dtl_lock); 3374 if (newvd->vdev_dtl_map.sm_space == 0 && 3375 newvd->vdev_dtl_scrub.sm_space == 0) { 3376 mutex_exit(&newvd->vdev_dtl_lock); 3377 return (oldvd); 3378 } 3379 mutex_exit(&newvd->vdev_dtl_lock); 3380 } 3381 3382 /* 3383 * Check for a completed resilver with the 'unspare' flag set. 3384 */ 3385 if (vd->vdev_ops == &vdev_spare_ops && vd->vdev_children == 2) { 3386 newvd = vd->vdev_child[0]; 3387 oldvd = vd->vdev_child[1]; 3388 3389 mutex_enter(&newvd->vdev_dtl_lock); 3390 if (newvd->vdev_unspare && 3391 newvd->vdev_dtl_map.sm_space == 0 && 3392 newvd->vdev_dtl_scrub.sm_space == 0) { 3393 newvd->vdev_unspare = 0; 3394 mutex_exit(&newvd->vdev_dtl_lock); 3395 return (oldvd); 3396 } 3397 mutex_exit(&newvd->vdev_dtl_lock); 3398 } 3399 3400 return (NULL); 3401 } 3402 3403 static void 3404 spa_vdev_resilver_done(spa_t *spa) 3405 { 3406 vdev_t *vd; 3407 vdev_t *pvd; 3408 uint64_t guid; 3409 uint64_t pguid = 0; 3410 3411 spa_config_enter(spa, RW_READER, FTAG); 3412 3413 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 3414 guid = vd->vdev_guid; 3415 /* 3416 * If we have just finished replacing a hot spared device, then 3417 * we need to detach the parent's first child (the original hot 3418 * spare) as well. 3419 */ 3420 pvd = vd->vdev_parent; 3421 if (pvd->vdev_parent->vdev_ops == &vdev_spare_ops && 3422 pvd->vdev_id == 0) { 3423 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 3424 ASSERT(pvd->vdev_parent->vdev_children == 2); 3425 pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid; 3426 } 3427 spa_config_exit(spa, FTAG); 3428 if (spa_vdev_detach(spa, guid, B_TRUE) != 0) 3429 return; 3430 if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0) 3431 return; 3432 spa_config_enter(spa, RW_READER, FTAG); 3433 } 3434 3435 spa_config_exit(spa, FTAG); 3436 } 3437 3438 /* 3439 * Update the stored path for this vdev. Dirty the vdev configuration, relying 3440 * on spa_vdev_enter/exit() to synchronize the labels and cache. 3441 */ 3442 int 3443 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 3444 { 3445 vdev_t *vd; 3446 uint64_t txg; 3447 3448 txg = spa_vdev_enter(spa); 3449 3450 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) { 3451 /* 3452 * Determine if this is a reference to a hot spare device. If 3453 * it is, update the path manually as there is no associated 3454 * vdev_t that can be synced to disk. 3455 */ 3456 nvlist_t **spares; 3457 uint_t i, nspares; 3458 3459 if (spa->spa_spares.sav_config != NULL) { 3460 VERIFY(nvlist_lookup_nvlist_array( 3461 spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, 3462 &spares, &nspares) == 0); 3463 for (i = 0; i < nspares; i++) { 3464 uint64_t theguid; 3465 VERIFY(nvlist_lookup_uint64(spares[i], 3466 ZPOOL_CONFIG_GUID, &theguid) == 0); 3467 if (theguid == guid) { 3468 VERIFY(nvlist_add_string(spares[i], 3469 ZPOOL_CONFIG_PATH, newpath) == 0); 3470 spa_load_spares(spa); 3471 spa->spa_spares.sav_sync = B_TRUE; 3472 return (spa_vdev_exit(spa, NULL, txg, 3473 0)); 3474 } 3475 } 3476 } 3477 3478 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 3479 } 3480 3481 if (!vd->vdev_ops->vdev_op_leaf) 3482 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3483 3484 spa_strfree(vd->vdev_path); 3485 vd->vdev_path = spa_strdup(newpath); 3486 3487 vdev_config_dirty(vd->vdev_top); 3488 3489 return (spa_vdev_exit(spa, NULL, txg, 0)); 3490 } 3491 3492 /* 3493 * ========================================================================== 3494 * SPA Scrubbing 3495 * ========================================================================== 3496 */ 3497 3498 int 3499 spa_scrub(spa_t *spa, pool_scrub_type_t type) 3500 { 3501 ASSERT(!spa_config_held(spa, RW_WRITER)); 3502 3503 if ((uint_t)type >= POOL_SCRUB_TYPES) 3504 return (ENOTSUP); 3505 3506 /* 3507 * If a resilver was requested, but there is no DTL on a 3508 * writeable leaf device, we have nothing to do. 3509 */ 3510 if (type == POOL_SCRUB_RESILVER && 3511 !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 3512 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 3513 return (0); 3514 } 3515 3516 if (type == POOL_SCRUB_EVERYTHING && 3517 spa->spa_dsl_pool->dp_scrub_func != SCRUB_FUNC_NONE && 3518 spa->spa_dsl_pool->dp_scrub_isresilver) 3519 return (EBUSY); 3520 3521 if (type == POOL_SCRUB_EVERYTHING || type == POOL_SCRUB_RESILVER) { 3522 return (dsl_pool_scrub_clean(spa->spa_dsl_pool)); 3523 } else if (type == POOL_SCRUB_NONE) { 3524 return (dsl_pool_scrub_cancel(spa->spa_dsl_pool)); 3525 } else { 3526 return (EINVAL); 3527 } 3528 } 3529 3530 /* 3531 * ========================================================================== 3532 * SPA async task processing 3533 * ========================================================================== 3534 */ 3535 3536 static void 3537 spa_async_remove(spa_t *spa, vdev_t *vd) 3538 { 3539 int c; 3540 3541 if (vd->vdev_remove_wanted) { 3542 vd->vdev_remove_wanted = 0; 3543 vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); 3544 vdev_clear(spa, vd, B_TRUE); 3545 vdev_config_dirty(vd->vdev_top); 3546 } 3547 3548 for (c = 0; c < vd->vdev_children; c++) 3549 spa_async_remove(spa, vd->vdev_child[c]); 3550 } 3551 3552 static void 3553 spa_async_thread(spa_t *spa) 3554 { 3555 int tasks, i; 3556 uint64_t txg; 3557 3558 ASSERT(spa->spa_sync_on); 3559 3560 mutex_enter(&spa->spa_async_lock); 3561 tasks = spa->spa_async_tasks; 3562 spa->spa_async_tasks = 0; 3563 mutex_exit(&spa->spa_async_lock); 3564 3565 /* 3566 * See if the config needs to be updated. 3567 */ 3568 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 3569 mutex_enter(&spa_namespace_lock); 3570 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 3571 mutex_exit(&spa_namespace_lock); 3572 } 3573 3574 /* 3575 * See if any devices need to be marked REMOVED. 3576 * 3577 * XXX - We avoid doing this when we are in 3578 * I/O failure state since spa_vdev_enter() grabs 3579 * the namespace lock and would not be able to obtain 3580 * the writer config lock. 3581 */ 3582 if (tasks & SPA_ASYNC_REMOVE && 3583 spa_state(spa) != POOL_STATE_IO_FAILURE) { 3584 txg = spa_vdev_enter(spa); 3585 spa_async_remove(spa, spa->spa_root_vdev); 3586 for (i = 0; i < spa->spa_l2cache.sav_count; i++) 3587 spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); 3588 for (i = 0; i < spa->spa_spares.sav_count; i++) 3589 spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); 3590 (void) spa_vdev_exit(spa, NULL, txg, 0); 3591 } 3592 3593 /* 3594 * If any devices are done replacing, detach them. 3595 */ 3596 if (tasks & SPA_ASYNC_RESILVER_DONE) 3597 spa_vdev_resilver_done(spa); 3598 3599 /* 3600 * Kick off a resilver. 3601 */ 3602 if (tasks & SPA_ASYNC_RESILVER) 3603 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER) == 0); 3604 3605 /* 3606 * Let the world know that we're done. 3607 */ 3608 mutex_enter(&spa->spa_async_lock); 3609 spa->spa_async_thread = NULL; 3610 cv_broadcast(&spa->spa_async_cv); 3611 mutex_exit(&spa->spa_async_lock); 3612 thread_exit(); 3613 } 3614 3615 void 3616 spa_async_suspend(spa_t *spa) 3617 { 3618 mutex_enter(&spa->spa_async_lock); 3619 spa->spa_async_suspended++; 3620 while (spa->spa_async_thread != NULL) 3621 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 3622 mutex_exit(&spa->spa_async_lock); 3623 } 3624 3625 void 3626 spa_async_resume(spa_t *spa) 3627 { 3628 mutex_enter(&spa->spa_async_lock); 3629 ASSERT(spa->spa_async_suspended != 0); 3630 spa->spa_async_suspended--; 3631 mutex_exit(&spa->spa_async_lock); 3632 } 3633 3634 static void 3635 spa_async_dispatch(spa_t *spa) 3636 { 3637 mutex_enter(&spa->spa_async_lock); 3638 if (spa->spa_async_tasks && !spa->spa_async_suspended && 3639 spa->spa_async_thread == NULL && 3640 rootdir != NULL && !vn_is_readonly(rootdir)) 3641 spa->spa_async_thread = thread_create(NULL, 0, 3642 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 3643 mutex_exit(&spa->spa_async_lock); 3644 } 3645 3646 void 3647 spa_async_request(spa_t *spa, int task) 3648 { 3649 mutex_enter(&spa->spa_async_lock); 3650 spa->spa_async_tasks |= task; 3651 mutex_exit(&spa->spa_async_lock); 3652 } 3653 3654 /* 3655 * ========================================================================== 3656 * SPA syncing routines 3657 * ========================================================================== 3658 */ 3659 3660 static void 3661 spa_sync_deferred_frees(spa_t *spa, uint64_t txg) 3662 { 3663 bplist_t *bpl = &spa->spa_sync_bplist; 3664 dmu_tx_t *tx; 3665 blkptr_t blk; 3666 uint64_t itor = 0; 3667 zio_t *zio; 3668 int error; 3669 uint8_t c = 1; 3670 3671 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD); 3672 3673 while (bplist_iterate(bpl, &itor, &blk) == 0) 3674 zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL)); 3675 3676 error = zio_wait(zio); 3677 ASSERT3U(error, ==, 0); 3678 3679 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 3680 bplist_vacate(bpl, tx); 3681 3682 /* 3683 * Pre-dirty the first block so we sync to convergence faster. 3684 * (Usually only the first block is needed.) 3685 */ 3686 dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 3687 dmu_tx_commit(tx); 3688 } 3689 3690 static void 3691 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 3692 { 3693 char *packed = NULL; 3694 size_t bufsize; 3695 size_t nvsize = 0; 3696 dmu_buf_t *db; 3697 3698 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 3699 3700 /* 3701 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration 3702 * information. This avoids the dbuf_will_dirty() path and 3703 * saves us a pre-read to get data we don't actually care about. 3704 */ 3705 bufsize = P2ROUNDUP(nvsize, SPA_CONFIG_BLOCKSIZE); 3706 packed = kmem_alloc(bufsize, KM_SLEEP); 3707 3708 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 3709 KM_SLEEP) == 0); 3710 bzero(packed + nvsize, bufsize - nvsize); 3711 3712 dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); 3713 3714 kmem_free(packed, bufsize); 3715 3716 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 3717 dmu_buf_will_dirty(db, tx); 3718 *(uint64_t *)db->db_data = nvsize; 3719 dmu_buf_rele(db, FTAG); 3720 } 3721 3722 static void 3723 spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, 3724 const char *config, const char *entry) 3725 { 3726 nvlist_t *nvroot; 3727 nvlist_t **list; 3728 int i; 3729 3730 if (!sav->sav_sync) 3731 return; 3732 3733 /* 3734 * Update the MOS nvlist describing the list of available devices. 3735 * spa_validate_aux() will have already made sure this nvlist is 3736 * valid and the vdevs are labeled appropriately. 3737 */ 3738 if (sav->sav_object == 0) { 3739 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, 3740 DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, 3741 sizeof (uint64_t), tx); 3742 VERIFY(zap_update(spa->spa_meta_objset, 3743 DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, 3744 &sav->sav_object, tx) == 0); 3745 } 3746 3747 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3748 if (sav->sav_count == 0) { 3749 VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); 3750 } else { 3751 list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 3752 for (i = 0; i < sav->sav_count; i++) 3753 list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], 3754 B_FALSE, B_FALSE, B_TRUE); 3755 VERIFY(nvlist_add_nvlist_array(nvroot, config, list, 3756 sav->sav_count) == 0); 3757 for (i = 0; i < sav->sav_count; i++) 3758 nvlist_free(list[i]); 3759 kmem_free(list, sav->sav_count * sizeof (void *)); 3760 } 3761 3762 spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); 3763 nvlist_free(nvroot); 3764 3765 sav->sav_sync = B_FALSE; 3766 } 3767 3768 static void 3769 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 3770 { 3771 nvlist_t *config; 3772 3773 if (list_is_empty(&spa->spa_dirty_list)) 3774 return; 3775 3776 config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE); 3777 3778 if (spa->spa_config_syncing) 3779 nvlist_free(spa->spa_config_syncing); 3780 spa->spa_config_syncing = config; 3781 3782 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 3783 } 3784 3785 /* 3786 * Set zpool properties. 3787 */ 3788 static void 3789 spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) 3790 { 3791 spa_t *spa = arg1; 3792 objset_t *mos = spa->spa_meta_objset; 3793 nvlist_t *nvp = arg2; 3794 nvpair_t *elem; 3795 uint64_t intval; 3796 char *strval; 3797 zpool_prop_t prop; 3798 const char *propname; 3799 zprop_type_t proptype; 3800 spa_config_dirent_t *dp; 3801 3802 elem = NULL; 3803 while ((elem = nvlist_next_nvpair(nvp, elem))) { 3804 switch (prop = zpool_name_to_prop(nvpair_name(elem))) { 3805 case ZPOOL_PROP_VERSION: 3806 /* 3807 * Only set version for non-zpool-creation cases 3808 * (set/import). spa_create() needs special care 3809 * for version setting. 3810 */ 3811 if (tx->tx_txg != TXG_INITIAL) { 3812 VERIFY(nvpair_value_uint64(elem, 3813 &intval) == 0); 3814 ASSERT(intval <= SPA_VERSION); 3815 ASSERT(intval >= spa_version(spa)); 3816 spa->spa_uberblock.ub_version = intval; 3817 vdev_config_dirty(spa->spa_root_vdev); 3818 } 3819 break; 3820 3821 case ZPOOL_PROP_ALTROOT: 3822 /* 3823 * 'altroot' is a non-persistent property. It should 3824 * have been set temporarily at creation or import time. 3825 */ 3826 ASSERT(spa->spa_root != NULL); 3827 break; 3828 3829 case ZPOOL_PROP_CACHEFILE: 3830 /* 3831 * 'cachefile' is a non-persistent property, but note 3832 * an async request that the config cache needs to be 3833 * udpated. 3834 */ 3835 VERIFY(nvpair_value_string(elem, &strval) == 0); 3836 3837 dp = kmem_alloc(sizeof (spa_config_dirent_t), 3838 KM_SLEEP); 3839 3840 if (strval[0] == '\0') 3841 dp->scd_path = spa_strdup(spa_config_path); 3842 else if (strcmp(strval, "none") == 0) 3843 dp->scd_path = NULL; 3844 else 3845 dp->scd_path = spa_strdup(strval); 3846 3847 list_insert_head(&spa->spa_config_list, dp); 3848 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 3849 break; 3850 default: 3851 /* 3852 * Set pool property values in the poolprops mos object. 3853 */ 3854 mutex_enter(&spa->spa_props_lock); 3855 if (spa->spa_pool_props_object == 0) { 3856 objset_t *mos = spa->spa_meta_objset; 3857 3858 VERIFY((spa->spa_pool_props_object = 3859 zap_create(mos, DMU_OT_POOL_PROPS, 3860 DMU_OT_NONE, 0, tx)) > 0); 3861 3862 VERIFY(zap_update(mos, 3863 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 3864 8, 1, &spa->spa_pool_props_object, tx) 3865 == 0); 3866 } 3867 mutex_exit(&spa->spa_props_lock); 3868 3869 /* normalize the property name */ 3870 propname = zpool_prop_to_name(prop); 3871 proptype = zpool_prop_get_type(prop); 3872 3873 if (nvpair_type(elem) == DATA_TYPE_STRING) { 3874 ASSERT(proptype == PROP_TYPE_STRING); 3875 VERIFY(nvpair_value_string(elem, &strval) == 0); 3876 VERIFY(zap_update(mos, 3877 spa->spa_pool_props_object, propname, 3878 1, strlen(strval) + 1, strval, tx) == 0); 3879 3880 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 3881 VERIFY(nvpair_value_uint64(elem, &intval) == 0); 3882 3883 if (proptype == PROP_TYPE_INDEX) { 3884 const char *unused; 3885 VERIFY(zpool_prop_index_to_string( 3886 prop, intval, &unused) == 0); 3887 } 3888 VERIFY(zap_update(mos, 3889 spa->spa_pool_props_object, propname, 3890 8, 1, &intval, tx) == 0); 3891 } else { 3892 ASSERT(0); /* not allowed */ 3893 } 3894 3895 switch (prop) { 3896 case ZPOOL_PROP_DELEGATION: 3897 spa->spa_delegation = intval; 3898 break; 3899 case ZPOOL_PROP_BOOTFS: 3900 spa->spa_bootfs = intval; 3901 break; 3902 case ZPOOL_PROP_FAILUREMODE: 3903 spa->spa_failmode = intval; 3904 break; 3905 default: 3906 break; 3907 } 3908 } 3909 3910 /* log internal history if this is not a zpool create */ 3911 if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY && 3912 tx->tx_txg != TXG_INITIAL) { 3913 spa_history_internal_log(LOG_POOL_PROPSET, 3914 spa, tx, cr, "%s %lld %s", 3915 nvpair_name(elem), intval, spa->spa_name); 3916 } 3917 } 3918 } 3919 3920 /* 3921 * Sync the specified transaction group. New blocks may be dirtied as 3922 * part of the process, so we iterate until it converges. 3923 */ 3924 void 3925 spa_sync(spa_t *spa, uint64_t txg) 3926 { 3927 dsl_pool_t *dp = spa->spa_dsl_pool; 3928 objset_t *mos = spa->spa_meta_objset; 3929 bplist_t *bpl = &spa->spa_sync_bplist; 3930 vdev_t *rvd = spa->spa_root_vdev; 3931 vdev_t *vd; 3932 dmu_tx_t *tx; 3933 int dirty_vdevs; 3934 3935 /* 3936 * Lock out configuration changes. 3937 */ 3938 spa_config_enter(spa, RW_READER, FTAG); 3939 3940 spa->spa_syncing_txg = txg; 3941 spa->spa_sync_pass = 0; 3942 3943 VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); 3944 3945 tx = dmu_tx_create_assigned(dp, txg); 3946 3947 /* 3948 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 3949 * set spa_deflate if we have no raid-z vdevs. 3950 */ 3951 if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 3952 spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 3953 int i; 3954 3955 for (i = 0; i < rvd->vdev_children; i++) { 3956 vd = rvd->vdev_child[i]; 3957 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 3958 break; 3959 } 3960 if (i == rvd->vdev_children) { 3961 spa->spa_deflate = TRUE; 3962 VERIFY(0 == zap_add(spa->spa_meta_objset, 3963 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 3964 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 3965 } 3966 } 3967 3968 if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && 3969 spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { 3970 dsl_pool_create_origin(dp, tx); 3971 3972 /* Keeping the origin open increases spa_minref */ 3973 spa->spa_minref += 3; 3974 } 3975 3976 if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && 3977 spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { 3978 dsl_pool_upgrade_clones(dp, tx); 3979 } 3980 3981 /* 3982 * If anything has changed in this txg, push the deferred frees 3983 * from the previous txg. If not, leave them alone so that we 3984 * don't generate work on an otherwise idle system. 3985 */ 3986 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 3987 !txg_list_empty(&dp->dp_dirty_dirs, txg) || 3988 !txg_list_empty(&dp->dp_sync_tasks, txg)) 3989 spa_sync_deferred_frees(spa, txg); 3990 3991 /* 3992 * Iterate to convergence. 3993 */ 3994 do { 3995 spa->spa_sync_pass++; 3996 3997 spa_sync_config_object(spa, tx); 3998 spa_sync_aux_dev(spa, &spa->spa_spares, tx, 3999 ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); 4000 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, 4001 ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); 4002 spa_errlog_sync(spa, txg); 4003 dsl_pool_sync(dp, txg); 4004 4005 dirty_vdevs = 0; 4006 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 4007 vdev_sync(vd, txg); 4008 dirty_vdevs++; 4009 } 4010 4011 bplist_sync(bpl, tx); 4012 } while (dirty_vdevs); 4013 4014 bplist_close(bpl); 4015 4016 dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 4017 4018 /* 4019 * Rewrite the vdev configuration (which includes the uberblock) 4020 * to commit the transaction group. 4021 * 4022 * If there are no dirty vdevs, we sync the uberblock to a few 4023 * random top-level vdevs that are known to be visible in the 4024 * config cache (see spa_vdev_add() for details). If there *are* 4025 * dirty vdevs -- or if the sync to our random subset fails -- 4026 * then sync the uberblock to all vdevs. 4027 */ 4028 if (list_is_empty(&spa->spa_dirty_list)) { 4029 vdev_t *svd[SPA_DVAS_PER_BP]; 4030 int svdcount = 0; 4031 int children = rvd->vdev_children; 4032 int c0 = spa_get_random(children); 4033 int c; 4034 4035 for (c = 0; c < children; c++) { 4036 vd = rvd->vdev_child[(c0 + c) % children]; 4037 if (vd->vdev_ms_array == 0 || vd->vdev_islog) 4038 continue; 4039 svd[svdcount++] = vd; 4040 if (svdcount == SPA_DVAS_PER_BP) 4041 break; 4042 } 4043 vdev_config_sync(svd, svdcount, txg); 4044 } else { 4045 vdev_config_sync(rvd->vdev_child, rvd->vdev_children, txg); 4046 } 4047 dmu_tx_commit(tx); 4048 4049 /* 4050 * Clear the dirty config list. 4051 */ 4052 while ((vd = list_head(&spa->spa_dirty_list)) != NULL) 4053 vdev_config_clean(vd); 4054 4055 /* 4056 * Now that the new config has synced transactionally, 4057 * let it become visible to the config cache. 4058 */ 4059 if (spa->spa_config_syncing != NULL) { 4060 spa_config_set(spa, spa->spa_config_syncing); 4061 spa->spa_config_txg = txg; 4062 spa->spa_config_syncing = NULL; 4063 } 4064 4065 spa->spa_traverse_wanted = B_TRUE; 4066 rw_enter(&spa->spa_traverse_lock, RW_WRITER); 4067 spa->spa_traverse_wanted = B_FALSE; 4068 spa->spa_ubsync = spa->spa_uberblock; 4069 rw_exit(&spa->spa_traverse_lock); 4070 4071 /* 4072 * Clean up the ZIL records for the synced txg. 4073 */ 4074 dsl_pool_zil_clean(dp); 4075 4076 /* 4077 * Update usable space statistics. 4078 */ 4079 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 4080 vdev_sync_done(vd, txg); 4081 4082 /* 4083 * It had better be the case that we didn't dirty anything 4084 * since vdev_config_sync(). 4085 */ 4086 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 4087 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 4088 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 4089 ASSERT(bpl->bpl_queue == NULL); 4090 4091 spa_config_exit(spa, FTAG); 4092 4093 /* 4094 * If any async tasks have been requested, kick them off. 4095 */ 4096 spa_async_dispatch(spa); 4097 } 4098 4099 /* 4100 * Sync all pools. We don't want to hold the namespace lock across these 4101 * operations, so we take a reference on the spa_t and drop the lock during the 4102 * sync. 4103 */ 4104 void 4105 spa_sync_allpools(void) 4106 { 4107 spa_t *spa = NULL; 4108 mutex_enter(&spa_namespace_lock); 4109 while ((spa = spa_next(spa)) != NULL) { 4110 if (spa_state(spa) != POOL_STATE_ACTIVE) 4111 continue; 4112 spa_open_ref(spa, FTAG); 4113 mutex_exit(&spa_namespace_lock); 4114 txg_wait_synced(spa_get_dsl(spa), 0); 4115 mutex_enter(&spa_namespace_lock); 4116 spa_close(spa, FTAG); 4117 } 4118 mutex_exit(&spa_namespace_lock); 4119 } 4120 4121 /* 4122 * ========================================================================== 4123 * Miscellaneous routines 4124 * ========================================================================== 4125 */ 4126 4127 /* 4128 * Remove all pools in the system. 4129 */ 4130 void 4131 spa_evict_all(void) 4132 { 4133 spa_t *spa; 4134 4135 /* 4136 * Remove all cached state. All pools should be closed now, 4137 * so every spa in the AVL tree should be unreferenced. 4138 */ 4139 mutex_enter(&spa_namespace_lock); 4140 while ((spa = spa_next(NULL)) != NULL) { 4141 /* 4142 * Stop async tasks. The async thread may need to detach 4143 * a device that's been replaced, which requires grabbing 4144 * spa_namespace_lock, so we must drop it here. 4145 */ 4146 spa_open_ref(spa, FTAG); 4147 mutex_exit(&spa_namespace_lock); 4148 spa_async_suspend(spa); 4149 mutex_enter(&spa_namespace_lock); 4150 spa_close(spa, FTAG); 4151 4152 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 4153 spa_unload(spa); 4154 spa_deactivate(spa); 4155 } 4156 spa_remove(spa); 4157 } 4158 mutex_exit(&spa_namespace_lock); 4159 } 4160 4161 vdev_t * 4162 spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t l2cache) 4163 { 4164 vdev_t *vd; 4165 int i; 4166 4167 if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) 4168 return (vd); 4169 4170 if (l2cache) { 4171 for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 4172 vd = spa->spa_l2cache.sav_vdevs[i]; 4173 if (vd->vdev_guid == guid) 4174 return (vd); 4175 } 4176 } 4177 4178 return (NULL); 4179 } 4180 4181 void 4182 spa_upgrade(spa_t *spa, uint64_t version) 4183 { 4184 spa_config_enter(spa, RW_WRITER, FTAG); 4185 4186 /* 4187 * This should only be called for a non-faulted pool, and since a 4188 * future version would result in an unopenable pool, this shouldn't be 4189 * possible. 4190 */ 4191 ASSERT(spa->spa_uberblock.ub_version <= SPA_VERSION); 4192 ASSERT(version >= spa->spa_uberblock.ub_version); 4193 4194 spa->spa_uberblock.ub_version = version; 4195 vdev_config_dirty(spa->spa_root_vdev); 4196 4197 spa_config_exit(spa, FTAG); 4198 4199 txg_wait_synced(spa_get_dsl(spa), 0); 4200 } 4201 4202 boolean_t 4203 spa_has_spare(spa_t *spa, uint64_t guid) 4204 { 4205 int i; 4206 uint64_t spareguid; 4207 spa_aux_vdev_t *sav = &spa->spa_spares; 4208 4209 for (i = 0; i < sav->sav_count; i++) 4210 if (sav->sav_vdevs[i]->vdev_guid == guid) 4211 return (B_TRUE); 4212 4213 for (i = 0; i < sav->sav_npending; i++) { 4214 if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, 4215 &spareguid) == 0 && spareguid == guid) 4216 return (B_TRUE); 4217 } 4218 4219 return (B_FALSE); 4220 } 4221 4222 /* 4223 * Check if a pool has an active shared spare device. 4224 * Note: reference count of an active spare is 2, as a spare and as a replace 4225 */ 4226 static boolean_t 4227 spa_has_active_shared_spare(spa_t *spa) 4228 { 4229 int i, refcnt; 4230 uint64_t pool; 4231 spa_aux_vdev_t *sav = &spa->spa_spares; 4232 4233 for (i = 0; i < sav->sav_count; i++) { 4234 if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, 4235 &refcnt) && pool != 0ULL && pool == spa_guid(spa) && 4236 refcnt > 2) 4237 return (B_TRUE); 4238 } 4239 4240 return (B_FALSE); 4241 } 4242 4243 /* 4244 * Post a sysevent corresponding to the given event. The 'name' must be one of 4245 * the event definitions in sys/sysevent/eventdefs.h. The payload will be 4246 * filled in from the spa and (optionally) the vdev. This doesn't do anything 4247 * in the userland libzpool, as we don't want consumers to misinterpret ztest 4248 * or zdb as real changes. 4249 */ 4250 void 4251 spa_event_notify(spa_t *spa, vdev_t *vd, const char *name) 4252 { 4253 #ifdef _KERNEL 4254 sysevent_t *ev; 4255 sysevent_attr_list_t *attr = NULL; 4256 sysevent_value_t value; 4257 sysevent_id_t eid; 4258 4259 ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", 4260 SE_SLEEP); 4261 4262 value.value_type = SE_DATA_TYPE_STRING; 4263 value.value.sv_string = spa_name(spa); 4264 if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) 4265 goto done; 4266 4267 value.value_type = SE_DATA_TYPE_UINT64; 4268 value.value.sv_uint64 = spa_guid(spa); 4269 if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) 4270 goto done; 4271 4272 if (vd) { 4273 value.value_type = SE_DATA_TYPE_UINT64; 4274 value.value.sv_uint64 = vd->vdev_guid; 4275 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, 4276 SE_SLEEP) != 0) 4277 goto done; 4278 4279 if (vd->vdev_path) { 4280 value.value_type = SE_DATA_TYPE_STRING; 4281 value.value.sv_string = vd->vdev_path; 4282 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, 4283 &value, SE_SLEEP) != 0) 4284 goto done; 4285 } 4286 } 4287 4288 if (sysevent_attach_attributes(ev, attr) != 0) 4289 goto done; 4290 attr = NULL; 4291 4292 (void) log_sysevent(ev, SE_SLEEP, &eid); 4293 4294 done: 4295 if (attr) 4296 sysevent_free_attr(attr); 4297 sysevent_free(ev); 4298 #endif 4299 } 4300