1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * This file contains all the routines used when modifying on-disk SPA state. 29 * This includes opening, importing, destroying, exporting a pool, and syncing a 30 * pool. 31 */ 32 33 #include <sys/zfs_context.h> 34 #include <sys/fm/fs/zfs.h> 35 #include <sys/spa_impl.h> 36 #include <sys/zio.h> 37 #include <sys/zio_checksum.h> 38 #include <sys/dmu.h> 39 #include <sys/dmu_tx.h> 40 #include <sys/zap.h> 41 #include <sys/zil.h> 42 #include <sys/ddt.h> 43 #include <sys/vdev_impl.h> 44 #include <sys/metaslab.h> 45 #include <sys/metaslab_impl.h> 46 #include <sys/uberblock_impl.h> 47 #include <sys/txg.h> 48 #include <sys/avl.h> 49 #include <sys/dmu_traverse.h> 50 #include <sys/dmu_objset.h> 51 #include <sys/unique.h> 52 #include <sys/dsl_pool.h> 53 #include <sys/dsl_dataset.h> 54 #include <sys/dsl_dir.h> 55 #include <sys/dsl_prop.h> 56 #include <sys/dsl_synctask.h> 57 #include <sys/fs/zfs.h> 58 #include <sys/arc.h> 59 #include <sys/callb.h> 60 #include <sys/systeminfo.h> 61 #include <sys/spa_boot.h> 62 #include <sys/zfs_ioctl.h> 63 64 #ifdef _KERNEL 65 #include <sys/zone.h> 66 #include <sys/bootprops.h> 67 #endif /* _KERNEL */ 68 69 #include "zfs_prop.h" 70 #include "zfs_comutil.h" 71 72 enum zti_modes { 73 zti_mode_fixed, /* value is # of threads (min 1) */ 74 zti_mode_online_percent, /* value is % of online CPUs */ 75 zti_mode_tune, /* fill from zio_taskq_tune_* */ 76 zti_mode_null, /* don't create a taskq */ 77 zti_nmodes 78 }; 79 80 #define ZTI_FIX(n) { zti_mode_fixed, (n) } 81 #define ZTI_PCT(n) { zti_mode_online_percent, (n) } 82 #define ZTI_TUNE { zti_mode_tune, 0 } 83 #define ZTI_NULL { zti_mode_null, 0 } 84 85 #define ZTI_ONE ZTI_FIX(1) 86 87 typedef struct zio_taskq_info { 88 enum zti_modes zti_mode; 89 uint_t zti_value; 90 } zio_taskq_info_t; 91 92 static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { 93 "issue", "issue_high", "intr", "intr_high" 94 }; 95 96 /* 97 * Define the taskq threads for the following I/O types: 98 * NULL, READ, WRITE, FREE, CLAIM, and IOCTL 99 */ 100 const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { 101 /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ 102 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, 103 { ZTI_FIX(8), ZTI_NULL, ZTI_TUNE, ZTI_NULL }, 104 { ZTI_TUNE, ZTI_FIX(5), ZTI_FIX(8), ZTI_FIX(5) }, 105 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, 106 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, 107 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, 108 }; 109 110 enum zti_modes zio_taskq_tune_mode = zti_mode_online_percent; 111 uint_t zio_taskq_tune_value = 80; /* #threads = 80% of # online CPUs */ 112 113 static void spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx); 114 static boolean_t spa_has_active_shared_spare(spa_t *spa); 115 116 /* 117 * ========================================================================== 118 * SPA properties routines 119 * ========================================================================== 120 */ 121 122 /* 123 * Add a (source=src, propname=propval) list to an nvlist. 124 */ 125 static void 126 spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, 127 uint64_t intval, zprop_source_t src) 128 { 129 const char *propname = zpool_prop_to_name(prop); 130 nvlist_t *propval; 131 132 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 133 VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); 134 135 if (strval != NULL) 136 VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); 137 else 138 VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0); 139 140 VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); 141 nvlist_free(propval); 142 } 143 144 /* 145 * Get property values from the spa configuration. 146 */ 147 static void 148 spa_prop_get_config(spa_t *spa, nvlist_t **nvp) 149 { 150 uint64_t size; 151 uint64_t alloc; 152 uint64_t cap, version; 153 zprop_source_t src = ZPROP_SRC_NONE; 154 spa_config_dirent_t *dp; 155 156 ASSERT(MUTEX_HELD(&spa->spa_props_lock)); 157 158 if (spa->spa_root_vdev != NULL) { 159 alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 160 size = metaslab_class_get_space(spa_normal_class(spa)); 161 spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); 162 spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); 163 spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); 164 spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, 165 size - alloc, src); 166 167 cap = (size == 0) ? 0 : (alloc * 100 / size); 168 spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); 169 170 spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL, 171 ddt_get_pool_dedup_ratio(spa), src); 172 173 spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, 174 spa->spa_root_vdev->vdev_state, src); 175 176 version = spa_version(spa); 177 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) 178 src = ZPROP_SRC_DEFAULT; 179 else 180 src = ZPROP_SRC_LOCAL; 181 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); 182 } 183 184 spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); 185 186 if (spa->spa_root != NULL) 187 spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 188 0, ZPROP_SRC_LOCAL); 189 190 if ((dp = list_head(&spa->spa_config_list)) != NULL) { 191 if (dp->scd_path == NULL) { 192 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 193 "none", 0, ZPROP_SRC_LOCAL); 194 } else if (strcmp(dp->scd_path, spa_config_path) != 0) { 195 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 196 dp->scd_path, 0, ZPROP_SRC_LOCAL); 197 } 198 } 199 } 200 201 /* 202 * Get zpool property values. 203 */ 204 int 205 spa_prop_get(spa_t *spa, nvlist_t **nvp) 206 { 207 objset_t *mos = spa->spa_meta_objset; 208 zap_cursor_t zc; 209 zap_attribute_t za; 210 int err; 211 212 VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 213 214 mutex_enter(&spa->spa_props_lock); 215 216 /* 217 * Get properties from the spa config. 218 */ 219 spa_prop_get_config(spa, nvp); 220 221 /* If no pool property object, no more prop to get. */ 222 if (spa->spa_pool_props_object == 0) { 223 mutex_exit(&spa->spa_props_lock); 224 return (0); 225 } 226 227 /* 228 * Get properties from the MOS pool property object. 229 */ 230 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 231 (err = zap_cursor_retrieve(&zc, &za)) == 0; 232 zap_cursor_advance(&zc)) { 233 uint64_t intval = 0; 234 char *strval = NULL; 235 zprop_source_t src = ZPROP_SRC_DEFAULT; 236 zpool_prop_t prop; 237 238 if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL) 239 continue; 240 241 switch (za.za_integer_length) { 242 case 8: 243 /* integer property */ 244 if (za.za_first_integer != 245 zpool_prop_default_numeric(prop)) 246 src = ZPROP_SRC_LOCAL; 247 248 if (prop == ZPOOL_PROP_BOOTFS) { 249 dsl_pool_t *dp; 250 dsl_dataset_t *ds = NULL; 251 252 dp = spa_get_dsl(spa); 253 rw_enter(&dp->dp_config_rwlock, RW_READER); 254 if (err = dsl_dataset_hold_obj(dp, 255 za.za_first_integer, FTAG, &ds)) { 256 rw_exit(&dp->dp_config_rwlock); 257 break; 258 } 259 260 strval = kmem_alloc( 261 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, 262 KM_SLEEP); 263 dsl_dataset_name(ds, strval); 264 dsl_dataset_rele(ds, FTAG); 265 rw_exit(&dp->dp_config_rwlock); 266 } else { 267 strval = NULL; 268 intval = za.za_first_integer; 269 } 270 271 spa_prop_add_list(*nvp, prop, strval, intval, src); 272 273 if (strval != NULL) 274 kmem_free(strval, 275 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); 276 277 break; 278 279 case 1: 280 /* string property */ 281 strval = kmem_alloc(za.za_num_integers, KM_SLEEP); 282 err = zap_lookup(mos, spa->spa_pool_props_object, 283 za.za_name, 1, za.za_num_integers, strval); 284 if (err) { 285 kmem_free(strval, za.za_num_integers); 286 break; 287 } 288 spa_prop_add_list(*nvp, prop, strval, 0, src); 289 kmem_free(strval, za.za_num_integers); 290 break; 291 292 default: 293 break; 294 } 295 } 296 zap_cursor_fini(&zc); 297 mutex_exit(&spa->spa_props_lock); 298 out: 299 if (err && err != ENOENT) { 300 nvlist_free(*nvp); 301 *nvp = NULL; 302 return (err); 303 } 304 305 return (0); 306 } 307 308 /* 309 * Validate the given pool properties nvlist and modify the list 310 * for the property values to be set. 311 */ 312 static int 313 spa_prop_validate(spa_t *spa, nvlist_t *props) 314 { 315 nvpair_t *elem; 316 int error = 0, reset_bootfs = 0; 317 uint64_t objnum; 318 319 elem = NULL; 320 while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 321 zpool_prop_t prop; 322 char *propname, *strval; 323 uint64_t intval; 324 objset_t *os; 325 char *slash; 326 327 propname = nvpair_name(elem); 328 329 if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL) 330 return (EINVAL); 331 332 switch (prop) { 333 case ZPOOL_PROP_VERSION: 334 error = nvpair_value_uint64(elem, &intval); 335 if (!error && 336 (intval < spa_version(spa) || intval > SPA_VERSION)) 337 error = EINVAL; 338 break; 339 340 case ZPOOL_PROP_DELEGATION: 341 case ZPOOL_PROP_AUTOREPLACE: 342 case ZPOOL_PROP_LISTSNAPS: 343 case ZPOOL_PROP_AUTOEXPAND: 344 error = nvpair_value_uint64(elem, &intval); 345 if (!error && intval > 1) 346 error = EINVAL; 347 break; 348 349 case ZPOOL_PROP_BOOTFS: 350 /* 351 * If the pool version is less than SPA_VERSION_BOOTFS, 352 * or the pool is still being created (version == 0), 353 * the bootfs property cannot be set. 354 */ 355 if (spa_version(spa) < SPA_VERSION_BOOTFS) { 356 error = ENOTSUP; 357 break; 358 } 359 360 /* 361 * Make sure the vdev config is bootable 362 */ 363 if (!vdev_is_bootable(spa->spa_root_vdev)) { 364 error = ENOTSUP; 365 break; 366 } 367 368 reset_bootfs = 1; 369 370 error = nvpair_value_string(elem, &strval); 371 372 if (!error) { 373 uint64_t compress; 374 375 if (strval == NULL || strval[0] == '\0') { 376 objnum = zpool_prop_default_numeric( 377 ZPOOL_PROP_BOOTFS); 378 break; 379 } 380 381 if (error = dmu_objset_hold(strval, FTAG, &os)) 382 break; 383 384 /* Must be ZPL and not gzip compressed. */ 385 386 if (dmu_objset_type(os) != DMU_OST_ZFS) { 387 error = ENOTSUP; 388 } else if ((error = dsl_prop_get_integer(strval, 389 zfs_prop_to_name(ZFS_PROP_COMPRESSION), 390 &compress, NULL)) == 0 && 391 !BOOTFS_COMPRESS_VALID(compress)) { 392 error = ENOTSUP; 393 } else { 394 objnum = dmu_objset_id(os); 395 } 396 dmu_objset_rele(os, FTAG); 397 } 398 break; 399 400 case ZPOOL_PROP_FAILUREMODE: 401 error = nvpair_value_uint64(elem, &intval); 402 if (!error && (intval < ZIO_FAILURE_MODE_WAIT || 403 intval > ZIO_FAILURE_MODE_PANIC)) 404 error = EINVAL; 405 406 /* 407 * This is a special case which only occurs when 408 * the pool has completely failed. This allows 409 * the user to change the in-core failmode property 410 * without syncing it out to disk (I/Os might 411 * currently be blocked). We do this by returning 412 * EIO to the caller (spa_prop_set) to trick it 413 * into thinking we encountered a property validation 414 * error. 415 */ 416 if (!error && spa_suspended(spa)) { 417 spa->spa_failmode = intval; 418 error = EIO; 419 } 420 break; 421 422 case ZPOOL_PROP_CACHEFILE: 423 if ((error = nvpair_value_string(elem, &strval)) != 0) 424 break; 425 426 if (strval[0] == '\0') 427 break; 428 429 if (strcmp(strval, "none") == 0) 430 break; 431 432 if (strval[0] != '/') { 433 error = EINVAL; 434 break; 435 } 436 437 slash = strrchr(strval, '/'); 438 ASSERT(slash != NULL); 439 440 if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || 441 strcmp(slash, "/..") == 0) 442 error = EINVAL; 443 break; 444 445 case ZPOOL_PROP_DEDUPDITTO: 446 if (spa_version(spa) < SPA_VERSION_DEDUP) 447 error = ENOTSUP; 448 else 449 error = nvpair_value_uint64(elem, &intval); 450 if (error == 0 && 451 intval != 0 && intval < ZIO_DEDUPDITTO_MIN) 452 error = EINVAL; 453 break; 454 } 455 456 if (error) 457 break; 458 } 459 460 if (!error && reset_bootfs) { 461 error = nvlist_remove(props, 462 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 463 464 if (!error) { 465 error = nvlist_add_uint64(props, 466 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 467 } 468 } 469 470 return (error); 471 } 472 473 void 474 spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) 475 { 476 char *cachefile; 477 spa_config_dirent_t *dp; 478 479 if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), 480 &cachefile) != 0) 481 return; 482 483 dp = kmem_alloc(sizeof (spa_config_dirent_t), 484 KM_SLEEP); 485 486 if (cachefile[0] == '\0') 487 dp->scd_path = spa_strdup(spa_config_path); 488 else if (strcmp(cachefile, "none") == 0) 489 dp->scd_path = NULL; 490 else 491 dp->scd_path = spa_strdup(cachefile); 492 493 list_insert_head(&spa->spa_config_list, dp); 494 if (need_sync) 495 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 496 } 497 498 int 499 spa_prop_set(spa_t *spa, nvlist_t *nvp) 500 { 501 int error; 502 nvpair_t *elem; 503 boolean_t need_sync = B_FALSE; 504 zpool_prop_t prop; 505 506 if ((error = spa_prop_validate(spa, nvp)) != 0) 507 return (error); 508 509 elem = NULL; 510 while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { 511 if ((prop = zpool_name_to_prop( 512 nvpair_name(elem))) == ZPROP_INVAL) 513 return (EINVAL); 514 515 if (prop == ZPOOL_PROP_CACHEFILE || prop == ZPOOL_PROP_ALTROOT) 516 continue; 517 518 need_sync = B_TRUE; 519 break; 520 } 521 522 if (need_sync) 523 return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, 524 spa, nvp, 3)); 525 else 526 return (0); 527 } 528 529 /* 530 * If the bootfs property value is dsobj, clear it. 531 */ 532 void 533 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 534 { 535 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 536 VERIFY(zap_remove(spa->spa_meta_objset, 537 spa->spa_pool_props_object, 538 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 539 spa->spa_bootfs = 0; 540 } 541 } 542 543 /* 544 * ========================================================================== 545 * SPA state manipulation (open/create/destroy/import/export) 546 * ========================================================================== 547 */ 548 549 static int 550 spa_error_entry_compare(const void *a, const void *b) 551 { 552 spa_error_entry_t *sa = (spa_error_entry_t *)a; 553 spa_error_entry_t *sb = (spa_error_entry_t *)b; 554 int ret; 555 556 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 557 sizeof (zbookmark_t)); 558 559 if (ret < 0) 560 return (-1); 561 else if (ret > 0) 562 return (1); 563 else 564 return (0); 565 } 566 567 /* 568 * Utility function which retrieves copies of the current logs and 569 * re-initializes them in the process. 570 */ 571 void 572 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 573 { 574 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 575 576 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 577 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 578 579 avl_create(&spa->spa_errlist_scrub, 580 spa_error_entry_compare, sizeof (spa_error_entry_t), 581 offsetof(spa_error_entry_t, se_avl)); 582 avl_create(&spa->spa_errlist_last, 583 spa_error_entry_compare, sizeof (spa_error_entry_t), 584 offsetof(spa_error_entry_t, se_avl)); 585 } 586 587 /* 588 * Activate an uninitialized pool. 589 */ 590 static void 591 spa_activate(spa_t *spa, int mode) 592 { 593 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 594 595 spa->spa_state = POOL_STATE_ACTIVE; 596 spa->spa_mode = mode; 597 598 spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops); 599 spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops); 600 601 for (int t = 0; t < ZIO_TYPES; t++) { 602 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 603 const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; 604 enum zti_modes mode = ztip->zti_mode; 605 uint_t value = ztip->zti_value; 606 char name[32]; 607 608 (void) snprintf(name, sizeof (name), 609 "%s_%s", zio_type_name[t], zio_taskq_types[q]); 610 611 if (mode == zti_mode_tune) { 612 mode = zio_taskq_tune_mode; 613 value = zio_taskq_tune_value; 614 if (mode == zti_mode_tune) 615 mode = zti_mode_online_percent; 616 } 617 618 switch (mode) { 619 case zti_mode_fixed: 620 ASSERT3U(value, >=, 1); 621 value = MAX(value, 1); 622 623 spa->spa_zio_taskq[t][q] = taskq_create(name, 624 value, maxclsyspri, 50, INT_MAX, 625 TASKQ_PREPOPULATE); 626 break; 627 628 case zti_mode_online_percent: 629 spa->spa_zio_taskq[t][q] = taskq_create(name, 630 value, maxclsyspri, 50, INT_MAX, 631 TASKQ_PREPOPULATE | TASKQ_THREADS_CPU_PCT); 632 break; 633 634 case zti_mode_null: 635 spa->spa_zio_taskq[t][q] = NULL; 636 break; 637 638 case zti_mode_tune: 639 default: 640 panic("unrecognized mode for " 641 "zio_taskqs[%u]->zti_nthreads[%u] (%u:%u) " 642 "in spa_activate()", 643 t, q, mode, value); 644 break; 645 } 646 } 647 } 648 649 list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), 650 offsetof(vdev_t, vdev_config_dirty_node)); 651 list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), 652 offsetof(vdev_t, vdev_state_dirty_node)); 653 654 txg_list_create(&spa->spa_vdev_txg_list, 655 offsetof(struct vdev, vdev_txg_node)); 656 657 avl_create(&spa->spa_errlist_scrub, 658 spa_error_entry_compare, sizeof (spa_error_entry_t), 659 offsetof(spa_error_entry_t, se_avl)); 660 avl_create(&spa->spa_errlist_last, 661 spa_error_entry_compare, sizeof (spa_error_entry_t), 662 offsetof(spa_error_entry_t, se_avl)); 663 } 664 665 /* 666 * Opposite of spa_activate(). 667 */ 668 static void 669 spa_deactivate(spa_t *spa) 670 { 671 ASSERT(spa->spa_sync_on == B_FALSE); 672 ASSERT(spa->spa_dsl_pool == NULL); 673 ASSERT(spa->spa_root_vdev == NULL); 674 ASSERT(spa->spa_async_zio_root == NULL); 675 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 676 677 txg_list_destroy(&spa->spa_vdev_txg_list); 678 679 list_destroy(&spa->spa_config_dirty_list); 680 list_destroy(&spa->spa_state_dirty_list); 681 682 for (int t = 0; t < ZIO_TYPES; t++) { 683 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 684 if (spa->spa_zio_taskq[t][q] != NULL) 685 taskq_destroy(spa->spa_zio_taskq[t][q]); 686 spa->spa_zio_taskq[t][q] = NULL; 687 } 688 } 689 690 metaslab_class_destroy(spa->spa_normal_class); 691 spa->spa_normal_class = NULL; 692 693 metaslab_class_destroy(spa->spa_log_class); 694 spa->spa_log_class = NULL; 695 696 /* 697 * If this was part of an import or the open otherwise failed, we may 698 * still have errors left in the queues. Empty them just in case. 699 */ 700 spa_errlog_drain(spa); 701 702 avl_destroy(&spa->spa_errlist_scrub); 703 avl_destroy(&spa->spa_errlist_last); 704 705 spa->spa_state = POOL_STATE_UNINITIALIZED; 706 } 707 708 /* 709 * Verify a pool configuration, and construct the vdev tree appropriately. This 710 * will create all the necessary vdevs in the appropriate layout, with each vdev 711 * in the CLOSED state. This will prep the pool before open/creation/import. 712 * All vdev validation is done by the vdev_alloc() routine. 713 */ 714 static int 715 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 716 uint_t id, int atype) 717 { 718 nvlist_t **child; 719 uint_t children; 720 int error; 721 722 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 723 return (error); 724 725 if ((*vdp)->vdev_ops->vdev_op_leaf) 726 return (0); 727 728 error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 729 &child, &children); 730 731 if (error == ENOENT) 732 return (0); 733 734 if (error) { 735 vdev_free(*vdp); 736 *vdp = NULL; 737 return (EINVAL); 738 } 739 740 for (int c = 0; c < children; c++) { 741 vdev_t *vd; 742 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 743 atype)) != 0) { 744 vdev_free(*vdp); 745 *vdp = NULL; 746 return (error); 747 } 748 } 749 750 ASSERT(*vdp != NULL); 751 752 return (0); 753 } 754 755 /* 756 * Opposite of spa_load(). 757 */ 758 static void 759 spa_unload(spa_t *spa) 760 { 761 int i; 762 763 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 764 765 /* 766 * Stop async tasks. 767 */ 768 spa_async_suspend(spa); 769 770 /* 771 * Stop syncing. 772 */ 773 if (spa->spa_sync_on) { 774 txg_sync_stop(spa->spa_dsl_pool); 775 spa->spa_sync_on = B_FALSE; 776 } 777 778 /* 779 * Wait for any outstanding async I/O to complete. 780 */ 781 if (spa->spa_async_zio_root != NULL) { 782 (void) zio_wait(spa->spa_async_zio_root); 783 spa->spa_async_zio_root = NULL; 784 } 785 786 /* 787 * Close the dsl pool. 788 */ 789 if (spa->spa_dsl_pool) { 790 dsl_pool_close(spa->spa_dsl_pool); 791 spa->spa_dsl_pool = NULL; 792 } 793 794 ddt_unload(spa); 795 796 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 797 798 /* 799 * Drop and purge level 2 cache 800 */ 801 spa_l2cache_drop(spa); 802 803 /* 804 * Close all vdevs. 805 */ 806 if (spa->spa_root_vdev) 807 vdev_free(spa->spa_root_vdev); 808 ASSERT(spa->spa_root_vdev == NULL); 809 810 for (i = 0; i < spa->spa_spares.sav_count; i++) 811 vdev_free(spa->spa_spares.sav_vdevs[i]); 812 if (spa->spa_spares.sav_vdevs) { 813 kmem_free(spa->spa_spares.sav_vdevs, 814 spa->spa_spares.sav_count * sizeof (void *)); 815 spa->spa_spares.sav_vdevs = NULL; 816 } 817 if (spa->spa_spares.sav_config) { 818 nvlist_free(spa->spa_spares.sav_config); 819 spa->spa_spares.sav_config = NULL; 820 } 821 spa->spa_spares.sav_count = 0; 822 823 for (i = 0; i < spa->spa_l2cache.sav_count; i++) 824 vdev_free(spa->spa_l2cache.sav_vdevs[i]); 825 if (spa->spa_l2cache.sav_vdevs) { 826 kmem_free(spa->spa_l2cache.sav_vdevs, 827 spa->spa_l2cache.sav_count * sizeof (void *)); 828 spa->spa_l2cache.sav_vdevs = NULL; 829 } 830 if (spa->spa_l2cache.sav_config) { 831 nvlist_free(spa->spa_l2cache.sav_config); 832 spa->spa_l2cache.sav_config = NULL; 833 } 834 spa->spa_l2cache.sav_count = 0; 835 836 spa->spa_async_suspended = 0; 837 838 spa_config_exit(spa, SCL_ALL, FTAG); 839 } 840 841 /* 842 * Load (or re-load) the current list of vdevs describing the active spares for 843 * this pool. When this is called, we have some form of basic information in 844 * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and 845 * then re-generate a more complete list including status information. 846 */ 847 static void 848 spa_load_spares(spa_t *spa) 849 { 850 nvlist_t **spares; 851 uint_t nspares; 852 int i; 853 vdev_t *vd, *tvd; 854 855 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 856 857 /* 858 * First, close and free any existing spare vdevs. 859 */ 860 for (i = 0; i < spa->spa_spares.sav_count; i++) { 861 vd = spa->spa_spares.sav_vdevs[i]; 862 863 /* Undo the call to spa_activate() below */ 864 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 865 B_FALSE)) != NULL && tvd->vdev_isspare) 866 spa_spare_remove(tvd); 867 vdev_close(vd); 868 vdev_free(vd); 869 } 870 871 if (spa->spa_spares.sav_vdevs) 872 kmem_free(spa->spa_spares.sav_vdevs, 873 spa->spa_spares.sav_count * sizeof (void *)); 874 875 if (spa->spa_spares.sav_config == NULL) 876 nspares = 0; 877 else 878 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 879 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 880 881 spa->spa_spares.sav_count = (int)nspares; 882 spa->spa_spares.sav_vdevs = NULL; 883 884 if (nspares == 0) 885 return; 886 887 /* 888 * Construct the array of vdevs, opening them to get status in the 889 * process. For each spare, there is potentially two different vdev_t 890 * structures associated with it: one in the list of spares (used only 891 * for basic validation purposes) and one in the active vdev 892 * configuration (if it's spared in). During this phase we open and 893 * validate each vdev on the spare list. If the vdev also exists in the 894 * active configuration, then we also mark this vdev as an active spare. 895 */ 896 spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *), 897 KM_SLEEP); 898 for (i = 0; i < spa->spa_spares.sav_count; i++) { 899 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 900 VDEV_ALLOC_SPARE) == 0); 901 ASSERT(vd != NULL); 902 903 spa->spa_spares.sav_vdevs[i] = vd; 904 905 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 906 B_FALSE)) != NULL) { 907 if (!tvd->vdev_isspare) 908 spa_spare_add(tvd); 909 910 /* 911 * We only mark the spare active if we were successfully 912 * able to load the vdev. Otherwise, importing a pool 913 * with a bad active spare would result in strange 914 * behavior, because multiple pool would think the spare 915 * is actively in use. 916 * 917 * There is a vulnerability here to an equally bizarre 918 * circumstance, where a dead active spare is later 919 * brought back to life (onlined or otherwise). Given 920 * the rarity of this scenario, and the extra complexity 921 * it adds, we ignore the possibility. 922 */ 923 if (!vdev_is_dead(tvd)) 924 spa_spare_activate(tvd); 925 } 926 927 vd->vdev_top = vd; 928 vd->vdev_aux = &spa->spa_spares; 929 930 if (vdev_open(vd) != 0) 931 continue; 932 933 if (vdev_validate_aux(vd) == 0) 934 spa_spare_add(vd); 935 } 936 937 /* 938 * Recompute the stashed list of spares, with status information 939 * this time. 940 */ 941 VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, 942 DATA_TYPE_NVLIST_ARRAY) == 0); 943 944 spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), 945 KM_SLEEP); 946 for (i = 0; i < spa->spa_spares.sav_count; i++) 947 spares[i] = vdev_config_generate(spa, 948 spa->spa_spares.sav_vdevs[i], B_TRUE, B_TRUE, B_FALSE); 949 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 950 ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); 951 for (i = 0; i < spa->spa_spares.sav_count; i++) 952 nvlist_free(spares[i]); 953 kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); 954 } 955 956 /* 957 * Load (or re-load) the current list of vdevs describing the active l2cache for 958 * this pool. When this is called, we have some form of basic information in 959 * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and 960 * then re-generate a more complete list including status information. 961 * Devices which are already active have their details maintained, and are 962 * not re-opened. 963 */ 964 static void 965 spa_load_l2cache(spa_t *spa) 966 { 967 nvlist_t **l2cache; 968 uint_t nl2cache; 969 int i, j, oldnvdevs; 970 uint64_t guid; 971 vdev_t *vd, **oldvdevs, **newvdevs; 972 spa_aux_vdev_t *sav = &spa->spa_l2cache; 973 974 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 975 976 if (sav->sav_config != NULL) { 977 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 978 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 979 newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); 980 } else { 981 nl2cache = 0; 982 } 983 984 oldvdevs = sav->sav_vdevs; 985 oldnvdevs = sav->sav_count; 986 sav->sav_vdevs = NULL; 987 sav->sav_count = 0; 988 989 /* 990 * Process new nvlist of vdevs. 991 */ 992 for (i = 0; i < nl2cache; i++) { 993 VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, 994 &guid) == 0); 995 996 newvdevs[i] = NULL; 997 for (j = 0; j < oldnvdevs; j++) { 998 vd = oldvdevs[j]; 999 if (vd != NULL && guid == vd->vdev_guid) { 1000 /* 1001 * Retain previous vdev for add/remove ops. 1002 */ 1003 newvdevs[i] = vd; 1004 oldvdevs[j] = NULL; 1005 break; 1006 } 1007 } 1008 1009 if (newvdevs[i] == NULL) { 1010 /* 1011 * Create new vdev 1012 */ 1013 VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, 1014 VDEV_ALLOC_L2CACHE) == 0); 1015 ASSERT(vd != NULL); 1016 newvdevs[i] = vd; 1017 1018 /* 1019 * Commit this vdev as an l2cache device, 1020 * even if it fails to open. 1021 */ 1022 spa_l2cache_add(vd); 1023 1024 vd->vdev_top = vd; 1025 vd->vdev_aux = sav; 1026 1027 spa_l2cache_activate(vd); 1028 1029 if (vdev_open(vd) != 0) 1030 continue; 1031 1032 (void) vdev_validate_aux(vd); 1033 1034 if (!vdev_is_dead(vd)) 1035 l2arc_add_vdev(spa, vd); 1036 } 1037 } 1038 1039 /* 1040 * Purge vdevs that were dropped 1041 */ 1042 for (i = 0; i < oldnvdevs; i++) { 1043 uint64_t pool; 1044 1045 vd = oldvdevs[i]; 1046 if (vd != NULL) { 1047 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 1048 pool != 0ULL && l2arc_vdev_present(vd)) 1049 l2arc_remove_vdev(vd); 1050 (void) vdev_close(vd); 1051 spa_l2cache_remove(vd); 1052 } 1053 } 1054 1055 if (oldvdevs) 1056 kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); 1057 1058 if (sav->sav_config == NULL) 1059 goto out; 1060 1061 sav->sav_vdevs = newvdevs; 1062 sav->sav_count = (int)nl2cache; 1063 1064 /* 1065 * Recompute the stashed list of l2cache devices, with status 1066 * information this time. 1067 */ 1068 VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 1069 DATA_TYPE_NVLIST_ARRAY) == 0); 1070 1071 l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 1072 for (i = 0; i < sav->sav_count; i++) 1073 l2cache[i] = vdev_config_generate(spa, 1074 sav->sav_vdevs[i], B_TRUE, B_FALSE, B_TRUE); 1075 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 1076 ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); 1077 out: 1078 for (i = 0; i < sav->sav_count; i++) 1079 nvlist_free(l2cache[i]); 1080 if (sav->sav_count) 1081 kmem_free(l2cache, sav->sav_count * sizeof (void *)); 1082 } 1083 1084 static int 1085 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 1086 { 1087 dmu_buf_t *db; 1088 char *packed = NULL; 1089 size_t nvsize = 0; 1090 int error; 1091 *value = NULL; 1092 1093 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 1094 nvsize = *(uint64_t *)db->db_data; 1095 dmu_buf_rele(db, FTAG); 1096 1097 packed = kmem_alloc(nvsize, KM_SLEEP); 1098 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, 1099 DMU_READ_PREFETCH); 1100 if (error == 0) 1101 error = nvlist_unpack(packed, nvsize, value, 0); 1102 kmem_free(packed, nvsize); 1103 1104 return (error); 1105 } 1106 1107 /* 1108 * Checks to see if the given vdev could not be opened, in which case we post a 1109 * sysevent to notify the autoreplace code that the device has been removed. 1110 */ 1111 static void 1112 spa_check_removed(vdev_t *vd) 1113 { 1114 for (int c = 0; c < vd->vdev_children; c++) 1115 spa_check_removed(vd->vdev_child[c]); 1116 1117 if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) { 1118 zfs_post_autoreplace(vd->vdev_spa, vd); 1119 spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK); 1120 } 1121 } 1122 1123 /* 1124 * Load the slog device state from the config object since it's possible 1125 * that the label does not contain the most up-to-date information. 1126 */ 1127 void 1128 spa_load_log_state(spa_t *spa, nvlist_t *nv) 1129 { 1130 vdev_t *ovd, *rvd = spa->spa_root_vdev; 1131 1132 /* 1133 * Load the original root vdev tree from the passed config. 1134 */ 1135 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1136 VERIFY(spa_config_parse(spa, &ovd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); 1137 1138 for (int c = 0; c < rvd->vdev_children; c++) { 1139 vdev_t *cvd = rvd->vdev_child[c]; 1140 if (cvd->vdev_islog) 1141 vdev_load_log_state(cvd, ovd->vdev_child[c]); 1142 } 1143 vdev_free(ovd); 1144 spa_config_exit(spa, SCL_ALL, FTAG); 1145 } 1146 1147 /* 1148 * Check for missing log devices 1149 */ 1150 int 1151 spa_check_logs(spa_t *spa) 1152 { 1153 switch (spa->spa_log_state) { 1154 case SPA_LOG_MISSING: 1155 /* need to recheck in case slog has been restored */ 1156 case SPA_LOG_UNKNOWN: 1157 if (dmu_objset_find(spa->spa_name, zil_check_log_chain, NULL, 1158 DS_FIND_CHILDREN)) { 1159 spa->spa_log_state = SPA_LOG_MISSING; 1160 return (1); 1161 } 1162 break; 1163 } 1164 return (0); 1165 } 1166 1167 static void 1168 spa_aux_check_removed(spa_aux_vdev_t *sav) 1169 { 1170 for (int i = 0; i < sav->sav_count; i++) 1171 spa_check_removed(sav->sav_vdevs[i]); 1172 } 1173 1174 void 1175 spa_claim_notify(zio_t *zio) 1176 { 1177 spa_t *spa = zio->io_spa; 1178 1179 if (zio->io_error) 1180 return; 1181 1182 mutex_enter(&spa->spa_props_lock); /* any mutex will do */ 1183 if (spa->spa_claim_max_txg < zio->io_bp->blk_birth) 1184 spa->spa_claim_max_txg = zio->io_bp->blk_birth; 1185 mutex_exit(&spa->spa_props_lock); 1186 } 1187 1188 typedef struct spa_load_error { 1189 uint64_t sle_metadata_count; 1190 uint64_t sle_data_count; 1191 } spa_load_error_t; 1192 1193 static void 1194 spa_load_verify_done(zio_t *zio) 1195 { 1196 blkptr_t *bp = zio->io_bp; 1197 spa_load_error_t *sle = zio->io_private; 1198 dmu_object_type_t type = BP_GET_TYPE(bp); 1199 int error = zio->io_error; 1200 1201 if (error) { 1202 if ((BP_GET_LEVEL(bp) != 0 || dmu_ot[type].ot_metadata) && 1203 type != DMU_OT_INTENT_LOG) 1204 atomic_add_64(&sle->sle_metadata_count, 1); 1205 else 1206 atomic_add_64(&sle->sle_data_count, 1); 1207 } 1208 zio_data_buf_free(zio->io_data, zio->io_size); 1209 } 1210 1211 /*ARGSUSED*/ 1212 static int 1213 spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 1214 const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) 1215 { 1216 if (bp != NULL) { 1217 zio_t *rio = arg; 1218 size_t size = BP_GET_PSIZE(bp); 1219 void *data = zio_data_buf_alloc(size); 1220 1221 zio_nowait(zio_read(rio, spa, bp, data, size, 1222 spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, 1223 ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | 1224 ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); 1225 } 1226 return (0); 1227 } 1228 1229 static int 1230 spa_load_verify(spa_t *spa) 1231 { 1232 zio_t *rio; 1233 spa_load_error_t sle = { 0 }; 1234 zpool_rewind_policy_t policy; 1235 boolean_t verify_ok = B_FALSE; 1236 int error; 1237 1238 rio = zio_root(spa, NULL, &sle, 1239 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 1240 1241 error = traverse_pool(spa, spa->spa_verify_min_txg, 1242 TRAVERSE_PRE | TRAVERSE_PREFETCH, spa_load_verify_cb, rio); 1243 1244 (void) zio_wait(rio); 1245 1246 zpool_get_rewind_policy(spa->spa_config, &policy); 1247 1248 spa->spa_load_meta_errors = sle.sle_metadata_count; 1249 spa->spa_load_data_errors = sle.sle_data_count; 1250 1251 if (!error && sle.sle_metadata_count <= policy.zrp_maxmeta && 1252 sle.sle_data_count <= policy.zrp_maxdata) { 1253 verify_ok = B_TRUE; 1254 spa->spa_load_txg = spa->spa_uberblock.ub_txg; 1255 spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; 1256 } else { 1257 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; 1258 } 1259 1260 if (error) { 1261 if (error != ENXIO && error != EIO) 1262 error = EIO; 1263 return (error); 1264 } 1265 1266 return (verify_ok ? 0 : EIO); 1267 } 1268 1269 /* 1270 * Load an existing storage pool, using the pool's builtin spa_config as a 1271 * source of configuration information. 1272 */ 1273 static int 1274 spa_load(spa_t *spa, spa_load_state_t state, int mosconfig) 1275 { 1276 int error = 0; 1277 nvlist_t *nvconfig, *nvroot = NULL; 1278 vdev_t *rvd; 1279 uberblock_t *ub = &spa->spa_uberblock; 1280 uint64_t config_cache_txg = spa->spa_config_txg; 1281 uint64_t pool_guid; 1282 uint64_t version; 1283 uint64_t autoreplace = 0; 1284 int orig_mode = spa->spa_mode; 1285 char *ereport = FM_EREPORT_ZFS_POOL; 1286 nvlist_t *config = spa->spa_config; 1287 1288 /* 1289 * If this is an untrusted config, access the pool in read-only mode. 1290 * This prevents things like resilvering recently removed devices. 1291 */ 1292 if (!mosconfig) 1293 spa->spa_mode = FREAD; 1294 1295 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1296 1297 spa->spa_load_state = state; 1298 1299 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 1300 nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 1301 error = EINVAL; 1302 goto out; 1303 } 1304 1305 /* 1306 * Versioning wasn't explicitly added to the label until later, so if 1307 * it's not present treat it as the initial version. 1308 */ 1309 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0) 1310 version = SPA_VERSION_INITIAL; 1311 1312 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 1313 &spa->spa_config_txg); 1314 1315 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 1316 spa_guid_exists(pool_guid, 0)) { 1317 error = EEXIST; 1318 goto out; 1319 } 1320 1321 spa->spa_load_guid = pool_guid; 1322 1323 /* 1324 * Create "The Godfather" zio to hold all async IOs 1325 */ 1326 spa->spa_async_zio_root = zio_root(spa, NULL, NULL, 1327 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); 1328 1329 /* 1330 * Parse the configuration into a vdev tree. We explicitly set the 1331 * value that will be returned by spa_version() since parsing the 1332 * configuration requires knowing the version number. 1333 */ 1334 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1335 spa->spa_ubsync.ub_version = version; 1336 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 1337 spa_config_exit(spa, SCL_ALL, FTAG); 1338 1339 if (error != 0) 1340 goto out; 1341 1342 ASSERT(spa->spa_root_vdev == rvd); 1343 ASSERT(spa_guid(spa) == pool_guid); 1344 1345 /* 1346 * Try to open all vdevs, loading each label in the process. 1347 */ 1348 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1349 error = vdev_open(rvd); 1350 spa_config_exit(spa, SCL_ALL, FTAG); 1351 if (error != 0) 1352 goto out; 1353 1354 /* 1355 * We need to validate the vdev labels against the configuration that 1356 * we have in hand, which is dependent on the setting of mosconfig. If 1357 * mosconfig is true then we're validating the vdev labels based on 1358 * that config. Otherwise, we're validating against the cached config 1359 * (zpool.cache) that was read when we loaded the zfs module, and then 1360 * later we will recursively call spa_load() and validate against 1361 * the vdev config. 1362 */ 1363 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1364 error = vdev_validate(rvd); 1365 spa_config_exit(spa, SCL_ALL, FTAG); 1366 if (error != 0) 1367 goto out; 1368 1369 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 1370 error = ENXIO; 1371 goto out; 1372 } 1373 1374 /* 1375 * Find the best uberblock. 1376 */ 1377 vdev_uberblock_load(NULL, rvd, ub); 1378 1379 /* 1380 * If we weren't able to find a single valid uberblock, return failure. 1381 */ 1382 if (ub->ub_txg == 0) { 1383 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1384 VDEV_AUX_CORRUPT_DATA); 1385 error = ENXIO; 1386 goto out; 1387 } 1388 1389 /* 1390 * If the pool is newer than the code, we can't open it. 1391 */ 1392 if (ub->ub_version > SPA_VERSION) { 1393 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1394 VDEV_AUX_VERSION_NEWER); 1395 error = ENOTSUP; 1396 goto out; 1397 } 1398 1399 /* 1400 * If the vdev guid sum doesn't match the uberblock, we have an 1401 * incomplete configuration. 1402 */ 1403 if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 1404 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1405 VDEV_AUX_BAD_GUID_SUM); 1406 error = ENXIO; 1407 goto out; 1408 } 1409 1410 /* 1411 * Initialize internal SPA structures. 1412 */ 1413 spa->spa_state = POOL_STATE_ACTIVE; 1414 spa->spa_ubsync = spa->spa_uberblock; 1415 spa->spa_verify_min_txg = spa->spa_extreme_rewind ? 1416 TXG_INITIAL : spa_last_synced_txg(spa) - TXG_DEFER_SIZE; 1417 spa->spa_first_txg = spa->spa_last_ubsync_txg ? 1418 spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; 1419 spa->spa_claim_max_txg = spa->spa_first_txg; 1420 1421 error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 1422 if (error) { 1423 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1424 VDEV_AUX_CORRUPT_DATA); 1425 error = EIO; 1426 goto out; 1427 } 1428 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 1429 1430 if (zap_lookup(spa->spa_meta_objset, 1431 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 1432 sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { 1433 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1434 VDEV_AUX_CORRUPT_DATA); 1435 error = EIO; 1436 goto out; 1437 } 1438 1439 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) { 1440 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1441 VDEV_AUX_CORRUPT_DATA); 1442 error = EIO; 1443 goto out; 1444 } 1445 1446 if (!mosconfig) { 1447 uint64_t hostid; 1448 1449 if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig, 1450 ZPOOL_CONFIG_HOSTID, &hostid) == 0) { 1451 char *hostname; 1452 unsigned long myhostid = 0; 1453 1454 VERIFY(nvlist_lookup_string(nvconfig, 1455 ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 1456 1457 #ifdef _KERNEL 1458 myhostid = zone_get_hostid(NULL); 1459 #else /* _KERNEL */ 1460 /* 1461 * We're emulating the system's hostid in userland, so 1462 * we can't use zone_get_hostid(). 1463 */ 1464 (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 1465 #endif /* _KERNEL */ 1466 if (hostid != 0 && myhostid != 0 && 1467 hostid != myhostid) { 1468 cmn_err(CE_WARN, "pool '%s' could not be " 1469 "loaded as it was last accessed by " 1470 "another system (host: %s hostid: 0x%lx). " 1471 "See: http://www.sun.com/msg/ZFS-8000-EY", 1472 spa_name(spa), hostname, 1473 (unsigned long)hostid); 1474 error = EBADF; 1475 goto out; 1476 } 1477 } 1478 1479 spa_config_set(spa, nvconfig); 1480 spa_unload(spa); 1481 spa_deactivate(spa); 1482 spa_activate(spa, orig_mode); 1483 1484 return (spa_load(spa, state, B_TRUE)); 1485 } 1486 1487 if (zap_lookup(spa->spa_meta_objset, 1488 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 1489 sizeof (uint64_t), 1, &spa->spa_deferred_bplist_obj) != 0) { 1490 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1491 VDEV_AUX_CORRUPT_DATA); 1492 error = EIO; 1493 goto out; 1494 } 1495 1496 /* 1497 * Load the bit that tells us to use the new accounting function 1498 * (raid-z deflation). If we have an older pool, this will not 1499 * be present. 1500 */ 1501 error = zap_lookup(spa->spa_meta_objset, 1502 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 1503 sizeof (uint64_t), 1, &spa->spa_deflate); 1504 if (error != 0 && error != ENOENT) { 1505 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1506 VDEV_AUX_CORRUPT_DATA); 1507 error = EIO; 1508 goto out; 1509 } 1510 1511 /* 1512 * Load the persistent error log. If we have an older pool, this will 1513 * not be present. 1514 */ 1515 error = zap_lookup(spa->spa_meta_objset, 1516 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, 1517 sizeof (uint64_t), 1, &spa->spa_errlog_last); 1518 if (error != 0 && error != ENOENT) { 1519 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1520 VDEV_AUX_CORRUPT_DATA); 1521 error = EIO; 1522 goto out; 1523 } 1524 1525 error = zap_lookup(spa->spa_meta_objset, 1526 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, 1527 sizeof (uint64_t), 1, &spa->spa_errlog_scrub); 1528 if (error != 0 && error != ENOENT) { 1529 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1530 VDEV_AUX_CORRUPT_DATA); 1531 error = EIO; 1532 goto out; 1533 } 1534 1535 /* 1536 * Load the history object. If we have an older pool, this 1537 * will not be present. 1538 */ 1539 error = zap_lookup(spa->spa_meta_objset, 1540 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY, 1541 sizeof (uint64_t), 1, &spa->spa_history); 1542 if (error != 0 && error != ENOENT) { 1543 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1544 VDEV_AUX_CORRUPT_DATA); 1545 error = EIO; 1546 goto out; 1547 } 1548 1549 /* 1550 * Load any hot spares for this pool. 1551 */ 1552 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1553 DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares.sav_object); 1554 if (error != 0 && error != ENOENT) { 1555 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1556 VDEV_AUX_CORRUPT_DATA); 1557 error = EIO; 1558 goto out; 1559 } 1560 if (error == 0) { 1561 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 1562 if (load_nvlist(spa, spa->spa_spares.sav_object, 1563 &spa->spa_spares.sav_config) != 0) { 1564 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1565 VDEV_AUX_CORRUPT_DATA); 1566 error = EIO; 1567 goto out; 1568 } 1569 1570 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1571 spa_load_spares(spa); 1572 spa_config_exit(spa, SCL_ALL, FTAG); 1573 } 1574 1575 /* 1576 * Load any level 2 ARC devices for this pool. 1577 */ 1578 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1579 DMU_POOL_L2CACHE, sizeof (uint64_t), 1, 1580 &spa->spa_l2cache.sav_object); 1581 if (error != 0 && error != ENOENT) { 1582 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1583 VDEV_AUX_CORRUPT_DATA); 1584 error = EIO; 1585 goto out; 1586 } 1587 if (error == 0) { 1588 ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); 1589 if (load_nvlist(spa, spa->spa_l2cache.sav_object, 1590 &spa->spa_l2cache.sav_config) != 0) { 1591 vdev_set_state(rvd, B_TRUE, 1592 VDEV_STATE_CANT_OPEN, 1593 VDEV_AUX_CORRUPT_DATA); 1594 error = EIO; 1595 goto out; 1596 } 1597 1598 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1599 spa_load_l2cache(spa); 1600 spa_config_exit(spa, SCL_ALL, FTAG); 1601 } 1602 1603 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 1604 1605 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1606 DMU_POOL_PROPS, sizeof (uint64_t), 1, &spa->spa_pool_props_object); 1607 1608 if (error && error != ENOENT) { 1609 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1610 VDEV_AUX_CORRUPT_DATA); 1611 error = EIO; 1612 goto out; 1613 } 1614 1615 if (error == 0) { 1616 (void) zap_lookup(spa->spa_meta_objset, 1617 spa->spa_pool_props_object, 1618 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), 1619 sizeof (uint64_t), 1, &spa->spa_bootfs); 1620 (void) zap_lookup(spa->spa_meta_objset, 1621 spa->spa_pool_props_object, 1622 zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), 1623 sizeof (uint64_t), 1, &autoreplace); 1624 spa->spa_autoreplace = (autoreplace != 0); 1625 (void) zap_lookup(spa->spa_meta_objset, 1626 spa->spa_pool_props_object, 1627 zpool_prop_to_name(ZPOOL_PROP_DELEGATION), 1628 sizeof (uint64_t), 1, &spa->spa_delegation); 1629 (void) zap_lookup(spa->spa_meta_objset, 1630 spa->spa_pool_props_object, 1631 zpool_prop_to_name(ZPOOL_PROP_FAILUREMODE), 1632 sizeof (uint64_t), 1, &spa->spa_failmode); 1633 (void) zap_lookup(spa->spa_meta_objset, 1634 spa->spa_pool_props_object, 1635 zpool_prop_to_name(ZPOOL_PROP_AUTOEXPAND), 1636 sizeof (uint64_t), 1, &spa->spa_autoexpand); 1637 (void) zap_lookup(spa->spa_meta_objset, 1638 spa->spa_pool_props_object, 1639 zpool_prop_to_name(ZPOOL_PROP_DEDUPDITTO), 1640 sizeof (uint64_t), 1, &spa->spa_dedup_ditto); 1641 } 1642 1643 /* 1644 * If the 'autoreplace' property is set, then post a resource notifying 1645 * the ZFS DE that it should not issue any faults for unopenable 1646 * devices. We also iterate over the vdevs, and post a sysevent for any 1647 * unopenable vdevs so that the normal autoreplace handler can take 1648 * over. 1649 */ 1650 if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) { 1651 spa_check_removed(spa->spa_root_vdev); 1652 /* 1653 * For the import case, this is done in spa_import(), because 1654 * at this point we're using the spare definitions from 1655 * the MOS config, not necessarily from the userland config. 1656 */ 1657 if (state != SPA_LOAD_IMPORT) { 1658 spa_aux_check_removed(&spa->spa_spares); 1659 spa_aux_check_removed(&spa->spa_l2cache); 1660 } 1661 } 1662 1663 /* 1664 * Load the vdev state for all toplevel vdevs. 1665 */ 1666 vdev_load(rvd); 1667 1668 /* 1669 * Propagate the leaf DTLs we just loaded all the way up the tree. 1670 */ 1671 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1672 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 1673 spa_config_exit(spa, SCL_ALL, FTAG); 1674 1675 /* 1676 * Check the state of the root vdev. If it can't be opened, it 1677 * indicates one or more toplevel vdevs are faulted. 1678 */ 1679 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 1680 error = ENXIO; 1681 goto out; 1682 } 1683 1684 /* 1685 * Load the DDTs (dedup tables). 1686 */ 1687 error = ddt_load(spa); 1688 if (error != 0) { 1689 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1690 VDEV_AUX_CORRUPT_DATA); 1691 error = EIO; 1692 goto out; 1693 } 1694 1695 spa_update_dspace(spa); 1696 1697 if (state != SPA_LOAD_TRYIMPORT) { 1698 error = spa_load_verify(spa); 1699 if (error) { 1700 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1701 VDEV_AUX_CORRUPT_DATA); 1702 goto out; 1703 } 1704 } 1705 1706 /* 1707 * Load the intent log state and check log integrity. 1708 */ 1709 VERIFY(nvlist_lookup_nvlist(nvconfig, ZPOOL_CONFIG_VDEV_TREE, 1710 &nvroot) == 0); 1711 spa_load_log_state(spa, nvroot); 1712 nvlist_free(nvconfig); 1713 1714 if (spa_check_logs(spa)) { 1715 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 1716 VDEV_AUX_BAD_LOG); 1717 error = ENXIO; 1718 ereport = FM_EREPORT_ZFS_LOG_REPLAY; 1719 goto out; 1720 } 1721 1722 if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER || 1723 spa->spa_load_max_txg == UINT64_MAX)) { 1724 dmu_tx_t *tx; 1725 int need_update = B_FALSE; 1726 1727 ASSERT(state != SPA_LOAD_TRYIMPORT); 1728 1729 /* 1730 * Claim log blocks that haven't been committed yet. 1731 * This must all happen in a single txg. 1732 * Note: spa_claim_max_txg is updated by spa_claim_notify(), 1733 * invoked from zil_claim_log_block()'s i/o done callback. 1734 * Price of rollback is that we abandon the log. 1735 */ 1736 spa->spa_claiming = B_TRUE; 1737 1738 tx = dmu_tx_create_assigned(spa_get_dsl(spa), 1739 spa_first_txg(spa)); 1740 (void) dmu_objset_find(spa_name(spa), 1741 zil_claim, tx, DS_FIND_CHILDREN); 1742 dmu_tx_commit(tx); 1743 1744 spa->spa_claiming = B_FALSE; 1745 1746 spa->spa_log_state = SPA_LOG_GOOD; 1747 spa->spa_sync_on = B_TRUE; 1748 txg_sync_start(spa->spa_dsl_pool); 1749 1750 /* 1751 * Wait for all claims to sync. We sync up to the highest 1752 * claimed log block birth time so that claimed log blocks 1753 * don't appear to be from the future. spa_claim_max_txg 1754 * will have been set for us by either zil_check_log_chain() 1755 * (invoked from spa_check_logs()) or zil_claim() above. 1756 */ 1757 txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); 1758 1759 /* 1760 * If the config cache is stale, or we have uninitialized 1761 * metaslabs (see spa_vdev_add()), then update the config. 1762 * 1763 * If spa_load_verbatim is true, trust the current 1764 * in-core spa_config and update the disk labels. 1765 */ 1766 if (config_cache_txg != spa->spa_config_txg || 1767 state == SPA_LOAD_IMPORT || spa->spa_load_verbatim || 1768 state == SPA_LOAD_RECOVER) 1769 need_update = B_TRUE; 1770 1771 for (int c = 0; c < rvd->vdev_children; c++) 1772 if (rvd->vdev_child[c]->vdev_ms_array == 0) 1773 need_update = B_TRUE; 1774 1775 /* 1776 * Update the config cache asychronously in case we're the 1777 * root pool, in which case the config cache isn't writable yet. 1778 */ 1779 if (need_update) 1780 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 1781 1782 /* 1783 * Check all DTLs to see if anything needs resilvering. 1784 */ 1785 if (vdev_resilver_needed(rvd, NULL, NULL)) 1786 spa_async_request(spa, SPA_ASYNC_RESILVER); 1787 1788 /* 1789 * Delete any inconsistent datasets. 1790 */ 1791 (void) dmu_objset_find(spa_name(spa), 1792 dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); 1793 1794 /* 1795 * Clean up any stale temporary dataset userrefs. 1796 */ 1797 dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); 1798 } 1799 1800 error = 0; 1801 out: 1802 1803 spa->spa_minref = refcount_count(&spa->spa_refcount); 1804 if (error && error != EBADF) 1805 zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); 1806 spa->spa_load_state = SPA_LOAD_NONE; 1807 spa->spa_ena = 0; 1808 1809 return (error); 1810 } 1811 1812 static int 1813 spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig) 1814 { 1815 spa_unload(spa); 1816 spa_deactivate(spa); 1817 1818 spa->spa_load_max_txg--; 1819 1820 spa_activate(spa, spa_mode_global); 1821 spa_async_suspend(spa); 1822 1823 return (spa_load(spa, state, mosconfig)); 1824 } 1825 1826 static int 1827 spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, 1828 uint64_t max_request, boolean_t extreme) 1829 { 1830 nvlist_t *config = NULL; 1831 int load_error, rewind_error; 1832 uint64_t safe_rollback_txg; 1833 uint64_t min_txg; 1834 1835 if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { 1836 spa->spa_load_max_txg = spa->spa_load_txg; 1837 spa->spa_log_state = SPA_LOG_CLEAR; 1838 } else { 1839 spa->spa_load_max_txg = max_request; 1840 } 1841 1842 load_error = rewind_error = spa_load(spa, state, mosconfig); 1843 if (load_error == 0) 1844 return (0); 1845 1846 if (spa->spa_root_vdev != NULL) 1847 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 1848 1849 spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; 1850 spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; 1851 1852 /* specific txg requested */ 1853 if (spa->spa_load_max_txg != UINT64_MAX && !extreme) { 1854 nvlist_free(config); 1855 return (load_error); 1856 } 1857 1858 /* Price of rolling back is discarding txgs, including log */ 1859 if (state == SPA_LOAD_RECOVER) 1860 spa->spa_log_state = SPA_LOG_CLEAR; 1861 1862 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; 1863 safe_rollback_txg = spa->spa_uberblock.ub_txg - TXG_DEFER_SIZE; 1864 1865 min_txg = extreme ? TXG_INITIAL : safe_rollback_txg; 1866 while (rewind_error && (spa->spa_uberblock.ub_txg >= min_txg)) { 1867 if (spa->spa_load_max_txg < safe_rollback_txg) 1868 spa->spa_extreme_rewind = B_TRUE; 1869 rewind_error = spa_load_retry(spa, state, mosconfig); 1870 } 1871 1872 if (config) 1873 spa_rewind_data_to_nvlist(spa, config); 1874 1875 spa->spa_extreme_rewind = B_FALSE; 1876 spa->spa_load_max_txg = UINT64_MAX; 1877 1878 if (config && (rewind_error || state != SPA_LOAD_RECOVER)) 1879 spa_config_set(spa, config); 1880 1881 return (state == SPA_LOAD_RECOVER ? rewind_error : load_error); 1882 } 1883 1884 /* 1885 * Pool Open/Import 1886 * 1887 * The import case is identical to an open except that the configuration is sent 1888 * down from userland, instead of grabbed from the configuration cache. For the 1889 * case of an open, the pool configuration will exist in the 1890 * POOL_STATE_UNINITIALIZED state. 1891 * 1892 * The stats information (gen/count/ustats) is used to gather vdev statistics at 1893 * the same time open the pool, without having to keep around the spa_t in some 1894 * ambiguous state. 1895 */ 1896 static int 1897 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, 1898 nvlist_t **config) 1899 { 1900 spa_t *spa; 1901 boolean_t norewind; 1902 boolean_t extreme; 1903 zpool_rewind_policy_t policy; 1904 spa_load_state_t state = SPA_LOAD_OPEN; 1905 int error; 1906 int locked = B_FALSE; 1907 1908 *spapp = NULL; 1909 1910 zpool_get_rewind_policy(nvpolicy, &policy); 1911 if (policy.zrp_request & ZPOOL_DO_REWIND) 1912 state = SPA_LOAD_RECOVER; 1913 norewind = (policy.zrp_request == ZPOOL_NO_REWIND); 1914 extreme = ((policy.zrp_request & ZPOOL_EXTREME_REWIND) != 0); 1915 1916 /* 1917 * As disgusting as this is, we need to support recursive calls to this 1918 * function because dsl_dir_open() is called during spa_load(), and ends 1919 * up calling spa_open() again. The real fix is to figure out how to 1920 * avoid dsl_dir_open() calling this in the first place. 1921 */ 1922 if (mutex_owner(&spa_namespace_lock) != curthread) { 1923 mutex_enter(&spa_namespace_lock); 1924 locked = B_TRUE; 1925 } 1926 1927 if ((spa = spa_lookup(pool)) == NULL) { 1928 if (locked) 1929 mutex_exit(&spa_namespace_lock); 1930 return (ENOENT); 1931 } 1932 1933 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 1934 1935 spa_activate(spa, spa_mode_global); 1936 1937 if (spa->spa_last_open_failed && norewind) { 1938 if (config != NULL && spa->spa_config) 1939 VERIFY(nvlist_dup(spa->spa_config, 1940 config, KM_SLEEP) == 0); 1941 spa_deactivate(spa); 1942 if (locked) 1943 mutex_exit(&spa_namespace_lock); 1944 return (spa->spa_last_open_failed); 1945 } 1946 1947 if (state != SPA_LOAD_RECOVER) 1948 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 1949 1950 error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg, 1951 extreme); 1952 1953 if (error == EBADF) { 1954 /* 1955 * If vdev_validate() returns failure (indicated by 1956 * EBADF), it indicates that one of the vdevs indicates 1957 * that the pool has been exported or destroyed. If 1958 * this is the case, the config cache is out of sync and 1959 * we should remove the pool from the namespace. 1960 */ 1961 spa_unload(spa); 1962 spa_deactivate(spa); 1963 spa_config_sync(spa, B_TRUE, B_TRUE); 1964 spa_remove(spa); 1965 if (locked) 1966 mutex_exit(&spa_namespace_lock); 1967 return (ENOENT); 1968 } 1969 1970 if (error) { 1971 /* 1972 * We can't open the pool, but we still have useful 1973 * information: the state of each vdev after the 1974 * attempted vdev_open(). Return this to the user. 1975 */ 1976 if (config != NULL && spa->spa_config) 1977 VERIFY(nvlist_dup(spa->spa_config, config, 1978 KM_SLEEP) == 0); 1979 spa_unload(spa); 1980 spa_deactivate(spa); 1981 spa->spa_last_open_failed = error; 1982 if (locked) 1983 mutex_exit(&spa_namespace_lock); 1984 *spapp = NULL; 1985 return (error); 1986 } 1987 1988 } 1989 1990 spa_open_ref(spa, tag); 1991 1992 1993 if (config != NULL) 1994 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 1995 1996 if (locked) { 1997 spa->spa_last_open_failed = 0; 1998 spa->spa_last_ubsync_txg = 0; 1999 spa->spa_load_txg = 0; 2000 mutex_exit(&spa_namespace_lock); 2001 } 2002 2003 *spapp = spa; 2004 2005 return (0); 2006 } 2007 2008 int 2009 spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy, 2010 nvlist_t **config) 2011 { 2012 return (spa_open_common(name, spapp, tag, policy, config)); 2013 } 2014 2015 int 2016 spa_open(const char *name, spa_t **spapp, void *tag) 2017 { 2018 return (spa_open_common(name, spapp, tag, NULL, NULL)); 2019 } 2020 2021 /* 2022 * Lookup the given spa_t, incrementing the inject count in the process, 2023 * preventing it from being exported or destroyed. 2024 */ 2025 spa_t * 2026 spa_inject_addref(char *name) 2027 { 2028 spa_t *spa; 2029 2030 mutex_enter(&spa_namespace_lock); 2031 if ((spa = spa_lookup(name)) == NULL) { 2032 mutex_exit(&spa_namespace_lock); 2033 return (NULL); 2034 } 2035 spa->spa_inject_ref++; 2036 mutex_exit(&spa_namespace_lock); 2037 2038 return (spa); 2039 } 2040 2041 void 2042 spa_inject_delref(spa_t *spa) 2043 { 2044 mutex_enter(&spa_namespace_lock); 2045 spa->spa_inject_ref--; 2046 mutex_exit(&spa_namespace_lock); 2047 } 2048 2049 /* 2050 * Add spares device information to the nvlist. 2051 */ 2052 static void 2053 spa_add_spares(spa_t *spa, nvlist_t *config) 2054 { 2055 nvlist_t **spares; 2056 uint_t i, nspares; 2057 nvlist_t *nvroot; 2058 uint64_t guid; 2059 vdev_stat_t *vs; 2060 uint_t vsc; 2061 uint64_t pool; 2062 2063 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 2064 2065 if (spa->spa_spares.sav_count == 0) 2066 return; 2067 2068 VERIFY(nvlist_lookup_nvlist(config, 2069 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 2070 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 2071 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 2072 if (nspares != 0) { 2073 VERIFY(nvlist_add_nvlist_array(nvroot, 2074 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 2075 VERIFY(nvlist_lookup_nvlist_array(nvroot, 2076 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 2077 2078 /* 2079 * Go through and find any spares which have since been 2080 * repurposed as an active spare. If this is the case, update 2081 * their status appropriately. 2082 */ 2083 for (i = 0; i < nspares; i++) { 2084 VERIFY(nvlist_lookup_uint64(spares[i], 2085 ZPOOL_CONFIG_GUID, &guid) == 0); 2086 if (spa_spare_exists(guid, &pool, NULL) && 2087 pool != 0ULL) { 2088 VERIFY(nvlist_lookup_uint64_array( 2089 spares[i], ZPOOL_CONFIG_STATS, 2090 (uint64_t **)&vs, &vsc) == 0); 2091 vs->vs_state = VDEV_STATE_CANT_OPEN; 2092 vs->vs_aux = VDEV_AUX_SPARED; 2093 } 2094 } 2095 } 2096 } 2097 2098 /* 2099 * Add l2cache device information to the nvlist, including vdev stats. 2100 */ 2101 static void 2102 spa_add_l2cache(spa_t *spa, nvlist_t *config) 2103 { 2104 nvlist_t **l2cache; 2105 uint_t i, j, nl2cache; 2106 nvlist_t *nvroot; 2107 uint64_t guid; 2108 vdev_t *vd; 2109 vdev_stat_t *vs; 2110 uint_t vsc; 2111 2112 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 2113 2114 if (spa->spa_l2cache.sav_count == 0) 2115 return; 2116 2117 VERIFY(nvlist_lookup_nvlist(config, 2118 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 2119 VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 2120 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 2121 if (nl2cache != 0) { 2122 VERIFY(nvlist_add_nvlist_array(nvroot, 2123 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 2124 VERIFY(nvlist_lookup_nvlist_array(nvroot, 2125 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 2126 2127 /* 2128 * Update level 2 cache device stats. 2129 */ 2130 2131 for (i = 0; i < nl2cache; i++) { 2132 VERIFY(nvlist_lookup_uint64(l2cache[i], 2133 ZPOOL_CONFIG_GUID, &guid) == 0); 2134 2135 vd = NULL; 2136 for (j = 0; j < spa->spa_l2cache.sav_count; j++) { 2137 if (guid == 2138 spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { 2139 vd = spa->spa_l2cache.sav_vdevs[j]; 2140 break; 2141 } 2142 } 2143 ASSERT(vd != NULL); 2144 2145 VERIFY(nvlist_lookup_uint64_array(l2cache[i], 2146 ZPOOL_CONFIG_STATS, (uint64_t **)&vs, &vsc) == 0); 2147 vdev_get_stats(vd, vs); 2148 } 2149 } 2150 } 2151 2152 int 2153 spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 2154 { 2155 int error; 2156 spa_t *spa; 2157 2158 *config = NULL; 2159 error = spa_open_common(name, &spa, FTAG, NULL, config); 2160 2161 if (spa != NULL) { 2162 /* 2163 * This still leaves a window of inconsistency where the spares 2164 * or l2cache devices could change and the config would be 2165 * self-inconsistent. 2166 */ 2167 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 2168 2169 if (*config != NULL) { 2170 VERIFY(nvlist_add_uint64(*config, 2171 ZPOOL_CONFIG_ERRCOUNT, 2172 spa_get_errlog_size(spa)) == 0); 2173 2174 if (spa_suspended(spa)) 2175 VERIFY(nvlist_add_uint64(*config, 2176 ZPOOL_CONFIG_SUSPENDED, 2177 spa->spa_failmode) == 0); 2178 2179 spa_add_spares(spa, *config); 2180 spa_add_l2cache(spa, *config); 2181 } 2182 } 2183 2184 /* 2185 * We want to get the alternate root even for faulted pools, so we cheat 2186 * and call spa_lookup() directly. 2187 */ 2188 if (altroot) { 2189 if (spa == NULL) { 2190 mutex_enter(&spa_namespace_lock); 2191 spa = spa_lookup(name); 2192 if (spa) 2193 spa_altroot(spa, altroot, buflen); 2194 else 2195 altroot[0] = '\0'; 2196 spa = NULL; 2197 mutex_exit(&spa_namespace_lock); 2198 } else { 2199 spa_altroot(spa, altroot, buflen); 2200 } 2201 } 2202 2203 if (spa != NULL) { 2204 spa_config_exit(spa, SCL_CONFIG, FTAG); 2205 spa_close(spa, FTAG); 2206 } 2207 2208 return (error); 2209 } 2210 2211 /* 2212 * Validate that the auxiliary device array is well formed. We must have an 2213 * array of nvlists, each which describes a valid leaf vdev. If this is an 2214 * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be 2215 * specified, as long as they are well-formed. 2216 */ 2217 static int 2218 spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, 2219 spa_aux_vdev_t *sav, const char *config, uint64_t version, 2220 vdev_labeltype_t label) 2221 { 2222 nvlist_t **dev; 2223 uint_t i, ndev; 2224 vdev_t *vd; 2225 int error; 2226 2227 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 2228 2229 /* 2230 * It's acceptable to have no devs specified. 2231 */ 2232 if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) 2233 return (0); 2234 2235 if (ndev == 0) 2236 return (EINVAL); 2237 2238 /* 2239 * Make sure the pool is formatted with a version that supports this 2240 * device type. 2241 */ 2242 if (spa_version(spa) < version) 2243 return (ENOTSUP); 2244 2245 /* 2246 * Set the pending device list so we correctly handle device in-use 2247 * checking. 2248 */ 2249 sav->sav_pending = dev; 2250 sav->sav_npending = ndev; 2251 2252 for (i = 0; i < ndev; i++) { 2253 if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, 2254 mode)) != 0) 2255 goto out; 2256 2257 if (!vd->vdev_ops->vdev_op_leaf) { 2258 vdev_free(vd); 2259 error = EINVAL; 2260 goto out; 2261 } 2262 2263 /* 2264 * The L2ARC currently only supports disk devices in 2265 * kernel context. For user-level testing, we allow it. 2266 */ 2267 #ifdef _KERNEL 2268 if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) && 2269 strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { 2270 error = ENOTBLK; 2271 goto out; 2272 } 2273 #endif 2274 vd->vdev_top = vd; 2275 2276 if ((error = vdev_open(vd)) == 0 && 2277 (error = vdev_label_init(vd, crtxg, label)) == 0) { 2278 VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, 2279 vd->vdev_guid) == 0); 2280 } 2281 2282 vdev_free(vd); 2283 2284 if (error && 2285 (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) 2286 goto out; 2287 else 2288 error = 0; 2289 } 2290 2291 out: 2292 sav->sav_pending = NULL; 2293 sav->sav_npending = 0; 2294 return (error); 2295 } 2296 2297 static int 2298 spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 2299 { 2300 int error; 2301 2302 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 2303 2304 if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, 2305 &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, 2306 VDEV_LABEL_SPARE)) != 0) { 2307 return (error); 2308 } 2309 2310 return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, 2311 &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, 2312 VDEV_LABEL_L2CACHE)); 2313 } 2314 2315 static void 2316 spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, 2317 const char *config) 2318 { 2319 int i; 2320 2321 if (sav->sav_config != NULL) { 2322 nvlist_t **olddevs; 2323 uint_t oldndevs; 2324 nvlist_t **newdevs; 2325 2326 /* 2327 * Generate new dev list by concatentating with the 2328 * current dev list. 2329 */ 2330 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, 2331 &olddevs, &oldndevs) == 0); 2332 2333 newdevs = kmem_alloc(sizeof (void *) * 2334 (ndevs + oldndevs), KM_SLEEP); 2335 for (i = 0; i < oldndevs; i++) 2336 VERIFY(nvlist_dup(olddevs[i], &newdevs[i], 2337 KM_SLEEP) == 0); 2338 for (i = 0; i < ndevs; i++) 2339 VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], 2340 KM_SLEEP) == 0); 2341 2342 VERIFY(nvlist_remove(sav->sav_config, config, 2343 DATA_TYPE_NVLIST_ARRAY) == 0); 2344 2345 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 2346 config, newdevs, ndevs + oldndevs) == 0); 2347 for (i = 0; i < oldndevs + ndevs; i++) 2348 nvlist_free(newdevs[i]); 2349 kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); 2350 } else { 2351 /* 2352 * Generate a new dev list. 2353 */ 2354 VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, 2355 KM_SLEEP) == 0); 2356 VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, 2357 devs, ndevs) == 0); 2358 } 2359 } 2360 2361 /* 2362 * Stop and drop level 2 ARC devices 2363 */ 2364 void 2365 spa_l2cache_drop(spa_t *spa) 2366 { 2367 vdev_t *vd; 2368 int i; 2369 spa_aux_vdev_t *sav = &spa->spa_l2cache; 2370 2371 for (i = 0; i < sav->sav_count; i++) { 2372 uint64_t pool; 2373 2374 vd = sav->sav_vdevs[i]; 2375 ASSERT(vd != NULL); 2376 2377 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 2378 pool != 0ULL && l2arc_vdev_present(vd)) 2379 l2arc_remove_vdev(vd); 2380 if (vd->vdev_isl2cache) 2381 spa_l2cache_remove(vd); 2382 vdev_clear_stats(vd); 2383 (void) vdev_close(vd); 2384 } 2385 } 2386 2387 /* 2388 * Pool Creation 2389 */ 2390 int 2391 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 2392 const char *history_str, nvlist_t *zplprops) 2393 { 2394 spa_t *spa; 2395 char *altroot = NULL; 2396 vdev_t *rvd; 2397 dsl_pool_t *dp; 2398 dmu_tx_t *tx; 2399 int error = 0; 2400 uint64_t txg = TXG_INITIAL; 2401 nvlist_t **spares, **l2cache; 2402 uint_t nspares, nl2cache; 2403 uint64_t version; 2404 2405 /* 2406 * If this pool already exists, return failure. 2407 */ 2408 mutex_enter(&spa_namespace_lock); 2409 if (spa_lookup(pool) != NULL) { 2410 mutex_exit(&spa_namespace_lock); 2411 return (EEXIST); 2412 } 2413 2414 /* 2415 * Allocate a new spa_t structure. 2416 */ 2417 (void) nvlist_lookup_string(props, 2418 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 2419 spa = spa_add(pool, NULL, altroot); 2420 spa_activate(spa, spa_mode_global); 2421 2422 if (props && (error = spa_prop_validate(spa, props))) { 2423 spa_deactivate(spa); 2424 spa_remove(spa); 2425 mutex_exit(&spa_namespace_lock); 2426 return (error); 2427 } 2428 2429 if (nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), 2430 &version) != 0) 2431 version = SPA_VERSION; 2432 ASSERT(version <= SPA_VERSION); 2433 2434 spa->spa_first_txg = txg; 2435 spa->spa_uberblock.ub_txg = txg - 1; 2436 spa->spa_uberblock.ub_version = version; 2437 spa->spa_ubsync = spa->spa_uberblock; 2438 2439 /* 2440 * Create "The Godfather" zio to hold all async IOs 2441 */ 2442 spa->spa_async_zio_root = zio_root(spa, NULL, NULL, 2443 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); 2444 2445 /* 2446 * Create the root vdev. 2447 */ 2448 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2449 2450 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 2451 2452 ASSERT(error != 0 || rvd != NULL); 2453 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 2454 2455 if (error == 0 && !zfs_allocatable_devs(nvroot)) 2456 error = EINVAL; 2457 2458 if (error == 0 && 2459 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 2460 (error = spa_validate_aux(spa, nvroot, txg, 2461 VDEV_ALLOC_ADD)) == 0) { 2462 for (int c = 0; c < rvd->vdev_children; c++) { 2463 vdev_metaslab_set_size(rvd->vdev_child[c]); 2464 vdev_expand(rvd->vdev_child[c], txg); 2465 } 2466 } 2467 2468 spa_config_exit(spa, SCL_ALL, FTAG); 2469 2470 if (error != 0) { 2471 spa_unload(spa); 2472 spa_deactivate(spa); 2473 spa_remove(spa); 2474 mutex_exit(&spa_namespace_lock); 2475 return (error); 2476 } 2477 2478 /* 2479 * Get the list of spares, if specified. 2480 */ 2481 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2482 &spares, &nspares) == 0) { 2483 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, 2484 KM_SLEEP) == 0); 2485 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 2486 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 2487 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2488 spa_load_spares(spa); 2489 spa_config_exit(spa, SCL_ALL, FTAG); 2490 spa->spa_spares.sav_sync = B_TRUE; 2491 } 2492 2493 /* 2494 * Get the list of level 2 cache devices, if specified. 2495 */ 2496 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 2497 &l2cache, &nl2cache) == 0) { 2498 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 2499 NV_UNIQUE_NAME, KM_SLEEP) == 0); 2500 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 2501 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 2502 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2503 spa_load_l2cache(spa); 2504 spa_config_exit(spa, SCL_ALL, FTAG); 2505 spa->spa_l2cache.sav_sync = B_TRUE; 2506 } 2507 2508 spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); 2509 spa->spa_meta_objset = dp->dp_meta_objset; 2510 2511 /* 2512 * Create DDTs (dedup tables). 2513 */ 2514 ddt_create(spa); 2515 2516 spa_update_dspace(spa); 2517 2518 tx = dmu_tx_create_assigned(dp, txg); 2519 2520 /* 2521 * Create the pool config object. 2522 */ 2523 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 2524 DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, 2525 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 2526 2527 if (zap_add(spa->spa_meta_objset, 2528 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 2529 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 2530 cmn_err(CE_PANIC, "failed to add pool config"); 2531 } 2532 2533 /* Newly created pools with the right version are always deflated. */ 2534 if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 2535 spa->spa_deflate = TRUE; 2536 if (zap_add(spa->spa_meta_objset, 2537 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 2538 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 2539 cmn_err(CE_PANIC, "failed to add deflate"); 2540 } 2541 } 2542 2543 /* 2544 * Create the deferred-free bplist object. Turn off compression 2545 * because sync-to-convergence takes longer if the blocksize 2546 * keeps changing. 2547 */ 2548 spa->spa_deferred_bplist_obj = bplist_create(spa->spa_meta_objset, 2549 1 << 14, tx); 2550 dmu_object_set_compress(spa->spa_meta_objset, 2551 spa->spa_deferred_bplist_obj, ZIO_COMPRESS_OFF, tx); 2552 2553 if (zap_add(spa->spa_meta_objset, 2554 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 2555 sizeof (uint64_t), 1, &spa->spa_deferred_bplist_obj, tx) != 0) { 2556 cmn_err(CE_PANIC, "failed to add bplist"); 2557 } 2558 2559 /* 2560 * Create the pool's history object. 2561 */ 2562 if (version >= SPA_VERSION_ZPOOL_HISTORY) 2563 spa_history_create_obj(spa, tx); 2564 2565 /* 2566 * Set pool properties. 2567 */ 2568 spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 2569 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 2570 spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 2571 spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); 2572 2573 if (props != NULL) { 2574 spa_configfile_set(spa, props, B_FALSE); 2575 spa_sync_props(spa, props, CRED(), tx); 2576 } 2577 2578 dmu_tx_commit(tx); 2579 2580 spa->spa_sync_on = B_TRUE; 2581 txg_sync_start(spa->spa_dsl_pool); 2582 2583 /* 2584 * We explicitly wait for the first transaction to complete so that our 2585 * bean counters are appropriately updated. 2586 */ 2587 txg_wait_synced(spa->spa_dsl_pool, txg); 2588 2589 spa_config_sync(spa, B_FALSE, B_TRUE); 2590 2591 if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL) 2592 (void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE); 2593 spa_history_log_version(spa, LOG_POOL_CREATE); 2594 2595 spa->spa_minref = refcount_count(&spa->spa_refcount); 2596 2597 mutex_exit(&spa_namespace_lock); 2598 2599 return (0); 2600 } 2601 2602 #ifdef _KERNEL 2603 /* 2604 * Get the root pool information from the root disk, then import the root pool 2605 * during the system boot up time. 2606 */ 2607 extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); 2608 2609 static nvlist_t * 2610 spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid) 2611 { 2612 nvlist_t *config; 2613 nvlist_t *nvtop, *nvroot; 2614 uint64_t pgid; 2615 2616 if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0) 2617 return (NULL); 2618 2619 /* 2620 * Add this top-level vdev to the child array. 2621 */ 2622 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 2623 &nvtop) == 0); 2624 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 2625 &pgid) == 0); 2626 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0); 2627 2628 /* 2629 * Put this pool's top-level vdevs into a root vdev. 2630 */ 2631 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 2632 VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 2633 VDEV_TYPE_ROOT) == 0); 2634 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 2635 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 2636 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 2637 &nvtop, 1) == 0); 2638 2639 /* 2640 * Replace the existing vdev_tree with the new root vdev in 2641 * this pool's configuration (remove the old, add the new). 2642 */ 2643 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 2644 nvlist_free(nvroot); 2645 return (config); 2646 } 2647 2648 /* 2649 * Walk the vdev tree and see if we can find a device with "better" 2650 * configuration. A configuration is "better" if the label on that 2651 * device has a more recent txg. 2652 */ 2653 static void 2654 spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg) 2655 { 2656 for (int c = 0; c < vd->vdev_children; c++) 2657 spa_alt_rootvdev(vd->vdev_child[c], avd, txg); 2658 2659 if (vd->vdev_ops->vdev_op_leaf) { 2660 nvlist_t *label; 2661 uint64_t label_txg; 2662 2663 if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid, 2664 &label) != 0) 2665 return; 2666 2667 VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, 2668 &label_txg) == 0); 2669 2670 /* 2671 * Do we have a better boot device? 2672 */ 2673 if (label_txg > *txg) { 2674 *txg = label_txg; 2675 *avd = vd; 2676 } 2677 nvlist_free(label); 2678 } 2679 } 2680 2681 /* 2682 * Import a root pool. 2683 * 2684 * For x86. devpath_list will consist of devid and/or physpath name of 2685 * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a"). 2686 * The GRUB "findroot" command will return the vdev we should boot. 2687 * 2688 * For Sparc, devpath_list consists the physpath name of the booting device 2689 * no matter the rootpool is a single device pool or a mirrored pool. 2690 * e.g. 2691 * "/pci@1f,0/ide@d/disk@0,0:a" 2692 */ 2693 int 2694 spa_import_rootpool(char *devpath, char *devid) 2695 { 2696 spa_t *spa; 2697 vdev_t *rvd, *bvd, *avd = NULL; 2698 nvlist_t *config, *nvtop; 2699 uint64_t guid, txg; 2700 char *pname; 2701 int error; 2702 2703 /* 2704 * Read the label from the boot device and generate a configuration. 2705 */ 2706 config = spa_generate_rootconf(devpath, devid, &guid); 2707 #if defined(_OBP) && defined(_KERNEL) 2708 if (config == NULL) { 2709 if (strstr(devpath, "/iscsi/ssd") != NULL) { 2710 /* iscsi boot */ 2711 get_iscsi_bootpath_phy(devpath); 2712 config = spa_generate_rootconf(devpath, devid, &guid); 2713 } 2714 } 2715 #endif 2716 if (config == NULL) { 2717 cmn_err(CE_NOTE, "Can not read the pool label from '%s'", 2718 devpath); 2719 return (EIO); 2720 } 2721 2722 VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, 2723 &pname) == 0); 2724 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); 2725 2726 mutex_enter(&spa_namespace_lock); 2727 if ((spa = spa_lookup(pname)) != NULL) { 2728 /* 2729 * Remove the existing root pool from the namespace so that we 2730 * can replace it with the correct config we just read in. 2731 */ 2732 spa_remove(spa); 2733 } 2734 2735 spa = spa_add(pname, config, NULL); 2736 spa->spa_is_root = B_TRUE; 2737 spa->spa_load_verbatim = B_TRUE; 2738 2739 /* 2740 * Build up a vdev tree based on the boot device's label config. 2741 */ 2742 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 2743 &nvtop) == 0); 2744 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2745 error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, 2746 VDEV_ALLOC_ROOTPOOL); 2747 spa_config_exit(spa, SCL_ALL, FTAG); 2748 if (error) { 2749 mutex_exit(&spa_namespace_lock); 2750 nvlist_free(config); 2751 cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", 2752 pname); 2753 return (error); 2754 } 2755 2756 /* 2757 * Get the boot vdev. 2758 */ 2759 if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 2760 cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu", 2761 (u_longlong_t)guid); 2762 error = ENOENT; 2763 goto out; 2764 } 2765 2766 /* 2767 * Determine if there is a better boot device. 2768 */ 2769 avd = bvd; 2770 spa_alt_rootvdev(rvd, &avd, &txg); 2771 if (avd != bvd) { 2772 cmn_err(CE_NOTE, "The boot device is 'degraded'. Please " 2773 "try booting from '%s'", avd->vdev_path); 2774 error = EINVAL; 2775 goto out; 2776 } 2777 2778 /* 2779 * If the boot device is part of a spare vdev then ensure that 2780 * we're booting off the active spare. 2781 */ 2782 if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops && 2783 !bvd->vdev_isspare) { 2784 cmn_err(CE_NOTE, "The boot device is currently spared. Please " 2785 "try booting from '%s'", 2786 bvd->vdev_parent->vdev_child[1]->vdev_path); 2787 error = EINVAL; 2788 goto out; 2789 } 2790 2791 error = 0; 2792 spa_history_log_version(spa, LOG_POOL_IMPORT); 2793 out: 2794 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2795 vdev_free(rvd); 2796 spa_config_exit(spa, SCL_ALL, FTAG); 2797 mutex_exit(&spa_namespace_lock); 2798 2799 nvlist_free(config); 2800 return (error); 2801 } 2802 2803 #endif 2804 2805 /* 2806 * Take a pool and insert it into the namespace as if it had been loaded at 2807 * boot. 2808 */ 2809 int 2810 spa_import_verbatim(const char *pool, nvlist_t *config, nvlist_t *props) 2811 { 2812 spa_t *spa; 2813 zpool_rewind_policy_t policy; 2814 char *altroot = NULL; 2815 2816 mutex_enter(&spa_namespace_lock); 2817 if (spa_lookup(pool) != NULL) { 2818 mutex_exit(&spa_namespace_lock); 2819 return (EEXIST); 2820 } 2821 2822 (void) nvlist_lookup_string(props, 2823 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 2824 spa = spa_add(pool, config, altroot); 2825 2826 zpool_get_rewind_policy(config, &policy); 2827 spa->spa_load_max_txg = policy.zrp_txg; 2828 2829 spa->spa_load_verbatim = B_TRUE; 2830 2831 if (props != NULL) 2832 spa_configfile_set(spa, props, B_FALSE); 2833 2834 spa_config_sync(spa, B_FALSE, B_TRUE); 2835 2836 mutex_exit(&spa_namespace_lock); 2837 spa_history_log_version(spa, LOG_POOL_IMPORT); 2838 2839 return (0); 2840 } 2841 2842 /* 2843 * Import a non-root pool into the system. 2844 */ 2845 int 2846 spa_import(const char *pool, nvlist_t *config, nvlist_t *props) 2847 { 2848 spa_t *spa; 2849 char *altroot = NULL; 2850 spa_load_state_t state = SPA_LOAD_IMPORT; 2851 zpool_rewind_policy_t policy; 2852 int error; 2853 nvlist_t *nvroot; 2854 nvlist_t **spares, **l2cache; 2855 uint_t nspares, nl2cache; 2856 2857 /* 2858 * If a pool with this name exists, return failure. 2859 */ 2860 mutex_enter(&spa_namespace_lock); 2861 if ((spa = spa_lookup(pool)) != NULL) { 2862 mutex_exit(&spa_namespace_lock); 2863 return (EEXIST); 2864 } 2865 2866 zpool_get_rewind_policy(config, &policy); 2867 if (policy.zrp_request & ZPOOL_DO_REWIND) 2868 state = SPA_LOAD_RECOVER; 2869 2870 /* 2871 * Create and initialize the spa structure. 2872 */ 2873 (void) nvlist_lookup_string(props, 2874 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 2875 spa = spa_add(pool, config, altroot); 2876 spa_activate(spa, spa_mode_global); 2877 2878 /* 2879 * Don't start async tasks until we know everything is healthy. 2880 */ 2881 spa_async_suspend(spa); 2882 2883 /* 2884 * Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig 2885 * because the user-supplied config is actually the one to trust when 2886 * doing an import. 2887 */ 2888 if (state != SPA_LOAD_RECOVER) 2889 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 2890 error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg, 2891 ((policy.zrp_request & ZPOOL_EXTREME_REWIND) != 0)); 2892 2893 /* 2894 * Propagate anything learned about failing or best txgs 2895 * back to caller 2896 */ 2897 spa_rewind_data_to_nvlist(spa, config); 2898 2899 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2900 /* 2901 * Toss any existing sparelist, as it doesn't have any validity 2902 * anymore, and conflicts with spa_has_spare(). 2903 */ 2904 if (spa->spa_spares.sav_config) { 2905 nvlist_free(spa->spa_spares.sav_config); 2906 spa->spa_spares.sav_config = NULL; 2907 spa_load_spares(spa); 2908 } 2909 if (spa->spa_l2cache.sav_config) { 2910 nvlist_free(spa->spa_l2cache.sav_config); 2911 spa->spa_l2cache.sav_config = NULL; 2912 spa_load_l2cache(spa); 2913 } 2914 2915 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 2916 &nvroot) == 0); 2917 if (error == 0) 2918 error = spa_validate_aux(spa, nvroot, -1ULL, 2919 VDEV_ALLOC_SPARE); 2920 if (error == 0) 2921 error = spa_validate_aux(spa, nvroot, -1ULL, 2922 VDEV_ALLOC_L2CACHE); 2923 spa_config_exit(spa, SCL_ALL, FTAG); 2924 2925 if (props != NULL) 2926 spa_configfile_set(spa, props, B_FALSE); 2927 2928 if (error != 0 || (props && spa_writeable(spa) && 2929 (error = spa_prop_set(spa, props)))) { 2930 spa_unload(spa); 2931 spa_deactivate(spa); 2932 spa_remove(spa); 2933 mutex_exit(&spa_namespace_lock); 2934 return (error); 2935 } 2936 2937 spa_async_resume(spa); 2938 2939 /* 2940 * Override any spares and level 2 cache devices as specified by 2941 * the user, as these may have correct device names/devids, etc. 2942 */ 2943 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2944 &spares, &nspares) == 0) { 2945 if (spa->spa_spares.sav_config) 2946 VERIFY(nvlist_remove(spa->spa_spares.sav_config, 2947 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 2948 else 2949 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, 2950 NV_UNIQUE_NAME, KM_SLEEP) == 0); 2951 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 2952 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 2953 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2954 spa_load_spares(spa); 2955 spa_config_exit(spa, SCL_ALL, FTAG); 2956 spa->spa_spares.sav_sync = B_TRUE; 2957 } 2958 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 2959 &l2cache, &nl2cache) == 0) { 2960 if (spa->spa_l2cache.sav_config) 2961 VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, 2962 ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); 2963 else 2964 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 2965 NV_UNIQUE_NAME, KM_SLEEP) == 0); 2966 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 2967 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 2968 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2969 spa_load_l2cache(spa); 2970 spa_config_exit(spa, SCL_ALL, FTAG); 2971 spa->spa_l2cache.sav_sync = B_TRUE; 2972 } 2973 2974 /* 2975 * Check for any removed devices. 2976 */ 2977 if (spa->spa_autoreplace) { 2978 spa_aux_check_removed(&spa->spa_spares); 2979 spa_aux_check_removed(&spa->spa_l2cache); 2980 } 2981 2982 if (spa_writeable(spa)) { 2983 /* 2984 * Update the config cache to include the newly-imported pool. 2985 */ 2986 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 2987 } 2988 2989 /* 2990 * It's possible that the pool was expanded while it was exported. 2991 * We kick off an async task to handle this for us. 2992 */ 2993 spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); 2994 2995 mutex_exit(&spa_namespace_lock); 2996 spa_history_log_version(spa, LOG_POOL_IMPORT); 2997 2998 return (0); 2999 } 3000 3001 3002 /* 3003 * This (illegal) pool name is used when temporarily importing a spa_t in order 3004 * to get the vdev stats associated with the imported devices. 3005 */ 3006 #define TRYIMPORT_NAME "$import" 3007 3008 nvlist_t * 3009 spa_tryimport(nvlist_t *tryconfig) 3010 { 3011 nvlist_t *config = NULL; 3012 char *poolname; 3013 spa_t *spa; 3014 uint64_t state; 3015 int error; 3016 3017 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 3018 return (NULL); 3019 3020 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 3021 return (NULL); 3022 3023 /* 3024 * Create and initialize the spa structure. 3025 */ 3026 mutex_enter(&spa_namespace_lock); 3027 spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); 3028 spa_activate(spa, FREAD); 3029 3030 /* 3031 * Pass off the heavy lifting to spa_load(). 3032 * Pass TRUE for mosconfig because the user-supplied config 3033 * is actually the one to trust when doing an import. 3034 */ 3035 error = spa_load(spa, SPA_LOAD_TRYIMPORT, B_TRUE); 3036 3037 /* 3038 * If 'tryconfig' was at least parsable, return the current config. 3039 */ 3040 if (spa->spa_root_vdev != NULL) { 3041 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 3042 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 3043 poolname) == 0); 3044 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 3045 state) == 0); 3046 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 3047 spa->spa_uberblock.ub_timestamp) == 0); 3048 3049 /* 3050 * If the bootfs property exists on this pool then we 3051 * copy it out so that external consumers can tell which 3052 * pools are bootable. 3053 */ 3054 if ((!error || error == EEXIST) && spa->spa_bootfs) { 3055 char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 3056 3057 /* 3058 * We have to play games with the name since the 3059 * pool was opened as TRYIMPORT_NAME. 3060 */ 3061 if (dsl_dsobj_to_dsname(spa_name(spa), 3062 spa->spa_bootfs, tmpname) == 0) { 3063 char *cp; 3064 char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 3065 3066 cp = strchr(tmpname, '/'); 3067 if (cp == NULL) { 3068 (void) strlcpy(dsname, tmpname, 3069 MAXPATHLEN); 3070 } else { 3071 (void) snprintf(dsname, MAXPATHLEN, 3072 "%s/%s", poolname, ++cp); 3073 } 3074 VERIFY(nvlist_add_string(config, 3075 ZPOOL_CONFIG_BOOTFS, dsname) == 0); 3076 kmem_free(dsname, MAXPATHLEN); 3077 } 3078 kmem_free(tmpname, MAXPATHLEN); 3079 } 3080 3081 /* 3082 * Add the list of hot spares and level 2 cache devices. 3083 */ 3084 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3085 spa_add_spares(spa, config); 3086 spa_add_l2cache(spa, config); 3087 spa_config_exit(spa, SCL_CONFIG, FTAG); 3088 } 3089 3090 spa_unload(spa); 3091 spa_deactivate(spa); 3092 spa_remove(spa); 3093 mutex_exit(&spa_namespace_lock); 3094 3095 return (config); 3096 } 3097 3098 /* 3099 * Pool export/destroy 3100 * 3101 * The act of destroying or exporting a pool is very simple. We make sure there 3102 * is no more pending I/O and any references to the pool are gone. Then, we 3103 * update the pool state and sync all the labels to disk, removing the 3104 * configuration from the cache afterwards. If the 'hardforce' flag is set, then 3105 * we don't sync the labels or remove the configuration cache. 3106 */ 3107 static int 3108 spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, 3109 boolean_t force, boolean_t hardforce) 3110 { 3111 spa_t *spa; 3112 3113 if (oldconfig) 3114 *oldconfig = NULL; 3115 3116 if (!(spa_mode_global & FWRITE)) 3117 return (EROFS); 3118 3119 mutex_enter(&spa_namespace_lock); 3120 if ((spa = spa_lookup(pool)) == NULL) { 3121 mutex_exit(&spa_namespace_lock); 3122 return (ENOENT); 3123 } 3124 3125 /* 3126 * Put a hold on the pool, drop the namespace lock, stop async tasks, 3127 * reacquire the namespace lock, and see if we can export. 3128 */ 3129 spa_open_ref(spa, FTAG); 3130 mutex_exit(&spa_namespace_lock); 3131 spa_async_suspend(spa); 3132 mutex_enter(&spa_namespace_lock); 3133 spa_close(spa, FTAG); 3134 3135 /* 3136 * The pool will be in core if it's openable, 3137 * in which case we can modify its state. 3138 */ 3139 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 3140 /* 3141 * Objsets may be open only because they're dirty, so we 3142 * have to force it to sync before checking spa_refcnt. 3143 */ 3144 txg_wait_synced(spa->spa_dsl_pool, 0); 3145 3146 /* 3147 * A pool cannot be exported or destroyed if there are active 3148 * references. If we are resetting a pool, allow references by 3149 * fault injection handlers. 3150 */ 3151 if (!spa_refcount_zero(spa) || 3152 (spa->spa_inject_ref != 0 && 3153 new_state != POOL_STATE_UNINITIALIZED)) { 3154 spa_async_resume(spa); 3155 mutex_exit(&spa_namespace_lock); 3156 return (EBUSY); 3157 } 3158 3159 /* 3160 * A pool cannot be exported if it has an active shared spare. 3161 * This is to prevent other pools stealing the active spare 3162 * from an exported pool. At user's own will, such pool can 3163 * be forcedly exported. 3164 */ 3165 if (!force && new_state == POOL_STATE_EXPORTED && 3166 spa_has_active_shared_spare(spa)) { 3167 spa_async_resume(spa); 3168 mutex_exit(&spa_namespace_lock); 3169 return (EXDEV); 3170 } 3171 3172 /* 3173 * We want this to be reflected on every label, 3174 * so mark them all dirty. spa_unload() will do the 3175 * final sync that pushes these changes out. 3176 */ 3177 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 3178 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3179 spa->spa_state = new_state; 3180 spa->spa_final_txg = spa_last_synced_txg(spa) + 1; 3181 vdev_config_dirty(spa->spa_root_vdev); 3182 spa_config_exit(spa, SCL_ALL, FTAG); 3183 } 3184 } 3185 3186 spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY); 3187 3188 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 3189 spa_unload(spa); 3190 spa_deactivate(spa); 3191 } 3192 3193 if (oldconfig && spa->spa_config) 3194 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 3195 3196 if (new_state != POOL_STATE_UNINITIALIZED) { 3197 if (!hardforce) 3198 spa_config_sync(spa, B_TRUE, B_TRUE); 3199 spa_remove(spa); 3200 } 3201 mutex_exit(&spa_namespace_lock); 3202 3203 return (0); 3204 } 3205 3206 /* 3207 * Destroy a storage pool. 3208 */ 3209 int 3210 spa_destroy(char *pool) 3211 { 3212 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, 3213 B_FALSE, B_FALSE)); 3214 } 3215 3216 /* 3217 * Export a storage pool. 3218 */ 3219 int 3220 spa_export(char *pool, nvlist_t **oldconfig, boolean_t force, 3221 boolean_t hardforce) 3222 { 3223 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, 3224 force, hardforce)); 3225 } 3226 3227 /* 3228 * Similar to spa_export(), this unloads the spa_t without actually removing it 3229 * from the namespace in any way. 3230 */ 3231 int 3232 spa_reset(char *pool) 3233 { 3234 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, 3235 B_FALSE, B_FALSE)); 3236 } 3237 3238 /* 3239 * ========================================================================== 3240 * Device manipulation 3241 * ========================================================================== 3242 */ 3243 3244 /* 3245 * Add a device to a storage pool. 3246 */ 3247 int 3248 spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 3249 { 3250 uint64_t txg, id; 3251 int error; 3252 vdev_t *rvd = spa->spa_root_vdev; 3253 vdev_t *vd, *tvd; 3254 nvlist_t **spares, **l2cache; 3255 uint_t nspares, nl2cache; 3256 3257 txg = spa_vdev_enter(spa); 3258 3259 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 3260 VDEV_ALLOC_ADD)) != 0) 3261 return (spa_vdev_exit(spa, NULL, txg, error)); 3262 3263 spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ 3264 3265 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 3266 &nspares) != 0) 3267 nspares = 0; 3268 3269 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, 3270 &nl2cache) != 0) 3271 nl2cache = 0; 3272 3273 if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) 3274 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 3275 3276 if (vd->vdev_children != 0 && 3277 (error = vdev_create(vd, txg, B_FALSE)) != 0) 3278 return (spa_vdev_exit(spa, vd, txg, error)); 3279 3280 /* 3281 * We must validate the spares and l2cache devices after checking the 3282 * children. Otherwise, vdev_inuse() will blindly overwrite the spare. 3283 */ 3284 if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) 3285 return (spa_vdev_exit(spa, vd, txg, error)); 3286 3287 /* 3288 * Transfer each new top-level vdev from vd to rvd. 3289 */ 3290 for (int c = 0; c < vd->vdev_children; c++) { 3291 3292 /* 3293 * Set the vdev id to the first hole, if one exists. 3294 */ 3295 for (id = 0; id < rvd->vdev_children; id++) { 3296 if (rvd->vdev_child[id]->vdev_ishole) { 3297 vdev_free(rvd->vdev_child[id]); 3298 break; 3299 } 3300 } 3301 tvd = vd->vdev_child[c]; 3302 vdev_remove_child(vd, tvd); 3303 tvd->vdev_id = id; 3304 vdev_add_child(rvd, tvd); 3305 vdev_config_dirty(tvd); 3306 } 3307 3308 if (nspares != 0) { 3309 spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, 3310 ZPOOL_CONFIG_SPARES); 3311 spa_load_spares(spa); 3312 spa->spa_spares.sav_sync = B_TRUE; 3313 } 3314 3315 if (nl2cache != 0) { 3316 spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, 3317 ZPOOL_CONFIG_L2CACHE); 3318 spa_load_l2cache(spa); 3319 spa->spa_l2cache.sav_sync = B_TRUE; 3320 } 3321 3322 /* 3323 * We have to be careful when adding new vdevs to an existing pool. 3324 * If other threads start allocating from these vdevs before we 3325 * sync the config cache, and we lose power, then upon reboot we may 3326 * fail to open the pool because there are DVAs that the config cache 3327 * can't translate. Therefore, we first add the vdevs without 3328 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 3329 * and then let spa_config_update() initialize the new metaslabs. 3330 * 3331 * spa_load() checks for added-but-not-initialized vdevs, so that 3332 * if we lose power at any point in this sequence, the remaining 3333 * steps will be completed the next time we load the pool. 3334 */ 3335 (void) spa_vdev_exit(spa, vd, txg, 0); 3336 3337 mutex_enter(&spa_namespace_lock); 3338 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 3339 mutex_exit(&spa_namespace_lock); 3340 3341 return (0); 3342 } 3343 3344 /* 3345 * Attach a device to a mirror. The arguments are the path to any device 3346 * in the mirror, and the nvroot for the new device. If the path specifies 3347 * a device that is not mirrored, we automatically insert the mirror vdev. 3348 * 3349 * If 'replacing' is specified, the new device is intended to replace the 3350 * existing device; in this case the two devices are made into their own 3351 * mirror using the 'replacing' vdev, which is functionally identical to 3352 * the mirror vdev (it actually reuses all the same ops) but has a few 3353 * extra rules: you can't attach to it after it's been created, and upon 3354 * completion of resilvering, the first disk (the one being replaced) 3355 * is automatically detached. 3356 */ 3357 int 3358 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 3359 { 3360 uint64_t txg, open_txg; 3361 vdev_t *rvd = spa->spa_root_vdev; 3362 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 3363 vdev_ops_t *pvops; 3364 char *oldvdpath, *newvdpath; 3365 int newvd_isspare; 3366 int error; 3367 3368 txg = spa_vdev_enter(spa); 3369 3370 oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); 3371 3372 if (oldvd == NULL) 3373 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 3374 3375 if (!oldvd->vdev_ops->vdev_op_leaf) 3376 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3377 3378 pvd = oldvd->vdev_parent; 3379 3380 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 3381 VDEV_ALLOC_ADD)) != 0) 3382 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 3383 3384 if (newrootvd->vdev_children != 1) 3385 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 3386 3387 newvd = newrootvd->vdev_child[0]; 3388 3389 if (!newvd->vdev_ops->vdev_op_leaf) 3390 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 3391 3392 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 3393 return (spa_vdev_exit(spa, newrootvd, txg, error)); 3394 3395 /* 3396 * Spares can't replace logs 3397 */ 3398 if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) 3399 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3400 3401 if (!replacing) { 3402 /* 3403 * For attach, the only allowable parent is a mirror or the root 3404 * vdev. 3405 */ 3406 if (pvd->vdev_ops != &vdev_mirror_ops && 3407 pvd->vdev_ops != &vdev_root_ops) 3408 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3409 3410 pvops = &vdev_mirror_ops; 3411 } else { 3412 /* 3413 * Active hot spares can only be replaced by inactive hot 3414 * spares. 3415 */ 3416 if (pvd->vdev_ops == &vdev_spare_ops && 3417 pvd->vdev_child[1] == oldvd && 3418 !spa_has_spare(spa, newvd->vdev_guid)) 3419 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3420 3421 /* 3422 * If the source is a hot spare, and the parent isn't already a 3423 * spare, then we want to create a new hot spare. Otherwise, we 3424 * want to create a replacing vdev. The user is not allowed to 3425 * attach to a spared vdev child unless the 'isspare' state is 3426 * the same (spare replaces spare, non-spare replaces 3427 * non-spare). 3428 */ 3429 if (pvd->vdev_ops == &vdev_replacing_ops) 3430 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3431 else if (pvd->vdev_ops == &vdev_spare_ops && 3432 newvd->vdev_isspare != oldvd->vdev_isspare) 3433 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3434 else if (pvd->vdev_ops != &vdev_spare_ops && 3435 newvd->vdev_isspare) 3436 pvops = &vdev_spare_ops; 3437 else 3438 pvops = &vdev_replacing_ops; 3439 } 3440 3441 /* 3442 * Make sure the new device is big enough. 3443 */ 3444 if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) 3445 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 3446 3447 /* 3448 * The new device cannot have a higher alignment requirement 3449 * than the top-level vdev. 3450 */ 3451 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 3452 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 3453 3454 /* 3455 * If this is an in-place replacement, update oldvd's path and devid 3456 * to make it distinguishable from newvd, and unopenable from now on. 3457 */ 3458 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 3459 spa_strfree(oldvd->vdev_path); 3460 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 3461 KM_SLEEP); 3462 (void) sprintf(oldvd->vdev_path, "%s/%s", 3463 newvd->vdev_path, "old"); 3464 if (oldvd->vdev_devid != NULL) { 3465 spa_strfree(oldvd->vdev_devid); 3466 oldvd->vdev_devid = NULL; 3467 } 3468 } 3469 3470 /* 3471 * If the parent is not a mirror, or if we're replacing, insert the new 3472 * mirror/replacing/spare vdev above oldvd. 3473 */ 3474 if (pvd->vdev_ops != pvops) 3475 pvd = vdev_add_parent(oldvd, pvops); 3476 3477 ASSERT(pvd->vdev_top->vdev_parent == rvd); 3478 ASSERT(pvd->vdev_ops == pvops); 3479 ASSERT(oldvd->vdev_parent == pvd); 3480 3481 /* 3482 * Extract the new device from its root and add it to pvd. 3483 */ 3484 vdev_remove_child(newrootvd, newvd); 3485 newvd->vdev_id = pvd->vdev_children; 3486 newvd->vdev_crtxg = oldvd->vdev_crtxg; 3487 vdev_add_child(pvd, newvd); 3488 3489 tvd = newvd->vdev_top; 3490 ASSERT(pvd->vdev_top == tvd); 3491 ASSERT(tvd->vdev_parent == rvd); 3492 3493 vdev_config_dirty(tvd); 3494 3495 /* 3496 * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 3497 * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 3498 */ 3499 open_txg = txg + TXG_CONCURRENT_STATES - 1; 3500 3501 vdev_dtl_dirty(newvd, DTL_MISSING, 3502 TXG_INITIAL, open_txg - TXG_INITIAL + 1); 3503 3504 if (newvd->vdev_isspare) { 3505 spa_spare_activate(newvd); 3506 spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE); 3507 } 3508 3509 oldvdpath = spa_strdup(oldvd->vdev_path); 3510 newvdpath = spa_strdup(newvd->vdev_path); 3511 newvd_isspare = newvd->vdev_isspare; 3512 3513 /* 3514 * Mark newvd's DTL dirty in this txg. 3515 */ 3516 vdev_dirty(tvd, VDD_DTL, newvd, txg); 3517 3518 (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 3519 3520 spa_history_internal_log(LOG_POOL_VDEV_ATTACH, spa, NULL, 3521 CRED(), "%s vdev=%s %s vdev=%s", 3522 replacing && newvd_isspare ? "spare in" : 3523 replacing ? "replace" : "attach", newvdpath, 3524 replacing ? "for" : "to", oldvdpath); 3525 3526 spa_strfree(oldvdpath); 3527 spa_strfree(newvdpath); 3528 3529 /* 3530 * Kick off a resilver to update newvd. 3531 */ 3532 VERIFY3U(spa_scrub(spa, POOL_SCRUB_RESILVER), ==, 0); 3533 3534 return (0); 3535 } 3536 3537 /* 3538 * Detach a device from a mirror or replacing vdev. 3539 * If 'replace_done' is specified, only detach if the parent 3540 * is a replacing vdev. 3541 */ 3542 int 3543 spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) 3544 { 3545 uint64_t txg; 3546 int error; 3547 vdev_t *rvd = spa->spa_root_vdev; 3548 vdev_t *vd, *pvd, *cvd, *tvd; 3549 boolean_t unspare = B_FALSE; 3550 uint64_t unspare_guid; 3551 size_t len; 3552 3553 txg = spa_vdev_enter(spa); 3554 3555 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 3556 3557 if (vd == NULL) 3558 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 3559 3560 if (!vd->vdev_ops->vdev_op_leaf) 3561 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3562 3563 pvd = vd->vdev_parent; 3564 3565 /* 3566 * If the parent/child relationship is not as expected, don't do it. 3567 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing 3568 * vdev that's replacing B with C. The user's intent in replacing 3569 * is to go from M(A,B) to M(A,C). If the user decides to cancel 3570 * the replace by detaching C, the expected behavior is to end up 3571 * M(A,B). But suppose that right after deciding to detach C, 3572 * the replacement of B completes. We would have M(A,C), and then 3573 * ask to detach C, which would leave us with just A -- not what 3574 * the user wanted. To prevent this, we make sure that the 3575 * parent/child relationship hasn't changed -- in this example, 3576 * that C's parent is still the replacing vdev R. 3577 */ 3578 if (pvd->vdev_guid != pguid && pguid != 0) 3579 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 3580 3581 /* 3582 * If replace_done is specified, only remove this device if it's 3583 * the first child of a replacing vdev. For the 'spare' vdev, either 3584 * disk can be removed. 3585 */ 3586 if (replace_done) { 3587 if (pvd->vdev_ops == &vdev_replacing_ops) { 3588 if (vd->vdev_id != 0) 3589 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3590 } else if (pvd->vdev_ops != &vdev_spare_ops) { 3591 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3592 } 3593 } 3594 3595 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 3596 spa_version(spa) >= SPA_VERSION_SPARES); 3597 3598 /* 3599 * Only mirror, replacing, and spare vdevs support detach. 3600 */ 3601 if (pvd->vdev_ops != &vdev_replacing_ops && 3602 pvd->vdev_ops != &vdev_mirror_ops && 3603 pvd->vdev_ops != &vdev_spare_ops) 3604 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3605 3606 /* 3607 * If this device has the only valid copy of some data, 3608 * we cannot safely detach it. 3609 */ 3610 if (vdev_dtl_required(vd)) 3611 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 3612 3613 ASSERT(pvd->vdev_children >= 2); 3614 3615 /* 3616 * If we are detaching the second disk from a replacing vdev, then 3617 * check to see if we changed the original vdev's path to have "/old" 3618 * at the end in spa_vdev_attach(). If so, undo that change now. 3619 */ 3620 if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id == 1 && 3621 pvd->vdev_child[0]->vdev_path != NULL && 3622 pvd->vdev_child[1]->vdev_path != NULL) { 3623 ASSERT(pvd->vdev_child[1] == vd); 3624 cvd = pvd->vdev_child[0]; 3625 len = strlen(vd->vdev_path); 3626 if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && 3627 strcmp(cvd->vdev_path + len, "/old") == 0) { 3628 spa_strfree(cvd->vdev_path); 3629 cvd->vdev_path = spa_strdup(vd->vdev_path); 3630 } 3631 } 3632 3633 /* 3634 * If we are detaching the original disk from a spare, then it implies 3635 * that the spare should become a real disk, and be removed from the 3636 * active spare list for the pool. 3637 */ 3638 if (pvd->vdev_ops == &vdev_spare_ops && 3639 vd->vdev_id == 0 && pvd->vdev_child[1]->vdev_isspare) 3640 unspare = B_TRUE; 3641 3642 /* 3643 * Erase the disk labels so the disk can be used for other things. 3644 * This must be done after all other error cases are handled, 3645 * but before we disembowel vd (so we can still do I/O to it). 3646 * But if we can't do it, don't treat the error as fatal -- 3647 * it may be that the unwritability of the disk is the reason 3648 * it's being detached! 3649 */ 3650 error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 3651 3652 /* 3653 * Remove vd from its parent and compact the parent's children. 3654 */ 3655 vdev_remove_child(pvd, vd); 3656 vdev_compact_children(pvd); 3657 3658 /* 3659 * Remember one of the remaining children so we can get tvd below. 3660 */ 3661 cvd = pvd->vdev_child[0]; 3662 3663 /* 3664 * If we need to remove the remaining child from the list of hot spares, 3665 * do it now, marking the vdev as no longer a spare in the process. 3666 * We must do this before vdev_remove_parent(), because that can 3667 * change the GUID if it creates a new toplevel GUID. For a similar 3668 * reason, we must remove the spare now, in the same txg as the detach; 3669 * otherwise someone could attach a new sibling, change the GUID, and 3670 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. 3671 */ 3672 if (unspare) { 3673 ASSERT(cvd->vdev_isspare); 3674 spa_spare_remove(cvd); 3675 unspare_guid = cvd->vdev_guid; 3676 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 3677 } 3678 3679 /* 3680 * If the parent mirror/replacing vdev only has one child, 3681 * the parent is no longer needed. Remove it from the tree. 3682 */ 3683 if (pvd->vdev_children == 1) 3684 vdev_remove_parent(cvd); 3685 3686 /* 3687 * We don't set tvd until now because the parent we just removed 3688 * may have been the previous top-level vdev. 3689 */ 3690 tvd = cvd->vdev_top; 3691 ASSERT(tvd->vdev_parent == rvd); 3692 3693 /* 3694 * Reevaluate the parent vdev state. 3695 */ 3696 vdev_propagate_state(cvd); 3697 3698 /* 3699 * If the 'autoexpand' property is set on the pool then automatically 3700 * try to expand the size of the pool. For example if the device we 3701 * just detached was smaller than the others, it may be possible to 3702 * add metaslabs (i.e. grow the pool). We need to reopen the vdev 3703 * first so that we can obtain the updated sizes of the leaf vdevs. 3704 */ 3705 if (spa->spa_autoexpand) { 3706 vdev_reopen(tvd); 3707 vdev_expand(tvd, txg); 3708 } 3709 3710 vdev_config_dirty(tvd); 3711 3712 /* 3713 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 3714 * vd->vdev_detached is set and free vd's DTL object in syncing context. 3715 * But first make sure we're not on any *other* txg's DTL list, to 3716 * prevent vd from being accessed after it's freed. 3717 */ 3718 for (int t = 0; t < TXG_SIZE; t++) 3719 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 3720 vd->vdev_detached = B_TRUE; 3721 vdev_dirty(tvd, VDD_DTL, vd, txg); 3722 3723 spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); 3724 3725 error = spa_vdev_exit(spa, vd, txg, 0); 3726 3727 /* 3728 * If this was the removal of the original device in a hot spare vdev, 3729 * then we want to go through and remove the device from the hot spare 3730 * list of every other pool. 3731 */ 3732 if (unspare) { 3733 spa_t *myspa = spa; 3734 spa = NULL; 3735 mutex_enter(&spa_namespace_lock); 3736 while ((spa = spa_next(spa)) != NULL) { 3737 if (spa->spa_state != POOL_STATE_ACTIVE) 3738 continue; 3739 if (spa == myspa) 3740 continue; 3741 spa_open_ref(spa, FTAG); 3742 mutex_exit(&spa_namespace_lock); 3743 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 3744 mutex_enter(&spa_namespace_lock); 3745 spa_close(spa, FTAG); 3746 } 3747 mutex_exit(&spa_namespace_lock); 3748 } 3749 3750 return (error); 3751 } 3752 3753 static nvlist_t * 3754 spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) 3755 { 3756 for (int i = 0; i < count; i++) { 3757 uint64_t guid; 3758 3759 VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID, 3760 &guid) == 0); 3761 3762 if (guid == target_guid) 3763 return (nvpp[i]); 3764 } 3765 3766 return (NULL); 3767 } 3768 3769 static void 3770 spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, 3771 nvlist_t *dev_to_remove) 3772 { 3773 nvlist_t **newdev = NULL; 3774 3775 if (count > 1) 3776 newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP); 3777 3778 for (int i = 0, j = 0; i < count; i++) { 3779 if (dev[i] == dev_to_remove) 3780 continue; 3781 VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); 3782 } 3783 3784 VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); 3785 VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0); 3786 3787 for (int i = 0; i < count - 1; i++) 3788 nvlist_free(newdev[i]); 3789 3790 if (count > 1) 3791 kmem_free(newdev, (count - 1) * sizeof (void *)); 3792 } 3793 3794 /* 3795 * Removing a device from the vdev namespace requires several steps 3796 * and can take a significant amount of time. As a result we use 3797 * the spa_vdev_config_[enter/exit] functions which allow us to 3798 * grab and release the spa_config_lock while still holding the namespace 3799 * lock. During each step the configuration is synced out. 3800 */ 3801 3802 /* 3803 * Evacuate the device. 3804 */ 3805 int 3806 spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd) 3807 { 3808 int error = 0; 3809 uint64_t txg; 3810 3811 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 3812 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 3813 ASSERT(vd == vd->vdev_top); 3814 3815 /* 3816 * Evacuate the device. We don't hold the config lock as writer 3817 * since we need to do I/O but we do keep the 3818 * spa_namespace_lock held. Once this completes the device 3819 * should no longer have any blocks allocated on it. 3820 */ 3821 if (vd->vdev_islog) { 3822 error = dmu_objset_find(spa_name(spa), zil_vdev_offline, 3823 NULL, DS_FIND_CHILDREN); 3824 } else { 3825 error = ENOTSUP; /* until we have bp rewrite */ 3826 } 3827 3828 txg_wait_synced(spa_get_dsl(spa), 0); 3829 3830 if (error) 3831 return (error); 3832 3833 /* 3834 * The evacuation succeeded. Remove any remaining MOS metadata 3835 * associated with this vdev, and wait for these changes to sync. 3836 */ 3837 txg = spa_vdev_config_enter(spa); 3838 vd->vdev_removing = B_TRUE; 3839 vdev_dirty(vd, 0, NULL, txg); 3840 vdev_config_dirty(vd); 3841 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 3842 3843 return (0); 3844 } 3845 3846 /* 3847 * Complete the removal by cleaning up the namespace. 3848 */ 3849 void 3850 spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd) 3851 { 3852 vdev_t *rvd = spa->spa_root_vdev; 3853 uint64_t id = vd->vdev_id; 3854 boolean_t last_vdev = (id == (rvd->vdev_children - 1)); 3855 3856 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 3857 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3858 ASSERT(vd == vd->vdev_top); 3859 3860 (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 3861 3862 if (list_link_active(&vd->vdev_state_dirty_node)) 3863 vdev_state_clean(vd); 3864 if (list_link_active(&vd->vdev_config_dirty_node)) 3865 vdev_config_clean(vd); 3866 3867 vdev_free(vd); 3868 3869 if (last_vdev) { 3870 vdev_compact_children(rvd); 3871 } else { 3872 vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); 3873 vdev_add_child(rvd, vd); 3874 } 3875 vdev_config_dirty(rvd); 3876 3877 /* 3878 * Reassess the health of our root vdev. 3879 */ 3880 vdev_reopen(rvd); 3881 } 3882 3883 /* 3884 * Remove a device from the pool. Currently, this supports removing only hot 3885 * spares, slogs, and level 2 ARC devices. 3886 */ 3887 int 3888 spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 3889 { 3890 vdev_t *vd; 3891 metaslab_group_t *mg; 3892 nvlist_t **spares, **l2cache, *nv; 3893 uint64_t txg = 0; 3894 uint_t nspares, nl2cache; 3895 int error = 0; 3896 boolean_t locked = MUTEX_HELD(&spa_namespace_lock); 3897 3898 if (!locked) 3899 txg = spa_vdev_enter(spa); 3900 3901 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 3902 3903 if (spa->spa_spares.sav_vdevs != NULL && 3904 nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 3905 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 && 3906 (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) { 3907 /* 3908 * Only remove the hot spare if it's not currently in use 3909 * in this pool. 3910 */ 3911 if (vd == NULL || unspare) { 3912 spa_vdev_remove_aux(spa->spa_spares.sav_config, 3913 ZPOOL_CONFIG_SPARES, spares, nspares, nv); 3914 spa_load_spares(spa); 3915 spa->spa_spares.sav_sync = B_TRUE; 3916 } else { 3917 error = EBUSY; 3918 } 3919 } else if (spa->spa_l2cache.sav_vdevs != NULL && 3920 nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 3921 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && 3922 (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { 3923 /* 3924 * Cache devices can always be removed. 3925 */ 3926 spa_vdev_remove_aux(spa->spa_l2cache.sav_config, 3927 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); 3928 spa_load_l2cache(spa); 3929 spa->spa_l2cache.sav_sync = B_TRUE; 3930 } else if (vd != NULL && vd->vdev_islog) { 3931 ASSERT(!locked); 3932 ASSERT(vd == vd->vdev_top); 3933 3934 /* 3935 * XXX - Once we have bp-rewrite this should 3936 * become the common case. 3937 */ 3938 3939 mg = vd->vdev_mg; 3940 3941 /* 3942 * Stop allocating from this vdev. 3943 */ 3944 metaslab_group_passivate(mg); 3945 3946 /* 3947 * Wait for the youngest allocations and frees to sync, 3948 * and then wait for the deferral of those frees to finish. 3949 */ 3950 spa_vdev_config_exit(spa, NULL, 3951 txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); 3952 3953 /* 3954 * Attempt to evacuate the vdev. 3955 */ 3956 error = spa_vdev_remove_evacuate(spa, vd); 3957 3958 txg = spa_vdev_config_enter(spa); 3959 3960 /* 3961 * If we couldn't evacuate the vdev, unwind. 3962 */ 3963 if (error) { 3964 metaslab_group_activate(mg); 3965 return (spa_vdev_exit(spa, NULL, txg, error)); 3966 } 3967 3968 /* 3969 * Clean up the vdev namespace. 3970 */ 3971 spa_vdev_remove_from_namespace(spa, vd); 3972 3973 } else if (vd != NULL) { 3974 /* 3975 * Normal vdevs cannot be removed (yet). 3976 */ 3977 error = ENOTSUP; 3978 } else { 3979 /* 3980 * There is no vdev of any kind with the specified guid. 3981 */ 3982 error = ENOENT; 3983 } 3984 3985 if (!locked) 3986 return (spa_vdev_exit(spa, NULL, txg, error)); 3987 3988 return (error); 3989 } 3990 3991 /* 3992 * Find any device that's done replacing, or a vdev marked 'unspare' that's 3993 * current spared, so we can detach it. 3994 */ 3995 static vdev_t * 3996 spa_vdev_resilver_done_hunt(vdev_t *vd) 3997 { 3998 vdev_t *newvd, *oldvd; 3999 4000 for (int c = 0; c < vd->vdev_children; c++) { 4001 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 4002 if (oldvd != NULL) 4003 return (oldvd); 4004 } 4005 4006 /* 4007 * Check for a completed replacement. 4008 */ 4009 if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 4010 oldvd = vd->vdev_child[0]; 4011 newvd = vd->vdev_child[1]; 4012 4013 if (vdev_dtl_empty(newvd, DTL_MISSING) && 4014 !vdev_dtl_required(oldvd)) 4015 return (oldvd); 4016 } 4017 4018 /* 4019 * Check for a completed resilver with the 'unspare' flag set. 4020 */ 4021 if (vd->vdev_ops == &vdev_spare_ops && vd->vdev_children == 2) { 4022 newvd = vd->vdev_child[0]; 4023 oldvd = vd->vdev_child[1]; 4024 4025 if (newvd->vdev_unspare && 4026 vdev_dtl_empty(newvd, DTL_MISSING) && 4027 !vdev_dtl_required(oldvd)) { 4028 newvd->vdev_unspare = 0; 4029 return (oldvd); 4030 } 4031 } 4032 4033 return (NULL); 4034 } 4035 4036 static void 4037 spa_vdev_resilver_done(spa_t *spa) 4038 { 4039 vdev_t *vd, *pvd, *ppvd; 4040 uint64_t guid, sguid, pguid, ppguid; 4041 4042 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4043 4044 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 4045 pvd = vd->vdev_parent; 4046 ppvd = pvd->vdev_parent; 4047 guid = vd->vdev_guid; 4048 pguid = pvd->vdev_guid; 4049 ppguid = ppvd->vdev_guid; 4050 sguid = 0; 4051 /* 4052 * If we have just finished replacing a hot spared device, then 4053 * we need to detach the parent's first child (the original hot 4054 * spare) as well. 4055 */ 4056 if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0) { 4057 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 4058 ASSERT(ppvd->vdev_children == 2); 4059 sguid = ppvd->vdev_child[1]->vdev_guid; 4060 } 4061 spa_config_exit(spa, SCL_ALL, FTAG); 4062 if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) 4063 return; 4064 if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) 4065 return; 4066 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4067 } 4068 4069 spa_config_exit(spa, SCL_ALL, FTAG); 4070 } 4071 4072 /* 4073 * Update the stored path or FRU for this vdev. 4074 */ 4075 int 4076 spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, 4077 boolean_t ispath) 4078 { 4079 vdev_t *vd; 4080 4081 spa_vdev_state_enter(spa, SCL_ALL); 4082 4083 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 4084 return (spa_vdev_state_exit(spa, NULL, ENOENT)); 4085 4086 if (!vd->vdev_ops->vdev_op_leaf) 4087 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 4088 4089 if (ispath) { 4090 spa_strfree(vd->vdev_path); 4091 vd->vdev_path = spa_strdup(value); 4092 } else { 4093 if (vd->vdev_fru != NULL) 4094 spa_strfree(vd->vdev_fru); 4095 vd->vdev_fru = spa_strdup(value); 4096 } 4097 4098 return (spa_vdev_state_exit(spa, vd, 0)); 4099 } 4100 4101 int 4102 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 4103 { 4104 return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); 4105 } 4106 4107 int 4108 spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) 4109 { 4110 return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); 4111 } 4112 4113 /* 4114 * ========================================================================== 4115 * SPA Scrubbing 4116 * ========================================================================== 4117 */ 4118 4119 int 4120 spa_scrub(spa_t *spa, pool_scrub_type_t type) 4121 { 4122 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 4123 4124 if ((uint_t)type >= POOL_SCRUB_TYPES) 4125 return (ENOTSUP); 4126 4127 /* 4128 * If a resilver was requested, but there is no DTL on a 4129 * writeable leaf device, we have nothing to do. 4130 */ 4131 if (type == POOL_SCRUB_RESILVER && 4132 !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 4133 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 4134 return (0); 4135 } 4136 4137 if (type == POOL_SCRUB_EVERYTHING && 4138 spa->spa_dsl_pool->dp_scrub_func != SCRUB_FUNC_NONE && 4139 spa->spa_dsl_pool->dp_scrub_isresilver) 4140 return (EBUSY); 4141 4142 if (type == POOL_SCRUB_EVERYTHING || type == POOL_SCRUB_RESILVER) { 4143 return (dsl_pool_scrub_clean(spa->spa_dsl_pool)); 4144 } else if (type == POOL_SCRUB_NONE) { 4145 return (dsl_pool_scrub_cancel(spa->spa_dsl_pool)); 4146 } else { 4147 return (EINVAL); 4148 } 4149 } 4150 4151 /* 4152 * ========================================================================== 4153 * SPA async task processing 4154 * ========================================================================== 4155 */ 4156 4157 static void 4158 spa_async_remove(spa_t *spa, vdev_t *vd) 4159 { 4160 if (vd->vdev_remove_wanted) { 4161 vd->vdev_remove_wanted = 0; 4162 vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); 4163 4164 /* 4165 * We want to clear the stats, but we don't want to do a full 4166 * vdev_clear() as that will cause us to throw away 4167 * degraded/faulted state as well as attempt to reopen the 4168 * device, all of which is a waste. 4169 */ 4170 vd->vdev_stat.vs_read_errors = 0; 4171 vd->vdev_stat.vs_write_errors = 0; 4172 vd->vdev_stat.vs_checksum_errors = 0; 4173 4174 vdev_state_dirty(vd->vdev_top); 4175 } 4176 4177 for (int c = 0; c < vd->vdev_children; c++) 4178 spa_async_remove(spa, vd->vdev_child[c]); 4179 } 4180 4181 static void 4182 spa_async_probe(spa_t *spa, vdev_t *vd) 4183 { 4184 if (vd->vdev_probe_wanted) { 4185 vd->vdev_probe_wanted = 0; 4186 vdev_reopen(vd); /* vdev_open() does the actual probe */ 4187 } 4188 4189 for (int c = 0; c < vd->vdev_children; c++) 4190 spa_async_probe(spa, vd->vdev_child[c]); 4191 } 4192 4193 static void 4194 spa_async_autoexpand(spa_t *spa, vdev_t *vd) 4195 { 4196 sysevent_id_t eid; 4197 nvlist_t *attr; 4198 char *physpath; 4199 4200 if (!spa->spa_autoexpand) 4201 return; 4202 4203 for (int c = 0; c < vd->vdev_children; c++) { 4204 vdev_t *cvd = vd->vdev_child[c]; 4205 spa_async_autoexpand(spa, cvd); 4206 } 4207 4208 if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) 4209 return; 4210 4211 physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP); 4212 (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath); 4213 4214 VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0); 4215 VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0); 4216 4217 (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS, 4218 ESC_DEV_DLE, attr, &eid, DDI_SLEEP); 4219 4220 nvlist_free(attr); 4221 kmem_free(physpath, MAXPATHLEN); 4222 } 4223 4224 static void 4225 spa_async_thread(spa_t *spa) 4226 { 4227 int tasks; 4228 4229 ASSERT(spa->spa_sync_on); 4230 4231 mutex_enter(&spa->spa_async_lock); 4232 tasks = spa->spa_async_tasks; 4233 spa->spa_async_tasks = 0; 4234 mutex_exit(&spa->spa_async_lock); 4235 4236 /* 4237 * See if the config needs to be updated. 4238 */ 4239 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 4240 uint64_t old_space, new_space; 4241 4242 mutex_enter(&spa_namespace_lock); 4243 old_space = metaslab_class_get_space(spa_normal_class(spa)); 4244 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 4245 new_space = metaslab_class_get_space(spa_normal_class(spa)); 4246 mutex_exit(&spa_namespace_lock); 4247 4248 /* 4249 * If the pool grew as a result of the config update, 4250 * then log an internal history event. 4251 */ 4252 if (new_space != old_space) { 4253 spa_history_internal_log(LOG_POOL_VDEV_ONLINE, 4254 spa, NULL, CRED(), 4255 "pool '%s' size: %llu(+%llu)", 4256 spa_name(spa), new_space, new_space - old_space); 4257 } 4258 } 4259 4260 /* 4261 * See if any devices need to be marked REMOVED. 4262 */ 4263 if (tasks & SPA_ASYNC_REMOVE) { 4264 spa_vdev_state_enter(spa, SCL_NONE); 4265 spa_async_remove(spa, spa->spa_root_vdev); 4266 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 4267 spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); 4268 for (int i = 0; i < spa->spa_spares.sav_count; i++) 4269 spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); 4270 (void) spa_vdev_state_exit(spa, NULL, 0); 4271 } 4272 4273 if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { 4274 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4275 spa_async_autoexpand(spa, spa->spa_root_vdev); 4276 spa_config_exit(spa, SCL_CONFIG, FTAG); 4277 } 4278 4279 /* 4280 * See if any devices need to be probed. 4281 */ 4282 if (tasks & SPA_ASYNC_PROBE) { 4283 spa_vdev_state_enter(spa, SCL_NONE); 4284 spa_async_probe(spa, spa->spa_root_vdev); 4285 (void) spa_vdev_state_exit(spa, NULL, 0); 4286 } 4287 4288 /* 4289 * If any devices are done replacing, detach them. 4290 */ 4291 if (tasks & SPA_ASYNC_RESILVER_DONE) 4292 spa_vdev_resilver_done(spa); 4293 4294 /* 4295 * Kick off a resilver. 4296 */ 4297 if (tasks & SPA_ASYNC_RESILVER) 4298 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER) == 0); 4299 4300 /* 4301 * Let the world know that we're done. 4302 */ 4303 mutex_enter(&spa->spa_async_lock); 4304 spa->spa_async_thread = NULL; 4305 cv_broadcast(&spa->spa_async_cv); 4306 mutex_exit(&spa->spa_async_lock); 4307 thread_exit(); 4308 } 4309 4310 void 4311 spa_async_suspend(spa_t *spa) 4312 { 4313 mutex_enter(&spa->spa_async_lock); 4314 spa->spa_async_suspended++; 4315 while (spa->spa_async_thread != NULL) 4316 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 4317 mutex_exit(&spa->spa_async_lock); 4318 } 4319 4320 void 4321 spa_async_resume(spa_t *spa) 4322 { 4323 mutex_enter(&spa->spa_async_lock); 4324 ASSERT(spa->spa_async_suspended != 0); 4325 spa->spa_async_suspended--; 4326 mutex_exit(&spa->spa_async_lock); 4327 } 4328 4329 static void 4330 spa_async_dispatch(spa_t *spa) 4331 { 4332 mutex_enter(&spa->spa_async_lock); 4333 if (spa->spa_async_tasks && !spa->spa_async_suspended && 4334 spa->spa_async_thread == NULL && 4335 rootdir != NULL && !vn_is_readonly(rootdir)) 4336 spa->spa_async_thread = thread_create(NULL, 0, 4337 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 4338 mutex_exit(&spa->spa_async_lock); 4339 } 4340 4341 void 4342 spa_async_request(spa_t *spa, int task) 4343 { 4344 mutex_enter(&spa->spa_async_lock); 4345 spa->spa_async_tasks |= task; 4346 mutex_exit(&spa->spa_async_lock); 4347 } 4348 4349 /* 4350 * ========================================================================== 4351 * SPA syncing routines 4352 * ========================================================================== 4353 */ 4354 static void 4355 spa_sync_deferred_bplist(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx, uint64_t txg) 4356 { 4357 blkptr_t blk; 4358 uint64_t itor = 0; 4359 uint8_t c = 1; 4360 4361 while (bplist_iterate(bpl, &itor, &blk) == 0) { 4362 ASSERT(blk.blk_birth < txg); 4363 zio_free(spa, txg, &blk); 4364 } 4365 4366 bplist_vacate(bpl, tx); 4367 4368 /* 4369 * Pre-dirty the first block so we sync to convergence faster. 4370 * (Usually only the first block is needed.) 4371 */ 4372 dmu_write(bpl->bpl_mos, spa->spa_deferred_bplist_obj, 0, 1, &c, tx); 4373 } 4374 4375 static void 4376 spa_sync_free(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 4377 { 4378 zio_t *zio = arg; 4379 4380 zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp, 4381 zio->io_flags)); 4382 } 4383 4384 static void 4385 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 4386 { 4387 char *packed = NULL; 4388 size_t bufsize; 4389 size_t nvsize = 0; 4390 dmu_buf_t *db; 4391 4392 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 4393 4394 /* 4395 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration 4396 * information. This avoids the dbuf_will_dirty() path and 4397 * saves us a pre-read to get data we don't actually care about. 4398 */ 4399 bufsize = P2ROUNDUP(nvsize, SPA_CONFIG_BLOCKSIZE); 4400 packed = kmem_alloc(bufsize, KM_SLEEP); 4401 4402 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 4403 KM_SLEEP) == 0); 4404 bzero(packed + nvsize, bufsize - nvsize); 4405 4406 dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); 4407 4408 kmem_free(packed, bufsize); 4409 4410 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 4411 dmu_buf_will_dirty(db, tx); 4412 *(uint64_t *)db->db_data = nvsize; 4413 dmu_buf_rele(db, FTAG); 4414 } 4415 4416 static void 4417 spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, 4418 const char *config, const char *entry) 4419 { 4420 nvlist_t *nvroot; 4421 nvlist_t **list; 4422 int i; 4423 4424 if (!sav->sav_sync) 4425 return; 4426 4427 /* 4428 * Update the MOS nvlist describing the list of available devices. 4429 * spa_validate_aux() will have already made sure this nvlist is 4430 * valid and the vdevs are labeled appropriately. 4431 */ 4432 if (sav->sav_object == 0) { 4433 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, 4434 DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, 4435 sizeof (uint64_t), tx); 4436 VERIFY(zap_update(spa->spa_meta_objset, 4437 DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, 4438 &sav->sav_object, tx) == 0); 4439 } 4440 4441 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 4442 if (sav->sav_count == 0) { 4443 VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); 4444 } else { 4445 list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 4446 for (i = 0; i < sav->sav_count; i++) 4447 list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], 4448 B_FALSE, B_FALSE, B_TRUE); 4449 VERIFY(nvlist_add_nvlist_array(nvroot, config, list, 4450 sav->sav_count) == 0); 4451 for (i = 0; i < sav->sav_count; i++) 4452 nvlist_free(list[i]); 4453 kmem_free(list, sav->sav_count * sizeof (void *)); 4454 } 4455 4456 spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); 4457 nvlist_free(nvroot); 4458 4459 sav->sav_sync = B_FALSE; 4460 } 4461 4462 static void 4463 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 4464 { 4465 nvlist_t *config; 4466 4467 if (list_is_empty(&spa->spa_config_dirty_list)) 4468 return; 4469 4470 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 4471 4472 config = spa_config_generate(spa, spa->spa_root_vdev, 4473 dmu_tx_get_txg(tx), B_FALSE); 4474 4475 spa_config_exit(spa, SCL_STATE, FTAG); 4476 4477 if (spa->spa_config_syncing) 4478 nvlist_free(spa->spa_config_syncing); 4479 spa->spa_config_syncing = config; 4480 4481 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 4482 } 4483 4484 /* 4485 * Set zpool properties. 4486 */ 4487 static void 4488 spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) 4489 { 4490 spa_t *spa = arg1; 4491 objset_t *mos = spa->spa_meta_objset; 4492 nvlist_t *nvp = arg2; 4493 nvpair_t *elem; 4494 uint64_t intval; 4495 char *strval; 4496 zpool_prop_t prop; 4497 const char *propname; 4498 zprop_type_t proptype; 4499 4500 mutex_enter(&spa->spa_props_lock); 4501 4502 elem = NULL; 4503 while ((elem = nvlist_next_nvpair(nvp, elem))) { 4504 switch (prop = zpool_name_to_prop(nvpair_name(elem))) { 4505 case ZPOOL_PROP_VERSION: 4506 /* 4507 * Only set version for non-zpool-creation cases 4508 * (set/import). spa_create() needs special care 4509 * for version setting. 4510 */ 4511 if (tx->tx_txg != TXG_INITIAL) { 4512 VERIFY(nvpair_value_uint64(elem, 4513 &intval) == 0); 4514 ASSERT(intval <= SPA_VERSION); 4515 ASSERT(intval >= spa_version(spa)); 4516 spa->spa_uberblock.ub_version = intval; 4517 vdev_config_dirty(spa->spa_root_vdev); 4518 } 4519 break; 4520 4521 case ZPOOL_PROP_ALTROOT: 4522 /* 4523 * 'altroot' is a non-persistent property. It should 4524 * have been set temporarily at creation or import time. 4525 */ 4526 ASSERT(spa->spa_root != NULL); 4527 break; 4528 4529 case ZPOOL_PROP_CACHEFILE: 4530 /* 4531 * 'cachefile' is also a non-persisitent property. 4532 */ 4533 break; 4534 default: 4535 /* 4536 * Set pool property values in the poolprops mos object. 4537 */ 4538 if (spa->spa_pool_props_object == 0) { 4539 VERIFY((spa->spa_pool_props_object = 4540 zap_create(mos, DMU_OT_POOL_PROPS, 4541 DMU_OT_NONE, 0, tx)) > 0); 4542 4543 VERIFY(zap_update(mos, 4544 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 4545 8, 1, &spa->spa_pool_props_object, tx) 4546 == 0); 4547 } 4548 4549 /* normalize the property name */ 4550 propname = zpool_prop_to_name(prop); 4551 proptype = zpool_prop_get_type(prop); 4552 4553 if (nvpair_type(elem) == DATA_TYPE_STRING) { 4554 ASSERT(proptype == PROP_TYPE_STRING); 4555 VERIFY(nvpair_value_string(elem, &strval) == 0); 4556 VERIFY(zap_update(mos, 4557 spa->spa_pool_props_object, propname, 4558 1, strlen(strval) + 1, strval, tx) == 0); 4559 4560 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 4561 VERIFY(nvpair_value_uint64(elem, &intval) == 0); 4562 4563 if (proptype == PROP_TYPE_INDEX) { 4564 const char *unused; 4565 VERIFY(zpool_prop_index_to_string( 4566 prop, intval, &unused) == 0); 4567 } 4568 VERIFY(zap_update(mos, 4569 spa->spa_pool_props_object, propname, 4570 8, 1, &intval, tx) == 0); 4571 } else { 4572 ASSERT(0); /* not allowed */ 4573 } 4574 4575 switch (prop) { 4576 case ZPOOL_PROP_DELEGATION: 4577 spa->spa_delegation = intval; 4578 break; 4579 case ZPOOL_PROP_BOOTFS: 4580 spa->spa_bootfs = intval; 4581 break; 4582 case ZPOOL_PROP_FAILUREMODE: 4583 spa->spa_failmode = intval; 4584 break; 4585 case ZPOOL_PROP_AUTOEXPAND: 4586 spa->spa_autoexpand = intval; 4587 spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); 4588 break; 4589 case ZPOOL_PROP_DEDUPDITTO: 4590 spa->spa_dedup_ditto = intval; 4591 break; 4592 default: 4593 break; 4594 } 4595 } 4596 4597 /* log internal history if this is not a zpool create */ 4598 if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY && 4599 tx->tx_txg != TXG_INITIAL) { 4600 spa_history_internal_log(LOG_POOL_PROPSET, 4601 spa, tx, cr, "%s %lld %s", 4602 nvpair_name(elem), intval, spa_name(spa)); 4603 } 4604 } 4605 4606 mutex_exit(&spa->spa_props_lock); 4607 } 4608 4609 /* 4610 * Sync the specified transaction group. New blocks may be dirtied as 4611 * part of the process, so we iterate until it converges. 4612 */ 4613 void 4614 spa_sync(spa_t *spa, uint64_t txg) 4615 { 4616 dsl_pool_t *dp = spa->spa_dsl_pool; 4617 objset_t *mos = spa->spa_meta_objset; 4618 bplist_t *defer_bpl = &spa->spa_deferred_bplist; 4619 bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; 4620 vdev_t *rvd = spa->spa_root_vdev; 4621 vdev_t *vd; 4622 dmu_tx_t *tx; 4623 int error; 4624 4625 /* 4626 * Lock out configuration changes. 4627 */ 4628 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4629 4630 spa->spa_syncing_txg = txg; 4631 spa->spa_sync_pass = 0; 4632 4633 /* 4634 * If there are any pending vdev state changes, convert them 4635 * into config changes that go out with this transaction group. 4636 */ 4637 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 4638 while (list_head(&spa->spa_state_dirty_list) != NULL) { 4639 /* 4640 * We need the write lock here because, for aux vdevs, 4641 * calling vdev_config_dirty() modifies sav_config. 4642 * This is ugly and will become unnecessary when we 4643 * eliminate the aux vdev wart by integrating all vdevs 4644 * into the root vdev tree. 4645 */ 4646 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 4647 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); 4648 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 4649 vdev_state_clean(vd); 4650 vdev_config_dirty(vd); 4651 } 4652 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 4653 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 4654 } 4655 spa_config_exit(spa, SCL_STATE, FTAG); 4656 4657 VERIFY(0 == bplist_open(defer_bpl, mos, spa->spa_deferred_bplist_obj)); 4658 4659 tx = dmu_tx_create_assigned(dp, txg); 4660 4661 /* 4662 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 4663 * set spa_deflate if we have no raid-z vdevs. 4664 */ 4665 if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 4666 spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 4667 int i; 4668 4669 for (i = 0; i < rvd->vdev_children; i++) { 4670 vd = rvd->vdev_child[i]; 4671 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 4672 break; 4673 } 4674 if (i == rvd->vdev_children) { 4675 spa->spa_deflate = TRUE; 4676 VERIFY(0 == zap_add(spa->spa_meta_objset, 4677 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 4678 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 4679 } 4680 } 4681 4682 if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && 4683 spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { 4684 dsl_pool_create_origin(dp, tx); 4685 4686 /* Keeping the origin open increases spa_minref */ 4687 spa->spa_minref += 3; 4688 } 4689 4690 if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && 4691 spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { 4692 dsl_pool_upgrade_clones(dp, tx); 4693 } 4694 4695 /* 4696 * If anything has changed in this txg, push the deferred frees 4697 * from the previous txg. If not, leave them alone so that we 4698 * don't generate work on an otherwise idle system. 4699 */ 4700 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 4701 !txg_list_empty(&dp->dp_dirty_dirs, txg) || 4702 !txg_list_empty(&dp->dp_sync_tasks, txg)) 4703 spa_sync_deferred_bplist(spa, defer_bpl, tx, txg); 4704 4705 /* 4706 * Iterate to convergence. 4707 */ 4708 do { 4709 int pass = ++spa->spa_sync_pass; 4710 4711 spa_sync_config_object(spa, tx); 4712 spa_sync_aux_dev(spa, &spa->spa_spares, tx, 4713 ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); 4714 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, 4715 ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); 4716 spa_errlog_sync(spa, txg); 4717 dsl_pool_sync(dp, txg); 4718 4719 if (pass <= SYNC_PASS_DEFERRED_FREE) { 4720 zio_t *zio = zio_root(spa, NULL, NULL, 0); 4721 bplist_sync(free_bpl, spa_sync_free, zio, tx); 4722 VERIFY(zio_wait(zio) == 0); 4723 } else { 4724 bplist_sync(free_bpl, bplist_enqueue_cb, defer_bpl, tx); 4725 } 4726 4727 ddt_sync(spa, txg); 4728 4729 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) 4730 vdev_sync(vd, txg); 4731 4732 } while (dmu_objset_is_dirty(mos, txg)); 4733 4734 ASSERT(free_bpl->bpl_queue == NULL); 4735 4736 bplist_close(defer_bpl); 4737 4738 /* 4739 * Rewrite the vdev configuration (which includes the uberblock) 4740 * to commit the transaction group. 4741 * 4742 * If there are no dirty vdevs, we sync the uberblock to a few 4743 * random top-level vdevs that are known to be visible in the 4744 * config cache (see spa_vdev_add() for a complete description). 4745 * If there *are* dirty vdevs, sync the uberblock to all vdevs. 4746 */ 4747 for (;;) { 4748 /* 4749 * We hold SCL_STATE to prevent vdev open/close/etc. 4750 * while we're attempting to write the vdev labels. 4751 */ 4752 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 4753 4754 if (list_is_empty(&spa->spa_config_dirty_list)) { 4755 vdev_t *svd[SPA_DVAS_PER_BP]; 4756 int svdcount = 0; 4757 int children = rvd->vdev_children; 4758 int c0 = spa_get_random(children); 4759 4760 for (int c = 0; c < children; c++) { 4761 vd = rvd->vdev_child[(c0 + c) % children]; 4762 if (vd->vdev_ms_array == 0 || vd->vdev_islog) 4763 continue; 4764 svd[svdcount++] = vd; 4765 if (svdcount == SPA_DVAS_PER_BP) 4766 break; 4767 } 4768 error = vdev_config_sync(svd, svdcount, txg, B_FALSE); 4769 if (error != 0) 4770 error = vdev_config_sync(svd, svdcount, txg, 4771 B_TRUE); 4772 } else { 4773 error = vdev_config_sync(rvd->vdev_child, 4774 rvd->vdev_children, txg, B_FALSE); 4775 if (error != 0) 4776 error = vdev_config_sync(rvd->vdev_child, 4777 rvd->vdev_children, txg, B_TRUE); 4778 } 4779 4780 spa_config_exit(spa, SCL_STATE, FTAG); 4781 4782 if (error == 0) 4783 break; 4784 zio_suspend(spa, NULL); 4785 zio_resume_wait(spa); 4786 } 4787 dmu_tx_commit(tx); 4788 4789 /* 4790 * Clear the dirty config list. 4791 */ 4792 while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) 4793 vdev_config_clean(vd); 4794 4795 /* 4796 * Now that the new config has synced transactionally, 4797 * let it become visible to the config cache. 4798 */ 4799 if (spa->spa_config_syncing != NULL) { 4800 spa_config_set(spa, spa->spa_config_syncing); 4801 spa->spa_config_txg = txg; 4802 spa->spa_config_syncing = NULL; 4803 } 4804 4805 spa->spa_ubsync = spa->spa_uberblock; 4806 4807 dsl_pool_sync_done(dp, txg); 4808 4809 /* 4810 * Update usable space statistics. 4811 */ 4812 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 4813 vdev_sync_done(vd, txg); 4814 4815 spa_update_dspace(spa); 4816 4817 /* 4818 * It had better be the case that we didn't dirty anything 4819 * since vdev_config_sync(). 4820 */ 4821 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 4822 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 4823 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 4824 ASSERT(defer_bpl->bpl_queue == NULL); 4825 ASSERT(free_bpl->bpl_queue == NULL); 4826 4827 spa->spa_sync_pass = 0; 4828 4829 spa_config_exit(spa, SCL_CONFIG, FTAG); 4830 4831 spa_handle_ignored_writes(spa); 4832 4833 /* 4834 * If any async tasks have been requested, kick them off. 4835 */ 4836 spa_async_dispatch(spa); 4837 } 4838 4839 /* 4840 * Sync all pools. We don't want to hold the namespace lock across these 4841 * operations, so we take a reference on the spa_t and drop the lock during the 4842 * sync. 4843 */ 4844 void 4845 spa_sync_allpools(void) 4846 { 4847 spa_t *spa = NULL; 4848 mutex_enter(&spa_namespace_lock); 4849 while ((spa = spa_next(spa)) != NULL) { 4850 if (spa_state(spa) != POOL_STATE_ACTIVE || spa_suspended(spa)) 4851 continue; 4852 spa_open_ref(spa, FTAG); 4853 mutex_exit(&spa_namespace_lock); 4854 txg_wait_synced(spa_get_dsl(spa), 0); 4855 mutex_enter(&spa_namespace_lock); 4856 spa_close(spa, FTAG); 4857 } 4858 mutex_exit(&spa_namespace_lock); 4859 } 4860 4861 /* 4862 * ========================================================================== 4863 * Miscellaneous routines 4864 * ========================================================================== 4865 */ 4866 4867 /* 4868 * Remove all pools in the system. 4869 */ 4870 void 4871 spa_evict_all(void) 4872 { 4873 spa_t *spa; 4874 4875 /* 4876 * Remove all cached state. All pools should be closed now, 4877 * so every spa in the AVL tree should be unreferenced. 4878 */ 4879 mutex_enter(&spa_namespace_lock); 4880 while ((spa = spa_next(NULL)) != NULL) { 4881 /* 4882 * Stop async tasks. The async thread may need to detach 4883 * a device that's been replaced, which requires grabbing 4884 * spa_namespace_lock, so we must drop it here. 4885 */ 4886 spa_open_ref(spa, FTAG); 4887 mutex_exit(&spa_namespace_lock); 4888 spa_async_suspend(spa); 4889 mutex_enter(&spa_namespace_lock); 4890 spa_close(spa, FTAG); 4891 4892 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 4893 spa_unload(spa); 4894 spa_deactivate(spa); 4895 } 4896 spa_remove(spa); 4897 } 4898 mutex_exit(&spa_namespace_lock); 4899 } 4900 4901 vdev_t * 4902 spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) 4903 { 4904 vdev_t *vd; 4905 int i; 4906 4907 if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) 4908 return (vd); 4909 4910 if (aux) { 4911 for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 4912 vd = spa->spa_l2cache.sav_vdevs[i]; 4913 if (vd->vdev_guid == guid) 4914 return (vd); 4915 } 4916 4917 for (i = 0; i < spa->spa_spares.sav_count; i++) { 4918 vd = spa->spa_spares.sav_vdevs[i]; 4919 if (vd->vdev_guid == guid) 4920 return (vd); 4921 } 4922 } 4923 4924 return (NULL); 4925 } 4926 4927 void 4928 spa_upgrade(spa_t *spa, uint64_t version) 4929 { 4930 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4931 4932 /* 4933 * This should only be called for a non-faulted pool, and since a 4934 * future version would result in an unopenable pool, this shouldn't be 4935 * possible. 4936 */ 4937 ASSERT(spa->spa_uberblock.ub_version <= SPA_VERSION); 4938 ASSERT(version >= spa->spa_uberblock.ub_version); 4939 4940 spa->spa_uberblock.ub_version = version; 4941 vdev_config_dirty(spa->spa_root_vdev); 4942 4943 spa_config_exit(spa, SCL_ALL, FTAG); 4944 4945 txg_wait_synced(spa_get_dsl(spa), 0); 4946 } 4947 4948 boolean_t 4949 spa_has_spare(spa_t *spa, uint64_t guid) 4950 { 4951 int i; 4952 uint64_t spareguid; 4953 spa_aux_vdev_t *sav = &spa->spa_spares; 4954 4955 for (i = 0; i < sav->sav_count; i++) 4956 if (sav->sav_vdevs[i]->vdev_guid == guid) 4957 return (B_TRUE); 4958 4959 for (i = 0; i < sav->sav_npending; i++) { 4960 if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, 4961 &spareguid) == 0 && spareguid == guid) 4962 return (B_TRUE); 4963 } 4964 4965 return (B_FALSE); 4966 } 4967 4968 /* 4969 * Check if a pool has an active shared spare device. 4970 * Note: reference count of an active spare is 2, as a spare and as a replace 4971 */ 4972 static boolean_t 4973 spa_has_active_shared_spare(spa_t *spa) 4974 { 4975 int i, refcnt; 4976 uint64_t pool; 4977 spa_aux_vdev_t *sav = &spa->spa_spares; 4978 4979 for (i = 0; i < sav->sav_count; i++) { 4980 if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, 4981 &refcnt) && pool != 0ULL && pool == spa_guid(spa) && 4982 refcnt > 2) 4983 return (B_TRUE); 4984 } 4985 4986 return (B_FALSE); 4987 } 4988 4989 /* 4990 * Post a sysevent corresponding to the given event. The 'name' must be one of 4991 * the event definitions in sys/sysevent/eventdefs.h. The payload will be 4992 * filled in from the spa and (optionally) the vdev. This doesn't do anything 4993 * in the userland libzpool, as we don't want consumers to misinterpret ztest 4994 * or zdb as real changes. 4995 */ 4996 void 4997 spa_event_notify(spa_t *spa, vdev_t *vd, const char *name) 4998 { 4999 #ifdef _KERNEL 5000 sysevent_t *ev; 5001 sysevent_attr_list_t *attr = NULL; 5002 sysevent_value_t value; 5003 sysevent_id_t eid; 5004 5005 ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", 5006 SE_SLEEP); 5007 5008 value.value_type = SE_DATA_TYPE_STRING; 5009 value.value.sv_string = spa_name(spa); 5010 if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) 5011 goto done; 5012 5013 value.value_type = SE_DATA_TYPE_UINT64; 5014 value.value.sv_uint64 = spa_guid(spa); 5015 if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) 5016 goto done; 5017 5018 if (vd) { 5019 value.value_type = SE_DATA_TYPE_UINT64; 5020 value.value.sv_uint64 = vd->vdev_guid; 5021 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, 5022 SE_SLEEP) != 0) 5023 goto done; 5024 5025 if (vd->vdev_path) { 5026 value.value_type = SE_DATA_TYPE_STRING; 5027 value.value.sv_string = vd->vdev_path; 5028 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, 5029 &value, SE_SLEEP) != 0) 5030 goto done; 5031 } 5032 } 5033 5034 if (sysevent_attach_attributes(ev, attr) != 0) 5035 goto done; 5036 attr = NULL; 5037 5038 (void) log_sysevent(ev, SE_SLEEP, &eid); 5039 5040 done: 5041 if (attr) 5042 sysevent_free_attr(attr); 5043 sysevent_free(ev); 5044 #endif 5045 } 5046