1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * This file contains all the routines used when modifying on-disk SPA state. 30 * This includes opening, importing, destroying, exporting a pool, and syncing a 31 * pool. 32 */ 33 34 #include <sys/zfs_context.h> 35 #include <sys/fm/fs/zfs.h> 36 #include <sys/spa_impl.h> 37 #include <sys/zio.h> 38 #include <sys/zio_checksum.h> 39 #include <sys/zio_compress.h> 40 #include <sys/dmu.h> 41 #include <sys/dmu_tx.h> 42 #include <sys/zap.h> 43 #include <sys/zil.h> 44 #include <sys/vdev_impl.h> 45 #include <sys/metaslab.h> 46 #include <sys/uberblock_impl.h> 47 #include <sys/txg.h> 48 #include <sys/avl.h> 49 #include <sys/dmu_traverse.h> 50 #include <sys/unique.h> 51 #include <sys/dsl_pool.h> 52 #include <sys/dsl_dir.h> 53 #include <sys/dsl_prop.h> 54 #include <sys/fs/zfs.h> 55 #include <sys/callb.h> 56 57 /* 58 * ========================================================================== 59 * SPA state manipulation (open/create/destroy/import/export) 60 * ========================================================================== 61 */ 62 63 static int 64 spa_error_entry_compare(const void *a, const void *b) 65 { 66 spa_error_entry_t *sa = (spa_error_entry_t *)a; 67 spa_error_entry_t *sb = (spa_error_entry_t *)b; 68 int ret; 69 70 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 71 sizeof (zbookmark_t)); 72 73 if (ret < 0) 74 return (-1); 75 else if (ret > 0) 76 return (1); 77 else 78 return (0); 79 } 80 81 /* 82 * Utility function which retrieves copies of the current logs and 83 * re-initializes them in the process. 84 */ 85 void 86 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 87 { 88 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 89 90 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 91 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 92 93 avl_create(&spa->spa_errlist_scrub, 94 spa_error_entry_compare, sizeof (spa_error_entry_t), 95 offsetof(spa_error_entry_t, se_avl)); 96 avl_create(&spa->spa_errlist_last, 97 spa_error_entry_compare, sizeof (spa_error_entry_t), 98 offsetof(spa_error_entry_t, se_avl)); 99 } 100 101 /* 102 * Activate an uninitialized pool. 103 */ 104 static void 105 spa_activate(spa_t *spa) 106 { 107 int t; 108 109 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 110 111 spa->spa_state = POOL_STATE_ACTIVE; 112 113 spa->spa_normal_class = metaslab_class_create(); 114 115 for (t = 0; t < ZIO_TYPES; t++) { 116 spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue", 117 8, maxclsyspri, 50, INT_MAX, 118 TASKQ_PREPOPULATE); 119 spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr", 120 8, maxclsyspri, 50, INT_MAX, 121 TASKQ_PREPOPULATE); 122 } 123 124 rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL); 125 126 list_create(&spa->spa_dirty_list, sizeof (vdev_t), 127 offsetof(vdev_t, vdev_dirty_node)); 128 129 txg_list_create(&spa->spa_vdev_txg_list, 130 offsetof(struct vdev, vdev_txg_node)); 131 132 avl_create(&spa->spa_errlist_scrub, 133 spa_error_entry_compare, sizeof (spa_error_entry_t), 134 offsetof(spa_error_entry_t, se_avl)); 135 avl_create(&spa->spa_errlist_last, 136 spa_error_entry_compare, sizeof (spa_error_entry_t), 137 offsetof(spa_error_entry_t, se_avl)); 138 } 139 140 /* 141 * Opposite of spa_activate(). 142 */ 143 static void 144 spa_deactivate(spa_t *spa) 145 { 146 int t; 147 148 ASSERT(spa->spa_sync_on == B_FALSE); 149 ASSERT(spa->spa_dsl_pool == NULL); 150 ASSERT(spa->spa_root_vdev == NULL); 151 152 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 153 154 txg_list_destroy(&spa->spa_vdev_txg_list); 155 156 list_destroy(&spa->spa_dirty_list); 157 158 rw_destroy(&spa->spa_traverse_lock); 159 160 for (t = 0; t < ZIO_TYPES; t++) { 161 taskq_destroy(spa->spa_zio_issue_taskq[t]); 162 taskq_destroy(spa->spa_zio_intr_taskq[t]); 163 spa->spa_zio_issue_taskq[t] = NULL; 164 spa->spa_zio_intr_taskq[t] = NULL; 165 } 166 167 metaslab_class_destroy(spa->spa_normal_class); 168 spa->spa_normal_class = NULL; 169 170 /* 171 * If this was part of an import or the open otherwise failed, we may 172 * still have errors left in the queues. Empty them just in case. 173 */ 174 spa_errlog_drain(spa); 175 176 avl_destroy(&spa->spa_errlist_scrub); 177 avl_destroy(&spa->spa_errlist_last); 178 179 spa->spa_state = POOL_STATE_UNINITIALIZED; 180 } 181 182 /* 183 * Verify a pool configuration, and construct the vdev tree appropriately. This 184 * will create all the necessary vdevs in the appropriate layout, with each vdev 185 * in the CLOSED state. This will prep the pool before open/creation/import. 186 * All vdev validation is done by the vdev_alloc() routine. 187 */ 188 static vdev_t * 189 spa_config_parse(spa_t *spa, nvlist_t *nv, vdev_t *parent, uint_t id, int atype) 190 { 191 nvlist_t **child; 192 uint_t c, children; 193 vdev_t *vd; 194 195 if ((vd = vdev_alloc(spa, nv, parent, id, atype)) == NULL) 196 return (NULL); 197 198 if (vd->vdev_ops->vdev_op_leaf) 199 return (vd); 200 201 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 202 &child, &children) != 0) { 203 vdev_free(vd); 204 return (NULL); 205 } 206 207 for (c = 0; c < children; c++) { 208 if (spa_config_parse(spa, child[c], vd, c, atype) == NULL) { 209 vdev_free(vd); 210 return (NULL); 211 } 212 } 213 214 return (vd); 215 } 216 217 /* 218 * Opposite of spa_load(). 219 */ 220 static void 221 spa_unload(spa_t *spa) 222 { 223 /* 224 * Stop async tasks. 225 */ 226 spa_async_suspend(spa); 227 228 /* 229 * Stop syncing. 230 */ 231 if (spa->spa_sync_on) { 232 txg_sync_stop(spa->spa_dsl_pool); 233 spa->spa_sync_on = B_FALSE; 234 } 235 236 /* 237 * Wait for any outstanding prefetch I/O to complete. 238 */ 239 spa_config_enter(spa, RW_WRITER, FTAG); 240 spa_config_exit(spa, FTAG); 241 242 /* 243 * Close the dsl pool. 244 */ 245 if (spa->spa_dsl_pool) { 246 dsl_pool_close(spa->spa_dsl_pool); 247 spa->spa_dsl_pool = NULL; 248 } 249 250 /* 251 * Close all vdevs. 252 */ 253 if (spa->spa_root_vdev) 254 vdev_free(spa->spa_root_vdev); 255 ASSERT(spa->spa_root_vdev == NULL); 256 257 spa->spa_async_suspended = 0; 258 } 259 260 /* 261 * Load an existing storage pool, using the pool's builtin spa_config as a 262 * source of configuration information. 263 */ 264 static int 265 spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) 266 { 267 int error = 0; 268 nvlist_t *nvroot = NULL; 269 vdev_t *rvd; 270 uberblock_t *ub = &spa->spa_uberblock; 271 uint64_t config_cache_txg = spa->spa_config_txg; 272 uint64_t pool_guid; 273 zio_t *zio; 274 275 spa->spa_load_state = state; 276 277 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 278 nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid) || 279 (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 280 &spa->spa_config_txg) && mosconfig)) { 281 error = EINVAL; 282 goto out; 283 } 284 285 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 286 spa_guid_exists(pool_guid, 0)) { 287 error = EEXIST; 288 goto out; 289 } 290 291 /* 292 * Parse the configuration into a vdev tree. 293 */ 294 spa_config_enter(spa, RW_WRITER, FTAG); 295 rvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 296 spa_config_exit(spa, FTAG); 297 298 if (rvd == NULL) { 299 error = EINVAL; 300 goto out; 301 } 302 303 ASSERT(spa->spa_root_vdev == rvd); 304 ASSERT(spa_guid(spa) == pool_guid); 305 306 /* 307 * Try to open all vdevs, loading each label in the process. 308 */ 309 if (vdev_open(rvd) != 0) { 310 error = ENXIO; 311 goto out; 312 } 313 314 /* 315 * Find the best uberblock. 316 */ 317 bzero(ub, sizeof (uberblock_t)); 318 319 zio = zio_root(spa, NULL, NULL, 320 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 321 vdev_uberblock_load(zio, rvd, ub); 322 error = zio_wait(zio); 323 324 /* 325 * If we weren't able to find a single valid uberblock, return failure. 326 */ 327 if (ub->ub_txg == 0) { 328 error = ENXIO; 329 goto out; 330 } 331 332 /* 333 * If the pool is newer than the code, we can't open it. 334 */ 335 if (ub->ub_version > UBERBLOCK_VERSION) { 336 error = ENOTSUP; 337 goto out; 338 } 339 340 /* 341 * If the vdev guid sum doesn't match the uberblock, we have an 342 * incomplete configuration. 343 */ 344 if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 345 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 346 VDEV_AUX_BAD_GUID_SUM); 347 error = ENXIO; 348 goto out; 349 } 350 351 /* 352 * Initialize internal SPA structures. 353 */ 354 spa->spa_state = POOL_STATE_ACTIVE; 355 spa->spa_ubsync = spa->spa_uberblock; 356 spa->spa_first_txg = spa_last_synced_txg(spa) + 1; 357 error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 358 if (error) { 359 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 360 VDEV_AUX_CORRUPT_DATA); 361 goto out; 362 } 363 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 364 365 if (zap_lookup(spa->spa_meta_objset, 366 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 367 sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { 368 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 369 VDEV_AUX_CORRUPT_DATA); 370 error = EIO; 371 goto out; 372 } 373 374 if (!mosconfig) { 375 dmu_buf_t *db; 376 char *packed = NULL; 377 size_t nvsize = 0; 378 nvlist_t *newconfig = NULL; 379 380 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, 381 spa->spa_config_object, FTAG, &db)); 382 nvsize = *(uint64_t *)db->db_data; 383 dmu_buf_rele(db, FTAG); 384 385 packed = kmem_alloc(nvsize, KM_SLEEP); 386 error = dmu_read(spa->spa_meta_objset, 387 spa->spa_config_object, 0, nvsize, packed); 388 if (error == 0) 389 error = nvlist_unpack(packed, nvsize, &newconfig, 0); 390 kmem_free(packed, nvsize); 391 392 if (error) { 393 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 394 VDEV_AUX_CORRUPT_DATA); 395 error = EIO; 396 goto out; 397 } 398 399 spa_config_set(spa, newconfig); 400 401 spa_unload(spa); 402 spa_deactivate(spa); 403 spa_activate(spa); 404 405 return (spa_load(spa, newconfig, state, B_TRUE)); 406 } 407 408 if (zap_lookup(spa->spa_meta_objset, 409 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 410 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { 411 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 412 VDEV_AUX_CORRUPT_DATA); 413 error = EIO; 414 goto out; 415 } 416 417 /* 418 * Load the persistent error log. If we have an older pool, this will 419 * not be present. 420 */ 421 error = zap_lookup(spa->spa_meta_objset, 422 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, 423 sizeof (uint64_t), 1, &spa->spa_errlog_last); 424 if (error != 0 &&error != ENOENT) { 425 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 426 VDEV_AUX_CORRUPT_DATA); 427 error = EIO; 428 goto out; 429 } 430 431 error = zap_lookup(spa->spa_meta_objset, 432 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, 433 sizeof (uint64_t), 1, &spa->spa_errlog_scrub); 434 if (error != 0 && error != ENOENT) { 435 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 436 VDEV_AUX_CORRUPT_DATA); 437 error = EIO; 438 goto out; 439 } 440 441 /* 442 * Load the vdev state for all top level vdevs. We need to grab the 443 * config lock because all label I/O is done with the 444 * ZIO_FLAG_CONFIG_HELD flag. 445 */ 446 spa_config_enter(spa, RW_READER, FTAG); 447 error = vdev_load(rvd); 448 spa_config_exit(spa, FTAG); 449 450 if (error) 451 goto out; 452 453 /* 454 * Propagate the leaf DTLs we just loaded all the way up the tree. 455 */ 456 spa_config_enter(spa, RW_WRITER, FTAG); 457 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 458 spa_config_exit(spa, FTAG); 459 460 /* 461 * Check the state of the root vdev. If it can't be opened, it 462 * indicates one or more toplevel vdevs are faulted. 463 */ 464 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 465 error = ENXIO; 466 goto out; 467 } 468 469 if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) { 470 dmu_tx_t *tx; 471 int need_update = B_FALSE; 472 int c; 473 474 /* 475 * Claim log blocks that haven't been committed yet. 476 * This must all happen in a single txg. 477 */ 478 tx = dmu_tx_create_assigned(spa_get_dsl(spa), 479 spa_first_txg(spa)); 480 dmu_objset_find(spa->spa_name, zil_claim, tx, 0); 481 dmu_tx_commit(tx); 482 483 spa->spa_sync_on = B_TRUE; 484 txg_sync_start(spa->spa_dsl_pool); 485 486 /* 487 * Wait for all claims to sync. 488 */ 489 txg_wait_synced(spa->spa_dsl_pool, 0); 490 491 /* 492 * If the config cache is stale, or we have uninitialized 493 * metaslabs (see spa_vdev_add()), then update the config. 494 */ 495 if (config_cache_txg != spa->spa_config_txg || 496 state == SPA_LOAD_IMPORT) 497 need_update = B_TRUE; 498 499 for (c = 0; c < rvd->vdev_children; c++) 500 if (rvd->vdev_child[c]->vdev_ms_array == 0) 501 need_update = B_TRUE; 502 503 /* 504 * Update the config cache asychronously in case we're the 505 * root pool, in which case the config cache isn't writable yet. 506 */ 507 if (need_update) 508 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 509 } 510 511 error = 0; 512 out: 513 if (error) 514 zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0); 515 spa->spa_load_state = SPA_LOAD_NONE; 516 spa->spa_ena = 0; 517 518 return (error); 519 } 520 521 /* 522 * Pool Open/Import 523 * 524 * The import case is identical to an open except that the configuration is sent 525 * down from userland, instead of grabbed from the configuration cache. For the 526 * case of an open, the pool configuration will exist in the 527 * POOL_STATE_UNITIALIZED state. 528 * 529 * The stats information (gen/count/ustats) is used to gather vdev statistics at 530 * the same time open the pool, without having to keep around the spa_t in some 531 * ambiguous state. 532 */ 533 static int 534 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) 535 { 536 spa_t *spa; 537 int error; 538 int loaded = B_FALSE; 539 int locked = B_FALSE; 540 541 *spapp = NULL; 542 543 /* 544 * As disgusting as this is, we need to support recursive calls to this 545 * function because dsl_dir_open() is called during spa_load(), and ends 546 * up calling spa_open() again. The real fix is to figure out how to 547 * avoid dsl_dir_open() calling this in the first place. 548 */ 549 if (mutex_owner(&spa_namespace_lock) != curthread) { 550 mutex_enter(&spa_namespace_lock); 551 locked = B_TRUE; 552 } 553 554 if ((spa = spa_lookup(pool)) == NULL) { 555 if (locked) 556 mutex_exit(&spa_namespace_lock); 557 return (ENOENT); 558 } 559 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 560 561 spa_activate(spa); 562 563 error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE); 564 565 if (error == EBADF) { 566 /* 567 * If vdev_load() returns EBADF, it indicates that one 568 * of the vdevs indicates that the pool has been 569 * exported or destroyed. If this is the case, the 570 * config cache is out of sync and we should remove the 571 * pool from the namespace. 572 */ 573 spa_unload(spa); 574 spa_deactivate(spa); 575 spa_remove(spa); 576 spa_config_sync(); 577 if (locked) 578 mutex_exit(&spa_namespace_lock); 579 return (ENOENT); 580 } 581 582 if (error) { 583 /* 584 * We can't open the pool, but we still have useful 585 * information: the state of each vdev after the 586 * attempted vdev_open(). Return this to the user. 587 */ 588 if (config != NULL && spa->spa_root_vdev != NULL) { 589 spa_config_enter(spa, RW_READER, FTAG); 590 *config = spa_config_generate(spa, NULL, -1ULL, 591 B_TRUE); 592 spa_config_exit(spa, FTAG); 593 } 594 spa_unload(spa); 595 spa_deactivate(spa); 596 spa->spa_last_open_failed = B_TRUE; 597 if (locked) 598 mutex_exit(&spa_namespace_lock); 599 *spapp = NULL; 600 return (error); 601 } else { 602 zfs_post_ok(spa, NULL); 603 spa->spa_last_open_failed = B_FALSE; 604 } 605 606 loaded = B_TRUE; 607 } 608 609 spa_open_ref(spa, tag); 610 if (locked) 611 mutex_exit(&spa_namespace_lock); 612 613 *spapp = spa; 614 615 if (config != NULL) { 616 spa_config_enter(spa, RW_READER, FTAG); 617 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 618 spa_config_exit(spa, FTAG); 619 } 620 621 /* 622 * If we just loaded the pool, resilver anything that's out of date. 623 */ 624 if (loaded && (spa_mode & FWRITE)) 625 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 626 627 return (0); 628 } 629 630 int 631 spa_open(const char *name, spa_t **spapp, void *tag) 632 { 633 return (spa_open_common(name, spapp, tag, NULL)); 634 } 635 636 /* 637 * Lookup the given spa_t, incrementing the inject count in the process, 638 * preventing it from being exported or destroyed. 639 */ 640 spa_t * 641 spa_inject_addref(char *name) 642 { 643 spa_t *spa; 644 645 mutex_enter(&spa_namespace_lock); 646 if ((spa = spa_lookup(name)) == NULL) { 647 mutex_exit(&spa_namespace_lock); 648 return (NULL); 649 } 650 spa->spa_inject_ref++; 651 mutex_exit(&spa_namespace_lock); 652 653 return (spa); 654 } 655 656 void 657 spa_inject_delref(spa_t *spa) 658 { 659 mutex_enter(&spa_namespace_lock); 660 spa->spa_inject_ref--; 661 mutex_exit(&spa_namespace_lock); 662 } 663 664 int 665 spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 666 { 667 int error; 668 spa_t *spa; 669 670 *config = NULL; 671 error = spa_open_common(name, &spa, FTAG, config); 672 673 if (spa && *config != NULL) 674 VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, 675 spa_get_errlog_size(spa)) == 0); 676 677 /* 678 * We want to get the alternate root even for faulted pools, so we cheat 679 * and call spa_lookup() directly. 680 */ 681 if (altroot) { 682 if (spa == NULL) { 683 mutex_enter(&spa_namespace_lock); 684 spa = spa_lookup(name); 685 if (spa) 686 spa_altroot(spa, altroot, buflen); 687 else 688 altroot[0] = '\0'; 689 spa = NULL; 690 mutex_exit(&spa_namespace_lock); 691 } else { 692 spa_altroot(spa, altroot, buflen); 693 } 694 } 695 696 if (spa != NULL) 697 spa_close(spa, FTAG); 698 699 return (error); 700 } 701 702 /* 703 * Pool Creation 704 */ 705 int 706 spa_create(const char *pool, nvlist_t *nvroot, const char *altroot) 707 { 708 spa_t *spa; 709 vdev_t *rvd; 710 dsl_pool_t *dp; 711 dmu_tx_t *tx; 712 int c, error; 713 uint64_t txg = TXG_INITIAL; 714 715 /* 716 * If this pool already exists, return failure. 717 */ 718 mutex_enter(&spa_namespace_lock); 719 if (spa_lookup(pool) != NULL) { 720 mutex_exit(&spa_namespace_lock); 721 return (EEXIST); 722 } 723 724 /* 725 * Allocate a new spa_t structure. 726 */ 727 spa = spa_add(pool, altroot); 728 spa_activate(spa); 729 730 spa->spa_uberblock.ub_txg = txg - 1; 731 spa->spa_ubsync = spa->spa_uberblock; 732 733 /* 734 * Create the root vdev. 735 */ 736 spa_config_enter(spa, RW_WRITER, FTAG); 737 738 rvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD); 739 740 ASSERT(spa->spa_root_vdev == rvd); 741 742 if (rvd == NULL) { 743 error = EINVAL; 744 } else { 745 if ((error = vdev_create(rvd, txg)) == 0) { 746 for (c = 0; c < rvd->vdev_children; c++) 747 vdev_init(rvd->vdev_child[c], txg); 748 vdev_config_dirty(rvd); 749 } 750 } 751 752 spa_config_exit(spa, FTAG); 753 754 if (error) { 755 spa_unload(spa); 756 spa_deactivate(spa); 757 spa_remove(spa); 758 mutex_exit(&spa_namespace_lock); 759 return (error); 760 } 761 762 spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg); 763 spa->spa_meta_objset = dp->dp_meta_objset; 764 765 tx = dmu_tx_create_assigned(dp, txg); 766 767 /* 768 * Create the pool config object. 769 */ 770 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 771 DMU_OT_PACKED_NVLIST, 1 << 14, 772 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 773 774 if (zap_add(spa->spa_meta_objset, 775 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 776 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 777 cmn_err(CE_PANIC, "failed to add pool config"); 778 } 779 780 /* 781 * Create the deferred-free bplist object. Turn off compression 782 * because sync-to-convergence takes longer if the blocksize 783 * keeps changing. 784 */ 785 spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 786 1 << 14, tx); 787 dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 788 ZIO_COMPRESS_OFF, tx); 789 790 if (zap_add(spa->spa_meta_objset, 791 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 792 sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { 793 cmn_err(CE_PANIC, "failed to add bplist"); 794 } 795 796 dmu_tx_commit(tx); 797 798 spa->spa_sync_on = B_TRUE; 799 txg_sync_start(spa->spa_dsl_pool); 800 801 /* 802 * We explicitly wait for the first transaction to complete so that our 803 * bean counters are appropriately updated. 804 */ 805 txg_wait_synced(spa->spa_dsl_pool, txg); 806 807 spa_config_sync(); 808 809 mutex_exit(&spa_namespace_lock); 810 811 return (0); 812 } 813 814 /* 815 * Import the given pool into the system. We set up the necessary spa_t and 816 * then call spa_load() to do the dirty work. 817 */ 818 int 819 spa_import(const char *pool, nvlist_t *config, const char *altroot) 820 { 821 spa_t *spa; 822 int error; 823 824 if (!(spa_mode & FWRITE)) 825 return (EROFS); 826 827 /* 828 * If a pool with this name exists, return failure. 829 */ 830 mutex_enter(&spa_namespace_lock); 831 if (spa_lookup(pool) != NULL) { 832 mutex_exit(&spa_namespace_lock); 833 return (EEXIST); 834 } 835 836 /* 837 * Create and initialize the spa structure. 838 */ 839 spa = spa_add(pool, altroot); 840 spa_activate(spa); 841 842 /* 843 * Pass off the heavy lifting to spa_load(). 844 * Pass TRUE for mosconfig because the user-supplied config 845 * is actually the one to trust when doing an import. 846 */ 847 error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE); 848 849 if (error) { 850 spa_unload(spa); 851 spa_deactivate(spa); 852 spa_remove(spa); 853 mutex_exit(&spa_namespace_lock); 854 return (error); 855 } 856 857 /* 858 * Update the config cache to include the newly-imported pool. 859 */ 860 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 861 862 mutex_exit(&spa_namespace_lock); 863 864 /* 865 * Resilver anything that's out of date. 866 */ 867 if (spa_mode & FWRITE) 868 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 869 870 return (0); 871 } 872 873 /* 874 * This (illegal) pool name is used when temporarily importing a spa_t in order 875 * to get the vdev stats associated with the imported devices. 876 */ 877 #define TRYIMPORT_NAME "$import" 878 879 nvlist_t * 880 spa_tryimport(nvlist_t *tryconfig) 881 { 882 nvlist_t *config = NULL; 883 char *poolname; 884 spa_t *spa; 885 uint64_t state; 886 887 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 888 return (NULL); 889 890 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 891 return (NULL); 892 893 /* 894 * Create and initialize the spa structure. 895 */ 896 mutex_enter(&spa_namespace_lock); 897 spa = spa_add(TRYIMPORT_NAME, NULL); 898 spa_activate(spa); 899 900 /* 901 * Pass off the heavy lifting to spa_load(). 902 * Pass TRUE for mosconfig because the user-supplied config 903 * is actually the one to trust when doing an import. 904 */ 905 (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); 906 907 /* 908 * If 'tryconfig' was at least parsable, return the current config. 909 */ 910 if (spa->spa_root_vdev != NULL) { 911 spa_config_enter(spa, RW_READER, FTAG); 912 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 913 spa_config_exit(spa, FTAG); 914 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 915 poolname) == 0); 916 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 917 state) == 0); 918 } 919 920 spa_unload(spa); 921 spa_deactivate(spa); 922 spa_remove(spa); 923 mutex_exit(&spa_namespace_lock); 924 925 return (config); 926 } 927 928 /* 929 * Pool export/destroy 930 * 931 * The act of destroying or exporting a pool is very simple. We make sure there 932 * is no more pending I/O and any references to the pool are gone. Then, we 933 * update the pool state and sync all the labels to disk, removing the 934 * configuration from the cache afterwards. 935 */ 936 static int 937 spa_export_common(char *pool, int new_state) 938 { 939 spa_t *spa; 940 941 if (!(spa_mode & FWRITE)) 942 return (EROFS); 943 944 mutex_enter(&spa_namespace_lock); 945 if ((spa = spa_lookup(pool)) == NULL) { 946 mutex_exit(&spa_namespace_lock); 947 return (ENOENT); 948 } 949 950 /* 951 * Put a hold on the pool, drop the namespace lock, stop async tasks, 952 * reacquire the namespace lock, and see if we can export. 953 */ 954 spa_open_ref(spa, FTAG); 955 mutex_exit(&spa_namespace_lock); 956 spa_async_suspend(spa); 957 mutex_enter(&spa_namespace_lock); 958 spa_close(spa, FTAG); 959 960 /* 961 * The pool will be in core if it's openable, 962 * in which case we can modify its state. 963 */ 964 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 965 /* 966 * Objsets may be open only because they're dirty, so we 967 * have to force it to sync before checking spa_refcnt. 968 */ 969 spa_scrub_suspend(spa); 970 txg_wait_synced(spa->spa_dsl_pool, 0); 971 972 /* 973 * A pool cannot be exported or destroyed if there are active 974 * references. If we are resetting a pool, allow references by 975 * fault injection handlers. 976 */ 977 if (!spa_refcount_zero(spa) || 978 (spa->spa_inject_ref != 0 && 979 new_state != POOL_STATE_UNINITIALIZED)) { 980 spa_scrub_resume(spa); 981 spa_async_resume(spa); 982 mutex_exit(&spa_namespace_lock); 983 return (EBUSY); 984 } 985 986 spa_scrub_resume(spa); 987 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 988 989 /* 990 * We want this to be reflected on every label, 991 * so mark them all dirty. spa_unload() will do the 992 * final sync that pushes these changes out. 993 */ 994 if (new_state != POOL_STATE_UNINITIALIZED) { 995 spa_config_enter(spa, RW_WRITER, FTAG); 996 spa->spa_state = new_state; 997 spa->spa_final_txg = spa_last_synced_txg(spa) + 1; 998 vdev_config_dirty(spa->spa_root_vdev); 999 spa_config_exit(spa, FTAG); 1000 } 1001 } 1002 1003 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 1004 spa_unload(spa); 1005 spa_deactivate(spa); 1006 } 1007 1008 if (new_state != POOL_STATE_UNINITIALIZED) { 1009 spa_remove(spa); 1010 spa_config_sync(); 1011 } 1012 mutex_exit(&spa_namespace_lock); 1013 1014 return (0); 1015 } 1016 1017 /* 1018 * Destroy a storage pool. 1019 */ 1020 int 1021 spa_destroy(char *pool) 1022 { 1023 return (spa_export_common(pool, POOL_STATE_DESTROYED)); 1024 } 1025 1026 /* 1027 * Export a storage pool. 1028 */ 1029 int 1030 spa_export(char *pool) 1031 { 1032 return (spa_export_common(pool, POOL_STATE_EXPORTED)); 1033 } 1034 1035 /* 1036 * Similar to spa_export(), this unloads the spa_t without actually removing it 1037 * from the namespace in any way. 1038 */ 1039 int 1040 spa_reset(char *pool) 1041 { 1042 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED)); 1043 } 1044 1045 1046 /* 1047 * ========================================================================== 1048 * Device manipulation 1049 * ========================================================================== 1050 */ 1051 1052 /* 1053 * Add capacity to a storage pool. 1054 */ 1055 int 1056 spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 1057 { 1058 uint64_t txg; 1059 int c, error; 1060 vdev_t *rvd = spa->spa_root_vdev; 1061 vdev_t *vd, *tvd; 1062 1063 txg = spa_vdev_enter(spa); 1064 1065 vd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD); 1066 1067 if (vd == NULL) 1068 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 1069 1070 if ((error = vdev_create(vd, txg)) != 0) 1071 return (spa_vdev_exit(spa, vd, txg, error)); 1072 1073 /* 1074 * Transfer each new top-level vdev from vd to rvd. 1075 */ 1076 for (c = 0; c < vd->vdev_children; c++) { 1077 tvd = vd->vdev_child[c]; 1078 vdev_remove_child(vd, tvd); 1079 tvd->vdev_id = rvd->vdev_children; 1080 vdev_add_child(rvd, tvd); 1081 vdev_config_dirty(tvd); 1082 } 1083 1084 /* 1085 * We have to be careful when adding new vdevs to an existing pool. 1086 * If other threads start allocating from these vdevs before we 1087 * sync the config cache, and we lose power, then upon reboot we may 1088 * fail to open the pool because there are DVAs that the config cache 1089 * can't translate. Therefore, we first add the vdevs without 1090 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 1091 * and then let spa_config_update() initialize the new metaslabs. 1092 * 1093 * spa_load() checks for added-but-not-initialized vdevs, so that 1094 * if we lose power at any point in this sequence, the remaining 1095 * steps will be completed the next time we load the pool. 1096 */ 1097 (void) spa_vdev_exit(spa, vd, txg, 0); 1098 1099 mutex_enter(&spa_namespace_lock); 1100 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 1101 mutex_exit(&spa_namespace_lock); 1102 1103 return (0); 1104 } 1105 1106 /* 1107 * Attach a device to a mirror. The arguments are the path to any device 1108 * in the mirror, and the nvroot for the new device. If the path specifies 1109 * a device that is not mirrored, we automatically insert the mirror vdev. 1110 * 1111 * If 'replacing' is specified, the new device is intended to replace the 1112 * existing device; in this case the two devices are made into their own 1113 * mirror using the 'replacing' vdev, which is functionally idendical to 1114 * the mirror vdev (it actually reuses all the same ops) but has a few 1115 * extra rules: you can't attach to it after it's been created, and upon 1116 * completion of resilvering, the first disk (the one being replaced) 1117 * is automatically detached. 1118 */ 1119 int 1120 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 1121 { 1122 uint64_t txg, open_txg; 1123 int error; 1124 vdev_t *rvd = spa->spa_root_vdev; 1125 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 1126 vdev_ops_t *pvops = replacing ? &vdev_replacing_ops : &vdev_mirror_ops; 1127 1128 txg = spa_vdev_enter(spa); 1129 1130 oldvd = vdev_lookup_by_guid(rvd, guid); 1131 1132 if (oldvd == NULL) 1133 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1134 1135 if (!oldvd->vdev_ops->vdev_op_leaf) 1136 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1137 1138 pvd = oldvd->vdev_parent; 1139 1140 /* 1141 * The parent must be a mirror or the root, unless we're replacing; 1142 * in that case, the parent can be anything but another replacing vdev. 1143 */ 1144 if (pvd->vdev_ops != &vdev_mirror_ops && 1145 pvd->vdev_ops != &vdev_root_ops && 1146 (!replacing || pvd->vdev_ops == &vdev_replacing_ops)) 1147 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1148 1149 newrootvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD); 1150 1151 if (newrootvd == NULL || newrootvd->vdev_children != 1) 1152 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1153 1154 newvd = newrootvd->vdev_child[0]; 1155 1156 if (!newvd->vdev_ops->vdev_op_leaf) 1157 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1158 1159 if ((error = vdev_create(newrootvd, txg)) != 0) 1160 return (spa_vdev_exit(spa, newrootvd, txg, error)); 1161 1162 /* 1163 * Compare the new device size with the replaceable/attachable 1164 * device size. 1165 */ 1166 if (newvd->vdev_psize < vdev_get_rsize(oldvd)) 1167 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 1168 1169 /* 1170 * The new device cannot have a higher alignment requirement 1171 * than the top-level vdev. 1172 */ 1173 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 1174 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 1175 1176 /* 1177 * If this is an in-place replacement, update oldvd's path and devid 1178 * to make it distinguishable from newvd, and unopenable from now on. 1179 */ 1180 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 1181 spa_strfree(oldvd->vdev_path); 1182 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 1183 KM_SLEEP); 1184 (void) sprintf(oldvd->vdev_path, "%s/%s", 1185 newvd->vdev_path, "old"); 1186 if (oldvd->vdev_devid != NULL) { 1187 spa_strfree(oldvd->vdev_devid); 1188 oldvd->vdev_devid = NULL; 1189 } 1190 } 1191 1192 /* 1193 * If the parent is not a mirror, or if we're replacing, 1194 * insert the new mirror/replacing vdev above oldvd. 1195 */ 1196 if (pvd->vdev_ops != pvops) 1197 pvd = vdev_add_parent(oldvd, pvops); 1198 1199 ASSERT(pvd->vdev_top->vdev_parent == rvd); 1200 ASSERT(pvd->vdev_ops == pvops); 1201 ASSERT(oldvd->vdev_parent == pvd); 1202 1203 /* 1204 * Extract the new device from its root and add it to pvd. 1205 */ 1206 vdev_remove_child(newrootvd, newvd); 1207 newvd->vdev_id = pvd->vdev_children; 1208 vdev_add_child(pvd, newvd); 1209 1210 /* 1211 * If newvd is smaller than oldvd, but larger than its rsize, 1212 * the addition of newvd may have decreased our parent's asize. 1213 */ 1214 pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize); 1215 1216 tvd = newvd->vdev_top; 1217 ASSERT(pvd->vdev_top == tvd); 1218 ASSERT(tvd->vdev_parent == rvd); 1219 1220 vdev_config_dirty(tvd); 1221 1222 /* 1223 * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 1224 * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 1225 */ 1226 open_txg = txg + TXG_CONCURRENT_STATES - 1; 1227 1228 mutex_enter(&newvd->vdev_dtl_lock); 1229 space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, 1230 open_txg - TXG_INITIAL + 1); 1231 mutex_exit(&newvd->vdev_dtl_lock); 1232 1233 dprintf("attached %s in txg %llu\n", newvd->vdev_path, txg); 1234 1235 /* 1236 * Mark newvd's DTL dirty in this txg. 1237 */ 1238 vdev_dirty(tvd, VDD_DTL, newvd, txg); 1239 1240 (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 1241 1242 /* 1243 * Kick off a resilver to update newvd. 1244 */ 1245 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1246 1247 return (0); 1248 } 1249 1250 /* 1251 * Detach a device from a mirror or replacing vdev. 1252 * If 'replace_done' is specified, only detach if the parent 1253 * is a replacing vdev. 1254 */ 1255 int 1256 spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) 1257 { 1258 uint64_t txg; 1259 int c, t, error; 1260 vdev_t *rvd = spa->spa_root_vdev; 1261 vdev_t *vd, *pvd, *cvd, *tvd; 1262 1263 txg = spa_vdev_enter(spa); 1264 1265 vd = vdev_lookup_by_guid(rvd, guid); 1266 1267 if (vd == NULL) 1268 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1269 1270 if (!vd->vdev_ops->vdev_op_leaf) 1271 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1272 1273 pvd = vd->vdev_parent; 1274 1275 /* 1276 * If replace_done is specified, only remove this device if it's 1277 * the first child of a replacing vdev. 1278 */ 1279 if (replace_done && 1280 (vd->vdev_id != 0 || pvd->vdev_ops != &vdev_replacing_ops)) 1281 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1282 1283 /* 1284 * Only mirror and replacing vdevs support detach. 1285 */ 1286 if (pvd->vdev_ops != &vdev_replacing_ops && 1287 pvd->vdev_ops != &vdev_mirror_ops) 1288 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1289 1290 /* 1291 * If there's only one replica, you can't detach it. 1292 */ 1293 if (pvd->vdev_children <= 1) 1294 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1295 1296 /* 1297 * If all siblings have non-empty DTLs, this device may have the only 1298 * valid copy of the data, which means we cannot safely detach it. 1299 * 1300 * XXX -- as in the vdev_offline() case, we really want a more 1301 * precise DTL check. 1302 */ 1303 for (c = 0; c < pvd->vdev_children; c++) { 1304 uint64_t dirty; 1305 1306 cvd = pvd->vdev_child[c]; 1307 if (cvd == vd) 1308 continue; 1309 if (vdev_is_dead(cvd)) 1310 continue; 1311 mutex_enter(&cvd->vdev_dtl_lock); 1312 dirty = cvd->vdev_dtl_map.sm_space | 1313 cvd->vdev_dtl_scrub.sm_space; 1314 mutex_exit(&cvd->vdev_dtl_lock); 1315 if (!dirty) 1316 break; 1317 } 1318 if (c == pvd->vdev_children) 1319 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1320 1321 /* 1322 * Erase the disk labels so the disk can be used for other things. 1323 * This must be done after all other error cases are handled, 1324 * but before we disembowel vd (so we can still do I/O to it). 1325 * But if we can't do it, don't treat the error as fatal -- 1326 * it may be that the unwritability of the disk is the reason 1327 * it's being detached! 1328 */ 1329 error = vdev_label_init(vd, 0); 1330 if (error) 1331 dprintf("unable to erase labels on %s\n", vdev_description(vd)); 1332 1333 /* 1334 * Remove vd from its parent and compact the parent's children. 1335 */ 1336 vdev_remove_child(pvd, vd); 1337 vdev_compact_children(pvd); 1338 1339 /* 1340 * Remember one of the remaining children so we can get tvd below. 1341 */ 1342 cvd = pvd->vdev_child[0]; 1343 1344 /* 1345 * If the parent mirror/replacing vdev only has one child, 1346 * the parent is no longer needed. Remove it from the tree. 1347 */ 1348 if (pvd->vdev_children == 1) 1349 vdev_remove_parent(cvd); 1350 1351 /* 1352 * We don't set tvd until now because the parent we just removed 1353 * may have been the previous top-level vdev. 1354 */ 1355 tvd = cvd->vdev_top; 1356 ASSERT(tvd->vdev_parent == rvd); 1357 1358 /* 1359 * Reopen this top-level vdev to reassess health after detach. 1360 */ 1361 vdev_reopen(tvd); 1362 1363 /* 1364 * If the device we just detached was smaller than the others, 1365 * it may be possible to add metaslabs (i.e. grow the pool). 1366 * vdev_metaslab_init() can't fail because the existing metaslabs 1367 * are already in core, so there's nothing to read from disk. 1368 */ 1369 VERIFY(vdev_metaslab_init(tvd, txg) == 0); 1370 1371 vdev_config_dirty(tvd); 1372 1373 /* 1374 * Mark vd's DTL as dirty in this txg. 1375 * vdev_dtl_sync() will see that vd->vdev_detached is set 1376 * and free vd's DTL object in syncing context. 1377 * But first make sure we're not on any *other* txg's DTL list, 1378 * to prevent vd from being accessed after it's freed. 1379 */ 1380 for (t = 0; t < TXG_SIZE; t++) 1381 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 1382 vd->vdev_detached = B_TRUE; 1383 vdev_dirty(tvd, VDD_DTL, vd, txg); 1384 1385 dprintf("detached %s in txg %llu\n", vd->vdev_path, txg); 1386 1387 return (spa_vdev_exit(spa, vd, txg, 0)); 1388 } 1389 1390 /* 1391 * Find any device that's done replacing, so we can detach it. 1392 */ 1393 static vdev_t * 1394 spa_vdev_replace_done_hunt(vdev_t *vd) 1395 { 1396 vdev_t *newvd, *oldvd; 1397 int c; 1398 1399 for (c = 0; c < vd->vdev_children; c++) { 1400 oldvd = spa_vdev_replace_done_hunt(vd->vdev_child[c]); 1401 if (oldvd != NULL) 1402 return (oldvd); 1403 } 1404 1405 if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 1406 oldvd = vd->vdev_child[0]; 1407 newvd = vd->vdev_child[1]; 1408 1409 mutex_enter(&newvd->vdev_dtl_lock); 1410 if (newvd->vdev_dtl_map.sm_space == 0 && 1411 newvd->vdev_dtl_scrub.sm_space == 0) { 1412 mutex_exit(&newvd->vdev_dtl_lock); 1413 return (oldvd); 1414 } 1415 mutex_exit(&newvd->vdev_dtl_lock); 1416 } 1417 1418 return (NULL); 1419 } 1420 1421 static void 1422 spa_vdev_replace_done(spa_t *spa) 1423 { 1424 vdev_t *vd; 1425 uint64_t guid; 1426 1427 spa_config_enter(spa, RW_READER, FTAG); 1428 1429 while ((vd = spa_vdev_replace_done_hunt(spa->spa_root_vdev)) != NULL) { 1430 guid = vd->vdev_guid; 1431 spa_config_exit(spa, FTAG); 1432 if (spa_vdev_detach(spa, guid, B_TRUE) != 0) 1433 return; 1434 spa_config_enter(spa, RW_READER, FTAG); 1435 } 1436 1437 spa_config_exit(spa, FTAG); 1438 } 1439 1440 /* 1441 * Update the stored path for this vdev. Dirty the vdev configuration, relying 1442 * on spa_vdev_enter/exit() to synchronize the labels and cache. 1443 */ 1444 int 1445 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 1446 { 1447 vdev_t *rvd, *vd; 1448 uint64_t txg; 1449 1450 rvd = spa->spa_root_vdev; 1451 1452 txg = spa_vdev_enter(spa); 1453 1454 if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) 1455 return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 1456 1457 if (!vd->vdev_ops->vdev_op_leaf) 1458 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1459 1460 spa_strfree(vd->vdev_path); 1461 vd->vdev_path = spa_strdup(newpath); 1462 1463 vdev_config_dirty(vd->vdev_top); 1464 1465 return (spa_vdev_exit(spa, NULL, txg, 0)); 1466 } 1467 1468 /* 1469 * ========================================================================== 1470 * SPA Scrubbing 1471 * ========================================================================== 1472 */ 1473 1474 void 1475 spa_scrub_throttle(spa_t *spa, int direction) 1476 { 1477 mutex_enter(&spa->spa_scrub_lock); 1478 spa->spa_scrub_throttled += direction; 1479 ASSERT(spa->spa_scrub_throttled >= 0); 1480 if (spa->spa_scrub_throttled == 0) 1481 cv_broadcast(&spa->spa_scrub_io_cv); 1482 mutex_exit(&spa->spa_scrub_lock); 1483 } 1484 1485 static void 1486 spa_scrub_io_done(zio_t *zio) 1487 { 1488 spa_t *spa = zio->io_spa; 1489 1490 zio_buf_free(zio->io_data, zio->io_size); 1491 1492 mutex_enter(&spa->spa_scrub_lock); 1493 if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 1494 vdev_t *vd = zio->io_vd; 1495 spa->spa_scrub_errors++; 1496 mutex_enter(&vd->vdev_stat_lock); 1497 vd->vdev_stat.vs_scrub_errors++; 1498 mutex_exit(&vd->vdev_stat_lock); 1499 } 1500 if (--spa->spa_scrub_inflight == 0) { 1501 cv_broadcast(&spa->spa_scrub_io_cv); 1502 ASSERT(spa->spa_scrub_throttled == 0); 1503 } 1504 mutex_exit(&spa->spa_scrub_lock); 1505 } 1506 1507 static void 1508 spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags, 1509 zbookmark_t *zb) 1510 { 1511 size_t size = BP_GET_LSIZE(bp); 1512 void *data = zio_buf_alloc(size); 1513 1514 mutex_enter(&spa->spa_scrub_lock); 1515 spa->spa_scrub_inflight++; 1516 mutex_exit(&spa->spa_scrub_lock); 1517 1518 if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET) 1519 flags |= ZIO_FLAG_SPECULATIVE; /* intent log block */ 1520 1521 flags |= ZIO_FLAG_CANFAIL; 1522 1523 zio_nowait(zio_read(NULL, spa, bp, data, size, 1524 spa_scrub_io_done, NULL, priority, flags, zb)); 1525 } 1526 1527 /* ARGSUSED */ 1528 static int 1529 spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) 1530 { 1531 blkptr_t *bp = &bc->bc_blkptr; 1532 vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[0])); 1533 1534 if (bc->bc_errno || vd == NULL) { 1535 /* 1536 * We can't scrub this block, but we can continue to scrub 1537 * the rest of the pool. Note the error and move along. 1538 */ 1539 mutex_enter(&spa->spa_scrub_lock); 1540 spa->spa_scrub_errors++; 1541 mutex_exit(&spa->spa_scrub_lock); 1542 1543 if (vd != NULL) { 1544 mutex_enter(&vd->vdev_stat_lock); 1545 vd->vdev_stat.vs_scrub_errors++; 1546 mutex_exit(&vd->vdev_stat_lock); 1547 } 1548 1549 return (ERESTART); 1550 } 1551 1552 ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg); 1553 1554 /* 1555 * Keep track of how much data we've examined so that 1556 * zpool(1M) status can make useful progress reports. 1557 */ 1558 mutex_enter(&vd->vdev_stat_lock); 1559 vd->vdev_stat.vs_scrub_examined += BP_GET_ASIZE(bp); 1560 mutex_exit(&vd->vdev_stat_lock); 1561 1562 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) { 1563 if (DVA_GET_GANG(&bp->blk_dva[0])) { 1564 /* 1565 * Gang members may be spread across multiple vdevs, 1566 * so the best we can do is look at the pool-wide DTL. 1567 * XXX -- it would be better to change our allocation 1568 * policy to ensure that this can't happen. 1569 */ 1570 vd = spa->spa_root_vdev; 1571 } 1572 if (vdev_dtl_contains(&vd->vdev_dtl_map, bp->blk_birth, 1)) { 1573 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER, 1574 ZIO_FLAG_RESILVER, &bc->bc_bookmark); 1575 } 1576 } else { 1577 spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB, 1578 ZIO_FLAG_SCRUB, &bc->bc_bookmark); 1579 } 1580 1581 return (0); 1582 } 1583 1584 static void 1585 spa_scrub_thread(spa_t *spa) 1586 { 1587 callb_cpr_t cprinfo; 1588 traverse_handle_t *th = spa->spa_scrub_th; 1589 vdev_t *rvd = spa->spa_root_vdev; 1590 pool_scrub_type_t scrub_type = spa->spa_scrub_type; 1591 int error = 0; 1592 boolean_t complete; 1593 1594 CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG); 1595 1596 /* 1597 * If we're restarting due to a snapshot create/delete, 1598 * wait for that to complete. 1599 */ 1600 txg_wait_synced(spa_get_dsl(spa), 0); 1601 1602 dprintf("start %s mintxg=%llu maxtxg=%llu\n", 1603 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 1604 spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg); 1605 1606 spa_config_enter(spa, RW_WRITER, FTAG); 1607 vdev_reopen(rvd); /* purge all vdev caches */ 1608 vdev_config_dirty(rvd); /* rewrite all disk labels */ 1609 vdev_scrub_stat_update(rvd, scrub_type, B_FALSE); 1610 spa_config_exit(spa, FTAG); 1611 1612 mutex_enter(&spa->spa_scrub_lock); 1613 spa->spa_scrub_errors = 0; 1614 spa->spa_scrub_active = 1; 1615 ASSERT(spa->spa_scrub_inflight == 0); 1616 ASSERT(spa->spa_scrub_throttled == 0); 1617 1618 while (!spa->spa_scrub_stop) { 1619 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1620 while (spa->spa_scrub_suspended) { 1621 spa->spa_scrub_active = 0; 1622 cv_broadcast(&spa->spa_scrub_cv); 1623 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 1624 spa->spa_scrub_active = 1; 1625 } 1626 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock); 1627 1628 if (spa->spa_scrub_restart_txg != 0) 1629 break; 1630 1631 mutex_exit(&spa->spa_scrub_lock); 1632 error = traverse_more(th); 1633 mutex_enter(&spa->spa_scrub_lock); 1634 if (error != EAGAIN) 1635 break; 1636 1637 while (spa->spa_scrub_throttled > 0) 1638 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1639 } 1640 1641 while (spa->spa_scrub_inflight) 1642 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1643 1644 spa->spa_scrub_active = 0; 1645 cv_broadcast(&spa->spa_scrub_cv); 1646 1647 mutex_exit(&spa->spa_scrub_lock); 1648 1649 spa_config_enter(spa, RW_WRITER, FTAG); 1650 1651 mutex_enter(&spa->spa_scrub_lock); 1652 1653 /* 1654 * Note: we check spa_scrub_restart_txg under both spa_scrub_lock 1655 * AND the spa config lock to synchronize with any config changes 1656 * that revise the DTLs under spa_vdev_enter() / spa_vdev_exit(). 1657 */ 1658 if (spa->spa_scrub_restart_txg != 0) 1659 error = ERESTART; 1660 1661 if (spa->spa_scrub_stop) 1662 error = EINTR; 1663 1664 /* 1665 * Even if there were uncorrectable errors, we consider the scrub 1666 * completed. The downside is that if there is a transient error during 1667 * a resilver, we won't resilver the data properly to the target. But 1668 * if the damage is permanent (more likely) we will resilver forever, 1669 * which isn't really acceptable. Since there is enough information for 1670 * the user to know what has failed and why, this seems like a more 1671 * tractable approach. 1672 */ 1673 complete = (error == 0); 1674 1675 dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n", 1676 scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 1677 spa->spa_scrub_maxtxg, complete ? "done" : "FAILED", 1678 error, spa->spa_scrub_errors, spa->spa_scrub_stop); 1679 1680 mutex_exit(&spa->spa_scrub_lock); 1681 1682 /* 1683 * If the scrub/resilver completed, update all DTLs to reflect this. 1684 * Whether it succeeded or not, vacate all temporary scrub DTLs. 1685 */ 1686 vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1, 1687 complete ? spa->spa_scrub_maxtxg : 0, B_TRUE); 1688 vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete); 1689 spa_errlog_rotate(spa); 1690 1691 spa_config_exit(spa, FTAG); 1692 1693 mutex_enter(&spa->spa_scrub_lock); 1694 1695 /* 1696 * We may have finished replacing a device. 1697 * Let the async thread assess this and handle the detach. 1698 */ 1699 spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 1700 1701 /* 1702 * If we were told to restart, our final act is to start a new scrub. 1703 */ 1704 if (error == ERESTART) 1705 spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ? 1706 SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB); 1707 1708 spa->spa_scrub_type = POOL_SCRUB_NONE; 1709 spa->spa_scrub_active = 0; 1710 spa->spa_scrub_thread = NULL; 1711 cv_broadcast(&spa->spa_scrub_cv); 1712 CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */ 1713 thread_exit(); 1714 } 1715 1716 void 1717 spa_scrub_suspend(spa_t *spa) 1718 { 1719 mutex_enter(&spa->spa_scrub_lock); 1720 spa->spa_scrub_suspended++; 1721 while (spa->spa_scrub_active) { 1722 cv_broadcast(&spa->spa_scrub_cv); 1723 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 1724 } 1725 while (spa->spa_scrub_inflight) 1726 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1727 mutex_exit(&spa->spa_scrub_lock); 1728 } 1729 1730 void 1731 spa_scrub_resume(spa_t *spa) 1732 { 1733 mutex_enter(&spa->spa_scrub_lock); 1734 ASSERT(spa->spa_scrub_suspended != 0); 1735 if (--spa->spa_scrub_suspended == 0) 1736 cv_broadcast(&spa->spa_scrub_cv); 1737 mutex_exit(&spa->spa_scrub_lock); 1738 } 1739 1740 void 1741 spa_scrub_restart(spa_t *spa, uint64_t txg) 1742 { 1743 /* 1744 * Something happened (e.g. snapshot create/delete) that means 1745 * we must restart any in-progress scrubs. The itinerary will 1746 * fix this properly. 1747 */ 1748 mutex_enter(&spa->spa_scrub_lock); 1749 spa->spa_scrub_restart_txg = txg; 1750 mutex_exit(&spa->spa_scrub_lock); 1751 } 1752 1753 int 1754 spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) 1755 { 1756 space_seg_t *ss; 1757 uint64_t mintxg, maxtxg; 1758 vdev_t *rvd = spa->spa_root_vdev; 1759 1760 if ((uint_t)type >= POOL_SCRUB_TYPES) 1761 return (ENOTSUP); 1762 1763 mutex_enter(&spa->spa_scrub_lock); 1764 1765 /* 1766 * If there's a scrub or resilver already in progress, stop it. 1767 */ 1768 while (spa->spa_scrub_thread != NULL) { 1769 /* 1770 * Don't stop a resilver unless forced. 1771 */ 1772 if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) { 1773 mutex_exit(&spa->spa_scrub_lock); 1774 return (EBUSY); 1775 } 1776 spa->spa_scrub_stop = 1; 1777 cv_broadcast(&spa->spa_scrub_cv); 1778 cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 1779 } 1780 1781 /* 1782 * Terminate the previous traverse. 1783 */ 1784 if (spa->spa_scrub_th != NULL) { 1785 traverse_fini(spa->spa_scrub_th); 1786 spa->spa_scrub_th = NULL; 1787 } 1788 1789 if (rvd == NULL) { 1790 ASSERT(spa->spa_scrub_stop == 0); 1791 ASSERT(spa->spa_scrub_type == type); 1792 ASSERT(spa->spa_scrub_restart_txg == 0); 1793 mutex_exit(&spa->spa_scrub_lock); 1794 return (0); 1795 } 1796 1797 mintxg = TXG_INITIAL - 1; 1798 maxtxg = spa_last_synced_txg(spa) + 1; 1799 1800 mutex_enter(&rvd->vdev_dtl_lock); 1801 1802 if (rvd->vdev_dtl_map.sm_space == 0) { 1803 /* 1804 * The pool-wide DTL is empty. 1805 * If this is a resilver, there's nothing to do except 1806 * check whether any in-progress replacements have completed. 1807 */ 1808 if (type == POOL_SCRUB_RESILVER) { 1809 type = POOL_SCRUB_NONE; 1810 spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 1811 } 1812 } else { 1813 /* 1814 * The pool-wide DTL is non-empty. 1815 * If this is a normal scrub, upgrade to a resilver instead. 1816 */ 1817 if (type == POOL_SCRUB_EVERYTHING) 1818 type = POOL_SCRUB_RESILVER; 1819 } 1820 1821 if (type == POOL_SCRUB_RESILVER) { 1822 /* 1823 * Determine the resilvering boundaries. 1824 * 1825 * Note: (mintxg, maxtxg) is an open interval, 1826 * i.e. mintxg and maxtxg themselves are not included. 1827 * 1828 * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1 1829 * so we don't claim to resilver a txg that's still changing. 1830 */ 1831 ss = avl_first(&rvd->vdev_dtl_map.sm_root); 1832 mintxg = ss->ss_start - 1; 1833 ss = avl_last(&rvd->vdev_dtl_map.sm_root); 1834 maxtxg = MIN(ss->ss_end, maxtxg); 1835 } 1836 1837 mutex_exit(&rvd->vdev_dtl_lock); 1838 1839 spa->spa_scrub_stop = 0; 1840 spa->spa_scrub_type = type; 1841 spa->spa_scrub_restart_txg = 0; 1842 1843 if (type != POOL_SCRUB_NONE) { 1844 spa->spa_scrub_mintxg = mintxg; 1845 spa->spa_scrub_maxtxg = maxtxg; 1846 spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL, 1847 ADVANCE_PRE | ADVANCE_PRUNE | ADVANCE_ZIL, 1848 ZIO_FLAG_CANFAIL); 1849 traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg); 1850 spa->spa_scrub_thread = thread_create(NULL, 0, 1851 spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri); 1852 } 1853 1854 mutex_exit(&spa->spa_scrub_lock); 1855 1856 return (0); 1857 } 1858 1859 /* 1860 * ========================================================================== 1861 * SPA async task processing 1862 * ========================================================================== 1863 */ 1864 1865 static void 1866 spa_async_reopen(spa_t *spa) 1867 { 1868 vdev_t *rvd = spa->spa_root_vdev; 1869 vdev_t *tvd; 1870 int c; 1871 1872 spa_config_enter(spa, RW_WRITER, FTAG); 1873 1874 for (c = 0; c < rvd->vdev_children; c++) { 1875 tvd = rvd->vdev_child[c]; 1876 if (tvd->vdev_reopen_wanted) { 1877 tvd->vdev_reopen_wanted = 0; 1878 vdev_reopen(tvd); 1879 } 1880 } 1881 1882 spa_config_exit(spa, FTAG); 1883 } 1884 1885 static void 1886 spa_async_thread(spa_t *spa) 1887 { 1888 int tasks; 1889 1890 ASSERT(spa->spa_sync_on); 1891 1892 mutex_enter(&spa->spa_async_lock); 1893 tasks = spa->spa_async_tasks; 1894 spa->spa_async_tasks = 0; 1895 mutex_exit(&spa->spa_async_lock); 1896 1897 /* 1898 * See if the config needs to be updated. 1899 */ 1900 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 1901 mutex_enter(&spa_namespace_lock); 1902 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 1903 mutex_exit(&spa_namespace_lock); 1904 } 1905 1906 /* 1907 * See if any devices need to be reopened. 1908 */ 1909 if (tasks & SPA_ASYNC_REOPEN) 1910 spa_async_reopen(spa); 1911 1912 /* 1913 * If any devices are done replacing, detach them. 1914 */ 1915 if (tasks & SPA_ASYNC_REPLACE_DONE) 1916 spa_vdev_replace_done(spa); 1917 1918 /* 1919 * Kick off a scrub. 1920 */ 1921 if (tasks & SPA_ASYNC_SCRUB) 1922 VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0); 1923 1924 /* 1925 * Kick off a resilver. 1926 */ 1927 if (tasks & SPA_ASYNC_RESILVER) 1928 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1929 1930 /* 1931 * Let the world know that we're done. 1932 */ 1933 mutex_enter(&spa->spa_async_lock); 1934 spa->spa_async_thread = NULL; 1935 cv_broadcast(&spa->spa_async_cv); 1936 mutex_exit(&spa->spa_async_lock); 1937 thread_exit(); 1938 } 1939 1940 void 1941 spa_async_suspend(spa_t *spa) 1942 { 1943 mutex_enter(&spa->spa_async_lock); 1944 spa->spa_async_suspended++; 1945 while (spa->spa_async_thread != NULL) 1946 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 1947 mutex_exit(&spa->spa_async_lock); 1948 } 1949 1950 void 1951 spa_async_resume(spa_t *spa) 1952 { 1953 mutex_enter(&spa->spa_async_lock); 1954 ASSERT(spa->spa_async_suspended != 0); 1955 spa->spa_async_suspended--; 1956 mutex_exit(&spa->spa_async_lock); 1957 } 1958 1959 static void 1960 spa_async_dispatch(spa_t *spa) 1961 { 1962 mutex_enter(&spa->spa_async_lock); 1963 if (spa->spa_async_tasks && !spa->spa_async_suspended && 1964 spa->spa_async_thread == NULL && 1965 rootdir != NULL && !vn_is_readonly(rootdir)) 1966 spa->spa_async_thread = thread_create(NULL, 0, 1967 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 1968 mutex_exit(&spa->spa_async_lock); 1969 } 1970 1971 void 1972 spa_async_request(spa_t *spa, int task) 1973 { 1974 mutex_enter(&spa->spa_async_lock); 1975 spa->spa_async_tasks |= task; 1976 mutex_exit(&spa->spa_async_lock); 1977 } 1978 1979 /* 1980 * ========================================================================== 1981 * SPA syncing routines 1982 * ========================================================================== 1983 */ 1984 1985 static void 1986 spa_sync_deferred_frees(spa_t *spa, uint64_t txg) 1987 { 1988 bplist_t *bpl = &spa->spa_sync_bplist; 1989 dmu_tx_t *tx; 1990 blkptr_t blk; 1991 uint64_t itor = 0; 1992 zio_t *zio; 1993 int error; 1994 uint8_t c = 1; 1995 1996 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD); 1997 1998 while (bplist_iterate(bpl, &itor, &blk) == 0) 1999 zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL)); 2000 2001 error = zio_wait(zio); 2002 ASSERT3U(error, ==, 0); 2003 2004 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 2005 bplist_vacate(bpl, tx); 2006 2007 /* 2008 * Pre-dirty the first block so we sync to convergence faster. 2009 * (Usually only the first block is needed.) 2010 */ 2011 dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 2012 dmu_tx_commit(tx); 2013 } 2014 2015 static void 2016 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 2017 { 2018 nvlist_t *config; 2019 char *packed = NULL; 2020 size_t nvsize = 0; 2021 dmu_buf_t *db; 2022 2023 if (list_is_empty(&spa->spa_dirty_list)) 2024 return; 2025 2026 config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE); 2027 2028 if (spa->spa_config_syncing) 2029 nvlist_free(spa->spa_config_syncing); 2030 spa->spa_config_syncing = config; 2031 2032 VERIFY(nvlist_size(config, &nvsize, NV_ENCODE_XDR) == 0); 2033 2034 packed = kmem_alloc(nvsize, KM_SLEEP); 2035 2036 VERIFY(nvlist_pack(config, &packed, &nvsize, NV_ENCODE_XDR, 2037 KM_SLEEP) == 0); 2038 2039 dmu_write(spa->spa_meta_objset, spa->spa_config_object, 0, nvsize, 2040 packed, tx); 2041 2042 kmem_free(packed, nvsize); 2043 2044 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, 2045 spa->spa_config_object, FTAG, &db)); 2046 dmu_buf_will_dirty(db, tx); 2047 *(uint64_t *)db->db_data = nvsize; 2048 dmu_buf_rele(db, FTAG); 2049 } 2050 2051 /* 2052 * Sync the specified transaction group. New blocks may be dirtied as 2053 * part of the process, so we iterate until it converges. 2054 */ 2055 void 2056 spa_sync(spa_t *spa, uint64_t txg) 2057 { 2058 dsl_pool_t *dp = spa->spa_dsl_pool; 2059 objset_t *mos = spa->spa_meta_objset; 2060 bplist_t *bpl = &spa->spa_sync_bplist; 2061 vdev_t *rvd = spa->spa_root_vdev; 2062 vdev_t *vd; 2063 dmu_tx_t *tx; 2064 int dirty_vdevs; 2065 2066 /* 2067 * Lock out configuration changes. 2068 */ 2069 spa_config_enter(spa, RW_READER, FTAG); 2070 2071 spa->spa_syncing_txg = txg; 2072 spa->spa_sync_pass = 0; 2073 2074 VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); 2075 2076 /* 2077 * If anything has changed in this txg, push the deferred frees 2078 * from the previous txg. If not, leave them alone so that we 2079 * don't generate work on an otherwise idle system. 2080 */ 2081 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 2082 !txg_list_empty(&dp->dp_dirty_dirs, txg)) 2083 spa_sync_deferred_frees(spa, txg); 2084 2085 /* 2086 * Iterate to convergence. 2087 */ 2088 do { 2089 spa->spa_sync_pass++; 2090 2091 tx = dmu_tx_create_assigned(dp, txg); 2092 spa_sync_config_object(spa, tx); 2093 dmu_tx_commit(tx); 2094 2095 spa_errlog_sync(spa, txg); 2096 2097 dsl_pool_sync(dp, txg); 2098 2099 dirty_vdevs = 0; 2100 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 2101 vdev_sync(vd, txg); 2102 dirty_vdevs++; 2103 } 2104 2105 tx = dmu_tx_create_assigned(dp, txg); 2106 bplist_sync(bpl, tx); 2107 dmu_tx_commit(tx); 2108 2109 } while (dirty_vdevs); 2110 2111 bplist_close(bpl); 2112 2113 dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 2114 2115 /* 2116 * Rewrite the vdev configuration (which includes the uberblock) 2117 * to commit the transaction group. 2118 * 2119 * If there are any dirty vdevs, sync the uberblock to all vdevs. 2120 * Otherwise, pick a random top-level vdev that's known to be 2121 * visible in the config cache (see spa_vdev_add() for details). 2122 * If the write fails, try the next vdev until we're tried them all. 2123 */ 2124 if (!list_is_empty(&spa->spa_dirty_list)) { 2125 VERIFY(vdev_config_sync(rvd, txg) == 0); 2126 } else { 2127 int children = rvd->vdev_children; 2128 int c0 = spa_get_random(children); 2129 int c; 2130 2131 for (c = 0; c < children; c++) { 2132 vd = rvd->vdev_child[(c0 + c) % children]; 2133 if (vd->vdev_ms_array == 0) 2134 continue; 2135 if (vdev_config_sync(vd, txg) == 0) 2136 break; 2137 } 2138 if (c == children) 2139 VERIFY(vdev_config_sync(rvd, txg) == 0); 2140 } 2141 2142 /* 2143 * Clear the dirty config list. 2144 */ 2145 while ((vd = list_head(&spa->spa_dirty_list)) != NULL) 2146 vdev_config_clean(vd); 2147 2148 /* 2149 * Now that the new config has synced transactionally, 2150 * let it become visible to the config cache. 2151 */ 2152 if (spa->spa_config_syncing != NULL) { 2153 spa_config_set(spa, spa->spa_config_syncing); 2154 spa->spa_config_txg = txg; 2155 spa->spa_config_syncing = NULL; 2156 } 2157 2158 /* 2159 * Make a stable copy of the fully synced uberblock. 2160 * We use this as the root for pool traversals. 2161 */ 2162 spa->spa_traverse_wanted = 1; /* tells traverse_more() to stop */ 2163 2164 spa_scrub_suspend(spa); /* stop scrubbing and finish I/Os */ 2165 2166 rw_enter(&spa->spa_traverse_lock, RW_WRITER); 2167 spa->spa_traverse_wanted = 0; 2168 spa->spa_ubsync = spa->spa_uberblock; 2169 rw_exit(&spa->spa_traverse_lock); 2170 2171 spa_scrub_resume(spa); /* resume scrub with new ubsync */ 2172 2173 /* 2174 * Clean up the ZIL records for the synced txg. 2175 */ 2176 dsl_pool_zil_clean(dp); 2177 2178 /* 2179 * Update usable space statistics. 2180 */ 2181 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 2182 vdev_sync_done(vd, txg); 2183 2184 /* 2185 * It had better be the case that we didn't dirty anything 2186 * since spa_sync_labels(). 2187 */ 2188 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 2189 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 2190 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 2191 ASSERT(bpl->bpl_queue == NULL); 2192 2193 spa_config_exit(spa, FTAG); 2194 2195 /* 2196 * If any async tasks have been requested, kick them off. 2197 */ 2198 spa_async_dispatch(spa); 2199 } 2200 2201 /* 2202 * Sync all pools. We don't want to hold the namespace lock across these 2203 * operations, so we take a reference on the spa_t and drop the lock during the 2204 * sync. 2205 */ 2206 void 2207 spa_sync_allpools(void) 2208 { 2209 spa_t *spa = NULL; 2210 mutex_enter(&spa_namespace_lock); 2211 while ((spa = spa_next(spa)) != NULL) { 2212 if (spa_state(spa) != POOL_STATE_ACTIVE) 2213 continue; 2214 spa_open_ref(spa, FTAG); 2215 mutex_exit(&spa_namespace_lock); 2216 txg_wait_synced(spa_get_dsl(spa), 0); 2217 mutex_enter(&spa_namespace_lock); 2218 spa_close(spa, FTAG); 2219 } 2220 mutex_exit(&spa_namespace_lock); 2221 } 2222 2223 /* 2224 * ========================================================================== 2225 * Miscellaneous routines 2226 * ========================================================================== 2227 */ 2228 2229 /* 2230 * Remove all pools in the system. 2231 */ 2232 void 2233 spa_evict_all(void) 2234 { 2235 spa_t *spa; 2236 2237 /* 2238 * Remove all cached state. All pools should be closed now, 2239 * so every spa in the AVL tree should be unreferenced. 2240 */ 2241 mutex_enter(&spa_namespace_lock); 2242 while ((spa = spa_next(NULL)) != NULL) { 2243 /* 2244 * Stop async tasks. The async thread may need to detach 2245 * a device that's been replaced, which requires grabbing 2246 * spa_namespace_lock, so we must drop it here. 2247 */ 2248 spa_open_ref(spa, FTAG); 2249 mutex_exit(&spa_namespace_lock); 2250 spa_async_suspend(spa); 2251 VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 2252 mutex_enter(&spa_namespace_lock); 2253 spa_close(spa, FTAG); 2254 2255 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 2256 spa_unload(spa); 2257 spa_deactivate(spa); 2258 } 2259 spa_remove(spa); 2260 } 2261 mutex_exit(&spa_namespace_lock); 2262 } 2263 2264 vdev_t * 2265 spa_lookup_by_guid(spa_t *spa, uint64_t guid) 2266 { 2267 return (vdev_lookup_by_guid(spa->spa_root_vdev, guid)); 2268 } 2269