1789Sahrens /* 2789Sahrens * CDDL HEADER START 3789Sahrens * 4789Sahrens * The contents of this file are subject to the terms of the 51544Seschrock * Common Development and Distribution License (the "License"). 61544Seschrock * You may not use this file except in compliance with the License. 7789Sahrens * 8789Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9789Sahrens * or http://www.opensolaris.org/os/licensing. 10789Sahrens * See the License for the specific language governing permissions 11789Sahrens * and limitations under the License. 12789Sahrens * 13789Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14789Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15789Sahrens * If applicable, add the following below this CDDL HEADER, with the 16789Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17789Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18789Sahrens * 19789Sahrens * CDDL HEADER END 20789Sahrens */ 21789Sahrens /* 221354Seschrock * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23789Sahrens * Use is subject to license terms. 24789Sahrens */ 25789Sahrens 26789Sahrens #pragma ident "%Z%%M% %I% %E% SMI" 27789Sahrens 28789Sahrens /* 29789Sahrens * This file contains all the routines used when modifying on-disk SPA state. 30789Sahrens * This includes opening, importing, destroying, exporting a pool, and syncing a 31789Sahrens * pool. 32789Sahrens */ 33789Sahrens 34789Sahrens #include <sys/zfs_context.h> 351544Seschrock #include <sys/fm/fs/zfs.h> 36789Sahrens #include <sys/spa_impl.h> 37789Sahrens #include <sys/zio.h> 38789Sahrens #include <sys/zio_checksum.h> 39789Sahrens #include <sys/zio_compress.h> 40789Sahrens #include <sys/dmu.h> 41789Sahrens #include <sys/dmu_tx.h> 42789Sahrens #include <sys/zap.h> 43789Sahrens #include <sys/zil.h> 44789Sahrens #include <sys/vdev_impl.h> 45789Sahrens #include <sys/metaslab.h> 46789Sahrens #include <sys/uberblock_impl.h> 47789Sahrens #include <sys/txg.h> 48789Sahrens #include <sys/avl.h> 49789Sahrens #include <sys/dmu_traverse.h> 50789Sahrens #include <sys/unique.h> 51789Sahrens #include <sys/dsl_pool.h> 52789Sahrens #include <sys/dsl_dir.h> 53789Sahrens #include <sys/dsl_prop.h> 54789Sahrens #include <sys/fs/zfs.h> 55789Sahrens #include <sys/callb.h> 56789Sahrens 57789Sahrens static uint32_t spa_active_count; 58789Sahrens 59789Sahrens /* 60789Sahrens * ========================================================================== 61789Sahrens * SPA state manipulation (open/create/destroy/import/export) 62789Sahrens * ========================================================================== 63789Sahrens */ 64789Sahrens 651544Seschrock static int 661544Seschrock spa_error_entry_compare(const void *a, const void *b) 671544Seschrock { 681544Seschrock spa_error_entry_t *sa = (spa_error_entry_t *)a; 691544Seschrock spa_error_entry_t *sb = (spa_error_entry_t *)b; 701544Seschrock int ret; 711544Seschrock 721544Seschrock ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 731544Seschrock sizeof (zbookmark_t)); 741544Seschrock 751544Seschrock if (ret < 0) 761544Seschrock return (-1); 771544Seschrock else if (ret > 0) 781544Seschrock return (1); 791544Seschrock else 801544Seschrock return (0); 811544Seschrock } 821544Seschrock 831544Seschrock /* 841544Seschrock * Utility function which retrieves copies of the current logs and 851544Seschrock * re-initializes them in the process. 861544Seschrock */ 871544Seschrock void 881544Seschrock spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 891544Seschrock { 901544Seschrock ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 911544Seschrock 921544Seschrock bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 931544Seschrock bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 941544Seschrock 951544Seschrock avl_create(&spa->spa_errlist_scrub, 961544Seschrock spa_error_entry_compare, sizeof (spa_error_entry_t), 971544Seschrock offsetof(spa_error_entry_t, se_avl)); 981544Seschrock avl_create(&spa->spa_errlist_last, 991544Seschrock spa_error_entry_compare, sizeof (spa_error_entry_t), 1001544Seschrock offsetof(spa_error_entry_t, se_avl)); 1011544Seschrock } 1021544Seschrock 103789Sahrens /* 104789Sahrens * Activate an uninitialized pool. 105789Sahrens */ 106789Sahrens static void 107789Sahrens spa_activate(spa_t *spa) 108789Sahrens { 109789Sahrens int t; 110789Sahrens 111789Sahrens ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 112789Sahrens 113789Sahrens spa->spa_state = POOL_STATE_ACTIVE; 114789Sahrens 115789Sahrens spa->spa_normal_class = metaslab_class_create(); 116789Sahrens 117789Sahrens for (t = 0; t < ZIO_TYPES; t++) { 118789Sahrens spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue", 119789Sahrens 8, maxclsyspri, 50, INT_MAX, 120789Sahrens TASKQ_PREPOPULATE); 121789Sahrens spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr", 122789Sahrens 8, maxclsyspri, 50, INT_MAX, 123789Sahrens TASKQ_PREPOPULATE); 124789Sahrens } 125789Sahrens 126789Sahrens rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL); 127789Sahrens 128789Sahrens list_create(&spa->spa_dirty_list, sizeof (vdev_t), 129789Sahrens offsetof(vdev_t, vdev_dirty_node)); 130789Sahrens 131789Sahrens txg_list_create(&spa->spa_vdev_txg_list, 132789Sahrens offsetof(struct vdev, vdev_txg_node)); 1331544Seschrock 1341544Seschrock avl_create(&spa->spa_errlist_scrub, 1351544Seschrock spa_error_entry_compare, sizeof (spa_error_entry_t), 1361544Seschrock offsetof(spa_error_entry_t, se_avl)); 1371544Seschrock avl_create(&spa->spa_errlist_last, 1381544Seschrock spa_error_entry_compare, sizeof (spa_error_entry_t), 1391544Seschrock offsetof(spa_error_entry_t, se_avl)); 140789Sahrens } 141789Sahrens 142789Sahrens /* 143789Sahrens * Opposite of spa_activate(). 144789Sahrens */ 145789Sahrens static void 146789Sahrens spa_deactivate(spa_t *spa) 147789Sahrens { 148789Sahrens int t; 149789Sahrens 150789Sahrens ASSERT(spa->spa_sync_on == B_FALSE); 151789Sahrens ASSERT(spa->spa_dsl_pool == NULL); 152789Sahrens ASSERT(spa->spa_root_vdev == NULL); 153789Sahrens 154789Sahrens ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 155789Sahrens 156789Sahrens txg_list_destroy(&spa->spa_vdev_txg_list); 157789Sahrens 158789Sahrens list_destroy(&spa->spa_dirty_list); 159789Sahrens 160789Sahrens rw_destroy(&spa->spa_traverse_lock); 161789Sahrens 162789Sahrens for (t = 0; t < ZIO_TYPES; t++) { 163789Sahrens taskq_destroy(spa->spa_zio_issue_taskq[t]); 164789Sahrens taskq_destroy(spa->spa_zio_intr_taskq[t]); 165789Sahrens spa->spa_zio_issue_taskq[t] = NULL; 166789Sahrens spa->spa_zio_intr_taskq[t] = NULL; 167789Sahrens } 168789Sahrens 169789Sahrens metaslab_class_destroy(spa->spa_normal_class); 170789Sahrens spa->spa_normal_class = NULL; 171789Sahrens 1721544Seschrock /* 1731544Seschrock * If this was part of an import or the open otherwise failed, we may 1741544Seschrock * still have errors left in the queues. Empty them just in case. 1751544Seschrock */ 1761544Seschrock spa_errlog_drain(spa); 1771544Seschrock 1781544Seschrock avl_destroy(&spa->spa_errlist_scrub); 1791544Seschrock avl_destroy(&spa->spa_errlist_last); 1801544Seschrock 181789Sahrens spa->spa_state = POOL_STATE_UNINITIALIZED; 182789Sahrens } 183789Sahrens 184789Sahrens /* 185789Sahrens * Verify a pool configuration, and construct the vdev tree appropriately. This 186789Sahrens * will create all the necessary vdevs in the appropriate layout, with each vdev 187789Sahrens * in the CLOSED state. This will prep the pool before open/creation/import. 188789Sahrens * All vdev validation is done by the vdev_alloc() routine. 189789Sahrens */ 190789Sahrens static vdev_t * 191789Sahrens spa_config_parse(spa_t *spa, nvlist_t *nv, vdev_t *parent, uint_t id, int atype) 192789Sahrens { 193789Sahrens nvlist_t **child; 194789Sahrens uint_t c, children; 195789Sahrens vdev_t *vd; 196789Sahrens 197789Sahrens if ((vd = vdev_alloc(spa, nv, parent, id, atype)) == NULL) 198789Sahrens return (NULL); 199789Sahrens 200789Sahrens if (vd->vdev_ops->vdev_op_leaf) 201789Sahrens return (vd); 202789Sahrens 203789Sahrens if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 204789Sahrens &child, &children) != 0) { 205789Sahrens vdev_free(vd); 206789Sahrens return (NULL); 207789Sahrens } 208789Sahrens 209789Sahrens for (c = 0; c < children; c++) { 210789Sahrens if (spa_config_parse(spa, child[c], vd, c, atype) == NULL) { 211789Sahrens vdev_free(vd); 212789Sahrens return (NULL); 213789Sahrens } 214789Sahrens } 215789Sahrens 216789Sahrens return (vd); 217789Sahrens } 218789Sahrens 219789Sahrens /* 220789Sahrens * Opposite of spa_load(). 221789Sahrens */ 222789Sahrens static void 223789Sahrens spa_unload(spa_t *spa) 224789Sahrens { 225789Sahrens /* 2261544Seschrock * Stop async tasks. 2271544Seschrock */ 2281544Seschrock spa_async_suspend(spa); 2291544Seschrock 2301544Seschrock /* 231789Sahrens * Stop syncing. 232789Sahrens */ 233789Sahrens if (spa->spa_sync_on) { 234789Sahrens txg_sync_stop(spa->spa_dsl_pool); 235789Sahrens spa->spa_sync_on = B_FALSE; 236789Sahrens } 237789Sahrens 238789Sahrens /* 239789Sahrens * Wait for any outstanding prefetch I/O to complete. 240789Sahrens */ 2411544Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 2421544Seschrock spa_config_exit(spa, FTAG); 243789Sahrens 244789Sahrens /* 245789Sahrens * Close the dsl pool. 246789Sahrens */ 247789Sahrens if (spa->spa_dsl_pool) { 248789Sahrens dsl_pool_close(spa->spa_dsl_pool); 249789Sahrens spa->spa_dsl_pool = NULL; 250789Sahrens } 251789Sahrens 252789Sahrens /* 253789Sahrens * Close all vdevs. 254789Sahrens */ 2551585Sbonwick if (spa->spa_root_vdev) 256789Sahrens vdev_free(spa->spa_root_vdev); 2571585Sbonwick ASSERT(spa->spa_root_vdev == NULL); 2581544Seschrock 2591544Seschrock spa->spa_async_suspended = 0; 260789Sahrens } 261789Sahrens 262789Sahrens /* 263789Sahrens * Load an existing storage pool, using the pool's builtin spa_config as a 2641544Seschrock * source of configuration information. 265789Sahrens */ 266789Sahrens static int 2671544Seschrock spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) 268789Sahrens { 269789Sahrens int error = 0; 2701585Sbonwick uint64_t config_cache_txg = spa->spa_config_txg; 271789Sahrens nvlist_t *nvroot = NULL; 272789Sahrens vdev_t *rvd; 273789Sahrens uberblock_t *ub = &spa->spa_uberblock; 274789Sahrens uint64_t pool_guid; 275789Sahrens zio_t *zio; 276789Sahrens 2771544Seschrock spa->spa_load_state = state; 278789Sahrens if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 2791544Seschrock nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 2801544Seschrock error = EINVAL; 2811544Seschrock goto out; 2821544Seschrock } 283789Sahrens 284789Sahrens (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 285789Sahrens &spa->spa_config_txg); 286789Sahrens 2871544Seschrock if ((spa->spa_load_state == SPA_LOAD_IMPORT || 2881544Seschrock spa->spa_load_state == SPA_LOAD_TRYIMPORT) && 2891544Seschrock spa_guid_exists(pool_guid, 0)) { 2901544Seschrock error = EEXIST; 2911544Seschrock goto out; 2921544Seschrock } 293789Sahrens 294789Sahrens /* 295789Sahrens * Parse the configuration into a vdev tree. 296789Sahrens */ 2971544Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 298789Sahrens rvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 2991544Seschrock spa_config_exit(spa, FTAG); 300789Sahrens 3011544Seschrock if (rvd == NULL) { 3021544Seschrock error = EINVAL; 3031544Seschrock goto out; 3041544Seschrock } 305789Sahrens 3061585Sbonwick ASSERT(spa->spa_root_vdev == rvd); 307789Sahrens ASSERT(spa_guid(spa) == pool_guid); 308789Sahrens 309789Sahrens /* 310789Sahrens * Try to open all vdevs, loading each label in the process. 311789Sahrens */ 3121544Seschrock if (vdev_open(rvd) != 0) { 3131544Seschrock error = ENXIO; 3141544Seschrock goto out; 3151544Seschrock } 316789Sahrens 317789Sahrens /* 318789Sahrens * Find the best uberblock. 319789Sahrens */ 320789Sahrens bzero(ub, sizeof (uberblock_t)); 321789Sahrens 322789Sahrens zio = zio_root(spa, NULL, NULL, 323789Sahrens ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 324789Sahrens vdev_uberblock_load(zio, rvd, ub); 325789Sahrens error = zio_wait(zio); 326789Sahrens 327789Sahrens /* 328789Sahrens * If we weren't able to find a single valid uberblock, return failure. 329789Sahrens */ 330789Sahrens if (ub->ub_txg == 0) { 3311544Seschrock error = ENXIO; 3321544Seschrock goto out; 3331544Seschrock } 3341544Seschrock 3351544Seschrock /* 3361544Seschrock * If the pool is newer than the code, we can't open it. 3371544Seschrock */ 3381544Seschrock if (ub->ub_version > UBERBLOCK_VERSION) { 3391544Seschrock error = ENOTSUP; 3401544Seschrock goto out; 341789Sahrens } 342789Sahrens 343789Sahrens /* 344789Sahrens * If the vdev guid sum doesn't match the uberblock, we have an 345789Sahrens * incomplete configuration. 346789Sahrens */ 347789Sahrens if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 3481544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 3491544Seschrock VDEV_AUX_BAD_GUID_SUM); 3501544Seschrock error = ENXIO; 3511544Seschrock goto out; 352789Sahrens } 353789Sahrens 354789Sahrens /* 355789Sahrens * Initialize internal SPA structures. 356789Sahrens */ 357789Sahrens spa->spa_state = POOL_STATE_ACTIVE; 358789Sahrens spa->spa_ubsync = spa->spa_uberblock; 359789Sahrens spa->spa_first_txg = spa_last_synced_txg(spa) + 1; 3601544Seschrock error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 3611544Seschrock if (error) { 3621544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 3631544Seschrock VDEV_AUX_CORRUPT_DATA); 3641544Seschrock goto out; 3651544Seschrock } 366789Sahrens spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 367789Sahrens 3681544Seschrock if (zap_lookup(spa->spa_meta_objset, 369789Sahrens DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 3701544Seschrock sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { 3711544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 3721544Seschrock VDEV_AUX_CORRUPT_DATA); 3731544Seschrock error = EIO; 3741544Seschrock goto out; 3751544Seschrock } 376789Sahrens 377789Sahrens if (!mosconfig) { 378789Sahrens dmu_buf_t *db; 379789Sahrens char *packed = NULL; 380789Sahrens size_t nvsize = 0; 381789Sahrens nvlist_t *newconfig = NULL; 382789Sahrens 3831544Seschrock VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, 3841544Seschrock spa->spa_config_object, FTAG, &db)); 385789Sahrens nvsize = *(uint64_t *)db->db_data; 3861544Seschrock dmu_buf_rele(db, FTAG); 387789Sahrens 388789Sahrens packed = kmem_alloc(nvsize, KM_SLEEP); 3891544Seschrock error = dmu_read(spa->spa_meta_objset, 390789Sahrens spa->spa_config_object, 0, nvsize, packed); 391789Sahrens if (error == 0) 392789Sahrens error = nvlist_unpack(packed, nvsize, &newconfig, 0); 393789Sahrens kmem_free(packed, nvsize); 394789Sahrens 3951544Seschrock if (error) { 3961544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 3971544Seschrock VDEV_AUX_CORRUPT_DATA); 3981544Seschrock error = EIO; 3991544Seschrock goto out; 4001544Seschrock } 401789Sahrens 402789Sahrens spa_config_set(spa, newconfig); 403789Sahrens 404789Sahrens spa_unload(spa); 405789Sahrens spa_deactivate(spa); 406789Sahrens spa_activate(spa); 407789Sahrens 4081544Seschrock return (spa_load(spa, newconfig, state, B_TRUE)); 4091544Seschrock } 4101544Seschrock 4111544Seschrock if (zap_lookup(spa->spa_meta_objset, 4121544Seschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 4131544Seschrock sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { 4141544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 4151544Seschrock VDEV_AUX_CORRUPT_DATA); 4161544Seschrock error = EIO; 4171544Seschrock goto out; 418789Sahrens } 419789Sahrens 4201544Seschrock /* 4211544Seschrock * Load the persistent error log. If we have an older pool, this will 4221544Seschrock * not be present. 4231544Seschrock */ 4241544Seschrock error = zap_lookup(spa->spa_meta_objset, 4251544Seschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, 4261544Seschrock sizeof (uint64_t), 1, &spa->spa_errlog_last); 4271544Seschrock if (error != 0 &&error != ENOENT) { 4281544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 4291544Seschrock VDEV_AUX_CORRUPT_DATA); 4301544Seschrock error = EIO; 4311544Seschrock goto out; 4321544Seschrock } 4331544Seschrock 4341544Seschrock error = zap_lookup(spa->spa_meta_objset, 4351544Seschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, 4361544Seschrock sizeof (uint64_t), 1, &spa->spa_errlog_scrub); 4371544Seschrock if (error != 0 && error != ENOENT) { 4381544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 4391544Seschrock VDEV_AUX_CORRUPT_DATA); 4401544Seschrock error = EIO; 4411544Seschrock goto out; 4421544Seschrock } 443789Sahrens 444789Sahrens /* 4451544Seschrock * Load the vdev state for all top level vdevs. We need to grab the 4461544Seschrock * config lock because all label I/O is done with the 4471544Seschrock * ZIO_FLAG_CONFIG_HELD flag. 448789Sahrens */ 4491544Seschrock spa_config_enter(spa, RW_READER, FTAG); 4501544Seschrock if ((error = vdev_load(rvd)) != 0) { 4511544Seschrock spa_config_exit(spa, FTAG); 4521544Seschrock goto out; 4531544Seschrock } 4541544Seschrock spa_config_exit(spa, FTAG); 455789Sahrens 456789Sahrens /* 457789Sahrens * Propagate the leaf DTLs we just loaded all the way up the tree. 458789Sahrens */ 4591544Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 460789Sahrens vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 4611544Seschrock spa_config_exit(spa, FTAG); 462789Sahrens 463789Sahrens /* 464789Sahrens * Check the state of the root vdev. If it can't be opened, it 465789Sahrens * indicates one or more toplevel vdevs are faulted. 466789Sahrens */ 4671544Seschrock if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 4681544Seschrock error = ENXIO; 4691544Seschrock goto out; 4701544Seschrock } 471789Sahrens 472789Sahrens /* 473789Sahrens * Claim log blocks that haven't been committed yet, and update all 474789Sahrens * top-level vdevs to sync any config changes found in vdev_load(). 475789Sahrens * This must all happen in a single txg. 476789Sahrens */ 4771544Seschrock if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) { 4781585Sbonwick int c; 479*1601Sbonwick dmu_tx_t *tx; 480*1601Sbonwick 481*1601Sbonwick spa_config_enter(spa, RW_WRITER, FTAG); 482*1601Sbonwick vdev_config_dirty(rvd); 483*1601Sbonwick spa_config_exit(spa, FTAG); 484*1601Sbonwick 485*1601Sbonwick tx = dmu_tx_create_assigned(spa_get_dsl(spa), 486789Sahrens spa_first_txg(spa)); 487789Sahrens dmu_objset_find(spa->spa_name, zil_claim, tx, 0); 488789Sahrens dmu_tx_commit(tx); 489789Sahrens 490789Sahrens spa->spa_sync_on = B_TRUE; 491789Sahrens txg_sync_start(spa->spa_dsl_pool); 492789Sahrens 493789Sahrens /* 494789Sahrens * Wait for all claims to sync. 495789Sahrens */ 496789Sahrens txg_wait_synced(spa->spa_dsl_pool, 0); 4971585Sbonwick 4981585Sbonwick /* 4991585Sbonwick * If the config cache is stale relative to the mosconfig, 5001585Sbonwick * sync the config cache. 5011585Sbonwick */ 502*1601Sbonwick if (config_cache_txg != spa->spa_config_txg) { 503*1601Sbonwick uint64_t txg; 504*1601Sbonwick spa_config_enter(spa, RW_WRITER, FTAG); 505*1601Sbonwick txg = spa_last_synced_txg(spa) + 1; 506*1601Sbonwick spa_config_set(spa, 507*1601Sbonwick spa_config_generate(spa, rvd, txg, 0)); 508*1601Sbonwick spa_config_exit(spa, FTAG); 509*1601Sbonwick txg_wait_synced(spa->spa_dsl_pool, txg); 5101585Sbonwick spa_config_sync(); 511*1601Sbonwick } 5121585Sbonwick 5131585Sbonwick /* 5141585Sbonwick * If we have top-level vdevs that were added but have 5151585Sbonwick * not yet been prepared for allocation, do that now. 5161585Sbonwick * (It's safe now because the config cache is up to date, 5171585Sbonwick * so it will be able to translate the new DVAs.) 5181585Sbonwick * See comments in spa_vdev_add() for full details. 5191585Sbonwick */ 5201585Sbonwick for (c = 0; c < rvd->vdev_children; c++) { 5211585Sbonwick vdev_t *tvd = rvd->vdev_child[c]; 5221585Sbonwick if (tvd->vdev_ms_array == 0) { 523*1601Sbonwick uint64_t txg; 5241585Sbonwick ASSERT(tvd->vdev_ms_shift == 0); 5251585Sbonwick spa_config_enter(spa, RW_WRITER, FTAG); 526*1601Sbonwick txg = spa_last_synced_txg(spa) + 1; 5271585Sbonwick vdev_init(tvd, txg); 5281585Sbonwick vdev_config_dirty(tvd); 5291585Sbonwick spa_config_set(spa, 5301585Sbonwick spa_config_generate(spa, rvd, txg, 0)); 5311585Sbonwick spa_config_exit(spa, FTAG); 5321585Sbonwick txg_wait_synced(spa->spa_dsl_pool, txg); 5331585Sbonwick ASSERT(tvd->vdev_ms_shift != 0); 5341585Sbonwick ASSERT(tvd->vdev_ms_array != 0); 5351585Sbonwick spa_config_sync(); 5361585Sbonwick } 5371585Sbonwick } 538789Sahrens } 539789Sahrens 5401544Seschrock error = 0; 5411544Seschrock out: 5421544Seschrock if (error) 5431544Seschrock zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0); 5441544Seschrock spa->spa_load_state = SPA_LOAD_NONE; 5451544Seschrock spa->spa_ena = 0; 5461544Seschrock 5471544Seschrock return (error); 548789Sahrens } 549789Sahrens 550789Sahrens /* 551789Sahrens * Pool Open/Import 552789Sahrens * 553789Sahrens * The import case is identical to an open except that the configuration is sent 554789Sahrens * down from userland, instead of grabbed from the configuration cache. For the 555789Sahrens * case of an open, the pool configuration will exist in the 556789Sahrens * POOL_STATE_UNITIALIZED state. 557789Sahrens * 558789Sahrens * The stats information (gen/count/ustats) is used to gather vdev statistics at 559789Sahrens * the same time open the pool, without having to keep around the spa_t in some 560789Sahrens * ambiguous state. 561789Sahrens */ 562789Sahrens static int 563789Sahrens spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) 564789Sahrens { 565789Sahrens spa_t *spa; 566789Sahrens int error; 567789Sahrens int loaded = B_FALSE; 568789Sahrens int locked = B_FALSE; 569789Sahrens 570789Sahrens *spapp = NULL; 571789Sahrens 572789Sahrens /* 573789Sahrens * As disgusting as this is, we need to support recursive calls to this 574789Sahrens * function because dsl_dir_open() is called during spa_load(), and ends 575789Sahrens * up calling spa_open() again. The real fix is to figure out how to 576789Sahrens * avoid dsl_dir_open() calling this in the first place. 577789Sahrens */ 578789Sahrens if (mutex_owner(&spa_namespace_lock) != curthread) { 579789Sahrens mutex_enter(&spa_namespace_lock); 580789Sahrens locked = B_TRUE; 581789Sahrens } 582789Sahrens 583789Sahrens if ((spa = spa_lookup(pool)) == NULL) { 584789Sahrens if (locked) 585789Sahrens mutex_exit(&spa_namespace_lock); 586789Sahrens return (ENOENT); 587789Sahrens } 588789Sahrens if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 589789Sahrens 590789Sahrens spa_activate(spa); 591789Sahrens 592789Sahrens error = spa_load(spa, spa->spa_config, 5931544Seschrock SPA_LOAD_OPEN, B_FALSE); 594789Sahrens 595789Sahrens if (error == EBADF) { 596789Sahrens /* 597789Sahrens * If vdev_load() returns EBADF, it indicates that one 598789Sahrens * of the vdevs indicates that the pool has been 599789Sahrens * exported or destroyed. If this is the case, the 600789Sahrens * config cache is out of sync and we should remove the 601789Sahrens * pool from the namespace. 602789Sahrens */ 603789Sahrens spa_unload(spa); 604789Sahrens spa_deactivate(spa); 605789Sahrens spa_remove(spa); 606789Sahrens spa_config_sync(); 607789Sahrens if (locked) 608789Sahrens mutex_exit(&spa_namespace_lock); 609789Sahrens return (ENOENT); 6101544Seschrock } 6111544Seschrock 6121544Seschrock if (error) { 613789Sahrens /* 614789Sahrens * We can't open the pool, but we still have useful 615789Sahrens * information: the state of each vdev after the 616789Sahrens * attempted vdev_open(). Return this to the user. 617789Sahrens */ 618789Sahrens if (config != NULL && spa->spa_root_vdev != NULL) 619789Sahrens *config = spa_config_generate(spa, NULL, -1ULL, 620789Sahrens B_TRUE); 621789Sahrens spa_unload(spa); 622789Sahrens spa_deactivate(spa); 6231544Seschrock spa->spa_last_open_failed = B_TRUE; 624789Sahrens if (locked) 625789Sahrens mutex_exit(&spa_namespace_lock); 626789Sahrens *spapp = NULL; 627789Sahrens return (error); 6281544Seschrock } else { 6291544Seschrock zfs_post_ok(spa, NULL); 6301544Seschrock spa->spa_last_open_failed = B_FALSE; 631789Sahrens } 632789Sahrens 633789Sahrens loaded = B_TRUE; 634789Sahrens } 635789Sahrens 636789Sahrens spa_open_ref(spa, tag); 637789Sahrens if (locked) 638789Sahrens mutex_exit(&spa_namespace_lock); 639789Sahrens 640789Sahrens *spapp = spa; 641789Sahrens 642789Sahrens if (config != NULL) { 6431544Seschrock spa_config_enter(spa, RW_READER, FTAG); 644789Sahrens *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 6451544Seschrock spa_config_exit(spa, FTAG); 646789Sahrens } 647789Sahrens 648789Sahrens /* 649789Sahrens * If we just loaded the pool, resilver anything that's out of date. 650789Sahrens */ 651789Sahrens if (loaded && (spa_mode & FWRITE)) 652789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 653789Sahrens 654789Sahrens return (0); 655789Sahrens } 656789Sahrens 657789Sahrens int 658789Sahrens spa_open(const char *name, spa_t **spapp, void *tag) 659789Sahrens { 660789Sahrens return (spa_open_common(name, spapp, tag, NULL)); 661789Sahrens } 662789Sahrens 6631544Seschrock /* 6641544Seschrock * Lookup the given spa_t, incrementing the inject count in the process, 6651544Seschrock * preventing it from being exported or destroyed. 6661544Seschrock */ 6671544Seschrock spa_t * 6681544Seschrock spa_inject_addref(char *name) 6691544Seschrock { 6701544Seschrock spa_t *spa; 6711544Seschrock 6721544Seschrock mutex_enter(&spa_namespace_lock); 6731544Seschrock if ((spa = spa_lookup(name)) == NULL) { 6741544Seschrock mutex_exit(&spa_namespace_lock); 6751544Seschrock return (NULL); 6761544Seschrock } 6771544Seschrock spa->spa_inject_ref++; 6781544Seschrock mutex_exit(&spa_namespace_lock); 6791544Seschrock 6801544Seschrock return (spa); 6811544Seschrock } 6821544Seschrock 6831544Seschrock void 6841544Seschrock spa_inject_delref(spa_t *spa) 6851544Seschrock { 6861544Seschrock mutex_enter(&spa_namespace_lock); 6871544Seschrock spa->spa_inject_ref--; 6881544Seschrock mutex_exit(&spa_namespace_lock); 6891544Seschrock } 6901544Seschrock 691789Sahrens int 6921544Seschrock spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 693789Sahrens { 694789Sahrens int error; 695789Sahrens spa_t *spa; 696789Sahrens 697789Sahrens *config = NULL; 698789Sahrens error = spa_open_common(name, &spa, FTAG, config); 699789Sahrens 7001544Seschrock if (spa && *config != NULL) 7011544Seschrock VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, 7021544Seschrock spa_get_errlog_size(spa)) == 0); 7031544Seschrock 7041544Seschrock /* 7051544Seschrock * We want to get the alternate root even for faulted pools, so we cheat 7061544Seschrock * and call spa_lookup() directly. 7071544Seschrock */ 7081544Seschrock if (altroot) { 7091544Seschrock if (spa == NULL) { 7101544Seschrock mutex_enter(&spa_namespace_lock); 7111544Seschrock spa = spa_lookup(name); 7121544Seschrock if (spa) 7131544Seschrock spa_altroot(spa, altroot, buflen); 7141544Seschrock else 7151544Seschrock altroot[0] = '\0'; 7161544Seschrock spa = NULL; 7171544Seschrock mutex_exit(&spa_namespace_lock); 7181544Seschrock } else { 7191544Seschrock spa_altroot(spa, altroot, buflen); 7201544Seschrock } 7211544Seschrock } 7221544Seschrock 723789Sahrens if (spa != NULL) 724789Sahrens spa_close(spa, FTAG); 725789Sahrens 726789Sahrens return (error); 727789Sahrens } 728789Sahrens 729789Sahrens /* 730789Sahrens * Pool Creation 731789Sahrens */ 732789Sahrens int 733789Sahrens spa_create(const char *pool, nvlist_t *nvroot, char *altroot) 734789Sahrens { 735789Sahrens spa_t *spa; 736789Sahrens dsl_pool_t *dp; 737789Sahrens dmu_tx_t *tx; 738789Sahrens int error; 739789Sahrens uint64_t txg = TXG_INITIAL; 740789Sahrens 741789Sahrens /* 742789Sahrens * If this pool already exists, return failure. 743789Sahrens */ 744789Sahrens mutex_enter(&spa_namespace_lock); 745789Sahrens if (spa_lookup(pool) != NULL) { 746789Sahrens mutex_exit(&spa_namespace_lock); 747789Sahrens return (EEXIST); 748789Sahrens } 749789Sahrens spa = spa_add(pool); 750789Sahrens 751789Sahrens /* 752789Sahrens * Allocate a new spa_t structure. 753789Sahrens */ 754789Sahrens spa_activate(spa); 755789Sahrens 756*1601Sbonwick if (altroot != NULL) { 757*1601Sbonwick spa->spa_root = spa_strdup(altroot); 758*1601Sbonwick atomic_add_32(&spa_active_count, 1); 759*1601Sbonwick } 760*1601Sbonwick 761789Sahrens spa->spa_uberblock.ub_txg = txg - 1; 762789Sahrens spa->spa_ubsync = spa->spa_uberblock; 763789Sahrens 764789Sahrens error = spa_vdev_add(spa, nvroot); 765789Sahrens 766789Sahrens if (error) { 767789Sahrens spa_unload(spa); 768789Sahrens spa_deactivate(spa); 769789Sahrens spa_remove(spa); 770789Sahrens mutex_exit(&spa_namespace_lock); 771789Sahrens return (error); 772789Sahrens } 773789Sahrens 774789Sahrens spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg); 775789Sahrens spa->spa_meta_objset = dp->dp_meta_objset; 776789Sahrens 777789Sahrens tx = dmu_tx_create_assigned(dp, txg); 778789Sahrens 779789Sahrens /* 780789Sahrens * Create the pool config object. 781789Sahrens */ 782789Sahrens spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 783789Sahrens DMU_OT_PACKED_NVLIST, 1 << 14, 784789Sahrens DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 785789Sahrens 7861544Seschrock if (zap_add(spa->spa_meta_objset, 787789Sahrens DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 7881544Seschrock sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 7891544Seschrock cmn_err(CE_PANIC, "failed to add pool config"); 7901544Seschrock } 791789Sahrens 792789Sahrens /* 793789Sahrens * Create the deferred-free bplist object. Turn off compression 794789Sahrens * because sync-to-convergence takes longer if the blocksize 795789Sahrens * keeps changing. 796789Sahrens */ 797789Sahrens spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 798789Sahrens 1 << 14, tx); 799789Sahrens dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 800789Sahrens ZIO_COMPRESS_OFF, tx); 801789Sahrens 8021544Seschrock if (zap_add(spa->spa_meta_objset, 803789Sahrens DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 8041544Seschrock sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { 8051544Seschrock cmn_err(CE_PANIC, "failed to add bplist"); 8061544Seschrock } 807789Sahrens 808789Sahrens dmu_tx_commit(tx); 809789Sahrens 810789Sahrens spa->spa_sync_on = B_TRUE; 811789Sahrens txg_sync_start(spa->spa_dsl_pool); 812789Sahrens 813789Sahrens /* 814789Sahrens * We explicitly wait for the first transaction to complete so that our 815789Sahrens * bean counters are appropriately updated. 816789Sahrens */ 817789Sahrens txg_wait_synced(spa->spa_dsl_pool, txg); 818789Sahrens 819789Sahrens spa_config_sync(); 820789Sahrens 821789Sahrens mutex_exit(&spa_namespace_lock); 822789Sahrens 823789Sahrens return (0); 824789Sahrens } 825789Sahrens 826789Sahrens /* 827789Sahrens * Import the given pool into the system. We set up the necessary spa_t and 828789Sahrens * then call spa_load() to do the dirty work. 829789Sahrens */ 830789Sahrens int 831789Sahrens spa_import(const char *pool, nvlist_t *config, char *altroot) 832789Sahrens { 833789Sahrens spa_t *spa; 834789Sahrens int error; 835789Sahrens 836789Sahrens if (!(spa_mode & FWRITE)) 837789Sahrens return (EROFS); 838789Sahrens 839789Sahrens /* 840789Sahrens * If a pool with this name exists, return failure. 841789Sahrens */ 842789Sahrens mutex_enter(&spa_namespace_lock); 843789Sahrens if (spa_lookup(pool) != NULL) { 844789Sahrens mutex_exit(&spa_namespace_lock); 845789Sahrens return (EEXIST); 846789Sahrens } 847789Sahrens 848789Sahrens /* 849789Sahrens * Create an initialize the spa structure 850789Sahrens */ 851789Sahrens spa = spa_add(pool); 852789Sahrens spa_activate(spa); 853789Sahrens 854789Sahrens /* 855*1601Sbonwick * Set the alternate root, if there is one. 856*1601Sbonwick */ 857*1601Sbonwick if (altroot != NULL) { 858*1601Sbonwick spa->spa_root = spa_strdup(altroot); 859*1601Sbonwick atomic_add_32(&spa_active_count, 1); 860*1601Sbonwick } 861*1601Sbonwick 862*1601Sbonwick /* 863789Sahrens * Pass off the heavy lifting to spa_load(). We pass TRUE for mosconfig 864789Sahrens * so that we don't try to open the pool if the config is damaged. 865*1601Sbonwick * Note: on success, spa_load() will update and sync the config cache. 866789Sahrens */ 8671544Seschrock error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE); 868789Sahrens 869789Sahrens if (error) { 870789Sahrens spa_unload(spa); 871789Sahrens spa_deactivate(spa); 872789Sahrens spa_remove(spa); 873789Sahrens mutex_exit(&spa_namespace_lock); 874789Sahrens return (error); 875789Sahrens } 876789Sahrens 877789Sahrens mutex_exit(&spa_namespace_lock); 878789Sahrens 879789Sahrens /* 880789Sahrens * Resilver anything that's out of date. 881789Sahrens */ 882789Sahrens if (spa_mode & FWRITE) 883789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 884789Sahrens 885789Sahrens return (0); 886789Sahrens } 887789Sahrens 888789Sahrens /* 889789Sahrens * This (illegal) pool name is used when temporarily importing a spa_t in order 890789Sahrens * to get the vdev stats associated with the imported devices. 891789Sahrens */ 892789Sahrens #define TRYIMPORT_NAME "$import" 893789Sahrens 894789Sahrens nvlist_t * 895789Sahrens spa_tryimport(nvlist_t *tryconfig) 896789Sahrens { 897789Sahrens nvlist_t *config = NULL; 898789Sahrens char *poolname; 899789Sahrens spa_t *spa; 900789Sahrens uint64_t state; 901789Sahrens 902789Sahrens if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 903789Sahrens return (NULL); 904789Sahrens 905789Sahrens if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 906789Sahrens return (NULL); 907789Sahrens 908789Sahrens mutex_enter(&spa_namespace_lock); 909789Sahrens spa = spa_add(TRYIMPORT_NAME); 910789Sahrens 911789Sahrens ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 912789Sahrens 913789Sahrens /* 914789Sahrens * Initialize the spa_t structure. 915789Sahrens */ 916789Sahrens spa_activate(spa); 917789Sahrens 918789Sahrens /* 919789Sahrens * Pass off the heavy lifting to spa_load(). We pass TRUE for mosconfig 920789Sahrens * so we don't try to open the pool if the config is damaged. 921789Sahrens */ 9221544Seschrock (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); 923789Sahrens 924789Sahrens /* 925789Sahrens * If 'tryconfig' was at least parsable, return the current config. 926789Sahrens */ 927789Sahrens if (spa->spa_root_vdev != NULL) { 928789Sahrens config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 929789Sahrens VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 930789Sahrens poolname) == 0); 931789Sahrens VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 932789Sahrens state) == 0); 933789Sahrens } 934789Sahrens 935789Sahrens spa_unload(spa); 936789Sahrens spa_deactivate(spa); 937789Sahrens spa_remove(spa); 938789Sahrens mutex_exit(&spa_namespace_lock); 939789Sahrens 940789Sahrens return (config); 941789Sahrens } 942789Sahrens 943789Sahrens /* 944789Sahrens * Pool export/destroy 945789Sahrens * 946789Sahrens * The act of destroying or exporting a pool is very simple. We make sure there 947789Sahrens * is no more pending I/O and any references to the pool are gone. Then, we 948789Sahrens * update the pool state and sync all the labels to disk, removing the 949789Sahrens * configuration from the cache afterwards. 950789Sahrens */ 951789Sahrens static int 952789Sahrens spa_export_common(char *pool, int new_state) 953789Sahrens { 954789Sahrens spa_t *spa; 955789Sahrens 956789Sahrens if (!(spa_mode & FWRITE)) 957789Sahrens return (EROFS); 958789Sahrens 959789Sahrens mutex_enter(&spa_namespace_lock); 960789Sahrens if ((spa = spa_lookup(pool)) == NULL) { 961789Sahrens mutex_exit(&spa_namespace_lock); 962789Sahrens return (ENOENT); 963789Sahrens } 964789Sahrens 965789Sahrens /* 9661544Seschrock * Put a hold on the pool, drop the namespace lock, stop async tasks, 9671544Seschrock * reacquire the namespace lock, and see if we can export. 9681544Seschrock */ 9691544Seschrock spa_open_ref(spa, FTAG); 9701544Seschrock mutex_exit(&spa_namespace_lock); 9711544Seschrock spa_async_suspend(spa); 9721544Seschrock mutex_enter(&spa_namespace_lock); 9731544Seschrock spa_close(spa, FTAG); 9741544Seschrock 9751544Seschrock /* 976789Sahrens * The pool will be in core if it's openable, 977789Sahrens * in which case we can modify its state. 978789Sahrens */ 979789Sahrens if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 980789Sahrens /* 981789Sahrens * Objsets may be open only because they're dirty, so we 982789Sahrens * have to force it to sync before checking spa_refcnt. 983789Sahrens */ 984789Sahrens spa_scrub_suspend(spa); 985789Sahrens txg_wait_synced(spa->spa_dsl_pool, 0); 986789Sahrens 9871544Seschrock /* 9881544Seschrock * A pool cannot be exported or destroyed if there are active 9891544Seschrock * references. If we are resetting a pool, allow references by 9901544Seschrock * fault injection handlers. 9911544Seschrock */ 9921544Seschrock if (!spa_refcount_zero(spa) || 9931544Seschrock (spa->spa_inject_ref != 0 && 9941544Seschrock new_state != POOL_STATE_UNINITIALIZED)) { 995789Sahrens spa_scrub_resume(spa); 9961544Seschrock spa_async_resume(spa); 997789Sahrens mutex_exit(&spa_namespace_lock); 998789Sahrens return (EBUSY); 999789Sahrens } 1000789Sahrens 1001789Sahrens spa_scrub_resume(spa); 1002789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 1003789Sahrens 1004789Sahrens if (spa->spa_root != NULL) 1005789Sahrens atomic_add_32(&spa_active_count, -1); 1006789Sahrens 1007789Sahrens /* 1008789Sahrens * We want this to be reflected on every label, 1009789Sahrens * so mark them all dirty. spa_unload() will do the 1010789Sahrens * final sync that pushes these changes out. 1011789Sahrens */ 10121544Seschrock if (new_state != POOL_STATE_UNINITIALIZED) { 1013*1601Sbonwick spa_config_enter(spa, RW_WRITER, FTAG); 10141544Seschrock spa->spa_state = new_state; 10151544Seschrock vdev_config_dirty(spa->spa_root_vdev); 1016*1601Sbonwick spa_config_exit(spa, FTAG); 10171544Seschrock } 1018789Sahrens } 1019789Sahrens 1020789Sahrens if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 1021789Sahrens spa_unload(spa); 1022789Sahrens spa_deactivate(spa); 1023789Sahrens } 1024789Sahrens 10251544Seschrock if (new_state != POOL_STATE_UNINITIALIZED) { 10261544Seschrock spa_remove(spa); 10271544Seschrock spa_config_sync(); 10281544Seschrock } 1029789Sahrens mutex_exit(&spa_namespace_lock); 1030789Sahrens 1031789Sahrens return (0); 1032789Sahrens } 1033789Sahrens 1034789Sahrens /* 1035789Sahrens * Destroy a storage pool. 1036789Sahrens */ 1037789Sahrens int 1038789Sahrens spa_destroy(char *pool) 1039789Sahrens { 1040789Sahrens return (spa_export_common(pool, POOL_STATE_DESTROYED)); 1041789Sahrens } 1042789Sahrens 1043789Sahrens /* 1044789Sahrens * Export a storage pool. 1045789Sahrens */ 1046789Sahrens int 1047789Sahrens spa_export(char *pool) 1048789Sahrens { 1049789Sahrens return (spa_export_common(pool, POOL_STATE_EXPORTED)); 1050789Sahrens } 1051789Sahrens 1052789Sahrens /* 10531544Seschrock * Similar to spa_export(), this unloads the spa_t without actually removing it 10541544Seschrock * from the namespace in any way. 10551544Seschrock */ 10561544Seschrock int 10571544Seschrock spa_reset(char *pool) 10581544Seschrock { 10591544Seschrock return (spa_export_common(pool, POOL_STATE_UNINITIALIZED)); 10601544Seschrock } 10611544Seschrock 10621544Seschrock 10631544Seschrock /* 1064789Sahrens * ========================================================================== 1065789Sahrens * Device manipulation 1066789Sahrens * ========================================================================== 1067789Sahrens */ 1068789Sahrens 1069789Sahrens /* 1070789Sahrens * Add capacity to a storage pool. 1071789Sahrens */ 1072789Sahrens int 1073789Sahrens spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 1074789Sahrens { 1075789Sahrens uint64_t txg; 10761585Sbonwick int c, c0, children, error; 1077789Sahrens vdev_t *rvd = spa->spa_root_vdev; 10781585Sbonwick vdev_t *vd, *tvd; 1079789Sahrens 1080789Sahrens txg = spa_vdev_enter(spa); 1081789Sahrens 1082789Sahrens vd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD); 1083789Sahrens 1084789Sahrens if (vd == NULL) 1085789Sahrens return (spa_vdev_exit(spa, vd, txg, EINVAL)); 1086789Sahrens 10871585Sbonwick if (rvd == NULL) { /* spa_create() */ 10881585Sbonwick rvd = vd; 10891585Sbonwick c0 = 0; 10901585Sbonwick } else { 10911585Sbonwick c0 = rvd->vdev_children; 10921585Sbonwick } 10931585Sbonwick 10941585Sbonwick ASSERT(spa->spa_root_vdev == rvd); 1095789Sahrens 1096789Sahrens if ((error = vdev_create(vd, txg)) != 0) 1097789Sahrens return (spa_vdev_exit(spa, vd, txg, error)); 1098789Sahrens 10991585Sbonwick children = vd->vdev_children; 11001585Sbonwick 1101789Sahrens /* 11021585Sbonwick * Transfer each new top-level vdev from vd to rvd. 1103789Sahrens */ 11041585Sbonwick for (c = 0; c < children; c++) { 11051585Sbonwick tvd = vd->vdev_child[c]; 1106789Sahrens if (vd != rvd) { 1107789Sahrens vdev_remove_child(vd, tvd); 11081585Sbonwick tvd->vdev_id = c0 + c; 1109789Sahrens vdev_add_child(rvd, tvd); 1110789Sahrens } 1111789Sahrens vdev_config_dirty(tvd); 1112789Sahrens } 1113789Sahrens 1114789Sahrens /* 11151585Sbonwick * We have to be careful when adding new vdevs to an existing pool. 11161585Sbonwick * If other threads start allocating from these vdevs before we 11171585Sbonwick * sync the config cache, and we lose power, then upon reboot we may 11181585Sbonwick * fail to open the pool because there are DVAs that the config cache 11191585Sbonwick * can't translate. Therefore, we first add the vdevs without 11201585Sbonwick * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 11211585Sbonwick * initialize the metaslabs; and sync the config cache again. 11221585Sbonwick * 11231585Sbonwick * spa_load() checks for added-but-not-initialized vdevs, so that 11241585Sbonwick * if we lose power at any point in this sequence, the remaining 11251585Sbonwick * steps will be completed the next time we load the pool. 1126789Sahrens */ 11271585Sbonwick if (vd != rvd) { 11281585Sbonwick (void) spa_vdev_exit(spa, vd, txg, 0); 11291585Sbonwick txg = spa_vdev_enter(spa); 11301585Sbonwick vd = NULL; 11311585Sbonwick } 11321585Sbonwick 11331585Sbonwick /* 11341585Sbonwick * Now that the config is safely on disk, we can use the new space. 11351585Sbonwick */ 11361585Sbonwick for (c = 0; c < children; c++) { 11371585Sbonwick tvd = rvd->vdev_child[c0 + c]; 11381585Sbonwick ASSERT(tvd->vdev_ms_array == 0); 11391585Sbonwick vdev_init(tvd, txg); 11401585Sbonwick vdev_config_dirty(tvd); 11411585Sbonwick } 1142789Sahrens 1143789Sahrens return (spa_vdev_exit(spa, vd, txg, 0)); 1144789Sahrens } 1145789Sahrens 1146789Sahrens /* 1147789Sahrens * Attach a device to a mirror. The arguments are the path to any device 1148789Sahrens * in the mirror, and the nvroot for the new device. If the path specifies 1149789Sahrens * a device that is not mirrored, we automatically insert the mirror vdev. 1150789Sahrens * 1151789Sahrens * If 'replacing' is specified, the new device is intended to replace the 1152789Sahrens * existing device; in this case the two devices are made into their own 1153789Sahrens * mirror using the 'replacing' vdev, which is functionally idendical to 1154789Sahrens * the mirror vdev (it actually reuses all the same ops) but has a few 1155789Sahrens * extra rules: you can't attach to it after it's been created, and upon 1156789Sahrens * completion of resilvering, the first disk (the one being replaced) 1157789Sahrens * is automatically detached. 1158789Sahrens */ 1159789Sahrens int 11601544Seschrock spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 1161789Sahrens { 1162789Sahrens uint64_t txg, open_txg; 1163789Sahrens int error; 1164789Sahrens vdev_t *rvd = spa->spa_root_vdev; 1165789Sahrens vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 1166789Sahrens vdev_ops_t *pvops = replacing ? &vdev_replacing_ops : &vdev_mirror_ops; 1167789Sahrens 1168789Sahrens txg = spa_vdev_enter(spa); 1169789Sahrens 11701544Seschrock oldvd = vdev_lookup_by_guid(rvd, guid); 1171789Sahrens 1172789Sahrens if (oldvd == NULL) 1173789Sahrens return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1174789Sahrens 11751585Sbonwick if (!oldvd->vdev_ops->vdev_op_leaf) 11761585Sbonwick return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 11771585Sbonwick 1178789Sahrens pvd = oldvd->vdev_parent; 1179789Sahrens 1180789Sahrens /* 1181789Sahrens * The parent must be a mirror or the root, unless we're replacing; 1182789Sahrens * in that case, the parent can be anything but another replacing vdev. 1183789Sahrens */ 1184789Sahrens if (pvd->vdev_ops != &vdev_mirror_ops && 1185789Sahrens pvd->vdev_ops != &vdev_root_ops && 1186789Sahrens (!replacing || pvd->vdev_ops == &vdev_replacing_ops)) 1187789Sahrens return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1188789Sahrens 1189789Sahrens newrootvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD); 1190789Sahrens 1191789Sahrens if (newrootvd == NULL || newrootvd->vdev_children != 1) 1192789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1193789Sahrens 1194789Sahrens newvd = newrootvd->vdev_child[0]; 1195789Sahrens 1196789Sahrens if (!newvd->vdev_ops->vdev_op_leaf) 1197789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1198789Sahrens 1199789Sahrens if ((error = vdev_create(newrootvd, txg)) != 0) 1200789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, error)); 1201789Sahrens 12021175Slling /* 12031175Slling * Compare the new device size with the replaceable/attachable 12041175Slling * device size. 12051175Slling */ 12061175Slling if (newvd->vdev_psize < vdev_get_rsize(oldvd)) 1207789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 1208789Sahrens 1209789Sahrens if (newvd->vdev_ashift != oldvd->vdev_ashift && oldvd->vdev_ashift != 0) 1210789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 1211789Sahrens 1212789Sahrens /* 1213789Sahrens * If this is an in-place replacement, update oldvd's path and devid 1214789Sahrens * to make it distinguishable from newvd, and unopenable from now on. 1215789Sahrens */ 1216789Sahrens if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 1217789Sahrens spa_strfree(oldvd->vdev_path); 1218789Sahrens oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 1219789Sahrens KM_SLEEP); 1220789Sahrens (void) sprintf(oldvd->vdev_path, "%s/%s", 1221789Sahrens newvd->vdev_path, "old"); 1222789Sahrens if (oldvd->vdev_devid != NULL) { 1223789Sahrens spa_strfree(oldvd->vdev_devid); 1224789Sahrens oldvd->vdev_devid = NULL; 1225789Sahrens } 1226789Sahrens } 1227789Sahrens 1228789Sahrens /* 1229789Sahrens * If the parent is not a mirror, or if we're replacing, 1230789Sahrens * insert the new mirror/replacing vdev above oldvd. 1231789Sahrens */ 1232789Sahrens if (pvd->vdev_ops != pvops) 1233789Sahrens pvd = vdev_add_parent(oldvd, pvops); 1234789Sahrens 1235789Sahrens ASSERT(pvd->vdev_top->vdev_parent == rvd); 1236789Sahrens ASSERT(pvd->vdev_ops == pvops); 1237789Sahrens ASSERT(oldvd->vdev_parent == pvd); 1238789Sahrens 1239789Sahrens /* 1240789Sahrens * Extract the new device from its root and add it to pvd. 1241789Sahrens */ 1242789Sahrens vdev_remove_child(newrootvd, newvd); 1243789Sahrens newvd->vdev_id = pvd->vdev_children; 1244789Sahrens vdev_add_child(pvd, newvd); 1245789Sahrens 12461544Seschrock /* 12471544Seschrock * If newvd is smaller than oldvd, but larger than its rsize, 12481544Seschrock * the addition of newvd may have decreased our parent's asize. 12491544Seschrock */ 12501544Seschrock pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize); 12511544Seschrock 1252789Sahrens tvd = newvd->vdev_top; 1253789Sahrens ASSERT(pvd->vdev_top == tvd); 1254789Sahrens ASSERT(tvd->vdev_parent == rvd); 1255789Sahrens 1256789Sahrens vdev_config_dirty(tvd); 1257789Sahrens 1258789Sahrens /* 1259789Sahrens * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 1260789Sahrens * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 1261789Sahrens */ 1262789Sahrens open_txg = txg + TXG_CONCURRENT_STATES - 1; 1263789Sahrens 1264789Sahrens mutex_enter(&newvd->vdev_dtl_lock); 1265789Sahrens space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, 1266789Sahrens open_txg - TXG_INITIAL + 1); 1267789Sahrens mutex_exit(&newvd->vdev_dtl_lock); 1268789Sahrens 12691544Seschrock dprintf("attached %s in txg %llu\n", newvd->vdev_path, txg); 12701544Seschrock 1271789Sahrens /* 1272789Sahrens * Mark newvd's DTL dirty in this txg. 1273789Sahrens */ 1274789Sahrens vdev_dirty(tvd, VDD_DTL, txg); 1275789Sahrens (void) txg_list_add(&tvd->vdev_dtl_list, newvd, txg); 1276789Sahrens 1277789Sahrens (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 1278789Sahrens 1279789Sahrens /* 1280789Sahrens * Kick off a resilver to update newvd. 1281789Sahrens */ 1282789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1283789Sahrens 1284789Sahrens return (0); 1285789Sahrens } 1286789Sahrens 1287789Sahrens /* 1288789Sahrens * Detach a device from a mirror or replacing vdev. 1289789Sahrens * If 'replace_done' is specified, only detach if the parent 1290789Sahrens * is a replacing vdev. 1291789Sahrens */ 1292789Sahrens int 12931544Seschrock spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) 1294789Sahrens { 1295789Sahrens uint64_t txg; 1296789Sahrens int c, t, error; 1297789Sahrens vdev_t *rvd = spa->spa_root_vdev; 1298789Sahrens vdev_t *vd, *pvd, *cvd, *tvd; 1299789Sahrens 1300789Sahrens txg = spa_vdev_enter(spa); 1301789Sahrens 13021544Seschrock vd = vdev_lookup_by_guid(rvd, guid); 1303789Sahrens 1304789Sahrens if (vd == NULL) 1305789Sahrens return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1306789Sahrens 13071585Sbonwick if (!vd->vdev_ops->vdev_op_leaf) 13081585Sbonwick return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 13091585Sbonwick 1310789Sahrens pvd = vd->vdev_parent; 1311789Sahrens 1312789Sahrens /* 1313789Sahrens * If replace_done is specified, only remove this device if it's 1314789Sahrens * the first child of a replacing vdev. 1315789Sahrens */ 1316789Sahrens if (replace_done && 1317789Sahrens (vd->vdev_id != 0 || pvd->vdev_ops != &vdev_replacing_ops)) 1318789Sahrens return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1319789Sahrens 1320789Sahrens /* 1321789Sahrens * Only mirror and replacing vdevs support detach. 1322789Sahrens */ 1323789Sahrens if (pvd->vdev_ops != &vdev_replacing_ops && 1324789Sahrens pvd->vdev_ops != &vdev_mirror_ops) 1325789Sahrens return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1326789Sahrens 1327789Sahrens /* 1328789Sahrens * If there's only one replica, you can't detach it. 1329789Sahrens */ 1330789Sahrens if (pvd->vdev_children <= 1) 1331789Sahrens return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1332789Sahrens 1333789Sahrens /* 1334789Sahrens * If all siblings have non-empty DTLs, this device may have the only 1335789Sahrens * valid copy of the data, which means we cannot safely detach it. 1336789Sahrens * 1337789Sahrens * XXX -- as in the vdev_offline() case, we really want a more 1338789Sahrens * precise DTL check. 1339789Sahrens */ 1340789Sahrens for (c = 0; c < pvd->vdev_children; c++) { 1341789Sahrens uint64_t dirty; 1342789Sahrens 1343789Sahrens cvd = pvd->vdev_child[c]; 1344789Sahrens if (cvd == vd) 1345789Sahrens continue; 1346789Sahrens if (vdev_is_dead(cvd)) 1347789Sahrens continue; 1348789Sahrens mutex_enter(&cvd->vdev_dtl_lock); 1349789Sahrens dirty = cvd->vdev_dtl_map.sm_space | 1350789Sahrens cvd->vdev_dtl_scrub.sm_space; 1351789Sahrens mutex_exit(&cvd->vdev_dtl_lock); 1352789Sahrens if (!dirty) 1353789Sahrens break; 1354789Sahrens } 1355789Sahrens if (c == pvd->vdev_children) 1356789Sahrens return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1357789Sahrens 1358789Sahrens /* 1359789Sahrens * Erase the disk labels so the disk can be used for other things. 1360789Sahrens * This must be done after all other error cases are handled, 1361789Sahrens * but before we disembowel vd (so we can still do I/O to it). 1362789Sahrens * But if we can't do it, don't treat the error as fatal -- 1363789Sahrens * it may be that the unwritability of the disk is the reason 1364789Sahrens * it's being detached! 1365789Sahrens */ 1366789Sahrens error = vdev_label_init(vd, 0); 1367789Sahrens if (error) 1368789Sahrens dprintf("unable to erase labels on %s\n", vdev_description(vd)); 1369789Sahrens 1370789Sahrens /* 1371789Sahrens * Remove vd from its parent and compact the parent's children. 1372789Sahrens */ 1373789Sahrens vdev_remove_child(pvd, vd); 1374789Sahrens vdev_compact_children(pvd); 1375789Sahrens 1376789Sahrens /* 1377789Sahrens * Remember one of the remaining children so we can get tvd below. 1378789Sahrens */ 1379789Sahrens cvd = pvd->vdev_child[0]; 1380789Sahrens 1381789Sahrens /* 1382789Sahrens * If the parent mirror/replacing vdev only has one child, 1383789Sahrens * the parent is no longer needed. Remove it from the tree. 1384789Sahrens */ 1385789Sahrens if (pvd->vdev_children == 1) 1386789Sahrens vdev_remove_parent(cvd); 1387789Sahrens 1388789Sahrens /* 1389789Sahrens * We don't set tvd until now because the parent we just removed 1390789Sahrens * may have been the previous top-level vdev. 1391789Sahrens */ 1392789Sahrens tvd = cvd->vdev_top; 1393789Sahrens ASSERT(tvd->vdev_parent == rvd); 1394789Sahrens 1395789Sahrens /* 1396789Sahrens * Reopen this top-level vdev to reassess health after detach. 1397789Sahrens */ 13981544Seschrock vdev_reopen(tvd); 1399789Sahrens 1400789Sahrens /* 1401789Sahrens * If the device we just detached was smaller than the others, 14021544Seschrock * it may be possible to add metaslabs (i.e. grow the pool). We ignore 14031544Seschrock * the error here because the detach still succeeded - we just weren't 14041544Seschrock * able to reinitialize the metaslabs. This pool is in for a world of 14051544Seschrock * hurt, in any case. 1406789Sahrens */ 14071544Seschrock (void) vdev_metaslab_init(tvd, txg); 1408789Sahrens 1409789Sahrens vdev_config_dirty(tvd); 1410789Sahrens 1411789Sahrens /* 1412789Sahrens * Mark vd's DTL as dirty in this txg. 1413789Sahrens * vdev_dtl_sync() will see that vd->vdev_detached is set 1414789Sahrens * and free vd's DTL object in syncing context. 1415789Sahrens * But first make sure we're not on any *other* txg's DTL list, 1416789Sahrens * to prevent vd from being accessed after it's freed. 1417789Sahrens */ 1418789Sahrens vdev_dirty(tvd, VDD_DTL, txg); 1419789Sahrens vd->vdev_detached = B_TRUE; 1420789Sahrens for (t = 0; t < TXG_SIZE; t++) 1421789Sahrens (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 1422789Sahrens (void) txg_list_add(&tvd->vdev_dtl_list, vd, txg); 1423789Sahrens 14241544Seschrock dprintf("detached %s in txg %llu\n", vd->vdev_path, txg); 1425789Sahrens 1426789Sahrens return (spa_vdev_exit(spa, vd, txg, 0)); 1427789Sahrens } 1428789Sahrens 1429789Sahrens /* 14301544Seschrock * Find any device that's done replacing, so we can detach it. 1431789Sahrens */ 14321544Seschrock static vdev_t * 14331544Seschrock spa_vdev_replace_done_hunt(vdev_t *vd) 1434789Sahrens { 14351544Seschrock vdev_t *newvd, *oldvd; 1436789Sahrens int c; 1437789Sahrens 14381544Seschrock for (c = 0; c < vd->vdev_children; c++) { 14391544Seschrock oldvd = spa_vdev_replace_done_hunt(vd->vdev_child[c]); 14401544Seschrock if (oldvd != NULL) 14411544Seschrock return (oldvd); 14421544Seschrock } 1443789Sahrens 1444789Sahrens if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 14451544Seschrock oldvd = vd->vdev_child[0]; 14461544Seschrock newvd = vd->vdev_child[1]; 1447789Sahrens 14481544Seschrock mutex_enter(&newvd->vdev_dtl_lock); 14491544Seschrock if (newvd->vdev_dtl_map.sm_space == 0 && 14501544Seschrock newvd->vdev_dtl_scrub.sm_space == 0) { 14511544Seschrock mutex_exit(&newvd->vdev_dtl_lock); 14521544Seschrock return (oldvd); 14531544Seschrock } 14541544Seschrock mutex_exit(&newvd->vdev_dtl_lock); 14551544Seschrock } 1456789Sahrens 14571544Seschrock return (NULL); 1458789Sahrens } 1459789Sahrens 14601544Seschrock static void 1461789Sahrens spa_vdev_replace_done(spa_t *spa) 1462789Sahrens { 14631544Seschrock vdev_t *vd; 14641544Seschrock uint64_t guid; 1465789Sahrens 14661544Seschrock spa_config_enter(spa, RW_READER, FTAG); 1467789Sahrens 14681544Seschrock while ((vd = spa_vdev_replace_done_hunt(spa->spa_root_vdev)) != NULL) { 14691544Seschrock guid = vd->vdev_guid; 14701544Seschrock spa_config_exit(spa, FTAG); 14711544Seschrock if (spa_vdev_detach(spa, guid, B_TRUE) != 0) 14721544Seschrock return; 14731544Seschrock spa_config_enter(spa, RW_READER, FTAG); 1474789Sahrens } 1475789Sahrens 14761544Seschrock spa_config_exit(spa, FTAG); 1477789Sahrens } 1478789Sahrens 1479789Sahrens /* 14801354Seschrock * Update the stored path for this vdev. Dirty the vdev configuration, relying 14811354Seschrock * on spa_vdev_enter/exit() to synchronize the labels and cache. 14821354Seschrock */ 14831354Seschrock int 14841354Seschrock spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 14851354Seschrock { 14861354Seschrock vdev_t *rvd, *vd; 14871354Seschrock uint64_t txg; 14881354Seschrock 14891354Seschrock rvd = spa->spa_root_vdev; 14901354Seschrock 14911354Seschrock txg = spa_vdev_enter(spa); 14921354Seschrock 14931354Seschrock if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) 14941354Seschrock return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 14951354Seschrock 14961585Sbonwick if (!vd->vdev_ops->vdev_op_leaf) 14971585Sbonwick return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 14981585Sbonwick 14991354Seschrock spa_strfree(vd->vdev_path); 15001354Seschrock vd->vdev_path = spa_strdup(newpath); 15011354Seschrock 15021354Seschrock vdev_config_dirty(vd->vdev_top); 15031354Seschrock 15041354Seschrock return (spa_vdev_exit(spa, NULL, txg, 0)); 15051354Seschrock } 15061354Seschrock 15071354Seschrock /* 1508789Sahrens * ========================================================================== 1509789Sahrens * SPA Scrubbing 1510789Sahrens * ========================================================================== 1511789Sahrens */ 1512789Sahrens 15131544Seschrock void 15141544Seschrock spa_scrub_throttle(spa_t *spa, int direction) 15151544Seschrock { 15161544Seschrock mutex_enter(&spa->spa_scrub_lock); 15171544Seschrock spa->spa_scrub_throttled += direction; 15181544Seschrock ASSERT(spa->spa_scrub_throttled >= 0); 15191544Seschrock if (spa->spa_scrub_throttled == 0) 15201544Seschrock cv_broadcast(&spa->spa_scrub_io_cv); 15211544Seschrock mutex_exit(&spa->spa_scrub_lock); 15221544Seschrock } 1523789Sahrens 1524789Sahrens static void 1525789Sahrens spa_scrub_io_done(zio_t *zio) 1526789Sahrens { 1527789Sahrens spa_t *spa = zio->io_spa; 1528789Sahrens 1529789Sahrens zio_buf_free(zio->io_data, zio->io_size); 1530789Sahrens 1531789Sahrens mutex_enter(&spa->spa_scrub_lock); 15321544Seschrock if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 15331544Seschrock vdev_t *vd = zio->io_vd; 1534789Sahrens spa->spa_scrub_errors++; 1535789Sahrens mutex_enter(&vd->vdev_stat_lock); 1536789Sahrens vd->vdev_stat.vs_scrub_errors++; 1537789Sahrens mutex_exit(&vd->vdev_stat_lock); 1538789Sahrens } 15391544Seschrock if (--spa->spa_scrub_inflight == 0) { 15401544Seschrock cv_broadcast(&spa->spa_scrub_io_cv); 15411544Seschrock ASSERT(spa->spa_scrub_throttled == 0); 15421544Seschrock } 15431544Seschrock mutex_exit(&spa->spa_scrub_lock); 1544789Sahrens } 1545789Sahrens 1546789Sahrens static void 15471544Seschrock spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags, 15481544Seschrock zbookmark_t *zb) 1549789Sahrens { 1550789Sahrens size_t size = BP_GET_LSIZE(bp); 1551789Sahrens void *data = zio_buf_alloc(size); 1552789Sahrens 1553789Sahrens mutex_enter(&spa->spa_scrub_lock); 1554789Sahrens spa->spa_scrub_inflight++; 1555789Sahrens mutex_exit(&spa->spa_scrub_lock); 1556789Sahrens 15571544Seschrock if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET) 15581544Seschrock flags |= ZIO_FLAG_SPECULATIVE; /* intent log block */ 15591544Seschrock 15601544Seschrock flags |= ZIO_FLAG_CANFAIL; 15611544Seschrock 1562789Sahrens zio_nowait(zio_read(NULL, spa, bp, data, size, 15631544Seschrock spa_scrub_io_done, NULL, priority, flags, zb)); 1564789Sahrens } 1565789Sahrens 1566789Sahrens /* ARGSUSED */ 1567789Sahrens static int 1568789Sahrens spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) 1569789Sahrens { 1570789Sahrens blkptr_t *bp = &bc->bc_blkptr; 1571789Sahrens vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[0])); 1572789Sahrens 1573789Sahrens if (bc->bc_errno || vd == NULL) { 1574789Sahrens /* 1575789Sahrens * We can't scrub this block, but we can continue to scrub 1576789Sahrens * the rest of the pool. Note the error and move along. 1577789Sahrens */ 1578789Sahrens mutex_enter(&spa->spa_scrub_lock); 1579789Sahrens spa->spa_scrub_errors++; 1580789Sahrens mutex_exit(&spa->spa_scrub_lock); 1581789Sahrens 1582789Sahrens if (vd != NULL) { 1583789Sahrens mutex_enter(&vd->vdev_stat_lock); 1584789Sahrens vd->vdev_stat.vs_scrub_errors++; 1585789Sahrens mutex_exit(&vd->vdev_stat_lock); 1586789Sahrens } 1587789Sahrens 1588789Sahrens return (ERESTART); 1589789Sahrens } 1590789Sahrens 1591789Sahrens ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg); 1592789Sahrens 1593789Sahrens /* 1594789Sahrens * Keep track of how much data we've examined so that 1595789Sahrens * zpool(1M) status can make useful progress reports. 1596789Sahrens */ 1597789Sahrens mutex_enter(&vd->vdev_stat_lock); 1598789Sahrens vd->vdev_stat.vs_scrub_examined += BP_GET_ASIZE(bp); 1599789Sahrens mutex_exit(&vd->vdev_stat_lock); 1600789Sahrens 1601789Sahrens if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) { 1602789Sahrens if (DVA_GET_GANG(&bp->blk_dva[0])) { 1603789Sahrens /* 1604789Sahrens * Gang members may be spread across multiple vdevs, 1605789Sahrens * so the best we can do is look at the pool-wide DTL. 1606789Sahrens * XXX -- it would be better to change our allocation 1607789Sahrens * policy to ensure that this can't happen. 1608789Sahrens */ 1609789Sahrens vd = spa->spa_root_vdev; 1610789Sahrens } 1611789Sahrens if (vdev_dtl_contains(&vd->vdev_dtl_map, bp->blk_birth, 1)) { 1612789Sahrens spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER, 16131544Seschrock ZIO_FLAG_RESILVER, &bc->bc_bookmark); 1614789Sahrens } 1615789Sahrens } else { 1616789Sahrens spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB, 16171544Seschrock ZIO_FLAG_SCRUB, &bc->bc_bookmark); 1618789Sahrens } 1619789Sahrens 1620789Sahrens return (0); 1621789Sahrens } 1622789Sahrens 1623789Sahrens static void 1624789Sahrens spa_scrub_thread(spa_t *spa) 1625789Sahrens { 1626789Sahrens callb_cpr_t cprinfo; 1627789Sahrens traverse_handle_t *th = spa->spa_scrub_th; 1628789Sahrens vdev_t *rvd = spa->spa_root_vdev; 1629789Sahrens pool_scrub_type_t scrub_type = spa->spa_scrub_type; 1630789Sahrens int error = 0; 1631789Sahrens boolean_t complete; 1632789Sahrens 1633789Sahrens CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG); 1634789Sahrens 1635797Sbonwick /* 1636797Sbonwick * If we're restarting due to a snapshot create/delete, 1637797Sbonwick * wait for that to complete. 1638797Sbonwick */ 1639797Sbonwick txg_wait_synced(spa_get_dsl(spa), 0); 1640797Sbonwick 16411544Seschrock dprintf("start %s mintxg=%llu maxtxg=%llu\n", 16421544Seschrock scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 16431544Seschrock spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg); 16441544Seschrock 16451544Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 16461544Seschrock vdev_reopen(rvd); /* purge all vdev caches */ 1647789Sahrens vdev_config_dirty(rvd); /* rewrite all disk labels */ 1648789Sahrens vdev_scrub_stat_update(rvd, scrub_type, B_FALSE); 16491544Seschrock spa_config_exit(spa, FTAG); 1650789Sahrens 1651789Sahrens mutex_enter(&spa->spa_scrub_lock); 1652789Sahrens spa->spa_scrub_errors = 0; 1653789Sahrens spa->spa_scrub_active = 1; 16541544Seschrock ASSERT(spa->spa_scrub_inflight == 0); 16551544Seschrock ASSERT(spa->spa_scrub_throttled == 0); 1656789Sahrens 1657789Sahrens while (!spa->spa_scrub_stop) { 1658789Sahrens CALLB_CPR_SAFE_BEGIN(&cprinfo); 16591544Seschrock while (spa->spa_scrub_suspended) { 1660789Sahrens spa->spa_scrub_active = 0; 1661789Sahrens cv_broadcast(&spa->spa_scrub_cv); 1662789Sahrens cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 1663789Sahrens spa->spa_scrub_active = 1; 1664789Sahrens } 1665789Sahrens CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock); 1666789Sahrens 1667789Sahrens if (spa->spa_scrub_restart_txg != 0) 1668789Sahrens break; 1669789Sahrens 1670789Sahrens mutex_exit(&spa->spa_scrub_lock); 1671789Sahrens error = traverse_more(th); 1672789Sahrens mutex_enter(&spa->spa_scrub_lock); 1673789Sahrens if (error != EAGAIN) 1674789Sahrens break; 16751544Seschrock 16761544Seschrock while (spa->spa_scrub_throttled > 0) 16771544Seschrock cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1678789Sahrens } 1679789Sahrens 1680789Sahrens while (spa->spa_scrub_inflight) 1681789Sahrens cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1682789Sahrens 1683*1601Sbonwick spa->spa_scrub_active = 0; 1684*1601Sbonwick cv_broadcast(&spa->spa_scrub_cv); 1685*1601Sbonwick 1686*1601Sbonwick mutex_exit(&spa->spa_scrub_lock); 1687*1601Sbonwick 1688*1601Sbonwick spa_config_enter(spa, RW_WRITER, FTAG); 1689*1601Sbonwick 1690*1601Sbonwick mutex_enter(&spa->spa_scrub_lock); 1691*1601Sbonwick 1692*1601Sbonwick /* 1693*1601Sbonwick * Note: we check spa_scrub_restart_txg under both spa_scrub_lock 1694*1601Sbonwick * AND the spa config lock to synchronize with any config changes 1695*1601Sbonwick * that revise the DTLs under spa_vdev_enter() / spa_vdev_exit(). 1696*1601Sbonwick */ 1697789Sahrens if (spa->spa_scrub_restart_txg != 0) 1698789Sahrens error = ERESTART; 1699789Sahrens 17001544Seschrock if (spa->spa_scrub_stop) 17011544Seschrock error = EINTR; 17021544Seschrock 1703789Sahrens /* 17041544Seschrock * Even if there were uncorrectable errors, we consider the scrub 17051544Seschrock * completed. The downside is that if there is a transient error during 17061544Seschrock * a resilver, we won't resilver the data properly to the target. But 17071544Seschrock * if the damage is permanent (more likely) we will resilver forever, 17081544Seschrock * which isn't really acceptable. Since there is enough information for 17091544Seschrock * the user to know what has failed and why, this seems like a more 17101544Seschrock * tractable approach. 1711789Sahrens */ 17121544Seschrock complete = (error == 0); 1713789Sahrens 17141544Seschrock dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n", 17151544Seschrock scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 1716789Sahrens spa->spa_scrub_maxtxg, complete ? "done" : "FAILED", 1717789Sahrens error, spa->spa_scrub_errors, spa->spa_scrub_stop); 1718789Sahrens 1719789Sahrens mutex_exit(&spa->spa_scrub_lock); 1720789Sahrens 1721789Sahrens /* 1722789Sahrens * If the scrub/resilver completed, update all DTLs to reflect this. 1723789Sahrens * Whether it succeeded or not, vacate all temporary scrub DTLs. 1724789Sahrens */ 1725789Sahrens vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1, 1726789Sahrens complete ? spa->spa_scrub_maxtxg : 0, B_TRUE); 1727789Sahrens vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete); 17281544Seschrock spa_errlog_rotate(spa); 1729*1601Sbonwick 17301544Seschrock spa_config_exit(spa, FTAG); 1731789Sahrens 1732789Sahrens mutex_enter(&spa->spa_scrub_lock); 1733789Sahrens 17341544Seschrock /* 17351544Seschrock * We may have finished replacing a device. 17361544Seschrock * Let the async thread assess this and handle the detach. 17371544Seschrock */ 17381544Seschrock spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 1739789Sahrens 1740789Sahrens /* 1741789Sahrens * If we were told to restart, our final act is to start a new scrub. 1742789Sahrens */ 1743789Sahrens if (error == ERESTART) 17441544Seschrock spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ? 17451544Seschrock SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB); 1746789Sahrens 17471544Seschrock spa->spa_scrub_type = POOL_SCRUB_NONE; 17481544Seschrock spa->spa_scrub_active = 0; 17491544Seschrock spa->spa_scrub_thread = NULL; 17501544Seschrock cv_broadcast(&spa->spa_scrub_cv); 1751789Sahrens CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */ 1752789Sahrens thread_exit(); 1753789Sahrens } 1754789Sahrens 1755789Sahrens void 1756789Sahrens spa_scrub_suspend(spa_t *spa) 1757789Sahrens { 1758789Sahrens mutex_enter(&spa->spa_scrub_lock); 17591544Seschrock spa->spa_scrub_suspended++; 1760789Sahrens while (spa->spa_scrub_active) { 1761789Sahrens cv_broadcast(&spa->spa_scrub_cv); 1762789Sahrens cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 1763789Sahrens } 1764789Sahrens while (spa->spa_scrub_inflight) 1765789Sahrens cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1766789Sahrens mutex_exit(&spa->spa_scrub_lock); 1767789Sahrens } 1768789Sahrens 1769789Sahrens void 1770789Sahrens spa_scrub_resume(spa_t *spa) 1771789Sahrens { 1772789Sahrens mutex_enter(&spa->spa_scrub_lock); 17731544Seschrock ASSERT(spa->spa_scrub_suspended != 0); 17741544Seschrock if (--spa->spa_scrub_suspended == 0) 1775789Sahrens cv_broadcast(&spa->spa_scrub_cv); 1776789Sahrens mutex_exit(&spa->spa_scrub_lock); 1777789Sahrens } 1778789Sahrens 1779789Sahrens void 1780789Sahrens spa_scrub_restart(spa_t *spa, uint64_t txg) 1781789Sahrens { 1782789Sahrens /* 1783789Sahrens * Something happened (e.g. snapshot create/delete) that means 1784789Sahrens * we must restart any in-progress scrubs. The itinerary will 1785789Sahrens * fix this properly. 1786789Sahrens */ 1787789Sahrens mutex_enter(&spa->spa_scrub_lock); 1788789Sahrens spa->spa_scrub_restart_txg = txg; 1789789Sahrens mutex_exit(&spa->spa_scrub_lock); 1790789Sahrens } 1791789Sahrens 17921544Seschrock int 17931544Seschrock spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) 1794789Sahrens { 1795789Sahrens space_seg_t *ss; 1796789Sahrens uint64_t mintxg, maxtxg; 1797789Sahrens vdev_t *rvd = spa->spa_root_vdev; 17981544Seschrock int advance = ADVANCE_PRE | ADVANCE_ZIL; 1799789Sahrens 1800789Sahrens if ((uint_t)type >= POOL_SCRUB_TYPES) 1801789Sahrens return (ENOTSUP); 1802789Sahrens 18031544Seschrock mutex_enter(&spa->spa_scrub_lock); 18041544Seschrock 1805789Sahrens /* 1806789Sahrens * If there's a scrub or resilver already in progress, stop it. 1807789Sahrens */ 1808789Sahrens while (spa->spa_scrub_thread != NULL) { 1809789Sahrens /* 1810789Sahrens * Don't stop a resilver unless forced. 1811789Sahrens */ 18121544Seschrock if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) { 18131544Seschrock mutex_exit(&spa->spa_scrub_lock); 1814789Sahrens return (EBUSY); 18151544Seschrock } 1816789Sahrens spa->spa_scrub_stop = 1; 1817789Sahrens cv_broadcast(&spa->spa_scrub_cv); 1818789Sahrens cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 1819789Sahrens } 1820789Sahrens 1821789Sahrens /* 1822789Sahrens * Terminate the previous traverse. 1823789Sahrens */ 1824789Sahrens if (spa->spa_scrub_th != NULL) { 1825789Sahrens traverse_fini(spa->spa_scrub_th); 1826789Sahrens spa->spa_scrub_th = NULL; 1827789Sahrens } 1828789Sahrens 18291544Seschrock if (rvd == NULL) { 18301544Seschrock ASSERT(spa->spa_scrub_stop == 0); 18311544Seschrock ASSERT(spa->spa_scrub_type == type); 18321544Seschrock ASSERT(spa->spa_scrub_restart_txg == 0); 18331544Seschrock mutex_exit(&spa->spa_scrub_lock); 18341544Seschrock return (0); 18351544Seschrock } 1836789Sahrens 1837789Sahrens mintxg = TXG_INITIAL - 1; 1838789Sahrens maxtxg = spa_last_synced_txg(spa) + 1; 1839789Sahrens 18401544Seschrock mutex_enter(&rvd->vdev_dtl_lock); 1841789Sahrens 18421544Seschrock if (rvd->vdev_dtl_map.sm_space == 0) { 18431544Seschrock /* 18441544Seschrock * The pool-wide DTL is empty. 18451544Seschrock * If this is a resilver, there's nothing to do. 18461544Seschrock */ 18471544Seschrock if (type == POOL_SCRUB_RESILVER) 18481544Seschrock type = POOL_SCRUB_NONE; 18491544Seschrock } else { 18501544Seschrock /* 18511544Seschrock * The pool-wide DTL is non-empty. 18521544Seschrock * If this is a normal scrub, upgrade to a resilver instead. 18531544Seschrock */ 18541544Seschrock if (type == POOL_SCRUB_EVERYTHING) 18551544Seschrock type = POOL_SCRUB_RESILVER; 18561544Seschrock } 1857789Sahrens 18581544Seschrock if (type == POOL_SCRUB_RESILVER) { 1859789Sahrens /* 1860789Sahrens * Determine the resilvering boundaries. 1861789Sahrens * 1862789Sahrens * Note: (mintxg, maxtxg) is an open interval, 1863789Sahrens * i.e. mintxg and maxtxg themselves are not included. 1864789Sahrens * 1865789Sahrens * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1 1866789Sahrens * so we don't claim to resilver a txg that's still changing. 1867789Sahrens */ 1868789Sahrens ss = avl_first(&rvd->vdev_dtl_map.sm_root); 18691544Seschrock mintxg = ss->ss_start - 1; 1870789Sahrens ss = avl_last(&rvd->vdev_dtl_map.sm_root); 18711544Seschrock maxtxg = MIN(ss->ss_end, maxtxg); 1872789Sahrens 18731544Seschrock advance |= ADVANCE_PRUNE; 1874789Sahrens } 1875789Sahrens 18761544Seschrock mutex_exit(&rvd->vdev_dtl_lock); 18771544Seschrock 18781544Seschrock spa->spa_scrub_stop = 0; 18791544Seschrock spa->spa_scrub_type = type; 18801544Seschrock spa->spa_scrub_restart_txg = 0; 18811544Seschrock 18821544Seschrock if (type != POOL_SCRUB_NONE) { 18831544Seschrock spa->spa_scrub_mintxg = mintxg; 1884789Sahrens spa->spa_scrub_maxtxg = maxtxg; 1885789Sahrens spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL, 1886789Sahrens advance, ZIO_FLAG_CANFAIL); 1887789Sahrens traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg); 1888789Sahrens spa->spa_scrub_thread = thread_create(NULL, 0, 1889789Sahrens spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri); 1890789Sahrens } 1891789Sahrens 18921544Seschrock mutex_exit(&spa->spa_scrub_lock); 18931544Seschrock 1894789Sahrens return (0); 1895789Sahrens } 1896789Sahrens 18971544Seschrock /* 18981544Seschrock * ========================================================================== 18991544Seschrock * SPA async task processing 19001544Seschrock * ========================================================================== 19011544Seschrock */ 19021544Seschrock 19031544Seschrock static void 19041544Seschrock spa_async_reopen(spa_t *spa) 1905789Sahrens { 19061544Seschrock vdev_t *rvd = spa->spa_root_vdev; 19071544Seschrock vdev_t *tvd; 19081544Seschrock int c; 19091544Seschrock 19101544Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 19111544Seschrock 19121544Seschrock for (c = 0; c < rvd->vdev_children; c++) { 19131544Seschrock tvd = rvd->vdev_child[c]; 19141544Seschrock if (tvd->vdev_reopen_wanted) { 19151544Seschrock tvd->vdev_reopen_wanted = 0; 19161544Seschrock vdev_reopen(tvd); 19171544Seschrock } 19181544Seschrock } 1919789Sahrens 19201544Seschrock spa_config_exit(spa, FTAG); 19211544Seschrock } 19221544Seschrock 19231544Seschrock static void 19241544Seschrock spa_async_thread(spa_t *spa) 19251544Seschrock { 19261544Seschrock int tasks; 19271544Seschrock 19281544Seschrock ASSERT(spa->spa_sync_on); 1929789Sahrens 19301544Seschrock mutex_enter(&spa->spa_async_lock); 19311544Seschrock tasks = spa->spa_async_tasks; 19321544Seschrock spa->spa_async_tasks = 0; 19331544Seschrock mutex_exit(&spa->spa_async_lock); 19341544Seschrock 19351544Seschrock /* 19361544Seschrock * See if any devices need to be reopened. 19371544Seschrock */ 19381544Seschrock if (tasks & SPA_ASYNC_REOPEN) 19391544Seschrock spa_async_reopen(spa); 19401544Seschrock 19411544Seschrock /* 19421544Seschrock * If any devices are done replacing, detach them. 19431544Seschrock */ 19441544Seschrock if (tasks & SPA_ASYNC_REPLACE_DONE) 1945789Sahrens spa_vdev_replace_done(spa); 1946789Sahrens 19471544Seschrock /* 19481544Seschrock * Kick off a scrub. 19491544Seschrock */ 19501544Seschrock if (tasks & SPA_ASYNC_SCRUB) 19511544Seschrock VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0); 19521544Seschrock 19531544Seschrock /* 19541544Seschrock * Kick off a resilver. 19551544Seschrock */ 19561544Seschrock if (tasks & SPA_ASYNC_RESILVER) 19571544Seschrock VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 19581544Seschrock 19591544Seschrock /* 19601544Seschrock * Let the world know that we're done. 19611544Seschrock */ 19621544Seschrock mutex_enter(&spa->spa_async_lock); 19631544Seschrock spa->spa_async_thread = NULL; 19641544Seschrock cv_broadcast(&spa->spa_async_cv); 19651544Seschrock mutex_exit(&spa->spa_async_lock); 19661544Seschrock thread_exit(); 19671544Seschrock } 19681544Seschrock 19691544Seschrock void 19701544Seschrock spa_async_suspend(spa_t *spa) 19711544Seschrock { 19721544Seschrock mutex_enter(&spa->spa_async_lock); 19731544Seschrock spa->spa_async_suspended++; 19741544Seschrock while (spa->spa_async_thread != NULL) 19751544Seschrock cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 19761544Seschrock mutex_exit(&spa->spa_async_lock); 19771544Seschrock } 19781544Seschrock 19791544Seschrock void 19801544Seschrock spa_async_resume(spa_t *spa) 19811544Seschrock { 19821544Seschrock mutex_enter(&spa->spa_async_lock); 19831544Seschrock ASSERT(spa->spa_async_suspended != 0); 19841544Seschrock spa->spa_async_suspended--; 19851544Seschrock mutex_exit(&spa->spa_async_lock); 19861544Seschrock } 19871544Seschrock 19881544Seschrock static void 19891544Seschrock spa_async_dispatch(spa_t *spa) 19901544Seschrock { 19911544Seschrock mutex_enter(&spa->spa_async_lock); 19921544Seschrock if (spa->spa_async_tasks && !spa->spa_async_suspended && 19931544Seschrock spa->spa_async_thread == NULL) 19941544Seschrock spa->spa_async_thread = thread_create(NULL, 0, 19951544Seschrock spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 19961544Seschrock mutex_exit(&spa->spa_async_lock); 19971544Seschrock } 19981544Seschrock 19991544Seschrock void 20001544Seschrock spa_async_request(spa_t *spa, int task) 20011544Seschrock { 20021544Seschrock mutex_enter(&spa->spa_async_lock); 20031544Seschrock spa->spa_async_tasks |= task; 20041544Seschrock mutex_exit(&spa->spa_async_lock); 2005789Sahrens } 2006789Sahrens 2007789Sahrens /* 2008789Sahrens * ========================================================================== 2009789Sahrens * SPA syncing routines 2010789Sahrens * ========================================================================== 2011789Sahrens */ 2012789Sahrens 2013789Sahrens static void 2014789Sahrens spa_sync_deferred_frees(spa_t *spa, uint64_t txg) 2015789Sahrens { 2016789Sahrens bplist_t *bpl = &spa->spa_sync_bplist; 2017789Sahrens dmu_tx_t *tx; 2018789Sahrens blkptr_t blk; 2019789Sahrens uint64_t itor = 0; 2020789Sahrens zio_t *zio; 2021789Sahrens int error; 2022789Sahrens uint8_t c = 1; 2023789Sahrens 2024789Sahrens zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD); 2025789Sahrens 2026789Sahrens while (bplist_iterate(bpl, &itor, &blk) == 0) 2027789Sahrens zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL)); 2028789Sahrens 2029789Sahrens error = zio_wait(zio); 2030789Sahrens ASSERT3U(error, ==, 0); 2031789Sahrens 2032789Sahrens tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 2033789Sahrens bplist_vacate(bpl, tx); 2034789Sahrens 2035789Sahrens /* 2036789Sahrens * Pre-dirty the first block so we sync to convergence faster. 2037789Sahrens * (Usually only the first block is needed.) 2038789Sahrens */ 2039789Sahrens dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 2040789Sahrens dmu_tx_commit(tx); 2041789Sahrens } 2042789Sahrens 2043789Sahrens static void 2044789Sahrens spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 2045789Sahrens { 2046789Sahrens nvlist_t *config; 2047789Sahrens char *packed = NULL; 2048789Sahrens size_t nvsize = 0; 2049789Sahrens dmu_buf_t *db; 2050789Sahrens 2051789Sahrens if (list_is_empty(&spa->spa_dirty_list)) 2052789Sahrens return; 2053789Sahrens 2054789Sahrens config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE); 2055789Sahrens 2056789Sahrens spa_config_set(spa, config); 2057789Sahrens 2058789Sahrens VERIFY(nvlist_size(config, &nvsize, NV_ENCODE_XDR) == 0); 2059789Sahrens 2060789Sahrens packed = kmem_alloc(nvsize, KM_SLEEP); 2061789Sahrens 20621544Seschrock VERIFY(nvlist_pack(config, &packed, &nvsize, NV_ENCODE_XDR, 20631544Seschrock KM_SLEEP) == 0); 2064789Sahrens 2065789Sahrens dmu_write(spa->spa_meta_objset, spa->spa_config_object, 0, nvsize, 2066789Sahrens packed, tx); 2067789Sahrens 2068789Sahrens kmem_free(packed, nvsize); 2069789Sahrens 20701544Seschrock VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, 20711544Seschrock spa->spa_config_object, FTAG, &db)); 2072789Sahrens dmu_buf_will_dirty(db, tx); 2073789Sahrens *(uint64_t *)db->db_data = nvsize; 20741544Seschrock dmu_buf_rele(db, FTAG); 2075789Sahrens } 2076789Sahrens 2077789Sahrens /* 2078789Sahrens * Sync the specified transaction group. New blocks may be dirtied as 2079789Sahrens * part of the process, so we iterate until it converges. 2080789Sahrens */ 2081789Sahrens void 2082789Sahrens spa_sync(spa_t *spa, uint64_t txg) 2083789Sahrens { 2084789Sahrens dsl_pool_t *dp = spa->spa_dsl_pool; 2085789Sahrens objset_t *mos = spa->spa_meta_objset; 2086789Sahrens bplist_t *bpl = &spa->spa_sync_bplist; 2087789Sahrens vdev_t *vd; 2088789Sahrens dmu_tx_t *tx; 2089789Sahrens int dirty_vdevs; 2090789Sahrens 2091789Sahrens /* 2092789Sahrens * Lock out configuration changes. 2093789Sahrens */ 20941544Seschrock spa_config_enter(spa, RW_READER, FTAG); 2095789Sahrens 2096789Sahrens spa->spa_syncing_txg = txg; 2097789Sahrens spa->spa_sync_pass = 0; 2098789Sahrens 20991544Seschrock VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); 2100789Sahrens 2101789Sahrens /* 2102789Sahrens * If anything has changed in this txg, push the deferred frees 2103789Sahrens * from the previous txg. If not, leave them alone so that we 2104789Sahrens * don't generate work on an otherwise idle system. 2105789Sahrens */ 2106789Sahrens if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 2107789Sahrens !txg_list_empty(&dp->dp_dirty_dirs, txg)) 2108789Sahrens spa_sync_deferred_frees(spa, txg); 2109789Sahrens 2110789Sahrens /* 2111789Sahrens * Iterate to convergence. 2112789Sahrens */ 2113789Sahrens do { 2114789Sahrens spa->spa_sync_pass++; 2115789Sahrens 2116789Sahrens tx = dmu_tx_create_assigned(dp, txg); 2117789Sahrens spa_sync_config_object(spa, tx); 2118789Sahrens dmu_tx_commit(tx); 2119789Sahrens 21201544Seschrock spa_errlog_sync(spa, txg); 21211544Seschrock 2122789Sahrens dsl_pool_sync(dp, txg); 2123789Sahrens 2124789Sahrens dirty_vdevs = 0; 2125789Sahrens while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 2126789Sahrens vdev_sync(vd, txg); 2127789Sahrens dirty_vdevs++; 2128789Sahrens } 2129789Sahrens 2130789Sahrens tx = dmu_tx_create_assigned(dp, txg); 2131789Sahrens bplist_sync(bpl, tx); 2132789Sahrens dmu_tx_commit(tx); 2133789Sahrens 2134789Sahrens } while (dirty_vdevs); 2135789Sahrens 2136789Sahrens bplist_close(bpl); 2137789Sahrens 2138789Sahrens dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 2139789Sahrens 2140789Sahrens /* 2141789Sahrens * Rewrite the vdev configuration (which includes the uberblock) 2142789Sahrens * to commit the transaction group. 2143789Sahrens */ 21441544Seschrock VERIFY(0 == spa_sync_labels(spa, txg)); 2145789Sahrens 2146789Sahrens /* 2147789Sahrens * Make a stable copy of the fully synced uberblock. 2148789Sahrens * We use this as the root for pool traversals. 2149789Sahrens */ 2150789Sahrens spa->spa_traverse_wanted = 1; /* tells traverse_more() to stop */ 2151789Sahrens 2152789Sahrens spa_scrub_suspend(spa); /* stop scrubbing and finish I/Os */ 2153789Sahrens 2154789Sahrens rw_enter(&spa->spa_traverse_lock, RW_WRITER); 2155789Sahrens spa->spa_traverse_wanted = 0; 2156789Sahrens spa->spa_ubsync = spa->spa_uberblock; 2157789Sahrens rw_exit(&spa->spa_traverse_lock); 2158789Sahrens 2159789Sahrens spa_scrub_resume(spa); /* resume scrub with new ubsync */ 2160789Sahrens 2161789Sahrens /* 2162789Sahrens * Clean up the ZIL records for the synced txg. 2163789Sahrens */ 2164789Sahrens dsl_pool_zil_clean(dp); 2165789Sahrens 2166789Sahrens /* 2167789Sahrens * Update usable space statistics. 2168789Sahrens */ 2169789Sahrens while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 2170789Sahrens vdev_sync_done(vd, txg); 2171789Sahrens 2172789Sahrens /* 2173789Sahrens * It had better be the case that we didn't dirty anything 2174789Sahrens * since spa_sync_labels(). 2175789Sahrens */ 2176789Sahrens ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 2177789Sahrens ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 2178789Sahrens ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 2179789Sahrens ASSERT(bpl->bpl_queue == NULL); 2180789Sahrens 21811544Seschrock spa_config_exit(spa, FTAG); 21821544Seschrock 21831544Seschrock /* 21841544Seschrock * If any async tasks have been requested, kick them off. 21851544Seschrock */ 21861544Seschrock spa_async_dispatch(spa); 2187789Sahrens } 2188789Sahrens 2189789Sahrens /* 2190789Sahrens * Sync all pools. We don't want to hold the namespace lock across these 2191789Sahrens * operations, so we take a reference on the spa_t and drop the lock during the 2192789Sahrens * sync. 2193789Sahrens */ 2194789Sahrens void 2195789Sahrens spa_sync_allpools(void) 2196789Sahrens { 2197789Sahrens spa_t *spa = NULL; 2198789Sahrens mutex_enter(&spa_namespace_lock); 2199789Sahrens while ((spa = spa_next(spa)) != NULL) { 2200789Sahrens if (spa_state(spa) != POOL_STATE_ACTIVE) 2201789Sahrens continue; 2202789Sahrens spa_open_ref(spa, FTAG); 2203789Sahrens mutex_exit(&spa_namespace_lock); 2204789Sahrens txg_wait_synced(spa_get_dsl(spa), 0); 2205789Sahrens mutex_enter(&spa_namespace_lock); 2206789Sahrens spa_close(spa, FTAG); 2207789Sahrens } 2208789Sahrens mutex_exit(&spa_namespace_lock); 2209789Sahrens } 2210789Sahrens 2211789Sahrens /* 2212789Sahrens * ========================================================================== 2213789Sahrens * Miscellaneous routines 2214789Sahrens * ========================================================================== 2215789Sahrens */ 2216789Sahrens 2217789Sahrens int 2218789Sahrens spa_busy(void) 2219789Sahrens { 2220789Sahrens return (spa_active_count != 0); 2221789Sahrens } 2222789Sahrens 2223789Sahrens /* 2224789Sahrens * Remove all pools in the system. 2225789Sahrens */ 2226789Sahrens void 2227789Sahrens spa_evict_all(void) 2228789Sahrens { 2229789Sahrens spa_t *spa; 2230789Sahrens 2231789Sahrens /* 2232789Sahrens * Remove all cached state. All pools should be closed now, 2233789Sahrens * so every spa in the AVL tree should be unreferenced. 2234789Sahrens */ 2235789Sahrens mutex_enter(&spa_namespace_lock); 2236789Sahrens while ((spa = spa_next(NULL)) != NULL) { 2237789Sahrens /* 22381544Seschrock * Stop async tasks. The async thread may need to detach 22391544Seschrock * a device that's been replaced, which requires grabbing 22401544Seschrock * spa_namespace_lock, so we must drop it here. 2241789Sahrens */ 2242789Sahrens spa_open_ref(spa, FTAG); 2243789Sahrens mutex_exit(&spa_namespace_lock); 22441544Seschrock spa_async_suspend(spa); 2245789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 2246789Sahrens mutex_enter(&spa_namespace_lock); 2247789Sahrens spa_close(spa, FTAG); 2248789Sahrens 2249789Sahrens if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 2250789Sahrens spa_unload(spa); 2251789Sahrens spa_deactivate(spa); 2252789Sahrens } 2253789Sahrens spa_remove(spa); 2254789Sahrens } 2255789Sahrens mutex_exit(&spa_namespace_lock); 2256789Sahrens } 22571544Seschrock 22581544Seschrock vdev_t * 22591544Seschrock spa_lookup_by_guid(spa_t *spa, uint64_t guid) 22601544Seschrock { 22611544Seschrock return (vdev_lookup_by_guid(spa->spa_root_vdev, guid)); 22621544Seschrock } 2263