1789Sahrens /* 2789Sahrens * CDDL HEADER START 3789Sahrens * 4789Sahrens * The contents of this file are subject to the terms of the 51544Seschrock * Common Development and Distribution License (the "License"). 61544Seschrock * You may not use this file except in compliance with the License. 7789Sahrens * 8789Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9789Sahrens * or http://www.opensolaris.org/os/licensing. 10789Sahrens * See the License for the specific language governing permissions 11789Sahrens * and limitations under the License. 12789Sahrens * 13789Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14789Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15789Sahrens * If applicable, add the following below this CDDL HEADER, with the 16789Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17789Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18789Sahrens * 19789Sahrens * CDDL HEADER END 20789Sahrens */ 21789Sahrens /* 221354Seschrock * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23789Sahrens * Use is subject to license terms. 24789Sahrens */ 25789Sahrens 26789Sahrens #pragma ident "%Z%%M% %I% %E% SMI" 27789Sahrens 28789Sahrens /* 29789Sahrens * This file contains all the routines used when modifying on-disk SPA state. 30789Sahrens * This includes opening, importing, destroying, exporting a pool, and syncing a 31789Sahrens * pool. 32789Sahrens */ 33789Sahrens 34789Sahrens #include <sys/zfs_context.h> 351544Seschrock #include <sys/fm/fs/zfs.h> 36789Sahrens #include <sys/spa_impl.h> 37789Sahrens #include <sys/zio.h> 38789Sahrens #include <sys/zio_checksum.h> 39789Sahrens #include <sys/zio_compress.h> 40789Sahrens #include <sys/dmu.h> 41789Sahrens #include <sys/dmu_tx.h> 42789Sahrens #include <sys/zap.h> 43789Sahrens #include <sys/zil.h> 44789Sahrens #include <sys/vdev_impl.h> 45789Sahrens #include <sys/metaslab.h> 46789Sahrens #include <sys/uberblock_impl.h> 47789Sahrens #include <sys/txg.h> 48789Sahrens #include <sys/avl.h> 49789Sahrens #include <sys/dmu_traverse.h> 50789Sahrens #include <sys/unique.h> 51789Sahrens #include <sys/dsl_pool.h> 52789Sahrens #include <sys/dsl_dir.h> 53789Sahrens #include <sys/dsl_prop.h> 54789Sahrens #include <sys/fs/zfs.h> 55789Sahrens #include <sys/callb.h> 56789Sahrens 57789Sahrens /* 58789Sahrens * ========================================================================== 59789Sahrens * SPA state manipulation (open/create/destroy/import/export) 60789Sahrens * ========================================================================== 61789Sahrens */ 62789Sahrens 631544Seschrock static int 641544Seschrock spa_error_entry_compare(const void *a, const void *b) 651544Seschrock { 661544Seschrock spa_error_entry_t *sa = (spa_error_entry_t *)a; 671544Seschrock spa_error_entry_t *sb = (spa_error_entry_t *)b; 681544Seschrock int ret; 691544Seschrock 701544Seschrock ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 711544Seschrock sizeof (zbookmark_t)); 721544Seschrock 731544Seschrock if (ret < 0) 741544Seschrock return (-1); 751544Seschrock else if (ret > 0) 761544Seschrock return (1); 771544Seschrock else 781544Seschrock return (0); 791544Seschrock } 801544Seschrock 811544Seschrock /* 821544Seschrock * Utility function which retrieves copies of the current logs and 831544Seschrock * re-initializes them in the process. 841544Seschrock */ 851544Seschrock void 861544Seschrock spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 871544Seschrock { 881544Seschrock ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 891544Seschrock 901544Seschrock bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 911544Seschrock bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 921544Seschrock 931544Seschrock avl_create(&spa->spa_errlist_scrub, 941544Seschrock spa_error_entry_compare, sizeof (spa_error_entry_t), 951544Seschrock offsetof(spa_error_entry_t, se_avl)); 961544Seschrock avl_create(&spa->spa_errlist_last, 971544Seschrock spa_error_entry_compare, sizeof (spa_error_entry_t), 981544Seschrock offsetof(spa_error_entry_t, se_avl)); 991544Seschrock } 1001544Seschrock 101789Sahrens /* 102789Sahrens * Activate an uninitialized pool. 103789Sahrens */ 104789Sahrens static void 105789Sahrens spa_activate(spa_t *spa) 106789Sahrens { 107789Sahrens int t; 108789Sahrens 109789Sahrens ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 110789Sahrens 111789Sahrens spa->spa_state = POOL_STATE_ACTIVE; 112789Sahrens 113789Sahrens spa->spa_normal_class = metaslab_class_create(); 114789Sahrens 115789Sahrens for (t = 0; t < ZIO_TYPES; t++) { 116789Sahrens spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue", 117789Sahrens 8, maxclsyspri, 50, INT_MAX, 118789Sahrens TASKQ_PREPOPULATE); 119789Sahrens spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr", 120789Sahrens 8, maxclsyspri, 50, INT_MAX, 121789Sahrens TASKQ_PREPOPULATE); 122789Sahrens } 123789Sahrens 124789Sahrens rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL); 125789Sahrens 126789Sahrens list_create(&spa->spa_dirty_list, sizeof (vdev_t), 127789Sahrens offsetof(vdev_t, vdev_dirty_node)); 128789Sahrens 129789Sahrens txg_list_create(&spa->spa_vdev_txg_list, 130789Sahrens offsetof(struct vdev, vdev_txg_node)); 1311544Seschrock 1321544Seschrock avl_create(&spa->spa_errlist_scrub, 1331544Seschrock spa_error_entry_compare, sizeof (spa_error_entry_t), 1341544Seschrock offsetof(spa_error_entry_t, se_avl)); 1351544Seschrock avl_create(&spa->spa_errlist_last, 1361544Seschrock spa_error_entry_compare, sizeof (spa_error_entry_t), 1371544Seschrock offsetof(spa_error_entry_t, se_avl)); 138789Sahrens } 139789Sahrens 140789Sahrens /* 141789Sahrens * Opposite of spa_activate(). 142789Sahrens */ 143789Sahrens static void 144789Sahrens spa_deactivate(spa_t *spa) 145789Sahrens { 146789Sahrens int t; 147789Sahrens 148789Sahrens ASSERT(spa->spa_sync_on == B_FALSE); 149789Sahrens ASSERT(spa->spa_dsl_pool == NULL); 150789Sahrens ASSERT(spa->spa_root_vdev == NULL); 151789Sahrens 152789Sahrens ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 153789Sahrens 154789Sahrens txg_list_destroy(&spa->spa_vdev_txg_list); 155789Sahrens 156789Sahrens list_destroy(&spa->spa_dirty_list); 157789Sahrens 158789Sahrens rw_destroy(&spa->spa_traverse_lock); 159789Sahrens 160789Sahrens for (t = 0; t < ZIO_TYPES; t++) { 161789Sahrens taskq_destroy(spa->spa_zio_issue_taskq[t]); 162789Sahrens taskq_destroy(spa->spa_zio_intr_taskq[t]); 163789Sahrens spa->spa_zio_issue_taskq[t] = NULL; 164789Sahrens spa->spa_zio_intr_taskq[t] = NULL; 165789Sahrens } 166789Sahrens 167789Sahrens metaslab_class_destroy(spa->spa_normal_class); 168789Sahrens spa->spa_normal_class = NULL; 169789Sahrens 1701544Seschrock /* 1711544Seschrock * If this was part of an import or the open otherwise failed, we may 1721544Seschrock * still have errors left in the queues. Empty them just in case. 1731544Seschrock */ 1741544Seschrock spa_errlog_drain(spa); 1751544Seschrock 1761544Seschrock avl_destroy(&spa->spa_errlist_scrub); 1771544Seschrock avl_destroy(&spa->spa_errlist_last); 1781544Seschrock 179789Sahrens spa->spa_state = POOL_STATE_UNINITIALIZED; 180789Sahrens } 181789Sahrens 182789Sahrens /* 183789Sahrens * Verify a pool configuration, and construct the vdev tree appropriately. This 184789Sahrens * will create all the necessary vdevs in the appropriate layout, with each vdev 185789Sahrens * in the CLOSED state. This will prep the pool before open/creation/import. 186789Sahrens * All vdev validation is done by the vdev_alloc() routine. 187789Sahrens */ 188789Sahrens static vdev_t * 189789Sahrens spa_config_parse(spa_t *spa, nvlist_t *nv, vdev_t *parent, uint_t id, int atype) 190789Sahrens { 191789Sahrens nvlist_t **child; 192789Sahrens uint_t c, children; 193789Sahrens vdev_t *vd; 194789Sahrens 195789Sahrens if ((vd = vdev_alloc(spa, nv, parent, id, atype)) == NULL) 196789Sahrens return (NULL); 197789Sahrens 198789Sahrens if (vd->vdev_ops->vdev_op_leaf) 199789Sahrens return (vd); 200789Sahrens 201789Sahrens if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 202789Sahrens &child, &children) != 0) { 203789Sahrens vdev_free(vd); 204789Sahrens return (NULL); 205789Sahrens } 206789Sahrens 207789Sahrens for (c = 0; c < children; c++) { 208789Sahrens if (spa_config_parse(spa, child[c], vd, c, atype) == NULL) { 209789Sahrens vdev_free(vd); 210789Sahrens return (NULL); 211789Sahrens } 212789Sahrens } 213789Sahrens 214789Sahrens return (vd); 215789Sahrens } 216789Sahrens 217789Sahrens /* 218789Sahrens * Opposite of spa_load(). 219789Sahrens */ 220789Sahrens static void 221789Sahrens spa_unload(spa_t *spa) 222789Sahrens { 223789Sahrens /* 2241544Seschrock * Stop async tasks. 2251544Seschrock */ 2261544Seschrock spa_async_suspend(spa); 2271544Seschrock 2281544Seschrock /* 229789Sahrens * Stop syncing. 230789Sahrens */ 231789Sahrens if (spa->spa_sync_on) { 232789Sahrens txg_sync_stop(spa->spa_dsl_pool); 233789Sahrens spa->spa_sync_on = B_FALSE; 234789Sahrens } 235789Sahrens 236789Sahrens /* 237789Sahrens * Wait for any outstanding prefetch I/O to complete. 238789Sahrens */ 2391544Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 2401544Seschrock spa_config_exit(spa, FTAG); 241789Sahrens 242789Sahrens /* 243789Sahrens * Close the dsl pool. 244789Sahrens */ 245789Sahrens if (spa->spa_dsl_pool) { 246789Sahrens dsl_pool_close(spa->spa_dsl_pool); 247789Sahrens spa->spa_dsl_pool = NULL; 248789Sahrens } 249789Sahrens 250789Sahrens /* 251789Sahrens * Close all vdevs. 252789Sahrens */ 2531585Sbonwick if (spa->spa_root_vdev) 254789Sahrens vdev_free(spa->spa_root_vdev); 2551585Sbonwick ASSERT(spa->spa_root_vdev == NULL); 2561544Seschrock 2571544Seschrock spa->spa_async_suspended = 0; 258789Sahrens } 259789Sahrens 260789Sahrens /* 261789Sahrens * Load an existing storage pool, using the pool's builtin spa_config as a 2621544Seschrock * source of configuration information. 263789Sahrens */ 264789Sahrens static int 2651544Seschrock spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) 266789Sahrens { 267789Sahrens int error = 0; 268789Sahrens nvlist_t *nvroot = NULL; 269789Sahrens vdev_t *rvd; 270789Sahrens uberblock_t *ub = &spa->spa_uberblock; 2711635Sbonwick uint64_t config_cache_txg = spa->spa_config_txg; 272789Sahrens uint64_t pool_guid; 273789Sahrens zio_t *zio; 274789Sahrens 2751544Seschrock spa->spa_load_state = state; 2761635Sbonwick 277789Sahrens if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 2781733Sbonwick nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 2791544Seschrock error = EINVAL; 2801544Seschrock goto out; 2811544Seschrock } 282789Sahrens 2831733Sbonwick (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 2841733Sbonwick &spa->spa_config_txg); 2851733Sbonwick 2861635Sbonwick if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 2871544Seschrock spa_guid_exists(pool_guid, 0)) { 2881544Seschrock error = EEXIST; 2891544Seschrock goto out; 2901544Seschrock } 291789Sahrens 292789Sahrens /* 293789Sahrens * Parse the configuration into a vdev tree. 294789Sahrens */ 2951544Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 296789Sahrens rvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 2971544Seschrock spa_config_exit(spa, FTAG); 298789Sahrens 2991544Seschrock if (rvd == NULL) { 3001544Seschrock error = EINVAL; 3011544Seschrock goto out; 3021544Seschrock } 303789Sahrens 3041585Sbonwick ASSERT(spa->spa_root_vdev == rvd); 305789Sahrens ASSERT(spa_guid(spa) == pool_guid); 306789Sahrens 307789Sahrens /* 308789Sahrens * Try to open all vdevs, loading each label in the process. 309789Sahrens */ 3101544Seschrock if (vdev_open(rvd) != 0) { 3111544Seschrock error = ENXIO; 3121544Seschrock goto out; 3131544Seschrock } 314789Sahrens 315789Sahrens /* 316789Sahrens * Find the best uberblock. 317789Sahrens */ 318789Sahrens bzero(ub, sizeof (uberblock_t)); 319789Sahrens 320789Sahrens zio = zio_root(spa, NULL, NULL, 321789Sahrens ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 322789Sahrens vdev_uberblock_load(zio, rvd, ub); 323789Sahrens error = zio_wait(zio); 324789Sahrens 325789Sahrens /* 326789Sahrens * If we weren't able to find a single valid uberblock, return failure. 327789Sahrens */ 328789Sahrens if (ub->ub_txg == 0) { 3291760Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 3301760Seschrock VDEV_AUX_CORRUPT_DATA); 3311544Seschrock error = ENXIO; 3321544Seschrock goto out; 3331544Seschrock } 3341544Seschrock 3351544Seschrock /* 3361544Seschrock * If the pool is newer than the code, we can't open it. 3371544Seschrock */ 3381760Seschrock if (ub->ub_version > ZFS_VERSION) { 3391760Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 3401760Seschrock VDEV_AUX_VERSION_NEWER); 3411544Seschrock error = ENOTSUP; 3421544Seschrock goto out; 343789Sahrens } 344789Sahrens 345789Sahrens /* 346789Sahrens * If the vdev guid sum doesn't match the uberblock, we have an 347789Sahrens * incomplete configuration. 348789Sahrens */ 3491732Sbonwick if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 3501544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 3511544Seschrock VDEV_AUX_BAD_GUID_SUM); 3521544Seschrock error = ENXIO; 3531544Seschrock goto out; 354789Sahrens } 355789Sahrens 356789Sahrens /* 357789Sahrens * Initialize internal SPA structures. 358789Sahrens */ 359789Sahrens spa->spa_state = POOL_STATE_ACTIVE; 360789Sahrens spa->spa_ubsync = spa->spa_uberblock; 361789Sahrens spa->spa_first_txg = spa_last_synced_txg(spa) + 1; 3621544Seschrock error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 3631544Seschrock if (error) { 3641544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 3651544Seschrock VDEV_AUX_CORRUPT_DATA); 3661544Seschrock goto out; 3671544Seschrock } 368789Sahrens spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 369789Sahrens 3701544Seschrock if (zap_lookup(spa->spa_meta_objset, 371789Sahrens DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 3721544Seschrock sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { 3731544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 3741544Seschrock VDEV_AUX_CORRUPT_DATA); 3751544Seschrock error = EIO; 3761544Seschrock goto out; 3771544Seschrock } 378789Sahrens 379789Sahrens if (!mosconfig) { 380789Sahrens dmu_buf_t *db; 381789Sahrens char *packed = NULL; 382789Sahrens size_t nvsize = 0; 383789Sahrens nvlist_t *newconfig = NULL; 384789Sahrens 3851544Seschrock VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, 3861544Seschrock spa->spa_config_object, FTAG, &db)); 387789Sahrens nvsize = *(uint64_t *)db->db_data; 3881544Seschrock dmu_buf_rele(db, FTAG); 389789Sahrens 390789Sahrens packed = kmem_alloc(nvsize, KM_SLEEP); 3911544Seschrock error = dmu_read(spa->spa_meta_objset, 392789Sahrens spa->spa_config_object, 0, nvsize, packed); 393789Sahrens if (error == 0) 394789Sahrens error = nvlist_unpack(packed, nvsize, &newconfig, 0); 395789Sahrens kmem_free(packed, nvsize); 396789Sahrens 3971544Seschrock if (error) { 3981544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 3991544Seschrock VDEV_AUX_CORRUPT_DATA); 4001544Seschrock error = EIO; 4011544Seschrock goto out; 4021544Seschrock } 403789Sahrens 404789Sahrens spa_config_set(spa, newconfig); 405789Sahrens 406789Sahrens spa_unload(spa); 407789Sahrens spa_deactivate(spa); 408789Sahrens spa_activate(spa); 409789Sahrens 4101544Seschrock return (spa_load(spa, newconfig, state, B_TRUE)); 4111544Seschrock } 4121544Seschrock 4131544Seschrock if (zap_lookup(spa->spa_meta_objset, 4141544Seschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 4151544Seschrock sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { 4161544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 4171544Seschrock VDEV_AUX_CORRUPT_DATA); 4181544Seschrock error = EIO; 4191544Seschrock goto out; 420789Sahrens } 421789Sahrens 4221544Seschrock /* 4231544Seschrock * Load the persistent error log. If we have an older pool, this will 4241544Seschrock * not be present. 4251544Seschrock */ 4261544Seschrock error = zap_lookup(spa->spa_meta_objset, 4271544Seschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, 4281544Seschrock sizeof (uint64_t), 1, &spa->spa_errlog_last); 4291544Seschrock if (error != 0 &&error != ENOENT) { 4301544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 4311544Seschrock VDEV_AUX_CORRUPT_DATA); 4321544Seschrock error = EIO; 4331544Seschrock goto out; 4341544Seschrock } 4351544Seschrock 4361544Seschrock error = zap_lookup(spa->spa_meta_objset, 4371544Seschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, 4381544Seschrock sizeof (uint64_t), 1, &spa->spa_errlog_scrub); 4391544Seschrock if (error != 0 && error != ENOENT) { 4401544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 4411544Seschrock VDEV_AUX_CORRUPT_DATA); 4421544Seschrock error = EIO; 4431544Seschrock goto out; 4441544Seschrock } 445789Sahrens 446789Sahrens /* 4471544Seschrock * Load the vdev state for all top level vdevs. We need to grab the 4481544Seschrock * config lock because all label I/O is done with the 4491544Seschrock * ZIO_FLAG_CONFIG_HELD flag. 450789Sahrens */ 4511544Seschrock spa_config_enter(spa, RW_READER, FTAG); 4521635Sbonwick error = vdev_load(rvd); 4531635Sbonwick spa_config_exit(spa, FTAG); 4541635Sbonwick 4551635Sbonwick if (error) 4561544Seschrock goto out; 457789Sahrens 458789Sahrens /* 459789Sahrens * Propagate the leaf DTLs we just loaded all the way up the tree. 460789Sahrens */ 4611544Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 462789Sahrens vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 4631544Seschrock spa_config_exit(spa, FTAG); 464789Sahrens 465789Sahrens /* 466789Sahrens * Check the state of the root vdev. If it can't be opened, it 467789Sahrens * indicates one or more toplevel vdevs are faulted. 468789Sahrens */ 4691544Seschrock if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 4701544Seschrock error = ENXIO; 4711544Seschrock goto out; 4721544Seschrock } 473789Sahrens 4741544Seschrock if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) { 4751635Sbonwick dmu_tx_t *tx; 4761635Sbonwick int need_update = B_FALSE; 4771585Sbonwick int c; 4781601Sbonwick 4791635Sbonwick /* 4801635Sbonwick * Claim log blocks that haven't been committed yet. 4811635Sbonwick * This must all happen in a single txg. 4821635Sbonwick */ 4831601Sbonwick tx = dmu_tx_create_assigned(spa_get_dsl(spa), 484789Sahrens spa_first_txg(spa)); 485789Sahrens dmu_objset_find(spa->spa_name, zil_claim, tx, 0); 486789Sahrens dmu_tx_commit(tx); 487789Sahrens 488789Sahrens spa->spa_sync_on = B_TRUE; 489789Sahrens txg_sync_start(spa->spa_dsl_pool); 490789Sahrens 491789Sahrens /* 492789Sahrens * Wait for all claims to sync. 493789Sahrens */ 494789Sahrens txg_wait_synced(spa->spa_dsl_pool, 0); 4951585Sbonwick 4961585Sbonwick /* 4971635Sbonwick * If the config cache is stale, or we have uninitialized 4981635Sbonwick * metaslabs (see spa_vdev_add()), then update the config. 4991585Sbonwick */ 5001635Sbonwick if (config_cache_txg != spa->spa_config_txg || 5011635Sbonwick state == SPA_LOAD_IMPORT) 5021635Sbonwick need_update = B_TRUE; 5031635Sbonwick 5041635Sbonwick for (c = 0; c < rvd->vdev_children; c++) 5051635Sbonwick if (rvd->vdev_child[c]->vdev_ms_array == 0) 5061635Sbonwick need_update = B_TRUE; 5071585Sbonwick 5081585Sbonwick /* 5091635Sbonwick * Update the config cache asychronously in case we're the 5101635Sbonwick * root pool, in which case the config cache isn't writable yet. 5111585Sbonwick */ 5121635Sbonwick if (need_update) 5131635Sbonwick spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 514789Sahrens } 515789Sahrens 5161544Seschrock error = 0; 5171544Seschrock out: 5181544Seschrock if (error) 5191544Seschrock zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0); 5201544Seschrock spa->spa_load_state = SPA_LOAD_NONE; 5211544Seschrock spa->spa_ena = 0; 5221544Seschrock 5231544Seschrock return (error); 524789Sahrens } 525789Sahrens 526789Sahrens /* 527789Sahrens * Pool Open/Import 528789Sahrens * 529789Sahrens * The import case is identical to an open except that the configuration is sent 530789Sahrens * down from userland, instead of grabbed from the configuration cache. For the 531789Sahrens * case of an open, the pool configuration will exist in the 532789Sahrens * POOL_STATE_UNITIALIZED state. 533789Sahrens * 534789Sahrens * The stats information (gen/count/ustats) is used to gather vdev statistics at 535789Sahrens * the same time open the pool, without having to keep around the spa_t in some 536789Sahrens * ambiguous state. 537789Sahrens */ 538789Sahrens static int 539789Sahrens spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) 540789Sahrens { 541789Sahrens spa_t *spa; 542789Sahrens int error; 543789Sahrens int loaded = B_FALSE; 544789Sahrens int locked = B_FALSE; 545789Sahrens 546789Sahrens *spapp = NULL; 547789Sahrens 548789Sahrens /* 549789Sahrens * As disgusting as this is, we need to support recursive calls to this 550789Sahrens * function because dsl_dir_open() is called during spa_load(), and ends 551789Sahrens * up calling spa_open() again. The real fix is to figure out how to 552789Sahrens * avoid dsl_dir_open() calling this in the first place. 553789Sahrens */ 554789Sahrens if (mutex_owner(&spa_namespace_lock) != curthread) { 555789Sahrens mutex_enter(&spa_namespace_lock); 556789Sahrens locked = B_TRUE; 557789Sahrens } 558789Sahrens 559789Sahrens if ((spa = spa_lookup(pool)) == NULL) { 560789Sahrens if (locked) 561789Sahrens mutex_exit(&spa_namespace_lock); 562789Sahrens return (ENOENT); 563789Sahrens } 564789Sahrens if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 565789Sahrens 566789Sahrens spa_activate(spa); 567789Sahrens 5681635Sbonwick error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE); 569789Sahrens 570789Sahrens if (error == EBADF) { 571789Sahrens /* 572789Sahrens * If vdev_load() returns EBADF, it indicates that one 573789Sahrens * of the vdevs indicates that the pool has been 574789Sahrens * exported or destroyed. If this is the case, the 575789Sahrens * config cache is out of sync and we should remove the 576789Sahrens * pool from the namespace. 577789Sahrens */ 578789Sahrens spa_unload(spa); 579789Sahrens spa_deactivate(spa); 580789Sahrens spa_remove(spa); 581789Sahrens spa_config_sync(); 582789Sahrens if (locked) 583789Sahrens mutex_exit(&spa_namespace_lock); 584789Sahrens return (ENOENT); 5851544Seschrock } 5861544Seschrock 5871544Seschrock if (error) { 588789Sahrens /* 589789Sahrens * We can't open the pool, but we still have useful 590789Sahrens * information: the state of each vdev after the 591789Sahrens * attempted vdev_open(). Return this to the user. 592789Sahrens */ 5931635Sbonwick if (config != NULL && spa->spa_root_vdev != NULL) { 5941635Sbonwick spa_config_enter(spa, RW_READER, FTAG); 595789Sahrens *config = spa_config_generate(spa, NULL, -1ULL, 596789Sahrens B_TRUE); 5971635Sbonwick spa_config_exit(spa, FTAG); 5981635Sbonwick } 599789Sahrens spa_unload(spa); 600789Sahrens spa_deactivate(spa); 6011544Seschrock spa->spa_last_open_failed = B_TRUE; 602789Sahrens if (locked) 603789Sahrens mutex_exit(&spa_namespace_lock); 604789Sahrens *spapp = NULL; 605789Sahrens return (error); 6061544Seschrock } else { 6071544Seschrock zfs_post_ok(spa, NULL); 6081544Seschrock spa->spa_last_open_failed = B_FALSE; 609789Sahrens } 610789Sahrens 611789Sahrens loaded = B_TRUE; 612789Sahrens } 613789Sahrens 614789Sahrens spa_open_ref(spa, tag); 615789Sahrens if (locked) 616789Sahrens mutex_exit(&spa_namespace_lock); 617789Sahrens 618789Sahrens *spapp = spa; 619789Sahrens 620789Sahrens if (config != NULL) { 6211544Seschrock spa_config_enter(spa, RW_READER, FTAG); 622789Sahrens *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 6231544Seschrock spa_config_exit(spa, FTAG); 624789Sahrens } 625789Sahrens 626789Sahrens /* 627789Sahrens * If we just loaded the pool, resilver anything that's out of date. 628789Sahrens */ 629789Sahrens if (loaded && (spa_mode & FWRITE)) 630789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 631789Sahrens 632789Sahrens return (0); 633789Sahrens } 634789Sahrens 635789Sahrens int 636789Sahrens spa_open(const char *name, spa_t **spapp, void *tag) 637789Sahrens { 638789Sahrens return (spa_open_common(name, spapp, tag, NULL)); 639789Sahrens } 640789Sahrens 6411544Seschrock /* 6421544Seschrock * Lookup the given spa_t, incrementing the inject count in the process, 6431544Seschrock * preventing it from being exported or destroyed. 6441544Seschrock */ 6451544Seschrock spa_t * 6461544Seschrock spa_inject_addref(char *name) 6471544Seschrock { 6481544Seschrock spa_t *spa; 6491544Seschrock 6501544Seschrock mutex_enter(&spa_namespace_lock); 6511544Seschrock if ((spa = spa_lookup(name)) == NULL) { 6521544Seschrock mutex_exit(&spa_namespace_lock); 6531544Seschrock return (NULL); 6541544Seschrock } 6551544Seschrock spa->spa_inject_ref++; 6561544Seschrock mutex_exit(&spa_namespace_lock); 6571544Seschrock 6581544Seschrock return (spa); 6591544Seschrock } 6601544Seschrock 6611544Seschrock void 6621544Seschrock spa_inject_delref(spa_t *spa) 6631544Seschrock { 6641544Seschrock mutex_enter(&spa_namespace_lock); 6651544Seschrock spa->spa_inject_ref--; 6661544Seschrock mutex_exit(&spa_namespace_lock); 6671544Seschrock } 6681544Seschrock 669789Sahrens int 6701544Seschrock spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 671789Sahrens { 672789Sahrens int error; 673789Sahrens spa_t *spa; 674789Sahrens 675789Sahrens *config = NULL; 676789Sahrens error = spa_open_common(name, &spa, FTAG, config); 677789Sahrens 6781544Seschrock if (spa && *config != NULL) 6791544Seschrock VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, 6801544Seschrock spa_get_errlog_size(spa)) == 0); 6811544Seschrock 6821544Seschrock /* 6831544Seschrock * We want to get the alternate root even for faulted pools, so we cheat 6841544Seschrock * and call spa_lookup() directly. 6851544Seschrock */ 6861544Seschrock if (altroot) { 6871544Seschrock if (spa == NULL) { 6881544Seschrock mutex_enter(&spa_namespace_lock); 6891544Seschrock spa = spa_lookup(name); 6901544Seschrock if (spa) 6911544Seschrock spa_altroot(spa, altroot, buflen); 6921544Seschrock else 6931544Seschrock altroot[0] = '\0'; 6941544Seschrock spa = NULL; 6951544Seschrock mutex_exit(&spa_namespace_lock); 6961544Seschrock } else { 6971544Seschrock spa_altroot(spa, altroot, buflen); 6981544Seschrock } 6991544Seschrock } 7001544Seschrock 701789Sahrens if (spa != NULL) 702789Sahrens spa_close(spa, FTAG); 703789Sahrens 704789Sahrens return (error); 705789Sahrens } 706789Sahrens 707789Sahrens /* 708789Sahrens * Pool Creation 709789Sahrens */ 710789Sahrens int 7111635Sbonwick spa_create(const char *pool, nvlist_t *nvroot, const char *altroot) 712789Sahrens { 713789Sahrens spa_t *spa; 7141635Sbonwick vdev_t *rvd; 715789Sahrens dsl_pool_t *dp; 716789Sahrens dmu_tx_t *tx; 7171635Sbonwick int c, error; 718789Sahrens uint64_t txg = TXG_INITIAL; 719789Sahrens 720789Sahrens /* 721789Sahrens * If this pool already exists, return failure. 722789Sahrens */ 723789Sahrens mutex_enter(&spa_namespace_lock); 724789Sahrens if (spa_lookup(pool) != NULL) { 725789Sahrens mutex_exit(&spa_namespace_lock); 726789Sahrens return (EEXIST); 727789Sahrens } 728789Sahrens 729789Sahrens /* 730789Sahrens * Allocate a new spa_t structure. 731789Sahrens */ 7321635Sbonwick spa = spa_add(pool, altroot); 733789Sahrens spa_activate(spa); 734789Sahrens 735789Sahrens spa->spa_uberblock.ub_txg = txg - 1; 7361760Seschrock spa->spa_uberblock.ub_version = ZFS_VERSION; 737789Sahrens spa->spa_ubsync = spa->spa_uberblock; 738789Sahrens 7391635Sbonwick /* 7401635Sbonwick * Create the root vdev. 7411635Sbonwick */ 7421635Sbonwick spa_config_enter(spa, RW_WRITER, FTAG); 7431635Sbonwick 7441635Sbonwick rvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD); 7451635Sbonwick 7461635Sbonwick ASSERT(spa->spa_root_vdev == rvd); 7471635Sbonwick 7481635Sbonwick if (rvd == NULL) { 7491635Sbonwick error = EINVAL; 7501635Sbonwick } else { 7511635Sbonwick if ((error = vdev_create(rvd, txg)) == 0) { 7521635Sbonwick for (c = 0; c < rvd->vdev_children; c++) 7531635Sbonwick vdev_init(rvd->vdev_child[c], txg); 7541635Sbonwick vdev_config_dirty(rvd); 7551635Sbonwick } 7561635Sbonwick } 7571635Sbonwick 7581635Sbonwick spa_config_exit(spa, FTAG); 759789Sahrens 760789Sahrens if (error) { 761789Sahrens spa_unload(spa); 762789Sahrens spa_deactivate(spa); 763789Sahrens spa_remove(spa); 764789Sahrens mutex_exit(&spa_namespace_lock); 765789Sahrens return (error); 766789Sahrens } 767789Sahrens 768789Sahrens spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg); 769789Sahrens spa->spa_meta_objset = dp->dp_meta_objset; 770789Sahrens 771789Sahrens tx = dmu_tx_create_assigned(dp, txg); 772789Sahrens 773789Sahrens /* 774789Sahrens * Create the pool config object. 775789Sahrens */ 776789Sahrens spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 777789Sahrens DMU_OT_PACKED_NVLIST, 1 << 14, 778789Sahrens DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 779789Sahrens 7801544Seschrock if (zap_add(spa->spa_meta_objset, 781789Sahrens DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 7821544Seschrock sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 7831544Seschrock cmn_err(CE_PANIC, "failed to add pool config"); 7841544Seschrock } 785789Sahrens 786789Sahrens /* 787789Sahrens * Create the deferred-free bplist object. Turn off compression 788789Sahrens * because sync-to-convergence takes longer if the blocksize 789789Sahrens * keeps changing. 790789Sahrens */ 791789Sahrens spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 792789Sahrens 1 << 14, tx); 793789Sahrens dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 794789Sahrens ZIO_COMPRESS_OFF, tx); 795789Sahrens 7961544Seschrock if (zap_add(spa->spa_meta_objset, 797789Sahrens DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 7981544Seschrock sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { 7991544Seschrock cmn_err(CE_PANIC, "failed to add bplist"); 8001544Seschrock } 801789Sahrens 802789Sahrens dmu_tx_commit(tx); 803789Sahrens 804789Sahrens spa->spa_sync_on = B_TRUE; 805789Sahrens txg_sync_start(spa->spa_dsl_pool); 806789Sahrens 807789Sahrens /* 808789Sahrens * We explicitly wait for the first transaction to complete so that our 809789Sahrens * bean counters are appropriately updated. 810789Sahrens */ 811789Sahrens txg_wait_synced(spa->spa_dsl_pool, txg); 812789Sahrens 813789Sahrens spa_config_sync(); 814789Sahrens 815789Sahrens mutex_exit(&spa_namespace_lock); 816789Sahrens 817789Sahrens return (0); 818789Sahrens } 819789Sahrens 820789Sahrens /* 821789Sahrens * Import the given pool into the system. We set up the necessary spa_t and 822789Sahrens * then call spa_load() to do the dirty work. 823789Sahrens */ 824789Sahrens int 8251635Sbonwick spa_import(const char *pool, nvlist_t *config, const char *altroot) 826789Sahrens { 827789Sahrens spa_t *spa; 828789Sahrens int error; 829789Sahrens 830789Sahrens if (!(spa_mode & FWRITE)) 831789Sahrens return (EROFS); 832789Sahrens 833789Sahrens /* 834789Sahrens * If a pool with this name exists, return failure. 835789Sahrens */ 836789Sahrens mutex_enter(&spa_namespace_lock); 837789Sahrens if (spa_lookup(pool) != NULL) { 838789Sahrens mutex_exit(&spa_namespace_lock); 839789Sahrens return (EEXIST); 840789Sahrens } 841789Sahrens 842789Sahrens /* 8431635Sbonwick * Create and initialize the spa structure. 844789Sahrens */ 8451635Sbonwick spa = spa_add(pool, altroot); 846789Sahrens spa_activate(spa); 847789Sahrens 848789Sahrens /* 8491635Sbonwick * Pass off the heavy lifting to spa_load(). 8501732Sbonwick * Pass TRUE for mosconfig because the user-supplied config 8511732Sbonwick * is actually the one to trust when doing an import. 8521601Sbonwick */ 8531732Sbonwick error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE); 854789Sahrens 855789Sahrens if (error) { 856789Sahrens spa_unload(spa); 857789Sahrens spa_deactivate(spa); 858789Sahrens spa_remove(spa); 859789Sahrens mutex_exit(&spa_namespace_lock); 860789Sahrens return (error); 861789Sahrens } 862789Sahrens 8631635Sbonwick /* 8641635Sbonwick * Update the config cache to include the newly-imported pool. 8651635Sbonwick */ 8661635Sbonwick spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 8671635Sbonwick 868789Sahrens mutex_exit(&spa_namespace_lock); 869789Sahrens 870789Sahrens /* 871789Sahrens * Resilver anything that's out of date. 872789Sahrens */ 873789Sahrens if (spa_mode & FWRITE) 874789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 875789Sahrens 876789Sahrens return (0); 877789Sahrens } 878789Sahrens 879789Sahrens /* 880789Sahrens * This (illegal) pool name is used when temporarily importing a spa_t in order 881789Sahrens * to get the vdev stats associated with the imported devices. 882789Sahrens */ 883789Sahrens #define TRYIMPORT_NAME "$import" 884789Sahrens 885789Sahrens nvlist_t * 886789Sahrens spa_tryimport(nvlist_t *tryconfig) 887789Sahrens { 888789Sahrens nvlist_t *config = NULL; 889789Sahrens char *poolname; 890789Sahrens spa_t *spa; 891789Sahrens uint64_t state; 892789Sahrens 893789Sahrens if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 894789Sahrens return (NULL); 895789Sahrens 896789Sahrens if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 897789Sahrens return (NULL); 898789Sahrens 8991635Sbonwick /* 9001635Sbonwick * Create and initialize the spa structure. 9011635Sbonwick */ 902789Sahrens mutex_enter(&spa_namespace_lock); 9031635Sbonwick spa = spa_add(TRYIMPORT_NAME, NULL); 904789Sahrens spa_activate(spa); 905789Sahrens 906789Sahrens /* 9071635Sbonwick * Pass off the heavy lifting to spa_load(). 9081732Sbonwick * Pass TRUE for mosconfig because the user-supplied config 9091732Sbonwick * is actually the one to trust when doing an import. 910789Sahrens */ 9111732Sbonwick (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); 912789Sahrens 913789Sahrens /* 914789Sahrens * If 'tryconfig' was at least parsable, return the current config. 915789Sahrens */ 916789Sahrens if (spa->spa_root_vdev != NULL) { 9171635Sbonwick spa_config_enter(spa, RW_READER, FTAG); 918789Sahrens config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 9191635Sbonwick spa_config_exit(spa, FTAG); 920789Sahrens VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 921789Sahrens poolname) == 0); 922789Sahrens VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 923789Sahrens state) == 0); 924789Sahrens } 925789Sahrens 926789Sahrens spa_unload(spa); 927789Sahrens spa_deactivate(spa); 928789Sahrens spa_remove(spa); 929789Sahrens mutex_exit(&spa_namespace_lock); 930789Sahrens 931789Sahrens return (config); 932789Sahrens } 933789Sahrens 934789Sahrens /* 935789Sahrens * Pool export/destroy 936789Sahrens * 937789Sahrens * The act of destroying or exporting a pool is very simple. We make sure there 938789Sahrens * is no more pending I/O and any references to the pool are gone. Then, we 939789Sahrens * update the pool state and sync all the labels to disk, removing the 940789Sahrens * configuration from the cache afterwards. 941789Sahrens */ 942789Sahrens static int 943*1775Sbillm spa_export_common(char *pool, int new_state, nvlist_t **oldconfig) 944789Sahrens { 945789Sahrens spa_t *spa; 946789Sahrens 947*1775Sbillm if (oldconfig) 948*1775Sbillm *oldconfig = NULL; 949*1775Sbillm 950789Sahrens if (!(spa_mode & FWRITE)) 951789Sahrens return (EROFS); 952789Sahrens 953789Sahrens mutex_enter(&spa_namespace_lock); 954789Sahrens if ((spa = spa_lookup(pool)) == NULL) { 955789Sahrens mutex_exit(&spa_namespace_lock); 956789Sahrens return (ENOENT); 957789Sahrens } 958789Sahrens 959789Sahrens /* 9601544Seschrock * Put a hold on the pool, drop the namespace lock, stop async tasks, 9611544Seschrock * reacquire the namespace lock, and see if we can export. 9621544Seschrock */ 9631544Seschrock spa_open_ref(spa, FTAG); 9641544Seschrock mutex_exit(&spa_namespace_lock); 9651544Seschrock spa_async_suspend(spa); 9661544Seschrock mutex_enter(&spa_namespace_lock); 9671544Seschrock spa_close(spa, FTAG); 9681544Seschrock 9691544Seschrock /* 970789Sahrens * The pool will be in core if it's openable, 971789Sahrens * in which case we can modify its state. 972789Sahrens */ 973789Sahrens if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 974789Sahrens /* 975789Sahrens * Objsets may be open only because they're dirty, so we 976789Sahrens * have to force it to sync before checking spa_refcnt. 977789Sahrens */ 978789Sahrens spa_scrub_suspend(spa); 979789Sahrens txg_wait_synced(spa->spa_dsl_pool, 0); 980789Sahrens 9811544Seschrock /* 9821544Seschrock * A pool cannot be exported or destroyed if there are active 9831544Seschrock * references. If we are resetting a pool, allow references by 9841544Seschrock * fault injection handlers. 9851544Seschrock */ 9861544Seschrock if (!spa_refcount_zero(spa) || 9871544Seschrock (spa->spa_inject_ref != 0 && 9881544Seschrock new_state != POOL_STATE_UNINITIALIZED)) { 989789Sahrens spa_scrub_resume(spa); 9901544Seschrock spa_async_resume(spa); 991789Sahrens mutex_exit(&spa_namespace_lock); 992789Sahrens return (EBUSY); 993789Sahrens } 994789Sahrens 995789Sahrens spa_scrub_resume(spa); 996789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 997789Sahrens 998789Sahrens /* 999789Sahrens * We want this to be reflected on every label, 1000789Sahrens * so mark them all dirty. spa_unload() will do the 1001789Sahrens * final sync that pushes these changes out. 1002789Sahrens */ 10031544Seschrock if (new_state != POOL_STATE_UNINITIALIZED) { 10041601Sbonwick spa_config_enter(spa, RW_WRITER, FTAG); 10051544Seschrock spa->spa_state = new_state; 10061635Sbonwick spa->spa_final_txg = spa_last_synced_txg(spa) + 1; 10071544Seschrock vdev_config_dirty(spa->spa_root_vdev); 10081601Sbonwick spa_config_exit(spa, FTAG); 10091544Seschrock } 1010789Sahrens } 1011789Sahrens 1012789Sahrens if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 1013789Sahrens spa_unload(spa); 1014789Sahrens spa_deactivate(spa); 1015789Sahrens } 1016789Sahrens 1017*1775Sbillm if (oldconfig && spa->spa_config) 1018*1775Sbillm VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 1019*1775Sbillm 10201544Seschrock if (new_state != POOL_STATE_UNINITIALIZED) { 10211544Seschrock spa_remove(spa); 10221544Seschrock spa_config_sync(); 10231544Seschrock } 1024789Sahrens mutex_exit(&spa_namespace_lock); 1025789Sahrens 1026789Sahrens return (0); 1027789Sahrens } 1028789Sahrens 1029789Sahrens /* 1030789Sahrens * Destroy a storage pool. 1031789Sahrens */ 1032789Sahrens int 1033789Sahrens spa_destroy(char *pool) 1034789Sahrens { 1035*1775Sbillm return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL)); 1036789Sahrens } 1037789Sahrens 1038789Sahrens /* 1039789Sahrens * Export a storage pool. 1040789Sahrens */ 1041789Sahrens int 1042*1775Sbillm spa_export(char *pool, nvlist_t **oldconfig) 1043789Sahrens { 1044*1775Sbillm return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig)); 1045789Sahrens } 1046789Sahrens 1047789Sahrens /* 10481544Seschrock * Similar to spa_export(), this unloads the spa_t without actually removing it 10491544Seschrock * from the namespace in any way. 10501544Seschrock */ 10511544Seschrock int 10521544Seschrock spa_reset(char *pool) 10531544Seschrock { 1054*1775Sbillm return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL)); 10551544Seschrock } 10561544Seschrock 10571544Seschrock 10581544Seschrock /* 1059789Sahrens * ========================================================================== 1060789Sahrens * Device manipulation 1061789Sahrens * ========================================================================== 1062789Sahrens */ 1063789Sahrens 1064789Sahrens /* 1065789Sahrens * Add capacity to a storage pool. 1066789Sahrens */ 1067789Sahrens int 1068789Sahrens spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 1069789Sahrens { 1070789Sahrens uint64_t txg; 10711635Sbonwick int c, error; 1072789Sahrens vdev_t *rvd = spa->spa_root_vdev; 10731585Sbonwick vdev_t *vd, *tvd; 1074789Sahrens 1075789Sahrens txg = spa_vdev_enter(spa); 1076789Sahrens 1077789Sahrens vd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD); 1078789Sahrens 1079789Sahrens if (vd == NULL) 1080789Sahrens return (spa_vdev_exit(spa, vd, txg, EINVAL)); 1081789Sahrens 1082789Sahrens if ((error = vdev_create(vd, txg)) != 0) 1083789Sahrens return (spa_vdev_exit(spa, vd, txg, error)); 1084789Sahrens 1085789Sahrens /* 10861585Sbonwick * Transfer each new top-level vdev from vd to rvd. 1087789Sahrens */ 10881635Sbonwick for (c = 0; c < vd->vdev_children; c++) { 10891585Sbonwick tvd = vd->vdev_child[c]; 10901635Sbonwick vdev_remove_child(vd, tvd); 10911635Sbonwick tvd->vdev_id = rvd->vdev_children; 10921635Sbonwick vdev_add_child(rvd, tvd); 1093789Sahrens vdev_config_dirty(tvd); 1094789Sahrens } 1095789Sahrens 1096789Sahrens /* 10971585Sbonwick * We have to be careful when adding new vdevs to an existing pool. 10981585Sbonwick * If other threads start allocating from these vdevs before we 10991585Sbonwick * sync the config cache, and we lose power, then upon reboot we may 11001585Sbonwick * fail to open the pool because there are DVAs that the config cache 11011585Sbonwick * can't translate. Therefore, we first add the vdevs without 11021585Sbonwick * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 11031635Sbonwick * and then let spa_config_update() initialize the new metaslabs. 11041585Sbonwick * 11051585Sbonwick * spa_load() checks for added-but-not-initialized vdevs, so that 11061585Sbonwick * if we lose power at any point in this sequence, the remaining 11071585Sbonwick * steps will be completed the next time we load the pool. 1108789Sahrens */ 11091635Sbonwick (void) spa_vdev_exit(spa, vd, txg, 0); 11101585Sbonwick 11111635Sbonwick mutex_enter(&spa_namespace_lock); 11121635Sbonwick spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 11131635Sbonwick mutex_exit(&spa_namespace_lock); 1114789Sahrens 11151635Sbonwick return (0); 1116789Sahrens } 1117789Sahrens 1118789Sahrens /* 1119789Sahrens * Attach a device to a mirror. The arguments are the path to any device 1120789Sahrens * in the mirror, and the nvroot for the new device. If the path specifies 1121789Sahrens * a device that is not mirrored, we automatically insert the mirror vdev. 1122789Sahrens * 1123789Sahrens * If 'replacing' is specified, the new device is intended to replace the 1124789Sahrens * existing device; in this case the two devices are made into their own 1125789Sahrens * mirror using the 'replacing' vdev, which is functionally idendical to 1126789Sahrens * the mirror vdev (it actually reuses all the same ops) but has a few 1127789Sahrens * extra rules: you can't attach to it after it's been created, and upon 1128789Sahrens * completion of resilvering, the first disk (the one being replaced) 1129789Sahrens * is automatically detached. 1130789Sahrens */ 1131789Sahrens int 11321544Seschrock spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 1133789Sahrens { 1134789Sahrens uint64_t txg, open_txg; 1135789Sahrens int error; 1136789Sahrens vdev_t *rvd = spa->spa_root_vdev; 1137789Sahrens vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 1138789Sahrens vdev_ops_t *pvops = replacing ? &vdev_replacing_ops : &vdev_mirror_ops; 1139789Sahrens 1140789Sahrens txg = spa_vdev_enter(spa); 1141789Sahrens 11421544Seschrock oldvd = vdev_lookup_by_guid(rvd, guid); 1143789Sahrens 1144789Sahrens if (oldvd == NULL) 1145789Sahrens return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1146789Sahrens 11471585Sbonwick if (!oldvd->vdev_ops->vdev_op_leaf) 11481585Sbonwick return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 11491585Sbonwick 1150789Sahrens pvd = oldvd->vdev_parent; 1151789Sahrens 1152789Sahrens /* 1153789Sahrens * The parent must be a mirror or the root, unless we're replacing; 1154789Sahrens * in that case, the parent can be anything but another replacing vdev. 1155789Sahrens */ 1156789Sahrens if (pvd->vdev_ops != &vdev_mirror_ops && 1157789Sahrens pvd->vdev_ops != &vdev_root_ops && 1158789Sahrens (!replacing || pvd->vdev_ops == &vdev_replacing_ops)) 1159789Sahrens return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1160789Sahrens 1161789Sahrens newrootvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD); 1162789Sahrens 1163789Sahrens if (newrootvd == NULL || newrootvd->vdev_children != 1) 1164789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1165789Sahrens 1166789Sahrens newvd = newrootvd->vdev_child[0]; 1167789Sahrens 1168789Sahrens if (!newvd->vdev_ops->vdev_op_leaf) 1169789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1170789Sahrens 1171789Sahrens if ((error = vdev_create(newrootvd, txg)) != 0) 1172789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, error)); 1173789Sahrens 11741175Slling /* 11751175Slling * Compare the new device size with the replaceable/attachable 11761175Slling * device size. 11771175Slling */ 11781175Slling if (newvd->vdev_psize < vdev_get_rsize(oldvd)) 1179789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 1180789Sahrens 11811732Sbonwick /* 11821732Sbonwick * The new device cannot have a higher alignment requirement 11831732Sbonwick * than the top-level vdev. 11841732Sbonwick */ 11851732Sbonwick if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 1186789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 1187789Sahrens 1188789Sahrens /* 1189789Sahrens * If this is an in-place replacement, update oldvd's path and devid 1190789Sahrens * to make it distinguishable from newvd, and unopenable from now on. 1191789Sahrens */ 1192789Sahrens if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 1193789Sahrens spa_strfree(oldvd->vdev_path); 1194789Sahrens oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 1195789Sahrens KM_SLEEP); 1196789Sahrens (void) sprintf(oldvd->vdev_path, "%s/%s", 1197789Sahrens newvd->vdev_path, "old"); 1198789Sahrens if (oldvd->vdev_devid != NULL) { 1199789Sahrens spa_strfree(oldvd->vdev_devid); 1200789Sahrens oldvd->vdev_devid = NULL; 1201789Sahrens } 1202789Sahrens } 1203789Sahrens 1204789Sahrens /* 1205789Sahrens * If the parent is not a mirror, or if we're replacing, 1206789Sahrens * insert the new mirror/replacing vdev above oldvd. 1207789Sahrens */ 1208789Sahrens if (pvd->vdev_ops != pvops) 1209789Sahrens pvd = vdev_add_parent(oldvd, pvops); 1210789Sahrens 1211789Sahrens ASSERT(pvd->vdev_top->vdev_parent == rvd); 1212789Sahrens ASSERT(pvd->vdev_ops == pvops); 1213789Sahrens ASSERT(oldvd->vdev_parent == pvd); 1214789Sahrens 1215789Sahrens /* 1216789Sahrens * Extract the new device from its root and add it to pvd. 1217789Sahrens */ 1218789Sahrens vdev_remove_child(newrootvd, newvd); 1219789Sahrens newvd->vdev_id = pvd->vdev_children; 1220789Sahrens vdev_add_child(pvd, newvd); 1221789Sahrens 12221544Seschrock /* 12231544Seschrock * If newvd is smaller than oldvd, but larger than its rsize, 12241544Seschrock * the addition of newvd may have decreased our parent's asize. 12251544Seschrock */ 12261544Seschrock pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize); 12271544Seschrock 1228789Sahrens tvd = newvd->vdev_top; 1229789Sahrens ASSERT(pvd->vdev_top == tvd); 1230789Sahrens ASSERT(tvd->vdev_parent == rvd); 1231789Sahrens 1232789Sahrens vdev_config_dirty(tvd); 1233789Sahrens 1234789Sahrens /* 1235789Sahrens * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 1236789Sahrens * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 1237789Sahrens */ 1238789Sahrens open_txg = txg + TXG_CONCURRENT_STATES - 1; 1239789Sahrens 1240789Sahrens mutex_enter(&newvd->vdev_dtl_lock); 1241789Sahrens space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, 1242789Sahrens open_txg - TXG_INITIAL + 1); 1243789Sahrens mutex_exit(&newvd->vdev_dtl_lock); 1244789Sahrens 12451544Seschrock dprintf("attached %s in txg %llu\n", newvd->vdev_path, txg); 12461544Seschrock 1247789Sahrens /* 1248789Sahrens * Mark newvd's DTL dirty in this txg. 1249789Sahrens */ 12501732Sbonwick vdev_dirty(tvd, VDD_DTL, newvd, txg); 1251789Sahrens 1252789Sahrens (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 1253789Sahrens 1254789Sahrens /* 1255789Sahrens * Kick off a resilver to update newvd. 1256789Sahrens */ 1257789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1258789Sahrens 1259789Sahrens return (0); 1260789Sahrens } 1261789Sahrens 1262789Sahrens /* 1263789Sahrens * Detach a device from a mirror or replacing vdev. 1264789Sahrens * If 'replace_done' is specified, only detach if the parent 1265789Sahrens * is a replacing vdev. 1266789Sahrens */ 1267789Sahrens int 12681544Seschrock spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) 1269789Sahrens { 1270789Sahrens uint64_t txg; 1271789Sahrens int c, t, error; 1272789Sahrens vdev_t *rvd = spa->spa_root_vdev; 1273789Sahrens vdev_t *vd, *pvd, *cvd, *tvd; 1274789Sahrens 1275789Sahrens txg = spa_vdev_enter(spa); 1276789Sahrens 12771544Seschrock vd = vdev_lookup_by_guid(rvd, guid); 1278789Sahrens 1279789Sahrens if (vd == NULL) 1280789Sahrens return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1281789Sahrens 12821585Sbonwick if (!vd->vdev_ops->vdev_op_leaf) 12831585Sbonwick return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 12841585Sbonwick 1285789Sahrens pvd = vd->vdev_parent; 1286789Sahrens 1287789Sahrens /* 1288789Sahrens * If replace_done is specified, only remove this device if it's 1289789Sahrens * the first child of a replacing vdev. 1290789Sahrens */ 1291789Sahrens if (replace_done && 1292789Sahrens (vd->vdev_id != 0 || pvd->vdev_ops != &vdev_replacing_ops)) 1293789Sahrens return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1294789Sahrens 1295789Sahrens /* 1296789Sahrens * Only mirror and replacing vdevs support detach. 1297789Sahrens */ 1298789Sahrens if (pvd->vdev_ops != &vdev_replacing_ops && 1299789Sahrens pvd->vdev_ops != &vdev_mirror_ops) 1300789Sahrens return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1301789Sahrens 1302789Sahrens /* 1303789Sahrens * If there's only one replica, you can't detach it. 1304789Sahrens */ 1305789Sahrens if (pvd->vdev_children <= 1) 1306789Sahrens return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1307789Sahrens 1308789Sahrens /* 1309789Sahrens * If all siblings have non-empty DTLs, this device may have the only 1310789Sahrens * valid copy of the data, which means we cannot safely detach it. 1311789Sahrens * 1312789Sahrens * XXX -- as in the vdev_offline() case, we really want a more 1313789Sahrens * precise DTL check. 1314789Sahrens */ 1315789Sahrens for (c = 0; c < pvd->vdev_children; c++) { 1316789Sahrens uint64_t dirty; 1317789Sahrens 1318789Sahrens cvd = pvd->vdev_child[c]; 1319789Sahrens if (cvd == vd) 1320789Sahrens continue; 1321789Sahrens if (vdev_is_dead(cvd)) 1322789Sahrens continue; 1323789Sahrens mutex_enter(&cvd->vdev_dtl_lock); 1324789Sahrens dirty = cvd->vdev_dtl_map.sm_space | 1325789Sahrens cvd->vdev_dtl_scrub.sm_space; 1326789Sahrens mutex_exit(&cvd->vdev_dtl_lock); 1327789Sahrens if (!dirty) 1328789Sahrens break; 1329789Sahrens } 1330789Sahrens if (c == pvd->vdev_children) 1331789Sahrens return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1332789Sahrens 1333789Sahrens /* 1334789Sahrens * Erase the disk labels so the disk can be used for other things. 1335789Sahrens * This must be done after all other error cases are handled, 1336789Sahrens * but before we disembowel vd (so we can still do I/O to it). 1337789Sahrens * But if we can't do it, don't treat the error as fatal -- 1338789Sahrens * it may be that the unwritability of the disk is the reason 1339789Sahrens * it's being detached! 1340789Sahrens */ 1341789Sahrens error = vdev_label_init(vd, 0); 1342789Sahrens if (error) 1343789Sahrens dprintf("unable to erase labels on %s\n", vdev_description(vd)); 1344789Sahrens 1345789Sahrens /* 1346789Sahrens * Remove vd from its parent and compact the parent's children. 1347789Sahrens */ 1348789Sahrens vdev_remove_child(pvd, vd); 1349789Sahrens vdev_compact_children(pvd); 1350789Sahrens 1351789Sahrens /* 1352789Sahrens * Remember one of the remaining children so we can get tvd below. 1353789Sahrens */ 1354789Sahrens cvd = pvd->vdev_child[0]; 1355789Sahrens 1356789Sahrens /* 1357789Sahrens * If the parent mirror/replacing vdev only has one child, 1358789Sahrens * the parent is no longer needed. Remove it from the tree. 1359789Sahrens */ 1360789Sahrens if (pvd->vdev_children == 1) 1361789Sahrens vdev_remove_parent(cvd); 1362789Sahrens 1363789Sahrens /* 1364789Sahrens * We don't set tvd until now because the parent we just removed 1365789Sahrens * may have been the previous top-level vdev. 1366789Sahrens */ 1367789Sahrens tvd = cvd->vdev_top; 1368789Sahrens ASSERT(tvd->vdev_parent == rvd); 1369789Sahrens 1370789Sahrens /* 1371789Sahrens * Reopen this top-level vdev to reassess health after detach. 1372789Sahrens */ 13731544Seschrock vdev_reopen(tvd); 1374789Sahrens 1375789Sahrens /* 1376789Sahrens * If the device we just detached was smaller than the others, 13771732Sbonwick * it may be possible to add metaslabs (i.e. grow the pool). 13781732Sbonwick * vdev_metaslab_init() can't fail because the existing metaslabs 13791732Sbonwick * are already in core, so there's nothing to read from disk. 1380789Sahrens */ 13811732Sbonwick VERIFY(vdev_metaslab_init(tvd, txg) == 0); 1382789Sahrens 1383789Sahrens vdev_config_dirty(tvd); 1384789Sahrens 1385789Sahrens /* 1386789Sahrens * Mark vd's DTL as dirty in this txg. 1387789Sahrens * vdev_dtl_sync() will see that vd->vdev_detached is set 1388789Sahrens * and free vd's DTL object in syncing context. 1389789Sahrens * But first make sure we're not on any *other* txg's DTL list, 1390789Sahrens * to prevent vd from being accessed after it's freed. 1391789Sahrens */ 1392789Sahrens for (t = 0; t < TXG_SIZE; t++) 1393789Sahrens (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 13941732Sbonwick vd->vdev_detached = B_TRUE; 13951732Sbonwick vdev_dirty(tvd, VDD_DTL, vd, txg); 1396789Sahrens 13971544Seschrock dprintf("detached %s in txg %llu\n", vd->vdev_path, txg); 1398789Sahrens 1399789Sahrens return (spa_vdev_exit(spa, vd, txg, 0)); 1400789Sahrens } 1401789Sahrens 1402789Sahrens /* 14031544Seschrock * Find any device that's done replacing, so we can detach it. 1404789Sahrens */ 14051544Seschrock static vdev_t * 14061544Seschrock spa_vdev_replace_done_hunt(vdev_t *vd) 1407789Sahrens { 14081544Seschrock vdev_t *newvd, *oldvd; 1409789Sahrens int c; 1410789Sahrens 14111544Seschrock for (c = 0; c < vd->vdev_children; c++) { 14121544Seschrock oldvd = spa_vdev_replace_done_hunt(vd->vdev_child[c]); 14131544Seschrock if (oldvd != NULL) 14141544Seschrock return (oldvd); 14151544Seschrock } 1416789Sahrens 1417789Sahrens if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 14181544Seschrock oldvd = vd->vdev_child[0]; 14191544Seschrock newvd = vd->vdev_child[1]; 1420789Sahrens 14211544Seschrock mutex_enter(&newvd->vdev_dtl_lock); 14221544Seschrock if (newvd->vdev_dtl_map.sm_space == 0 && 14231544Seschrock newvd->vdev_dtl_scrub.sm_space == 0) { 14241544Seschrock mutex_exit(&newvd->vdev_dtl_lock); 14251544Seschrock return (oldvd); 14261544Seschrock } 14271544Seschrock mutex_exit(&newvd->vdev_dtl_lock); 14281544Seschrock } 1429789Sahrens 14301544Seschrock return (NULL); 1431789Sahrens } 1432789Sahrens 14331544Seschrock static void 1434789Sahrens spa_vdev_replace_done(spa_t *spa) 1435789Sahrens { 14361544Seschrock vdev_t *vd; 14371544Seschrock uint64_t guid; 1438789Sahrens 14391544Seschrock spa_config_enter(spa, RW_READER, FTAG); 1440789Sahrens 14411544Seschrock while ((vd = spa_vdev_replace_done_hunt(spa->spa_root_vdev)) != NULL) { 14421544Seschrock guid = vd->vdev_guid; 14431544Seschrock spa_config_exit(spa, FTAG); 14441544Seschrock if (spa_vdev_detach(spa, guid, B_TRUE) != 0) 14451544Seschrock return; 14461544Seschrock spa_config_enter(spa, RW_READER, FTAG); 1447789Sahrens } 1448789Sahrens 14491544Seschrock spa_config_exit(spa, FTAG); 1450789Sahrens } 1451789Sahrens 1452789Sahrens /* 14531354Seschrock * Update the stored path for this vdev. Dirty the vdev configuration, relying 14541354Seschrock * on spa_vdev_enter/exit() to synchronize the labels and cache. 14551354Seschrock */ 14561354Seschrock int 14571354Seschrock spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 14581354Seschrock { 14591354Seschrock vdev_t *rvd, *vd; 14601354Seschrock uint64_t txg; 14611354Seschrock 14621354Seschrock rvd = spa->spa_root_vdev; 14631354Seschrock 14641354Seschrock txg = spa_vdev_enter(spa); 14651354Seschrock 14661354Seschrock if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) 14671354Seschrock return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 14681354Seschrock 14691585Sbonwick if (!vd->vdev_ops->vdev_op_leaf) 14701585Sbonwick return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 14711585Sbonwick 14721354Seschrock spa_strfree(vd->vdev_path); 14731354Seschrock vd->vdev_path = spa_strdup(newpath); 14741354Seschrock 14751354Seschrock vdev_config_dirty(vd->vdev_top); 14761354Seschrock 14771354Seschrock return (spa_vdev_exit(spa, NULL, txg, 0)); 14781354Seschrock } 14791354Seschrock 14801354Seschrock /* 1481789Sahrens * ========================================================================== 1482789Sahrens * SPA Scrubbing 1483789Sahrens * ========================================================================== 1484789Sahrens */ 1485789Sahrens 14861544Seschrock void 14871544Seschrock spa_scrub_throttle(spa_t *spa, int direction) 14881544Seschrock { 14891544Seschrock mutex_enter(&spa->spa_scrub_lock); 14901544Seschrock spa->spa_scrub_throttled += direction; 14911544Seschrock ASSERT(spa->spa_scrub_throttled >= 0); 14921544Seschrock if (spa->spa_scrub_throttled == 0) 14931544Seschrock cv_broadcast(&spa->spa_scrub_io_cv); 14941544Seschrock mutex_exit(&spa->spa_scrub_lock); 14951544Seschrock } 1496789Sahrens 1497789Sahrens static void 1498789Sahrens spa_scrub_io_done(zio_t *zio) 1499789Sahrens { 1500789Sahrens spa_t *spa = zio->io_spa; 1501789Sahrens 1502789Sahrens zio_buf_free(zio->io_data, zio->io_size); 1503789Sahrens 1504789Sahrens mutex_enter(&spa->spa_scrub_lock); 15051544Seschrock if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 1506*1775Sbillm vdev_t *vd = zio->io_vd ? zio->io_vd : spa->spa_root_vdev; 1507789Sahrens spa->spa_scrub_errors++; 1508789Sahrens mutex_enter(&vd->vdev_stat_lock); 1509789Sahrens vd->vdev_stat.vs_scrub_errors++; 1510789Sahrens mutex_exit(&vd->vdev_stat_lock); 1511789Sahrens } 15121544Seschrock if (--spa->spa_scrub_inflight == 0) { 15131544Seschrock cv_broadcast(&spa->spa_scrub_io_cv); 15141544Seschrock ASSERT(spa->spa_scrub_throttled == 0); 15151544Seschrock } 15161544Seschrock mutex_exit(&spa->spa_scrub_lock); 1517789Sahrens } 1518789Sahrens 1519789Sahrens static void 15201544Seschrock spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags, 15211544Seschrock zbookmark_t *zb) 1522789Sahrens { 1523789Sahrens size_t size = BP_GET_LSIZE(bp); 1524789Sahrens void *data = zio_buf_alloc(size); 1525789Sahrens 1526789Sahrens mutex_enter(&spa->spa_scrub_lock); 1527789Sahrens spa->spa_scrub_inflight++; 1528789Sahrens mutex_exit(&spa->spa_scrub_lock); 1529789Sahrens 15301544Seschrock if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET) 15311544Seschrock flags |= ZIO_FLAG_SPECULATIVE; /* intent log block */ 15321544Seschrock 15331544Seschrock flags |= ZIO_FLAG_CANFAIL; 15341544Seschrock 1535789Sahrens zio_nowait(zio_read(NULL, spa, bp, data, size, 15361544Seschrock spa_scrub_io_done, NULL, priority, flags, zb)); 1537789Sahrens } 1538789Sahrens 1539789Sahrens /* ARGSUSED */ 1540789Sahrens static int 1541789Sahrens spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) 1542789Sahrens { 1543789Sahrens blkptr_t *bp = &bc->bc_blkptr; 1544*1775Sbillm vdev_t *vd = spa->spa_root_vdev; 1545*1775Sbillm dva_t *dva = bp->blk_dva; 1546*1775Sbillm int needs_resilver = B_FALSE; 1547*1775Sbillm int d; 1548789Sahrens 1549*1775Sbillm if (bc->bc_errno) { 1550789Sahrens /* 1551789Sahrens * We can't scrub this block, but we can continue to scrub 1552789Sahrens * the rest of the pool. Note the error and move along. 1553789Sahrens */ 1554789Sahrens mutex_enter(&spa->spa_scrub_lock); 1555789Sahrens spa->spa_scrub_errors++; 1556789Sahrens mutex_exit(&spa->spa_scrub_lock); 1557789Sahrens 1558*1775Sbillm mutex_enter(&vd->vdev_stat_lock); 1559*1775Sbillm vd->vdev_stat.vs_scrub_errors++; 1560*1775Sbillm mutex_exit(&vd->vdev_stat_lock); 1561789Sahrens 1562789Sahrens return (ERESTART); 1563789Sahrens } 1564789Sahrens 1565789Sahrens ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg); 1566789Sahrens 1567*1775Sbillm for (d = 0; d < BP_GET_NDVAS(bp); d++) { 1568*1775Sbillm vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d])); 1569*1775Sbillm 1570*1775Sbillm ASSERT(vd != NULL); 1571*1775Sbillm 1572*1775Sbillm /* 1573*1775Sbillm * Keep track of how much data we've examined so that 1574*1775Sbillm * zpool(1M) status can make useful progress reports. 1575*1775Sbillm */ 1576*1775Sbillm mutex_enter(&vd->vdev_stat_lock); 1577*1775Sbillm vd->vdev_stat.vs_scrub_examined += DVA_GET_ASIZE(&dva[d]); 1578*1775Sbillm mutex_exit(&vd->vdev_stat_lock); 1579789Sahrens 1580*1775Sbillm if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) { 1581*1775Sbillm if (DVA_GET_GANG(&dva[d])) { 1582*1775Sbillm /* 1583*1775Sbillm * Gang members may be spread across multiple 1584*1775Sbillm * vdevs, so the best we can do is look at the 1585*1775Sbillm * pool-wide DTL. 1586*1775Sbillm * XXX -- it would be better to change our 1587*1775Sbillm * allocation policy to ensure that this can't 1588*1775Sbillm * happen. 1589*1775Sbillm */ 1590*1775Sbillm vd = spa->spa_root_vdev; 1591*1775Sbillm } 1592*1775Sbillm if (vdev_dtl_contains(&vd->vdev_dtl_map, 1593*1775Sbillm bp->blk_birth, 1)) 1594*1775Sbillm needs_resilver = B_TRUE; 1595789Sahrens } 1596*1775Sbillm } 1597*1775Sbillm 1598*1775Sbillm if (spa->spa_scrub_type == POOL_SCRUB_EVERYTHING) 1599789Sahrens spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB, 16001544Seschrock ZIO_FLAG_SCRUB, &bc->bc_bookmark); 1601*1775Sbillm else if (needs_resilver) 1602*1775Sbillm spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER, 1603*1775Sbillm ZIO_FLAG_RESILVER, &bc->bc_bookmark); 1604789Sahrens 1605789Sahrens return (0); 1606789Sahrens } 1607789Sahrens 1608789Sahrens static void 1609789Sahrens spa_scrub_thread(spa_t *spa) 1610789Sahrens { 1611789Sahrens callb_cpr_t cprinfo; 1612789Sahrens traverse_handle_t *th = spa->spa_scrub_th; 1613789Sahrens vdev_t *rvd = spa->spa_root_vdev; 1614789Sahrens pool_scrub_type_t scrub_type = spa->spa_scrub_type; 1615789Sahrens int error = 0; 1616789Sahrens boolean_t complete; 1617789Sahrens 1618789Sahrens CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG); 1619789Sahrens 1620797Sbonwick /* 1621797Sbonwick * If we're restarting due to a snapshot create/delete, 1622797Sbonwick * wait for that to complete. 1623797Sbonwick */ 1624797Sbonwick txg_wait_synced(spa_get_dsl(spa), 0); 1625797Sbonwick 16261544Seschrock dprintf("start %s mintxg=%llu maxtxg=%llu\n", 16271544Seschrock scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 16281544Seschrock spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg); 16291544Seschrock 16301544Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 16311544Seschrock vdev_reopen(rvd); /* purge all vdev caches */ 1632789Sahrens vdev_config_dirty(rvd); /* rewrite all disk labels */ 1633789Sahrens vdev_scrub_stat_update(rvd, scrub_type, B_FALSE); 16341544Seschrock spa_config_exit(spa, FTAG); 1635789Sahrens 1636789Sahrens mutex_enter(&spa->spa_scrub_lock); 1637789Sahrens spa->spa_scrub_errors = 0; 1638789Sahrens spa->spa_scrub_active = 1; 16391544Seschrock ASSERT(spa->spa_scrub_inflight == 0); 16401544Seschrock ASSERT(spa->spa_scrub_throttled == 0); 1641789Sahrens 1642789Sahrens while (!spa->spa_scrub_stop) { 1643789Sahrens CALLB_CPR_SAFE_BEGIN(&cprinfo); 16441544Seschrock while (spa->spa_scrub_suspended) { 1645789Sahrens spa->spa_scrub_active = 0; 1646789Sahrens cv_broadcast(&spa->spa_scrub_cv); 1647789Sahrens cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 1648789Sahrens spa->spa_scrub_active = 1; 1649789Sahrens } 1650789Sahrens CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock); 1651789Sahrens 1652789Sahrens if (spa->spa_scrub_restart_txg != 0) 1653789Sahrens break; 1654789Sahrens 1655789Sahrens mutex_exit(&spa->spa_scrub_lock); 1656789Sahrens error = traverse_more(th); 1657789Sahrens mutex_enter(&spa->spa_scrub_lock); 1658789Sahrens if (error != EAGAIN) 1659789Sahrens break; 16601544Seschrock 16611544Seschrock while (spa->spa_scrub_throttled > 0) 16621544Seschrock cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1663789Sahrens } 1664789Sahrens 1665789Sahrens while (spa->spa_scrub_inflight) 1666789Sahrens cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1667789Sahrens 16681601Sbonwick spa->spa_scrub_active = 0; 16691601Sbonwick cv_broadcast(&spa->spa_scrub_cv); 16701601Sbonwick 16711601Sbonwick mutex_exit(&spa->spa_scrub_lock); 16721601Sbonwick 16731601Sbonwick spa_config_enter(spa, RW_WRITER, FTAG); 16741601Sbonwick 16751601Sbonwick mutex_enter(&spa->spa_scrub_lock); 16761601Sbonwick 16771601Sbonwick /* 16781601Sbonwick * Note: we check spa_scrub_restart_txg under both spa_scrub_lock 16791601Sbonwick * AND the spa config lock to synchronize with any config changes 16801601Sbonwick * that revise the DTLs under spa_vdev_enter() / spa_vdev_exit(). 16811601Sbonwick */ 1682789Sahrens if (spa->spa_scrub_restart_txg != 0) 1683789Sahrens error = ERESTART; 1684789Sahrens 16851544Seschrock if (spa->spa_scrub_stop) 16861544Seschrock error = EINTR; 16871544Seschrock 1688789Sahrens /* 16891544Seschrock * Even if there were uncorrectable errors, we consider the scrub 16901544Seschrock * completed. The downside is that if there is a transient error during 16911544Seschrock * a resilver, we won't resilver the data properly to the target. But 16921544Seschrock * if the damage is permanent (more likely) we will resilver forever, 16931544Seschrock * which isn't really acceptable. Since there is enough information for 16941544Seschrock * the user to know what has failed and why, this seems like a more 16951544Seschrock * tractable approach. 1696789Sahrens */ 16971544Seschrock complete = (error == 0); 1698789Sahrens 16991544Seschrock dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n", 17001544Seschrock scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 1701789Sahrens spa->spa_scrub_maxtxg, complete ? "done" : "FAILED", 1702789Sahrens error, spa->spa_scrub_errors, spa->spa_scrub_stop); 1703789Sahrens 1704789Sahrens mutex_exit(&spa->spa_scrub_lock); 1705789Sahrens 1706789Sahrens /* 1707789Sahrens * If the scrub/resilver completed, update all DTLs to reflect this. 1708789Sahrens * Whether it succeeded or not, vacate all temporary scrub DTLs. 1709789Sahrens */ 1710789Sahrens vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1, 1711789Sahrens complete ? spa->spa_scrub_maxtxg : 0, B_TRUE); 1712789Sahrens vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete); 17131544Seschrock spa_errlog_rotate(spa); 17141601Sbonwick 17151544Seschrock spa_config_exit(spa, FTAG); 1716789Sahrens 1717789Sahrens mutex_enter(&spa->spa_scrub_lock); 1718789Sahrens 17191544Seschrock /* 17201544Seschrock * We may have finished replacing a device. 17211544Seschrock * Let the async thread assess this and handle the detach. 17221544Seschrock */ 17231544Seschrock spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 1724789Sahrens 1725789Sahrens /* 1726789Sahrens * If we were told to restart, our final act is to start a new scrub. 1727789Sahrens */ 1728789Sahrens if (error == ERESTART) 17291544Seschrock spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ? 17301544Seschrock SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB); 1731789Sahrens 17321544Seschrock spa->spa_scrub_type = POOL_SCRUB_NONE; 17331544Seschrock spa->spa_scrub_active = 0; 17341544Seschrock spa->spa_scrub_thread = NULL; 17351544Seschrock cv_broadcast(&spa->spa_scrub_cv); 1736789Sahrens CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */ 1737789Sahrens thread_exit(); 1738789Sahrens } 1739789Sahrens 1740789Sahrens void 1741789Sahrens spa_scrub_suspend(spa_t *spa) 1742789Sahrens { 1743789Sahrens mutex_enter(&spa->spa_scrub_lock); 17441544Seschrock spa->spa_scrub_suspended++; 1745789Sahrens while (spa->spa_scrub_active) { 1746789Sahrens cv_broadcast(&spa->spa_scrub_cv); 1747789Sahrens cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 1748789Sahrens } 1749789Sahrens while (spa->spa_scrub_inflight) 1750789Sahrens cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1751789Sahrens mutex_exit(&spa->spa_scrub_lock); 1752789Sahrens } 1753789Sahrens 1754789Sahrens void 1755789Sahrens spa_scrub_resume(spa_t *spa) 1756789Sahrens { 1757789Sahrens mutex_enter(&spa->spa_scrub_lock); 17581544Seschrock ASSERT(spa->spa_scrub_suspended != 0); 17591544Seschrock if (--spa->spa_scrub_suspended == 0) 1760789Sahrens cv_broadcast(&spa->spa_scrub_cv); 1761789Sahrens mutex_exit(&spa->spa_scrub_lock); 1762789Sahrens } 1763789Sahrens 1764789Sahrens void 1765789Sahrens spa_scrub_restart(spa_t *spa, uint64_t txg) 1766789Sahrens { 1767789Sahrens /* 1768789Sahrens * Something happened (e.g. snapshot create/delete) that means 1769789Sahrens * we must restart any in-progress scrubs. The itinerary will 1770789Sahrens * fix this properly. 1771789Sahrens */ 1772789Sahrens mutex_enter(&spa->spa_scrub_lock); 1773789Sahrens spa->spa_scrub_restart_txg = txg; 1774789Sahrens mutex_exit(&spa->spa_scrub_lock); 1775789Sahrens } 1776789Sahrens 17771544Seschrock int 17781544Seschrock spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) 1779789Sahrens { 1780789Sahrens space_seg_t *ss; 1781789Sahrens uint64_t mintxg, maxtxg; 1782789Sahrens vdev_t *rvd = spa->spa_root_vdev; 1783789Sahrens 1784789Sahrens if ((uint_t)type >= POOL_SCRUB_TYPES) 1785789Sahrens return (ENOTSUP); 1786789Sahrens 17871544Seschrock mutex_enter(&spa->spa_scrub_lock); 17881544Seschrock 1789789Sahrens /* 1790789Sahrens * If there's a scrub or resilver already in progress, stop it. 1791789Sahrens */ 1792789Sahrens while (spa->spa_scrub_thread != NULL) { 1793789Sahrens /* 1794789Sahrens * Don't stop a resilver unless forced. 1795789Sahrens */ 17961544Seschrock if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) { 17971544Seschrock mutex_exit(&spa->spa_scrub_lock); 1798789Sahrens return (EBUSY); 17991544Seschrock } 1800789Sahrens spa->spa_scrub_stop = 1; 1801789Sahrens cv_broadcast(&spa->spa_scrub_cv); 1802789Sahrens cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 1803789Sahrens } 1804789Sahrens 1805789Sahrens /* 1806789Sahrens * Terminate the previous traverse. 1807789Sahrens */ 1808789Sahrens if (spa->spa_scrub_th != NULL) { 1809789Sahrens traverse_fini(spa->spa_scrub_th); 1810789Sahrens spa->spa_scrub_th = NULL; 1811789Sahrens } 1812789Sahrens 18131544Seschrock if (rvd == NULL) { 18141544Seschrock ASSERT(spa->spa_scrub_stop == 0); 18151544Seschrock ASSERT(spa->spa_scrub_type == type); 18161544Seschrock ASSERT(spa->spa_scrub_restart_txg == 0); 18171544Seschrock mutex_exit(&spa->spa_scrub_lock); 18181544Seschrock return (0); 18191544Seschrock } 1820789Sahrens 1821789Sahrens mintxg = TXG_INITIAL - 1; 1822789Sahrens maxtxg = spa_last_synced_txg(spa) + 1; 1823789Sahrens 18241544Seschrock mutex_enter(&rvd->vdev_dtl_lock); 1825789Sahrens 18261544Seschrock if (rvd->vdev_dtl_map.sm_space == 0) { 18271544Seschrock /* 18281544Seschrock * The pool-wide DTL is empty. 18291732Sbonwick * If this is a resilver, there's nothing to do except 18301732Sbonwick * check whether any in-progress replacements have completed. 18311544Seschrock */ 18321732Sbonwick if (type == POOL_SCRUB_RESILVER) { 18331544Seschrock type = POOL_SCRUB_NONE; 18341732Sbonwick spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 18351732Sbonwick } 18361544Seschrock } else { 18371544Seschrock /* 18381544Seschrock * The pool-wide DTL is non-empty. 18391544Seschrock * If this is a normal scrub, upgrade to a resilver instead. 18401544Seschrock */ 18411544Seschrock if (type == POOL_SCRUB_EVERYTHING) 18421544Seschrock type = POOL_SCRUB_RESILVER; 18431544Seschrock } 1844789Sahrens 18451544Seschrock if (type == POOL_SCRUB_RESILVER) { 1846789Sahrens /* 1847789Sahrens * Determine the resilvering boundaries. 1848789Sahrens * 1849789Sahrens * Note: (mintxg, maxtxg) is an open interval, 1850789Sahrens * i.e. mintxg and maxtxg themselves are not included. 1851789Sahrens * 1852789Sahrens * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1 1853789Sahrens * so we don't claim to resilver a txg that's still changing. 1854789Sahrens */ 1855789Sahrens ss = avl_first(&rvd->vdev_dtl_map.sm_root); 18561544Seschrock mintxg = ss->ss_start - 1; 1857789Sahrens ss = avl_last(&rvd->vdev_dtl_map.sm_root); 18581544Seschrock maxtxg = MIN(ss->ss_end, maxtxg); 1859789Sahrens } 1860789Sahrens 18611544Seschrock mutex_exit(&rvd->vdev_dtl_lock); 18621544Seschrock 18631544Seschrock spa->spa_scrub_stop = 0; 18641544Seschrock spa->spa_scrub_type = type; 18651544Seschrock spa->spa_scrub_restart_txg = 0; 18661544Seschrock 18671544Seschrock if (type != POOL_SCRUB_NONE) { 18681544Seschrock spa->spa_scrub_mintxg = mintxg; 1869789Sahrens spa->spa_scrub_maxtxg = maxtxg; 1870789Sahrens spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL, 18711635Sbonwick ADVANCE_PRE | ADVANCE_PRUNE | ADVANCE_ZIL, 18721635Sbonwick ZIO_FLAG_CANFAIL); 1873789Sahrens traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg); 1874789Sahrens spa->spa_scrub_thread = thread_create(NULL, 0, 1875789Sahrens spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri); 1876789Sahrens } 1877789Sahrens 18781544Seschrock mutex_exit(&spa->spa_scrub_lock); 18791544Seschrock 1880789Sahrens return (0); 1881789Sahrens } 1882789Sahrens 18831544Seschrock /* 18841544Seschrock * ========================================================================== 18851544Seschrock * SPA async task processing 18861544Seschrock * ========================================================================== 18871544Seschrock */ 18881544Seschrock 18891544Seschrock static void 18901544Seschrock spa_async_reopen(spa_t *spa) 1891789Sahrens { 18921544Seschrock vdev_t *rvd = spa->spa_root_vdev; 18931544Seschrock vdev_t *tvd; 18941544Seschrock int c; 18951544Seschrock 18961544Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 18971544Seschrock 18981544Seschrock for (c = 0; c < rvd->vdev_children; c++) { 18991544Seschrock tvd = rvd->vdev_child[c]; 19001544Seschrock if (tvd->vdev_reopen_wanted) { 19011544Seschrock tvd->vdev_reopen_wanted = 0; 19021544Seschrock vdev_reopen(tvd); 19031544Seschrock } 19041544Seschrock } 1905789Sahrens 19061544Seschrock spa_config_exit(spa, FTAG); 19071544Seschrock } 19081544Seschrock 19091544Seschrock static void 19101544Seschrock spa_async_thread(spa_t *spa) 19111544Seschrock { 19121544Seschrock int tasks; 19131544Seschrock 19141544Seschrock ASSERT(spa->spa_sync_on); 1915789Sahrens 19161544Seschrock mutex_enter(&spa->spa_async_lock); 19171544Seschrock tasks = spa->spa_async_tasks; 19181544Seschrock spa->spa_async_tasks = 0; 19191544Seschrock mutex_exit(&spa->spa_async_lock); 19201544Seschrock 19211544Seschrock /* 19221635Sbonwick * See if the config needs to be updated. 19231635Sbonwick */ 19241635Sbonwick if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 19251635Sbonwick mutex_enter(&spa_namespace_lock); 19261635Sbonwick spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 19271635Sbonwick mutex_exit(&spa_namespace_lock); 19281635Sbonwick } 19291635Sbonwick 19301635Sbonwick /* 19311544Seschrock * See if any devices need to be reopened. 19321544Seschrock */ 19331544Seschrock if (tasks & SPA_ASYNC_REOPEN) 19341544Seschrock spa_async_reopen(spa); 19351544Seschrock 19361544Seschrock /* 19371544Seschrock * If any devices are done replacing, detach them. 19381544Seschrock */ 19391544Seschrock if (tasks & SPA_ASYNC_REPLACE_DONE) 1940789Sahrens spa_vdev_replace_done(spa); 1941789Sahrens 19421544Seschrock /* 19431544Seschrock * Kick off a scrub. 19441544Seschrock */ 19451544Seschrock if (tasks & SPA_ASYNC_SCRUB) 19461544Seschrock VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0); 19471544Seschrock 19481544Seschrock /* 19491544Seschrock * Kick off a resilver. 19501544Seschrock */ 19511544Seschrock if (tasks & SPA_ASYNC_RESILVER) 19521544Seschrock VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 19531544Seschrock 19541544Seschrock /* 19551544Seschrock * Let the world know that we're done. 19561544Seschrock */ 19571544Seschrock mutex_enter(&spa->spa_async_lock); 19581544Seschrock spa->spa_async_thread = NULL; 19591544Seschrock cv_broadcast(&spa->spa_async_cv); 19601544Seschrock mutex_exit(&spa->spa_async_lock); 19611544Seschrock thread_exit(); 19621544Seschrock } 19631544Seschrock 19641544Seschrock void 19651544Seschrock spa_async_suspend(spa_t *spa) 19661544Seschrock { 19671544Seschrock mutex_enter(&spa->spa_async_lock); 19681544Seschrock spa->spa_async_suspended++; 19691544Seschrock while (spa->spa_async_thread != NULL) 19701544Seschrock cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 19711544Seschrock mutex_exit(&spa->spa_async_lock); 19721544Seschrock } 19731544Seschrock 19741544Seschrock void 19751544Seschrock spa_async_resume(spa_t *spa) 19761544Seschrock { 19771544Seschrock mutex_enter(&spa->spa_async_lock); 19781544Seschrock ASSERT(spa->spa_async_suspended != 0); 19791544Seschrock spa->spa_async_suspended--; 19801544Seschrock mutex_exit(&spa->spa_async_lock); 19811544Seschrock } 19821544Seschrock 19831544Seschrock static void 19841544Seschrock spa_async_dispatch(spa_t *spa) 19851544Seschrock { 19861544Seschrock mutex_enter(&spa->spa_async_lock); 19871544Seschrock if (spa->spa_async_tasks && !spa->spa_async_suspended && 19881635Sbonwick spa->spa_async_thread == NULL && 19891635Sbonwick rootdir != NULL && !vn_is_readonly(rootdir)) 19901544Seschrock spa->spa_async_thread = thread_create(NULL, 0, 19911544Seschrock spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 19921544Seschrock mutex_exit(&spa->spa_async_lock); 19931544Seschrock } 19941544Seschrock 19951544Seschrock void 19961544Seschrock spa_async_request(spa_t *spa, int task) 19971544Seschrock { 19981544Seschrock mutex_enter(&spa->spa_async_lock); 19991544Seschrock spa->spa_async_tasks |= task; 20001544Seschrock mutex_exit(&spa->spa_async_lock); 2001789Sahrens } 2002789Sahrens 2003789Sahrens /* 2004789Sahrens * ========================================================================== 2005789Sahrens * SPA syncing routines 2006789Sahrens * ========================================================================== 2007789Sahrens */ 2008789Sahrens 2009789Sahrens static void 2010789Sahrens spa_sync_deferred_frees(spa_t *spa, uint64_t txg) 2011789Sahrens { 2012789Sahrens bplist_t *bpl = &spa->spa_sync_bplist; 2013789Sahrens dmu_tx_t *tx; 2014789Sahrens blkptr_t blk; 2015789Sahrens uint64_t itor = 0; 2016789Sahrens zio_t *zio; 2017789Sahrens int error; 2018789Sahrens uint8_t c = 1; 2019789Sahrens 2020789Sahrens zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD); 2021789Sahrens 2022789Sahrens while (bplist_iterate(bpl, &itor, &blk) == 0) 2023789Sahrens zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL)); 2024789Sahrens 2025789Sahrens error = zio_wait(zio); 2026789Sahrens ASSERT3U(error, ==, 0); 2027789Sahrens 2028789Sahrens tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 2029789Sahrens bplist_vacate(bpl, tx); 2030789Sahrens 2031789Sahrens /* 2032789Sahrens * Pre-dirty the first block so we sync to convergence faster. 2033789Sahrens * (Usually only the first block is needed.) 2034789Sahrens */ 2035789Sahrens dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 2036789Sahrens dmu_tx_commit(tx); 2037789Sahrens } 2038789Sahrens 2039789Sahrens static void 2040789Sahrens spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 2041789Sahrens { 2042789Sahrens nvlist_t *config; 2043789Sahrens char *packed = NULL; 2044789Sahrens size_t nvsize = 0; 2045789Sahrens dmu_buf_t *db; 2046789Sahrens 2047789Sahrens if (list_is_empty(&spa->spa_dirty_list)) 2048789Sahrens return; 2049789Sahrens 2050789Sahrens config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE); 2051789Sahrens 20521635Sbonwick if (spa->spa_config_syncing) 20531635Sbonwick nvlist_free(spa->spa_config_syncing); 20541635Sbonwick spa->spa_config_syncing = config; 2055789Sahrens 2056789Sahrens VERIFY(nvlist_size(config, &nvsize, NV_ENCODE_XDR) == 0); 2057789Sahrens 2058789Sahrens packed = kmem_alloc(nvsize, KM_SLEEP); 2059789Sahrens 20601544Seschrock VERIFY(nvlist_pack(config, &packed, &nvsize, NV_ENCODE_XDR, 20611544Seschrock KM_SLEEP) == 0); 2062789Sahrens 2063789Sahrens dmu_write(spa->spa_meta_objset, spa->spa_config_object, 0, nvsize, 2064789Sahrens packed, tx); 2065789Sahrens 2066789Sahrens kmem_free(packed, nvsize); 2067789Sahrens 20681544Seschrock VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, 20691544Seschrock spa->spa_config_object, FTAG, &db)); 2070789Sahrens dmu_buf_will_dirty(db, tx); 2071789Sahrens *(uint64_t *)db->db_data = nvsize; 20721544Seschrock dmu_buf_rele(db, FTAG); 2073789Sahrens } 2074789Sahrens 2075789Sahrens /* 2076789Sahrens * Sync the specified transaction group. New blocks may be dirtied as 2077789Sahrens * part of the process, so we iterate until it converges. 2078789Sahrens */ 2079789Sahrens void 2080789Sahrens spa_sync(spa_t *spa, uint64_t txg) 2081789Sahrens { 2082789Sahrens dsl_pool_t *dp = spa->spa_dsl_pool; 2083789Sahrens objset_t *mos = spa->spa_meta_objset; 2084789Sahrens bplist_t *bpl = &spa->spa_sync_bplist; 20851635Sbonwick vdev_t *rvd = spa->spa_root_vdev; 2086789Sahrens vdev_t *vd; 2087789Sahrens dmu_tx_t *tx; 2088789Sahrens int dirty_vdevs; 2089789Sahrens 2090789Sahrens /* 2091789Sahrens * Lock out configuration changes. 2092789Sahrens */ 20931544Seschrock spa_config_enter(spa, RW_READER, FTAG); 2094789Sahrens 2095789Sahrens spa->spa_syncing_txg = txg; 2096789Sahrens spa->spa_sync_pass = 0; 2097789Sahrens 20981544Seschrock VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); 2099789Sahrens 2100789Sahrens /* 2101789Sahrens * If anything has changed in this txg, push the deferred frees 2102789Sahrens * from the previous txg. If not, leave them alone so that we 2103789Sahrens * don't generate work on an otherwise idle system. 2104789Sahrens */ 2105789Sahrens if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 2106789Sahrens !txg_list_empty(&dp->dp_dirty_dirs, txg)) 2107789Sahrens spa_sync_deferred_frees(spa, txg); 2108789Sahrens 2109789Sahrens /* 2110789Sahrens * Iterate to convergence. 2111789Sahrens */ 2112789Sahrens do { 2113789Sahrens spa->spa_sync_pass++; 2114789Sahrens 2115789Sahrens tx = dmu_tx_create_assigned(dp, txg); 2116789Sahrens spa_sync_config_object(spa, tx); 2117789Sahrens dmu_tx_commit(tx); 2118789Sahrens 21191544Seschrock spa_errlog_sync(spa, txg); 21201544Seschrock 2121789Sahrens dsl_pool_sync(dp, txg); 2122789Sahrens 2123789Sahrens dirty_vdevs = 0; 2124789Sahrens while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 2125789Sahrens vdev_sync(vd, txg); 2126789Sahrens dirty_vdevs++; 2127789Sahrens } 2128789Sahrens 2129789Sahrens tx = dmu_tx_create_assigned(dp, txg); 2130789Sahrens bplist_sync(bpl, tx); 2131789Sahrens dmu_tx_commit(tx); 2132789Sahrens 2133789Sahrens } while (dirty_vdevs); 2134789Sahrens 2135789Sahrens bplist_close(bpl); 2136789Sahrens 2137789Sahrens dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 2138789Sahrens 2139789Sahrens /* 2140789Sahrens * Rewrite the vdev configuration (which includes the uberblock) 2141789Sahrens * to commit the transaction group. 21421635Sbonwick * 21431635Sbonwick * If there are any dirty vdevs, sync the uberblock to all vdevs. 21441635Sbonwick * Otherwise, pick a random top-level vdev that's known to be 21451635Sbonwick * visible in the config cache (see spa_vdev_add() for details). 21461635Sbonwick * If the write fails, try the next vdev until we're tried them all. 2147789Sahrens */ 21481635Sbonwick if (!list_is_empty(&spa->spa_dirty_list)) { 21491635Sbonwick VERIFY(vdev_config_sync(rvd, txg) == 0); 21501635Sbonwick } else { 21511635Sbonwick int children = rvd->vdev_children; 21521635Sbonwick int c0 = spa_get_random(children); 21531635Sbonwick int c; 21541635Sbonwick 21551635Sbonwick for (c = 0; c < children; c++) { 21561635Sbonwick vd = rvd->vdev_child[(c0 + c) % children]; 21571635Sbonwick if (vd->vdev_ms_array == 0) 21581635Sbonwick continue; 21591635Sbonwick if (vdev_config_sync(vd, txg) == 0) 21601635Sbonwick break; 21611635Sbonwick } 21621635Sbonwick if (c == children) 21631635Sbonwick VERIFY(vdev_config_sync(rvd, txg) == 0); 21641635Sbonwick } 21651635Sbonwick 21661635Sbonwick /* 21671635Sbonwick * Clear the dirty config list. 21681635Sbonwick */ 21691635Sbonwick while ((vd = list_head(&spa->spa_dirty_list)) != NULL) 21701635Sbonwick vdev_config_clean(vd); 21711635Sbonwick 21721635Sbonwick /* 21731635Sbonwick * Now that the new config has synced transactionally, 21741635Sbonwick * let it become visible to the config cache. 21751635Sbonwick */ 21761635Sbonwick if (spa->spa_config_syncing != NULL) { 21771635Sbonwick spa_config_set(spa, spa->spa_config_syncing); 21781635Sbonwick spa->spa_config_txg = txg; 21791635Sbonwick spa->spa_config_syncing = NULL; 21801635Sbonwick } 2181789Sahrens 2182789Sahrens /* 2183789Sahrens * Make a stable copy of the fully synced uberblock. 2184789Sahrens * We use this as the root for pool traversals. 2185789Sahrens */ 2186789Sahrens spa->spa_traverse_wanted = 1; /* tells traverse_more() to stop */ 2187789Sahrens 2188789Sahrens spa_scrub_suspend(spa); /* stop scrubbing and finish I/Os */ 2189789Sahrens 2190789Sahrens rw_enter(&spa->spa_traverse_lock, RW_WRITER); 2191789Sahrens spa->spa_traverse_wanted = 0; 2192789Sahrens spa->spa_ubsync = spa->spa_uberblock; 2193789Sahrens rw_exit(&spa->spa_traverse_lock); 2194789Sahrens 2195789Sahrens spa_scrub_resume(spa); /* resume scrub with new ubsync */ 2196789Sahrens 2197789Sahrens /* 2198789Sahrens * Clean up the ZIL records for the synced txg. 2199789Sahrens */ 2200789Sahrens dsl_pool_zil_clean(dp); 2201789Sahrens 2202789Sahrens /* 2203789Sahrens * Update usable space statistics. 2204789Sahrens */ 2205789Sahrens while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 2206789Sahrens vdev_sync_done(vd, txg); 2207789Sahrens 2208789Sahrens /* 2209789Sahrens * It had better be the case that we didn't dirty anything 2210789Sahrens * since spa_sync_labels(). 2211789Sahrens */ 2212789Sahrens ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 2213789Sahrens ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 2214789Sahrens ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 2215789Sahrens ASSERT(bpl->bpl_queue == NULL); 2216789Sahrens 22171544Seschrock spa_config_exit(spa, FTAG); 22181544Seschrock 22191544Seschrock /* 22201544Seschrock * If any async tasks have been requested, kick them off. 22211544Seschrock */ 22221544Seschrock spa_async_dispatch(spa); 2223789Sahrens } 2224789Sahrens 2225789Sahrens /* 2226789Sahrens * Sync all pools. We don't want to hold the namespace lock across these 2227789Sahrens * operations, so we take a reference on the spa_t and drop the lock during the 2228789Sahrens * sync. 2229789Sahrens */ 2230789Sahrens void 2231789Sahrens spa_sync_allpools(void) 2232789Sahrens { 2233789Sahrens spa_t *spa = NULL; 2234789Sahrens mutex_enter(&spa_namespace_lock); 2235789Sahrens while ((spa = spa_next(spa)) != NULL) { 2236789Sahrens if (spa_state(spa) != POOL_STATE_ACTIVE) 2237789Sahrens continue; 2238789Sahrens spa_open_ref(spa, FTAG); 2239789Sahrens mutex_exit(&spa_namespace_lock); 2240789Sahrens txg_wait_synced(spa_get_dsl(spa), 0); 2241789Sahrens mutex_enter(&spa_namespace_lock); 2242789Sahrens spa_close(spa, FTAG); 2243789Sahrens } 2244789Sahrens mutex_exit(&spa_namespace_lock); 2245789Sahrens } 2246789Sahrens 2247789Sahrens /* 2248789Sahrens * ========================================================================== 2249789Sahrens * Miscellaneous routines 2250789Sahrens * ========================================================================== 2251789Sahrens */ 2252789Sahrens 2253789Sahrens /* 2254789Sahrens * Remove all pools in the system. 2255789Sahrens */ 2256789Sahrens void 2257789Sahrens spa_evict_all(void) 2258789Sahrens { 2259789Sahrens spa_t *spa; 2260789Sahrens 2261789Sahrens /* 2262789Sahrens * Remove all cached state. All pools should be closed now, 2263789Sahrens * so every spa in the AVL tree should be unreferenced. 2264789Sahrens */ 2265789Sahrens mutex_enter(&spa_namespace_lock); 2266789Sahrens while ((spa = spa_next(NULL)) != NULL) { 2267789Sahrens /* 22681544Seschrock * Stop async tasks. The async thread may need to detach 22691544Seschrock * a device that's been replaced, which requires grabbing 22701544Seschrock * spa_namespace_lock, so we must drop it here. 2271789Sahrens */ 2272789Sahrens spa_open_ref(spa, FTAG); 2273789Sahrens mutex_exit(&spa_namespace_lock); 22741544Seschrock spa_async_suspend(spa); 2275789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 2276789Sahrens mutex_enter(&spa_namespace_lock); 2277789Sahrens spa_close(spa, FTAG); 2278789Sahrens 2279789Sahrens if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 2280789Sahrens spa_unload(spa); 2281789Sahrens spa_deactivate(spa); 2282789Sahrens } 2283789Sahrens spa_remove(spa); 2284789Sahrens } 2285789Sahrens mutex_exit(&spa_namespace_lock); 2286789Sahrens } 22871544Seschrock 22881544Seschrock vdev_t * 22891544Seschrock spa_lookup_by_guid(spa_t *spa, uint64_t guid) 22901544Seschrock { 22911544Seschrock return (vdev_lookup_by_guid(spa->spa_root_vdev, guid)); 22921544Seschrock } 22931760Seschrock 22941760Seschrock void 22951760Seschrock spa_upgrade(spa_t *spa) 22961760Seschrock { 22971760Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 22981760Seschrock 22991760Seschrock /* 23001760Seschrock * This should only be called for a non-faulted pool, and since a 23011760Seschrock * future version would result in an unopenable pool, this shouldn't be 23021760Seschrock * possible. 23031760Seschrock */ 23041760Seschrock ASSERT(spa->spa_uberblock.ub_version <= ZFS_VERSION); 23051760Seschrock 23061760Seschrock spa->spa_uberblock.ub_version = ZFS_VERSION; 23071760Seschrock vdev_config_dirty(spa->spa_root_vdev); 23081760Seschrock 23091760Seschrock spa_config_exit(spa, FTAG); 23101760Seschrock } 2311