1789Sahrens /* 2789Sahrens * CDDL HEADER START 3789Sahrens * 4789Sahrens * The contents of this file are subject to the terms of the 51544Seschrock * Common Development and Distribution License (the "License"). 61544Seschrock * You may not use this file except in compliance with the License. 7789Sahrens * 8789Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9789Sahrens * or http://www.opensolaris.org/os/licensing. 10789Sahrens * See the License for the specific language governing permissions 11789Sahrens * and limitations under the License. 12789Sahrens * 13789Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14789Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15789Sahrens * If applicable, add the following below this CDDL HEADER, with the 16789Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17789Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18789Sahrens * 19789Sahrens * CDDL HEADER END 20789Sahrens */ 21789Sahrens /* 221354Seschrock * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23789Sahrens * Use is subject to license terms. 24789Sahrens */ 25789Sahrens 26789Sahrens #pragma ident "%Z%%M% %I% %E% SMI" 27789Sahrens 28789Sahrens /* 29789Sahrens * This file contains all the routines used when modifying on-disk SPA state. 30789Sahrens * This includes opening, importing, destroying, exporting a pool, and syncing a 31789Sahrens * pool. 32789Sahrens */ 33789Sahrens 34789Sahrens #include <sys/zfs_context.h> 351544Seschrock #include <sys/fm/fs/zfs.h> 36789Sahrens #include <sys/spa_impl.h> 37789Sahrens #include <sys/zio.h> 38789Sahrens #include <sys/zio_checksum.h> 39789Sahrens #include <sys/zio_compress.h> 40789Sahrens #include <sys/dmu.h> 41789Sahrens #include <sys/dmu_tx.h> 42789Sahrens #include <sys/zap.h> 43789Sahrens #include <sys/zil.h> 44789Sahrens #include <sys/vdev_impl.h> 45789Sahrens #include <sys/metaslab.h> 46789Sahrens #include <sys/uberblock_impl.h> 47789Sahrens #include <sys/txg.h> 48789Sahrens #include <sys/avl.h> 49789Sahrens #include <sys/dmu_traverse.h> 50789Sahrens #include <sys/unique.h> 51789Sahrens #include <sys/dsl_pool.h> 52789Sahrens #include <sys/dsl_dir.h> 53789Sahrens #include <sys/dsl_prop.h> 54789Sahrens #include <sys/fs/zfs.h> 55789Sahrens #include <sys/callb.h> 56789Sahrens 57789Sahrens /* 58789Sahrens * ========================================================================== 59789Sahrens * SPA state manipulation (open/create/destroy/import/export) 60789Sahrens * ========================================================================== 61789Sahrens */ 62789Sahrens 631544Seschrock static int 641544Seschrock spa_error_entry_compare(const void *a, const void *b) 651544Seschrock { 661544Seschrock spa_error_entry_t *sa = (spa_error_entry_t *)a; 671544Seschrock spa_error_entry_t *sb = (spa_error_entry_t *)b; 681544Seschrock int ret; 691544Seschrock 701544Seschrock ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 711544Seschrock sizeof (zbookmark_t)); 721544Seschrock 731544Seschrock if (ret < 0) 741544Seschrock return (-1); 751544Seschrock else if (ret > 0) 761544Seschrock return (1); 771544Seschrock else 781544Seschrock return (0); 791544Seschrock } 801544Seschrock 811544Seschrock /* 821544Seschrock * Utility function which retrieves copies of the current logs and 831544Seschrock * re-initializes them in the process. 841544Seschrock */ 851544Seschrock void 861544Seschrock spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 871544Seschrock { 881544Seschrock ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 891544Seschrock 901544Seschrock bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 911544Seschrock bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 921544Seschrock 931544Seschrock avl_create(&spa->spa_errlist_scrub, 941544Seschrock spa_error_entry_compare, sizeof (spa_error_entry_t), 951544Seschrock offsetof(spa_error_entry_t, se_avl)); 961544Seschrock avl_create(&spa->spa_errlist_last, 971544Seschrock spa_error_entry_compare, sizeof (spa_error_entry_t), 981544Seschrock offsetof(spa_error_entry_t, se_avl)); 991544Seschrock } 1001544Seschrock 101789Sahrens /* 102789Sahrens * Activate an uninitialized pool. 103789Sahrens */ 104789Sahrens static void 105789Sahrens spa_activate(spa_t *spa) 106789Sahrens { 107789Sahrens int t; 108789Sahrens 109789Sahrens ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 110789Sahrens 111789Sahrens spa->spa_state = POOL_STATE_ACTIVE; 112789Sahrens 113789Sahrens spa->spa_normal_class = metaslab_class_create(); 114789Sahrens 115789Sahrens for (t = 0; t < ZIO_TYPES; t++) { 116789Sahrens spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue", 117789Sahrens 8, maxclsyspri, 50, INT_MAX, 118789Sahrens TASKQ_PREPOPULATE); 119789Sahrens spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr", 120789Sahrens 8, maxclsyspri, 50, INT_MAX, 121789Sahrens TASKQ_PREPOPULATE); 122789Sahrens } 123789Sahrens 124789Sahrens rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL); 125789Sahrens 126789Sahrens list_create(&spa->spa_dirty_list, sizeof (vdev_t), 127789Sahrens offsetof(vdev_t, vdev_dirty_node)); 128789Sahrens 129789Sahrens txg_list_create(&spa->spa_vdev_txg_list, 130789Sahrens offsetof(struct vdev, vdev_txg_node)); 1311544Seschrock 1321544Seschrock avl_create(&spa->spa_errlist_scrub, 1331544Seschrock spa_error_entry_compare, sizeof (spa_error_entry_t), 1341544Seschrock offsetof(spa_error_entry_t, se_avl)); 1351544Seschrock avl_create(&spa->spa_errlist_last, 1361544Seschrock spa_error_entry_compare, sizeof (spa_error_entry_t), 1371544Seschrock offsetof(spa_error_entry_t, se_avl)); 138789Sahrens } 139789Sahrens 140789Sahrens /* 141789Sahrens * Opposite of spa_activate(). 142789Sahrens */ 143789Sahrens static void 144789Sahrens spa_deactivate(spa_t *spa) 145789Sahrens { 146789Sahrens int t; 147789Sahrens 148789Sahrens ASSERT(spa->spa_sync_on == B_FALSE); 149789Sahrens ASSERT(spa->spa_dsl_pool == NULL); 150789Sahrens ASSERT(spa->spa_root_vdev == NULL); 151789Sahrens 152789Sahrens ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 153789Sahrens 154789Sahrens txg_list_destroy(&spa->spa_vdev_txg_list); 155789Sahrens 156789Sahrens list_destroy(&spa->spa_dirty_list); 157789Sahrens 158789Sahrens rw_destroy(&spa->spa_traverse_lock); 159789Sahrens 160789Sahrens for (t = 0; t < ZIO_TYPES; t++) { 161789Sahrens taskq_destroy(spa->spa_zio_issue_taskq[t]); 162789Sahrens taskq_destroy(spa->spa_zio_intr_taskq[t]); 163789Sahrens spa->spa_zio_issue_taskq[t] = NULL; 164789Sahrens spa->spa_zio_intr_taskq[t] = NULL; 165789Sahrens } 166789Sahrens 167789Sahrens metaslab_class_destroy(spa->spa_normal_class); 168789Sahrens spa->spa_normal_class = NULL; 169789Sahrens 1701544Seschrock /* 1711544Seschrock * If this was part of an import or the open otherwise failed, we may 1721544Seschrock * still have errors left in the queues. Empty them just in case. 1731544Seschrock */ 1741544Seschrock spa_errlog_drain(spa); 1751544Seschrock 1761544Seschrock avl_destroy(&spa->spa_errlist_scrub); 1771544Seschrock avl_destroy(&spa->spa_errlist_last); 1781544Seschrock 179789Sahrens spa->spa_state = POOL_STATE_UNINITIALIZED; 180789Sahrens } 181789Sahrens 182789Sahrens /* 183789Sahrens * Verify a pool configuration, and construct the vdev tree appropriately. This 184789Sahrens * will create all the necessary vdevs in the appropriate layout, with each vdev 185789Sahrens * in the CLOSED state. This will prep the pool before open/creation/import. 186789Sahrens * All vdev validation is done by the vdev_alloc() routine. 187789Sahrens */ 188789Sahrens static vdev_t * 189789Sahrens spa_config_parse(spa_t *spa, nvlist_t *nv, vdev_t *parent, uint_t id, int atype) 190789Sahrens { 191789Sahrens nvlist_t **child; 192789Sahrens uint_t c, children; 193789Sahrens vdev_t *vd; 194789Sahrens 195789Sahrens if ((vd = vdev_alloc(spa, nv, parent, id, atype)) == NULL) 196789Sahrens return (NULL); 197789Sahrens 198789Sahrens if (vd->vdev_ops->vdev_op_leaf) 199789Sahrens return (vd); 200789Sahrens 201789Sahrens if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 202789Sahrens &child, &children) != 0) { 203789Sahrens vdev_free(vd); 204789Sahrens return (NULL); 205789Sahrens } 206789Sahrens 207789Sahrens for (c = 0; c < children; c++) { 208789Sahrens if (spa_config_parse(spa, child[c], vd, c, atype) == NULL) { 209789Sahrens vdev_free(vd); 210789Sahrens return (NULL); 211789Sahrens } 212789Sahrens } 213789Sahrens 214789Sahrens return (vd); 215789Sahrens } 216789Sahrens 217789Sahrens /* 218789Sahrens * Opposite of spa_load(). 219789Sahrens */ 220789Sahrens static void 221789Sahrens spa_unload(spa_t *spa) 222789Sahrens { 223789Sahrens /* 2241544Seschrock * Stop async tasks. 2251544Seschrock */ 2261544Seschrock spa_async_suspend(spa); 2271544Seschrock 2281544Seschrock /* 229789Sahrens * Stop syncing. 230789Sahrens */ 231789Sahrens if (spa->spa_sync_on) { 232789Sahrens txg_sync_stop(spa->spa_dsl_pool); 233789Sahrens spa->spa_sync_on = B_FALSE; 234789Sahrens } 235789Sahrens 236789Sahrens /* 237789Sahrens * Wait for any outstanding prefetch I/O to complete. 238789Sahrens */ 2391544Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 2401544Seschrock spa_config_exit(spa, FTAG); 241789Sahrens 242789Sahrens /* 243789Sahrens * Close the dsl pool. 244789Sahrens */ 245789Sahrens if (spa->spa_dsl_pool) { 246789Sahrens dsl_pool_close(spa->spa_dsl_pool); 247789Sahrens spa->spa_dsl_pool = NULL; 248789Sahrens } 249789Sahrens 250789Sahrens /* 251789Sahrens * Close all vdevs. 252789Sahrens */ 2531585Sbonwick if (spa->spa_root_vdev) 254789Sahrens vdev_free(spa->spa_root_vdev); 2551585Sbonwick ASSERT(spa->spa_root_vdev == NULL); 2561544Seschrock 2571544Seschrock spa->spa_async_suspended = 0; 258789Sahrens } 259789Sahrens 260789Sahrens /* 261789Sahrens * Load an existing storage pool, using the pool's builtin spa_config as a 2621544Seschrock * source of configuration information. 263789Sahrens */ 264789Sahrens static int 2651544Seschrock spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) 266789Sahrens { 267789Sahrens int error = 0; 268789Sahrens nvlist_t *nvroot = NULL; 269789Sahrens vdev_t *rvd; 270789Sahrens uberblock_t *ub = &spa->spa_uberblock; 271*1635Sbonwick uint64_t config_cache_txg = spa->spa_config_txg; 272789Sahrens uint64_t pool_guid; 273789Sahrens zio_t *zio; 274789Sahrens 2751544Seschrock spa->spa_load_state = state; 276*1635Sbonwick 277789Sahrens if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 278*1635Sbonwick nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid) || 279*1635Sbonwick (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 280*1635Sbonwick &spa->spa_config_txg) && mosconfig)) { 2811544Seschrock error = EINVAL; 2821544Seschrock goto out; 2831544Seschrock } 284789Sahrens 285*1635Sbonwick if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 2861544Seschrock spa_guid_exists(pool_guid, 0)) { 2871544Seschrock error = EEXIST; 2881544Seschrock goto out; 2891544Seschrock } 290789Sahrens 291789Sahrens /* 292789Sahrens * Parse the configuration into a vdev tree. 293789Sahrens */ 2941544Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 295789Sahrens rvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 2961544Seschrock spa_config_exit(spa, FTAG); 297789Sahrens 2981544Seschrock if (rvd == NULL) { 2991544Seschrock error = EINVAL; 3001544Seschrock goto out; 3011544Seschrock } 302789Sahrens 3031585Sbonwick ASSERT(spa->spa_root_vdev == rvd); 304789Sahrens ASSERT(spa_guid(spa) == pool_guid); 305789Sahrens 306789Sahrens /* 307789Sahrens * Try to open all vdevs, loading each label in the process. 308789Sahrens */ 3091544Seschrock if (vdev_open(rvd) != 0) { 3101544Seschrock error = ENXIO; 3111544Seschrock goto out; 3121544Seschrock } 313789Sahrens 314789Sahrens /* 315789Sahrens * Find the best uberblock. 316789Sahrens */ 317789Sahrens bzero(ub, sizeof (uberblock_t)); 318789Sahrens 319789Sahrens zio = zio_root(spa, NULL, NULL, 320789Sahrens ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 321789Sahrens vdev_uberblock_load(zio, rvd, ub); 322789Sahrens error = zio_wait(zio); 323789Sahrens 324789Sahrens /* 325789Sahrens * If we weren't able to find a single valid uberblock, return failure. 326789Sahrens */ 327789Sahrens if (ub->ub_txg == 0) { 3281544Seschrock error = ENXIO; 3291544Seschrock goto out; 3301544Seschrock } 3311544Seschrock 3321544Seschrock /* 3331544Seschrock * If the pool is newer than the code, we can't open it. 3341544Seschrock */ 3351544Seschrock if (ub->ub_version > UBERBLOCK_VERSION) { 3361544Seschrock error = ENOTSUP; 3371544Seschrock goto out; 338789Sahrens } 339789Sahrens 340789Sahrens /* 341789Sahrens * If the vdev guid sum doesn't match the uberblock, we have an 342789Sahrens * incomplete configuration. 343789Sahrens */ 344*1635Sbonwick if (rvd->vdev_guid_sum != ub->ub_guid_sum && (mosconfig || 345*1635Sbonwick state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT)) { 3461544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 3471544Seschrock VDEV_AUX_BAD_GUID_SUM); 3481544Seschrock error = ENXIO; 3491544Seschrock goto out; 350789Sahrens } 351789Sahrens 352789Sahrens /* 353789Sahrens * Initialize internal SPA structures. 354789Sahrens */ 355789Sahrens spa->spa_state = POOL_STATE_ACTIVE; 356789Sahrens spa->spa_ubsync = spa->spa_uberblock; 357789Sahrens spa->spa_first_txg = spa_last_synced_txg(spa) + 1; 3581544Seschrock error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 3591544Seschrock if (error) { 3601544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 3611544Seschrock VDEV_AUX_CORRUPT_DATA); 3621544Seschrock goto out; 3631544Seschrock } 364789Sahrens spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 365789Sahrens 3661544Seschrock if (zap_lookup(spa->spa_meta_objset, 367789Sahrens DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 3681544Seschrock sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { 3691544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 3701544Seschrock VDEV_AUX_CORRUPT_DATA); 3711544Seschrock error = EIO; 3721544Seschrock goto out; 3731544Seschrock } 374789Sahrens 375789Sahrens if (!mosconfig) { 376789Sahrens dmu_buf_t *db; 377789Sahrens char *packed = NULL; 378789Sahrens size_t nvsize = 0; 379789Sahrens nvlist_t *newconfig = NULL; 380789Sahrens 3811544Seschrock VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, 3821544Seschrock spa->spa_config_object, FTAG, &db)); 383789Sahrens nvsize = *(uint64_t *)db->db_data; 3841544Seschrock dmu_buf_rele(db, FTAG); 385789Sahrens 386789Sahrens packed = kmem_alloc(nvsize, KM_SLEEP); 3871544Seschrock error = dmu_read(spa->spa_meta_objset, 388789Sahrens spa->spa_config_object, 0, nvsize, packed); 389789Sahrens if (error == 0) 390789Sahrens error = nvlist_unpack(packed, nvsize, &newconfig, 0); 391789Sahrens kmem_free(packed, nvsize); 392789Sahrens 3931544Seschrock if (error) { 3941544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 3951544Seschrock VDEV_AUX_CORRUPT_DATA); 3961544Seschrock error = EIO; 3971544Seschrock goto out; 3981544Seschrock } 399789Sahrens 400789Sahrens spa_config_set(spa, newconfig); 401789Sahrens 402789Sahrens spa_unload(spa); 403789Sahrens spa_deactivate(spa); 404789Sahrens spa_activate(spa); 405789Sahrens 4061544Seschrock return (spa_load(spa, newconfig, state, B_TRUE)); 4071544Seschrock } 4081544Seschrock 4091544Seschrock if (zap_lookup(spa->spa_meta_objset, 4101544Seschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 4111544Seschrock sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { 4121544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 4131544Seschrock VDEV_AUX_CORRUPT_DATA); 4141544Seschrock error = EIO; 4151544Seschrock goto out; 416789Sahrens } 417789Sahrens 4181544Seschrock /* 4191544Seschrock * Load the persistent error log. If we have an older pool, this will 4201544Seschrock * not be present. 4211544Seschrock */ 4221544Seschrock error = zap_lookup(spa->spa_meta_objset, 4231544Seschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, 4241544Seschrock sizeof (uint64_t), 1, &spa->spa_errlog_last); 4251544Seschrock if (error != 0 &&error != ENOENT) { 4261544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 4271544Seschrock VDEV_AUX_CORRUPT_DATA); 4281544Seschrock error = EIO; 4291544Seschrock goto out; 4301544Seschrock } 4311544Seschrock 4321544Seschrock error = zap_lookup(spa->spa_meta_objset, 4331544Seschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, 4341544Seschrock sizeof (uint64_t), 1, &spa->spa_errlog_scrub); 4351544Seschrock if (error != 0 && error != ENOENT) { 4361544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 4371544Seschrock VDEV_AUX_CORRUPT_DATA); 4381544Seschrock error = EIO; 4391544Seschrock goto out; 4401544Seschrock } 441789Sahrens 442789Sahrens /* 4431544Seschrock * Load the vdev state for all top level vdevs. We need to grab the 4441544Seschrock * config lock because all label I/O is done with the 4451544Seschrock * ZIO_FLAG_CONFIG_HELD flag. 446789Sahrens */ 4471544Seschrock spa_config_enter(spa, RW_READER, FTAG); 448*1635Sbonwick error = vdev_load(rvd); 449*1635Sbonwick spa_config_exit(spa, FTAG); 450*1635Sbonwick 451*1635Sbonwick if (error) 4521544Seschrock goto out; 453789Sahrens 454789Sahrens /* 455789Sahrens * Propagate the leaf DTLs we just loaded all the way up the tree. 456789Sahrens */ 4571544Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 458789Sahrens vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 4591544Seschrock spa_config_exit(spa, FTAG); 460789Sahrens 461789Sahrens /* 462789Sahrens * Check the state of the root vdev. If it can't be opened, it 463789Sahrens * indicates one or more toplevel vdevs are faulted. 464789Sahrens */ 4651544Seschrock if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 4661544Seschrock error = ENXIO; 4671544Seschrock goto out; 4681544Seschrock } 469789Sahrens 4701544Seschrock if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) { 471*1635Sbonwick dmu_tx_t *tx; 472*1635Sbonwick int need_update = B_FALSE; 4731585Sbonwick int c; 4741601Sbonwick 475*1635Sbonwick /* 476*1635Sbonwick * Claim log blocks that haven't been committed yet. 477*1635Sbonwick * This must all happen in a single txg. 478*1635Sbonwick */ 4791601Sbonwick tx = dmu_tx_create_assigned(spa_get_dsl(spa), 480789Sahrens spa_first_txg(spa)); 481789Sahrens dmu_objset_find(spa->spa_name, zil_claim, tx, 0); 482789Sahrens dmu_tx_commit(tx); 483789Sahrens 484789Sahrens spa->spa_sync_on = B_TRUE; 485789Sahrens txg_sync_start(spa->spa_dsl_pool); 486789Sahrens 487789Sahrens /* 488789Sahrens * Wait for all claims to sync. 489789Sahrens */ 490789Sahrens txg_wait_synced(spa->spa_dsl_pool, 0); 4911585Sbonwick 4921585Sbonwick /* 493*1635Sbonwick * If the config cache is stale, or we have uninitialized 494*1635Sbonwick * metaslabs (see spa_vdev_add()), then update the config. 4951585Sbonwick */ 496*1635Sbonwick if (config_cache_txg != spa->spa_config_txg || 497*1635Sbonwick state == SPA_LOAD_IMPORT) 498*1635Sbonwick need_update = B_TRUE; 499*1635Sbonwick 500*1635Sbonwick for (c = 0; c < rvd->vdev_children; c++) 501*1635Sbonwick if (rvd->vdev_child[c]->vdev_ms_array == 0) 502*1635Sbonwick need_update = B_TRUE; 5031585Sbonwick 5041585Sbonwick /* 505*1635Sbonwick * Update the config cache asychronously in case we're the 506*1635Sbonwick * root pool, in which case the config cache isn't writable yet. 5071585Sbonwick */ 508*1635Sbonwick if (need_update) 509*1635Sbonwick spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 510789Sahrens } 511789Sahrens 5121544Seschrock error = 0; 5131544Seschrock out: 5141544Seschrock if (error) 5151544Seschrock zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0); 5161544Seschrock spa->spa_load_state = SPA_LOAD_NONE; 5171544Seschrock spa->spa_ena = 0; 5181544Seschrock 5191544Seschrock return (error); 520789Sahrens } 521789Sahrens 522789Sahrens /* 523789Sahrens * Pool Open/Import 524789Sahrens * 525789Sahrens * The import case is identical to an open except that the configuration is sent 526789Sahrens * down from userland, instead of grabbed from the configuration cache. For the 527789Sahrens * case of an open, the pool configuration will exist in the 528789Sahrens * POOL_STATE_UNITIALIZED state. 529789Sahrens * 530789Sahrens * The stats information (gen/count/ustats) is used to gather vdev statistics at 531789Sahrens * the same time open the pool, without having to keep around the spa_t in some 532789Sahrens * ambiguous state. 533789Sahrens */ 534789Sahrens static int 535789Sahrens spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) 536789Sahrens { 537789Sahrens spa_t *spa; 538789Sahrens int error; 539789Sahrens int loaded = B_FALSE; 540789Sahrens int locked = B_FALSE; 541789Sahrens 542789Sahrens *spapp = NULL; 543789Sahrens 544789Sahrens /* 545789Sahrens * As disgusting as this is, we need to support recursive calls to this 546789Sahrens * function because dsl_dir_open() is called during spa_load(), and ends 547789Sahrens * up calling spa_open() again. The real fix is to figure out how to 548789Sahrens * avoid dsl_dir_open() calling this in the first place. 549789Sahrens */ 550789Sahrens if (mutex_owner(&spa_namespace_lock) != curthread) { 551789Sahrens mutex_enter(&spa_namespace_lock); 552789Sahrens locked = B_TRUE; 553789Sahrens } 554789Sahrens 555789Sahrens if ((spa = spa_lookup(pool)) == NULL) { 556789Sahrens if (locked) 557789Sahrens mutex_exit(&spa_namespace_lock); 558789Sahrens return (ENOENT); 559789Sahrens } 560789Sahrens if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 561789Sahrens 562789Sahrens spa_activate(spa); 563789Sahrens 564*1635Sbonwick error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE); 565789Sahrens 566789Sahrens if (error == EBADF) { 567789Sahrens /* 568789Sahrens * If vdev_load() returns EBADF, it indicates that one 569789Sahrens * of the vdevs indicates that the pool has been 570789Sahrens * exported or destroyed. If this is the case, the 571789Sahrens * config cache is out of sync and we should remove the 572789Sahrens * pool from the namespace. 573789Sahrens */ 574789Sahrens spa_unload(spa); 575789Sahrens spa_deactivate(spa); 576789Sahrens spa_remove(spa); 577789Sahrens spa_config_sync(); 578789Sahrens if (locked) 579789Sahrens mutex_exit(&spa_namespace_lock); 580789Sahrens return (ENOENT); 5811544Seschrock } 5821544Seschrock 5831544Seschrock if (error) { 584789Sahrens /* 585789Sahrens * We can't open the pool, but we still have useful 586789Sahrens * information: the state of each vdev after the 587789Sahrens * attempted vdev_open(). Return this to the user. 588789Sahrens */ 589*1635Sbonwick if (config != NULL && spa->spa_root_vdev != NULL) { 590*1635Sbonwick spa_config_enter(spa, RW_READER, FTAG); 591789Sahrens *config = spa_config_generate(spa, NULL, -1ULL, 592789Sahrens B_TRUE); 593*1635Sbonwick spa_config_exit(spa, FTAG); 594*1635Sbonwick } 595789Sahrens spa_unload(spa); 596789Sahrens spa_deactivate(spa); 5971544Seschrock spa->spa_last_open_failed = B_TRUE; 598789Sahrens if (locked) 599789Sahrens mutex_exit(&spa_namespace_lock); 600789Sahrens *spapp = NULL; 601789Sahrens return (error); 6021544Seschrock } else { 6031544Seschrock zfs_post_ok(spa, NULL); 6041544Seschrock spa->spa_last_open_failed = B_FALSE; 605789Sahrens } 606789Sahrens 607789Sahrens loaded = B_TRUE; 608789Sahrens } 609789Sahrens 610789Sahrens spa_open_ref(spa, tag); 611789Sahrens if (locked) 612789Sahrens mutex_exit(&spa_namespace_lock); 613789Sahrens 614789Sahrens *spapp = spa; 615789Sahrens 616789Sahrens if (config != NULL) { 6171544Seschrock spa_config_enter(spa, RW_READER, FTAG); 618789Sahrens *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 6191544Seschrock spa_config_exit(spa, FTAG); 620789Sahrens } 621789Sahrens 622789Sahrens /* 623789Sahrens * If we just loaded the pool, resilver anything that's out of date. 624789Sahrens */ 625789Sahrens if (loaded && (spa_mode & FWRITE)) 626789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 627789Sahrens 628789Sahrens return (0); 629789Sahrens } 630789Sahrens 631789Sahrens int 632789Sahrens spa_open(const char *name, spa_t **spapp, void *tag) 633789Sahrens { 634789Sahrens return (spa_open_common(name, spapp, tag, NULL)); 635789Sahrens } 636789Sahrens 6371544Seschrock /* 6381544Seschrock * Lookup the given spa_t, incrementing the inject count in the process, 6391544Seschrock * preventing it from being exported or destroyed. 6401544Seschrock */ 6411544Seschrock spa_t * 6421544Seschrock spa_inject_addref(char *name) 6431544Seschrock { 6441544Seschrock spa_t *spa; 6451544Seschrock 6461544Seschrock mutex_enter(&spa_namespace_lock); 6471544Seschrock if ((spa = spa_lookup(name)) == NULL) { 6481544Seschrock mutex_exit(&spa_namespace_lock); 6491544Seschrock return (NULL); 6501544Seschrock } 6511544Seschrock spa->spa_inject_ref++; 6521544Seschrock mutex_exit(&spa_namespace_lock); 6531544Seschrock 6541544Seschrock return (spa); 6551544Seschrock } 6561544Seschrock 6571544Seschrock void 6581544Seschrock spa_inject_delref(spa_t *spa) 6591544Seschrock { 6601544Seschrock mutex_enter(&spa_namespace_lock); 6611544Seschrock spa->spa_inject_ref--; 6621544Seschrock mutex_exit(&spa_namespace_lock); 6631544Seschrock } 6641544Seschrock 665789Sahrens int 6661544Seschrock spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 667789Sahrens { 668789Sahrens int error; 669789Sahrens spa_t *spa; 670789Sahrens 671789Sahrens *config = NULL; 672789Sahrens error = spa_open_common(name, &spa, FTAG, config); 673789Sahrens 6741544Seschrock if (spa && *config != NULL) 6751544Seschrock VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, 6761544Seschrock spa_get_errlog_size(spa)) == 0); 6771544Seschrock 6781544Seschrock /* 6791544Seschrock * We want to get the alternate root even for faulted pools, so we cheat 6801544Seschrock * and call spa_lookup() directly. 6811544Seschrock */ 6821544Seschrock if (altroot) { 6831544Seschrock if (spa == NULL) { 6841544Seschrock mutex_enter(&spa_namespace_lock); 6851544Seschrock spa = spa_lookup(name); 6861544Seschrock if (spa) 6871544Seschrock spa_altroot(spa, altroot, buflen); 6881544Seschrock else 6891544Seschrock altroot[0] = '\0'; 6901544Seschrock spa = NULL; 6911544Seschrock mutex_exit(&spa_namespace_lock); 6921544Seschrock } else { 6931544Seschrock spa_altroot(spa, altroot, buflen); 6941544Seschrock } 6951544Seschrock } 6961544Seschrock 697789Sahrens if (spa != NULL) 698789Sahrens spa_close(spa, FTAG); 699789Sahrens 700789Sahrens return (error); 701789Sahrens } 702789Sahrens 703789Sahrens /* 704789Sahrens * Pool Creation 705789Sahrens */ 706789Sahrens int 707*1635Sbonwick spa_create(const char *pool, nvlist_t *nvroot, const char *altroot) 708789Sahrens { 709789Sahrens spa_t *spa; 710*1635Sbonwick vdev_t *rvd; 711789Sahrens dsl_pool_t *dp; 712789Sahrens dmu_tx_t *tx; 713*1635Sbonwick int c, error; 714789Sahrens uint64_t txg = TXG_INITIAL; 715789Sahrens 716789Sahrens /* 717789Sahrens * If this pool already exists, return failure. 718789Sahrens */ 719789Sahrens mutex_enter(&spa_namespace_lock); 720789Sahrens if (spa_lookup(pool) != NULL) { 721789Sahrens mutex_exit(&spa_namespace_lock); 722789Sahrens return (EEXIST); 723789Sahrens } 724789Sahrens 725789Sahrens /* 726789Sahrens * Allocate a new spa_t structure. 727789Sahrens */ 728*1635Sbonwick spa = spa_add(pool, altroot); 729789Sahrens spa_activate(spa); 730789Sahrens 731789Sahrens spa->spa_uberblock.ub_txg = txg - 1; 732789Sahrens spa->spa_ubsync = spa->spa_uberblock; 733789Sahrens 734*1635Sbonwick /* 735*1635Sbonwick * Create the root vdev. 736*1635Sbonwick */ 737*1635Sbonwick spa_config_enter(spa, RW_WRITER, FTAG); 738*1635Sbonwick 739*1635Sbonwick rvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD); 740*1635Sbonwick 741*1635Sbonwick ASSERT(spa->spa_root_vdev == rvd); 742*1635Sbonwick 743*1635Sbonwick if (rvd == NULL) { 744*1635Sbonwick error = EINVAL; 745*1635Sbonwick } else { 746*1635Sbonwick if ((error = vdev_create(rvd, txg)) == 0) { 747*1635Sbonwick for (c = 0; c < rvd->vdev_children; c++) 748*1635Sbonwick vdev_init(rvd->vdev_child[c], txg); 749*1635Sbonwick vdev_config_dirty(rvd); 750*1635Sbonwick } 751*1635Sbonwick } 752*1635Sbonwick 753*1635Sbonwick spa_config_exit(spa, FTAG); 754789Sahrens 755789Sahrens if (error) { 756789Sahrens spa_unload(spa); 757789Sahrens spa_deactivate(spa); 758789Sahrens spa_remove(spa); 759789Sahrens mutex_exit(&spa_namespace_lock); 760789Sahrens return (error); 761789Sahrens } 762789Sahrens 763789Sahrens spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg); 764789Sahrens spa->spa_meta_objset = dp->dp_meta_objset; 765789Sahrens 766789Sahrens tx = dmu_tx_create_assigned(dp, txg); 767789Sahrens 768789Sahrens /* 769789Sahrens * Create the pool config object. 770789Sahrens */ 771789Sahrens spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 772789Sahrens DMU_OT_PACKED_NVLIST, 1 << 14, 773789Sahrens DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 774789Sahrens 7751544Seschrock if (zap_add(spa->spa_meta_objset, 776789Sahrens DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 7771544Seschrock sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 7781544Seschrock cmn_err(CE_PANIC, "failed to add pool config"); 7791544Seschrock } 780789Sahrens 781789Sahrens /* 782789Sahrens * Create the deferred-free bplist object. Turn off compression 783789Sahrens * because sync-to-convergence takes longer if the blocksize 784789Sahrens * keeps changing. 785789Sahrens */ 786789Sahrens spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 787789Sahrens 1 << 14, tx); 788789Sahrens dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 789789Sahrens ZIO_COMPRESS_OFF, tx); 790789Sahrens 7911544Seschrock if (zap_add(spa->spa_meta_objset, 792789Sahrens DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 7931544Seschrock sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { 7941544Seschrock cmn_err(CE_PANIC, "failed to add bplist"); 7951544Seschrock } 796789Sahrens 797789Sahrens dmu_tx_commit(tx); 798789Sahrens 799789Sahrens spa->spa_sync_on = B_TRUE; 800789Sahrens txg_sync_start(spa->spa_dsl_pool); 801789Sahrens 802789Sahrens /* 803789Sahrens * We explicitly wait for the first transaction to complete so that our 804789Sahrens * bean counters are appropriately updated. 805789Sahrens */ 806789Sahrens txg_wait_synced(spa->spa_dsl_pool, txg); 807789Sahrens 808789Sahrens spa_config_sync(); 809789Sahrens 810789Sahrens mutex_exit(&spa_namespace_lock); 811789Sahrens 812789Sahrens return (0); 813789Sahrens } 814789Sahrens 815789Sahrens /* 816789Sahrens * Import the given pool into the system. We set up the necessary spa_t and 817789Sahrens * then call spa_load() to do the dirty work. 818789Sahrens */ 819789Sahrens int 820*1635Sbonwick spa_import(const char *pool, nvlist_t *config, const char *altroot) 821789Sahrens { 822789Sahrens spa_t *spa; 823789Sahrens int error; 824789Sahrens 825789Sahrens if (!(spa_mode & FWRITE)) 826789Sahrens return (EROFS); 827789Sahrens 828789Sahrens /* 829789Sahrens * If a pool with this name exists, return failure. 830789Sahrens */ 831789Sahrens mutex_enter(&spa_namespace_lock); 832789Sahrens if (spa_lookup(pool) != NULL) { 833789Sahrens mutex_exit(&spa_namespace_lock); 834789Sahrens return (EEXIST); 835789Sahrens } 836789Sahrens 837789Sahrens /* 838*1635Sbonwick * Create and initialize the spa structure. 839789Sahrens */ 840*1635Sbonwick spa = spa_add(pool, altroot); 841789Sahrens spa_activate(spa); 842789Sahrens 843789Sahrens /* 844*1635Sbonwick * Pass off the heavy lifting to spa_load(). 8451601Sbonwick */ 846*1635Sbonwick error = spa_load(spa, config, SPA_LOAD_IMPORT, B_FALSE); 847789Sahrens 848789Sahrens if (error) { 849789Sahrens spa_unload(spa); 850789Sahrens spa_deactivate(spa); 851789Sahrens spa_remove(spa); 852789Sahrens mutex_exit(&spa_namespace_lock); 853789Sahrens return (error); 854789Sahrens } 855789Sahrens 856*1635Sbonwick /* 857*1635Sbonwick * Update the config cache to include the newly-imported pool. 858*1635Sbonwick */ 859*1635Sbonwick spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 860*1635Sbonwick 861789Sahrens mutex_exit(&spa_namespace_lock); 862789Sahrens 863789Sahrens /* 864789Sahrens * Resilver anything that's out of date. 865789Sahrens */ 866789Sahrens if (spa_mode & FWRITE) 867789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 868789Sahrens 869789Sahrens return (0); 870789Sahrens } 871789Sahrens 872789Sahrens /* 873789Sahrens * This (illegal) pool name is used when temporarily importing a spa_t in order 874789Sahrens * to get the vdev stats associated with the imported devices. 875789Sahrens */ 876789Sahrens #define TRYIMPORT_NAME "$import" 877789Sahrens 878789Sahrens nvlist_t * 879789Sahrens spa_tryimport(nvlist_t *tryconfig) 880789Sahrens { 881789Sahrens nvlist_t *config = NULL; 882789Sahrens char *poolname; 883789Sahrens spa_t *spa; 884789Sahrens uint64_t state; 885789Sahrens 886789Sahrens if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 887789Sahrens return (NULL); 888789Sahrens 889789Sahrens if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 890789Sahrens return (NULL); 891789Sahrens 892*1635Sbonwick /* 893*1635Sbonwick * Create and initialize the spa structure. 894*1635Sbonwick */ 895789Sahrens mutex_enter(&spa_namespace_lock); 896*1635Sbonwick spa = spa_add(TRYIMPORT_NAME, NULL); 897789Sahrens spa_activate(spa); 898789Sahrens 899789Sahrens /* 900*1635Sbonwick * Pass off the heavy lifting to spa_load(). 901789Sahrens */ 902*1635Sbonwick (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_FALSE); 903789Sahrens 904789Sahrens /* 905789Sahrens * If 'tryconfig' was at least parsable, return the current config. 906789Sahrens */ 907789Sahrens if (spa->spa_root_vdev != NULL) { 908*1635Sbonwick spa_config_enter(spa, RW_READER, FTAG); 909789Sahrens config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 910*1635Sbonwick spa_config_exit(spa, FTAG); 911789Sahrens VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 912789Sahrens poolname) == 0); 913789Sahrens VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 914789Sahrens state) == 0); 915789Sahrens } 916789Sahrens 917789Sahrens spa_unload(spa); 918789Sahrens spa_deactivate(spa); 919789Sahrens spa_remove(spa); 920789Sahrens mutex_exit(&spa_namespace_lock); 921789Sahrens 922789Sahrens return (config); 923789Sahrens } 924789Sahrens 925789Sahrens /* 926789Sahrens * Pool export/destroy 927789Sahrens * 928789Sahrens * The act of destroying or exporting a pool is very simple. We make sure there 929789Sahrens * is no more pending I/O and any references to the pool are gone. Then, we 930789Sahrens * update the pool state and sync all the labels to disk, removing the 931789Sahrens * configuration from the cache afterwards. 932789Sahrens */ 933789Sahrens static int 934789Sahrens spa_export_common(char *pool, int new_state) 935789Sahrens { 936789Sahrens spa_t *spa; 937789Sahrens 938789Sahrens if (!(spa_mode & FWRITE)) 939789Sahrens return (EROFS); 940789Sahrens 941789Sahrens mutex_enter(&spa_namespace_lock); 942789Sahrens if ((spa = spa_lookup(pool)) == NULL) { 943789Sahrens mutex_exit(&spa_namespace_lock); 944789Sahrens return (ENOENT); 945789Sahrens } 946789Sahrens 947789Sahrens /* 9481544Seschrock * Put a hold on the pool, drop the namespace lock, stop async tasks, 9491544Seschrock * reacquire the namespace lock, and see if we can export. 9501544Seschrock */ 9511544Seschrock spa_open_ref(spa, FTAG); 9521544Seschrock mutex_exit(&spa_namespace_lock); 9531544Seschrock spa_async_suspend(spa); 9541544Seschrock mutex_enter(&spa_namespace_lock); 9551544Seschrock spa_close(spa, FTAG); 9561544Seschrock 9571544Seschrock /* 958789Sahrens * The pool will be in core if it's openable, 959789Sahrens * in which case we can modify its state. 960789Sahrens */ 961789Sahrens if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 962789Sahrens /* 963789Sahrens * Objsets may be open only because they're dirty, so we 964789Sahrens * have to force it to sync before checking spa_refcnt. 965789Sahrens */ 966789Sahrens spa_scrub_suspend(spa); 967789Sahrens txg_wait_synced(spa->spa_dsl_pool, 0); 968789Sahrens 9691544Seschrock /* 9701544Seschrock * A pool cannot be exported or destroyed if there are active 9711544Seschrock * references. If we are resetting a pool, allow references by 9721544Seschrock * fault injection handlers. 9731544Seschrock */ 9741544Seschrock if (!spa_refcount_zero(spa) || 9751544Seschrock (spa->spa_inject_ref != 0 && 9761544Seschrock new_state != POOL_STATE_UNINITIALIZED)) { 977789Sahrens spa_scrub_resume(spa); 9781544Seschrock spa_async_resume(spa); 979789Sahrens mutex_exit(&spa_namespace_lock); 980789Sahrens return (EBUSY); 981789Sahrens } 982789Sahrens 983789Sahrens spa_scrub_resume(spa); 984789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 985789Sahrens 986789Sahrens /* 987789Sahrens * We want this to be reflected on every label, 988789Sahrens * so mark them all dirty. spa_unload() will do the 989789Sahrens * final sync that pushes these changes out. 990789Sahrens */ 9911544Seschrock if (new_state != POOL_STATE_UNINITIALIZED) { 9921601Sbonwick spa_config_enter(spa, RW_WRITER, FTAG); 9931544Seschrock spa->spa_state = new_state; 994*1635Sbonwick spa->spa_final_txg = spa_last_synced_txg(spa) + 1; 9951544Seschrock vdev_config_dirty(spa->spa_root_vdev); 9961601Sbonwick spa_config_exit(spa, FTAG); 9971544Seschrock } 998789Sahrens } 999789Sahrens 1000789Sahrens if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 1001789Sahrens spa_unload(spa); 1002789Sahrens spa_deactivate(spa); 1003789Sahrens } 1004789Sahrens 10051544Seschrock if (new_state != POOL_STATE_UNINITIALIZED) { 10061544Seschrock spa_remove(spa); 10071544Seschrock spa_config_sync(); 10081544Seschrock } 1009789Sahrens mutex_exit(&spa_namespace_lock); 1010789Sahrens 1011789Sahrens return (0); 1012789Sahrens } 1013789Sahrens 1014789Sahrens /* 1015789Sahrens * Destroy a storage pool. 1016789Sahrens */ 1017789Sahrens int 1018789Sahrens spa_destroy(char *pool) 1019789Sahrens { 1020789Sahrens return (spa_export_common(pool, POOL_STATE_DESTROYED)); 1021789Sahrens } 1022789Sahrens 1023789Sahrens /* 1024789Sahrens * Export a storage pool. 1025789Sahrens */ 1026789Sahrens int 1027789Sahrens spa_export(char *pool) 1028789Sahrens { 1029789Sahrens return (spa_export_common(pool, POOL_STATE_EXPORTED)); 1030789Sahrens } 1031789Sahrens 1032789Sahrens /* 10331544Seschrock * Similar to spa_export(), this unloads the spa_t without actually removing it 10341544Seschrock * from the namespace in any way. 10351544Seschrock */ 10361544Seschrock int 10371544Seschrock spa_reset(char *pool) 10381544Seschrock { 10391544Seschrock return (spa_export_common(pool, POOL_STATE_UNINITIALIZED)); 10401544Seschrock } 10411544Seschrock 10421544Seschrock 10431544Seschrock /* 1044789Sahrens * ========================================================================== 1045789Sahrens * Device manipulation 1046789Sahrens * ========================================================================== 1047789Sahrens */ 1048789Sahrens 1049789Sahrens /* 1050789Sahrens * Add capacity to a storage pool. 1051789Sahrens */ 1052789Sahrens int 1053789Sahrens spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 1054789Sahrens { 1055789Sahrens uint64_t txg; 1056*1635Sbonwick int c, error; 1057789Sahrens vdev_t *rvd = spa->spa_root_vdev; 10581585Sbonwick vdev_t *vd, *tvd; 1059789Sahrens 1060789Sahrens txg = spa_vdev_enter(spa); 1061789Sahrens 1062789Sahrens vd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD); 1063789Sahrens 1064789Sahrens if (vd == NULL) 1065789Sahrens return (spa_vdev_exit(spa, vd, txg, EINVAL)); 1066789Sahrens 1067789Sahrens if ((error = vdev_create(vd, txg)) != 0) 1068789Sahrens return (spa_vdev_exit(spa, vd, txg, error)); 1069789Sahrens 1070789Sahrens /* 10711585Sbonwick * Transfer each new top-level vdev from vd to rvd. 1072789Sahrens */ 1073*1635Sbonwick for (c = 0; c < vd->vdev_children; c++) { 10741585Sbonwick tvd = vd->vdev_child[c]; 1075*1635Sbonwick vdev_remove_child(vd, tvd); 1076*1635Sbonwick tvd->vdev_id = rvd->vdev_children; 1077*1635Sbonwick vdev_add_child(rvd, tvd); 1078789Sahrens vdev_config_dirty(tvd); 1079789Sahrens } 1080789Sahrens 1081789Sahrens /* 10821585Sbonwick * We have to be careful when adding new vdevs to an existing pool. 10831585Sbonwick * If other threads start allocating from these vdevs before we 10841585Sbonwick * sync the config cache, and we lose power, then upon reboot we may 10851585Sbonwick * fail to open the pool because there are DVAs that the config cache 10861585Sbonwick * can't translate. Therefore, we first add the vdevs without 10871585Sbonwick * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 1088*1635Sbonwick * and then let spa_config_update() initialize the new metaslabs. 10891585Sbonwick * 10901585Sbonwick * spa_load() checks for added-but-not-initialized vdevs, so that 10911585Sbonwick * if we lose power at any point in this sequence, the remaining 10921585Sbonwick * steps will be completed the next time we load the pool. 1093789Sahrens */ 1094*1635Sbonwick (void) spa_vdev_exit(spa, vd, txg, 0); 10951585Sbonwick 1096*1635Sbonwick mutex_enter(&spa_namespace_lock); 1097*1635Sbonwick spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 1098*1635Sbonwick mutex_exit(&spa_namespace_lock); 1099789Sahrens 1100*1635Sbonwick return (0); 1101789Sahrens } 1102789Sahrens 1103789Sahrens /* 1104789Sahrens * Attach a device to a mirror. The arguments are the path to any device 1105789Sahrens * in the mirror, and the nvroot for the new device. If the path specifies 1106789Sahrens * a device that is not mirrored, we automatically insert the mirror vdev. 1107789Sahrens * 1108789Sahrens * If 'replacing' is specified, the new device is intended to replace the 1109789Sahrens * existing device; in this case the two devices are made into their own 1110789Sahrens * mirror using the 'replacing' vdev, which is functionally idendical to 1111789Sahrens * the mirror vdev (it actually reuses all the same ops) but has a few 1112789Sahrens * extra rules: you can't attach to it after it's been created, and upon 1113789Sahrens * completion of resilvering, the first disk (the one being replaced) 1114789Sahrens * is automatically detached. 1115789Sahrens */ 1116789Sahrens int 11171544Seschrock spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 1118789Sahrens { 1119789Sahrens uint64_t txg, open_txg; 1120789Sahrens int error; 1121789Sahrens vdev_t *rvd = spa->spa_root_vdev; 1122789Sahrens vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 1123789Sahrens vdev_ops_t *pvops = replacing ? &vdev_replacing_ops : &vdev_mirror_ops; 1124789Sahrens 1125789Sahrens txg = spa_vdev_enter(spa); 1126789Sahrens 11271544Seschrock oldvd = vdev_lookup_by_guid(rvd, guid); 1128789Sahrens 1129789Sahrens if (oldvd == NULL) 1130789Sahrens return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1131789Sahrens 11321585Sbonwick if (!oldvd->vdev_ops->vdev_op_leaf) 11331585Sbonwick return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 11341585Sbonwick 1135789Sahrens pvd = oldvd->vdev_parent; 1136789Sahrens 1137789Sahrens /* 1138789Sahrens * The parent must be a mirror or the root, unless we're replacing; 1139789Sahrens * in that case, the parent can be anything but another replacing vdev. 1140789Sahrens */ 1141789Sahrens if (pvd->vdev_ops != &vdev_mirror_ops && 1142789Sahrens pvd->vdev_ops != &vdev_root_ops && 1143789Sahrens (!replacing || pvd->vdev_ops == &vdev_replacing_ops)) 1144789Sahrens return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1145789Sahrens 1146789Sahrens newrootvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD); 1147789Sahrens 1148789Sahrens if (newrootvd == NULL || newrootvd->vdev_children != 1) 1149789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1150789Sahrens 1151789Sahrens newvd = newrootvd->vdev_child[0]; 1152789Sahrens 1153789Sahrens if (!newvd->vdev_ops->vdev_op_leaf) 1154789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1155789Sahrens 1156789Sahrens if ((error = vdev_create(newrootvd, txg)) != 0) 1157789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, error)); 1158789Sahrens 11591175Slling /* 11601175Slling * Compare the new device size with the replaceable/attachable 11611175Slling * device size. 11621175Slling */ 11631175Slling if (newvd->vdev_psize < vdev_get_rsize(oldvd)) 1164789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 1165789Sahrens 1166789Sahrens if (newvd->vdev_ashift != oldvd->vdev_ashift && oldvd->vdev_ashift != 0) 1167789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 1168789Sahrens 1169789Sahrens /* 1170789Sahrens * If this is an in-place replacement, update oldvd's path and devid 1171789Sahrens * to make it distinguishable from newvd, and unopenable from now on. 1172789Sahrens */ 1173789Sahrens if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 1174789Sahrens spa_strfree(oldvd->vdev_path); 1175789Sahrens oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 1176789Sahrens KM_SLEEP); 1177789Sahrens (void) sprintf(oldvd->vdev_path, "%s/%s", 1178789Sahrens newvd->vdev_path, "old"); 1179789Sahrens if (oldvd->vdev_devid != NULL) { 1180789Sahrens spa_strfree(oldvd->vdev_devid); 1181789Sahrens oldvd->vdev_devid = NULL; 1182789Sahrens } 1183789Sahrens } 1184789Sahrens 1185789Sahrens /* 1186789Sahrens * If the parent is not a mirror, or if we're replacing, 1187789Sahrens * insert the new mirror/replacing vdev above oldvd. 1188789Sahrens */ 1189789Sahrens if (pvd->vdev_ops != pvops) 1190789Sahrens pvd = vdev_add_parent(oldvd, pvops); 1191789Sahrens 1192789Sahrens ASSERT(pvd->vdev_top->vdev_parent == rvd); 1193789Sahrens ASSERT(pvd->vdev_ops == pvops); 1194789Sahrens ASSERT(oldvd->vdev_parent == pvd); 1195789Sahrens 1196789Sahrens /* 1197789Sahrens * Extract the new device from its root and add it to pvd. 1198789Sahrens */ 1199789Sahrens vdev_remove_child(newrootvd, newvd); 1200789Sahrens newvd->vdev_id = pvd->vdev_children; 1201789Sahrens vdev_add_child(pvd, newvd); 1202789Sahrens 12031544Seschrock /* 12041544Seschrock * If newvd is smaller than oldvd, but larger than its rsize, 12051544Seschrock * the addition of newvd may have decreased our parent's asize. 12061544Seschrock */ 12071544Seschrock pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize); 12081544Seschrock 1209789Sahrens tvd = newvd->vdev_top; 1210789Sahrens ASSERT(pvd->vdev_top == tvd); 1211789Sahrens ASSERT(tvd->vdev_parent == rvd); 1212789Sahrens 1213789Sahrens vdev_config_dirty(tvd); 1214789Sahrens 1215789Sahrens /* 1216789Sahrens * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 1217789Sahrens * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 1218789Sahrens */ 1219789Sahrens open_txg = txg + TXG_CONCURRENT_STATES - 1; 1220789Sahrens 1221789Sahrens mutex_enter(&newvd->vdev_dtl_lock); 1222789Sahrens space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, 1223789Sahrens open_txg - TXG_INITIAL + 1); 1224789Sahrens mutex_exit(&newvd->vdev_dtl_lock); 1225789Sahrens 12261544Seschrock dprintf("attached %s in txg %llu\n", newvd->vdev_path, txg); 12271544Seschrock 1228789Sahrens /* 1229789Sahrens * Mark newvd's DTL dirty in this txg. 1230789Sahrens */ 1231789Sahrens vdev_dirty(tvd, VDD_DTL, txg); 1232789Sahrens (void) txg_list_add(&tvd->vdev_dtl_list, newvd, txg); 1233789Sahrens 1234789Sahrens (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 1235789Sahrens 1236789Sahrens /* 1237789Sahrens * Kick off a resilver to update newvd. 1238789Sahrens */ 1239789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1240789Sahrens 1241789Sahrens return (0); 1242789Sahrens } 1243789Sahrens 1244789Sahrens /* 1245789Sahrens * Detach a device from a mirror or replacing vdev. 1246789Sahrens * If 'replace_done' is specified, only detach if the parent 1247789Sahrens * is a replacing vdev. 1248789Sahrens */ 1249789Sahrens int 12501544Seschrock spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) 1251789Sahrens { 1252789Sahrens uint64_t txg; 1253789Sahrens int c, t, error; 1254789Sahrens vdev_t *rvd = spa->spa_root_vdev; 1255789Sahrens vdev_t *vd, *pvd, *cvd, *tvd; 1256789Sahrens 1257789Sahrens txg = spa_vdev_enter(spa); 1258789Sahrens 12591544Seschrock vd = vdev_lookup_by_guid(rvd, guid); 1260789Sahrens 1261789Sahrens if (vd == NULL) 1262789Sahrens return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1263789Sahrens 12641585Sbonwick if (!vd->vdev_ops->vdev_op_leaf) 12651585Sbonwick return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 12661585Sbonwick 1267789Sahrens pvd = vd->vdev_parent; 1268789Sahrens 1269789Sahrens /* 1270789Sahrens * If replace_done is specified, only remove this device if it's 1271789Sahrens * the first child of a replacing vdev. 1272789Sahrens */ 1273789Sahrens if (replace_done && 1274789Sahrens (vd->vdev_id != 0 || pvd->vdev_ops != &vdev_replacing_ops)) 1275789Sahrens return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1276789Sahrens 1277789Sahrens /* 1278789Sahrens * Only mirror and replacing vdevs support detach. 1279789Sahrens */ 1280789Sahrens if (pvd->vdev_ops != &vdev_replacing_ops && 1281789Sahrens pvd->vdev_ops != &vdev_mirror_ops) 1282789Sahrens return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1283789Sahrens 1284789Sahrens /* 1285789Sahrens * If there's only one replica, you can't detach it. 1286789Sahrens */ 1287789Sahrens if (pvd->vdev_children <= 1) 1288789Sahrens return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1289789Sahrens 1290789Sahrens /* 1291789Sahrens * If all siblings have non-empty DTLs, this device may have the only 1292789Sahrens * valid copy of the data, which means we cannot safely detach it. 1293789Sahrens * 1294789Sahrens * XXX -- as in the vdev_offline() case, we really want a more 1295789Sahrens * precise DTL check. 1296789Sahrens */ 1297789Sahrens for (c = 0; c < pvd->vdev_children; c++) { 1298789Sahrens uint64_t dirty; 1299789Sahrens 1300789Sahrens cvd = pvd->vdev_child[c]; 1301789Sahrens if (cvd == vd) 1302789Sahrens continue; 1303789Sahrens if (vdev_is_dead(cvd)) 1304789Sahrens continue; 1305789Sahrens mutex_enter(&cvd->vdev_dtl_lock); 1306789Sahrens dirty = cvd->vdev_dtl_map.sm_space | 1307789Sahrens cvd->vdev_dtl_scrub.sm_space; 1308789Sahrens mutex_exit(&cvd->vdev_dtl_lock); 1309789Sahrens if (!dirty) 1310789Sahrens break; 1311789Sahrens } 1312789Sahrens if (c == pvd->vdev_children) 1313789Sahrens return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1314789Sahrens 1315789Sahrens /* 1316789Sahrens * Erase the disk labels so the disk can be used for other things. 1317789Sahrens * This must be done after all other error cases are handled, 1318789Sahrens * but before we disembowel vd (so we can still do I/O to it). 1319789Sahrens * But if we can't do it, don't treat the error as fatal -- 1320789Sahrens * it may be that the unwritability of the disk is the reason 1321789Sahrens * it's being detached! 1322789Sahrens */ 1323789Sahrens error = vdev_label_init(vd, 0); 1324789Sahrens if (error) 1325789Sahrens dprintf("unable to erase labels on %s\n", vdev_description(vd)); 1326789Sahrens 1327789Sahrens /* 1328789Sahrens * Remove vd from its parent and compact the parent's children. 1329789Sahrens */ 1330789Sahrens vdev_remove_child(pvd, vd); 1331789Sahrens vdev_compact_children(pvd); 1332789Sahrens 1333789Sahrens /* 1334789Sahrens * Remember one of the remaining children so we can get tvd below. 1335789Sahrens */ 1336789Sahrens cvd = pvd->vdev_child[0]; 1337789Sahrens 1338789Sahrens /* 1339789Sahrens * If the parent mirror/replacing vdev only has one child, 1340789Sahrens * the parent is no longer needed. Remove it from the tree. 1341789Sahrens */ 1342789Sahrens if (pvd->vdev_children == 1) 1343789Sahrens vdev_remove_parent(cvd); 1344789Sahrens 1345789Sahrens /* 1346789Sahrens * We don't set tvd until now because the parent we just removed 1347789Sahrens * may have been the previous top-level vdev. 1348789Sahrens */ 1349789Sahrens tvd = cvd->vdev_top; 1350789Sahrens ASSERT(tvd->vdev_parent == rvd); 1351789Sahrens 1352789Sahrens /* 1353789Sahrens * Reopen this top-level vdev to reassess health after detach. 1354789Sahrens */ 13551544Seschrock vdev_reopen(tvd); 1356789Sahrens 1357789Sahrens /* 1358789Sahrens * If the device we just detached was smaller than the others, 13591544Seschrock * it may be possible to add metaslabs (i.e. grow the pool). We ignore 13601544Seschrock * the error here because the detach still succeeded - we just weren't 13611544Seschrock * able to reinitialize the metaslabs. This pool is in for a world of 13621544Seschrock * hurt, in any case. 1363789Sahrens */ 13641544Seschrock (void) vdev_metaslab_init(tvd, txg); 1365789Sahrens 1366789Sahrens vdev_config_dirty(tvd); 1367789Sahrens 1368789Sahrens /* 1369789Sahrens * Mark vd's DTL as dirty in this txg. 1370789Sahrens * vdev_dtl_sync() will see that vd->vdev_detached is set 1371789Sahrens * and free vd's DTL object in syncing context. 1372789Sahrens * But first make sure we're not on any *other* txg's DTL list, 1373789Sahrens * to prevent vd from being accessed after it's freed. 1374789Sahrens */ 1375789Sahrens vdev_dirty(tvd, VDD_DTL, txg); 1376789Sahrens vd->vdev_detached = B_TRUE; 1377789Sahrens for (t = 0; t < TXG_SIZE; t++) 1378789Sahrens (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 1379789Sahrens (void) txg_list_add(&tvd->vdev_dtl_list, vd, txg); 1380789Sahrens 13811544Seschrock dprintf("detached %s in txg %llu\n", vd->vdev_path, txg); 1382789Sahrens 1383789Sahrens return (spa_vdev_exit(spa, vd, txg, 0)); 1384789Sahrens } 1385789Sahrens 1386789Sahrens /* 13871544Seschrock * Find any device that's done replacing, so we can detach it. 1388789Sahrens */ 13891544Seschrock static vdev_t * 13901544Seschrock spa_vdev_replace_done_hunt(vdev_t *vd) 1391789Sahrens { 13921544Seschrock vdev_t *newvd, *oldvd; 1393789Sahrens int c; 1394789Sahrens 13951544Seschrock for (c = 0; c < vd->vdev_children; c++) { 13961544Seschrock oldvd = spa_vdev_replace_done_hunt(vd->vdev_child[c]); 13971544Seschrock if (oldvd != NULL) 13981544Seschrock return (oldvd); 13991544Seschrock } 1400789Sahrens 1401789Sahrens if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 14021544Seschrock oldvd = vd->vdev_child[0]; 14031544Seschrock newvd = vd->vdev_child[1]; 1404789Sahrens 14051544Seschrock mutex_enter(&newvd->vdev_dtl_lock); 14061544Seschrock if (newvd->vdev_dtl_map.sm_space == 0 && 14071544Seschrock newvd->vdev_dtl_scrub.sm_space == 0) { 14081544Seschrock mutex_exit(&newvd->vdev_dtl_lock); 14091544Seschrock return (oldvd); 14101544Seschrock } 14111544Seschrock mutex_exit(&newvd->vdev_dtl_lock); 14121544Seschrock } 1413789Sahrens 14141544Seschrock return (NULL); 1415789Sahrens } 1416789Sahrens 14171544Seschrock static void 1418789Sahrens spa_vdev_replace_done(spa_t *spa) 1419789Sahrens { 14201544Seschrock vdev_t *vd; 14211544Seschrock uint64_t guid; 1422789Sahrens 14231544Seschrock spa_config_enter(spa, RW_READER, FTAG); 1424789Sahrens 14251544Seschrock while ((vd = spa_vdev_replace_done_hunt(spa->spa_root_vdev)) != NULL) { 14261544Seschrock guid = vd->vdev_guid; 14271544Seschrock spa_config_exit(spa, FTAG); 14281544Seschrock if (spa_vdev_detach(spa, guid, B_TRUE) != 0) 14291544Seschrock return; 14301544Seschrock spa_config_enter(spa, RW_READER, FTAG); 1431789Sahrens } 1432789Sahrens 14331544Seschrock spa_config_exit(spa, FTAG); 1434789Sahrens } 1435789Sahrens 1436789Sahrens /* 14371354Seschrock * Update the stored path for this vdev. Dirty the vdev configuration, relying 14381354Seschrock * on spa_vdev_enter/exit() to synchronize the labels and cache. 14391354Seschrock */ 14401354Seschrock int 14411354Seschrock spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 14421354Seschrock { 14431354Seschrock vdev_t *rvd, *vd; 14441354Seschrock uint64_t txg; 14451354Seschrock 14461354Seschrock rvd = spa->spa_root_vdev; 14471354Seschrock 14481354Seschrock txg = spa_vdev_enter(spa); 14491354Seschrock 14501354Seschrock if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) 14511354Seschrock return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 14521354Seschrock 14531585Sbonwick if (!vd->vdev_ops->vdev_op_leaf) 14541585Sbonwick return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 14551585Sbonwick 14561354Seschrock spa_strfree(vd->vdev_path); 14571354Seschrock vd->vdev_path = spa_strdup(newpath); 14581354Seschrock 14591354Seschrock vdev_config_dirty(vd->vdev_top); 14601354Seschrock 14611354Seschrock return (spa_vdev_exit(spa, NULL, txg, 0)); 14621354Seschrock } 14631354Seschrock 14641354Seschrock /* 1465789Sahrens * ========================================================================== 1466789Sahrens * SPA Scrubbing 1467789Sahrens * ========================================================================== 1468789Sahrens */ 1469789Sahrens 14701544Seschrock void 14711544Seschrock spa_scrub_throttle(spa_t *spa, int direction) 14721544Seschrock { 14731544Seschrock mutex_enter(&spa->spa_scrub_lock); 14741544Seschrock spa->spa_scrub_throttled += direction; 14751544Seschrock ASSERT(spa->spa_scrub_throttled >= 0); 14761544Seschrock if (spa->spa_scrub_throttled == 0) 14771544Seschrock cv_broadcast(&spa->spa_scrub_io_cv); 14781544Seschrock mutex_exit(&spa->spa_scrub_lock); 14791544Seschrock } 1480789Sahrens 1481789Sahrens static void 1482789Sahrens spa_scrub_io_done(zio_t *zio) 1483789Sahrens { 1484789Sahrens spa_t *spa = zio->io_spa; 1485789Sahrens 1486789Sahrens zio_buf_free(zio->io_data, zio->io_size); 1487789Sahrens 1488789Sahrens mutex_enter(&spa->spa_scrub_lock); 14891544Seschrock if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 14901544Seschrock vdev_t *vd = zio->io_vd; 1491789Sahrens spa->spa_scrub_errors++; 1492789Sahrens mutex_enter(&vd->vdev_stat_lock); 1493789Sahrens vd->vdev_stat.vs_scrub_errors++; 1494789Sahrens mutex_exit(&vd->vdev_stat_lock); 1495789Sahrens } 14961544Seschrock if (--spa->spa_scrub_inflight == 0) { 14971544Seschrock cv_broadcast(&spa->spa_scrub_io_cv); 14981544Seschrock ASSERT(spa->spa_scrub_throttled == 0); 14991544Seschrock } 15001544Seschrock mutex_exit(&spa->spa_scrub_lock); 1501789Sahrens } 1502789Sahrens 1503789Sahrens static void 15041544Seschrock spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags, 15051544Seschrock zbookmark_t *zb) 1506789Sahrens { 1507789Sahrens size_t size = BP_GET_LSIZE(bp); 1508789Sahrens void *data = zio_buf_alloc(size); 1509789Sahrens 1510789Sahrens mutex_enter(&spa->spa_scrub_lock); 1511789Sahrens spa->spa_scrub_inflight++; 1512789Sahrens mutex_exit(&spa->spa_scrub_lock); 1513789Sahrens 15141544Seschrock if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET) 15151544Seschrock flags |= ZIO_FLAG_SPECULATIVE; /* intent log block */ 15161544Seschrock 15171544Seschrock flags |= ZIO_FLAG_CANFAIL; 15181544Seschrock 1519789Sahrens zio_nowait(zio_read(NULL, spa, bp, data, size, 15201544Seschrock spa_scrub_io_done, NULL, priority, flags, zb)); 1521789Sahrens } 1522789Sahrens 1523789Sahrens /* ARGSUSED */ 1524789Sahrens static int 1525789Sahrens spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) 1526789Sahrens { 1527789Sahrens blkptr_t *bp = &bc->bc_blkptr; 1528789Sahrens vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[0])); 1529789Sahrens 1530789Sahrens if (bc->bc_errno || vd == NULL) { 1531789Sahrens /* 1532789Sahrens * We can't scrub this block, but we can continue to scrub 1533789Sahrens * the rest of the pool. Note the error and move along. 1534789Sahrens */ 1535789Sahrens mutex_enter(&spa->spa_scrub_lock); 1536789Sahrens spa->spa_scrub_errors++; 1537789Sahrens mutex_exit(&spa->spa_scrub_lock); 1538789Sahrens 1539789Sahrens if (vd != NULL) { 1540789Sahrens mutex_enter(&vd->vdev_stat_lock); 1541789Sahrens vd->vdev_stat.vs_scrub_errors++; 1542789Sahrens mutex_exit(&vd->vdev_stat_lock); 1543789Sahrens } 1544789Sahrens 1545789Sahrens return (ERESTART); 1546789Sahrens } 1547789Sahrens 1548789Sahrens ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg); 1549789Sahrens 1550789Sahrens /* 1551789Sahrens * Keep track of how much data we've examined so that 1552789Sahrens * zpool(1M) status can make useful progress reports. 1553789Sahrens */ 1554789Sahrens mutex_enter(&vd->vdev_stat_lock); 1555789Sahrens vd->vdev_stat.vs_scrub_examined += BP_GET_ASIZE(bp); 1556789Sahrens mutex_exit(&vd->vdev_stat_lock); 1557789Sahrens 1558789Sahrens if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) { 1559789Sahrens if (DVA_GET_GANG(&bp->blk_dva[0])) { 1560789Sahrens /* 1561789Sahrens * Gang members may be spread across multiple vdevs, 1562789Sahrens * so the best we can do is look at the pool-wide DTL. 1563789Sahrens * XXX -- it would be better to change our allocation 1564789Sahrens * policy to ensure that this can't happen. 1565789Sahrens */ 1566789Sahrens vd = spa->spa_root_vdev; 1567789Sahrens } 1568789Sahrens if (vdev_dtl_contains(&vd->vdev_dtl_map, bp->blk_birth, 1)) { 1569789Sahrens spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER, 15701544Seschrock ZIO_FLAG_RESILVER, &bc->bc_bookmark); 1571789Sahrens } 1572789Sahrens } else { 1573789Sahrens spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB, 15741544Seschrock ZIO_FLAG_SCRUB, &bc->bc_bookmark); 1575789Sahrens } 1576789Sahrens 1577789Sahrens return (0); 1578789Sahrens } 1579789Sahrens 1580789Sahrens static void 1581789Sahrens spa_scrub_thread(spa_t *spa) 1582789Sahrens { 1583789Sahrens callb_cpr_t cprinfo; 1584789Sahrens traverse_handle_t *th = spa->spa_scrub_th; 1585789Sahrens vdev_t *rvd = spa->spa_root_vdev; 1586789Sahrens pool_scrub_type_t scrub_type = spa->spa_scrub_type; 1587789Sahrens int error = 0; 1588789Sahrens boolean_t complete; 1589789Sahrens 1590789Sahrens CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG); 1591789Sahrens 1592797Sbonwick /* 1593797Sbonwick * If we're restarting due to a snapshot create/delete, 1594797Sbonwick * wait for that to complete. 1595797Sbonwick */ 1596797Sbonwick txg_wait_synced(spa_get_dsl(spa), 0); 1597797Sbonwick 15981544Seschrock dprintf("start %s mintxg=%llu maxtxg=%llu\n", 15991544Seschrock scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 16001544Seschrock spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg); 16011544Seschrock 16021544Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 16031544Seschrock vdev_reopen(rvd); /* purge all vdev caches */ 1604789Sahrens vdev_config_dirty(rvd); /* rewrite all disk labels */ 1605789Sahrens vdev_scrub_stat_update(rvd, scrub_type, B_FALSE); 16061544Seschrock spa_config_exit(spa, FTAG); 1607789Sahrens 1608789Sahrens mutex_enter(&spa->spa_scrub_lock); 1609789Sahrens spa->spa_scrub_errors = 0; 1610789Sahrens spa->spa_scrub_active = 1; 16111544Seschrock ASSERT(spa->spa_scrub_inflight == 0); 16121544Seschrock ASSERT(spa->spa_scrub_throttled == 0); 1613789Sahrens 1614789Sahrens while (!spa->spa_scrub_stop) { 1615789Sahrens CALLB_CPR_SAFE_BEGIN(&cprinfo); 16161544Seschrock while (spa->spa_scrub_suspended) { 1617789Sahrens spa->spa_scrub_active = 0; 1618789Sahrens cv_broadcast(&spa->spa_scrub_cv); 1619789Sahrens cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 1620789Sahrens spa->spa_scrub_active = 1; 1621789Sahrens } 1622789Sahrens CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock); 1623789Sahrens 1624789Sahrens if (spa->spa_scrub_restart_txg != 0) 1625789Sahrens break; 1626789Sahrens 1627789Sahrens mutex_exit(&spa->spa_scrub_lock); 1628789Sahrens error = traverse_more(th); 1629789Sahrens mutex_enter(&spa->spa_scrub_lock); 1630789Sahrens if (error != EAGAIN) 1631789Sahrens break; 16321544Seschrock 16331544Seschrock while (spa->spa_scrub_throttled > 0) 16341544Seschrock cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1635789Sahrens } 1636789Sahrens 1637789Sahrens while (spa->spa_scrub_inflight) 1638789Sahrens cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1639789Sahrens 16401601Sbonwick spa->spa_scrub_active = 0; 16411601Sbonwick cv_broadcast(&spa->spa_scrub_cv); 16421601Sbonwick 16431601Sbonwick mutex_exit(&spa->spa_scrub_lock); 16441601Sbonwick 16451601Sbonwick spa_config_enter(spa, RW_WRITER, FTAG); 16461601Sbonwick 16471601Sbonwick mutex_enter(&spa->spa_scrub_lock); 16481601Sbonwick 16491601Sbonwick /* 16501601Sbonwick * Note: we check spa_scrub_restart_txg under both spa_scrub_lock 16511601Sbonwick * AND the spa config lock to synchronize with any config changes 16521601Sbonwick * that revise the DTLs under spa_vdev_enter() / spa_vdev_exit(). 16531601Sbonwick */ 1654789Sahrens if (spa->spa_scrub_restart_txg != 0) 1655789Sahrens error = ERESTART; 1656789Sahrens 16571544Seschrock if (spa->spa_scrub_stop) 16581544Seschrock error = EINTR; 16591544Seschrock 1660789Sahrens /* 16611544Seschrock * Even if there were uncorrectable errors, we consider the scrub 16621544Seschrock * completed. The downside is that if there is a transient error during 16631544Seschrock * a resilver, we won't resilver the data properly to the target. But 16641544Seschrock * if the damage is permanent (more likely) we will resilver forever, 16651544Seschrock * which isn't really acceptable. Since there is enough information for 16661544Seschrock * the user to know what has failed and why, this seems like a more 16671544Seschrock * tractable approach. 1668789Sahrens */ 16691544Seschrock complete = (error == 0); 1670789Sahrens 16711544Seschrock dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n", 16721544Seschrock scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 1673789Sahrens spa->spa_scrub_maxtxg, complete ? "done" : "FAILED", 1674789Sahrens error, spa->spa_scrub_errors, spa->spa_scrub_stop); 1675789Sahrens 1676789Sahrens mutex_exit(&spa->spa_scrub_lock); 1677789Sahrens 1678789Sahrens /* 1679789Sahrens * If the scrub/resilver completed, update all DTLs to reflect this. 1680789Sahrens * Whether it succeeded or not, vacate all temporary scrub DTLs. 1681789Sahrens */ 1682789Sahrens vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1, 1683789Sahrens complete ? spa->spa_scrub_maxtxg : 0, B_TRUE); 1684789Sahrens vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete); 16851544Seschrock spa_errlog_rotate(spa); 16861601Sbonwick 16871544Seschrock spa_config_exit(spa, FTAG); 1688789Sahrens 1689789Sahrens mutex_enter(&spa->spa_scrub_lock); 1690789Sahrens 16911544Seschrock /* 16921544Seschrock * We may have finished replacing a device. 16931544Seschrock * Let the async thread assess this and handle the detach. 16941544Seschrock */ 16951544Seschrock spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 1696789Sahrens 1697789Sahrens /* 1698789Sahrens * If we were told to restart, our final act is to start a new scrub. 1699789Sahrens */ 1700789Sahrens if (error == ERESTART) 17011544Seschrock spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ? 17021544Seschrock SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB); 1703789Sahrens 17041544Seschrock spa->spa_scrub_type = POOL_SCRUB_NONE; 17051544Seschrock spa->spa_scrub_active = 0; 17061544Seschrock spa->spa_scrub_thread = NULL; 17071544Seschrock cv_broadcast(&spa->spa_scrub_cv); 1708789Sahrens CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */ 1709789Sahrens thread_exit(); 1710789Sahrens } 1711789Sahrens 1712789Sahrens void 1713789Sahrens spa_scrub_suspend(spa_t *spa) 1714789Sahrens { 1715789Sahrens mutex_enter(&spa->spa_scrub_lock); 17161544Seschrock spa->spa_scrub_suspended++; 1717789Sahrens while (spa->spa_scrub_active) { 1718789Sahrens cv_broadcast(&spa->spa_scrub_cv); 1719789Sahrens cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 1720789Sahrens } 1721789Sahrens while (spa->spa_scrub_inflight) 1722789Sahrens cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1723789Sahrens mutex_exit(&spa->spa_scrub_lock); 1724789Sahrens } 1725789Sahrens 1726789Sahrens void 1727789Sahrens spa_scrub_resume(spa_t *spa) 1728789Sahrens { 1729789Sahrens mutex_enter(&spa->spa_scrub_lock); 17301544Seschrock ASSERT(spa->spa_scrub_suspended != 0); 17311544Seschrock if (--spa->spa_scrub_suspended == 0) 1732789Sahrens cv_broadcast(&spa->spa_scrub_cv); 1733789Sahrens mutex_exit(&spa->spa_scrub_lock); 1734789Sahrens } 1735789Sahrens 1736789Sahrens void 1737789Sahrens spa_scrub_restart(spa_t *spa, uint64_t txg) 1738789Sahrens { 1739789Sahrens /* 1740789Sahrens * Something happened (e.g. snapshot create/delete) that means 1741789Sahrens * we must restart any in-progress scrubs. The itinerary will 1742789Sahrens * fix this properly. 1743789Sahrens */ 1744789Sahrens mutex_enter(&spa->spa_scrub_lock); 1745789Sahrens spa->spa_scrub_restart_txg = txg; 1746789Sahrens mutex_exit(&spa->spa_scrub_lock); 1747789Sahrens } 1748789Sahrens 17491544Seschrock int 17501544Seschrock spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) 1751789Sahrens { 1752789Sahrens space_seg_t *ss; 1753789Sahrens uint64_t mintxg, maxtxg; 1754789Sahrens vdev_t *rvd = spa->spa_root_vdev; 1755789Sahrens 1756789Sahrens if ((uint_t)type >= POOL_SCRUB_TYPES) 1757789Sahrens return (ENOTSUP); 1758789Sahrens 17591544Seschrock mutex_enter(&spa->spa_scrub_lock); 17601544Seschrock 1761789Sahrens /* 1762789Sahrens * If there's a scrub or resilver already in progress, stop it. 1763789Sahrens */ 1764789Sahrens while (spa->spa_scrub_thread != NULL) { 1765789Sahrens /* 1766789Sahrens * Don't stop a resilver unless forced. 1767789Sahrens */ 17681544Seschrock if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) { 17691544Seschrock mutex_exit(&spa->spa_scrub_lock); 1770789Sahrens return (EBUSY); 17711544Seschrock } 1772789Sahrens spa->spa_scrub_stop = 1; 1773789Sahrens cv_broadcast(&spa->spa_scrub_cv); 1774789Sahrens cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 1775789Sahrens } 1776789Sahrens 1777789Sahrens /* 1778789Sahrens * Terminate the previous traverse. 1779789Sahrens */ 1780789Sahrens if (spa->spa_scrub_th != NULL) { 1781789Sahrens traverse_fini(spa->spa_scrub_th); 1782789Sahrens spa->spa_scrub_th = NULL; 1783789Sahrens } 1784789Sahrens 17851544Seschrock if (rvd == NULL) { 17861544Seschrock ASSERT(spa->spa_scrub_stop == 0); 17871544Seschrock ASSERT(spa->spa_scrub_type == type); 17881544Seschrock ASSERT(spa->spa_scrub_restart_txg == 0); 17891544Seschrock mutex_exit(&spa->spa_scrub_lock); 17901544Seschrock return (0); 17911544Seschrock } 1792789Sahrens 1793789Sahrens mintxg = TXG_INITIAL - 1; 1794789Sahrens maxtxg = spa_last_synced_txg(spa) + 1; 1795789Sahrens 17961544Seschrock mutex_enter(&rvd->vdev_dtl_lock); 1797789Sahrens 17981544Seschrock if (rvd->vdev_dtl_map.sm_space == 0) { 17991544Seschrock /* 18001544Seschrock * The pool-wide DTL is empty. 18011544Seschrock * If this is a resilver, there's nothing to do. 18021544Seschrock */ 18031544Seschrock if (type == POOL_SCRUB_RESILVER) 18041544Seschrock type = POOL_SCRUB_NONE; 18051544Seschrock } else { 18061544Seschrock /* 18071544Seschrock * The pool-wide DTL is non-empty. 18081544Seschrock * If this is a normal scrub, upgrade to a resilver instead. 18091544Seschrock */ 18101544Seschrock if (type == POOL_SCRUB_EVERYTHING) 18111544Seschrock type = POOL_SCRUB_RESILVER; 18121544Seschrock } 1813789Sahrens 18141544Seschrock if (type == POOL_SCRUB_RESILVER) { 1815789Sahrens /* 1816789Sahrens * Determine the resilvering boundaries. 1817789Sahrens * 1818789Sahrens * Note: (mintxg, maxtxg) is an open interval, 1819789Sahrens * i.e. mintxg and maxtxg themselves are not included. 1820789Sahrens * 1821789Sahrens * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1 1822789Sahrens * so we don't claim to resilver a txg that's still changing. 1823789Sahrens */ 1824789Sahrens ss = avl_first(&rvd->vdev_dtl_map.sm_root); 18251544Seschrock mintxg = ss->ss_start - 1; 1826789Sahrens ss = avl_last(&rvd->vdev_dtl_map.sm_root); 18271544Seschrock maxtxg = MIN(ss->ss_end, maxtxg); 1828789Sahrens } 1829789Sahrens 18301544Seschrock mutex_exit(&rvd->vdev_dtl_lock); 18311544Seschrock 18321544Seschrock spa->spa_scrub_stop = 0; 18331544Seschrock spa->spa_scrub_type = type; 18341544Seschrock spa->spa_scrub_restart_txg = 0; 18351544Seschrock 18361544Seschrock if (type != POOL_SCRUB_NONE) { 18371544Seschrock spa->spa_scrub_mintxg = mintxg; 1838789Sahrens spa->spa_scrub_maxtxg = maxtxg; 1839789Sahrens spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL, 1840*1635Sbonwick ADVANCE_PRE | ADVANCE_PRUNE | ADVANCE_ZIL, 1841*1635Sbonwick ZIO_FLAG_CANFAIL); 1842789Sahrens traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg); 1843789Sahrens spa->spa_scrub_thread = thread_create(NULL, 0, 1844789Sahrens spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri); 1845789Sahrens } 1846789Sahrens 18471544Seschrock mutex_exit(&spa->spa_scrub_lock); 18481544Seschrock 1849789Sahrens return (0); 1850789Sahrens } 1851789Sahrens 18521544Seschrock /* 18531544Seschrock * ========================================================================== 18541544Seschrock * SPA async task processing 18551544Seschrock * ========================================================================== 18561544Seschrock */ 18571544Seschrock 18581544Seschrock static void 18591544Seschrock spa_async_reopen(spa_t *spa) 1860789Sahrens { 18611544Seschrock vdev_t *rvd = spa->spa_root_vdev; 18621544Seschrock vdev_t *tvd; 18631544Seschrock int c; 18641544Seschrock 18651544Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 18661544Seschrock 18671544Seschrock for (c = 0; c < rvd->vdev_children; c++) { 18681544Seschrock tvd = rvd->vdev_child[c]; 18691544Seschrock if (tvd->vdev_reopen_wanted) { 18701544Seschrock tvd->vdev_reopen_wanted = 0; 18711544Seschrock vdev_reopen(tvd); 18721544Seschrock } 18731544Seschrock } 1874789Sahrens 18751544Seschrock spa_config_exit(spa, FTAG); 18761544Seschrock } 18771544Seschrock 18781544Seschrock static void 18791544Seschrock spa_async_thread(spa_t *spa) 18801544Seschrock { 18811544Seschrock int tasks; 18821544Seschrock 18831544Seschrock ASSERT(spa->spa_sync_on); 1884789Sahrens 18851544Seschrock mutex_enter(&spa->spa_async_lock); 18861544Seschrock tasks = spa->spa_async_tasks; 18871544Seschrock spa->spa_async_tasks = 0; 18881544Seschrock mutex_exit(&spa->spa_async_lock); 18891544Seschrock 18901544Seschrock /* 1891*1635Sbonwick * See if the config needs to be updated. 1892*1635Sbonwick */ 1893*1635Sbonwick if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 1894*1635Sbonwick mutex_enter(&spa_namespace_lock); 1895*1635Sbonwick spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 1896*1635Sbonwick mutex_exit(&spa_namespace_lock); 1897*1635Sbonwick } 1898*1635Sbonwick 1899*1635Sbonwick /* 19001544Seschrock * See if any devices need to be reopened. 19011544Seschrock */ 19021544Seschrock if (tasks & SPA_ASYNC_REOPEN) 19031544Seschrock spa_async_reopen(spa); 19041544Seschrock 19051544Seschrock /* 19061544Seschrock * If any devices are done replacing, detach them. 19071544Seschrock */ 19081544Seschrock if (tasks & SPA_ASYNC_REPLACE_DONE) 1909789Sahrens spa_vdev_replace_done(spa); 1910789Sahrens 19111544Seschrock /* 19121544Seschrock * Kick off a scrub. 19131544Seschrock */ 19141544Seschrock if (tasks & SPA_ASYNC_SCRUB) 19151544Seschrock VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0); 19161544Seschrock 19171544Seschrock /* 19181544Seschrock * Kick off a resilver. 19191544Seschrock */ 19201544Seschrock if (tasks & SPA_ASYNC_RESILVER) 19211544Seschrock VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 19221544Seschrock 19231544Seschrock /* 19241544Seschrock * Let the world know that we're done. 19251544Seschrock */ 19261544Seschrock mutex_enter(&spa->spa_async_lock); 19271544Seschrock spa->spa_async_thread = NULL; 19281544Seschrock cv_broadcast(&spa->spa_async_cv); 19291544Seschrock mutex_exit(&spa->spa_async_lock); 19301544Seschrock thread_exit(); 19311544Seschrock } 19321544Seschrock 19331544Seschrock void 19341544Seschrock spa_async_suspend(spa_t *spa) 19351544Seschrock { 19361544Seschrock mutex_enter(&spa->spa_async_lock); 19371544Seschrock spa->spa_async_suspended++; 19381544Seschrock while (spa->spa_async_thread != NULL) 19391544Seschrock cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 19401544Seschrock mutex_exit(&spa->spa_async_lock); 19411544Seschrock } 19421544Seschrock 19431544Seschrock void 19441544Seschrock spa_async_resume(spa_t *spa) 19451544Seschrock { 19461544Seschrock mutex_enter(&spa->spa_async_lock); 19471544Seschrock ASSERT(spa->spa_async_suspended != 0); 19481544Seschrock spa->spa_async_suspended--; 19491544Seschrock mutex_exit(&spa->spa_async_lock); 19501544Seschrock } 19511544Seschrock 19521544Seschrock static void 19531544Seschrock spa_async_dispatch(spa_t *spa) 19541544Seschrock { 19551544Seschrock mutex_enter(&spa->spa_async_lock); 19561544Seschrock if (spa->spa_async_tasks && !spa->spa_async_suspended && 1957*1635Sbonwick spa->spa_async_thread == NULL && 1958*1635Sbonwick rootdir != NULL && !vn_is_readonly(rootdir)) 19591544Seschrock spa->spa_async_thread = thread_create(NULL, 0, 19601544Seschrock spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 19611544Seschrock mutex_exit(&spa->spa_async_lock); 19621544Seschrock } 19631544Seschrock 19641544Seschrock void 19651544Seschrock spa_async_request(spa_t *spa, int task) 19661544Seschrock { 19671544Seschrock mutex_enter(&spa->spa_async_lock); 19681544Seschrock spa->spa_async_tasks |= task; 19691544Seschrock mutex_exit(&spa->spa_async_lock); 1970789Sahrens } 1971789Sahrens 1972789Sahrens /* 1973789Sahrens * ========================================================================== 1974789Sahrens * SPA syncing routines 1975789Sahrens * ========================================================================== 1976789Sahrens */ 1977789Sahrens 1978789Sahrens static void 1979789Sahrens spa_sync_deferred_frees(spa_t *spa, uint64_t txg) 1980789Sahrens { 1981789Sahrens bplist_t *bpl = &spa->spa_sync_bplist; 1982789Sahrens dmu_tx_t *tx; 1983789Sahrens blkptr_t blk; 1984789Sahrens uint64_t itor = 0; 1985789Sahrens zio_t *zio; 1986789Sahrens int error; 1987789Sahrens uint8_t c = 1; 1988789Sahrens 1989789Sahrens zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD); 1990789Sahrens 1991789Sahrens while (bplist_iterate(bpl, &itor, &blk) == 0) 1992789Sahrens zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL)); 1993789Sahrens 1994789Sahrens error = zio_wait(zio); 1995789Sahrens ASSERT3U(error, ==, 0); 1996789Sahrens 1997789Sahrens tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 1998789Sahrens bplist_vacate(bpl, tx); 1999789Sahrens 2000789Sahrens /* 2001789Sahrens * Pre-dirty the first block so we sync to convergence faster. 2002789Sahrens * (Usually only the first block is needed.) 2003789Sahrens */ 2004789Sahrens dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 2005789Sahrens dmu_tx_commit(tx); 2006789Sahrens } 2007789Sahrens 2008789Sahrens static void 2009789Sahrens spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 2010789Sahrens { 2011789Sahrens nvlist_t *config; 2012789Sahrens char *packed = NULL; 2013789Sahrens size_t nvsize = 0; 2014789Sahrens dmu_buf_t *db; 2015789Sahrens 2016789Sahrens if (list_is_empty(&spa->spa_dirty_list)) 2017789Sahrens return; 2018789Sahrens 2019789Sahrens config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE); 2020789Sahrens 2021*1635Sbonwick if (spa->spa_config_syncing) 2022*1635Sbonwick nvlist_free(spa->spa_config_syncing); 2023*1635Sbonwick spa->spa_config_syncing = config; 2024789Sahrens 2025789Sahrens VERIFY(nvlist_size(config, &nvsize, NV_ENCODE_XDR) == 0); 2026789Sahrens 2027789Sahrens packed = kmem_alloc(nvsize, KM_SLEEP); 2028789Sahrens 20291544Seschrock VERIFY(nvlist_pack(config, &packed, &nvsize, NV_ENCODE_XDR, 20301544Seschrock KM_SLEEP) == 0); 2031789Sahrens 2032789Sahrens dmu_write(spa->spa_meta_objset, spa->spa_config_object, 0, nvsize, 2033789Sahrens packed, tx); 2034789Sahrens 2035789Sahrens kmem_free(packed, nvsize); 2036789Sahrens 20371544Seschrock VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, 20381544Seschrock spa->spa_config_object, FTAG, &db)); 2039789Sahrens dmu_buf_will_dirty(db, tx); 2040789Sahrens *(uint64_t *)db->db_data = nvsize; 20411544Seschrock dmu_buf_rele(db, FTAG); 2042789Sahrens } 2043789Sahrens 2044789Sahrens /* 2045789Sahrens * Sync the specified transaction group. New blocks may be dirtied as 2046789Sahrens * part of the process, so we iterate until it converges. 2047789Sahrens */ 2048789Sahrens void 2049789Sahrens spa_sync(spa_t *spa, uint64_t txg) 2050789Sahrens { 2051789Sahrens dsl_pool_t *dp = spa->spa_dsl_pool; 2052789Sahrens objset_t *mos = spa->spa_meta_objset; 2053789Sahrens bplist_t *bpl = &spa->spa_sync_bplist; 2054*1635Sbonwick vdev_t *rvd = spa->spa_root_vdev; 2055789Sahrens vdev_t *vd; 2056789Sahrens dmu_tx_t *tx; 2057789Sahrens int dirty_vdevs; 2058789Sahrens 2059789Sahrens /* 2060789Sahrens * Lock out configuration changes. 2061789Sahrens */ 20621544Seschrock spa_config_enter(spa, RW_READER, FTAG); 2063789Sahrens 2064789Sahrens spa->spa_syncing_txg = txg; 2065789Sahrens spa->spa_sync_pass = 0; 2066789Sahrens 20671544Seschrock VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); 2068789Sahrens 2069789Sahrens /* 2070789Sahrens * If anything has changed in this txg, push the deferred frees 2071789Sahrens * from the previous txg. If not, leave them alone so that we 2072789Sahrens * don't generate work on an otherwise idle system. 2073789Sahrens */ 2074789Sahrens if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 2075789Sahrens !txg_list_empty(&dp->dp_dirty_dirs, txg)) 2076789Sahrens spa_sync_deferred_frees(spa, txg); 2077789Sahrens 2078789Sahrens /* 2079789Sahrens * Iterate to convergence. 2080789Sahrens */ 2081789Sahrens do { 2082789Sahrens spa->spa_sync_pass++; 2083789Sahrens 2084789Sahrens tx = dmu_tx_create_assigned(dp, txg); 2085789Sahrens spa_sync_config_object(spa, tx); 2086789Sahrens dmu_tx_commit(tx); 2087789Sahrens 20881544Seschrock spa_errlog_sync(spa, txg); 20891544Seschrock 2090789Sahrens dsl_pool_sync(dp, txg); 2091789Sahrens 2092789Sahrens dirty_vdevs = 0; 2093789Sahrens while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 2094789Sahrens vdev_sync(vd, txg); 2095789Sahrens dirty_vdevs++; 2096789Sahrens } 2097789Sahrens 2098789Sahrens tx = dmu_tx_create_assigned(dp, txg); 2099789Sahrens bplist_sync(bpl, tx); 2100789Sahrens dmu_tx_commit(tx); 2101789Sahrens 2102789Sahrens } while (dirty_vdevs); 2103789Sahrens 2104789Sahrens bplist_close(bpl); 2105789Sahrens 2106789Sahrens dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 2107789Sahrens 2108789Sahrens /* 2109789Sahrens * Rewrite the vdev configuration (which includes the uberblock) 2110789Sahrens * to commit the transaction group. 2111*1635Sbonwick * 2112*1635Sbonwick * If there are any dirty vdevs, sync the uberblock to all vdevs. 2113*1635Sbonwick * Otherwise, pick a random top-level vdev that's known to be 2114*1635Sbonwick * visible in the config cache (see spa_vdev_add() for details). 2115*1635Sbonwick * If the write fails, try the next vdev until we're tried them all. 2116789Sahrens */ 2117*1635Sbonwick if (!list_is_empty(&spa->spa_dirty_list)) { 2118*1635Sbonwick VERIFY(vdev_config_sync(rvd, txg) == 0); 2119*1635Sbonwick } else { 2120*1635Sbonwick int children = rvd->vdev_children; 2121*1635Sbonwick int c0 = spa_get_random(children); 2122*1635Sbonwick int c; 2123*1635Sbonwick 2124*1635Sbonwick for (c = 0; c < children; c++) { 2125*1635Sbonwick vd = rvd->vdev_child[(c0 + c) % children]; 2126*1635Sbonwick if (vd->vdev_ms_array == 0) 2127*1635Sbonwick continue; 2128*1635Sbonwick if (vdev_config_sync(vd, txg) == 0) 2129*1635Sbonwick break; 2130*1635Sbonwick } 2131*1635Sbonwick if (c == children) 2132*1635Sbonwick VERIFY(vdev_config_sync(rvd, txg) == 0); 2133*1635Sbonwick } 2134*1635Sbonwick 2135*1635Sbonwick /* 2136*1635Sbonwick * Clear the dirty config list. 2137*1635Sbonwick */ 2138*1635Sbonwick while ((vd = list_head(&spa->spa_dirty_list)) != NULL) 2139*1635Sbonwick vdev_config_clean(vd); 2140*1635Sbonwick 2141*1635Sbonwick /* 2142*1635Sbonwick * Now that the new config has synced transactionally, 2143*1635Sbonwick * let it become visible to the config cache. 2144*1635Sbonwick */ 2145*1635Sbonwick if (spa->spa_config_syncing != NULL) { 2146*1635Sbonwick spa_config_set(spa, spa->spa_config_syncing); 2147*1635Sbonwick spa->spa_config_txg = txg; 2148*1635Sbonwick spa->spa_config_syncing = NULL; 2149*1635Sbonwick } 2150789Sahrens 2151789Sahrens /* 2152789Sahrens * Make a stable copy of the fully synced uberblock. 2153789Sahrens * We use this as the root for pool traversals. 2154789Sahrens */ 2155789Sahrens spa->spa_traverse_wanted = 1; /* tells traverse_more() to stop */ 2156789Sahrens 2157789Sahrens spa_scrub_suspend(spa); /* stop scrubbing and finish I/Os */ 2158789Sahrens 2159789Sahrens rw_enter(&spa->spa_traverse_lock, RW_WRITER); 2160789Sahrens spa->spa_traverse_wanted = 0; 2161789Sahrens spa->spa_ubsync = spa->spa_uberblock; 2162789Sahrens rw_exit(&spa->spa_traverse_lock); 2163789Sahrens 2164789Sahrens spa_scrub_resume(spa); /* resume scrub with new ubsync */ 2165789Sahrens 2166789Sahrens /* 2167789Sahrens * Clean up the ZIL records for the synced txg. 2168789Sahrens */ 2169789Sahrens dsl_pool_zil_clean(dp); 2170789Sahrens 2171789Sahrens /* 2172789Sahrens * Update usable space statistics. 2173789Sahrens */ 2174789Sahrens while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 2175789Sahrens vdev_sync_done(vd, txg); 2176789Sahrens 2177789Sahrens /* 2178789Sahrens * It had better be the case that we didn't dirty anything 2179789Sahrens * since spa_sync_labels(). 2180789Sahrens */ 2181789Sahrens ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 2182789Sahrens ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 2183789Sahrens ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 2184789Sahrens ASSERT(bpl->bpl_queue == NULL); 2185789Sahrens 21861544Seschrock spa_config_exit(spa, FTAG); 21871544Seschrock 21881544Seschrock /* 21891544Seschrock * If any async tasks have been requested, kick them off. 21901544Seschrock */ 21911544Seschrock spa_async_dispatch(spa); 2192789Sahrens } 2193789Sahrens 2194789Sahrens /* 2195789Sahrens * Sync all pools. We don't want to hold the namespace lock across these 2196789Sahrens * operations, so we take a reference on the spa_t and drop the lock during the 2197789Sahrens * sync. 2198789Sahrens */ 2199789Sahrens void 2200789Sahrens spa_sync_allpools(void) 2201789Sahrens { 2202789Sahrens spa_t *spa = NULL; 2203789Sahrens mutex_enter(&spa_namespace_lock); 2204789Sahrens while ((spa = spa_next(spa)) != NULL) { 2205789Sahrens if (spa_state(spa) != POOL_STATE_ACTIVE) 2206789Sahrens continue; 2207789Sahrens spa_open_ref(spa, FTAG); 2208789Sahrens mutex_exit(&spa_namespace_lock); 2209789Sahrens txg_wait_synced(spa_get_dsl(spa), 0); 2210789Sahrens mutex_enter(&spa_namespace_lock); 2211789Sahrens spa_close(spa, FTAG); 2212789Sahrens } 2213789Sahrens mutex_exit(&spa_namespace_lock); 2214789Sahrens } 2215789Sahrens 2216789Sahrens /* 2217789Sahrens * ========================================================================== 2218789Sahrens * Miscellaneous routines 2219789Sahrens * ========================================================================== 2220789Sahrens */ 2221789Sahrens 2222789Sahrens /* 2223789Sahrens * Remove all pools in the system. 2224789Sahrens */ 2225789Sahrens void 2226789Sahrens spa_evict_all(void) 2227789Sahrens { 2228789Sahrens spa_t *spa; 2229789Sahrens 2230789Sahrens /* 2231789Sahrens * Remove all cached state. All pools should be closed now, 2232789Sahrens * so every spa in the AVL tree should be unreferenced. 2233789Sahrens */ 2234789Sahrens mutex_enter(&spa_namespace_lock); 2235789Sahrens while ((spa = spa_next(NULL)) != NULL) { 2236789Sahrens /* 22371544Seschrock * Stop async tasks. The async thread may need to detach 22381544Seschrock * a device that's been replaced, which requires grabbing 22391544Seschrock * spa_namespace_lock, so we must drop it here. 2240789Sahrens */ 2241789Sahrens spa_open_ref(spa, FTAG); 2242789Sahrens mutex_exit(&spa_namespace_lock); 22431544Seschrock spa_async_suspend(spa); 2244789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 2245789Sahrens mutex_enter(&spa_namespace_lock); 2246789Sahrens spa_close(spa, FTAG); 2247789Sahrens 2248789Sahrens if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 2249789Sahrens spa_unload(spa); 2250789Sahrens spa_deactivate(spa); 2251789Sahrens } 2252789Sahrens spa_remove(spa); 2253789Sahrens } 2254789Sahrens mutex_exit(&spa_namespace_lock); 2255789Sahrens } 22561544Seschrock 22571544Seschrock vdev_t * 22581544Seschrock spa_lookup_by_guid(spa_t *spa, uint64_t guid) 22591544Seschrock { 22601544Seschrock return (vdev_lookup_by_guid(spa->spa_root_vdev, guid)); 22611544Seschrock } 2262