1789Sahrens /* 2789Sahrens * CDDL HEADER START 3789Sahrens * 4789Sahrens * The contents of this file are subject to the terms of the 51544Seschrock * Common Development and Distribution License (the "License"). 61544Seschrock * You may not use this file except in compliance with the License. 7789Sahrens * 8789Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9789Sahrens * or http://www.opensolaris.org/os/licensing. 10789Sahrens * See the License for the specific language governing permissions 11789Sahrens * and limitations under the License. 12789Sahrens * 13789Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14789Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15789Sahrens * If applicable, add the following below this CDDL HEADER, with the 16789Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17789Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18789Sahrens * 19789Sahrens * CDDL HEADER END 20789Sahrens */ 21789Sahrens /* 221354Seschrock * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23789Sahrens * Use is subject to license terms. 24789Sahrens */ 25789Sahrens 26789Sahrens #pragma ident "%Z%%M% %I% %E% SMI" 27789Sahrens 28789Sahrens /* 29789Sahrens * This file contains all the routines used when modifying on-disk SPA state. 30789Sahrens * This includes opening, importing, destroying, exporting a pool, and syncing a 31789Sahrens * pool. 32789Sahrens */ 33789Sahrens 34789Sahrens #include <sys/zfs_context.h> 351544Seschrock #include <sys/fm/fs/zfs.h> 36789Sahrens #include <sys/spa_impl.h> 37789Sahrens #include <sys/zio.h> 38789Sahrens #include <sys/zio_checksum.h> 39789Sahrens #include <sys/zio_compress.h> 40789Sahrens #include <sys/dmu.h> 41789Sahrens #include <sys/dmu_tx.h> 42789Sahrens #include <sys/zap.h> 43789Sahrens #include <sys/zil.h> 44789Sahrens #include <sys/vdev_impl.h> 45789Sahrens #include <sys/metaslab.h> 46789Sahrens #include <sys/uberblock_impl.h> 47789Sahrens #include <sys/txg.h> 48789Sahrens #include <sys/avl.h> 49789Sahrens #include <sys/dmu_traverse.h> 50789Sahrens #include <sys/unique.h> 51789Sahrens #include <sys/dsl_pool.h> 52789Sahrens #include <sys/dsl_dir.h> 53789Sahrens #include <sys/dsl_prop.h> 54789Sahrens #include <sys/fs/zfs.h> 55789Sahrens #include <sys/callb.h> 56789Sahrens 57789Sahrens /* 58789Sahrens * ========================================================================== 59789Sahrens * SPA state manipulation (open/create/destroy/import/export) 60789Sahrens * ========================================================================== 61789Sahrens */ 62789Sahrens 631544Seschrock static int 641544Seschrock spa_error_entry_compare(const void *a, const void *b) 651544Seschrock { 661544Seschrock spa_error_entry_t *sa = (spa_error_entry_t *)a; 671544Seschrock spa_error_entry_t *sb = (spa_error_entry_t *)b; 681544Seschrock int ret; 691544Seschrock 701544Seschrock ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 711544Seschrock sizeof (zbookmark_t)); 721544Seschrock 731544Seschrock if (ret < 0) 741544Seschrock return (-1); 751544Seschrock else if (ret > 0) 761544Seschrock return (1); 771544Seschrock else 781544Seschrock return (0); 791544Seschrock } 801544Seschrock 811544Seschrock /* 821544Seschrock * Utility function which retrieves copies of the current logs and 831544Seschrock * re-initializes them in the process. 841544Seschrock */ 851544Seschrock void 861544Seschrock spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 871544Seschrock { 881544Seschrock ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 891544Seschrock 901544Seschrock bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 911544Seschrock bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 921544Seschrock 931544Seschrock avl_create(&spa->spa_errlist_scrub, 941544Seschrock spa_error_entry_compare, sizeof (spa_error_entry_t), 951544Seschrock offsetof(spa_error_entry_t, se_avl)); 961544Seschrock avl_create(&spa->spa_errlist_last, 971544Seschrock spa_error_entry_compare, sizeof (spa_error_entry_t), 981544Seschrock offsetof(spa_error_entry_t, se_avl)); 991544Seschrock } 1001544Seschrock 101789Sahrens /* 102789Sahrens * Activate an uninitialized pool. 103789Sahrens */ 104789Sahrens static void 105789Sahrens spa_activate(spa_t *spa) 106789Sahrens { 107789Sahrens int t; 108789Sahrens 109789Sahrens ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 110789Sahrens 111789Sahrens spa->spa_state = POOL_STATE_ACTIVE; 112789Sahrens 113789Sahrens spa->spa_normal_class = metaslab_class_create(); 114789Sahrens 115789Sahrens for (t = 0; t < ZIO_TYPES; t++) { 116789Sahrens spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue", 117789Sahrens 8, maxclsyspri, 50, INT_MAX, 118789Sahrens TASKQ_PREPOPULATE); 119789Sahrens spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr", 120789Sahrens 8, maxclsyspri, 50, INT_MAX, 121789Sahrens TASKQ_PREPOPULATE); 122789Sahrens } 123789Sahrens 124789Sahrens rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL); 125789Sahrens 126789Sahrens list_create(&spa->spa_dirty_list, sizeof (vdev_t), 127789Sahrens offsetof(vdev_t, vdev_dirty_node)); 128789Sahrens 129789Sahrens txg_list_create(&spa->spa_vdev_txg_list, 130789Sahrens offsetof(struct vdev, vdev_txg_node)); 1311544Seschrock 1321544Seschrock avl_create(&spa->spa_errlist_scrub, 1331544Seschrock spa_error_entry_compare, sizeof (spa_error_entry_t), 1341544Seschrock offsetof(spa_error_entry_t, se_avl)); 1351544Seschrock avl_create(&spa->spa_errlist_last, 1361544Seschrock spa_error_entry_compare, sizeof (spa_error_entry_t), 1371544Seschrock offsetof(spa_error_entry_t, se_avl)); 138789Sahrens } 139789Sahrens 140789Sahrens /* 141789Sahrens * Opposite of spa_activate(). 142789Sahrens */ 143789Sahrens static void 144789Sahrens spa_deactivate(spa_t *spa) 145789Sahrens { 146789Sahrens int t; 147789Sahrens 148789Sahrens ASSERT(spa->spa_sync_on == B_FALSE); 149789Sahrens ASSERT(spa->spa_dsl_pool == NULL); 150789Sahrens ASSERT(spa->spa_root_vdev == NULL); 151789Sahrens 152789Sahrens ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 153789Sahrens 154789Sahrens txg_list_destroy(&spa->spa_vdev_txg_list); 155789Sahrens 156789Sahrens list_destroy(&spa->spa_dirty_list); 157789Sahrens 158789Sahrens rw_destroy(&spa->spa_traverse_lock); 159789Sahrens 160789Sahrens for (t = 0; t < ZIO_TYPES; t++) { 161789Sahrens taskq_destroy(spa->spa_zio_issue_taskq[t]); 162789Sahrens taskq_destroy(spa->spa_zio_intr_taskq[t]); 163789Sahrens spa->spa_zio_issue_taskq[t] = NULL; 164789Sahrens spa->spa_zio_intr_taskq[t] = NULL; 165789Sahrens } 166789Sahrens 167789Sahrens metaslab_class_destroy(spa->spa_normal_class); 168789Sahrens spa->spa_normal_class = NULL; 169789Sahrens 1701544Seschrock /* 1711544Seschrock * If this was part of an import or the open otherwise failed, we may 1721544Seschrock * still have errors left in the queues. Empty them just in case. 1731544Seschrock */ 1741544Seschrock spa_errlog_drain(spa); 1751544Seschrock 1761544Seschrock avl_destroy(&spa->spa_errlist_scrub); 1771544Seschrock avl_destroy(&spa->spa_errlist_last); 1781544Seschrock 179789Sahrens spa->spa_state = POOL_STATE_UNINITIALIZED; 180789Sahrens } 181789Sahrens 182789Sahrens /* 183789Sahrens * Verify a pool configuration, and construct the vdev tree appropriately. This 184789Sahrens * will create all the necessary vdevs in the appropriate layout, with each vdev 185789Sahrens * in the CLOSED state. This will prep the pool before open/creation/import. 186789Sahrens * All vdev validation is done by the vdev_alloc() routine. 187789Sahrens */ 188789Sahrens static vdev_t * 189789Sahrens spa_config_parse(spa_t *spa, nvlist_t *nv, vdev_t *parent, uint_t id, int atype) 190789Sahrens { 191789Sahrens nvlist_t **child; 192789Sahrens uint_t c, children; 193789Sahrens vdev_t *vd; 194789Sahrens 195789Sahrens if ((vd = vdev_alloc(spa, nv, parent, id, atype)) == NULL) 196789Sahrens return (NULL); 197789Sahrens 198789Sahrens if (vd->vdev_ops->vdev_op_leaf) 199789Sahrens return (vd); 200789Sahrens 201789Sahrens if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 202789Sahrens &child, &children) != 0) { 203789Sahrens vdev_free(vd); 204789Sahrens return (NULL); 205789Sahrens } 206789Sahrens 207789Sahrens for (c = 0; c < children; c++) { 208789Sahrens if (spa_config_parse(spa, child[c], vd, c, atype) == NULL) { 209789Sahrens vdev_free(vd); 210789Sahrens return (NULL); 211789Sahrens } 212789Sahrens } 213789Sahrens 214789Sahrens return (vd); 215789Sahrens } 216789Sahrens 217789Sahrens /* 218789Sahrens * Opposite of spa_load(). 219789Sahrens */ 220789Sahrens static void 221789Sahrens spa_unload(spa_t *spa) 222789Sahrens { 223789Sahrens /* 2241544Seschrock * Stop async tasks. 2251544Seschrock */ 2261544Seschrock spa_async_suspend(spa); 2271544Seschrock 2281544Seschrock /* 229789Sahrens * Stop syncing. 230789Sahrens */ 231789Sahrens if (spa->spa_sync_on) { 232789Sahrens txg_sync_stop(spa->spa_dsl_pool); 233789Sahrens spa->spa_sync_on = B_FALSE; 234789Sahrens } 235789Sahrens 236789Sahrens /* 237789Sahrens * Wait for any outstanding prefetch I/O to complete. 238789Sahrens */ 2391544Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 2401544Seschrock spa_config_exit(spa, FTAG); 241789Sahrens 242789Sahrens /* 243789Sahrens * Close the dsl pool. 244789Sahrens */ 245789Sahrens if (spa->spa_dsl_pool) { 246789Sahrens dsl_pool_close(spa->spa_dsl_pool); 247789Sahrens spa->spa_dsl_pool = NULL; 248789Sahrens } 249789Sahrens 250789Sahrens /* 251789Sahrens * Close all vdevs. 252789Sahrens */ 2531585Sbonwick if (spa->spa_root_vdev) 254789Sahrens vdev_free(spa->spa_root_vdev); 2551585Sbonwick ASSERT(spa->spa_root_vdev == NULL); 2561544Seschrock 2571544Seschrock spa->spa_async_suspended = 0; 258789Sahrens } 259789Sahrens 260789Sahrens /* 261789Sahrens * Load an existing storage pool, using the pool's builtin spa_config as a 2621544Seschrock * source of configuration information. 263789Sahrens */ 264789Sahrens static int 2651544Seschrock spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) 266789Sahrens { 267789Sahrens int error = 0; 268789Sahrens nvlist_t *nvroot = NULL; 269789Sahrens vdev_t *rvd; 270789Sahrens uberblock_t *ub = &spa->spa_uberblock; 2711635Sbonwick uint64_t config_cache_txg = spa->spa_config_txg; 272789Sahrens uint64_t pool_guid; 273789Sahrens zio_t *zio; 274789Sahrens 2751544Seschrock spa->spa_load_state = state; 2761635Sbonwick 277789Sahrens if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 2781733Sbonwick nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 2791544Seschrock error = EINVAL; 2801544Seschrock goto out; 2811544Seschrock } 282789Sahrens 2831733Sbonwick (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 2841733Sbonwick &spa->spa_config_txg); 2851733Sbonwick 2861635Sbonwick if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 2871544Seschrock spa_guid_exists(pool_guid, 0)) { 2881544Seschrock error = EEXIST; 2891544Seschrock goto out; 2901544Seschrock } 291789Sahrens 292789Sahrens /* 293789Sahrens * Parse the configuration into a vdev tree. 294789Sahrens */ 2951544Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 296789Sahrens rvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 2971544Seschrock spa_config_exit(spa, FTAG); 298789Sahrens 2991544Seschrock if (rvd == NULL) { 3001544Seschrock error = EINVAL; 3011544Seschrock goto out; 3021544Seschrock } 303789Sahrens 3041585Sbonwick ASSERT(spa->spa_root_vdev == rvd); 305789Sahrens ASSERT(spa_guid(spa) == pool_guid); 306789Sahrens 307789Sahrens /* 308789Sahrens * Try to open all vdevs, loading each label in the process. 309789Sahrens */ 3101544Seschrock if (vdev_open(rvd) != 0) { 3111544Seschrock error = ENXIO; 3121544Seschrock goto out; 3131544Seschrock } 314789Sahrens 315789Sahrens /* 316*1986Seschrock * Validate the labels for all leaf vdevs. We need to grab the config 317*1986Seschrock * lock because all label I/O is done with the ZIO_FLAG_CONFIG_HELD 318*1986Seschrock * flag. 319*1986Seschrock */ 320*1986Seschrock spa_config_enter(spa, RW_READER, FTAG); 321*1986Seschrock error = vdev_validate(rvd); 322*1986Seschrock spa_config_exit(spa, FTAG); 323*1986Seschrock 324*1986Seschrock if (error != 0) { 325*1986Seschrock error = EBADF; 326*1986Seschrock goto out; 327*1986Seschrock } 328*1986Seschrock 329*1986Seschrock if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 330*1986Seschrock error = ENXIO; 331*1986Seschrock goto out; 332*1986Seschrock } 333*1986Seschrock 334*1986Seschrock /* 335789Sahrens * Find the best uberblock. 336789Sahrens */ 337789Sahrens bzero(ub, sizeof (uberblock_t)); 338789Sahrens 339789Sahrens zio = zio_root(spa, NULL, NULL, 340789Sahrens ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 341789Sahrens vdev_uberblock_load(zio, rvd, ub); 342789Sahrens error = zio_wait(zio); 343789Sahrens 344789Sahrens /* 345789Sahrens * If we weren't able to find a single valid uberblock, return failure. 346789Sahrens */ 347789Sahrens if (ub->ub_txg == 0) { 3481760Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 3491760Seschrock VDEV_AUX_CORRUPT_DATA); 3501544Seschrock error = ENXIO; 3511544Seschrock goto out; 3521544Seschrock } 3531544Seschrock 3541544Seschrock /* 3551544Seschrock * If the pool is newer than the code, we can't open it. 3561544Seschrock */ 3571760Seschrock if (ub->ub_version > ZFS_VERSION) { 3581760Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 3591760Seschrock VDEV_AUX_VERSION_NEWER); 3601544Seschrock error = ENOTSUP; 3611544Seschrock goto out; 362789Sahrens } 363789Sahrens 364789Sahrens /* 365789Sahrens * If the vdev guid sum doesn't match the uberblock, we have an 366789Sahrens * incomplete configuration. 367789Sahrens */ 3681732Sbonwick if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 3691544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 3701544Seschrock VDEV_AUX_BAD_GUID_SUM); 3711544Seschrock error = ENXIO; 3721544Seschrock goto out; 373789Sahrens } 374789Sahrens 375789Sahrens /* 376789Sahrens * Initialize internal SPA structures. 377789Sahrens */ 378789Sahrens spa->spa_state = POOL_STATE_ACTIVE; 379789Sahrens spa->spa_ubsync = spa->spa_uberblock; 380789Sahrens spa->spa_first_txg = spa_last_synced_txg(spa) + 1; 3811544Seschrock error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 3821544Seschrock if (error) { 3831544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 3841544Seschrock VDEV_AUX_CORRUPT_DATA); 3851544Seschrock goto out; 3861544Seschrock } 387789Sahrens spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 388789Sahrens 3891544Seschrock if (zap_lookup(spa->spa_meta_objset, 390789Sahrens DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 3911544Seschrock sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { 3921544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 3931544Seschrock VDEV_AUX_CORRUPT_DATA); 3941544Seschrock error = EIO; 3951544Seschrock goto out; 3961544Seschrock } 397789Sahrens 398789Sahrens if (!mosconfig) { 399789Sahrens dmu_buf_t *db; 400789Sahrens char *packed = NULL; 401789Sahrens size_t nvsize = 0; 402789Sahrens nvlist_t *newconfig = NULL; 403789Sahrens 4041544Seschrock VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, 4051544Seschrock spa->spa_config_object, FTAG, &db)); 406789Sahrens nvsize = *(uint64_t *)db->db_data; 4071544Seschrock dmu_buf_rele(db, FTAG); 408789Sahrens 409789Sahrens packed = kmem_alloc(nvsize, KM_SLEEP); 4101544Seschrock error = dmu_read(spa->spa_meta_objset, 411789Sahrens spa->spa_config_object, 0, nvsize, packed); 412789Sahrens if (error == 0) 413789Sahrens error = nvlist_unpack(packed, nvsize, &newconfig, 0); 414789Sahrens kmem_free(packed, nvsize); 415789Sahrens 4161544Seschrock if (error) { 4171544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 4181544Seschrock VDEV_AUX_CORRUPT_DATA); 4191544Seschrock error = EIO; 4201544Seschrock goto out; 4211544Seschrock } 422789Sahrens 423789Sahrens spa_config_set(spa, newconfig); 424789Sahrens 425789Sahrens spa_unload(spa); 426789Sahrens spa_deactivate(spa); 427789Sahrens spa_activate(spa); 428789Sahrens 4291544Seschrock return (spa_load(spa, newconfig, state, B_TRUE)); 4301544Seschrock } 4311544Seschrock 4321544Seschrock if (zap_lookup(spa->spa_meta_objset, 4331544Seschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 4341544Seschrock sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { 4351544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 4361544Seschrock VDEV_AUX_CORRUPT_DATA); 4371544Seschrock error = EIO; 4381544Seschrock goto out; 439789Sahrens } 440789Sahrens 4411544Seschrock /* 4421544Seschrock * Load the persistent error log. If we have an older pool, this will 4431544Seschrock * not be present. 4441544Seschrock */ 4451544Seschrock error = zap_lookup(spa->spa_meta_objset, 4461544Seschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, 4471544Seschrock sizeof (uint64_t), 1, &spa->spa_errlog_last); 4481807Sbonwick if (error != 0 && error != ENOENT) { 4491544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 4501544Seschrock VDEV_AUX_CORRUPT_DATA); 4511544Seschrock error = EIO; 4521544Seschrock goto out; 4531544Seschrock } 4541544Seschrock 4551544Seschrock error = zap_lookup(spa->spa_meta_objset, 4561544Seschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, 4571544Seschrock sizeof (uint64_t), 1, &spa->spa_errlog_scrub); 4581544Seschrock if (error != 0 && error != ENOENT) { 4591544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 4601544Seschrock VDEV_AUX_CORRUPT_DATA); 4611544Seschrock error = EIO; 4621544Seschrock goto out; 4631544Seschrock } 464789Sahrens 465789Sahrens /* 466*1986Seschrock * Load the vdev state for all toplevel vdevs. 467789Sahrens */ 468*1986Seschrock vdev_load(rvd); 469789Sahrens 470789Sahrens /* 471789Sahrens * Propagate the leaf DTLs we just loaded all the way up the tree. 472789Sahrens */ 4731544Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 474789Sahrens vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 4751544Seschrock spa_config_exit(spa, FTAG); 476789Sahrens 477789Sahrens /* 478789Sahrens * Check the state of the root vdev. If it can't be opened, it 479789Sahrens * indicates one or more toplevel vdevs are faulted. 480789Sahrens */ 4811544Seschrock if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 4821544Seschrock error = ENXIO; 4831544Seschrock goto out; 4841544Seschrock } 485789Sahrens 4861544Seschrock if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) { 4871635Sbonwick dmu_tx_t *tx; 4881635Sbonwick int need_update = B_FALSE; 4891585Sbonwick int c; 4901601Sbonwick 4911635Sbonwick /* 4921635Sbonwick * Claim log blocks that haven't been committed yet. 4931635Sbonwick * This must all happen in a single txg. 4941635Sbonwick */ 4951601Sbonwick tx = dmu_tx_create_assigned(spa_get_dsl(spa), 496789Sahrens spa_first_txg(spa)); 497789Sahrens dmu_objset_find(spa->spa_name, zil_claim, tx, 0); 498789Sahrens dmu_tx_commit(tx); 499789Sahrens 500789Sahrens spa->spa_sync_on = B_TRUE; 501789Sahrens txg_sync_start(spa->spa_dsl_pool); 502789Sahrens 503789Sahrens /* 504789Sahrens * Wait for all claims to sync. 505789Sahrens */ 506789Sahrens txg_wait_synced(spa->spa_dsl_pool, 0); 5071585Sbonwick 5081585Sbonwick /* 5091635Sbonwick * If the config cache is stale, or we have uninitialized 5101635Sbonwick * metaslabs (see spa_vdev_add()), then update the config. 5111585Sbonwick */ 5121635Sbonwick if (config_cache_txg != spa->spa_config_txg || 5131635Sbonwick state == SPA_LOAD_IMPORT) 5141635Sbonwick need_update = B_TRUE; 5151635Sbonwick 5161635Sbonwick for (c = 0; c < rvd->vdev_children; c++) 5171635Sbonwick if (rvd->vdev_child[c]->vdev_ms_array == 0) 5181635Sbonwick need_update = B_TRUE; 5191585Sbonwick 5201585Sbonwick /* 5211635Sbonwick * Update the config cache asychronously in case we're the 5221635Sbonwick * root pool, in which case the config cache isn't writable yet. 5231585Sbonwick */ 5241635Sbonwick if (need_update) 5251635Sbonwick spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 526789Sahrens } 527789Sahrens 5281544Seschrock error = 0; 5291544Seschrock out: 5301544Seschrock if (error) 5311544Seschrock zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0); 5321544Seschrock spa->spa_load_state = SPA_LOAD_NONE; 5331544Seschrock spa->spa_ena = 0; 5341544Seschrock 5351544Seschrock return (error); 536789Sahrens } 537789Sahrens 538789Sahrens /* 539789Sahrens * Pool Open/Import 540789Sahrens * 541789Sahrens * The import case is identical to an open except that the configuration is sent 542789Sahrens * down from userland, instead of grabbed from the configuration cache. For the 543789Sahrens * case of an open, the pool configuration will exist in the 544789Sahrens * POOL_STATE_UNITIALIZED state. 545789Sahrens * 546789Sahrens * The stats information (gen/count/ustats) is used to gather vdev statistics at 547789Sahrens * the same time open the pool, without having to keep around the spa_t in some 548789Sahrens * ambiguous state. 549789Sahrens */ 550789Sahrens static int 551789Sahrens spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) 552789Sahrens { 553789Sahrens spa_t *spa; 554789Sahrens int error; 555789Sahrens int loaded = B_FALSE; 556789Sahrens int locked = B_FALSE; 557789Sahrens 558789Sahrens *spapp = NULL; 559789Sahrens 560789Sahrens /* 561789Sahrens * As disgusting as this is, we need to support recursive calls to this 562789Sahrens * function because dsl_dir_open() is called during spa_load(), and ends 563789Sahrens * up calling spa_open() again. The real fix is to figure out how to 564789Sahrens * avoid dsl_dir_open() calling this in the first place. 565789Sahrens */ 566789Sahrens if (mutex_owner(&spa_namespace_lock) != curthread) { 567789Sahrens mutex_enter(&spa_namespace_lock); 568789Sahrens locked = B_TRUE; 569789Sahrens } 570789Sahrens 571789Sahrens if ((spa = spa_lookup(pool)) == NULL) { 572789Sahrens if (locked) 573789Sahrens mutex_exit(&spa_namespace_lock); 574789Sahrens return (ENOENT); 575789Sahrens } 576789Sahrens if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 577789Sahrens 578789Sahrens spa_activate(spa); 579789Sahrens 5801635Sbonwick error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE); 581789Sahrens 582789Sahrens if (error == EBADF) { 583789Sahrens /* 584*1986Seschrock * If vdev_validate() returns failure (indicated by 585*1986Seschrock * EBADF), it indicates that one of the vdevs indicates 586*1986Seschrock * that the pool has been exported or destroyed. If 587*1986Seschrock * this is the case, the config cache is out of sync and 588*1986Seschrock * we should remove the pool from the namespace. 589789Sahrens */ 590789Sahrens spa_unload(spa); 591789Sahrens spa_deactivate(spa); 592789Sahrens spa_remove(spa); 593789Sahrens spa_config_sync(); 594789Sahrens if (locked) 595789Sahrens mutex_exit(&spa_namespace_lock); 596789Sahrens return (ENOENT); 5971544Seschrock } 5981544Seschrock 5991544Seschrock if (error) { 600789Sahrens /* 601789Sahrens * We can't open the pool, but we still have useful 602789Sahrens * information: the state of each vdev after the 603789Sahrens * attempted vdev_open(). Return this to the user. 604789Sahrens */ 6051635Sbonwick if (config != NULL && spa->spa_root_vdev != NULL) { 6061635Sbonwick spa_config_enter(spa, RW_READER, FTAG); 607789Sahrens *config = spa_config_generate(spa, NULL, -1ULL, 608789Sahrens B_TRUE); 6091635Sbonwick spa_config_exit(spa, FTAG); 6101635Sbonwick } 611789Sahrens spa_unload(spa); 612789Sahrens spa_deactivate(spa); 6131544Seschrock spa->spa_last_open_failed = B_TRUE; 614789Sahrens if (locked) 615789Sahrens mutex_exit(&spa_namespace_lock); 616789Sahrens *spapp = NULL; 617789Sahrens return (error); 6181544Seschrock } else { 6191544Seschrock zfs_post_ok(spa, NULL); 6201544Seschrock spa->spa_last_open_failed = B_FALSE; 621789Sahrens } 622789Sahrens 623789Sahrens loaded = B_TRUE; 624789Sahrens } 625789Sahrens 626789Sahrens spa_open_ref(spa, tag); 627789Sahrens if (locked) 628789Sahrens mutex_exit(&spa_namespace_lock); 629789Sahrens 630789Sahrens *spapp = spa; 631789Sahrens 632789Sahrens if (config != NULL) { 6331544Seschrock spa_config_enter(spa, RW_READER, FTAG); 634789Sahrens *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 6351544Seschrock spa_config_exit(spa, FTAG); 636789Sahrens } 637789Sahrens 638789Sahrens /* 639789Sahrens * If we just loaded the pool, resilver anything that's out of date. 640789Sahrens */ 641789Sahrens if (loaded && (spa_mode & FWRITE)) 642789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 643789Sahrens 644789Sahrens return (0); 645789Sahrens } 646789Sahrens 647789Sahrens int 648789Sahrens spa_open(const char *name, spa_t **spapp, void *tag) 649789Sahrens { 650789Sahrens return (spa_open_common(name, spapp, tag, NULL)); 651789Sahrens } 652789Sahrens 6531544Seschrock /* 6541544Seschrock * Lookup the given spa_t, incrementing the inject count in the process, 6551544Seschrock * preventing it from being exported or destroyed. 6561544Seschrock */ 6571544Seschrock spa_t * 6581544Seschrock spa_inject_addref(char *name) 6591544Seschrock { 6601544Seschrock spa_t *spa; 6611544Seschrock 6621544Seschrock mutex_enter(&spa_namespace_lock); 6631544Seschrock if ((spa = spa_lookup(name)) == NULL) { 6641544Seschrock mutex_exit(&spa_namespace_lock); 6651544Seschrock return (NULL); 6661544Seschrock } 6671544Seschrock spa->spa_inject_ref++; 6681544Seschrock mutex_exit(&spa_namespace_lock); 6691544Seschrock 6701544Seschrock return (spa); 6711544Seschrock } 6721544Seschrock 6731544Seschrock void 6741544Seschrock spa_inject_delref(spa_t *spa) 6751544Seschrock { 6761544Seschrock mutex_enter(&spa_namespace_lock); 6771544Seschrock spa->spa_inject_ref--; 6781544Seschrock mutex_exit(&spa_namespace_lock); 6791544Seschrock } 6801544Seschrock 681789Sahrens int 6821544Seschrock spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 683789Sahrens { 684789Sahrens int error; 685789Sahrens spa_t *spa; 686789Sahrens 687789Sahrens *config = NULL; 688789Sahrens error = spa_open_common(name, &spa, FTAG, config); 689789Sahrens 6901544Seschrock if (spa && *config != NULL) 6911544Seschrock VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, 6921544Seschrock spa_get_errlog_size(spa)) == 0); 6931544Seschrock 6941544Seschrock /* 6951544Seschrock * We want to get the alternate root even for faulted pools, so we cheat 6961544Seschrock * and call spa_lookup() directly. 6971544Seschrock */ 6981544Seschrock if (altroot) { 6991544Seschrock if (spa == NULL) { 7001544Seschrock mutex_enter(&spa_namespace_lock); 7011544Seschrock spa = spa_lookup(name); 7021544Seschrock if (spa) 7031544Seschrock spa_altroot(spa, altroot, buflen); 7041544Seschrock else 7051544Seschrock altroot[0] = '\0'; 7061544Seschrock spa = NULL; 7071544Seschrock mutex_exit(&spa_namespace_lock); 7081544Seschrock } else { 7091544Seschrock spa_altroot(spa, altroot, buflen); 7101544Seschrock } 7111544Seschrock } 7121544Seschrock 713789Sahrens if (spa != NULL) 714789Sahrens spa_close(spa, FTAG); 715789Sahrens 716789Sahrens return (error); 717789Sahrens } 718789Sahrens 719789Sahrens /* 720789Sahrens * Pool Creation 721789Sahrens */ 722789Sahrens int 7231635Sbonwick spa_create(const char *pool, nvlist_t *nvroot, const char *altroot) 724789Sahrens { 725789Sahrens spa_t *spa; 7261635Sbonwick vdev_t *rvd; 727789Sahrens dsl_pool_t *dp; 728789Sahrens dmu_tx_t *tx; 7291635Sbonwick int c, error; 730789Sahrens uint64_t txg = TXG_INITIAL; 731789Sahrens 732789Sahrens /* 733789Sahrens * If this pool already exists, return failure. 734789Sahrens */ 735789Sahrens mutex_enter(&spa_namespace_lock); 736789Sahrens if (spa_lookup(pool) != NULL) { 737789Sahrens mutex_exit(&spa_namespace_lock); 738789Sahrens return (EEXIST); 739789Sahrens } 740789Sahrens 741789Sahrens /* 742789Sahrens * Allocate a new spa_t structure. 743789Sahrens */ 7441635Sbonwick spa = spa_add(pool, altroot); 745789Sahrens spa_activate(spa); 746789Sahrens 747789Sahrens spa->spa_uberblock.ub_txg = txg - 1; 7481760Seschrock spa->spa_uberblock.ub_version = ZFS_VERSION; 749789Sahrens spa->spa_ubsync = spa->spa_uberblock; 750789Sahrens 7511635Sbonwick /* 7521635Sbonwick * Create the root vdev. 7531635Sbonwick */ 7541635Sbonwick spa_config_enter(spa, RW_WRITER, FTAG); 7551635Sbonwick 7561635Sbonwick rvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD); 7571635Sbonwick 7581635Sbonwick ASSERT(spa->spa_root_vdev == rvd); 7591635Sbonwick 7601635Sbonwick if (rvd == NULL) { 7611635Sbonwick error = EINVAL; 7621635Sbonwick } else { 7631635Sbonwick if ((error = vdev_create(rvd, txg)) == 0) { 7641635Sbonwick for (c = 0; c < rvd->vdev_children; c++) 7651635Sbonwick vdev_init(rvd->vdev_child[c], txg); 7661635Sbonwick vdev_config_dirty(rvd); 7671635Sbonwick } 7681635Sbonwick } 7691635Sbonwick 7701635Sbonwick spa_config_exit(spa, FTAG); 771789Sahrens 772789Sahrens if (error) { 773789Sahrens spa_unload(spa); 774789Sahrens spa_deactivate(spa); 775789Sahrens spa_remove(spa); 776789Sahrens mutex_exit(&spa_namespace_lock); 777789Sahrens return (error); 778789Sahrens } 779789Sahrens 780789Sahrens spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg); 781789Sahrens spa->spa_meta_objset = dp->dp_meta_objset; 782789Sahrens 783789Sahrens tx = dmu_tx_create_assigned(dp, txg); 784789Sahrens 785789Sahrens /* 786789Sahrens * Create the pool config object. 787789Sahrens */ 788789Sahrens spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 789789Sahrens DMU_OT_PACKED_NVLIST, 1 << 14, 790789Sahrens DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 791789Sahrens 7921544Seschrock if (zap_add(spa->spa_meta_objset, 793789Sahrens DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 7941544Seschrock sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 7951544Seschrock cmn_err(CE_PANIC, "failed to add pool config"); 7961544Seschrock } 797789Sahrens 798789Sahrens /* 799789Sahrens * Create the deferred-free bplist object. Turn off compression 800789Sahrens * because sync-to-convergence takes longer if the blocksize 801789Sahrens * keeps changing. 802789Sahrens */ 803789Sahrens spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 804789Sahrens 1 << 14, tx); 805789Sahrens dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 806789Sahrens ZIO_COMPRESS_OFF, tx); 807789Sahrens 8081544Seschrock if (zap_add(spa->spa_meta_objset, 809789Sahrens DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 8101544Seschrock sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { 8111544Seschrock cmn_err(CE_PANIC, "failed to add bplist"); 8121544Seschrock } 813789Sahrens 814789Sahrens dmu_tx_commit(tx); 815789Sahrens 816789Sahrens spa->spa_sync_on = B_TRUE; 817789Sahrens txg_sync_start(spa->spa_dsl_pool); 818789Sahrens 819789Sahrens /* 820789Sahrens * We explicitly wait for the first transaction to complete so that our 821789Sahrens * bean counters are appropriately updated. 822789Sahrens */ 823789Sahrens txg_wait_synced(spa->spa_dsl_pool, txg); 824789Sahrens 825789Sahrens spa_config_sync(); 826789Sahrens 827789Sahrens mutex_exit(&spa_namespace_lock); 828789Sahrens 829789Sahrens return (0); 830789Sahrens } 831789Sahrens 832789Sahrens /* 833789Sahrens * Import the given pool into the system. We set up the necessary spa_t and 834789Sahrens * then call spa_load() to do the dirty work. 835789Sahrens */ 836789Sahrens int 8371635Sbonwick spa_import(const char *pool, nvlist_t *config, const char *altroot) 838789Sahrens { 839789Sahrens spa_t *spa; 840789Sahrens int error; 841789Sahrens 842789Sahrens if (!(spa_mode & FWRITE)) 843789Sahrens return (EROFS); 844789Sahrens 845789Sahrens /* 846789Sahrens * If a pool with this name exists, return failure. 847789Sahrens */ 848789Sahrens mutex_enter(&spa_namespace_lock); 849789Sahrens if (spa_lookup(pool) != NULL) { 850789Sahrens mutex_exit(&spa_namespace_lock); 851789Sahrens return (EEXIST); 852789Sahrens } 853789Sahrens 854789Sahrens /* 8551635Sbonwick * Create and initialize the spa structure. 856789Sahrens */ 8571635Sbonwick spa = spa_add(pool, altroot); 858789Sahrens spa_activate(spa); 859789Sahrens 860789Sahrens /* 8611635Sbonwick * Pass off the heavy lifting to spa_load(). 8621732Sbonwick * Pass TRUE for mosconfig because the user-supplied config 8631732Sbonwick * is actually the one to trust when doing an import. 8641601Sbonwick */ 8651732Sbonwick error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE); 866789Sahrens 867789Sahrens if (error) { 868789Sahrens spa_unload(spa); 869789Sahrens spa_deactivate(spa); 870789Sahrens spa_remove(spa); 871789Sahrens mutex_exit(&spa_namespace_lock); 872789Sahrens return (error); 873789Sahrens } 874789Sahrens 8751635Sbonwick /* 8761635Sbonwick * Update the config cache to include the newly-imported pool. 8771635Sbonwick */ 8781635Sbonwick spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 8791635Sbonwick 880789Sahrens mutex_exit(&spa_namespace_lock); 881789Sahrens 882789Sahrens /* 883789Sahrens * Resilver anything that's out of date. 884789Sahrens */ 885789Sahrens if (spa_mode & FWRITE) 886789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 887789Sahrens 888789Sahrens return (0); 889789Sahrens } 890789Sahrens 891789Sahrens /* 892789Sahrens * This (illegal) pool name is used when temporarily importing a spa_t in order 893789Sahrens * to get the vdev stats associated with the imported devices. 894789Sahrens */ 895789Sahrens #define TRYIMPORT_NAME "$import" 896789Sahrens 897789Sahrens nvlist_t * 898789Sahrens spa_tryimport(nvlist_t *tryconfig) 899789Sahrens { 900789Sahrens nvlist_t *config = NULL; 901789Sahrens char *poolname; 902789Sahrens spa_t *spa; 903789Sahrens uint64_t state; 904789Sahrens 905789Sahrens if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 906789Sahrens return (NULL); 907789Sahrens 908789Sahrens if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 909789Sahrens return (NULL); 910789Sahrens 9111635Sbonwick /* 9121635Sbonwick * Create and initialize the spa structure. 9131635Sbonwick */ 914789Sahrens mutex_enter(&spa_namespace_lock); 9151635Sbonwick spa = spa_add(TRYIMPORT_NAME, NULL); 916789Sahrens spa_activate(spa); 917789Sahrens 918789Sahrens /* 9191635Sbonwick * Pass off the heavy lifting to spa_load(). 9201732Sbonwick * Pass TRUE for mosconfig because the user-supplied config 9211732Sbonwick * is actually the one to trust when doing an import. 922789Sahrens */ 9231732Sbonwick (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); 924789Sahrens 925789Sahrens /* 926789Sahrens * If 'tryconfig' was at least parsable, return the current config. 927789Sahrens */ 928789Sahrens if (spa->spa_root_vdev != NULL) { 9291635Sbonwick spa_config_enter(spa, RW_READER, FTAG); 930789Sahrens config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 9311635Sbonwick spa_config_exit(spa, FTAG); 932789Sahrens VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 933789Sahrens poolname) == 0); 934789Sahrens VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 935789Sahrens state) == 0); 936789Sahrens } 937789Sahrens 938789Sahrens spa_unload(spa); 939789Sahrens spa_deactivate(spa); 940789Sahrens spa_remove(spa); 941789Sahrens mutex_exit(&spa_namespace_lock); 942789Sahrens 943789Sahrens return (config); 944789Sahrens } 945789Sahrens 946789Sahrens /* 947789Sahrens * Pool export/destroy 948789Sahrens * 949789Sahrens * The act of destroying or exporting a pool is very simple. We make sure there 950789Sahrens * is no more pending I/O and any references to the pool are gone. Then, we 951789Sahrens * update the pool state and sync all the labels to disk, removing the 952789Sahrens * configuration from the cache afterwards. 953789Sahrens */ 954789Sahrens static int 9551775Sbillm spa_export_common(char *pool, int new_state, nvlist_t **oldconfig) 956789Sahrens { 957789Sahrens spa_t *spa; 958789Sahrens 9591775Sbillm if (oldconfig) 9601775Sbillm *oldconfig = NULL; 9611775Sbillm 962789Sahrens if (!(spa_mode & FWRITE)) 963789Sahrens return (EROFS); 964789Sahrens 965789Sahrens mutex_enter(&spa_namespace_lock); 966789Sahrens if ((spa = spa_lookup(pool)) == NULL) { 967789Sahrens mutex_exit(&spa_namespace_lock); 968789Sahrens return (ENOENT); 969789Sahrens } 970789Sahrens 971789Sahrens /* 9721544Seschrock * Put a hold on the pool, drop the namespace lock, stop async tasks, 9731544Seschrock * reacquire the namespace lock, and see if we can export. 9741544Seschrock */ 9751544Seschrock spa_open_ref(spa, FTAG); 9761544Seschrock mutex_exit(&spa_namespace_lock); 9771544Seschrock spa_async_suspend(spa); 9781544Seschrock mutex_enter(&spa_namespace_lock); 9791544Seschrock spa_close(spa, FTAG); 9801544Seschrock 9811544Seschrock /* 982789Sahrens * The pool will be in core if it's openable, 983789Sahrens * in which case we can modify its state. 984789Sahrens */ 985789Sahrens if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 986789Sahrens /* 987789Sahrens * Objsets may be open only because they're dirty, so we 988789Sahrens * have to force it to sync before checking spa_refcnt. 989789Sahrens */ 990789Sahrens spa_scrub_suspend(spa); 991789Sahrens txg_wait_synced(spa->spa_dsl_pool, 0); 992789Sahrens 9931544Seschrock /* 9941544Seschrock * A pool cannot be exported or destroyed if there are active 9951544Seschrock * references. If we are resetting a pool, allow references by 9961544Seschrock * fault injection handlers. 9971544Seschrock */ 9981544Seschrock if (!spa_refcount_zero(spa) || 9991544Seschrock (spa->spa_inject_ref != 0 && 10001544Seschrock new_state != POOL_STATE_UNINITIALIZED)) { 1001789Sahrens spa_scrub_resume(spa); 10021544Seschrock spa_async_resume(spa); 1003789Sahrens mutex_exit(&spa_namespace_lock); 1004789Sahrens return (EBUSY); 1005789Sahrens } 1006789Sahrens 1007789Sahrens spa_scrub_resume(spa); 1008789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 1009789Sahrens 1010789Sahrens /* 1011789Sahrens * We want this to be reflected on every label, 1012789Sahrens * so mark them all dirty. spa_unload() will do the 1013789Sahrens * final sync that pushes these changes out. 1014789Sahrens */ 10151544Seschrock if (new_state != POOL_STATE_UNINITIALIZED) { 10161601Sbonwick spa_config_enter(spa, RW_WRITER, FTAG); 10171544Seschrock spa->spa_state = new_state; 10181635Sbonwick spa->spa_final_txg = spa_last_synced_txg(spa) + 1; 10191544Seschrock vdev_config_dirty(spa->spa_root_vdev); 10201601Sbonwick spa_config_exit(spa, FTAG); 10211544Seschrock } 1022789Sahrens } 1023789Sahrens 1024789Sahrens if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 1025789Sahrens spa_unload(spa); 1026789Sahrens spa_deactivate(spa); 1027789Sahrens } 1028789Sahrens 10291775Sbillm if (oldconfig && spa->spa_config) 10301775Sbillm VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 10311775Sbillm 10321544Seschrock if (new_state != POOL_STATE_UNINITIALIZED) { 10331544Seschrock spa_remove(spa); 10341544Seschrock spa_config_sync(); 10351544Seschrock } 1036789Sahrens mutex_exit(&spa_namespace_lock); 1037789Sahrens 1038789Sahrens return (0); 1039789Sahrens } 1040789Sahrens 1041789Sahrens /* 1042789Sahrens * Destroy a storage pool. 1043789Sahrens */ 1044789Sahrens int 1045789Sahrens spa_destroy(char *pool) 1046789Sahrens { 10471775Sbillm return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL)); 1048789Sahrens } 1049789Sahrens 1050789Sahrens /* 1051789Sahrens * Export a storage pool. 1052789Sahrens */ 1053789Sahrens int 10541775Sbillm spa_export(char *pool, nvlist_t **oldconfig) 1055789Sahrens { 10561775Sbillm return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig)); 1057789Sahrens } 1058789Sahrens 1059789Sahrens /* 10601544Seschrock * Similar to spa_export(), this unloads the spa_t without actually removing it 10611544Seschrock * from the namespace in any way. 10621544Seschrock */ 10631544Seschrock int 10641544Seschrock spa_reset(char *pool) 10651544Seschrock { 10661775Sbillm return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL)); 10671544Seschrock } 10681544Seschrock 10691544Seschrock 10701544Seschrock /* 1071789Sahrens * ========================================================================== 1072789Sahrens * Device manipulation 1073789Sahrens * ========================================================================== 1074789Sahrens */ 1075789Sahrens 1076789Sahrens /* 1077789Sahrens * Add capacity to a storage pool. 1078789Sahrens */ 1079789Sahrens int 1080789Sahrens spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 1081789Sahrens { 1082789Sahrens uint64_t txg; 10831635Sbonwick int c, error; 1084789Sahrens vdev_t *rvd = spa->spa_root_vdev; 10851585Sbonwick vdev_t *vd, *tvd; 1086789Sahrens 1087789Sahrens txg = spa_vdev_enter(spa); 1088789Sahrens 1089789Sahrens vd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD); 1090789Sahrens 1091789Sahrens if (vd == NULL) 1092789Sahrens return (spa_vdev_exit(spa, vd, txg, EINVAL)); 1093789Sahrens 1094789Sahrens if ((error = vdev_create(vd, txg)) != 0) 1095789Sahrens return (spa_vdev_exit(spa, vd, txg, error)); 1096789Sahrens 1097789Sahrens /* 10981585Sbonwick * Transfer each new top-level vdev from vd to rvd. 1099789Sahrens */ 11001635Sbonwick for (c = 0; c < vd->vdev_children; c++) { 11011585Sbonwick tvd = vd->vdev_child[c]; 11021635Sbonwick vdev_remove_child(vd, tvd); 11031635Sbonwick tvd->vdev_id = rvd->vdev_children; 11041635Sbonwick vdev_add_child(rvd, tvd); 1105789Sahrens vdev_config_dirty(tvd); 1106789Sahrens } 1107789Sahrens 1108789Sahrens /* 11091585Sbonwick * We have to be careful when adding new vdevs to an existing pool. 11101585Sbonwick * If other threads start allocating from these vdevs before we 11111585Sbonwick * sync the config cache, and we lose power, then upon reboot we may 11121585Sbonwick * fail to open the pool because there are DVAs that the config cache 11131585Sbonwick * can't translate. Therefore, we first add the vdevs without 11141585Sbonwick * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 11151635Sbonwick * and then let spa_config_update() initialize the new metaslabs. 11161585Sbonwick * 11171585Sbonwick * spa_load() checks for added-but-not-initialized vdevs, so that 11181585Sbonwick * if we lose power at any point in this sequence, the remaining 11191585Sbonwick * steps will be completed the next time we load the pool. 1120789Sahrens */ 11211635Sbonwick (void) spa_vdev_exit(spa, vd, txg, 0); 11221585Sbonwick 11231635Sbonwick mutex_enter(&spa_namespace_lock); 11241635Sbonwick spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 11251635Sbonwick mutex_exit(&spa_namespace_lock); 1126789Sahrens 11271635Sbonwick return (0); 1128789Sahrens } 1129789Sahrens 1130789Sahrens /* 1131789Sahrens * Attach a device to a mirror. The arguments are the path to any device 1132789Sahrens * in the mirror, and the nvroot for the new device. If the path specifies 1133789Sahrens * a device that is not mirrored, we automatically insert the mirror vdev. 1134789Sahrens * 1135789Sahrens * If 'replacing' is specified, the new device is intended to replace the 1136789Sahrens * existing device; in this case the two devices are made into their own 1137789Sahrens * mirror using the 'replacing' vdev, which is functionally idendical to 1138789Sahrens * the mirror vdev (it actually reuses all the same ops) but has a few 1139789Sahrens * extra rules: you can't attach to it after it's been created, and upon 1140789Sahrens * completion of resilvering, the first disk (the one being replaced) 1141789Sahrens * is automatically detached. 1142789Sahrens */ 1143789Sahrens int 11441544Seschrock spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 1145789Sahrens { 1146789Sahrens uint64_t txg, open_txg; 1147789Sahrens int error; 1148789Sahrens vdev_t *rvd = spa->spa_root_vdev; 1149789Sahrens vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 1150789Sahrens vdev_ops_t *pvops = replacing ? &vdev_replacing_ops : &vdev_mirror_ops; 1151789Sahrens 1152789Sahrens txg = spa_vdev_enter(spa); 1153789Sahrens 11541544Seschrock oldvd = vdev_lookup_by_guid(rvd, guid); 1155789Sahrens 1156789Sahrens if (oldvd == NULL) 1157789Sahrens return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1158789Sahrens 11591585Sbonwick if (!oldvd->vdev_ops->vdev_op_leaf) 11601585Sbonwick return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 11611585Sbonwick 1162789Sahrens pvd = oldvd->vdev_parent; 1163789Sahrens 1164789Sahrens /* 1165789Sahrens * The parent must be a mirror or the root, unless we're replacing; 1166789Sahrens * in that case, the parent can be anything but another replacing vdev. 1167789Sahrens */ 1168789Sahrens if (pvd->vdev_ops != &vdev_mirror_ops && 1169789Sahrens pvd->vdev_ops != &vdev_root_ops && 1170789Sahrens (!replacing || pvd->vdev_ops == &vdev_replacing_ops)) 1171789Sahrens return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1172789Sahrens 1173789Sahrens newrootvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD); 1174789Sahrens 1175789Sahrens if (newrootvd == NULL || newrootvd->vdev_children != 1) 1176789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1177789Sahrens 1178789Sahrens newvd = newrootvd->vdev_child[0]; 1179789Sahrens 1180789Sahrens if (!newvd->vdev_ops->vdev_op_leaf) 1181789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1182789Sahrens 1183789Sahrens if ((error = vdev_create(newrootvd, txg)) != 0) 1184789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, error)); 1185789Sahrens 11861175Slling /* 11871175Slling * Compare the new device size with the replaceable/attachable 11881175Slling * device size. 11891175Slling */ 11901175Slling if (newvd->vdev_psize < vdev_get_rsize(oldvd)) 1191789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 1192789Sahrens 11931732Sbonwick /* 11941732Sbonwick * The new device cannot have a higher alignment requirement 11951732Sbonwick * than the top-level vdev. 11961732Sbonwick */ 11971732Sbonwick if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 1198789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 1199789Sahrens 1200789Sahrens /* 1201789Sahrens * If this is an in-place replacement, update oldvd's path and devid 1202789Sahrens * to make it distinguishable from newvd, and unopenable from now on. 1203789Sahrens */ 1204789Sahrens if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 1205789Sahrens spa_strfree(oldvd->vdev_path); 1206789Sahrens oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 1207789Sahrens KM_SLEEP); 1208789Sahrens (void) sprintf(oldvd->vdev_path, "%s/%s", 1209789Sahrens newvd->vdev_path, "old"); 1210789Sahrens if (oldvd->vdev_devid != NULL) { 1211789Sahrens spa_strfree(oldvd->vdev_devid); 1212789Sahrens oldvd->vdev_devid = NULL; 1213789Sahrens } 1214789Sahrens } 1215789Sahrens 1216789Sahrens /* 1217789Sahrens * If the parent is not a mirror, or if we're replacing, 1218789Sahrens * insert the new mirror/replacing vdev above oldvd. 1219789Sahrens */ 1220789Sahrens if (pvd->vdev_ops != pvops) 1221789Sahrens pvd = vdev_add_parent(oldvd, pvops); 1222789Sahrens 1223789Sahrens ASSERT(pvd->vdev_top->vdev_parent == rvd); 1224789Sahrens ASSERT(pvd->vdev_ops == pvops); 1225789Sahrens ASSERT(oldvd->vdev_parent == pvd); 1226789Sahrens 1227789Sahrens /* 1228789Sahrens * Extract the new device from its root and add it to pvd. 1229789Sahrens */ 1230789Sahrens vdev_remove_child(newrootvd, newvd); 1231789Sahrens newvd->vdev_id = pvd->vdev_children; 1232789Sahrens vdev_add_child(pvd, newvd); 1233789Sahrens 12341544Seschrock /* 12351544Seschrock * If newvd is smaller than oldvd, but larger than its rsize, 12361544Seschrock * the addition of newvd may have decreased our parent's asize. 12371544Seschrock */ 12381544Seschrock pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize); 12391544Seschrock 1240789Sahrens tvd = newvd->vdev_top; 1241789Sahrens ASSERT(pvd->vdev_top == tvd); 1242789Sahrens ASSERT(tvd->vdev_parent == rvd); 1243789Sahrens 1244789Sahrens vdev_config_dirty(tvd); 1245789Sahrens 1246789Sahrens /* 1247789Sahrens * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 1248789Sahrens * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 1249789Sahrens */ 1250789Sahrens open_txg = txg + TXG_CONCURRENT_STATES - 1; 1251789Sahrens 1252789Sahrens mutex_enter(&newvd->vdev_dtl_lock); 1253789Sahrens space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, 1254789Sahrens open_txg - TXG_INITIAL + 1); 1255789Sahrens mutex_exit(&newvd->vdev_dtl_lock); 1256789Sahrens 12571544Seschrock dprintf("attached %s in txg %llu\n", newvd->vdev_path, txg); 12581544Seschrock 1259789Sahrens /* 1260789Sahrens * Mark newvd's DTL dirty in this txg. 1261789Sahrens */ 12621732Sbonwick vdev_dirty(tvd, VDD_DTL, newvd, txg); 1263789Sahrens 1264789Sahrens (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 1265789Sahrens 1266789Sahrens /* 1267789Sahrens * Kick off a resilver to update newvd. 1268789Sahrens */ 1269789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1270789Sahrens 1271789Sahrens return (0); 1272789Sahrens } 1273789Sahrens 1274789Sahrens /* 1275789Sahrens * Detach a device from a mirror or replacing vdev. 1276789Sahrens * If 'replace_done' is specified, only detach if the parent 1277789Sahrens * is a replacing vdev. 1278789Sahrens */ 1279789Sahrens int 12801544Seschrock spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) 1281789Sahrens { 1282789Sahrens uint64_t txg; 1283789Sahrens int c, t, error; 1284789Sahrens vdev_t *rvd = spa->spa_root_vdev; 1285789Sahrens vdev_t *vd, *pvd, *cvd, *tvd; 1286789Sahrens 1287789Sahrens txg = spa_vdev_enter(spa); 1288789Sahrens 12891544Seschrock vd = vdev_lookup_by_guid(rvd, guid); 1290789Sahrens 1291789Sahrens if (vd == NULL) 1292789Sahrens return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1293789Sahrens 12941585Sbonwick if (!vd->vdev_ops->vdev_op_leaf) 12951585Sbonwick return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 12961585Sbonwick 1297789Sahrens pvd = vd->vdev_parent; 1298789Sahrens 1299789Sahrens /* 1300789Sahrens * If replace_done is specified, only remove this device if it's 1301789Sahrens * the first child of a replacing vdev. 1302789Sahrens */ 1303789Sahrens if (replace_done && 1304789Sahrens (vd->vdev_id != 0 || pvd->vdev_ops != &vdev_replacing_ops)) 1305789Sahrens return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1306789Sahrens 1307789Sahrens /* 1308789Sahrens * Only mirror and replacing vdevs support detach. 1309789Sahrens */ 1310789Sahrens if (pvd->vdev_ops != &vdev_replacing_ops && 1311789Sahrens pvd->vdev_ops != &vdev_mirror_ops) 1312789Sahrens return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1313789Sahrens 1314789Sahrens /* 1315789Sahrens * If there's only one replica, you can't detach it. 1316789Sahrens */ 1317789Sahrens if (pvd->vdev_children <= 1) 1318789Sahrens return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1319789Sahrens 1320789Sahrens /* 1321789Sahrens * If all siblings have non-empty DTLs, this device may have the only 1322789Sahrens * valid copy of the data, which means we cannot safely detach it. 1323789Sahrens * 1324789Sahrens * XXX -- as in the vdev_offline() case, we really want a more 1325789Sahrens * precise DTL check. 1326789Sahrens */ 1327789Sahrens for (c = 0; c < pvd->vdev_children; c++) { 1328789Sahrens uint64_t dirty; 1329789Sahrens 1330789Sahrens cvd = pvd->vdev_child[c]; 1331789Sahrens if (cvd == vd) 1332789Sahrens continue; 1333789Sahrens if (vdev_is_dead(cvd)) 1334789Sahrens continue; 1335789Sahrens mutex_enter(&cvd->vdev_dtl_lock); 1336789Sahrens dirty = cvd->vdev_dtl_map.sm_space | 1337789Sahrens cvd->vdev_dtl_scrub.sm_space; 1338789Sahrens mutex_exit(&cvd->vdev_dtl_lock); 1339789Sahrens if (!dirty) 1340789Sahrens break; 1341789Sahrens } 1342789Sahrens if (c == pvd->vdev_children) 1343789Sahrens return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1344789Sahrens 1345789Sahrens /* 1346789Sahrens * Erase the disk labels so the disk can be used for other things. 1347789Sahrens * This must be done after all other error cases are handled, 1348789Sahrens * but before we disembowel vd (so we can still do I/O to it). 1349789Sahrens * But if we can't do it, don't treat the error as fatal -- 1350789Sahrens * it may be that the unwritability of the disk is the reason 1351789Sahrens * it's being detached! 1352789Sahrens */ 1353789Sahrens error = vdev_label_init(vd, 0); 1354789Sahrens if (error) 1355789Sahrens dprintf("unable to erase labels on %s\n", vdev_description(vd)); 1356789Sahrens 1357789Sahrens /* 1358789Sahrens * Remove vd from its parent and compact the parent's children. 1359789Sahrens */ 1360789Sahrens vdev_remove_child(pvd, vd); 1361789Sahrens vdev_compact_children(pvd); 1362789Sahrens 1363789Sahrens /* 1364789Sahrens * Remember one of the remaining children so we can get tvd below. 1365789Sahrens */ 1366789Sahrens cvd = pvd->vdev_child[0]; 1367789Sahrens 1368789Sahrens /* 1369789Sahrens * If the parent mirror/replacing vdev only has one child, 1370789Sahrens * the parent is no longer needed. Remove it from the tree. 1371789Sahrens */ 1372789Sahrens if (pvd->vdev_children == 1) 1373789Sahrens vdev_remove_parent(cvd); 1374789Sahrens 1375789Sahrens /* 1376789Sahrens * We don't set tvd until now because the parent we just removed 1377789Sahrens * may have been the previous top-level vdev. 1378789Sahrens */ 1379789Sahrens tvd = cvd->vdev_top; 1380789Sahrens ASSERT(tvd->vdev_parent == rvd); 1381789Sahrens 1382789Sahrens /* 1383789Sahrens * Reopen this top-level vdev to reassess health after detach. 1384789Sahrens */ 13851544Seschrock vdev_reopen(tvd); 1386789Sahrens 1387789Sahrens /* 1388789Sahrens * If the device we just detached was smaller than the others, 13891732Sbonwick * it may be possible to add metaslabs (i.e. grow the pool). 13901732Sbonwick * vdev_metaslab_init() can't fail because the existing metaslabs 13911732Sbonwick * are already in core, so there's nothing to read from disk. 1392789Sahrens */ 13931732Sbonwick VERIFY(vdev_metaslab_init(tvd, txg) == 0); 1394789Sahrens 1395789Sahrens vdev_config_dirty(tvd); 1396789Sahrens 1397789Sahrens /* 1398789Sahrens * Mark vd's DTL as dirty in this txg. 1399789Sahrens * vdev_dtl_sync() will see that vd->vdev_detached is set 1400789Sahrens * and free vd's DTL object in syncing context. 1401789Sahrens * But first make sure we're not on any *other* txg's DTL list, 1402789Sahrens * to prevent vd from being accessed after it's freed. 1403789Sahrens */ 1404789Sahrens for (t = 0; t < TXG_SIZE; t++) 1405789Sahrens (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 14061732Sbonwick vd->vdev_detached = B_TRUE; 14071732Sbonwick vdev_dirty(tvd, VDD_DTL, vd, txg); 1408789Sahrens 14091544Seschrock dprintf("detached %s in txg %llu\n", vd->vdev_path, txg); 1410789Sahrens 1411789Sahrens return (spa_vdev_exit(spa, vd, txg, 0)); 1412789Sahrens } 1413789Sahrens 1414789Sahrens /* 14151544Seschrock * Find any device that's done replacing, so we can detach it. 1416789Sahrens */ 14171544Seschrock static vdev_t * 14181544Seschrock spa_vdev_replace_done_hunt(vdev_t *vd) 1419789Sahrens { 14201544Seschrock vdev_t *newvd, *oldvd; 1421789Sahrens int c; 1422789Sahrens 14231544Seschrock for (c = 0; c < vd->vdev_children; c++) { 14241544Seschrock oldvd = spa_vdev_replace_done_hunt(vd->vdev_child[c]); 14251544Seschrock if (oldvd != NULL) 14261544Seschrock return (oldvd); 14271544Seschrock } 1428789Sahrens 1429789Sahrens if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 14301544Seschrock oldvd = vd->vdev_child[0]; 14311544Seschrock newvd = vd->vdev_child[1]; 1432789Sahrens 14331544Seschrock mutex_enter(&newvd->vdev_dtl_lock); 14341544Seschrock if (newvd->vdev_dtl_map.sm_space == 0 && 14351544Seschrock newvd->vdev_dtl_scrub.sm_space == 0) { 14361544Seschrock mutex_exit(&newvd->vdev_dtl_lock); 14371544Seschrock return (oldvd); 14381544Seschrock } 14391544Seschrock mutex_exit(&newvd->vdev_dtl_lock); 14401544Seschrock } 1441789Sahrens 14421544Seschrock return (NULL); 1443789Sahrens } 1444789Sahrens 14451544Seschrock static void 1446789Sahrens spa_vdev_replace_done(spa_t *spa) 1447789Sahrens { 14481544Seschrock vdev_t *vd; 14491544Seschrock uint64_t guid; 1450789Sahrens 14511544Seschrock spa_config_enter(spa, RW_READER, FTAG); 1452789Sahrens 14531544Seschrock while ((vd = spa_vdev_replace_done_hunt(spa->spa_root_vdev)) != NULL) { 14541544Seschrock guid = vd->vdev_guid; 14551544Seschrock spa_config_exit(spa, FTAG); 14561544Seschrock if (spa_vdev_detach(spa, guid, B_TRUE) != 0) 14571544Seschrock return; 14581544Seschrock spa_config_enter(spa, RW_READER, FTAG); 1459789Sahrens } 1460789Sahrens 14611544Seschrock spa_config_exit(spa, FTAG); 1462789Sahrens } 1463789Sahrens 1464789Sahrens /* 14651354Seschrock * Update the stored path for this vdev. Dirty the vdev configuration, relying 14661354Seschrock * on spa_vdev_enter/exit() to synchronize the labels and cache. 14671354Seschrock */ 14681354Seschrock int 14691354Seschrock spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 14701354Seschrock { 14711354Seschrock vdev_t *rvd, *vd; 14721354Seschrock uint64_t txg; 14731354Seschrock 14741354Seschrock rvd = spa->spa_root_vdev; 14751354Seschrock 14761354Seschrock txg = spa_vdev_enter(spa); 14771354Seschrock 14781354Seschrock if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) 14791354Seschrock return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 14801354Seschrock 14811585Sbonwick if (!vd->vdev_ops->vdev_op_leaf) 14821585Sbonwick return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 14831585Sbonwick 14841354Seschrock spa_strfree(vd->vdev_path); 14851354Seschrock vd->vdev_path = spa_strdup(newpath); 14861354Seschrock 14871354Seschrock vdev_config_dirty(vd->vdev_top); 14881354Seschrock 14891354Seschrock return (spa_vdev_exit(spa, NULL, txg, 0)); 14901354Seschrock } 14911354Seschrock 14921354Seschrock /* 1493789Sahrens * ========================================================================== 1494789Sahrens * SPA Scrubbing 1495789Sahrens * ========================================================================== 1496789Sahrens */ 1497789Sahrens 14981544Seschrock void 14991544Seschrock spa_scrub_throttle(spa_t *spa, int direction) 15001544Seschrock { 15011544Seschrock mutex_enter(&spa->spa_scrub_lock); 15021544Seschrock spa->spa_scrub_throttled += direction; 15031544Seschrock ASSERT(spa->spa_scrub_throttled >= 0); 15041544Seschrock if (spa->spa_scrub_throttled == 0) 15051544Seschrock cv_broadcast(&spa->spa_scrub_io_cv); 15061544Seschrock mutex_exit(&spa->spa_scrub_lock); 15071544Seschrock } 1508789Sahrens 1509789Sahrens static void 1510789Sahrens spa_scrub_io_done(zio_t *zio) 1511789Sahrens { 1512789Sahrens spa_t *spa = zio->io_spa; 1513789Sahrens 1514789Sahrens zio_buf_free(zio->io_data, zio->io_size); 1515789Sahrens 1516789Sahrens mutex_enter(&spa->spa_scrub_lock); 15171544Seschrock if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 15181775Sbillm vdev_t *vd = zio->io_vd ? zio->io_vd : spa->spa_root_vdev; 1519789Sahrens spa->spa_scrub_errors++; 1520789Sahrens mutex_enter(&vd->vdev_stat_lock); 1521789Sahrens vd->vdev_stat.vs_scrub_errors++; 1522789Sahrens mutex_exit(&vd->vdev_stat_lock); 1523789Sahrens } 15241544Seschrock if (--spa->spa_scrub_inflight == 0) { 15251544Seschrock cv_broadcast(&spa->spa_scrub_io_cv); 15261544Seschrock ASSERT(spa->spa_scrub_throttled == 0); 15271544Seschrock } 15281544Seschrock mutex_exit(&spa->spa_scrub_lock); 1529789Sahrens } 1530789Sahrens 1531789Sahrens static void 15321544Seschrock spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags, 15331544Seschrock zbookmark_t *zb) 1534789Sahrens { 1535789Sahrens size_t size = BP_GET_LSIZE(bp); 1536789Sahrens void *data = zio_buf_alloc(size); 1537789Sahrens 1538789Sahrens mutex_enter(&spa->spa_scrub_lock); 1539789Sahrens spa->spa_scrub_inflight++; 1540789Sahrens mutex_exit(&spa->spa_scrub_lock); 1541789Sahrens 15421544Seschrock if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET) 15431544Seschrock flags |= ZIO_FLAG_SPECULATIVE; /* intent log block */ 15441544Seschrock 15451807Sbonwick flags |= ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL; 15461544Seschrock 1547789Sahrens zio_nowait(zio_read(NULL, spa, bp, data, size, 15481544Seschrock spa_scrub_io_done, NULL, priority, flags, zb)); 1549789Sahrens } 1550789Sahrens 1551789Sahrens /* ARGSUSED */ 1552789Sahrens static int 1553789Sahrens spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) 1554789Sahrens { 1555789Sahrens blkptr_t *bp = &bc->bc_blkptr; 15561775Sbillm vdev_t *vd = spa->spa_root_vdev; 15571775Sbillm dva_t *dva = bp->blk_dva; 15581775Sbillm int needs_resilver = B_FALSE; 15591775Sbillm int d; 1560789Sahrens 15611775Sbillm if (bc->bc_errno) { 1562789Sahrens /* 1563789Sahrens * We can't scrub this block, but we can continue to scrub 1564789Sahrens * the rest of the pool. Note the error and move along. 1565789Sahrens */ 1566789Sahrens mutex_enter(&spa->spa_scrub_lock); 1567789Sahrens spa->spa_scrub_errors++; 1568789Sahrens mutex_exit(&spa->spa_scrub_lock); 1569789Sahrens 15701775Sbillm mutex_enter(&vd->vdev_stat_lock); 15711775Sbillm vd->vdev_stat.vs_scrub_errors++; 15721775Sbillm mutex_exit(&vd->vdev_stat_lock); 1573789Sahrens 1574789Sahrens return (ERESTART); 1575789Sahrens } 1576789Sahrens 1577789Sahrens ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg); 1578789Sahrens 15791775Sbillm for (d = 0; d < BP_GET_NDVAS(bp); d++) { 15801775Sbillm vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d])); 15811775Sbillm 15821775Sbillm ASSERT(vd != NULL); 15831775Sbillm 15841775Sbillm /* 15851775Sbillm * Keep track of how much data we've examined so that 15861775Sbillm * zpool(1M) status can make useful progress reports. 15871775Sbillm */ 15881775Sbillm mutex_enter(&vd->vdev_stat_lock); 15891775Sbillm vd->vdev_stat.vs_scrub_examined += DVA_GET_ASIZE(&dva[d]); 15901775Sbillm mutex_exit(&vd->vdev_stat_lock); 1591789Sahrens 15921775Sbillm if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) { 15931775Sbillm if (DVA_GET_GANG(&dva[d])) { 15941775Sbillm /* 15951775Sbillm * Gang members may be spread across multiple 15961775Sbillm * vdevs, so the best we can do is look at the 15971775Sbillm * pool-wide DTL. 15981775Sbillm * XXX -- it would be better to change our 15991775Sbillm * allocation policy to ensure that this can't 16001775Sbillm * happen. 16011775Sbillm */ 16021775Sbillm vd = spa->spa_root_vdev; 16031775Sbillm } 16041775Sbillm if (vdev_dtl_contains(&vd->vdev_dtl_map, 16051775Sbillm bp->blk_birth, 1)) 16061775Sbillm needs_resilver = B_TRUE; 1607789Sahrens } 16081775Sbillm } 16091775Sbillm 16101775Sbillm if (spa->spa_scrub_type == POOL_SCRUB_EVERYTHING) 1611789Sahrens spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB, 16121544Seschrock ZIO_FLAG_SCRUB, &bc->bc_bookmark); 16131775Sbillm else if (needs_resilver) 16141775Sbillm spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER, 16151775Sbillm ZIO_FLAG_RESILVER, &bc->bc_bookmark); 1616789Sahrens 1617789Sahrens return (0); 1618789Sahrens } 1619789Sahrens 1620789Sahrens static void 1621789Sahrens spa_scrub_thread(spa_t *spa) 1622789Sahrens { 1623789Sahrens callb_cpr_t cprinfo; 1624789Sahrens traverse_handle_t *th = spa->spa_scrub_th; 1625789Sahrens vdev_t *rvd = spa->spa_root_vdev; 1626789Sahrens pool_scrub_type_t scrub_type = spa->spa_scrub_type; 1627789Sahrens int error = 0; 1628789Sahrens boolean_t complete; 1629789Sahrens 1630789Sahrens CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG); 1631789Sahrens 1632797Sbonwick /* 1633797Sbonwick * If we're restarting due to a snapshot create/delete, 1634797Sbonwick * wait for that to complete. 1635797Sbonwick */ 1636797Sbonwick txg_wait_synced(spa_get_dsl(spa), 0); 1637797Sbonwick 16381544Seschrock dprintf("start %s mintxg=%llu maxtxg=%llu\n", 16391544Seschrock scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 16401544Seschrock spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg); 16411544Seschrock 16421544Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 16431544Seschrock vdev_reopen(rvd); /* purge all vdev caches */ 1644789Sahrens vdev_config_dirty(rvd); /* rewrite all disk labels */ 1645789Sahrens vdev_scrub_stat_update(rvd, scrub_type, B_FALSE); 16461544Seschrock spa_config_exit(spa, FTAG); 1647789Sahrens 1648789Sahrens mutex_enter(&spa->spa_scrub_lock); 1649789Sahrens spa->spa_scrub_errors = 0; 1650789Sahrens spa->spa_scrub_active = 1; 16511544Seschrock ASSERT(spa->spa_scrub_inflight == 0); 16521544Seschrock ASSERT(spa->spa_scrub_throttled == 0); 1653789Sahrens 1654789Sahrens while (!spa->spa_scrub_stop) { 1655789Sahrens CALLB_CPR_SAFE_BEGIN(&cprinfo); 16561544Seschrock while (spa->spa_scrub_suspended) { 1657789Sahrens spa->spa_scrub_active = 0; 1658789Sahrens cv_broadcast(&spa->spa_scrub_cv); 1659789Sahrens cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 1660789Sahrens spa->spa_scrub_active = 1; 1661789Sahrens } 1662789Sahrens CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock); 1663789Sahrens 1664789Sahrens if (spa->spa_scrub_restart_txg != 0) 1665789Sahrens break; 1666789Sahrens 1667789Sahrens mutex_exit(&spa->spa_scrub_lock); 1668789Sahrens error = traverse_more(th); 1669789Sahrens mutex_enter(&spa->spa_scrub_lock); 1670789Sahrens if (error != EAGAIN) 1671789Sahrens break; 16721544Seschrock 16731544Seschrock while (spa->spa_scrub_throttled > 0) 16741544Seschrock cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1675789Sahrens } 1676789Sahrens 1677789Sahrens while (spa->spa_scrub_inflight) 1678789Sahrens cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1679789Sahrens 16801601Sbonwick spa->spa_scrub_active = 0; 16811601Sbonwick cv_broadcast(&spa->spa_scrub_cv); 16821601Sbonwick 16831601Sbonwick mutex_exit(&spa->spa_scrub_lock); 16841601Sbonwick 16851601Sbonwick spa_config_enter(spa, RW_WRITER, FTAG); 16861601Sbonwick 16871601Sbonwick mutex_enter(&spa->spa_scrub_lock); 16881601Sbonwick 16891601Sbonwick /* 16901601Sbonwick * Note: we check spa_scrub_restart_txg under both spa_scrub_lock 16911601Sbonwick * AND the spa config lock to synchronize with any config changes 16921601Sbonwick * that revise the DTLs under spa_vdev_enter() / spa_vdev_exit(). 16931601Sbonwick */ 1694789Sahrens if (spa->spa_scrub_restart_txg != 0) 1695789Sahrens error = ERESTART; 1696789Sahrens 16971544Seschrock if (spa->spa_scrub_stop) 16981544Seschrock error = EINTR; 16991544Seschrock 1700789Sahrens /* 17011544Seschrock * Even if there were uncorrectable errors, we consider the scrub 17021544Seschrock * completed. The downside is that if there is a transient error during 17031544Seschrock * a resilver, we won't resilver the data properly to the target. But 17041544Seschrock * if the damage is permanent (more likely) we will resilver forever, 17051544Seschrock * which isn't really acceptable. Since there is enough information for 17061544Seschrock * the user to know what has failed and why, this seems like a more 17071544Seschrock * tractable approach. 1708789Sahrens */ 17091544Seschrock complete = (error == 0); 1710789Sahrens 17111544Seschrock dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n", 17121544Seschrock scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 1713789Sahrens spa->spa_scrub_maxtxg, complete ? "done" : "FAILED", 1714789Sahrens error, spa->spa_scrub_errors, spa->spa_scrub_stop); 1715789Sahrens 1716789Sahrens mutex_exit(&spa->spa_scrub_lock); 1717789Sahrens 1718789Sahrens /* 1719789Sahrens * If the scrub/resilver completed, update all DTLs to reflect this. 1720789Sahrens * Whether it succeeded or not, vacate all temporary scrub DTLs. 1721789Sahrens */ 1722789Sahrens vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1, 1723789Sahrens complete ? spa->spa_scrub_maxtxg : 0, B_TRUE); 1724789Sahrens vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete); 17251544Seschrock spa_errlog_rotate(spa); 17261601Sbonwick 17271544Seschrock spa_config_exit(spa, FTAG); 1728789Sahrens 1729789Sahrens mutex_enter(&spa->spa_scrub_lock); 1730789Sahrens 17311544Seschrock /* 17321544Seschrock * We may have finished replacing a device. 17331544Seschrock * Let the async thread assess this and handle the detach. 17341544Seschrock */ 17351544Seschrock spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 1736789Sahrens 1737789Sahrens /* 1738789Sahrens * If we were told to restart, our final act is to start a new scrub. 1739789Sahrens */ 1740789Sahrens if (error == ERESTART) 17411544Seschrock spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ? 17421544Seschrock SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB); 1743789Sahrens 17441544Seschrock spa->spa_scrub_type = POOL_SCRUB_NONE; 17451544Seschrock spa->spa_scrub_active = 0; 17461544Seschrock spa->spa_scrub_thread = NULL; 17471544Seschrock cv_broadcast(&spa->spa_scrub_cv); 1748789Sahrens CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */ 1749789Sahrens thread_exit(); 1750789Sahrens } 1751789Sahrens 1752789Sahrens void 1753789Sahrens spa_scrub_suspend(spa_t *spa) 1754789Sahrens { 1755789Sahrens mutex_enter(&spa->spa_scrub_lock); 17561544Seschrock spa->spa_scrub_suspended++; 1757789Sahrens while (spa->spa_scrub_active) { 1758789Sahrens cv_broadcast(&spa->spa_scrub_cv); 1759789Sahrens cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 1760789Sahrens } 1761789Sahrens while (spa->spa_scrub_inflight) 1762789Sahrens cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1763789Sahrens mutex_exit(&spa->spa_scrub_lock); 1764789Sahrens } 1765789Sahrens 1766789Sahrens void 1767789Sahrens spa_scrub_resume(spa_t *spa) 1768789Sahrens { 1769789Sahrens mutex_enter(&spa->spa_scrub_lock); 17701544Seschrock ASSERT(spa->spa_scrub_suspended != 0); 17711544Seschrock if (--spa->spa_scrub_suspended == 0) 1772789Sahrens cv_broadcast(&spa->spa_scrub_cv); 1773789Sahrens mutex_exit(&spa->spa_scrub_lock); 1774789Sahrens } 1775789Sahrens 1776789Sahrens void 1777789Sahrens spa_scrub_restart(spa_t *spa, uint64_t txg) 1778789Sahrens { 1779789Sahrens /* 1780789Sahrens * Something happened (e.g. snapshot create/delete) that means 1781789Sahrens * we must restart any in-progress scrubs. The itinerary will 1782789Sahrens * fix this properly. 1783789Sahrens */ 1784789Sahrens mutex_enter(&spa->spa_scrub_lock); 1785789Sahrens spa->spa_scrub_restart_txg = txg; 1786789Sahrens mutex_exit(&spa->spa_scrub_lock); 1787789Sahrens } 1788789Sahrens 17891544Seschrock int 17901544Seschrock spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) 1791789Sahrens { 1792789Sahrens space_seg_t *ss; 1793789Sahrens uint64_t mintxg, maxtxg; 1794789Sahrens vdev_t *rvd = spa->spa_root_vdev; 1795789Sahrens 1796789Sahrens if ((uint_t)type >= POOL_SCRUB_TYPES) 1797789Sahrens return (ENOTSUP); 1798789Sahrens 17991544Seschrock mutex_enter(&spa->spa_scrub_lock); 18001544Seschrock 1801789Sahrens /* 1802789Sahrens * If there's a scrub or resilver already in progress, stop it. 1803789Sahrens */ 1804789Sahrens while (spa->spa_scrub_thread != NULL) { 1805789Sahrens /* 1806789Sahrens * Don't stop a resilver unless forced. 1807789Sahrens */ 18081544Seschrock if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) { 18091544Seschrock mutex_exit(&spa->spa_scrub_lock); 1810789Sahrens return (EBUSY); 18111544Seschrock } 1812789Sahrens spa->spa_scrub_stop = 1; 1813789Sahrens cv_broadcast(&spa->spa_scrub_cv); 1814789Sahrens cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 1815789Sahrens } 1816789Sahrens 1817789Sahrens /* 1818789Sahrens * Terminate the previous traverse. 1819789Sahrens */ 1820789Sahrens if (spa->spa_scrub_th != NULL) { 1821789Sahrens traverse_fini(spa->spa_scrub_th); 1822789Sahrens spa->spa_scrub_th = NULL; 1823789Sahrens } 1824789Sahrens 18251544Seschrock if (rvd == NULL) { 18261544Seschrock ASSERT(spa->spa_scrub_stop == 0); 18271544Seschrock ASSERT(spa->spa_scrub_type == type); 18281544Seschrock ASSERT(spa->spa_scrub_restart_txg == 0); 18291544Seschrock mutex_exit(&spa->spa_scrub_lock); 18301544Seschrock return (0); 18311544Seschrock } 1832789Sahrens 1833789Sahrens mintxg = TXG_INITIAL - 1; 1834789Sahrens maxtxg = spa_last_synced_txg(spa) + 1; 1835789Sahrens 18361544Seschrock mutex_enter(&rvd->vdev_dtl_lock); 1837789Sahrens 18381544Seschrock if (rvd->vdev_dtl_map.sm_space == 0) { 18391544Seschrock /* 18401544Seschrock * The pool-wide DTL is empty. 18411732Sbonwick * If this is a resilver, there's nothing to do except 18421732Sbonwick * check whether any in-progress replacements have completed. 18431544Seschrock */ 18441732Sbonwick if (type == POOL_SCRUB_RESILVER) { 18451544Seschrock type = POOL_SCRUB_NONE; 18461732Sbonwick spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 18471732Sbonwick } 18481544Seschrock } else { 18491544Seschrock /* 18501544Seschrock * The pool-wide DTL is non-empty. 18511544Seschrock * If this is a normal scrub, upgrade to a resilver instead. 18521544Seschrock */ 18531544Seschrock if (type == POOL_SCRUB_EVERYTHING) 18541544Seschrock type = POOL_SCRUB_RESILVER; 18551544Seschrock } 1856789Sahrens 18571544Seschrock if (type == POOL_SCRUB_RESILVER) { 1858789Sahrens /* 1859789Sahrens * Determine the resilvering boundaries. 1860789Sahrens * 1861789Sahrens * Note: (mintxg, maxtxg) is an open interval, 1862789Sahrens * i.e. mintxg and maxtxg themselves are not included. 1863789Sahrens * 1864789Sahrens * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1 1865789Sahrens * so we don't claim to resilver a txg that's still changing. 1866789Sahrens */ 1867789Sahrens ss = avl_first(&rvd->vdev_dtl_map.sm_root); 18681544Seschrock mintxg = ss->ss_start - 1; 1869789Sahrens ss = avl_last(&rvd->vdev_dtl_map.sm_root); 18701544Seschrock maxtxg = MIN(ss->ss_end, maxtxg); 1871789Sahrens } 1872789Sahrens 18731544Seschrock mutex_exit(&rvd->vdev_dtl_lock); 18741544Seschrock 18751544Seschrock spa->spa_scrub_stop = 0; 18761544Seschrock spa->spa_scrub_type = type; 18771544Seschrock spa->spa_scrub_restart_txg = 0; 18781544Seschrock 18791544Seschrock if (type != POOL_SCRUB_NONE) { 18801544Seschrock spa->spa_scrub_mintxg = mintxg; 1881789Sahrens spa->spa_scrub_maxtxg = maxtxg; 1882789Sahrens spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL, 18831635Sbonwick ADVANCE_PRE | ADVANCE_PRUNE | ADVANCE_ZIL, 18841635Sbonwick ZIO_FLAG_CANFAIL); 1885789Sahrens traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg); 1886789Sahrens spa->spa_scrub_thread = thread_create(NULL, 0, 1887789Sahrens spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri); 1888789Sahrens } 1889789Sahrens 18901544Seschrock mutex_exit(&spa->spa_scrub_lock); 18911544Seschrock 1892789Sahrens return (0); 1893789Sahrens } 1894789Sahrens 18951544Seschrock /* 18961544Seschrock * ========================================================================== 18971544Seschrock * SPA async task processing 18981544Seschrock * ========================================================================== 18991544Seschrock */ 19001544Seschrock 19011544Seschrock static void 19021544Seschrock spa_async_reopen(spa_t *spa) 1903789Sahrens { 19041544Seschrock vdev_t *rvd = spa->spa_root_vdev; 19051544Seschrock vdev_t *tvd; 19061544Seschrock int c; 19071544Seschrock 19081544Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 19091544Seschrock 19101544Seschrock for (c = 0; c < rvd->vdev_children; c++) { 19111544Seschrock tvd = rvd->vdev_child[c]; 19121544Seschrock if (tvd->vdev_reopen_wanted) { 19131544Seschrock tvd->vdev_reopen_wanted = 0; 19141544Seschrock vdev_reopen(tvd); 19151544Seschrock } 19161544Seschrock } 1917789Sahrens 19181544Seschrock spa_config_exit(spa, FTAG); 19191544Seschrock } 19201544Seschrock 19211544Seschrock static void 19221544Seschrock spa_async_thread(spa_t *spa) 19231544Seschrock { 19241544Seschrock int tasks; 19251544Seschrock 19261544Seschrock ASSERT(spa->spa_sync_on); 1927789Sahrens 19281544Seschrock mutex_enter(&spa->spa_async_lock); 19291544Seschrock tasks = spa->spa_async_tasks; 19301544Seschrock spa->spa_async_tasks = 0; 19311544Seschrock mutex_exit(&spa->spa_async_lock); 19321544Seschrock 19331544Seschrock /* 19341635Sbonwick * See if the config needs to be updated. 19351635Sbonwick */ 19361635Sbonwick if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 19371635Sbonwick mutex_enter(&spa_namespace_lock); 19381635Sbonwick spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 19391635Sbonwick mutex_exit(&spa_namespace_lock); 19401635Sbonwick } 19411635Sbonwick 19421635Sbonwick /* 19431544Seschrock * See if any devices need to be reopened. 19441544Seschrock */ 19451544Seschrock if (tasks & SPA_ASYNC_REOPEN) 19461544Seschrock spa_async_reopen(spa); 19471544Seschrock 19481544Seschrock /* 19491544Seschrock * If any devices are done replacing, detach them. 19501544Seschrock */ 19511544Seschrock if (tasks & SPA_ASYNC_REPLACE_DONE) 1952789Sahrens spa_vdev_replace_done(spa); 1953789Sahrens 19541544Seschrock /* 19551544Seschrock * Kick off a scrub. 19561544Seschrock */ 19571544Seschrock if (tasks & SPA_ASYNC_SCRUB) 19581544Seschrock VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0); 19591544Seschrock 19601544Seschrock /* 19611544Seschrock * Kick off a resilver. 19621544Seschrock */ 19631544Seschrock if (tasks & SPA_ASYNC_RESILVER) 19641544Seschrock VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 19651544Seschrock 19661544Seschrock /* 19671544Seschrock * Let the world know that we're done. 19681544Seschrock */ 19691544Seschrock mutex_enter(&spa->spa_async_lock); 19701544Seschrock spa->spa_async_thread = NULL; 19711544Seschrock cv_broadcast(&spa->spa_async_cv); 19721544Seschrock mutex_exit(&spa->spa_async_lock); 19731544Seschrock thread_exit(); 19741544Seschrock } 19751544Seschrock 19761544Seschrock void 19771544Seschrock spa_async_suspend(spa_t *spa) 19781544Seschrock { 19791544Seschrock mutex_enter(&spa->spa_async_lock); 19801544Seschrock spa->spa_async_suspended++; 19811544Seschrock while (spa->spa_async_thread != NULL) 19821544Seschrock cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 19831544Seschrock mutex_exit(&spa->spa_async_lock); 19841544Seschrock } 19851544Seschrock 19861544Seschrock void 19871544Seschrock spa_async_resume(spa_t *spa) 19881544Seschrock { 19891544Seschrock mutex_enter(&spa->spa_async_lock); 19901544Seschrock ASSERT(spa->spa_async_suspended != 0); 19911544Seschrock spa->spa_async_suspended--; 19921544Seschrock mutex_exit(&spa->spa_async_lock); 19931544Seschrock } 19941544Seschrock 19951544Seschrock static void 19961544Seschrock spa_async_dispatch(spa_t *spa) 19971544Seschrock { 19981544Seschrock mutex_enter(&spa->spa_async_lock); 19991544Seschrock if (spa->spa_async_tasks && !spa->spa_async_suspended && 20001635Sbonwick spa->spa_async_thread == NULL && 20011635Sbonwick rootdir != NULL && !vn_is_readonly(rootdir)) 20021544Seschrock spa->spa_async_thread = thread_create(NULL, 0, 20031544Seschrock spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 20041544Seschrock mutex_exit(&spa->spa_async_lock); 20051544Seschrock } 20061544Seschrock 20071544Seschrock void 20081544Seschrock spa_async_request(spa_t *spa, int task) 20091544Seschrock { 20101544Seschrock mutex_enter(&spa->spa_async_lock); 20111544Seschrock spa->spa_async_tasks |= task; 20121544Seschrock mutex_exit(&spa->spa_async_lock); 2013789Sahrens } 2014789Sahrens 2015789Sahrens /* 2016789Sahrens * ========================================================================== 2017789Sahrens * SPA syncing routines 2018789Sahrens * ========================================================================== 2019789Sahrens */ 2020789Sahrens 2021789Sahrens static void 2022789Sahrens spa_sync_deferred_frees(spa_t *spa, uint64_t txg) 2023789Sahrens { 2024789Sahrens bplist_t *bpl = &spa->spa_sync_bplist; 2025789Sahrens dmu_tx_t *tx; 2026789Sahrens blkptr_t blk; 2027789Sahrens uint64_t itor = 0; 2028789Sahrens zio_t *zio; 2029789Sahrens int error; 2030789Sahrens uint8_t c = 1; 2031789Sahrens 2032789Sahrens zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD); 2033789Sahrens 2034789Sahrens while (bplist_iterate(bpl, &itor, &blk) == 0) 2035789Sahrens zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL)); 2036789Sahrens 2037789Sahrens error = zio_wait(zio); 2038789Sahrens ASSERT3U(error, ==, 0); 2039789Sahrens 2040789Sahrens tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 2041789Sahrens bplist_vacate(bpl, tx); 2042789Sahrens 2043789Sahrens /* 2044789Sahrens * Pre-dirty the first block so we sync to convergence faster. 2045789Sahrens * (Usually only the first block is needed.) 2046789Sahrens */ 2047789Sahrens dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 2048789Sahrens dmu_tx_commit(tx); 2049789Sahrens } 2050789Sahrens 2051789Sahrens static void 2052789Sahrens spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 2053789Sahrens { 2054789Sahrens nvlist_t *config; 2055789Sahrens char *packed = NULL; 2056789Sahrens size_t nvsize = 0; 2057789Sahrens dmu_buf_t *db; 2058789Sahrens 2059789Sahrens if (list_is_empty(&spa->spa_dirty_list)) 2060789Sahrens return; 2061789Sahrens 2062789Sahrens config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE); 2063789Sahrens 20641635Sbonwick if (spa->spa_config_syncing) 20651635Sbonwick nvlist_free(spa->spa_config_syncing); 20661635Sbonwick spa->spa_config_syncing = config; 2067789Sahrens 2068789Sahrens VERIFY(nvlist_size(config, &nvsize, NV_ENCODE_XDR) == 0); 2069789Sahrens 2070789Sahrens packed = kmem_alloc(nvsize, KM_SLEEP); 2071789Sahrens 20721544Seschrock VERIFY(nvlist_pack(config, &packed, &nvsize, NV_ENCODE_XDR, 20731544Seschrock KM_SLEEP) == 0); 2074789Sahrens 2075789Sahrens dmu_write(spa->spa_meta_objset, spa->spa_config_object, 0, nvsize, 2076789Sahrens packed, tx); 2077789Sahrens 2078789Sahrens kmem_free(packed, nvsize); 2079789Sahrens 20801544Seschrock VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, 20811544Seschrock spa->spa_config_object, FTAG, &db)); 2082789Sahrens dmu_buf_will_dirty(db, tx); 2083789Sahrens *(uint64_t *)db->db_data = nvsize; 20841544Seschrock dmu_buf_rele(db, FTAG); 2085789Sahrens } 2086789Sahrens 2087789Sahrens /* 2088789Sahrens * Sync the specified transaction group. New blocks may be dirtied as 2089789Sahrens * part of the process, so we iterate until it converges. 2090789Sahrens */ 2091789Sahrens void 2092789Sahrens spa_sync(spa_t *spa, uint64_t txg) 2093789Sahrens { 2094789Sahrens dsl_pool_t *dp = spa->spa_dsl_pool; 2095789Sahrens objset_t *mos = spa->spa_meta_objset; 2096789Sahrens bplist_t *bpl = &spa->spa_sync_bplist; 20971635Sbonwick vdev_t *rvd = spa->spa_root_vdev; 2098789Sahrens vdev_t *vd; 2099789Sahrens dmu_tx_t *tx; 2100789Sahrens int dirty_vdevs; 2101789Sahrens 2102789Sahrens /* 2103789Sahrens * Lock out configuration changes. 2104789Sahrens */ 21051544Seschrock spa_config_enter(spa, RW_READER, FTAG); 2106789Sahrens 2107789Sahrens spa->spa_syncing_txg = txg; 2108789Sahrens spa->spa_sync_pass = 0; 2109789Sahrens 21101544Seschrock VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); 2111789Sahrens 2112789Sahrens /* 2113789Sahrens * If anything has changed in this txg, push the deferred frees 2114789Sahrens * from the previous txg. If not, leave them alone so that we 2115789Sahrens * don't generate work on an otherwise idle system. 2116789Sahrens */ 2117789Sahrens if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 2118789Sahrens !txg_list_empty(&dp->dp_dirty_dirs, txg)) 2119789Sahrens spa_sync_deferred_frees(spa, txg); 2120789Sahrens 2121789Sahrens /* 2122789Sahrens * Iterate to convergence. 2123789Sahrens */ 2124789Sahrens do { 2125789Sahrens spa->spa_sync_pass++; 2126789Sahrens 2127789Sahrens tx = dmu_tx_create_assigned(dp, txg); 2128789Sahrens spa_sync_config_object(spa, tx); 2129789Sahrens dmu_tx_commit(tx); 2130789Sahrens 21311544Seschrock spa_errlog_sync(spa, txg); 21321544Seschrock 2133789Sahrens dsl_pool_sync(dp, txg); 2134789Sahrens 2135789Sahrens dirty_vdevs = 0; 2136789Sahrens while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 2137789Sahrens vdev_sync(vd, txg); 2138789Sahrens dirty_vdevs++; 2139789Sahrens } 2140789Sahrens 2141789Sahrens tx = dmu_tx_create_assigned(dp, txg); 2142789Sahrens bplist_sync(bpl, tx); 2143789Sahrens dmu_tx_commit(tx); 2144789Sahrens 2145789Sahrens } while (dirty_vdevs); 2146789Sahrens 2147789Sahrens bplist_close(bpl); 2148789Sahrens 2149789Sahrens dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 2150789Sahrens 2151789Sahrens /* 2152789Sahrens * Rewrite the vdev configuration (which includes the uberblock) 2153789Sahrens * to commit the transaction group. 21541635Sbonwick * 21551635Sbonwick * If there are any dirty vdevs, sync the uberblock to all vdevs. 21561635Sbonwick * Otherwise, pick a random top-level vdev that's known to be 21571635Sbonwick * visible in the config cache (see spa_vdev_add() for details). 21581635Sbonwick * If the write fails, try the next vdev until we're tried them all. 2159789Sahrens */ 21601635Sbonwick if (!list_is_empty(&spa->spa_dirty_list)) { 21611635Sbonwick VERIFY(vdev_config_sync(rvd, txg) == 0); 21621635Sbonwick } else { 21631635Sbonwick int children = rvd->vdev_children; 21641635Sbonwick int c0 = spa_get_random(children); 21651635Sbonwick int c; 21661635Sbonwick 21671635Sbonwick for (c = 0; c < children; c++) { 21681635Sbonwick vd = rvd->vdev_child[(c0 + c) % children]; 21691635Sbonwick if (vd->vdev_ms_array == 0) 21701635Sbonwick continue; 21711635Sbonwick if (vdev_config_sync(vd, txg) == 0) 21721635Sbonwick break; 21731635Sbonwick } 21741635Sbonwick if (c == children) 21751635Sbonwick VERIFY(vdev_config_sync(rvd, txg) == 0); 21761635Sbonwick } 21771635Sbonwick 21781635Sbonwick /* 21791635Sbonwick * Clear the dirty config list. 21801635Sbonwick */ 21811635Sbonwick while ((vd = list_head(&spa->spa_dirty_list)) != NULL) 21821635Sbonwick vdev_config_clean(vd); 21831635Sbonwick 21841635Sbonwick /* 21851635Sbonwick * Now that the new config has synced transactionally, 21861635Sbonwick * let it become visible to the config cache. 21871635Sbonwick */ 21881635Sbonwick if (spa->spa_config_syncing != NULL) { 21891635Sbonwick spa_config_set(spa, spa->spa_config_syncing); 21901635Sbonwick spa->spa_config_txg = txg; 21911635Sbonwick spa->spa_config_syncing = NULL; 21921635Sbonwick } 2193789Sahrens 2194789Sahrens /* 2195789Sahrens * Make a stable copy of the fully synced uberblock. 2196789Sahrens * We use this as the root for pool traversals. 2197789Sahrens */ 2198789Sahrens spa->spa_traverse_wanted = 1; /* tells traverse_more() to stop */ 2199789Sahrens 2200789Sahrens spa_scrub_suspend(spa); /* stop scrubbing and finish I/Os */ 2201789Sahrens 2202789Sahrens rw_enter(&spa->spa_traverse_lock, RW_WRITER); 2203789Sahrens spa->spa_traverse_wanted = 0; 2204789Sahrens spa->spa_ubsync = spa->spa_uberblock; 2205789Sahrens rw_exit(&spa->spa_traverse_lock); 2206789Sahrens 2207789Sahrens spa_scrub_resume(spa); /* resume scrub with new ubsync */ 2208789Sahrens 2209789Sahrens /* 2210789Sahrens * Clean up the ZIL records for the synced txg. 2211789Sahrens */ 2212789Sahrens dsl_pool_zil_clean(dp); 2213789Sahrens 2214789Sahrens /* 2215789Sahrens * Update usable space statistics. 2216789Sahrens */ 2217789Sahrens while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 2218789Sahrens vdev_sync_done(vd, txg); 2219789Sahrens 2220789Sahrens /* 2221789Sahrens * It had better be the case that we didn't dirty anything 2222789Sahrens * since spa_sync_labels(). 2223789Sahrens */ 2224789Sahrens ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 2225789Sahrens ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 2226789Sahrens ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 2227789Sahrens ASSERT(bpl->bpl_queue == NULL); 2228789Sahrens 22291544Seschrock spa_config_exit(spa, FTAG); 22301544Seschrock 22311544Seschrock /* 22321544Seschrock * If any async tasks have been requested, kick them off. 22331544Seschrock */ 22341544Seschrock spa_async_dispatch(spa); 2235789Sahrens } 2236789Sahrens 2237789Sahrens /* 2238789Sahrens * Sync all pools. We don't want to hold the namespace lock across these 2239789Sahrens * operations, so we take a reference on the spa_t and drop the lock during the 2240789Sahrens * sync. 2241789Sahrens */ 2242789Sahrens void 2243789Sahrens spa_sync_allpools(void) 2244789Sahrens { 2245789Sahrens spa_t *spa = NULL; 2246789Sahrens mutex_enter(&spa_namespace_lock); 2247789Sahrens while ((spa = spa_next(spa)) != NULL) { 2248789Sahrens if (spa_state(spa) != POOL_STATE_ACTIVE) 2249789Sahrens continue; 2250789Sahrens spa_open_ref(spa, FTAG); 2251789Sahrens mutex_exit(&spa_namespace_lock); 2252789Sahrens txg_wait_synced(spa_get_dsl(spa), 0); 2253789Sahrens mutex_enter(&spa_namespace_lock); 2254789Sahrens spa_close(spa, FTAG); 2255789Sahrens } 2256789Sahrens mutex_exit(&spa_namespace_lock); 2257789Sahrens } 2258789Sahrens 2259789Sahrens /* 2260789Sahrens * ========================================================================== 2261789Sahrens * Miscellaneous routines 2262789Sahrens * ========================================================================== 2263789Sahrens */ 2264789Sahrens 2265789Sahrens /* 2266789Sahrens * Remove all pools in the system. 2267789Sahrens */ 2268789Sahrens void 2269789Sahrens spa_evict_all(void) 2270789Sahrens { 2271789Sahrens spa_t *spa; 2272789Sahrens 2273789Sahrens /* 2274789Sahrens * Remove all cached state. All pools should be closed now, 2275789Sahrens * so every spa in the AVL tree should be unreferenced. 2276789Sahrens */ 2277789Sahrens mutex_enter(&spa_namespace_lock); 2278789Sahrens while ((spa = spa_next(NULL)) != NULL) { 2279789Sahrens /* 22801544Seschrock * Stop async tasks. The async thread may need to detach 22811544Seschrock * a device that's been replaced, which requires grabbing 22821544Seschrock * spa_namespace_lock, so we must drop it here. 2283789Sahrens */ 2284789Sahrens spa_open_ref(spa, FTAG); 2285789Sahrens mutex_exit(&spa_namespace_lock); 22861544Seschrock spa_async_suspend(spa); 2287789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 2288789Sahrens mutex_enter(&spa_namespace_lock); 2289789Sahrens spa_close(spa, FTAG); 2290789Sahrens 2291789Sahrens if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 2292789Sahrens spa_unload(spa); 2293789Sahrens spa_deactivate(spa); 2294789Sahrens } 2295789Sahrens spa_remove(spa); 2296789Sahrens } 2297789Sahrens mutex_exit(&spa_namespace_lock); 2298789Sahrens } 22991544Seschrock 23001544Seschrock vdev_t * 23011544Seschrock spa_lookup_by_guid(spa_t *spa, uint64_t guid) 23021544Seschrock { 23031544Seschrock return (vdev_lookup_by_guid(spa->spa_root_vdev, guid)); 23041544Seschrock } 23051760Seschrock 23061760Seschrock void 23071760Seschrock spa_upgrade(spa_t *spa) 23081760Seschrock { 23091760Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 23101760Seschrock 23111760Seschrock /* 23121760Seschrock * This should only be called for a non-faulted pool, and since a 23131760Seschrock * future version would result in an unopenable pool, this shouldn't be 23141760Seschrock * possible. 23151760Seschrock */ 23161760Seschrock ASSERT(spa->spa_uberblock.ub_version <= ZFS_VERSION); 23171760Seschrock 23181760Seschrock spa->spa_uberblock.ub_version = ZFS_VERSION; 23191760Seschrock vdev_config_dirty(spa->spa_root_vdev); 23201760Seschrock 23211760Seschrock spa_config_exit(spa, FTAG); 23221760Seschrock } 2323