1789Sahrens /* 2789Sahrens * CDDL HEADER START 3789Sahrens * 4789Sahrens * The contents of this file are subject to the terms of the 51544Seschrock * Common Development and Distribution License (the "License"). 61544Seschrock * You may not use this file except in compliance with the License. 7789Sahrens * 8789Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9789Sahrens * or http://www.opensolaris.org/os/licensing. 10789Sahrens * See the License for the specific language governing permissions 11789Sahrens * and limitations under the License. 12789Sahrens * 13789Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14789Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15789Sahrens * If applicable, add the following below this CDDL HEADER, with the 16789Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17789Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18789Sahrens * 19789Sahrens * CDDL HEADER END 20789Sahrens */ 21*2082Seschrock 22789Sahrens /* 231354Seschrock * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 24789Sahrens * Use is subject to license terms. 25789Sahrens */ 26789Sahrens 27789Sahrens #pragma ident "%Z%%M% %I% %E% SMI" 28789Sahrens 29789Sahrens /* 30789Sahrens * This file contains all the routines used when modifying on-disk SPA state. 31789Sahrens * This includes opening, importing, destroying, exporting a pool, and syncing a 32789Sahrens * pool. 33789Sahrens */ 34789Sahrens 35789Sahrens #include <sys/zfs_context.h> 361544Seschrock #include <sys/fm/fs/zfs.h> 37789Sahrens #include <sys/spa_impl.h> 38789Sahrens #include <sys/zio.h> 39789Sahrens #include <sys/zio_checksum.h> 40789Sahrens #include <sys/zio_compress.h> 41789Sahrens #include <sys/dmu.h> 42789Sahrens #include <sys/dmu_tx.h> 43789Sahrens #include <sys/zap.h> 44789Sahrens #include <sys/zil.h> 45789Sahrens #include <sys/vdev_impl.h> 46789Sahrens #include <sys/metaslab.h> 47789Sahrens #include <sys/uberblock_impl.h> 48789Sahrens #include <sys/txg.h> 49789Sahrens #include <sys/avl.h> 50789Sahrens #include <sys/dmu_traverse.h> 51789Sahrens #include <sys/unique.h> 52789Sahrens #include <sys/dsl_pool.h> 53789Sahrens #include <sys/dsl_dir.h> 54789Sahrens #include <sys/dsl_prop.h> 55789Sahrens #include <sys/fs/zfs.h> 56789Sahrens #include <sys/callb.h> 57789Sahrens 58789Sahrens /* 59789Sahrens * ========================================================================== 60789Sahrens * SPA state manipulation (open/create/destroy/import/export) 61789Sahrens * ========================================================================== 62789Sahrens */ 63789Sahrens 641544Seschrock static int 651544Seschrock spa_error_entry_compare(const void *a, const void *b) 661544Seschrock { 671544Seschrock spa_error_entry_t *sa = (spa_error_entry_t *)a; 681544Seschrock spa_error_entry_t *sb = (spa_error_entry_t *)b; 691544Seschrock int ret; 701544Seschrock 711544Seschrock ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 721544Seschrock sizeof (zbookmark_t)); 731544Seschrock 741544Seschrock if (ret < 0) 751544Seschrock return (-1); 761544Seschrock else if (ret > 0) 771544Seschrock return (1); 781544Seschrock else 791544Seschrock return (0); 801544Seschrock } 811544Seschrock 821544Seschrock /* 831544Seschrock * Utility function which retrieves copies of the current logs and 841544Seschrock * re-initializes them in the process. 851544Seschrock */ 861544Seschrock void 871544Seschrock spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 881544Seschrock { 891544Seschrock ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 901544Seschrock 911544Seschrock bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 921544Seschrock bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 931544Seschrock 941544Seschrock avl_create(&spa->spa_errlist_scrub, 951544Seschrock spa_error_entry_compare, sizeof (spa_error_entry_t), 961544Seschrock offsetof(spa_error_entry_t, se_avl)); 971544Seschrock avl_create(&spa->spa_errlist_last, 981544Seschrock spa_error_entry_compare, sizeof (spa_error_entry_t), 991544Seschrock offsetof(spa_error_entry_t, se_avl)); 1001544Seschrock } 1011544Seschrock 102789Sahrens /* 103789Sahrens * Activate an uninitialized pool. 104789Sahrens */ 105789Sahrens static void 106789Sahrens spa_activate(spa_t *spa) 107789Sahrens { 108789Sahrens int t; 109789Sahrens 110789Sahrens ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 111789Sahrens 112789Sahrens spa->spa_state = POOL_STATE_ACTIVE; 113789Sahrens 114789Sahrens spa->spa_normal_class = metaslab_class_create(); 115789Sahrens 116789Sahrens for (t = 0; t < ZIO_TYPES; t++) { 117789Sahrens spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue", 118789Sahrens 8, maxclsyspri, 50, INT_MAX, 119789Sahrens TASKQ_PREPOPULATE); 120789Sahrens spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr", 121789Sahrens 8, maxclsyspri, 50, INT_MAX, 122789Sahrens TASKQ_PREPOPULATE); 123789Sahrens } 124789Sahrens 125789Sahrens rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL); 126789Sahrens 127789Sahrens list_create(&spa->spa_dirty_list, sizeof (vdev_t), 128789Sahrens offsetof(vdev_t, vdev_dirty_node)); 129789Sahrens 130789Sahrens txg_list_create(&spa->spa_vdev_txg_list, 131789Sahrens offsetof(struct vdev, vdev_txg_node)); 1321544Seschrock 1331544Seschrock avl_create(&spa->spa_errlist_scrub, 1341544Seschrock spa_error_entry_compare, sizeof (spa_error_entry_t), 1351544Seschrock offsetof(spa_error_entry_t, se_avl)); 1361544Seschrock avl_create(&spa->spa_errlist_last, 1371544Seschrock spa_error_entry_compare, sizeof (spa_error_entry_t), 1381544Seschrock offsetof(spa_error_entry_t, se_avl)); 139789Sahrens } 140789Sahrens 141789Sahrens /* 142789Sahrens * Opposite of spa_activate(). 143789Sahrens */ 144789Sahrens static void 145789Sahrens spa_deactivate(spa_t *spa) 146789Sahrens { 147789Sahrens int t; 148789Sahrens 149789Sahrens ASSERT(spa->spa_sync_on == B_FALSE); 150789Sahrens ASSERT(spa->spa_dsl_pool == NULL); 151789Sahrens ASSERT(spa->spa_root_vdev == NULL); 152789Sahrens 153789Sahrens ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 154789Sahrens 155789Sahrens txg_list_destroy(&spa->spa_vdev_txg_list); 156789Sahrens 157789Sahrens list_destroy(&spa->spa_dirty_list); 158789Sahrens 159789Sahrens rw_destroy(&spa->spa_traverse_lock); 160789Sahrens 161789Sahrens for (t = 0; t < ZIO_TYPES; t++) { 162789Sahrens taskq_destroy(spa->spa_zio_issue_taskq[t]); 163789Sahrens taskq_destroy(spa->spa_zio_intr_taskq[t]); 164789Sahrens spa->spa_zio_issue_taskq[t] = NULL; 165789Sahrens spa->spa_zio_intr_taskq[t] = NULL; 166789Sahrens } 167789Sahrens 168789Sahrens metaslab_class_destroy(spa->spa_normal_class); 169789Sahrens spa->spa_normal_class = NULL; 170789Sahrens 1711544Seschrock /* 1721544Seschrock * If this was part of an import or the open otherwise failed, we may 1731544Seschrock * still have errors left in the queues. Empty them just in case. 1741544Seschrock */ 1751544Seschrock spa_errlog_drain(spa); 1761544Seschrock 1771544Seschrock avl_destroy(&spa->spa_errlist_scrub); 1781544Seschrock avl_destroy(&spa->spa_errlist_last); 1791544Seschrock 180789Sahrens spa->spa_state = POOL_STATE_UNINITIALIZED; 181789Sahrens } 182789Sahrens 183789Sahrens /* 184789Sahrens * Verify a pool configuration, and construct the vdev tree appropriately. This 185789Sahrens * will create all the necessary vdevs in the appropriate layout, with each vdev 186789Sahrens * in the CLOSED state. This will prep the pool before open/creation/import. 187789Sahrens * All vdev validation is done by the vdev_alloc() routine. 188789Sahrens */ 189*2082Seschrock static int 190*2082Seschrock spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 191*2082Seschrock uint_t id, int atype) 192789Sahrens { 193789Sahrens nvlist_t **child; 194789Sahrens uint_t c, children; 195*2082Seschrock int error; 196*2082Seschrock 197*2082Seschrock if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 198*2082Seschrock return (error); 199*2082Seschrock 200*2082Seschrock if ((*vdp)->vdev_ops->vdev_op_leaf) 201*2082Seschrock return (0); 202789Sahrens 203789Sahrens if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 204789Sahrens &child, &children) != 0) { 205*2082Seschrock vdev_free(*vdp); 206*2082Seschrock *vdp = NULL; 207*2082Seschrock return (EINVAL); 208789Sahrens } 209789Sahrens 210789Sahrens for (c = 0; c < children; c++) { 211*2082Seschrock vdev_t *vd; 212*2082Seschrock if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 213*2082Seschrock atype)) != 0) { 214*2082Seschrock vdev_free(*vdp); 215*2082Seschrock *vdp = NULL; 216*2082Seschrock return (error); 217789Sahrens } 218789Sahrens } 219789Sahrens 220*2082Seschrock ASSERT(*vdp != NULL); 221*2082Seschrock 222*2082Seschrock return (0); 223789Sahrens } 224789Sahrens 225789Sahrens /* 226789Sahrens * Opposite of spa_load(). 227789Sahrens */ 228789Sahrens static void 229789Sahrens spa_unload(spa_t *spa) 230789Sahrens { 231*2082Seschrock int i; 232*2082Seschrock 233789Sahrens /* 2341544Seschrock * Stop async tasks. 2351544Seschrock */ 2361544Seschrock spa_async_suspend(spa); 2371544Seschrock 2381544Seschrock /* 239789Sahrens * Stop syncing. 240789Sahrens */ 241789Sahrens if (spa->spa_sync_on) { 242789Sahrens txg_sync_stop(spa->spa_dsl_pool); 243789Sahrens spa->spa_sync_on = B_FALSE; 244789Sahrens } 245789Sahrens 246789Sahrens /* 247789Sahrens * Wait for any outstanding prefetch I/O to complete. 248789Sahrens */ 2491544Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 2501544Seschrock spa_config_exit(spa, FTAG); 251789Sahrens 252789Sahrens /* 253789Sahrens * Close the dsl pool. 254789Sahrens */ 255789Sahrens if (spa->spa_dsl_pool) { 256789Sahrens dsl_pool_close(spa->spa_dsl_pool); 257789Sahrens spa->spa_dsl_pool = NULL; 258789Sahrens } 259789Sahrens 260789Sahrens /* 261789Sahrens * Close all vdevs. 262789Sahrens */ 2631585Sbonwick if (spa->spa_root_vdev) 264789Sahrens vdev_free(spa->spa_root_vdev); 2651585Sbonwick ASSERT(spa->spa_root_vdev == NULL); 2661544Seschrock 267*2082Seschrock for (i = 0; i < spa->spa_nspares; i++) 268*2082Seschrock vdev_free(spa->spa_spares[i]); 269*2082Seschrock if (spa->spa_spares) { 270*2082Seschrock kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); 271*2082Seschrock spa->spa_spares = NULL; 272*2082Seschrock } 273*2082Seschrock if (spa->spa_sparelist) { 274*2082Seschrock nvlist_free(spa->spa_sparelist); 275*2082Seschrock spa->spa_sparelist = NULL; 276*2082Seschrock } 277*2082Seschrock 2781544Seschrock spa->spa_async_suspended = 0; 279789Sahrens } 280789Sahrens 281789Sahrens /* 282*2082Seschrock * Load (or re-load) the current list of vdevs describing the active spares for 283*2082Seschrock * this pool. When this is called, we have some form of basic information in 284*2082Seschrock * 'spa_sparelist'. We parse this into vdevs, try to open them, and then 285*2082Seschrock * re-generate a more complete list including status information. 286*2082Seschrock */ 287*2082Seschrock static void 288*2082Seschrock spa_load_spares(spa_t *spa) 289*2082Seschrock { 290*2082Seschrock nvlist_t **spares; 291*2082Seschrock uint_t nspares; 292*2082Seschrock int i; 293*2082Seschrock 294*2082Seschrock /* 295*2082Seschrock * First, close and free any existing spare vdevs. 296*2082Seschrock */ 297*2082Seschrock for (i = 0; i < spa->spa_nspares; i++) { 298*2082Seschrock vdev_close(spa->spa_spares[i]); 299*2082Seschrock vdev_free(spa->spa_spares[i]); 300*2082Seschrock } 301*2082Seschrock if (spa->spa_spares) 302*2082Seschrock kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); 303*2082Seschrock 304*2082Seschrock if (spa->spa_sparelist == NULL) 305*2082Seschrock nspares = 0; 306*2082Seschrock else 307*2082Seschrock VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 308*2082Seschrock ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 309*2082Seschrock 310*2082Seschrock spa->spa_nspares = (int)nspares; 311*2082Seschrock spa->spa_spares = NULL; 312*2082Seschrock 313*2082Seschrock if (nspares == 0) 314*2082Seschrock return; 315*2082Seschrock 316*2082Seschrock /* 317*2082Seschrock * Construct the array of vdevs, opening them to get status in the 318*2082Seschrock * process. 319*2082Seschrock */ 320*2082Seschrock spa->spa_spares = kmem_alloc(nspares * sizeof (void *), KM_SLEEP); 321*2082Seschrock for (i = 0; i < spa->spa_nspares; i++) { 322*2082Seschrock vdev_t *vd; 323*2082Seschrock 324*2082Seschrock VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 325*2082Seschrock VDEV_ALLOC_SPARE) == 0); 326*2082Seschrock ASSERT(vd != NULL); 327*2082Seschrock 328*2082Seschrock spa->spa_spares[i] = vd; 329*2082Seschrock 330*2082Seschrock if (vdev_open(vd) != 0) 331*2082Seschrock continue; 332*2082Seschrock 333*2082Seschrock vd->vdev_top = vd; 334*2082Seschrock (void) vdev_validate_spare(vd); 335*2082Seschrock } 336*2082Seschrock 337*2082Seschrock /* 338*2082Seschrock * Recompute the stashed list of spares, with status information 339*2082Seschrock * this time. 340*2082Seschrock */ 341*2082Seschrock VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 342*2082Seschrock DATA_TYPE_NVLIST_ARRAY) == 0); 343*2082Seschrock 344*2082Seschrock spares = kmem_alloc(spa->spa_nspares * sizeof (void *), KM_SLEEP); 345*2082Seschrock for (i = 0; i < spa->spa_nspares; i++) 346*2082Seschrock spares[i] = vdev_config_generate(spa, spa->spa_spares[i], 347*2082Seschrock B_TRUE, B_TRUE); 348*2082Seschrock VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 349*2082Seschrock spares, spa->spa_nspares) == 0); 350*2082Seschrock for (i = 0; i < spa->spa_nspares; i++) 351*2082Seschrock nvlist_free(spares[i]); 352*2082Seschrock kmem_free(spares, spa->spa_nspares * sizeof (void *)); 353*2082Seschrock } 354*2082Seschrock 355*2082Seschrock static int 356*2082Seschrock load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 357*2082Seschrock { 358*2082Seschrock dmu_buf_t *db; 359*2082Seschrock char *packed = NULL; 360*2082Seschrock size_t nvsize = 0; 361*2082Seschrock int error; 362*2082Seschrock *value = NULL; 363*2082Seschrock 364*2082Seschrock VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 365*2082Seschrock nvsize = *(uint64_t *)db->db_data; 366*2082Seschrock dmu_buf_rele(db, FTAG); 367*2082Seschrock 368*2082Seschrock packed = kmem_alloc(nvsize, KM_SLEEP); 369*2082Seschrock error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed); 370*2082Seschrock if (error == 0) 371*2082Seschrock error = nvlist_unpack(packed, nvsize, value, 0); 372*2082Seschrock kmem_free(packed, nvsize); 373*2082Seschrock 374*2082Seschrock return (error); 375*2082Seschrock } 376*2082Seschrock 377*2082Seschrock /* 378789Sahrens * Load an existing storage pool, using the pool's builtin spa_config as a 3791544Seschrock * source of configuration information. 380789Sahrens */ 381789Sahrens static int 3821544Seschrock spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) 383789Sahrens { 384789Sahrens int error = 0; 385789Sahrens nvlist_t *nvroot = NULL; 386789Sahrens vdev_t *rvd; 387789Sahrens uberblock_t *ub = &spa->spa_uberblock; 3881635Sbonwick uint64_t config_cache_txg = spa->spa_config_txg; 389789Sahrens uint64_t pool_guid; 390*2082Seschrock uint64_t version; 391789Sahrens zio_t *zio; 392789Sahrens 3931544Seschrock spa->spa_load_state = state; 3941635Sbonwick 395789Sahrens if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 3961733Sbonwick nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 3971544Seschrock error = EINVAL; 3981544Seschrock goto out; 3991544Seschrock } 400789Sahrens 401*2082Seschrock /* 402*2082Seschrock * Versioning wasn't explicitly added to the label until later, so if 403*2082Seschrock * it's not present treat it as the initial version. 404*2082Seschrock */ 405*2082Seschrock if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0) 406*2082Seschrock version = ZFS_VERSION_INITIAL; 407*2082Seschrock 4081733Sbonwick (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 4091733Sbonwick &spa->spa_config_txg); 4101733Sbonwick 4111635Sbonwick if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 4121544Seschrock spa_guid_exists(pool_guid, 0)) { 4131544Seschrock error = EEXIST; 4141544Seschrock goto out; 4151544Seschrock } 416789Sahrens 417789Sahrens /* 418*2082Seschrock * Parse the configuration into a vdev tree. We explicitly set the 419*2082Seschrock * value that will be returned by spa_version() since parsing the 420*2082Seschrock * configuration requires knowing the version number. 421789Sahrens */ 4221544Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 423*2082Seschrock spa->spa_ubsync.ub_version = version; 424*2082Seschrock error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 4251544Seschrock spa_config_exit(spa, FTAG); 426789Sahrens 427*2082Seschrock if (error != 0) 4281544Seschrock goto out; 429789Sahrens 4301585Sbonwick ASSERT(spa->spa_root_vdev == rvd); 431789Sahrens ASSERT(spa_guid(spa) == pool_guid); 432789Sahrens 433789Sahrens /* 434789Sahrens * Try to open all vdevs, loading each label in the process. 435789Sahrens */ 4361544Seschrock if (vdev_open(rvd) != 0) { 4371544Seschrock error = ENXIO; 4381544Seschrock goto out; 4391544Seschrock } 440789Sahrens 441789Sahrens /* 4421986Seschrock * Validate the labels for all leaf vdevs. We need to grab the config 4431986Seschrock * lock because all label I/O is done with the ZIO_FLAG_CONFIG_HELD 4441986Seschrock * flag. 4451986Seschrock */ 4461986Seschrock spa_config_enter(spa, RW_READER, FTAG); 4471986Seschrock error = vdev_validate(rvd); 4481986Seschrock spa_config_exit(spa, FTAG); 4491986Seschrock 4501986Seschrock if (error != 0) { 4511986Seschrock error = EBADF; 4521986Seschrock goto out; 4531986Seschrock } 4541986Seschrock 4551986Seschrock if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 4561986Seschrock error = ENXIO; 4571986Seschrock goto out; 4581986Seschrock } 4591986Seschrock 4601986Seschrock /* 461789Sahrens * Find the best uberblock. 462789Sahrens */ 463789Sahrens bzero(ub, sizeof (uberblock_t)); 464789Sahrens 465789Sahrens zio = zio_root(spa, NULL, NULL, 466789Sahrens ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 467789Sahrens vdev_uberblock_load(zio, rvd, ub); 468789Sahrens error = zio_wait(zio); 469789Sahrens 470789Sahrens /* 471789Sahrens * If we weren't able to find a single valid uberblock, return failure. 472789Sahrens */ 473789Sahrens if (ub->ub_txg == 0) { 4741760Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 4751760Seschrock VDEV_AUX_CORRUPT_DATA); 4761544Seschrock error = ENXIO; 4771544Seschrock goto out; 4781544Seschrock } 4791544Seschrock 4801544Seschrock /* 4811544Seschrock * If the pool is newer than the code, we can't open it. 4821544Seschrock */ 4831760Seschrock if (ub->ub_version > ZFS_VERSION) { 4841760Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 4851760Seschrock VDEV_AUX_VERSION_NEWER); 4861544Seschrock error = ENOTSUP; 4871544Seschrock goto out; 488789Sahrens } 489789Sahrens 490789Sahrens /* 491789Sahrens * If the vdev guid sum doesn't match the uberblock, we have an 492789Sahrens * incomplete configuration. 493789Sahrens */ 4941732Sbonwick if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 4951544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 4961544Seschrock VDEV_AUX_BAD_GUID_SUM); 4971544Seschrock error = ENXIO; 4981544Seschrock goto out; 499789Sahrens } 500789Sahrens 501789Sahrens /* 502789Sahrens * Initialize internal SPA structures. 503789Sahrens */ 504789Sahrens spa->spa_state = POOL_STATE_ACTIVE; 505789Sahrens spa->spa_ubsync = spa->spa_uberblock; 506789Sahrens spa->spa_first_txg = spa_last_synced_txg(spa) + 1; 5071544Seschrock error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 5081544Seschrock if (error) { 5091544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 5101544Seschrock VDEV_AUX_CORRUPT_DATA); 5111544Seschrock goto out; 5121544Seschrock } 513789Sahrens spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 514789Sahrens 5151544Seschrock if (zap_lookup(spa->spa_meta_objset, 516789Sahrens DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 5171544Seschrock sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { 5181544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 5191544Seschrock VDEV_AUX_CORRUPT_DATA); 5201544Seschrock error = EIO; 5211544Seschrock goto out; 5221544Seschrock } 523789Sahrens 524789Sahrens if (!mosconfig) { 525*2082Seschrock nvlist_t *newconfig; 526*2082Seschrock 527*2082Seschrock if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) { 5281544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 5291544Seschrock VDEV_AUX_CORRUPT_DATA); 5301544Seschrock error = EIO; 5311544Seschrock goto out; 5321544Seschrock } 533789Sahrens 534789Sahrens spa_config_set(spa, newconfig); 535789Sahrens spa_unload(spa); 536789Sahrens spa_deactivate(spa); 537789Sahrens spa_activate(spa); 538789Sahrens 5391544Seschrock return (spa_load(spa, newconfig, state, B_TRUE)); 5401544Seschrock } 5411544Seschrock 5421544Seschrock if (zap_lookup(spa->spa_meta_objset, 5431544Seschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 5441544Seschrock sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { 5451544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 5461544Seschrock VDEV_AUX_CORRUPT_DATA); 5471544Seschrock error = EIO; 5481544Seschrock goto out; 549789Sahrens } 550789Sahrens 5511544Seschrock /* 552*2082Seschrock * Load the bit that tells us to use the new accounting function 553*2082Seschrock * (raid-z deflation). If we have an older pool, this will not 554*2082Seschrock * be present. 555*2082Seschrock */ 556*2082Seschrock error = zap_lookup(spa->spa_meta_objset, 557*2082Seschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 558*2082Seschrock sizeof (uint64_t), 1, &spa->spa_deflate); 559*2082Seschrock if (error != 0 && error != ENOENT) { 560*2082Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 561*2082Seschrock VDEV_AUX_CORRUPT_DATA); 562*2082Seschrock error = EIO; 563*2082Seschrock goto out; 564*2082Seschrock } 565*2082Seschrock 566*2082Seschrock /* 5671544Seschrock * Load the persistent error log. If we have an older pool, this will 5681544Seschrock * not be present. 5691544Seschrock */ 5701544Seschrock error = zap_lookup(spa->spa_meta_objset, 5711544Seschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, 5721544Seschrock sizeof (uint64_t), 1, &spa->spa_errlog_last); 5731807Sbonwick if (error != 0 && error != ENOENT) { 5741544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 5751544Seschrock VDEV_AUX_CORRUPT_DATA); 5761544Seschrock error = EIO; 5771544Seschrock goto out; 5781544Seschrock } 5791544Seschrock 5801544Seschrock error = zap_lookup(spa->spa_meta_objset, 5811544Seschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, 5821544Seschrock sizeof (uint64_t), 1, &spa->spa_errlog_scrub); 5831544Seschrock if (error != 0 && error != ENOENT) { 5841544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 5851544Seschrock VDEV_AUX_CORRUPT_DATA); 5861544Seschrock error = EIO; 5871544Seschrock goto out; 5881544Seschrock } 589789Sahrens 590789Sahrens /* 591*2082Seschrock * Load any hot spares for this pool. 592*2082Seschrock */ 593*2082Seschrock error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 594*2082Seschrock DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares_object); 595*2082Seschrock if (error != 0 && error != ENOENT) { 596*2082Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 597*2082Seschrock VDEV_AUX_CORRUPT_DATA); 598*2082Seschrock error = EIO; 599*2082Seschrock goto out; 600*2082Seschrock } 601*2082Seschrock if (error == 0) { 602*2082Seschrock ASSERT(spa_version(spa) >= ZFS_VERSION_SPARES); 603*2082Seschrock if (load_nvlist(spa, spa->spa_spares_object, 604*2082Seschrock &spa->spa_sparelist) != 0) { 605*2082Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 606*2082Seschrock VDEV_AUX_CORRUPT_DATA); 607*2082Seschrock error = EIO; 608*2082Seschrock goto out; 609*2082Seschrock } 610*2082Seschrock 611*2082Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 612*2082Seschrock spa_load_spares(spa); 613*2082Seschrock spa_config_exit(spa, FTAG); 614*2082Seschrock } 615*2082Seschrock 616*2082Seschrock /* 6171986Seschrock * Load the vdev state for all toplevel vdevs. 618789Sahrens */ 6191986Seschrock vdev_load(rvd); 620789Sahrens 621789Sahrens /* 622789Sahrens * Propagate the leaf DTLs we just loaded all the way up the tree. 623789Sahrens */ 6241544Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 625789Sahrens vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 6261544Seschrock spa_config_exit(spa, FTAG); 627789Sahrens 628789Sahrens /* 629789Sahrens * Check the state of the root vdev. If it can't be opened, it 630789Sahrens * indicates one or more toplevel vdevs are faulted. 631789Sahrens */ 6321544Seschrock if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 6331544Seschrock error = ENXIO; 6341544Seschrock goto out; 6351544Seschrock } 636789Sahrens 6371544Seschrock if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) { 6381635Sbonwick dmu_tx_t *tx; 6391635Sbonwick int need_update = B_FALSE; 6401585Sbonwick int c; 6411601Sbonwick 6421635Sbonwick /* 6431635Sbonwick * Claim log blocks that haven't been committed yet. 6441635Sbonwick * This must all happen in a single txg. 6451635Sbonwick */ 6461601Sbonwick tx = dmu_tx_create_assigned(spa_get_dsl(spa), 647789Sahrens spa_first_txg(spa)); 648789Sahrens dmu_objset_find(spa->spa_name, zil_claim, tx, 0); 649789Sahrens dmu_tx_commit(tx); 650789Sahrens 651789Sahrens spa->spa_sync_on = B_TRUE; 652789Sahrens txg_sync_start(spa->spa_dsl_pool); 653789Sahrens 654789Sahrens /* 655789Sahrens * Wait for all claims to sync. 656789Sahrens */ 657789Sahrens txg_wait_synced(spa->spa_dsl_pool, 0); 6581585Sbonwick 6591585Sbonwick /* 6601635Sbonwick * If the config cache is stale, or we have uninitialized 6611635Sbonwick * metaslabs (see spa_vdev_add()), then update the config. 6621585Sbonwick */ 6631635Sbonwick if (config_cache_txg != spa->spa_config_txg || 6641635Sbonwick state == SPA_LOAD_IMPORT) 6651635Sbonwick need_update = B_TRUE; 6661635Sbonwick 6671635Sbonwick for (c = 0; c < rvd->vdev_children; c++) 6681635Sbonwick if (rvd->vdev_child[c]->vdev_ms_array == 0) 6691635Sbonwick need_update = B_TRUE; 6701585Sbonwick 6711585Sbonwick /* 6721635Sbonwick * Update the config cache asychronously in case we're the 6731635Sbonwick * root pool, in which case the config cache isn't writable yet. 6741585Sbonwick */ 6751635Sbonwick if (need_update) 6761635Sbonwick spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 677789Sahrens } 678789Sahrens 6791544Seschrock error = 0; 6801544Seschrock out: 681*2082Seschrock if (error && error != EBADF) 6821544Seschrock zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0); 6831544Seschrock spa->spa_load_state = SPA_LOAD_NONE; 6841544Seschrock spa->spa_ena = 0; 6851544Seschrock 6861544Seschrock return (error); 687789Sahrens } 688789Sahrens 689789Sahrens /* 690789Sahrens * Pool Open/Import 691789Sahrens * 692789Sahrens * The import case is identical to an open except that the configuration is sent 693789Sahrens * down from userland, instead of grabbed from the configuration cache. For the 694789Sahrens * case of an open, the pool configuration will exist in the 695789Sahrens * POOL_STATE_UNITIALIZED state. 696789Sahrens * 697789Sahrens * The stats information (gen/count/ustats) is used to gather vdev statistics at 698789Sahrens * the same time open the pool, without having to keep around the spa_t in some 699789Sahrens * ambiguous state. 700789Sahrens */ 701789Sahrens static int 702789Sahrens spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) 703789Sahrens { 704789Sahrens spa_t *spa; 705789Sahrens int error; 706789Sahrens int loaded = B_FALSE; 707789Sahrens int locked = B_FALSE; 708789Sahrens 709789Sahrens *spapp = NULL; 710789Sahrens 711789Sahrens /* 712789Sahrens * As disgusting as this is, we need to support recursive calls to this 713789Sahrens * function because dsl_dir_open() is called during spa_load(), and ends 714789Sahrens * up calling spa_open() again. The real fix is to figure out how to 715789Sahrens * avoid dsl_dir_open() calling this in the first place. 716789Sahrens */ 717789Sahrens if (mutex_owner(&spa_namespace_lock) != curthread) { 718789Sahrens mutex_enter(&spa_namespace_lock); 719789Sahrens locked = B_TRUE; 720789Sahrens } 721789Sahrens 722789Sahrens if ((spa = spa_lookup(pool)) == NULL) { 723789Sahrens if (locked) 724789Sahrens mutex_exit(&spa_namespace_lock); 725789Sahrens return (ENOENT); 726789Sahrens } 727789Sahrens if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 728789Sahrens 729789Sahrens spa_activate(spa); 730789Sahrens 7311635Sbonwick error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE); 732789Sahrens 733789Sahrens if (error == EBADF) { 734789Sahrens /* 7351986Seschrock * If vdev_validate() returns failure (indicated by 7361986Seschrock * EBADF), it indicates that one of the vdevs indicates 7371986Seschrock * that the pool has been exported or destroyed. If 7381986Seschrock * this is the case, the config cache is out of sync and 7391986Seschrock * we should remove the pool from the namespace. 740789Sahrens */ 741*2082Seschrock zfs_post_ok(spa, NULL); 742789Sahrens spa_unload(spa); 743789Sahrens spa_deactivate(spa); 744789Sahrens spa_remove(spa); 745789Sahrens spa_config_sync(); 746789Sahrens if (locked) 747789Sahrens mutex_exit(&spa_namespace_lock); 748789Sahrens return (ENOENT); 7491544Seschrock } 7501544Seschrock 7511544Seschrock if (error) { 752789Sahrens /* 753789Sahrens * We can't open the pool, but we still have useful 754789Sahrens * information: the state of each vdev after the 755789Sahrens * attempted vdev_open(). Return this to the user. 756789Sahrens */ 7571635Sbonwick if (config != NULL && spa->spa_root_vdev != NULL) { 7581635Sbonwick spa_config_enter(spa, RW_READER, FTAG); 759789Sahrens *config = spa_config_generate(spa, NULL, -1ULL, 760789Sahrens B_TRUE); 7611635Sbonwick spa_config_exit(spa, FTAG); 7621635Sbonwick } 763789Sahrens spa_unload(spa); 764789Sahrens spa_deactivate(spa); 7651544Seschrock spa->spa_last_open_failed = B_TRUE; 766789Sahrens if (locked) 767789Sahrens mutex_exit(&spa_namespace_lock); 768789Sahrens *spapp = NULL; 769789Sahrens return (error); 7701544Seschrock } else { 7711544Seschrock zfs_post_ok(spa, NULL); 7721544Seschrock spa->spa_last_open_failed = B_FALSE; 773789Sahrens } 774789Sahrens 775789Sahrens loaded = B_TRUE; 776789Sahrens } 777789Sahrens 778789Sahrens spa_open_ref(spa, tag); 779789Sahrens if (locked) 780789Sahrens mutex_exit(&spa_namespace_lock); 781789Sahrens 782789Sahrens *spapp = spa; 783789Sahrens 784789Sahrens if (config != NULL) { 7851544Seschrock spa_config_enter(spa, RW_READER, FTAG); 786789Sahrens *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 7871544Seschrock spa_config_exit(spa, FTAG); 788789Sahrens } 789789Sahrens 790789Sahrens /* 791789Sahrens * If we just loaded the pool, resilver anything that's out of date. 792789Sahrens */ 793789Sahrens if (loaded && (spa_mode & FWRITE)) 794789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 795789Sahrens 796789Sahrens return (0); 797789Sahrens } 798789Sahrens 799789Sahrens int 800789Sahrens spa_open(const char *name, spa_t **spapp, void *tag) 801789Sahrens { 802789Sahrens return (spa_open_common(name, spapp, tag, NULL)); 803789Sahrens } 804789Sahrens 8051544Seschrock /* 8061544Seschrock * Lookup the given spa_t, incrementing the inject count in the process, 8071544Seschrock * preventing it from being exported or destroyed. 8081544Seschrock */ 8091544Seschrock spa_t * 8101544Seschrock spa_inject_addref(char *name) 8111544Seschrock { 8121544Seschrock spa_t *spa; 8131544Seschrock 8141544Seschrock mutex_enter(&spa_namespace_lock); 8151544Seschrock if ((spa = spa_lookup(name)) == NULL) { 8161544Seschrock mutex_exit(&spa_namespace_lock); 8171544Seschrock return (NULL); 8181544Seschrock } 8191544Seschrock spa->spa_inject_ref++; 8201544Seschrock mutex_exit(&spa_namespace_lock); 8211544Seschrock 8221544Seschrock return (spa); 8231544Seschrock } 8241544Seschrock 8251544Seschrock void 8261544Seschrock spa_inject_delref(spa_t *spa) 8271544Seschrock { 8281544Seschrock mutex_enter(&spa_namespace_lock); 8291544Seschrock spa->spa_inject_ref--; 8301544Seschrock mutex_exit(&spa_namespace_lock); 8311544Seschrock } 8321544Seschrock 833*2082Seschrock static void 834*2082Seschrock spa_add_spares(spa_t *spa, nvlist_t *config) 835*2082Seschrock { 836*2082Seschrock nvlist_t **spares; 837*2082Seschrock uint_t i, nspares; 838*2082Seschrock nvlist_t *nvroot; 839*2082Seschrock uint64_t guid; 840*2082Seschrock vdev_stat_t *vs; 841*2082Seschrock uint_t vsc; 842*2082Seschrock 843*2082Seschrock if (spa->spa_nspares == 0) 844*2082Seschrock return; 845*2082Seschrock 846*2082Seschrock VERIFY(nvlist_lookup_nvlist(config, 847*2082Seschrock ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 848*2082Seschrock VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 849*2082Seschrock ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 850*2082Seschrock if (nspares != 0) { 851*2082Seschrock VERIFY(nvlist_add_nvlist_array(nvroot, 852*2082Seschrock ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 853*2082Seschrock VERIFY(nvlist_lookup_nvlist_array(nvroot, 854*2082Seschrock ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 855*2082Seschrock 856*2082Seschrock /* 857*2082Seschrock * Go through and find any spares which have since been 858*2082Seschrock * repurposed as an active spare. If this is the case, update 859*2082Seschrock * their status appropriately. 860*2082Seschrock */ 861*2082Seschrock for (i = 0; i < nspares; i++) { 862*2082Seschrock VERIFY(nvlist_lookup_uint64(spares[i], 863*2082Seschrock ZPOOL_CONFIG_GUID, &guid) == 0); 864*2082Seschrock if (spa_spare_inuse(guid)) { 865*2082Seschrock VERIFY(nvlist_lookup_uint64_array( 866*2082Seschrock spares[i], ZPOOL_CONFIG_STATS, 867*2082Seschrock (uint64_t **)&vs, &vsc) == 0); 868*2082Seschrock vs->vs_state = VDEV_STATE_CANT_OPEN; 869*2082Seschrock vs->vs_aux = VDEV_AUX_SPARED; 870*2082Seschrock } 871*2082Seschrock } 872*2082Seschrock } 873*2082Seschrock } 874*2082Seschrock 875789Sahrens int 8761544Seschrock spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 877789Sahrens { 878789Sahrens int error; 879789Sahrens spa_t *spa; 880789Sahrens 881789Sahrens *config = NULL; 882789Sahrens error = spa_open_common(name, &spa, FTAG, config); 883789Sahrens 884*2082Seschrock if (spa && *config != NULL) { 8851544Seschrock VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, 8861544Seschrock spa_get_errlog_size(spa)) == 0); 8871544Seschrock 888*2082Seschrock spa_add_spares(spa, *config); 889*2082Seschrock } 890*2082Seschrock 8911544Seschrock /* 8921544Seschrock * We want to get the alternate root even for faulted pools, so we cheat 8931544Seschrock * and call spa_lookup() directly. 8941544Seschrock */ 8951544Seschrock if (altroot) { 8961544Seschrock if (spa == NULL) { 8971544Seschrock mutex_enter(&spa_namespace_lock); 8981544Seschrock spa = spa_lookup(name); 8991544Seschrock if (spa) 9001544Seschrock spa_altroot(spa, altroot, buflen); 9011544Seschrock else 9021544Seschrock altroot[0] = '\0'; 9031544Seschrock spa = NULL; 9041544Seschrock mutex_exit(&spa_namespace_lock); 9051544Seschrock } else { 9061544Seschrock spa_altroot(spa, altroot, buflen); 9071544Seschrock } 9081544Seschrock } 9091544Seschrock 910789Sahrens if (spa != NULL) 911789Sahrens spa_close(spa, FTAG); 912789Sahrens 913789Sahrens return (error); 914789Sahrens } 915789Sahrens 916789Sahrens /* 917*2082Seschrock * Validate that the 'spares' array is well formed. We must have an array of 918*2082Seschrock * nvlists, each which describes a valid leaf vdev. 919*2082Seschrock */ 920*2082Seschrock static int 921*2082Seschrock spa_validate_spares(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 922*2082Seschrock { 923*2082Seschrock nvlist_t **spares; 924*2082Seschrock uint_t i, nspares; 925*2082Seschrock vdev_t *vd; 926*2082Seschrock int error; 927*2082Seschrock 928*2082Seschrock /* 929*2082Seschrock * It's acceptable to have no spares specified. 930*2082Seschrock */ 931*2082Seschrock if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 932*2082Seschrock &spares, &nspares) != 0) 933*2082Seschrock return (0); 934*2082Seschrock 935*2082Seschrock if (nspares == 0) 936*2082Seschrock return (EINVAL); 937*2082Seschrock 938*2082Seschrock /* 939*2082Seschrock * Make sure the pool is formatted with a version that supports hot 940*2082Seschrock * spares. 941*2082Seschrock */ 942*2082Seschrock if (spa_version(spa) < ZFS_VERSION_SPARES) 943*2082Seschrock return (ENOTSUP); 944*2082Seschrock 945*2082Seschrock for (i = 0; i < nspares; i++) { 946*2082Seschrock if ((error = spa_config_parse(spa, &vd, spares[i], NULL, 0, 947*2082Seschrock mode)) != 0) 948*2082Seschrock return (error); 949*2082Seschrock 950*2082Seschrock if (!vd->vdev_ops->vdev_op_leaf) { 951*2082Seschrock vdev_free(vd); 952*2082Seschrock return (EINVAL); 953*2082Seschrock } 954*2082Seschrock 955*2082Seschrock if ((error = vdev_open(vd)) != 0) { 956*2082Seschrock vdev_free(vd); 957*2082Seschrock return (error); 958*2082Seschrock } 959*2082Seschrock 960*2082Seschrock vd->vdev_top = vd; 961*2082Seschrock if ((error = vdev_label_spare(vd, crtxg)) != 0) { 962*2082Seschrock vdev_free(vd); 963*2082Seschrock return (error); 964*2082Seschrock } 965*2082Seschrock 966*2082Seschrock VERIFY(nvlist_add_uint64(spares[i], ZPOOL_CONFIG_GUID, 967*2082Seschrock vd->vdev_guid) == 0); 968*2082Seschrock 969*2082Seschrock vdev_free(vd); 970*2082Seschrock } 971*2082Seschrock 972*2082Seschrock return (0); 973*2082Seschrock } 974*2082Seschrock 975*2082Seschrock /* 976789Sahrens * Pool Creation 977789Sahrens */ 978789Sahrens int 9791635Sbonwick spa_create(const char *pool, nvlist_t *nvroot, const char *altroot) 980789Sahrens { 981789Sahrens spa_t *spa; 9821635Sbonwick vdev_t *rvd; 983789Sahrens dsl_pool_t *dp; 984789Sahrens dmu_tx_t *tx; 985*2082Seschrock int c, error = 0; 986789Sahrens uint64_t txg = TXG_INITIAL; 987*2082Seschrock nvlist_t **spares; 988*2082Seschrock uint_t nspares; 989789Sahrens 990789Sahrens /* 991789Sahrens * If this pool already exists, return failure. 992789Sahrens */ 993789Sahrens mutex_enter(&spa_namespace_lock); 994789Sahrens if (spa_lookup(pool) != NULL) { 995789Sahrens mutex_exit(&spa_namespace_lock); 996789Sahrens return (EEXIST); 997789Sahrens } 998789Sahrens 999789Sahrens /* 1000789Sahrens * Allocate a new spa_t structure. 1001789Sahrens */ 10021635Sbonwick spa = spa_add(pool, altroot); 1003789Sahrens spa_activate(spa); 1004789Sahrens 1005789Sahrens spa->spa_uberblock.ub_txg = txg - 1; 10061760Seschrock spa->spa_uberblock.ub_version = ZFS_VERSION; 1007789Sahrens spa->spa_ubsync = spa->spa_uberblock; 1008789Sahrens 10091635Sbonwick /* 10101635Sbonwick * Create the root vdev. 10111635Sbonwick */ 10121635Sbonwick spa_config_enter(spa, RW_WRITER, FTAG); 10131635Sbonwick 1014*2082Seschrock error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 1015*2082Seschrock 1016*2082Seschrock ASSERT(error != 0 || rvd != NULL); 1017*2082Seschrock ASSERT(error != 0 || spa->spa_root_vdev == rvd); 1018*2082Seschrock 1019*2082Seschrock if (error == 0 && rvd->vdev_children == 0) 10201635Sbonwick error = EINVAL; 1021*2082Seschrock 1022*2082Seschrock if (error == 0 && 1023*2082Seschrock (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 1024*2082Seschrock (error = spa_validate_spares(spa, nvroot, txg, 1025*2082Seschrock VDEV_ALLOC_ADD)) == 0) { 1026*2082Seschrock for (c = 0; c < rvd->vdev_children; c++) 1027*2082Seschrock vdev_init(rvd->vdev_child[c], txg); 1028*2082Seschrock vdev_config_dirty(rvd); 10291635Sbonwick } 10301635Sbonwick 10311635Sbonwick spa_config_exit(spa, FTAG); 1032789Sahrens 1033*2082Seschrock if (error != 0) { 1034789Sahrens spa_unload(spa); 1035789Sahrens spa_deactivate(spa); 1036789Sahrens spa_remove(spa); 1037789Sahrens mutex_exit(&spa_namespace_lock); 1038789Sahrens return (error); 1039789Sahrens } 1040789Sahrens 1041*2082Seschrock /* 1042*2082Seschrock * Get the list of spares, if specified. 1043*2082Seschrock */ 1044*2082Seschrock if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1045*2082Seschrock &spares, &nspares) == 0) { 1046*2082Seschrock VERIFY(nvlist_alloc(&spa->spa_sparelist, NV_UNIQUE_NAME, 1047*2082Seschrock KM_SLEEP) == 0); 1048*2082Seschrock VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1049*2082Seschrock ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1050*2082Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 1051*2082Seschrock spa_load_spares(spa); 1052*2082Seschrock spa_config_exit(spa, FTAG); 1053*2082Seschrock spa->spa_sync_spares = B_TRUE; 1054*2082Seschrock } 1055*2082Seschrock 1056789Sahrens spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg); 1057789Sahrens spa->spa_meta_objset = dp->dp_meta_objset; 1058789Sahrens 1059789Sahrens tx = dmu_tx_create_assigned(dp, txg); 1060789Sahrens 1061789Sahrens /* 1062789Sahrens * Create the pool config object. 1063789Sahrens */ 1064789Sahrens spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 1065789Sahrens DMU_OT_PACKED_NVLIST, 1 << 14, 1066789Sahrens DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 1067789Sahrens 10681544Seschrock if (zap_add(spa->spa_meta_objset, 1069789Sahrens DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 10701544Seschrock sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 10711544Seschrock cmn_err(CE_PANIC, "failed to add pool config"); 10721544Seschrock } 1073789Sahrens 1074*2082Seschrock /* Newly created pools are always deflated. */ 1075*2082Seschrock spa->spa_deflate = TRUE; 1076*2082Seschrock if (zap_add(spa->spa_meta_objset, 1077*2082Seschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 1078*2082Seschrock sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 1079*2082Seschrock cmn_err(CE_PANIC, "failed to add deflate"); 1080*2082Seschrock } 1081*2082Seschrock 1082789Sahrens /* 1083789Sahrens * Create the deferred-free bplist object. Turn off compression 1084789Sahrens * because sync-to-convergence takes longer if the blocksize 1085789Sahrens * keeps changing. 1086789Sahrens */ 1087789Sahrens spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 1088789Sahrens 1 << 14, tx); 1089789Sahrens dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 1090789Sahrens ZIO_COMPRESS_OFF, tx); 1091789Sahrens 10921544Seschrock if (zap_add(spa->spa_meta_objset, 1093789Sahrens DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 10941544Seschrock sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { 10951544Seschrock cmn_err(CE_PANIC, "failed to add bplist"); 10961544Seschrock } 1097789Sahrens 1098789Sahrens dmu_tx_commit(tx); 1099789Sahrens 1100789Sahrens spa->spa_sync_on = B_TRUE; 1101789Sahrens txg_sync_start(spa->spa_dsl_pool); 1102789Sahrens 1103789Sahrens /* 1104789Sahrens * We explicitly wait for the first transaction to complete so that our 1105789Sahrens * bean counters are appropriately updated. 1106789Sahrens */ 1107789Sahrens txg_wait_synced(spa->spa_dsl_pool, txg); 1108789Sahrens 1109789Sahrens spa_config_sync(); 1110789Sahrens 1111789Sahrens mutex_exit(&spa_namespace_lock); 1112789Sahrens 1113789Sahrens return (0); 1114789Sahrens } 1115789Sahrens 1116789Sahrens /* 1117789Sahrens * Import the given pool into the system. We set up the necessary spa_t and 1118789Sahrens * then call spa_load() to do the dirty work. 1119789Sahrens */ 1120789Sahrens int 11211635Sbonwick spa_import(const char *pool, nvlist_t *config, const char *altroot) 1122789Sahrens { 1123789Sahrens spa_t *spa; 1124789Sahrens int error; 1125*2082Seschrock nvlist_t *nvroot; 1126*2082Seschrock nvlist_t **spares; 1127*2082Seschrock uint_t nspares; 1128789Sahrens 1129789Sahrens if (!(spa_mode & FWRITE)) 1130789Sahrens return (EROFS); 1131789Sahrens 1132789Sahrens /* 1133789Sahrens * If a pool with this name exists, return failure. 1134789Sahrens */ 1135789Sahrens mutex_enter(&spa_namespace_lock); 1136789Sahrens if (spa_lookup(pool) != NULL) { 1137789Sahrens mutex_exit(&spa_namespace_lock); 1138789Sahrens return (EEXIST); 1139789Sahrens } 1140789Sahrens 1141789Sahrens /* 11421635Sbonwick * Create and initialize the spa structure. 1143789Sahrens */ 11441635Sbonwick spa = spa_add(pool, altroot); 1145789Sahrens spa_activate(spa); 1146789Sahrens 1147789Sahrens /* 11481635Sbonwick * Pass off the heavy lifting to spa_load(). 11491732Sbonwick * Pass TRUE for mosconfig because the user-supplied config 11501732Sbonwick * is actually the one to trust when doing an import. 11511601Sbonwick */ 11521732Sbonwick error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE); 1153789Sahrens 1154*2082Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 1155*2082Seschrock /* 1156*2082Seschrock * Toss any existing sparelist, as it doesn't have any validity anymore, 1157*2082Seschrock * and conflicts with spa_has_spare(). 1158*2082Seschrock */ 1159*2082Seschrock if (spa->spa_sparelist) { 1160*2082Seschrock nvlist_free(spa->spa_sparelist); 1161*2082Seschrock spa->spa_sparelist = NULL; 1162*2082Seschrock spa_load_spares(spa); 1163*2082Seschrock } 1164*2082Seschrock 1165*2082Seschrock VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 1166*2082Seschrock &nvroot) == 0); 1167*2082Seschrock if (error == 0) 1168*2082Seschrock error = spa_validate_spares(spa, nvroot, -1ULL, 1169*2082Seschrock VDEV_ALLOC_SPARE); 1170*2082Seschrock spa_config_exit(spa, FTAG); 1171*2082Seschrock 1172*2082Seschrock if (error != 0) { 1173789Sahrens spa_unload(spa); 1174789Sahrens spa_deactivate(spa); 1175789Sahrens spa_remove(spa); 1176789Sahrens mutex_exit(&spa_namespace_lock); 1177789Sahrens return (error); 1178789Sahrens } 1179789Sahrens 11801635Sbonwick /* 1181*2082Seschrock * Override any spares as specified by the user, as these may have 1182*2082Seschrock * correct device names/devids, etc. 1183*2082Seschrock */ 1184*2082Seschrock if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1185*2082Seschrock &spares, &nspares) == 0) { 1186*2082Seschrock if (spa->spa_sparelist) 1187*2082Seschrock VERIFY(nvlist_remove(spa->spa_sparelist, 1188*2082Seschrock ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 1189*2082Seschrock else 1190*2082Seschrock VERIFY(nvlist_alloc(&spa->spa_sparelist, 1191*2082Seschrock NV_UNIQUE_NAME, KM_SLEEP) == 0); 1192*2082Seschrock VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1193*2082Seschrock ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1194*2082Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 1195*2082Seschrock spa_load_spares(spa); 1196*2082Seschrock spa_config_exit(spa, FTAG); 1197*2082Seschrock spa->spa_sync_spares = B_TRUE; 1198*2082Seschrock } 1199*2082Seschrock 1200*2082Seschrock /* 12011635Sbonwick * Update the config cache to include the newly-imported pool. 12021635Sbonwick */ 12031635Sbonwick spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 12041635Sbonwick 1205789Sahrens mutex_exit(&spa_namespace_lock); 1206789Sahrens 1207789Sahrens /* 1208789Sahrens * Resilver anything that's out of date. 1209789Sahrens */ 1210789Sahrens if (spa_mode & FWRITE) 1211789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1212789Sahrens 1213789Sahrens return (0); 1214789Sahrens } 1215789Sahrens 1216789Sahrens /* 1217789Sahrens * This (illegal) pool name is used when temporarily importing a spa_t in order 1218789Sahrens * to get the vdev stats associated with the imported devices. 1219789Sahrens */ 1220789Sahrens #define TRYIMPORT_NAME "$import" 1221789Sahrens 1222789Sahrens nvlist_t * 1223789Sahrens spa_tryimport(nvlist_t *tryconfig) 1224789Sahrens { 1225789Sahrens nvlist_t *config = NULL; 1226789Sahrens char *poolname; 1227789Sahrens spa_t *spa; 1228789Sahrens uint64_t state; 1229789Sahrens 1230789Sahrens if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 1231789Sahrens return (NULL); 1232789Sahrens 1233789Sahrens if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 1234789Sahrens return (NULL); 1235789Sahrens 12361635Sbonwick /* 12371635Sbonwick * Create and initialize the spa structure. 12381635Sbonwick */ 1239789Sahrens mutex_enter(&spa_namespace_lock); 12401635Sbonwick spa = spa_add(TRYIMPORT_NAME, NULL); 1241789Sahrens spa_activate(spa); 1242789Sahrens 1243789Sahrens /* 12441635Sbonwick * Pass off the heavy lifting to spa_load(). 12451732Sbonwick * Pass TRUE for mosconfig because the user-supplied config 12461732Sbonwick * is actually the one to trust when doing an import. 1247789Sahrens */ 12481732Sbonwick (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); 1249789Sahrens 1250789Sahrens /* 1251789Sahrens * If 'tryconfig' was at least parsable, return the current config. 1252789Sahrens */ 1253789Sahrens if (spa->spa_root_vdev != NULL) { 12541635Sbonwick spa_config_enter(spa, RW_READER, FTAG); 1255789Sahrens config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 12561635Sbonwick spa_config_exit(spa, FTAG); 1257789Sahrens VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 1258789Sahrens poolname) == 0); 1259789Sahrens VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 1260789Sahrens state) == 0); 1261*2082Seschrock 1262*2082Seschrock /* 1263*2082Seschrock * Add the list of hot spares. 1264*2082Seschrock */ 1265*2082Seschrock spa_add_spares(spa, config); 1266789Sahrens } 1267789Sahrens 1268789Sahrens spa_unload(spa); 1269789Sahrens spa_deactivate(spa); 1270789Sahrens spa_remove(spa); 1271789Sahrens mutex_exit(&spa_namespace_lock); 1272789Sahrens 1273789Sahrens return (config); 1274789Sahrens } 1275789Sahrens 1276789Sahrens /* 1277789Sahrens * Pool export/destroy 1278789Sahrens * 1279789Sahrens * The act of destroying or exporting a pool is very simple. We make sure there 1280789Sahrens * is no more pending I/O and any references to the pool are gone. Then, we 1281789Sahrens * update the pool state and sync all the labels to disk, removing the 1282789Sahrens * configuration from the cache afterwards. 1283789Sahrens */ 1284789Sahrens static int 12851775Sbillm spa_export_common(char *pool, int new_state, nvlist_t **oldconfig) 1286789Sahrens { 1287789Sahrens spa_t *spa; 1288789Sahrens 12891775Sbillm if (oldconfig) 12901775Sbillm *oldconfig = NULL; 12911775Sbillm 1292789Sahrens if (!(spa_mode & FWRITE)) 1293789Sahrens return (EROFS); 1294789Sahrens 1295789Sahrens mutex_enter(&spa_namespace_lock); 1296789Sahrens if ((spa = spa_lookup(pool)) == NULL) { 1297789Sahrens mutex_exit(&spa_namespace_lock); 1298789Sahrens return (ENOENT); 1299789Sahrens } 1300789Sahrens 1301789Sahrens /* 13021544Seschrock * Put a hold on the pool, drop the namespace lock, stop async tasks, 13031544Seschrock * reacquire the namespace lock, and see if we can export. 13041544Seschrock */ 13051544Seschrock spa_open_ref(spa, FTAG); 13061544Seschrock mutex_exit(&spa_namespace_lock); 13071544Seschrock spa_async_suspend(spa); 13081544Seschrock mutex_enter(&spa_namespace_lock); 13091544Seschrock spa_close(spa, FTAG); 13101544Seschrock 13111544Seschrock /* 1312789Sahrens * The pool will be in core if it's openable, 1313789Sahrens * in which case we can modify its state. 1314789Sahrens */ 1315789Sahrens if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 1316789Sahrens /* 1317789Sahrens * Objsets may be open only because they're dirty, so we 1318789Sahrens * have to force it to sync before checking spa_refcnt. 1319789Sahrens */ 1320789Sahrens spa_scrub_suspend(spa); 1321789Sahrens txg_wait_synced(spa->spa_dsl_pool, 0); 1322789Sahrens 13231544Seschrock /* 13241544Seschrock * A pool cannot be exported or destroyed if there are active 13251544Seschrock * references. If we are resetting a pool, allow references by 13261544Seschrock * fault injection handlers. 13271544Seschrock */ 13281544Seschrock if (!spa_refcount_zero(spa) || 13291544Seschrock (spa->spa_inject_ref != 0 && 13301544Seschrock new_state != POOL_STATE_UNINITIALIZED)) { 1331789Sahrens spa_scrub_resume(spa); 13321544Seschrock spa_async_resume(spa); 1333789Sahrens mutex_exit(&spa_namespace_lock); 1334789Sahrens return (EBUSY); 1335789Sahrens } 1336789Sahrens 1337789Sahrens spa_scrub_resume(spa); 1338789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 1339789Sahrens 1340789Sahrens /* 1341789Sahrens * We want this to be reflected on every label, 1342789Sahrens * so mark them all dirty. spa_unload() will do the 1343789Sahrens * final sync that pushes these changes out. 1344789Sahrens */ 13451544Seschrock if (new_state != POOL_STATE_UNINITIALIZED) { 13461601Sbonwick spa_config_enter(spa, RW_WRITER, FTAG); 13471544Seschrock spa->spa_state = new_state; 13481635Sbonwick spa->spa_final_txg = spa_last_synced_txg(spa) + 1; 13491544Seschrock vdev_config_dirty(spa->spa_root_vdev); 13501601Sbonwick spa_config_exit(spa, FTAG); 13511544Seschrock } 1352789Sahrens } 1353789Sahrens 1354789Sahrens if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 1355789Sahrens spa_unload(spa); 1356789Sahrens spa_deactivate(spa); 1357789Sahrens } 1358789Sahrens 13591775Sbillm if (oldconfig && spa->spa_config) 13601775Sbillm VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 13611775Sbillm 13621544Seschrock if (new_state != POOL_STATE_UNINITIALIZED) { 13631544Seschrock spa_remove(spa); 13641544Seschrock spa_config_sync(); 13651544Seschrock } 1366789Sahrens mutex_exit(&spa_namespace_lock); 1367789Sahrens 1368789Sahrens return (0); 1369789Sahrens } 1370789Sahrens 1371789Sahrens /* 1372789Sahrens * Destroy a storage pool. 1373789Sahrens */ 1374789Sahrens int 1375789Sahrens spa_destroy(char *pool) 1376789Sahrens { 13771775Sbillm return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL)); 1378789Sahrens } 1379789Sahrens 1380789Sahrens /* 1381789Sahrens * Export a storage pool. 1382789Sahrens */ 1383789Sahrens int 13841775Sbillm spa_export(char *pool, nvlist_t **oldconfig) 1385789Sahrens { 13861775Sbillm return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig)); 1387789Sahrens } 1388789Sahrens 1389789Sahrens /* 13901544Seschrock * Similar to spa_export(), this unloads the spa_t without actually removing it 13911544Seschrock * from the namespace in any way. 13921544Seschrock */ 13931544Seschrock int 13941544Seschrock spa_reset(char *pool) 13951544Seschrock { 13961775Sbillm return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL)); 13971544Seschrock } 13981544Seschrock 13991544Seschrock 14001544Seschrock /* 1401789Sahrens * ========================================================================== 1402789Sahrens * Device manipulation 1403789Sahrens * ========================================================================== 1404789Sahrens */ 1405789Sahrens 1406789Sahrens /* 1407789Sahrens * Add capacity to a storage pool. 1408789Sahrens */ 1409789Sahrens int 1410789Sahrens spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 1411789Sahrens { 1412789Sahrens uint64_t txg; 14131635Sbonwick int c, error; 1414789Sahrens vdev_t *rvd = spa->spa_root_vdev; 14151585Sbonwick vdev_t *vd, *tvd; 1416*2082Seschrock nvlist_t **spares; 1417*2082Seschrock uint_t i, nspares; 1418789Sahrens 1419789Sahrens txg = spa_vdev_enter(spa); 1420789Sahrens 1421*2082Seschrock if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 1422*2082Seschrock VDEV_ALLOC_ADD)) != 0) 1423*2082Seschrock return (spa_vdev_exit(spa, NULL, txg, error)); 1424*2082Seschrock 1425*2082Seschrock if ((error = spa_validate_spares(spa, nvroot, txg, 1426*2082Seschrock VDEV_ALLOC_ADD)) != 0) 1427789Sahrens return (spa_vdev_exit(spa, vd, txg, error)); 1428789Sahrens 1429*2082Seschrock if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1430*2082Seschrock &spares, &nspares) != 0) 1431*2082Seschrock nspares = 0; 1432*2082Seschrock 1433*2082Seschrock if (vd->vdev_children == 0 && nspares == 0) 1434*2082Seschrock return (spa_vdev_exit(spa, vd, txg, EINVAL)); 1435*2082Seschrock 1436*2082Seschrock if (vd->vdev_children != 0) { 1437*2082Seschrock if ((error = vdev_create(vd, txg, B_FALSE)) != 0) 1438*2082Seschrock return (spa_vdev_exit(spa, vd, txg, error)); 1439*2082Seschrock 1440*2082Seschrock /* 1441*2082Seschrock * Transfer each new top-level vdev from vd to rvd. 1442*2082Seschrock */ 1443*2082Seschrock for (c = 0; c < vd->vdev_children; c++) { 1444*2082Seschrock tvd = vd->vdev_child[c]; 1445*2082Seschrock vdev_remove_child(vd, tvd); 1446*2082Seschrock tvd->vdev_id = rvd->vdev_children; 1447*2082Seschrock vdev_add_child(rvd, tvd); 1448*2082Seschrock vdev_config_dirty(tvd); 1449*2082Seschrock } 1450*2082Seschrock } 1451*2082Seschrock 1452*2082Seschrock if (nspares != 0) { 1453*2082Seschrock if (spa->spa_sparelist != NULL) { 1454*2082Seschrock nvlist_t **oldspares; 1455*2082Seschrock uint_t oldnspares; 1456*2082Seschrock nvlist_t **newspares; 1457*2082Seschrock 1458*2082Seschrock VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 1459*2082Seschrock ZPOOL_CONFIG_SPARES, &oldspares, &oldnspares) == 0); 1460*2082Seschrock 1461*2082Seschrock newspares = kmem_alloc(sizeof (void *) * 1462*2082Seschrock (nspares + oldnspares), KM_SLEEP); 1463*2082Seschrock for (i = 0; i < oldnspares; i++) 1464*2082Seschrock VERIFY(nvlist_dup(oldspares[i], 1465*2082Seschrock &newspares[i], KM_SLEEP) == 0); 1466*2082Seschrock for (i = 0; i < nspares; i++) 1467*2082Seschrock VERIFY(nvlist_dup(spares[i], 1468*2082Seschrock &newspares[i + oldnspares], 1469*2082Seschrock KM_SLEEP) == 0); 1470*2082Seschrock 1471*2082Seschrock VERIFY(nvlist_remove(spa->spa_sparelist, 1472*2082Seschrock ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 1473*2082Seschrock 1474*2082Seschrock VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1475*2082Seschrock ZPOOL_CONFIG_SPARES, newspares, 1476*2082Seschrock nspares + oldnspares) == 0); 1477*2082Seschrock for (i = 0; i < oldnspares + nspares; i++) 1478*2082Seschrock nvlist_free(newspares[i]); 1479*2082Seschrock kmem_free(newspares, (oldnspares + nspares) * 1480*2082Seschrock sizeof (void *)); 1481*2082Seschrock } else { 1482*2082Seschrock VERIFY(nvlist_alloc(&spa->spa_sparelist, 1483*2082Seschrock NV_UNIQUE_NAME, KM_SLEEP) == 0); 1484*2082Seschrock VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 1485*2082Seschrock ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 1486*2082Seschrock } 1487*2082Seschrock 1488*2082Seschrock spa_load_spares(spa); 1489*2082Seschrock spa->spa_sync_spares = B_TRUE; 1490789Sahrens } 1491789Sahrens 1492789Sahrens /* 14931585Sbonwick * We have to be careful when adding new vdevs to an existing pool. 14941585Sbonwick * If other threads start allocating from these vdevs before we 14951585Sbonwick * sync the config cache, and we lose power, then upon reboot we may 14961585Sbonwick * fail to open the pool because there are DVAs that the config cache 14971585Sbonwick * can't translate. Therefore, we first add the vdevs without 14981585Sbonwick * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 14991635Sbonwick * and then let spa_config_update() initialize the new metaslabs. 15001585Sbonwick * 15011585Sbonwick * spa_load() checks for added-but-not-initialized vdevs, so that 15021585Sbonwick * if we lose power at any point in this sequence, the remaining 15031585Sbonwick * steps will be completed the next time we load the pool. 1504789Sahrens */ 15051635Sbonwick (void) spa_vdev_exit(spa, vd, txg, 0); 15061585Sbonwick 15071635Sbonwick mutex_enter(&spa_namespace_lock); 15081635Sbonwick spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 15091635Sbonwick mutex_exit(&spa_namespace_lock); 1510789Sahrens 15111635Sbonwick return (0); 1512789Sahrens } 1513789Sahrens 1514789Sahrens /* 1515789Sahrens * Attach a device to a mirror. The arguments are the path to any device 1516789Sahrens * in the mirror, and the nvroot for the new device. If the path specifies 1517789Sahrens * a device that is not mirrored, we automatically insert the mirror vdev. 1518789Sahrens * 1519789Sahrens * If 'replacing' is specified, the new device is intended to replace the 1520789Sahrens * existing device; in this case the two devices are made into their own 1521789Sahrens * mirror using the 'replacing' vdev, which is functionally idendical to 1522789Sahrens * the mirror vdev (it actually reuses all the same ops) but has a few 1523789Sahrens * extra rules: you can't attach to it after it's been created, and upon 1524789Sahrens * completion of resilvering, the first disk (the one being replaced) 1525789Sahrens * is automatically detached. 1526789Sahrens */ 1527789Sahrens int 15281544Seschrock spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 1529789Sahrens { 1530789Sahrens uint64_t txg, open_txg; 1531789Sahrens int error; 1532789Sahrens vdev_t *rvd = spa->spa_root_vdev; 1533789Sahrens vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 1534*2082Seschrock vdev_ops_t *pvops; 1535789Sahrens 1536789Sahrens txg = spa_vdev_enter(spa); 1537789Sahrens 15381544Seschrock oldvd = vdev_lookup_by_guid(rvd, guid); 1539789Sahrens 1540789Sahrens if (oldvd == NULL) 1541789Sahrens return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1542789Sahrens 15431585Sbonwick if (!oldvd->vdev_ops->vdev_op_leaf) 15441585Sbonwick return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 15451585Sbonwick 1546789Sahrens pvd = oldvd->vdev_parent; 1547789Sahrens 1548*2082Seschrock if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 1549*2082Seschrock VDEV_ALLOC_ADD)) != 0 || newrootvd->vdev_children != 1) 1550789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1551789Sahrens 1552789Sahrens newvd = newrootvd->vdev_child[0]; 1553789Sahrens 1554789Sahrens if (!newvd->vdev_ops->vdev_op_leaf) 1555789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1556789Sahrens 1557*2082Seschrock if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 1558789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, error)); 1559789Sahrens 1560*2082Seschrock if (!replacing) { 1561*2082Seschrock /* 1562*2082Seschrock * For attach, the only allowable parent is a mirror or the root 1563*2082Seschrock * vdev. 1564*2082Seschrock */ 1565*2082Seschrock if (pvd->vdev_ops != &vdev_mirror_ops && 1566*2082Seschrock pvd->vdev_ops != &vdev_root_ops) 1567*2082Seschrock return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1568*2082Seschrock 1569*2082Seschrock pvops = &vdev_mirror_ops; 1570*2082Seschrock } else { 1571*2082Seschrock /* 1572*2082Seschrock * Active hot spares can only be replaced by inactive hot 1573*2082Seschrock * spares. 1574*2082Seschrock */ 1575*2082Seschrock if (pvd->vdev_ops == &vdev_spare_ops && 1576*2082Seschrock pvd->vdev_child[1] == oldvd && 1577*2082Seschrock !spa_has_spare(spa, newvd->vdev_guid)) 1578*2082Seschrock return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1579*2082Seschrock 1580*2082Seschrock /* 1581*2082Seschrock * If the source is a hot spare, and the parent isn't already a 1582*2082Seschrock * spare, then we want to create a new hot spare. Otherwise, we 1583*2082Seschrock * want to create a replacing vdev. 1584*2082Seschrock */ 1585*2082Seschrock if (pvd->vdev_ops == &vdev_replacing_ops) 1586*2082Seschrock return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 1587*2082Seschrock else if (pvd->vdev_ops != &vdev_spare_ops && 1588*2082Seschrock newvd->vdev_isspare) 1589*2082Seschrock pvops = &vdev_spare_ops; 1590*2082Seschrock else 1591*2082Seschrock pvops = &vdev_replacing_ops; 1592*2082Seschrock } 1593*2082Seschrock 15941175Slling /* 15951175Slling * Compare the new device size with the replaceable/attachable 15961175Slling * device size. 15971175Slling */ 15981175Slling if (newvd->vdev_psize < vdev_get_rsize(oldvd)) 1599789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 1600789Sahrens 16011732Sbonwick /* 16021732Sbonwick * The new device cannot have a higher alignment requirement 16031732Sbonwick * than the top-level vdev. 16041732Sbonwick */ 16051732Sbonwick if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 1606789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 1607789Sahrens 1608789Sahrens /* 1609789Sahrens * If this is an in-place replacement, update oldvd's path and devid 1610789Sahrens * to make it distinguishable from newvd, and unopenable from now on. 1611789Sahrens */ 1612789Sahrens if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 1613789Sahrens spa_strfree(oldvd->vdev_path); 1614789Sahrens oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 1615789Sahrens KM_SLEEP); 1616789Sahrens (void) sprintf(oldvd->vdev_path, "%s/%s", 1617789Sahrens newvd->vdev_path, "old"); 1618789Sahrens if (oldvd->vdev_devid != NULL) { 1619789Sahrens spa_strfree(oldvd->vdev_devid); 1620789Sahrens oldvd->vdev_devid = NULL; 1621789Sahrens } 1622789Sahrens } 1623789Sahrens 1624789Sahrens /* 1625*2082Seschrock * If the parent is not a mirror, or if we're replacing, insert the new 1626*2082Seschrock * mirror/replacing/spare vdev above oldvd. 1627789Sahrens */ 1628789Sahrens if (pvd->vdev_ops != pvops) 1629789Sahrens pvd = vdev_add_parent(oldvd, pvops); 1630789Sahrens 1631789Sahrens ASSERT(pvd->vdev_top->vdev_parent == rvd); 1632789Sahrens ASSERT(pvd->vdev_ops == pvops); 1633789Sahrens ASSERT(oldvd->vdev_parent == pvd); 1634789Sahrens 1635789Sahrens /* 1636789Sahrens * Extract the new device from its root and add it to pvd. 1637789Sahrens */ 1638789Sahrens vdev_remove_child(newrootvd, newvd); 1639789Sahrens newvd->vdev_id = pvd->vdev_children; 1640789Sahrens vdev_add_child(pvd, newvd); 1641789Sahrens 16421544Seschrock /* 16431544Seschrock * If newvd is smaller than oldvd, but larger than its rsize, 16441544Seschrock * the addition of newvd may have decreased our parent's asize. 16451544Seschrock */ 16461544Seschrock pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize); 16471544Seschrock 1648789Sahrens tvd = newvd->vdev_top; 1649789Sahrens ASSERT(pvd->vdev_top == tvd); 1650789Sahrens ASSERT(tvd->vdev_parent == rvd); 1651789Sahrens 1652789Sahrens vdev_config_dirty(tvd); 1653789Sahrens 1654789Sahrens /* 1655789Sahrens * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 1656789Sahrens * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 1657789Sahrens */ 1658789Sahrens open_txg = txg + TXG_CONCURRENT_STATES - 1; 1659789Sahrens 1660789Sahrens mutex_enter(&newvd->vdev_dtl_lock); 1661789Sahrens space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, 1662789Sahrens open_txg - TXG_INITIAL + 1); 1663789Sahrens mutex_exit(&newvd->vdev_dtl_lock); 1664789Sahrens 16651544Seschrock dprintf("attached %s in txg %llu\n", newvd->vdev_path, txg); 16661544Seschrock 1667789Sahrens /* 1668789Sahrens * Mark newvd's DTL dirty in this txg. 1669789Sahrens */ 16701732Sbonwick vdev_dirty(tvd, VDD_DTL, newvd, txg); 1671789Sahrens 1672789Sahrens (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 1673789Sahrens 1674789Sahrens /* 1675789Sahrens * Kick off a resilver to update newvd. 1676789Sahrens */ 1677789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1678789Sahrens 1679789Sahrens return (0); 1680789Sahrens } 1681789Sahrens 1682789Sahrens /* 1683789Sahrens * Detach a device from a mirror or replacing vdev. 1684789Sahrens * If 'replace_done' is specified, only detach if the parent 1685789Sahrens * is a replacing vdev. 1686789Sahrens */ 1687789Sahrens int 16881544Seschrock spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) 1689789Sahrens { 1690789Sahrens uint64_t txg; 1691789Sahrens int c, t, error; 1692789Sahrens vdev_t *rvd = spa->spa_root_vdev; 1693789Sahrens vdev_t *vd, *pvd, *cvd, *tvd; 1694*2082Seschrock boolean_t unspare = B_FALSE; 1695*2082Seschrock uint64_t unspare_guid; 1696789Sahrens 1697789Sahrens txg = spa_vdev_enter(spa); 1698789Sahrens 16991544Seschrock vd = vdev_lookup_by_guid(rvd, guid); 1700789Sahrens 1701789Sahrens if (vd == NULL) 1702789Sahrens return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1703789Sahrens 17041585Sbonwick if (!vd->vdev_ops->vdev_op_leaf) 17051585Sbonwick return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 17061585Sbonwick 1707789Sahrens pvd = vd->vdev_parent; 1708789Sahrens 1709789Sahrens /* 1710789Sahrens * If replace_done is specified, only remove this device if it's 1711*2082Seschrock * the first child of a replacing vdev. For the 'spare' vdev, either 1712*2082Seschrock * disk can be removed. 1713789Sahrens */ 1714*2082Seschrock if (replace_done) { 1715*2082Seschrock if (pvd->vdev_ops == &vdev_replacing_ops) { 1716*2082Seschrock if (vd->vdev_id != 0) 1717*2082Seschrock return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1718*2082Seschrock } else if (pvd->vdev_ops != &vdev_spare_ops) { 1719*2082Seschrock return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1720*2082Seschrock } 1721*2082Seschrock } 1722*2082Seschrock 1723*2082Seschrock ASSERT(pvd->vdev_ops != &vdev_spare_ops || 1724*2082Seschrock spa_version(spa) >= ZFS_VERSION_SPARES); 1725789Sahrens 1726789Sahrens /* 1727*2082Seschrock * Only mirror, replacing, and spare vdevs support detach. 1728789Sahrens */ 1729789Sahrens if (pvd->vdev_ops != &vdev_replacing_ops && 1730*2082Seschrock pvd->vdev_ops != &vdev_mirror_ops && 1731*2082Seschrock pvd->vdev_ops != &vdev_spare_ops) 1732789Sahrens return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1733789Sahrens 1734789Sahrens /* 1735789Sahrens * If there's only one replica, you can't detach it. 1736789Sahrens */ 1737789Sahrens if (pvd->vdev_children <= 1) 1738789Sahrens return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1739789Sahrens 1740789Sahrens /* 1741789Sahrens * If all siblings have non-empty DTLs, this device may have the only 1742789Sahrens * valid copy of the data, which means we cannot safely detach it. 1743789Sahrens * 1744789Sahrens * XXX -- as in the vdev_offline() case, we really want a more 1745789Sahrens * precise DTL check. 1746789Sahrens */ 1747789Sahrens for (c = 0; c < pvd->vdev_children; c++) { 1748789Sahrens uint64_t dirty; 1749789Sahrens 1750789Sahrens cvd = pvd->vdev_child[c]; 1751789Sahrens if (cvd == vd) 1752789Sahrens continue; 1753789Sahrens if (vdev_is_dead(cvd)) 1754789Sahrens continue; 1755789Sahrens mutex_enter(&cvd->vdev_dtl_lock); 1756789Sahrens dirty = cvd->vdev_dtl_map.sm_space | 1757789Sahrens cvd->vdev_dtl_scrub.sm_space; 1758789Sahrens mutex_exit(&cvd->vdev_dtl_lock); 1759789Sahrens if (!dirty) 1760789Sahrens break; 1761789Sahrens } 1762*2082Seschrock 1763*2082Seschrock /* 1764*2082Seschrock * If we are a replacing or spare vdev, then we can always detach the 1765*2082Seschrock * latter child, as that is how one cancels the operation. 1766*2082Seschrock */ 1767*2082Seschrock if ((pvd->vdev_ops == &vdev_mirror_ops || vd->vdev_id != 1) && 1768*2082Seschrock c == pvd->vdev_children) 1769789Sahrens return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1770789Sahrens 1771789Sahrens /* 1772*2082Seschrock * If we are detaching the original disk from a spare, then it implies 1773*2082Seschrock * that the spare should become a real disk, and be removed from the 1774*2082Seschrock * active spare list for the pool. 1775*2082Seschrock */ 1776*2082Seschrock if (pvd->vdev_ops == &vdev_spare_ops && 1777*2082Seschrock vd->vdev_id == 0) 1778*2082Seschrock unspare = B_TRUE; 1779*2082Seschrock 1780*2082Seschrock /* 1781789Sahrens * Erase the disk labels so the disk can be used for other things. 1782789Sahrens * This must be done after all other error cases are handled, 1783789Sahrens * but before we disembowel vd (so we can still do I/O to it). 1784789Sahrens * But if we can't do it, don't treat the error as fatal -- 1785789Sahrens * it may be that the unwritability of the disk is the reason 1786789Sahrens * it's being detached! 1787789Sahrens */ 1788*2082Seschrock error = vdev_label_init(vd, 0, B_FALSE); 1789789Sahrens if (error) 1790789Sahrens dprintf("unable to erase labels on %s\n", vdev_description(vd)); 1791789Sahrens 1792789Sahrens /* 1793789Sahrens * Remove vd from its parent and compact the parent's children. 1794789Sahrens */ 1795789Sahrens vdev_remove_child(pvd, vd); 1796789Sahrens vdev_compact_children(pvd); 1797789Sahrens 1798789Sahrens /* 1799789Sahrens * Remember one of the remaining children so we can get tvd below. 1800789Sahrens */ 1801789Sahrens cvd = pvd->vdev_child[0]; 1802789Sahrens 1803789Sahrens /* 1804*2082Seschrock * If we need to remove the remaining child from the list of hot spares, 1805*2082Seschrock * do it now, marking the vdev as no longer a spare in the process. We 1806*2082Seschrock * must do this before vdev_remove_parent(), because that can change the 1807*2082Seschrock * GUID if it creates a new toplevel GUID. 1808*2082Seschrock */ 1809*2082Seschrock if (unspare) { 1810*2082Seschrock ASSERT(cvd->vdev_isspare); 1811*2082Seschrock spa_spare_remove(cvd->vdev_guid); 1812*2082Seschrock cvd->vdev_isspare = B_FALSE; 1813*2082Seschrock unspare_guid = cvd->vdev_guid; 1814*2082Seschrock } 1815*2082Seschrock 1816*2082Seschrock /* 1817789Sahrens * If the parent mirror/replacing vdev only has one child, 1818789Sahrens * the parent is no longer needed. Remove it from the tree. 1819789Sahrens */ 1820789Sahrens if (pvd->vdev_children == 1) 1821789Sahrens vdev_remove_parent(cvd); 1822789Sahrens 1823789Sahrens /* 1824789Sahrens * We don't set tvd until now because the parent we just removed 1825789Sahrens * may have been the previous top-level vdev. 1826789Sahrens */ 1827789Sahrens tvd = cvd->vdev_top; 1828789Sahrens ASSERT(tvd->vdev_parent == rvd); 1829789Sahrens 1830789Sahrens /* 1831789Sahrens * Reopen this top-level vdev to reassess health after detach. 1832789Sahrens */ 18331544Seschrock vdev_reopen(tvd); 1834789Sahrens 1835789Sahrens /* 1836789Sahrens * If the device we just detached was smaller than the others, 18371732Sbonwick * it may be possible to add metaslabs (i.e. grow the pool). 18381732Sbonwick * vdev_metaslab_init() can't fail because the existing metaslabs 18391732Sbonwick * are already in core, so there's nothing to read from disk. 1840789Sahrens */ 18411732Sbonwick VERIFY(vdev_metaslab_init(tvd, txg) == 0); 1842789Sahrens 1843789Sahrens vdev_config_dirty(tvd); 1844789Sahrens 1845789Sahrens /* 1846789Sahrens * Mark vd's DTL as dirty in this txg. 1847789Sahrens * vdev_dtl_sync() will see that vd->vdev_detached is set 1848789Sahrens * and free vd's DTL object in syncing context. 1849789Sahrens * But first make sure we're not on any *other* txg's DTL list, 1850789Sahrens * to prevent vd from being accessed after it's freed. 1851789Sahrens */ 1852789Sahrens for (t = 0; t < TXG_SIZE; t++) 1853789Sahrens (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 18541732Sbonwick vd->vdev_detached = B_TRUE; 18551732Sbonwick vdev_dirty(tvd, VDD_DTL, vd, txg); 1856789Sahrens 18571544Seschrock dprintf("detached %s in txg %llu\n", vd->vdev_path, txg); 1858789Sahrens 1859*2082Seschrock error = spa_vdev_exit(spa, vd, txg, 0); 1860*2082Seschrock 1861*2082Seschrock /* 1862*2082Seschrock * If we are supposed to remove the given vdev from the list of spares, 1863*2082Seschrock * iterate over all pools in the system and replace it if it's present. 1864*2082Seschrock */ 1865*2082Seschrock if (unspare) { 1866*2082Seschrock spa = NULL; 1867*2082Seschrock mutex_enter(&spa_namespace_lock); 1868*2082Seschrock while ((spa = spa_next(spa)) != NULL) { 1869*2082Seschrock if (spa->spa_state != POOL_STATE_ACTIVE) 1870*2082Seschrock continue; 1871*2082Seschrock 1872*2082Seschrock (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 1873*2082Seschrock } 1874*2082Seschrock mutex_exit(&spa_namespace_lock); 1875*2082Seschrock } 1876*2082Seschrock 1877*2082Seschrock return (error); 1878*2082Seschrock } 1879*2082Seschrock 1880*2082Seschrock /* 1881*2082Seschrock * Remove a device from the pool. Currently, this supports removing only hot 1882*2082Seschrock * spares. 1883*2082Seschrock */ 1884*2082Seschrock int 1885*2082Seschrock spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 1886*2082Seschrock { 1887*2082Seschrock vdev_t *vd; 1888*2082Seschrock nvlist_t **spares, *nv, **newspares; 1889*2082Seschrock uint_t i, j, nspares; 1890*2082Seschrock int ret = 0; 1891*2082Seschrock 1892*2082Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 1893*2082Seschrock 1894*2082Seschrock vd = spa_lookup_by_guid(spa, guid); 1895*2082Seschrock 1896*2082Seschrock nv = NULL; 1897*2082Seschrock if (spa->spa_spares != NULL && 1898*2082Seschrock nvlist_lookup_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 1899*2082Seschrock &spares, &nspares) == 0) { 1900*2082Seschrock for (i = 0; i < nspares; i++) { 1901*2082Seschrock uint64_t theguid; 1902*2082Seschrock 1903*2082Seschrock VERIFY(nvlist_lookup_uint64(spares[i], 1904*2082Seschrock ZPOOL_CONFIG_GUID, &theguid) == 0); 1905*2082Seschrock if (theguid == guid) { 1906*2082Seschrock nv = spares[i]; 1907*2082Seschrock break; 1908*2082Seschrock } 1909*2082Seschrock } 1910*2082Seschrock } 1911*2082Seschrock 1912*2082Seschrock /* 1913*2082Seschrock * We only support removing a hot spare, and only if it's not currently 1914*2082Seschrock * in use in this pool. 1915*2082Seschrock */ 1916*2082Seschrock if (nv == NULL && vd == NULL) { 1917*2082Seschrock ret = ENOENT; 1918*2082Seschrock goto out; 1919*2082Seschrock } 1920*2082Seschrock 1921*2082Seschrock if (nv == NULL && vd != NULL) { 1922*2082Seschrock ret = ENOTSUP; 1923*2082Seschrock goto out; 1924*2082Seschrock } 1925*2082Seschrock 1926*2082Seschrock if (!unspare && nv != NULL && vd != NULL) { 1927*2082Seschrock ret = EBUSY; 1928*2082Seschrock goto out; 1929*2082Seschrock } 1930*2082Seschrock 1931*2082Seschrock if (nspares == 1) { 1932*2082Seschrock newspares = NULL; 1933*2082Seschrock } else { 1934*2082Seschrock newspares = kmem_alloc((nspares - 1) * sizeof (void *), 1935*2082Seschrock KM_SLEEP); 1936*2082Seschrock for (i = 0, j = 0; i < nspares; i++) { 1937*2082Seschrock if (spares[i] != nv) 1938*2082Seschrock VERIFY(nvlist_dup(spares[i], 1939*2082Seschrock &newspares[j++], KM_SLEEP) == 0); 1940*2082Seschrock } 1941*2082Seschrock } 1942*2082Seschrock 1943*2082Seschrock VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 1944*2082Seschrock DATA_TYPE_NVLIST_ARRAY) == 0); 1945*2082Seschrock VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 1946*2082Seschrock newspares, nspares - 1) == 0); 1947*2082Seschrock for (i = 0; i < nspares - 1; i++) 1948*2082Seschrock nvlist_free(newspares[i]); 1949*2082Seschrock kmem_free(newspares, (nspares - 1) * sizeof (void *)); 1950*2082Seschrock spa_load_spares(spa); 1951*2082Seschrock spa->spa_sync_spares = B_TRUE; 1952*2082Seschrock 1953*2082Seschrock out: 1954*2082Seschrock spa_config_exit(spa, FTAG); 1955*2082Seschrock 1956*2082Seschrock return (ret); 1957789Sahrens } 1958789Sahrens 1959789Sahrens /* 19601544Seschrock * Find any device that's done replacing, so we can detach it. 1961789Sahrens */ 19621544Seschrock static vdev_t * 19631544Seschrock spa_vdev_replace_done_hunt(vdev_t *vd) 1964789Sahrens { 19651544Seschrock vdev_t *newvd, *oldvd; 1966789Sahrens int c; 1967789Sahrens 19681544Seschrock for (c = 0; c < vd->vdev_children; c++) { 19691544Seschrock oldvd = spa_vdev_replace_done_hunt(vd->vdev_child[c]); 19701544Seschrock if (oldvd != NULL) 19711544Seschrock return (oldvd); 19721544Seschrock } 1973789Sahrens 1974789Sahrens if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 19751544Seschrock oldvd = vd->vdev_child[0]; 19761544Seschrock newvd = vd->vdev_child[1]; 1977789Sahrens 19781544Seschrock mutex_enter(&newvd->vdev_dtl_lock); 19791544Seschrock if (newvd->vdev_dtl_map.sm_space == 0 && 19801544Seschrock newvd->vdev_dtl_scrub.sm_space == 0) { 19811544Seschrock mutex_exit(&newvd->vdev_dtl_lock); 19821544Seschrock return (oldvd); 19831544Seschrock } 19841544Seschrock mutex_exit(&newvd->vdev_dtl_lock); 19851544Seschrock } 1986789Sahrens 19871544Seschrock return (NULL); 1988789Sahrens } 1989789Sahrens 19901544Seschrock static void 1991789Sahrens spa_vdev_replace_done(spa_t *spa) 1992789Sahrens { 19931544Seschrock vdev_t *vd; 1994*2082Seschrock vdev_t *pvd; 19951544Seschrock uint64_t guid; 1996*2082Seschrock uint64_t pguid = 0; 1997789Sahrens 19981544Seschrock spa_config_enter(spa, RW_READER, FTAG); 1999789Sahrens 20001544Seschrock while ((vd = spa_vdev_replace_done_hunt(spa->spa_root_vdev)) != NULL) { 20011544Seschrock guid = vd->vdev_guid; 2002*2082Seschrock /* 2003*2082Seschrock * If we have just finished replacing a hot spared device, then 2004*2082Seschrock * we need to detach the parent's first child (the original hot 2005*2082Seschrock * spare) as well. 2006*2082Seschrock */ 2007*2082Seschrock pvd = vd->vdev_parent; 2008*2082Seschrock if (pvd->vdev_parent->vdev_ops == &vdev_spare_ops && 2009*2082Seschrock pvd->vdev_id == 0) { 2010*2082Seschrock ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 2011*2082Seschrock ASSERT(pvd->vdev_parent->vdev_children == 2); 2012*2082Seschrock pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid; 2013*2082Seschrock } 20141544Seschrock spa_config_exit(spa, FTAG); 20151544Seschrock if (spa_vdev_detach(spa, guid, B_TRUE) != 0) 20161544Seschrock return; 2017*2082Seschrock if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0) 2018*2082Seschrock return; 20191544Seschrock spa_config_enter(spa, RW_READER, FTAG); 2020789Sahrens } 2021789Sahrens 20221544Seschrock spa_config_exit(spa, FTAG); 2023789Sahrens } 2024789Sahrens 2025789Sahrens /* 20261354Seschrock * Update the stored path for this vdev. Dirty the vdev configuration, relying 20271354Seschrock * on spa_vdev_enter/exit() to synchronize the labels and cache. 20281354Seschrock */ 20291354Seschrock int 20301354Seschrock spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 20311354Seschrock { 20321354Seschrock vdev_t *rvd, *vd; 20331354Seschrock uint64_t txg; 20341354Seschrock 20351354Seschrock rvd = spa->spa_root_vdev; 20361354Seschrock 20371354Seschrock txg = spa_vdev_enter(spa); 20381354Seschrock 2039*2082Seschrock if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 2040*2082Seschrock /* 2041*2082Seschrock * Determine if this is a reference to a hot spare. In that 2042*2082Seschrock * case, update the path as stored in the spare list. 2043*2082Seschrock */ 2044*2082Seschrock nvlist_t **spares; 2045*2082Seschrock uint_t i, nspares; 2046*2082Seschrock if (spa->spa_sparelist != NULL) { 2047*2082Seschrock VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 2048*2082Seschrock ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 2049*2082Seschrock for (i = 0; i < nspares; i++) { 2050*2082Seschrock uint64_t theguid; 2051*2082Seschrock VERIFY(nvlist_lookup_uint64(spares[i], 2052*2082Seschrock ZPOOL_CONFIG_GUID, &theguid) == 0); 2053*2082Seschrock if (theguid == guid) 2054*2082Seschrock break; 2055*2082Seschrock } 2056*2082Seschrock 2057*2082Seschrock if (i == nspares) 2058*2082Seschrock return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 2059*2082Seschrock 2060*2082Seschrock VERIFY(nvlist_add_string(spares[i], 2061*2082Seschrock ZPOOL_CONFIG_PATH, newpath) == 0); 2062*2082Seschrock spa_load_spares(spa); 2063*2082Seschrock spa->spa_sync_spares = B_TRUE; 2064*2082Seschrock return (spa_vdev_exit(spa, NULL, txg, 0)); 2065*2082Seschrock } else { 2066*2082Seschrock return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 2067*2082Seschrock } 2068*2082Seschrock } 20691354Seschrock 20701585Sbonwick if (!vd->vdev_ops->vdev_op_leaf) 20711585Sbonwick return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 20721585Sbonwick 20731354Seschrock spa_strfree(vd->vdev_path); 20741354Seschrock vd->vdev_path = spa_strdup(newpath); 20751354Seschrock 20761354Seschrock vdev_config_dirty(vd->vdev_top); 20771354Seschrock 20781354Seschrock return (spa_vdev_exit(spa, NULL, txg, 0)); 20791354Seschrock } 20801354Seschrock 20811354Seschrock /* 2082789Sahrens * ========================================================================== 2083789Sahrens * SPA Scrubbing 2084789Sahrens * ========================================================================== 2085789Sahrens */ 2086789Sahrens 20871544Seschrock void 20881544Seschrock spa_scrub_throttle(spa_t *spa, int direction) 20891544Seschrock { 20901544Seschrock mutex_enter(&spa->spa_scrub_lock); 20911544Seschrock spa->spa_scrub_throttled += direction; 20921544Seschrock ASSERT(spa->spa_scrub_throttled >= 0); 20931544Seschrock if (spa->spa_scrub_throttled == 0) 20941544Seschrock cv_broadcast(&spa->spa_scrub_io_cv); 20951544Seschrock mutex_exit(&spa->spa_scrub_lock); 20961544Seschrock } 2097789Sahrens 2098789Sahrens static void 2099789Sahrens spa_scrub_io_done(zio_t *zio) 2100789Sahrens { 2101789Sahrens spa_t *spa = zio->io_spa; 2102789Sahrens 2103789Sahrens zio_buf_free(zio->io_data, zio->io_size); 2104789Sahrens 2105789Sahrens mutex_enter(&spa->spa_scrub_lock); 21061544Seschrock if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 21071775Sbillm vdev_t *vd = zio->io_vd ? zio->io_vd : spa->spa_root_vdev; 2108789Sahrens spa->spa_scrub_errors++; 2109789Sahrens mutex_enter(&vd->vdev_stat_lock); 2110789Sahrens vd->vdev_stat.vs_scrub_errors++; 2111789Sahrens mutex_exit(&vd->vdev_stat_lock); 2112789Sahrens } 21131544Seschrock if (--spa->spa_scrub_inflight == 0) { 21141544Seschrock cv_broadcast(&spa->spa_scrub_io_cv); 21151544Seschrock ASSERT(spa->spa_scrub_throttled == 0); 21161544Seschrock } 21171544Seschrock mutex_exit(&spa->spa_scrub_lock); 2118789Sahrens } 2119789Sahrens 2120789Sahrens static void 21211544Seschrock spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags, 21221544Seschrock zbookmark_t *zb) 2123789Sahrens { 2124789Sahrens size_t size = BP_GET_LSIZE(bp); 2125789Sahrens void *data = zio_buf_alloc(size); 2126789Sahrens 2127789Sahrens mutex_enter(&spa->spa_scrub_lock); 2128789Sahrens spa->spa_scrub_inflight++; 2129789Sahrens mutex_exit(&spa->spa_scrub_lock); 2130789Sahrens 21311544Seschrock if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET) 21321544Seschrock flags |= ZIO_FLAG_SPECULATIVE; /* intent log block */ 21331544Seschrock 21341807Sbonwick flags |= ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL; 21351544Seschrock 2136789Sahrens zio_nowait(zio_read(NULL, spa, bp, data, size, 21371544Seschrock spa_scrub_io_done, NULL, priority, flags, zb)); 2138789Sahrens } 2139789Sahrens 2140789Sahrens /* ARGSUSED */ 2141789Sahrens static int 2142789Sahrens spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) 2143789Sahrens { 2144789Sahrens blkptr_t *bp = &bc->bc_blkptr; 21451775Sbillm vdev_t *vd = spa->spa_root_vdev; 21461775Sbillm dva_t *dva = bp->blk_dva; 21471775Sbillm int needs_resilver = B_FALSE; 21481775Sbillm int d; 2149789Sahrens 21501775Sbillm if (bc->bc_errno) { 2151789Sahrens /* 2152789Sahrens * We can't scrub this block, but we can continue to scrub 2153789Sahrens * the rest of the pool. Note the error and move along. 2154789Sahrens */ 2155789Sahrens mutex_enter(&spa->spa_scrub_lock); 2156789Sahrens spa->spa_scrub_errors++; 2157789Sahrens mutex_exit(&spa->spa_scrub_lock); 2158789Sahrens 21591775Sbillm mutex_enter(&vd->vdev_stat_lock); 21601775Sbillm vd->vdev_stat.vs_scrub_errors++; 21611775Sbillm mutex_exit(&vd->vdev_stat_lock); 2162789Sahrens 2163789Sahrens return (ERESTART); 2164789Sahrens } 2165789Sahrens 2166789Sahrens ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg); 2167789Sahrens 21681775Sbillm for (d = 0; d < BP_GET_NDVAS(bp); d++) { 21691775Sbillm vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d])); 21701775Sbillm 21711775Sbillm ASSERT(vd != NULL); 21721775Sbillm 21731775Sbillm /* 21741775Sbillm * Keep track of how much data we've examined so that 21751775Sbillm * zpool(1M) status can make useful progress reports. 21761775Sbillm */ 21771775Sbillm mutex_enter(&vd->vdev_stat_lock); 21781775Sbillm vd->vdev_stat.vs_scrub_examined += DVA_GET_ASIZE(&dva[d]); 21791775Sbillm mutex_exit(&vd->vdev_stat_lock); 2180789Sahrens 21811775Sbillm if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) { 21821775Sbillm if (DVA_GET_GANG(&dva[d])) { 21831775Sbillm /* 21841775Sbillm * Gang members may be spread across multiple 21851775Sbillm * vdevs, so the best we can do is look at the 21861775Sbillm * pool-wide DTL. 21871775Sbillm * XXX -- it would be better to change our 21881775Sbillm * allocation policy to ensure that this can't 21891775Sbillm * happen. 21901775Sbillm */ 21911775Sbillm vd = spa->spa_root_vdev; 21921775Sbillm } 21931775Sbillm if (vdev_dtl_contains(&vd->vdev_dtl_map, 21941775Sbillm bp->blk_birth, 1)) 21951775Sbillm needs_resilver = B_TRUE; 2196789Sahrens } 21971775Sbillm } 21981775Sbillm 21991775Sbillm if (spa->spa_scrub_type == POOL_SCRUB_EVERYTHING) 2200789Sahrens spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB, 22011544Seschrock ZIO_FLAG_SCRUB, &bc->bc_bookmark); 22021775Sbillm else if (needs_resilver) 22031775Sbillm spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER, 22041775Sbillm ZIO_FLAG_RESILVER, &bc->bc_bookmark); 2205789Sahrens 2206789Sahrens return (0); 2207789Sahrens } 2208789Sahrens 2209789Sahrens static void 2210789Sahrens spa_scrub_thread(spa_t *spa) 2211789Sahrens { 2212789Sahrens callb_cpr_t cprinfo; 2213789Sahrens traverse_handle_t *th = spa->spa_scrub_th; 2214789Sahrens vdev_t *rvd = spa->spa_root_vdev; 2215789Sahrens pool_scrub_type_t scrub_type = spa->spa_scrub_type; 2216789Sahrens int error = 0; 2217789Sahrens boolean_t complete; 2218789Sahrens 2219789Sahrens CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG); 2220789Sahrens 2221797Sbonwick /* 2222797Sbonwick * If we're restarting due to a snapshot create/delete, 2223797Sbonwick * wait for that to complete. 2224797Sbonwick */ 2225797Sbonwick txg_wait_synced(spa_get_dsl(spa), 0); 2226797Sbonwick 22271544Seschrock dprintf("start %s mintxg=%llu maxtxg=%llu\n", 22281544Seschrock scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 22291544Seschrock spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg); 22301544Seschrock 22311544Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 22321544Seschrock vdev_reopen(rvd); /* purge all vdev caches */ 2233789Sahrens vdev_config_dirty(rvd); /* rewrite all disk labels */ 2234789Sahrens vdev_scrub_stat_update(rvd, scrub_type, B_FALSE); 22351544Seschrock spa_config_exit(spa, FTAG); 2236789Sahrens 2237789Sahrens mutex_enter(&spa->spa_scrub_lock); 2238789Sahrens spa->spa_scrub_errors = 0; 2239789Sahrens spa->spa_scrub_active = 1; 22401544Seschrock ASSERT(spa->spa_scrub_inflight == 0); 22411544Seschrock ASSERT(spa->spa_scrub_throttled == 0); 2242789Sahrens 2243789Sahrens while (!spa->spa_scrub_stop) { 2244789Sahrens CALLB_CPR_SAFE_BEGIN(&cprinfo); 22451544Seschrock while (spa->spa_scrub_suspended) { 2246789Sahrens spa->spa_scrub_active = 0; 2247789Sahrens cv_broadcast(&spa->spa_scrub_cv); 2248789Sahrens cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2249789Sahrens spa->spa_scrub_active = 1; 2250789Sahrens } 2251789Sahrens CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock); 2252789Sahrens 2253789Sahrens if (spa->spa_scrub_restart_txg != 0) 2254789Sahrens break; 2255789Sahrens 2256789Sahrens mutex_exit(&spa->spa_scrub_lock); 2257789Sahrens error = traverse_more(th); 2258789Sahrens mutex_enter(&spa->spa_scrub_lock); 2259789Sahrens if (error != EAGAIN) 2260789Sahrens break; 22611544Seschrock 22621544Seschrock while (spa->spa_scrub_throttled > 0) 22631544Seschrock cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2264789Sahrens } 2265789Sahrens 2266789Sahrens while (spa->spa_scrub_inflight) 2267789Sahrens cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2268789Sahrens 22691601Sbonwick spa->spa_scrub_active = 0; 22701601Sbonwick cv_broadcast(&spa->spa_scrub_cv); 22711601Sbonwick 22721601Sbonwick mutex_exit(&spa->spa_scrub_lock); 22731601Sbonwick 22741601Sbonwick spa_config_enter(spa, RW_WRITER, FTAG); 22751601Sbonwick 22761601Sbonwick mutex_enter(&spa->spa_scrub_lock); 22771601Sbonwick 22781601Sbonwick /* 22791601Sbonwick * Note: we check spa_scrub_restart_txg under both spa_scrub_lock 22801601Sbonwick * AND the spa config lock to synchronize with any config changes 22811601Sbonwick * that revise the DTLs under spa_vdev_enter() / spa_vdev_exit(). 22821601Sbonwick */ 2283789Sahrens if (spa->spa_scrub_restart_txg != 0) 2284789Sahrens error = ERESTART; 2285789Sahrens 22861544Seschrock if (spa->spa_scrub_stop) 22871544Seschrock error = EINTR; 22881544Seschrock 2289789Sahrens /* 22901544Seschrock * Even if there were uncorrectable errors, we consider the scrub 22911544Seschrock * completed. The downside is that if there is a transient error during 22921544Seschrock * a resilver, we won't resilver the data properly to the target. But 22931544Seschrock * if the damage is permanent (more likely) we will resilver forever, 22941544Seschrock * which isn't really acceptable. Since there is enough information for 22951544Seschrock * the user to know what has failed and why, this seems like a more 22961544Seschrock * tractable approach. 2297789Sahrens */ 22981544Seschrock complete = (error == 0); 2299789Sahrens 23001544Seschrock dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n", 23011544Seschrock scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 2302789Sahrens spa->spa_scrub_maxtxg, complete ? "done" : "FAILED", 2303789Sahrens error, spa->spa_scrub_errors, spa->spa_scrub_stop); 2304789Sahrens 2305789Sahrens mutex_exit(&spa->spa_scrub_lock); 2306789Sahrens 2307789Sahrens /* 2308789Sahrens * If the scrub/resilver completed, update all DTLs to reflect this. 2309789Sahrens * Whether it succeeded or not, vacate all temporary scrub DTLs. 2310789Sahrens */ 2311789Sahrens vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1, 2312789Sahrens complete ? spa->spa_scrub_maxtxg : 0, B_TRUE); 2313789Sahrens vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete); 23141544Seschrock spa_errlog_rotate(spa); 23151601Sbonwick 23161544Seschrock spa_config_exit(spa, FTAG); 2317789Sahrens 2318789Sahrens mutex_enter(&spa->spa_scrub_lock); 2319789Sahrens 23201544Seschrock /* 23211544Seschrock * We may have finished replacing a device. 23221544Seschrock * Let the async thread assess this and handle the detach. 23231544Seschrock */ 23241544Seschrock spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 2325789Sahrens 2326789Sahrens /* 2327789Sahrens * If we were told to restart, our final act is to start a new scrub. 2328789Sahrens */ 2329789Sahrens if (error == ERESTART) 23301544Seschrock spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ? 23311544Seschrock SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB); 2332789Sahrens 23331544Seschrock spa->spa_scrub_type = POOL_SCRUB_NONE; 23341544Seschrock spa->spa_scrub_active = 0; 23351544Seschrock spa->spa_scrub_thread = NULL; 23361544Seschrock cv_broadcast(&spa->spa_scrub_cv); 2337789Sahrens CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */ 2338789Sahrens thread_exit(); 2339789Sahrens } 2340789Sahrens 2341789Sahrens void 2342789Sahrens spa_scrub_suspend(spa_t *spa) 2343789Sahrens { 2344789Sahrens mutex_enter(&spa->spa_scrub_lock); 23451544Seschrock spa->spa_scrub_suspended++; 2346789Sahrens while (spa->spa_scrub_active) { 2347789Sahrens cv_broadcast(&spa->spa_scrub_cv); 2348789Sahrens cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2349789Sahrens } 2350789Sahrens while (spa->spa_scrub_inflight) 2351789Sahrens cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2352789Sahrens mutex_exit(&spa->spa_scrub_lock); 2353789Sahrens } 2354789Sahrens 2355789Sahrens void 2356789Sahrens spa_scrub_resume(spa_t *spa) 2357789Sahrens { 2358789Sahrens mutex_enter(&spa->spa_scrub_lock); 23591544Seschrock ASSERT(spa->spa_scrub_suspended != 0); 23601544Seschrock if (--spa->spa_scrub_suspended == 0) 2361789Sahrens cv_broadcast(&spa->spa_scrub_cv); 2362789Sahrens mutex_exit(&spa->spa_scrub_lock); 2363789Sahrens } 2364789Sahrens 2365789Sahrens void 2366789Sahrens spa_scrub_restart(spa_t *spa, uint64_t txg) 2367789Sahrens { 2368789Sahrens /* 2369789Sahrens * Something happened (e.g. snapshot create/delete) that means 2370789Sahrens * we must restart any in-progress scrubs. The itinerary will 2371789Sahrens * fix this properly. 2372789Sahrens */ 2373789Sahrens mutex_enter(&spa->spa_scrub_lock); 2374789Sahrens spa->spa_scrub_restart_txg = txg; 2375789Sahrens mutex_exit(&spa->spa_scrub_lock); 2376789Sahrens } 2377789Sahrens 23781544Seschrock int 23791544Seschrock spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) 2380789Sahrens { 2381789Sahrens space_seg_t *ss; 2382789Sahrens uint64_t mintxg, maxtxg; 2383789Sahrens vdev_t *rvd = spa->spa_root_vdev; 2384789Sahrens 2385789Sahrens if ((uint_t)type >= POOL_SCRUB_TYPES) 2386789Sahrens return (ENOTSUP); 2387789Sahrens 23881544Seschrock mutex_enter(&spa->spa_scrub_lock); 23891544Seschrock 2390789Sahrens /* 2391789Sahrens * If there's a scrub or resilver already in progress, stop it. 2392789Sahrens */ 2393789Sahrens while (spa->spa_scrub_thread != NULL) { 2394789Sahrens /* 2395789Sahrens * Don't stop a resilver unless forced. 2396789Sahrens */ 23971544Seschrock if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) { 23981544Seschrock mutex_exit(&spa->spa_scrub_lock); 2399789Sahrens return (EBUSY); 24001544Seschrock } 2401789Sahrens spa->spa_scrub_stop = 1; 2402789Sahrens cv_broadcast(&spa->spa_scrub_cv); 2403789Sahrens cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2404789Sahrens } 2405789Sahrens 2406789Sahrens /* 2407789Sahrens * Terminate the previous traverse. 2408789Sahrens */ 2409789Sahrens if (spa->spa_scrub_th != NULL) { 2410789Sahrens traverse_fini(spa->spa_scrub_th); 2411789Sahrens spa->spa_scrub_th = NULL; 2412789Sahrens } 2413789Sahrens 24141544Seschrock if (rvd == NULL) { 24151544Seschrock ASSERT(spa->spa_scrub_stop == 0); 24161544Seschrock ASSERT(spa->spa_scrub_type == type); 24171544Seschrock ASSERT(spa->spa_scrub_restart_txg == 0); 24181544Seschrock mutex_exit(&spa->spa_scrub_lock); 24191544Seschrock return (0); 24201544Seschrock } 2421789Sahrens 2422789Sahrens mintxg = TXG_INITIAL - 1; 2423789Sahrens maxtxg = spa_last_synced_txg(spa) + 1; 2424789Sahrens 24251544Seschrock mutex_enter(&rvd->vdev_dtl_lock); 2426789Sahrens 24271544Seschrock if (rvd->vdev_dtl_map.sm_space == 0) { 24281544Seschrock /* 24291544Seschrock * The pool-wide DTL is empty. 24301732Sbonwick * If this is a resilver, there's nothing to do except 24311732Sbonwick * check whether any in-progress replacements have completed. 24321544Seschrock */ 24331732Sbonwick if (type == POOL_SCRUB_RESILVER) { 24341544Seschrock type = POOL_SCRUB_NONE; 24351732Sbonwick spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 24361732Sbonwick } 24371544Seschrock } else { 24381544Seschrock /* 24391544Seschrock * The pool-wide DTL is non-empty. 24401544Seschrock * If this is a normal scrub, upgrade to a resilver instead. 24411544Seschrock */ 24421544Seschrock if (type == POOL_SCRUB_EVERYTHING) 24431544Seschrock type = POOL_SCRUB_RESILVER; 24441544Seschrock } 2445789Sahrens 24461544Seschrock if (type == POOL_SCRUB_RESILVER) { 2447789Sahrens /* 2448789Sahrens * Determine the resilvering boundaries. 2449789Sahrens * 2450789Sahrens * Note: (mintxg, maxtxg) is an open interval, 2451789Sahrens * i.e. mintxg and maxtxg themselves are not included. 2452789Sahrens * 2453789Sahrens * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1 2454789Sahrens * so we don't claim to resilver a txg that's still changing. 2455789Sahrens */ 2456789Sahrens ss = avl_first(&rvd->vdev_dtl_map.sm_root); 24571544Seschrock mintxg = ss->ss_start - 1; 2458789Sahrens ss = avl_last(&rvd->vdev_dtl_map.sm_root); 24591544Seschrock maxtxg = MIN(ss->ss_end, maxtxg); 2460789Sahrens } 2461789Sahrens 24621544Seschrock mutex_exit(&rvd->vdev_dtl_lock); 24631544Seschrock 24641544Seschrock spa->spa_scrub_stop = 0; 24651544Seschrock spa->spa_scrub_type = type; 24661544Seschrock spa->spa_scrub_restart_txg = 0; 24671544Seschrock 24681544Seschrock if (type != POOL_SCRUB_NONE) { 24691544Seschrock spa->spa_scrub_mintxg = mintxg; 2470789Sahrens spa->spa_scrub_maxtxg = maxtxg; 2471789Sahrens spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL, 24721635Sbonwick ADVANCE_PRE | ADVANCE_PRUNE | ADVANCE_ZIL, 24731635Sbonwick ZIO_FLAG_CANFAIL); 2474789Sahrens traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg); 2475789Sahrens spa->spa_scrub_thread = thread_create(NULL, 0, 2476789Sahrens spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri); 2477789Sahrens } 2478789Sahrens 24791544Seschrock mutex_exit(&spa->spa_scrub_lock); 24801544Seschrock 2481789Sahrens return (0); 2482789Sahrens } 2483789Sahrens 24841544Seschrock /* 24851544Seschrock * ========================================================================== 24861544Seschrock * SPA async task processing 24871544Seschrock * ========================================================================== 24881544Seschrock */ 24891544Seschrock 24901544Seschrock static void 24911544Seschrock spa_async_reopen(spa_t *spa) 2492789Sahrens { 24931544Seschrock vdev_t *rvd = spa->spa_root_vdev; 24941544Seschrock vdev_t *tvd; 24951544Seschrock int c; 24961544Seschrock 24971544Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 24981544Seschrock 24991544Seschrock for (c = 0; c < rvd->vdev_children; c++) { 25001544Seschrock tvd = rvd->vdev_child[c]; 25011544Seschrock if (tvd->vdev_reopen_wanted) { 25021544Seschrock tvd->vdev_reopen_wanted = 0; 25031544Seschrock vdev_reopen(tvd); 25041544Seschrock } 25051544Seschrock } 2506789Sahrens 25071544Seschrock spa_config_exit(spa, FTAG); 25081544Seschrock } 25091544Seschrock 25101544Seschrock static void 25111544Seschrock spa_async_thread(spa_t *spa) 25121544Seschrock { 25131544Seschrock int tasks; 25141544Seschrock 25151544Seschrock ASSERT(spa->spa_sync_on); 2516789Sahrens 25171544Seschrock mutex_enter(&spa->spa_async_lock); 25181544Seschrock tasks = spa->spa_async_tasks; 25191544Seschrock spa->spa_async_tasks = 0; 25201544Seschrock mutex_exit(&spa->spa_async_lock); 25211544Seschrock 25221544Seschrock /* 25231635Sbonwick * See if the config needs to be updated. 25241635Sbonwick */ 25251635Sbonwick if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 25261635Sbonwick mutex_enter(&spa_namespace_lock); 25271635Sbonwick spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 25281635Sbonwick mutex_exit(&spa_namespace_lock); 25291635Sbonwick } 25301635Sbonwick 25311635Sbonwick /* 25321544Seschrock * See if any devices need to be reopened. 25331544Seschrock */ 25341544Seschrock if (tasks & SPA_ASYNC_REOPEN) 25351544Seschrock spa_async_reopen(spa); 25361544Seschrock 25371544Seschrock /* 25381544Seschrock * If any devices are done replacing, detach them. 25391544Seschrock */ 25401544Seschrock if (tasks & SPA_ASYNC_REPLACE_DONE) 2541789Sahrens spa_vdev_replace_done(spa); 2542789Sahrens 25431544Seschrock /* 25441544Seschrock * Kick off a scrub. 25451544Seschrock */ 25461544Seschrock if (tasks & SPA_ASYNC_SCRUB) 25471544Seschrock VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0); 25481544Seschrock 25491544Seschrock /* 25501544Seschrock * Kick off a resilver. 25511544Seschrock */ 25521544Seschrock if (tasks & SPA_ASYNC_RESILVER) 25531544Seschrock VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 25541544Seschrock 25551544Seschrock /* 25561544Seschrock * Let the world know that we're done. 25571544Seschrock */ 25581544Seschrock mutex_enter(&spa->spa_async_lock); 25591544Seschrock spa->spa_async_thread = NULL; 25601544Seschrock cv_broadcast(&spa->spa_async_cv); 25611544Seschrock mutex_exit(&spa->spa_async_lock); 25621544Seschrock thread_exit(); 25631544Seschrock } 25641544Seschrock 25651544Seschrock void 25661544Seschrock spa_async_suspend(spa_t *spa) 25671544Seschrock { 25681544Seschrock mutex_enter(&spa->spa_async_lock); 25691544Seschrock spa->spa_async_suspended++; 25701544Seschrock while (spa->spa_async_thread != NULL) 25711544Seschrock cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 25721544Seschrock mutex_exit(&spa->spa_async_lock); 25731544Seschrock } 25741544Seschrock 25751544Seschrock void 25761544Seschrock spa_async_resume(spa_t *spa) 25771544Seschrock { 25781544Seschrock mutex_enter(&spa->spa_async_lock); 25791544Seschrock ASSERT(spa->spa_async_suspended != 0); 25801544Seschrock spa->spa_async_suspended--; 25811544Seschrock mutex_exit(&spa->spa_async_lock); 25821544Seschrock } 25831544Seschrock 25841544Seschrock static void 25851544Seschrock spa_async_dispatch(spa_t *spa) 25861544Seschrock { 25871544Seschrock mutex_enter(&spa->spa_async_lock); 25881544Seschrock if (spa->spa_async_tasks && !spa->spa_async_suspended && 25891635Sbonwick spa->spa_async_thread == NULL && 25901635Sbonwick rootdir != NULL && !vn_is_readonly(rootdir)) 25911544Seschrock spa->spa_async_thread = thread_create(NULL, 0, 25921544Seschrock spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 25931544Seschrock mutex_exit(&spa->spa_async_lock); 25941544Seschrock } 25951544Seschrock 25961544Seschrock void 25971544Seschrock spa_async_request(spa_t *spa, int task) 25981544Seschrock { 25991544Seschrock mutex_enter(&spa->spa_async_lock); 26001544Seschrock spa->spa_async_tasks |= task; 26011544Seschrock mutex_exit(&spa->spa_async_lock); 2602789Sahrens } 2603789Sahrens 2604789Sahrens /* 2605789Sahrens * ========================================================================== 2606789Sahrens * SPA syncing routines 2607789Sahrens * ========================================================================== 2608789Sahrens */ 2609789Sahrens 2610789Sahrens static void 2611789Sahrens spa_sync_deferred_frees(spa_t *spa, uint64_t txg) 2612789Sahrens { 2613789Sahrens bplist_t *bpl = &spa->spa_sync_bplist; 2614789Sahrens dmu_tx_t *tx; 2615789Sahrens blkptr_t blk; 2616789Sahrens uint64_t itor = 0; 2617789Sahrens zio_t *zio; 2618789Sahrens int error; 2619789Sahrens uint8_t c = 1; 2620789Sahrens 2621789Sahrens zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD); 2622789Sahrens 2623789Sahrens while (bplist_iterate(bpl, &itor, &blk) == 0) 2624789Sahrens zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL)); 2625789Sahrens 2626789Sahrens error = zio_wait(zio); 2627789Sahrens ASSERT3U(error, ==, 0); 2628789Sahrens 2629789Sahrens tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 2630789Sahrens bplist_vacate(bpl, tx); 2631789Sahrens 2632789Sahrens /* 2633789Sahrens * Pre-dirty the first block so we sync to convergence faster. 2634789Sahrens * (Usually only the first block is needed.) 2635789Sahrens */ 2636789Sahrens dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 2637789Sahrens dmu_tx_commit(tx); 2638789Sahrens } 2639789Sahrens 2640789Sahrens static void 2641*2082Seschrock spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 2642*2082Seschrock { 2643*2082Seschrock char *packed = NULL; 2644*2082Seschrock size_t nvsize = 0; 2645*2082Seschrock dmu_buf_t *db; 2646*2082Seschrock 2647*2082Seschrock VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 2648*2082Seschrock 2649*2082Seschrock packed = kmem_alloc(nvsize, KM_SLEEP); 2650*2082Seschrock 2651*2082Seschrock VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 2652*2082Seschrock KM_SLEEP) == 0); 2653*2082Seschrock 2654*2082Seschrock dmu_write(spa->spa_meta_objset, obj, 0, nvsize, packed, tx); 2655*2082Seschrock 2656*2082Seschrock kmem_free(packed, nvsize); 2657*2082Seschrock 2658*2082Seschrock VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 2659*2082Seschrock dmu_buf_will_dirty(db, tx); 2660*2082Seschrock *(uint64_t *)db->db_data = nvsize; 2661*2082Seschrock dmu_buf_rele(db, FTAG); 2662*2082Seschrock } 2663*2082Seschrock 2664*2082Seschrock static void 2665*2082Seschrock spa_sync_spares(spa_t *spa, dmu_tx_t *tx) 2666*2082Seschrock { 2667*2082Seschrock nvlist_t *nvroot; 2668*2082Seschrock nvlist_t **spares; 2669*2082Seschrock int i; 2670*2082Seschrock 2671*2082Seschrock if (!spa->spa_sync_spares) 2672*2082Seschrock return; 2673*2082Seschrock 2674*2082Seschrock /* 2675*2082Seschrock * Update the MOS nvlist describing the list of available spares. 2676*2082Seschrock * spa_validate_spares() will have already made sure this nvlist is 2677*2082Seschrock * valid and the vdevs are labelled appropriately. 2678*2082Seschrock */ 2679*2082Seschrock if (spa->spa_spares_object == 0) { 2680*2082Seschrock spa->spa_spares_object = dmu_object_alloc(spa->spa_meta_objset, 2681*2082Seschrock DMU_OT_PACKED_NVLIST, 1 << 14, 2682*2082Seschrock DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 2683*2082Seschrock VERIFY(zap_update(spa->spa_meta_objset, 2684*2082Seschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SPARES, 2685*2082Seschrock sizeof (uint64_t), 1, &spa->spa_spares_object, tx) == 0); 2686*2082Seschrock } 2687*2082Seschrock 2688*2082Seschrock VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 2689*2082Seschrock if (spa->spa_nspares == 0) { 2690*2082Seschrock VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2691*2082Seschrock NULL, 0) == 0); 2692*2082Seschrock } else { 2693*2082Seschrock spares = kmem_alloc(spa->spa_nspares * sizeof (void *), 2694*2082Seschrock KM_SLEEP); 2695*2082Seschrock for (i = 0; i < spa->spa_nspares; i++) 2696*2082Seschrock spares[i] = vdev_config_generate(spa, 2697*2082Seschrock spa->spa_spares[i], B_FALSE, B_TRUE); 2698*2082Seschrock VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2699*2082Seschrock spares, spa->spa_nspares) == 0); 2700*2082Seschrock for (i = 0; i < spa->spa_nspares; i++) 2701*2082Seschrock nvlist_free(spares[i]); 2702*2082Seschrock kmem_free(spares, spa->spa_nspares * sizeof (void *)); 2703*2082Seschrock } 2704*2082Seschrock 2705*2082Seschrock spa_sync_nvlist(spa, spa->spa_spares_object, nvroot, tx); 2706*2082Seschrock 2707*2082Seschrock spa->spa_sync_spares = B_FALSE; 2708*2082Seschrock } 2709*2082Seschrock 2710*2082Seschrock static void 2711789Sahrens spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 2712789Sahrens { 2713789Sahrens nvlist_t *config; 2714789Sahrens 2715789Sahrens if (list_is_empty(&spa->spa_dirty_list)) 2716789Sahrens return; 2717789Sahrens 2718789Sahrens config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE); 2719789Sahrens 27201635Sbonwick if (spa->spa_config_syncing) 27211635Sbonwick nvlist_free(spa->spa_config_syncing); 27221635Sbonwick spa->spa_config_syncing = config; 2723789Sahrens 2724*2082Seschrock spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 2725789Sahrens } 2726789Sahrens 2727789Sahrens /* 2728789Sahrens * Sync the specified transaction group. New blocks may be dirtied as 2729789Sahrens * part of the process, so we iterate until it converges. 2730789Sahrens */ 2731789Sahrens void 2732789Sahrens spa_sync(spa_t *spa, uint64_t txg) 2733789Sahrens { 2734789Sahrens dsl_pool_t *dp = spa->spa_dsl_pool; 2735789Sahrens objset_t *mos = spa->spa_meta_objset; 2736789Sahrens bplist_t *bpl = &spa->spa_sync_bplist; 27371635Sbonwick vdev_t *rvd = spa->spa_root_vdev; 2738789Sahrens vdev_t *vd; 2739789Sahrens dmu_tx_t *tx; 2740789Sahrens int dirty_vdevs; 2741789Sahrens 2742789Sahrens /* 2743789Sahrens * Lock out configuration changes. 2744789Sahrens */ 27451544Seschrock spa_config_enter(spa, RW_READER, FTAG); 2746789Sahrens 2747789Sahrens spa->spa_syncing_txg = txg; 2748789Sahrens spa->spa_sync_pass = 0; 2749789Sahrens 27501544Seschrock VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); 2751789Sahrens 2752*2082Seschrock tx = dmu_tx_create_assigned(dp, txg); 2753*2082Seschrock 2754*2082Seschrock /* 2755*2082Seschrock * If we are upgrading to ZFS_VERSION_RAIDZ_DEFLATE this txg, 2756*2082Seschrock * set spa_deflate if we have no raid-z vdevs. 2757*2082Seschrock */ 2758*2082Seschrock if (spa->spa_ubsync.ub_version < ZFS_VERSION_RAIDZ_DEFLATE && 2759*2082Seschrock spa->spa_uberblock.ub_version >= ZFS_VERSION_RAIDZ_DEFLATE) { 2760*2082Seschrock int i; 2761*2082Seschrock 2762*2082Seschrock for (i = 0; i < rvd->vdev_children; i++) { 2763*2082Seschrock vd = rvd->vdev_child[i]; 2764*2082Seschrock if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 2765*2082Seschrock break; 2766*2082Seschrock } 2767*2082Seschrock if (i == rvd->vdev_children) { 2768*2082Seschrock spa->spa_deflate = TRUE; 2769*2082Seschrock VERIFY(0 == zap_add(spa->spa_meta_objset, 2770*2082Seschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 2771*2082Seschrock sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 2772*2082Seschrock } 2773*2082Seschrock } 2774*2082Seschrock 2775789Sahrens /* 2776789Sahrens * If anything has changed in this txg, push the deferred frees 2777789Sahrens * from the previous txg. If not, leave them alone so that we 2778789Sahrens * don't generate work on an otherwise idle system. 2779789Sahrens */ 2780789Sahrens if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 2781789Sahrens !txg_list_empty(&dp->dp_dirty_dirs, txg)) 2782789Sahrens spa_sync_deferred_frees(spa, txg); 2783789Sahrens 2784789Sahrens /* 2785789Sahrens * Iterate to convergence. 2786789Sahrens */ 2787789Sahrens do { 2788789Sahrens spa->spa_sync_pass++; 2789789Sahrens 2790789Sahrens spa_sync_config_object(spa, tx); 2791*2082Seschrock spa_sync_spares(spa, tx); 27921544Seschrock spa_errlog_sync(spa, txg); 2793789Sahrens dsl_pool_sync(dp, txg); 2794789Sahrens 2795789Sahrens dirty_vdevs = 0; 2796789Sahrens while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 2797789Sahrens vdev_sync(vd, txg); 2798789Sahrens dirty_vdevs++; 2799789Sahrens } 2800789Sahrens 2801789Sahrens bplist_sync(bpl, tx); 2802789Sahrens } while (dirty_vdevs); 2803789Sahrens 2804789Sahrens bplist_close(bpl); 2805789Sahrens 2806789Sahrens dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 2807789Sahrens 2808789Sahrens /* 2809789Sahrens * Rewrite the vdev configuration (which includes the uberblock) 2810789Sahrens * to commit the transaction group. 28111635Sbonwick * 28121635Sbonwick * If there are any dirty vdevs, sync the uberblock to all vdevs. 28131635Sbonwick * Otherwise, pick a random top-level vdev that's known to be 28141635Sbonwick * visible in the config cache (see spa_vdev_add() for details). 28151635Sbonwick * If the write fails, try the next vdev until we're tried them all. 2816789Sahrens */ 28171635Sbonwick if (!list_is_empty(&spa->spa_dirty_list)) { 28181635Sbonwick VERIFY(vdev_config_sync(rvd, txg) == 0); 28191635Sbonwick } else { 28201635Sbonwick int children = rvd->vdev_children; 28211635Sbonwick int c0 = spa_get_random(children); 28221635Sbonwick int c; 28231635Sbonwick 28241635Sbonwick for (c = 0; c < children; c++) { 28251635Sbonwick vd = rvd->vdev_child[(c0 + c) % children]; 28261635Sbonwick if (vd->vdev_ms_array == 0) 28271635Sbonwick continue; 28281635Sbonwick if (vdev_config_sync(vd, txg) == 0) 28291635Sbonwick break; 28301635Sbonwick } 28311635Sbonwick if (c == children) 28321635Sbonwick VERIFY(vdev_config_sync(rvd, txg) == 0); 28331635Sbonwick } 28341635Sbonwick 2835*2082Seschrock dmu_tx_commit(tx); 2836*2082Seschrock 28371635Sbonwick /* 28381635Sbonwick * Clear the dirty config list. 28391635Sbonwick */ 28401635Sbonwick while ((vd = list_head(&spa->spa_dirty_list)) != NULL) 28411635Sbonwick vdev_config_clean(vd); 28421635Sbonwick 28431635Sbonwick /* 28441635Sbonwick * Now that the new config has synced transactionally, 28451635Sbonwick * let it become visible to the config cache. 28461635Sbonwick */ 28471635Sbonwick if (spa->spa_config_syncing != NULL) { 28481635Sbonwick spa_config_set(spa, spa->spa_config_syncing); 28491635Sbonwick spa->spa_config_txg = txg; 28501635Sbonwick spa->spa_config_syncing = NULL; 28511635Sbonwick } 2852789Sahrens 2853789Sahrens /* 2854789Sahrens * Make a stable copy of the fully synced uberblock. 2855789Sahrens * We use this as the root for pool traversals. 2856789Sahrens */ 2857789Sahrens spa->spa_traverse_wanted = 1; /* tells traverse_more() to stop */ 2858789Sahrens 2859789Sahrens spa_scrub_suspend(spa); /* stop scrubbing and finish I/Os */ 2860789Sahrens 2861789Sahrens rw_enter(&spa->spa_traverse_lock, RW_WRITER); 2862789Sahrens spa->spa_traverse_wanted = 0; 2863789Sahrens spa->spa_ubsync = spa->spa_uberblock; 2864789Sahrens rw_exit(&spa->spa_traverse_lock); 2865789Sahrens 2866789Sahrens spa_scrub_resume(spa); /* resume scrub with new ubsync */ 2867789Sahrens 2868789Sahrens /* 2869789Sahrens * Clean up the ZIL records for the synced txg. 2870789Sahrens */ 2871789Sahrens dsl_pool_zil_clean(dp); 2872789Sahrens 2873789Sahrens /* 2874789Sahrens * Update usable space statistics. 2875789Sahrens */ 2876789Sahrens while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 2877789Sahrens vdev_sync_done(vd, txg); 2878789Sahrens 2879789Sahrens /* 2880789Sahrens * It had better be the case that we didn't dirty anything 2881*2082Seschrock * since vdev_config_sync(). 2882789Sahrens */ 2883789Sahrens ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 2884789Sahrens ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 2885789Sahrens ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 2886789Sahrens ASSERT(bpl->bpl_queue == NULL); 2887789Sahrens 28881544Seschrock spa_config_exit(spa, FTAG); 28891544Seschrock 28901544Seschrock /* 28911544Seschrock * If any async tasks have been requested, kick them off. 28921544Seschrock */ 28931544Seschrock spa_async_dispatch(spa); 2894789Sahrens } 2895789Sahrens 2896789Sahrens /* 2897789Sahrens * Sync all pools. We don't want to hold the namespace lock across these 2898789Sahrens * operations, so we take a reference on the spa_t and drop the lock during the 2899789Sahrens * sync. 2900789Sahrens */ 2901789Sahrens void 2902789Sahrens spa_sync_allpools(void) 2903789Sahrens { 2904789Sahrens spa_t *spa = NULL; 2905789Sahrens mutex_enter(&spa_namespace_lock); 2906789Sahrens while ((spa = spa_next(spa)) != NULL) { 2907789Sahrens if (spa_state(spa) != POOL_STATE_ACTIVE) 2908789Sahrens continue; 2909789Sahrens spa_open_ref(spa, FTAG); 2910789Sahrens mutex_exit(&spa_namespace_lock); 2911789Sahrens txg_wait_synced(spa_get_dsl(spa), 0); 2912789Sahrens mutex_enter(&spa_namespace_lock); 2913789Sahrens spa_close(spa, FTAG); 2914789Sahrens } 2915789Sahrens mutex_exit(&spa_namespace_lock); 2916789Sahrens } 2917789Sahrens 2918789Sahrens /* 2919789Sahrens * ========================================================================== 2920789Sahrens * Miscellaneous routines 2921789Sahrens * ========================================================================== 2922789Sahrens */ 2923789Sahrens 2924789Sahrens /* 2925789Sahrens * Remove all pools in the system. 2926789Sahrens */ 2927789Sahrens void 2928789Sahrens spa_evict_all(void) 2929789Sahrens { 2930789Sahrens spa_t *spa; 2931789Sahrens 2932789Sahrens /* 2933789Sahrens * Remove all cached state. All pools should be closed now, 2934789Sahrens * so every spa in the AVL tree should be unreferenced. 2935789Sahrens */ 2936789Sahrens mutex_enter(&spa_namespace_lock); 2937789Sahrens while ((spa = spa_next(NULL)) != NULL) { 2938789Sahrens /* 29391544Seschrock * Stop async tasks. The async thread may need to detach 29401544Seschrock * a device that's been replaced, which requires grabbing 29411544Seschrock * spa_namespace_lock, so we must drop it here. 2942789Sahrens */ 2943789Sahrens spa_open_ref(spa, FTAG); 2944789Sahrens mutex_exit(&spa_namespace_lock); 29451544Seschrock spa_async_suspend(spa); 2946789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 2947789Sahrens mutex_enter(&spa_namespace_lock); 2948789Sahrens spa_close(spa, FTAG); 2949789Sahrens 2950789Sahrens if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 2951789Sahrens spa_unload(spa); 2952789Sahrens spa_deactivate(spa); 2953789Sahrens } 2954789Sahrens spa_remove(spa); 2955789Sahrens } 2956789Sahrens mutex_exit(&spa_namespace_lock); 2957789Sahrens } 29581544Seschrock 29591544Seschrock vdev_t * 29601544Seschrock spa_lookup_by_guid(spa_t *spa, uint64_t guid) 29611544Seschrock { 29621544Seschrock return (vdev_lookup_by_guid(spa->spa_root_vdev, guid)); 29631544Seschrock } 29641760Seschrock 29651760Seschrock void 29661760Seschrock spa_upgrade(spa_t *spa) 29671760Seschrock { 29681760Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 29691760Seschrock 29701760Seschrock /* 29711760Seschrock * This should only be called for a non-faulted pool, and since a 29721760Seschrock * future version would result in an unopenable pool, this shouldn't be 29731760Seschrock * possible. 29741760Seschrock */ 29751760Seschrock ASSERT(spa->spa_uberblock.ub_version <= ZFS_VERSION); 29761760Seschrock 29771760Seschrock spa->spa_uberblock.ub_version = ZFS_VERSION; 29781760Seschrock vdev_config_dirty(spa->spa_root_vdev); 29791760Seschrock 29801760Seschrock spa_config_exit(spa, FTAG); 2981*2082Seschrock 2982*2082Seschrock txg_wait_synced(spa_get_dsl(spa), 0); 29831760Seschrock } 2984*2082Seschrock 2985*2082Seschrock boolean_t 2986*2082Seschrock spa_has_spare(spa_t *spa, uint64_t guid) 2987*2082Seschrock { 2988*2082Seschrock int i; 2989*2082Seschrock 2990*2082Seschrock for (i = 0; i < spa->spa_nspares; i++) 2991*2082Seschrock if (spa->spa_spares[i]->vdev_guid == guid) 2992*2082Seschrock return (B_TRUE); 2993*2082Seschrock 2994*2082Seschrock return (B_FALSE); 2995*2082Seschrock } 2996