1789Sahrens /* 2789Sahrens * CDDL HEADER START 3789Sahrens * 4789Sahrens * The contents of this file are subject to the terms of the 51544Seschrock * Common Development and Distribution License (the "License"). 61544Seschrock * You may not use this file except in compliance with the License. 7789Sahrens * 8789Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9789Sahrens * or http://www.opensolaris.org/os/licensing. 10789Sahrens * See the License for the specific language governing permissions 11789Sahrens * and limitations under the License. 12789Sahrens * 13789Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14789Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15789Sahrens * If applicable, add the following below this CDDL HEADER, with the 16789Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17789Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18789Sahrens * 19789Sahrens * CDDL HEADER END 20789Sahrens */ 212082Seschrock 22789Sahrens /* 233377Seschrock * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24789Sahrens * Use is subject to license terms. 25789Sahrens */ 26789Sahrens 27789Sahrens #pragma ident "%Z%%M% %I% %E% SMI" 28789Sahrens 29789Sahrens /* 30789Sahrens * This file contains all the routines used when modifying on-disk SPA state. 31789Sahrens * This includes opening, importing, destroying, exporting a pool, and syncing a 32789Sahrens * pool. 33789Sahrens */ 34789Sahrens 35789Sahrens #include <sys/zfs_context.h> 361544Seschrock #include <sys/fm/fs/zfs.h> 37789Sahrens #include <sys/spa_impl.h> 38789Sahrens #include <sys/zio.h> 39789Sahrens #include <sys/zio_checksum.h> 40789Sahrens #include <sys/zio_compress.h> 41789Sahrens #include <sys/dmu.h> 42789Sahrens #include <sys/dmu_tx.h> 43789Sahrens #include <sys/zap.h> 44789Sahrens #include <sys/zil.h> 45789Sahrens #include <sys/vdev_impl.h> 46789Sahrens #include <sys/metaslab.h> 47789Sahrens #include <sys/uberblock_impl.h> 48789Sahrens #include <sys/txg.h> 49789Sahrens #include <sys/avl.h> 50789Sahrens #include <sys/dmu_traverse.h> 51789Sahrens #include <sys/unique.h> 52789Sahrens #include <sys/dsl_pool.h> 53789Sahrens #include <sys/dsl_dir.h> 54789Sahrens #include <sys/dsl_prop.h> 55789Sahrens #include <sys/fs/zfs.h> 56789Sahrens #include <sys/callb.h> 57789Sahrens 582986Sek110237 int zio_taskq_threads = 8; 592986Sek110237 60789Sahrens /* 61789Sahrens * ========================================================================== 62789Sahrens * SPA state manipulation (open/create/destroy/import/export) 63789Sahrens * ========================================================================== 64789Sahrens */ 65789Sahrens 661544Seschrock static int 671544Seschrock spa_error_entry_compare(const void *a, const void *b) 681544Seschrock { 691544Seschrock spa_error_entry_t *sa = (spa_error_entry_t *)a; 701544Seschrock spa_error_entry_t *sb = (spa_error_entry_t *)b; 711544Seschrock int ret; 721544Seschrock 731544Seschrock ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 741544Seschrock sizeof (zbookmark_t)); 751544Seschrock 761544Seschrock if (ret < 0) 771544Seschrock return (-1); 781544Seschrock else if (ret > 0) 791544Seschrock return (1); 801544Seschrock else 811544Seschrock return (0); 821544Seschrock } 831544Seschrock 841544Seschrock /* 851544Seschrock * Utility function which retrieves copies of the current logs and 861544Seschrock * re-initializes them in the process. 871544Seschrock */ 881544Seschrock void 891544Seschrock spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 901544Seschrock { 911544Seschrock ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 921544Seschrock 931544Seschrock bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 941544Seschrock bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 951544Seschrock 961544Seschrock avl_create(&spa->spa_errlist_scrub, 971544Seschrock spa_error_entry_compare, sizeof (spa_error_entry_t), 981544Seschrock offsetof(spa_error_entry_t, se_avl)); 991544Seschrock avl_create(&spa->spa_errlist_last, 1001544Seschrock spa_error_entry_compare, sizeof (spa_error_entry_t), 1011544Seschrock offsetof(spa_error_entry_t, se_avl)); 1021544Seschrock } 1031544Seschrock 104789Sahrens /* 105789Sahrens * Activate an uninitialized pool. 106789Sahrens */ 107789Sahrens static void 108789Sahrens spa_activate(spa_t *spa) 109789Sahrens { 110789Sahrens int t; 111789Sahrens 112789Sahrens ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 113789Sahrens 114789Sahrens spa->spa_state = POOL_STATE_ACTIVE; 115789Sahrens 116789Sahrens spa->spa_normal_class = metaslab_class_create(); 117789Sahrens 118789Sahrens for (t = 0; t < ZIO_TYPES; t++) { 119789Sahrens spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue", 1202986Sek110237 zio_taskq_threads, maxclsyspri, 50, INT_MAX, 121789Sahrens TASKQ_PREPOPULATE); 122789Sahrens spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr", 1232986Sek110237 zio_taskq_threads, maxclsyspri, 50, INT_MAX, 124789Sahrens TASKQ_PREPOPULATE); 125789Sahrens } 126789Sahrens 127789Sahrens rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL); 128789Sahrens 1292856Snd150628 mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL); 1302856Snd150628 mutex_init(&spa->spa_config_cache_lock, NULL, MUTEX_DEFAULT, NULL); 1312856Snd150628 mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL); 1322856Snd150628 mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL); 1332856Snd150628 mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL); 1342856Snd150628 mutex_init(&spa->spa_config_lock.scl_lock, NULL, MUTEX_DEFAULT, NULL); 1352856Snd150628 mutex_init(&spa->spa_sync_bplist.bpl_lock, NULL, MUTEX_DEFAULT, NULL); 1362926Sek110237 mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL); 1372856Snd150628 138789Sahrens list_create(&spa->spa_dirty_list, sizeof (vdev_t), 139789Sahrens offsetof(vdev_t, vdev_dirty_node)); 140789Sahrens 141789Sahrens txg_list_create(&spa->spa_vdev_txg_list, 142789Sahrens offsetof(struct vdev, vdev_txg_node)); 1431544Seschrock 1441544Seschrock avl_create(&spa->spa_errlist_scrub, 1451544Seschrock spa_error_entry_compare, sizeof (spa_error_entry_t), 1461544Seschrock offsetof(spa_error_entry_t, se_avl)); 1471544Seschrock avl_create(&spa->spa_errlist_last, 1481544Seschrock spa_error_entry_compare, sizeof (spa_error_entry_t), 1491544Seschrock offsetof(spa_error_entry_t, se_avl)); 150789Sahrens } 151789Sahrens 152789Sahrens /* 153789Sahrens * Opposite of spa_activate(). 154789Sahrens */ 155789Sahrens static void 156789Sahrens spa_deactivate(spa_t *spa) 157789Sahrens { 158789Sahrens int t; 159789Sahrens 160789Sahrens ASSERT(spa->spa_sync_on == B_FALSE); 161789Sahrens ASSERT(spa->spa_dsl_pool == NULL); 162789Sahrens ASSERT(spa->spa_root_vdev == NULL); 163789Sahrens 164789Sahrens ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 165789Sahrens 166789Sahrens txg_list_destroy(&spa->spa_vdev_txg_list); 167789Sahrens 168789Sahrens list_destroy(&spa->spa_dirty_list); 169789Sahrens 170789Sahrens rw_destroy(&spa->spa_traverse_lock); 171789Sahrens 172789Sahrens for (t = 0; t < ZIO_TYPES; t++) { 173789Sahrens taskq_destroy(spa->spa_zio_issue_taskq[t]); 174789Sahrens taskq_destroy(spa->spa_zio_intr_taskq[t]); 175789Sahrens spa->spa_zio_issue_taskq[t] = NULL; 176789Sahrens spa->spa_zio_intr_taskq[t] = NULL; 177789Sahrens } 178789Sahrens 179789Sahrens metaslab_class_destroy(spa->spa_normal_class); 180789Sahrens spa->spa_normal_class = NULL; 181789Sahrens 1821544Seschrock /* 1831544Seschrock * If this was part of an import or the open otherwise failed, we may 1841544Seschrock * still have errors left in the queues. Empty them just in case. 1851544Seschrock */ 1861544Seschrock spa_errlog_drain(spa); 1871544Seschrock 1881544Seschrock avl_destroy(&spa->spa_errlist_scrub); 1891544Seschrock avl_destroy(&spa->spa_errlist_last); 1901544Seschrock 191789Sahrens spa->spa_state = POOL_STATE_UNINITIALIZED; 192789Sahrens } 193789Sahrens 194789Sahrens /* 195789Sahrens * Verify a pool configuration, and construct the vdev tree appropriately. This 196789Sahrens * will create all the necessary vdevs in the appropriate layout, with each vdev 197789Sahrens * in the CLOSED state. This will prep the pool before open/creation/import. 198789Sahrens * All vdev validation is done by the vdev_alloc() routine. 199789Sahrens */ 2002082Seschrock static int 2012082Seschrock spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 2022082Seschrock uint_t id, int atype) 203789Sahrens { 204789Sahrens nvlist_t **child; 205789Sahrens uint_t c, children; 2062082Seschrock int error; 2072082Seschrock 2082082Seschrock if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 2092082Seschrock return (error); 2102082Seschrock 2112082Seschrock if ((*vdp)->vdev_ops->vdev_op_leaf) 2122082Seschrock return (0); 213789Sahrens 214789Sahrens if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 215789Sahrens &child, &children) != 0) { 2162082Seschrock vdev_free(*vdp); 2172082Seschrock *vdp = NULL; 2182082Seschrock return (EINVAL); 219789Sahrens } 220789Sahrens 221789Sahrens for (c = 0; c < children; c++) { 2222082Seschrock vdev_t *vd; 2232082Seschrock if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 2242082Seschrock atype)) != 0) { 2252082Seschrock vdev_free(*vdp); 2262082Seschrock *vdp = NULL; 2272082Seschrock return (error); 228789Sahrens } 229789Sahrens } 230789Sahrens 2312082Seschrock ASSERT(*vdp != NULL); 2322082Seschrock 2332082Seschrock return (0); 234789Sahrens } 235789Sahrens 236789Sahrens /* 237789Sahrens * Opposite of spa_load(). 238789Sahrens */ 239789Sahrens static void 240789Sahrens spa_unload(spa_t *spa) 241789Sahrens { 2422082Seschrock int i; 2432082Seschrock 244789Sahrens /* 2451544Seschrock * Stop async tasks. 2461544Seschrock */ 2471544Seschrock spa_async_suspend(spa); 2481544Seschrock 2491544Seschrock /* 250789Sahrens * Stop syncing. 251789Sahrens */ 252789Sahrens if (spa->spa_sync_on) { 253789Sahrens txg_sync_stop(spa->spa_dsl_pool); 254789Sahrens spa->spa_sync_on = B_FALSE; 255789Sahrens } 256789Sahrens 257789Sahrens /* 258789Sahrens * Wait for any outstanding prefetch I/O to complete. 259789Sahrens */ 2601544Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 2611544Seschrock spa_config_exit(spa, FTAG); 262789Sahrens 263789Sahrens /* 264789Sahrens * Close the dsl pool. 265789Sahrens */ 266789Sahrens if (spa->spa_dsl_pool) { 267789Sahrens dsl_pool_close(spa->spa_dsl_pool); 268789Sahrens spa->spa_dsl_pool = NULL; 269789Sahrens } 270789Sahrens 271789Sahrens /* 272789Sahrens * Close all vdevs. 273789Sahrens */ 2741585Sbonwick if (spa->spa_root_vdev) 275789Sahrens vdev_free(spa->spa_root_vdev); 2761585Sbonwick ASSERT(spa->spa_root_vdev == NULL); 2771544Seschrock 2782082Seschrock for (i = 0; i < spa->spa_nspares; i++) 2792082Seschrock vdev_free(spa->spa_spares[i]); 2802082Seschrock if (spa->spa_spares) { 2812082Seschrock kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); 2822082Seschrock spa->spa_spares = NULL; 2832082Seschrock } 2842082Seschrock if (spa->spa_sparelist) { 2852082Seschrock nvlist_free(spa->spa_sparelist); 2862082Seschrock spa->spa_sparelist = NULL; 2872082Seschrock } 2882082Seschrock 2891544Seschrock spa->spa_async_suspended = 0; 290789Sahrens } 291789Sahrens 292789Sahrens /* 2932082Seschrock * Load (or re-load) the current list of vdevs describing the active spares for 2942082Seschrock * this pool. When this is called, we have some form of basic information in 2952082Seschrock * 'spa_sparelist'. We parse this into vdevs, try to open them, and then 2962082Seschrock * re-generate a more complete list including status information. 2972082Seschrock */ 2982082Seschrock static void 2992082Seschrock spa_load_spares(spa_t *spa) 3002082Seschrock { 3012082Seschrock nvlist_t **spares; 3022082Seschrock uint_t nspares; 3032082Seschrock int i; 3043377Seschrock vdev_t *vd, *tvd; 3052082Seschrock 3062082Seschrock /* 3072082Seschrock * First, close and free any existing spare vdevs. 3082082Seschrock */ 3092082Seschrock for (i = 0; i < spa->spa_nspares; i++) { 3103377Seschrock vd = spa->spa_spares[i]; 3113377Seschrock 3123377Seschrock /* Undo the call to spa_activate() below */ 3133377Seschrock if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL && 3143377Seschrock tvd->vdev_isspare) 3153377Seschrock spa_spare_remove(tvd); 3163377Seschrock vdev_close(vd); 3173377Seschrock vdev_free(vd); 3182082Seschrock } 3193377Seschrock 3202082Seschrock if (spa->spa_spares) 3212082Seschrock kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); 3222082Seschrock 3232082Seschrock if (spa->spa_sparelist == NULL) 3242082Seschrock nspares = 0; 3252082Seschrock else 3262082Seschrock VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 3272082Seschrock ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 3282082Seschrock 3292082Seschrock spa->spa_nspares = (int)nspares; 3302082Seschrock spa->spa_spares = NULL; 3312082Seschrock 3322082Seschrock if (nspares == 0) 3332082Seschrock return; 3342082Seschrock 3352082Seschrock /* 3362082Seschrock * Construct the array of vdevs, opening them to get status in the 3373377Seschrock * process. For each spare, there is potentially two different vdev_t 3383377Seschrock * structures associated with it: one in the list of spares (used only 3393377Seschrock * for basic validation purposes) and one in the active vdev 3403377Seschrock * configuration (if it's spared in). During this phase we open and 3413377Seschrock * validate each vdev on the spare list. If the vdev also exists in the 3423377Seschrock * active configuration, then we also mark this vdev as an active spare. 3432082Seschrock */ 3442082Seschrock spa->spa_spares = kmem_alloc(nspares * sizeof (void *), KM_SLEEP); 3452082Seschrock for (i = 0; i < spa->spa_nspares; i++) { 3462082Seschrock VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 3472082Seschrock VDEV_ALLOC_SPARE) == 0); 3482082Seschrock ASSERT(vd != NULL); 3492082Seschrock 3502082Seschrock spa->spa_spares[i] = vd; 3512082Seschrock 3523377Seschrock if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL) { 3533377Seschrock if (!tvd->vdev_isspare) 3543377Seschrock spa_spare_add(tvd); 3553377Seschrock 3563377Seschrock /* 3573377Seschrock * We only mark the spare active if we were successfully 3583377Seschrock * able to load the vdev. Otherwise, importing a pool 3593377Seschrock * with a bad active spare would result in strange 3603377Seschrock * behavior, because multiple pool would think the spare 3613377Seschrock * is actively in use. 3623377Seschrock * 3633377Seschrock * There is a vulnerability here to an equally bizarre 3643377Seschrock * circumstance, where a dead active spare is later 3653377Seschrock * brought back to life (onlined or otherwise). Given 3663377Seschrock * the rarity of this scenario, and the extra complexity 3673377Seschrock * it adds, we ignore the possibility. 3683377Seschrock */ 3693377Seschrock if (!vdev_is_dead(tvd)) 3703377Seschrock spa_spare_activate(tvd); 3713377Seschrock } 3723377Seschrock 3732082Seschrock if (vdev_open(vd) != 0) 3742082Seschrock continue; 3752082Seschrock 3762082Seschrock vd->vdev_top = vd; 3772082Seschrock (void) vdev_validate_spare(vd); 3782082Seschrock } 3792082Seschrock 3802082Seschrock /* 3812082Seschrock * Recompute the stashed list of spares, with status information 3822082Seschrock * this time. 3832082Seschrock */ 3842082Seschrock VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 3852082Seschrock DATA_TYPE_NVLIST_ARRAY) == 0); 3862082Seschrock 3872082Seschrock spares = kmem_alloc(spa->spa_nspares * sizeof (void *), KM_SLEEP); 3882082Seschrock for (i = 0; i < spa->spa_nspares; i++) 3892082Seschrock spares[i] = vdev_config_generate(spa, spa->spa_spares[i], 3902082Seschrock B_TRUE, B_TRUE); 3912082Seschrock VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 3922082Seschrock spares, spa->spa_nspares) == 0); 3932082Seschrock for (i = 0; i < spa->spa_nspares; i++) 3942082Seschrock nvlist_free(spares[i]); 3952082Seschrock kmem_free(spares, spa->spa_nspares * sizeof (void *)); 3962082Seschrock } 3972082Seschrock 3982082Seschrock static int 3992082Seschrock load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 4002082Seschrock { 4012082Seschrock dmu_buf_t *db; 4022082Seschrock char *packed = NULL; 4032082Seschrock size_t nvsize = 0; 4042082Seschrock int error; 4052082Seschrock *value = NULL; 4062082Seschrock 4072082Seschrock VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 4082082Seschrock nvsize = *(uint64_t *)db->db_data; 4092082Seschrock dmu_buf_rele(db, FTAG); 4102082Seschrock 4112082Seschrock packed = kmem_alloc(nvsize, KM_SLEEP); 4122082Seschrock error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed); 4132082Seschrock if (error == 0) 4142082Seschrock error = nvlist_unpack(packed, nvsize, value, 0); 4152082Seschrock kmem_free(packed, nvsize); 4162082Seschrock 4172082Seschrock return (error); 4182082Seschrock } 4192082Seschrock 4202082Seschrock /* 421789Sahrens * Load an existing storage pool, using the pool's builtin spa_config as a 4221544Seschrock * source of configuration information. 423789Sahrens */ 424789Sahrens static int 4251544Seschrock spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) 426789Sahrens { 427789Sahrens int error = 0; 428789Sahrens nvlist_t *nvroot = NULL; 429789Sahrens vdev_t *rvd; 430789Sahrens uberblock_t *ub = &spa->spa_uberblock; 4311635Sbonwick uint64_t config_cache_txg = spa->spa_config_txg; 432789Sahrens uint64_t pool_guid; 4332082Seschrock uint64_t version; 434789Sahrens zio_t *zio; 435789Sahrens 4361544Seschrock spa->spa_load_state = state; 4371635Sbonwick 438789Sahrens if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 4391733Sbonwick nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 4401544Seschrock error = EINVAL; 4411544Seschrock goto out; 4421544Seschrock } 443789Sahrens 4442082Seschrock /* 4452082Seschrock * Versioning wasn't explicitly added to the label until later, so if 4462082Seschrock * it's not present treat it as the initial version. 4472082Seschrock */ 4482082Seschrock if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0) 4492082Seschrock version = ZFS_VERSION_INITIAL; 4502082Seschrock 4511733Sbonwick (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 4521733Sbonwick &spa->spa_config_txg); 4531733Sbonwick 4541635Sbonwick if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 4551544Seschrock spa_guid_exists(pool_guid, 0)) { 4561544Seschrock error = EEXIST; 4571544Seschrock goto out; 4581544Seschrock } 459789Sahrens 4602174Seschrock spa->spa_load_guid = pool_guid; 4612174Seschrock 462789Sahrens /* 4632082Seschrock * Parse the configuration into a vdev tree. We explicitly set the 4642082Seschrock * value that will be returned by spa_version() since parsing the 4652082Seschrock * configuration requires knowing the version number. 466789Sahrens */ 4671544Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 4682082Seschrock spa->spa_ubsync.ub_version = version; 4692082Seschrock error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 4701544Seschrock spa_config_exit(spa, FTAG); 471789Sahrens 4722082Seschrock if (error != 0) 4731544Seschrock goto out; 474789Sahrens 4751585Sbonwick ASSERT(spa->spa_root_vdev == rvd); 476789Sahrens ASSERT(spa_guid(spa) == pool_guid); 477789Sahrens 478789Sahrens /* 479789Sahrens * Try to open all vdevs, loading each label in the process. 480789Sahrens */ 4811544Seschrock if (vdev_open(rvd) != 0) { 4821544Seschrock error = ENXIO; 4831544Seschrock goto out; 4841544Seschrock } 485789Sahrens 486789Sahrens /* 4871986Seschrock * Validate the labels for all leaf vdevs. We need to grab the config 4881986Seschrock * lock because all label I/O is done with the ZIO_FLAG_CONFIG_HELD 4891986Seschrock * flag. 4901986Seschrock */ 4911986Seschrock spa_config_enter(spa, RW_READER, FTAG); 4921986Seschrock error = vdev_validate(rvd); 4931986Seschrock spa_config_exit(spa, FTAG); 4941986Seschrock 4951986Seschrock if (error != 0) { 4961986Seschrock error = EBADF; 4971986Seschrock goto out; 4981986Seschrock } 4991986Seschrock 5001986Seschrock if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 5011986Seschrock error = ENXIO; 5021986Seschrock goto out; 5031986Seschrock } 5041986Seschrock 5051986Seschrock /* 506789Sahrens * Find the best uberblock. 507789Sahrens */ 508789Sahrens bzero(ub, sizeof (uberblock_t)); 509789Sahrens 510789Sahrens zio = zio_root(spa, NULL, NULL, 511789Sahrens ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 512789Sahrens vdev_uberblock_load(zio, rvd, ub); 513789Sahrens error = zio_wait(zio); 514789Sahrens 515789Sahrens /* 516789Sahrens * If we weren't able to find a single valid uberblock, return failure. 517789Sahrens */ 518789Sahrens if (ub->ub_txg == 0) { 5191760Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 5201760Seschrock VDEV_AUX_CORRUPT_DATA); 5211544Seschrock error = ENXIO; 5221544Seschrock goto out; 5231544Seschrock } 5241544Seschrock 5251544Seschrock /* 5261544Seschrock * If the pool is newer than the code, we can't open it. 5271544Seschrock */ 5281760Seschrock if (ub->ub_version > ZFS_VERSION) { 5291760Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 5301760Seschrock VDEV_AUX_VERSION_NEWER); 5311544Seschrock error = ENOTSUP; 5321544Seschrock goto out; 533789Sahrens } 534789Sahrens 535789Sahrens /* 536789Sahrens * If the vdev guid sum doesn't match the uberblock, we have an 537789Sahrens * incomplete configuration. 538789Sahrens */ 5391732Sbonwick if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 5401544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 5411544Seschrock VDEV_AUX_BAD_GUID_SUM); 5421544Seschrock error = ENXIO; 5431544Seschrock goto out; 544789Sahrens } 545789Sahrens 546789Sahrens /* 547789Sahrens * Initialize internal SPA structures. 548789Sahrens */ 549789Sahrens spa->spa_state = POOL_STATE_ACTIVE; 550789Sahrens spa->spa_ubsync = spa->spa_uberblock; 551789Sahrens spa->spa_first_txg = spa_last_synced_txg(spa) + 1; 5521544Seschrock error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 5531544Seschrock if (error) { 5541544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 5551544Seschrock VDEV_AUX_CORRUPT_DATA); 5561544Seschrock goto out; 5571544Seschrock } 558789Sahrens spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 559789Sahrens 5601544Seschrock if (zap_lookup(spa->spa_meta_objset, 561789Sahrens DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 5621544Seschrock sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { 5631544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 5641544Seschrock VDEV_AUX_CORRUPT_DATA); 5651544Seschrock error = EIO; 5661544Seschrock goto out; 5671544Seschrock } 568789Sahrens 569789Sahrens if (!mosconfig) { 5702082Seschrock nvlist_t *newconfig; 5712082Seschrock 5722082Seschrock if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) { 5731544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 5741544Seschrock VDEV_AUX_CORRUPT_DATA); 5751544Seschrock error = EIO; 5761544Seschrock goto out; 5771544Seschrock } 578789Sahrens 579789Sahrens spa_config_set(spa, newconfig); 580789Sahrens spa_unload(spa); 581789Sahrens spa_deactivate(spa); 582789Sahrens spa_activate(spa); 583789Sahrens 5841544Seschrock return (spa_load(spa, newconfig, state, B_TRUE)); 5851544Seschrock } 5861544Seschrock 5871544Seschrock if (zap_lookup(spa->spa_meta_objset, 5881544Seschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 5891544Seschrock sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { 5901544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 5911544Seschrock VDEV_AUX_CORRUPT_DATA); 5921544Seschrock error = EIO; 5931544Seschrock goto out; 594789Sahrens } 595789Sahrens 5961544Seschrock /* 5972082Seschrock * Load the bit that tells us to use the new accounting function 5982082Seschrock * (raid-z deflation). If we have an older pool, this will not 5992082Seschrock * be present. 6002082Seschrock */ 6012082Seschrock error = zap_lookup(spa->spa_meta_objset, 6022082Seschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 6032082Seschrock sizeof (uint64_t), 1, &spa->spa_deflate); 6042082Seschrock if (error != 0 && error != ENOENT) { 6052082Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 6062082Seschrock VDEV_AUX_CORRUPT_DATA); 6072082Seschrock error = EIO; 6082082Seschrock goto out; 6092082Seschrock } 6102082Seschrock 6112082Seschrock /* 6121544Seschrock * Load the persistent error log. If we have an older pool, this will 6131544Seschrock * not be present. 6141544Seschrock */ 6151544Seschrock error = zap_lookup(spa->spa_meta_objset, 6161544Seschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, 6171544Seschrock sizeof (uint64_t), 1, &spa->spa_errlog_last); 6181807Sbonwick if (error != 0 && error != ENOENT) { 6191544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 6201544Seschrock VDEV_AUX_CORRUPT_DATA); 6211544Seschrock error = EIO; 6221544Seschrock goto out; 6231544Seschrock } 6241544Seschrock 6251544Seschrock error = zap_lookup(spa->spa_meta_objset, 6261544Seschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, 6271544Seschrock sizeof (uint64_t), 1, &spa->spa_errlog_scrub); 6281544Seschrock if (error != 0 && error != ENOENT) { 6291544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 6301544Seschrock VDEV_AUX_CORRUPT_DATA); 6311544Seschrock error = EIO; 6321544Seschrock goto out; 6331544Seschrock } 634789Sahrens 635789Sahrens /* 6362926Sek110237 * Load the history object. If we have an older pool, this 6372926Sek110237 * will not be present. 6382926Sek110237 */ 6392926Sek110237 error = zap_lookup(spa->spa_meta_objset, 6402926Sek110237 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY, 6412926Sek110237 sizeof (uint64_t), 1, &spa->spa_history); 6422926Sek110237 if (error != 0 && error != ENOENT) { 6432926Sek110237 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 6442926Sek110237 VDEV_AUX_CORRUPT_DATA); 6452926Sek110237 error = EIO; 6462926Sek110237 goto out; 6472926Sek110237 } 6482926Sek110237 6492926Sek110237 /* 6502082Seschrock * Load any hot spares for this pool. 6512082Seschrock */ 6522082Seschrock error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 6532082Seschrock DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares_object); 6542082Seschrock if (error != 0 && error != ENOENT) { 6552082Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 6562082Seschrock VDEV_AUX_CORRUPT_DATA); 6572082Seschrock error = EIO; 6582082Seschrock goto out; 6592082Seschrock } 6602082Seschrock if (error == 0) { 6612082Seschrock ASSERT(spa_version(spa) >= ZFS_VERSION_SPARES); 6622082Seschrock if (load_nvlist(spa, spa->spa_spares_object, 6632082Seschrock &spa->spa_sparelist) != 0) { 6642082Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 6652082Seschrock VDEV_AUX_CORRUPT_DATA); 6662082Seschrock error = EIO; 6672082Seschrock goto out; 6682082Seschrock } 6692082Seschrock 6702082Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 6712082Seschrock spa_load_spares(spa); 6722082Seschrock spa_config_exit(spa, FTAG); 6732082Seschrock } 6742082Seschrock 6752082Seschrock /* 6761986Seschrock * Load the vdev state for all toplevel vdevs. 677789Sahrens */ 6781986Seschrock vdev_load(rvd); 679789Sahrens 680789Sahrens /* 681789Sahrens * Propagate the leaf DTLs we just loaded all the way up the tree. 682789Sahrens */ 6831544Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 684789Sahrens vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 6851544Seschrock spa_config_exit(spa, FTAG); 686789Sahrens 687789Sahrens /* 688789Sahrens * Check the state of the root vdev. If it can't be opened, it 689789Sahrens * indicates one or more toplevel vdevs are faulted. 690789Sahrens */ 6911544Seschrock if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 6921544Seschrock error = ENXIO; 6931544Seschrock goto out; 6941544Seschrock } 695789Sahrens 6961544Seschrock if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) { 6971635Sbonwick dmu_tx_t *tx; 6981635Sbonwick int need_update = B_FALSE; 6991585Sbonwick int c; 7001601Sbonwick 7011635Sbonwick /* 7021635Sbonwick * Claim log blocks that haven't been committed yet. 7031635Sbonwick * This must all happen in a single txg. 7041635Sbonwick */ 7051601Sbonwick tx = dmu_tx_create_assigned(spa_get_dsl(spa), 706789Sahrens spa_first_txg(spa)); 7072417Sahrens (void) dmu_objset_find(spa->spa_name, 7082417Sahrens zil_claim, tx, DS_FIND_CHILDREN); 709789Sahrens dmu_tx_commit(tx); 710789Sahrens 711789Sahrens spa->spa_sync_on = B_TRUE; 712789Sahrens txg_sync_start(spa->spa_dsl_pool); 713789Sahrens 714789Sahrens /* 715789Sahrens * Wait for all claims to sync. 716789Sahrens */ 717789Sahrens txg_wait_synced(spa->spa_dsl_pool, 0); 7181585Sbonwick 7191585Sbonwick /* 7201635Sbonwick * If the config cache is stale, or we have uninitialized 7211635Sbonwick * metaslabs (see spa_vdev_add()), then update the config. 7221585Sbonwick */ 7231635Sbonwick if (config_cache_txg != spa->spa_config_txg || 7241635Sbonwick state == SPA_LOAD_IMPORT) 7251635Sbonwick need_update = B_TRUE; 7261635Sbonwick 7271635Sbonwick for (c = 0; c < rvd->vdev_children; c++) 7281635Sbonwick if (rvd->vdev_child[c]->vdev_ms_array == 0) 7291635Sbonwick need_update = B_TRUE; 7301585Sbonwick 7311585Sbonwick /* 7321635Sbonwick * Update the config cache asychronously in case we're the 7331635Sbonwick * root pool, in which case the config cache isn't writable yet. 7341585Sbonwick */ 7351635Sbonwick if (need_update) 7361635Sbonwick spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 737789Sahrens } 738789Sahrens 7391544Seschrock error = 0; 7401544Seschrock out: 7412082Seschrock if (error && error != EBADF) 7421544Seschrock zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0); 7431544Seschrock spa->spa_load_state = SPA_LOAD_NONE; 7441544Seschrock spa->spa_ena = 0; 7451544Seschrock 7461544Seschrock return (error); 747789Sahrens } 748789Sahrens 749789Sahrens /* 750789Sahrens * Pool Open/Import 751789Sahrens * 752789Sahrens * The import case is identical to an open except that the configuration is sent 753789Sahrens * down from userland, instead of grabbed from the configuration cache. For the 754789Sahrens * case of an open, the pool configuration will exist in the 755789Sahrens * POOL_STATE_UNITIALIZED state. 756789Sahrens * 757789Sahrens * The stats information (gen/count/ustats) is used to gather vdev statistics at 758789Sahrens * the same time open the pool, without having to keep around the spa_t in some 759789Sahrens * ambiguous state. 760789Sahrens */ 761789Sahrens static int 762789Sahrens spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) 763789Sahrens { 764789Sahrens spa_t *spa; 765789Sahrens int error; 766789Sahrens int loaded = B_FALSE; 767789Sahrens int locked = B_FALSE; 768789Sahrens 769789Sahrens *spapp = NULL; 770789Sahrens 771789Sahrens /* 772789Sahrens * As disgusting as this is, we need to support recursive calls to this 773789Sahrens * function because dsl_dir_open() is called during spa_load(), and ends 774789Sahrens * up calling spa_open() again. The real fix is to figure out how to 775789Sahrens * avoid dsl_dir_open() calling this in the first place. 776789Sahrens */ 777789Sahrens if (mutex_owner(&spa_namespace_lock) != curthread) { 778789Sahrens mutex_enter(&spa_namespace_lock); 779789Sahrens locked = B_TRUE; 780789Sahrens } 781789Sahrens 782789Sahrens if ((spa = spa_lookup(pool)) == NULL) { 783789Sahrens if (locked) 784789Sahrens mutex_exit(&spa_namespace_lock); 785789Sahrens return (ENOENT); 786789Sahrens } 787789Sahrens if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 788789Sahrens 789789Sahrens spa_activate(spa); 790789Sahrens 7911635Sbonwick error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE); 792789Sahrens 793789Sahrens if (error == EBADF) { 794789Sahrens /* 7951986Seschrock * If vdev_validate() returns failure (indicated by 7961986Seschrock * EBADF), it indicates that one of the vdevs indicates 7971986Seschrock * that the pool has been exported or destroyed. If 7981986Seschrock * this is the case, the config cache is out of sync and 7991986Seschrock * we should remove the pool from the namespace. 800789Sahrens */ 8012082Seschrock zfs_post_ok(spa, NULL); 802789Sahrens spa_unload(spa); 803789Sahrens spa_deactivate(spa); 804789Sahrens spa_remove(spa); 805789Sahrens spa_config_sync(); 806789Sahrens if (locked) 807789Sahrens mutex_exit(&spa_namespace_lock); 808789Sahrens return (ENOENT); 8091544Seschrock } 8101544Seschrock 8111544Seschrock if (error) { 812789Sahrens /* 813789Sahrens * We can't open the pool, but we still have useful 814789Sahrens * information: the state of each vdev after the 815789Sahrens * attempted vdev_open(). Return this to the user. 816789Sahrens */ 8171635Sbonwick if (config != NULL && spa->spa_root_vdev != NULL) { 8181635Sbonwick spa_config_enter(spa, RW_READER, FTAG); 819789Sahrens *config = spa_config_generate(spa, NULL, -1ULL, 820789Sahrens B_TRUE); 8211635Sbonwick spa_config_exit(spa, FTAG); 8221635Sbonwick } 823789Sahrens spa_unload(spa); 824789Sahrens spa_deactivate(spa); 8251544Seschrock spa->spa_last_open_failed = B_TRUE; 826789Sahrens if (locked) 827789Sahrens mutex_exit(&spa_namespace_lock); 828789Sahrens *spapp = NULL; 829789Sahrens return (error); 8301544Seschrock } else { 8311544Seschrock zfs_post_ok(spa, NULL); 8321544Seschrock spa->spa_last_open_failed = B_FALSE; 833789Sahrens } 834789Sahrens 835789Sahrens loaded = B_TRUE; 836789Sahrens } 837789Sahrens 838789Sahrens spa_open_ref(spa, tag); 839789Sahrens if (locked) 840789Sahrens mutex_exit(&spa_namespace_lock); 841789Sahrens 842789Sahrens *spapp = spa; 843789Sahrens 844789Sahrens if (config != NULL) { 8451544Seschrock spa_config_enter(spa, RW_READER, FTAG); 846789Sahrens *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 8471544Seschrock spa_config_exit(spa, FTAG); 848789Sahrens } 849789Sahrens 850789Sahrens /* 851789Sahrens * If we just loaded the pool, resilver anything that's out of date. 852789Sahrens */ 853789Sahrens if (loaded && (spa_mode & FWRITE)) 854789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 855789Sahrens 856789Sahrens return (0); 857789Sahrens } 858789Sahrens 859789Sahrens int 860789Sahrens spa_open(const char *name, spa_t **spapp, void *tag) 861789Sahrens { 862789Sahrens return (spa_open_common(name, spapp, tag, NULL)); 863789Sahrens } 864789Sahrens 8651544Seschrock /* 8661544Seschrock * Lookup the given spa_t, incrementing the inject count in the process, 8671544Seschrock * preventing it from being exported or destroyed. 8681544Seschrock */ 8691544Seschrock spa_t * 8701544Seschrock spa_inject_addref(char *name) 8711544Seschrock { 8721544Seschrock spa_t *spa; 8731544Seschrock 8741544Seschrock mutex_enter(&spa_namespace_lock); 8751544Seschrock if ((spa = spa_lookup(name)) == NULL) { 8761544Seschrock mutex_exit(&spa_namespace_lock); 8771544Seschrock return (NULL); 8781544Seschrock } 8791544Seschrock spa->spa_inject_ref++; 8801544Seschrock mutex_exit(&spa_namespace_lock); 8811544Seschrock 8821544Seschrock return (spa); 8831544Seschrock } 8841544Seschrock 8851544Seschrock void 8861544Seschrock spa_inject_delref(spa_t *spa) 8871544Seschrock { 8881544Seschrock mutex_enter(&spa_namespace_lock); 8891544Seschrock spa->spa_inject_ref--; 8901544Seschrock mutex_exit(&spa_namespace_lock); 8911544Seschrock } 8921544Seschrock 8932082Seschrock static void 8942082Seschrock spa_add_spares(spa_t *spa, nvlist_t *config) 8952082Seschrock { 8962082Seschrock nvlist_t **spares; 8972082Seschrock uint_t i, nspares; 8982082Seschrock nvlist_t *nvroot; 8992082Seschrock uint64_t guid; 9002082Seschrock vdev_stat_t *vs; 9012082Seschrock uint_t vsc; 9023377Seschrock uint64_t pool; 9032082Seschrock 9042082Seschrock if (spa->spa_nspares == 0) 9052082Seschrock return; 9062082Seschrock 9072082Seschrock VERIFY(nvlist_lookup_nvlist(config, 9082082Seschrock ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 9092082Seschrock VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 9102082Seschrock ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 9112082Seschrock if (nspares != 0) { 9122082Seschrock VERIFY(nvlist_add_nvlist_array(nvroot, 9132082Seschrock ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 9142082Seschrock VERIFY(nvlist_lookup_nvlist_array(nvroot, 9152082Seschrock ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 9162082Seschrock 9172082Seschrock /* 9182082Seschrock * Go through and find any spares which have since been 9192082Seschrock * repurposed as an active spare. If this is the case, update 9202082Seschrock * their status appropriately. 9212082Seschrock */ 9222082Seschrock for (i = 0; i < nspares; i++) { 9232082Seschrock VERIFY(nvlist_lookup_uint64(spares[i], 9242082Seschrock ZPOOL_CONFIG_GUID, &guid) == 0); 9253377Seschrock if (spa_spare_exists(guid, &pool) && pool != 0ULL) { 9262082Seschrock VERIFY(nvlist_lookup_uint64_array( 9272082Seschrock spares[i], ZPOOL_CONFIG_STATS, 9282082Seschrock (uint64_t **)&vs, &vsc) == 0); 9292082Seschrock vs->vs_state = VDEV_STATE_CANT_OPEN; 9302082Seschrock vs->vs_aux = VDEV_AUX_SPARED; 9312082Seschrock } 9322082Seschrock } 9332082Seschrock } 9342082Seschrock } 9352082Seschrock 936789Sahrens int 9371544Seschrock spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 938789Sahrens { 939789Sahrens int error; 940789Sahrens spa_t *spa; 941789Sahrens 942789Sahrens *config = NULL; 943789Sahrens error = spa_open_common(name, &spa, FTAG, config); 944789Sahrens 9452082Seschrock if (spa && *config != NULL) { 9461544Seschrock VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, 9471544Seschrock spa_get_errlog_size(spa)) == 0); 9481544Seschrock 9492082Seschrock spa_add_spares(spa, *config); 9502082Seschrock } 9512082Seschrock 9521544Seschrock /* 9531544Seschrock * We want to get the alternate root even for faulted pools, so we cheat 9541544Seschrock * and call spa_lookup() directly. 9551544Seschrock */ 9561544Seschrock if (altroot) { 9571544Seschrock if (spa == NULL) { 9581544Seschrock mutex_enter(&spa_namespace_lock); 9591544Seschrock spa = spa_lookup(name); 9601544Seschrock if (spa) 9611544Seschrock spa_altroot(spa, altroot, buflen); 9621544Seschrock else 9631544Seschrock altroot[0] = '\0'; 9641544Seschrock spa = NULL; 9651544Seschrock mutex_exit(&spa_namespace_lock); 9661544Seschrock } else { 9671544Seschrock spa_altroot(spa, altroot, buflen); 9681544Seschrock } 9691544Seschrock } 9701544Seschrock 971789Sahrens if (spa != NULL) 972789Sahrens spa_close(spa, FTAG); 973789Sahrens 974789Sahrens return (error); 975789Sahrens } 976789Sahrens 977789Sahrens /* 9782082Seschrock * Validate that the 'spares' array is well formed. We must have an array of 9793377Seschrock * nvlists, each which describes a valid leaf vdev. If this is an import (mode 9803377Seschrock * is VDEV_ALLOC_SPARE), then we allow corrupted spares to be specified, as long 9813377Seschrock * as they are well-formed. 9822082Seschrock */ 9832082Seschrock static int 9842082Seschrock spa_validate_spares(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 9852082Seschrock { 9862082Seschrock nvlist_t **spares; 9872082Seschrock uint_t i, nspares; 9882082Seschrock vdev_t *vd; 9892082Seschrock int error; 9902082Seschrock 9912082Seschrock /* 9922082Seschrock * It's acceptable to have no spares specified. 9932082Seschrock */ 9942082Seschrock if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 9952082Seschrock &spares, &nspares) != 0) 9962082Seschrock return (0); 9972082Seschrock 9982082Seschrock if (nspares == 0) 9992082Seschrock return (EINVAL); 10002082Seschrock 10012082Seschrock /* 10022082Seschrock * Make sure the pool is formatted with a version that supports hot 10032082Seschrock * spares. 10042082Seschrock */ 10052082Seschrock if (spa_version(spa) < ZFS_VERSION_SPARES) 10062082Seschrock return (ENOTSUP); 10072082Seschrock 10083377Seschrock /* 10093377Seschrock * Set the pending spare list so we correctly handle device in-use 10103377Seschrock * checking. 10113377Seschrock */ 10123377Seschrock spa->spa_pending_spares = spares; 10133377Seschrock spa->spa_pending_nspares = nspares; 10143377Seschrock 10152082Seschrock for (i = 0; i < nspares; i++) { 10162082Seschrock if ((error = spa_config_parse(spa, &vd, spares[i], NULL, 0, 10172082Seschrock mode)) != 0) 10183377Seschrock goto out; 10192082Seschrock 10202082Seschrock if (!vd->vdev_ops->vdev_op_leaf) { 10212082Seschrock vdev_free(vd); 10223377Seschrock error = EINVAL; 10233377Seschrock goto out; 10242082Seschrock } 10252082Seschrock 10262082Seschrock vd->vdev_top = vd; 10273377Seschrock 10283377Seschrock if ((error = vdev_open(vd)) == 0 && 10293377Seschrock (error = vdev_label_init(vd, crtxg, 10303377Seschrock VDEV_LABEL_SPARE)) == 0) { 10313377Seschrock VERIFY(nvlist_add_uint64(spares[i], ZPOOL_CONFIG_GUID, 10323377Seschrock vd->vdev_guid) == 0); 10332082Seschrock } 10342082Seschrock 10352082Seschrock vdev_free(vd); 10363377Seschrock 10373377Seschrock if (error && mode != VDEV_ALLOC_SPARE) 10383377Seschrock goto out; 10393377Seschrock else 10403377Seschrock error = 0; 10412082Seschrock } 10422082Seschrock 10433377Seschrock out: 10443377Seschrock spa->spa_pending_spares = NULL; 10453377Seschrock spa->spa_pending_nspares = 0; 10463377Seschrock return (error); 10472082Seschrock } 10482082Seschrock 10492082Seschrock /* 1050789Sahrens * Pool Creation 1051789Sahrens */ 1052789Sahrens int 10531635Sbonwick spa_create(const char *pool, nvlist_t *nvroot, const char *altroot) 1054789Sahrens { 1055789Sahrens spa_t *spa; 10561635Sbonwick vdev_t *rvd; 1057789Sahrens dsl_pool_t *dp; 1058789Sahrens dmu_tx_t *tx; 10592082Seschrock int c, error = 0; 1060789Sahrens uint64_t txg = TXG_INITIAL; 10612082Seschrock nvlist_t **spares; 10622082Seschrock uint_t nspares; 1063789Sahrens 1064789Sahrens /* 1065789Sahrens * If this pool already exists, return failure. 1066789Sahrens */ 1067789Sahrens mutex_enter(&spa_namespace_lock); 1068789Sahrens if (spa_lookup(pool) != NULL) { 1069789Sahrens mutex_exit(&spa_namespace_lock); 1070789Sahrens return (EEXIST); 1071789Sahrens } 1072789Sahrens 1073789Sahrens /* 1074789Sahrens * Allocate a new spa_t structure. 1075789Sahrens */ 10761635Sbonwick spa = spa_add(pool, altroot); 1077789Sahrens spa_activate(spa); 1078789Sahrens 1079789Sahrens spa->spa_uberblock.ub_txg = txg - 1; 10801760Seschrock spa->spa_uberblock.ub_version = ZFS_VERSION; 1081789Sahrens spa->spa_ubsync = spa->spa_uberblock; 1082789Sahrens 10831635Sbonwick /* 10841635Sbonwick * Create the root vdev. 10851635Sbonwick */ 10861635Sbonwick spa_config_enter(spa, RW_WRITER, FTAG); 10871635Sbonwick 10882082Seschrock error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 10892082Seschrock 10902082Seschrock ASSERT(error != 0 || rvd != NULL); 10912082Seschrock ASSERT(error != 0 || spa->spa_root_vdev == rvd); 10922082Seschrock 10932082Seschrock if (error == 0 && rvd->vdev_children == 0) 10941635Sbonwick error = EINVAL; 10952082Seschrock 10962082Seschrock if (error == 0 && 10972082Seschrock (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 10982082Seschrock (error = spa_validate_spares(spa, nvroot, txg, 10992082Seschrock VDEV_ALLOC_ADD)) == 0) { 11002082Seschrock for (c = 0; c < rvd->vdev_children; c++) 11012082Seschrock vdev_init(rvd->vdev_child[c], txg); 11022082Seschrock vdev_config_dirty(rvd); 11031635Sbonwick } 11041635Sbonwick 11051635Sbonwick spa_config_exit(spa, FTAG); 1106789Sahrens 11072082Seschrock if (error != 0) { 1108789Sahrens spa_unload(spa); 1109789Sahrens spa_deactivate(spa); 1110789Sahrens spa_remove(spa); 1111789Sahrens mutex_exit(&spa_namespace_lock); 1112789Sahrens return (error); 1113789Sahrens } 1114789Sahrens 11152082Seschrock /* 11162082Seschrock * Get the list of spares, if specified. 11172082Seschrock */ 11182082Seschrock if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 11192082Seschrock &spares, &nspares) == 0) { 11202082Seschrock VERIFY(nvlist_alloc(&spa->spa_sparelist, NV_UNIQUE_NAME, 11212082Seschrock KM_SLEEP) == 0); 11222082Seschrock VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 11232082Seschrock ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 11242082Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 11252082Seschrock spa_load_spares(spa); 11262082Seschrock spa_config_exit(spa, FTAG); 11272082Seschrock spa->spa_sync_spares = B_TRUE; 11282082Seschrock } 11292082Seschrock 1130789Sahrens spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg); 1131789Sahrens spa->spa_meta_objset = dp->dp_meta_objset; 1132789Sahrens 1133789Sahrens tx = dmu_tx_create_assigned(dp, txg); 1134789Sahrens 1135789Sahrens /* 1136789Sahrens * Create the pool config object. 1137789Sahrens */ 1138789Sahrens spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 1139789Sahrens DMU_OT_PACKED_NVLIST, 1 << 14, 1140789Sahrens DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 1141789Sahrens 11421544Seschrock if (zap_add(spa->spa_meta_objset, 1143789Sahrens DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 11441544Seschrock sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 11451544Seschrock cmn_err(CE_PANIC, "failed to add pool config"); 11461544Seschrock } 1147789Sahrens 11482082Seschrock /* Newly created pools are always deflated. */ 11492082Seschrock spa->spa_deflate = TRUE; 11502082Seschrock if (zap_add(spa->spa_meta_objset, 11512082Seschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 11522082Seschrock sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 11532082Seschrock cmn_err(CE_PANIC, "failed to add deflate"); 11542082Seschrock } 11552082Seschrock 1156789Sahrens /* 1157789Sahrens * Create the deferred-free bplist object. Turn off compression 1158789Sahrens * because sync-to-convergence takes longer if the blocksize 1159789Sahrens * keeps changing. 1160789Sahrens */ 1161789Sahrens spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 1162789Sahrens 1 << 14, tx); 1163789Sahrens dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 1164789Sahrens ZIO_COMPRESS_OFF, tx); 1165789Sahrens 11661544Seschrock if (zap_add(spa->spa_meta_objset, 1167789Sahrens DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 11681544Seschrock sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { 11691544Seschrock cmn_err(CE_PANIC, "failed to add bplist"); 11701544Seschrock } 1171789Sahrens 11722926Sek110237 /* 11732926Sek110237 * Create the pool's history object. 11742926Sek110237 */ 11752926Sek110237 spa_history_create_obj(spa, tx); 11762926Sek110237 1177789Sahrens dmu_tx_commit(tx); 1178789Sahrens 1179789Sahrens spa->spa_sync_on = B_TRUE; 1180789Sahrens txg_sync_start(spa->spa_dsl_pool); 1181789Sahrens 1182789Sahrens /* 1183789Sahrens * We explicitly wait for the first transaction to complete so that our 1184789Sahrens * bean counters are appropriately updated. 1185789Sahrens */ 1186789Sahrens txg_wait_synced(spa->spa_dsl_pool, txg); 1187789Sahrens 1188789Sahrens spa_config_sync(); 1189789Sahrens 1190789Sahrens mutex_exit(&spa_namespace_lock); 1191789Sahrens 1192789Sahrens return (0); 1193789Sahrens } 1194789Sahrens 1195789Sahrens /* 1196789Sahrens * Import the given pool into the system. We set up the necessary spa_t and 1197789Sahrens * then call spa_load() to do the dirty work. 1198789Sahrens */ 1199789Sahrens int 12001635Sbonwick spa_import(const char *pool, nvlist_t *config, const char *altroot) 1201789Sahrens { 1202789Sahrens spa_t *spa; 1203789Sahrens int error; 12042082Seschrock nvlist_t *nvroot; 12052082Seschrock nvlist_t **spares; 12062082Seschrock uint_t nspares; 1207789Sahrens 1208789Sahrens if (!(spa_mode & FWRITE)) 1209789Sahrens return (EROFS); 1210789Sahrens 1211789Sahrens /* 1212789Sahrens * If a pool with this name exists, return failure. 1213789Sahrens */ 1214789Sahrens mutex_enter(&spa_namespace_lock); 1215789Sahrens if (spa_lookup(pool) != NULL) { 1216789Sahrens mutex_exit(&spa_namespace_lock); 1217789Sahrens return (EEXIST); 1218789Sahrens } 1219789Sahrens 1220789Sahrens /* 12211635Sbonwick * Create and initialize the spa structure. 1222789Sahrens */ 12231635Sbonwick spa = spa_add(pool, altroot); 1224789Sahrens spa_activate(spa); 1225789Sahrens 1226789Sahrens /* 12271635Sbonwick * Pass off the heavy lifting to spa_load(). 12281732Sbonwick * Pass TRUE for mosconfig because the user-supplied config 12291732Sbonwick * is actually the one to trust when doing an import. 12301601Sbonwick */ 12311732Sbonwick error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE); 1232789Sahrens 12332082Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 12342082Seschrock /* 12352082Seschrock * Toss any existing sparelist, as it doesn't have any validity anymore, 12362082Seschrock * and conflicts with spa_has_spare(). 12372082Seschrock */ 12382082Seschrock if (spa->spa_sparelist) { 12392082Seschrock nvlist_free(spa->spa_sparelist); 12402082Seschrock spa->spa_sparelist = NULL; 12412082Seschrock spa_load_spares(spa); 12422082Seschrock } 12432082Seschrock 12442082Seschrock VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 12452082Seschrock &nvroot) == 0); 12462082Seschrock if (error == 0) 12472082Seschrock error = spa_validate_spares(spa, nvroot, -1ULL, 12482082Seschrock VDEV_ALLOC_SPARE); 12492082Seschrock spa_config_exit(spa, FTAG); 12502082Seschrock 12512082Seschrock if (error != 0) { 1252789Sahrens spa_unload(spa); 1253789Sahrens spa_deactivate(spa); 1254789Sahrens spa_remove(spa); 1255789Sahrens mutex_exit(&spa_namespace_lock); 1256789Sahrens return (error); 1257789Sahrens } 1258789Sahrens 12591635Sbonwick /* 12602082Seschrock * Override any spares as specified by the user, as these may have 12612082Seschrock * correct device names/devids, etc. 12622082Seschrock */ 12632082Seschrock if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 12642082Seschrock &spares, &nspares) == 0) { 12652082Seschrock if (spa->spa_sparelist) 12662082Seschrock VERIFY(nvlist_remove(spa->spa_sparelist, 12672082Seschrock ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 12682082Seschrock else 12692082Seschrock VERIFY(nvlist_alloc(&spa->spa_sparelist, 12702082Seschrock NV_UNIQUE_NAME, KM_SLEEP) == 0); 12712082Seschrock VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 12722082Seschrock ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 12732082Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 12742082Seschrock spa_load_spares(spa); 12752082Seschrock spa_config_exit(spa, FTAG); 12762082Seschrock spa->spa_sync_spares = B_TRUE; 12772082Seschrock } 12782082Seschrock 12792082Seschrock /* 12801635Sbonwick * Update the config cache to include the newly-imported pool. 12811635Sbonwick */ 12821635Sbonwick spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 12831635Sbonwick 1284789Sahrens mutex_exit(&spa_namespace_lock); 1285789Sahrens 1286789Sahrens /* 1287789Sahrens * Resilver anything that's out of date. 1288789Sahrens */ 1289789Sahrens if (spa_mode & FWRITE) 1290789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1291789Sahrens 1292789Sahrens return (0); 1293789Sahrens } 1294789Sahrens 1295789Sahrens /* 1296789Sahrens * This (illegal) pool name is used when temporarily importing a spa_t in order 1297789Sahrens * to get the vdev stats associated with the imported devices. 1298789Sahrens */ 1299789Sahrens #define TRYIMPORT_NAME "$import" 1300789Sahrens 1301789Sahrens nvlist_t * 1302789Sahrens spa_tryimport(nvlist_t *tryconfig) 1303789Sahrens { 1304789Sahrens nvlist_t *config = NULL; 1305789Sahrens char *poolname; 1306789Sahrens spa_t *spa; 1307789Sahrens uint64_t state; 1308789Sahrens 1309789Sahrens if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 1310789Sahrens return (NULL); 1311789Sahrens 1312789Sahrens if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 1313789Sahrens return (NULL); 1314789Sahrens 13151635Sbonwick /* 13161635Sbonwick * Create and initialize the spa structure. 13171635Sbonwick */ 1318789Sahrens mutex_enter(&spa_namespace_lock); 13191635Sbonwick spa = spa_add(TRYIMPORT_NAME, NULL); 1320789Sahrens spa_activate(spa); 1321789Sahrens 1322789Sahrens /* 13231635Sbonwick * Pass off the heavy lifting to spa_load(). 13241732Sbonwick * Pass TRUE for mosconfig because the user-supplied config 13251732Sbonwick * is actually the one to trust when doing an import. 1326789Sahrens */ 13271732Sbonwick (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); 1328789Sahrens 1329789Sahrens /* 1330789Sahrens * If 'tryconfig' was at least parsable, return the current config. 1331789Sahrens */ 1332789Sahrens if (spa->spa_root_vdev != NULL) { 13331635Sbonwick spa_config_enter(spa, RW_READER, FTAG); 1334789Sahrens config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 13351635Sbonwick spa_config_exit(spa, FTAG); 1336789Sahrens VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 1337789Sahrens poolname) == 0); 1338789Sahrens VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 1339789Sahrens state) == 0); 13402082Seschrock 13412082Seschrock /* 13422082Seschrock * Add the list of hot spares. 13432082Seschrock */ 13442082Seschrock spa_add_spares(spa, config); 1345789Sahrens } 1346789Sahrens 1347789Sahrens spa_unload(spa); 1348789Sahrens spa_deactivate(spa); 1349789Sahrens spa_remove(spa); 1350789Sahrens mutex_exit(&spa_namespace_lock); 1351789Sahrens 1352789Sahrens return (config); 1353789Sahrens } 1354789Sahrens 1355789Sahrens /* 1356789Sahrens * Pool export/destroy 1357789Sahrens * 1358789Sahrens * The act of destroying or exporting a pool is very simple. We make sure there 1359789Sahrens * is no more pending I/O and any references to the pool are gone. Then, we 1360789Sahrens * update the pool state and sync all the labels to disk, removing the 1361789Sahrens * configuration from the cache afterwards. 1362789Sahrens */ 1363789Sahrens static int 13641775Sbillm spa_export_common(char *pool, int new_state, nvlist_t **oldconfig) 1365789Sahrens { 1366789Sahrens spa_t *spa; 1367789Sahrens 13681775Sbillm if (oldconfig) 13691775Sbillm *oldconfig = NULL; 13701775Sbillm 1371789Sahrens if (!(spa_mode & FWRITE)) 1372789Sahrens return (EROFS); 1373789Sahrens 1374789Sahrens mutex_enter(&spa_namespace_lock); 1375789Sahrens if ((spa = spa_lookup(pool)) == NULL) { 1376789Sahrens mutex_exit(&spa_namespace_lock); 1377789Sahrens return (ENOENT); 1378789Sahrens } 1379789Sahrens 1380789Sahrens /* 13811544Seschrock * Put a hold on the pool, drop the namespace lock, stop async tasks, 13821544Seschrock * reacquire the namespace lock, and see if we can export. 13831544Seschrock */ 13841544Seschrock spa_open_ref(spa, FTAG); 13851544Seschrock mutex_exit(&spa_namespace_lock); 13861544Seschrock spa_async_suspend(spa); 13871544Seschrock mutex_enter(&spa_namespace_lock); 13881544Seschrock spa_close(spa, FTAG); 13891544Seschrock 13901544Seschrock /* 1391789Sahrens * The pool will be in core if it's openable, 1392789Sahrens * in which case we can modify its state. 1393789Sahrens */ 1394789Sahrens if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 1395789Sahrens /* 1396789Sahrens * Objsets may be open only because they're dirty, so we 1397789Sahrens * have to force it to sync before checking spa_refcnt. 1398789Sahrens */ 1399789Sahrens spa_scrub_suspend(spa); 1400789Sahrens txg_wait_synced(spa->spa_dsl_pool, 0); 1401789Sahrens 14021544Seschrock /* 14031544Seschrock * A pool cannot be exported or destroyed if there are active 14041544Seschrock * references. If we are resetting a pool, allow references by 14051544Seschrock * fault injection handlers. 14061544Seschrock */ 14071544Seschrock if (!spa_refcount_zero(spa) || 14081544Seschrock (spa->spa_inject_ref != 0 && 14091544Seschrock new_state != POOL_STATE_UNINITIALIZED)) { 1410789Sahrens spa_scrub_resume(spa); 14111544Seschrock spa_async_resume(spa); 1412789Sahrens mutex_exit(&spa_namespace_lock); 1413789Sahrens return (EBUSY); 1414789Sahrens } 1415789Sahrens 1416789Sahrens spa_scrub_resume(spa); 1417789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 1418789Sahrens 1419789Sahrens /* 1420789Sahrens * We want this to be reflected on every label, 1421789Sahrens * so mark them all dirty. spa_unload() will do the 1422789Sahrens * final sync that pushes these changes out. 1423789Sahrens */ 14241544Seschrock if (new_state != POOL_STATE_UNINITIALIZED) { 14251601Sbonwick spa_config_enter(spa, RW_WRITER, FTAG); 14261544Seschrock spa->spa_state = new_state; 14271635Sbonwick spa->spa_final_txg = spa_last_synced_txg(spa) + 1; 14281544Seschrock vdev_config_dirty(spa->spa_root_vdev); 14291601Sbonwick spa_config_exit(spa, FTAG); 14301544Seschrock } 1431789Sahrens } 1432789Sahrens 1433789Sahrens if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 1434789Sahrens spa_unload(spa); 1435789Sahrens spa_deactivate(spa); 1436789Sahrens } 1437789Sahrens 14381775Sbillm if (oldconfig && spa->spa_config) 14391775Sbillm VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 14401775Sbillm 14411544Seschrock if (new_state != POOL_STATE_UNINITIALIZED) { 14421544Seschrock spa_remove(spa); 14431544Seschrock spa_config_sync(); 14441544Seschrock } 1445789Sahrens mutex_exit(&spa_namespace_lock); 1446789Sahrens 1447789Sahrens return (0); 1448789Sahrens } 1449789Sahrens 1450789Sahrens /* 1451789Sahrens * Destroy a storage pool. 1452789Sahrens */ 1453789Sahrens int 1454789Sahrens spa_destroy(char *pool) 1455789Sahrens { 14561775Sbillm return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL)); 1457789Sahrens } 1458789Sahrens 1459789Sahrens /* 1460789Sahrens * Export a storage pool. 1461789Sahrens */ 1462789Sahrens int 14631775Sbillm spa_export(char *pool, nvlist_t **oldconfig) 1464789Sahrens { 14651775Sbillm return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig)); 1466789Sahrens } 1467789Sahrens 1468789Sahrens /* 14691544Seschrock * Similar to spa_export(), this unloads the spa_t without actually removing it 14701544Seschrock * from the namespace in any way. 14711544Seschrock */ 14721544Seschrock int 14731544Seschrock spa_reset(char *pool) 14741544Seschrock { 14751775Sbillm return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL)); 14761544Seschrock } 14771544Seschrock 14781544Seschrock 14791544Seschrock /* 1480789Sahrens * ========================================================================== 1481789Sahrens * Device manipulation 1482789Sahrens * ========================================================================== 1483789Sahrens */ 1484789Sahrens 1485789Sahrens /* 1486789Sahrens * Add capacity to a storage pool. 1487789Sahrens */ 1488789Sahrens int 1489789Sahrens spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 1490789Sahrens { 1491789Sahrens uint64_t txg; 14921635Sbonwick int c, error; 1493789Sahrens vdev_t *rvd = spa->spa_root_vdev; 14941585Sbonwick vdev_t *vd, *tvd; 14952082Seschrock nvlist_t **spares; 14962082Seschrock uint_t i, nspares; 1497789Sahrens 1498789Sahrens txg = spa_vdev_enter(spa); 1499789Sahrens 15002082Seschrock if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 15012082Seschrock VDEV_ALLOC_ADD)) != 0) 15022082Seschrock return (spa_vdev_exit(spa, NULL, txg, error)); 15032082Seschrock 15043377Seschrock spa->spa_pending_vdev = vd; 1505789Sahrens 15062082Seschrock if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 15072082Seschrock &spares, &nspares) != 0) 15082082Seschrock nspares = 0; 15092082Seschrock 15103377Seschrock if (vd->vdev_children == 0 && nspares == 0) { 15113377Seschrock spa->spa_pending_vdev = NULL; 15122082Seschrock return (spa_vdev_exit(spa, vd, txg, EINVAL)); 15133377Seschrock } 15142082Seschrock 15152082Seschrock if (vd->vdev_children != 0) { 15163377Seschrock if ((error = vdev_create(vd, txg, B_FALSE)) != 0) { 15173377Seschrock spa->spa_pending_vdev = NULL; 15182082Seschrock return (spa_vdev_exit(spa, vd, txg, error)); 15192082Seschrock } 15202082Seschrock } 15212082Seschrock 15223377Seschrock /* 15233377Seschrock * We must validate the spares after checking the children. Otherwise, 15243377Seschrock * vdev_inuse() will blindly overwrite the spare. 15253377Seschrock */ 15263377Seschrock if ((error = spa_validate_spares(spa, nvroot, txg, 15273377Seschrock VDEV_ALLOC_ADD)) != 0) { 15283377Seschrock spa->spa_pending_vdev = NULL; 15293377Seschrock return (spa_vdev_exit(spa, vd, txg, error)); 15303377Seschrock } 15313377Seschrock 15323377Seschrock spa->spa_pending_vdev = NULL; 15333377Seschrock 15343377Seschrock /* 15353377Seschrock * Transfer each new top-level vdev from vd to rvd. 15363377Seschrock */ 15373377Seschrock for (c = 0; c < vd->vdev_children; c++) { 15383377Seschrock tvd = vd->vdev_child[c]; 15393377Seschrock vdev_remove_child(vd, tvd); 15403377Seschrock tvd->vdev_id = rvd->vdev_children; 15413377Seschrock vdev_add_child(rvd, tvd); 15423377Seschrock vdev_config_dirty(tvd); 15433377Seschrock } 15443377Seschrock 15452082Seschrock if (nspares != 0) { 15462082Seschrock if (spa->spa_sparelist != NULL) { 15472082Seschrock nvlist_t **oldspares; 15482082Seschrock uint_t oldnspares; 15492082Seschrock nvlist_t **newspares; 15502082Seschrock 15512082Seschrock VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 15522082Seschrock ZPOOL_CONFIG_SPARES, &oldspares, &oldnspares) == 0); 15532082Seschrock 15542082Seschrock newspares = kmem_alloc(sizeof (void *) * 15552082Seschrock (nspares + oldnspares), KM_SLEEP); 15562082Seschrock for (i = 0; i < oldnspares; i++) 15572082Seschrock VERIFY(nvlist_dup(oldspares[i], 15582082Seschrock &newspares[i], KM_SLEEP) == 0); 15592082Seschrock for (i = 0; i < nspares; i++) 15602082Seschrock VERIFY(nvlist_dup(spares[i], 15612082Seschrock &newspares[i + oldnspares], 15622082Seschrock KM_SLEEP) == 0); 15632082Seschrock 15642082Seschrock VERIFY(nvlist_remove(spa->spa_sparelist, 15652082Seschrock ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 15662082Seschrock 15672082Seschrock VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 15682082Seschrock ZPOOL_CONFIG_SPARES, newspares, 15692082Seschrock nspares + oldnspares) == 0); 15702082Seschrock for (i = 0; i < oldnspares + nspares; i++) 15712082Seschrock nvlist_free(newspares[i]); 15722082Seschrock kmem_free(newspares, (oldnspares + nspares) * 15732082Seschrock sizeof (void *)); 15742082Seschrock } else { 15752082Seschrock VERIFY(nvlist_alloc(&spa->spa_sparelist, 15762082Seschrock NV_UNIQUE_NAME, KM_SLEEP) == 0); 15772082Seschrock VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 15782082Seschrock ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 15792082Seschrock } 15802082Seschrock 15812082Seschrock spa_load_spares(spa); 15822082Seschrock spa->spa_sync_spares = B_TRUE; 1583789Sahrens } 1584789Sahrens 1585789Sahrens /* 15861585Sbonwick * We have to be careful when adding new vdevs to an existing pool. 15871585Sbonwick * If other threads start allocating from these vdevs before we 15881585Sbonwick * sync the config cache, and we lose power, then upon reboot we may 15891585Sbonwick * fail to open the pool because there are DVAs that the config cache 15901585Sbonwick * can't translate. Therefore, we first add the vdevs without 15911585Sbonwick * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 15921635Sbonwick * and then let spa_config_update() initialize the new metaslabs. 15931585Sbonwick * 15941585Sbonwick * spa_load() checks for added-but-not-initialized vdevs, so that 15951585Sbonwick * if we lose power at any point in this sequence, the remaining 15961585Sbonwick * steps will be completed the next time we load the pool. 1597789Sahrens */ 15981635Sbonwick (void) spa_vdev_exit(spa, vd, txg, 0); 15991585Sbonwick 16001635Sbonwick mutex_enter(&spa_namespace_lock); 16011635Sbonwick spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 16021635Sbonwick mutex_exit(&spa_namespace_lock); 1603789Sahrens 16041635Sbonwick return (0); 1605789Sahrens } 1606789Sahrens 1607789Sahrens /* 1608789Sahrens * Attach a device to a mirror. The arguments are the path to any device 1609789Sahrens * in the mirror, and the nvroot for the new device. If the path specifies 1610789Sahrens * a device that is not mirrored, we automatically insert the mirror vdev. 1611789Sahrens * 1612789Sahrens * If 'replacing' is specified, the new device is intended to replace the 1613789Sahrens * existing device; in this case the two devices are made into their own 1614789Sahrens * mirror using the 'replacing' vdev, which is functionally idendical to 1615789Sahrens * the mirror vdev (it actually reuses all the same ops) but has a few 1616789Sahrens * extra rules: you can't attach to it after it's been created, and upon 1617789Sahrens * completion of resilvering, the first disk (the one being replaced) 1618789Sahrens * is automatically detached. 1619789Sahrens */ 1620789Sahrens int 16211544Seschrock spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 1622789Sahrens { 1623789Sahrens uint64_t txg, open_txg; 1624789Sahrens int error; 1625789Sahrens vdev_t *rvd = spa->spa_root_vdev; 1626789Sahrens vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 16272082Seschrock vdev_ops_t *pvops; 1628789Sahrens 1629789Sahrens txg = spa_vdev_enter(spa); 1630789Sahrens 16311544Seschrock oldvd = vdev_lookup_by_guid(rvd, guid); 1632789Sahrens 1633789Sahrens if (oldvd == NULL) 1634789Sahrens return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1635789Sahrens 16361585Sbonwick if (!oldvd->vdev_ops->vdev_op_leaf) 16371585Sbonwick return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 16381585Sbonwick 1639789Sahrens pvd = oldvd->vdev_parent; 1640789Sahrens 16412082Seschrock if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 16422082Seschrock VDEV_ALLOC_ADD)) != 0 || newrootvd->vdev_children != 1) 1643789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1644789Sahrens 1645789Sahrens newvd = newrootvd->vdev_child[0]; 1646789Sahrens 1647789Sahrens if (!newvd->vdev_ops->vdev_op_leaf) 1648789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1649789Sahrens 16502082Seschrock if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 1651789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, error)); 1652789Sahrens 16532082Seschrock if (!replacing) { 16542082Seschrock /* 16552082Seschrock * For attach, the only allowable parent is a mirror or the root 16562082Seschrock * vdev. 16572082Seschrock */ 16582082Seschrock if (pvd->vdev_ops != &vdev_mirror_ops && 16592082Seschrock pvd->vdev_ops != &vdev_root_ops) 16602082Seschrock return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 16612082Seschrock 16622082Seschrock pvops = &vdev_mirror_ops; 16632082Seschrock } else { 16642082Seschrock /* 16652082Seschrock * Active hot spares can only be replaced by inactive hot 16662082Seschrock * spares. 16672082Seschrock */ 16682082Seschrock if (pvd->vdev_ops == &vdev_spare_ops && 16692082Seschrock pvd->vdev_child[1] == oldvd && 16702082Seschrock !spa_has_spare(spa, newvd->vdev_guid)) 16712082Seschrock return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 16722082Seschrock 16732082Seschrock /* 16742082Seschrock * If the source is a hot spare, and the parent isn't already a 16752082Seschrock * spare, then we want to create a new hot spare. Otherwise, we 16763377Seschrock * want to create a replacing vdev. The user is not allowed to 16773377Seschrock * attach to a spared vdev child unless the 'isspare' state is 16783377Seschrock * the same (spare replaces spare, non-spare replaces 16793377Seschrock * non-spare). 16802082Seschrock */ 16812082Seschrock if (pvd->vdev_ops == &vdev_replacing_ops) 16822082Seschrock return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 16833377Seschrock else if (pvd->vdev_ops == &vdev_spare_ops && 16843377Seschrock newvd->vdev_isspare != oldvd->vdev_isspare) 16853377Seschrock return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 16862082Seschrock else if (pvd->vdev_ops != &vdev_spare_ops && 16872082Seschrock newvd->vdev_isspare) 16882082Seschrock pvops = &vdev_spare_ops; 16892082Seschrock else 16902082Seschrock pvops = &vdev_replacing_ops; 16912082Seschrock } 16922082Seschrock 16931175Slling /* 16941175Slling * Compare the new device size with the replaceable/attachable 16951175Slling * device size. 16961175Slling */ 16971175Slling if (newvd->vdev_psize < vdev_get_rsize(oldvd)) 1698789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 1699789Sahrens 17001732Sbonwick /* 17011732Sbonwick * The new device cannot have a higher alignment requirement 17021732Sbonwick * than the top-level vdev. 17031732Sbonwick */ 17041732Sbonwick if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 1705789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 1706789Sahrens 1707789Sahrens /* 1708789Sahrens * If this is an in-place replacement, update oldvd's path and devid 1709789Sahrens * to make it distinguishable from newvd, and unopenable from now on. 1710789Sahrens */ 1711789Sahrens if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 1712789Sahrens spa_strfree(oldvd->vdev_path); 1713789Sahrens oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 1714789Sahrens KM_SLEEP); 1715789Sahrens (void) sprintf(oldvd->vdev_path, "%s/%s", 1716789Sahrens newvd->vdev_path, "old"); 1717789Sahrens if (oldvd->vdev_devid != NULL) { 1718789Sahrens spa_strfree(oldvd->vdev_devid); 1719789Sahrens oldvd->vdev_devid = NULL; 1720789Sahrens } 1721789Sahrens } 1722789Sahrens 1723789Sahrens /* 17242082Seschrock * If the parent is not a mirror, or if we're replacing, insert the new 17252082Seschrock * mirror/replacing/spare vdev above oldvd. 1726789Sahrens */ 1727789Sahrens if (pvd->vdev_ops != pvops) 1728789Sahrens pvd = vdev_add_parent(oldvd, pvops); 1729789Sahrens 1730789Sahrens ASSERT(pvd->vdev_top->vdev_parent == rvd); 1731789Sahrens ASSERT(pvd->vdev_ops == pvops); 1732789Sahrens ASSERT(oldvd->vdev_parent == pvd); 1733789Sahrens 1734789Sahrens /* 1735789Sahrens * Extract the new device from its root and add it to pvd. 1736789Sahrens */ 1737789Sahrens vdev_remove_child(newrootvd, newvd); 1738789Sahrens newvd->vdev_id = pvd->vdev_children; 1739789Sahrens vdev_add_child(pvd, newvd); 1740789Sahrens 17411544Seschrock /* 17421544Seschrock * If newvd is smaller than oldvd, but larger than its rsize, 17431544Seschrock * the addition of newvd may have decreased our parent's asize. 17441544Seschrock */ 17451544Seschrock pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize); 17461544Seschrock 1747789Sahrens tvd = newvd->vdev_top; 1748789Sahrens ASSERT(pvd->vdev_top == tvd); 1749789Sahrens ASSERT(tvd->vdev_parent == rvd); 1750789Sahrens 1751789Sahrens vdev_config_dirty(tvd); 1752789Sahrens 1753789Sahrens /* 1754789Sahrens * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 1755789Sahrens * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 1756789Sahrens */ 1757789Sahrens open_txg = txg + TXG_CONCURRENT_STATES - 1; 1758789Sahrens 1759789Sahrens mutex_enter(&newvd->vdev_dtl_lock); 1760789Sahrens space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, 1761789Sahrens open_txg - TXG_INITIAL + 1); 1762789Sahrens mutex_exit(&newvd->vdev_dtl_lock); 1763789Sahrens 17643377Seschrock if (newvd->vdev_isspare) 17653377Seschrock spa_spare_activate(newvd); 17661544Seschrock 1767789Sahrens /* 1768789Sahrens * Mark newvd's DTL dirty in this txg. 1769789Sahrens */ 17701732Sbonwick vdev_dirty(tvd, VDD_DTL, newvd, txg); 1771789Sahrens 1772789Sahrens (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 1773789Sahrens 1774789Sahrens /* 1775789Sahrens * Kick off a resilver to update newvd. 1776789Sahrens */ 1777789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1778789Sahrens 1779789Sahrens return (0); 1780789Sahrens } 1781789Sahrens 1782789Sahrens /* 1783789Sahrens * Detach a device from a mirror or replacing vdev. 1784789Sahrens * If 'replace_done' is specified, only detach if the parent 1785789Sahrens * is a replacing vdev. 1786789Sahrens */ 1787789Sahrens int 17881544Seschrock spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) 1789789Sahrens { 1790789Sahrens uint64_t txg; 1791789Sahrens int c, t, error; 1792789Sahrens vdev_t *rvd = spa->spa_root_vdev; 1793789Sahrens vdev_t *vd, *pvd, *cvd, *tvd; 17942082Seschrock boolean_t unspare = B_FALSE; 17952082Seschrock uint64_t unspare_guid; 1796789Sahrens 1797789Sahrens txg = spa_vdev_enter(spa); 1798789Sahrens 17991544Seschrock vd = vdev_lookup_by_guid(rvd, guid); 1800789Sahrens 1801789Sahrens if (vd == NULL) 1802789Sahrens return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1803789Sahrens 18041585Sbonwick if (!vd->vdev_ops->vdev_op_leaf) 18051585Sbonwick return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 18061585Sbonwick 1807789Sahrens pvd = vd->vdev_parent; 1808789Sahrens 1809789Sahrens /* 1810789Sahrens * If replace_done is specified, only remove this device if it's 18112082Seschrock * the first child of a replacing vdev. For the 'spare' vdev, either 18122082Seschrock * disk can be removed. 1813789Sahrens */ 18142082Seschrock if (replace_done) { 18152082Seschrock if (pvd->vdev_ops == &vdev_replacing_ops) { 18162082Seschrock if (vd->vdev_id != 0) 18172082Seschrock return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 18182082Seschrock } else if (pvd->vdev_ops != &vdev_spare_ops) { 18192082Seschrock return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 18202082Seschrock } 18212082Seschrock } 18222082Seschrock 18232082Seschrock ASSERT(pvd->vdev_ops != &vdev_spare_ops || 18242082Seschrock spa_version(spa) >= ZFS_VERSION_SPARES); 1825789Sahrens 1826789Sahrens /* 18272082Seschrock * Only mirror, replacing, and spare vdevs support detach. 1828789Sahrens */ 1829789Sahrens if (pvd->vdev_ops != &vdev_replacing_ops && 18302082Seschrock pvd->vdev_ops != &vdev_mirror_ops && 18312082Seschrock pvd->vdev_ops != &vdev_spare_ops) 1832789Sahrens return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1833789Sahrens 1834789Sahrens /* 1835789Sahrens * If there's only one replica, you can't detach it. 1836789Sahrens */ 1837789Sahrens if (pvd->vdev_children <= 1) 1838789Sahrens return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1839789Sahrens 1840789Sahrens /* 1841789Sahrens * If all siblings have non-empty DTLs, this device may have the only 1842789Sahrens * valid copy of the data, which means we cannot safely detach it. 1843789Sahrens * 1844789Sahrens * XXX -- as in the vdev_offline() case, we really want a more 1845789Sahrens * precise DTL check. 1846789Sahrens */ 1847789Sahrens for (c = 0; c < pvd->vdev_children; c++) { 1848789Sahrens uint64_t dirty; 1849789Sahrens 1850789Sahrens cvd = pvd->vdev_child[c]; 1851789Sahrens if (cvd == vd) 1852789Sahrens continue; 1853789Sahrens if (vdev_is_dead(cvd)) 1854789Sahrens continue; 1855789Sahrens mutex_enter(&cvd->vdev_dtl_lock); 1856789Sahrens dirty = cvd->vdev_dtl_map.sm_space | 1857789Sahrens cvd->vdev_dtl_scrub.sm_space; 1858789Sahrens mutex_exit(&cvd->vdev_dtl_lock); 1859789Sahrens if (!dirty) 1860789Sahrens break; 1861789Sahrens } 18622082Seschrock 18632082Seschrock /* 18642082Seschrock * If we are a replacing or spare vdev, then we can always detach the 18652082Seschrock * latter child, as that is how one cancels the operation. 18662082Seschrock */ 18672082Seschrock if ((pvd->vdev_ops == &vdev_mirror_ops || vd->vdev_id != 1) && 18682082Seschrock c == pvd->vdev_children) 1869789Sahrens return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1870789Sahrens 1871789Sahrens /* 18722082Seschrock * If we are detaching the original disk from a spare, then it implies 18732082Seschrock * that the spare should become a real disk, and be removed from the 18742082Seschrock * active spare list for the pool. 18752082Seschrock */ 18762082Seschrock if (pvd->vdev_ops == &vdev_spare_ops && 18772082Seschrock vd->vdev_id == 0) 18782082Seschrock unspare = B_TRUE; 18792082Seschrock 18802082Seschrock /* 1881789Sahrens * Erase the disk labels so the disk can be used for other things. 1882789Sahrens * This must be done after all other error cases are handled, 1883789Sahrens * but before we disembowel vd (so we can still do I/O to it). 1884789Sahrens * But if we can't do it, don't treat the error as fatal -- 1885789Sahrens * it may be that the unwritability of the disk is the reason 1886789Sahrens * it's being detached! 1887789Sahrens */ 18883377Seschrock error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 1889789Sahrens 1890789Sahrens /* 1891789Sahrens * Remove vd from its parent and compact the parent's children. 1892789Sahrens */ 1893789Sahrens vdev_remove_child(pvd, vd); 1894789Sahrens vdev_compact_children(pvd); 1895789Sahrens 1896789Sahrens /* 1897789Sahrens * Remember one of the remaining children so we can get tvd below. 1898789Sahrens */ 1899789Sahrens cvd = pvd->vdev_child[0]; 1900789Sahrens 1901789Sahrens /* 19022082Seschrock * If we need to remove the remaining child from the list of hot spares, 19032082Seschrock * do it now, marking the vdev as no longer a spare in the process. We 19042082Seschrock * must do this before vdev_remove_parent(), because that can change the 19052082Seschrock * GUID if it creates a new toplevel GUID. 19062082Seschrock */ 19072082Seschrock if (unspare) { 19082082Seschrock ASSERT(cvd->vdev_isspare); 19093377Seschrock spa_spare_remove(cvd); 19102082Seschrock unspare_guid = cvd->vdev_guid; 19112082Seschrock } 19122082Seschrock 19132082Seschrock /* 1914789Sahrens * If the parent mirror/replacing vdev only has one child, 1915789Sahrens * the parent is no longer needed. Remove it from the tree. 1916789Sahrens */ 1917789Sahrens if (pvd->vdev_children == 1) 1918789Sahrens vdev_remove_parent(cvd); 1919789Sahrens 1920789Sahrens /* 1921789Sahrens * We don't set tvd until now because the parent we just removed 1922789Sahrens * may have been the previous top-level vdev. 1923789Sahrens */ 1924789Sahrens tvd = cvd->vdev_top; 1925789Sahrens ASSERT(tvd->vdev_parent == rvd); 1926789Sahrens 1927789Sahrens /* 19283377Seschrock * Reevaluate the parent vdev state. 1929789Sahrens */ 19303377Seschrock vdev_propagate_state(cvd->vdev_parent); 1931789Sahrens 1932789Sahrens /* 19333377Seschrock * If the device we just detached was smaller than the others, it may be 19343377Seschrock * possible to add metaslabs (i.e. grow the pool). vdev_metaslab_init() 19353377Seschrock * can't fail because the existing metaslabs are already in core, so 19363377Seschrock * there's nothing to read from disk. 1937789Sahrens */ 19381732Sbonwick VERIFY(vdev_metaslab_init(tvd, txg) == 0); 1939789Sahrens 1940789Sahrens vdev_config_dirty(tvd); 1941789Sahrens 1942789Sahrens /* 19433377Seschrock * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 19443377Seschrock * vd->vdev_detached is set and free vd's DTL object in syncing context. 19453377Seschrock * But first make sure we're not on any *other* txg's DTL list, to 19463377Seschrock * prevent vd from being accessed after it's freed. 1947789Sahrens */ 1948789Sahrens for (t = 0; t < TXG_SIZE; t++) 1949789Sahrens (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 19501732Sbonwick vd->vdev_detached = B_TRUE; 19511732Sbonwick vdev_dirty(tvd, VDD_DTL, vd, txg); 1952789Sahrens 19532082Seschrock error = spa_vdev_exit(spa, vd, txg, 0); 19542082Seschrock 19552082Seschrock /* 19563377Seschrock * If this was the removal of the original device in a hot spare vdev, 19573377Seschrock * then we want to go through and remove the device from the hot spare 19583377Seschrock * list of every other pool. 19592082Seschrock */ 19602082Seschrock if (unspare) { 19612082Seschrock spa = NULL; 19622082Seschrock mutex_enter(&spa_namespace_lock); 19632082Seschrock while ((spa = spa_next(spa)) != NULL) { 19642082Seschrock if (spa->spa_state != POOL_STATE_ACTIVE) 19652082Seschrock continue; 19662082Seschrock 19672082Seschrock (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 19682082Seschrock } 19692082Seschrock mutex_exit(&spa_namespace_lock); 19702082Seschrock } 19712082Seschrock 19722082Seschrock return (error); 19732082Seschrock } 19742082Seschrock 19752082Seschrock /* 19762082Seschrock * Remove a device from the pool. Currently, this supports removing only hot 19772082Seschrock * spares. 19782082Seschrock */ 19792082Seschrock int 19802082Seschrock spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 19812082Seschrock { 19822082Seschrock vdev_t *vd; 19832082Seschrock nvlist_t **spares, *nv, **newspares; 19842082Seschrock uint_t i, j, nspares; 19852082Seschrock int ret = 0; 19862082Seschrock 19872082Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 19882082Seschrock 19892082Seschrock vd = spa_lookup_by_guid(spa, guid); 19902082Seschrock 19912082Seschrock nv = NULL; 19922082Seschrock if (spa->spa_spares != NULL && 19932082Seschrock nvlist_lookup_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 19942082Seschrock &spares, &nspares) == 0) { 19952082Seschrock for (i = 0; i < nspares; i++) { 19962082Seschrock uint64_t theguid; 19972082Seschrock 19982082Seschrock VERIFY(nvlist_lookup_uint64(spares[i], 19992082Seschrock ZPOOL_CONFIG_GUID, &theguid) == 0); 20002082Seschrock if (theguid == guid) { 20012082Seschrock nv = spares[i]; 20022082Seschrock break; 20032082Seschrock } 20042082Seschrock } 20052082Seschrock } 20062082Seschrock 20072082Seschrock /* 20082082Seschrock * We only support removing a hot spare, and only if it's not currently 20092082Seschrock * in use in this pool. 20102082Seschrock */ 20112082Seschrock if (nv == NULL && vd == NULL) { 20122082Seschrock ret = ENOENT; 20132082Seschrock goto out; 20142082Seschrock } 20152082Seschrock 20162082Seschrock if (nv == NULL && vd != NULL) { 20172082Seschrock ret = ENOTSUP; 20182082Seschrock goto out; 20192082Seschrock } 20202082Seschrock 20212082Seschrock if (!unspare && nv != NULL && vd != NULL) { 20222082Seschrock ret = EBUSY; 20232082Seschrock goto out; 20242082Seschrock } 20252082Seschrock 20262082Seschrock if (nspares == 1) { 20272082Seschrock newspares = NULL; 20282082Seschrock } else { 20292082Seschrock newspares = kmem_alloc((nspares - 1) * sizeof (void *), 20302082Seschrock KM_SLEEP); 20312082Seschrock for (i = 0, j = 0; i < nspares; i++) { 20322082Seschrock if (spares[i] != nv) 20332082Seschrock VERIFY(nvlist_dup(spares[i], 20342082Seschrock &newspares[j++], KM_SLEEP) == 0); 20352082Seschrock } 20362082Seschrock } 20372082Seschrock 20382082Seschrock VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 20392082Seschrock DATA_TYPE_NVLIST_ARRAY) == 0); 20402082Seschrock VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 20412082Seschrock newspares, nspares - 1) == 0); 20422082Seschrock for (i = 0; i < nspares - 1; i++) 20432082Seschrock nvlist_free(newspares[i]); 20442082Seschrock kmem_free(newspares, (nspares - 1) * sizeof (void *)); 20452082Seschrock spa_load_spares(spa); 20462082Seschrock spa->spa_sync_spares = B_TRUE; 20472082Seschrock 20482082Seschrock out: 20492082Seschrock spa_config_exit(spa, FTAG); 20502082Seschrock 20512082Seschrock return (ret); 2052789Sahrens } 2053789Sahrens 2054789Sahrens /* 20551544Seschrock * Find any device that's done replacing, so we can detach it. 2056789Sahrens */ 20571544Seschrock static vdev_t * 20581544Seschrock spa_vdev_replace_done_hunt(vdev_t *vd) 2059789Sahrens { 20601544Seschrock vdev_t *newvd, *oldvd; 2061789Sahrens int c; 2062789Sahrens 20631544Seschrock for (c = 0; c < vd->vdev_children; c++) { 20641544Seschrock oldvd = spa_vdev_replace_done_hunt(vd->vdev_child[c]); 20651544Seschrock if (oldvd != NULL) 20661544Seschrock return (oldvd); 20671544Seschrock } 2068789Sahrens 2069789Sahrens if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 20701544Seschrock oldvd = vd->vdev_child[0]; 20711544Seschrock newvd = vd->vdev_child[1]; 2072789Sahrens 20731544Seschrock mutex_enter(&newvd->vdev_dtl_lock); 20741544Seschrock if (newvd->vdev_dtl_map.sm_space == 0 && 20751544Seschrock newvd->vdev_dtl_scrub.sm_space == 0) { 20761544Seschrock mutex_exit(&newvd->vdev_dtl_lock); 20771544Seschrock return (oldvd); 20781544Seschrock } 20791544Seschrock mutex_exit(&newvd->vdev_dtl_lock); 20801544Seschrock } 2081789Sahrens 20821544Seschrock return (NULL); 2083789Sahrens } 2084789Sahrens 20851544Seschrock static void 2086789Sahrens spa_vdev_replace_done(spa_t *spa) 2087789Sahrens { 20881544Seschrock vdev_t *vd; 20892082Seschrock vdev_t *pvd; 20901544Seschrock uint64_t guid; 20912082Seschrock uint64_t pguid = 0; 2092789Sahrens 20931544Seschrock spa_config_enter(spa, RW_READER, FTAG); 2094789Sahrens 20951544Seschrock while ((vd = spa_vdev_replace_done_hunt(spa->spa_root_vdev)) != NULL) { 20961544Seschrock guid = vd->vdev_guid; 20972082Seschrock /* 20982082Seschrock * If we have just finished replacing a hot spared device, then 20992082Seschrock * we need to detach the parent's first child (the original hot 21002082Seschrock * spare) as well. 21012082Seschrock */ 21022082Seschrock pvd = vd->vdev_parent; 21032082Seschrock if (pvd->vdev_parent->vdev_ops == &vdev_spare_ops && 21042082Seschrock pvd->vdev_id == 0) { 21052082Seschrock ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 21062082Seschrock ASSERT(pvd->vdev_parent->vdev_children == 2); 21072082Seschrock pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid; 21082082Seschrock } 21091544Seschrock spa_config_exit(spa, FTAG); 21101544Seschrock if (spa_vdev_detach(spa, guid, B_TRUE) != 0) 21111544Seschrock return; 21122082Seschrock if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0) 21132082Seschrock return; 21141544Seschrock spa_config_enter(spa, RW_READER, FTAG); 2115789Sahrens } 2116789Sahrens 21171544Seschrock spa_config_exit(spa, FTAG); 2118789Sahrens } 2119789Sahrens 2120789Sahrens /* 21211354Seschrock * Update the stored path for this vdev. Dirty the vdev configuration, relying 21221354Seschrock * on spa_vdev_enter/exit() to synchronize the labels and cache. 21231354Seschrock */ 21241354Seschrock int 21251354Seschrock spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 21261354Seschrock { 21271354Seschrock vdev_t *rvd, *vd; 21281354Seschrock uint64_t txg; 21291354Seschrock 21301354Seschrock rvd = spa->spa_root_vdev; 21311354Seschrock 21321354Seschrock txg = spa_vdev_enter(spa); 21331354Seschrock 21342082Seschrock if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 21352082Seschrock /* 21362082Seschrock * Determine if this is a reference to a hot spare. In that 21372082Seschrock * case, update the path as stored in the spare list. 21382082Seschrock */ 21392082Seschrock nvlist_t **spares; 21402082Seschrock uint_t i, nspares; 21412082Seschrock if (spa->spa_sparelist != NULL) { 21422082Seschrock VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 21432082Seschrock ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 21442082Seschrock for (i = 0; i < nspares; i++) { 21452082Seschrock uint64_t theguid; 21462082Seschrock VERIFY(nvlist_lookup_uint64(spares[i], 21472082Seschrock ZPOOL_CONFIG_GUID, &theguid) == 0); 21482082Seschrock if (theguid == guid) 21492082Seschrock break; 21502082Seschrock } 21512082Seschrock 21522082Seschrock if (i == nspares) 21532082Seschrock return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 21542082Seschrock 21552082Seschrock VERIFY(nvlist_add_string(spares[i], 21562082Seschrock ZPOOL_CONFIG_PATH, newpath) == 0); 21572082Seschrock spa_load_spares(spa); 21582082Seschrock spa->spa_sync_spares = B_TRUE; 21592082Seschrock return (spa_vdev_exit(spa, NULL, txg, 0)); 21602082Seschrock } else { 21612082Seschrock return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 21622082Seschrock } 21632082Seschrock } 21641354Seschrock 21651585Sbonwick if (!vd->vdev_ops->vdev_op_leaf) 21661585Sbonwick return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 21671585Sbonwick 21681354Seschrock spa_strfree(vd->vdev_path); 21691354Seschrock vd->vdev_path = spa_strdup(newpath); 21701354Seschrock 21711354Seschrock vdev_config_dirty(vd->vdev_top); 21721354Seschrock 21731354Seschrock return (spa_vdev_exit(spa, NULL, txg, 0)); 21741354Seschrock } 21751354Seschrock 21761354Seschrock /* 2177789Sahrens * ========================================================================== 2178789Sahrens * SPA Scrubbing 2179789Sahrens * ========================================================================== 2180789Sahrens */ 2181789Sahrens 2182789Sahrens static void 2183789Sahrens spa_scrub_io_done(zio_t *zio) 2184789Sahrens { 2185789Sahrens spa_t *spa = zio->io_spa; 2186789Sahrens 21873290Sjohansen zio_data_buf_free(zio->io_data, zio->io_size); 2188789Sahrens 2189789Sahrens mutex_enter(&spa->spa_scrub_lock); 21901544Seschrock if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 21911775Sbillm vdev_t *vd = zio->io_vd ? zio->io_vd : spa->spa_root_vdev; 2192789Sahrens spa->spa_scrub_errors++; 2193789Sahrens mutex_enter(&vd->vdev_stat_lock); 2194789Sahrens vd->vdev_stat.vs_scrub_errors++; 2195789Sahrens mutex_exit(&vd->vdev_stat_lock); 2196789Sahrens } 2197*3697Smishra 2198*3697Smishra if (--spa->spa_scrub_inflight < spa->spa_scrub_maxinflight) 21991544Seschrock cv_broadcast(&spa->spa_scrub_io_cv); 2200*3697Smishra 2201*3697Smishra ASSERT(spa->spa_scrub_inflight >= 0); 2202*3697Smishra 22031544Seschrock mutex_exit(&spa->spa_scrub_lock); 2204789Sahrens } 2205789Sahrens 2206789Sahrens static void 22071544Seschrock spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags, 22081544Seschrock zbookmark_t *zb) 2209789Sahrens { 2210789Sahrens size_t size = BP_GET_LSIZE(bp); 2211*3697Smishra void *data; 2212789Sahrens 2213789Sahrens mutex_enter(&spa->spa_scrub_lock); 2214*3697Smishra /* 2215*3697Smishra * Do not give too much work to vdev(s). 2216*3697Smishra */ 2217*3697Smishra while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight) { 2218*3697Smishra cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2219*3697Smishra } 2220789Sahrens spa->spa_scrub_inflight++; 2221789Sahrens mutex_exit(&spa->spa_scrub_lock); 2222789Sahrens 2223*3697Smishra data = zio_data_buf_alloc(size); 2224*3697Smishra 22251544Seschrock if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET) 22261544Seschrock flags |= ZIO_FLAG_SPECULATIVE; /* intent log block */ 22271544Seschrock 22281807Sbonwick flags |= ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL; 22291544Seschrock 2230789Sahrens zio_nowait(zio_read(NULL, spa, bp, data, size, 22311544Seschrock spa_scrub_io_done, NULL, priority, flags, zb)); 2232789Sahrens } 2233789Sahrens 2234789Sahrens /* ARGSUSED */ 2235789Sahrens static int 2236789Sahrens spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) 2237789Sahrens { 2238789Sahrens blkptr_t *bp = &bc->bc_blkptr; 22391775Sbillm vdev_t *vd = spa->spa_root_vdev; 22401775Sbillm dva_t *dva = bp->blk_dva; 22411775Sbillm int needs_resilver = B_FALSE; 22421775Sbillm int d; 2243789Sahrens 22441775Sbillm if (bc->bc_errno) { 2245789Sahrens /* 2246789Sahrens * We can't scrub this block, but we can continue to scrub 2247789Sahrens * the rest of the pool. Note the error and move along. 2248789Sahrens */ 2249789Sahrens mutex_enter(&spa->spa_scrub_lock); 2250789Sahrens spa->spa_scrub_errors++; 2251789Sahrens mutex_exit(&spa->spa_scrub_lock); 2252789Sahrens 22531775Sbillm mutex_enter(&vd->vdev_stat_lock); 22541775Sbillm vd->vdev_stat.vs_scrub_errors++; 22551775Sbillm mutex_exit(&vd->vdev_stat_lock); 2256789Sahrens 2257789Sahrens return (ERESTART); 2258789Sahrens } 2259789Sahrens 2260789Sahrens ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg); 2261789Sahrens 22621775Sbillm for (d = 0; d < BP_GET_NDVAS(bp); d++) { 22631775Sbillm vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d])); 22641775Sbillm 22651775Sbillm ASSERT(vd != NULL); 22661775Sbillm 22671775Sbillm /* 22681775Sbillm * Keep track of how much data we've examined so that 22691775Sbillm * zpool(1M) status can make useful progress reports. 22701775Sbillm */ 22711775Sbillm mutex_enter(&vd->vdev_stat_lock); 22721775Sbillm vd->vdev_stat.vs_scrub_examined += DVA_GET_ASIZE(&dva[d]); 22731775Sbillm mutex_exit(&vd->vdev_stat_lock); 2274789Sahrens 22751775Sbillm if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) { 22761775Sbillm if (DVA_GET_GANG(&dva[d])) { 22771775Sbillm /* 22781775Sbillm * Gang members may be spread across multiple 22791775Sbillm * vdevs, so the best we can do is look at the 22801775Sbillm * pool-wide DTL. 22811775Sbillm * XXX -- it would be better to change our 22821775Sbillm * allocation policy to ensure that this can't 22831775Sbillm * happen. 22841775Sbillm */ 22851775Sbillm vd = spa->spa_root_vdev; 22861775Sbillm } 22871775Sbillm if (vdev_dtl_contains(&vd->vdev_dtl_map, 22881775Sbillm bp->blk_birth, 1)) 22891775Sbillm needs_resilver = B_TRUE; 2290789Sahrens } 22911775Sbillm } 22921775Sbillm 22931775Sbillm if (spa->spa_scrub_type == POOL_SCRUB_EVERYTHING) 2294789Sahrens spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB, 22951544Seschrock ZIO_FLAG_SCRUB, &bc->bc_bookmark); 22961775Sbillm else if (needs_resilver) 22971775Sbillm spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER, 22981775Sbillm ZIO_FLAG_RESILVER, &bc->bc_bookmark); 2299789Sahrens 2300789Sahrens return (0); 2301789Sahrens } 2302789Sahrens 2303789Sahrens static void 2304789Sahrens spa_scrub_thread(spa_t *spa) 2305789Sahrens { 2306789Sahrens callb_cpr_t cprinfo; 2307789Sahrens traverse_handle_t *th = spa->spa_scrub_th; 2308789Sahrens vdev_t *rvd = spa->spa_root_vdev; 2309789Sahrens pool_scrub_type_t scrub_type = spa->spa_scrub_type; 2310789Sahrens int error = 0; 2311789Sahrens boolean_t complete; 2312789Sahrens 2313789Sahrens CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG); 2314789Sahrens 2315797Sbonwick /* 2316797Sbonwick * If we're restarting due to a snapshot create/delete, 2317797Sbonwick * wait for that to complete. 2318797Sbonwick */ 2319797Sbonwick txg_wait_synced(spa_get_dsl(spa), 0); 2320797Sbonwick 23211544Seschrock dprintf("start %s mintxg=%llu maxtxg=%llu\n", 23221544Seschrock scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 23231544Seschrock spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg); 23241544Seschrock 23251544Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 23261544Seschrock vdev_reopen(rvd); /* purge all vdev caches */ 2327789Sahrens vdev_config_dirty(rvd); /* rewrite all disk labels */ 2328789Sahrens vdev_scrub_stat_update(rvd, scrub_type, B_FALSE); 23291544Seschrock spa_config_exit(spa, FTAG); 2330789Sahrens 2331789Sahrens mutex_enter(&spa->spa_scrub_lock); 2332789Sahrens spa->spa_scrub_errors = 0; 2333789Sahrens spa->spa_scrub_active = 1; 23341544Seschrock ASSERT(spa->spa_scrub_inflight == 0); 2335789Sahrens 2336789Sahrens while (!spa->spa_scrub_stop) { 2337789Sahrens CALLB_CPR_SAFE_BEGIN(&cprinfo); 23381544Seschrock while (spa->spa_scrub_suspended) { 2339789Sahrens spa->spa_scrub_active = 0; 2340789Sahrens cv_broadcast(&spa->spa_scrub_cv); 2341789Sahrens cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2342789Sahrens spa->spa_scrub_active = 1; 2343789Sahrens } 2344789Sahrens CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock); 2345789Sahrens 2346789Sahrens if (spa->spa_scrub_restart_txg != 0) 2347789Sahrens break; 2348789Sahrens 2349789Sahrens mutex_exit(&spa->spa_scrub_lock); 2350789Sahrens error = traverse_more(th); 2351789Sahrens mutex_enter(&spa->spa_scrub_lock); 2352789Sahrens if (error != EAGAIN) 2353789Sahrens break; 2354789Sahrens } 2355789Sahrens 2356789Sahrens while (spa->spa_scrub_inflight) 2357789Sahrens cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2358789Sahrens 23591601Sbonwick spa->spa_scrub_active = 0; 23601601Sbonwick cv_broadcast(&spa->spa_scrub_cv); 23611601Sbonwick 23621601Sbonwick mutex_exit(&spa->spa_scrub_lock); 23631601Sbonwick 23641601Sbonwick spa_config_enter(spa, RW_WRITER, FTAG); 23651601Sbonwick 23661601Sbonwick mutex_enter(&spa->spa_scrub_lock); 23671601Sbonwick 23681601Sbonwick /* 23691601Sbonwick * Note: we check spa_scrub_restart_txg under both spa_scrub_lock 23701601Sbonwick * AND the spa config lock to synchronize with any config changes 23711601Sbonwick * that revise the DTLs under spa_vdev_enter() / spa_vdev_exit(). 23721601Sbonwick */ 2373789Sahrens if (spa->spa_scrub_restart_txg != 0) 2374789Sahrens error = ERESTART; 2375789Sahrens 23761544Seschrock if (spa->spa_scrub_stop) 23771544Seschrock error = EINTR; 23781544Seschrock 2379789Sahrens /* 23801544Seschrock * Even if there were uncorrectable errors, we consider the scrub 23811544Seschrock * completed. The downside is that if there is a transient error during 23821544Seschrock * a resilver, we won't resilver the data properly to the target. But 23831544Seschrock * if the damage is permanent (more likely) we will resilver forever, 23841544Seschrock * which isn't really acceptable. Since there is enough information for 23851544Seschrock * the user to know what has failed and why, this seems like a more 23861544Seschrock * tractable approach. 2387789Sahrens */ 23881544Seschrock complete = (error == 0); 2389789Sahrens 23901544Seschrock dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n", 23911544Seschrock scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 2392789Sahrens spa->spa_scrub_maxtxg, complete ? "done" : "FAILED", 2393789Sahrens error, spa->spa_scrub_errors, spa->spa_scrub_stop); 2394789Sahrens 2395789Sahrens mutex_exit(&spa->spa_scrub_lock); 2396789Sahrens 2397789Sahrens /* 2398789Sahrens * If the scrub/resilver completed, update all DTLs to reflect this. 2399789Sahrens * Whether it succeeded or not, vacate all temporary scrub DTLs. 2400789Sahrens */ 2401789Sahrens vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1, 2402789Sahrens complete ? spa->spa_scrub_maxtxg : 0, B_TRUE); 2403789Sahrens vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete); 24041544Seschrock spa_errlog_rotate(spa); 24051601Sbonwick 24061544Seschrock spa_config_exit(spa, FTAG); 2407789Sahrens 2408789Sahrens mutex_enter(&spa->spa_scrub_lock); 2409789Sahrens 24101544Seschrock /* 24111544Seschrock * We may have finished replacing a device. 24121544Seschrock * Let the async thread assess this and handle the detach. 24131544Seschrock */ 24141544Seschrock spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 2415789Sahrens 2416789Sahrens /* 2417789Sahrens * If we were told to restart, our final act is to start a new scrub. 2418789Sahrens */ 2419789Sahrens if (error == ERESTART) 24201544Seschrock spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ? 24211544Seschrock SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB); 2422789Sahrens 24231544Seschrock spa->spa_scrub_type = POOL_SCRUB_NONE; 24241544Seschrock spa->spa_scrub_active = 0; 24251544Seschrock spa->spa_scrub_thread = NULL; 24261544Seschrock cv_broadcast(&spa->spa_scrub_cv); 2427789Sahrens CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */ 2428789Sahrens thread_exit(); 2429789Sahrens } 2430789Sahrens 2431789Sahrens void 2432789Sahrens spa_scrub_suspend(spa_t *spa) 2433789Sahrens { 2434789Sahrens mutex_enter(&spa->spa_scrub_lock); 24351544Seschrock spa->spa_scrub_suspended++; 2436789Sahrens while (spa->spa_scrub_active) { 2437789Sahrens cv_broadcast(&spa->spa_scrub_cv); 2438789Sahrens cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2439789Sahrens } 2440789Sahrens while (spa->spa_scrub_inflight) 2441789Sahrens cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2442789Sahrens mutex_exit(&spa->spa_scrub_lock); 2443789Sahrens } 2444789Sahrens 2445789Sahrens void 2446789Sahrens spa_scrub_resume(spa_t *spa) 2447789Sahrens { 2448789Sahrens mutex_enter(&spa->spa_scrub_lock); 24491544Seschrock ASSERT(spa->spa_scrub_suspended != 0); 24501544Seschrock if (--spa->spa_scrub_suspended == 0) 2451789Sahrens cv_broadcast(&spa->spa_scrub_cv); 2452789Sahrens mutex_exit(&spa->spa_scrub_lock); 2453789Sahrens } 2454789Sahrens 2455789Sahrens void 2456789Sahrens spa_scrub_restart(spa_t *spa, uint64_t txg) 2457789Sahrens { 2458789Sahrens /* 2459789Sahrens * Something happened (e.g. snapshot create/delete) that means 2460789Sahrens * we must restart any in-progress scrubs. The itinerary will 2461789Sahrens * fix this properly. 2462789Sahrens */ 2463789Sahrens mutex_enter(&spa->spa_scrub_lock); 2464789Sahrens spa->spa_scrub_restart_txg = txg; 2465789Sahrens mutex_exit(&spa->spa_scrub_lock); 2466789Sahrens } 2467789Sahrens 24681544Seschrock int 24691544Seschrock spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) 2470789Sahrens { 2471789Sahrens space_seg_t *ss; 2472789Sahrens uint64_t mintxg, maxtxg; 2473789Sahrens vdev_t *rvd = spa->spa_root_vdev; 2474789Sahrens 2475789Sahrens if ((uint_t)type >= POOL_SCRUB_TYPES) 2476789Sahrens return (ENOTSUP); 2477789Sahrens 24781544Seschrock mutex_enter(&spa->spa_scrub_lock); 24791544Seschrock 2480789Sahrens /* 2481789Sahrens * If there's a scrub or resilver already in progress, stop it. 2482789Sahrens */ 2483789Sahrens while (spa->spa_scrub_thread != NULL) { 2484789Sahrens /* 2485789Sahrens * Don't stop a resilver unless forced. 2486789Sahrens */ 24871544Seschrock if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) { 24881544Seschrock mutex_exit(&spa->spa_scrub_lock); 2489789Sahrens return (EBUSY); 24901544Seschrock } 2491789Sahrens spa->spa_scrub_stop = 1; 2492789Sahrens cv_broadcast(&spa->spa_scrub_cv); 2493789Sahrens cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2494789Sahrens } 2495789Sahrens 2496789Sahrens /* 2497789Sahrens * Terminate the previous traverse. 2498789Sahrens */ 2499789Sahrens if (spa->spa_scrub_th != NULL) { 2500789Sahrens traverse_fini(spa->spa_scrub_th); 2501789Sahrens spa->spa_scrub_th = NULL; 2502789Sahrens } 2503789Sahrens 25041544Seschrock if (rvd == NULL) { 25051544Seschrock ASSERT(spa->spa_scrub_stop == 0); 25061544Seschrock ASSERT(spa->spa_scrub_type == type); 25071544Seschrock ASSERT(spa->spa_scrub_restart_txg == 0); 25081544Seschrock mutex_exit(&spa->spa_scrub_lock); 25091544Seschrock return (0); 25101544Seschrock } 2511789Sahrens 2512789Sahrens mintxg = TXG_INITIAL - 1; 2513789Sahrens maxtxg = spa_last_synced_txg(spa) + 1; 2514789Sahrens 25151544Seschrock mutex_enter(&rvd->vdev_dtl_lock); 2516789Sahrens 25171544Seschrock if (rvd->vdev_dtl_map.sm_space == 0) { 25181544Seschrock /* 25191544Seschrock * The pool-wide DTL is empty. 25201732Sbonwick * If this is a resilver, there's nothing to do except 25211732Sbonwick * check whether any in-progress replacements have completed. 25221544Seschrock */ 25231732Sbonwick if (type == POOL_SCRUB_RESILVER) { 25241544Seschrock type = POOL_SCRUB_NONE; 25251732Sbonwick spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 25261732Sbonwick } 25271544Seschrock } else { 25281544Seschrock /* 25291544Seschrock * The pool-wide DTL is non-empty. 25301544Seschrock * If this is a normal scrub, upgrade to a resilver instead. 25311544Seschrock */ 25321544Seschrock if (type == POOL_SCRUB_EVERYTHING) 25331544Seschrock type = POOL_SCRUB_RESILVER; 25341544Seschrock } 2535789Sahrens 25361544Seschrock if (type == POOL_SCRUB_RESILVER) { 2537789Sahrens /* 2538789Sahrens * Determine the resilvering boundaries. 2539789Sahrens * 2540789Sahrens * Note: (mintxg, maxtxg) is an open interval, 2541789Sahrens * i.e. mintxg and maxtxg themselves are not included. 2542789Sahrens * 2543789Sahrens * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1 2544789Sahrens * so we don't claim to resilver a txg that's still changing. 2545789Sahrens */ 2546789Sahrens ss = avl_first(&rvd->vdev_dtl_map.sm_root); 25471544Seschrock mintxg = ss->ss_start - 1; 2548789Sahrens ss = avl_last(&rvd->vdev_dtl_map.sm_root); 25491544Seschrock maxtxg = MIN(ss->ss_end, maxtxg); 2550789Sahrens } 2551789Sahrens 25521544Seschrock mutex_exit(&rvd->vdev_dtl_lock); 25531544Seschrock 25541544Seschrock spa->spa_scrub_stop = 0; 25551544Seschrock spa->spa_scrub_type = type; 25561544Seschrock spa->spa_scrub_restart_txg = 0; 25571544Seschrock 25581544Seschrock if (type != POOL_SCRUB_NONE) { 25591544Seschrock spa->spa_scrub_mintxg = mintxg; 2560789Sahrens spa->spa_scrub_maxtxg = maxtxg; 2561789Sahrens spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL, 25621635Sbonwick ADVANCE_PRE | ADVANCE_PRUNE | ADVANCE_ZIL, 25631635Sbonwick ZIO_FLAG_CANFAIL); 2564789Sahrens traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg); 2565789Sahrens spa->spa_scrub_thread = thread_create(NULL, 0, 2566789Sahrens spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri); 2567789Sahrens } 2568789Sahrens 25691544Seschrock mutex_exit(&spa->spa_scrub_lock); 25701544Seschrock 2571789Sahrens return (0); 2572789Sahrens } 2573789Sahrens 25741544Seschrock /* 25751544Seschrock * ========================================================================== 25761544Seschrock * SPA async task processing 25771544Seschrock * ========================================================================== 25781544Seschrock */ 25791544Seschrock 25801544Seschrock static void 25811544Seschrock spa_async_reopen(spa_t *spa) 2582789Sahrens { 25831544Seschrock vdev_t *rvd = spa->spa_root_vdev; 25841544Seschrock vdev_t *tvd; 25851544Seschrock int c; 25861544Seschrock 25871544Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 25881544Seschrock 25891544Seschrock for (c = 0; c < rvd->vdev_children; c++) { 25901544Seschrock tvd = rvd->vdev_child[c]; 25911544Seschrock if (tvd->vdev_reopen_wanted) { 25921544Seschrock tvd->vdev_reopen_wanted = 0; 25931544Seschrock vdev_reopen(tvd); 25941544Seschrock } 25951544Seschrock } 2596789Sahrens 25971544Seschrock spa_config_exit(spa, FTAG); 25981544Seschrock } 25991544Seschrock 26001544Seschrock static void 26011544Seschrock spa_async_thread(spa_t *spa) 26021544Seschrock { 26031544Seschrock int tasks; 26041544Seschrock 26051544Seschrock ASSERT(spa->spa_sync_on); 2606789Sahrens 26071544Seschrock mutex_enter(&spa->spa_async_lock); 26081544Seschrock tasks = spa->spa_async_tasks; 26091544Seschrock spa->spa_async_tasks = 0; 26101544Seschrock mutex_exit(&spa->spa_async_lock); 26111544Seschrock 26121544Seschrock /* 26131635Sbonwick * See if the config needs to be updated. 26141635Sbonwick */ 26151635Sbonwick if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 26161635Sbonwick mutex_enter(&spa_namespace_lock); 26171635Sbonwick spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 26181635Sbonwick mutex_exit(&spa_namespace_lock); 26191635Sbonwick } 26201635Sbonwick 26211635Sbonwick /* 26221544Seschrock * See if any devices need to be reopened. 26231544Seschrock */ 26241544Seschrock if (tasks & SPA_ASYNC_REOPEN) 26251544Seschrock spa_async_reopen(spa); 26261544Seschrock 26271544Seschrock /* 26281544Seschrock * If any devices are done replacing, detach them. 26291544Seschrock */ 26301544Seschrock if (tasks & SPA_ASYNC_REPLACE_DONE) 2631789Sahrens spa_vdev_replace_done(spa); 2632789Sahrens 26331544Seschrock /* 26341544Seschrock * Kick off a scrub. 26351544Seschrock */ 26361544Seschrock if (tasks & SPA_ASYNC_SCRUB) 26371544Seschrock VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0); 26381544Seschrock 26391544Seschrock /* 26401544Seschrock * Kick off a resilver. 26411544Seschrock */ 26421544Seschrock if (tasks & SPA_ASYNC_RESILVER) 26431544Seschrock VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 26441544Seschrock 26451544Seschrock /* 26461544Seschrock * Let the world know that we're done. 26471544Seschrock */ 26481544Seschrock mutex_enter(&spa->spa_async_lock); 26491544Seschrock spa->spa_async_thread = NULL; 26501544Seschrock cv_broadcast(&spa->spa_async_cv); 26511544Seschrock mutex_exit(&spa->spa_async_lock); 26521544Seschrock thread_exit(); 26531544Seschrock } 26541544Seschrock 26551544Seschrock void 26561544Seschrock spa_async_suspend(spa_t *spa) 26571544Seschrock { 26581544Seschrock mutex_enter(&spa->spa_async_lock); 26591544Seschrock spa->spa_async_suspended++; 26601544Seschrock while (spa->spa_async_thread != NULL) 26611544Seschrock cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 26621544Seschrock mutex_exit(&spa->spa_async_lock); 26631544Seschrock } 26641544Seschrock 26651544Seschrock void 26661544Seschrock spa_async_resume(spa_t *spa) 26671544Seschrock { 26681544Seschrock mutex_enter(&spa->spa_async_lock); 26691544Seschrock ASSERT(spa->spa_async_suspended != 0); 26701544Seschrock spa->spa_async_suspended--; 26711544Seschrock mutex_exit(&spa->spa_async_lock); 26721544Seschrock } 26731544Seschrock 26741544Seschrock static void 26751544Seschrock spa_async_dispatch(spa_t *spa) 26761544Seschrock { 26771544Seschrock mutex_enter(&spa->spa_async_lock); 26781544Seschrock if (spa->spa_async_tasks && !spa->spa_async_suspended && 26791635Sbonwick spa->spa_async_thread == NULL && 26801635Sbonwick rootdir != NULL && !vn_is_readonly(rootdir)) 26811544Seschrock spa->spa_async_thread = thread_create(NULL, 0, 26821544Seschrock spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 26831544Seschrock mutex_exit(&spa->spa_async_lock); 26841544Seschrock } 26851544Seschrock 26861544Seschrock void 26871544Seschrock spa_async_request(spa_t *spa, int task) 26881544Seschrock { 26891544Seschrock mutex_enter(&spa->spa_async_lock); 26901544Seschrock spa->spa_async_tasks |= task; 26911544Seschrock mutex_exit(&spa->spa_async_lock); 2692789Sahrens } 2693789Sahrens 2694789Sahrens /* 2695789Sahrens * ========================================================================== 2696789Sahrens * SPA syncing routines 2697789Sahrens * ========================================================================== 2698789Sahrens */ 2699789Sahrens 2700789Sahrens static void 2701789Sahrens spa_sync_deferred_frees(spa_t *spa, uint64_t txg) 2702789Sahrens { 2703789Sahrens bplist_t *bpl = &spa->spa_sync_bplist; 2704789Sahrens dmu_tx_t *tx; 2705789Sahrens blkptr_t blk; 2706789Sahrens uint64_t itor = 0; 2707789Sahrens zio_t *zio; 2708789Sahrens int error; 2709789Sahrens uint8_t c = 1; 2710789Sahrens 2711789Sahrens zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD); 2712789Sahrens 2713789Sahrens while (bplist_iterate(bpl, &itor, &blk) == 0) 2714789Sahrens zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL)); 2715789Sahrens 2716789Sahrens error = zio_wait(zio); 2717789Sahrens ASSERT3U(error, ==, 0); 2718789Sahrens 2719789Sahrens tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 2720789Sahrens bplist_vacate(bpl, tx); 2721789Sahrens 2722789Sahrens /* 2723789Sahrens * Pre-dirty the first block so we sync to convergence faster. 2724789Sahrens * (Usually only the first block is needed.) 2725789Sahrens */ 2726789Sahrens dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 2727789Sahrens dmu_tx_commit(tx); 2728789Sahrens } 2729789Sahrens 2730789Sahrens static void 27312082Seschrock spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 27322082Seschrock { 27332082Seschrock char *packed = NULL; 27342082Seschrock size_t nvsize = 0; 27352082Seschrock dmu_buf_t *db; 27362082Seschrock 27372082Seschrock VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 27382082Seschrock 27392082Seschrock packed = kmem_alloc(nvsize, KM_SLEEP); 27402082Seschrock 27412082Seschrock VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 27422082Seschrock KM_SLEEP) == 0); 27432082Seschrock 27442082Seschrock dmu_write(spa->spa_meta_objset, obj, 0, nvsize, packed, tx); 27452082Seschrock 27462082Seschrock kmem_free(packed, nvsize); 27472082Seschrock 27482082Seschrock VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 27492082Seschrock dmu_buf_will_dirty(db, tx); 27502082Seschrock *(uint64_t *)db->db_data = nvsize; 27512082Seschrock dmu_buf_rele(db, FTAG); 27522082Seschrock } 27532082Seschrock 27542082Seschrock static void 27552082Seschrock spa_sync_spares(spa_t *spa, dmu_tx_t *tx) 27562082Seschrock { 27572082Seschrock nvlist_t *nvroot; 27582082Seschrock nvlist_t **spares; 27592082Seschrock int i; 27602082Seschrock 27612082Seschrock if (!spa->spa_sync_spares) 27622082Seschrock return; 27632082Seschrock 27642082Seschrock /* 27652082Seschrock * Update the MOS nvlist describing the list of available spares. 27662082Seschrock * spa_validate_spares() will have already made sure this nvlist is 27672082Seschrock * valid and the vdevs are labelled appropriately. 27682082Seschrock */ 27692082Seschrock if (spa->spa_spares_object == 0) { 27702082Seschrock spa->spa_spares_object = dmu_object_alloc(spa->spa_meta_objset, 27712082Seschrock DMU_OT_PACKED_NVLIST, 1 << 14, 27722082Seschrock DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 27732082Seschrock VERIFY(zap_update(spa->spa_meta_objset, 27742082Seschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SPARES, 27752082Seschrock sizeof (uint64_t), 1, &spa->spa_spares_object, tx) == 0); 27762082Seschrock } 27772082Seschrock 27782082Seschrock VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 27792082Seschrock if (spa->spa_nspares == 0) { 27802082Seschrock VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 27812082Seschrock NULL, 0) == 0); 27822082Seschrock } else { 27832082Seschrock spares = kmem_alloc(spa->spa_nspares * sizeof (void *), 27842082Seschrock KM_SLEEP); 27852082Seschrock for (i = 0; i < spa->spa_nspares; i++) 27862082Seschrock spares[i] = vdev_config_generate(spa, 27872082Seschrock spa->spa_spares[i], B_FALSE, B_TRUE); 27882082Seschrock VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 27892082Seschrock spares, spa->spa_nspares) == 0); 27902082Seschrock for (i = 0; i < spa->spa_nspares; i++) 27912082Seschrock nvlist_free(spares[i]); 27922082Seschrock kmem_free(spares, spa->spa_nspares * sizeof (void *)); 27932082Seschrock } 27942082Seschrock 27952082Seschrock spa_sync_nvlist(spa, spa->spa_spares_object, nvroot, tx); 27962926Sek110237 nvlist_free(nvroot); 27972082Seschrock 27982082Seschrock spa->spa_sync_spares = B_FALSE; 27992082Seschrock } 28002082Seschrock 28012082Seschrock static void 2802789Sahrens spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 2803789Sahrens { 2804789Sahrens nvlist_t *config; 2805789Sahrens 2806789Sahrens if (list_is_empty(&spa->spa_dirty_list)) 2807789Sahrens return; 2808789Sahrens 2809789Sahrens config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE); 2810789Sahrens 28111635Sbonwick if (spa->spa_config_syncing) 28121635Sbonwick nvlist_free(spa->spa_config_syncing); 28131635Sbonwick spa->spa_config_syncing = config; 2814789Sahrens 28152082Seschrock spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 2816789Sahrens } 2817789Sahrens 2818789Sahrens /* 2819789Sahrens * Sync the specified transaction group. New blocks may be dirtied as 2820789Sahrens * part of the process, so we iterate until it converges. 2821789Sahrens */ 2822789Sahrens void 2823789Sahrens spa_sync(spa_t *spa, uint64_t txg) 2824789Sahrens { 2825789Sahrens dsl_pool_t *dp = spa->spa_dsl_pool; 2826789Sahrens objset_t *mos = spa->spa_meta_objset; 2827789Sahrens bplist_t *bpl = &spa->spa_sync_bplist; 28281635Sbonwick vdev_t *rvd = spa->spa_root_vdev; 2829789Sahrens vdev_t *vd; 2830789Sahrens dmu_tx_t *tx; 2831789Sahrens int dirty_vdevs; 2832789Sahrens 2833789Sahrens /* 2834789Sahrens * Lock out configuration changes. 2835789Sahrens */ 28361544Seschrock spa_config_enter(spa, RW_READER, FTAG); 2837789Sahrens 2838789Sahrens spa->spa_syncing_txg = txg; 2839789Sahrens spa->spa_sync_pass = 0; 2840789Sahrens 28411544Seschrock VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); 2842789Sahrens 28432082Seschrock tx = dmu_tx_create_assigned(dp, txg); 28442082Seschrock 28452082Seschrock /* 28462082Seschrock * If we are upgrading to ZFS_VERSION_RAIDZ_DEFLATE this txg, 28472082Seschrock * set spa_deflate if we have no raid-z vdevs. 28482082Seschrock */ 28492082Seschrock if (spa->spa_ubsync.ub_version < ZFS_VERSION_RAIDZ_DEFLATE && 28502082Seschrock spa->spa_uberblock.ub_version >= ZFS_VERSION_RAIDZ_DEFLATE) { 28512082Seschrock int i; 28522082Seschrock 28532082Seschrock for (i = 0; i < rvd->vdev_children; i++) { 28542082Seschrock vd = rvd->vdev_child[i]; 28552082Seschrock if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 28562082Seschrock break; 28572082Seschrock } 28582082Seschrock if (i == rvd->vdev_children) { 28592082Seschrock spa->spa_deflate = TRUE; 28602082Seschrock VERIFY(0 == zap_add(spa->spa_meta_objset, 28612082Seschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 28622082Seschrock sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 28632082Seschrock } 28642082Seschrock } 28652082Seschrock 2866789Sahrens /* 2867789Sahrens * If anything has changed in this txg, push the deferred frees 2868789Sahrens * from the previous txg. If not, leave them alone so that we 2869789Sahrens * don't generate work on an otherwise idle system. 2870789Sahrens */ 2871789Sahrens if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 28722329Sek110237 !txg_list_empty(&dp->dp_dirty_dirs, txg) || 28732329Sek110237 !txg_list_empty(&dp->dp_sync_tasks, txg)) 2874789Sahrens spa_sync_deferred_frees(spa, txg); 2875789Sahrens 2876789Sahrens /* 2877789Sahrens * Iterate to convergence. 2878789Sahrens */ 2879789Sahrens do { 2880789Sahrens spa->spa_sync_pass++; 2881789Sahrens 2882789Sahrens spa_sync_config_object(spa, tx); 28832082Seschrock spa_sync_spares(spa, tx); 28841544Seschrock spa_errlog_sync(spa, txg); 2885789Sahrens dsl_pool_sync(dp, txg); 2886789Sahrens 2887789Sahrens dirty_vdevs = 0; 2888789Sahrens while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 2889789Sahrens vdev_sync(vd, txg); 2890789Sahrens dirty_vdevs++; 2891789Sahrens } 2892789Sahrens 2893789Sahrens bplist_sync(bpl, tx); 2894789Sahrens } while (dirty_vdevs); 2895789Sahrens 2896789Sahrens bplist_close(bpl); 2897789Sahrens 2898789Sahrens dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 2899789Sahrens 2900789Sahrens /* 2901789Sahrens * Rewrite the vdev configuration (which includes the uberblock) 2902789Sahrens * to commit the transaction group. 29031635Sbonwick * 29041635Sbonwick * If there are any dirty vdevs, sync the uberblock to all vdevs. 29051635Sbonwick * Otherwise, pick a random top-level vdev that's known to be 29061635Sbonwick * visible in the config cache (see spa_vdev_add() for details). 29071635Sbonwick * If the write fails, try the next vdev until we're tried them all. 2908789Sahrens */ 29091635Sbonwick if (!list_is_empty(&spa->spa_dirty_list)) { 29101635Sbonwick VERIFY(vdev_config_sync(rvd, txg) == 0); 29111635Sbonwick } else { 29121635Sbonwick int children = rvd->vdev_children; 29131635Sbonwick int c0 = spa_get_random(children); 29141635Sbonwick int c; 29151635Sbonwick 29161635Sbonwick for (c = 0; c < children; c++) { 29171635Sbonwick vd = rvd->vdev_child[(c0 + c) % children]; 29181635Sbonwick if (vd->vdev_ms_array == 0) 29191635Sbonwick continue; 29201635Sbonwick if (vdev_config_sync(vd, txg) == 0) 29211635Sbonwick break; 29221635Sbonwick } 29231635Sbonwick if (c == children) 29241635Sbonwick VERIFY(vdev_config_sync(rvd, txg) == 0); 29251635Sbonwick } 29261635Sbonwick 29272082Seschrock dmu_tx_commit(tx); 29282082Seschrock 29291635Sbonwick /* 29301635Sbonwick * Clear the dirty config list. 29311635Sbonwick */ 29321635Sbonwick while ((vd = list_head(&spa->spa_dirty_list)) != NULL) 29331635Sbonwick vdev_config_clean(vd); 29341635Sbonwick 29351635Sbonwick /* 29361635Sbonwick * Now that the new config has synced transactionally, 29371635Sbonwick * let it become visible to the config cache. 29381635Sbonwick */ 29391635Sbonwick if (spa->spa_config_syncing != NULL) { 29401635Sbonwick spa_config_set(spa, spa->spa_config_syncing); 29411635Sbonwick spa->spa_config_txg = txg; 29421635Sbonwick spa->spa_config_syncing = NULL; 29431635Sbonwick } 2944789Sahrens 2945789Sahrens /* 2946789Sahrens * Make a stable copy of the fully synced uberblock. 2947789Sahrens * We use this as the root for pool traversals. 2948789Sahrens */ 2949789Sahrens spa->spa_traverse_wanted = 1; /* tells traverse_more() to stop */ 2950789Sahrens 2951789Sahrens spa_scrub_suspend(spa); /* stop scrubbing and finish I/Os */ 2952789Sahrens 2953789Sahrens rw_enter(&spa->spa_traverse_lock, RW_WRITER); 2954789Sahrens spa->spa_traverse_wanted = 0; 2955789Sahrens spa->spa_ubsync = spa->spa_uberblock; 2956789Sahrens rw_exit(&spa->spa_traverse_lock); 2957789Sahrens 2958789Sahrens spa_scrub_resume(spa); /* resume scrub with new ubsync */ 2959789Sahrens 2960789Sahrens /* 2961789Sahrens * Clean up the ZIL records for the synced txg. 2962789Sahrens */ 2963789Sahrens dsl_pool_zil_clean(dp); 2964789Sahrens 2965789Sahrens /* 2966789Sahrens * Update usable space statistics. 2967789Sahrens */ 2968789Sahrens while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 2969789Sahrens vdev_sync_done(vd, txg); 2970789Sahrens 2971789Sahrens /* 2972789Sahrens * It had better be the case that we didn't dirty anything 29732082Seschrock * since vdev_config_sync(). 2974789Sahrens */ 2975789Sahrens ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 2976789Sahrens ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 2977789Sahrens ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 2978789Sahrens ASSERT(bpl->bpl_queue == NULL); 2979789Sahrens 29801544Seschrock spa_config_exit(spa, FTAG); 29811544Seschrock 29821544Seschrock /* 29831544Seschrock * If any async tasks have been requested, kick them off. 29841544Seschrock */ 29851544Seschrock spa_async_dispatch(spa); 2986789Sahrens } 2987789Sahrens 2988789Sahrens /* 2989789Sahrens * Sync all pools. We don't want to hold the namespace lock across these 2990789Sahrens * operations, so we take a reference on the spa_t and drop the lock during the 2991789Sahrens * sync. 2992789Sahrens */ 2993789Sahrens void 2994789Sahrens spa_sync_allpools(void) 2995789Sahrens { 2996789Sahrens spa_t *spa = NULL; 2997789Sahrens mutex_enter(&spa_namespace_lock); 2998789Sahrens while ((spa = spa_next(spa)) != NULL) { 2999789Sahrens if (spa_state(spa) != POOL_STATE_ACTIVE) 3000789Sahrens continue; 3001789Sahrens spa_open_ref(spa, FTAG); 3002789Sahrens mutex_exit(&spa_namespace_lock); 3003789Sahrens txg_wait_synced(spa_get_dsl(spa), 0); 3004789Sahrens mutex_enter(&spa_namespace_lock); 3005789Sahrens spa_close(spa, FTAG); 3006789Sahrens } 3007789Sahrens mutex_exit(&spa_namespace_lock); 3008789Sahrens } 3009789Sahrens 3010789Sahrens /* 3011789Sahrens * ========================================================================== 3012789Sahrens * Miscellaneous routines 3013789Sahrens * ========================================================================== 3014789Sahrens */ 3015789Sahrens 3016789Sahrens /* 3017789Sahrens * Remove all pools in the system. 3018789Sahrens */ 3019789Sahrens void 3020789Sahrens spa_evict_all(void) 3021789Sahrens { 3022789Sahrens spa_t *spa; 3023789Sahrens 3024789Sahrens /* 3025789Sahrens * Remove all cached state. All pools should be closed now, 3026789Sahrens * so every spa in the AVL tree should be unreferenced. 3027789Sahrens */ 3028789Sahrens mutex_enter(&spa_namespace_lock); 3029789Sahrens while ((spa = spa_next(NULL)) != NULL) { 3030789Sahrens /* 30311544Seschrock * Stop async tasks. The async thread may need to detach 30321544Seschrock * a device that's been replaced, which requires grabbing 30331544Seschrock * spa_namespace_lock, so we must drop it here. 3034789Sahrens */ 3035789Sahrens spa_open_ref(spa, FTAG); 3036789Sahrens mutex_exit(&spa_namespace_lock); 30371544Seschrock spa_async_suspend(spa); 3038789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 3039789Sahrens mutex_enter(&spa_namespace_lock); 3040789Sahrens spa_close(spa, FTAG); 3041789Sahrens 3042789Sahrens if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 3043789Sahrens spa_unload(spa); 3044789Sahrens spa_deactivate(spa); 3045789Sahrens } 3046789Sahrens spa_remove(spa); 3047789Sahrens } 3048789Sahrens mutex_exit(&spa_namespace_lock); 3049789Sahrens } 30501544Seschrock 30511544Seschrock vdev_t * 30521544Seschrock spa_lookup_by_guid(spa_t *spa, uint64_t guid) 30531544Seschrock { 30541544Seschrock return (vdev_lookup_by_guid(spa->spa_root_vdev, guid)); 30551544Seschrock } 30561760Seschrock 30571760Seschrock void 30581760Seschrock spa_upgrade(spa_t *spa) 30591760Seschrock { 30601760Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 30611760Seschrock 30621760Seschrock /* 30631760Seschrock * This should only be called for a non-faulted pool, and since a 30641760Seschrock * future version would result in an unopenable pool, this shouldn't be 30651760Seschrock * possible. 30661760Seschrock */ 30671760Seschrock ASSERT(spa->spa_uberblock.ub_version <= ZFS_VERSION); 30681760Seschrock 30691760Seschrock spa->spa_uberblock.ub_version = ZFS_VERSION; 30701760Seschrock vdev_config_dirty(spa->spa_root_vdev); 30711760Seschrock 30721760Seschrock spa_config_exit(spa, FTAG); 30732082Seschrock 30742082Seschrock txg_wait_synced(spa_get_dsl(spa), 0); 30751760Seschrock } 30762082Seschrock 30772082Seschrock boolean_t 30782082Seschrock spa_has_spare(spa_t *spa, uint64_t guid) 30792082Seschrock { 30802082Seschrock int i; 30813377Seschrock uint64_t spareguid; 30822082Seschrock 30832082Seschrock for (i = 0; i < spa->spa_nspares; i++) 30842082Seschrock if (spa->spa_spares[i]->vdev_guid == guid) 30852082Seschrock return (B_TRUE); 30862082Seschrock 30873377Seschrock for (i = 0; i < spa->spa_pending_nspares; i++) { 30883377Seschrock if (nvlist_lookup_uint64(spa->spa_pending_spares[i], 30893377Seschrock ZPOOL_CONFIG_GUID, &spareguid) == 0 && 30903377Seschrock spareguid == guid) 30913377Seschrock return (B_TRUE); 30923377Seschrock } 30933377Seschrock 30942082Seschrock return (B_FALSE); 30952082Seschrock } 3096