1789Sahrens /* 2789Sahrens * CDDL HEADER START 3789Sahrens * 4789Sahrens * The contents of this file are subject to the terms of the 51544Seschrock * Common Development and Distribution License (the "License"). 61544Seschrock * You may not use this file except in compliance with the License. 7789Sahrens * 8789Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9789Sahrens * or http://www.opensolaris.org/os/licensing. 10789Sahrens * See the License for the specific language governing permissions 11789Sahrens * and limitations under the License. 12789Sahrens * 13789Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14789Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15789Sahrens * If applicable, add the following below this CDDL HEADER, with the 16789Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17789Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18789Sahrens * 19789Sahrens * CDDL HEADER END 20789Sahrens */ 212082Seschrock 22789Sahrens /* 233377Seschrock * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24789Sahrens * Use is subject to license terms. 25789Sahrens */ 26789Sahrens 27789Sahrens #pragma ident "%Z%%M% %I% %E% SMI" 28789Sahrens 29789Sahrens /* 30789Sahrens * This file contains all the routines used when modifying on-disk SPA state. 31789Sahrens * This includes opening, importing, destroying, exporting a pool, and syncing a 32789Sahrens * pool. 33789Sahrens */ 34789Sahrens 35789Sahrens #include <sys/zfs_context.h> 361544Seschrock #include <sys/fm/fs/zfs.h> 37789Sahrens #include <sys/spa_impl.h> 38789Sahrens #include <sys/zio.h> 39789Sahrens #include <sys/zio_checksum.h> 40789Sahrens #include <sys/zio_compress.h> 41789Sahrens #include <sys/dmu.h> 42789Sahrens #include <sys/dmu_tx.h> 43789Sahrens #include <sys/zap.h> 44789Sahrens #include <sys/zil.h> 45789Sahrens #include <sys/vdev_impl.h> 46789Sahrens #include <sys/metaslab.h> 47789Sahrens #include <sys/uberblock_impl.h> 48789Sahrens #include <sys/txg.h> 49789Sahrens #include <sys/avl.h> 50789Sahrens #include <sys/dmu_traverse.h> 513912Slling #include <sys/dmu_objset.h> 52789Sahrens #include <sys/unique.h> 53789Sahrens #include <sys/dsl_pool.h> 543912Slling #include <sys/dsl_dataset.h> 55789Sahrens #include <sys/dsl_dir.h> 56789Sahrens #include <sys/dsl_prop.h> 573912Slling #include <sys/dsl_synctask.h> 58789Sahrens #include <sys/fs/zfs.h> 59789Sahrens #include <sys/callb.h> 60*3975Sek110237 #include <sys/systeminfo.h> 61*3975Sek110237 #include <sys/sunddi.h> 62789Sahrens 632986Sek110237 int zio_taskq_threads = 8; 642986Sek110237 65789Sahrens /* 66789Sahrens * ========================================================================== 67789Sahrens * SPA state manipulation (open/create/destroy/import/export) 68789Sahrens * ========================================================================== 69789Sahrens */ 70789Sahrens 711544Seschrock static int 721544Seschrock spa_error_entry_compare(const void *a, const void *b) 731544Seschrock { 741544Seschrock spa_error_entry_t *sa = (spa_error_entry_t *)a; 751544Seschrock spa_error_entry_t *sb = (spa_error_entry_t *)b; 761544Seschrock int ret; 771544Seschrock 781544Seschrock ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 791544Seschrock sizeof (zbookmark_t)); 801544Seschrock 811544Seschrock if (ret < 0) 821544Seschrock return (-1); 831544Seschrock else if (ret > 0) 841544Seschrock return (1); 851544Seschrock else 861544Seschrock return (0); 871544Seschrock } 881544Seschrock 891544Seschrock /* 901544Seschrock * Utility function which retrieves copies of the current logs and 911544Seschrock * re-initializes them in the process. 921544Seschrock */ 931544Seschrock void 941544Seschrock spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 951544Seschrock { 961544Seschrock ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 971544Seschrock 981544Seschrock bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 991544Seschrock bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 1001544Seschrock 1011544Seschrock avl_create(&spa->spa_errlist_scrub, 1021544Seschrock spa_error_entry_compare, sizeof (spa_error_entry_t), 1031544Seschrock offsetof(spa_error_entry_t, se_avl)); 1041544Seschrock avl_create(&spa->spa_errlist_last, 1051544Seschrock spa_error_entry_compare, sizeof (spa_error_entry_t), 1061544Seschrock offsetof(spa_error_entry_t, se_avl)); 1071544Seschrock } 1081544Seschrock 109789Sahrens /* 110789Sahrens * Activate an uninitialized pool. 111789Sahrens */ 112789Sahrens static void 113789Sahrens spa_activate(spa_t *spa) 114789Sahrens { 115789Sahrens int t; 116789Sahrens 117789Sahrens ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 118789Sahrens 119789Sahrens spa->spa_state = POOL_STATE_ACTIVE; 120789Sahrens 121789Sahrens spa->spa_normal_class = metaslab_class_create(); 122789Sahrens 123789Sahrens for (t = 0; t < ZIO_TYPES; t++) { 124789Sahrens spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue", 1252986Sek110237 zio_taskq_threads, maxclsyspri, 50, INT_MAX, 126789Sahrens TASKQ_PREPOPULATE); 127789Sahrens spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr", 1282986Sek110237 zio_taskq_threads, maxclsyspri, 50, INT_MAX, 129789Sahrens TASKQ_PREPOPULATE); 130789Sahrens } 131789Sahrens 132789Sahrens rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL); 133789Sahrens 1342856Snd150628 mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL); 1352856Snd150628 mutex_init(&spa->spa_config_cache_lock, NULL, MUTEX_DEFAULT, NULL); 1362856Snd150628 mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL); 1372856Snd150628 mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL); 1382856Snd150628 mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL); 1392856Snd150628 mutex_init(&spa->spa_config_lock.scl_lock, NULL, MUTEX_DEFAULT, NULL); 1402856Snd150628 mutex_init(&spa->spa_sync_bplist.bpl_lock, NULL, MUTEX_DEFAULT, NULL); 1412926Sek110237 mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL); 1423912Slling mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL); 1432856Snd150628 144789Sahrens list_create(&spa->spa_dirty_list, sizeof (vdev_t), 145789Sahrens offsetof(vdev_t, vdev_dirty_node)); 146789Sahrens 147789Sahrens txg_list_create(&spa->spa_vdev_txg_list, 148789Sahrens offsetof(struct vdev, vdev_txg_node)); 1491544Seschrock 1501544Seschrock avl_create(&spa->spa_errlist_scrub, 1511544Seschrock spa_error_entry_compare, sizeof (spa_error_entry_t), 1521544Seschrock offsetof(spa_error_entry_t, se_avl)); 1531544Seschrock avl_create(&spa->spa_errlist_last, 1541544Seschrock spa_error_entry_compare, sizeof (spa_error_entry_t), 1551544Seschrock offsetof(spa_error_entry_t, se_avl)); 156789Sahrens } 157789Sahrens 158789Sahrens /* 159789Sahrens * Opposite of spa_activate(). 160789Sahrens */ 161789Sahrens static void 162789Sahrens spa_deactivate(spa_t *spa) 163789Sahrens { 164789Sahrens int t; 165789Sahrens 166789Sahrens ASSERT(spa->spa_sync_on == B_FALSE); 167789Sahrens ASSERT(spa->spa_dsl_pool == NULL); 168789Sahrens ASSERT(spa->spa_root_vdev == NULL); 169789Sahrens 170789Sahrens ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 171789Sahrens 172789Sahrens txg_list_destroy(&spa->spa_vdev_txg_list); 173789Sahrens 174789Sahrens list_destroy(&spa->spa_dirty_list); 175789Sahrens 176789Sahrens rw_destroy(&spa->spa_traverse_lock); 177789Sahrens 178789Sahrens for (t = 0; t < ZIO_TYPES; t++) { 179789Sahrens taskq_destroy(spa->spa_zio_issue_taskq[t]); 180789Sahrens taskq_destroy(spa->spa_zio_intr_taskq[t]); 181789Sahrens spa->spa_zio_issue_taskq[t] = NULL; 182789Sahrens spa->spa_zio_intr_taskq[t] = NULL; 183789Sahrens } 184789Sahrens 185789Sahrens metaslab_class_destroy(spa->spa_normal_class); 186789Sahrens spa->spa_normal_class = NULL; 187789Sahrens 1881544Seschrock /* 1891544Seschrock * If this was part of an import or the open otherwise failed, we may 1901544Seschrock * still have errors left in the queues. Empty them just in case. 1911544Seschrock */ 1921544Seschrock spa_errlog_drain(spa); 1931544Seschrock 1941544Seschrock avl_destroy(&spa->spa_errlist_scrub); 1951544Seschrock avl_destroy(&spa->spa_errlist_last); 1961544Seschrock 197789Sahrens spa->spa_state = POOL_STATE_UNINITIALIZED; 198789Sahrens } 199789Sahrens 200789Sahrens /* 201789Sahrens * Verify a pool configuration, and construct the vdev tree appropriately. This 202789Sahrens * will create all the necessary vdevs in the appropriate layout, with each vdev 203789Sahrens * in the CLOSED state. This will prep the pool before open/creation/import. 204789Sahrens * All vdev validation is done by the vdev_alloc() routine. 205789Sahrens */ 2062082Seschrock static int 2072082Seschrock spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 2082082Seschrock uint_t id, int atype) 209789Sahrens { 210789Sahrens nvlist_t **child; 211789Sahrens uint_t c, children; 2122082Seschrock int error; 2132082Seschrock 2142082Seschrock if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 2152082Seschrock return (error); 2162082Seschrock 2172082Seschrock if ((*vdp)->vdev_ops->vdev_op_leaf) 2182082Seschrock return (0); 219789Sahrens 220789Sahrens if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 221789Sahrens &child, &children) != 0) { 2222082Seschrock vdev_free(*vdp); 2232082Seschrock *vdp = NULL; 2242082Seschrock return (EINVAL); 225789Sahrens } 226789Sahrens 227789Sahrens for (c = 0; c < children; c++) { 2282082Seschrock vdev_t *vd; 2292082Seschrock if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 2302082Seschrock atype)) != 0) { 2312082Seschrock vdev_free(*vdp); 2322082Seschrock *vdp = NULL; 2332082Seschrock return (error); 234789Sahrens } 235789Sahrens } 236789Sahrens 2372082Seschrock ASSERT(*vdp != NULL); 2382082Seschrock 2392082Seschrock return (0); 240789Sahrens } 241789Sahrens 242789Sahrens /* 243789Sahrens * Opposite of spa_load(). 244789Sahrens */ 245789Sahrens static void 246789Sahrens spa_unload(spa_t *spa) 247789Sahrens { 2482082Seschrock int i; 2492082Seschrock 250789Sahrens /* 2511544Seschrock * Stop async tasks. 2521544Seschrock */ 2531544Seschrock spa_async_suspend(spa); 2541544Seschrock 2551544Seschrock /* 256789Sahrens * Stop syncing. 257789Sahrens */ 258789Sahrens if (spa->spa_sync_on) { 259789Sahrens txg_sync_stop(spa->spa_dsl_pool); 260789Sahrens spa->spa_sync_on = B_FALSE; 261789Sahrens } 262789Sahrens 263789Sahrens /* 264789Sahrens * Wait for any outstanding prefetch I/O to complete. 265789Sahrens */ 2661544Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 2671544Seschrock spa_config_exit(spa, FTAG); 268789Sahrens 269789Sahrens /* 270789Sahrens * Close the dsl pool. 271789Sahrens */ 272789Sahrens if (spa->spa_dsl_pool) { 273789Sahrens dsl_pool_close(spa->spa_dsl_pool); 274789Sahrens spa->spa_dsl_pool = NULL; 275789Sahrens } 276789Sahrens 277789Sahrens /* 278789Sahrens * Close all vdevs. 279789Sahrens */ 2801585Sbonwick if (spa->spa_root_vdev) 281789Sahrens vdev_free(spa->spa_root_vdev); 2821585Sbonwick ASSERT(spa->spa_root_vdev == NULL); 2831544Seschrock 2842082Seschrock for (i = 0; i < spa->spa_nspares; i++) 2852082Seschrock vdev_free(spa->spa_spares[i]); 2862082Seschrock if (spa->spa_spares) { 2872082Seschrock kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); 2882082Seschrock spa->spa_spares = NULL; 2892082Seschrock } 2902082Seschrock if (spa->spa_sparelist) { 2912082Seschrock nvlist_free(spa->spa_sparelist); 2922082Seschrock spa->spa_sparelist = NULL; 2932082Seschrock } 2942082Seschrock 2951544Seschrock spa->spa_async_suspended = 0; 296789Sahrens } 297789Sahrens 298789Sahrens /* 2992082Seschrock * Load (or re-load) the current list of vdevs describing the active spares for 3002082Seschrock * this pool. When this is called, we have some form of basic information in 3012082Seschrock * 'spa_sparelist'. We parse this into vdevs, try to open them, and then 3022082Seschrock * re-generate a more complete list including status information. 3032082Seschrock */ 3042082Seschrock static void 3052082Seschrock spa_load_spares(spa_t *spa) 3062082Seschrock { 3072082Seschrock nvlist_t **spares; 3082082Seschrock uint_t nspares; 3092082Seschrock int i; 3103377Seschrock vdev_t *vd, *tvd; 3112082Seschrock 3122082Seschrock /* 3132082Seschrock * First, close and free any existing spare vdevs. 3142082Seschrock */ 3152082Seschrock for (i = 0; i < spa->spa_nspares; i++) { 3163377Seschrock vd = spa->spa_spares[i]; 3173377Seschrock 3183377Seschrock /* Undo the call to spa_activate() below */ 3193377Seschrock if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL && 3203377Seschrock tvd->vdev_isspare) 3213377Seschrock spa_spare_remove(tvd); 3223377Seschrock vdev_close(vd); 3233377Seschrock vdev_free(vd); 3242082Seschrock } 3253377Seschrock 3262082Seschrock if (spa->spa_spares) 3272082Seschrock kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); 3282082Seschrock 3292082Seschrock if (spa->spa_sparelist == NULL) 3302082Seschrock nspares = 0; 3312082Seschrock else 3322082Seschrock VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 3332082Seschrock ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 3342082Seschrock 3352082Seschrock spa->spa_nspares = (int)nspares; 3362082Seschrock spa->spa_spares = NULL; 3372082Seschrock 3382082Seschrock if (nspares == 0) 3392082Seschrock return; 3402082Seschrock 3412082Seschrock /* 3422082Seschrock * Construct the array of vdevs, opening them to get status in the 3433377Seschrock * process. For each spare, there is potentially two different vdev_t 3443377Seschrock * structures associated with it: one in the list of spares (used only 3453377Seschrock * for basic validation purposes) and one in the active vdev 3463377Seschrock * configuration (if it's spared in). During this phase we open and 3473377Seschrock * validate each vdev on the spare list. If the vdev also exists in the 3483377Seschrock * active configuration, then we also mark this vdev as an active spare. 3492082Seschrock */ 3502082Seschrock spa->spa_spares = kmem_alloc(nspares * sizeof (void *), KM_SLEEP); 3512082Seschrock for (i = 0; i < spa->spa_nspares; i++) { 3522082Seschrock VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 3532082Seschrock VDEV_ALLOC_SPARE) == 0); 3542082Seschrock ASSERT(vd != NULL); 3552082Seschrock 3562082Seschrock spa->spa_spares[i] = vd; 3572082Seschrock 3583377Seschrock if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL) { 3593377Seschrock if (!tvd->vdev_isspare) 3603377Seschrock spa_spare_add(tvd); 3613377Seschrock 3623377Seschrock /* 3633377Seschrock * We only mark the spare active if we were successfully 3643377Seschrock * able to load the vdev. Otherwise, importing a pool 3653377Seschrock * with a bad active spare would result in strange 3663377Seschrock * behavior, because multiple pool would think the spare 3673377Seschrock * is actively in use. 3683377Seschrock * 3693377Seschrock * There is a vulnerability here to an equally bizarre 3703377Seschrock * circumstance, where a dead active spare is later 3713377Seschrock * brought back to life (onlined or otherwise). Given 3723377Seschrock * the rarity of this scenario, and the extra complexity 3733377Seschrock * it adds, we ignore the possibility. 3743377Seschrock */ 3753377Seschrock if (!vdev_is_dead(tvd)) 3763377Seschrock spa_spare_activate(tvd); 3773377Seschrock } 3783377Seschrock 3792082Seschrock if (vdev_open(vd) != 0) 3802082Seschrock continue; 3812082Seschrock 3822082Seschrock vd->vdev_top = vd; 3832082Seschrock (void) vdev_validate_spare(vd); 3842082Seschrock } 3852082Seschrock 3862082Seschrock /* 3872082Seschrock * Recompute the stashed list of spares, with status information 3882082Seschrock * this time. 3892082Seschrock */ 3902082Seschrock VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 3912082Seschrock DATA_TYPE_NVLIST_ARRAY) == 0); 3922082Seschrock 3932082Seschrock spares = kmem_alloc(spa->spa_nspares * sizeof (void *), KM_SLEEP); 3942082Seschrock for (i = 0; i < spa->spa_nspares; i++) 3952082Seschrock spares[i] = vdev_config_generate(spa, spa->spa_spares[i], 3962082Seschrock B_TRUE, B_TRUE); 3972082Seschrock VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 3982082Seschrock spares, spa->spa_nspares) == 0); 3992082Seschrock for (i = 0; i < spa->spa_nspares; i++) 4002082Seschrock nvlist_free(spares[i]); 4012082Seschrock kmem_free(spares, spa->spa_nspares * sizeof (void *)); 4022082Seschrock } 4032082Seschrock 4042082Seschrock static int 4052082Seschrock load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 4062082Seschrock { 4072082Seschrock dmu_buf_t *db; 4082082Seschrock char *packed = NULL; 4092082Seschrock size_t nvsize = 0; 4102082Seschrock int error; 4112082Seschrock *value = NULL; 4122082Seschrock 4132082Seschrock VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 4142082Seschrock nvsize = *(uint64_t *)db->db_data; 4152082Seschrock dmu_buf_rele(db, FTAG); 4162082Seschrock 4172082Seschrock packed = kmem_alloc(nvsize, KM_SLEEP); 4182082Seschrock error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed); 4192082Seschrock if (error == 0) 4202082Seschrock error = nvlist_unpack(packed, nvsize, value, 0); 4212082Seschrock kmem_free(packed, nvsize); 4222082Seschrock 4232082Seschrock return (error); 4242082Seschrock } 4252082Seschrock 4262082Seschrock /* 427789Sahrens * Load an existing storage pool, using the pool's builtin spa_config as a 4281544Seschrock * source of configuration information. 429789Sahrens */ 430789Sahrens static int 4311544Seschrock spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) 432789Sahrens { 433789Sahrens int error = 0; 434789Sahrens nvlist_t *nvroot = NULL; 435789Sahrens vdev_t *rvd; 436789Sahrens uberblock_t *ub = &spa->spa_uberblock; 4371635Sbonwick uint64_t config_cache_txg = spa->spa_config_txg; 438789Sahrens uint64_t pool_guid; 4392082Seschrock uint64_t version; 440789Sahrens zio_t *zio; 441789Sahrens 4421544Seschrock spa->spa_load_state = state; 4431635Sbonwick 444789Sahrens if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 4451733Sbonwick nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 4461544Seschrock error = EINVAL; 4471544Seschrock goto out; 4481544Seschrock } 449789Sahrens 4502082Seschrock /* 4512082Seschrock * Versioning wasn't explicitly added to the label until later, so if 4522082Seschrock * it's not present treat it as the initial version. 4532082Seschrock */ 4542082Seschrock if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0) 4552082Seschrock version = ZFS_VERSION_INITIAL; 4562082Seschrock 4571733Sbonwick (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 4581733Sbonwick &spa->spa_config_txg); 4591733Sbonwick 4601635Sbonwick if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 4611544Seschrock spa_guid_exists(pool_guid, 0)) { 4621544Seschrock error = EEXIST; 4631544Seschrock goto out; 4641544Seschrock } 465789Sahrens 4662174Seschrock spa->spa_load_guid = pool_guid; 4672174Seschrock 468789Sahrens /* 4692082Seschrock * Parse the configuration into a vdev tree. We explicitly set the 4702082Seschrock * value that will be returned by spa_version() since parsing the 4712082Seschrock * configuration requires knowing the version number. 472789Sahrens */ 4731544Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 4742082Seschrock spa->spa_ubsync.ub_version = version; 4752082Seschrock error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 4761544Seschrock spa_config_exit(spa, FTAG); 477789Sahrens 4782082Seschrock if (error != 0) 4791544Seschrock goto out; 480789Sahrens 4811585Sbonwick ASSERT(spa->spa_root_vdev == rvd); 482789Sahrens ASSERT(spa_guid(spa) == pool_guid); 483789Sahrens 484789Sahrens /* 485789Sahrens * Try to open all vdevs, loading each label in the process. 486789Sahrens */ 4871544Seschrock if (vdev_open(rvd) != 0) { 4881544Seschrock error = ENXIO; 4891544Seschrock goto out; 4901544Seschrock } 491789Sahrens 492789Sahrens /* 4931986Seschrock * Validate the labels for all leaf vdevs. We need to grab the config 4941986Seschrock * lock because all label I/O is done with the ZIO_FLAG_CONFIG_HELD 4951986Seschrock * flag. 4961986Seschrock */ 4971986Seschrock spa_config_enter(spa, RW_READER, FTAG); 4981986Seschrock error = vdev_validate(rvd); 4991986Seschrock spa_config_exit(spa, FTAG); 5001986Seschrock 5011986Seschrock if (error != 0) { 5021986Seschrock error = EBADF; 5031986Seschrock goto out; 5041986Seschrock } 5051986Seschrock 5061986Seschrock if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 5071986Seschrock error = ENXIO; 5081986Seschrock goto out; 5091986Seschrock } 5101986Seschrock 5111986Seschrock /* 512789Sahrens * Find the best uberblock. 513789Sahrens */ 514789Sahrens bzero(ub, sizeof (uberblock_t)); 515789Sahrens 516789Sahrens zio = zio_root(spa, NULL, NULL, 517789Sahrens ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 518789Sahrens vdev_uberblock_load(zio, rvd, ub); 519789Sahrens error = zio_wait(zio); 520789Sahrens 521789Sahrens /* 522789Sahrens * If we weren't able to find a single valid uberblock, return failure. 523789Sahrens */ 524789Sahrens if (ub->ub_txg == 0) { 5251760Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 5261760Seschrock VDEV_AUX_CORRUPT_DATA); 5271544Seschrock error = ENXIO; 5281544Seschrock goto out; 5291544Seschrock } 5301544Seschrock 5311544Seschrock /* 5321544Seschrock * If the pool is newer than the code, we can't open it. 5331544Seschrock */ 5341760Seschrock if (ub->ub_version > ZFS_VERSION) { 5351760Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 5361760Seschrock VDEV_AUX_VERSION_NEWER); 5371544Seschrock error = ENOTSUP; 5381544Seschrock goto out; 539789Sahrens } 540789Sahrens 541789Sahrens /* 542789Sahrens * If the vdev guid sum doesn't match the uberblock, we have an 543789Sahrens * incomplete configuration. 544789Sahrens */ 5451732Sbonwick if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 5461544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 5471544Seschrock VDEV_AUX_BAD_GUID_SUM); 5481544Seschrock error = ENXIO; 5491544Seschrock goto out; 550789Sahrens } 551789Sahrens 552789Sahrens /* 553789Sahrens * Initialize internal SPA structures. 554789Sahrens */ 555789Sahrens spa->spa_state = POOL_STATE_ACTIVE; 556789Sahrens spa->spa_ubsync = spa->spa_uberblock; 557789Sahrens spa->spa_first_txg = spa_last_synced_txg(spa) + 1; 5581544Seschrock error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 5591544Seschrock if (error) { 5601544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 5611544Seschrock VDEV_AUX_CORRUPT_DATA); 5621544Seschrock goto out; 5631544Seschrock } 564789Sahrens spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 565789Sahrens 5661544Seschrock if (zap_lookup(spa->spa_meta_objset, 567789Sahrens DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 5681544Seschrock sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { 5691544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 5701544Seschrock VDEV_AUX_CORRUPT_DATA); 5711544Seschrock error = EIO; 5721544Seschrock goto out; 5731544Seschrock } 574789Sahrens 575789Sahrens if (!mosconfig) { 5762082Seschrock nvlist_t *newconfig; 577*3975Sek110237 uint64_t hostid; 5782082Seschrock 5792082Seschrock if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) { 5801544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 5811544Seschrock VDEV_AUX_CORRUPT_DATA); 5821544Seschrock error = EIO; 5831544Seschrock goto out; 5841544Seschrock } 585789Sahrens 586*3975Sek110237 if (nvlist_lookup_uint64(newconfig, ZPOOL_CONFIG_HOSTID, 587*3975Sek110237 &hostid) == 0) { 588*3975Sek110237 char *hostname; 589*3975Sek110237 unsigned long myhostid = 0; 590*3975Sek110237 591*3975Sek110237 VERIFY(nvlist_lookup_string(newconfig, 592*3975Sek110237 ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 593*3975Sek110237 594*3975Sek110237 (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 595*3975Sek110237 if ((unsigned long)hostid != myhostid) { 596*3975Sek110237 cmn_err(CE_WARN, "pool '%s' could not be " 597*3975Sek110237 "loaded as it was last accessed by " 598*3975Sek110237 "another system (host: %s hostid: 0x%lx). " 599*3975Sek110237 "See: http://www.sun.com/msg/ZFS-8000-EY", 600*3975Sek110237 spa->spa_name, hostname, 601*3975Sek110237 (unsigned long)hostid); 602*3975Sek110237 error = EBADF; 603*3975Sek110237 goto out; 604*3975Sek110237 } 605*3975Sek110237 } 606*3975Sek110237 607789Sahrens spa_config_set(spa, newconfig); 608789Sahrens spa_unload(spa); 609789Sahrens spa_deactivate(spa); 610789Sahrens spa_activate(spa); 611789Sahrens 6121544Seschrock return (spa_load(spa, newconfig, state, B_TRUE)); 6131544Seschrock } 6141544Seschrock 6151544Seschrock if (zap_lookup(spa->spa_meta_objset, 6161544Seschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 6171544Seschrock sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { 6181544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 6191544Seschrock VDEV_AUX_CORRUPT_DATA); 6201544Seschrock error = EIO; 6211544Seschrock goto out; 622789Sahrens } 623789Sahrens 6241544Seschrock /* 6252082Seschrock * Load the bit that tells us to use the new accounting function 6262082Seschrock * (raid-z deflation). If we have an older pool, this will not 6272082Seschrock * be present. 6282082Seschrock */ 6292082Seschrock error = zap_lookup(spa->spa_meta_objset, 6302082Seschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 6312082Seschrock sizeof (uint64_t), 1, &spa->spa_deflate); 6322082Seschrock if (error != 0 && error != ENOENT) { 6332082Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 6342082Seschrock VDEV_AUX_CORRUPT_DATA); 6352082Seschrock error = EIO; 6362082Seschrock goto out; 6372082Seschrock } 6382082Seschrock 6392082Seschrock /* 6401544Seschrock * Load the persistent error log. If we have an older pool, this will 6411544Seschrock * not be present. 6421544Seschrock */ 6431544Seschrock error = zap_lookup(spa->spa_meta_objset, 6441544Seschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, 6451544Seschrock sizeof (uint64_t), 1, &spa->spa_errlog_last); 6461807Sbonwick if (error != 0 && error != ENOENT) { 6471544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 6481544Seschrock VDEV_AUX_CORRUPT_DATA); 6491544Seschrock error = EIO; 6501544Seschrock goto out; 6511544Seschrock } 6521544Seschrock 6531544Seschrock error = zap_lookup(spa->spa_meta_objset, 6541544Seschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, 6551544Seschrock sizeof (uint64_t), 1, &spa->spa_errlog_scrub); 6561544Seschrock if (error != 0 && error != ENOENT) { 6571544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 6581544Seschrock VDEV_AUX_CORRUPT_DATA); 6591544Seschrock error = EIO; 6601544Seschrock goto out; 6611544Seschrock } 662789Sahrens 663789Sahrens /* 6642926Sek110237 * Load the history object. If we have an older pool, this 6652926Sek110237 * will not be present. 6662926Sek110237 */ 6672926Sek110237 error = zap_lookup(spa->spa_meta_objset, 6682926Sek110237 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY, 6692926Sek110237 sizeof (uint64_t), 1, &spa->spa_history); 6702926Sek110237 if (error != 0 && error != ENOENT) { 6712926Sek110237 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 6722926Sek110237 VDEV_AUX_CORRUPT_DATA); 6732926Sek110237 error = EIO; 6742926Sek110237 goto out; 6752926Sek110237 } 6762926Sek110237 6772926Sek110237 /* 6782082Seschrock * Load any hot spares for this pool. 6792082Seschrock */ 6802082Seschrock error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 6812082Seschrock DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares_object); 6822082Seschrock if (error != 0 && error != ENOENT) { 6832082Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 6842082Seschrock VDEV_AUX_CORRUPT_DATA); 6852082Seschrock error = EIO; 6862082Seschrock goto out; 6872082Seschrock } 6882082Seschrock if (error == 0) { 6892082Seschrock ASSERT(spa_version(spa) >= ZFS_VERSION_SPARES); 6902082Seschrock if (load_nvlist(spa, spa->spa_spares_object, 6912082Seschrock &spa->spa_sparelist) != 0) { 6922082Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 6932082Seschrock VDEV_AUX_CORRUPT_DATA); 6942082Seschrock error = EIO; 6952082Seschrock goto out; 6962082Seschrock } 6972082Seschrock 6982082Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 6992082Seschrock spa_load_spares(spa); 7002082Seschrock spa_config_exit(spa, FTAG); 7012082Seschrock } 7022082Seschrock 7033912Slling error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 7043912Slling DMU_POOL_PROPS, sizeof (uint64_t), 1, &spa->spa_pool_props_object); 7053912Slling 7063912Slling if (error && error != ENOENT) { 7073912Slling vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 7083912Slling VDEV_AUX_CORRUPT_DATA); 7093912Slling error = EIO; 7103912Slling goto out; 7113912Slling } 7123912Slling 7133912Slling if (error == 0) { 7143912Slling (void) zap_lookup(spa->spa_meta_objset, 7153912Slling spa->spa_pool_props_object, 7163912Slling zpool_prop_to_name(ZFS_PROP_BOOTFS), 7173912Slling sizeof (uint64_t), 1, &spa->spa_bootfs); 7183912Slling } 7193912Slling 7202082Seschrock /* 7211986Seschrock * Load the vdev state for all toplevel vdevs. 722789Sahrens */ 7231986Seschrock vdev_load(rvd); 724789Sahrens 725789Sahrens /* 726789Sahrens * Propagate the leaf DTLs we just loaded all the way up the tree. 727789Sahrens */ 7281544Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 729789Sahrens vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 7301544Seschrock spa_config_exit(spa, FTAG); 731789Sahrens 732789Sahrens /* 733789Sahrens * Check the state of the root vdev. If it can't be opened, it 734789Sahrens * indicates one or more toplevel vdevs are faulted. 735789Sahrens */ 7361544Seschrock if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 7371544Seschrock error = ENXIO; 7381544Seschrock goto out; 7391544Seschrock } 740789Sahrens 7411544Seschrock if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) { 7421635Sbonwick dmu_tx_t *tx; 7431635Sbonwick int need_update = B_FALSE; 7441585Sbonwick int c; 7451601Sbonwick 7461635Sbonwick /* 7471635Sbonwick * Claim log blocks that haven't been committed yet. 7481635Sbonwick * This must all happen in a single txg. 7491635Sbonwick */ 7501601Sbonwick tx = dmu_tx_create_assigned(spa_get_dsl(spa), 751789Sahrens spa_first_txg(spa)); 7522417Sahrens (void) dmu_objset_find(spa->spa_name, 7532417Sahrens zil_claim, tx, DS_FIND_CHILDREN); 754789Sahrens dmu_tx_commit(tx); 755789Sahrens 756789Sahrens spa->spa_sync_on = B_TRUE; 757789Sahrens txg_sync_start(spa->spa_dsl_pool); 758789Sahrens 759789Sahrens /* 760789Sahrens * Wait for all claims to sync. 761789Sahrens */ 762789Sahrens txg_wait_synced(spa->spa_dsl_pool, 0); 7631585Sbonwick 7641585Sbonwick /* 7651635Sbonwick * If the config cache is stale, or we have uninitialized 7661635Sbonwick * metaslabs (see spa_vdev_add()), then update the config. 7671585Sbonwick */ 7681635Sbonwick if (config_cache_txg != spa->spa_config_txg || 7691635Sbonwick state == SPA_LOAD_IMPORT) 7701635Sbonwick need_update = B_TRUE; 7711635Sbonwick 7721635Sbonwick for (c = 0; c < rvd->vdev_children; c++) 7731635Sbonwick if (rvd->vdev_child[c]->vdev_ms_array == 0) 7741635Sbonwick need_update = B_TRUE; 7751585Sbonwick 7761585Sbonwick /* 7771635Sbonwick * Update the config cache asychronously in case we're the 7781635Sbonwick * root pool, in which case the config cache isn't writable yet. 7791585Sbonwick */ 7801635Sbonwick if (need_update) 7811635Sbonwick spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 782789Sahrens } 783789Sahrens 7841544Seschrock error = 0; 7851544Seschrock out: 7862082Seschrock if (error && error != EBADF) 7871544Seschrock zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0); 7881544Seschrock spa->spa_load_state = SPA_LOAD_NONE; 7891544Seschrock spa->spa_ena = 0; 7901544Seschrock 7911544Seschrock return (error); 792789Sahrens } 793789Sahrens 794789Sahrens /* 795789Sahrens * Pool Open/Import 796789Sahrens * 797789Sahrens * The import case is identical to an open except that the configuration is sent 798789Sahrens * down from userland, instead of grabbed from the configuration cache. For the 799789Sahrens * case of an open, the pool configuration will exist in the 800789Sahrens * POOL_STATE_UNITIALIZED state. 801789Sahrens * 802789Sahrens * The stats information (gen/count/ustats) is used to gather vdev statistics at 803789Sahrens * the same time open the pool, without having to keep around the spa_t in some 804789Sahrens * ambiguous state. 805789Sahrens */ 806789Sahrens static int 807789Sahrens spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) 808789Sahrens { 809789Sahrens spa_t *spa; 810789Sahrens int error; 811789Sahrens int loaded = B_FALSE; 812789Sahrens int locked = B_FALSE; 813789Sahrens 814789Sahrens *spapp = NULL; 815789Sahrens 816789Sahrens /* 817789Sahrens * As disgusting as this is, we need to support recursive calls to this 818789Sahrens * function because dsl_dir_open() is called during spa_load(), and ends 819789Sahrens * up calling spa_open() again. The real fix is to figure out how to 820789Sahrens * avoid dsl_dir_open() calling this in the first place. 821789Sahrens */ 822789Sahrens if (mutex_owner(&spa_namespace_lock) != curthread) { 823789Sahrens mutex_enter(&spa_namespace_lock); 824789Sahrens locked = B_TRUE; 825789Sahrens } 826789Sahrens 827789Sahrens if ((spa = spa_lookup(pool)) == NULL) { 828789Sahrens if (locked) 829789Sahrens mutex_exit(&spa_namespace_lock); 830789Sahrens return (ENOENT); 831789Sahrens } 832789Sahrens if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 833789Sahrens 834789Sahrens spa_activate(spa); 835789Sahrens 8361635Sbonwick error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE); 837789Sahrens 838789Sahrens if (error == EBADF) { 839789Sahrens /* 8401986Seschrock * If vdev_validate() returns failure (indicated by 8411986Seschrock * EBADF), it indicates that one of the vdevs indicates 8421986Seschrock * that the pool has been exported or destroyed. If 8431986Seschrock * this is the case, the config cache is out of sync and 8441986Seschrock * we should remove the pool from the namespace. 845789Sahrens */ 8462082Seschrock zfs_post_ok(spa, NULL); 847789Sahrens spa_unload(spa); 848789Sahrens spa_deactivate(spa); 849789Sahrens spa_remove(spa); 850789Sahrens spa_config_sync(); 851789Sahrens if (locked) 852789Sahrens mutex_exit(&spa_namespace_lock); 853789Sahrens return (ENOENT); 8541544Seschrock } 8551544Seschrock 8561544Seschrock if (error) { 857789Sahrens /* 858789Sahrens * We can't open the pool, but we still have useful 859789Sahrens * information: the state of each vdev after the 860789Sahrens * attempted vdev_open(). Return this to the user. 861789Sahrens */ 8621635Sbonwick if (config != NULL && spa->spa_root_vdev != NULL) { 8631635Sbonwick spa_config_enter(spa, RW_READER, FTAG); 864789Sahrens *config = spa_config_generate(spa, NULL, -1ULL, 865789Sahrens B_TRUE); 8661635Sbonwick spa_config_exit(spa, FTAG); 8671635Sbonwick } 868789Sahrens spa_unload(spa); 869789Sahrens spa_deactivate(spa); 8701544Seschrock spa->spa_last_open_failed = B_TRUE; 871789Sahrens if (locked) 872789Sahrens mutex_exit(&spa_namespace_lock); 873789Sahrens *spapp = NULL; 874789Sahrens return (error); 8751544Seschrock } else { 8761544Seschrock zfs_post_ok(spa, NULL); 8771544Seschrock spa->spa_last_open_failed = B_FALSE; 878789Sahrens } 879789Sahrens 880789Sahrens loaded = B_TRUE; 881789Sahrens } 882789Sahrens 883789Sahrens spa_open_ref(spa, tag); 884789Sahrens if (locked) 885789Sahrens mutex_exit(&spa_namespace_lock); 886789Sahrens 887789Sahrens *spapp = spa; 888789Sahrens 889789Sahrens if (config != NULL) { 8901544Seschrock spa_config_enter(spa, RW_READER, FTAG); 891789Sahrens *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 8921544Seschrock spa_config_exit(spa, FTAG); 893789Sahrens } 894789Sahrens 895789Sahrens /* 896789Sahrens * If we just loaded the pool, resilver anything that's out of date. 897789Sahrens */ 898789Sahrens if (loaded && (spa_mode & FWRITE)) 899789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 900789Sahrens 901789Sahrens return (0); 902789Sahrens } 903789Sahrens 904789Sahrens int 905789Sahrens spa_open(const char *name, spa_t **spapp, void *tag) 906789Sahrens { 907789Sahrens return (spa_open_common(name, spapp, tag, NULL)); 908789Sahrens } 909789Sahrens 9101544Seschrock /* 9111544Seschrock * Lookup the given spa_t, incrementing the inject count in the process, 9121544Seschrock * preventing it from being exported or destroyed. 9131544Seschrock */ 9141544Seschrock spa_t * 9151544Seschrock spa_inject_addref(char *name) 9161544Seschrock { 9171544Seschrock spa_t *spa; 9181544Seschrock 9191544Seschrock mutex_enter(&spa_namespace_lock); 9201544Seschrock if ((spa = spa_lookup(name)) == NULL) { 9211544Seschrock mutex_exit(&spa_namespace_lock); 9221544Seschrock return (NULL); 9231544Seschrock } 9241544Seschrock spa->spa_inject_ref++; 9251544Seschrock mutex_exit(&spa_namespace_lock); 9261544Seschrock 9271544Seschrock return (spa); 9281544Seschrock } 9291544Seschrock 9301544Seschrock void 9311544Seschrock spa_inject_delref(spa_t *spa) 9321544Seschrock { 9331544Seschrock mutex_enter(&spa_namespace_lock); 9341544Seschrock spa->spa_inject_ref--; 9351544Seschrock mutex_exit(&spa_namespace_lock); 9361544Seschrock } 9371544Seschrock 9382082Seschrock static void 9392082Seschrock spa_add_spares(spa_t *spa, nvlist_t *config) 9402082Seschrock { 9412082Seschrock nvlist_t **spares; 9422082Seschrock uint_t i, nspares; 9432082Seschrock nvlist_t *nvroot; 9442082Seschrock uint64_t guid; 9452082Seschrock vdev_stat_t *vs; 9462082Seschrock uint_t vsc; 9473377Seschrock uint64_t pool; 9482082Seschrock 9492082Seschrock if (spa->spa_nspares == 0) 9502082Seschrock return; 9512082Seschrock 9522082Seschrock VERIFY(nvlist_lookup_nvlist(config, 9532082Seschrock ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 9542082Seschrock VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 9552082Seschrock ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 9562082Seschrock if (nspares != 0) { 9572082Seschrock VERIFY(nvlist_add_nvlist_array(nvroot, 9582082Seschrock ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 9592082Seschrock VERIFY(nvlist_lookup_nvlist_array(nvroot, 9602082Seschrock ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 9612082Seschrock 9622082Seschrock /* 9632082Seschrock * Go through and find any spares which have since been 9642082Seschrock * repurposed as an active spare. If this is the case, update 9652082Seschrock * their status appropriately. 9662082Seschrock */ 9672082Seschrock for (i = 0; i < nspares; i++) { 9682082Seschrock VERIFY(nvlist_lookup_uint64(spares[i], 9692082Seschrock ZPOOL_CONFIG_GUID, &guid) == 0); 9703377Seschrock if (spa_spare_exists(guid, &pool) && pool != 0ULL) { 9712082Seschrock VERIFY(nvlist_lookup_uint64_array( 9722082Seschrock spares[i], ZPOOL_CONFIG_STATS, 9732082Seschrock (uint64_t **)&vs, &vsc) == 0); 9742082Seschrock vs->vs_state = VDEV_STATE_CANT_OPEN; 9752082Seschrock vs->vs_aux = VDEV_AUX_SPARED; 9762082Seschrock } 9772082Seschrock } 9782082Seschrock } 9792082Seschrock } 9802082Seschrock 981789Sahrens int 9821544Seschrock spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 983789Sahrens { 984789Sahrens int error; 985789Sahrens spa_t *spa; 986789Sahrens 987789Sahrens *config = NULL; 988789Sahrens error = spa_open_common(name, &spa, FTAG, config); 989789Sahrens 9902082Seschrock if (spa && *config != NULL) { 9911544Seschrock VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, 9921544Seschrock spa_get_errlog_size(spa)) == 0); 9931544Seschrock 9942082Seschrock spa_add_spares(spa, *config); 9952082Seschrock } 9962082Seschrock 9971544Seschrock /* 9981544Seschrock * We want to get the alternate root even for faulted pools, so we cheat 9991544Seschrock * and call spa_lookup() directly. 10001544Seschrock */ 10011544Seschrock if (altroot) { 10021544Seschrock if (spa == NULL) { 10031544Seschrock mutex_enter(&spa_namespace_lock); 10041544Seschrock spa = spa_lookup(name); 10051544Seschrock if (spa) 10061544Seschrock spa_altroot(spa, altroot, buflen); 10071544Seschrock else 10081544Seschrock altroot[0] = '\0'; 10091544Seschrock spa = NULL; 10101544Seschrock mutex_exit(&spa_namespace_lock); 10111544Seschrock } else { 10121544Seschrock spa_altroot(spa, altroot, buflen); 10131544Seschrock } 10141544Seschrock } 10151544Seschrock 1016789Sahrens if (spa != NULL) 1017789Sahrens spa_close(spa, FTAG); 1018789Sahrens 1019789Sahrens return (error); 1020789Sahrens } 1021789Sahrens 1022789Sahrens /* 10232082Seschrock * Validate that the 'spares' array is well formed. We must have an array of 10243377Seschrock * nvlists, each which describes a valid leaf vdev. If this is an import (mode 10253377Seschrock * is VDEV_ALLOC_SPARE), then we allow corrupted spares to be specified, as long 10263377Seschrock * as they are well-formed. 10272082Seschrock */ 10282082Seschrock static int 10292082Seschrock spa_validate_spares(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 10302082Seschrock { 10312082Seschrock nvlist_t **spares; 10322082Seschrock uint_t i, nspares; 10332082Seschrock vdev_t *vd; 10342082Seschrock int error; 10352082Seschrock 10362082Seschrock /* 10372082Seschrock * It's acceptable to have no spares specified. 10382082Seschrock */ 10392082Seschrock if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 10402082Seschrock &spares, &nspares) != 0) 10412082Seschrock return (0); 10422082Seschrock 10432082Seschrock if (nspares == 0) 10442082Seschrock return (EINVAL); 10452082Seschrock 10462082Seschrock /* 10472082Seschrock * Make sure the pool is formatted with a version that supports hot 10482082Seschrock * spares. 10492082Seschrock */ 10502082Seschrock if (spa_version(spa) < ZFS_VERSION_SPARES) 10512082Seschrock return (ENOTSUP); 10522082Seschrock 10533377Seschrock /* 10543377Seschrock * Set the pending spare list so we correctly handle device in-use 10553377Seschrock * checking. 10563377Seschrock */ 10573377Seschrock spa->spa_pending_spares = spares; 10583377Seschrock spa->spa_pending_nspares = nspares; 10593377Seschrock 10602082Seschrock for (i = 0; i < nspares; i++) { 10612082Seschrock if ((error = spa_config_parse(spa, &vd, spares[i], NULL, 0, 10622082Seschrock mode)) != 0) 10633377Seschrock goto out; 10642082Seschrock 10652082Seschrock if (!vd->vdev_ops->vdev_op_leaf) { 10662082Seschrock vdev_free(vd); 10673377Seschrock error = EINVAL; 10683377Seschrock goto out; 10692082Seschrock } 10702082Seschrock 10712082Seschrock vd->vdev_top = vd; 10723377Seschrock 10733377Seschrock if ((error = vdev_open(vd)) == 0 && 10743377Seschrock (error = vdev_label_init(vd, crtxg, 10753377Seschrock VDEV_LABEL_SPARE)) == 0) { 10763377Seschrock VERIFY(nvlist_add_uint64(spares[i], ZPOOL_CONFIG_GUID, 10773377Seschrock vd->vdev_guid) == 0); 10782082Seschrock } 10792082Seschrock 10802082Seschrock vdev_free(vd); 10813377Seschrock 10823377Seschrock if (error && mode != VDEV_ALLOC_SPARE) 10833377Seschrock goto out; 10843377Seschrock else 10853377Seschrock error = 0; 10862082Seschrock } 10872082Seschrock 10883377Seschrock out: 10893377Seschrock spa->spa_pending_spares = NULL; 10903377Seschrock spa->spa_pending_nspares = 0; 10913377Seschrock return (error); 10922082Seschrock } 10932082Seschrock 10942082Seschrock /* 1095789Sahrens * Pool Creation 1096789Sahrens */ 1097789Sahrens int 10981635Sbonwick spa_create(const char *pool, nvlist_t *nvroot, const char *altroot) 1099789Sahrens { 1100789Sahrens spa_t *spa; 11011635Sbonwick vdev_t *rvd; 1102789Sahrens dsl_pool_t *dp; 1103789Sahrens dmu_tx_t *tx; 11042082Seschrock int c, error = 0; 1105789Sahrens uint64_t txg = TXG_INITIAL; 11062082Seschrock nvlist_t **spares; 11072082Seschrock uint_t nspares; 1108789Sahrens 1109789Sahrens /* 1110789Sahrens * If this pool already exists, return failure. 1111789Sahrens */ 1112789Sahrens mutex_enter(&spa_namespace_lock); 1113789Sahrens if (spa_lookup(pool) != NULL) { 1114789Sahrens mutex_exit(&spa_namespace_lock); 1115789Sahrens return (EEXIST); 1116789Sahrens } 1117789Sahrens 1118789Sahrens /* 1119789Sahrens * Allocate a new spa_t structure. 1120789Sahrens */ 11211635Sbonwick spa = spa_add(pool, altroot); 1122789Sahrens spa_activate(spa); 1123789Sahrens 1124789Sahrens spa->spa_uberblock.ub_txg = txg - 1; 11251760Seschrock spa->spa_uberblock.ub_version = ZFS_VERSION; 1126789Sahrens spa->spa_ubsync = spa->spa_uberblock; 1127789Sahrens 11281635Sbonwick /* 11291635Sbonwick * Create the root vdev. 11301635Sbonwick */ 11311635Sbonwick spa_config_enter(spa, RW_WRITER, FTAG); 11321635Sbonwick 11332082Seschrock error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 11342082Seschrock 11352082Seschrock ASSERT(error != 0 || rvd != NULL); 11362082Seschrock ASSERT(error != 0 || spa->spa_root_vdev == rvd); 11372082Seschrock 11382082Seschrock if (error == 0 && rvd->vdev_children == 0) 11391635Sbonwick error = EINVAL; 11402082Seschrock 11412082Seschrock if (error == 0 && 11422082Seschrock (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 11432082Seschrock (error = spa_validate_spares(spa, nvroot, txg, 11442082Seschrock VDEV_ALLOC_ADD)) == 0) { 11452082Seschrock for (c = 0; c < rvd->vdev_children; c++) 11462082Seschrock vdev_init(rvd->vdev_child[c], txg); 11472082Seschrock vdev_config_dirty(rvd); 11481635Sbonwick } 11491635Sbonwick 11501635Sbonwick spa_config_exit(spa, FTAG); 1151789Sahrens 11522082Seschrock if (error != 0) { 1153789Sahrens spa_unload(spa); 1154789Sahrens spa_deactivate(spa); 1155789Sahrens spa_remove(spa); 1156789Sahrens mutex_exit(&spa_namespace_lock); 1157789Sahrens return (error); 1158789Sahrens } 1159789Sahrens 11602082Seschrock /* 11612082Seschrock * Get the list of spares, if specified. 11622082Seschrock */ 11632082Seschrock if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 11642082Seschrock &spares, &nspares) == 0) { 11652082Seschrock VERIFY(nvlist_alloc(&spa->spa_sparelist, NV_UNIQUE_NAME, 11662082Seschrock KM_SLEEP) == 0); 11672082Seschrock VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 11682082Seschrock ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 11692082Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 11702082Seschrock spa_load_spares(spa); 11712082Seschrock spa_config_exit(spa, FTAG); 11722082Seschrock spa->spa_sync_spares = B_TRUE; 11732082Seschrock } 11742082Seschrock 1175789Sahrens spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg); 1176789Sahrens spa->spa_meta_objset = dp->dp_meta_objset; 1177789Sahrens 1178789Sahrens tx = dmu_tx_create_assigned(dp, txg); 1179789Sahrens 1180789Sahrens /* 1181789Sahrens * Create the pool config object. 1182789Sahrens */ 1183789Sahrens spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 1184789Sahrens DMU_OT_PACKED_NVLIST, 1 << 14, 1185789Sahrens DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 1186789Sahrens 11871544Seschrock if (zap_add(spa->spa_meta_objset, 1188789Sahrens DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 11891544Seschrock sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 11901544Seschrock cmn_err(CE_PANIC, "failed to add pool config"); 11911544Seschrock } 1192789Sahrens 11932082Seschrock /* Newly created pools are always deflated. */ 11942082Seschrock spa->spa_deflate = TRUE; 11952082Seschrock if (zap_add(spa->spa_meta_objset, 11962082Seschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 11972082Seschrock sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 11982082Seschrock cmn_err(CE_PANIC, "failed to add deflate"); 11992082Seschrock } 12002082Seschrock 1201789Sahrens /* 1202789Sahrens * Create the deferred-free bplist object. Turn off compression 1203789Sahrens * because sync-to-convergence takes longer if the blocksize 1204789Sahrens * keeps changing. 1205789Sahrens */ 1206789Sahrens spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 1207789Sahrens 1 << 14, tx); 1208789Sahrens dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 1209789Sahrens ZIO_COMPRESS_OFF, tx); 1210789Sahrens 12111544Seschrock if (zap_add(spa->spa_meta_objset, 1212789Sahrens DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 12131544Seschrock sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { 12141544Seschrock cmn_err(CE_PANIC, "failed to add bplist"); 12151544Seschrock } 1216789Sahrens 12172926Sek110237 /* 12182926Sek110237 * Create the pool's history object. 12192926Sek110237 */ 12202926Sek110237 spa_history_create_obj(spa, tx); 12212926Sek110237 1222789Sahrens dmu_tx_commit(tx); 1223789Sahrens 12243912Slling spa->spa_bootfs = zfs_prop_default_numeric(ZFS_PROP_BOOTFS); 1225789Sahrens spa->spa_sync_on = B_TRUE; 1226789Sahrens txg_sync_start(spa->spa_dsl_pool); 1227789Sahrens 1228789Sahrens /* 1229789Sahrens * We explicitly wait for the first transaction to complete so that our 1230789Sahrens * bean counters are appropriately updated. 1231789Sahrens */ 1232789Sahrens txg_wait_synced(spa->spa_dsl_pool, txg); 1233789Sahrens 1234789Sahrens spa_config_sync(); 1235789Sahrens 1236789Sahrens mutex_exit(&spa_namespace_lock); 1237789Sahrens 1238789Sahrens return (0); 1239789Sahrens } 1240789Sahrens 1241789Sahrens /* 1242789Sahrens * Import the given pool into the system. We set up the necessary spa_t and 1243789Sahrens * then call spa_load() to do the dirty work. 1244789Sahrens */ 1245789Sahrens int 12461635Sbonwick spa_import(const char *pool, nvlist_t *config, const char *altroot) 1247789Sahrens { 1248789Sahrens spa_t *spa; 1249789Sahrens int error; 12502082Seschrock nvlist_t *nvroot; 12512082Seschrock nvlist_t **spares; 12522082Seschrock uint_t nspares; 1253789Sahrens 1254789Sahrens if (!(spa_mode & FWRITE)) 1255789Sahrens return (EROFS); 1256789Sahrens 1257789Sahrens /* 1258789Sahrens * If a pool with this name exists, return failure. 1259789Sahrens */ 1260789Sahrens mutex_enter(&spa_namespace_lock); 1261789Sahrens if (spa_lookup(pool) != NULL) { 1262789Sahrens mutex_exit(&spa_namespace_lock); 1263789Sahrens return (EEXIST); 1264789Sahrens } 1265789Sahrens 1266789Sahrens /* 12671635Sbonwick * Create and initialize the spa structure. 1268789Sahrens */ 12691635Sbonwick spa = spa_add(pool, altroot); 1270789Sahrens spa_activate(spa); 1271789Sahrens 1272789Sahrens /* 12731635Sbonwick * Pass off the heavy lifting to spa_load(). 12741732Sbonwick * Pass TRUE for mosconfig because the user-supplied config 12751732Sbonwick * is actually the one to trust when doing an import. 12761601Sbonwick */ 12771732Sbonwick error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE); 1278789Sahrens 12792082Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 12802082Seschrock /* 12812082Seschrock * Toss any existing sparelist, as it doesn't have any validity anymore, 12822082Seschrock * and conflicts with spa_has_spare(). 12832082Seschrock */ 12842082Seschrock if (spa->spa_sparelist) { 12852082Seschrock nvlist_free(spa->spa_sparelist); 12862082Seschrock spa->spa_sparelist = NULL; 12872082Seschrock spa_load_spares(spa); 12882082Seschrock } 12892082Seschrock 12902082Seschrock VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 12912082Seschrock &nvroot) == 0); 12922082Seschrock if (error == 0) 12932082Seschrock error = spa_validate_spares(spa, nvroot, -1ULL, 12942082Seschrock VDEV_ALLOC_SPARE); 12952082Seschrock spa_config_exit(spa, FTAG); 12962082Seschrock 12972082Seschrock if (error != 0) { 1298789Sahrens spa_unload(spa); 1299789Sahrens spa_deactivate(spa); 1300789Sahrens spa_remove(spa); 1301789Sahrens mutex_exit(&spa_namespace_lock); 1302789Sahrens return (error); 1303789Sahrens } 1304789Sahrens 13051635Sbonwick /* 13062082Seschrock * Override any spares as specified by the user, as these may have 13072082Seschrock * correct device names/devids, etc. 13082082Seschrock */ 13092082Seschrock if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 13102082Seschrock &spares, &nspares) == 0) { 13112082Seschrock if (spa->spa_sparelist) 13122082Seschrock VERIFY(nvlist_remove(spa->spa_sparelist, 13132082Seschrock ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 13142082Seschrock else 13152082Seschrock VERIFY(nvlist_alloc(&spa->spa_sparelist, 13162082Seschrock NV_UNIQUE_NAME, KM_SLEEP) == 0); 13172082Seschrock VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 13182082Seschrock ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 13192082Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 13202082Seschrock spa_load_spares(spa); 13212082Seschrock spa_config_exit(spa, FTAG); 13222082Seschrock spa->spa_sync_spares = B_TRUE; 13232082Seschrock } 13242082Seschrock 13252082Seschrock /* 13261635Sbonwick * Update the config cache to include the newly-imported pool. 13271635Sbonwick */ 13281635Sbonwick spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 13291635Sbonwick 1330789Sahrens mutex_exit(&spa_namespace_lock); 1331789Sahrens 1332789Sahrens /* 1333789Sahrens * Resilver anything that's out of date. 1334789Sahrens */ 1335789Sahrens if (spa_mode & FWRITE) 1336789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1337789Sahrens 1338789Sahrens return (0); 1339789Sahrens } 1340789Sahrens 1341789Sahrens /* 1342789Sahrens * This (illegal) pool name is used when temporarily importing a spa_t in order 1343789Sahrens * to get the vdev stats associated with the imported devices. 1344789Sahrens */ 1345789Sahrens #define TRYIMPORT_NAME "$import" 1346789Sahrens 1347789Sahrens nvlist_t * 1348789Sahrens spa_tryimport(nvlist_t *tryconfig) 1349789Sahrens { 1350789Sahrens nvlist_t *config = NULL; 1351789Sahrens char *poolname; 1352789Sahrens spa_t *spa; 1353789Sahrens uint64_t state; 1354789Sahrens 1355789Sahrens if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 1356789Sahrens return (NULL); 1357789Sahrens 1358789Sahrens if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 1359789Sahrens return (NULL); 1360789Sahrens 13611635Sbonwick /* 13621635Sbonwick * Create and initialize the spa structure. 13631635Sbonwick */ 1364789Sahrens mutex_enter(&spa_namespace_lock); 13651635Sbonwick spa = spa_add(TRYIMPORT_NAME, NULL); 1366789Sahrens spa_activate(spa); 1367789Sahrens 1368789Sahrens /* 13691635Sbonwick * Pass off the heavy lifting to spa_load(). 13701732Sbonwick * Pass TRUE for mosconfig because the user-supplied config 13711732Sbonwick * is actually the one to trust when doing an import. 1372789Sahrens */ 13731732Sbonwick (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); 1374789Sahrens 1375789Sahrens /* 1376789Sahrens * If 'tryconfig' was at least parsable, return the current config. 1377789Sahrens */ 1378789Sahrens if (spa->spa_root_vdev != NULL) { 13791635Sbonwick spa_config_enter(spa, RW_READER, FTAG); 1380789Sahrens config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 13811635Sbonwick spa_config_exit(spa, FTAG); 1382789Sahrens VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 1383789Sahrens poolname) == 0); 1384789Sahrens VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 1385789Sahrens state) == 0); 1386*3975Sek110237 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 1387*3975Sek110237 spa->spa_uberblock.ub_timestamp) == 0); 13882082Seschrock 13892082Seschrock /* 13902082Seschrock * Add the list of hot spares. 13912082Seschrock */ 13922082Seschrock spa_add_spares(spa, config); 1393789Sahrens } 1394789Sahrens 1395789Sahrens spa_unload(spa); 1396789Sahrens spa_deactivate(spa); 1397789Sahrens spa_remove(spa); 1398789Sahrens mutex_exit(&spa_namespace_lock); 1399789Sahrens 1400789Sahrens return (config); 1401789Sahrens } 1402789Sahrens 1403789Sahrens /* 1404789Sahrens * Pool export/destroy 1405789Sahrens * 1406789Sahrens * The act of destroying or exporting a pool is very simple. We make sure there 1407789Sahrens * is no more pending I/O and any references to the pool are gone. Then, we 1408789Sahrens * update the pool state and sync all the labels to disk, removing the 1409789Sahrens * configuration from the cache afterwards. 1410789Sahrens */ 1411789Sahrens static int 14121775Sbillm spa_export_common(char *pool, int new_state, nvlist_t **oldconfig) 1413789Sahrens { 1414789Sahrens spa_t *spa; 1415789Sahrens 14161775Sbillm if (oldconfig) 14171775Sbillm *oldconfig = NULL; 14181775Sbillm 1419789Sahrens if (!(spa_mode & FWRITE)) 1420789Sahrens return (EROFS); 1421789Sahrens 1422789Sahrens mutex_enter(&spa_namespace_lock); 1423789Sahrens if ((spa = spa_lookup(pool)) == NULL) { 1424789Sahrens mutex_exit(&spa_namespace_lock); 1425789Sahrens return (ENOENT); 1426789Sahrens } 1427789Sahrens 1428789Sahrens /* 14291544Seschrock * Put a hold on the pool, drop the namespace lock, stop async tasks, 14301544Seschrock * reacquire the namespace lock, and see if we can export. 14311544Seschrock */ 14321544Seschrock spa_open_ref(spa, FTAG); 14331544Seschrock mutex_exit(&spa_namespace_lock); 14341544Seschrock spa_async_suspend(spa); 14351544Seschrock mutex_enter(&spa_namespace_lock); 14361544Seschrock spa_close(spa, FTAG); 14371544Seschrock 14381544Seschrock /* 1439789Sahrens * The pool will be in core if it's openable, 1440789Sahrens * in which case we can modify its state. 1441789Sahrens */ 1442789Sahrens if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 1443789Sahrens /* 1444789Sahrens * Objsets may be open only because they're dirty, so we 1445789Sahrens * have to force it to sync before checking spa_refcnt. 1446789Sahrens */ 1447789Sahrens spa_scrub_suspend(spa); 1448789Sahrens txg_wait_synced(spa->spa_dsl_pool, 0); 1449789Sahrens 14501544Seschrock /* 14511544Seschrock * A pool cannot be exported or destroyed if there are active 14521544Seschrock * references. If we are resetting a pool, allow references by 14531544Seschrock * fault injection handlers. 14541544Seschrock */ 14551544Seschrock if (!spa_refcount_zero(spa) || 14561544Seschrock (spa->spa_inject_ref != 0 && 14571544Seschrock new_state != POOL_STATE_UNINITIALIZED)) { 1458789Sahrens spa_scrub_resume(spa); 14591544Seschrock spa_async_resume(spa); 1460789Sahrens mutex_exit(&spa_namespace_lock); 1461789Sahrens return (EBUSY); 1462789Sahrens } 1463789Sahrens 1464789Sahrens spa_scrub_resume(spa); 1465789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 1466789Sahrens 1467789Sahrens /* 1468789Sahrens * We want this to be reflected on every label, 1469789Sahrens * so mark them all dirty. spa_unload() will do the 1470789Sahrens * final sync that pushes these changes out. 1471789Sahrens */ 14721544Seschrock if (new_state != POOL_STATE_UNINITIALIZED) { 14731601Sbonwick spa_config_enter(spa, RW_WRITER, FTAG); 14741544Seschrock spa->spa_state = new_state; 14751635Sbonwick spa->spa_final_txg = spa_last_synced_txg(spa) + 1; 14761544Seschrock vdev_config_dirty(spa->spa_root_vdev); 14771601Sbonwick spa_config_exit(spa, FTAG); 14781544Seschrock } 1479789Sahrens } 1480789Sahrens 1481789Sahrens if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 1482789Sahrens spa_unload(spa); 1483789Sahrens spa_deactivate(spa); 1484789Sahrens } 1485789Sahrens 14861775Sbillm if (oldconfig && spa->spa_config) 14871775Sbillm VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 14881775Sbillm 14891544Seschrock if (new_state != POOL_STATE_UNINITIALIZED) { 14901544Seschrock spa_remove(spa); 14911544Seschrock spa_config_sync(); 14921544Seschrock } 1493789Sahrens mutex_exit(&spa_namespace_lock); 1494789Sahrens 1495789Sahrens return (0); 1496789Sahrens } 1497789Sahrens 1498789Sahrens /* 1499789Sahrens * Destroy a storage pool. 1500789Sahrens */ 1501789Sahrens int 1502789Sahrens spa_destroy(char *pool) 1503789Sahrens { 15041775Sbillm return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL)); 1505789Sahrens } 1506789Sahrens 1507789Sahrens /* 1508789Sahrens * Export a storage pool. 1509789Sahrens */ 1510789Sahrens int 15111775Sbillm spa_export(char *pool, nvlist_t **oldconfig) 1512789Sahrens { 15131775Sbillm return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig)); 1514789Sahrens } 1515789Sahrens 1516789Sahrens /* 15171544Seschrock * Similar to spa_export(), this unloads the spa_t without actually removing it 15181544Seschrock * from the namespace in any way. 15191544Seschrock */ 15201544Seschrock int 15211544Seschrock spa_reset(char *pool) 15221544Seschrock { 15231775Sbillm return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL)); 15241544Seschrock } 15251544Seschrock 15261544Seschrock 15271544Seschrock /* 1528789Sahrens * ========================================================================== 1529789Sahrens * Device manipulation 1530789Sahrens * ========================================================================== 1531789Sahrens */ 1532789Sahrens 1533789Sahrens /* 1534789Sahrens * Add capacity to a storage pool. 1535789Sahrens */ 1536789Sahrens int 1537789Sahrens spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 1538789Sahrens { 1539789Sahrens uint64_t txg; 15401635Sbonwick int c, error; 1541789Sahrens vdev_t *rvd = spa->spa_root_vdev; 15421585Sbonwick vdev_t *vd, *tvd; 15432082Seschrock nvlist_t **spares; 15442082Seschrock uint_t i, nspares; 1545789Sahrens 1546789Sahrens txg = spa_vdev_enter(spa); 1547789Sahrens 15482082Seschrock if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 15492082Seschrock VDEV_ALLOC_ADD)) != 0) 15502082Seschrock return (spa_vdev_exit(spa, NULL, txg, error)); 15512082Seschrock 15523377Seschrock spa->spa_pending_vdev = vd; 1553789Sahrens 15542082Seschrock if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 15552082Seschrock &spares, &nspares) != 0) 15562082Seschrock nspares = 0; 15572082Seschrock 15583377Seschrock if (vd->vdev_children == 0 && nspares == 0) { 15593377Seschrock spa->spa_pending_vdev = NULL; 15602082Seschrock return (spa_vdev_exit(spa, vd, txg, EINVAL)); 15613377Seschrock } 15622082Seschrock 15632082Seschrock if (vd->vdev_children != 0) { 15643377Seschrock if ((error = vdev_create(vd, txg, B_FALSE)) != 0) { 15653377Seschrock spa->spa_pending_vdev = NULL; 15662082Seschrock return (spa_vdev_exit(spa, vd, txg, error)); 15672082Seschrock } 15682082Seschrock } 15692082Seschrock 15703377Seschrock /* 15713377Seschrock * We must validate the spares after checking the children. Otherwise, 15723377Seschrock * vdev_inuse() will blindly overwrite the spare. 15733377Seschrock */ 15743377Seschrock if ((error = spa_validate_spares(spa, nvroot, txg, 15753377Seschrock VDEV_ALLOC_ADD)) != 0) { 15763377Seschrock spa->spa_pending_vdev = NULL; 15773377Seschrock return (spa_vdev_exit(spa, vd, txg, error)); 15783377Seschrock } 15793377Seschrock 15803377Seschrock spa->spa_pending_vdev = NULL; 15813377Seschrock 15823377Seschrock /* 15833377Seschrock * Transfer each new top-level vdev from vd to rvd. 15843377Seschrock */ 15853377Seschrock for (c = 0; c < vd->vdev_children; c++) { 15863377Seschrock tvd = vd->vdev_child[c]; 15873377Seschrock vdev_remove_child(vd, tvd); 15883377Seschrock tvd->vdev_id = rvd->vdev_children; 15893377Seschrock vdev_add_child(rvd, tvd); 15903377Seschrock vdev_config_dirty(tvd); 15913377Seschrock } 15923377Seschrock 15932082Seschrock if (nspares != 0) { 15942082Seschrock if (spa->spa_sparelist != NULL) { 15952082Seschrock nvlist_t **oldspares; 15962082Seschrock uint_t oldnspares; 15972082Seschrock nvlist_t **newspares; 15982082Seschrock 15992082Seschrock VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 16002082Seschrock ZPOOL_CONFIG_SPARES, &oldspares, &oldnspares) == 0); 16012082Seschrock 16022082Seschrock newspares = kmem_alloc(sizeof (void *) * 16032082Seschrock (nspares + oldnspares), KM_SLEEP); 16042082Seschrock for (i = 0; i < oldnspares; i++) 16052082Seschrock VERIFY(nvlist_dup(oldspares[i], 16062082Seschrock &newspares[i], KM_SLEEP) == 0); 16072082Seschrock for (i = 0; i < nspares; i++) 16082082Seschrock VERIFY(nvlist_dup(spares[i], 16092082Seschrock &newspares[i + oldnspares], 16102082Seschrock KM_SLEEP) == 0); 16112082Seschrock 16122082Seschrock VERIFY(nvlist_remove(spa->spa_sparelist, 16132082Seschrock ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 16142082Seschrock 16152082Seschrock VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 16162082Seschrock ZPOOL_CONFIG_SPARES, newspares, 16172082Seschrock nspares + oldnspares) == 0); 16182082Seschrock for (i = 0; i < oldnspares + nspares; i++) 16192082Seschrock nvlist_free(newspares[i]); 16202082Seschrock kmem_free(newspares, (oldnspares + nspares) * 16212082Seschrock sizeof (void *)); 16222082Seschrock } else { 16232082Seschrock VERIFY(nvlist_alloc(&spa->spa_sparelist, 16242082Seschrock NV_UNIQUE_NAME, KM_SLEEP) == 0); 16252082Seschrock VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 16262082Seschrock ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 16272082Seschrock } 16282082Seschrock 16292082Seschrock spa_load_spares(spa); 16302082Seschrock spa->spa_sync_spares = B_TRUE; 1631789Sahrens } 1632789Sahrens 1633789Sahrens /* 16341585Sbonwick * We have to be careful when adding new vdevs to an existing pool. 16351585Sbonwick * If other threads start allocating from these vdevs before we 16361585Sbonwick * sync the config cache, and we lose power, then upon reboot we may 16371585Sbonwick * fail to open the pool because there are DVAs that the config cache 16381585Sbonwick * can't translate. Therefore, we first add the vdevs without 16391585Sbonwick * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 16401635Sbonwick * and then let spa_config_update() initialize the new metaslabs. 16411585Sbonwick * 16421585Sbonwick * spa_load() checks for added-but-not-initialized vdevs, so that 16431585Sbonwick * if we lose power at any point in this sequence, the remaining 16441585Sbonwick * steps will be completed the next time we load the pool. 1645789Sahrens */ 16461635Sbonwick (void) spa_vdev_exit(spa, vd, txg, 0); 16471585Sbonwick 16481635Sbonwick mutex_enter(&spa_namespace_lock); 16491635Sbonwick spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 16501635Sbonwick mutex_exit(&spa_namespace_lock); 1651789Sahrens 16521635Sbonwick return (0); 1653789Sahrens } 1654789Sahrens 1655789Sahrens /* 1656789Sahrens * Attach a device to a mirror. The arguments are the path to any device 1657789Sahrens * in the mirror, and the nvroot for the new device. If the path specifies 1658789Sahrens * a device that is not mirrored, we automatically insert the mirror vdev. 1659789Sahrens * 1660789Sahrens * If 'replacing' is specified, the new device is intended to replace the 1661789Sahrens * existing device; in this case the two devices are made into their own 1662789Sahrens * mirror using the 'replacing' vdev, which is functionally idendical to 1663789Sahrens * the mirror vdev (it actually reuses all the same ops) but has a few 1664789Sahrens * extra rules: you can't attach to it after it's been created, and upon 1665789Sahrens * completion of resilvering, the first disk (the one being replaced) 1666789Sahrens * is automatically detached. 1667789Sahrens */ 1668789Sahrens int 16691544Seschrock spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 1670789Sahrens { 1671789Sahrens uint64_t txg, open_txg; 1672789Sahrens int error; 1673789Sahrens vdev_t *rvd = spa->spa_root_vdev; 1674789Sahrens vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 16752082Seschrock vdev_ops_t *pvops; 1676789Sahrens 1677789Sahrens txg = spa_vdev_enter(spa); 1678789Sahrens 16791544Seschrock oldvd = vdev_lookup_by_guid(rvd, guid); 1680789Sahrens 1681789Sahrens if (oldvd == NULL) 1682789Sahrens return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1683789Sahrens 16841585Sbonwick if (!oldvd->vdev_ops->vdev_op_leaf) 16851585Sbonwick return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 16861585Sbonwick 1687789Sahrens pvd = oldvd->vdev_parent; 1688789Sahrens 16892082Seschrock if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 16902082Seschrock VDEV_ALLOC_ADD)) != 0 || newrootvd->vdev_children != 1) 1691789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1692789Sahrens 1693789Sahrens newvd = newrootvd->vdev_child[0]; 1694789Sahrens 1695789Sahrens if (!newvd->vdev_ops->vdev_op_leaf) 1696789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1697789Sahrens 16982082Seschrock if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 1699789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, error)); 1700789Sahrens 17012082Seschrock if (!replacing) { 17022082Seschrock /* 17032082Seschrock * For attach, the only allowable parent is a mirror or the root 17042082Seschrock * vdev. 17052082Seschrock */ 17062082Seschrock if (pvd->vdev_ops != &vdev_mirror_ops && 17072082Seschrock pvd->vdev_ops != &vdev_root_ops) 17082082Seschrock return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 17092082Seschrock 17102082Seschrock pvops = &vdev_mirror_ops; 17112082Seschrock } else { 17122082Seschrock /* 17132082Seschrock * Active hot spares can only be replaced by inactive hot 17142082Seschrock * spares. 17152082Seschrock */ 17162082Seschrock if (pvd->vdev_ops == &vdev_spare_ops && 17172082Seschrock pvd->vdev_child[1] == oldvd && 17182082Seschrock !spa_has_spare(spa, newvd->vdev_guid)) 17192082Seschrock return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 17202082Seschrock 17212082Seschrock /* 17222082Seschrock * If the source is a hot spare, and the parent isn't already a 17232082Seschrock * spare, then we want to create a new hot spare. Otherwise, we 17243377Seschrock * want to create a replacing vdev. The user is not allowed to 17253377Seschrock * attach to a spared vdev child unless the 'isspare' state is 17263377Seschrock * the same (spare replaces spare, non-spare replaces 17273377Seschrock * non-spare). 17282082Seschrock */ 17292082Seschrock if (pvd->vdev_ops == &vdev_replacing_ops) 17302082Seschrock return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 17313377Seschrock else if (pvd->vdev_ops == &vdev_spare_ops && 17323377Seschrock newvd->vdev_isspare != oldvd->vdev_isspare) 17333377Seschrock return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 17342082Seschrock else if (pvd->vdev_ops != &vdev_spare_ops && 17352082Seschrock newvd->vdev_isspare) 17362082Seschrock pvops = &vdev_spare_ops; 17372082Seschrock else 17382082Seschrock pvops = &vdev_replacing_ops; 17392082Seschrock } 17402082Seschrock 17411175Slling /* 17421175Slling * Compare the new device size with the replaceable/attachable 17431175Slling * device size. 17441175Slling */ 17451175Slling if (newvd->vdev_psize < vdev_get_rsize(oldvd)) 1746789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 1747789Sahrens 17481732Sbonwick /* 17491732Sbonwick * The new device cannot have a higher alignment requirement 17501732Sbonwick * than the top-level vdev. 17511732Sbonwick */ 17521732Sbonwick if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 1753789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 1754789Sahrens 1755789Sahrens /* 1756789Sahrens * If this is an in-place replacement, update oldvd's path and devid 1757789Sahrens * to make it distinguishable from newvd, and unopenable from now on. 1758789Sahrens */ 1759789Sahrens if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 1760789Sahrens spa_strfree(oldvd->vdev_path); 1761789Sahrens oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 1762789Sahrens KM_SLEEP); 1763789Sahrens (void) sprintf(oldvd->vdev_path, "%s/%s", 1764789Sahrens newvd->vdev_path, "old"); 1765789Sahrens if (oldvd->vdev_devid != NULL) { 1766789Sahrens spa_strfree(oldvd->vdev_devid); 1767789Sahrens oldvd->vdev_devid = NULL; 1768789Sahrens } 1769789Sahrens } 1770789Sahrens 1771789Sahrens /* 17722082Seschrock * If the parent is not a mirror, or if we're replacing, insert the new 17732082Seschrock * mirror/replacing/spare vdev above oldvd. 1774789Sahrens */ 1775789Sahrens if (pvd->vdev_ops != pvops) 1776789Sahrens pvd = vdev_add_parent(oldvd, pvops); 1777789Sahrens 1778789Sahrens ASSERT(pvd->vdev_top->vdev_parent == rvd); 1779789Sahrens ASSERT(pvd->vdev_ops == pvops); 1780789Sahrens ASSERT(oldvd->vdev_parent == pvd); 1781789Sahrens 1782789Sahrens /* 1783789Sahrens * Extract the new device from its root and add it to pvd. 1784789Sahrens */ 1785789Sahrens vdev_remove_child(newrootvd, newvd); 1786789Sahrens newvd->vdev_id = pvd->vdev_children; 1787789Sahrens vdev_add_child(pvd, newvd); 1788789Sahrens 17891544Seschrock /* 17901544Seschrock * If newvd is smaller than oldvd, but larger than its rsize, 17911544Seschrock * the addition of newvd may have decreased our parent's asize. 17921544Seschrock */ 17931544Seschrock pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize); 17941544Seschrock 1795789Sahrens tvd = newvd->vdev_top; 1796789Sahrens ASSERT(pvd->vdev_top == tvd); 1797789Sahrens ASSERT(tvd->vdev_parent == rvd); 1798789Sahrens 1799789Sahrens vdev_config_dirty(tvd); 1800789Sahrens 1801789Sahrens /* 1802789Sahrens * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 1803789Sahrens * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 1804789Sahrens */ 1805789Sahrens open_txg = txg + TXG_CONCURRENT_STATES - 1; 1806789Sahrens 1807789Sahrens mutex_enter(&newvd->vdev_dtl_lock); 1808789Sahrens space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, 1809789Sahrens open_txg - TXG_INITIAL + 1); 1810789Sahrens mutex_exit(&newvd->vdev_dtl_lock); 1811789Sahrens 18123377Seschrock if (newvd->vdev_isspare) 18133377Seschrock spa_spare_activate(newvd); 18141544Seschrock 1815789Sahrens /* 1816789Sahrens * Mark newvd's DTL dirty in this txg. 1817789Sahrens */ 18181732Sbonwick vdev_dirty(tvd, VDD_DTL, newvd, txg); 1819789Sahrens 1820789Sahrens (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 1821789Sahrens 1822789Sahrens /* 1823789Sahrens * Kick off a resilver to update newvd. 1824789Sahrens */ 1825789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1826789Sahrens 1827789Sahrens return (0); 1828789Sahrens } 1829789Sahrens 1830789Sahrens /* 1831789Sahrens * Detach a device from a mirror or replacing vdev. 1832789Sahrens * If 'replace_done' is specified, only detach if the parent 1833789Sahrens * is a replacing vdev. 1834789Sahrens */ 1835789Sahrens int 18361544Seschrock spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) 1837789Sahrens { 1838789Sahrens uint64_t txg; 1839789Sahrens int c, t, error; 1840789Sahrens vdev_t *rvd = spa->spa_root_vdev; 1841789Sahrens vdev_t *vd, *pvd, *cvd, *tvd; 18422082Seschrock boolean_t unspare = B_FALSE; 18432082Seschrock uint64_t unspare_guid; 1844789Sahrens 1845789Sahrens txg = spa_vdev_enter(spa); 1846789Sahrens 18471544Seschrock vd = vdev_lookup_by_guid(rvd, guid); 1848789Sahrens 1849789Sahrens if (vd == NULL) 1850789Sahrens return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1851789Sahrens 18521585Sbonwick if (!vd->vdev_ops->vdev_op_leaf) 18531585Sbonwick return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 18541585Sbonwick 1855789Sahrens pvd = vd->vdev_parent; 1856789Sahrens 1857789Sahrens /* 1858789Sahrens * If replace_done is specified, only remove this device if it's 18592082Seschrock * the first child of a replacing vdev. For the 'spare' vdev, either 18602082Seschrock * disk can be removed. 1861789Sahrens */ 18622082Seschrock if (replace_done) { 18632082Seschrock if (pvd->vdev_ops == &vdev_replacing_ops) { 18642082Seschrock if (vd->vdev_id != 0) 18652082Seschrock return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 18662082Seschrock } else if (pvd->vdev_ops != &vdev_spare_ops) { 18672082Seschrock return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 18682082Seschrock } 18692082Seschrock } 18702082Seschrock 18712082Seschrock ASSERT(pvd->vdev_ops != &vdev_spare_ops || 18722082Seschrock spa_version(spa) >= ZFS_VERSION_SPARES); 1873789Sahrens 1874789Sahrens /* 18752082Seschrock * Only mirror, replacing, and spare vdevs support detach. 1876789Sahrens */ 1877789Sahrens if (pvd->vdev_ops != &vdev_replacing_ops && 18782082Seschrock pvd->vdev_ops != &vdev_mirror_ops && 18792082Seschrock pvd->vdev_ops != &vdev_spare_ops) 1880789Sahrens return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1881789Sahrens 1882789Sahrens /* 1883789Sahrens * If there's only one replica, you can't detach it. 1884789Sahrens */ 1885789Sahrens if (pvd->vdev_children <= 1) 1886789Sahrens return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1887789Sahrens 1888789Sahrens /* 1889789Sahrens * If all siblings have non-empty DTLs, this device may have the only 1890789Sahrens * valid copy of the data, which means we cannot safely detach it. 1891789Sahrens * 1892789Sahrens * XXX -- as in the vdev_offline() case, we really want a more 1893789Sahrens * precise DTL check. 1894789Sahrens */ 1895789Sahrens for (c = 0; c < pvd->vdev_children; c++) { 1896789Sahrens uint64_t dirty; 1897789Sahrens 1898789Sahrens cvd = pvd->vdev_child[c]; 1899789Sahrens if (cvd == vd) 1900789Sahrens continue; 1901789Sahrens if (vdev_is_dead(cvd)) 1902789Sahrens continue; 1903789Sahrens mutex_enter(&cvd->vdev_dtl_lock); 1904789Sahrens dirty = cvd->vdev_dtl_map.sm_space | 1905789Sahrens cvd->vdev_dtl_scrub.sm_space; 1906789Sahrens mutex_exit(&cvd->vdev_dtl_lock); 1907789Sahrens if (!dirty) 1908789Sahrens break; 1909789Sahrens } 19102082Seschrock 19112082Seschrock /* 19122082Seschrock * If we are a replacing or spare vdev, then we can always detach the 19132082Seschrock * latter child, as that is how one cancels the operation. 19142082Seschrock */ 19152082Seschrock if ((pvd->vdev_ops == &vdev_mirror_ops || vd->vdev_id != 1) && 19162082Seschrock c == pvd->vdev_children) 1917789Sahrens return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1918789Sahrens 1919789Sahrens /* 19202082Seschrock * If we are detaching the original disk from a spare, then it implies 19212082Seschrock * that the spare should become a real disk, and be removed from the 19222082Seschrock * active spare list for the pool. 19232082Seschrock */ 19242082Seschrock if (pvd->vdev_ops == &vdev_spare_ops && 19252082Seschrock vd->vdev_id == 0) 19262082Seschrock unspare = B_TRUE; 19272082Seschrock 19282082Seschrock /* 1929789Sahrens * Erase the disk labels so the disk can be used for other things. 1930789Sahrens * This must be done after all other error cases are handled, 1931789Sahrens * but before we disembowel vd (so we can still do I/O to it). 1932789Sahrens * But if we can't do it, don't treat the error as fatal -- 1933789Sahrens * it may be that the unwritability of the disk is the reason 1934789Sahrens * it's being detached! 1935789Sahrens */ 19363377Seschrock error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 1937789Sahrens 1938789Sahrens /* 1939789Sahrens * Remove vd from its parent and compact the parent's children. 1940789Sahrens */ 1941789Sahrens vdev_remove_child(pvd, vd); 1942789Sahrens vdev_compact_children(pvd); 1943789Sahrens 1944789Sahrens /* 1945789Sahrens * Remember one of the remaining children so we can get tvd below. 1946789Sahrens */ 1947789Sahrens cvd = pvd->vdev_child[0]; 1948789Sahrens 1949789Sahrens /* 19502082Seschrock * If we need to remove the remaining child from the list of hot spares, 19512082Seschrock * do it now, marking the vdev as no longer a spare in the process. We 19522082Seschrock * must do this before vdev_remove_parent(), because that can change the 19532082Seschrock * GUID if it creates a new toplevel GUID. 19542082Seschrock */ 19552082Seschrock if (unspare) { 19562082Seschrock ASSERT(cvd->vdev_isspare); 19573377Seschrock spa_spare_remove(cvd); 19582082Seschrock unspare_guid = cvd->vdev_guid; 19592082Seschrock } 19602082Seschrock 19612082Seschrock /* 1962789Sahrens * If the parent mirror/replacing vdev only has one child, 1963789Sahrens * the parent is no longer needed. Remove it from the tree. 1964789Sahrens */ 1965789Sahrens if (pvd->vdev_children == 1) 1966789Sahrens vdev_remove_parent(cvd); 1967789Sahrens 1968789Sahrens /* 1969789Sahrens * We don't set tvd until now because the parent we just removed 1970789Sahrens * may have been the previous top-level vdev. 1971789Sahrens */ 1972789Sahrens tvd = cvd->vdev_top; 1973789Sahrens ASSERT(tvd->vdev_parent == rvd); 1974789Sahrens 1975789Sahrens /* 19763377Seschrock * Reevaluate the parent vdev state. 1977789Sahrens */ 19783377Seschrock vdev_propagate_state(cvd->vdev_parent); 1979789Sahrens 1980789Sahrens /* 19813377Seschrock * If the device we just detached was smaller than the others, it may be 19823377Seschrock * possible to add metaslabs (i.e. grow the pool). vdev_metaslab_init() 19833377Seschrock * can't fail because the existing metaslabs are already in core, so 19843377Seschrock * there's nothing to read from disk. 1985789Sahrens */ 19861732Sbonwick VERIFY(vdev_metaslab_init(tvd, txg) == 0); 1987789Sahrens 1988789Sahrens vdev_config_dirty(tvd); 1989789Sahrens 1990789Sahrens /* 19913377Seschrock * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 19923377Seschrock * vd->vdev_detached is set and free vd's DTL object in syncing context. 19933377Seschrock * But first make sure we're not on any *other* txg's DTL list, to 19943377Seschrock * prevent vd from being accessed after it's freed. 1995789Sahrens */ 1996789Sahrens for (t = 0; t < TXG_SIZE; t++) 1997789Sahrens (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 19981732Sbonwick vd->vdev_detached = B_TRUE; 19991732Sbonwick vdev_dirty(tvd, VDD_DTL, vd, txg); 2000789Sahrens 20012082Seschrock error = spa_vdev_exit(spa, vd, txg, 0); 20022082Seschrock 20032082Seschrock /* 20043377Seschrock * If this was the removal of the original device in a hot spare vdev, 20053377Seschrock * then we want to go through and remove the device from the hot spare 20063377Seschrock * list of every other pool. 20072082Seschrock */ 20082082Seschrock if (unspare) { 20092082Seschrock spa = NULL; 20102082Seschrock mutex_enter(&spa_namespace_lock); 20112082Seschrock while ((spa = spa_next(spa)) != NULL) { 20122082Seschrock if (spa->spa_state != POOL_STATE_ACTIVE) 20132082Seschrock continue; 20142082Seschrock 20152082Seschrock (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 20162082Seschrock } 20172082Seschrock mutex_exit(&spa_namespace_lock); 20182082Seschrock } 20192082Seschrock 20202082Seschrock return (error); 20212082Seschrock } 20222082Seschrock 20232082Seschrock /* 20242082Seschrock * Remove a device from the pool. Currently, this supports removing only hot 20252082Seschrock * spares. 20262082Seschrock */ 20272082Seschrock int 20282082Seschrock spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 20292082Seschrock { 20302082Seschrock vdev_t *vd; 20312082Seschrock nvlist_t **spares, *nv, **newspares; 20322082Seschrock uint_t i, j, nspares; 20332082Seschrock int ret = 0; 20342082Seschrock 20352082Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 20362082Seschrock 20372082Seschrock vd = spa_lookup_by_guid(spa, guid); 20382082Seschrock 20392082Seschrock nv = NULL; 20402082Seschrock if (spa->spa_spares != NULL && 20412082Seschrock nvlist_lookup_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 20422082Seschrock &spares, &nspares) == 0) { 20432082Seschrock for (i = 0; i < nspares; i++) { 20442082Seschrock uint64_t theguid; 20452082Seschrock 20462082Seschrock VERIFY(nvlist_lookup_uint64(spares[i], 20472082Seschrock ZPOOL_CONFIG_GUID, &theguid) == 0); 20482082Seschrock if (theguid == guid) { 20492082Seschrock nv = spares[i]; 20502082Seschrock break; 20512082Seschrock } 20522082Seschrock } 20532082Seschrock } 20542082Seschrock 20552082Seschrock /* 20562082Seschrock * We only support removing a hot spare, and only if it's not currently 20572082Seschrock * in use in this pool. 20582082Seschrock */ 20592082Seschrock if (nv == NULL && vd == NULL) { 20602082Seschrock ret = ENOENT; 20612082Seschrock goto out; 20622082Seschrock } 20632082Seschrock 20642082Seschrock if (nv == NULL && vd != NULL) { 20652082Seschrock ret = ENOTSUP; 20662082Seschrock goto out; 20672082Seschrock } 20682082Seschrock 20692082Seschrock if (!unspare && nv != NULL && vd != NULL) { 20702082Seschrock ret = EBUSY; 20712082Seschrock goto out; 20722082Seschrock } 20732082Seschrock 20742082Seschrock if (nspares == 1) { 20752082Seschrock newspares = NULL; 20762082Seschrock } else { 20772082Seschrock newspares = kmem_alloc((nspares - 1) * sizeof (void *), 20782082Seschrock KM_SLEEP); 20792082Seschrock for (i = 0, j = 0; i < nspares; i++) { 20802082Seschrock if (spares[i] != nv) 20812082Seschrock VERIFY(nvlist_dup(spares[i], 20822082Seschrock &newspares[j++], KM_SLEEP) == 0); 20832082Seschrock } 20842082Seschrock } 20852082Seschrock 20862082Seschrock VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 20872082Seschrock DATA_TYPE_NVLIST_ARRAY) == 0); 20882082Seschrock VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 20892082Seschrock newspares, nspares - 1) == 0); 20902082Seschrock for (i = 0; i < nspares - 1; i++) 20912082Seschrock nvlist_free(newspares[i]); 20922082Seschrock kmem_free(newspares, (nspares - 1) * sizeof (void *)); 20932082Seschrock spa_load_spares(spa); 20942082Seschrock spa->spa_sync_spares = B_TRUE; 20952082Seschrock 20962082Seschrock out: 20972082Seschrock spa_config_exit(spa, FTAG); 20982082Seschrock 20992082Seschrock return (ret); 2100789Sahrens } 2101789Sahrens 2102789Sahrens /* 21031544Seschrock * Find any device that's done replacing, so we can detach it. 2104789Sahrens */ 21051544Seschrock static vdev_t * 21061544Seschrock spa_vdev_replace_done_hunt(vdev_t *vd) 2107789Sahrens { 21081544Seschrock vdev_t *newvd, *oldvd; 2109789Sahrens int c; 2110789Sahrens 21111544Seschrock for (c = 0; c < vd->vdev_children; c++) { 21121544Seschrock oldvd = spa_vdev_replace_done_hunt(vd->vdev_child[c]); 21131544Seschrock if (oldvd != NULL) 21141544Seschrock return (oldvd); 21151544Seschrock } 2116789Sahrens 2117789Sahrens if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 21181544Seschrock oldvd = vd->vdev_child[0]; 21191544Seschrock newvd = vd->vdev_child[1]; 2120789Sahrens 21211544Seschrock mutex_enter(&newvd->vdev_dtl_lock); 21221544Seschrock if (newvd->vdev_dtl_map.sm_space == 0 && 21231544Seschrock newvd->vdev_dtl_scrub.sm_space == 0) { 21241544Seschrock mutex_exit(&newvd->vdev_dtl_lock); 21251544Seschrock return (oldvd); 21261544Seschrock } 21271544Seschrock mutex_exit(&newvd->vdev_dtl_lock); 21281544Seschrock } 2129789Sahrens 21301544Seschrock return (NULL); 2131789Sahrens } 2132789Sahrens 21331544Seschrock static void 2134789Sahrens spa_vdev_replace_done(spa_t *spa) 2135789Sahrens { 21361544Seschrock vdev_t *vd; 21372082Seschrock vdev_t *pvd; 21381544Seschrock uint64_t guid; 21392082Seschrock uint64_t pguid = 0; 2140789Sahrens 21411544Seschrock spa_config_enter(spa, RW_READER, FTAG); 2142789Sahrens 21431544Seschrock while ((vd = spa_vdev_replace_done_hunt(spa->spa_root_vdev)) != NULL) { 21441544Seschrock guid = vd->vdev_guid; 21452082Seschrock /* 21462082Seschrock * If we have just finished replacing a hot spared device, then 21472082Seschrock * we need to detach the parent's first child (the original hot 21482082Seschrock * spare) as well. 21492082Seschrock */ 21502082Seschrock pvd = vd->vdev_parent; 21512082Seschrock if (pvd->vdev_parent->vdev_ops == &vdev_spare_ops && 21522082Seschrock pvd->vdev_id == 0) { 21532082Seschrock ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 21542082Seschrock ASSERT(pvd->vdev_parent->vdev_children == 2); 21552082Seschrock pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid; 21562082Seschrock } 21571544Seschrock spa_config_exit(spa, FTAG); 21581544Seschrock if (spa_vdev_detach(spa, guid, B_TRUE) != 0) 21591544Seschrock return; 21602082Seschrock if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0) 21612082Seschrock return; 21621544Seschrock spa_config_enter(spa, RW_READER, FTAG); 2163789Sahrens } 2164789Sahrens 21651544Seschrock spa_config_exit(spa, FTAG); 2166789Sahrens } 2167789Sahrens 2168789Sahrens /* 21691354Seschrock * Update the stored path for this vdev. Dirty the vdev configuration, relying 21701354Seschrock * on spa_vdev_enter/exit() to synchronize the labels and cache. 21711354Seschrock */ 21721354Seschrock int 21731354Seschrock spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 21741354Seschrock { 21751354Seschrock vdev_t *rvd, *vd; 21761354Seschrock uint64_t txg; 21771354Seschrock 21781354Seschrock rvd = spa->spa_root_vdev; 21791354Seschrock 21801354Seschrock txg = spa_vdev_enter(spa); 21811354Seschrock 21822082Seschrock if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 21832082Seschrock /* 21842082Seschrock * Determine if this is a reference to a hot spare. In that 21852082Seschrock * case, update the path as stored in the spare list. 21862082Seschrock */ 21872082Seschrock nvlist_t **spares; 21882082Seschrock uint_t i, nspares; 21892082Seschrock if (spa->spa_sparelist != NULL) { 21902082Seschrock VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 21912082Seschrock ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 21922082Seschrock for (i = 0; i < nspares; i++) { 21932082Seschrock uint64_t theguid; 21942082Seschrock VERIFY(nvlist_lookup_uint64(spares[i], 21952082Seschrock ZPOOL_CONFIG_GUID, &theguid) == 0); 21962082Seschrock if (theguid == guid) 21972082Seschrock break; 21982082Seschrock } 21992082Seschrock 22002082Seschrock if (i == nspares) 22012082Seschrock return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 22022082Seschrock 22032082Seschrock VERIFY(nvlist_add_string(spares[i], 22042082Seschrock ZPOOL_CONFIG_PATH, newpath) == 0); 22052082Seschrock spa_load_spares(spa); 22062082Seschrock spa->spa_sync_spares = B_TRUE; 22072082Seschrock return (spa_vdev_exit(spa, NULL, txg, 0)); 22082082Seschrock } else { 22092082Seschrock return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 22102082Seschrock } 22112082Seschrock } 22121354Seschrock 22131585Sbonwick if (!vd->vdev_ops->vdev_op_leaf) 22141585Sbonwick return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 22151585Sbonwick 22161354Seschrock spa_strfree(vd->vdev_path); 22171354Seschrock vd->vdev_path = spa_strdup(newpath); 22181354Seschrock 22191354Seschrock vdev_config_dirty(vd->vdev_top); 22201354Seschrock 22211354Seschrock return (spa_vdev_exit(spa, NULL, txg, 0)); 22221354Seschrock } 22231354Seschrock 22241354Seschrock /* 2225789Sahrens * ========================================================================== 2226789Sahrens * SPA Scrubbing 2227789Sahrens * ========================================================================== 2228789Sahrens */ 2229789Sahrens 2230789Sahrens static void 2231789Sahrens spa_scrub_io_done(zio_t *zio) 2232789Sahrens { 2233789Sahrens spa_t *spa = zio->io_spa; 2234789Sahrens 22353290Sjohansen zio_data_buf_free(zio->io_data, zio->io_size); 2236789Sahrens 2237789Sahrens mutex_enter(&spa->spa_scrub_lock); 22381544Seschrock if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 22391775Sbillm vdev_t *vd = zio->io_vd ? zio->io_vd : spa->spa_root_vdev; 2240789Sahrens spa->spa_scrub_errors++; 2241789Sahrens mutex_enter(&vd->vdev_stat_lock); 2242789Sahrens vd->vdev_stat.vs_scrub_errors++; 2243789Sahrens mutex_exit(&vd->vdev_stat_lock); 2244789Sahrens } 22453697Smishra 22463697Smishra if (--spa->spa_scrub_inflight < spa->spa_scrub_maxinflight) 22471544Seschrock cv_broadcast(&spa->spa_scrub_io_cv); 22483697Smishra 22493697Smishra ASSERT(spa->spa_scrub_inflight >= 0); 22503697Smishra 22511544Seschrock mutex_exit(&spa->spa_scrub_lock); 2252789Sahrens } 2253789Sahrens 2254789Sahrens static void 22551544Seschrock spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags, 22561544Seschrock zbookmark_t *zb) 2257789Sahrens { 2258789Sahrens size_t size = BP_GET_LSIZE(bp); 22593697Smishra void *data; 2260789Sahrens 2261789Sahrens mutex_enter(&spa->spa_scrub_lock); 22623697Smishra /* 22633697Smishra * Do not give too much work to vdev(s). 22643697Smishra */ 22653697Smishra while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight) { 22663697Smishra cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 22673697Smishra } 2268789Sahrens spa->spa_scrub_inflight++; 2269789Sahrens mutex_exit(&spa->spa_scrub_lock); 2270789Sahrens 22713697Smishra data = zio_data_buf_alloc(size); 22723697Smishra 22731544Seschrock if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET) 22741544Seschrock flags |= ZIO_FLAG_SPECULATIVE; /* intent log block */ 22751544Seschrock 22761807Sbonwick flags |= ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL; 22771544Seschrock 2278789Sahrens zio_nowait(zio_read(NULL, spa, bp, data, size, 22791544Seschrock spa_scrub_io_done, NULL, priority, flags, zb)); 2280789Sahrens } 2281789Sahrens 2282789Sahrens /* ARGSUSED */ 2283789Sahrens static int 2284789Sahrens spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) 2285789Sahrens { 2286789Sahrens blkptr_t *bp = &bc->bc_blkptr; 22871775Sbillm vdev_t *vd = spa->spa_root_vdev; 22881775Sbillm dva_t *dva = bp->blk_dva; 22891775Sbillm int needs_resilver = B_FALSE; 22901775Sbillm int d; 2291789Sahrens 22921775Sbillm if (bc->bc_errno) { 2293789Sahrens /* 2294789Sahrens * We can't scrub this block, but we can continue to scrub 2295789Sahrens * the rest of the pool. Note the error and move along. 2296789Sahrens */ 2297789Sahrens mutex_enter(&spa->spa_scrub_lock); 2298789Sahrens spa->spa_scrub_errors++; 2299789Sahrens mutex_exit(&spa->spa_scrub_lock); 2300789Sahrens 23011775Sbillm mutex_enter(&vd->vdev_stat_lock); 23021775Sbillm vd->vdev_stat.vs_scrub_errors++; 23031775Sbillm mutex_exit(&vd->vdev_stat_lock); 2304789Sahrens 2305789Sahrens return (ERESTART); 2306789Sahrens } 2307789Sahrens 2308789Sahrens ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg); 2309789Sahrens 23101775Sbillm for (d = 0; d < BP_GET_NDVAS(bp); d++) { 23111775Sbillm vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d])); 23121775Sbillm 23131775Sbillm ASSERT(vd != NULL); 23141775Sbillm 23151775Sbillm /* 23161775Sbillm * Keep track of how much data we've examined so that 23171775Sbillm * zpool(1M) status can make useful progress reports. 23181775Sbillm */ 23191775Sbillm mutex_enter(&vd->vdev_stat_lock); 23201775Sbillm vd->vdev_stat.vs_scrub_examined += DVA_GET_ASIZE(&dva[d]); 23211775Sbillm mutex_exit(&vd->vdev_stat_lock); 2322789Sahrens 23231775Sbillm if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) { 23241775Sbillm if (DVA_GET_GANG(&dva[d])) { 23251775Sbillm /* 23261775Sbillm * Gang members may be spread across multiple 23271775Sbillm * vdevs, so the best we can do is look at the 23281775Sbillm * pool-wide DTL. 23291775Sbillm * XXX -- it would be better to change our 23301775Sbillm * allocation policy to ensure that this can't 23311775Sbillm * happen. 23321775Sbillm */ 23331775Sbillm vd = spa->spa_root_vdev; 23341775Sbillm } 23351775Sbillm if (vdev_dtl_contains(&vd->vdev_dtl_map, 23361775Sbillm bp->blk_birth, 1)) 23371775Sbillm needs_resilver = B_TRUE; 2338789Sahrens } 23391775Sbillm } 23401775Sbillm 23411775Sbillm if (spa->spa_scrub_type == POOL_SCRUB_EVERYTHING) 2342789Sahrens spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB, 23431544Seschrock ZIO_FLAG_SCRUB, &bc->bc_bookmark); 23441775Sbillm else if (needs_resilver) 23451775Sbillm spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER, 23461775Sbillm ZIO_FLAG_RESILVER, &bc->bc_bookmark); 2347789Sahrens 2348789Sahrens return (0); 2349789Sahrens } 2350789Sahrens 2351789Sahrens static void 2352789Sahrens spa_scrub_thread(spa_t *spa) 2353789Sahrens { 2354789Sahrens callb_cpr_t cprinfo; 2355789Sahrens traverse_handle_t *th = spa->spa_scrub_th; 2356789Sahrens vdev_t *rvd = spa->spa_root_vdev; 2357789Sahrens pool_scrub_type_t scrub_type = spa->spa_scrub_type; 2358789Sahrens int error = 0; 2359789Sahrens boolean_t complete; 2360789Sahrens 2361789Sahrens CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG); 2362789Sahrens 2363797Sbonwick /* 2364797Sbonwick * If we're restarting due to a snapshot create/delete, 2365797Sbonwick * wait for that to complete. 2366797Sbonwick */ 2367797Sbonwick txg_wait_synced(spa_get_dsl(spa), 0); 2368797Sbonwick 23691544Seschrock dprintf("start %s mintxg=%llu maxtxg=%llu\n", 23701544Seschrock scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 23711544Seschrock spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg); 23721544Seschrock 23731544Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 23741544Seschrock vdev_reopen(rvd); /* purge all vdev caches */ 2375789Sahrens vdev_config_dirty(rvd); /* rewrite all disk labels */ 2376789Sahrens vdev_scrub_stat_update(rvd, scrub_type, B_FALSE); 23771544Seschrock spa_config_exit(spa, FTAG); 2378789Sahrens 2379789Sahrens mutex_enter(&spa->spa_scrub_lock); 2380789Sahrens spa->spa_scrub_errors = 0; 2381789Sahrens spa->spa_scrub_active = 1; 23821544Seschrock ASSERT(spa->spa_scrub_inflight == 0); 2383789Sahrens 2384789Sahrens while (!spa->spa_scrub_stop) { 2385789Sahrens CALLB_CPR_SAFE_BEGIN(&cprinfo); 23861544Seschrock while (spa->spa_scrub_suspended) { 2387789Sahrens spa->spa_scrub_active = 0; 2388789Sahrens cv_broadcast(&spa->spa_scrub_cv); 2389789Sahrens cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2390789Sahrens spa->spa_scrub_active = 1; 2391789Sahrens } 2392789Sahrens CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock); 2393789Sahrens 2394789Sahrens if (spa->spa_scrub_restart_txg != 0) 2395789Sahrens break; 2396789Sahrens 2397789Sahrens mutex_exit(&spa->spa_scrub_lock); 2398789Sahrens error = traverse_more(th); 2399789Sahrens mutex_enter(&spa->spa_scrub_lock); 2400789Sahrens if (error != EAGAIN) 2401789Sahrens break; 2402789Sahrens } 2403789Sahrens 2404789Sahrens while (spa->spa_scrub_inflight) 2405789Sahrens cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2406789Sahrens 24071601Sbonwick spa->spa_scrub_active = 0; 24081601Sbonwick cv_broadcast(&spa->spa_scrub_cv); 24091601Sbonwick 24101601Sbonwick mutex_exit(&spa->spa_scrub_lock); 24111601Sbonwick 24121601Sbonwick spa_config_enter(spa, RW_WRITER, FTAG); 24131601Sbonwick 24141601Sbonwick mutex_enter(&spa->spa_scrub_lock); 24151601Sbonwick 24161601Sbonwick /* 24171601Sbonwick * Note: we check spa_scrub_restart_txg under both spa_scrub_lock 24181601Sbonwick * AND the spa config lock to synchronize with any config changes 24191601Sbonwick * that revise the DTLs under spa_vdev_enter() / spa_vdev_exit(). 24201601Sbonwick */ 2421789Sahrens if (spa->spa_scrub_restart_txg != 0) 2422789Sahrens error = ERESTART; 2423789Sahrens 24241544Seschrock if (spa->spa_scrub_stop) 24251544Seschrock error = EINTR; 24261544Seschrock 2427789Sahrens /* 24281544Seschrock * Even if there were uncorrectable errors, we consider the scrub 24291544Seschrock * completed. The downside is that if there is a transient error during 24301544Seschrock * a resilver, we won't resilver the data properly to the target. But 24311544Seschrock * if the damage is permanent (more likely) we will resilver forever, 24321544Seschrock * which isn't really acceptable. Since there is enough information for 24331544Seschrock * the user to know what has failed and why, this seems like a more 24341544Seschrock * tractable approach. 2435789Sahrens */ 24361544Seschrock complete = (error == 0); 2437789Sahrens 24381544Seschrock dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n", 24391544Seschrock scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 2440789Sahrens spa->spa_scrub_maxtxg, complete ? "done" : "FAILED", 2441789Sahrens error, spa->spa_scrub_errors, spa->spa_scrub_stop); 2442789Sahrens 2443789Sahrens mutex_exit(&spa->spa_scrub_lock); 2444789Sahrens 2445789Sahrens /* 2446789Sahrens * If the scrub/resilver completed, update all DTLs to reflect this. 2447789Sahrens * Whether it succeeded or not, vacate all temporary scrub DTLs. 2448789Sahrens */ 2449789Sahrens vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1, 2450789Sahrens complete ? spa->spa_scrub_maxtxg : 0, B_TRUE); 2451789Sahrens vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete); 24521544Seschrock spa_errlog_rotate(spa); 24531601Sbonwick 24541544Seschrock spa_config_exit(spa, FTAG); 2455789Sahrens 2456789Sahrens mutex_enter(&spa->spa_scrub_lock); 2457789Sahrens 24581544Seschrock /* 24591544Seschrock * We may have finished replacing a device. 24601544Seschrock * Let the async thread assess this and handle the detach. 24611544Seschrock */ 24621544Seschrock spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 2463789Sahrens 2464789Sahrens /* 2465789Sahrens * If we were told to restart, our final act is to start a new scrub. 2466789Sahrens */ 2467789Sahrens if (error == ERESTART) 24681544Seschrock spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ? 24691544Seschrock SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB); 2470789Sahrens 24711544Seschrock spa->spa_scrub_type = POOL_SCRUB_NONE; 24721544Seschrock spa->spa_scrub_active = 0; 24731544Seschrock spa->spa_scrub_thread = NULL; 24741544Seschrock cv_broadcast(&spa->spa_scrub_cv); 2475789Sahrens CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */ 2476789Sahrens thread_exit(); 2477789Sahrens } 2478789Sahrens 2479789Sahrens void 2480789Sahrens spa_scrub_suspend(spa_t *spa) 2481789Sahrens { 2482789Sahrens mutex_enter(&spa->spa_scrub_lock); 24831544Seschrock spa->spa_scrub_suspended++; 2484789Sahrens while (spa->spa_scrub_active) { 2485789Sahrens cv_broadcast(&spa->spa_scrub_cv); 2486789Sahrens cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2487789Sahrens } 2488789Sahrens while (spa->spa_scrub_inflight) 2489789Sahrens cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2490789Sahrens mutex_exit(&spa->spa_scrub_lock); 2491789Sahrens } 2492789Sahrens 2493789Sahrens void 2494789Sahrens spa_scrub_resume(spa_t *spa) 2495789Sahrens { 2496789Sahrens mutex_enter(&spa->spa_scrub_lock); 24971544Seschrock ASSERT(spa->spa_scrub_suspended != 0); 24981544Seschrock if (--spa->spa_scrub_suspended == 0) 2499789Sahrens cv_broadcast(&spa->spa_scrub_cv); 2500789Sahrens mutex_exit(&spa->spa_scrub_lock); 2501789Sahrens } 2502789Sahrens 2503789Sahrens void 2504789Sahrens spa_scrub_restart(spa_t *spa, uint64_t txg) 2505789Sahrens { 2506789Sahrens /* 2507789Sahrens * Something happened (e.g. snapshot create/delete) that means 2508789Sahrens * we must restart any in-progress scrubs. The itinerary will 2509789Sahrens * fix this properly. 2510789Sahrens */ 2511789Sahrens mutex_enter(&spa->spa_scrub_lock); 2512789Sahrens spa->spa_scrub_restart_txg = txg; 2513789Sahrens mutex_exit(&spa->spa_scrub_lock); 2514789Sahrens } 2515789Sahrens 25161544Seschrock int 25171544Seschrock spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) 2518789Sahrens { 2519789Sahrens space_seg_t *ss; 2520789Sahrens uint64_t mintxg, maxtxg; 2521789Sahrens vdev_t *rvd = spa->spa_root_vdev; 2522789Sahrens 2523789Sahrens if ((uint_t)type >= POOL_SCRUB_TYPES) 2524789Sahrens return (ENOTSUP); 2525789Sahrens 25261544Seschrock mutex_enter(&spa->spa_scrub_lock); 25271544Seschrock 2528789Sahrens /* 2529789Sahrens * If there's a scrub or resilver already in progress, stop it. 2530789Sahrens */ 2531789Sahrens while (spa->spa_scrub_thread != NULL) { 2532789Sahrens /* 2533789Sahrens * Don't stop a resilver unless forced. 2534789Sahrens */ 25351544Seschrock if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) { 25361544Seschrock mutex_exit(&spa->spa_scrub_lock); 2537789Sahrens return (EBUSY); 25381544Seschrock } 2539789Sahrens spa->spa_scrub_stop = 1; 2540789Sahrens cv_broadcast(&spa->spa_scrub_cv); 2541789Sahrens cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2542789Sahrens } 2543789Sahrens 2544789Sahrens /* 2545789Sahrens * Terminate the previous traverse. 2546789Sahrens */ 2547789Sahrens if (spa->spa_scrub_th != NULL) { 2548789Sahrens traverse_fini(spa->spa_scrub_th); 2549789Sahrens spa->spa_scrub_th = NULL; 2550789Sahrens } 2551789Sahrens 25521544Seschrock if (rvd == NULL) { 25531544Seschrock ASSERT(spa->spa_scrub_stop == 0); 25541544Seschrock ASSERT(spa->spa_scrub_type == type); 25551544Seschrock ASSERT(spa->spa_scrub_restart_txg == 0); 25561544Seschrock mutex_exit(&spa->spa_scrub_lock); 25571544Seschrock return (0); 25581544Seschrock } 2559789Sahrens 2560789Sahrens mintxg = TXG_INITIAL - 1; 2561789Sahrens maxtxg = spa_last_synced_txg(spa) + 1; 2562789Sahrens 25631544Seschrock mutex_enter(&rvd->vdev_dtl_lock); 2564789Sahrens 25651544Seschrock if (rvd->vdev_dtl_map.sm_space == 0) { 25661544Seschrock /* 25671544Seschrock * The pool-wide DTL is empty. 25681732Sbonwick * If this is a resilver, there's nothing to do except 25691732Sbonwick * check whether any in-progress replacements have completed. 25701544Seschrock */ 25711732Sbonwick if (type == POOL_SCRUB_RESILVER) { 25721544Seschrock type = POOL_SCRUB_NONE; 25731732Sbonwick spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 25741732Sbonwick } 25751544Seschrock } else { 25761544Seschrock /* 25771544Seschrock * The pool-wide DTL is non-empty. 25781544Seschrock * If this is a normal scrub, upgrade to a resilver instead. 25791544Seschrock */ 25801544Seschrock if (type == POOL_SCRUB_EVERYTHING) 25811544Seschrock type = POOL_SCRUB_RESILVER; 25821544Seschrock } 2583789Sahrens 25841544Seschrock if (type == POOL_SCRUB_RESILVER) { 2585789Sahrens /* 2586789Sahrens * Determine the resilvering boundaries. 2587789Sahrens * 2588789Sahrens * Note: (mintxg, maxtxg) is an open interval, 2589789Sahrens * i.e. mintxg and maxtxg themselves are not included. 2590789Sahrens * 2591789Sahrens * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1 2592789Sahrens * so we don't claim to resilver a txg that's still changing. 2593789Sahrens */ 2594789Sahrens ss = avl_first(&rvd->vdev_dtl_map.sm_root); 25951544Seschrock mintxg = ss->ss_start - 1; 2596789Sahrens ss = avl_last(&rvd->vdev_dtl_map.sm_root); 25971544Seschrock maxtxg = MIN(ss->ss_end, maxtxg); 2598789Sahrens } 2599789Sahrens 26001544Seschrock mutex_exit(&rvd->vdev_dtl_lock); 26011544Seschrock 26021544Seschrock spa->spa_scrub_stop = 0; 26031544Seschrock spa->spa_scrub_type = type; 26041544Seschrock spa->spa_scrub_restart_txg = 0; 26051544Seschrock 26061544Seschrock if (type != POOL_SCRUB_NONE) { 26071544Seschrock spa->spa_scrub_mintxg = mintxg; 2608789Sahrens spa->spa_scrub_maxtxg = maxtxg; 2609789Sahrens spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL, 26101635Sbonwick ADVANCE_PRE | ADVANCE_PRUNE | ADVANCE_ZIL, 26111635Sbonwick ZIO_FLAG_CANFAIL); 2612789Sahrens traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg); 2613789Sahrens spa->spa_scrub_thread = thread_create(NULL, 0, 2614789Sahrens spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri); 2615789Sahrens } 2616789Sahrens 26171544Seschrock mutex_exit(&spa->spa_scrub_lock); 26181544Seschrock 2619789Sahrens return (0); 2620789Sahrens } 2621789Sahrens 26221544Seschrock /* 26231544Seschrock * ========================================================================== 26241544Seschrock * SPA async task processing 26251544Seschrock * ========================================================================== 26261544Seschrock */ 26271544Seschrock 26281544Seschrock static void 26291544Seschrock spa_async_reopen(spa_t *spa) 2630789Sahrens { 26311544Seschrock vdev_t *rvd = spa->spa_root_vdev; 26321544Seschrock vdev_t *tvd; 26331544Seschrock int c; 26341544Seschrock 26351544Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 26361544Seschrock 26371544Seschrock for (c = 0; c < rvd->vdev_children; c++) { 26381544Seschrock tvd = rvd->vdev_child[c]; 26391544Seschrock if (tvd->vdev_reopen_wanted) { 26401544Seschrock tvd->vdev_reopen_wanted = 0; 26411544Seschrock vdev_reopen(tvd); 26421544Seschrock } 26431544Seschrock } 2644789Sahrens 26451544Seschrock spa_config_exit(spa, FTAG); 26461544Seschrock } 26471544Seschrock 26481544Seschrock static void 26491544Seschrock spa_async_thread(spa_t *spa) 26501544Seschrock { 26511544Seschrock int tasks; 26521544Seschrock 26531544Seschrock ASSERT(spa->spa_sync_on); 2654789Sahrens 26551544Seschrock mutex_enter(&spa->spa_async_lock); 26561544Seschrock tasks = spa->spa_async_tasks; 26571544Seschrock spa->spa_async_tasks = 0; 26581544Seschrock mutex_exit(&spa->spa_async_lock); 26591544Seschrock 26601544Seschrock /* 26611635Sbonwick * See if the config needs to be updated. 26621635Sbonwick */ 26631635Sbonwick if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 26641635Sbonwick mutex_enter(&spa_namespace_lock); 26651635Sbonwick spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 26661635Sbonwick mutex_exit(&spa_namespace_lock); 26671635Sbonwick } 26681635Sbonwick 26691635Sbonwick /* 26701544Seschrock * See if any devices need to be reopened. 26711544Seschrock */ 26721544Seschrock if (tasks & SPA_ASYNC_REOPEN) 26731544Seschrock spa_async_reopen(spa); 26741544Seschrock 26751544Seschrock /* 26761544Seschrock * If any devices are done replacing, detach them. 26771544Seschrock */ 26781544Seschrock if (tasks & SPA_ASYNC_REPLACE_DONE) 2679789Sahrens spa_vdev_replace_done(spa); 2680789Sahrens 26811544Seschrock /* 26821544Seschrock * Kick off a scrub. 26831544Seschrock */ 26841544Seschrock if (tasks & SPA_ASYNC_SCRUB) 26851544Seschrock VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0); 26861544Seschrock 26871544Seschrock /* 26881544Seschrock * Kick off a resilver. 26891544Seschrock */ 26901544Seschrock if (tasks & SPA_ASYNC_RESILVER) 26911544Seschrock VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 26921544Seschrock 26931544Seschrock /* 26941544Seschrock * Let the world know that we're done. 26951544Seschrock */ 26961544Seschrock mutex_enter(&spa->spa_async_lock); 26971544Seschrock spa->spa_async_thread = NULL; 26981544Seschrock cv_broadcast(&spa->spa_async_cv); 26991544Seschrock mutex_exit(&spa->spa_async_lock); 27001544Seschrock thread_exit(); 27011544Seschrock } 27021544Seschrock 27031544Seschrock void 27041544Seschrock spa_async_suspend(spa_t *spa) 27051544Seschrock { 27061544Seschrock mutex_enter(&spa->spa_async_lock); 27071544Seschrock spa->spa_async_suspended++; 27081544Seschrock while (spa->spa_async_thread != NULL) 27091544Seschrock cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 27101544Seschrock mutex_exit(&spa->spa_async_lock); 27111544Seschrock } 27121544Seschrock 27131544Seschrock void 27141544Seschrock spa_async_resume(spa_t *spa) 27151544Seschrock { 27161544Seschrock mutex_enter(&spa->spa_async_lock); 27171544Seschrock ASSERT(spa->spa_async_suspended != 0); 27181544Seschrock spa->spa_async_suspended--; 27191544Seschrock mutex_exit(&spa->spa_async_lock); 27201544Seschrock } 27211544Seschrock 27221544Seschrock static void 27231544Seschrock spa_async_dispatch(spa_t *spa) 27241544Seschrock { 27251544Seschrock mutex_enter(&spa->spa_async_lock); 27261544Seschrock if (spa->spa_async_tasks && !spa->spa_async_suspended && 27271635Sbonwick spa->spa_async_thread == NULL && 27281635Sbonwick rootdir != NULL && !vn_is_readonly(rootdir)) 27291544Seschrock spa->spa_async_thread = thread_create(NULL, 0, 27301544Seschrock spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 27311544Seschrock mutex_exit(&spa->spa_async_lock); 27321544Seschrock } 27331544Seschrock 27341544Seschrock void 27351544Seschrock spa_async_request(spa_t *spa, int task) 27361544Seschrock { 27371544Seschrock mutex_enter(&spa->spa_async_lock); 27381544Seschrock spa->spa_async_tasks |= task; 27391544Seschrock mutex_exit(&spa->spa_async_lock); 2740789Sahrens } 2741789Sahrens 2742789Sahrens /* 2743789Sahrens * ========================================================================== 2744789Sahrens * SPA syncing routines 2745789Sahrens * ========================================================================== 2746789Sahrens */ 2747789Sahrens 2748789Sahrens static void 2749789Sahrens spa_sync_deferred_frees(spa_t *spa, uint64_t txg) 2750789Sahrens { 2751789Sahrens bplist_t *bpl = &spa->spa_sync_bplist; 2752789Sahrens dmu_tx_t *tx; 2753789Sahrens blkptr_t blk; 2754789Sahrens uint64_t itor = 0; 2755789Sahrens zio_t *zio; 2756789Sahrens int error; 2757789Sahrens uint8_t c = 1; 2758789Sahrens 2759789Sahrens zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD); 2760789Sahrens 2761789Sahrens while (bplist_iterate(bpl, &itor, &blk) == 0) 2762789Sahrens zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL)); 2763789Sahrens 2764789Sahrens error = zio_wait(zio); 2765789Sahrens ASSERT3U(error, ==, 0); 2766789Sahrens 2767789Sahrens tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 2768789Sahrens bplist_vacate(bpl, tx); 2769789Sahrens 2770789Sahrens /* 2771789Sahrens * Pre-dirty the first block so we sync to convergence faster. 2772789Sahrens * (Usually only the first block is needed.) 2773789Sahrens */ 2774789Sahrens dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 2775789Sahrens dmu_tx_commit(tx); 2776789Sahrens } 2777789Sahrens 2778789Sahrens static void 27792082Seschrock spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 27802082Seschrock { 27812082Seschrock char *packed = NULL; 27822082Seschrock size_t nvsize = 0; 27832082Seschrock dmu_buf_t *db; 27842082Seschrock 27852082Seschrock VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 27862082Seschrock 27872082Seschrock packed = kmem_alloc(nvsize, KM_SLEEP); 27882082Seschrock 27892082Seschrock VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 27902082Seschrock KM_SLEEP) == 0); 27912082Seschrock 27922082Seschrock dmu_write(spa->spa_meta_objset, obj, 0, nvsize, packed, tx); 27932082Seschrock 27942082Seschrock kmem_free(packed, nvsize); 27952082Seschrock 27962082Seschrock VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 27972082Seschrock dmu_buf_will_dirty(db, tx); 27982082Seschrock *(uint64_t *)db->db_data = nvsize; 27992082Seschrock dmu_buf_rele(db, FTAG); 28002082Seschrock } 28012082Seschrock 28022082Seschrock static void 28032082Seschrock spa_sync_spares(spa_t *spa, dmu_tx_t *tx) 28042082Seschrock { 28052082Seschrock nvlist_t *nvroot; 28062082Seschrock nvlist_t **spares; 28072082Seschrock int i; 28082082Seschrock 28092082Seschrock if (!spa->spa_sync_spares) 28102082Seschrock return; 28112082Seschrock 28122082Seschrock /* 28132082Seschrock * Update the MOS nvlist describing the list of available spares. 28142082Seschrock * spa_validate_spares() will have already made sure this nvlist is 28152082Seschrock * valid and the vdevs are labelled appropriately. 28162082Seschrock */ 28172082Seschrock if (spa->spa_spares_object == 0) { 28182082Seschrock spa->spa_spares_object = dmu_object_alloc(spa->spa_meta_objset, 28192082Seschrock DMU_OT_PACKED_NVLIST, 1 << 14, 28202082Seschrock DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 28212082Seschrock VERIFY(zap_update(spa->spa_meta_objset, 28222082Seschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SPARES, 28232082Seschrock sizeof (uint64_t), 1, &spa->spa_spares_object, tx) == 0); 28242082Seschrock } 28252082Seschrock 28262082Seschrock VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 28272082Seschrock if (spa->spa_nspares == 0) { 28282082Seschrock VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 28292082Seschrock NULL, 0) == 0); 28302082Seschrock } else { 28312082Seschrock spares = kmem_alloc(spa->spa_nspares * sizeof (void *), 28322082Seschrock KM_SLEEP); 28332082Seschrock for (i = 0; i < spa->spa_nspares; i++) 28342082Seschrock spares[i] = vdev_config_generate(spa, 28352082Seschrock spa->spa_spares[i], B_FALSE, B_TRUE); 28362082Seschrock VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 28372082Seschrock spares, spa->spa_nspares) == 0); 28382082Seschrock for (i = 0; i < spa->spa_nspares; i++) 28392082Seschrock nvlist_free(spares[i]); 28402082Seschrock kmem_free(spares, spa->spa_nspares * sizeof (void *)); 28412082Seschrock } 28422082Seschrock 28432082Seschrock spa_sync_nvlist(spa, spa->spa_spares_object, nvroot, tx); 28442926Sek110237 nvlist_free(nvroot); 28452082Seschrock 28462082Seschrock spa->spa_sync_spares = B_FALSE; 28472082Seschrock } 28482082Seschrock 28492082Seschrock static void 2850789Sahrens spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 2851789Sahrens { 2852789Sahrens nvlist_t *config; 2853789Sahrens 2854789Sahrens if (list_is_empty(&spa->spa_dirty_list)) 2855789Sahrens return; 2856789Sahrens 2857789Sahrens config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE); 2858789Sahrens 28591635Sbonwick if (spa->spa_config_syncing) 28601635Sbonwick nvlist_free(spa->spa_config_syncing); 28611635Sbonwick spa->spa_config_syncing = config; 2862789Sahrens 28632082Seschrock spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 2864789Sahrens } 2865789Sahrens 28663912Slling static void 28673912Slling spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx) 28683912Slling { 28693912Slling spa_t *spa = arg1; 28703912Slling nvlist_t *nvp = arg2; 28713912Slling nvpair_t *nvpair; 28723912Slling objset_t *mos = spa->spa_meta_objset; 28733912Slling uint64_t zapobj; 28743912Slling 28753912Slling mutex_enter(&spa->spa_props_lock); 28763912Slling if (spa->spa_pool_props_object == 0) { 28773912Slling zapobj = zap_create(mos, DMU_OT_POOL_PROPS, DMU_OT_NONE, 0, tx); 28783912Slling VERIFY(zapobj > 0); 28793912Slling 28803912Slling spa->spa_pool_props_object = zapobj; 28813912Slling 28823912Slling VERIFY(zap_update(mos, DMU_POOL_DIRECTORY_OBJECT, 28833912Slling DMU_POOL_PROPS, 8, 1, 28843912Slling &spa->spa_pool_props_object, tx) == 0); 28853912Slling } 28863912Slling mutex_exit(&spa->spa_props_lock); 28873912Slling 28883912Slling nvpair = NULL; 28893912Slling while ((nvpair = nvlist_next_nvpair(nvp, nvpair))) { 28903912Slling switch (zpool_name_to_prop(nvpair_name(nvpair))) { 28913912Slling case ZFS_PROP_BOOTFS: 28923912Slling VERIFY(nvlist_lookup_uint64(nvp, 28933912Slling nvpair_name(nvpair), &spa->spa_bootfs) == 0); 28943912Slling VERIFY(zap_update(mos, 28953912Slling spa->spa_pool_props_object, 28963912Slling zpool_prop_to_name(ZFS_PROP_BOOTFS), 8, 1, 28973912Slling &spa->spa_bootfs, tx) == 0); 28983912Slling break; 28993912Slling } 29003912Slling } 29013912Slling } 29023912Slling 2903789Sahrens /* 2904789Sahrens * Sync the specified transaction group. New blocks may be dirtied as 2905789Sahrens * part of the process, so we iterate until it converges. 2906789Sahrens */ 2907789Sahrens void 2908789Sahrens spa_sync(spa_t *spa, uint64_t txg) 2909789Sahrens { 2910789Sahrens dsl_pool_t *dp = spa->spa_dsl_pool; 2911789Sahrens objset_t *mos = spa->spa_meta_objset; 2912789Sahrens bplist_t *bpl = &spa->spa_sync_bplist; 29131635Sbonwick vdev_t *rvd = spa->spa_root_vdev; 2914789Sahrens vdev_t *vd; 2915789Sahrens dmu_tx_t *tx; 2916789Sahrens int dirty_vdevs; 2917789Sahrens 2918789Sahrens /* 2919789Sahrens * Lock out configuration changes. 2920789Sahrens */ 29211544Seschrock spa_config_enter(spa, RW_READER, FTAG); 2922789Sahrens 2923789Sahrens spa->spa_syncing_txg = txg; 2924789Sahrens spa->spa_sync_pass = 0; 2925789Sahrens 29261544Seschrock VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); 2927789Sahrens 29282082Seschrock tx = dmu_tx_create_assigned(dp, txg); 29292082Seschrock 29302082Seschrock /* 29312082Seschrock * If we are upgrading to ZFS_VERSION_RAIDZ_DEFLATE this txg, 29322082Seschrock * set spa_deflate if we have no raid-z vdevs. 29332082Seschrock */ 29342082Seschrock if (spa->spa_ubsync.ub_version < ZFS_VERSION_RAIDZ_DEFLATE && 29352082Seschrock spa->spa_uberblock.ub_version >= ZFS_VERSION_RAIDZ_DEFLATE) { 29362082Seschrock int i; 29372082Seschrock 29382082Seschrock for (i = 0; i < rvd->vdev_children; i++) { 29392082Seschrock vd = rvd->vdev_child[i]; 29402082Seschrock if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 29412082Seschrock break; 29422082Seschrock } 29432082Seschrock if (i == rvd->vdev_children) { 29442082Seschrock spa->spa_deflate = TRUE; 29452082Seschrock VERIFY(0 == zap_add(spa->spa_meta_objset, 29462082Seschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 29472082Seschrock sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 29482082Seschrock } 29492082Seschrock } 29502082Seschrock 2951789Sahrens /* 2952789Sahrens * If anything has changed in this txg, push the deferred frees 2953789Sahrens * from the previous txg. If not, leave them alone so that we 2954789Sahrens * don't generate work on an otherwise idle system. 2955789Sahrens */ 2956789Sahrens if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 29572329Sek110237 !txg_list_empty(&dp->dp_dirty_dirs, txg) || 29582329Sek110237 !txg_list_empty(&dp->dp_sync_tasks, txg)) 2959789Sahrens spa_sync_deferred_frees(spa, txg); 2960789Sahrens 2961789Sahrens /* 2962789Sahrens * Iterate to convergence. 2963789Sahrens */ 2964789Sahrens do { 2965789Sahrens spa->spa_sync_pass++; 2966789Sahrens 2967789Sahrens spa_sync_config_object(spa, tx); 29682082Seschrock spa_sync_spares(spa, tx); 29691544Seschrock spa_errlog_sync(spa, txg); 2970789Sahrens dsl_pool_sync(dp, txg); 2971789Sahrens 2972789Sahrens dirty_vdevs = 0; 2973789Sahrens while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 2974789Sahrens vdev_sync(vd, txg); 2975789Sahrens dirty_vdevs++; 2976789Sahrens } 2977789Sahrens 2978789Sahrens bplist_sync(bpl, tx); 2979789Sahrens } while (dirty_vdevs); 2980789Sahrens 2981789Sahrens bplist_close(bpl); 2982789Sahrens 2983789Sahrens dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 2984789Sahrens 2985789Sahrens /* 2986789Sahrens * Rewrite the vdev configuration (which includes the uberblock) 2987789Sahrens * to commit the transaction group. 29881635Sbonwick * 29891635Sbonwick * If there are any dirty vdevs, sync the uberblock to all vdevs. 29901635Sbonwick * Otherwise, pick a random top-level vdev that's known to be 29911635Sbonwick * visible in the config cache (see spa_vdev_add() for details). 29921635Sbonwick * If the write fails, try the next vdev until we're tried them all. 2993789Sahrens */ 29941635Sbonwick if (!list_is_empty(&spa->spa_dirty_list)) { 29951635Sbonwick VERIFY(vdev_config_sync(rvd, txg) == 0); 29961635Sbonwick } else { 29971635Sbonwick int children = rvd->vdev_children; 29981635Sbonwick int c0 = spa_get_random(children); 29991635Sbonwick int c; 30001635Sbonwick 30011635Sbonwick for (c = 0; c < children; c++) { 30021635Sbonwick vd = rvd->vdev_child[(c0 + c) % children]; 30031635Sbonwick if (vd->vdev_ms_array == 0) 30041635Sbonwick continue; 30051635Sbonwick if (vdev_config_sync(vd, txg) == 0) 30061635Sbonwick break; 30071635Sbonwick } 30081635Sbonwick if (c == children) 30091635Sbonwick VERIFY(vdev_config_sync(rvd, txg) == 0); 30101635Sbonwick } 30111635Sbonwick 30122082Seschrock dmu_tx_commit(tx); 30132082Seschrock 30141635Sbonwick /* 30151635Sbonwick * Clear the dirty config list. 30161635Sbonwick */ 30171635Sbonwick while ((vd = list_head(&spa->spa_dirty_list)) != NULL) 30181635Sbonwick vdev_config_clean(vd); 30191635Sbonwick 30201635Sbonwick /* 30211635Sbonwick * Now that the new config has synced transactionally, 30221635Sbonwick * let it become visible to the config cache. 30231635Sbonwick */ 30241635Sbonwick if (spa->spa_config_syncing != NULL) { 30251635Sbonwick spa_config_set(spa, spa->spa_config_syncing); 30261635Sbonwick spa->spa_config_txg = txg; 30271635Sbonwick spa->spa_config_syncing = NULL; 30281635Sbonwick } 3029789Sahrens 3030789Sahrens /* 3031789Sahrens * Make a stable copy of the fully synced uberblock. 3032789Sahrens * We use this as the root for pool traversals. 3033789Sahrens */ 3034789Sahrens spa->spa_traverse_wanted = 1; /* tells traverse_more() to stop */ 3035789Sahrens 3036789Sahrens spa_scrub_suspend(spa); /* stop scrubbing and finish I/Os */ 3037789Sahrens 3038789Sahrens rw_enter(&spa->spa_traverse_lock, RW_WRITER); 3039789Sahrens spa->spa_traverse_wanted = 0; 3040789Sahrens spa->spa_ubsync = spa->spa_uberblock; 3041789Sahrens rw_exit(&spa->spa_traverse_lock); 3042789Sahrens 3043789Sahrens spa_scrub_resume(spa); /* resume scrub with new ubsync */ 3044789Sahrens 3045789Sahrens /* 3046789Sahrens * Clean up the ZIL records for the synced txg. 3047789Sahrens */ 3048789Sahrens dsl_pool_zil_clean(dp); 3049789Sahrens 3050789Sahrens /* 3051789Sahrens * Update usable space statistics. 3052789Sahrens */ 3053789Sahrens while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 3054789Sahrens vdev_sync_done(vd, txg); 3055789Sahrens 3056789Sahrens /* 3057789Sahrens * It had better be the case that we didn't dirty anything 30582082Seschrock * since vdev_config_sync(). 3059789Sahrens */ 3060789Sahrens ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 3061789Sahrens ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 3062789Sahrens ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 3063789Sahrens ASSERT(bpl->bpl_queue == NULL); 3064789Sahrens 30651544Seschrock spa_config_exit(spa, FTAG); 30661544Seschrock 30671544Seschrock /* 30681544Seschrock * If any async tasks have been requested, kick them off. 30691544Seschrock */ 30701544Seschrock spa_async_dispatch(spa); 3071789Sahrens } 3072789Sahrens 3073789Sahrens /* 3074789Sahrens * Sync all pools. We don't want to hold the namespace lock across these 3075789Sahrens * operations, so we take a reference on the spa_t and drop the lock during the 3076789Sahrens * sync. 3077789Sahrens */ 3078789Sahrens void 3079789Sahrens spa_sync_allpools(void) 3080789Sahrens { 3081789Sahrens spa_t *spa = NULL; 3082789Sahrens mutex_enter(&spa_namespace_lock); 3083789Sahrens while ((spa = spa_next(spa)) != NULL) { 3084789Sahrens if (spa_state(spa) != POOL_STATE_ACTIVE) 3085789Sahrens continue; 3086789Sahrens spa_open_ref(spa, FTAG); 3087789Sahrens mutex_exit(&spa_namespace_lock); 3088789Sahrens txg_wait_synced(spa_get_dsl(spa), 0); 3089789Sahrens mutex_enter(&spa_namespace_lock); 3090789Sahrens spa_close(spa, FTAG); 3091789Sahrens } 3092789Sahrens mutex_exit(&spa_namespace_lock); 3093789Sahrens } 3094789Sahrens 3095789Sahrens /* 3096789Sahrens * ========================================================================== 3097789Sahrens * Miscellaneous routines 3098789Sahrens * ========================================================================== 3099789Sahrens */ 3100789Sahrens 3101789Sahrens /* 3102789Sahrens * Remove all pools in the system. 3103789Sahrens */ 3104789Sahrens void 3105789Sahrens spa_evict_all(void) 3106789Sahrens { 3107789Sahrens spa_t *spa; 3108789Sahrens 3109789Sahrens /* 3110789Sahrens * Remove all cached state. All pools should be closed now, 3111789Sahrens * so every spa in the AVL tree should be unreferenced. 3112789Sahrens */ 3113789Sahrens mutex_enter(&spa_namespace_lock); 3114789Sahrens while ((spa = spa_next(NULL)) != NULL) { 3115789Sahrens /* 31161544Seschrock * Stop async tasks. The async thread may need to detach 31171544Seschrock * a device that's been replaced, which requires grabbing 31181544Seschrock * spa_namespace_lock, so we must drop it here. 3119789Sahrens */ 3120789Sahrens spa_open_ref(spa, FTAG); 3121789Sahrens mutex_exit(&spa_namespace_lock); 31221544Seschrock spa_async_suspend(spa); 3123789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 3124789Sahrens mutex_enter(&spa_namespace_lock); 3125789Sahrens spa_close(spa, FTAG); 3126789Sahrens 3127789Sahrens if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 3128789Sahrens spa_unload(spa); 3129789Sahrens spa_deactivate(spa); 3130789Sahrens } 3131789Sahrens spa_remove(spa); 3132789Sahrens } 3133789Sahrens mutex_exit(&spa_namespace_lock); 3134789Sahrens } 31351544Seschrock 31361544Seschrock vdev_t * 31371544Seschrock spa_lookup_by_guid(spa_t *spa, uint64_t guid) 31381544Seschrock { 31391544Seschrock return (vdev_lookup_by_guid(spa->spa_root_vdev, guid)); 31401544Seschrock } 31411760Seschrock 31421760Seschrock void 31431760Seschrock spa_upgrade(spa_t *spa) 31441760Seschrock { 31451760Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 31461760Seschrock 31471760Seschrock /* 31481760Seschrock * This should only be called for a non-faulted pool, and since a 31491760Seschrock * future version would result in an unopenable pool, this shouldn't be 31501760Seschrock * possible. 31511760Seschrock */ 31521760Seschrock ASSERT(spa->spa_uberblock.ub_version <= ZFS_VERSION); 31531760Seschrock 31541760Seschrock spa->spa_uberblock.ub_version = ZFS_VERSION; 31551760Seschrock vdev_config_dirty(spa->spa_root_vdev); 31561760Seschrock 31571760Seschrock spa_config_exit(spa, FTAG); 31582082Seschrock 31592082Seschrock txg_wait_synced(spa_get_dsl(spa), 0); 31601760Seschrock } 31612082Seschrock 31622082Seschrock boolean_t 31632082Seschrock spa_has_spare(spa_t *spa, uint64_t guid) 31642082Seschrock { 31652082Seschrock int i; 31663377Seschrock uint64_t spareguid; 31672082Seschrock 31682082Seschrock for (i = 0; i < spa->spa_nspares; i++) 31692082Seschrock if (spa->spa_spares[i]->vdev_guid == guid) 31702082Seschrock return (B_TRUE); 31712082Seschrock 31723377Seschrock for (i = 0; i < spa->spa_pending_nspares; i++) { 31733377Seschrock if (nvlist_lookup_uint64(spa->spa_pending_spares[i], 31743377Seschrock ZPOOL_CONFIG_GUID, &spareguid) == 0 && 31753377Seschrock spareguid == guid) 31763377Seschrock return (B_TRUE); 31773377Seschrock } 31783377Seschrock 31792082Seschrock return (B_FALSE); 31802082Seschrock } 31813912Slling 31823912Slling int 31833912Slling spa_set_props(spa_t *spa, nvlist_t *nvp) 31843912Slling { 31853912Slling return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, 31863912Slling spa, nvp, 3)); 31873912Slling } 31883912Slling 31893912Slling int 31903912Slling spa_get_props(spa_t *spa, nvlist_t **nvp) 31913912Slling { 31923912Slling zap_cursor_t zc; 31933912Slling zap_attribute_t za; 31943912Slling objset_t *mos = spa->spa_meta_objset; 31953912Slling zfs_source_t src; 31963912Slling zfs_prop_t prop; 31973912Slling nvlist_t *propval; 31983912Slling uint64_t value; 31993912Slling int err; 32003912Slling 32013912Slling VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 32023912Slling 32033912Slling mutex_enter(&spa->spa_props_lock); 32043912Slling /* If no props object, then just return empty nvlist */ 32053912Slling if (spa->spa_pool_props_object == 0) { 32063912Slling mutex_exit(&spa->spa_props_lock); 32073912Slling return (0); 32083912Slling } 32093912Slling 32103912Slling for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 32113912Slling (err = zap_cursor_retrieve(&zc, &za)) == 0; 32123912Slling zap_cursor_advance(&zc)) { 32133912Slling 32143912Slling if ((prop = zpool_name_to_prop(za.za_name)) == ZFS_PROP_INVAL) 32153912Slling continue; 32163912Slling 32173912Slling VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 32183912Slling switch (za.za_integer_length) { 32193912Slling case 8: 32203912Slling if (zfs_prop_default_numeric(prop) == 32213912Slling za.za_first_integer) 32223912Slling src = ZFS_SRC_DEFAULT; 32233912Slling else 32243912Slling src = ZFS_SRC_LOCAL; 32253912Slling value = za.za_first_integer; 32263912Slling 32273912Slling if (prop == ZFS_PROP_BOOTFS) { 32283912Slling dsl_pool_t *dp; 32293912Slling dsl_dataset_t *ds = NULL; 32303912Slling char strval[MAXPATHLEN]; 32313912Slling 32323912Slling dp = spa_get_dsl(spa); 32333912Slling rw_enter(&dp->dp_config_rwlock, RW_READER); 32343912Slling if ((err = dsl_dataset_open_obj(dp, 32353912Slling za.za_first_integer, NULL, DS_MODE_NONE, 32363912Slling FTAG, &ds)) != 0) { 32373912Slling rw_exit(&dp->dp_config_rwlock); 32383912Slling break; 32393912Slling } 32403912Slling dsl_dataset_name(ds, strval); 32413912Slling dsl_dataset_close(ds, DS_MODE_NONE, FTAG); 32423912Slling rw_exit(&dp->dp_config_rwlock); 32433912Slling 32443912Slling VERIFY(nvlist_add_uint64(propval, 32453912Slling ZFS_PROP_SOURCE, src) == 0); 32463912Slling VERIFY(nvlist_add_string(propval, 32473912Slling ZFS_PROP_VALUE, strval) == 0); 32483912Slling } else { 32493912Slling VERIFY(nvlist_add_uint64(propval, 32503912Slling ZFS_PROP_SOURCE, src) == 0); 32513912Slling VERIFY(nvlist_add_uint64(propval, 32523912Slling ZFS_PROP_VALUE, value) == 0); 32533912Slling } 32543912Slling VERIFY(nvlist_add_nvlist(*nvp, za.za_name, 32553912Slling propval) == 0); 32563912Slling break; 32573912Slling } 32583912Slling nvlist_free(propval); 32593912Slling } 32603912Slling zap_cursor_fini(&zc); 32613912Slling mutex_exit(&spa->spa_props_lock); 32623912Slling if (err && err != ENOENT) { 32633912Slling nvlist_free(*nvp); 32643912Slling return (err); 32653912Slling } 32663912Slling 32673912Slling return (0); 32683912Slling } 32693912Slling 32703912Slling /* 32713912Slling * If the bootfs property value is dsobj, clear it. 32723912Slling */ 32733912Slling void 32743912Slling spa_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 32753912Slling { 32763912Slling if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 32773912Slling VERIFY(zap_remove(spa->spa_meta_objset, 32783912Slling spa->spa_pool_props_object, 32793912Slling zpool_prop_to_name(ZFS_PROP_BOOTFS), tx) == 0); 32803912Slling spa->spa_bootfs = 0; 32813912Slling } 32823912Slling } 3283