1789Sahrens /* 2789Sahrens * CDDL HEADER START 3789Sahrens * 4789Sahrens * The contents of this file are subject to the terms of the 51544Seschrock * Common Development and Distribution License (the "License"). 61544Seschrock * You may not use this file except in compliance with the License. 7789Sahrens * 8789Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9789Sahrens * or http://www.opensolaris.org/os/licensing. 10789Sahrens * See the License for the specific language governing permissions 11789Sahrens * and limitations under the License. 12789Sahrens * 13789Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14789Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15789Sahrens * If applicable, add the following below this CDDL HEADER, with the 16789Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17789Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18789Sahrens * 19789Sahrens * CDDL HEADER END 20789Sahrens */ 212082Seschrock 22789Sahrens /* 233377Seschrock * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24789Sahrens * Use is subject to license terms. 25789Sahrens */ 26789Sahrens 27789Sahrens #pragma ident "%Z%%M% %I% %E% SMI" 28789Sahrens 29789Sahrens /* 30789Sahrens * This file contains all the routines used when modifying on-disk SPA state. 31789Sahrens * This includes opening, importing, destroying, exporting a pool, and syncing a 32789Sahrens * pool. 33789Sahrens */ 34789Sahrens 35789Sahrens #include <sys/zfs_context.h> 361544Seschrock #include <sys/fm/fs/zfs.h> 37789Sahrens #include <sys/spa_impl.h> 38789Sahrens #include <sys/zio.h> 39789Sahrens #include <sys/zio_checksum.h> 40789Sahrens #include <sys/zio_compress.h> 41789Sahrens #include <sys/dmu.h> 42789Sahrens #include <sys/dmu_tx.h> 43789Sahrens #include <sys/zap.h> 44789Sahrens #include <sys/zil.h> 45789Sahrens #include <sys/vdev_impl.h> 46789Sahrens #include <sys/metaslab.h> 47789Sahrens #include <sys/uberblock_impl.h> 48789Sahrens #include <sys/txg.h> 49789Sahrens #include <sys/avl.h> 50789Sahrens #include <sys/dmu_traverse.h> 513912Slling #include <sys/dmu_objset.h> 52789Sahrens #include <sys/unique.h> 53789Sahrens #include <sys/dsl_pool.h> 543912Slling #include <sys/dsl_dataset.h> 55789Sahrens #include <sys/dsl_dir.h> 56789Sahrens #include <sys/dsl_prop.h> 573912Slling #include <sys/dsl_synctask.h> 58789Sahrens #include <sys/fs/zfs.h> 59789Sahrens #include <sys/callb.h> 603975Sek110237 #include <sys/systeminfo.h> 613975Sek110237 #include <sys/sunddi.h> 62789Sahrens 632986Sek110237 int zio_taskq_threads = 8; 642986Sek110237 65789Sahrens /* 66789Sahrens * ========================================================================== 67789Sahrens * SPA state manipulation (open/create/destroy/import/export) 68789Sahrens * ========================================================================== 69789Sahrens */ 70789Sahrens 711544Seschrock static int 721544Seschrock spa_error_entry_compare(const void *a, const void *b) 731544Seschrock { 741544Seschrock spa_error_entry_t *sa = (spa_error_entry_t *)a; 751544Seschrock spa_error_entry_t *sb = (spa_error_entry_t *)b; 761544Seschrock int ret; 771544Seschrock 781544Seschrock ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 791544Seschrock sizeof (zbookmark_t)); 801544Seschrock 811544Seschrock if (ret < 0) 821544Seschrock return (-1); 831544Seschrock else if (ret > 0) 841544Seschrock return (1); 851544Seschrock else 861544Seschrock return (0); 871544Seschrock } 881544Seschrock 891544Seschrock /* 901544Seschrock * Utility function which retrieves copies of the current logs and 911544Seschrock * re-initializes them in the process. 921544Seschrock */ 931544Seschrock void 941544Seschrock spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 951544Seschrock { 961544Seschrock ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 971544Seschrock 981544Seschrock bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 991544Seschrock bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 1001544Seschrock 1011544Seschrock avl_create(&spa->spa_errlist_scrub, 1021544Seschrock spa_error_entry_compare, sizeof (spa_error_entry_t), 1031544Seschrock offsetof(spa_error_entry_t, se_avl)); 1041544Seschrock avl_create(&spa->spa_errlist_last, 1051544Seschrock spa_error_entry_compare, sizeof (spa_error_entry_t), 1061544Seschrock offsetof(spa_error_entry_t, se_avl)); 1071544Seschrock } 1081544Seschrock 109789Sahrens /* 110789Sahrens * Activate an uninitialized pool. 111789Sahrens */ 112789Sahrens static void 113789Sahrens spa_activate(spa_t *spa) 114789Sahrens { 115789Sahrens int t; 116789Sahrens 117789Sahrens ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 118789Sahrens 119789Sahrens spa->spa_state = POOL_STATE_ACTIVE; 120789Sahrens 121789Sahrens spa->spa_normal_class = metaslab_class_create(); 122789Sahrens 123789Sahrens for (t = 0; t < ZIO_TYPES; t++) { 124789Sahrens spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue", 1252986Sek110237 zio_taskq_threads, maxclsyspri, 50, INT_MAX, 126789Sahrens TASKQ_PREPOPULATE); 127789Sahrens spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr", 1282986Sek110237 zio_taskq_threads, maxclsyspri, 50, INT_MAX, 129789Sahrens TASKQ_PREPOPULATE); 130789Sahrens } 131789Sahrens 132789Sahrens rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL); 133789Sahrens 1342856Snd150628 mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL); 1352856Snd150628 mutex_init(&spa->spa_config_cache_lock, NULL, MUTEX_DEFAULT, NULL); 1362856Snd150628 mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL); 1372856Snd150628 mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL); 1382856Snd150628 mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL); 1392856Snd150628 mutex_init(&spa->spa_config_lock.scl_lock, NULL, MUTEX_DEFAULT, NULL); 1402856Snd150628 mutex_init(&spa->spa_sync_bplist.bpl_lock, NULL, MUTEX_DEFAULT, NULL); 1412926Sek110237 mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL); 1423912Slling mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL); 1432856Snd150628 144789Sahrens list_create(&spa->spa_dirty_list, sizeof (vdev_t), 145789Sahrens offsetof(vdev_t, vdev_dirty_node)); 146789Sahrens 147789Sahrens txg_list_create(&spa->spa_vdev_txg_list, 148789Sahrens offsetof(struct vdev, vdev_txg_node)); 1491544Seschrock 1501544Seschrock avl_create(&spa->spa_errlist_scrub, 1511544Seschrock spa_error_entry_compare, sizeof (spa_error_entry_t), 1521544Seschrock offsetof(spa_error_entry_t, se_avl)); 1531544Seschrock avl_create(&spa->spa_errlist_last, 1541544Seschrock spa_error_entry_compare, sizeof (spa_error_entry_t), 1551544Seschrock offsetof(spa_error_entry_t, se_avl)); 156789Sahrens } 157789Sahrens 158789Sahrens /* 159789Sahrens * Opposite of spa_activate(). 160789Sahrens */ 161789Sahrens static void 162789Sahrens spa_deactivate(spa_t *spa) 163789Sahrens { 164789Sahrens int t; 165789Sahrens 166789Sahrens ASSERT(spa->spa_sync_on == B_FALSE); 167789Sahrens ASSERT(spa->spa_dsl_pool == NULL); 168789Sahrens ASSERT(spa->spa_root_vdev == NULL); 169789Sahrens 170789Sahrens ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 171789Sahrens 172789Sahrens txg_list_destroy(&spa->spa_vdev_txg_list); 173789Sahrens 174789Sahrens list_destroy(&spa->spa_dirty_list); 175789Sahrens 176789Sahrens rw_destroy(&spa->spa_traverse_lock); 177789Sahrens 178789Sahrens for (t = 0; t < ZIO_TYPES; t++) { 179789Sahrens taskq_destroy(spa->spa_zio_issue_taskq[t]); 180789Sahrens taskq_destroy(spa->spa_zio_intr_taskq[t]); 181789Sahrens spa->spa_zio_issue_taskq[t] = NULL; 182789Sahrens spa->spa_zio_intr_taskq[t] = NULL; 183789Sahrens } 184789Sahrens 185789Sahrens metaslab_class_destroy(spa->spa_normal_class); 186789Sahrens spa->spa_normal_class = NULL; 187789Sahrens 1881544Seschrock /* 1891544Seschrock * If this was part of an import or the open otherwise failed, we may 1901544Seschrock * still have errors left in the queues. Empty them just in case. 1911544Seschrock */ 1921544Seschrock spa_errlog_drain(spa); 1931544Seschrock 1941544Seschrock avl_destroy(&spa->spa_errlist_scrub); 1951544Seschrock avl_destroy(&spa->spa_errlist_last); 1961544Seschrock 197789Sahrens spa->spa_state = POOL_STATE_UNINITIALIZED; 198789Sahrens } 199789Sahrens 200789Sahrens /* 201789Sahrens * Verify a pool configuration, and construct the vdev tree appropriately. This 202789Sahrens * will create all the necessary vdevs in the appropriate layout, with each vdev 203789Sahrens * in the CLOSED state. This will prep the pool before open/creation/import. 204789Sahrens * All vdev validation is done by the vdev_alloc() routine. 205789Sahrens */ 2062082Seschrock static int 2072082Seschrock spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 2082082Seschrock uint_t id, int atype) 209789Sahrens { 210789Sahrens nvlist_t **child; 211789Sahrens uint_t c, children; 2122082Seschrock int error; 2132082Seschrock 2142082Seschrock if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 2152082Seschrock return (error); 2162082Seschrock 2172082Seschrock if ((*vdp)->vdev_ops->vdev_op_leaf) 2182082Seschrock return (0); 219789Sahrens 220789Sahrens if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 221789Sahrens &child, &children) != 0) { 2222082Seschrock vdev_free(*vdp); 2232082Seschrock *vdp = NULL; 2242082Seschrock return (EINVAL); 225789Sahrens } 226789Sahrens 227789Sahrens for (c = 0; c < children; c++) { 2282082Seschrock vdev_t *vd; 2292082Seschrock if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 2302082Seschrock atype)) != 0) { 2312082Seschrock vdev_free(*vdp); 2322082Seschrock *vdp = NULL; 2332082Seschrock return (error); 234789Sahrens } 235789Sahrens } 236789Sahrens 2372082Seschrock ASSERT(*vdp != NULL); 2382082Seschrock 2392082Seschrock return (0); 240789Sahrens } 241789Sahrens 242789Sahrens /* 243789Sahrens * Opposite of spa_load(). 244789Sahrens */ 245789Sahrens static void 246789Sahrens spa_unload(spa_t *spa) 247789Sahrens { 2482082Seschrock int i; 2492082Seschrock 250789Sahrens /* 2511544Seschrock * Stop async tasks. 2521544Seschrock */ 2531544Seschrock spa_async_suspend(spa); 2541544Seschrock 2551544Seschrock /* 256789Sahrens * Stop syncing. 257789Sahrens */ 258789Sahrens if (spa->spa_sync_on) { 259789Sahrens txg_sync_stop(spa->spa_dsl_pool); 260789Sahrens spa->spa_sync_on = B_FALSE; 261789Sahrens } 262789Sahrens 263789Sahrens /* 264789Sahrens * Wait for any outstanding prefetch I/O to complete. 265789Sahrens */ 2661544Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 2671544Seschrock spa_config_exit(spa, FTAG); 268789Sahrens 269789Sahrens /* 270789Sahrens * Close the dsl pool. 271789Sahrens */ 272789Sahrens if (spa->spa_dsl_pool) { 273789Sahrens dsl_pool_close(spa->spa_dsl_pool); 274789Sahrens spa->spa_dsl_pool = NULL; 275789Sahrens } 276789Sahrens 277789Sahrens /* 278789Sahrens * Close all vdevs. 279789Sahrens */ 2801585Sbonwick if (spa->spa_root_vdev) 281789Sahrens vdev_free(spa->spa_root_vdev); 2821585Sbonwick ASSERT(spa->spa_root_vdev == NULL); 2831544Seschrock 2842082Seschrock for (i = 0; i < spa->spa_nspares; i++) 2852082Seschrock vdev_free(spa->spa_spares[i]); 2862082Seschrock if (spa->spa_spares) { 2872082Seschrock kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); 2882082Seschrock spa->spa_spares = NULL; 2892082Seschrock } 2902082Seschrock if (spa->spa_sparelist) { 2912082Seschrock nvlist_free(spa->spa_sparelist); 2922082Seschrock spa->spa_sparelist = NULL; 2932082Seschrock } 2942082Seschrock 2951544Seschrock spa->spa_async_suspended = 0; 296789Sahrens } 297789Sahrens 298789Sahrens /* 2992082Seschrock * Load (or re-load) the current list of vdevs describing the active spares for 3002082Seschrock * this pool. When this is called, we have some form of basic information in 3012082Seschrock * 'spa_sparelist'. We parse this into vdevs, try to open them, and then 3022082Seschrock * re-generate a more complete list including status information. 3032082Seschrock */ 3042082Seschrock static void 3052082Seschrock spa_load_spares(spa_t *spa) 3062082Seschrock { 3072082Seschrock nvlist_t **spares; 3082082Seschrock uint_t nspares; 3092082Seschrock int i; 3103377Seschrock vdev_t *vd, *tvd; 3112082Seschrock 3122082Seschrock /* 3132082Seschrock * First, close and free any existing spare vdevs. 3142082Seschrock */ 3152082Seschrock for (i = 0; i < spa->spa_nspares; i++) { 3163377Seschrock vd = spa->spa_spares[i]; 3173377Seschrock 3183377Seschrock /* Undo the call to spa_activate() below */ 3193377Seschrock if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL && 3203377Seschrock tvd->vdev_isspare) 3213377Seschrock spa_spare_remove(tvd); 3223377Seschrock vdev_close(vd); 3233377Seschrock vdev_free(vd); 3242082Seschrock } 3253377Seschrock 3262082Seschrock if (spa->spa_spares) 3272082Seschrock kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); 3282082Seschrock 3292082Seschrock if (spa->spa_sparelist == NULL) 3302082Seschrock nspares = 0; 3312082Seschrock else 3322082Seschrock VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 3332082Seschrock ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 3342082Seschrock 3352082Seschrock spa->spa_nspares = (int)nspares; 3362082Seschrock spa->spa_spares = NULL; 3372082Seschrock 3382082Seschrock if (nspares == 0) 3392082Seschrock return; 3402082Seschrock 3412082Seschrock /* 3422082Seschrock * Construct the array of vdevs, opening them to get status in the 3433377Seschrock * process. For each spare, there is potentially two different vdev_t 3443377Seschrock * structures associated with it: one in the list of spares (used only 3453377Seschrock * for basic validation purposes) and one in the active vdev 3463377Seschrock * configuration (if it's spared in). During this phase we open and 3473377Seschrock * validate each vdev on the spare list. If the vdev also exists in the 3483377Seschrock * active configuration, then we also mark this vdev as an active spare. 3492082Seschrock */ 3502082Seschrock spa->spa_spares = kmem_alloc(nspares * sizeof (void *), KM_SLEEP); 3512082Seschrock for (i = 0; i < spa->spa_nspares; i++) { 3522082Seschrock VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 3532082Seschrock VDEV_ALLOC_SPARE) == 0); 3542082Seschrock ASSERT(vd != NULL); 3552082Seschrock 3562082Seschrock spa->spa_spares[i] = vd; 3572082Seschrock 3583377Seschrock if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL) { 3593377Seschrock if (!tvd->vdev_isspare) 3603377Seschrock spa_spare_add(tvd); 3613377Seschrock 3623377Seschrock /* 3633377Seschrock * We only mark the spare active if we were successfully 3643377Seschrock * able to load the vdev. Otherwise, importing a pool 3653377Seschrock * with a bad active spare would result in strange 3663377Seschrock * behavior, because multiple pool would think the spare 3673377Seschrock * is actively in use. 3683377Seschrock * 3693377Seschrock * There is a vulnerability here to an equally bizarre 3703377Seschrock * circumstance, where a dead active spare is later 3713377Seschrock * brought back to life (onlined or otherwise). Given 3723377Seschrock * the rarity of this scenario, and the extra complexity 3733377Seschrock * it adds, we ignore the possibility. 3743377Seschrock */ 3753377Seschrock if (!vdev_is_dead(tvd)) 3763377Seschrock spa_spare_activate(tvd); 3773377Seschrock } 3783377Seschrock 3792082Seschrock if (vdev_open(vd) != 0) 3802082Seschrock continue; 3812082Seschrock 3822082Seschrock vd->vdev_top = vd; 3832082Seschrock (void) vdev_validate_spare(vd); 3842082Seschrock } 3852082Seschrock 3862082Seschrock /* 3872082Seschrock * Recompute the stashed list of spares, with status information 3882082Seschrock * this time. 3892082Seschrock */ 3902082Seschrock VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 3912082Seschrock DATA_TYPE_NVLIST_ARRAY) == 0); 3922082Seschrock 3932082Seschrock spares = kmem_alloc(spa->spa_nspares * sizeof (void *), KM_SLEEP); 3942082Seschrock for (i = 0; i < spa->spa_nspares; i++) 3952082Seschrock spares[i] = vdev_config_generate(spa, spa->spa_spares[i], 3962082Seschrock B_TRUE, B_TRUE); 3972082Seschrock VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 3982082Seschrock spares, spa->spa_nspares) == 0); 3992082Seschrock for (i = 0; i < spa->spa_nspares; i++) 4002082Seschrock nvlist_free(spares[i]); 4012082Seschrock kmem_free(spares, spa->spa_nspares * sizeof (void *)); 4022082Seschrock } 4032082Seschrock 4042082Seschrock static int 4052082Seschrock load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 4062082Seschrock { 4072082Seschrock dmu_buf_t *db; 4082082Seschrock char *packed = NULL; 4092082Seschrock size_t nvsize = 0; 4102082Seschrock int error; 4112082Seschrock *value = NULL; 4122082Seschrock 4132082Seschrock VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 4142082Seschrock nvsize = *(uint64_t *)db->db_data; 4152082Seschrock dmu_buf_rele(db, FTAG); 4162082Seschrock 4172082Seschrock packed = kmem_alloc(nvsize, KM_SLEEP); 4182082Seschrock error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed); 4192082Seschrock if (error == 0) 4202082Seschrock error = nvlist_unpack(packed, nvsize, value, 0); 4212082Seschrock kmem_free(packed, nvsize); 4222082Seschrock 4232082Seschrock return (error); 4242082Seschrock } 4252082Seschrock 4262082Seschrock /* 427789Sahrens * Load an existing storage pool, using the pool's builtin spa_config as a 4281544Seschrock * source of configuration information. 429789Sahrens */ 430789Sahrens static int 4311544Seschrock spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) 432789Sahrens { 433789Sahrens int error = 0; 434789Sahrens nvlist_t *nvroot = NULL; 435789Sahrens vdev_t *rvd; 436789Sahrens uberblock_t *ub = &spa->spa_uberblock; 4371635Sbonwick uint64_t config_cache_txg = spa->spa_config_txg; 438789Sahrens uint64_t pool_guid; 4392082Seschrock uint64_t version; 440789Sahrens zio_t *zio; 441789Sahrens 4421544Seschrock spa->spa_load_state = state; 4431635Sbonwick 444789Sahrens if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 4451733Sbonwick nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 4461544Seschrock error = EINVAL; 4471544Seschrock goto out; 4481544Seschrock } 449789Sahrens 4502082Seschrock /* 4512082Seschrock * Versioning wasn't explicitly added to the label until later, so if 4522082Seschrock * it's not present treat it as the initial version. 4532082Seschrock */ 4542082Seschrock if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0) 4552082Seschrock version = ZFS_VERSION_INITIAL; 4562082Seschrock 4571733Sbonwick (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 4581733Sbonwick &spa->spa_config_txg); 4591733Sbonwick 4601635Sbonwick if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 4611544Seschrock spa_guid_exists(pool_guid, 0)) { 4621544Seschrock error = EEXIST; 4631544Seschrock goto out; 4641544Seschrock } 465789Sahrens 4662174Seschrock spa->spa_load_guid = pool_guid; 4672174Seschrock 468789Sahrens /* 4692082Seschrock * Parse the configuration into a vdev tree. We explicitly set the 4702082Seschrock * value that will be returned by spa_version() since parsing the 4712082Seschrock * configuration requires knowing the version number. 472789Sahrens */ 4731544Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 4742082Seschrock spa->spa_ubsync.ub_version = version; 4752082Seschrock error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 4761544Seschrock spa_config_exit(spa, FTAG); 477789Sahrens 4782082Seschrock if (error != 0) 4791544Seschrock goto out; 480789Sahrens 4811585Sbonwick ASSERT(spa->spa_root_vdev == rvd); 482789Sahrens ASSERT(spa_guid(spa) == pool_guid); 483789Sahrens 484789Sahrens /* 485789Sahrens * Try to open all vdevs, loading each label in the process. 486789Sahrens */ 487*4070Smc142369 error = vdev_open(rvd); 488*4070Smc142369 if (error != 0) 4891544Seschrock goto out; 490789Sahrens 491789Sahrens /* 4921986Seschrock * Validate the labels for all leaf vdevs. We need to grab the config 4931986Seschrock * lock because all label I/O is done with the ZIO_FLAG_CONFIG_HELD 4941986Seschrock * flag. 4951986Seschrock */ 4961986Seschrock spa_config_enter(spa, RW_READER, FTAG); 4971986Seschrock error = vdev_validate(rvd); 4981986Seschrock spa_config_exit(spa, FTAG); 4991986Seschrock 500*4070Smc142369 if (error != 0) 5011986Seschrock goto out; 5021986Seschrock 5031986Seschrock if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 5041986Seschrock error = ENXIO; 5051986Seschrock goto out; 5061986Seschrock } 5071986Seschrock 5081986Seschrock /* 509789Sahrens * Find the best uberblock. 510789Sahrens */ 511789Sahrens bzero(ub, sizeof (uberblock_t)); 512789Sahrens 513789Sahrens zio = zio_root(spa, NULL, NULL, 514789Sahrens ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 515789Sahrens vdev_uberblock_load(zio, rvd, ub); 516789Sahrens error = zio_wait(zio); 517789Sahrens 518789Sahrens /* 519789Sahrens * If we weren't able to find a single valid uberblock, return failure. 520789Sahrens */ 521789Sahrens if (ub->ub_txg == 0) { 5221760Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 5231760Seschrock VDEV_AUX_CORRUPT_DATA); 5241544Seschrock error = ENXIO; 5251544Seschrock goto out; 5261544Seschrock } 5271544Seschrock 5281544Seschrock /* 5291544Seschrock * If the pool is newer than the code, we can't open it. 5301544Seschrock */ 5311760Seschrock if (ub->ub_version > ZFS_VERSION) { 5321760Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 5331760Seschrock VDEV_AUX_VERSION_NEWER); 5341544Seschrock error = ENOTSUP; 5351544Seschrock goto out; 536789Sahrens } 537789Sahrens 538789Sahrens /* 539789Sahrens * If the vdev guid sum doesn't match the uberblock, we have an 540789Sahrens * incomplete configuration. 541789Sahrens */ 5421732Sbonwick if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 5431544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 5441544Seschrock VDEV_AUX_BAD_GUID_SUM); 5451544Seschrock error = ENXIO; 5461544Seschrock goto out; 547789Sahrens } 548789Sahrens 549789Sahrens /* 550789Sahrens * Initialize internal SPA structures. 551789Sahrens */ 552789Sahrens spa->spa_state = POOL_STATE_ACTIVE; 553789Sahrens spa->spa_ubsync = spa->spa_uberblock; 554789Sahrens spa->spa_first_txg = spa_last_synced_txg(spa) + 1; 5551544Seschrock error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 5561544Seschrock if (error) { 5571544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 5581544Seschrock VDEV_AUX_CORRUPT_DATA); 5591544Seschrock goto out; 5601544Seschrock } 561789Sahrens spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 562789Sahrens 5631544Seschrock if (zap_lookup(spa->spa_meta_objset, 564789Sahrens DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 5651544Seschrock sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { 5661544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 5671544Seschrock VDEV_AUX_CORRUPT_DATA); 5681544Seschrock error = EIO; 5691544Seschrock goto out; 5701544Seschrock } 571789Sahrens 572789Sahrens if (!mosconfig) { 5732082Seschrock nvlist_t *newconfig; 5743975Sek110237 uint64_t hostid; 5752082Seschrock 5762082Seschrock if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) { 5771544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 5781544Seschrock VDEV_AUX_CORRUPT_DATA); 5791544Seschrock error = EIO; 5801544Seschrock goto out; 5811544Seschrock } 582789Sahrens 5833975Sek110237 if (nvlist_lookup_uint64(newconfig, ZPOOL_CONFIG_HOSTID, 5843975Sek110237 &hostid) == 0) { 5853975Sek110237 char *hostname; 5863975Sek110237 unsigned long myhostid = 0; 5873975Sek110237 5883975Sek110237 VERIFY(nvlist_lookup_string(newconfig, 5893975Sek110237 ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 5903975Sek110237 5913975Sek110237 (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 5923975Sek110237 if ((unsigned long)hostid != myhostid) { 5933975Sek110237 cmn_err(CE_WARN, "pool '%s' could not be " 5943975Sek110237 "loaded as it was last accessed by " 5953975Sek110237 "another system (host: %s hostid: 0x%lx). " 5963975Sek110237 "See: http://www.sun.com/msg/ZFS-8000-EY", 5973975Sek110237 spa->spa_name, hostname, 5983975Sek110237 (unsigned long)hostid); 5993975Sek110237 error = EBADF; 6003975Sek110237 goto out; 6013975Sek110237 } 6023975Sek110237 } 6033975Sek110237 604789Sahrens spa_config_set(spa, newconfig); 605789Sahrens spa_unload(spa); 606789Sahrens spa_deactivate(spa); 607789Sahrens spa_activate(spa); 608789Sahrens 6091544Seschrock return (spa_load(spa, newconfig, state, B_TRUE)); 6101544Seschrock } 6111544Seschrock 6121544Seschrock if (zap_lookup(spa->spa_meta_objset, 6131544Seschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 6141544Seschrock sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { 6151544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 6161544Seschrock VDEV_AUX_CORRUPT_DATA); 6171544Seschrock error = EIO; 6181544Seschrock goto out; 619789Sahrens } 620789Sahrens 6211544Seschrock /* 6222082Seschrock * Load the bit that tells us to use the new accounting function 6232082Seschrock * (raid-z deflation). If we have an older pool, this will not 6242082Seschrock * be present. 6252082Seschrock */ 6262082Seschrock error = zap_lookup(spa->spa_meta_objset, 6272082Seschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 6282082Seschrock sizeof (uint64_t), 1, &spa->spa_deflate); 6292082Seschrock if (error != 0 && error != ENOENT) { 6302082Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 6312082Seschrock VDEV_AUX_CORRUPT_DATA); 6322082Seschrock error = EIO; 6332082Seschrock goto out; 6342082Seschrock } 6352082Seschrock 6362082Seschrock /* 6371544Seschrock * Load the persistent error log. If we have an older pool, this will 6381544Seschrock * not be present. 6391544Seschrock */ 6401544Seschrock error = zap_lookup(spa->spa_meta_objset, 6411544Seschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, 6421544Seschrock sizeof (uint64_t), 1, &spa->spa_errlog_last); 6431807Sbonwick if (error != 0 && error != ENOENT) { 6441544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 6451544Seschrock VDEV_AUX_CORRUPT_DATA); 6461544Seschrock error = EIO; 6471544Seschrock goto out; 6481544Seschrock } 6491544Seschrock 6501544Seschrock error = zap_lookup(spa->spa_meta_objset, 6511544Seschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, 6521544Seschrock sizeof (uint64_t), 1, &spa->spa_errlog_scrub); 6531544Seschrock if (error != 0 && error != ENOENT) { 6541544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 6551544Seschrock VDEV_AUX_CORRUPT_DATA); 6561544Seschrock error = EIO; 6571544Seschrock goto out; 6581544Seschrock } 659789Sahrens 660789Sahrens /* 6612926Sek110237 * Load the history object. If we have an older pool, this 6622926Sek110237 * will not be present. 6632926Sek110237 */ 6642926Sek110237 error = zap_lookup(spa->spa_meta_objset, 6652926Sek110237 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY, 6662926Sek110237 sizeof (uint64_t), 1, &spa->spa_history); 6672926Sek110237 if (error != 0 && error != ENOENT) { 6682926Sek110237 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 6692926Sek110237 VDEV_AUX_CORRUPT_DATA); 6702926Sek110237 error = EIO; 6712926Sek110237 goto out; 6722926Sek110237 } 6732926Sek110237 6742926Sek110237 /* 6752082Seschrock * Load any hot spares for this pool. 6762082Seschrock */ 6772082Seschrock error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 6782082Seschrock DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares_object); 6792082Seschrock if (error != 0 && error != ENOENT) { 6802082Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 6812082Seschrock VDEV_AUX_CORRUPT_DATA); 6822082Seschrock error = EIO; 6832082Seschrock goto out; 6842082Seschrock } 6852082Seschrock if (error == 0) { 6862082Seschrock ASSERT(spa_version(spa) >= ZFS_VERSION_SPARES); 6872082Seschrock if (load_nvlist(spa, spa->spa_spares_object, 6882082Seschrock &spa->spa_sparelist) != 0) { 6892082Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 6902082Seschrock VDEV_AUX_CORRUPT_DATA); 6912082Seschrock error = EIO; 6922082Seschrock goto out; 6932082Seschrock } 6942082Seschrock 6952082Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 6962082Seschrock spa_load_spares(spa); 6972082Seschrock spa_config_exit(spa, FTAG); 6982082Seschrock } 6992082Seschrock 7003912Slling error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 7013912Slling DMU_POOL_PROPS, sizeof (uint64_t), 1, &spa->spa_pool_props_object); 7023912Slling 7033912Slling if (error && error != ENOENT) { 7043912Slling vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 7053912Slling VDEV_AUX_CORRUPT_DATA); 7063912Slling error = EIO; 7073912Slling goto out; 7083912Slling } 7093912Slling 7103912Slling if (error == 0) { 7113912Slling (void) zap_lookup(spa->spa_meta_objset, 7123912Slling spa->spa_pool_props_object, 7133912Slling zpool_prop_to_name(ZFS_PROP_BOOTFS), 7143912Slling sizeof (uint64_t), 1, &spa->spa_bootfs); 7153912Slling } 7163912Slling 7172082Seschrock /* 7181986Seschrock * Load the vdev state for all toplevel vdevs. 719789Sahrens */ 7201986Seschrock vdev_load(rvd); 721789Sahrens 722789Sahrens /* 723789Sahrens * Propagate the leaf DTLs we just loaded all the way up the tree. 724789Sahrens */ 7251544Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 726789Sahrens vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 7271544Seschrock spa_config_exit(spa, FTAG); 728789Sahrens 729789Sahrens /* 730789Sahrens * Check the state of the root vdev. If it can't be opened, it 731789Sahrens * indicates one or more toplevel vdevs are faulted. 732789Sahrens */ 7331544Seschrock if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 7341544Seschrock error = ENXIO; 7351544Seschrock goto out; 7361544Seschrock } 737789Sahrens 7381544Seschrock if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) { 7391635Sbonwick dmu_tx_t *tx; 7401635Sbonwick int need_update = B_FALSE; 7411585Sbonwick int c; 7421601Sbonwick 7431635Sbonwick /* 7441635Sbonwick * Claim log blocks that haven't been committed yet. 7451635Sbonwick * This must all happen in a single txg. 7461635Sbonwick */ 7471601Sbonwick tx = dmu_tx_create_assigned(spa_get_dsl(spa), 748789Sahrens spa_first_txg(spa)); 7492417Sahrens (void) dmu_objset_find(spa->spa_name, 7502417Sahrens zil_claim, tx, DS_FIND_CHILDREN); 751789Sahrens dmu_tx_commit(tx); 752789Sahrens 753789Sahrens spa->spa_sync_on = B_TRUE; 754789Sahrens txg_sync_start(spa->spa_dsl_pool); 755789Sahrens 756789Sahrens /* 757789Sahrens * Wait for all claims to sync. 758789Sahrens */ 759789Sahrens txg_wait_synced(spa->spa_dsl_pool, 0); 7601585Sbonwick 7611585Sbonwick /* 7621635Sbonwick * If the config cache is stale, or we have uninitialized 7631635Sbonwick * metaslabs (see spa_vdev_add()), then update the config. 7641585Sbonwick */ 7651635Sbonwick if (config_cache_txg != spa->spa_config_txg || 7661635Sbonwick state == SPA_LOAD_IMPORT) 7671635Sbonwick need_update = B_TRUE; 7681635Sbonwick 7691635Sbonwick for (c = 0; c < rvd->vdev_children; c++) 7701635Sbonwick if (rvd->vdev_child[c]->vdev_ms_array == 0) 7711635Sbonwick need_update = B_TRUE; 7721585Sbonwick 7731585Sbonwick /* 7741635Sbonwick * Update the config cache asychronously in case we're the 7751635Sbonwick * root pool, in which case the config cache isn't writable yet. 7761585Sbonwick */ 7771635Sbonwick if (need_update) 7781635Sbonwick spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 779789Sahrens } 780789Sahrens 7811544Seschrock error = 0; 7821544Seschrock out: 7832082Seschrock if (error && error != EBADF) 7841544Seschrock zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0); 7851544Seschrock spa->spa_load_state = SPA_LOAD_NONE; 7861544Seschrock spa->spa_ena = 0; 7871544Seschrock 7881544Seschrock return (error); 789789Sahrens } 790789Sahrens 791789Sahrens /* 792789Sahrens * Pool Open/Import 793789Sahrens * 794789Sahrens * The import case is identical to an open except that the configuration is sent 795789Sahrens * down from userland, instead of grabbed from the configuration cache. For the 796789Sahrens * case of an open, the pool configuration will exist in the 797789Sahrens * POOL_STATE_UNITIALIZED state. 798789Sahrens * 799789Sahrens * The stats information (gen/count/ustats) is used to gather vdev statistics at 800789Sahrens * the same time open the pool, without having to keep around the spa_t in some 801789Sahrens * ambiguous state. 802789Sahrens */ 803789Sahrens static int 804789Sahrens spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) 805789Sahrens { 806789Sahrens spa_t *spa; 807789Sahrens int error; 808789Sahrens int loaded = B_FALSE; 809789Sahrens int locked = B_FALSE; 810789Sahrens 811789Sahrens *spapp = NULL; 812789Sahrens 813789Sahrens /* 814789Sahrens * As disgusting as this is, we need to support recursive calls to this 815789Sahrens * function because dsl_dir_open() is called during spa_load(), and ends 816789Sahrens * up calling spa_open() again. The real fix is to figure out how to 817789Sahrens * avoid dsl_dir_open() calling this in the first place. 818789Sahrens */ 819789Sahrens if (mutex_owner(&spa_namespace_lock) != curthread) { 820789Sahrens mutex_enter(&spa_namespace_lock); 821789Sahrens locked = B_TRUE; 822789Sahrens } 823789Sahrens 824789Sahrens if ((spa = spa_lookup(pool)) == NULL) { 825789Sahrens if (locked) 826789Sahrens mutex_exit(&spa_namespace_lock); 827789Sahrens return (ENOENT); 828789Sahrens } 829789Sahrens if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 830789Sahrens 831789Sahrens spa_activate(spa); 832789Sahrens 8331635Sbonwick error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE); 834789Sahrens 835789Sahrens if (error == EBADF) { 836789Sahrens /* 8371986Seschrock * If vdev_validate() returns failure (indicated by 8381986Seschrock * EBADF), it indicates that one of the vdevs indicates 8391986Seschrock * that the pool has been exported or destroyed. If 8401986Seschrock * this is the case, the config cache is out of sync and 8411986Seschrock * we should remove the pool from the namespace. 842789Sahrens */ 8432082Seschrock zfs_post_ok(spa, NULL); 844789Sahrens spa_unload(spa); 845789Sahrens spa_deactivate(spa); 846789Sahrens spa_remove(spa); 847789Sahrens spa_config_sync(); 848789Sahrens if (locked) 849789Sahrens mutex_exit(&spa_namespace_lock); 850789Sahrens return (ENOENT); 8511544Seschrock } 8521544Seschrock 8531544Seschrock if (error) { 854789Sahrens /* 855789Sahrens * We can't open the pool, but we still have useful 856789Sahrens * information: the state of each vdev after the 857789Sahrens * attempted vdev_open(). Return this to the user. 858789Sahrens */ 8591635Sbonwick if (config != NULL && spa->spa_root_vdev != NULL) { 8601635Sbonwick spa_config_enter(spa, RW_READER, FTAG); 861789Sahrens *config = spa_config_generate(spa, NULL, -1ULL, 862789Sahrens B_TRUE); 8631635Sbonwick spa_config_exit(spa, FTAG); 8641635Sbonwick } 865789Sahrens spa_unload(spa); 866789Sahrens spa_deactivate(spa); 8671544Seschrock spa->spa_last_open_failed = B_TRUE; 868789Sahrens if (locked) 869789Sahrens mutex_exit(&spa_namespace_lock); 870789Sahrens *spapp = NULL; 871789Sahrens return (error); 8721544Seschrock } else { 8731544Seschrock zfs_post_ok(spa, NULL); 8741544Seschrock spa->spa_last_open_failed = B_FALSE; 875789Sahrens } 876789Sahrens 877789Sahrens loaded = B_TRUE; 878789Sahrens } 879789Sahrens 880789Sahrens spa_open_ref(spa, tag); 881789Sahrens if (locked) 882789Sahrens mutex_exit(&spa_namespace_lock); 883789Sahrens 884789Sahrens *spapp = spa; 885789Sahrens 886789Sahrens if (config != NULL) { 8871544Seschrock spa_config_enter(spa, RW_READER, FTAG); 888789Sahrens *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 8891544Seschrock spa_config_exit(spa, FTAG); 890789Sahrens } 891789Sahrens 892789Sahrens /* 893789Sahrens * If we just loaded the pool, resilver anything that's out of date. 894789Sahrens */ 895789Sahrens if (loaded && (spa_mode & FWRITE)) 896789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 897789Sahrens 898789Sahrens return (0); 899789Sahrens } 900789Sahrens 901789Sahrens int 902789Sahrens spa_open(const char *name, spa_t **spapp, void *tag) 903789Sahrens { 904789Sahrens return (spa_open_common(name, spapp, tag, NULL)); 905789Sahrens } 906789Sahrens 9071544Seschrock /* 9081544Seschrock * Lookup the given spa_t, incrementing the inject count in the process, 9091544Seschrock * preventing it from being exported or destroyed. 9101544Seschrock */ 9111544Seschrock spa_t * 9121544Seschrock spa_inject_addref(char *name) 9131544Seschrock { 9141544Seschrock spa_t *spa; 9151544Seschrock 9161544Seschrock mutex_enter(&spa_namespace_lock); 9171544Seschrock if ((spa = spa_lookup(name)) == NULL) { 9181544Seschrock mutex_exit(&spa_namespace_lock); 9191544Seschrock return (NULL); 9201544Seschrock } 9211544Seschrock spa->spa_inject_ref++; 9221544Seschrock mutex_exit(&spa_namespace_lock); 9231544Seschrock 9241544Seschrock return (spa); 9251544Seschrock } 9261544Seschrock 9271544Seschrock void 9281544Seschrock spa_inject_delref(spa_t *spa) 9291544Seschrock { 9301544Seschrock mutex_enter(&spa_namespace_lock); 9311544Seschrock spa->spa_inject_ref--; 9321544Seschrock mutex_exit(&spa_namespace_lock); 9331544Seschrock } 9341544Seschrock 9352082Seschrock static void 9362082Seschrock spa_add_spares(spa_t *spa, nvlist_t *config) 9372082Seschrock { 9382082Seschrock nvlist_t **spares; 9392082Seschrock uint_t i, nspares; 9402082Seschrock nvlist_t *nvroot; 9412082Seschrock uint64_t guid; 9422082Seschrock vdev_stat_t *vs; 9432082Seschrock uint_t vsc; 9443377Seschrock uint64_t pool; 9452082Seschrock 9462082Seschrock if (spa->spa_nspares == 0) 9472082Seschrock return; 9482082Seschrock 9492082Seschrock VERIFY(nvlist_lookup_nvlist(config, 9502082Seschrock ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 9512082Seschrock VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 9522082Seschrock ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 9532082Seschrock if (nspares != 0) { 9542082Seschrock VERIFY(nvlist_add_nvlist_array(nvroot, 9552082Seschrock ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 9562082Seschrock VERIFY(nvlist_lookup_nvlist_array(nvroot, 9572082Seschrock ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 9582082Seschrock 9592082Seschrock /* 9602082Seschrock * Go through and find any spares which have since been 9612082Seschrock * repurposed as an active spare. If this is the case, update 9622082Seschrock * their status appropriately. 9632082Seschrock */ 9642082Seschrock for (i = 0; i < nspares; i++) { 9652082Seschrock VERIFY(nvlist_lookup_uint64(spares[i], 9662082Seschrock ZPOOL_CONFIG_GUID, &guid) == 0); 9673377Seschrock if (spa_spare_exists(guid, &pool) && pool != 0ULL) { 9682082Seschrock VERIFY(nvlist_lookup_uint64_array( 9692082Seschrock spares[i], ZPOOL_CONFIG_STATS, 9702082Seschrock (uint64_t **)&vs, &vsc) == 0); 9712082Seschrock vs->vs_state = VDEV_STATE_CANT_OPEN; 9722082Seschrock vs->vs_aux = VDEV_AUX_SPARED; 9732082Seschrock } 9742082Seschrock } 9752082Seschrock } 9762082Seschrock } 9772082Seschrock 978789Sahrens int 9791544Seschrock spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 980789Sahrens { 981789Sahrens int error; 982789Sahrens spa_t *spa; 983789Sahrens 984789Sahrens *config = NULL; 985789Sahrens error = spa_open_common(name, &spa, FTAG, config); 986789Sahrens 9872082Seschrock if (spa && *config != NULL) { 9881544Seschrock VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, 9891544Seschrock spa_get_errlog_size(spa)) == 0); 9901544Seschrock 9912082Seschrock spa_add_spares(spa, *config); 9922082Seschrock } 9932082Seschrock 9941544Seschrock /* 9951544Seschrock * We want to get the alternate root even for faulted pools, so we cheat 9961544Seschrock * and call spa_lookup() directly. 9971544Seschrock */ 9981544Seschrock if (altroot) { 9991544Seschrock if (spa == NULL) { 10001544Seschrock mutex_enter(&spa_namespace_lock); 10011544Seschrock spa = spa_lookup(name); 10021544Seschrock if (spa) 10031544Seschrock spa_altroot(spa, altroot, buflen); 10041544Seschrock else 10051544Seschrock altroot[0] = '\0'; 10061544Seschrock spa = NULL; 10071544Seschrock mutex_exit(&spa_namespace_lock); 10081544Seschrock } else { 10091544Seschrock spa_altroot(spa, altroot, buflen); 10101544Seschrock } 10111544Seschrock } 10121544Seschrock 1013789Sahrens if (spa != NULL) 1014789Sahrens spa_close(spa, FTAG); 1015789Sahrens 1016789Sahrens return (error); 1017789Sahrens } 1018789Sahrens 1019789Sahrens /* 10202082Seschrock * Validate that the 'spares' array is well formed. We must have an array of 10213377Seschrock * nvlists, each which describes a valid leaf vdev. If this is an import (mode 10223377Seschrock * is VDEV_ALLOC_SPARE), then we allow corrupted spares to be specified, as long 10233377Seschrock * as they are well-formed. 10242082Seschrock */ 10252082Seschrock static int 10262082Seschrock spa_validate_spares(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 10272082Seschrock { 10282082Seschrock nvlist_t **spares; 10292082Seschrock uint_t i, nspares; 10302082Seschrock vdev_t *vd; 10312082Seschrock int error; 10322082Seschrock 10332082Seschrock /* 10342082Seschrock * It's acceptable to have no spares specified. 10352082Seschrock */ 10362082Seschrock if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 10372082Seschrock &spares, &nspares) != 0) 10382082Seschrock return (0); 10392082Seschrock 10402082Seschrock if (nspares == 0) 10412082Seschrock return (EINVAL); 10422082Seschrock 10432082Seschrock /* 10442082Seschrock * Make sure the pool is formatted with a version that supports hot 10452082Seschrock * spares. 10462082Seschrock */ 10472082Seschrock if (spa_version(spa) < ZFS_VERSION_SPARES) 10482082Seschrock return (ENOTSUP); 10492082Seschrock 10503377Seschrock /* 10513377Seschrock * Set the pending spare list so we correctly handle device in-use 10523377Seschrock * checking. 10533377Seschrock */ 10543377Seschrock spa->spa_pending_spares = spares; 10553377Seschrock spa->spa_pending_nspares = nspares; 10563377Seschrock 10572082Seschrock for (i = 0; i < nspares; i++) { 10582082Seschrock if ((error = spa_config_parse(spa, &vd, spares[i], NULL, 0, 10592082Seschrock mode)) != 0) 10603377Seschrock goto out; 10612082Seschrock 10622082Seschrock if (!vd->vdev_ops->vdev_op_leaf) { 10632082Seschrock vdev_free(vd); 10643377Seschrock error = EINVAL; 10653377Seschrock goto out; 10662082Seschrock } 10672082Seschrock 10682082Seschrock vd->vdev_top = vd; 10693377Seschrock 10703377Seschrock if ((error = vdev_open(vd)) == 0 && 10713377Seschrock (error = vdev_label_init(vd, crtxg, 10723377Seschrock VDEV_LABEL_SPARE)) == 0) { 10733377Seschrock VERIFY(nvlist_add_uint64(spares[i], ZPOOL_CONFIG_GUID, 10743377Seschrock vd->vdev_guid) == 0); 10752082Seschrock } 10762082Seschrock 10772082Seschrock vdev_free(vd); 10783377Seschrock 10793377Seschrock if (error && mode != VDEV_ALLOC_SPARE) 10803377Seschrock goto out; 10813377Seschrock else 10823377Seschrock error = 0; 10832082Seschrock } 10842082Seschrock 10853377Seschrock out: 10863377Seschrock spa->spa_pending_spares = NULL; 10873377Seschrock spa->spa_pending_nspares = 0; 10883377Seschrock return (error); 10892082Seschrock } 10902082Seschrock 10912082Seschrock /* 1092789Sahrens * Pool Creation 1093789Sahrens */ 1094789Sahrens int 10951635Sbonwick spa_create(const char *pool, nvlist_t *nvroot, const char *altroot) 1096789Sahrens { 1097789Sahrens spa_t *spa; 10981635Sbonwick vdev_t *rvd; 1099789Sahrens dsl_pool_t *dp; 1100789Sahrens dmu_tx_t *tx; 11012082Seschrock int c, error = 0; 1102789Sahrens uint64_t txg = TXG_INITIAL; 11032082Seschrock nvlist_t **spares; 11042082Seschrock uint_t nspares; 1105789Sahrens 1106789Sahrens /* 1107789Sahrens * If this pool already exists, return failure. 1108789Sahrens */ 1109789Sahrens mutex_enter(&spa_namespace_lock); 1110789Sahrens if (spa_lookup(pool) != NULL) { 1111789Sahrens mutex_exit(&spa_namespace_lock); 1112789Sahrens return (EEXIST); 1113789Sahrens } 1114789Sahrens 1115789Sahrens /* 1116789Sahrens * Allocate a new spa_t structure. 1117789Sahrens */ 11181635Sbonwick spa = spa_add(pool, altroot); 1119789Sahrens spa_activate(spa); 1120789Sahrens 1121789Sahrens spa->spa_uberblock.ub_txg = txg - 1; 11221760Seschrock spa->spa_uberblock.ub_version = ZFS_VERSION; 1123789Sahrens spa->spa_ubsync = spa->spa_uberblock; 1124789Sahrens 11251635Sbonwick /* 11261635Sbonwick * Create the root vdev. 11271635Sbonwick */ 11281635Sbonwick spa_config_enter(spa, RW_WRITER, FTAG); 11291635Sbonwick 11302082Seschrock error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 11312082Seschrock 11322082Seschrock ASSERT(error != 0 || rvd != NULL); 11332082Seschrock ASSERT(error != 0 || spa->spa_root_vdev == rvd); 11342082Seschrock 11352082Seschrock if (error == 0 && rvd->vdev_children == 0) 11361635Sbonwick error = EINVAL; 11372082Seschrock 11382082Seschrock if (error == 0 && 11392082Seschrock (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 11402082Seschrock (error = spa_validate_spares(spa, nvroot, txg, 11412082Seschrock VDEV_ALLOC_ADD)) == 0) { 11422082Seschrock for (c = 0; c < rvd->vdev_children; c++) 11432082Seschrock vdev_init(rvd->vdev_child[c], txg); 11442082Seschrock vdev_config_dirty(rvd); 11451635Sbonwick } 11461635Sbonwick 11471635Sbonwick spa_config_exit(spa, FTAG); 1148789Sahrens 11492082Seschrock if (error != 0) { 1150789Sahrens spa_unload(spa); 1151789Sahrens spa_deactivate(spa); 1152789Sahrens spa_remove(spa); 1153789Sahrens mutex_exit(&spa_namespace_lock); 1154789Sahrens return (error); 1155789Sahrens } 1156789Sahrens 11572082Seschrock /* 11582082Seschrock * Get the list of spares, if specified. 11592082Seschrock */ 11602082Seschrock if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 11612082Seschrock &spares, &nspares) == 0) { 11622082Seschrock VERIFY(nvlist_alloc(&spa->spa_sparelist, NV_UNIQUE_NAME, 11632082Seschrock KM_SLEEP) == 0); 11642082Seschrock VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 11652082Seschrock ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 11662082Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 11672082Seschrock spa_load_spares(spa); 11682082Seschrock spa_config_exit(spa, FTAG); 11692082Seschrock spa->spa_sync_spares = B_TRUE; 11702082Seschrock } 11712082Seschrock 1172789Sahrens spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg); 1173789Sahrens spa->spa_meta_objset = dp->dp_meta_objset; 1174789Sahrens 1175789Sahrens tx = dmu_tx_create_assigned(dp, txg); 1176789Sahrens 1177789Sahrens /* 1178789Sahrens * Create the pool config object. 1179789Sahrens */ 1180789Sahrens spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 1181789Sahrens DMU_OT_PACKED_NVLIST, 1 << 14, 1182789Sahrens DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 1183789Sahrens 11841544Seschrock if (zap_add(spa->spa_meta_objset, 1185789Sahrens DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 11861544Seschrock sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 11871544Seschrock cmn_err(CE_PANIC, "failed to add pool config"); 11881544Seschrock } 1189789Sahrens 11902082Seschrock /* Newly created pools are always deflated. */ 11912082Seschrock spa->spa_deflate = TRUE; 11922082Seschrock if (zap_add(spa->spa_meta_objset, 11932082Seschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 11942082Seschrock sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 11952082Seschrock cmn_err(CE_PANIC, "failed to add deflate"); 11962082Seschrock } 11972082Seschrock 1198789Sahrens /* 1199789Sahrens * Create the deferred-free bplist object. Turn off compression 1200789Sahrens * because sync-to-convergence takes longer if the blocksize 1201789Sahrens * keeps changing. 1202789Sahrens */ 1203789Sahrens spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 1204789Sahrens 1 << 14, tx); 1205789Sahrens dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 1206789Sahrens ZIO_COMPRESS_OFF, tx); 1207789Sahrens 12081544Seschrock if (zap_add(spa->spa_meta_objset, 1209789Sahrens DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 12101544Seschrock sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { 12111544Seschrock cmn_err(CE_PANIC, "failed to add bplist"); 12121544Seschrock } 1213789Sahrens 12142926Sek110237 /* 12152926Sek110237 * Create the pool's history object. 12162926Sek110237 */ 12172926Sek110237 spa_history_create_obj(spa, tx); 12182926Sek110237 1219789Sahrens dmu_tx_commit(tx); 1220789Sahrens 12213912Slling spa->spa_bootfs = zfs_prop_default_numeric(ZFS_PROP_BOOTFS); 1222789Sahrens spa->spa_sync_on = B_TRUE; 1223789Sahrens txg_sync_start(spa->spa_dsl_pool); 1224789Sahrens 1225789Sahrens /* 1226789Sahrens * We explicitly wait for the first transaction to complete so that our 1227789Sahrens * bean counters are appropriately updated. 1228789Sahrens */ 1229789Sahrens txg_wait_synced(spa->spa_dsl_pool, txg); 1230789Sahrens 1231789Sahrens spa_config_sync(); 1232789Sahrens 1233789Sahrens mutex_exit(&spa_namespace_lock); 1234789Sahrens 1235789Sahrens return (0); 1236789Sahrens } 1237789Sahrens 1238789Sahrens /* 1239789Sahrens * Import the given pool into the system. We set up the necessary spa_t and 1240789Sahrens * then call spa_load() to do the dirty work. 1241789Sahrens */ 1242789Sahrens int 12431635Sbonwick spa_import(const char *pool, nvlist_t *config, const char *altroot) 1244789Sahrens { 1245789Sahrens spa_t *spa; 1246789Sahrens int error; 12472082Seschrock nvlist_t *nvroot; 12482082Seschrock nvlist_t **spares; 12492082Seschrock uint_t nspares; 1250789Sahrens 1251789Sahrens if (!(spa_mode & FWRITE)) 1252789Sahrens return (EROFS); 1253789Sahrens 1254789Sahrens /* 1255789Sahrens * If a pool with this name exists, return failure. 1256789Sahrens */ 1257789Sahrens mutex_enter(&spa_namespace_lock); 1258789Sahrens if (spa_lookup(pool) != NULL) { 1259789Sahrens mutex_exit(&spa_namespace_lock); 1260789Sahrens return (EEXIST); 1261789Sahrens } 1262789Sahrens 1263789Sahrens /* 12641635Sbonwick * Create and initialize the spa structure. 1265789Sahrens */ 12661635Sbonwick spa = spa_add(pool, altroot); 1267789Sahrens spa_activate(spa); 1268789Sahrens 1269789Sahrens /* 12701635Sbonwick * Pass off the heavy lifting to spa_load(). 12711732Sbonwick * Pass TRUE for mosconfig because the user-supplied config 12721732Sbonwick * is actually the one to trust when doing an import. 12731601Sbonwick */ 12741732Sbonwick error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE); 1275789Sahrens 12762082Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 12772082Seschrock /* 12782082Seschrock * Toss any existing sparelist, as it doesn't have any validity anymore, 12792082Seschrock * and conflicts with spa_has_spare(). 12802082Seschrock */ 12812082Seschrock if (spa->spa_sparelist) { 12822082Seschrock nvlist_free(spa->spa_sparelist); 12832082Seschrock spa->spa_sparelist = NULL; 12842082Seschrock spa_load_spares(spa); 12852082Seschrock } 12862082Seschrock 12872082Seschrock VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 12882082Seschrock &nvroot) == 0); 12892082Seschrock if (error == 0) 12902082Seschrock error = spa_validate_spares(spa, nvroot, -1ULL, 12912082Seschrock VDEV_ALLOC_SPARE); 12922082Seschrock spa_config_exit(spa, FTAG); 12932082Seschrock 12942082Seschrock if (error != 0) { 1295789Sahrens spa_unload(spa); 1296789Sahrens spa_deactivate(spa); 1297789Sahrens spa_remove(spa); 1298789Sahrens mutex_exit(&spa_namespace_lock); 1299789Sahrens return (error); 1300789Sahrens } 1301789Sahrens 13021635Sbonwick /* 13032082Seschrock * Override any spares as specified by the user, as these may have 13042082Seschrock * correct device names/devids, etc. 13052082Seschrock */ 13062082Seschrock if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 13072082Seschrock &spares, &nspares) == 0) { 13082082Seschrock if (spa->spa_sparelist) 13092082Seschrock VERIFY(nvlist_remove(spa->spa_sparelist, 13102082Seschrock ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 13112082Seschrock else 13122082Seschrock VERIFY(nvlist_alloc(&spa->spa_sparelist, 13132082Seschrock NV_UNIQUE_NAME, KM_SLEEP) == 0); 13142082Seschrock VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 13152082Seschrock ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 13162082Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 13172082Seschrock spa_load_spares(spa); 13182082Seschrock spa_config_exit(spa, FTAG); 13192082Seschrock spa->spa_sync_spares = B_TRUE; 13202082Seschrock } 13212082Seschrock 13222082Seschrock /* 13231635Sbonwick * Update the config cache to include the newly-imported pool. 13241635Sbonwick */ 13251635Sbonwick spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 13261635Sbonwick 1327789Sahrens mutex_exit(&spa_namespace_lock); 1328789Sahrens 1329789Sahrens /* 1330789Sahrens * Resilver anything that's out of date. 1331789Sahrens */ 1332789Sahrens if (spa_mode & FWRITE) 1333789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1334789Sahrens 1335789Sahrens return (0); 1336789Sahrens } 1337789Sahrens 1338789Sahrens /* 1339789Sahrens * This (illegal) pool name is used when temporarily importing a spa_t in order 1340789Sahrens * to get the vdev stats associated with the imported devices. 1341789Sahrens */ 1342789Sahrens #define TRYIMPORT_NAME "$import" 1343789Sahrens 1344789Sahrens nvlist_t * 1345789Sahrens spa_tryimport(nvlist_t *tryconfig) 1346789Sahrens { 1347789Sahrens nvlist_t *config = NULL; 1348789Sahrens char *poolname; 1349789Sahrens spa_t *spa; 1350789Sahrens uint64_t state; 1351789Sahrens 1352789Sahrens if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 1353789Sahrens return (NULL); 1354789Sahrens 1355789Sahrens if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 1356789Sahrens return (NULL); 1357789Sahrens 13581635Sbonwick /* 13591635Sbonwick * Create and initialize the spa structure. 13601635Sbonwick */ 1361789Sahrens mutex_enter(&spa_namespace_lock); 13621635Sbonwick spa = spa_add(TRYIMPORT_NAME, NULL); 1363789Sahrens spa_activate(spa); 1364789Sahrens 1365789Sahrens /* 13661635Sbonwick * Pass off the heavy lifting to spa_load(). 13671732Sbonwick * Pass TRUE for mosconfig because the user-supplied config 13681732Sbonwick * is actually the one to trust when doing an import. 1369789Sahrens */ 13701732Sbonwick (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); 1371789Sahrens 1372789Sahrens /* 1373789Sahrens * If 'tryconfig' was at least parsable, return the current config. 1374789Sahrens */ 1375789Sahrens if (spa->spa_root_vdev != NULL) { 13761635Sbonwick spa_config_enter(spa, RW_READER, FTAG); 1377789Sahrens config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 13781635Sbonwick spa_config_exit(spa, FTAG); 1379789Sahrens VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 1380789Sahrens poolname) == 0); 1381789Sahrens VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 1382789Sahrens state) == 0); 13833975Sek110237 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 13843975Sek110237 spa->spa_uberblock.ub_timestamp) == 0); 13852082Seschrock 13862082Seschrock /* 13872082Seschrock * Add the list of hot spares. 13882082Seschrock */ 13892082Seschrock spa_add_spares(spa, config); 1390789Sahrens } 1391789Sahrens 1392789Sahrens spa_unload(spa); 1393789Sahrens spa_deactivate(spa); 1394789Sahrens spa_remove(spa); 1395789Sahrens mutex_exit(&spa_namespace_lock); 1396789Sahrens 1397789Sahrens return (config); 1398789Sahrens } 1399789Sahrens 1400789Sahrens /* 1401789Sahrens * Pool export/destroy 1402789Sahrens * 1403789Sahrens * The act of destroying or exporting a pool is very simple. We make sure there 1404789Sahrens * is no more pending I/O and any references to the pool are gone. Then, we 1405789Sahrens * update the pool state and sync all the labels to disk, removing the 1406789Sahrens * configuration from the cache afterwards. 1407789Sahrens */ 1408789Sahrens static int 14091775Sbillm spa_export_common(char *pool, int new_state, nvlist_t **oldconfig) 1410789Sahrens { 1411789Sahrens spa_t *spa; 1412789Sahrens 14131775Sbillm if (oldconfig) 14141775Sbillm *oldconfig = NULL; 14151775Sbillm 1416789Sahrens if (!(spa_mode & FWRITE)) 1417789Sahrens return (EROFS); 1418789Sahrens 1419789Sahrens mutex_enter(&spa_namespace_lock); 1420789Sahrens if ((spa = spa_lookup(pool)) == NULL) { 1421789Sahrens mutex_exit(&spa_namespace_lock); 1422789Sahrens return (ENOENT); 1423789Sahrens } 1424789Sahrens 1425789Sahrens /* 14261544Seschrock * Put a hold on the pool, drop the namespace lock, stop async tasks, 14271544Seschrock * reacquire the namespace lock, and see if we can export. 14281544Seschrock */ 14291544Seschrock spa_open_ref(spa, FTAG); 14301544Seschrock mutex_exit(&spa_namespace_lock); 14311544Seschrock spa_async_suspend(spa); 14321544Seschrock mutex_enter(&spa_namespace_lock); 14331544Seschrock spa_close(spa, FTAG); 14341544Seschrock 14351544Seschrock /* 1436789Sahrens * The pool will be in core if it's openable, 1437789Sahrens * in which case we can modify its state. 1438789Sahrens */ 1439789Sahrens if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 1440789Sahrens /* 1441789Sahrens * Objsets may be open only because they're dirty, so we 1442789Sahrens * have to force it to sync before checking spa_refcnt. 1443789Sahrens */ 1444789Sahrens spa_scrub_suspend(spa); 1445789Sahrens txg_wait_synced(spa->spa_dsl_pool, 0); 1446789Sahrens 14471544Seschrock /* 14481544Seschrock * A pool cannot be exported or destroyed if there are active 14491544Seschrock * references. If we are resetting a pool, allow references by 14501544Seschrock * fault injection handlers. 14511544Seschrock */ 14521544Seschrock if (!spa_refcount_zero(spa) || 14531544Seschrock (spa->spa_inject_ref != 0 && 14541544Seschrock new_state != POOL_STATE_UNINITIALIZED)) { 1455789Sahrens spa_scrub_resume(spa); 14561544Seschrock spa_async_resume(spa); 1457789Sahrens mutex_exit(&spa_namespace_lock); 1458789Sahrens return (EBUSY); 1459789Sahrens } 1460789Sahrens 1461789Sahrens spa_scrub_resume(spa); 1462789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 1463789Sahrens 1464789Sahrens /* 1465789Sahrens * We want this to be reflected on every label, 1466789Sahrens * so mark them all dirty. spa_unload() will do the 1467789Sahrens * final sync that pushes these changes out. 1468789Sahrens */ 14691544Seschrock if (new_state != POOL_STATE_UNINITIALIZED) { 14701601Sbonwick spa_config_enter(spa, RW_WRITER, FTAG); 14711544Seschrock spa->spa_state = new_state; 14721635Sbonwick spa->spa_final_txg = spa_last_synced_txg(spa) + 1; 14731544Seschrock vdev_config_dirty(spa->spa_root_vdev); 14741601Sbonwick spa_config_exit(spa, FTAG); 14751544Seschrock } 1476789Sahrens } 1477789Sahrens 1478789Sahrens if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 1479789Sahrens spa_unload(spa); 1480789Sahrens spa_deactivate(spa); 1481789Sahrens } 1482789Sahrens 14831775Sbillm if (oldconfig && spa->spa_config) 14841775Sbillm VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 14851775Sbillm 14861544Seschrock if (new_state != POOL_STATE_UNINITIALIZED) { 14871544Seschrock spa_remove(spa); 14881544Seschrock spa_config_sync(); 14891544Seschrock } 1490789Sahrens mutex_exit(&spa_namespace_lock); 1491789Sahrens 1492789Sahrens return (0); 1493789Sahrens } 1494789Sahrens 1495789Sahrens /* 1496789Sahrens * Destroy a storage pool. 1497789Sahrens */ 1498789Sahrens int 1499789Sahrens spa_destroy(char *pool) 1500789Sahrens { 15011775Sbillm return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL)); 1502789Sahrens } 1503789Sahrens 1504789Sahrens /* 1505789Sahrens * Export a storage pool. 1506789Sahrens */ 1507789Sahrens int 15081775Sbillm spa_export(char *pool, nvlist_t **oldconfig) 1509789Sahrens { 15101775Sbillm return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig)); 1511789Sahrens } 1512789Sahrens 1513789Sahrens /* 15141544Seschrock * Similar to spa_export(), this unloads the spa_t without actually removing it 15151544Seschrock * from the namespace in any way. 15161544Seschrock */ 15171544Seschrock int 15181544Seschrock spa_reset(char *pool) 15191544Seschrock { 15201775Sbillm return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL)); 15211544Seschrock } 15221544Seschrock 15231544Seschrock 15241544Seschrock /* 1525789Sahrens * ========================================================================== 1526789Sahrens * Device manipulation 1527789Sahrens * ========================================================================== 1528789Sahrens */ 1529789Sahrens 1530789Sahrens /* 1531789Sahrens * Add capacity to a storage pool. 1532789Sahrens */ 1533789Sahrens int 1534789Sahrens spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 1535789Sahrens { 1536789Sahrens uint64_t txg; 15371635Sbonwick int c, error; 1538789Sahrens vdev_t *rvd = spa->spa_root_vdev; 15391585Sbonwick vdev_t *vd, *tvd; 15402082Seschrock nvlist_t **spares; 15412082Seschrock uint_t i, nspares; 1542789Sahrens 1543789Sahrens txg = spa_vdev_enter(spa); 1544789Sahrens 15452082Seschrock if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 15462082Seschrock VDEV_ALLOC_ADD)) != 0) 15472082Seschrock return (spa_vdev_exit(spa, NULL, txg, error)); 15482082Seschrock 15493377Seschrock spa->spa_pending_vdev = vd; 1550789Sahrens 15512082Seschrock if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 15522082Seschrock &spares, &nspares) != 0) 15532082Seschrock nspares = 0; 15542082Seschrock 15553377Seschrock if (vd->vdev_children == 0 && nspares == 0) { 15563377Seschrock spa->spa_pending_vdev = NULL; 15572082Seschrock return (spa_vdev_exit(spa, vd, txg, EINVAL)); 15583377Seschrock } 15592082Seschrock 15602082Seschrock if (vd->vdev_children != 0) { 15613377Seschrock if ((error = vdev_create(vd, txg, B_FALSE)) != 0) { 15623377Seschrock spa->spa_pending_vdev = NULL; 15632082Seschrock return (spa_vdev_exit(spa, vd, txg, error)); 15642082Seschrock } 15652082Seschrock } 15662082Seschrock 15673377Seschrock /* 15683377Seschrock * We must validate the spares after checking the children. Otherwise, 15693377Seschrock * vdev_inuse() will blindly overwrite the spare. 15703377Seschrock */ 15713377Seschrock if ((error = spa_validate_spares(spa, nvroot, txg, 15723377Seschrock VDEV_ALLOC_ADD)) != 0) { 15733377Seschrock spa->spa_pending_vdev = NULL; 15743377Seschrock return (spa_vdev_exit(spa, vd, txg, error)); 15753377Seschrock } 15763377Seschrock 15773377Seschrock spa->spa_pending_vdev = NULL; 15783377Seschrock 15793377Seschrock /* 15803377Seschrock * Transfer each new top-level vdev from vd to rvd. 15813377Seschrock */ 15823377Seschrock for (c = 0; c < vd->vdev_children; c++) { 15833377Seschrock tvd = vd->vdev_child[c]; 15843377Seschrock vdev_remove_child(vd, tvd); 15853377Seschrock tvd->vdev_id = rvd->vdev_children; 15863377Seschrock vdev_add_child(rvd, tvd); 15873377Seschrock vdev_config_dirty(tvd); 15883377Seschrock } 15893377Seschrock 15902082Seschrock if (nspares != 0) { 15912082Seschrock if (spa->spa_sparelist != NULL) { 15922082Seschrock nvlist_t **oldspares; 15932082Seschrock uint_t oldnspares; 15942082Seschrock nvlist_t **newspares; 15952082Seschrock 15962082Seschrock VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 15972082Seschrock ZPOOL_CONFIG_SPARES, &oldspares, &oldnspares) == 0); 15982082Seschrock 15992082Seschrock newspares = kmem_alloc(sizeof (void *) * 16002082Seschrock (nspares + oldnspares), KM_SLEEP); 16012082Seschrock for (i = 0; i < oldnspares; i++) 16022082Seschrock VERIFY(nvlist_dup(oldspares[i], 16032082Seschrock &newspares[i], KM_SLEEP) == 0); 16042082Seschrock for (i = 0; i < nspares; i++) 16052082Seschrock VERIFY(nvlist_dup(spares[i], 16062082Seschrock &newspares[i + oldnspares], 16072082Seschrock KM_SLEEP) == 0); 16082082Seschrock 16092082Seschrock VERIFY(nvlist_remove(spa->spa_sparelist, 16102082Seschrock ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 16112082Seschrock 16122082Seschrock VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 16132082Seschrock ZPOOL_CONFIG_SPARES, newspares, 16142082Seschrock nspares + oldnspares) == 0); 16152082Seschrock for (i = 0; i < oldnspares + nspares; i++) 16162082Seschrock nvlist_free(newspares[i]); 16172082Seschrock kmem_free(newspares, (oldnspares + nspares) * 16182082Seschrock sizeof (void *)); 16192082Seschrock } else { 16202082Seschrock VERIFY(nvlist_alloc(&spa->spa_sparelist, 16212082Seschrock NV_UNIQUE_NAME, KM_SLEEP) == 0); 16222082Seschrock VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 16232082Seschrock ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 16242082Seschrock } 16252082Seschrock 16262082Seschrock spa_load_spares(spa); 16272082Seschrock spa->spa_sync_spares = B_TRUE; 1628789Sahrens } 1629789Sahrens 1630789Sahrens /* 16311585Sbonwick * We have to be careful when adding new vdevs to an existing pool. 16321585Sbonwick * If other threads start allocating from these vdevs before we 16331585Sbonwick * sync the config cache, and we lose power, then upon reboot we may 16341585Sbonwick * fail to open the pool because there are DVAs that the config cache 16351585Sbonwick * can't translate. Therefore, we first add the vdevs without 16361585Sbonwick * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 16371635Sbonwick * and then let spa_config_update() initialize the new metaslabs. 16381585Sbonwick * 16391585Sbonwick * spa_load() checks for added-but-not-initialized vdevs, so that 16401585Sbonwick * if we lose power at any point in this sequence, the remaining 16411585Sbonwick * steps will be completed the next time we load the pool. 1642789Sahrens */ 16431635Sbonwick (void) spa_vdev_exit(spa, vd, txg, 0); 16441585Sbonwick 16451635Sbonwick mutex_enter(&spa_namespace_lock); 16461635Sbonwick spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 16471635Sbonwick mutex_exit(&spa_namespace_lock); 1648789Sahrens 16491635Sbonwick return (0); 1650789Sahrens } 1651789Sahrens 1652789Sahrens /* 1653789Sahrens * Attach a device to a mirror. The arguments are the path to any device 1654789Sahrens * in the mirror, and the nvroot for the new device. If the path specifies 1655789Sahrens * a device that is not mirrored, we automatically insert the mirror vdev. 1656789Sahrens * 1657789Sahrens * If 'replacing' is specified, the new device is intended to replace the 1658789Sahrens * existing device; in this case the two devices are made into their own 1659789Sahrens * mirror using the 'replacing' vdev, which is functionally idendical to 1660789Sahrens * the mirror vdev (it actually reuses all the same ops) but has a few 1661789Sahrens * extra rules: you can't attach to it after it's been created, and upon 1662789Sahrens * completion of resilvering, the first disk (the one being replaced) 1663789Sahrens * is automatically detached. 1664789Sahrens */ 1665789Sahrens int 16661544Seschrock spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 1667789Sahrens { 1668789Sahrens uint64_t txg, open_txg; 1669789Sahrens int error; 1670789Sahrens vdev_t *rvd = spa->spa_root_vdev; 1671789Sahrens vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 16722082Seschrock vdev_ops_t *pvops; 1673789Sahrens 1674789Sahrens txg = spa_vdev_enter(spa); 1675789Sahrens 16761544Seschrock oldvd = vdev_lookup_by_guid(rvd, guid); 1677789Sahrens 1678789Sahrens if (oldvd == NULL) 1679789Sahrens return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1680789Sahrens 16811585Sbonwick if (!oldvd->vdev_ops->vdev_op_leaf) 16821585Sbonwick return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 16831585Sbonwick 1684789Sahrens pvd = oldvd->vdev_parent; 1685789Sahrens 16862082Seschrock if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 16872082Seschrock VDEV_ALLOC_ADD)) != 0 || newrootvd->vdev_children != 1) 1688789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1689789Sahrens 1690789Sahrens newvd = newrootvd->vdev_child[0]; 1691789Sahrens 1692789Sahrens if (!newvd->vdev_ops->vdev_op_leaf) 1693789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1694789Sahrens 16952082Seschrock if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 1696789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, error)); 1697789Sahrens 16982082Seschrock if (!replacing) { 16992082Seschrock /* 17002082Seschrock * For attach, the only allowable parent is a mirror or the root 17012082Seschrock * vdev. 17022082Seschrock */ 17032082Seschrock if (pvd->vdev_ops != &vdev_mirror_ops && 17042082Seschrock pvd->vdev_ops != &vdev_root_ops) 17052082Seschrock return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 17062082Seschrock 17072082Seschrock pvops = &vdev_mirror_ops; 17082082Seschrock } else { 17092082Seschrock /* 17102082Seschrock * Active hot spares can only be replaced by inactive hot 17112082Seschrock * spares. 17122082Seschrock */ 17132082Seschrock if (pvd->vdev_ops == &vdev_spare_ops && 17142082Seschrock pvd->vdev_child[1] == oldvd && 17152082Seschrock !spa_has_spare(spa, newvd->vdev_guid)) 17162082Seschrock return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 17172082Seschrock 17182082Seschrock /* 17192082Seschrock * If the source is a hot spare, and the parent isn't already a 17202082Seschrock * spare, then we want to create a new hot spare. Otherwise, we 17213377Seschrock * want to create a replacing vdev. The user is not allowed to 17223377Seschrock * attach to a spared vdev child unless the 'isspare' state is 17233377Seschrock * the same (spare replaces spare, non-spare replaces 17243377Seschrock * non-spare). 17252082Seschrock */ 17262082Seschrock if (pvd->vdev_ops == &vdev_replacing_ops) 17272082Seschrock return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 17283377Seschrock else if (pvd->vdev_ops == &vdev_spare_ops && 17293377Seschrock newvd->vdev_isspare != oldvd->vdev_isspare) 17303377Seschrock return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 17312082Seschrock else if (pvd->vdev_ops != &vdev_spare_ops && 17322082Seschrock newvd->vdev_isspare) 17332082Seschrock pvops = &vdev_spare_ops; 17342082Seschrock else 17352082Seschrock pvops = &vdev_replacing_ops; 17362082Seschrock } 17372082Seschrock 17381175Slling /* 17391175Slling * Compare the new device size with the replaceable/attachable 17401175Slling * device size. 17411175Slling */ 17421175Slling if (newvd->vdev_psize < vdev_get_rsize(oldvd)) 1743789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 1744789Sahrens 17451732Sbonwick /* 17461732Sbonwick * The new device cannot have a higher alignment requirement 17471732Sbonwick * than the top-level vdev. 17481732Sbonwick */ 17491732Sbonwick if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 1750789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 1751789Sahrens 1752789Sahrens /* 1753789Sahrens * If this is an in-place replacement, update oldvd's path and devid 1754789Sahrens * to make it distinguishable from newvd, and unopenable from now on. 1755789Sahrens */ 1756789Sahrens if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 1757789Sahrens spa_strfree(oldvd->vdev_path); 1758789Sahrens oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 1759789Sahrens KM_SLEEP); 1760789Sahrens (void) sprintf(oldvd->vdev_path, "%s/%s", 1761789Sahrens newvd->vdev_path, "old"); 1762789Sahrens if (oldvd->vdev_devid != NULL) { 1763789Sahrens spa_strfree(oldvd->vdev_devid); 1764789Sahrens oldvd->vdev_devid = NULL; 1765789Sahrens } 1766789Sahrens } 1767789Sahrens 1768789Sahrens /* 17692082Seschrock * If the parent is not a mirror, or if we're replacing, insert the new 17702082Seschrock * mirror/replacing/spare vdev above oldvd. 1771789Sahrens */ 1772789Sahrens if (pvd->vdev_ops != pvops) 1773789Sahrens pvd = vdev_add_parent(oldvd, pvops); 1774789Sahrens 1775789Sahrens ASSERT(pvd->vdev_top->vdev_parent == rvd); 1776789Sahrens ASSERT(pvd->vdev_ops == pvops); 1777789Sahrens ASSERT(oldvd->vdev_parent == pvd); 1778789Sahrens 1779789Sahrens /* 1780789Sahrens * Extract the new device from its root and add it to pvd. 1781789Sahrens */ 1782789Sahrens vdev_remove_child(newrootvd, newvd); 1783789Sahrens newvd->vdev_id = pvd->vdev_children; 1784789Sahrens vdev_add_child(pvd, newvd); 1785789Sahrens 17861544Seschrock /* 17871544Seschrock * If newvd is smaller than oldvd, but larger than its rsize, 17881544Seschrock * the addition of newvd may have decreased our parent's asize. 17891544Seschrock */ 17901544Seschrock pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize); 17911544Seschrock 1792789Sahrens tvd = newvd->vdev_top; 1793789Sahrens ASSERT(pvd->vdev_top == tvd); 1794789Sahrens ASSERT(tvd->vdev_parent == rvd); 1795789Sahrens 1796789Sahrens vdev_config_dirty(tvd); 1797789Sahrens 1798789Sahrens /* 1799789Sahrens * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 1800789Sahrens * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 1801789Sahrens */ 1802789Sahrens open_txg = txg + TXG_CONCURRENT_STATES - 1; 1803789Sahrens 1804789Sahrens mutex_enter(&newvd->vdev_dtl_lock); 1805789Sahrens space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, 1806789Sahrens open_txg - TXG_INITIAL + 1); 1807789Sahrens mutex_exit(&newvd->vdev_dtl_lock); 1808789Sahrens 18093377Seschrock if (newvd->vdev_isspare) 18103377Seschrock spa_spare_activate(newvd); 18111544Seschrock 1812789Sahrens /* 1813789Sahrens * Mark newvd's DTL dirty in this txg. 1814789Sahrens */ 18151732Sbonwick vdev_dirty(tvd, VDD_DTL, newvd, txg); 1816789Sahrens 1817789Sahrens (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 1818789Sahrens 1819789Sahrens /* 1820789Sahrens * Kick off a resilver to update newvd. 1821789Sahrens */ 1822789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1823789Sahrens 1824789Sahrens return (0); 1825789Sahrens } 1826789Sahrens 1827789Sahrens /* 1828789Sahrens * Detach a device from a mirror or replacing vdev. 1829789Sahrens * If 'replace_done' is specified, only detach if the parent 1830789Sahrens * is a replacing vdev. 1831789Sahrens */ 1832789Sahrens int 18331544Seschrock spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) 1834789Sahrens { 1835789Sahrens uint64_t txg; 1836789Sahrens int c, t, error; 1837789Sahrens vdev_t *rvd = spa->spa_root_vdev; 1838789Sahrens vdev_t *vd, *pvd, *cvd, *tvd; 18392082Seschrock boolean_t unspare = B_FALSE; 18402082Seschrock uint64_t unspare_guid; 1841789Sahrens 1842789Sahrens txg = spa_vdev_enter(spa); 1843789Sahrens 18441544Seschrock vd = vdev_lookup_by_guid(rvd, guid); 1845789Sahrens 1846789Sahrens if (vd == NULL) 1847789Sahrens return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1848789Sahrens 18491585Sbonwick if (!vd->vdev_ops->vdev_op_leaf) 18501585Sbonwick return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 18511585Sbonwick 1852789Sahrens pvd = vd->vdev_parent; 1853789Sahrens 1854789Sahrens /* 1855789Sahrens * If replace_done is specified, only remove this device if it's 18562082Seschrock * the first child of a replacing vdev. For the 'spare' vdev, either 18572082Seschrock * disk can be removed. 1858789Sahrens */ 18592082Seschrock if (replace_done) { 18602082Seschrock if (pvd->vdev_ops == &vdev_replacing_ops) { 18612082Seschrock if (vd->vdev_id != 0) 18622082Seschrock return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 18632082Seschrock } else if (pvd->vdev_ops != &vdev_spare_ops) { 18642082Seschrock return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 18652082Seschrock } 18662082Seschrock } 18672082Seschrock 18682082Seschrock ASSERT(pvd->vdev_ops != &vdev_spare_ops || 18692082Seschrock spa_version(spa) >= ZFS_VERSION_SPARES); 1870789Sahrens 1871789Sahrens /* 18722082Seschrock * Only mirror, replacing, and spare vdevs support detach. 1873789Sahrens */ 1874789Sahrens if (pvd->vdev_ops != &vdev_replacing_ops && 18752082Seschrock pvd->vdev_ops != &vdev_mirror_ops && 18762082Seschrock pvd->vdev_ops != &vdev_spare_ops) 1877789Sahrens return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1878789Sahrens 1879789Sahrens /* 1880789Sahrens * If there's only one replica, you can't detach it. 1881789Sahrens */ 1882789Sahrens if (pvd->vdev_children <= 1) 1883789Sahrens return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1884789Sahrens 1885789Sahrens /* 1886789Sahrens * If all siblings have non-empty DTLs, this device may have the only 1887789Sahrens * valid copy of the data, which means we cannot safely detach it. 1888789Sahrens * 1889789Sahrens * XXX -- as in the vdev_offline() case, we really want a more 1890789Sahrens * precise DTL check. 1891789Sahrens */ 1892789Sahrens for (c = 0; c < pvd->vdev_children; c++) { 1893789Sahrens uint64_t dirty; 1894789Sahrens 1895789Sahrens cvd = pvd->vdev_child[c]; 1896789Sahrens if (cvd == vd) 1897789Sahrens continue; 1898789Sahrens if (vdev_is_dead(cvd)) 1899789Sahrens continue; 1900789Sahrens mutex_enter(&cvd->vdev_dtl_lock); 1901789Sahrens dirty = cvd->vdev_dtl_map.sm_space | 1902789Sahrens cvd->vdev_dtl_scrub.sm_space; 1903789Sahrens mutex_exit(&cvd->vdev_dtl_lock); 1904789Sahrens if (!dirty) 1905789Sahrens break; 1906789Sahrens } 19072082Seschrock 19082082Seschrock /* 19092082Seschrock * If we are a replacing or spare vdev, then we can always detach the 19102082Seschrock * latter child, as that is how one cancels the operation. 19112082Seschrock */ 19122082Seschrock if ((pvd->vdev_ops == &vdev_mirror_ops || vd->vdev_id != 1) && 19132082Seschrock c == pvd->vdev_children) 1914789Sahrens return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1915789Sahrens 1916789Sahrens /* 19172082Seschrock * If we are detaching the original disk from a spare, then it implies 19182082Seschrock * that the spare should become a real disk, and be removed from the 19192082Seschrock * active spare list for the pool. 19202082Seschrock */ 19212082Seschrock if (pvd->vdev_ops == &vdev_spare_ops && 19222082Seschrock vd->vdev_id == 0) 19232082Seschrock unspare = B_TRUE; 19242082Seschrock 19252082Seschrock /* 1926789Sahrens * Erase the disk labels so the disk can be used for other things. 1927789Sahrens * This must be done after all other error cases are handled, 1928789Sahrens * but before we disembowel vd (so we can still do I/O to it). 1929789Sahrens * But if we can't do it, don't treat the error as fatal -- 1930789Sahrens * it may be that the unwritability of the disk is the reason 1931789Sahrens * it's being detached! 1932789Sahrens */ 19333377Seschrock error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 1934789Sahrens 1935789Sahrens /* 1936789Sahrens * Remove vd from its parent and compact the parent's children. 1937789Sahrens */ 1938789Sahrens vdev_remove_child(pvd, vd); 1939789Sahrens vdev_compact_children(pvd); 1940789Sahrens 1941789Sahrens /* 1942789Sahrens * Remember one of the remaining children so we can get tvd below. 1943789Sahrens */ 1944789Sahrens cvd = pvd->vdev_child[0]; 1945789Sahrens 1946789Sahrens /* 19472082Seschrock * If we need to remove the remaining child from the list of hot spares, 19482082Seschrock * do it now, marking the vdev as no longer a spare in the process. We 19492082Seschrock * must do this before vdev_remove_parent(), because that can change the 19502082Seschrock * GUID if it creates a new toplevel GUID. 19512082Seschrock */ 19522082Seschrock if (unspare) { 19532082Seschrock ASSERT(cvd->vdev_isspare); 19543377Seschrock spa_spare_remove(cvd); 19552082Seschrock unspare_guid = cvd->vdev_guid; 19562082Seschrock } 19572082Seschrock 19582082Seschrock /* 1959789Sahrens * If the parent mirror/replacing vdev only has one child, 1960789Sahrens * the parent is no longer needed. Remove it from the tree. 1961789Sahrens */ 1962789Sahrens if (pvd->vdev_children == 1) 1963789Sahrens vdev_remove_parent(cvd); 1964789Sahrens 1965789Sahrens /* 1966789Sahrens * We don't set tvd until now because the parent we just removed 1967789Sahrens * may have been the previous top-level vdev. 1968789Sahrens */ 1969789Sahrens tvd = cvd->vdev_top; 1970789Sahrens ASSERT(tvd->vdev_parent == rvd); 1971789Sahrens 1972789Sahrens /* 19733377Seschrock * Reevaluate the parent vdev state. 1974789Sahrens */ 19753377Seschrock vdev_propagate_state(cvd->vdev_parent); 1976789Sahrens 1977789Sahrens /* 19783377Seschrock * If the device we just detached was smaller than the others, it may be 19793377Seschrock * possible to add metaslabs (i.e. grow the pool). vdev_metaslab_init() 19803377Seschrock * can't fail because the existing metaslabs are already in core, so 19813377Seschrock * there's nothing to read from disk. 1982789Sahrens */ 19831732Sbonwick VERIFY(vdev_metaslab_init(tvd, txg) == 0); 1984789Sahrens 1985789Sahrens vdev_config_dirty(tvd); 1986789Sahrens 1987789Sahrens /* 19883377Seschrock * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 19893377Seschrock * vd->vdev_detached is set and free vd's DTL object in syncing context. 19903377Seschrock * But first make sure we're not on any *other* txg's DTL list, to 19913377Seschrock * prevent vd from being accessed after it's freed. 1992789Sahrens */ 1993789Sahrens for (t = 0; t < TXG_SIZE; t++) 1994789Sahrens (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 19951732Sbonwick vd->vdev_detached = B_TRUE; 19961732Sbonwick vdev_dirty(tvd, VDD_DTL, vd, txg); 1997789Sahrens 19982082Seschrock error = spa_vdev_exit(spa, vd, txg, 0); 19992082Seschrock 20002082Seschrock /* 20013377Seschrock * If this was the removal of the original device in a hot spare vdev, 20023377Seschrock * then we want to go through and remove the device from the hot spare 20033377Seschrock * list of every other pool. 20042082Seschrock */ 20052082Seschrock if (unspare) { 20062082Seschrock spa = NULL; 20072082Seschrock mutex_enter(&spa_namespace_lock); 20082082Seschrock while ((spa = spa_next(spa)) != NULL) { 20092082Seschrock if (spa->spa_state != POOL_STATE_ACTIVE) 20102082Seschrock continue; 20112082Seschrock 20122082Seschrock (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 20132082Seschrock } 20142082Seschrock mutex_exit(&spa_namespace_lock); 20152082Seschrock } 20162082Seschrock 20172082Seschrock return (error); 20182082Seschrock } 20192082Seschrock 20202082Seschrock /* 20212082Seschrock * Remove a device from the pool. Currently, this supports removing only hot 20222082Seschrock * spares. 20232082Seschrock */ 20242082Seschrock int 20252082Seschrock spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 20262082Seschrock { 20272082Seschrock vdev_t *vd; 20282082Seschrock nvlist_t **spares, *nv, **newspares; 20292082Seschrock uint_t i, j, nspares; 20302082Seschrock int ret = 0; 20312082Seschrock 20322082Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 20332082Seschrock 20342082Seschrock vd = spa_lookup_by_guid(spa, guid); 20352082Seschrock 20362082Seschrock nv = NULL; 20372082Seschrock if (spa->spa_spares != NULL && 20382082Seschrock nvlist_lookup_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 20392082Seschrock &spares, &nspares) == 0) { 20402082Seschrock for (i = 0; i < nspares; i++) { 20412082Seschrock uint64_t theguid; 20422082Seschrock 20432082Seschrock VERIFY(nvlist_lookup_uint64(spares[i], 20442082Seschrock ZPOOL_CONFIG_GUID, &theguid) == 0); 20452082Seschrock if (theguid == guid) { 20462082Seschrock nv = spares[i]; 20472082Seschrock break; 20482082Seschrock } 20492082Seschrock } 20502082Seschrock } 20512082Seschrock 20522082Seschrock /* 20532082Seschrock * We only support removing a hot spare, and only if it's not currently 20542082Seschrock * in use in this pool. 20552082Seschrock */ 20562082Seschrock if (nv == NULL && vd == NULL) { 20572082Seschrock ret = ENOENT; 20582082Seschrock goto out; 20592082Seschrock } 20602082Seschrock 20612082Seschrock if (nv == NULL && vd != NULL) { 20622082Seschrock ret = ENOTSUP; 20632082Seschrock goto out; 20642082Seschrock } 20652082Seschrock 20662082Seschrock if (!unspare && nv != NULL && vd != NULL) { 20672082Seschrock ret = EBUSY; 20682082Seschrock goto out; 20692082Seschrock } 20702082Seschrock 20712082Seschrock if (nspares == 1) { 20722082Seschrock newspares = NULL; 20732082Seschrock } else { 20742082Seschrock newspares = kmem_alloc((nspares - 1) * sizeof (void *), 20752082Seschrock KM_SLEEP); 20762082Seschrock for (i = 0, j = 0; i < nspares; i++) { 20772082Seschrock if (spares[i] != nv) 20782082Seschrock VERIFY(nvlist_dup(spares[i], 20792082Seschrock &newspares[j++], KM_SLEEP) == 0); 20802082Seschrock } 20812082Seschrock } 20822082Seschrock 20832082Seschrock VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 20842082Seschrock DATA_TYPE_NVLIST_ARRAY) == 0); 20852082Seschrock VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 20862082Seschrock newspares, nspares - 1) == 0); 20872082Seschrock for (i = 0; i < nspares - 1; i++) 20882082Seschrock nvlist_free(newspares[i]); 20892082Seschrock kmem_free(newspares, (nspares - 1) * sizeof (void *)); 20902082Seschrock spa_load_spares(spa); 20912082Seschrock spa->spa_sync_spares = B_TRUE; 20922082Seschrock 20932082Seschrock out: 20942082Seschrock spa_config_exit(spa, FTAG); 20952082Seschrock 20962082Seschrock return (ret); 2097789Sahrens } 2098789Sahrens 2099789Sahrens /* 21001544Seschrock * Find any device that's done replacing, so we can detach it. 2101789Sahrens */ 21021544Seschrock static vdev_t * 21031544Seschrock spa_vdev_replace_done_hunt(vdev_t *vd) 2104789Sahrens { 21051544Seschrock vdev_t *newvd, *oldvd; 2106789Sahrens int c; 2107789Sahrens 21081544Seschrock for (c = 0; c < vd->vdev_children; c++) { 21091544Seschrock oldvd = spa_vdev_replace_done_hunt(vd->vdev_child[c]); 21101544Seschrock if (oldvd != NULL) 21111544Seschrock return (oldvd); 21121544Seschrock } 2113789Sahrens 2114789Sahrens if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 21151544Seschrock oldvd = vd->vdev_child[0]; 21161544Seschrock newvd = vd->vdev_child[1]; 2117789Sahrens 21181544Seschrock mutex_enter(&newvd->vdev_dtl_lock); 21191544Seschrock if (newvd->vdev_dtl_map.sm_space == 0 && 21201544Seschrock newvd->vdev_dtl_scrub.sm_space == 0) { 21211544Seschrock mutex_exit(&newvd->vdev_dtl_lock); 21221544Seschrock return (oldvd); 21231544Seschrock } 21241544Seschrock mutex_exit(&newvd->vdev_dtl_lock); 21251544Seschrock } 2126789Sahrens 21271544Seschrock return (NULL); 2128789Sahrens } 2129789Sahrens 21301544Seschrock static void 2131789Sahrens spa_vdev_replace_done(spa_t *spa) 2132789Sahrens { 21331544Seschrock vdev_t *vd; 21342082Seschrock vdev_t *pvd; 21351544Seschrock uint64_t guid; 21362082Seschrock uint64_t pguid = 0; 2137789Sahrens 21381544Seschrock spa_config_enter(spa, RW_READER, FTAG); 2139789Sahrens 21401544Seschrock while ((vd = spa_vdev_replace_done_hunt(spa->spa_root_vdev)) != NULL) { 21411544Seschrock guid = vd->vdev_guid; 21422082Seschrock /* 21432082Seschrock * If we have just finished replacing a hot spared device, then 21442082Seschrock * we need to detach the parent's first child (the original hot 21452082Seschrock * spare) as well. 21462082Seschrock */ 21472082Seschrock pvd = vd->vdev_parent; 21482082Seschrock if (pvd->vdev_parent->vdev_ops == &vdev_spare_ops && 21492082Seschrock pvd->vdev_id == 0) { 21502082Seschrock ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 21512082Seschrock ASSERT(pvd->vdev_parent->vdev_children == 2); 21522082Seschrock pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid; 21532082Seschrock } 21541544Seschrock spa_config_exit(spa, FTAG); 21551544Seschrock if (spa_vdev_detach(spa, guid, B_TRUE) != 0) 21561544Seschrock return; 21572082Seschrock if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0) 21582082Seschrock return; 21591544Seschrock spa_config_enter(spa, RW_READER, FTAG); 2160789Sahrens } 2161789Sahrens 21621544Seschrock spa_config_exit(spa, FTAG); 2163789Sahrens } 2164789Sahrens 2165789Sahrens /* 21661354Seschrock * Update the stored path for this vdev. Dirty the vdev configuration, relying 21671354Seschrock * on spa_vdev_enter/exit() to synchronize the labels and cache. 21681354Seschrock */ 21691354Seschrock int 21701354Seschrock spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 21711354Seschrock { 21721354Seschrock vdev_t *rvd, *vd; 21731354Seschrock uint64_t txg; 21741354Seschrock 21751354Seschrock rvd = spa->spa_root_vdev; 21761354Seschrock 21771354Seschrock txg = spa_vdev_enter(spa); 21781354Seschrock 21792082Seschrock if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 21802082Seschrock /* 21812082Seschrock * Determine if this is a reference to a hot spare. In that 21822082Seschrock * case, update the path as stored in the spare list. 21832082Seschrock */ 21842082Seschrock nvlist_t **spares; 21852082Seschrock uint_t i, nspares; 21862082Seschrock if (spa->spa_sparelist != NULL) { 21872082Seschrock VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 21882082Seschrock ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 21892082Seschrock for (i = 0; i < nspares; i++) { 21902082Seschrock uint64_t theguid; 21912082Seschrock VERIFY(nvlist_lookup_uint64(spares[i], 21922082Seschrock ZPOOL_CONFIG_GUID, &theguid) == 0); 21932082Seschrock if (theguid == guid) 21942082Seschrock break; 21952082Seschrock } 21962082Seschrock 21972082Seschrock if (i == nspares) 21982082Seschrock return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 21992082Seschrock 22002082Seschrock VERIFY(nvlist_add_string(spares[i], 22012082Seschrock ZPOOL_CONFIG_PATH, newpath) == 0); 22022082Seschrock spa_load_spares(spa); 22032082Seschrock spa->spa_sync_spares = B_TRUE; 22042082Seschrock return (spa_vdev_exit(spa, NULL, txg, 0)); 22052082Seschrock } else { 22062082Seschrock return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 22072082Seschrock } 22082082Seschrock } 22091354Seschrock 22101585Sbonwick if (!vd->vdev_ops->vdev_op_leaf) 22111585Sbonwick return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 22121585Sbonwick 22131354Seschrock spa_strfree(vd->vdev_path); 22141354Seschrock vd->vdev_path = spa_strdup(newpath); 22151354Seschrock 22161354Seschrock vdev_config_dirty(vd->vdev_top); 22171354Seschrock 22181354Seschrock return (spa_vdev_exit(spa, NULL, txg, 0)); 22191354Seschrock } 22201354Seschrock 22211354Seschrock /* 2222789Sahrens * ========================================================================== 2223789Sahrens * SPA Scrubbing 2224789Sahrens * ========================================================================== 2225789Sahrens */ 2226789Sahrens 2227789Sahrens static void 2228789Sahrens spa_scrub_io_done(zio_t *zio) 2229789Sahrens { 2230789Sahrens spa_t *spa = zio->io_spa; 2231789Sahrens 22323290Sjohansen zio_data_buf_free(zio->io_data, zio->io_size); 2233789Sahrens 2234789Sahrens mutex_enter(&spa->spa_scrub_lock); 22351544Seschrock if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 22361775Sbillm vdev_t *vd = zio->io_vd ? zio->io_vd : spa->spa_root_vdev; 2237789Sahrens spa->spa_scrub_errors++; 2238789Sahrens mutex_enter(&vd->vdev_stat_lock); 2239789Sahrens vd->vdev_stat.vs_scrub_errors++; 2240789Sahrens mutex_exit(&vd->vdev_stat_lock); 2241789Sahrens } 22423697Smishra 22433697Smishra if (--spa->spa_scrub_inflight < spa->spa_scrub_maxinflight) 22441544Seschrock cv_broadcast(&spa->spa_scrub_io_cv); 22453697Smishra 22463697Smishra ASSERT(spa->spa_scrub_inflight >= 0); 22473697Smishra 22481544Seschrock mutex_exit(&spa->spa_scrub_lock); 2249789Sahrens } 2250789Sahrens 2251789Sahrens static void 22521544Seschrock spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags, 22531544Seschrock zbookmark_t *zb) 2254789Sahrens { 2255789Sahrens size_t size = BP_GET_LSIZE(bp); 22563697Smishra void *data; 2257789Sahrens 2258789Sahrens mutex_enter(&spa->spa_scrub_lock); 22593697Smishra /* 22603697Smishra * Do not give too much work to vdev(s). 22613697Smishra */ 22623697Smishra while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight) { 22633697Smishra cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 22643697Smishra } 2265789Sahrens spa->spa_scrub_inflight++; 2266789Sahrens mutex_exit(&spa->spa_scrub_lock); 2267789Sahrens 22683697Smishra data = zio_data_buf_alloc(size); 22693697Smishra 22701544Seschrock if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET) 22711544Seschrock flags |= ZIO_FLAG_SPECULATIVE; /* intent log block */ 22721544Seschrock 22731807Sbonwick flags |= ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL; 22741544Seschrock 2275789Sahrens zio_nowait(zio_read(NULL, spa, bp, data, size, 22761544Seschrock spa_scrub_io_done, NULL, priority, flags, zb)); 2277789Sahrens } 2278789Sahrens 2279789Sahrens /* ARGSUSED */ 2280789Sahrens static int 2281789Sahrens spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) 2282789Sahrens { 2283789Sahrens blkptr_t *bp = &bc->bc_blkptr; 22841775Sbillm vdev_t *vd = spa->spa_root_vdev; 22851775Sbillm dva_t *dva = bp->blk_dva; 22861775Sbillm int needs_resilver = B_FALSE; 22871775Sbillm int d; 2288789Sahrens 22891775Sbillm if (bc->bc_errno) { 2290789Sahrens /* 2291789Sahrens * We can't scrub this block, but we can continue to scrub 2292789Sahrens * the rest of the pool. Note the error and move along. 2293789Sahrens */ 2294789Sahrens mutex_enter(&spa->spa_scrub_lock); 2295789Sahrens spa->spa_scrub_errors++; 2296789Sahrens mutex_exit(&spa->spa_scrub_lock); 2297789Sahrens 22981775Sbillm mutex_enter(&vd->vdev_stat_lock); 22991775Sbillm vd->vdev_stat.vs_scrub_errors++; 23001775Sbillm mutex_exit(&vd->vdev_stat_lock); 2301789Sahrens 2302789Sahrens return (ERESTART); 2303789Sahrens } 2304789Sahrens 2305789Sahrens ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg); 2306789Sahrens 23071775Sbillm for (d = 0; d < BP_GET_NDVAS(bp); d++) { 23081775Sbillm vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d])); 23091775Sbillm 23101775Sbillm ASSERT(vd != NULL); 23111775Sbillm 23121775Sbillm /* 23131775Sbillm * Keep track of how much data we've examined so that 23141775Sbillm * zpool(1M) status can make useful progress reports. 23151775Sbillm */ 23161775Sbillm mutex_enter(&vd->vdev_stat_lock); 23171775Sbillm vd->vdev_stat.vs_scrub_examined += DVA_GET_ASIZE(&dva[d]); 23181775Sbillm mutex_exit(&vd->vdev_stat_lock); 2319789Sahrens 23201775Sbillm if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) { 23211775Sbillm if (DVA_GET_GANG(&dva[d])) { 23221775Sbillm /* 23231775Sbillm * Gang members may be spread across multiple 23241775Sbillm * vdevs, so the best we can do is look at the 23251775Sbillm * pool-wide DTL. 23261775Sbillm * XXX -- it would be better to change our 23271775Sbillm * allocation policy to ensure that this can't 23281775Sbillm * happen. 23291775Sbillm */ 23301775Sbillm vd = spa->spa_root_vdev; 23311775Sbillm } 23321775Sbillm if (vdev_dtl_contains(&vd->vdev_dtl_map, 23331775Sbillm bp->blk_birth, 1)) 23341775Sbillm needs_resilver = B_TRUE; 2335789Sahrens } 23361775Sbillm } 23371775Sbillm 23381775Sbillm if (spa->spa_scrub_type == POOL_SCRUB_EVERYTHING) 2339789Sahrens spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB, 23401544Seschrock ZIO_FLAG_SCRUB, &bc->bc_bookmark); 23411775Sbillm else if (needs_resilver) 23421775Sbillm spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER, 23431775Sbillm ZIO_FLAG_RESILVER, &bc->bc_bookmark); 2344789Sahrens 2345789Sahrens return (0); 2346789Sahrens } 2347789Sahrens 2348789Sahrens static void 2349789Sahrens spa_scrub_thread(spa_t *spa) 2350789Sahrens { 2351789Sahrens callb_cpr_t cprinfo; 2352789Sahrens traverse_handle_t *th = spa->spa_scrub_th; 2353789Sahrens vdev_t *rvd = spa->spa_root_vdev; 2354789Sahrens pool_scrub_type_t scrub_type = spa->spa_scrub_type; 2355789Sahrens int error = 0; 2356789Sahrens boolean_t complete; 2357789Sahrens 2358789Sahrens CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG); 2359789Sahrens 2360797Sbonwick /* 2361797Sbonwick * If we're restarting due to a snapshot create/delete, 2362797Sbonwick * wait for that to complete. 2363797Sbonwick */ 2364797Sbonwick txg_wait_synced(spa_get_dsl(spa), 0); 2365797Sbonwick 23661544Seschrock dprintf("start %s mintxg=%llu maxtxg=%llu\n", 23671544Seschrock scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 23681544Seschrock spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg); 23691544Seschrock 23701544Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 23711544Seschrock vdev_reopen(rvd); /* purge all vdev caches */ 2372789Sahrens vdev_config_dirty(rvd); /* rewrite all disk labels */ 2373789Sahrens vdev_scrub_stat_update(rvd, scrub_type, B_FALSE); 23741544Seschrock spa_config_exit(spa, FTAG); 2375789Sahrens 2376789Sahrens mutex_enter(&spa->spa_scrub_lock); 2377789Sahrens spa->spa_scrub_errors = 0; 2378789Sahrens spa->spa_scrub_active = 1; 23791544Seschrock ASSERT(spa->spa_scrub_inflight == 0); 2380789Sahrens 2381789Sahrens while (!spa->spa_scrub_stop) { 2382789Sahrens CALLB_CPR_SAFE_BEGIN(&cprinfo); 23831544Seschrock while (spa->spa_scrub_suspended) { 2384789Sahrens spa->spa_scrub_active = 0; 2385789Sahrens cv_broadcast(&spa->spa_scrub_cv); 2386789Sahrens cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2387789Sahrens spa->spa_scrub_active = 1; 2388789Sahrens } 2389789Sahrens CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock); 2390789Sahrens 2391789Sahrens if (spa->spa_scrub_restart_txg != 0) 2392789Sahrens break; 2393789Sahrens 2394789Sahrens mutex_exit(&spa->spa_scrub_lock); 2395789Sahrens error = traverse_more(th); 2396789Sahrens mutex_enter(&spa->spa_scrub_lock); 2397789Sahrens if (error != EAGAIN) 2398789Sahrens break; 2399789Sahrens } 2400789Sahrens 2401789Sahrens while (spa->spa_scrub_inflight) 2402789Sahrens cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2403789Sahrens 24041601Sbonwick spa->spa_scrub_active = 0; 24051601Sbonwick cv_broadcast(&spa->spa_scrub_cv); 24061601Sbonwick 24071601Sbonwick mutex_exit(&spa->spa_scrub_lock); 24081601Sbonwick 24091601Sbonwick spa_config_enter(spa, RW_WRITER, FTAG); 24101601Sbonwick 24111601Sbonwick mutex_enter(&spa->spa_scrub_lock); 24121601Sbonwick 24131601Sbonwick /* 24141601Sbonwick * Note: we check spa_scrub_restart_txg under both spa_scrub_lock 24151601Sbonwick * AND the spa config lock to synchronize with any config changes 24161601Sbonwick * that revise the DTLs under spa_vdev_enter() / spa_vdev_exit(). 24171601Sbonwick */ 2418789Sahrens if (spa->spa_scrub_restart_txg != 0) 2419789Sahrens error = ERESTART; 2420789Sahrens 24211544Seschrock if (spa->spa_scrub_stop) 24221544Seschrock error = EINTR; 24231544Seschrock 2424789Sahrens /* 24251544Seschrock * Even if there were uncorrectable errors, we consider the scrub 24261544Seschrock * completed. The downside is that if there is a transient error during 24271544Seschrock * a resilver, we won't resilver the data properly to the target. But 24281544Seschrock * if the damage is permanent (more likely) we will resilver forever, 24291544Seschrock * which isn't really acceptable. Since there is enough information for 24301544Seschrock * the user to know what has failed and why, this seems like a more 24311544Seschrock * tractable approach. 2432789Sahrens */ 24331544Seschrock complete = (error == 0); 2434789Sahrens 24351544Seschrock dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n", 24361544Seschrock scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 2437789Sahrens spa->spa_scrub_maxtxg, complete ? "done" : "FAILED", 2438789Sahrens error, spa->spa_scrub_errors, spa->spa_scrub_stop); 2439789Sahrens 2440789Sahrens mutex_exit(&spa->spa_scrub_lock); 2441789Sahrens 2442789Sahrens /* 2443789Sahrens * If the scrub/resilver completed, update all DTLs to reflect this. 2444789Sahrens * Whether it succeeded or not, vacate all temporary scrub DTLs. 2445789Sahrens */ 2446789Sahrens vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1, 2447789Sahrens complete ? spa->spa_scrub_maxtxg : 0, B_TRUE); 2448789Sahrens vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete); 24491544Seschrock spa_errlog_rotate(spa); 24501601Sbonwick 24511544Seschrock spa_config_exit(spa, FTAG); 2452789Sahrens 2453789Sahrens mutex_enter(&spa->spa_scrub_lock); 2454789Sahrens 24551544Seschrock /* 24561544Seschrock * We may have finished replacing a device. 24571544Seschrock * Let the async thread assess this and handle the detach. 24581544Seschrock */ 24591544Seschrock spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 2460789Sahrens 2461789Sahrens /* 2462789Sahrens * If we were told to restart, our final act is to start a new scrub. 2463789Sahrens */ 2464789Sahrens if (error == ERESTART) 24651544Seschrock spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ? 24661544Seschrock SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB); 2467789Sahrens 24681544Seschrock spa->spa_scrub_type = POOL_SCRUB_NONE; 24691544Seschrock spa->spa_scrub_active = 0; 24701544Seschrock spa->spa_scrub_thread = NULL; 24711544Seschrock cv_broadcast(&spa->spa_scrub_cv); 2472789Sahrens CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */ 2473789Sahrens thread_exit(); 2474789Sahrens } 2475789Sahrens 2476789Sahrens void 2477789Sahrens spa_scrub_suspend(spa_t *spa) 2478789Sahrens { 2479789Sahrens mutex_enter(&spa->spa_scrub_lock); 24801544Seschrock spa->spa_scrub_suspended++; 2481789Sahrens while (spa->spa_scrub_active) { 2482789Sahrens cv_broadcast(&spa->spa_scrub_cv); 2483789Sahrens cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2484789Sahrens } 2485789Sahrens while (spa->spa_scrub_inflight) 2486789Sahrens cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2487789Sahrens mutex_exit(&spa->spa_scrub_lock); 2488789Sahrens } 2489789Sahrens 2490789Sahrens void 2491789Sahrens spa_scrub_resume(spa_t *spa) 2492789Sahrens { 2493789Sahrens mutex_enter(&spa->spa_scrub_lock); 24941544Seschrock ASSERT(spa->spa_scrub_suspended != 0); 24951544Seschrock if (--spa->spa_scrub_suspended == 0) 2496789Sahrens cv_broadcast(&spa->spa_scrub_cv); 2497789Sahrens mutex_exit(&spa->spa_scrub_lock); 2498789Sahrens } 2499789Sahrens 2500789Sahrens void 2501789Sahrens spa_scrub_restart(spa_t *spa, uint64_t txg) 2502789Sahrens { 2503789Sahrens /* 2504789Sahrens * Something happened (e.g. snapshot create/delete) that means 2505789Sahrens * we must restart any in-progress scrubs. The itinerary will 2506789Sahrens * fix this properly. 2507789Sahrens */ 2508789Sahrens mutex_enter(&spa->spa_scrub_lock); 2509789Sahrens spa->spa_scrub_restart_txg = txg; 2510789Sahrens mutex_exit(&spa->spa_scrub_lock); 2511789Sahrens } 2512789Sahrens 25131544Seschrock int 25141544Seschrock spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) 2515789Sahrens { 2516789Sahrens space_seg_t *ss; 2517789Sahrens uint64_t mintxg, maxtxg; 2518789Sahrens vdev_t *rvd = spa->spa_root_vdev; 2519789Sahrens 2520789Sahrens if ((uint_t)type >= POOL_SCRUB_TYPES) 2521789Sahrens return (ENOTSUP); 2522789Sahrens 25231544Seschrock mutex_enter(&spa->spa_scrub_lock); 25241544Seschrock 2525789Sahrens /* 2526789Sahrens * If there's a scrub or resilver already in progress, stop it. 2527789Sahrens */ 2528789Sahrens while (spa->spa_scrub_thread != NULL) { 2529789Sahrens /* 2530789Sahrens * Don't stop a resilver unless forced. 2531789Sahrens */ 25321544Seschrock if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) { 25331544Seschrock mutex_exit(&spa->spa_scrub_lock); 2534789Sahrens return (EBUSY); 25351544Seschrock } 2536789Sahrens spa->spa_scrub_stop = 1; 2537789Sahrens cv_broadcast(&spa->spa_scrub_cv); 2538789Sahrens cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2539789Sahrens } 2540789Sahrens 2541789Sahrens /* 2542789Sahrens * Terminate the previous traverse. 2543789Sahrens */ 2544789Sahrens if (spa->spa_scrub_th != NULL) { 2545789Sahrens traverse_fini(spa->spa_scrub_th); 2546789Sahrens spa->spa_scrub_th = NULL; 2547789Sahrens } 2548789Sahrens 25491544Seschrock if (rvd == NULL) { 25501544Seschrock ASSERT(spa->spa_scrub_stop == 0); 25511544Seschrock ASSERT(spa->spa_scrub_type == type); 25521544Seschrock ASSERT(spa->spa_scrub_restart_txg == 0); 25531544Seschrock mutex_exit(&spa->spa_scrub_lock); 25541544Seschrock return (0); 25551544Seschrock } 2556789Sahrens 2557789Sahrens mintxg = TXG_INITIAL - 1; 2558789Sahrens maxtxg = spa_last_synced_txg(spa) + 1; 2559789Sahrens 25601544Seschrock mutex_enter(&rvd->vdev_dtl_lock); 2561789Sahrens 25621544Seschrock if (rvd->vdev_dtl_map.sm_space == 0) { 25631544Seschrock /* 25641544Seschrock * The pool-wide DTL is empty. 25651732Sbonwick * If this is a resilver, there's nothing to do except 25661732Sbonwick * check whether any in-progress replacements have completed. 25671544Seschrock */ 25681732Sbonwick if (type == POOL_SCRUB_RESILVER) { 25691544Seschrock type = POOL_SCRUB_NONE; 25701732Sbonwick spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 25711732Sbonwick } 25721544Seschrock } else { 25731544Seschrock /* 25741544Seschrock * The pool-wide DTL is non-empty. 25751544Seschrock * If this is a normal scrub, upgrade to a resilver instead. 25761544Seschrock */ 25771544Seschrock if (type == POOL_SCRUB_EVERYTHING) 25781544Seschrock type = POOL_SCRUB_RESILVER; 25791544Seschrock } 2580789Sahrens 25811544Seschrock if (type == POOL_SCRUB_RESILVER) { 2582789Sahrens /* 2583789Sahrens * Determine the resilvering boundaries. 2584789Sahrens * 2585789Sahrens * Note: (mintxg, maxtxg) is an open interval, 2586789Sahrens * i.e. mintxg and maxtxg themselves are not included. 2587789Sahrens * 2588789Sahrens * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1 2589789Sahrens * so we don't claim to resilver a txg that's still changing. 2590789Sahrens */ 2591789Sahrens ss = avl_first(&rvd->vdev_dtl_map.sm_root); 25921544Seschrock mintxg = ss->ss_start - 1; 2593789Sahrens ss = avl_last(&rvd->vdev_dtl_map.sm_root); 25941544Seschrock maxtxg = MIN(ss->ss_end, maxtxg); 2595789Sahrens } 2596789Sahrens 25971544Seschrock mutex_exit(&rvd->vdev_dtl_lock); 25981544Seschrock 25991544Seschrock spa->spa_scrub_stop = 0; 26001544Seschrock spa->spa_scrub_type = type; 26011544Seschrock spa->spa_scrub_restart_txg = 0; 26021544Seschrock 26031544Seschrock if (type != POOL_SCRUB_NONE) { 26041544Seschrock spa->spa_scrub_mintxg = mintxg; 2605789Sahrens spa->spa_scrub_maxtxg = maxtxg; 2606789Sahrens spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL, 26071635Sbonwick ADVANCE_PRE | ADVANCE_PRUNE | ADVANCE_ZIL, 26081635Sbonwick ZIO_FLAG_CANFAIL); 2609789Sahrens traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg); 2610789Sahrens spa->spa_scrub_thread = thread_create(NULL, 0, 2611789Sahrens spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri); 2612789Sahrens } 2613789Sahrens 26141544Seschrock mutex_exit(&spa->spa_scrub_lock); 26151544Seschrock 2616789Sahrens return (0); 2617789Sahrens } 2618789Sahrens 26191544Seschrock /* 26201544Seschrock * ========================================================================== 26211544Seschrock * SPA async task processing 26221544Seschrock * ========================================================================== 26231544Seschrock */ 26241544Seschrock 26251544Seschrock static void 26261544Seschrock spa_async_reopen(spa_t *spa) 2627789Sahrens { 26281544Seschrock vdev_t *rvd = spa->spa_root_vdev; 26291544Seschrock vdev_t *tvd; 26301544Seschrock int c; 26311544Seschrock 26321544Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 26331544Seschrock 26341544Seschrock for (c = 0; c < rvd->vdev_children; c++) { 26351544Seschrock tvd = rvd->vdev_child[c]; 26361544Seschrock if (tvd->vdev_reopen_wanted) { 26371544Seschrock tvd->vdev_reopen_wanted = 0; 26381544Seschrock vdev_reopen(tvd); 26391544Seschrock } 26401544Seschrock } 2641789Sahrens 26421544Seschrock spa_config_exit(spa, FTAG); 26431544Seschrock } 26441544Seschrock 26451544Seschrock static void 26461544Seschrock spa_async_thread(spa_t *spa) 26471544Seschrock { 26481544Seschrock int tasks; 26491544Seschrock 26501544Seschrock ASSERT(spa->spa_sync_on); 2651789Sahrens 26521544Seschrock mutex_enter(&spa->spa_async_lock); 26531544Seschrock tasks = spa->spa_async_tasks; 26541544Seschrock spa->spa_async_tasks = 0; 26551544Seschrock mutex_exit(&spa->spa_async_lock); 26561544Seschrock 26571544Seschrock /* 26581635Sbonwick * See if the config needs to be updated. 26591635Sbonwick */ 26601635Sbonwick if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 26611635Sbonwick mutex_enter(&spa_namespace_lock); 26621635Sbonwick spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 26631635Sbonwick mutex_exit(&spa_namespace_lock); 26641635Sbonwick } 26651635Sbonwick 26661635Sbonwick /* 26671544Seschrock * See if any devices need to be reopened. 26681544Seschrock */ 26691544Seschrock if (tasks & SPA_ASYNC_REOPEN) 26701544Seschrock spa_async_reopen(spa); 26711544Seschrock 26721544Seschrock /* 26731544Seschrock * If any devices are done replacing, detach them. 26741544Seschrock */ 26751544Seschrock if (tasks & SPA_ASYNC_REPLACE_DONE) 2676789Sahrens spa_vdev_replace_done(spa); 2677789Sahrens 26781544Seschrock /* 26791544Seschrock * Kick off a scrub. 26801544Seschrock */ 26811544Seschrock if (tasks & SPA_ASYNC_SCRUB) 26821544Seschrock VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0); 26831544Seschrock 26841544Seschrock /* 26851544Seschrock * Kick off a resilver. 26861544Seschrock */ 26871544Seschrock if (tasks & SPA_ASYNC_RESILVER) 26881544Seschrock VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 26891544Seschrock 26901544Seschrock /* 26911544Seschrock * Let the world know that we're done. 26921544Seschrock */ 26931544Seschrock mutex_enter(&spa->spa_async_lock); 26941544Seschrock spa->spa_async_thread = NULL; 26951544Seschrock cv_broadcast(&spa->spa_async_cv); 26961544Seschrock mutex_exit(&spa->spa_async_lock); 26971544Seschrock thread_exit(); 26981544Seschrock } 26991544Seschrock 27001544Seschrock void 27011544Seschrock spa_async_suspend(spa_t *spa) 27021544Seschrock { 27031544Seschrock mutex_enter(&spa->spa_async_lock); 27041544Seschrock spa->spa_async_suspended++; 27051544Seschrock while (spa->spa_async_thread != NULL) 27061544Seschrock cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 27071544Seschrock mutex_exit(&spa->spa_async_lock); 27081544Seschrock } 27091544Seschrock 27101544Seschrock void 27111544Seschrock spa_async_resume(spa_t *spa) 27121544Seschrock { 27131544Seschrock mutex_enter(&spa->spa_async_lock); 27141544Seschrock ASSERT(spa->spa_async_suspended != 0); 27151544Seschrock spa->spa_async_suspended--; 27161544Seschrock mutex_exit(&spa->spa_async_lock); 27171544Seschrock } 27181544Seschrock 27191544Seschrock static void 27201544Seschrock spa_async_dispatch(spa_t *spa) 27211544Seschrock { 27221544Seschrock mutex_enter(&spa->spa_async_lock); 27231544Seschrock if (spa->spa_async_tasks && !spa->spa_async_suspended && 27241635Sbonwick spa->spa_async_thread == NULL && 27251635Sbonwick rootdir != NULL && !vn_is_readonly(rootdir)) 27261544Seschrock spa->spa_async_thread = thread_create(NULL, 0, 27271544Seschrock spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 27281544Seschrock mutex_exit(&spa->spa_async_lock); 27291544Seschrock } 27301544Seschrock 27311544Seschrock void 27321544Seschrock spa_async_request(spa_t *spa, int task) 27331544Seschrock { 27341544Seschrock mutex_enter(&spa->spa_async_lock); 27351544Seschrock spa->spa_async_tasks |= task; 27361544Seschrock mutex_exit(&spa->spa_async_lock); 2737789Sahrens } 2738789Sahrens 2739789Sahrens /* 2740789Sahrens * ========================================================================== 2741789Sahrens * SPA syncing routines 2742789Sahrens * ========================================================================== 2743789Sahrens */ 2744789Sahrens 2745789Sahrens static void 2746789Sahrens spa_sync_deferred_frees(spa_t *spa, uint64_t txg) 2747789Sahrens { 2748789Sahrens bplist_t *bpl = &spa->spa_sync_bplist; 2749789Sahrens dmu_tx_t *tx; 2750789Sahrens blkptr_t blk; 2751789Sahrens uint64_t itor = 0; 2752789Sahrens zio_t *zio; 2753789Sahrens int error; 2754789Sahrens uint8_t c = 1; 2755789Sahrens 2756789Sahrens zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD); 2757789Sahrens 2758789Sahrens while (bplist_iterate(bpl, &itor, &blk) == 0) 2759789Sahrens zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL)); 2760789Sahrens 2761789Sahrens error = zio_wait(zio); 2762789Sahrens ASSERT3U(error, ==, 0); 2763789Sahrens 2764789Sahrens tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 2765789Sahrens bplist_vacate(bpl, tx); 2766789Sahrens 2767789Sahrens /* 2768789Sahrens * Pre-dirty the first block so we sync to convergence faster. 2769789Sahrens * (Usually only the first block is needed.) 2770789Sahrens */ 2771789Sahrens dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 2772789Sahrens dmu_tx_commit(tx); 2773789Sahrens } 2774789Sahrens 2775789Sahrens static void 27762082Seschrock spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 27772082Seschrock { 27782082Seschrock char *packed = NULL; 27792082Seschrock size_t nvsize = 0; 27802082Seschrock dmu_buf_t *db; 27812082Seschrock 27822082Seschrock VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 27832082Seschrock 27842082Seschrock packed = kmem_alloc(nvsize, KM_SLEEP); 27852082Seschrock 27862082Seschrock VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 27872082Seschrock KM_SLEEP) == 0); 27882082Seschrock 27892082Seschrock dmu_write(spa->spa_meta_objset, obj, 0, nvsize, packed, tx); 27902082Seschrock 27912082Seschrock kmem_free(packed, nvsize); 27922082Seschrock 27932082Seschrock VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 27942082Seschrock dmu_buf_will_dirty(db, tx); 27952082Seschrock *(uint64_t *)db->db_data = nvsize; 27962082Seschrock dmu_buf_rele(db, FTAG); 27972082Seschrock } 27982082Seschrock 27992082Seschrock static void 28002082Seschrock spa_sync_spares(spa_t *spa, dmu_tx_t *tx) 28012082Seschrock { 28022082Seschrock nvlist_t *nvroot; 28032082Seschrock nvlist_t **spares; 28042082Seschrock int i; 28052082Seschrock 28062082Seschrock if (!spa->spa_sync_spares) 28072082Seschrock return; 28082082Seschrock 28092082Seschrock /* 28102082Seschrock * Update the MOS nvlist describing the list of available spares. 28112082Seschrock * spa_validate_spares() will have already made sure this nvlist is 28122082Seschrock * valid and the vdevs are labelled appropriately. 28132082Seschrock */ 28142082Seschrock if (spa->spa_spares_object == 0) { 28152082Seschrock spa->spa_spares_object = dmu_object_alloc(spa->spa_meta_objset, 28162082Seschrock DMU_OT_PACKED_NVLIST, 1 << 14, 28172082Seschrock DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 28182082Seschrock VERIFY(zap_update(spa->spa_meta_objset, 28192082Seschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SPARES, 28202082Seschrock sizeof (uint64_t), 1, &spa->spa_spares_object, tx) == 0); 28212082Seschrock } 28222082Seschrock 28232082Seschrock VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 28242082Seschrock if (spa->spa_nspares == 0) { 28252082Seschrock VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 28262082Seschrock NULL, 0) == 0); 28272082Seschrock } else { 28282082Seschrock spares = kmem_alloc(spa->spa_nspares * sizeof (void *), 28292082Seschrock KM_SLEEP); 28302082Seschrock for (i = 0; i < spa->spa_nspares; i++) 28312082Seschrock spares[i] = vdev_config_generate(spa, 28322082Seschrock spa->spa_spares[i], B_FALSE, B_TRUE); 28332082Seschrock VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 28342082Seschrock spares, spa->spa_nspares) == 0); 28352082Seschrock for (i = 0; i < spa->spa_nspares; i++) 28362082Seschrock nvlist_free(spares[i]); 28372082Seschrock kmem_free(spares, spa->spa_nspares * sizeof (void *)); 28382082Seschrock } 28392082Seschrock 28402082Seschrock spa_sync_nvlist(spa, spa->spa_spares_object, nvroot, tx); 28412926Sek110237 nvlist_free(nvroot); 28422082Seschrock 28432082Seschrock spa->spa_sync_spares = B_FALSE; 28442082Seschrock } 28452082Seschrock 28462082Seschrock static void 2847789Sahrens spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 2848789Sahrens { 2849789Sahrens nvlist_t *config; 2850789Sahrens 2851789Sahrens if (list_is_empty(&spa->spa_dirty_list)) 2852789Sahrens return; 2853789Sahrens 2854789Sahrens config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE); 2855789Sahrens 28561635Sbonwick if (spa->spa_config_syncing) 28571635Sbonwick nvlist_free(spa->spa_config_syncing); 28581635Sbonwick spa->spa_config_syncing = config; 2859789Sahrens 28602082Seschrock spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 2861789Sahrens } 2862789Sahrens 28633912Slling static void 28643912Slling spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx) 28653912Slling { 28663912Slling spa_t *spa = arg1; 28673912Slling nvlist_t *nvp = arg2; 28683912Slling nvpair_t *nvpair; 28693912Slling objset_t *mos = spa->spa_meta_objset; 28703912Slling uint64_t zapobj; 28713912Slling 28723912Slling mutex_enter(&spa->spa_props_lock); 28733912Slling if (spa->spa_pool_props_object == 0) { 28743912Slling zapobj = zap_create(mos, DMU_OT_POOL_PROPS, DMU_OT_NONE, 0, tx); 28753912Slling VERIFY(zapobj > 0); 28763912Slling 28773912Slling spa->spa_pool_props_object = zapobj; 28783912Slling 28793912Slling VERIFY(zap_update(mos, DMU_POOL_DIRECTORY_OBJECT, 28803912Slling DMU_POOL_PROPS, 8, 1, 28813912Slling &spa->spa_pool_props_object, tx) == 0); 28823912Slling } 28833912Slling mutex_exit(&spa->spa_props_lock); 28843912Slling 28853912Slling nvpair = NULL; 28863912Slling while ((nvpair = nvlist_next_nvpair(nvp, nvpair))) { 28873912Slling switch (zpool_name_to_prop(nvpair_name(nvpair))) { 28883912Slling case ZFS_PROP_BOOTFS: 28893912Slling VERIFY(nvlist_lookup_uint64(nvp, 28903912Slling nvpair_name(nvpair), &spa->spa_bootfs) == 0); 28913912Slling VERIFY(zap_update(mos, 28923912Slling spa->spa_pool_props_object, 28933912Slling zpool_prop_to_name(ZFS_PROP_BOOTFS), 8, 1, 28943912Slling &spa->spa_bootfs, tx) == 0); 28953912Slling break; 28963912Slling } 28973912Slling } 28983912Slling } 28993912Slling 2900789Sahrens /* 2901789Sahrens * Sync the specified transaction group. New blocks may be dirtied as 2902789Sahrens * part of the process, so we iterate until it converges. 2903789Sahrens */ 2904789Sahrens void 2905789Sahrens spa_sync(spa_t *spa, uint64_t txg) 2906789Sahrens { 2907789Sahrens dsl_pool_t *dp = spa->spa_dsl_pool; 2908789Sahrens objset_t *mos = spa->spa_meta_objset; 2909789Sahrens bplist_t *bpl = &spa->spa_sync_bplist; 29101635Sbonwick vdev_t *rvd = spa->spa_root_vdev; 2911789Sahrens vdev_t *vd; 2912789Sahrens dmu_tx_t *tx; 2913789Sahrens int dirty_vdevs; 2914789Sahrens 2915789Sahrens /* 2916789Sahrens * Lock out configuration changes. 2917789Sahrens */ 29181544Seschrock spa_config_enter(spa, RW_READER, FTAG); 2919789Sahrens 2920789Sahrens spa->spa_syncing_txg = txg; 2921789Sahrens spa->spa_sync_pass = 0; 2922789Sahrens 29231544Seschrock VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); 2924789Sahrens 29252082Seschrock tx = dmu_tx_create_assigned(dp, txg); 29262082Seschrock 29272082Seschrock /* 29282082Seschrock * If we are upgrading to ZFS_VERSION_RAIDZ_DEFLATE this txg, 29292082Seschrock * set spa_deflate if we have no raid-z vdevs. 29302082Seschrock */ 29312082Seschrock if (spa->spa_ubsync.ub_version < ZFS_VERSION_RAIDZ_DEFLATE && 29322082Seschrock spa->spa_uberblock.ub_version >= ZFS_VERSION_RAIDZ_DEFLATE) { 29332082Seschrock int i; 29342082Seschrock 29352082Seschrock for (i = 0; i < rvd->vdev_children; i++) { 29362082Seschrock vd = rvd->vdev_child[i]; 29372082Seschrock if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 29382082Seschrock break; 29392082Seschrock } 29402082Seschrock if (i == rvd->vdev_children) { 29412082Seschrock spa->spa_deflate = TRUE; 29422082Seschrock VERIFY(0 == zap_add(spa->spa_meta_objset, 29432082Seschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 29442082Seschrock sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 29452082Seschrock } 29462082Seschrock } 29472082Seschrock 2948789Sahrens /* 2949789Sahrens * If anything has changed in this txg, push the deferred frees 2950789Sahrens * from the previous txg. If not, leave them alone so that we 2951789Sahrens * don't generate work on an otherwise idle system. 2952789Sahrens */ 2953789Sahrens if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 29542329Sek110237 !txg_list_empty(&dp->dp_dirty_dirs, txg) || 29552329Sek110237 !txg_list_empty(&dp->dp_sync_tasks, txg)) 2956789Sahrens spa_sync_deferred_frees(spa, txg); 2957789Sahrens 2958789Sahrens /* 2959789Sahrens * Iterate to convergence. 2960789Sahrens */ 2961789Sahrens do { 2962789Sahrens spa->spa_sync_pass++; 2963789Sahrens 2964789Sahrens spa_sync_config_object(spa, tx); 29652082Seschrock spa_sync_spares(spa, tx); 29661544Seschrock spa_errlog_sync(spa, txg); 2967789Sahrens dsl_pool_sync(dp, txg); 2968789Sahrens 2969789Sahrens dirty_vdevs = 0; 2970789Sahrens while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 2971789Sahrens vdev_sync(vd, txg); 2972789Sahrens dirty_vdevs++; 2973789Sahrens } 2974789Sahrens 2975789Sahrens bplist_sync(bpl, tx); 2976789Sahrens } while (dirty_vdevs); 2977789Sahrens 2978789Sahrens bplist_close(bpl); 2979789Sahrens 2980789Sahrens dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 2981789Sahrens 2982789Sahrens /* 2983789Sahrens * Rewrite the vdev configuration (which includes the uberblock) 2984789Sahrens * to commit the transaction group. 29851635Sbonwick * 29861635Sbonwick * If there are any dirty vdevs, sync the uberblock to all vdevs. 29871635Sbonwick * Otherwise, pick a random top-level vdev that's known to be 29881635Sbonwick * visible in the config cache (see spa_vdev_add() for details). 29891635Sbonwick * If the write fails, try the next vdev until we're tried them all. 2990789Sahrens */ 29911635Sbonwick if (!list_is_empty(&spa->spa_dirty_list)) { 29921635Sbonwick VERIFY(vdev_config_sync(rvd, txg) == 0); 29931635Sbonwick } else { 29941635Sbonwick int children = rvd->vdev_children; 29951635Sbonwick int c0 = spa_get_random(children); 29961635Sbonwick int c; 29971635Sbonwick 29981635Sbonwick for (c = 0; c < children; c++) { 29991635Sbonwick vd = rvd->vdev_child[(c0 + c) % children]; 30001635Sbonwick if (vd->vdev_ms_array == 0) 30011635Sbonwick continue; 30021635Sbonwick if (vdev_config_sync(vd, txg) == 0) 30031635Sbonwick break; 30041635Sbonwick } 30051635Sbonwick if (c == children) 30061635Sbonwick VERIFY(vdev_config_sync(rvd, txg) == 0); 30071635Sbonwick } 30081635Sbonwick 30092082Seschrock dmu_tx_commit(tx); 30102082Seschrock 30111635Sbonwick /* 30121635Sbonwick * Clear the dirty config list. 30131635Sbonwick */ 30141635Sbonwick while ((vd = list_head(&spa->spa_dirty_list)) != NULL) 30151635Sbonwick vdev_config_clean(vd); 30161635Sbonwick 30171635Sbonwick /* 30181635Sbonwick * Now that the new config has synced transactionally, 30191635Sbonwick * let it become visible to the config cache. 30201635Sbonwick */ 30211635Sbonwick if (spa->spa_config_syncing != NULL) { 30221635Sbonwick spa_config_set(spa, spa->spa_config_syncing); 30231635Sbonwick spa->spa_config_txg = txg; 30241635Sbonwick spa->spa_config_syncing = NULL; 30251635Sbonwick } 3026789Sahrens 3027789Sahrens /* 3028789Sahrens * Make a stable copy of the fully synced uberblock. 3029789Sahrens * We use this as the root for pool traversals. 3030789Sahrens */ 3031789Sahrens spa->spa_traverse_wanted = 1; /* tells traverse_more() to stop */ 3032789Sahrens 3033789Sahrens spa_scrub_suspend(spa); /* stop scrubbing and finish I/Os */ 3034789Sahrens 3035789Sahrens rw_enter(&spa->spa_traverse_lock, RW_WRITER); 3036789Sahrens spa->spa_traverse_wanted = 0; 3037789Sahrens spa->spa_ubsync = spa->spa_uberblock; 3038789Sahrens rw_exit(&spa->spa_traverse_lock); 3039789Sahrens 3040789Sahrens spa_scrub_resume(spa); /* resume scrub with new ubsync */ 3041789Sahrens 3042789Sahrens /* 3043789Sahrens * Clean up the ZIL records for the synced txg. 3044789Sahrens */ 3045789Sahrens dsl_pool_zil_clean(dp); 3046789Sahrens 3047789Sahrens /* 3048789Sahrens * Update usable space statistics. 3049789Sahrens */ 3050789Sahrens while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 3051789Sahrens vdev_sync_done(vd, txg); 3052789Sahrens 3053789Sahrens /* 3054789Sahrens * It had better be the case that we didn't dirty anything 30552082Seschrock * since vdev_config_sync(). 3056789Sahrens */ 3057789Sahrens ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 3058789Sahrens ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 3059789Sahrens ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 3060789Sahrens ASSERT(bpl->bpl_queue == NULL); 3061789Sahrens 30621544Seschrock spa_config_exit(spa, FTAG); 30631544Seschrock 30641544Seschrock /* 30651544Seschrock * If any async tasks have been requested, kick them off. 30661544Seschrock */ 30671544Seschrock spa_async_dispatch(spa); 3068789Sahrens } 3069789Sahrens 3070789Sahrens /* 3071789Sahrens * Sync all pools. We don't want to hold the namespace lock across these 3072789Sahrens * operations, so we take a reference on the spa_t and drop the lock during the 3073789Sahrens * sync. 3074789Sahrens */ 3075789Sahrens void 3076789Sahrens spa_sync_allpools(void) 3077789Sahrens { 3078789Sahrens spa_t *spa = NULL; 3079789Sahrens mutex_enter(&spa_namespace_lock); 3080789Sahrens while ((spa = spa_next(spa)) != NULL) { 3081789Sahrens if (spa_state(spa) != POOL_STATE_ACTIVE) 3082789Sahrens continue; 3083789Sahrens spa_open_ref(spa, FTAG); 3084789Sahrens mutex_exit(&spa_namespace_lock); 3085789Sahrens txg_wait_synced(spa_get_dsl(spa), 0); 3086789Sahrens mutex_enter(&spa_namespace_lock); 3087789Sahrens spa_close(spa, FTAG); 3088789Sahrens } 3089789Sahrens mutex_exit(&spa_namespace_lock); 3090789Sahrens } 3091789Sahrens 3092789Sahrens /* 3093789Sahrens * ========================================================================== 3094789Sahrens * Miscellaneous routines 3095789Sahrens * ========================================================================== 3096789Sahrens */ 3097789Sahrens 3098789Sahrens /* 3099789Sahrens * Remove all pools in the system. 3100789Sahrens */ 3101789Sahrens void 3102789Sahrens spa_evict_all(void) 3103789Sahrens { 3104789Sahrens spa_t *spa; 3105789Sahrens 3106789Sahrens /* 3107789Sahrens * Remove all cached state. All pools should be closed now, 3108789Sahrens * so every spa in the AVL tree should be unreferenced. 3109789Sahrens */ 3110789Sahrens mutex_enter(&spa_namespace_lock); 3111789Sahrens while ((spa = spa_next(NULL)) != NULL) { 3112789Sahrens /* 31131544Seschrock * Stop async tasks. The async thread may need to detach 31141544Seschrock * a device that's been replaced, which requires grabbing 31151544Seschrock * spa_namespace_lock, so we must drop it here. 3116789Sahrens */ 3117789Sahrens spa_open_ref(spa, FTAG); 3118789Sahrens mutex_exit(&spa_namespace_lock); 31191544Seschrock spa_async_suspend(spa); 3120789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 3121789Sahrens mutex_enter(&spa_namespace_lock); 3122789Sahrens spa_close(spa, FTAG); 3123789Sahrens 3124789Sahrens if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 3125789Sahrens spa_unload(spa); 3126789Sahrens spa_deactivate(spa); 3127789Sahrens } 3128789Sahrens spa_remove(spa); 3129789Sahrens } 3130789Sahrens mutex_exit(&spa_namespace_lock); 3131789Sahrens } 31321544Seschrock 31331544Seschrock vdev_t * 31341544Seschrock spa_lookup_by_guid(spa_t *spa, uint64_t guid) 31351544Seschrock { 31361544Seschrock return (vdev_lookup_by_guid(spa->spa_root_vdev, guid)); 31371544Seschrock } 31381760Seschrock 31391760Seschrock void 31401760Seschrock spa_upgrade(spa_t *spa) 31411760Seschrock { 31421760Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 31431760Seschrock 31441760Seschrock /* 31451760Seschrock * This should only be called for a non-faulted pool, and since a 31461760Seschrock * future version would result in an unopenable pool, this shouldn't be 31471760Seschrock * possible. 31481760Seschrock */ 31491760Seschrock ASSERT(spa->spa_uberblock.ub_version <= ZFS_VERSION); 31501760Seschrock 31511760Seschrock spa->spa_uberblock.ub_version = ZFS_VERSION; 31521760Seschrock vdev_config_dirty(spa->spa_root_vdev); 31531760Seschrock 31541760Seschrock spa_config_exit(spa, FTAG); 31552082Seschrock 31562082Seschrock txg_wait_synced(spa_get_dsl(spa), 0); 31571760Seschrock } 31582082Seschrock 31592082Seschrock boolean_t 31602082Seschrock spa_has_spare(spa_t *spa, uint64_t guid) 31612082Seschrock { 31622082Seschrock int i; 31633377Seschrock uint64_t spareguid; 31642082Seschrock 31652082Seschrock for (i = 0; i < spa->spa_nspares; i++) 31662082Seschrock if (spa->spa_spares[i]->vdev_guid == guid) 31672082Seschrock return (B_TRUE); 31682082Seschrock 31693377Seschrock for (i = 0; i < spa->spa_pending_nspares; i++) { 31703377Seschrock if (nvlist_lookup_uint64(spa->spa_pending_spares[i], 31713377Seschrock ZPOOL_CONFIG_GUID, &spareguid) == 0 && 31723377Seschrock spareguid == guid) 31733377Seschrock return (B_TRUE); 31743377Seschrock } 31753377Seschrock 31762082Seschrock return (B_FALSE); 31772082Seschrock } 31783912Slling 31793912Slling int 31803912Slling spa_set_props(spa_t *spa, nvlist_t *nvp) 31813912Slling { 31823912Slling return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, 31833912Slling spa, nvp, 3)); 31843912Slling } 31853912Slling 31863912Slling int 31873912Slling spa_get_props(spa_t *spa, nvlist_t **nvp) 31883912Slling { 31893912Slling zap_cursor_t zc; 31903912Slling zap_attribute_t za; 31913912Slling objset_t *mos = spa->spa_meta_objset; 31923912Slling zfs_source_t src; 31933912Slling zfs_prop_t prop; 31943912Slling nvlist_t *propval; 31953912Slling uint64_t value; 31963912Slling int err; 31973912Slling 31983912Slling VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 31993912Slling 32003912Slling mutex_enter(&spa->spa_props_lock); 32013912Slling /* If no props object, then just return empty nvlist */ 32023912Slling if (spa->spa_pool_props_object == 0) { 32033912Slling mutex_exit(&spa->spa_props_lock); 32043912Slling return (0); 32053912Slling } 32063912Slling 32073912Slling for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 32083912Slling (err = zap_cursor_retrieve(&zc, &za)) == 0; 32093912Slling zap_cursor_advance(&zc)) { 32103912Slling 32113912Slling if ((prop = zpool_name_to_prop(za.za_name)) == ZFS_PROP_INVAL) 32123912Slling continue; 32133912Slling 32143912Slling VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 32153912Slling switch (za.za_integer_length) { 32163912Slling case 8: 32173912Slling if (zfs_prop_default_numeric(prop) == 32183912Slling za.za_first_integer) 32193912Slling src = ZFS_SRC_DEFAULT; 32203912Slling else 32213912Slling src = ZFS_SRC_LOCAL; 32223912Slling value = za.za_first_integer; 32233912Slling 32243912Slling if (prop == ZFS_PROP_BOOTFS) { 32253912Slling dsl_pool_t *dp; 32263912Slling dsl_dataset_t *ds = NULL; 32273912Slling char strval[MAXPATHLEN]; 32283912Slling 32293912Slling dp = spa_get_dsl(spa); 32303912Slling rw_enter(&dp->dp_config_rwlock, RW_READER); 32313912Slling if ((err = dsl_dataset_open_obj(dp, 32323912Slling za.za_first_integer, NULL, DS_MODE_NONE, 32333912Slling FTAG, &ds)) != 0) { 32343912Slling rw_exit(&dp->dp_config_rwlock); 32353912Slling break; 32363912Slling } 32373912Slling dsl_dataset_name(ds, strval); 32383912Slling dsl_dataset_close(ds, DS_MODE_NONE, FTAG); 32393912Slling rw_exit(&dp->dp_config_rwlock); 32403912Slling 32413912Slling VERIFY(nvlist_add_uint64(propval, 32423912Slling ZFS_PROP_SOURCE, src) == 0); 32433912Slling VERIFY(nvlist_add_string(propval, 32443912Slling ZFS_PROP_VALUE, strval) == 0); 32453912Slling } else { 32463912Slling VERIFY(nvlist_add_uint64(propval, 32473912Slling ZFS_PROP_SOURCE, src) == 0); 32483912Slling VERIFY(nvlist_add_uint64(propval, 32493912Slling ZFS_PROP_VALUE, value) == 0); 32503912Slling } 32513912Slling VERIFY(nvlist_add_nvlist(*nvp, za.za_name, 32523912Slling propval) == 0); 32533912Slling break; 32543912Slling } 32553912Slling nvlist_free(propval); 32563912Slling } 32573912Slling zap_cursor_fini(&zc); 32583912Slling mutex_exit(&spa->spa_props_lock); 32593912Slling if (err && err != ENOENT) { 32603912Slling nvlist_free(*nvp); 32613912Slling return (err); 32623912Slling } 32633912Slling 32643912Slling return (0); 32653912Slling } 32663912Slling 32673912Slling /* 32683912Slling * If the bootfs property value is dsobj, clear it. 32693912Slling */ 32703912Slling void 32713912Slling spa_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 32723912Slling { 32733912Slling if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 32743912Slling VERIFY(zap_remove(spa->spa_meta_objset, 32753912Slling spa->spa_pool_props_object, 32763912Slling zpool_prop_to_name(ZFS_PROP_BOOTFS), tx) == 0); 32773912Slling spa->spa_bootfs = 0; 32783912Slling } 32793912Slling } 3280