1789Sahrens /* 2789Sahrens * CDDL HEADER START 3789Sahrens * 4789Sahrens * The contents of this file are subject to the terms of the 51544Seschrock * Common Development and Distribution License (the "License"). 61544Seschrock * You may not use this file except in compliance with the License. 7789Sahrens * 8789Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9789Sahrens * or http://www.opensolaris.org/os/licensing. 10789Sahrens * See the License for the specific language governing permissions 11789Sahrens * and limitations under the License. 12789Sahrens * 13789Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14789Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15789Sahrens * If applicable, add the following below this CDDL HEADER, with the 16789Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17789Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18789Sahrens * 19789Sahrens * CDDL HEADER END 20789Sahrens */ 212082Seschrock 22789Sahrens /* 233377Seschrock * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24789Sahrens * Use is subject to license terms. 25789Sahrens */ 26789Sahrens 27789Sahrens #pragma ident "%Z%%M% %I% %E% SMI" 28789Sahrens 29789Sahrens /* 30789Sahrens * This file contains all the routines used when modifying on-disk SPA state. 31789Sahrens * This includes opening, importing, destroying, exporting a pool, and syncing a 32789Sahrens * pool. 33789Sahrens */ 34789Sahrens 35789Sahrens #include <sys/zfs_context.h> 361544Seschrock #include <sys/fm/fs/zfs.h> 37789Sahrens #include <sys/spa_impl.h> 38789Sahrens #include <sys/zio.h> 39789Sahrens #include <sys/zio_checksum.h> 40789Sahrens #include <sys/zio_compress.h> 41789Sahrens #include <sys/dmu.h> 42789Sahrens #include <sys/dmu_tx.h> 43789Sahrens #include <sys/zap.h> 44789Sahrens #include <sys/zil.h> 45789Sahrens #include <sys/vdev_impl.h> 46789Sahrens #include <sys/metaslab.h> 47789Sahrens #include <sys/uberblock_impl.h> 48789Sahrens #include <sys/txg.h> 49789Sahrens #include <sys/avl.h> 50789Sahrens #include <sys/dmu_traverse.h> 513912Slling #include <sys/dmu_objset.h> 52789Sahrens #include <sys/unique.h> 53789Sahrens #include <sys/dsl_pool.h> 543912Slling #include <sys/dsl_dataset.h> 55789Sahrens #include <sys/dsl_dir.h> 56789Sahrens #include <sys/dsl_prop.h> 573912Slling #include <sys/dsl_synctask.h> 58789Sahrens #include <sys/fs/zfs.h> 59789Sahrens #include <sys/callb.h> 603975Sek110237 #include <sys/systeminfo.h> 613975Sek110237 #include <sys/sunddi.h> 62789Sahrens 632986Sek110237 int zio_taskq_threads = 8; 642986Sek110237 65789Sahrens /* 66789Sahrens * ========================================================================== 67789Sahrens * SPA state manipulation (open/create/destroy/import/export) 68789Sahrens * ========================================================================== 69789Sahrens */ 70789Sahrens 711544Seschrock static int 721544Seschrock spa_error_entry_compare(const void *a, const void *b) 731544Seschrock { 741544Seschrock spa_error_entry_t *sa = (spa_error_entry_t *)a; 751544Seschrock spa_error_entry_t *sb = (spa_error_entry_t *)b; 761544Seschrock int ret; 771544Seschrock 781544Seschrock ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 791544Seschrock sizeof (zbookmark_t)); 801544Seschrock 811544Seschrock if (ret < 0) 821544Seschrock return (-1); 831544Seschrock else if (ret > 0) 841544Seschrock return (1); 851544Seschrock else 861544Seschrock return (0); 871544Seschrock } 881544Seschrock 891544Seschrock /* 901544Seschrock * Utility function which retrieves copies of the current logs and 911544Seschrock * re-initializes them in the process. 921544Seschrock */ 931544Seschrock void 941544Seschrock spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 951544Seschrock { 961544Seschrock ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 971544Seschrock 981544Seschrock bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 991544Seschrock bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 1001544Seschrock 1011544Seschrock avl_create(&spa->spa_errlist_scrub, 1021544Seschrock spa_error_entry_compare, sizeof (spa_error_entry_t), 1031544Seschrock offsetof(spa_error_entry_t, se_avl)); 1041544Seschrock avl_create(&spa->spa_errlist_last, 1051544Seschrock spa_error_entry_compare, sizeof (spa_error_entry_t), 1061544Seschrock offsetof(spa_error_entry_t, se_avl)); 1071544Seschrock } 1081544Seschrock 109789Sahrens /* 110789Sahrens * Activate an uninitialized pool. 111789Sahrens */ 112789Sahrens static void 113789Sahrens spa_activate(spa_t *spa) 114789Sahrens { 115789Sahrens int t; 116789Sahrens 117789Sahrens ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 118789Sahrens 119789Sahrens spa->spa_state = POOL_STATE_ACTIVE; 120789Sahrens 121789Sahrens spa->spa_normal_class = metaslab_class_create(); 122789Sahrens 123789Sahrens for (t = 0; t < ZIO_TYPES; t++) { 124789Sahrens spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue", 1252986Sek110237 zio_taskq_threads, maxclsyspri, 50, INT_MAX, 126789Sahrens TASKQ_PREPOPULATE); 127789Sahrens spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr", 1282986Sek110237 zio_taskq_threads, maxclsyspri, 50, INT_MAX, 129789Sahrens TASKQ_PREPOPULATE); 130789Sahrens } 131789Sahrens 132789Sahrens rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL); 133789Sahrens 1342856Snd150628 mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL); 1352856Snd150628 mutex_init(&spa->spa_config_cache_lock, NULL, MUTEX_DEFAULT, NULL); 1362856Snd150628 mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL); 1372856Snd150628 mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL); 1382856Snd150628 mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL); 1392856Snd150628 mutex_init(&spa->spa_config_lock.scl_lock, NULL, MUTEX_DEFAULT, NULL); 1402856Snd150628 mutex_init(&spa->spa_sync_bplist.bpl_lock, NULL, MUTEX_DEFAULT, NULL); 1412926Sek110237 mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL); 1423912Slling mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL); 1432856Snd150628 144789Sahrens list_create(&spa->spa_dirty_list, sizeof (vdev_t), 145789Sahrens offsetof(vdev_t, vdev_dirty_node)); 146789Sahrens 147789Sahrens txg_list_create(&spa->spa_vdev_txg_list, 148789Sahrens offsetof(struct vdev, vdev_txg_node)); 1491544Seschrock 1501544Seschrock avl_create(&spa->spa_errlist_scrub, 1511544Seschrock spa_error_entry_compare, sizeof (spa_error_entry_t), 1521544Seschrock offsetof(spa_error_entry_t, se_avl)); 1531544Seschrock avl_create(&spa->spa_errlist_last, 1541544Seschrock spa_error_entry_compare, sizeof (spa_error_entry_t), 1551544Seschrock offsetof(spa_error_entry_t, se_avl)); 156789Sahrens } 157789Sahrens 158789Sahrens /* 159789Sahrens * Opposite of spa_activate(). 160789Sahrens */ 161789Sahrens static void 162789Sahrens spa_deactivate(spa_t *spa) 163789Sahrens { 164789Sahrens int t; 165789Sahrens 166789Sahrens ASSERT(spa->spa_sync_on == B_FALSE); 167789Sahrens ASSERT(spa->spa_dsl_pool == NULL); 168789Sahrens ASSERT(spa->spa_root_vdev == NULL); 169789Sahrens 170789Sahrens ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 171789Sahrens 172789Sahrens txg_list_destroy(&spa->spa_vdev_txg_list); 173789Sahrens 174789Sahrens list_destroy(&spa->spa_dirty_list); 175789Sahrens 176789Sahrens rw_destroy(&spa->spa_traverse_lock); 177789Sahrens 178789Sahrens for (t = 0; t < ZIO_TYPES; t++) { 179789Sahrens taskq_destroy(spa->spa_zio_issue_taskq[t]); 180789Sahrens taskq_destroy(spa->spa_zio_intr_taskq[t]); 181789Sahrens spa->spa_zio_issue_taskq[t] = NULL; 182789Sahrens spa->spa_zio_intr_taskq[t] = NULL; 183789Sahrens } 184789Sahrens 185789Sahrens metaslab_class_destroy(spa->spa_normal_class); 186789Sahrens spa->spa_normal_class = NULL; 187789Sahrens 1881544Seschrock /* 1891544Seschrock * If this was part of an import or the open otherwise failed, we may 1901544Seschrock * still have errors left in the queues. Empty them just in case. 1911544Seschrock */ 1921544Seschrock spa_errlog_drain(spa); 1931544Seschrock 1941544Seschrock avl_destroy(&spa->spa_errlist_scrub); 1951544Seschrock avl_destroy(&spa->spa_errlist_last); 1961544Seschrock 197789Sahrens spa->spa_state = POOL_STATE_UNINITIALIZED; 198789Sahrens } 199789Sahrens 200789Sahrens /* 201789Sahrens * Verify a pool configuration, and construct the vdev tree appropriately. This 202789Sahrens * will create all the necessary vdevs in the appropriate layout, with each vdev 203789Sahrens * in the CLOSED state. This will prep the pool before open/creation/import. 204789Sahrens * All vdev validation is done by the vdev_alloc() routine. 205789Sahrens */ 2062082Seschrock static int 2072082Seschrock spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 2082082Seschrock uint_t id, int atype) 209789Sahrens { 210789Sahrens nvlist_t **child; 211789Sahrens uint_t c, children; 2122082Seschrock int error; 2132082Seschrock 2142082Seschrock if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 2152082Seschrock return (error); 2162082Seschrock 2172082Seschrock if ((*vdp)->vdev_ops->vdev_op_leaf) 2182082Seschrock return (0); 219789Sahrens 220789Sahrens if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 221789Sahrens &child, &children) != 0) { 2222082Seschrock vdev_free(*vdp); 2232082Seschrock *vdp = NULL; 2242082Seschrock return (EINVAL); 225789Sahrens } 226789Sahrens 227789Sahrens for (c = 0; c < children; c++) { 2282082Seschrock vdev_t *vd; 2292082Seschrock if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 2302082Seschrock atype)) != 0) { 2312082Seschrock vdev_free(*vdp); 2322082Seschrock *vdp = NULL; 2332082Seschrock return (error); 234789Sahrens } 235789Sahrens } 236789Sahrens 2372082Seschrock ASSERT(*vdp != NULL); 2382082Seschrock 2392082Seschrock return (0); 240789Sahrens } 241789Sahrens 242789Sahrens /* 243789Sahrens * Opposite of spa_load(). 244789Sahrens */ 245789Sahrens static void 246789Sahrens spa_unload(spa_t *spa) 247789Sahrens { 2482082Seschrock int i; 2492082Seschrock 250789Sahrens /* 2511544Seschrock * Stop async tasks. 2521544Seschrock */ 2531544Seschrock spa_async_suspend(spa); 2541544Seschrock 2551544Seschrock /* 256789Sahrens * Stop syncing. 257789Sahrens */ 258789Sahrens if (spa->spa_sync_on) { 259789Sahrens txg_sync_stop(spa->spa_dsl_pool); 260789Sahrens spa->spa_sync_on = B_FALSE; 261789Sahrens } 262789Sahrens 263789Sahrens /* 264789Sahrens * Wait for any outstanding prefetch I/O to complete. 265789Sahrens */ 2661544Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 2671544Seschrock spa_config_exit(spa, FTAG); 268789Sahrens 269789Sahrens /* 270789Sahrens * Close the dsl pool. 271789Sahrens */ 272789Sahrens if (spa->spa_dsl_pool) { 273789Sahrens dsl_pool_close(spa->spa_dsl_pool); 274789Sahrens spa->spa_dsl_pool = NULL; 275789Sahrens } 276789Sahrens 277789Sahrens /* 278789Sahrens * Close all vdevs. 279789Sahrens */ 2801585Sbonwick if (spa->spa_root_vdev) 281789Sahrens vdev_free(spa->spa_root_vdev); 2821585Sbonwick ASSERT(spa->spa_root_vdev == NULL); 2831544Seschrock 2842082Seschrock for (i = 0; i < spa->spa_nspares; i++) 2852082Seschrock vdev_free(spa->spa_spares[i]); 2862082Seschrock if (spa->spa_spares) { 2872082Seschrock kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); 2882082Seschrock spa->spa_spares = NULL; 2892082Seschrock } 2902082Seschrock if (spa->spa_sparelist) { 2912082Seschrock nvlist_free(spa->spa_sparelist); 2922082Seschrock spa->spa_sparelist = NULL; 2932082Seschrock } 2942082Seschrock 2951544Seschrock spa->spa_async_suspended = 0; 296789Sahrens } 297789Sahrens 298789Sahrens /* 2992082Seschrock * Load (or re-load) the current list of vdevs describing the active spares for 3002082Seschrock * this pool. When this is called, we have some form of basic information in 3012082Seschrock * 'spa_sparelist'. We parse this into vdevs, try to open them, and then 3022082Seschrock * re-generate a more complete list including status information. 3032082Seschrock */ 3042082Seschrock static void 3052082Seschrock spa_load_spares(spa_t *spa) 3062082Seschrock { 3072082Seschrock nvlist_t **spares; 3082082Seschrock uint_t nspares; 3092082Seschrock int i; 3103377Seschrock vdev_t *vd, *tvd; 3112082Seschrock 3122082Seschrock /* 3132082Seschrock * First, close and free any existing spare vdevs. 3142082Seschrock */ 3152082Seschrock for (i = 0; i < spa->spa_nspares; i++) { 3163377Seschrock vd = spa->spa_spares[i]; 3173377Seschrock 3183377Seschrock /* Undo the call to spa_activate() below */ 3193377Seschrock if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL && 3203377Seschrock tvd->vdev_isspare) 3213377Seschrock spa_spare_remove(tvd); 3223377Seschrock vdev_close(vd); 3233377Seschrock vdev_free(vd); 3242082Seschrock } 3253377Seschrock 3262082Seschrock if (spa->spa_spares) 3272082Seschrock kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); 3282082Seschrock 3292082Seschrock if (spa->spa_sparelist == NULL) 3302082Seschrock nspares = 0; 3312082Seschrock else 3322082Seschrock VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 3332082Seschrock ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 3342082Seschrock 3352082Seschrock spa->spa_nspares = (int)nspares; 3362082Seschrock spa->spa_spares = NULL; 3372082Seschrock 3382082Seschrock if (nspares == 0) 3392082Seschrock return; 3402082Seschrock 3412082Seschrock /* 3422082Seschrock * Construct the array of vdevs, opening them to get status in the 3433377Seschrock * process. For each spare, there is potentially two different vdev_t 3443377Seschrock * structures associated with it: one in the list of spares (used only 3453377Seschrock * for basic validation purposes) and one in the active vdev 3463377Seschrock * configuration (if it's spared in). During this phase we open and 3473377Seschrock * validate each vdev on the spare list. If the vdev also exists in the 3483377Seschrock * active configuration, then we also mark this vdev as an active spare. 3492082Seschrock */ 3502082Seschrock spa->spa_spares = kmem_alloc(nspares * sizeof (void *), KM_SLEEP); 3512082Seschrock for (i = 0; i < spa->spa_nspares; i++) { 3522082Seschrock VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 3532082Seschrock VDEV_ALLOC_SPARE) == 0); 3542082Seschrock ASSERT(vd != NULL); 3552082Seschrock 3562082Seschrock spa->spa_spares[i] = vd; 3572082Seschrock 3583377Seschrock if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL) { 3593377Seschrock if (!tvd->vdev_isspare) 3603377Seschrock spa_spare_add(tvd); 3613377Seschrock 3623377Seschrock /* 3633377Seschrock * We only mark the spare active if we were successfully 3643377Seschrock * able to load the vdev. Otherwise, importing a pool 3653377Seschrock * with a bad active spare would result in strange 3663377Seschrock * behavior, because multiple pool would think the spare 3673377Seschrock * is actively in use. 3683377Seschrock * 3693377Seschrock * There is a vulnerability here to an equally bizarre 3703377Seschrock * circumstance, where a dead active spare is later 3713377Seschrock * brought back to life (onlined or otherwise). Given 3723377Seschrock * the rarity of this scenario, and the extra complexity 3733377Seschrock * it adds, we ignore the possibility. 3743377Seschrock */ 3753377Seschrock if (!vdev_is_dead(tvd)) 3763377Seschrock spa_spare_activate(tvd); 3773377Seschrock } 3783377Seschrock 3792082Seschrock if (vdev_open(vd) != 0) 3802082Seschrock continue; 3812082Seschrock 3822082Seschrock vd->vdev_top = vd; 3832082Seschrock (void) vdev_validate_spare(vd); 3842082Seschrock } 3852082Seschrock 3862082Seschrock /* 3872082Seschrock * Recompute the stashed list of spares, with status information 3882082Seschrock * this time. 3892082Seschrock */ 3902082Seschrock VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 3912082Seschrock DATA_TYPE_NVLIST_ARRAY) == 0); 3922082Seschrock 3932082Seschrock spares = kmem_alloc(spa->spa_nspares * sizeof (void *), KM_SLEEP); 3942082Seschrock for (i = 0; i < spa->spa_nspares; i++) 3952082Seschrock spares[i] = vdev_config_generate(spa, spa->spa_spares[i], 3962082Seschrock B_TRUE, B_TRUE); 3972082Seschrock VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 3982082Seschrock spares, spa->spa_nspares) == 0); 3992082Seschrock for (i = 0; i < spa->spa_nspares; i++) 4002082Seschrock nvlist_free(spares[i]); 4012082Seschrock kmem_free(spares, spa->spa_nspares * sizeof (void *)); 4022082Seschrock } 4032082Seschrock 4042082Seschrock static int 4052082Seschrock load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 4062082Seschrock { 4072082Seschrock dmu_buf_t *db; 4082082Seschrock char *packed = NULL; 4092082Seschrock size_t nvsize = 0; 4102082Seschrock int error; 4112082Seschrock *value = NULL; 4122082Seschrock 4132082Seschrock VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 4142082Seschrock nvsize = *(uint64_t *)db->db_data; 4152082Seschrock dmu_buf_rele(db, FTAG); 4162082Seschrock 4172082Seschrock packed = kmem_alloc(nvsize, KM_SLEEP); 4182082Seschrock error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed); 4192082Seschrock if (error == 0) 4202082Seschrock error = nvlist_unpack(packed, nvsize, value, 0); 4212082Seschrock kmem_free(packed, nvsize); 4222082Seschrock 4232082Seschrock return (error); 4242082Seschrock } 4252082Seschrock 4262082Seschrock /* 427789Sahrens * Load an existing storage pool, using the pool's builtin spa_config as a 4281544Seschrock * source of configuration information. 429789Sahrens */ 430789Sahrens static int 4311544Seschrock spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) 432789Sahrens { 433789Sahrens int error = 0; 434789Sahrens nvlist_t *nvroot = NULL; 435789Sahrens vdev_t *rvd; 436789Sahrens uberblock_t *ub = &spa->spa_uberblock; 4371635Sbonwick uint64_t config_cache_txg = spa->spa_config_txg; 438789Sahrens uint64_t pool_guid; 4392082Seschrock uint64_t version; 440789Sahrens zio_t *zio; 441789Sahrens 4421544Seschrock spa->spa_load_state = state; 4431635Sbonwick 444789Sahrens if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 4451733Sbonwick nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 4461544Seschrock error = EINVAL; 4471544Seschrock goto out; 4481544Seschrock } 449789Sahrens 4502082Seschrock /* 4512082Seschrock * Versioning wasn't explicitly added to the label until later, so if 4522082Seschrock * it's not present treat it as the initial version. 4532082Seschrock */ 4542082Seschrock if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0) 4552082Seschrock version = ZFS_VERSION_INITIAL; 4562082Seschrock 4571733Sbonwick (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 4581733Sbonwick &spa->spa_config_txg); 4591733Sbonwick 4601635Sbonwick if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 4611544Seschrock spa_guid_exists(pool_guid, 0)) { 4621544Seschrock error = EEXIST; 4631544Seschrock goto out; 4641544Seschrock } 465789Sahrens 4662174Seschrock spa->spa_load_guid = pool_guid; 4672174Seschrock 468789Sahrens /* 4692082Seschrock * Parse the configuration into a vdev tree. We explicitly set the 4702082Seschrock * value that will be returned by spa_version() since parsing the 4712082Seschrock * configuration requires knowing the version number. 472789Sahrens */ 4731544Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 4742082Seschrock spa->spa_ubsync.ub_version = version; 4752082Seschrock error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 4761544Seschrock spa_config_exit(spa, FTAG); 477789Sahrens 4782082Seschrock if (error != 0) 4791544Seschrock goto out; 480789Sahrens 4811585Sbonwick ASSERT(spa->spa_root_vdev == rvd); 482789Sahrens ASSERT(spa_guid(spa) == pool_guid); 483789Sahrens 484789Sahrens /* 485789Sahrens * Try to open all vdevs, loading each label in the process. 486789Sahrens */ 4874070Smc142369 error = vdev_open(rvd); 4884070Smc142369 if (error != 0) 4891544Seschrock goto out; 490789Sahrens 491789Sahrens /* 4921986Seschrock * Validate the labels for all leaf vdevs. We need to grab the config 4931986Seschrock * lock because all label I/O is done with the ZIO_FLAG_CONFIG_HELD 4941986Seschrock * flag. 4951986Seschrock */ 4961986Seschrock spa_config_enter(spa, RW_READER, FTAG); 4971986Seschrock error = vdev_validate(rvd); 4981986Seschrock spa_config_exit(spa, FTAG); 4991986Seschrock 5004070Smc142369 if (error != 0) 5011986Seschrock goto out; 5021986Seschrock 5031986Seschrock if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 5041986Seschrock error = ENXIO; 5051986Seschrock goto out; 5061986Seschrock } 5071986Seschrock 5081986Seschrock /* 509789Sahrens * Find the best uberblock. 510789Sahrens */ 511789Sahrens bzero(ub, sizeof (uberblock_t)); 512789Sahrens 513789Sahrens zio = zio_root(spa, NULL, NULL, 514789Sahrens ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 515789Sahrens vdev_uberblock_load(zio, rvd, ub); 516789Sahrens error = zio_wait(zio); 517789Sahrens 518789Sahrens /* 519789Sahrens * If we weren't able to find a single valid uberblock, return failure. 520789Sahrens */ 521789Sahrens if (ub->ub_txg == 0) { 5221760Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 5231760Seschrock VDEV_AUX_CORRUPT_DATA); 5241544Seschrock error = ENXIO; 5251544Seschrock goto out; 5261544Seschrock } 5271544Seschrock 5281544Seschrock /* 5291544Seschrock * If the pool is newer than the code, we can't open it. 5301544Seschrock */ 5311760Seschrock if (ub->ub_version > ZFS_VERSION) { 5321760Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 5331760Seschrock VDEV_AUX_VERSION_NEWER); 5341544Seschrock error = ENOTSUP; 5351544Seschrock goto out; 536789Sahrens } 537789Sahrens 538789Sahrens /* 539789Sahrens * If the vdev guid sum doesn't match the uberblock, we have an 540789Sahrens * incomplete configuration. 541789Sahrens */ 5421732Sbonwick if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 5431544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 5441544Seschrock VDEV_AUX_BAD_GUID_SUM); 5451544Seschrock error = ENXIO; 5461544Seschrock goto out; 547789Sahrens } 548789Sahrens 549789Sahrens /* 550789Sahrens * Initialize internal SPA structures. 551789Sahrens */ 552789Sahrens spa->spa_state = POOL_STATE_ACTIVE; 553789Sahrens spa->spa_ubsync = spa->spa_uberblock; 554789Sahrens spa->spa_first_txg = spa_last_synced_txg(spa) + 1; 5551544Seschrock error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 5561544Seschrock if (error) { 5571544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 5581544Seschrock VDEV_AUX_CORRUPT_DATA); 5591544Seschrock goto out; 5601544Seschrock } 561789Sahrens spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 562789Sahrens 5631544Seschrock if (zap_lookup(spa->spa_meta_objset, 564789Sahrens DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 5651544Seschrock sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { 5661544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 5671544Seschrock VDEV_AUX_CORRUPT_DATA); 5681544Seschrock error = EIO; 5691544Seschrock goto out; 5701544Seschrock } 571789Sahrens 572789Sahrens if (!mosconfig) { 5732082Seschrock nvlist_t *newconfig; 5743975Sek110237 uint64_t hostid; 5752082Seschrock 5762082Seschrock if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) { 5771544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 5781544Seschrock VDEV_AUX_CORRUPT_DATA); 5791544Seschrock error = EIO; 5801544Seschrock goto out; 5811544Seschrock } 582789Sahrens 5833975Sek110237 if (nvlist_lookup_uint64(newconfig, ZPOOL_CONFIG_HOSTID, 5843975Sek110237 &hostid) == 0) { 5853975Sek110237 char *hostname; 5863975Sek110237 unsigned long myhostid = 0; 5873975Sek110237 5883975Sek110237 VERIFY(nvlist_lookup_string(newconfig, 5893975Sek110237 ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 5903975Sek110237 5913975Sek110237 (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 592*4178Slling if (hostid != 0 && myhostid != 0 && 593*4178Slling (unsigned long)hostid != myhostid) { 5943975Sek110237 cmn_err(CE_WARN, "pool '%s' could not be " 5953975Sek110237 "loaded as it was last accessed by " 5963975Sek110237 "another system (host: %s hostid: 0x%lx). " 5973975Sek110237 "See: http://www.sun.com/msg/ZFS-8000-EY", 5983975Sek110237 spa->spa_name, hostname, 5993975Sek110237 (unsigned long)hostid); 6003975Sek110237 error = EBADF; 6013975Sek110237 goto out; 6023975Sek110237 } 6033975Sek110237 } 6043975Sek110237 605789Sahrens spa_config_set(spa, newconfig); 606789Sahrens spa_unload(spa); 607789Sahrens spa_deactivate(spa); 608789Sahrens spa_activate(spa); 609789Sahrens 6101544Seschrock return (spa_load(spa, newconfig, state, B_TRUE)); 6111544Seschrock } 6121544Seschrock 6131544Seschrock if (zap_lookup(spa->spa_meta_objset, 6141544Seschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 6151544Seschrock sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { 6161544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 6171544Seschrock VDEV_AUX_CORRUPT_DATA); 6181544Seschrock error = EIO; 6191544Seschrock goto out; 620789Sahrens } 621789Sahrens 6221544Seschrock /* 6232082Seschrock * Load the bit that tells us to use the new accounting function 6242082Seschrock * (raid-z deflation). If we have an older pool, this will not 6252082Seschrock * be present. 6262082Seschrock */ 6272082Seschrock error = zap_lookup(spa->spa_meta_objset, 6282082Seschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 6292082Seschrock sizeof (uint64_t), 1, &spa->spa_deflate); 6302082Seschrock if (error != 0 && error != ENOENT) { 6312082Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 6322082Seschrock VDEV_AUX_CORRUPT_DATA); 6332082Seschrock error = EIO; 6342082Seschrock goto out; 6352082Seschrock } 6362082Seschrock 6372082Seschrock /* 6381544Seschrock * Load the persistent error log. If we have an older pool, this will 6391544Seschrock * not be present. 6401544Seschrock */ 6411544Seschrock error = zap_lookup(spa->spa_meta_objset, 6421544Seschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, 6431544Seschrock sizeof (uint64_t), 1, &spa->spa_errlog_last); 6441807Sbonwick if (error != 0 && error != ENOENT) { 6451544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 6461544Seschrock VDEV_AUX_CORRUPT_DATA); 6471544Seschrock error = EIO; 6481544Seschrock goto out; 6491544Seschrock } 6501544Seschrock 6511544Seschrock error = zap_lookup(spa->spa_meta_objset, 6521544Seschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, 6531544Seschrock sizeof (uint64_t), 1, &spa->spa_errlog_scrub); 6541544Seschrock if (error != 0 && error != ENOENT) { 6551544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 6561544Seschrock VDEV_AUX_CORRUPT_DATA); 6571544Seschrock error = EIO; 6581544Seschrock goto out; 6591544Seschrock } 660789Sahrens 661789Sahrens /* 6622926Sek110237 * Load the history object. If we have an older pool, this 6632926Sek110237 * will not be present. 6642926Sek110237 */ 6652926Sek110237 error = zap_lookup(spa->spa_meta_objset, 6662926Sek110237 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY, 6672926Sek110237 sizeof (uint64_t), 1, &spa->spa_history); 6682926Sek110237 if (error != 0 && error != ENOENT) { 6692926Sek110237 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 6702926Sek110237 VDEV_AUX_CORRUPT_DATA); 6712926Sek110237 error = EIO; 6722926Sek110237 goto out; 6732926Sek110237 } 6742926Sek110237 6752926Sek110237 /* 6762082Seschrock * Load any hot spares for this pool. 6772082Seschrock */ 6782082Seschrock error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 6792082Seschrock DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares_object); 6802082Seschrock if (error != 0 && error != ENOENT) { 6812082Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 6822082Seschrock VDEV_AUX_CORRUPT_DATA); 6832082Seschrock error = EIO; 6842082Seschrock goto out; 6852082Seschrock } 6862082Seschrock if (error == 0) { 6872082Seschrock ASSERT(spa_version(spa) >= ZFS_VERSION_SPARES); 6882082Seschrock if (load_nvlist(spa, spa->spa_spares_object, 6892082Seschrock &spa->spa_sparelist) != 0) { 6902082Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 6912082Seschrock VDEV_AUX_CORRUPT_DATA); 6922082Seschrock error = EIO; 6932082Seschrock goto out; 6942082Seschrock } 6952082Seschrock 6962082Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 6972082Seschrock spa_load_spares(spa); 6982082Seschrock spa_config_exit(spa, FTAG); 6992082Seschrock } 7002082Seschrock 7013912Slling error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 7023912Slling DMU_POOL_PROPS, sizeof (uint64_t), 1, &spa->spa_pool_props_object); 7033912Slling 7043912Slling if (error && error != ENOENT) { 7053912Slling vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 7063912Slling VDEV_AUX_CORRUPT_DATA); 7073912Slling error = EIO; 7083912Slling goto out; 7093912Slling } 7103912Slling 7113912Slling if (error == 0) { 7123912Slling (void) zap_lookup(spa->spa_meta_objset, 7133912Slling spa->spa_pool_props_object, 7143912Slling zpool_prop_to_name(ZFS_PROP_BOOTFS), 7153912Slling sizeof (uint64_t), 1, &spa->spa_bootfs); 7163912Slling } 7173912Slling 7182082Seschrock /* 7191986Seschrock * Load the vdev state for all toplevel vdevs. 720789Sahrens */ 7211986Seschrock vdev_load(rvd); 722789Sahrens 723789Sahrens /* 724789Sahrens * Propagate the leaf DTLs we just loaded all the way up the tree. 725789Sahrens */ 7261544Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 727789Sahrens vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 7281544Seschrock spa_config_exit(spa, FTAG); 729789Sahrens 730789Sahrens /* 731789Sahrens * Check the state of the root vdev. If it can't be opened, it 732789Sahrens * indicates one or more toplevel vdevs are faulted. 733789Sahrens */ 7341544Seschrock if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 7351544Seschrock error = ENXIO; 7361544Seschrock goto out; 7371544Seschrock } 738789Sahrens 7391544Seschrock if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) { 7401635Sbonwick dmu_tx_t *tx; 7411635Sbonwick int need_update = B_FALSE; 7421585Sbonwick int c; 7431601Sbonwick 7441635Sbonwick /* 7451635Sbonwick * Claim log blocks that haven't been committed yet. 7461635Sbonwick * This must all happen in a single txg. 7471635Sbonwick */ 7481601Sbonwick tx = dmu_tx_create_assigned(spa_get_dsl(spa), 749789Sahrens spa_first_txg(spa)); 7502417Sahrens (void) dmu_objset_find(spa->spa_name, 7512417Sahrens zil_claim, tx, DS_FIND_CHILDREN); 752789Sahrens dmu_tx_commit(tx); 753789Sahrens 754789Sahrens spa->spa_sync_on = B_TRUE; 755789Sahrens txg_sync_start(spa->spa_dsl_pool); 756789Sahrens 757789Sahrens /* 758789Sahrens * Wait for all claims to sync. 759789Sahrens */ 760789Sahrens txg_wait_synced(spa->spa_dsl_pool, 0); 7611585Sbonwick 7621585Sbonwick /* 7631635Sbonwick * If the config cache is stale, or we have uninitialized 7641635Sbonwick * metaslabs (see spa_vdev_add()), then update the config. 7651585Sbonwick */ 7661635Sbonwick if (config_cache_txg != spa->spa_config_txg || 7671635Sbonwick state == SPA_LOAD_IMPORT) 7681635Sbonwick need_update = B_TRUE; 7691635Sbonwick 7701635Sbonwick for (c = 0; c < rvd->vdev_children; c++) 7711635Sbonwick if (rvd->vdev_child[c]->vdev_ms_array == 0) 7721635Sbonwick need_update = B_TRUE; 7731585Sbonwick 7741585Sbonwick /* 7751635Sbonwick * Update the config cache asychronously in case we're the 7761635Sbonwick * root pool, in which case the config cache isn't writable yet. 7771585Sbonwick */ 7781635Sbonwick if (need_update) 7791635Sbonwick spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 780789Sahrens } 781789Sahrens 7821544Seschrock error = 0; 7831544Seschrock out: 7842082Seschrock if (error && error != EBADF) 7851544Seschrock zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0); 7861544Seschrock spa->spa_load_state = SPA_LOAD_NONE; 7871544Seschrock spa->spa_ena = 0; 7881544Seschrock 7891544Seschrock return (error); 790789Sahrens } 791789Sahrens 792789Sahrens /* 793789Sahrens * Pool Open/Import 794789Sahrens * 795789Sahrens * The import case is identical to an open except that the configuration is sent 796789Sahrens * down from userland, instead of grabbed from the configuration cache. For the 797789Sahrens * case of an open, the pool configuration will exist in the 798789Sahrens * POOL_STATE_UNITIALIZED state. 799789Sahrens * 800789Sahrens * The stats information (gen/count/ustats) is used to gather vdev statistics at 801789Sahrens * the same time open the pool, without having to keep around the spa_t in some 802789Sahrens * ambiguous state. 803789Sahrens */ 804789Sahrens static int 805789Sahrens spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) 806789Sahrens { 807789Sahrens spa_t *spa; 808789Sahrens int error; 809789Sahrens int loaded = B_FALSE; 810789Sahrens int locked = B_FALSE; 811789Sahrens 812789Sahrens *spapp = NULL; 813789Sahrens 814789Sahrens /* 815789Sahrens * As disgusting as this is, we need to support recursive calls to this 816789Sahrens * function because dsl_dir_open() is called during spa_load(), and ends 817789Sahrens * up calling spa_open() again. The real fix is to figure out how to 818789Sahrens * avoid dsl_dir_open() calling this in the first place. 819789Sahrens */ 820789Sahrens if (mutex_owner(&spa_namespace_lock) != curthread) { 821789Sahrens mutex_enter(&spa_namespace_lock); 822789Sahrens locked = B_TRUE; 823789Sahrens } 824789Sahrens 825789Sahrens if ((spa = spa_lookup(pool)) == NULL) { 826789Sahrens if (locked) 827789Sahrens mutex_exit(&spa_namespace_lock); 828789Sahrens return (ENOENT); 829789Sahrens } 830789Sahrens if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 831789Sahrens 832789Sahrens spa_activate(spa); 833789Sahrens 8341635Sbonwick error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE); 835789Sahrens 836789Sahrens if (error == EBADF) { 837789Sahrens /* 8381986Seschrock * If vdev_validate() returns failure (indicated by 8391986Seschrock * EBADF), it indicates that one of the vdevs indicates 8401986Seschrock * that the pool has been exported or destroyed. If 8411986Seschrock * this is the case, the config cache is out of sync and 8421986Seschrock * we should remove the pool from the namespace. 843789Sahrens */ 8442082Seschrock zfs_post_ok(spa, NULL); 845789Sahrens spa_unload(spa); 846789Sahrens spa_deactivate(spa); 847789Sahrens spa_remove(spa); 848789Sahrens spa_config_sync(); 849789Sahrens if (locked) 850789Sahrens mutex_exit(&spa_namespace_lock); 851789Sahrens return (ENOENT); 8521544Seschrock } 8531544Seschrock 8541544Seschrock if (error) { 855789Sahrens /* 856789Sahrens * We can't open the pool, but we still have useful 857789Sahrens * information: the state of each vdev after the 858789Sahrens * attempted vdev_open(). Return this to the user. 859789Sahrens */ 8601635Sbonwick if (config != NULL && spa->spa_root_vdev != NULL) { 8611635Sbonwick spa_config_enter(spa, RW_READER, FTAG); 862789Sahrens *config = spa_config_generate(spa, NULL, -1ULL, 863789Sahrens B_TRUE); 8641635Sbonwick spa_config_exit(spa, FTAG); 8651635Sbonwick } 866789Sahrens spa_unload(spa); 867789Sahrens spa_deactivate(spa); 8681544Seschrock spa->spa_last_open_failed = B_TRUE; 869789Sahrens if (locked) 870789Sahrens mutex_exit(&spa_namespace_lock); 871789Sahrens *spapp = NULL; 872789Sahrens return (error); 8731544Seschrock } else { 8741544Seschrock zfs_post_ok(spa, NULL); 8751544Seschrock spa->spa_last_open_failed = B_FALSE; 876789Sahrens } 877789Sahrens 878789Sahrens loaded = B_TRUE; 879789Sahrens } 880789Sahrens 881789Sahrens spa_open_ref(spa, tag); 882789Sahrens if (locked) 883789Sahrens mutex_exit(&spa_namespace_lock); 884789Sahrens 885789Sahrens *spapp = spa; 886789Sahrens 887789Sahrens if (config != NULL) { 8881544Seschrock spa_config_enter(spa, RW_READER, FTAG); 889789Sahrens *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 8901544Seschrock spa_config_exit(spa, FTAG); 891789Sahrens } 892789Sahrens 893789Sahrens /* 894789Sahrens * If we just loaded the pool, resilver anything that's out of date. 895789Sahrens */ 896789Sahrens if (loaded && (spa_mode & FWRITE)) 897789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 898789Sahrens 899789Sahrens return (0); 900789Sahrens } 901789Sahrens 902789Sahrens int 903789Sahrens spa_open(const char *name, spa_t **spapp, void *tag) 904789Sahrens { 905789Sahrens return (spa_open_common(name, spapp, tag, NULL)); 906789Sahrens } 907789Sahrens 9081544Seschrock /* 9091544Seschrock * Lookup the given spa_t, incrementing the inject count in the process, 9101544Seschrock * preventing it from being exported or destroyed. 9111544Seschrock */ 9121544Seschrock spa_t * 9131544Seschrock spa_inject_addref(char *name) 9141544Seschrock { 9151544Seschrock spa_t *spa; 9161544Seschrock 9171544Seschrock mutex_enter(&spa_namespace_lock); 9181544Seschrock if ((spa = spa_lookup(name)) == NULL) { 9191544Seschrock mutex_exit(&spa_namespace_lock); 9201544Seschrock return (NULL); 9211544Seschrock } 9221544Seschrock spa->spa_inject_ref++; 9231544Seschrock mutex_exit(&spa_namespace_lock); 9241544Seschrock 9251544Seschrock return (spa); 9261544Seschrock } 9271544Seschrock 9281544Seschrock void 9291544Seschrock spa_inject_delref(spa_t *spa) 9301544Seschrock { 9311544Seschrock mutex_enter(&spa_namespace_lock); 9321544Seschrock spa->spa_inject_ref--; 9331544Seschrock mutex_exit(&spa_namespace_lock); 9341544Seschrock } 9351544Seschrock 9362082Seschrock static void 9372082Seschrock spa_add_spares(spa_t *spa, nvlist_t *config) 9382082Seschrock { 9392082Seschrock nvlist_t **spares; 9402082Seschrock uint_t i, nspares; 9412082Seschrock nvlist_t *nvroot; 9422082Seschrock uint64_t guid; 9432082Seschrock vdev_stat_t *vs; 9442082Seschrock uint_t vsc; 9453377Seschrock uint64_t pool; 9462082Seschrock 9472082Seschrock if (spa->spa_nspares == 0) 9482082Seschrock return; 9492082Seschrock 9502082Seschrock VERIFY(nvlist_lookup_nvlist(config, 9512082Seschrock ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 9522082Seschrock VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 9532082Seschrock ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 9542082Seschrock if (nspares != 0) { 9552082Seschrock VERIFY(nvlist_add_nvlist_array(nvroot, 9562082Seschrock ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 9572082Seschrock VERIFY(nvlist_lookup_nvlist_array(nvroot, 9582082Seschrock ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 9592082Seschrock 9602082Seschrock /* 9612082Seschrock * Go through and find any spares which have since been 9622082Seschrock * repurposed as an active spare. If this is the case, update 9632082Seschrock * their status appropriately. 9642082Seschrock */ 9652082Seschrock for (i = 0; i < nspares; i++) { 9662082Seschrock VERIFY(nvlist_lookup_uint64(spares[i], 9672082Seschrock ZPOOL_CONFIG_GUID, &guid) == 0); 9683377Seschrock if (spa_spare_exists(guid, &pool) && pool != 0ULL) { 9692082Seschrock VERIFY(nvlist_lookup_uint64_array( 9702082Seschrock spares[i], ZPOOL_CONFIG_STATS, 9712082Seschrock (uint64_t **)&vs, &vsc) == 0); 9722082Seschrock vs->vs_state = VDEV_STATE_CANT_OPEN; 9732082Seschrock vs->vs_aux = VDEV_AUX_SPARED; 9742082Seschrock } 9752082Seschrock } 9762082Seschrock } 9772082Seschrock } 9782082Seschrock 979789Sahrens int 9801544Seschrock spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 981789Sahrens { 982789Sahrens int error; 983789Sahrens spa_t *spa; 984789Sahrens 985789Sahrens *config = NULL; 986789Sahrens error = spa_open_common(name, &spa, FTAG, config); 987789Sahrens 9882082Seschrock if (spa && *config != NULL) { 9891544Seschrock VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, 9901544Seschrock spa_get_errlog_size(spa)) == 0); 9911544Seschrock 9922082Seschrock spa_add_spares(spa, *config); 9932082Seschrock } 9942082Seschrock 9951544Seschrock /* 9961544Seschrock * We want to get the alternate root even for faulted pools, so we cheat 9971544Seschrock * and call spa_lookup() directly. 9981544Seschrock */ 9991544Seschrock if (altroot) { 10001544Seschrock if (spa == NULL) { 10011544Seschrock mutex_enter(&spa_namespace_lock); 10021544Seschrock spa = spa_lookup(name); 10031544Seschrock if (spa) 10041544Seschrock spa_altroot(spa, altroot, buflen); 10051544Seschrock else 10061544Seschrock altroot[0] = '\0'; 10071544Seschrock spa = NULL; 10081544Seschrock mutex_exit(&spa_namespace_lock); 10091544Seschrock } else { 10101544Seschrock spa_altroot(spa, altroot, buflen); 10111544Seschrock } 10121544Seschrock } 10131544Seschrock 1014789Sahrens if (spa != NULL) 1015789Sahrens spa_close(spa, FTAG); 1016789Sahrens 1017789Sahrens return (error); 1018789Sahrens } 1019789Sahrens 1020789Sahrens /* 10212082Seschrock * Validate that the 'spares' array is well formed. We must have an array of 10223377Seschrock * nvlists, each which describes a valid leaf vdev. If this is an import (mode 10233377Seschrock * is VDEV_ALLOC_SPARE), then we allow corrupted spares to be specified, as long 10243377Seschrock * as they are well-formed. 10252082Seschrock */ 10262082Seschrock static int 10272082Seschrock spa_validate_spares(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 10282082Seschrock { 10292082Seschrock nvlist_t **spares; 10302082Seschrock uint_t i, nspares; 10312082Seschrock vdev_t *vd; 10322082Seschrock int error; 10332082Seschrock 10342082Seschrock /* 10352082Seschrock * It's acceptable to have no spares specified. 10362082Seschrock */ 10372082Seschrock if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 10382082Seschrock &spares, &nspares) != 0) 10392082Seschrock return (0); 10402082Seschrock 10412082Seschrock if (nspares == 0) 10422082Seschrock return (EINVAL); 10432082Seschrock 10442082Seschrock /* 10452082Seschrock * Make sure the pool is formatted with a version that supports hot 10462082Seschrock * spares. 10472082Seschrock */ 10482082Seschrock if (spa_version(spa) < ZFS_VERSION_SPARES) 10492082Seschrock return (ENOTSUP); 10502082Seschrock 10513377Seschrock /* 10523377Seschrock * Set the pending spare list so we correctly handle device in-use 10533377Seschrock * checking. 10543377Seschrock */ 10553377Seschrock spa->spa_pending_spares = spares; 10563377Seschrock spa->spa_pending_nspares = nspares; 10573377Seschrock 10582082Seschrock for (i = 0; i < nspares; i++) { 10592082Seschrock if ((error = spa_config_parse(spa, &vd, spares[i], NULL, 0, 10602082Seschrock mode)) != 0) 10613377Seschrock goto out; 10622082Seschrock 10632082Seschrock if (!vd->vdev_ops->vdev_op_leaf) { 10642082Seschrock vdev_free(vd); 10653377Seschrock error = EINVAL; 10663377Seschrock goto out; 10672082Seschrock } 10682082Seschrock 10692082Seschrock vd->vdev_top = vd; 10703377Seschrock 10713377Seschrock if ((error = vdev_open(vd)) == 0 && 10723377Seschrock (error = vdev_label_init(vd, crtxg, 10733377Seschrock VDEV_LABEL_SPARE)) == 0) { 10743377Seschrock VERIFY(nvlist_add_uint64(spares[i], ZPOOL_CONFIG_GUID, 10753377Seschrock vd->vdev_guid) == 0); 10762082Seschrock } 10772082Seschrock 10782082Seschrock vdev_free(vd); 10793377Seschrock 10803377Seschrock if (error && mode != VDEV_ALLOC_SPARE) 10813377Seschrock goto out; 10823377Seschrock else 10833377Seschrock error = 0; 10842082Seschrock } 10852082Seschrock 10863377Seschrock out: 10873377Seschrock spa->spa_pending_spares = NULL; 10883377Seschrock spa->spa_pending_nspares = 0; 10893377Seschrock return (error); 10902082Seschrock } 10912082Seschrock 10922082Seschrock /* 1093789Sahrens * Pool Creation 1094789Sahrens */ 1095789Sahrens int 10961635Sbonwick spa_create(const char *pool, nvlist_t *nvroot, const char *altroot) 1097789Sahrens { 1098789Sahrens spa_t *spa; 10991635Sbonwick vdev_t *rvd; 1100789Sahrens dsl_pool_t *dp; 1101789Sahrens dmu_tx_t *tx; 11022082Seschrock int c, error = 0; 1103789Sahrens uint64_t txg = TXG_INITIAL; 11042082Seschrock nvlist_t **spares; 11052082Seschrock uint_t nspares; 1106789Sahrens 1107789Sahrens /* 1108789Sahrens * If this pool already exists, return failure. 1109789Sahrens */ 1110789Sahrens mutex_enter(&spa_namespace_lock); 1111789Sahrens if (spa_lookup(pool) != NULL) { 1112789Sahrens mutex_exit(&spa_namespace_lock); 1113789Sahrens return (EEXIST); 1114789Sahrens } 1115789Sahrens 1116789Sahrens /* 1117789Sahrens * Allocate a new spa_t structure. 1118789Sahrens */ 11191635Sbonwick spa = spa_add(pool, altroot); 1120789Sahrens spa_activate(spa); 1121789Sahrens 1122789Sahrens spa->spa_uberblock.ub_txg = txg - 1; 11231760Seschrock spa->spa_uberblock.ub_version = ZFS_VERSION; 1124789Sahrens spa->spa_ubsync = spa->spa_uberblock; 1125789Sahrens 11261635Sbonwick /* 11271635Sbonwick * Create the root vdev. 11281635Sbonwick */ 11291635Sbonwick spa_config_enter(spa, RW_WRITER, FTAG); 11301635Sbonwick 11312082Seschrock error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 11322082Seschrock 11332082Seschrock ASSERT(error != 0 || rvd != NULL); 11342082Seschrock ASSERT(error != 0 || spa->spa_root_vdev == rvd); 11352082Seschrock 11362082Seschrock if (error == 0 && rvd->vdev_children == 0) 11371635Sbonwick error = EINVAL; 11382082Seschrock 11392082Seschrock if (error == 0 && 11402082Seschrock (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 11412082Seschrock (error = spa_validate_spares(spa, nvroot, txg, 11422082Seschrock VDEV_ALLOC_ADD)) == 0) { 11432082Seschrock for (c = 0; c < rvd->vdev_children; c++) 11442082Seschrock vdev_init(rvd->vdev_child[c], txg); 11452082Seschrock vdev_config_dirty(rvd); 11461635Sbonwick } 11471635Sbonwick 11481635Sbonwick spa_config_exit(spa, FTAG); 1149789Sahrens 11502082Seschrock if (error != 0) { 1151789Sahrens spa_unload(spa); 1152789Sahrens spa_deactivate(spa); 1153789Sahrens spa_remove(spa); 1154789Sahrens mutex_exit(&spa_namespace_lock); 1155789Sahrens return (error); 1156789Sahrens } 1157789Sahrens 11582082Seschrock /* 11592082Seschrock * Get the list of spares, if specified. 11602082Seschrock */ 11612082Seschrock if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 11622082Seschrock &spares, &nspares) == 0) { 11632082Seschrock VERIFY(nvlist_alloc(&spa->spa_sparelist, NV_UNIQUE_NAME, 11642082Seschrock KM_SLEEP) == 0); 11652082Seschrock VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 11662082Seschrock ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 11672082Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 11682082Seschrock spa_load_spares(spa); 11692082Seschrock spa_config_exit(spa, FTAG); 11702082Seschrock spa->spa_sync_spares = B_TRUE; 11712082Seschrock } 11722082Seschrock 1173789Sahrens spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg); 1174789Sahrens spa->spa_meta_objset = dp->dp_meta_objset; 1175789Sahrens 1176789Sahrens tx = dmu_tx_create_assigned(dp, txg); 1177789Sahrens 1178789Sahrens /* 1179789Sahrens * Create the pool config object. 1180789Sahrens */ 1181789Sahrens spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 1182789Sahrens DMU_OT_PACKED_NVLIST, 1 << 14, 1183789Sahrens DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 1184789Sahrens 11851544Seschrock if (zap_add(spa->spa_meta_objset, 1186789Sahrens DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 11871544Seschrock sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 11881544Seschrock cmn_err(CE_PANIC, "failed to add pool config"); 11891544Seschrock } 1190789Sahrens 11912082Seschrock /* Newly created pools are always deflated. */ 11922082Seschrock spa->spa_deflate = TRUE; 11932082Seschrock if (zap_add(spa->spa_meta_objset, 11942082Seschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 11952082Seschrock sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 11962082Seschrock cmn_err(CE_PANIC, "failed to add deflate"); 11972082Seschrock } 11982082Seschrock 1199789Sahrens /* 1200789Sahrens * Create the deferred-free bplist object. Turn off compression 1201789Sahrens * because sync-to-convergence takes longer if the blocksize 1202789Sahrens * keeps changing. 1203789Sahrens */ 1204789Sahrens spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 1205789Sahrens 1 << 14, tx); 1206789Sahrens dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 1207789Sahrens ZIO_COMPRESS_OFF, tx); 1208789Sahrens 12091544Seschrock if (zap_add(spa->spa_meta_objset, 1210789Sahrens DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 12111544Seschrock sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { 12121544Seschrock cmn_err(CE_PANIC, "failed to add bplist"); 12131544Seschrock } 1214789Sahrens 12152926Sek110237 /* 12162926Sek110237 * Create the pool's history object. 12172926Sek110237 */ 12182926Sek110237 spa_history_create_obj(spa, tx); 12192926Sek110237 1220789Sahrens dmu_tx_commit(tx); 1221789Sahrens 12223912Slling spa->spa_bootfs = zfs_prop_default_numeric(ZFS_PROP_BOOTFS); 1223789Sahrens spa->spa_sync_on = B_TRUE; 1224789Sahrens txg_sync_start(spa->spa_dsl_pool); 1225789Sahrens 1226789Sahrens /* 1227789Sahrens * We explicitly wait for the first transaction to complete so that our 1228789Sahrens * bean counters are appropriately updated. 1229789Sahrens */ 1230789Sahrens txg_wait_synced(spa->spa_dsl_pool, txg); 1231789Sahrens 1232789Sahrens spa_config_sync(); 1233789Sahrens 1234789Sahrens mutex_exit(&spa_namespace_lock); 1235789Sahrens 1236789Sahrens return (0); 1237789Sahrens } 1238789Sahrens 1239789Sahrens /* 1240789Sahrens * Import the given pool into the system. We set up the necessary spa_t and 1241789Sahrens * then call spa_load() to do the dirty work. 1242789Sahrens */ 1243789Sahrens int 12441635Sbonwick spa_import(const char *pool, nvlist_t *config, const char *altroot) 1245789Sahrens { 1246789Sahrens spa_t *spa; 1247789Sahrens int error; 12482082Seschrock nvlist_t *nvroot; 12492082Seschrock nvlist_t **spares; 12502082Seschrock uint_t nspares; 1251789Sahrens 1252789Sahrens if (!(spa_mode & FWRITE)) 1253789Sahrens return (EROFS); 1254789Sahrens 1255789Sahrens /* 1256789Sahrens * If a pool with this name exists, return failure. 1257789Sahrens */ 1258789Sahrens mutex_enter(&spa_namespace_lock); 1259789Sahrens if (spa_lookup(pool) != NULL) { 1260789Sahrens mutex_exit(&spa_namespace_lock); 1261789Sahrens return (EEXIST); 1262789Sahrens } 1263789Sahrens 1264789Sahrens /* 12651635Sbonwick * Create and initialize the spa structure. 1266789Sahrens */ 12671635Sbonwick spa = spa_add(pool, altroot); 1268789Sahrens spa_activate(spa); 1269789Sahrens 1270789Sahrens /* 12711635Sbonwick * Pass off the heavy lifting to spa_load(). 12721732Sbonwick * Pass TRUE for mosconfig because the user-supplied config 12731732Sbonwick * is actually the one to trust when doing an import. 12741601Sbonwick */ 12751732Sbonwick error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE); 1276789Sahrens 12772082Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 12782082Seschrock /* 12792082Seschrock * Toss any existing sparelist, as it doesn't have any validity anymore, 12802082Seschrock * and conflicts with spa_has_spare(). 12812082Seschrock */ 12822082Seschrock if (spa->spa_sparelist) { 12832082Seschrock nvlist_free(spa->spa_sparelist); 12842082Seschrock spa->spa_sparelist = NULL; 12852082Seschrock spa_load_spares(spa); 12862082Seschrock } 12872082Seschrock 12882082Seschrock VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 12892082Seschrock &nvroot) == 0); 12902082Seschrock if (error == 0) 12912082Seschrock error = spa_validate_spares(spa, nvroot, -1ULL, 12922082Seschrock VDEV_ALLOC_SPARE); 12932082Seschrock spa_config_exit(spa, FTAG); 12942082Seschrock 12952082Seschrock if (error != 0) { 1296789Sahrens spa_unload(spa); 1297789Sahrens spa_deactivate(spa); 1298789Sahrens spa_remove(spa); 1299789Sahrens mutex_exit(&spa_namespace_lock); 1300789Sahrens return (error); 1301789Sahrens } 1302789Sahrens 13031635Sbonwick /* 13042082Seschrock * Override any spares as specified by the user, as these may have 13052082Seschrock * correct device names/devids, etc. 13062082Seschrock */ 13072082Seschrock if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 13082082Seschrock &spares, &nspares) == 0) { 13092082Seschrock if (spa->spa_sparelist) 13102082Seschrock VERIFY(nvlist_remove(spa->spa_sparelist, 13112082Seschrock ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 13122082Seschrock else 13132082Seschrock VERIFY(nvlist_alloc(&spa->spa_sparelist, 13142082Seschrock NV_UNIQUE_NAME, KM_SLEEP) == 0); 13152082Seschrock VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 13162082Seschrock ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 13172082Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 13182082Seschrock spa_load_spares(spa); 13192082Seschrock spa_config_exit(spa, FTAG); 13202082Seschrock spa->spa_sync_spares = B_TRUE; 13212082Seschrock } 13222082Seschrock 13232082Seschrock /* 13241635Sbonwick * Update the config cache to include the newly-imported pool. 13251635Sbonwick */ 13261635Sbonwick spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 13271635Sbonwick 1328789Sahrens mutex_exit(&spa_namespace_lock); 1329789Sahrens 1330789Sahrens /* 1331789Sahrens * Resilver anything that's out of date. 1332789Sahrens */ 1333789Sahrens if (spa_mode & FWRITE) 1334789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1335789Sahrens 1336789Sahrens return (0); 1337789Sahrens } 1338789Sahrens 1339789Sahrens /* 1340789Sahrens * This (illegal) pool name is used when temporarily importing a spa_t in order 1341789Sahrens * to get the vdev stats associated with the imported devices. 1342789Sahrens */ 1343789Sahrens #define TRYIMPORT_NAME "$import" 1344789Sahrens 1345789Sahrens nvlist_t * 1346789Sahrens spa_tryimport(nvlist_t *tryconfig) 1347789Sahrens { 1348789Sahrens nvlist_t *config = NULL; 1349789Sahrens char *poolname; 1350789Sahrens spa_t *spa; 1351789Sahrens uint64_t state; 1352789Sahrens 1353789Sahrens if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 1354789Sahrens return (NULL); 1355789Sahrens 1356789Sahrens if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 1357789Sahrens return (NULL); 1358789Sahrens 13591635Sbonwick /* 13601635Sbonwick * Create and initialize the spa structure. 13611635Sbonwick */ 1362789Sahrens mutex_enter(&spa_namespace_lock); 13631635Sbonwick spa = spa_add(TRYIMPORT_NAME, NULL); 1364789Sahrens spa_activate(spa); 1365789Sahrens 1366789Sahrens /* 13671635Sbonwick * Pass off the heavy lifting to spa_load(). 13681732Sbonwick * Pass TRUE for mosconfig because the user-supplied config 13691732Sbonwick * is actually the one to trust when doing an import. 1370789Sahrens */ 13711732Sbonwick (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); 1372789Sahrens 1373789Sahrens /* 1374789Sahrens * If 'tryconfig' was at least parsable, return the current config. 1375789Sahrens */ 1376789Sahrens if (spa->spa_root_vdev != NULL) { 13771635Sbonwick spa_config_enter(spa, RW_READER, FTAG); 1378789Sahrens config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 13791635Sbonwick spa_config_exit(spa, FTAG); 1380789Sahrens VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 1381789Sahrens poolname) == 0); 1382789Sahrens VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 1383789Sahrens state) == 0); 13843975Sek110237 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 13853975Sek110237 spa->spa_uberblock.ub_timestamp) == 0); 13862082Seschrock 13872082Seschrock /* 13882082Seschrock * Add the list of hot spares. 13892082Seschrock */ 13902082Seschrock spa_add_spares(spa, config); 1391789Sahrens } 1392789Sahrens 1393789Sahrens spa_unload(spa); 1394789Sahrens spa_deactivate(spa); 1395789Sahrens spa_remove(spa); 1396789Sahrens mutex_exit(&spa_namespace_lock); 1397789Sahrens 1398789Sahrens return (config); 1399789Sahrens } 1400789Sahrens 1401789Sahrens /* 1402789Sahrens * Pool export/destroy 1403789Sahrens * 1404789Sahrens * The act of destroying or exporting a pool is very simple. We make sure there 1405789Sahrens * is no more pending I/O and any references to the pool are gone. Then, we 1406789Sahrens * update the pool state and sync all the labels to disk, removing the 1407789Sahrens * configuration from the cache afterwards. 1408789Sahrens */ 1409789Sahrens static int 14101775Sbillm spa_export_common(char *pool, int new_state, nvlist_t **oldconfig) 1411789Sahrens { 1412789Sahrens spa_t *spa; 1413789Sahrens 14141775Sbillm if (oldconfig) 14151775Sbillm *oldconfig = NULL; 14161775Sbillm 1417789Sahrens if (!(spa_mode & FWRITE)) 1418789Sahrens return (EROFS); 1419789Sahrens 1420789Sahrens mutex_enter(&spa_namespace_lock); 1421789Sahrens if ((spa = spa_lookup(pool)) == NULL) { 1422789Sahrens mutex_exit(&spa_namespace_lock); 1423789Sahrens return (ENOENT); 1424789Sahrens } 1425789Sahrens 1426789Sahrens /* 14271544Seschrock * Put a hold on the pool, drop the namespace lock, stop async tasks, 14281544Seschrock * reacquire the namespace lock, and see if we can export. 14291544Seschrock */ 14301544Seschrock spa_open_ref(spa, FTAG); 14311544Seschrock mutex_exit(&spa_namespace_lock); 14321544Seschrock spa_async_suspend(spa); 14331544Seschrock mutex_enter(&spa_namespace_lock); 14341544Seschrock spa_close(spa, FTAG); 14351544Seschrock 14361544Seschrock /* 1437789Sahrens * The pool will be in core if it's openable, 1438789Sahrens * in which case we can modify its state. 1439789Sahrens */ 1440789Sahrens if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 1441789Sahrens /* 1442789Sahrens * Objsets may be open only because they're dirty, so we 1443789Sahrens * have to force it to sync before checking spa_refcnt. 1444789Sahrens */ 1445789Sahrens spa_scrub_suspend(spa); 1446789Sahrens txg_wait_synced(spa->spa_dsl_pool, 0); 1447789Sahrens 14481544Seschrock /* 14491544Seschrock * A pool cannot be exported or destroyed if there are active 14501544Seschrock * references. If we are resetting a pool, allow references by 14511544Seschrock * fault injection handlers. 14521544Seschrock */ 14531544Seschrock if (!spa_refcount_zero(spa) || 14541544Seschrock (spa->spa_inject_ref != 0 && 14551544Seschrock new_state != POOL_STATE_UNINITIALIZED)) { 1456789Sahrens spa_scrub_resume(spa); 14571544Seschrock spa_async_resume(spa); 1458789Sahrens mutex_exit(&spa_namespace_lock); 1459789Sahrens return (EBUSY); 1460789Sahrens } 1461789Sahrens 1462789Sahrens spa_scrub_resume(spa); 1463789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 1464789Sahrens 1465789Sahrens /* 1466789Sahrens * We want this to be reflected on every label, 1467789Sahrens * so mark them all dirty. spa_unload() will do the 1468789Sahrens * final sync that pushes these changes out. 1469789Sahrens */ 14701544Seschrock if (new_state != POOL_STATE_UNINITIALIZED) { 14711601Sbonwick spa_config_enter(spa, RW_WRITER, FTAG); 14721544Seschrock spa->spa_state = new_state; 14731635Sbonwick spa->spa_final_txg = spa_last_synced_txg(spa) + 1; 14741544Seschrock vdev_config_dirty(spa->spa_root_vdev); 14751601Sbonwick spa_config_exit(spa, FTAG); 14761544Seschrock } 1477789Sahrens } 1478789Sahrens 1479789Sahrens if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 1480789Sahrens spa_unload(spa); 1481789Sahrens spa_deactivate(spa); 1482789Sahrens } 1483789Sahrens 14841775Sbillm if (oldconfig && spa->spa_config) 14851775Sbillm VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 14861775Sbillm 14871544Seschrock if (new_state != POOL_STATE_UNINITIALIZED) { 14881544Seschrock spa_remove(spa); 14891544Seschrock spa_config_sync(); 14901544Seschrock } 1491789Sahrens mutex_exit(&spa_namespace_lock); 1492789Sahrens 1493789Sahrens return (0); 1494789Sahrens } 1495789Sahrens 1496789Sahrens /* 1497789Sahrens * Destroy a storage pool. 1498789Sahrens */ 1499789Sahrens int 1500789Sahrens spa_destroy(char *pool) 1501789Sahrens { 15021775Sbillm return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL)); 1503789Sahrens } 1504789Sahrens 1505789Sahrens /* 1506789Sahrens * Export a storage pool. 1507789Sahrens */ 1508789Sahrens int 15091775Sbillm spa_export(char *pool, nvlist_t **oldconfig) 1510789Sahrens { 15111775Sbillm return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig)); 1512789Sahrens } 1513789Sahrens 1514789Sahrens /* 15151544Seschrock * Similar to spa_export(), this unloads the spa_t without actually removing it 15161544Seschrock * from the namespace in any way. 15171544Seschrock */ 15181544Seschrock int 15191544Seschrock spa_reset(char *pool) 15201544Seschrock { 15211775Sbillm return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL)); 15221544Seschrock } 15231544Seschrock 15241544Seschrock 15251544Seschrock /* 1526789Sahrens * ========================================================================== 1527789Sahrens * Device manipulation 1528789Sahrens * ========================================================================== 1529789Sahrens */ 1530789Sahrens 1531789Sahrens /* 1532789Sahrens * Add capacity to a storage pool. 1533789Sahrens */ 1534789Sahrens int 1535789Sahrens spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 1536789Sahrens { 1537789Sahrens uint64_t txg; 15381635Sbonwick int c, error; 1539789Sahrens vdev_t *rvd = spa->spa_root_vdev; 15401585Sbonwick vdev_t *vd, *tvd; 15412082Seschrock nvlist_t **spares; 15422082Seschrock uint_t i, nspares; 1543789Sahrens 1544789Sahrens txg = spa_vdev_enter(spa); 1545789Sahrens 15462082Seschrock if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 15472082Seschrock VDEV_ALLOC_ADD)) != 0) 15482082Seschrock return (spa_vdev_exit(spa, NULL, txg, error)); 15492082Seschrock 15503377Seschrock spa->spa_pending_vdev = vd; 1551789Sahrens 15522082Seschrock if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 15532082Seschrock &spares, &nspares) != 0) 15542082Seschrock nspares = 0; 15552082Seschrock 15563377Seschrock if (vd->vdev_children == 0 && nspares == 0) { 15573377Seschrock spa->spa_pending_vdev = NULL; 15582082Seschrock return (spa_vdev_exit(spa, vd, txg, EINVAL)); 15593377Seschrock } 15602082Seschrock 15612082Seschrock if (vd->vdev_children != 0) { 15623377Seschrock if ((error = vdev_create(vd, txg, B_FALSE)) != 0) { 15633377Seschrock spa->spa_pending_vdev = NULL; 15642082Seschrock return (spa_vdev_exit(spa, vd, txg, error)); 15652082Seschrock } 15662082Seschrock } 15672082Seschrock 15683377Seschrock /* 15693377Seschrock * We must validate the spares after checking the children. Otherwise, 15703377Seschrock * vdev_inuse() will blindly overwrite the spare. 15713377Seschrock */ 15723377Seschrock if ((error = spa_validate_spares(spa, nvroot, txg, 15733377Seschrock VDEV_ALLOC_ADD)) != 0) { 15743377Seschrock spa->spa_pending_vdev = NULL; 15753377Seschrock return (spa_vdev_exit(spa, vd, txg, error)); 15763377Seschrock } 15773377Seschrock 15783377Seschrock spa->spa_pending_vdev = NULL; 15793377Seschrock 15803377Seschrock /* 15813377Seschrock * Transfer each new top-level vdev from vd to rvd. 15823377Seschrock */ 15833377Seschrock for (c = 0; c < vd->vdev_children; c++) { 15843377Seschrock tvd = vd->vdev_child[c]; 15853377Seschrock vdev_remove_child(vd, tvd); 15863377Seschrock tvd->vdev_id = rvd->vdev_children; 15873377Seschrock vdev_add_child(rvd, tvd); 15883377Seschrock vdev_config_dirty(tvd); 15893377Seschrock } 15903377Seschrock 15912082Seschrock if (nspares != 0) { 15922082Seschrock if (spa->spa_sparelist != NULL) { 15932082Seschrock nvlist_t **oldspares; 15942082Seschrock uint_t oldnspares; 15952082Seschrock nvlist_t **newspares; 15962082Seschrock 15972082Seschrock VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 15982082Seschrock ZPOOL_CONFIG_SPARES, &oldspares, &oldnspares) == 0); 15992082Seschrock 16002082Seschrock newspares = kmem_alloc(sizeof (void *) * 16012082Seschrock (nspares + oldnspares), KM_SLEEP); 16022082Seschrock for (i = 0; i < oldnspares; i++) 16032082Seschrock VERIFY(nvlist_dup(oldspares[i], 16042082Seschrock &newspares[i], KM_SLEEP) == 0); 16052082Seschrock for (i = 0; i < nspares; i++) 16062082Seschrock VERIFY(nvlist_dup(spares[i], 16072082Seschrock &newspares[i + oldnspares], 16082082Seschrock KM_SLEEP) == 0); 16092082Seschrock 16102082Seschrock VERIFY(nvlist_remove(spa->spa_sparelist, 16112082Seschrock ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 16122082Seschrock 16132082Seschrock VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 16142082Seschrock ZPOOL_CONFIG_SPARES, newspares, 16152082Seschrock nspares + oldnspares) == 0); 16162082Seschrock for (i = 0; i < oldnspares + nspares; i++) 16172082Seschrock nvlist_free(newspares[i]); 16182082Seschrock kmem_free(newspares, (oldnspares + nspares) * 16192082Seschrock sizeof (void *)); 16202082Seschrock } else { 16212082Seschrock VERIFY(nvlist_alloc(&spa->spa_sparelist, 16222082Seschrock NV_UNIQUE_NAME, KM_SLEEP) == 0); 16232082Seschrock VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 16242082Seschrock ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 16252082Seschrock } 16262082Seschrock 16272082Seschrock spa_load_spares(spa); 16282082Seschrock spa->spa_sync_spares = B_TRUE; 1629789Sahrens } 1630789Sahrens 1631789Sahrens /* 16321585Sbonwick * We have to be careful when adding new vdevs to an existing pool. 16331585Sbonwick * If other threads start allocating from these vdevs before we 16341585Sbonwick * sync the config cache, and we lose power, then upon reboot we may 16351585Sbonwick * fail to open the pool because there are DVAs that the config cache 16361585Sbonwick * can't translate. Therefore, we first add the vdevs without 16371585Sbonwick * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 16381635Sbonwick * and then let spa_config_update() initialize the new metaslabs. 16391585Sbonwick * 16401585Sbonwick * spa_load() checks for added-but-not-initialized vdevs, so that 16411585Sbonwick * if we lose power at any point in this sequence, the remaining 16421585Sbonwick * steps will be completed the next time we load the pool. 1643789Sahrens */ 16441635Sbonwick (void) spa_vdev_exit(spa, vd, txg, 0); 16451585Sbonwick 16461635Sbonwick mutex_enter(&spa_namespace_lock); 16471635Sbonwick spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 16481635Sbonwick mutex_exit(&spa_namespace_lock); 1649789Sahrens 16501635Sbonwick return (0); 1651789Sahrens } 1652789Sahrens 1653789Sahrens /* 1654789Sahrens * Attach a device to a mirror. The arguments are the path to any device 1655789Sahrens * in the mirror, and the nvroot for the new device. If the path specifies 1656789Sahrens * a device that is not mirrored, we automatically insert the mirror vdev. 1657789Sahrens * 1658789Sahrens * If 'replacing' is specified, the new device is intended to replace the 1659789Sahrens * existing device; in this case the two devices are made into their own 1660789Sahrens * mirror using the 'replacing' vdev, which is functionally idendical to 1661789Sahrens * the mirror vdev (it actually reuses all the same ops) but has a few 1662789Sahrens * extra rules: you can't attach to it after it's been created, and upon 1663789Sahrens * completion of resilvering, the first disk (the one being replaced) 1664789Sahrens * is automatically detached. 1665789Sahrens */ 1666789Sahrens int 16671544Seschrock spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 1668789Sahrens { 1669789Sahrens uint64_t txg, open_txg; 1670789Sahrens int error; 1671789Sahrens vdev_t *rvd = spa->spa_root_vdev; 1672789Sahrens vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 16732082Seschrock vdev_ops_t *pvops; 1674789Sahrens 1675789Sahrens txg = spa_vdev_enter(spa); 1676789Sahrens 16771544Seschrock oldvd = vdev_lookup_by_guid(rvd, guid); 1678789Sahrens 1679789Sahrens if (oldvd == NULL) 1680789Sahrens return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1681789Sahrens 16821585Sbonwick if (!oldvd->vdev_ops->vdev_op_leaf) 16831585Sbonwick return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 16841585Sbonwick 1685789Sahrens pvd = oldvd->vdev_parent; 1686789Sahrens 16872082Seschrock if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 16882082Seschrock VDEV_ALLOC_ADD)) != 0 || newrootvd->vdev_children != 1) 1689789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1690789Sahrens 1691789Sahrens newvd = newrootvd->vdev_child[0]; 1692789Sahrens 1693789Sahrens if (!newvd->vdev_ops->vdev_op_leaf) 1694789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1695789Sahrens 16962082Seschrock if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 1697789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, error)); 1698789Sahrens 16992082Seschrock if (!replacing) { 17002082Seschrock /* 17012082Seschrock * For attach, the only allowable parent is a mirror or the root 17022082Seschrock * vdev. 17032082Seschrock */ 17042082Seschrock if (pvd->vdev_ops != &vdev_mirror_ops && 17052082Seschrock pvd->vdev_ops != &vdev_root_ops) 17062082Seschrock return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 17072082Seschrock 17082082Seschrock pvops = &vdev_mirror_ops; 17092082Seschrock } else { 17102082Seschrock /* 17112082Seschrock * Active hot spares can only be replaced by inactive hot 17122082Seschrock * spares. 17132082Seschrock */ 17142082Seschrock if (pvd->vdev_ops == &vdev_spare_ops && 17152082Seschrock pvd->vdev_child[1] == oldvd && 17162082Seschrock !spa_has_spare(spa, newvd->vdev_guid)) 17172082Seschrock return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 17182082Seschrock 17192082Seschrock /* 17202082Seschrock * If the source is a hot spare, and the parent isn't already a 17212082Seschrock * spare, then we want to create a new hot spare. Otherwise, we 17223377Seschrock * want to create a replacing vdev. The user is not allowed to 17233377Seschrock * attach to a spared vdev child unless the 'isspare' state is 17243377Seschrock * the same (spare replaces spare, non-spare replaces 17253377Seschrock * non-spare). 17262082Seschrock */ 17272082Seschrock if (pvd->vdev_ops == &vdev_replacing_ops) 17282082Seschrock return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 17293377Seschrock else if (pvd->vdev_ops == &vdev_spare_ops && 17303377Seschrock newvd->vdev_isspare != oldvd->vdev_isspare) 17313377Seschrock return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 17322082Seschrock else if (pvd->vdev_ops != &vdev_spare_ops && 17332082Seschrock newvd->vdev_isspare) 17342082Seschrock pvops = &vdev_spare_ops; 17352082Seschrock else 17362082Seschrock pvops = &vdev_replacing_ops; 17372082Seschrock } 17382082Seschrock 17391175Slling /* 17401175Slling * Compare the new device size with the replaceable/attachable 17411175Slling * device size. 17421175Slling */ 17431175Slling if (newvd->vdev_psize < vdev_get_rsize(oldvd)) 1744789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 1745789Sahrens 17461732Sbonwick /* 17471732Sbonwick * The new device cannot have a higher alignment requirement 17481732Sbonwick * than the top-level vdev. 17491732Sbonwick */ 17501732Sbonwick if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 1751789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 1752789Sahrens 1753789Sahrens /* 1754789Sahrens * If this is an in-place replacement, update oldvd's path and devid 1755789Sahrens * to make it distinguishable from newvd, and unopenable from now on. 1756789Sahrens */ 1757789Sahrens if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 1758789Sahrens spa_strfree(oldvd->vdev_path); 1759789Sahrens oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 1760789Sahrens KM_SLEEP); 1761789Sahrens (void) sprintf(oldvd->vdev_path, "%s/%s", 1762789Sahrens newvd->vdev_path, "old"); 1763789Sahrens if (oldvd->vdev_devid != NULL) { 1764789Sahrens spa_strfree(oldvd->vdev_devid); 1765789Sahrens oldvd->vdev_devid = NULL; 1766789Sahrens } 1767789Sahrens } 1768789Sahrens 1769789Sahrens /* 17702082Seschrock * If the parent is not a mirror, or if we're replacing, insert the new 17712082Seschrock * mirror/replacing/spare vdev above oldvd. 1772789Sahrens */ 1773789Sahrens if (pvd->vdev_ops != pvops) 1774789Sahrens pvd = vdev_add_parent(oldvd, pvops); 1775789Sahrens 1776789Sahrens ASSERT(pvd->vdev_top->vdev_parent == rvd); 1777789Sahrens ASSERT(pvd->vdev_ops == pvops); 1778789Sahrens ASSERT(oldvd->vdev_parent == pvd); 1779789Sahrens 1780789Sahrens /* 1781789Sahrens * Extract the new device from its root and add it to pvd. 1782789Sahrens */ 1783789Sahrens vdev_remove_child(newrootvd, newvd); 1784789Sahrens newvd->vdev_id = pvd->vdev_children; 1785789Sahrens vdev_add_child(pvd, newvd); 1786789Sahrens 17871544Seschrock /* 17881544Seschrock * If newvd is smaller than oldvd, but larger than its rsize, 17891544Seschrock * the addition of newvd may have decreased our parent's asize. 17901544Seschrock */ 17911544Seschrock pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize); 17921544Seschrock 1793789Sahrens tvd = newvd->vdev_top; 1794789Sahrens ASSERT(pvd->vdev_top == tvd); 1795789Sahrens ASSERT(tvd->vdev_parent == rvd); 1796789Sahrens 1797789Sahrens vdev_config_dirty(tvd); 1798789Sahrens 1799789Sahrens /* 1800789Sahrens * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 1801789Sahrens * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 1802789Sahrens */ 1803789Sahrens open_txg = txg + TXG_CONCURRENT_STATES - 1; 1804789Sahrens 1805789Sahrens mutex_enter(&newvd->vdev_dtl_lock); 1806789Sahrens space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, 1807789Sahrens open_txg - TXG_INITIAL + 1); 1808789Sahrens mutex_exit(&newvd->vdev_dtl_lock); 1809789Sahrens 18103377Seschrock if (newvd->vdev_isspare) 18113377Seschrock spa_spare_activate(newvd); 18121544Seschrock 1813789Sahrens /* 1814789Sahrens * Mark newvd's DTL dirty in this txg. 1815789Sahrens */ 18161732Sbonwick vdev_dirty(tvd, VDD_DTL, newvd, txg); 1817789Sahrens 1818789Sahrens (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 1819789Sahrens 1820789Sahrens /* 1821789Sahrens * Kick off a resilver to update newvd. 1822789Sahrens */ 1823789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1824789Sahrens 1825789Sahrens return (0); 1826789Sahrens } 1827789Sahrens 1828789Sahrens /* 1829789Sahrens * Detach a device from a mirror or replacing vdev. 1830789Sahrens * If 'replace_done' is specified, only detach if the parent 1831789Sahrens * is a replacing vdev. 1832789Sahrens */ 1833789Sahrens int 18341544Seschrock spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) 1835789Sahrens { 1836789Sahrens uint64_t txg; 1837789Sahrens int c, t, error; 1838789Sahrens vdev_t *rvd = spa->spa_root_vdev; 1839789Sahrens vdev_t *vd, *pvd, *cvd, *tvd; 18402082Seschrock boolean_t unspare = B_FALSE; 18412082Seschrock uint64_t unspare_guid; 1842789Sahrens 1843789Sahrens txg = spa_vdev_enter(spa); 1844789Sahrens 18451544Seschrock vd = vdev_lookup_by_guid(rvd, guid); 1846789Sahrens 1847789Sahrens if (vd == NULL) 1848789Sahrens return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1849789Sahrens 18501585Sbonwick if (!vd->vdev_ops->vdev_op_leaf) 18511585Sbonwick return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 18521585Sbonwick 1853789Sahrens pvd = vd->vdev_parent; 1854789Sahrens 1855789Sahrens /* 1856789Sahrens * If replace_done is specified, only remove this device if it's 18572082Seschrock * the first child of a replacing vdev. For the 'spare' vdev, either 18582082Seschrock * disk can be removed. 1859789Sahrens */ 18602082Seschrock if (replace_done) { 18612082Seschrock if (pvd->vdev_ops == &vdev_replacing_ops) { 18622082Seschrock if (vd->vdev_id != 0) 18632082Seschrock return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 18642082Seschrock } else if (pvd->vdev_ops != &vdev_spare_ops) { 18652082Seschrock return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 18662082Seschrock } 18672082Seschrock } 18682082Seschrock 18692082Seschrock ASSERT(pvd->vdev_ops != &vdev_spare_ops || 18702082Seschrock spa_version(spa) >= ZFS_VERSION_SPARES); 1871789Sahrens 1872789Sahrens /* 18732082Seschrock * Only mirror, replacing, and spare vdevs support detach. 1874789Sahrens */ 1875789Sahrens if (pvd->vdev_ops != &vdev_replacing_ops && 18762082Seschrock pvd->vdev_ops != &vdev_mirror_ops && 18772082Seschrock pvd->vdev_ops != &vdev_spare_ops) 1878789Sahrens return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1879789Sahrens 1880789Sahrens /* 1881789Sahrens * If there's only one replica, you can't detach it. 1882789Sahrens */ 1883789Sahrens if (pvd->vdev_children <= 1) 1884789Sahrens return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1885789Sahrens 1886789Sahrens /* 1887789Sahrens * If all siblings have non-empty DTLs, this device may have the only 1888789Sahrens * valid copy of the data, which means we cannot safely detach it. 1889789Sahrens * 1890789Sahrens * XXX -- as in the vdev_offline() case, we really want a more 1891789Sahrens * precise DTL check. 1892789Sahrens */ 1893789Sahrens for (c = 0; c < pvd->vdev_children; c++) { 1894789Sahrens uint64_t dirty; 1895789Sahrens 1896789Sahrens cvd = pvd->vdev_child[c]; 1897789Sahrens if (cvd == vd) 1898789Sahrens continue; 1899789Sahrens if (vdev_is_dead(cvd)) 1900789Sahrens continue; 1901789Sahrens mutex_enter(&cvd->vdev_dtl_lock); 1902789Sahrens dirty = cvd->vdev_dtl_map.sm_space | 1903789Sahrens cvd->vdev_dtl_scrub.sm_space; 1904789Sahrens mutex_exit(&cvd->vdev_dtl_lock); 1905789Sahrens if (!dirty) 1906789Sahrens break; 1907789Sahrens } 19082082Seschrock 19092082Seschrock /* 19102082Seschrock * If we are a replacing or spare vdev, then we can always detach the 19112082Seschrock * latter child, as that is how one cancels the operation. 19122082Seschrock */ 19132082Seschrock if ((pvd->vdev_ops == &vdev_mirror_ops || vd->vdev_id != 1) && 19142082Seschrock c == pvd->vdev_children) 1915789Sahrens return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1916789Sahrens 1917789Sahrens /* 19182082Seschrock * If we are detaching the original disk from a spare, then it implies 19192082Seschrock * that the spare should become a real disk, and be removed from the 19202082Seschrock * active spare list for the pool. 19212082Seschrock */ 19222082Seschrock if (pvd->vdev_ops == &vdev_spare_ops && 19232082Seschrock vd->vdev_id == 0) 19242082Seschrock unspare = B_TRUE; 19252082Seschrock 19262082Seschrock /* 1927789Sahrens * Erase the disk labels so the disk can be used for other things. 1928789Sahrens * This must be done after all other error cases are handled, 1929789Sahrens * but before we disembowel vd (so we can still do I/O to it). 1930789Sahrens * But if we can't do it, don't treat the error as fatal -- 1931789Sahrens * it may be that the unwritability of the disk is the reason 1932789Sahrens * it's being detached! 1933789Sahrens */ 19343377Seschrock error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 1935789Sahrens 1936789Sahrens /* 1937789Sahrens * Remove vd from its parent and compact the parent's children. 1938789Sahrens */ 1939789Sahrens vdev_remove_child(pvd, vd); 1940789Sahrens vdev_compact_children(pvd); 1941789Sahrens 1942789Sahrens /* 1943789Sahrens * Remember one of the remaining children so we can get tvd below. 1944789Sahrens */ 1945789Sahrens cvd = pvd->vdev_child[0]; 1946789Sahrens 1947789Sahrens /* 19482082Seschrock * If we need to remove the remaining child from the list of hot spares, 19492082Seschrock * do it now, marking the vdev as no longer a spare in the process. We 19502082Seschrock * must do this before vdev_remove_parent(), because that can change the 19512082Seschrock * GUID if it creates a new toplevel GUID. 19522082Seschrock */ 19532082Seschrock if (unspare) { 19542082Seschrock ASSERT(cvd->vdev_isspare); 19553377Seschrock spa_spare_remove(cvd); 19562082Seschrock unspare_guid = cvd->vdev_guid; 19572082Seschrock } 19582082Seschrock 19592082Seschrock /* 1960789Sahrens * If the parent mirror/replacing vdev only has one child, 1961789Sahrens * the parent is no longer needed. Remove it from the tree. 1962789Sahrens */ 1963789Sahrens if (pvd->vdev_children == 1) 1964789Sahrens vdev_remove_parent(cvd); 1965789Sahrens 1966789Sahrens /* 1967789Sahrens * We don't set tvd until now because the parent we just removed 1968789Sahrens * may have been the previous top-level vdev. 1969789Sahrens */ 1970789Sahrens tvd = cvd->vdev_top; 1971789Sahrens ASSERT(tvd->vdev_parent == rvd); 1972789Sahrens 1973789Sahrens /* 19743377Seschrock * Reevaluate the parent vdev state. 1975789Sahrens */ 19763377Seschrock vdev_propagate_state(cvd->vdev_parent); 1977789Sahrens 1978789Sahrens /* 19793377Seschrock * If the device we just detached was smaller than the others, it may be 19803377Seschrock * possible to add metaslabs (i.e. grow the pool). vdev_metaslab_init() 19813377Seschrock * can't fail because the existing metaslabs are already in core, so 19823377Seschrock * there's nothing to read from disk. 1983789Sahrens */ 19841732Sbonwick VERIFY(vdev_metaslab_init(tvd, txg) == 0); 1985789Sahrens 1986789Sahrens vdev_config_dirty(tvd); 1987789Sahrens 1988789Sahrens /* 19893377Seschrock * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 19903377Seschrock * vd->vdev_detached is set and free vd's DTL object in syncing context. 19913377Seschrock * But first make sure we're not on any *other* txg's DTL list, to 19923377Seschrock * prevent vd from being accessed after it's freed. 1993789Sahrens */ 1994789Sahrens for (t = 0; t < TXG_SIZE; t++) 1995789Sahrens (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 19961732Sbonwick vd->vdev_detached = B_TRUE; 19971732Sbonwick vdev_dirty(tvd, VDD_DTL, vd, txg); 1998789Sahrens 19992082Seschrock error = spa_vdev_exit(spa, vd, txg, 0); 20002082Seschrock 20012082Seschrock /* 20023377Seschrock * If this was the removal of the original device in a hot spare vdev, 20033377Seschrock * then we want to go through and remove the device from the hot spare 20043377Seschrock * list of every other pool. 20052082Seschrock */ 20062082Seschrock if (unspare) { 20072082Seschrock spa = NULL; 20082082Seschrock mutex_enter(&spa_namespace_lock); 20092082Seschrock while ((spa = spa_next(spa)) != NULL) { 20102082Seschrock if (spa->spa_state != POOL_STATE_ACTIVE) 20112082Seschrock continue; 20122082Seschrock 20132082Seschrock (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 20142082Seschrock } 20152082Seschrock mutex_exit(&spa_namespace_lock); 20162082Seschrock } 20172082Seschrock 20182082Seschrock return (error); 20192082Seschrock } 20202082Seschrock 20212082Seschrock /* 20222082Seschrock * Remove a device from the pool. Currently, this supports removing only hot 20232082Seschrock * spares. 20242082Seschrock */ 20252082Seschrock int 20262082Seschrock spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 20272082Seschrock { 20282082Seschrock vdev_t *vd; 20292082Seschrock nvlist_t **spares, *nv, **newspares; 20302082Seschrock uint_t i, j, nspares; 20312082Seschrock int ret = 0; 20322082Seschrock 20332082Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 20342082Seschrock 20352082Seschrock vd = spa_lookup_by_guid(spa, guid); 20362082Seschrock 20372082Seschrock nv = NULL; 20382082Seschrock if (spa->spa_spares != NULL && 20392082Seschrock nvlist_lookup_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 20402082Seschrock &spares, &nspares) == 0) { 20412082Seschrock for (i = 0; i < nspares; i++) { 20422082Seschrock uint64_t theguid; 20432082Seschrock 20442082Seschrock VERIFY(nvlist_lookup_uint64(spares[i], 20452082Seschrock ZPOOL_CONFIG_GUID, &theguid) == 0); 20462082Seschrock if (theguid == guid) { 20472082Seschrock nv = spares[i]; 20482082Seschrock break; 20492082Seschrock } 20502082Seschrock } 20512082Seschrock } 20522082Seschrock 20532082Seschrock /* 20542082Seschrock * We only support removing a hot spare, and only if it's not currently 20552082Seschrock * in use in this pool. 20562082Seschrock */ 20572082Seschrock if (nv == NULL && vd == NULL) { 20582082Seschrock ret = ENOENT; 20592082Seschrock goto out; 20602082Seschrock } 20612082Seschrock 20622082Seschrock if (nv == NULL && vd != NULL) { 20632082Seschrock ret = ENOTSUP; 20642082Seschrock goto out; 20652082Seschrock } 20662082Seschrock 20672082Seschrock if (!unspare && nv != NULL && vd != NULL) { 20682082Seschrock ret = EBUSY; 20692082Seschrock goto out; 20702082Seschrock } 20712082Seschrock 20722082Seschrock if (nspares == 1) { 20732082Seschrock newspares = NULL; 20742082Seschrock } else { 20752082Seschrock newspares = kmem_alloc((nspares - 1) * sizeof (void *), 20762082Seschrock KM_SLEEP); 20772082Seschrock for (i = 0, j = 0; i < nspares; i++) { 20782082Seschrock if (spares[i] != nv) 20792082Seschrock VERIFY(nvlist_dup(spares[i], 20802082Seschrock &newspares[j++], KM_SLEEP) == 0); 20812082Seschrock } 20822082Seschrock } 20832082Seschrock 20842082Seschrock VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 20852082Seschrock DATA_TYPE_NVLIST_ARRAY) == 0); 20862082Seschrock VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 20872082Seschrock newspares, nspares - 1) == 0); 20882082Seschrock for (i = 0; i < nspares - 1; i++) 20892082Seschrock nvlist_free(newspares[i]); 20902082Seschrock kmem_free(newspares, (nspares - 1) * sizeof (void *)); 20912082Seschrock spa_load_spares(spa); 20922082Seschrock spa->spa_sync_spares = B_TRUE; 20932082Seschrock 20942082Seschrock out: 20952082Seschrock spa_config_exit(spa, FTAG); 20962082Seschrock 20972082Seschrock return (ret); 2098789Sahrens } 2099789Sahrens 2100789Sahrens /* 21011544Seschrock * Find any device that's done replacing, so we can detach it. 2102789Sahrens */ 21031544Seschrock static vdev_t * 21041544Seschrock spa_vdev_replace_done_hunt(vdev_t *vd) 2105789Sahrens { 21061544Seschrock vdev_t *newvd, *oldvd; 2107789Sahrens int c; 2108789Sahrens 21091544Seschrock for (c = 0; c < vd->vdev_children; c++) { 21101544Seschrock oldvd = spa_vdev_replace_done_hunt(vd->vdev_child[c]); 21111544Seschrock if (oldvd != NULL) 21121544Seschrock return (oldvd); 21131544Seschrock } 2114789Sahrens 2115789Sahrens if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 21161544Seschrock oldvd = vd->vdev_child[0]; 21171544Seschrock newvd = vd->vdev_child[1]; 2118789Sahrens 21191544Seschrock mutex_enter(&newvd->vdev_dtl_lock); 21201544Seschrock if (newvd->vdev_dtl_map.sm_space == 0 && 21211544Seschrock newvd->vdev_dtl_scrub.sm_space == 0) { 21221544Seschrock mutex_exit(&newvd->vdev_dtl_lock); 21231544Seschrock return (oldvd); 21241544Seschrock } 21251544Seschrock mutex_exit(&newvd->vdev_dtl_lock); 21261544Seschrock } 2127789Sahrens 21281544Seschrock return (NULL); 2129789Sahrens } 2130789Sahrens 21311544Seschrock static void 2132789Sahrens spa_vdev_replace_done(spa_t *spa) 2133789Sahrens { 21341544Seschrock vdev_t *vd; 21352082Seschrock vdev_t *pvd; 21361544Seschrock uint64_t guid; 21372082Seschrock uint64_t pguid = 0; 2138789Sahrens 21391544Seschrock spa_config_enter(spa, RW_READER, FTAG); 2140789Sahrens 21411544Seschrock while ((vd = spa_vdev_replace_done_hunt(spa->spa_root_vdev)) != NULL) { 21421544Seschrock guid = vd->vdev_guid; 21432082Seschrock /* 21442082Seschrock * If we have just finished replacing a hot spared device, then 21452082Seschrock * we need to detach the parent's first child (the original hot 21462082Seschrock * spare) as well. 21472082Seschrock */ 21482082Seschrock pvd = vd->vdev_parent; 21492082Seschrock if (pvd->vdev_parent->vdev_ops == &vdev_spare_ops && 21502082Seschrock pvd->vdev_id == 0) { 21512082Seschrock ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 21522082Seschrock ASSERT(pvd->vdev_parent->vdev_children == 2); 21532082Seschrock pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid; 21542082Seschrock } 21551544Seschrock spa_config_exit(spa, FTAG); 21561544Seschrock if (spa_vdev_detach(spa, guid, B_TRUE) != 0) 21571544Seschrock return; 21582082Seschrock if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0) 21592082Seschrock return; 21601544Seschrock spa_config_enter(spa, RW_READER, FTAG); 2161789Sahrens } 2162789Sahrens 21631544Seschrock spa_config_exit(spa, FTAG); 2164789Sahrens } 2165789Sahrens 2166789Sahrens /* 21671354Seschrock * Update the stored path for this vdev. Dirty the vdev configuration, relying 21681354Seschrock * on spa_vdev_enter/exit() to synchronize the labels and cache. 21691354Seschrock */ 21701354Seschrock int 21711354Seschrock spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 21721354Seschrock { 21731354Seschrock vdev_t *rvd, *vd; 21741354Seschrock uint64_t txg; 21751354Seschrock 21761354Seschrock rvd = spa->spa_root_vdev; 21771354Seschrock 21781354Seschrock txg = spa_vdev_enter(spa); 21791354Seschrock 21802082Seschrock if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 21812082Seschrock /* 21822082Seschrock * Determine if this is a reference to a hot spare. In that 21832082Seschrock * case, update the path as stored in the spare list. 21842082Seschrock */ 21852082Seschrock nvlist_t **spares; 21862082Seschrock uint_t i, nspares; 21872082Seschrock if (spa->spa_sparelist != NULL) { 21882082Seschrock VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 21892082Seschrock ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 21902082Seschrock for (i = 0; i < nspares; i++) { 21912082Seschrock uint64_t theguid; 21922082Seschrock VERIFY(nvlist_lookup_uint64(spares[i], 21932082Seschrock ZPOOL_CONFIG_GUID, &theguid) == 0); 21942082Seschrock if (theguid == guid) 21952082Seschrock break; 21962082Seschrock } 21972082Seschrock 21982082Seschrock if (i == nspares) 21992082Seschrock return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 22002082Seschrock 22012082Seschrock VERIFY(nvlist_add_string(spares[i], 22022082Seschrock ZPOOL_CONFIG_PATH, newpath) == 0); 22032082Seschrock spa_load_spares(spa); 22042082Seschrock spa->spa_sync_spares = B_TRUE; 22052082Seschrock return (spa_vdev_exit(spa, NULL, txg, 0)); 22062082Seschrock } else { 22072082Seschrock return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 22082082Seschrock } 22092082Seschrock } 22101354Seschrock 22111585Sbonwick if (!vd->vdev_ops->vdev_op_leaf) 22121585Sbonwick return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 22131585Sbonwick 22141354Seschrock spa_strfree(vd->vdev_path); 22151354Seschrock vd->vdev_path = spa_strdup(newpath); 22161354Seschrock 22171354Seschrock vdev_config_dirty(vd->vdev_top); 22181354Seschrock 22191354Seschrock return (spa_vdev_exit(spa, NULL, txg, 0)); 22201354Seschrock } 22211354Seschrock 22221354Seschrock /* 2223789Sahrens * ========================================================================== 2224789Sahrens * SPA Scrubbing 2225789Sahrens * ========================================================================== 2226789Sahrens */ 2227789Sahrens 2228789Sahrens static void 2229789Sahrens spa_scrub_io_done(zio_t *zio) 2230789Sahrens { 2231789Sahrens spa_t *spa = zio->io_spa; 2232789Sahrens 22333290Sjohansen zio_data_buf_free(zio->io_data, zio->io_size); 2234789Sahrens 2235789Sahrens mutex_enter(&spa->spa_scrub_lock); 22361544Seschrock if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 22371775Sbillm vdev_t *vd = zio->io_vd ? zio->io_vd : spa->spa_root_vdev; 2238789Sahrens spa->spa_scrub_errors++; 2239789Sahrens mutex_enter(&vd->vdev_stat_lock); 2240789Sahrens vd->vdev_stat.vs_scrub_errors++; 2241789Sahrens mutex_exit(&vd->vdev_stat_lock); 2242789Sahrens } 22433697Smishra 22443697Smishra if (--spa->spa_scrub_inflight < spa->spa_scrub_maxinflight) 22451544Seschrock cv_broadcast(&spa->spa_scrub_io_cv); 22463697Smishra 22473697Smishra ASSERT(spa->spa_scrub_inflight >= 0); 22483697Smishra 22491544Seschrock mutex_exit(&spa->spa_scrub_lock); 2250789Sahrens } 2251789Sahrens 2252789Sahrens static void 22531544Seschrock spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags, 22541544Seschrock zbookmark_t *zb) 2255789Sahrens { 2256789Sahrens size_t size = BP_GET_LSIZE(bp); 22573697Smishra void *data; 2258789Sahrens 2259789Sahrens mutex_enter(&spa->spa_scrub_lock); 22603697Smishra /* 22613697Smishra * Do not give too much work to vdev(s). 22623697Smishra */ 22633697Smishra while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight) { 22643697Smishra cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 22653697Smishra } 2266789Sahrens spa->spa_scrub_inflight++; 2267789Sahrens mutex_exit(&spa->spa_scrub_lock); 2268789Sahrens 22693697Smishra data = zio_data_buf_alloc(size); 22703697Smishra 22711544Seschrock if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET) 22721544Seschrock flags |= ZIO_FLAG_SPECULATIVE; /* intent log block */ 22731544Seschrock 22741807Sbonwick flags |= ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL; 22751544Seschrock 2276789Sahrens zio_nowait(zio_read(NULL, spa, bp, data, size, 22771544Seschrock spa_scrub_io_done, NULL, priority, flags, zb)); 2278789Sahrens } 2279789Sahrens 2280789Sahrens /* ARGSUSED */ 2281789Sahrens static int 2282789Sahrens spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) 2283789Sahrens { 2284789Sahrens blkptr_t *bp = &bc->bc_blkptr; 22851775Sbillm vdev_t *vd = spa->spa_root_vdev; 22861775Sbillm dva_t *dva = bp->blk_dva; 22871775Sbillm int needs_resilver = B_FALSE; 22881775Sbillm int d; 2289789Sahrens 22901775Sbillm if (bc->bc_errno) { 2291789Sahrens /* 2292789Sahrens * We can't scrub this block, but we can continue to scrub 2293789Sahrens * the rest of the pool. Note the error and move along. 2294789Sahrens */ 2295789Sahrens mutex_enter(&spa->spa_scrub_lock); 2296789Sahrens spa->spa_scrub_errors++; 2297789Sahrens mutex_exit(&spa->spa_scrub_lock); 2298789Sahrens 22991775Sbillm mutex_enter(&vd->vdev_stat_lock); 23001775Sbillm vd->vdev_stat.vs_scrub_errors++; 23011775Sbillm mutex_exit(&vd->vdev_stat_lock); 2302789Sahrens 2303789Sahrens return (ERESTART); 2304789Sahrens } 2305789Sahrens 2306789Sahrens ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg); 2307789Sahrens 23081775Sbillm for (d = 0; d < BP_GET_NDVAS(bp); d++) { 23091775Sbillm vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d])); 23101775Sbillm 23111775Sbillm ASSERT(vd != NULL); 23121775Sbillm 23131775Sbillm /* 23141775Sbillm * Keep track of how much data we've examined so that 23151775Sbillm * zpool(1M) status can make useful progress reports. 23161775Sbillm */ 23171775Sbillm mutex_enter(&vd->vdev_stat_lock); 23181775Sbillm vd->vdev_stat.vs_scrub_examined += DVA_GET_ASIZE(&dva[d]); 23191775Sbillm mutex_exit(&vd->vdev_stat_lock); 2320789Sahrens 23211775Sbillm if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) { 23221775Sbillm if (DVA_GET_GANG(&dva[d])) { 23231775Sbillm /* 23241775Sbillm * Gang members may be spread across multiple 23251775Sbillm * vdevs, so the best we can do is look at the 23261775Sbillm * pool-wide DTL. 23271775Sbillm * XXX -- it would be better to change our 23281775Sbillm * allocation policy to ensure that this can't 23291775Sbillm * happen. 23301775Sbillm */ 23311775Sbillm vd = spa->spa_root_vdev; 23321775Sbillm } 23331775Sbillm if (vdev_dtl_contains(&vd->vdev_dtl_map, 23341775Sbillm bp->blk_birth, 1)) 23351775Sbillm needs_resilver = B_TRUE; 2336789Sahrens } 23371775Sbillm } 23381775Sbillm 23391775Sbillm if (spa->spa_scrub_type == POOL_SCRUB_EVERYTHING) 2340789Sahrens spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB, 23411544Seschrock ZIO_FLAG_SCRUB, &bc->bc_bookmark); 23421775Sbillm else if (needs_resilver) 23431775Sbillm spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER, 23441775Sbillm ZIO_FLAG_RESILVER, &bc->bc_bookmark); 2345789Sahrens 2346789Sahrens return (0); 2347789Sahrens } 2348789Sahrens 2349789Sahrens static void 2350789Sahrens spa_scrub_thread(spa_t *spa) 2351789Sahrens { 2352789Sahrens callb_cpr_t cprinfo; 2353789Sahrens traverse_handle_t *th = spa->spa_scrub_th; 2354789Sahrens vdev_t *rvd = spa->spa_root_vdev; 2355789Sahrens pool_scrub_type_t scrub_type = spa->spa_scrub_type; 2356789Sahrens int error = 0; 2357789Sahrens boolean_t complete; 2358789Sahrens 2359789Sahrens CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG); 2360789Sahrens 2361797Sbonwick /* 2362797Sbonwick * If we're restarting due to a snapshot create/delete, 2363797Sbonwick * wait for that to complete. 2364797Sbonwick */ 2365797Sbonwick txg_wait_synced(spa_get_dsl(spa), 0); 2366797Sbonwick 23671544Seschrock dprintf("start %s mintxg=%llu maxtxg=%llu\n", 23681544Seschrock scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 23691544Seschrock spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg); 23701544Seschrock 23711544Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 23721544Seschrock vdev_reopen(rvd); /* purge all vdev caches */ 2373789Sahrens vdev_config_dirty(rvd); /* rewrite all disk labels */ 2374789Sahrens vdev_scrub_stat_update(rvd, scrub_type, B_FALSE); 23751544Seschrock spa_config_exit(spa, FTAG); 2376789Sahrens 2377789Sahrens mutex_enter(&spa->spa_scrub_lock); 2378789Sahrens spa->spa_scrub_errors = 0; 2379789Sahrens spa->spa_scrub_active = 1; 23801544Seschrock ASSERT(spa->spa_scrub_inflight == 0); 2381789Sahrens 2382789Sahrens while (!spa->spa_scrub_stop) { 2383789Sahrens CALLB_CPR_SAFE_BEGIN(&cprinfo); 23841544Seschrock while (spa->spa_scrub_suspended) { 2385789Sahrens spa->spa_scrub_active = 0; 2386789Sahrens cv_broadcast(&spa->spa_scrub_cv); 2387789Sahrens cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2388789Sahrens spa->spa_scrub_active = 1; 2389789Sahrens } 2390789Sahrens CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock); 2391789Sahrens 2392789Sahrens if (spa->spa_scrub_restart_txg != 0) 2393789Sahrens break; 2394789Sahrens 2395789Sahrens mutex_exit(&spa->spa_scrub_lock); 2396789Sahrens error = traverse_more(th); 2397789Sahrens mutex_enter(&spa->spa_scrub_lock); 2398789Sahrens if (error != EAGAIN) 2399789Sahrens break; 2400789Sahrens } 2401789Sahrens 2402789Sahrens while (spa->spa_scrub_inflight) 2403789Sahrens cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2404789Sahrens 24051601Sbonwick spa->spa_scrub_active = 0; 24061601Sbonwick cv_broadcast(&spa->spa_scrub_cv); 24071601Sbonwick 24081601Sbonwick mutex_exit(&spa->spa_scrub_lock); 24091601Sbonwick 24101601Sbonwick spa_config_enter(spa, RW_WRITER, FTAG); 24111601Sbonwick 24121601Sbonwick mutex_enter(&spa->spa_scrub_lock); 24131601Sbonwick 24141601Sbonwick /* 24151601Sbonwick * Note: we check spa_scrub_restart_txg under both spa_scrub_lock 24161601Sbonwick * AND the spa config lock to synchronize with any config changes 24171601Sbonwick * that revise the DTLs under spa_vdev_enter() / spa_vdev_exit(). 24181601Sbonwick */ 2419789Sahrens if (spa->spa_scrub_restart_txg != 0) 2420789Sahrens error = ERESTART; 2421789Sahrens 24221544Seschrock if (spa->spa_scrub_stop) 24231544Seschrock error = EINTR; 24241544Seschrock 2425789Sahrens /* 24261544Seschrock * Even if there were uncorrectable errors, we consider the scrub 24271544Seschrock * completed. The downside is that if there is a transient error during 24281544Seschrock * a resilver, we won't resilver the data properly to the target. But 24291544Seschrock * if the damage is permanent (more likely) we will resilver forever, 24301544Seschrock * which isn't really acceptable. Since there is enough information for 24311544Seschrock * the user to know what has failed and why, this seems like a more 24321544Seschrock * tractable approach. 2433789Sahrens */ 24341544Seschrock complete = (error == 0); 2435789Sahrens 24361544Seschrock dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n", 24371544Seschrock scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 2438789Sahrens spa->spa_scrub_maxtxg, complete ? "done" : "FAILED", 2439789Sahrens error, spa->spa_scrub_errors, spa->spa_scrub_stop); 2440789Sahrens 2441789Sahrens mutex_exit(&spa->spa_scrub_lock); 2442789Sahrens 2443789Sahrens /* 2444789Sahrens * If the scrub/resilver completed, update all DTLs to reflect this. 2445789Sahrens * Whether it succeeded or not, vacate all temporary scrub DTLs. 2446789Sahrens */ 2447789Sahrens vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1, 2448789Sahrens complete ? spa->spa_scrub_maxtxg : 0, B_TRUE); 2449789Sahrens vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete); 24501544Seschrock spa_errlog_rotate(spa); 24511601Sbonwick 24521544Seschrock spa_config_exit(spa, FTAG); 2453789Sahrens 2454789Sahrens mutex_enter(&spa->spa_scrub_lock); 2455789Sahrens 24561544Seschrock /* 24571544Seschrock * We may have finished replacing a device. 24581544Seschrock * Let the async thread assess this and handle the detach. 24591544Seschrock */ 24601544Seschrock spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 2461789Sahrens 2462789Sahrens /* 2463789Sahrens * If we were told to restart, our final act is to start a new scrub. 2464789Sahrens */ 2465789Sahrens if (error == ERESTART) 24661544Seschrock spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ? 24671544Seschrock SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB); 2468789Sahrens 24691544Seschrock spa->spa_scrub_type = POOL_SCRUB_NONE; 24701544Seschrock spa->spa_scrub_active = 0; 24711544Seschrock spa->spa_scrub_thread = NULL; 24721544Seschrock cv_broadcast(&spa->spa_scrub_cv); 2473789Sahrens CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */ 2474789Sahrens thread_exit(); 2475789Sahrens } 2476789Sahrens 2477789Sahrens void 2478789Sahrens spa_scrub_suspend(spa_t *spa) 2479789Sahrens { 2480789Sahrens mutex_enter(&spa->spa_scrub_lock); 24811544Seschrock spa->spa_scrub_suspended++; 2482789Sahrens while (spa->spa_scrub_active) { 2483789Sahrens cv_broadcast(&spa->spa_scrub_cv); 2484789Sahrens cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2485789Sahrens } 2486789Sahrens while (spa->spa_scrub_inflight) 2487789Sahrens cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2488789Sahrens mutex_exit(&spa->spa_scrub_lock); 2489789Sahrens } 2490789Sahrens 2491789Sahrens void 2492789Sahrens spa_scrub_resume(spa_t *spa) 2493789Sahrens { 2494789Sahrens mutex_enter(&spa->spa_scrub_lock); 24951544Seschrock ASSERT(spa->spa_scrub_suspended != 0); 24961544Seschrock if (--spa->spa_scrub_suspended == 0) 2497789Sahrens cv_broadcast(&spa->spa_scrub_cv); 2498789Sahrens mutex_exit(&spa->spa_scrub_lock); 2499789Sahrens } 2500789Sahrens 2501789Sahrens void 2502789Sahrens spa_scrub_restart(spa_t *spa, uint64_t txg) 2503789Sahrens { 2504789Sahrens /* 2505789Sahrens * Something happened (e.g. snapshot create/delete) that means 2506789Sahrens * we must restart any in-progress scrubs. The itinerary will 2507789Sahrens * fix this properly. 2508789Sahrens */ 2509789Sahrens mutex_enter(&spa->spa_scrub_lock); 2510789Sahrens spa->spa_scrub_restart_txg = txg; 2511789Sahrens mutex_exit(&spa->spa_scrub_lock); 2512789Sahrens } 2513789Sahrens 25141544Seschrock int 25151544Seschrock spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) 2516789Sahrens { 2517789Sahrens space_seg_t *ss; 2518789Sahrens uint64_t mintxg, maxtxg; 2519789Sahrens vdev_t *rvd = spa->spa_root_vdev; 2520789Sahrens 2521789Sahrens if ((uint_t)type >= POOL_SCRUB_TYPES) 2522789Sahrens return (ENOTSUP); 2523789Sahrens 25241544Seschrock mutex_enter(&spa->spa_scrub_lock); 25251544Seschrock 2526789Sahrens /* 2527789Sahrens * If there's a scrub or resilver already in progress, stop it. 2528789Sahrens */ 2529789Sahrens while (spa->spa_scrub_thread != NULL) { 2530789Sahrens /* 2531789Sahrens * Don't stop a resilver unless forced. 2532789Sahrens */ 25331544Seschrock if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) { 25341544Seschrock mutex_exit(&spa->spa_scrub_lock); 2535789Sahrens return (EBUSY); 25361544Seschrock } 2537789Sahrens spa->spa_scrub_stop = 1; 2538789Sahrens cv_broadcast(&spa->spa_scrub_cv); 2539789Sahrens cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2540789Sahrens } 2541789Sahrens 2542789Sahrens /* 2543789Sahrens * Terminate the previous traverse. 2544789Sahrens */ 2545789Sahrens if (spa->spa_scrub_th != NULL) { 2546789Sahrens traverse_fini(spa->spa_scrub_th); 2547789Sahrens spa->spa_scrub_th = NULL; 2548789Sahrens } 2549789Sahrens 25501544Seschrock if (rvd == NULL) { 25511544Seschrock ASSERT(spa->spa_scrub_stop == 0); 25521544Seschrock ASSERT(spa->spa_scrub_type == type); 25531544Seschrock ASSERT(spa->spa_scrub_restart_txg == 0); 25541544Seschrock mutex_exit(&spa->spa_scrub_lock); 25551544Seschrock return (0); 25561544Seschrock } 2557789Sahrens 2558789Sahrens mintxg = TXG_INITIAL - 1; 2559789Sahrens maxtxg = spa_last_synced_txg(spa) + 1; 2560789Sahrens 25611544Seschrock mutex_enter(&rvd->vdev_dtl_lock); 2562789Sahrens 25631544Seschrock if (rvd->vdev_dtl_map.sm_space == 0) { 25641544Seschrock /* 25651544Seschrock * The pool-wide DTL is empty. 25661732Sbonwick * If this is a resilver, there's nothing to do except 25671732Sbonwick * check whether any in-progress replacements have completed. 25681544Seschrock */ 25691732Sbonwick if (type == POOL_SCRUB_RESILVER) { 25701544Seschrock type = POOL_SCRUB_NONE; 25711732Sbonwick spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 25721732Sbonwick } 25731544Seschrock } else { 25741544Seschrock /* 25751544Seschrock * The pool-wide DTL is non-empty. 25761544Seschrock * If this is a normal scrub, upgrade to a resilver instead. 25771544Seschrock */ 25781544Seschrock if (type == POOL_SCRUB_EVERYTHING) 25791544Seschrock type = POOL_SCRUB_RESILVER; 25801544Seschrock } 2581789Sahrens 25821544Seschrock if (type == POOL_SCRUB_RESILVER) { 2583789Sahrens /* 2584789Sahrens * Determine the resilvering boundaries. 2585789Sahrens * 2586789Sahrens * Note: (mintxg, maxtxg) is an open interval, 2587789Sahrens * i.e. mintxg and maxtxg themselves are not included. 2588789Sahrens * 2589789Sahrens * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1 2590789Sahrens * so we don't claim to resilver a txg that's still changing. 2591789Sahrens */ 2592789Sahrens ss = avl_first(&rvd->vdev_dtl_map.sm_root); 25931544Seschrock mintxg = ss->ss_start - 1; 2594789Sahrens ss = avl_last(&rvd->vdev_dtl_map.sm_root); 25951544Seschrock maxtxg = MIN(ss->ss_end, maxtxg); 2596789Sahrens } 2597789Sahrens 25981544Seschrock mutex_exit(&rvd->vdev_dtl_lock); 25991544Seschrock 26001544Seschrock spa->spa_scrub_stop = 0; 26011544Seschrock spa->spa_scrub_type = type; 26021544Seschrock spa->spa_scrub_restart_txg = 0; 26031544Seschrock 26041544Seschrock if (type != POOL_SCRUB_NONE) { 26051544Seschrock spa->spa_scrub_mintxg = mintxg; 2606789Sahrens spa->spa_scrub_maxtxg = maxtxg; 2607789Sahrens spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL, 26081635Sbonwick ADVANCE_PRE | ADVANCE_PRUNE | ADVANCE_ZIL, 26091635Sbonwick ZIO_FLAG_CANFAIL); 2610789Sahrens traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg); 2611789Sahrens spa->spa_scrub_thread = thread_create(NULL, 0, 2612789Sahrens spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri); 2613789Sahrens } 2614789Sahrens 26151544Seschrock mutex_exit(&spa->spa_scrub_lock); 26161544Seschrock 2617789Sahrens return (0); 2618789Sahrens } 2619789Sahrens 26201544Seschrock /* 26211544Seschrock * ========================================================================== 26221544Seschrock * SPA async task processing 26231544Seschrock * ========================================================================== 26241544Seschrock */ 26251544Seschrock 26261544Seschrock static void 26271544Seschrock spa_async_reopen(spa_t *spa) 2628789Sahrens { 26291544Seschrock vdev_t *rvd = spa->spa_root_vdev; 26301544Seschrock vdev_t *tvd; 26311544Seschrock int c; 26321544Seschrock 26331544Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 26341544Seschrock 26351544Seschrock for (c = 0; c < rvd->vdev_children; c++) { 26361544Seschrock tvd = rvd->vdev_child[c]; 26371544Seschrock if (tvd->vdev_reopen_wanted) { 26381544Seschrock tvd->vdev_reopen_wanted = 0; 26391544Seschrock vdev_reopen(tvd); 26401544Seschrock } 26411544Seschrock } 2642789Sahrens 26431544Seschrock spa_config_exit(spa, FTAG); 26441544Seschrock } 26451544Seschrock 26461544Seschrock static void 26471544Seschrock spa_async_thread(spa_t *spa) 26481544Seschrock { 26491544Seschrock int tasks; 26501544Seschrock 26511544Seschrock ASSERT(spa->spa_sync_on); 2652789Sahrens 26531544Seschrock mutex_enter(&spa->spa_async_lock); 26541544Seschrock tasks = spa->spa_async_tasks; 26551544Seschrock spa->spa_async_tasks = 0; 26561544Seschrock mutex_exit(&spa->spa_async_lock); 26571544Seschrock 26581544Seschrock /* 26591635Sbonwick * See if the config needs to be updated. 26601635Sbonwick */ 26611635Sbonwick if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 26621635Sbonwick mutex_enter(&spa_namespace_lock); 26631635Sbonwick spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 26641635Sbonwick mutex_exit(&spa_namespace_lock); 26651635Sbonwick } 26661635Sbonwick 26671635Sbonwick /* 26681544Seschrock * See if any devices need to be reopened. 26691544Seschrock */ 26701544Seschrock if (tasks & SPA_ASYNC_REOPEN) 26711544Seschrock spa_async_reopen(spa); 26721544Seschrock 26731544Seschrock /* 26741544Seschrock * If any devices are done replacing, detach them. 26751544Seschrock */ 26761544Seschrock if (tasks & SPA_ASYNC_REPLACE_DONE) 2677789Sahrens spa_vdev_replace_done(spa); 2678789Sahrens 26791544Seschrock /* 26801544Seschrock * Kick off a scrub. 26811544Seschrock */ 26821544Seschrock if (tasks & SPA_ASYNC_SCRUB) 26831544Seschrock VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0); 26841544Seschrock 26851544Seschrock /* 26861544Seschrock * Kick off a resilver. 26871544Seschrock */ 26881544Seschrock if (tasks & SPA_ASYNC_RESILVER) 26891544Seschrock VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 26901544Seschrock 26911544Seschrock /* 26921544Seschrock * Let the world know that we're done. 26931544Seschrock */ 26941544Seschrock mutex_enter(&spa->spa_async_lock); 26951544Seschrock spa->spa_async_thread = NULL; 26961544Seschrock cv_broadcast(&spa->spa_async_cv); 26971544Seschrock mutex_exit(&spa->spa_async_lock); 26981544Seschrock thread_exit(); 26991544Seschrock } 27001544Seschrock 27011544Seschrock void 27021544Seschrock spa_async_suspend(spa_t *spa) 27031544Seschrock { 27041544Seschrock mutex_enter(&spa->spa_async_lock); 27051544Seschrock spa->spa_async_suspended++; 27061544Seschrock while (spa->spa_async_thread != NULL) 27071544Seschrock cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 27081544Seschrock mutex_exit(&spa->spa_async_lock); 27091544Seschrock } 27101544Seschrock 27111544Seschrock void 27121544Seschrock spa_async_resume(spa_t *spa) 27131544Seschrock { 27141544Seschrock mutex_enter(&spa->spa_async_lock); 27151544Seschrock ASSERT(spa->spa_async_suspended != 0); 27161544Seschrock spa->spa_async_suspended--; 27171544Seschrock mutex_exit(&spa->spa_async_lock); 27181544Seschrock } 27191544Seschrock 27201544Seschrock static void 27211544Seschrock spa_async_dispatch(spa_t *spa) 27221544Seschrock { 27231544Seschrock mutex_enter(&spa->spa_async_lock); 27241544Seschrock if (spa->spa_async_tasks && !spa->spa_async_suspended && 27251635Sbonwick spa->spa_async_thread == NULL && 27261635Sbonwick rootdir != NULL && !vn_is_readonly(rootdir)) 27271544Seschrock spa->spa_async_thread = thread_create(NULL, 0, 27281544Seschrock spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 27291544Seschrock mutex_exit(&spa->spa_async_lock); 27301544Seschrock } 27311544Seschrock 27321544Seschrock void 27331544Seschrock spa_async_request(spa_t *spa, int task) 27341544Seschrock { 27351544Seschrock mutex_enter(&spa->spa_async_lock); 27361544Seschrock spa->spa_async_tasks |= task; 27371544Seschrock mutex_exit(&spa->spa_async_lock); 2738789Sahrens } 2739789Sahrens 2740789Sahrens /* 2741789Sahrens * ========================================================================== 2742789Sahrens * SPA syncing routines 2743789Sahrens * ========================================================================== 2744789Sahrens */ 2745789Sahrens 2746789Sahrens static void 2747789Sahrens spa_sync_deferred_frees(spa_t *spa, uint64_t txg) 2748789Sahrens { 2749789Sahrens bplist_t *bpl = &spa->spa_sync_bplist; 2750789Sahrens dmu_tx_t *tx; 2751789Sahrens blkptr_t blk; 2752789Sahrens uint64_t itor = 0; 2753789Sahrens zio_t *zio; 2754789Sahrens int error; 2755789Sahrens uint8_t c = 1; 2756789Sahrens 2757789Sahrens zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD); 2758789Sahrens 2759789Sahrens while (bplist_iterate(bpl, &itor, &blk) == 0) 2760789Sahrens zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL)); 2761789Sahrens 2762789Sahrens error = zio_wait(zio); 2763789Sahrens ASSERT3U(error, ==, 0); 2764789Sahrens 2765789Sahrens tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 2766789Sahrens bplist_vacate(bpl, tx); 2767789Sahrens 2768789Sahrens /* 2769789Sahrens * Pre-dirty the first block so we sync to convergence faster. 2770789Sahrens * (Usually only the first block is needed.) 2771789Sahrens */ 2772789Sahrens dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 2773789Sahrens dmu_tx_commit(tx); 2774789Sahrens } 2775789Sahrens 2776789Sahrens static void 27772082Seschrock spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 27782082Seschrock { 27792082Seschrock char *packed = NULL; 27802082Seschrock size_t nvsize = 0; 27812082Seschrock dmu_buf_t *db; 27822082Seschrock 27832082Seschrock VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 27842082Seschrock 27852082Seschrock packed = kmem_alloc(nvsize, KM_SLEEP); 27862082Seschrock 27872082Seschrock VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 27882082Seschrock KM_SLEEP) == 0); 27892082Seschrock 27902082Seschrock dmu_write(spa->spa_meta_objset, obj, 0, nvsize, packed, tx); 27912082Seschrock 27922082Seschrock kmem_free(packed, nvsize); 27932082Seschrock 27942082Seschrock VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 27952082Seschrock dmu_buf_will_dirty(db, tx); 27962082Seschrock *(uint64_t *)db->db_data = nvsize; 27972082Seschrock dmu_buf_rele(db, FTAG); 27982082Seschrock } 27992082Seschrock 28002082Seschrock static void 28012082Seschrock spa_sync_spares(spa_t *spa, dmu_tx_t *tx) 28022082Seschrock { 28032082Seschrock nvlist_t *nvroot; 28042082Seschrock nvlist_t **spares; 28052082Seschrock int i; 28062082Seschrock 28072082Seschrock if (!spa->spa_sync_spares) 28082082Seschrock return; 28092082Seschrock 28102082Seschrock /* 28112082Seschrock * Update the MOS nvlist describing the list of available spares. 28122082Seschrock * spa_validate_spares() will have already made sure this nvlist is 28132082Seschrock * valid and the vdevs are labelled appropriately. 28142082Seschrock */ 28152082Seschrock if (spa->spa_spares_object == 0) { 28162082Seschrock spa->spa_spares_object = dmu_object_alloc(spa->spa_meta_objset, 28172082Seschrock DMU_OT_PACKED_NVLIST, 1 << 14, 28182082Seschrock DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 28192082Seschrock VERIFY(zap_update(spa->spa_meta_objset, 28202082Seschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SPARES, 28212082Seschrock sizeof (uint64_t), 1, &spa->spa_spares_object, tx) == 0); 28222082Seschrock } 28232082Seschrock 28242082Seschrock VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 28252082Seschrock if (spa->spa_nspares == 0) { 28262082Seschrock VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 28272082Seschrock NULL, 0) == 0); 28282082Seschrock } else { 28292082Seschrock spares = kmem_alloc(spa->spa_nspares * sizeof (void *), 28302082Seschrock KM_SLEEP); 28312082Seschrock for (i = 0; i < spa->spa_nspares; i++) 28322082Seschrock spares[i] = vdev_config_generate(spa, 28332082Seschrock spa->spa_spares[i], B_FALSE, B_TRUE); 28342082Seschrock VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 28352082Seschrock spares, spa->spa_nspares) == 0); 28362082Seschrock for (i = 0; i < spa->spa_nspares; i++) 28372082Seschrock nvlist_free(spares[i]); 28382082Seschrock kmem_free(spares, spa->spa_nspares * sizeof (void *)); 28392082Seschrock } 28402082Seschrock 28412082Seschrock spa_sync_nvlist(spa, spa->spa_spares_object, nvroot, tx); 28422926Sek110237 nvlist_free(nvroot); 28432082Seschrock 28442082Seschrock spa->spa_sync_spares = B_FALSE; 28452082Seschrock } 28462082Seschrock 28472082Seschrock static void 2848789Sahrens spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 2849789Sahrens { 2850789Sahrens nvlist_t *config; 2851789Sahrens 2852789Sahrens if (list_is_empty(&spa->spa_dirty_list)) 2853789Sahrens return; 2854789Sahrens 2855789Sahrens config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE); 2856789Sahrens 28571635Sbonwick if (spa->spa_config_syncing) 28581635Sbonwick nvlist_free(spa->spa_config_syncing); 28591635Sbonwick spa->spa_config_syncing = config; 2860789Sahrens 28612082Seschrock spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 2862789Sahrens } 2863789Sahrens 28643912Slling static void 28653912Slling spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx) 28663912Slling { 28673912Slling spa_t *spa = arg1; 28683912Slling nvlist_t *nvp = arg2; 28693912Slling nvpair_t *nvpair; 28703912Slling objset_t *mos = spa->spa_meta_objset; 28713912Slling uint64_t zapobj; 28723912Slling 28733912Slling mutex_enter(&spa->spa_props_lock); 28743912Slling if (spa->spa_pool_props_object == 0) { 28753912Slling zapobj = zap_create(mos, DMU_OT_POOL_PROPS, DMU_OT_NONE, 0, tx); 28763912Slling VERIFY(zapobj > 0); 28773912Slling 28783912Slling spa->spa_pool_props_object = zapobj; 28793912Slling 28803912Slling VERIFY(zap_update(mos, DMU_POOL_DIRECTORY_OBJECT, 28813912Slling DMU_POOL_PROPS, 8, 1, 28823912Slling &spa->spa_pool_props_object, tx) == 0); 28833912Slling } 28843912Slling mutex_exit(&spa->spa_props_lock); 28853912Slling 28863912Slling nvpair = NULL; 28873912Slling while ((nvpair = nvlist_next_nvpair(nvp, nvpair))) { 28883912Slling switch (zpool_name_to_prop(nvpair_name(nvpair))) { 28893912Slling case ZFS_PROP_BOOTFS: 28903912Slling VERIFY(nvlist_lookup_uint64(nvp, 28913912Slling nvpair_name(nvpair), &spa->spa_bootfs) == 0); 28923912Slling VERIFY(zap_update(mos, 28933912Slling spa->spa_pool_props_object, 28943912Slling zpool_prop_to_name(ZFS_PROP_BOOTFS), 8, 1, 28953912Slling &spa->spa_bootfs, tx) == 0); 28963912Slling break; 28973912Slling } 28983912Slling } 28993912Slling } 29003912Slling 2901789Sahrens /* 2902789Sahrens * Sync the specified transaction group. New blocks may be dirtied as 2903789Sahrens * part of the process, so we iterate until it converges. 2904789Sahrens */ 2905789Sahrens void 2906789Sahrens spa_sync(spa_t *spa, uint64_t txg) 2907789Sahrens { 2908789Sahrens dsl_pool_t *dp = spa->spa_dsl_pool; 2909789Sahrens objset_t *mos = spa->spa_meta_objset; 2910789Sahrens bplist_t *bpl = &spa->spa_sync_bplist; 29111635Sbonwick vdev_t *rvd = spa->spa_root_vdev; 2912789Sahrens vdev_t *vd; 2913789Sahrens dmu_tx_t *tx; 2914789Sahrens int dirty_vdevs; 2915789Sahrens 2916789Sahrens /* 2917789Sahrens * Lock out configuration changes. 2918789Sahrens */ 29191544Seschrock spa_config_enter(spa, RW_READER, FTAG); 2920789Sahrens 2921789Sahrens spa->spa_syncing_txg = txg; 2922789Sahrens spa->spa_sync_pass = 0; 2923789Sahrens 29241544Seschrock VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); 2925789Sahrens 29262082Seschrock tx = dmu_tx_create_assigned(dp, txg); 29272082Seschrock 29282082Seschrock /* 29292082Seschrock * If we are upgrading to ZFS_VERSION_RAIDZ_DEFLATE this txg, 29302082Seschrock * set spa_deflate if we have no raid-z vdevs. 29312082Seschrock */ 29322082Seschrock if (spa->spa_ubsync.ub_version < ZFS_VERSION_RAIDZ_DEFLATE && 29332082Seschrock spa->spa_uberblock.ub_version >= ZFS_VERSION_RAIDZ_DEFLATE) { 29342082Seschrock int i; 29352082Seschrock 29362082Seschrock for (i = 0; i < rvd->vdev_children; i++) { 29372082Seschrock vd = rvd->vdev_child[i]; 29382082Seschrock if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 29392082Seschrock break; 29402082Seschrock } 29412082Seschrock if (i == rvd->vdev_children) { 29422082Seschrock spa->spa_deflate = TRUE; 29432082Seschrock VERIFY(0 == zap_add(spa->spa_meta_objset, 29442082Seschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 29452082Seschrock sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 29462082Seschrock } 29472082Seschrock } 29482082Seschrock 2949789Sahrens /* 2950789Sahrens * If anything has changed in this txg, push the deferred frees 2951789Sahrens * from the previous txg. If not, leave them alone so that we 2952789Sahrens * don't generate work on an otherwise idle system. 2953789Sahrens */ 2954789Sahrens if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 29552329Sek110237 !txg_list_empty(&dp->dp_dirty_dirs, txg) || 29562329Sek110237 !txg_list_empty(&dp->dp_sync_tasks, txg)) 2957789Sahrens spa_sync_deferred_frees(spa, txg); 2958789Sahrens 2959789Sahrens /* 2960789Sahrens * Iterate to convergence. 2961789Sahrens */ 2962789Sahrens do { 2963789Sahrens spa->spa_sync_pass++; 2964789Sahrens 2965789Sahrens spa_sync_config_object(spa, tx); 29662082Seschrock spa_sync_spares(spa, tx); 29671544Seschrock spa_errlog_sync(spa, txg); 2968789Sahrens dsl_pool_sync(dp, txg); 2969789Sahrens 2970789Sahrens dirty_vdevs = 0; 2971789Sahrens while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 2972789Sahrens vdev_sync(vd, txg); 2973789Sahrens dirty_vdevs++; 2974789Sahrens } 2975789Sahrens 2976789Sahrens bplist_sync(bpl, tx); 2977789Sahrens } while (dirty_vdevs); 2978789Sahrens 2979789Sahrens bplist_close(bpl); 2980789Sahrens 2981789Sahrens dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 2982789Sahrens 2983789Sahrens /* 2984789Sahrens * Rewrite the vdev configuration (which includes the uberblock) 2985789Sahrens * to commit the transaction group. 29861635Sbonwick * 29871635Sbonwick * If there are any dirty vdevs, sync the uberblock to all vdevs. 29881635Sbonwick * Otherwise, pick a random top-level vdev that's known to be 29891635Sbonwick * visible in the config cache (see spa_vdev_add() for details). 29901635Sbonwick * If the write fails, try the next vdev until we're tried them all. 2991789Sahrens */ 29921635Sbonwick if (!list_is_empty(&spa->spa_dirty_list)) { 29931635Sbonwick VERIFY(vdev_config_sync(rvd, txg) == 0); 29941635Sbonwick } else { 29951635Sbonwick int children = rvd->vdev_children; 29961635Sbonwick int c0 = spa_get_random(children); 29971635Sbonwick int c; 29981635Sbonwick 29991635Sbonwick for (c = 0; c < children; c++) { 30001635Sbonwick vd = rvd->vdev_child[(c0 + c) % children]; 30011635Sbonwick if (vd->vdev_ms_array == 0) 30021635Sbonwick continue; 30031635Sbonwick if (vdev_config_sync(vd, txg) == 0) 30041635Sbonwick break; 30051635Sbonwick } 30061635Sbonwick if (c == children) 30071635Sbonwick VERIFY(vdev_config_sync(rvd, txg) == 0); 30081635Sbonwick } 30091635Sbonwick 30102082Seschrock dmu_tx_commit(tx); 30112082Seschrock 30121635Sbonwick /* 30131635Sbonwick * Clear the dirty config list. 30141635Sbonwick */ 30151635Sbonwick while ((vd = list_head(&spa->spa_dirty_list)) != NULL) 30161635Sbonwick vdev_config_clean(vd); 30171635Sbonwick 30181635Sbonwick /* 30191635Sbonwick * Now that the new config has synced transactionally, 30201635Sbonwick * let it become visible to the config cache. 30211635Sbonwick */ 30221635Sbonwick if (spa->spa_config_syncing != NULL) { 30231635Sbonwick spa_config_set(spa, spa->spa_config_syncing); 30241635Sbonwick spa->spa_config_txg = txg; 30251635Sbonwick spa->spa_config_syncing = NULL; 30261635Sbonwick } 3027789Sahrens 3028789Sahrens /* 3029789Sahrens * Make a stable copy of the fully synced uberblock. 3030789Sahrens * We use this as the root for pool traversals. 3031789Sahrens */ 3032789Sahrens spa->spa_traverse_wanted = 1; /* tells traverse_more() to stop */ 3033789Sahrens 3034789Sahrens spa_scrub_suspend(spa); /* stop scrubbing and finish I/Os */ 3035789Sahrens 3036789Sahrens rw_enter(&spa->spa_traverse_lock, RW_WRITER); 3037789Sahrens spa->spa_traverse_wanted = 0; 3038789Sahrens spa->spa_ubsync = spa->spa_uberblock; 3039789Sahrens rw_exit(&spa->spa_traverse_lock); 3040789Sahrens 3041789Sahrens spa_scrub_resume(spa); /* resume scrub with new ubsync */ 3042789Sahrens 3043789Sahrens /* 3044789Sahrens * Clean up the ZIL records for the synced txg. 3045789Sahrens */ 3046789Sahrens dsl_pool_zil_clean(dp); 3047789Sahrens 3048789Sahrens /* 3049789Sahrens * Update usable space statistics. 3050789Sahrens */ 3051789Sahrens while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 3052789Sahrens vdev_sync_done(vd, txg); 3053789Sahrens 3054789Sahrens /* 3055789Sahrens * It had better be the case that we didn't dirty anything 30562082Seschrock * since vdev_config_sync(). 3057789Sahrens */ 3058789Sahrens ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 3059789Sahrens ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 3060789Sahrens ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 3061789Sahrens ASSERT(bpl->bpl_queue == NULL); 3062789Sahrens 30631544Seschrock spa_config_exit(spa, FTAG); 30641544Seschrock 30651544Seschrock /* 30661544Seschrock * If any async tasks have been requested, kick them off. 30671544Seschrock */ 30681544Seschrock spa_async_dispatch(spa); 3069789Sahrens } 3070789Sahrens 3071789Sahrens /* 3072789Sahrens * Sync all pools. We don't want to hold the namespace lock across these 3073789Sahrens * operations, so we take a reference on the spa_t and drop the lock during the 3074789Sahrens * sync. 3075789Sahrens */ 3076789Sahrens void 3077789Sahrens spa_sync_allpools(void) 3078789Sahrens { 3079789Sahrens spa_t *spa = NULL; 3080789Sahrens mutex_enter(&spa_namespace_lock); 3081789Sahrens while ((spa = spa_next(spa)) != NULL) { 3082789Sahrens if (spa_state(spa) != POOL_STATE_ACTIVE) 3083789Sahrens continue; 3084789Sahrens spa_open_ref(spa, FTAG); 3085789Sahrens mutex_exit(&spa_namespace_lock); 3086789Sahrens txg_wait_synced(spa_get_dsl(spa), 0); 3087789Sahrens mutex_enter(&spa_namespace_lock); 3088789Sahrens spa_close(spa, FTAG); 3089789Sahrens } 3090789Sahrens mutex_exit(&spa_namespace_lock); 3091789Sahrens } 3092789Sahrens 3093789Sahrens /* 3094789Sahrens * ========================================================================== 3095789Sahrens * Miscellaneous routines 3096789Sahrens * ========================================================================== 3097789Sahrens */ 3098789Sahrens 3099789Sahrens /* 3100789Sahrens * Remove all pools in the system. 3101789Sahrens */ 3102789Sahrens void 3103789Sahrens spa_evict_all(void) 3104789Sahrens { 3105789Sahrens spa_t *spa; 3106789Sahrens 3107789Sahrens /* 3108789Sahrens * Remove all cached state. All pools should be closed now, 3109789Sahrens * so every spa in the AVL tree should be unreferenced. 3110789Sahrens */ 3111789Sahrens mutex_enter(&spa_namespace_lock); 3112789Sahrens while ((spa = spa_next(NULL)) != NULL) { 3113789Sahrens /* 31141544Seschrock * Stop async tasks. The async thread may need to detach 31151544Seschrock * a device that's been replaced, which requires grabbing 31161544Seschrock * spa_namespace_lock, so we must drop it here. 3117789Sahrens */ 3118789Sahrens spa_open_ref(spa, FTAG); 3119789Sahrens mutex_exit(&spa_namespace_lock); 31201544Seschrock spa_async_suspend(spa); 3121789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 3122789Sahrens mutex_enter(&spa_namespace_lock); 3123789Sahrens spa_close(spa, FTAG); 3124789Sahrens 3125789Sahrens if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 3126789Sahrens spa_unload(spa); 3127789Sahrens spa_deactivate(spa); 3128789Sahrens } 3129789Sahrens spa_remove(spa); 3130789Sahrens } 3131789Sahrens mutex_exit(&spa_namespace_lock); 3132789Sahrens } 31331544Seschrock 31341544Seschrock vdev_t * 31351544Seschrock spa_lookup_by_guid(spa_t *spa, uint64_t guid) 31361544Seschrock { 31371544Seschrock return (vdev_lookup_by_guid(spa->spa_root_vdev, guid)); 31381544Seschrock } 31391760Seschrock 31401760Seschrock void 31411760Seschrock spa_upgrade(spa_t *spa) 31421760Seschrock { 31431760Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 31441760Seschrock 31451760Seschrock /* 31461760Seschrock * This should only be called for a non-faulted pool, and since a 31471760Seschrock * future version would result in an unopenable pool, this shouldn't be 31481760Seschrock * possible. 31491760Seschrock */ 31501760Seschrock ASSERT(spa->spa_uberblock.ub_version <= ZFS_VERSION); 31511760Seschrock 31521760Seschrock spa->spa_uberblock.ub_version = ZFS_VERSION; 31531760Seschrock vdev_config_dirty(spa->spa_root_vdev); 31541760Seschrock 31551760Seschrock spa_config_exit(spa, FTAG); 31562082Seschrock 31572082Seschrock txg_wait_synced(spa_get_dsl(spa), 0); 31581760Seschrock } 31592082Seschrock 31602082Seschrock boolean_t 31612082Seschrock spa_has_spare(spa_t *spa, uint64_t guid) 31622082Seschrock { 31632082Seschrock int i; 31643377Seschrock uint64_t spareguid; 31652082Seschrock 31662082Seschrock for (i = 0; i < spa->spa_nspares; i++) 31672082Seschrock if (spa->spa_spares[i]->vdev_guid == guid) 31682082Seschrock return (B_TRUE); 31692082Seschrock 31703377Seschrock for (i = 0; i < spa->spa_pending_nspares; i++) { 31713377Seschrock if (nvlist_lookup_uint64(spa->spa_pending_spares[i], 31723377Seschrock ZPOOL_CONFIG_GUID, &spareguid) == 0 && 31733377Seschrock spareguid == guid) 31743377Seschrock return (B_TRUE); 31753377Seschrock } 31763377Seschrock 31772082Seschrock return (B_FALSE); 31782082Seschrock } 31793912Slling 31803912Slling int 31813912Slling spa_set_props(spa_t *spa, nvlist_t *nvp) 31823912Slling { 31833912Slling return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, 31843912Slling spa, nvp, 3)); 31853912Slling } 31863912Slling 31873912Slling int 31883912Slling spa_get_props(spa_t *spa, nvlist_t **nvp) 31893912Slling { 31903912Slling zap_cursor_t zc; 31913912Slling zap_attribute_t za; 31923912Slling objset_t *mos = spa->spa_meta_objset; 31933912Slling zfs_source_t src; 31943912Slling zfs_prop_t prop; 31953912Slling nvlist_t *propval; 31963912Slling uint64_t value; 31973912Slling int err; 31983912Slling 31993912Slling VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 32003912Slling 32013912Slling mutex_enter(&spa->spa_props_lock); 32023912Slling /* If no props object, then just return empty nvlist */ 32033912Slling if (spa->spa_pool_props_object == 0) { 32043912Slling mutex_exit(&spa->spa_props_lock); 32053912Slling return (0); 32063912Slling } 32073912Slling 32083912Slling for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 32093912Slling (err = zap_cursor_retrieve(&zc, &za)) == 0; 32103912Slling zap_cursor_advance(&zc)) { 32113912Slling 32123912Slling if ((prop = zpool_name_to_prop(za.za_name)) == ZFS_PROP_INVAL) 32133912Slling continue; 32143912Slling 32153912Slling VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 32163912Slling switch (za.za_integer_length) { 32173912Slling case 8: 32183912Slling if (zfs_prop_default_numeric(prop) == 32193912Slling za.za_first_integer) 32203912Slling src = ZFS_SRC_DEFAULT; 32213912Slling else 32223912Slling src = ZFS_SRC_LOCAL; 32233912Slling value = za.za_first_integer; 32243912Slling 32253912Slling if (prop == ZFS_PROP_BOOTFS) { 32263912Slling dsl_pool_t *dp; 32273912Slling dsl_dataset_t *ds = NULL; 32283912Slling char strval[MAXPATHLEN]; 32293912Slling 32303912Slling dp = spa_get_dsl(spa); 32313912Slling rw_enter(&dp->dp_config_rwlock, RW_READER); 32323912Slling if ((err = dsl_dataset_open_obj(dp, 32333912Slling za.za_first_integer, NULL, DS_MODE_NONE, 32343912Slling FTAG, &ds)) != 0) { 32353912Slling rw_exit(&dp->dp_config_rwlock); 32363912Slling break; 32373912Slling } 32383912Slling dsl_dataset_name(ds, strval); 32393912Slling dsl_dataset_close(ds, DS_MODE_NONE, FTAG); 32403912Slling rw_exit(&dp->dp_config_rwlock); 32413912Slling 32423912Slling VERIFY(nvlist_add_uint64(propval, 32433912Slling ZFS_PROP_SOURCE, src) == 0); 32443912Slling VERIFY(nvlist_add_string(propval, 32453912Slling ZFS_PROP_VALUE, strval) == 0); 32463912Slling } else { 32473912Slling VERIFY(nvlist_add_uint64(propval, 32483912Slling ZFS_PROP_SOURCE, src) == 0); 32493912Slling VERIFY(nvlist_add_uint64(propval, 32503912Slling ZFS_PROP_VALUE, value) == 0); 32513912Slling } 32523912Slling VERIFY(nvlist_add_nvlist(*nvp, za.za_name, 32533912Slling propval) == 0); 32543912Slling break; 32553912Slling } 32563912Slling nvlist_free(propval); 32573912Slling } 32583912Slling zap_cursor_fini(&zc); 32593912Slling mutex_exit(&spa->spa_props_lock); 32603912Slling if (err && err != ENOENT) { 32613912Slling nvlist_free(*nvp); 32623912Slling return (err); 32633912Slling } 32643912Slling 32653912Slling return (0); 32663912Slling } 32673912Slling 32683912Slling /* 32693912Slling * If the bootfs property value is dsobj, clear it. 32703912Slling */ 32713912Slling void 32723912Slling spa_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 32733912Slling { 32743912Slling if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 32753912Slling VERIFY(zap_remove(spa->spa_meta_objset, 32763912Slling spa->spa_pool_props_object, 32773912Slling zpool_prop_to_name(ZFS_PROP_BOOTFS), tx) == 0); 32783912Slling spa->spa_bootfs = 0; 32793912Slling } 32803912Slling } 3281