1789Sahrens /* 2789Sahrens * CDDL HEADER START 3789Sahrens * 4789Sahrens * The contents of this file are subject to the terms of the 51544Seschrock * Common Development and Distribution License (the "License"). 61544Seschrock * You may not use this file except in compliance with the License. 7789Sahrens * 8789Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9789Sahrens * or http://www.opensolaris.org/os/licensing. 10789Sahrens * See the License for the specific language governing permissions 11789Sahrens * and limitations under the License. 12789Sahrens * 13789Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14789Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15789Sahrens * If applicable, add the following below this CDDL HEADER, with the 16789Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17789Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18789Sahrens * 19789Sahrens * CDDL HEADER END 20789Sahrens */ 212082Seschrock 22789Sahrens /* 233377Seschrock * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24789Sahrens * Use is subject to license terms. 25789Sahrens */ 26789Sahrens 27789Sahrens #pragma ident "%Z%%M% %I% %E% SMI" 28789Sahrens 29789Sahrens /* 30789Sahrens * This file contains all the routines used when modifying on-disk SPA state. 31789Sahrens * This includes opening, importing, destroying, exporting a pool, and syncing a 32789Sahrens * pool. 33789Sahrens */ 34789Sahrens 35789Sahrens #include <sys/zfs_context.h> 361544Seschrock #include <sys/fm/fs/zfs.h> 37789Sahrens #include <sys/spa_impl.h> 38789Sahrens #include <sys/zio.h> 39789Sahrens #include <sys/zio_checksum.h> 40789Sahrens #include <sys/zio_compress.h> 41789Sahrens #include <sys/dmu.h> 42789Sahrens #include <sys/dmu_tx.h> 43789Sahrens #include <sys/zap.h> 44789Sahrens #include <sys/zil.h> 45789Sahrens #include <sys/vdev_impl.h> 46789Sahrens #include <sys/metaslab.h> 47789Sahrens #include <sys/uberblock_impl.h> 48789Sahrens #include <sys/txg.h> 49789Sahrens #include <sys/avl.h> 50789Sahrens #include <sys/dmu_traverse.h> 513912Slling #include <sys/dmu_objset.h> 52789Sahrens #include <sys/unique.h> 53789Sahrens #include <sys/dsl_pool.h> 543912Slling #include <sys/dsl_dataset.h> 55789Sahrens #include <sys/dsl_dir.h> 56789Sahrens #include <sys/dsl_prop.h> 573912Slling #include <sys/dsl_synctask.h> 58789Sahrens #include <sys/fs/zfs.h> 59789Sahrens #include <sys/callb.h> 603975Sek110237 #include <sys/systeminfo.h> 613975Sek110237 #include <sys/sunddi.h> 62789Sahrens 635094Slling #include "zfs_prop.h" 645094Slling 652986Sek110237 int zio_taskq_threads = 8; 662986Sek110237 675094Slling static void spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx); 685094Slling 695094Slling /* 705094Slling * ========================================================================== 715094Slling * SPA properties routines 725094Slling * ========================================================================== 735094Slling */ 745094Slling 755094Slling /* 765094Slling * Add a (source=src, propname=propval) list to an nvlist. 775094Slling */ 785094Slling static int 795094Slling spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, 805094Slling uint64_t intval, zprop_source_t src) 815094Slling { 825094Slling const char *propname = zpool_prop_to_name(prop); 835094Slling nvlist_t *propval; 845094Slling int err = 0; 855094Slling 865094Slling if (err = nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP)) 875094Slling return (err); 885094Slling 895094Slling if (err = nvlist_add_uint64(propval, ZPROP_SOURCE, src)) 905094Slling goto out; 915094Slling 925094Slling if (strval != NULL) { 935094Slling if (err = nvlist_add_string(propval, ZPROP_VALUE, strval)) 945094Slling goto out; 955094Slling } else { 965094Slling if (err = nvlist_add_uint64(propval, ZPROP_VALUE, intval)) 975094Slling goto out; 985094Slling } 995094Slling 1005094Slling err = nvlist_add_nvlist(nvl, propname, propval); 1015094Slling out: 1025094Slling nvlist_free(propval); 1035094Slling return (err); 1045094Slling } 1055094Slling 1065094Slling /* 1075094Slling * Get property values from the spa configuration. 1085094Slling */ 1095094Slling static int 1105094Slling spa_prop_get_config(spa_t *spa, nvlist_t **nvp) 1115094Slling { 1125094Slling uint64_t size = spa_get_space(spa); 1135094Slling uint64_t used = spa_get_alloc(spa); 1145094Slling uint64_t cap, version; 1155094Slling zprop_source_t src = ZPROP_SRC_NONE; 1165094Slling int err; 1175094Slling 1185094Slling /* 1195094Slling * readonly properties 1205094Slling */ 1215094Slling if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa->spa_name, 1225094Slling 0, src)) 1235094Slling return (err); 1245094Slling 1255094Slling if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src)) 1265094Slling return (err); 1275094Slling 1285094Slling if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_USED, NULL, used, src)) 1295094Slling return (err); 1305094Slling 1315094Slling if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_AVAILABLE, NULL, 1325094Slling size - used, src)) 1335094Slling return (err); 1345094Slling 1355094Slling cap = (size == 0) ? 0 : (used * 100 / size); 1365094Slling if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src)) 1375094Slling return (err); 1385094Slling 1395094Slling if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, 1405094Slling spa_guid(spa), src)) 1415094Slling return (err); 1425094Slling 1435094Slling if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, 1445094Slling spa->spa_root_vdev->vdev_state, src)) 1455094Slling return (err); 1465094Slling 1475094Slling /* 1485094Slling * settable properties that are not stored in the pool property object. 1495094Slling */ 1505094Slling version = spa_version(spa); 1515094Slling if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) 1525094Slling src = ZPROP_SRC_DEFAULT; 1535094Slling else 1545094Slling src = ZPROP_SRC_LOCAL; 1555094Slling if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, 1565094Slling version, src)) 1575094Slling return (err); 1585094Slling 1595094Slling if (spa->spa_root != NULL) { 1605094Slling src = ZPROP_SRC_LOCAL; 1615094Slling if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, 1625094Slling spa->spa_root, 0, src)) 1635094Slling return (err); 1645094Slling } 1655094Slling 1665094Slling if (spa->spa_temporary == 1675094Slling zpool_prop_default_numeric(ZPOOL_PROP_TEMPORARY)) 1685094Slling src = ZPROP_SRC_DEFAULT; 1695094Slling else 1705094Slling src = ZPROP_SRC_LOCAL; 1715094Slling if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_TEMPORARY, NULL, 1725094Slling spa->spa_temporary, src)) 1735094Slling return (err); 1745094Slling 1755094Slling return (0); 1765094Slling } 1775094Slling 1785094Slling /* 1795094Slling * Get zpool property values. 1805094Slling */ 1815094Slling int 1825094Slling spa_prop_get(spa_t *spa, nvlist_t **nvp) 1835094Slling { 1845094Slling zap_cursor_t zc; 1855094Slling zap_attribute_t za; 1865094Slling objset_t *mos = spa->spa_meta_objset; 1875094Slling int err; 1885094Slling 1895094Slling if (err = nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP)) 1905094Slling return (err); 1915094Slling 1925094Slling /* 1935094Slling * Get properties from the spa config. 1945094Slling */ 1955094Slling if (err = spa_prop_get_config(spa, nvp)) 1965094Slling goto out; 1975094Slling 1985094Slling mutex_enter(&spa->spa_props_lock); 1995094Slling /* If no pool property object, no more prop to get. */ 2005094Slling if (spa->spa_pool_props_object == 0) { 2015094Slling mutex_exit(&spa->spa_props_lock); 2025094Slling return (0); 2035094Slling } 2045094Slling 2055094Slling /* 2065094Slling * Get properties from the MOS pool property object. 2075094Slling */ 2085094Slling for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 2095094Slling (err = zap_cursor_retrieve(&zc, &za)) == 0; 2105094Slling zap_cursor_advance(&zc)) { 2115094Slling uint64_t intval = 0; 2125094Slling char *strval = NULL; 2135094Slling zprop_source_t src = ZPROP_SRC_DEFAULT; 2145094Slling zpool_prop_t prop; 2155094Slling 2165094Slling if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL) 2175094Slling continue; 2185094Slling 2195094Slling switch (za.za_integer_length) { 2205094Slling case 8: 2215094Slling /* integer property */ 2225094Slling if (za.za_first_integer != 2235094Slling zpool_prop_default_numeric(prop)) 2245094Slling src = ZPROP_SRC_LOCAL; 2255094Slling 2265094Slling if (prop == ZPOOL_PROP_BOOTFS) { 2275094Slling dsl_pool_t *dp; 2285094Slling dsl_dataset_t *ds = NULL; 2295094Slling 2305094Slling dp = spa_get_dsl(spa); 2315094Slling rw_enter(&dp->dp_config_rwlock, RW_READER); 2325094Slling if (err = dsl_dataset_open_obj(dp, 2335094Slling za.za_first_integer, NULL, DS_MODE_NONE, 2345094Slling FTAG, &ds)) { 2355094Slling rw_exit(&dp->dp_config_rwlock); 2365094Slling break; 2375094Slling } 2385094Slling 2395094Slling strval = kmem_alloc( 2405094Slling MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, 2415094Slling KM_SLEEP); 2425094Slling dsl_dataset_name(ds, strval); 2435094Slling dsl_dataset_close(ds, DS_MODE_NONE, FTAG); 2445094Slling rw_exit(&dp->dp_config_rwlock); 2455094Slling } else { 2465094Slling strval = NULL; 2475094Slling intval = za.za_first_integer; 2485094Slling } 2495094Slling 2505094Slling err = spa_prop_add_list(*nvp, prop, strval, 2515094Slling intval, src); 2525094Slling 2535094Slling if (strval != NULL) 2545094Slling kmem_free(strval, 2555094Slling MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); 2565094Slling 2575094Slling break; 2585094Slling 2595094Slling case 1: 2605094Slling /* string property */ 2615094Slling strval = kmem_alloc(za.za_num_integers, KM_SLEEP); 2625094Slling err = zap_lookup(mos, spa->spa_pool_props_object, 2635094Slling za.za_name, 1, za.za_num_integers, strval); 2645094Slling if (err) { 2655094Slling kmem_free(strval, za.za_num_integers); 2665094Slling break; 2675094Slling } 2685094Slling err = spa_prop_add_list(*nvp, prop, strval, 0, src); 2695094Slling kmem_free(strval, za.za_num_integers); 2705094Slling break; 2715094Slling 2725094Slling default: 2735094Slling break; 2745094Slling } 2755094Slling } 2765094Slling zap_cursor_fini(&zc); 2775094Slling mutex_exit(&spa->spa_props_lock); 2785094Slling out: 2795094Slling if (err && err != ENOENT) { 2805094Slling nvlist_free(*nvp); 2815094Slling return (err); 2825094Slling } 2835094Slling 2845094Slling return (0); 2855094Slling } 2865094Slling 2875094Slling /* 2885094Slling * Validate the given pool properties nvlist and modify the list 2895094Slling * for the property values to be set. 2905094Slling */ 2915094Slling static int 2925094Slling spa_prop_validate(spa_t *spa, nvlist_t *props) 2935094Slling { 2945094Slling nvpair_t *elem; 2955094Slling int error = 0, reset_bootfs = 0; 2965094Slling uint64_t objnum; 2975094Slling 2985094Slling elem = NULL; 2995094Slling while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 3005094Slling zpool_prop_t prop; 3015094Slling char *propname, *strval; 3025094Slling uint64_t intval; 3035094Slling vdev_t *rvdev; 3045094Slling char *vdev_type; 3055094Slling objset_t *os; 3065094Slling 3075094Slling propname = nvpair_name(elem); 3085094Slling 3095094Slling if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL) 3105094Slling return (EINVAL); 3115094Slling 3125094Slling switch (prop) { 3135094Slling case ZPOOL_PROP_VERSION: 3145094Slling error = nvpair_value_uint64(elem, &intval); 3155094Slling if (!error && 3165094Slling (intval < spa_version(spa) || intval > SPA_VERSION)) 3175094Slling error = EINVAL; 3185094Slling break; 3195094Slling 3205094Slling case ZPOOL_PROP_DELEGATION: 3215094Slling case ZPOOL_PROP_AUTOREPLACE: 3225094Slling error = nvpair_value_uint64(elem, &intval); 3235094Slling if (!error && intval > 1) 3245094Slling error = EINVAL; 3255094Slling break; 3265094Slling 3275094Slling case ZPOOL_PROP_BOOTFS: 3285094Slling if (spa_version(spa) < SPA_VERSION_BOOTFS) { 3295094Slling error = ENOTSUP; 3305094Slling break; 3315094Slling } 3325094Slling 3335094Slling /* 3345094Slling * A bootable filesystem can not be on a RAIDZ pool 3355094Slling * nor a striped pool with more than 1 device. 3365094Slling */ 3375094Slling rvdev = spa->spa_root_vdev; 3385094Slling vdev_type = 3395094Slling rvdev->vdev_child[0]->vdev_ops->vdev_op_type; 3405094Slling if (rvdev->vdev_children > 1 || 3415094Slling strcmp(vdev_type, VDEV_TYPE_RAIDZ) == 0 || 3425094Slling strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) { 3435094Slling error = ENOTSUP; 3445094Slling break; 3455094Slling } 3465094Slling 3475094Slling reset_bootfs = 1; 3485094Slling 3495094Slling error = nvpair_value_string(elem, &strval); 3505094Slling 3515094Slling if (!error) { 3525094Slling if (strval == NULL || strval[0] == '\0') { 3535094Slling objnum = zpool_prop_default_numeric( 3545094Slling ZPOOL_PROP_BOOTFS); 3555094Slling break; 3565094Slling } 3575094Slling 3585094Slling if (error = dmu_objset_open(strval, DMU_OST_ZFS, 3595094Slling DS_MODE_STANDARD | DS_MODE_READONLY, &os)) 3605094Slling break; 3615094Slling objnum = dmu_objset_id(os); 3625094Slling dmu_objset_close(os); 3635094Slling } 3645094Slling break; 365*5329Sgw25295 case ZPOOL_PROP_FAILUREMODE: 366*5329Sgw25295 error = nvpair_value_uint64(elem, &intval); 367*5329Sgw25295 if (!error && (intval < ZIO_FAILURE_MODE_WAIT || 368*5329Sgw25295 intval > ZIO_FAILURE_MODE_PANIC)) 369*5329Sgw25295 error = EINVAL; 370*5329Sgw25295 371*5329Sgw25295 /* 372*5329Sgw25295 * This is a special case which only occurs when 373*5329Sgw25295 * the pool has completely failed. This allows 374*5329Sgw25295 * the user to change the in-core failmode property 375*5329Sgw25295 * without syncing it out to disk (I/Os might 376*5329Sgw25295 * currently be blocked). We do this by returning 377*5329Sgw25295 * EIO to the caller (spa_prop_set) to trick it 378*5329Sgw25295 * into thinking we encountered a property validation 379*5329Sgw25295 * error. 380*5329Sgw25295 */ 381*5329Sgw25295 if (!error && spa_state(spa) == POOL_STATE_IO_FAILURE) { 382*5329Sgw25295 spa->spa_failmode = intval; 383*5329Sgw25295 error = EIO; 384*5329Sgw25295 } 385*5329Sgw25295 break; 3865094Slling } 3875094Slling 3885094Slling if (error) 3895094Slling break; 3905094Slling } 3915094Slling 3925094Slling if (!error && reset_bootfs) { 3935094Slling error = nvlist_remove(props, 3945094Slling zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 3955094Slling 3965094Slling if (!error) { 3975094Slling error = nvlist_add_uint64(props, 3985094Slling zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 3995094Slling } 4005094Slling } 4015094Slling 4025094Slling return (error); 4035094Slling } 4045094Slling 4055094Slling int 4065094Slling spa_prop_set(spa_t *spa, nvlist_t *nvp) 4075094Slling { 4085094Slling int error; 4095094Slling 4105094Slling if ((error = spa_prop_validate(spa, nvp)) != 0) 4115094Slling return (error); 4125094Slling 4135094Slling return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, 4145094Slling spa, nvp, 3)); 4155094Slling } 4165094Slling 4175094Slling /* 4185094Slling * If the bootfs property value is dsobj, clear it. 4195094Slling */ 4205094Slling void 4215094Slling spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 4225094Slling { 4235094Slling if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 4245094Slling VERIFY(zap_remove(spa->spa_meta_objset, 4255094Slling spa->spa_pool_props_object, 4265094Slling zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 4275094Slling spa->spa_bootfs = 0; 4285094Slling } 4295094Slling } 4305094Slling 431789Sahrens /* 432789Sahrens * ========================================================================== 433789Sahrens * SPA state manipulation (open/create/destroy/import/export) 434789Sahrens * ========================================================================== 435789Sahrens */ 436789Sahrens 4371544Seschrock static int 4381544Seschrock spa_error_entry_compare(const void *a, const void *b) 4391544Seschrock { 4401544Seschrock spa_error_entry_t *sa = (spa_error_entry_t *)a; 4411544Seschrock spa_error_entry_t *sb = (spa_error_entry_t *)b; 4421544Seschrock int ret; 4431544Seschrock 4441544Seschrock ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 4451544Seschrock sizeof (zbookmark_t)); 4461544Seschrock 4471544Seschrock if (ret < 0) 4481544Seschrock return (-1); 4491544Seschrock else if (ret > 0) 4501544Seschrock return (1); 4511544Seschrock else 4521544Seschrock return (0); 4531544Seschrock } 4541544Seschrock 4551544Seschrock /* 4561544Seschrock * Utility function which retrieves copies of the current logs and 4571544Seschrock * re-initializes them in the process. 4581544Seschrock */ 4591544Seschrock void 4601544Seschrock spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 4611544Seschrock { 4621544Seschrock ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 4631544Seschrock 4641544Seschrock bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 4651544Seschrock bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 4661544Seschrock 4671544Seschrock avl_create(&spa->spa_errlist_scrub, 4681544Seschrock spa_error_entry_compare, sizeof (spa_error_entry_t), 4691544Seschrock offsetof(spa_error_entry_t, se_avl)); 4701544Seschrock avl_create(&spa->spa_errlist_last, 4711544Seschrock spa_error_entry_compare, sizeof (spa_error_entry_t), 4721544Seschrock offsetof(spa_error_entry_t, se_avl)); 4731544Seschrock } 4741544Seschrock 475789Sahrens /* 476789Sahrens * Activate an uninitialized pool. 477789Sahrens */ 478789Sahrens static void 479789Sahrens spa_activate(spa_t *spa) 480789Sahrens { 481789Sahrens int t; 482789Sahrens 483789Sahrens ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 484789Sahrens 485789Sahrens spa->spa_state = POOL_STATE_ACTIVE; 486789Sahrens 487789Sahrens spa->spa_normal_class = metaslab_class_create(); 4884527Sperrin spa->spa_log_class = metaslab_class_create(); 489789Sahrens 490789Sahrens for (t = 0; t < ZIO_TYPES; t++) { 491789Sahrens spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue", 4922986Sek110237 zio_taskq_threads, maxclsyspri, 50, INT_MAX, 493789Sahrens TASKQ_PREPOPULATE); 494789Sahrens spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr", 4952986Sek110237 zio_taskq_threads, maxclsyspri, 50, INT_MAX, 496789Sahrens TASKQ_PREPOPULATE); 497789Sahrens } 498789Sahrens 499789Sahrens list_create(&spa->spa_dirty_list, sizeof (vdev_t), 500789Sahrens offsetof(vdev_t, vdev_dirty_node)); 501*5329Sgw25295 list_create(&spa->spa_zio_list, sizeof (zio_t), 502*5329Sgw25295 offsetof(zio_t, zio_link_node)); 503789Sahrens 504789Sahrens txg_list_create(&spa->spa_vdev_txg_list, 505789Sahrens offsetof(struct vdev, vdev_txg_node)); 5061544Seschrock 5071544Seschrock avl_create(&spa->spa_errlist_scrub, 5081544Seschrock spa_error_entry_compare, sizeof (spa_error_entry_t), 5091544Seschrock offsetof(spa_error_entry_t, se_avl)); 5101544Seschrock avl_create(&spa->spa_errlist_last, 5111544Seschrock spa_error_entry_compare, sizeof (spa_error_entry_t), 5121544Seschrock offsetof(spa_error_entry_t, se_avl)); 513789Sahrens } 514789Sahrens 515789Sahrens /* 516789Sahrens * Opposite of spa_activate(). 517789Sahrens */ 518789Sahrens static void 519789Sahrens spa_deactivate(spa_t *spa) 520789Sahrens { 521789Sahrens int t; 522789Sahrens 523789Sahrens ASSERT(spa->spa_sync_on == B_FALSE); 524789Sahrens ASSERT(spa->spa_dsl_pool == NULL); 525789Sahrens ASSERT(spa->spa_root_vdev == NULL); 526789Sahrens 527789Sahrens ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 528789Sahrens 529789Sahrens txg_list_destroy(&spa->spa_vdev_txg_list); 530789Sahrens 531789Sahrens list_destroy(&spa->spa_dirty_list); 532*5329Sgw25295 list_destroy(&spa->spa_zio_list); 533789Sahrens 534789Sahrens for (t = 0; t < ZIO_TYPES; t++) { 535789Sahrens taskq_destroy(spa->spa_zio_issue_taskq[t]); 536789Sahrens taskq_destroy(spa->spa_zio_intr_taskq[t]); 537789Sahrens spa->spa_zio_issue_taskq[t] = NULL; 538789Sahrens spa->spa_zio_intr_taskq[t] = NULL; 539789Sahrens } 540789Sahrens 541789Sahrens metaslab_class_destroy(spa->spa_normal_class); 542789Sahrens spa->spa_normal_class = NULL; 543789Sahrens 5444527Sperrin metaslab_class_destroy(spa->spa_log_class); 5454527Sperrin spa->spa_log_class = NULL; 5464527Sperrin 5471544Seschrock /* 5481544Seschrock * If this was part of an import or the open otherwise failed, we may 5491544Seschrock * still have errors left in the queues. Empty them just in case. 5501544Seschrock */ 5511544Seschrock spa_errlog_drain(spa); 5521544Seschrock 5531544Seschrock avl_destroy(&spa->spa_errlist_scrub); 5541544Seschrock avl_destroy(&spa->spa_errlist_last); 5551544Seschrock 556789Sahrens spa->spa_state = POOL_STATE_UNINITIALIZED; 557789Sahrens } 558789Sahrens 559789Sahrens /* 560789Sahrens * Verify a pool configuration, and construct the vdev tree appropriately. This 561789Sahrens * will create all the necessary vdevs in the appropriate layout, with each vdev 562789Sahrens * in the CLOSED state. This will prep the pool before open/creation/import. 563789Sahrens * All vdev validation is done by the vdev_alloc() routine. 564789Sahrens */ 5652082Seschrock static int 5662082Seschrock spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 5672082Seschrock uint_t id, int atype) 568789Sahrens { 569789Sahrens nvlist_t **child; 570789Sahrens uint_t c, children; 5712082Seschrock int error; 5722082Seschrock 5732082Seschrock if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 5742082Seschrock return (error); 5752082Seschrock 5762082Seschrock if ((*vdp)->vdev_ops->vdev_op_leaf) 5772082Seschrock return (0); 578789Sahrens 579789Sahrens if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 580789Sahrens &child, &children) != 0) { 5812082Seschrock vdev_free(*vdp); 5822082Seschrock *vdp = NULL; 5832082Seschrock return (EINVAL); 584789Sahrens } 585789Sahrens 586789Sahrens for (c = 0; c < children; c++) { 5872082Seschrock vdev_t *vd; 5882082Seschrock if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 5892082Seschrock atype)) != 0) { 5902082Seschrock vdev_free(*vdp); 5912082Seschrock *vdp = NULL; 5922082Seschrock return (error); 593789Sahrens } 594789Sahrens } 595789Sahrens 5962082Seschrock ASSERT(*vdp != NULL); 5972082Seschrock 5982082Seschrock return (0); 599789Sahrens } 600789Sahrens 601789Sahrens /* 602789Sahrens * Opposite of spa_load(). 603789Sahrens */ 604789Sahrens static void 605789Sahrens spa_unload(spa_t *spa) 606789Sahrens { 6072082Seschrock int i; 6082082Seschrock 609789Sahrens /* 6101544Seschrock * Stop async tasks. 6111544Seschrock */ 6121544Seschrock spa_async_suspend(spa); 6131544Seschrock 6141544Seschrock /* 615789Sahrens * Stop syncing. 616789Sahrens */ 617789Sahrens if (spa->spa_sync_on) { 618789Sahrens txg_sync_stop(spa->spa_dsl_pool); 619789Sahrens spa->spa_sync_on = B_FALSE; 620789Sahrens } 621789Sahrens 622789Sahrens /* 623789Sahrens * Wait for any outstanding prefetch I/O to complete. 624789Sahrens */ 6251544Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 6261544Seschrock spa_config_exit(spa, FTAG); 627789Sahrens 628789Sahrens /* 629789Sahrens * Close the dsl pool. 630789Sahrens */ 631789Sahrens if (spa->spa_dsl_pool) { 632789Sahrens dsl_pool_close(spa->spa_dsl_pool); 633789Sahrens spa->spa_dsl_pool = NULL; 634789Sahrens } 635789Sahrens 636789Sahrens /* 637789Sahrens * Close all vdevs. 638789Sahrens */ 6391585Sbonwick if (spa->spa_root_vdev) 640789Sahrens vdev_free(spa->spa_root_vdev); 6411585Sbonwick ASSERT(spa->spa_root_vdev == NULL); 6421544Seschrock 6432082Seschrock for (i = 0; i < spa->spa_nspares; i++) 6442082Seschrock vdev_free(spa->spa_spares[i]); 6452082Seschrock if (spa->spa_spares) { 6462082Seschrock kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); 6472082Seschrock spa->spa_spares = NULL; 6482082Seschrock } 6492082Seschrock if (spa->spa_sparelist) { 6502082Seschrock nvlist_free(spa->spa_sparelist); 6512082Seschrock spa->spa_sparelist = NULL; 6522082Seschrock } 6532082Seschrock 6541544Seschrock spa->spa_async_suspended = 0; 655789Sahrens } 656789Sahrens 657789Sahrens /* 6582082Seschrock * Load (or re-load) the current list of vdevs describing the active spares for 6592082Seschrock * this pool. When this is called, we have some form of basic information in 6602082Seschrock * 'spa_sparelist'. We parse this into vdevs, try to open them, and then 6612082Seschrock * re-generate a more complete list including status information. 6622082Seschrock */ 6632082Seschrock static void 6642082Seschrock spa_load_spares(spa_t *spa) 6652082Seschrock { 6662082Seschrock nvlist_t **spares; 6672082Seschrock uint_t nspares; 6682082Seschrock int i; 6693377Seschrock vdev_t *vd, *tvd; 6702082Seschrock 6712082Seschrock /* 6722082Seschrock * First, close and free any existing spare vdevs. 6732082Seschrock */ 6742082Seschrock for (i = 0; i < spa->spa_nspares; i++) { 6753377Seschrock vd = spa->spa_spares[i]; 6763377Seschrock 6773377Seschrock /* Undo the call to spa_activate() below */ 6783377Seschrock if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL && 6793377Seschrock tvd->vdev_isspare) 6803377Seschrock spa_spare_remove(tvd); 6813377Seschrock vdev_close(vd); 6823377Seschrock vdev_free(vd); 6832082Seschrock } 6843377Seschrock 6852082Seschrock if (spa->spa_spares) 6862082Seschrock kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); 6872082Seschrock 6882082Seschrock if (spa->spa_sparelist == NULL) 6892082Seschrock nspares = 0; 6902082Seschrock else 6912082Seschrock VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 6922082Seschrock ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 6932082Seschrock 6942082Seschrock spa->spa_nspares = (int)nspares; 6952082Seschrock spa->spa_spares = NULL; 6962082Seschrock 6972082Seschrock if (nspares == 0) 6982082Seschrock return; 6992082Seschrock 7002082Seschrock /* 7012082Seschrock * Construct the array of vdevs, opening them to get status in the 7023377Seschrock * process. For each spare, there is potentially two different vdev_t 7033377Seschrock * structures associated with it: one in the list of spares (used only 7043377Seschrock * for basic validation purposes) and one in the active vdev 7053377Seschrock * configuration (if it's spared in). During this phase we open and 7063377Seschrock * validate each vdev on the spare list. If the vdev also exists in the 7073377Seschrock * active configuration, then we also mark this vdev as an active spare. 7082082Seschrock */ 7092082Seschrock spa->spa_spares = kmem_alloc(nspares * sizeof (void *), KM_SLEEP); 7102082Seschrock for (i = 0; i < spa->spa_nspares; i++) { 7112082Seschrock VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 7122082Seschrock VDEV_ALLOC_SPARE) == 0); 7132082Seschrock ASSERT(vd != NULL); 7142082Seschrock 7152082Seschrock spa->spa_spares[i] = vd; 7162082Seschrock 7173377Seschrock if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL) { 7183377Seschrock if (!tvd->vdev_isspare) 7193377Seschrock spa_spare_add(tvd); 7203377Seschrock 7213377Seschrock /* 7223377Seschrock * We only mark the spare active if we were successfully 7233377Seschrock * able to load the vdev. Otherwise, importing a pool 7243377Seschrock * with a bad active spare would result in strange 7253377Seschrock * behavior, because multiple pool would think the spare 7263377Seschrock * is actively in use. 7273377Seschrock * 7283377Seschrock * There is a vulnerability here to an equally bizarre 7293377Seschrock * circumstance, where a dead active spare is later 7303377Seschrock * brought back to life (onlined or otherwise). Given 7313377Seschrock * the rarity of this scenario, and the extra complexity 7323377Seschrock * it adds, we ignore the possibility. 7333377Seschrock */ 7343377Seschrock if (!vdev_is_dead(tvd)) 7353377Seschrock spa_spare_activate(tvd); 7363377Seschrock } 7373377Seschrock 7382082Seschrock if (vdev_open(vd) != 0) 7392082Seschrock continue; 7402082Seschrock 7412082Seschrock vd->vdev_top = vd; 7422082Seschrock (void) vdev_validate_spare(vd); 7432082Seschrock } 7442082Seschrock 7452082Seschrock /* 7462082Seschrock * Recompute the stashed list of spares, with status information 7472082Seschrock * this time. 7482082Seschrock */ 7492082Seschrock VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 7502082Seschrock DATA_TYPE_NVLIST_ARRAY) == 0); 7512082Seschrock 7522082Seschrock spares = kmem_alloc(spa->spa_nspares * sizeof (void *), KM_SLEEP); 7532082Seschrock for (i = 0; i < spa->spa_nspares; i++) 7542082Seschrock spares[i] = vdev_config_generate(spa, spa->spa_spares[i], 7552082Seschrock B_TRUE, B_TRUE); 7562082Seschrock VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 7572082Seschrock spares, spa->spa_nspares) == 0); 7582082Seschrock for (i = 0; i < spa->spa_nspares; i++) 7592082Seschrock nvlist_free(spares[i]); 7602082Seschrock kmem_free(spares, spa->spa_nspares * sizeof (void *)); 7612082Seschrock } 7622082Seschrock 7632082Seschrock static int 7642082Seschrock load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 7652082Seschrock { 7662082Seschrock dmu_buf_t *db; 7672082Seschrock char *packed = NULL; 7682082Seschrock size_t nvsize = 0; 7692082Seschrock int error; 7702082Seschrock *value = NULL; 7712082Seschrock 7722082Seschrock VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 7732082Seschrock nvsize = *(uint64_t *)db->db_data; 7742082Seschrock dmu_buf_rele(db, FTAG); 7752082Seschrock 7762082Seschrock packed = kmem_alloc(nvsize, KM_SLEEP); 7772082Seschrock error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed); 7782082Seschrock if (error == 0) 7792082Seschrock error = nvlist_unpack(packed, nvsize, value, 0); 7802082Seschrock kmem_free(packed, nvsize); 7812082Seschrock 7822082Seschrock return (error); 7832082Seschrock } 7842082Seschrock 7852082Seschrock /* 7864451Seschrock * Checks to see if the given vdev could not be opened, in which case we post a 7874451Seschrock * sysevent to notify the autoreplace code that the device has been removed. 7884451Seschrock */ 7894451Seschrock static void 7904451Seschrock spa_check_removed(vdev_t *vd) 7914451Seschrock { 7924451Seschrock int c; 7934451Seschrock 7944451Seschrock for (c = 0; c < vd->vdev_children; c++) 7954451Seschrock spa_check_removed(vd->vdev_child[c]); 7964451Seschrock 7974451Seschrock if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) { 7984451Seschrock zfs_post_autoreplace(vd->vdev_spa, vd); 7994451Seschrock spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK); 8004451Seschrock } 8014451Seschrock } 8024451Seschrock 8034451Seschrock /* 804789Sahrens * Load an existing storage pool, using the pool's builtin spa_config as a 8051544Seschrock * source of configuration information. 806789Sahrens */ 807789Sahrens static int 8081544Seschrock spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) 809789Sahrens { 810789Sahrens int error = 0; 811789Sahrens nvlist_t *nvroot = NULL; 812789Sahrens vdev_t *rvd; 813789Sahrens uberblock_t *ub = &spa->spa_uberblock; 8141635Sbonwick uint64_t config_cache_txg = spa->spa_config_txg; 815789Sahrens uint64_t pool_guid; 8162082Seschrock uint64_t version; 817789Sahrens zio_t *zio; 8184451Seschrock uint64_t autoreplace = 0; 819789Sahrens 8201544Seschrock spa->spa_load_state = state; 8211635Sbonwick 822789Sahrens if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 8231733Sbonwick nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 8241544Seschrock error = EINVAL; 8251544Seschrock goto out; 8261544Seschrock } 827789Sahrens 8282082Seschrock /* 8292082Seschrock * Versioning wasn't explicitly added to the label until later, so if 8302082Seschrock * it's not present treat it as the initial version. 8312082Seschrock */ 8322082Seschrock if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0) 8334577Sahrens version = SPA_VERSION_INITIAL; 8342082Seschrock 8351733Sbonwick (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 8361733Sbonwick &spa->spa_config_txg); 8371733Sbonwick 8381635Sbonwick if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 8391544Seschrock spa_guid_exists(pool_guid, 0)) { 8401544Seschrock error = EEXIST; 8411544Seschrock goto out; 8421544Seschrock } 843789Sahrens 8442174Seschrock spa->spa_load_guid = pool_guid; 8452174Seschrock 846789Sahrens /* 8472082Seschrock * Parse the configuration into a vdev tree. We explicitly set the 8482082Seschrock * value that will be returned by spa_version() since parsing the 8492082Seschrock * configuration requires knowing the version number. 850789Sahrens */ 8511544Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 8522082Seschrock spa->spa_ubsync.ub_version = version; 8532082Seschrock error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 8541544Seschrock spa_config_exit(spa, FTAG); 855789Sahrens 8562082Seschrock if (error != 0) 8571544Seschrock goto out; 858789Sahrens 8591585Sbonwick ASSERT(spa->spa_root_vdev == rvd); 860789Sahrens ASSERT(spa_guid(spa) == pool_guid); 861789Sahrens 862789Sahrens /* 863789Sahrens * Try to open all vdevs, loading each label in the process. 864789Sahrens */ 8654070Smc142369 error = vdev_open(rvd); 8664070Smc142369 if (error != 0) 8671544Seschrock goto out; 868789Sahrens 869789Sahrens /* 8701986Seschrock * Validate the labels for all leaf vdevs. We need to grab the config 8711986Seschrock * lock because all label I/O is done with the ZIO_FLAG_CONFIG_HELD 8721986Seschrock * flag. 8731986Seschrock */ 8741986Seschrock spa_config_enter(spa, RW_READER, FTAG); 8751986Seschrock error = vdev_validate(rvd); 8761986Seschrock spa_config_exit(spa, FTAG); 8771986Seschrock 8784070Smc142369 if (error != 0) 8791986Seschrock goto out; 8801986Seschrock 8811986Seschrock if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 8821986Seschrock error = ENXIO; 8831986Seschrock goto out; 8841986Seschrock } 8851986Seschrock 8861986Seschrock /* 887789Sahrens * Find the best uberblock. 888789Sahrens */ 889789Sahrens bzero(ub, sizeof (uberblock_t)); 890789Sahrens 891789Sahrens zio = zio_root(spa, NULL, NULL, 892789Sahrens ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 893789Sahrens vdev_uberblock_load(zio, rvd, ub); 894789Sahrens error = zio_wait(zio); 895789Sahrens 896789Sahrens /* 897789Sahrens * If we weren't able to find a single valid uberblock, return failure. 898789Sahrens */ 899789Sahrens if (ub->ub_txg == 0) { 9001760Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 9011760Seschrock VDEV_AUX_CORRUPT_DATA); 9021544Seschrock error = ENXIO; 9031544Seschrock goto out; 9041544Seschrock } 9051544Seschrock 9061544Seschrock /* 9071544Seschrock * If the pool is newer than the code, we can't open it. 9081544Seschrock */ 9094577Sahrens if (ub->ub_version > SPA_VERSION) { 9101760Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 9111760Seschrock VDEV_AUX_VERSION_NEWER); 9121544Seschrock error = ENOTSUP; 9131544Seschrock goto out; 914789Sahrens } 915789Sahrens 916789Sahrens /* 917789Sahrens * If the vdev guid sum doesn't match the uberblock, we have an 918789Sahrens * incomplete configuration. 919789Sahrens */ 9201732Sbonwick if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 9211544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 9221544Seschrock VDEV_AUX_BAD_GUID_SUM); 9231544Seschrock error = ENXIO; 9241544Seschrock goto out; 925789Sahrens } 926789Sahrens 927789Sahrens /* 928789Sahrens * Initialize internal SPA structures. 929789Sahrens */ 930789Sahrens spa->spa_state = POOL_STATE_ACTIVE; 931789Sahrens spa->spa_ubsync = spa->spa_uberblock; 932789Sahrens spa->spa_first_txg = spa_last_synced_txg(spa) + 1; 9331544Seschrock error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 9341544Seschrock if (error) { 9351544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 9361544Seschrock VDEV_AUX_CORRUPT_DATA); 9371544Seschrock goto out; 9381544Seschrock } 939789Sahrens spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 940789Sahrens 9411544Seschrock if (zap_lookup(spa->spa_meta_objset, 942789Sahrens DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 9431544Seschrock sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { 9441544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 9451544Seschrock VDEV_AUX_CORRUPT_DATA); 9461544Seschrock error = EIO; 9471544Seschrock goto out; 9481544Seschrock } 949789Sahrens 950789Sahrens if (!mosconfig) { 9512082Seschrock nvlist_t *newconfig; 9523975Sek110237 uint64_t hostid; 9532082Seschrock 9542082Seschrock if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) { 9551544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 9561544Seschrock VDEV_AUX_CORRUPT_DATA); 9571544Seschrock error = EIO; 9581544Seschrock goto out; 9591544Seschrock } 960789Sahrens 9613975Sek110237 if (nvlist_lookup_uint64(newconfig, ZPOOL_CONFIG_HOSTID, 9623975Sek110237 &hostid) == 0) { 9633975Sek110237 char *hostname; 9643975Sek110237 unsigned long myhostid = 0; 9653975Sek110237 9663975Sek110237 VERIFY(nvlist_lookup_string(newconfig, 9673975Sek110237 ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 9683975Sek110237 9693975Sek110237 (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 9704178Slling if (hostid != 0 && myhostid != 0 && 9714178Slling (unsigned long)hostid != myhostid) { 9723975Sek110237 cmn_err(CE_WARN, "pool '%s' could not be " 9733975Sek110237 "loaded as it was last accessed by " 9743975Sek110237 "another system (host: %s hostid: 0x%lx). " 9753975Sek110237 "See: http://www.sun.com/msg/ZFS-8000-EY", 9763975Sek110237 spa->spa_name, hostname, 9773975Sek110237 (unsigned long)hostid); 9783975Sek110237 error = EBADF; 9793975Sek110237 goto out; 9803975Sek110237 } 9813975Sek110237 } 9823975Sek110237 983789Sahrens spa_config_set(spa, newconfig); 984789Sahrens spa_unload(spa); 985789Sahrens spa_deactivate(spa); 986789Sahrens spa_activate(spa); 987789Sahrens 9881544Seschrock return (spa_load(spa, newconfig, state, B_TRUE)); 9891544Seschrock } 9901544Seschrock 9911544Seschrock if (zap_lookup(spa->spa_meta_objset, 9921544Seschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 9931544Seschrock sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { 9941544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 9951544Seschrock VDEV_AUX_CORRUPT_DATA); 9961544Seschrock error = EIO; 9971544Seschrock goto out; 998789Sahrens } 999789Sahrens 10001544Seschrock /* 10012082Seschrock * Load the bit that tells us to use the new accounting function 10022082Seschrock * (raid-z deflation). If we have an older pool, this will not 10032082Seschrock * be present. 10042082Seschrock */ 10052082Seschrock error = zap_lookup(spa->spa_meta_objset, 10062082Seschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 10072082Seschrock sizeof (uint64_t), 1, &spa->spa_deflate); 10082082Seschrock if (error != 0 && error != ENOENT) { 10092082Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 10102082Seschrock VDEV_AUX_CORRUPT_DATA); 10112082Seschrock error = EIO; 10122082Seschrock goto out; 10132082Seschrock } 10142082Seschrock 10152082Seschrock /* 10161544Seschrock * Load the persistent error log. If we have an older pool, this will 10171544Seschrock * not be present. 10181544Seschrock */ 10191544Seschrock error = zap_lookup(spa->spa_meta_objset, 10201544Seschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, 10211544Seschrock sizeof (uint64_t), 1, &spa->spa_errlog_last); 10221807Sbonwick if (error != 0 && error != ENOENT) { 10231544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 10241544Seschrock VDEV_AUX_CORRUPT_DATA); 10251544Seschrock error = EIO; 10261544Seschrock goto out; 10271544Seschrock } 10281544Seschrock 10291544Seschrock error = zap_lookup(spa->spa_meta_objset, 10301544Seschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, 10311544Seschrock sizeof (uint64_t), 1, &spa->spa_errlog_scrub); 10321544Seschrock if (error != 0 && error != ENOENT) { 10331544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 10341544Seschrock VDEV_AUX_CORRUPT_DATA); 10351544Seschrock error = EIO; 10361544Seschrock goto out; 10371544Seschrock } 1038789Sahrens 1039789Sahrens /* 10402926Sek110237 * Load the history object. If we have an older pool, this 10412926Sek110237 * will not be present. 10422926Sek110237 */ 10432926Sek110237 error = zap_lookup(spa->spa_meta_objset, 10442926Sek110237 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY, 10452926Sek110237 sizeof (uint64_t), 1, &spa->spa_history); 10462926Sek110237 if (error != 0 && error != ENOENT) { 10472926Sek110237 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 10482926Sek110237 VDEV_AUX_CORRUPT_DATA); 10492926Sek110237 error = EIO; 10502926Sek110237 goto out; 10512926Sek110237 } 10522926Sek110237 10532926Sek110237 /* 10542082Seschrock * Load any hot spares for this pool. 10552082Seschrock */ 10562082Seschrock error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 10572082Seschrock DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares_object); 10582082Seschrock if (error != 0 && error != ENOENT) { 10592082Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 10602082Seschrock VDEV_AUX_CORRUPT_DATA); 10612082Seschrock error = EIO; 10622082Seschrock goto out; 10632082Seschrock } 10642082Seschrock if (error == 0) { 10654577Sahrens ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 10662082Seschrock if (load_nvlist(spa, spa->spa_spares_object, 10672082Seschrock &spa->spa_sparelist) != 0) { 10682082Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 10692082Seschrock VDEV_AUX_CORRUPT_DATA); 10702082Seschrock error = EIO; 10712082Seschrock goto out; 10722082Seschrock } 10732082Seschrock 10742082Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 10752082Seschrock spa_load_spares(spa); 10762082Seschrock spa_config_exit(spa, FTAG); 10772082Seschrock } 10782082Seschrock 10795094Slling spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 10804543Smarks 10813912Slling error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 10823912Slling DMU_POOL_PROPS, sizeof (uint64_t), 1, &spa->spa_pool_props_object); 10833912Slling 10843912Slling if (error && error != ENOENT) { 10853912Slling vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 10863912Slling VDEV_AUX_CORRUPT_DATA); 10873912Slling error = EIO; 10883912Slling goto out; 10893912Slling } 10903912Slling 10913912Slling if (error == 0) { 10923912Slling (void) zap_lookup(spa->spa_meta_objset, 10933912Slling spa->spa_pool_props_object, 10944451Seschrock zpool_prop_to_name(ZPOOL_PROP_BOOTFS), 10953912Slling sizeof (uint64_t), 1, &spa->spa_bootfs); 10964451Seschrock (void) zap_lookup(spa->spa_meta_objset, 10974451Seschrock spa->spa_pool_props_object, 10984451Seschrock zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), 10994451Seschrock sizeof (uint64_t), 1, &autoreplace); 11004543Smarks (void) zap_lookup(spa->spa_meta_objset, 11014543Smarks spa->spa_pool_props_object, 11024543Smarks zpool_prop_to_name(ZPOOL_PROP_DELEGATION), 11034543Smarks sizeof (uint64_t), 1, &spa->spa_delegation); 1104*5329Sgw25295 (void) zap_lookup(spa->spa_meta_objset, 1105*5329Sgw25295 spa->spa_pool_props_object, 1106*5329Sgw25295 zpool_prop_to_name(ZPOOL_PROP_FAILUREMODE), 1107*5329Sgw25295 sizeof (uint64_t), 1, &spa->spa_failmode); 11083912Slling } 11093912Slling 11102082Seschrock /* 11114451Seschrock * If the 'autoreplace' property is set, then post a resource notifying 11124451Seschrock * the ZFS DE that it should not issue any faults for unopenable 11134451Seschrock * devices. We also iterate over the vdevs, and post a sysevent for any 11144451Seschrock * unopenable vdevs so that the normal autoreplace handler can take 11154451Seschrock * over. 11164451Seschrock */ 11174451Seschrock if (autoreplace) 11184451Seschrock spa_check_removed(spa->spa_root_vdev); 11194451Seschrock 11204451Seschrock /* 11211986Seschrock * Load the vdev state for all toplevel vdevs. 1122789Sahrens */ 11231986Seschrock vdev_load(rvd); 1124789Sahrens 1125789Sahrens /* 1126789Sahrens * Propagate the leaf DTLs we just loaded all the way up the tree. 1127789Sahrens */ 11281544Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 1129789Sahrens vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 11301544Seschrock spa_config_exit(spa, FTAG); 1131789Sahrens 1132789Sahrens /* 1133789Sahrens * Check the state of the root vdev. If it can't be opened, it 1134789Sahrens * indicates one or more toplevel vdevs are faulted. 1135789Sahrens */ 11361544Seschrock if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 11371544Seschrock error = ENXIO; 11381544Seschrock goto out; 11391544Seschrock } 1140789Sahrens 11411544Seschrock if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) { 11421635Sbonwick dmu_tx_t *tx; 11431635Sbonwick int need_update = B_FALSE; 11441585Sbonwick int c; 11451601Sbonwick 11461635Sbonwick /* 11471635Sbonwick * Claim log blocks that haven't been committed yet. 11481635Sbonwick * This must all happen in a single txg. 11491635Sbonwick */ 11501601Sbonwick tx = dmu_tx_create_assigned(spa_get_dsl(spa), 1151789Sahrens spa_first_txg(spa)); 11522417Sahrens (void) dmu_objset_find(spa->spa_name, 11532417Sahrens zil_claim, tx, DS_FIND_CHILDREN); 1154789Sahrens dmu_tx_commit(tx); 1155789Sahrens 1156789Sahrens spa->spa_sync_on = B_TRUE; 1157789Sahrens txg_sync_start(spa->spa_dsl_pool); 1158789Sahrens 1159789Sahrens /* 1160789Sahrens * Wait for all claims to sync. 1161789Sahrens */ 1162789Sahrens txg_wait_synced(spa->spa_dsl_pool, 0); 11631585Sbonwick 11641585Sbonwick /* 11651635Sbonwick * If the config cache is stale, or we have uninitialized 11661635Sbonwick * metaslabs (see spa_vdev_add()), then update the config. 11671585Sbonwick */ 11681635Sbonwick if (config_cache_txg != spa->spa_config_txg || 11691635Sbonwick state == SPA_LOAD_IMPORT) 11701635Sbonwick need_update = B_TRUE; 11711635Sbonwick 11721635Sbonwick for (c = 0; c < rvd->vdev_children; c++) 11731635Sbonwick if (rvd->vdev_child[c]->vdev_ms_array == 0) 11741635Sbonwick need_update = B_TRUE; 11751585Sbonwick 11761585Sbonwick /* 11771635Sbonwick * Update the config cache asychronously in case we're the 11781635Sbonwick * root pool, in which case the config cache isn't writable yet. 11791585Sbonwick */ 11801635Sbonwick if (need_update) 11811635Sbonwick spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 1182789Sahrens } 1183789Sahrens 11841544Seschrock error = 0; 11851544Seschrock out: 11862082Seschrock if (error && error != EBADF) 11871544Seschrock zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0); 11881544Seschrock spa->spa_load_state = SPA_LOAD_NONE; 11891544Seschrock spa->spa_ena = 0; 11901544Seschrock 11911544Seschrock return (error); 1192789Sahrens } 1193789Sahrens 1194789Sahrens /* 1195789Sahrens * Pool Open/Import 1196789Sahrens * 1197789Sahrens * The import case is identical to an open except that the configuration is sent 1198789Sahrens * down from userland, instead of grabbed from the configuration cache. For the 1199789Sahrens * case of an open, the pool configuration will exist in the 12004451Seschrock * POOL_STATE_UNINITIALIZED state. 1201789Sahrens * 1202789Sahrens * The stats information (gen/count/ustats) is used to gather vdev statistics at 1203789Sahrens * the same time open the pool, without having to keep around the spa_t in some 1204789Sahrens * ambiguous state. 1205789Sahrens */ 1206789Sahrens static int 1207789Sahrens spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) 1208789Sahrens { 1209789Sahrens spa_t *spa; 1210789Sahrens int error; 1211789Sahrens int loaded = B_FALSE; 1212789Sahrens int locked = B_FALSE; 1213789Sahrens 1214789Sahrens *spapp = NULL; 1215789Sahrens 1216789Sahrens /* 1217789Sahrens * As disgusting as this is, we need to support recursive calls to this 1218789Sahrens * function because dsl_dir_open() is called during spa_load(), and ends 1219789Sahrens * up calling spa_open() again. The real fix is to figure out how to 1220789Sahrens * avoid dsl_dir_open() calling this in the first place. 1221789Sahrens */ 1222789Sahrens if (mutex_owner(&spa_namespace_lock) != curthread) { 1223789Sahrens mutex_enter(&spa_namespace_lock); 1224789Sahrens locked = B_TRUE; 1225789Sahrens } 1226789Sahrens 1227789Sahrens if ((spa = spa_lookup(pool)) == NULL) { 1228789Sahrens if (locked) 1229789Sahrens mutex_exit(&spa_namespace_lock); 1230789Sahrens return (ENOENT); 1231789Sahrens } 1232789Sahrens if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 1233789Sahrens 1234789Sahrens spa_activate(spa); 1235789Sahrens 12361635Sbonwick error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE); 1237789Sahrens 1238789Sahrens if (error == EBADF) { 1239789Sahrens /* 12401986Seschrock * If vdev_validate() returns failure (indicated by 12411986Seschrock * EBADF), it indicates that one of the vdevs indicates 12421986Seschrock * that the pool has been exported or destroyed. If 12431986Seschrock * this is the case, the config cache is out of sync and 12441986Seschrock * we should remove the pool from the namespace. 1245789Sahrens */ 12462082Seschrock zfs_post_ok(spa, NULL); 1247789Sahrens spa_unload(spa); 1248789Sahrens spa_deactivate(spa); 1249789Sahrens spa_remove(spa); 1250789Sahrens spa_config_sync(); 1251789Sahrens if (locked) 1252789Sahrens mutex_exit(&spa_namespace_lock); 1253789Sahrens return (ENOENT); 12541544Seschrock } 12551544Seschrock 12561544Seschrock if (error) { 1257789Sahrens /* 1258789Sahrens * We can't open the pool, but we still have useful 1259789Sahrens * information: the state of each vdev after the 1260789Sahrens * attempted vdev_open(). Return this to the user. 1261789Sahrens */ 12621635Sbonwick if (config != NULL && spa->spa_root_vdev != NULL) { 12631635Sbonwick spa_config_enter(spa, RW_READER, FTAG); 1264789Sahrens *config = spa_config_generate(spa, NULL, -1ULL, 1265789Sahrens B_TRUE); 12661635Sbonwick spa_config_exit(spa, FTAG); 12671635Sbonwick } 1268789Sahrens spa_unload(spa); 1269789Sahrens spa_deactivate(spa); 12701544Seschrock spa->spa_last_open_failed = B_TRUE; 1271789Sahrens if (locked) 1272789Sahrens mutex_exit(&spa_namespace_lock); 1273789Sahrens *spapp = NULL; 1274789Sahrens return (error); 12751544Seschrock } else { 12761544Seschrock zfs_post_ok(spa, NULL); 12771544Seschrock spa->spa_last_open_failed = B_FALSE; 1278789Sahrens } 1279789Sahrens 1280789Sahrens loaded = B_TRUE; 1281789Sahrens } 1282789Sahrens 1283789Sahrens spa_open_ref(spa, tag); 12844451Seschrock 12854451Seschrock /* 12864451Seschrock * If we just loaded the pool, resilver anything that's out of date. 12874451Seschrock */ 12884451Seschrock if (loaded && (spa_mode & FWRITE)) 12894451Seschrock VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 12904451Seschrock 1291789Sahrens if (locked) 1292789Sahrens mutex_exit(&spa_namespace_lock); 1293789Sahrens 1294789Sahrens *spapp = spa; 1295789Sahrens 1296789Sahrens if (config != NULL) { 12971544Seschrock spa_config_enter(spa, RW_READER, FTAG); 1298789Sahrens *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 12991544Seschrock spa_config_exit(spa, FTAG); 1300789Sahrens } 1301789Sahrens 1302789Sahrens return (0); 1303789Sahrens } 1304789Sahrens 1305789Sahrens int 1306789Sahrens spa_open(const char *name, spa_t **spapp, void *tag) 1307789Sahrens { 1308789Sahrens return (spa_open_common(name, spapp, tag, NULL)); 1309789Sahrens } 1310789Sahrens 13111544Seschrock /* 13121544Seschrock * Lookup the given spa_t, incrementing the inject count in the process, 13131544Seschrock * preventing it from being exported or destroyed. 13141544Seschrock */ 13151544Seschrock spa_t * 13161544Seschrock spa_inject_addref(char *name) 13171544Seschrock { 13181544Seschrock spa_t *spa; 13191544Seschrock 13201544Seschrock mutex_enter(&spa_namespace_lock); 13211544Seschrock if ((spa = spa_lookup(name)) == NULL) { 13221544Seschrock mutex_exit(&spa_namespace_lock); 13231544Seschrock return (NULL); 13241544Seschrock } 13251544Seschrock spa->spa_inject_ref++; 13261544Seschrock mutex_exit(&spa_namespace_lock); 13271544Seschrock 13281544Seschrock return (spa); 13291544Seschrock } 13301544Seschrock 13311544Seschrock void 13321544Seschrock spa_inject_delref(spa_t *spa) 13331544Seschrock { 13341544Seschrock mutex_enter(&spa_namespace_lock); 13351544Seschrock spa->spa_inject_ref--; 13361544Seschrock mutex_exit(&spa_namespace_lock); 13371544Seschrock } 13381544Seschrock 13392082Seschrock static void 13402082Seschrock spa_add_spares(spa_t *spa, nvlist_t *config) 13412082Seschrock { 13422082Seschrock nvlist_t **spares; 13432082Seschrock uint_t i, nspares; 13442082Seschrock nvlist_t *nvroot; 13452082Seschrock uint64_t guid; 13462082Seschrock vdev_stat_t *vs; 13472082Seschrock uint_t vsc; 13483377Seschrock uint64_t pool; 13492082Seschrock 13502082Seschrock if (spa->spa_nspares == 0) 13512082Seschrock return; 13522082Seschrock 13532082Seschrock VERIFY(nvlist_lookup_nvlist(config, 13542082Seschrock ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 13552082Seschrock VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 13562082Seschrock ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 13572082Seschrock if (nspares != 0) { 13582082Seschrock VERIFY(nvlist_add_nvlist_array(nvroot, 13592082Seschrock ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 13602082Seschrock VERIFY(nvlist_lookup_nvlist_array(nvroot, 13612082Seschrock ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 13622082Seschrock 13632082Seschrock /* 13642082Seschrock * Go through and find any spares which have since been 13652082Seschrock * repurposed as an active spare. If this is the case, update 13662082Seschrock * their status appropriately. 13672082Seschrock */ 13682082Seschrock for (i = 0; i < nspares; i++) { 13692082Seschrock VERIFY(nvlist_lookup_uint64(spares[i], 13702082Seschrock ZPOOL_CONFIG_GUID, &guid) == 0); 13713377Seschrock if (spa_spare_exists(guid, &pool) && pool != 0ULL) { 13722082Seschrock VERIFY(nvlist_lookup_uint64_array( 13732082Seschrock spares[i], ZPOOL_CONFIG_STATS, 13742082Seschrock (uint64_t **)&vs, &vsc) == 0); 13752082Seschrock vs->vs_state = VDEV_STATE_CANT_OPEN; 13762082Seschrock vs->vs_aux = VDEV_AUX_SPARED; 13772082Seschrock } 13782082Seschrock } 13792082Seschrock } 13802082Seschrock } 13812082Seschrock 1382789Sahrens int 13831544Seschrock spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 1384789Sahrens { 1385789Sahrens int error; 1386789Sahrens spa_t *spa; 1387789Sahrens 1388789Sahrens *config = NULL; 1389789Sahrens error = spa_open_common(name, &spa, FTAG, config); 1390789Sahrens 13912082Seschrock if (spa && *config != NULL) { 13921544Seschrock VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, 13931544Seschrock spa_get_errlog_size(spa)) == 0); 13941544Seschrock 13952082Seschrock spa_add_spares(spa, *config); 13962082Seschrock } 13972082Seschrock 13981544Seschrock /* 13991544Seschrock * We want to get the alternate root even for faulted pools, so we cheat 14001544Seschrock * and call spa_lookup() directly. 14011544Seschrock */ 14021544Seschrock if (altroot) { 14031544Seschrock if (spa == NULL) { 14041544Seschrock mutex_enter(&spa_namespace_lock); 14051544Seschrock spa = spa_lookup(name); 14061544Seschrock if (spa) 14071544Seschrock spa_altroot(spa, altroot, buflen); 14081544Seschrock else 14091544Seschrock altroot[0] = '\0'; 14101544Seschrock spa = NULL; 14111544Seschrock mutex_exit(&spa_namespace_lock); 14121544Seschrock } else { 14131544Seschrock spa_altroot(spa, altroot, buflen); 14141544Seschrock } 14151544Seschrock } 14161544Seschrock 1417789Sahrens if (spa != NULL) 1418789Sahrens spa_close(spa, FTAG); 1419789Sahrens 1420789Sahrens return (error); 1421789Sahrens } 1422789Sahrens 1423789Sahrens /* 14242082Seschrock * Validate that the 'spares' array is well formed. We must have an array of 14253377Seschrock * nvlists, each which describes a valid leaf vdev. If this is an import (mode 14263377Seschrock * is VDEV_ALLOC_SPARE), then we allow corrupted spares to be specified, as long 14273377Seschrock * as they are well-formed. 14282082Seschrock */ 14292082Seschrock static int 14302082Seschrock spa_validate_spares(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 14312082Seschrock { 14322082Seschrock nvlist_t **spares; 14332082Seschrock uint_t i, nspares; 14342082Seschrock vdev_t *vd; 14352082Seschrock int error; 14362082Seschrock 14372082Seschrock /* 14382082Seschrock * It's acceptable to have no spares specified. 14392082Seschrock */ 14402082Seschrock if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 14412082Seschrock &spares, &nspares) != 0) 14422082Seschrock return (0); 14432082Seschrock 14442082Seschrock if (nspares == 0) 14452082Seschrock return (EINVAL); 14462082Seschrock 14472082Seschrock /* 14482082Seschrock * Make sure the pool is formatted with a version that supports hot 14492082Seschrock * spares. 14502082Seschrock */ 14514577Sahrens if (spa_version(spa) < SPA_VERSION_SPARES) 14522082Seschrock return (ENOTSUP); 14532082Seschrock 14543377Seschrock /* 14553377Seschrock * Set the pending spare list so we correctly handle device in-use 14563377Seschrock * checking. 14573377Seschrock */ 14583377Seschrock spa->spa_pending_spares = spares; 14593377Seschrock spa->spa_pending_nspares = nspares; 14603377Seschrock 14612082Seschrock for (i = 0; i < nspares; i++) { 14622082Seschrock if ((error = spa_config_parse(spa, &vd, spares[i], NULL, 0, 14632082Seschrock mode)) != 0) 14643377Seschrock goto out; 14652082Seschrock 14662082Seschrock if (!vd->vdev_ops->vdev_op_leaf) { 14672082Seschrock vdev_free(vd); 14683377Seschrock error = EINVAL; 14693377Seschrock goto out; 14702082Seschrock } 14712082Seschrock 14722082Seschrock vd->vdev_top = vd; 14733377Seschrock 14743377Seschrock if ((error = vdev_open(vd)) == 0 && 14753377Seschrock (error = vdev_label_init(vd, crtxg, 14763377Seschrock VDEV_LABEL_SPARE)) == 0) { 14773377Seschrock VERIFY(nvlist_add_uint64(spares[i], ZPOOL_CONFIG_GUID, 14783377Seschrock vd->vdev_guid) == 0); 14792082Seschrock } 14802082Seschrock 14812082Seschrock vdev_free(vd); 14823377Seschrock 14833377Seschrock if (error && mode != VDEV_ALLOC_SPARE) 14843377Seschrock goto out; 14853377Seschrock else 14863377Seschrock error = 0; 14872082Seschrock } 14882082Seschrock 14893377Seschrock out: 14903377Seschrock spa->spa_pending_spares = NULL; 14913377Seschrock spa->spa_pending_nspares = 0; 14923377Seschrock return (error); 14932082Seschrock } 14942082Seschrock 14952082Seschrock /* 1496789Sahrens * Pool Creation 1497789Sahrens */ 1498789Sahrens int 14995094Slling spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 15004715Sek110237 const char *history_str) 1501789Sahrens { 1502789Sahrens spa_t *spa; 15035094Slling char *altroot = NULL; 15041635Sbonwick vdev_t *rvd; 1505789Sahrens dsl_pool_t *dp; 1506789Sahrens dmu_tx_t *tx; 15072082Seschrock int c, error = 0; 1508789Sahrens uint64_t txg = TXG_INITIAL; 15092082Seschrock nvlist_t **spares; 15102082Seschrock uint_t nspares; 15115094Slling uint64_t version; 1512789Sahrens 1513789Sahrens /* 1514789Sahrens * If this pool already exists, return failure. 1515789Sahrens */ 1516789Sahrens mutex_enter(&spa_namespace_lock); 1517789Sahrens if (spa_lookup(pool) != NULL) { 1518789Sahrens mutex_exit(&spa_namespace_lock); 1519789Sahrens return (EEXIST); 1520789Sahrens } 1521789Sahrens 1522789Sahrens /* 1523789Sahrens * Allocate a new spa_t structure. 1524789Sahrens */ 15255094Slling (void) nvlist_lookup_string(props, 15265094Slling zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 15271635Sbonwick spa = spa_add(pool, altroot); 1528789Sahrens spa_activate(spa); 1529789Sahrens 1530789Sahrens spa->spa_uberblock.ub_txg = txg - 1; 15315094Slling 15325094Slling if (props && (error = spa_prop_validate(spa, props))) { 15335094Slling spa_unload(spa); 15345094Slling spa_deactivate(spa); 15355094Slling spa_remove(spa); 15365094Slling return (error); 15375094Slling } 15385094Slling 15395094Slling if (nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), 15405094Slling &version) != 0) 15415094Slling version = SPA_VERSION; 15425094Slling ASSERT(version <= SPA_VERSION); 15435094Slling spa->spa_uberblock.ub_version = version; 1544789Sahrens spa->spa_ubsync = spa->spa_uberblock; 1545789Sahrens 15461635Sbonwick /* 15471635Sbonwick * Create the root vdev. 15481635Sbonwick */ 15491635Sbonwick spa_config_enter(spa, RW_WRITER, FTAG); 15501635Sbonwick 15512082Seschrock error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 15522082Seschrock 15532082Seschrock ASSERT(error != 0 || rvd != NULL); 15542082Seschrock ASSERT(error != 0 || spa->spa_root_vdev == rvd); 15552082Seschrock 15562082Seschrock if (error == 0 && rvd->vdev_children == 0) 15571635Sbonwick error = EINVAL; 15582082Seschrock 15592082Seschrock if (error == 0 && 15602082Seschrock (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 15612082Seschrock (error = spa_validate_spares(spa, nvroot, txg, 15622082Seschrock VDEV_ALLOC_ADD)) == 0) { 15632082Seschrock for (c = 0; c < rvd->vdev_children; c++) 15642082Seschrock vdev_init(rvd->vdev_child[c], txg); 15652082Seschrock vdev_config_dirty(rvd); 15661635Sbonwick } 15671635Sbonwick 15681635Sbonwick spa_config_exit(spa, FTAG); 1569789Sahrens 15702082Seschrock if (error != 0) { 1571789Sahrens spa_unload(spa); 1572789Sahrens spa_deactivate(spa); 1573789Sahrens spa_remove(spa); 1574789Sahrens mutex_exit(&spa_namespace_lock); 1575789Sahrens return (error); 1576789Sahrens } 1577789Sahrens 15782082Seschrock /* 15792082Seschrock * Get the list of spares, if specified. 15802082Seschrock */ 15812082Seschrock if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 15822082Seschrock &spares, &nspares) == 0) { 15832082Seschrock VERIFY(nvlist_alloc(&spa->spa_sparelist, NV_UNIQUE_NAME, 15842082Seschrock KM_SLEEP) == 0); 15852082Seschrock VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 15862082Seschrock ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 15872082Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 15882082Seschrock spa_load_spares(spa); 15892082Seschrock spa_config_exit(spa, FTAG); 15902082Seschrock spa->spa_sync_spares = B_TRUE; 15912082Seschrock } 15922082Seschrock 1593789Sahrens spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg); 1594789Sahrens spa->spa_meta_objset = dp->dp_meta_objset; 1595789Sahrens 1596789Sahrens tx = dmu_tx_create_assigned(dp, txg); 1597789Sahrens 1598789Sahrens /* 1599789Sahrens * Create the pool config object. 1600789Sahrens */ 1601789Sahrens spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 1602789Sahrens DMU_OT_PACKED_NVLIST, 1 << 14, 1603789Sahrens DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 1604789Sahrens 16051544Seschrock if (zap_add(spa->spa_meta_objset, 1606789Sahrens DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 16071544Seschrock sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 16081544Seschrock cmn_err(CE_PANIC, "failed to add pool config"); 16091544Seschrock } 1610789Sahrens 16115094Slling /* Newly created pools with the right version are always deflated. */ 16125094Slling if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 16135094Slling spa->spa_deflate = TRUE; 16145094Slling if (zap_add(spa->spa_meta_objset, 16155094Slling DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 16165094Slling sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 16175094Slling cmn_err(CE_PANIC, "failed to add deflate"); 16185094Slling } 16192082Seschrock } 16202082Seschrock 1621789Sahrens /* 1622789Sahrens * Create the deferred-free bplist object. Turn off compression 1623789Sahrens * because sync-to-convergence takes longer if the blocksize 1624789Sahrens * keeps changing. 1625789Sahrens */ 1626789Sahrens spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 1627789Sahrens 1 << 14, tx); 1628789Sahrens dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 1629789Sahrens ZIO_COMPRESS_OFF, tx); 1630789Sahrens 16311544Seschrock if (zap_add(spa->spa_meta_objset, 1632789Sahrens DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 16331544Seschrock sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { 16341544Seschrock cmn_err(CE_PANIC, "failed to add bplist"); 16351544Seschrock } 1636789Sahrens 16372926Sek110237 /* 16382926Sek110237 * Create the pool's history object. 16392926Sek110237 */ 16405094Slling if (version >= SPA_VERSION_ZPOOL_HISTORY) 16415094Slling spa_history_create_obj(spa, tx); 16425094Slling 16435094Slling /* 16445094Slling * Set pool properties. 16455094Slling */ 16465094Slling spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 16475094Slling spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 16485094Slling spa->spa_temporary = zpool_prop_default_numeric(ZPOOL_PROP_TEMPORARY); 1649*5329Sgw25295 spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 16505094Slling if (props) 16515094Slling spa_sync_props(spa, props, CRED(), tx); 16522926Sek110237 1653789Sahrens dmu_tx_commit(tx); 1654789Sahrens 1655789Sahrens spa->spa_sync_on = B_TRUE; 1656789Sahrens txg_sync_start(spa->spa_dsl_pool); 1657789Sahrens 1658789Sahrens /* 1659789Sahrens * We explicitly wait for the first transaction to complete so that our 1660789Sahrens * bean counters are appropriately updated. 1661789Sahrens */ 1662789Sahrens txg_wait_synced(spa->spa_dsl_pool, txg); 1663789Sahrens 1664789Sahrens spa_config_sync(); 1665789Sahrens 16665094Slling if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL) 16674715Sek110237 (void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE); 16684715Sek110237 1669789Sahrens mutex_exit(&spa_namespace_lock); 1670789Sahrens 1671789Sahrens return (0); 1672789Sahrens } 1673789Sahrens 1674789Sahrens /* 1675789Sahrens * Import the given pool into the system. We set up the necessary spa_t and 1676789Sahrens * then call spa_load() to do the dirty work. 1677789Sahrens */ 1678789Sahrens int 16795094Slling spa_import(const char *pool, nvlist_t *config, nvlist_t *props) 1680789Sahrens { 1681789Sahrens spa_t *spa; 16825094Slling char *altroot = NULL; 1683789Sahrens int error; 16842082Seschrock nvlist_t *nvroot; 16852082Seschrock nvlist_t **spares; 16862082Seschrock uint_t nspares; 1687789Sahrens 1688789Sahrens /* 1689789Sahrens * If a pool with this name exists, return failure. 1690789Sahrens */ 1691789Sahrens mutex_enter(&spa_namespace_lock); 1692789Sahrens if (spa_lookup(pool) != NULL) { 1693789Sahrens mutex_exit(&spa_namespace_lock); 1694789Sahrens return (EEXIST); 1695789Sahrens } 1696789Sahrens 1697789Sahrens /* 16981635Sbonwick * Create and initialize the spa structure. 1699789Sahrens */ 17005094Slling (void) nvlist_lookup_string(props, 17015094Slling zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 17021635Sbonwick spa = spa_add(pool, altroot); 1703789Sahrens spa_activate(spa); 1704789Sahrens 1705789Sahrens /* 17061635Sbonwick * Pass off the heavy lifting to spa_load(). 17071732Sbonwick * Pass TRUE for mosconfig because the user-supplied config 17081732Sbonwick * is actually the one to trust when doing an import. 17091601Sbonwick */ 17101732Sbonwick error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE); 1711789Sahrens 17122082Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 17132082Seschrock /* 17142082Seschrock * Toss any existing sparelist, as it doesn't have any validity anymore, 17152082Seschrock * and conflicts with spa_has_spare(). 17162082Seschrock */ 17172082Seschrock if (spa->spa_sparelist) { 17182082Seschrock nvlist_free(spa->spa_sparelist); 17192082Seschrock spa->spa_sparelist = NULL; 17202082Seschrock spa_load_spares(spa); 17212082Seschrock } 17222082Seschrock 17232082Seschrock VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 17242082Seschrock &nvroot) == 0); 17255094Slling if (error == 0) { 17262082Seschrock error = spa_validate_spares(spa, nvroot, -1ULL, 17272082Seschrock VDEV_ALLOC_SPARE); 17285094Slling } 17292082Seschrock spa_config_exit(spa, FTAG); 17302082Seschrock 17315094Slling if (error != 0 || (props && (error = spa_prop_set(spa, props)))) { 1732789Sahrens spa_unload(spa); 1733789Sahrens spa_deactivate(spa); 1734789Sahrens spa_remove(spa); 1735789Sahrens mutex_exit(&spa_namespace_lock); 1736789Sahrens return (error); 1737789Sahrens } 1738789Sahrens 17391635Sbonwick /* 17402082Seschrock * Override any spares as specified by the user, as these may have 17412082Seschrock * correct device names/devids, etc. 17422082Seschrock */ 17432082Seschrock if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 17442082Seschrock &spares, &nspares) == 0) { 17452082Seschrock if (spa->spa_sparelist) 17462082Seschrock VERIFY(nvlist_remove(spa->spa_sparelist, 17472082Seschrock ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 17482082Seschrock else 17492082Seschrock VERIFY(nvlist_alloc(&spa->spa_sparelist, 17502082Seschrock NV_UNIQUE_NAME, KM_SLEEP) == 0); 17512082Seschrock VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 17522082Seschrock ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 17532082Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 17542082Seschrock spa_load_spares(spa); 17552082Seschrock spa_config_exit(spa, FTAG); 17562082Seschrock spa->spa_sync_spares = B_TRUE; 17572082Seschrock } 17582082Seschrock 17592082Seschrock /* 17601635Sbonwick * Update the config cache to include the newly-imported pool. 17611635Sbonwick */ 17624627Sck153898 if (spa_mode & FWRITE) 17634627Sck153898 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 17641635Sbonwick 1765789Sahrens /* 1766789Sahrens * Resilver anything that's out of date. 1767789Sahrens */ 1768789Sahrens if (spa_mode & FWRITE) 1769789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1770789Sahrens 17714451Seschrock mutex_exit(&spa_namespace_lock); 17724451Seschrock 1773789Sahrens return (0); 1774789Sahrens } 1775789Sahrens 1776789Sahrens /* 1777789Sahrens * This (illegal) pool name is used when temporarily importing a spa_t in order 1778789Sahrens * to get the vdev stats associated with the imported devices. 1779789Sahrens */ 1780789Sahrens #define TRYIMPORT_NAME "$import" 1781789Sahrens 1782789Sahrens nvlist_t * 1783789Sahrens spa_tryimport(nvlist_t *tryconfig) 1784789Sahrens { 1785789Sahrens nvlist_t *config = NULL; 1786789Sahrens char *poolname; 1787789Sahrens spa_t *spa; 1788789Sahrens uint64_t state; 1789789Sahrens 1790789Sahrens if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 1791789Sahrens return (NULL); 1792789Sahrens 1793789Sahrens if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 1794789Sahrens return (NULL); 1795789Sahrens 17961635Sbonwick /* 17971635Sbonwick * Create and initialize the spa structure. 17981635Sbonwick */ 1799789Sahrens mutex_enter(&spa_namespace_lock); 18001635Sbonwick spa = spa_add(TRYIMPORT_NAME, NULL); 1801789Sahrens spa_activate(spa); 1802789Sahrens 1803789Sahrens /* 18041635Sbonwick * Pass off the heavy lifting to spa_load(). 18051732Sbonwick * Pass TRUE for mosconfig because the user-supplied config 18061732Sbonwick * is actually the one to trust when doing an import. 1807789Sahrens */ 18081732Sbonwick (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); 1809789Sahrens 1810789Sahrens /* 1811789Sahrens * If 'tryconfig' was at least parsable, return the current config. 1812789Sahrens */ 1813789Sahrens if (spa->spa_root_vdev != NULL) { 18141635Sbonwick spa_config_enter(spa, RW_READER, FTAG); 1815789Sahrens config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 18161635Sbonwick spa_config_exit(spa, FTAG); 1817789Sahrens VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 1818789Sahrens poolname) == 0); 1819789Sahrens VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 1820789Sahrens state) == 0); 18213975Sek110237 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 18223975Sek110237 spa->spa_uberblock.ub_timestamp) == 0); 18232082Seschrock 18242082Seschrock /* 18252082Seschrock * Add the list of hot spares. 18262082Seschrock */ 18272082Seschrock spa_add_spares(spa, config); 1828789Sahrens } 1829789Sahrens 1830789Sahrens spa_unload(spa); 1831789Sahrens spa_deactivate(spa); 1832789Sahrens spa_remove(spa); 1833789Sahrens mutex_exit(&spa_namespace_lock); 1834789Sahrens 1835789Sahrens return (config); 1836789Sahrens } 1837789Sahrens 1838789Sahrens /* 1839789Sahrens * Pool export/destroy 1840789Sahrens * 1841789Sahrens * The act of destroying or exporting a pool is very simple. We make sure there 1842789Sahrens * is no more pending I/O and any references to the pool are gone. Then, we 1843789Sahrens * update the pool state and sync all the labels to disk, removing the 1844789Sahrens * configuration from the cache afterwards. 1845789Sahrens */ 1846789Sahrens static int 18471775Sbillm spa_export_common(char *pool, int new_state, nvlist_t **oldconfig) 1848789Sahrens { 1849789Sahrens spa_t *spa; 1850789Sahrens 18511775Sbillm if (oldconfig) 18521775Sbillm *oldconfig = NULL; 18531775Sbillm 1854789Sahrens if (!(spa_mode & FWRITE)) 1855789Sahrens return (EROFS); 1856789Sahrens 1857789Sahrens mutex_enter(&spa_namespace_lock); 1858789Sahrens if ((spa = spa_lookup(pool)) == NULL) { 1859789Sahrens mutex_exit(&spa_namespace_lock); 1860789Sahrens return (ENOENT); 1861789Sahrens } 1862789Sahrens 1863789Sahrens /* 18641544Seschrock * Put a hold on the pool, drop the namespace lock, stop async tasks, 18651544Seschrock * reacquire the namespace lock, and see if we can export. 18661544Seschrock */ 18671544Seschrock spa_open_ref(spa, FTAG); 18681544Seschrock mutex_exit(&spa_namespace_lock); 18691544Seschrock spa_async_suspend(spa); 18701544Seschrock mutex_enter(&spa_namespace_lock); 18711544Seschrock spa_close(spa, FTAG); 18721544Seschrock 18731544Seschrock /* 1874789Sahrens * The pool will be in core if it's openable, 1875789Sahrens * in which case we can modify its state. 1876789Sahrens */ 1877789Sahrens if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 1878789Sahrens /* 1879789Sahrens * Objsets may be open only because they're dirty, so we 1880789Sahrens * have to force it to sync before checking spa_refcnt. 1881789Sahrens */ 1882789Sahrens spa_scrub_suspend(spa); 1883789Sahrens txg_wait_synced(spa->spa_dsl_pool, 0); 1884789Sahrens 18851544Seschrock /* 18861544Seschrock * A pool cannot be exported or destroyed if there are active 18871544Seschrock * references. If we are resetting a pool, allow references by 18881544Seschrock * fault injection handlers. 18891544Seschrock */ 18901544Seschrock if (!spa_refcount_zero(spa) || 18911544Seschrock (spa->spa_inject_ref != 0 && 18921544Seschrock new_state != POOL_STATE_UNINITIALIZED)) { 1893789Sahrens spa_scrub_resume(spa); 18941544Seschrock spa_async_resume(spa); 1895789Sahrens mutex_exit(&spa_namespace_lock); 1896789Sahrens return (EBUSY); 1897789Sahrens } 1898789Sahrens 1899789Sahrens spa_scrub_resume(spa); 1900789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 1901789Sahrens 1902789Sahrens /* 1903789Sahrens * We want this to be reflected on every label, 1904789Sahrens * so mark them all dirty. spa_unload() will do the 1905789Sahrens * final sync that pushes these changes out. 1906789Sahrens */ 19071544Seschrock if (new_state != POOL_STATE_UNINITIALIZED) { 19081601Sbonwick spa_config_enter(spa, RW_WRITER, FTAG); 19091544Seschrock spa->spa_state = new_state; 19101635Sbonwick spa->spa_final_txg = spa_last_synced_txg(spa) + 1; 19111544Seschrock vdev_config_dirty(spa->spa_root_vdev); 19121601Sbonwick spa_config_exit(spa, FTAG); 19131544Seschrock } 1914789Sahrens } 1915789Sahrens 19164451Seschrock spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY); 19174451Seschrock 1918789Sahrens if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 1919789Sahrens spa_unload(spa); 1920789Sahrens spa_deactivate(spa); 1921789Sahrens } 1922789Sahrens 19231775Sbillm if (oldconfig && spa->spa_config) 19241775Sbillm VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 19251775Sbillm 19261544Seschrock if (new_state != POOL_STATE_UNINITIALIZED) { 19271544Seschrock spa_remove(spa); 19281544Seschrock spa_config_sync(); 19291544Seschrock } 1930789Sahrens mutex_exit(&spa_namespace_lock); 1931789Sahrens 1932789Sahrens return (0); 1933789Sahrens } 1934789Sahrens 1935789Sahrens /* 1936789Sahrens * Destroy a storage pool. 1937789Sahrens */ 1938789Sahrens int 1939789Sahrens spa_destroy(char *pool) 1940789Sahrens { 19411775Sbillm return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL)); 1942789Sahrens } 1943789Sahrens 1944789Sahrens /* 1945789Sahrens * Export a storage pool. 1946789Sahrens */ 1947789Sahrens int 19481775Sbillm spa_export(char *pool, nvlist_t **oldconfig) 1949789Sahrens { 19501775Sbillm return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig)); 1951789Sahrens } 1952789Sahrens 1953789Sahrens /* 19541544Seschrock * Similar to spa_export(), this unloads the spa_t without actually removing it 19551544Seschrock * from the namespace in any way. 19561544Seschrock */ 19571544Seschrock int 19581544Seschrock spa_reset(char *pool) 19591544Seschrock { 19601775Sbillm return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL)); 19611544Seschrock } 19621544Seschrock 19631544Seschrock 19641544Seschrock /* 1965789Sahrens * ========================================================================== 1966789Sahrens * Device manipulation 1967789Sahrens * ========================================================================== 1968789Sahrens */ 1969789Sahrens 1970789Sahrens /* 19714527Sperrin * Add a device to a storage pool. 1972789Sahrens */ 1973789Sahrens int 1974789Sahrens spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 1975789Sahrens { 1976789Sahrens uint64_t txg; 19771635Sbonwick int c, error; 1978789Sahrens vdev_t *rvd = spa->spa_root_vdev; 19791585Sbonwick vdev_t *vd, *tvd; 19802082Seschrock nvlist_t **spares; 19812082Seschrock uint_t i, nspares; 1982789Sahrens 1983789Sahrens txg = spa_vdev_enter(spa); 1984789Sahrens 19852082Seschrock if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 19862082Seschrock VDEV_ALLOC_ADD)) != 0) 19872082Seschrock return (spa_vdev_exit(spa, NULL, txg, error)); 19882082Seschrock 19893377Seschrock spa->spa_pending_vdev = vd; 1990789Sahrens 19912082Seschrock if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 19922082Seschrock &spares, &nspares) != 0) 19932082Seschrock nspares = 0; 19942082Seschrock 19953377Seschrock if (vd->vdev_children == 0 && nspares == 0) { 19963377Seschrock spa->spa_pending_vdev = NULL; 19972082Seschrock return (spa_vdev_exit(spa, vd, txg, EINVAL)); 19983377Seschrock } 19992082Seschrock 20002082Seschrock if (vd->vdev_children != 0) { 20013377Seschrock if ((error = vdev_create(vd, txg, B_FALSE)) != 0) { 20023377Seschrock spa->spa_pending_vdev = NULL; 20032082Seschrock return (spa_vdev_exit(spa, vd, txg, error)); 20042082Seschrock } 20052082Seschrock } 20062082Seschrock 20073377Seschrock /* 20083377Seschrock * We must validate the spares after checking the children. Otherwise, 20093377Seschrock * vdev_inuse() will blindly overwrite the spare. 20103377Seschrock */ 20113377Seschrock if ((error = spa_validate_spares(spa, nvroot, txg, 20123377Seschrock VDEV_ALLOC_ADD)) != 0) { 20133377Seschrock spa->spa_pending_vdev = NULL; 20143377Seschrock return (spa_vdev_exit(spa, vd, txg, error)); 20153377Seschrock } 20163377Seschrock 20173377Seschrock spa->spa_pending_vdev = NULL; 20183377Seschrock 20193377Seschrock /* 20203377Seschrock * Transfer each new top-level vdev from vd to rvd. 20213377Seschrock */ 20223377Seschrock for (c = 0; c < vd->vdev_children; c++) { 20233377Seschrock tvd = vd->vdev_child[c]; 20243377Seschrock vdev_remove_child(vd, tvd); 20253377Seschrock tvd->vdev_id = rvd->vdev_children; 20263377Seschrock vdev_add_child(rvd, tvd); 20273377Seschrock vdev_config_dirty(tvd); 20283377Seschrock } 20293377Seschrock 20302082Seschrock if (nspares != 0) { 20312082Seschrock if (spa->spa_sparelist != NULL) { 20322082Seschrock nvlist_t **oldspares; 20332082Seschrock uint_t oldnspares; 20342082Seschrock nvlist_t **newspares; 20352082Seschrock 20362082Seschrock VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 20372082Seschrock ZPOOL_CONFIG_SPARES, &oldspares, &oldnspares) == 0); 20382082Seschrock 20392082Seschrock newspares = kmem_alloc(sizeof (void *) * 20402082Seschrock (nspares + oldnspares), KM_SLEEP); 20412082Seschrock for (i = 0; i < oldnspares; i++) 20422082Seschrock VERIFY(nvlist_dup(oldspares[i], 20432082Seschrock &newspares[i], KM_SLEEP) == 0); 20442082Seschrock for (i = 0; i < nspares; i++) 20452082Seschrock VERIFY(nvlist_dup(spares[i], 20462082Seschrock &newspares[i + oldnspares], 20472082Seschrock KM_SLEEP) == 0); 20482082Seschrock 20492082Seschrock VERIFY(nvlist_remove(spa->spa_sparelist, 20502082Seschrock ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 20512082Seschrock 20522082Seschrock VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 20532082Seschrock ZPOOL_CONFIG_SPARES, newspares, 20542082Seschrock nspares + oldnspares) == 0); 20552082Seschrock for (i = 0; i < oldnspares + nspares; i++) 20562082Seschrock nvlist_free(newspares[i]); 20572082Seschrock kmem_free(newspares, (oldnspares + nspares) * 20582082Seschrock sizeof (void *)); 20592082Seschrock } else { 20602082Seschrock VERIFY(nvlist_alloc(&spa->spa_sparelist, 20612082Seschrock NV_UNIQUE_NAME, KM_SLEEP) == 0); 20622082Seschrock VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 20632082Seschrock ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 20642082Seschrock } 20652082Seschrock 20662082Seschrock spa_load_spares(spa); 20672082Seschrock spa->spa_sync_spares = B_TRUE; 2068789Sahrens } 2069789Sahrens 2070789Sahrens /* 20711585Sbonwick * We have to be careful when adding new vdevs to an existing pool. 20721585Sbonwick * If other threads start allocating from these vdevs before we 20731585Sbonwick * sync the config cache, and we lose power, then upon reboot we may 20741585Sbonwick * fail to open the pool because there are DVAs that the config cache 20751585Sbonwick * can't translate. Therefore, we first add the vdevs without 20761585Sbonwick * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 20771635Sbonwick * and then let spa_config_update() initialize the new metaslabs. 20781585Sbonwick * 20791585Sbonwick * spa_load() checks for added-but-not-initialized vdevs, so that 20801585Sbonwick * if we lose power at any point in this sequence, the remaining 20811585Sbonwick * steps will be completed the next time we load the pool. 2082789Sahrens */ 20831635Sbonwick (void) spa_vdev_exit(spa, vd, txg, 0); 20841585Sbonwick 20851635Sbonwick mutex_enter(&spa_namespace_lock); 20861635Sbonwick spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 20871635Sbonwick mutex_exit(&spa_namespace_lock); 2088789Sahrens 20891635Sbonwick return (0); 2090789Sahrens } 2091789Sahrens 2092789Sahrens /* 2093789Sahrens * Attach a device to a mirror. The arguments are the path to any device 2094789Sahrens * in the mirror, and the nvroot for the new device. If the path specifies 2095789Sahrens * a device that is not mirrored, we automatically insert the mirror vdev. 2096789Sahrens * 2097789Sahrens * If 'replacing' is specified, the new device is intended to replace the 2098789Sahrens * existing device; in this case the two devices are made into their own 20994451Seschrock * mirror using the 'replacing' vdev, which is functionally identical to 2100789Sahrens * the mirror vdev (it actually reuses all the same ops) but has a few 2101789Sahrens * extra rules: you can't attach to it after it's been created, and upon 2102789Sahrens * completion of resilvering, the first disk (the one being replaced) 2103789Sahrens * is automatically detached. 2104789Sahrens */ 2105789Sahrens int 21061544Seschrock spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 2107789Sahrens { 2108789Sahrens uint64_t txg, open_txg; 2109789Sahrens int error; 2110789Sahrens vdev_t *rvd = spa->spa_root_vdev; 2111789Sahrens vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 21122082Seschrock vdev_ops_t *pvops; 21134527Sperrin int is_log; 2114789Sahrens 2115789Sahrens txg = spa_vdev_enter(spa); 2116789Sahrens 21171544Seschrock oldvd = vdev_lookup_by_guid(rvd, guid); 2118789Sahrens 2119789Sahrens if (oldvd == NULL) 2120789Sahrens return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 2121789Sahrens 21221585Sbonwick if (!oldvd->vdev_ops->vdev_op_leaf) 21231585Sbonwick return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 21241585Sbonwick 2125789Sahrens pvd = oldvd->vdev_parent; 2126789Sahrens 21272082Seschrock if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 21284451Seschrock VDEV_ALLOC_ADD)) != 0) 21294451Seschrock return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 21304451Seschrock 21314451Seschrock if (newrootvd->vdev_children != 1) 2132789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 2133789Sahrens 2134789Sahrens newvd = newrootvd->vdev_child[0]; 2135789Sahrens 2136789Sahrens if (!newvd->vdev_ops->vdev_op_leaf) 2137789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 2138789Sahrens 21392082Seschrock if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 2140789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, error)); 2141789Sahrens 21424527Sperrin /* 21434527Sperrin * Spares can't replace logs 21444527Sperrin */ 21454527Sperrin is_log = oldvd->vdev_islog; 21464527Sperrin if (is_log && newvd->vdev_isspare) 21474527Sperrin return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 21484527Sperrin 21492082Seschrock if (!replacing) { 21502082Seschrock /* 21512082Seschrock * For attach, the only allowable parent is a mirror or the root 21522082Seschrock * vdev. 21532082Seschrock */ 21542082Seschrock if (pvd->vdev_ops != &vdev_mirror_ops && 21552082Seschrock pvd->vdev_ops != &vdev_root_ops) 21562082Seschrock return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 21572082Seschrock 21582082Seschrock pvops = &vdev_mirror_ops; 21592082Seschrock } else { 21602082Seschrock /* 21612082Seschrock * Active hot spares can only be replaced by inactive hot 21622082Seschrock * spares. 21632082Seschrock */ 21642082Seschrock if (pvd->vdev_ops == &vdev_spare_ops && 21652082Seschrock pvd->vdev_child[1] == oldvd && 21662082Seschrock !spa_has_spare(spa, newvd->vdev_guid)) 21672082Seschrock return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 21682082Seschrock 21692082Seschrock /* 21702082Seschrock * If the source is a hot spare, and the parent isn't already a 21712082Seschrock * spare, then we want to create a new hot spare. Otherwise, we 21723377Seschrock * want to create a replacing vdev. The user is not allowed to 21733377Seschrock * attach to a spared vdev child unless the 'isspare' state is 21743377Seschrock * the same (spare replaces spare, non-spare replaces 21753377Seschrock * non-spare). 21762082Seschrock */ 21772082Seschrock if (pvd->vdev_ops == &vdev_replacing_ops) 21782082Seschrock return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 21793377Seschrock else if (pvd->vdev_ops == &vdev_spare_ops && 21803377Seschrock newvd->vdev_isspare != oldvd->vdev_isspare) 21813377Seschrock return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 21822082Seschrock else if (pvd->vdev_ops != &vdev_spare_ops && 21832082Seschrock newvd->vdev_isspare) 21842082Seschrock pvops = &vdev_spare_ops; 21852082Seschrock else 21862082Seschrock pvops = &vdev_replacing_ops; 21872082Seschrock } 21882082Seschrock 21891175Slling /* 21901175Slling * Compare the new device size with the replaceable/attachable 21911175Slling * device size. 21921175Slling */ 21931175Slling if (newvd->vdev_psize < vdev_get_rsize(oldvd)) 2194789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 2195789Sahrens 21961732Sbonwick /* 21971732Sbonwick * The new device cannot have a higher alignment requirement 21981732Sbonwick * than the top-level vdev. 21991732Sbonwick */ 22001732Sbonwick if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 2201789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 2202789Sahrens 2203789Sahrens /* 2204789Sahrens * If this is an in-place replacement, update oldvd's path and devid 2205789Sahrens * to make it distinguishable from newvd, and unopenable from now on. 2206789Sahrens */ 2207789Sahrens if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 2208789Sahrens spa_strfree(oldvd->vdev_path); 2209789Sahrens oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 2210789Sahrens KM_SLEEP); 2211789Sahrens (void) sprintf(oldvd->vdev_path, "%s/%s", 2212789Sahrens newvd->vdev_path, "old"); 2213789Sahrens if (oldvd->vdev_devid != NULL) { 2214789Sahrens spa_strfree(oldvd->vdev_devid); 2215789Sahrens oldvd->vdev_devid = NULL; 2216789Sahrens } 2217789Sahrens } 2218789Sahrens 2219789Sahrens /* 22202082Seschrock * If the parent is not a mirror, or if we're replacing, insert the new 22212082Seschrock * mirror/replacing/spare vdev above oldvd. 2222789Sahrens */ 2223789Sahrens if (pvd->vdev_ops != pvops) 2224789Sahrens pvd = vdev_add_parent(oldvd, pvops); 2225789Sahrens 2226789Sahrens ASSERT(pvd->vdev_top->vdev_parent == rvd); 2227789Sahrens ASSERT(pvd->vdev_ops == pvops); 2228789Sahrens ASSERT(oldvd->vdev_parent == pvd); 2229789Sahrens 2230789Sahrens /* 2231789Sahrens * Extract the new device from its root and add it to pvd. 2232789Sahrens */ 2233789Sahrens vdev_remove_child(newrootvd, newvd); 2234789Sahrens newvd->vdev_id = pvd->vdev_children; 2235789Sahrens vdev_add_child(pvd, newvd); 2236789Sahrens 22371544Seschrock /* 22381544Seschrock * If newvd is smaller than oldvd, but larger than its rsize, 22391544Seschrock * the addition of newvd may have decreased our parent's asize. 22401544Seschrock */ 22411544Seschrock pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize); 22421544Seschrock 2243789Sahrens tvd = newvd->vdev_top; 2244789Sahrens ASSERT(pvd->vdev_top == tvd); 2245789Sahrens ASSERT(tvd->vdev_parent == rvd); 2246789Sahrens 2247789Sahrens vdev_config_dirty(tvd); 2248789Sahrens 2249789Sahrens /* 2250789Sahrens * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 2251789Sahrens * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 2252789Sahrens */ 2253789Sahrens open_txg = txg + TXG_CONCURRENT_STATES - 1; 2254789Sahrens 2255789Sahrens mutex_enter(&newvd->vdev_dtl_lock); 2256789Sahrens space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, 2257789Sahrens open_txg - TXG_INITIAL + 1); 2258789Sahrens mutex_exit(&newvd->vdev_dtl_lock); 2259789Sahrens 22603377Seschrock if (newvd->vdev_isspare) 22613377Seschrock spa_spare_activate(newvd); 22621544Seschrock 2263789Sahrens /* 2264789Sahrens * Mark newvd's DTL dirty in this txg. 2265789Sahrens */ 22661732Sbonwick vdev_dirty(tvd, VDD_DTL, newvd, txg); 2267789Sahrens 2268789Sahrens (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 2269789Sahrens 2270789Sahrens /* 22714451Seschrock * Kick off a resilver to update newvd. We need to grab the namespace 22724451Seschrock * lock because spa_scrub() needs to post a sysevent with the pool name. 2273789Sahrens */ 22744451Seschrock mutex_enter(&spa_namespace_lock); 2275789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 22764451Seschrock mutex_exit(&spa_namespace_lock); 2277789Sahrens 2278789Sahrens return (0); 2279789Sahrens } 2280789Sahrens 2281789Sahrens /* 2282789Sahrens * Detach a device from a mirror or replacing vdev. 2283789Sahrens * If 'replace_done' is specified, only detach if the parent 2284789Sahrens * is a replacing vdev. 2285789Sahrens */ 2286789Sahrens int 22871544Seschrock spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) 2288789Sahrens { 2289789Sahrens uint64_t txg; 2290789Sahrens int c, t, error; 2291789Sahrens vdev_t *rvd = spa->spa_root_vdev; 2292789Sahrens vdev_t *vd, *pvd, *cvd, *tvd; 22932082Seschrock boolean_t unspare = B_FALSE; 22942082Seschrock uint64_t unspare_guid; 2295789Sahrens 2296789Sahrens txg = spa_vdev_enter(spa); 2297789Sahrens 22981544Seschrock vd = vdev_lookup_by_guid(rvd, guid); 2299789Sahrens 2300789Sahrens if (vd == NULL) 2301789Sahrens return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 2302789Sahrens 23031585Sbonwick if (!vd->vdev_ops->vdev_op_leaf) 23041585Sbonwick return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 23051585Sbonwick 2306789Sahrens pvd = vd->vdev_parent; 2307789Sahrens 2308789Sahrens /* 2309789Sahrens * If replace_done is specified, only remove this device if it's 23102082Seschrock * the first child of a replacing vdev. For the 'spare' vdev, either 23112082Seschrock * disk can be removed. 2312789Sahrens */ 23132082Seschrock if (replace_done) { 23142082Seschrock if (pvd->vdev_ops == &vdev_replacing_ops) { 23152082Seschrock if (vd->vdev_id != 0) 23162082Seschrock return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 23172082Seschrock } else if (pvd->vdev_ops != &vdev_spare_ops) { 23182082Seschrock return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 23192082Seschrock } 23202082Seschrock } 23212082Seschrock 23222082Seschrock ASSERT(pvd->vdev_ops != &vdev_spare_ops || 23234577Sahrens spa_version(spa) >= SPA_VERSION_SPARES); 2324789Sahrens 2325789Sahrens /* 23262082Seschrock * Only mirror, replacing, and spare vdevs support detach. 2327789Sahrens */ 2328789Sahrens if (pvd->vdev_ops != &vdev_replacing_ops && 23292082Seschrock pvd->vdev_ops != &vdev_mirror_ops && 23302082Seschrock pvd->vdev_ops != &vdev_spare_ops) 2331789Sahrens return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 2332789Sahrens 2333789Sahrens /* 2334789Sahrens * If there's only one replica, you can't detach it. 2335789Sahrens */ 2336789Sahrens if (pvd->vdev_children <= 1) 2337789Sahrens return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 2338789Sahrens 2339789Sahrens /* 2340789Sahrens * If all siblings have non-empty DTLs, this device may have the only 2341789Sahrens * valid copy of the data, which means we cannot safely detach it. 2342789Sahrens * 2343789Sahrens * XXX -- as in the vdev_offline() case, we really want a more 2344789Sahrens * precise DTL check. 2345789Sahrens */ 2346789Sahrens for (c = 0; c < pvd->vdev_children; c++) { 2347789Sahrens uint64_t dirty; 2348789Sahrens 2349789Sahrens cvd = pvd->vdev_child[c]; 2350789Sahrens if (cvd == vd) 2351789Sahrens continue; 2352789Sahrens if (vdev_is_dead(cvd)) 2353789Sahrens continue; 2354789Sahrens mutex_enter(&cvd->vdev_dtl_lock); 2355789Sahrens dirty = cvd->vdev_dtl_map.sm_space | 2356789Sahrens cvd->vdev_dtl_scrub.sm_space; 2357789Sahrens mutex_exit(&cvd->vdev_dtl_lock); 2358789Sahrens if (!dirty) 2359789Sahrens break; 2360789Sahrens } 23612082Seschrock 23622082Seschrock /* 23632082Seschrock * If we are a replacing or spare vdev, then we can always detach the 23642082Seschrock * latter child, as that is how one cancels the operation. 23652082Seschrock */ 23662082Seschrock if ((pvd->vdev_ops == &vdev_mirror_ops || vd->vdev_id != 1) && 23672082Seschrock c == pvd->vdev_children) 2368789Sahrens return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 2369789Sahrens 2370789Sahrens /* 23712082Seschrock * If we are detaching the original disk from a spare, then it implies 23722082Seschrock * that the spare should become a real disk, and be removed from the 23732082Seschrock * active spare list for the pool. 23742082Seschrock */ 23752082Seschrock if (pvd->vdev_ops == &vdev_spare_ops && 23762082Seschrock vd->vdev_id == 0) 23772082Seschrock unspare = B_TRUE; 23782082Seschrock 23792082Seschrock /* 2380789Sahrens * Erase the disk labels so the disk can be used for other things. 2381789Sahrens * This must be done after all other error cases are handled, 2382789Sahrens * but before we disembowel vd (so we can still do I/O to it). 2383789Sahrens * But if we can't do it, don't treat the error as fatal -- 2384789Sahrens * it may be that the unwritability of the disk is the reason 2385789Sahrens * it's being detached! 2386789Sahrens */ 23873377Seschrock error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 2388789Sahrens 2389789Sahrens /* 2390789Sahrens * Remove vd from its parent and compact the parent's children. 2391789Sahrens */ 2392789Sahrens vdev_remove_child(pvd, vd); 2393789Sahrens vdev_compact_children(pvd); 2394789Sahrens 2395789Sahrens /* 2396789Sahrens * Remember one of the remaining children so we can get tvd below. 2397789Sahrens */ 2398789Sahrens cvd = pvd->vdev_child[0]; 2399789Sahrens 2400789Sahrens /* 24012082Seschrock * If we need to remove the remaining child from the list of hot spares, 24022082Seschrock * do it now, marking the vdev as no longer a spare in the process. We 24032082Seschrock * must do this before vdev_remove_parent(), because that can change the 24042082Seschrock * GUID if it creates a new toplevel GUID. 24052082Seschrock */ 24062082Seschrock if (unspare) { 24072082Seschrock ASSERT(cvd->vdev_isspare); 24083377Seschrock spa_spare_remove(cvd); 24092082Seschrock unspare_guid = cvd->vdev_guid; 24102082Seschrock } 24112082Seschrock 24122082Seschrock /* 2413789Sahrens * If the parent mirror/replacing vdev only has one child, 2414789Sahrens * the parent is no longer needed. Remove it from the tree. 2415789Sahrens */ 2416789Sahrens if (pvd->vdev_children == 1) 2417789Sahrens vdev_remove_parent(cvd); 2418789Sahrens 2419789Sahrens /* 2420789Sahrens * We don't set tvd until now because the parent we just removed 2421789Sahrens * may have been the previous top-level vdev. 2422789Sahrens */ 2423789Sahrens tvd = cvd->vdev_top; 2424789Sahrens ASSERT(tvd->vdev_parent == rvd); 2425789Sahrens 2426789Sahrens /* 24273377Seschrock * Reevaluate the parent vdev state. 2428789Sahrens */ 24294451Seschrock vdev_propagate_state(cvd); 2430789Sahrens 2431789Sahrens /* 24323377Seschrock * If the device we just detached was smaller than the others, it may be 24333377Seschrock * possible to add metaslabs (i.e. grow the pool). vdev_metaslab_init() 24343377Seschrock * can't fail because the existing metaslabs are already in core, so 24353377Seschrock * there's nothing to read from disk. 2436789Sahrens */ 24371732Sbonwick VERIFY(vdev_metaslab_init(tvd, txg) == 0); 2438789Sahrens 2439789Sahrens vdev_config_dirty(tvd); 2440789Sahrens 2441789Sahrens /* 24423377Seschrock * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 24433377Seschrock * vd->vdev_detached is set and free vd's DTL object in syncing context. 24443377Seschrock * But first make sure we're not on any *other* txg's DTL list, to 24453377Seschrock * prevent vd from being accessed after it's freed. 2446789Sahrens */ 2447789Sahrens for (t = 0; t < TXG_SIZE; t++) 2448789Sahrens (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 24491732Sbonwick vd->vdev_detached = B_TRUE; 24501732Sbonwick vdev_dirty(tvd, VDD_DTL, vd, txg); 2451789Sahrens 24524451Seschrock spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); 24534451Seschrock 24542082Seschrock error = spa_vdev_exit(spa, vd, txg, 0); 24552082Seschrock 24562082Seschrock /* 24573377Seschrock * If this was the removal of the original device in a hot spare vdev, 24583377Seschrock * then we want to go through and remove the device from the hot spare 24593377Seschrock * list of every other pool. 24602082Seschrock */ 24612082Seschrock if (unspare) { 24622082Seschrock spa = NULL; 24632082Seschrock mutex_enter(&spa_namespace_lock); 24642082Seschrock while ((spa = spa_next(spa)) != NULL) { 24652082Seschrock if (spa->spa_state != POOL_STATE_ACTIVE) 24662082Seschrock continue; 24672082Seschrock 24682082Seschrock (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 24692082Seschrock } 24702082Seschrock mutex_exit(&spa_namespace_lock); 24712082Seschrock } 24722082Seschrock 24732082Seschrock return (error); 24742082Seschrock } 24752082Seschrock 24762082Seschrock /* 24772082Seschrock * Remove a device from the pool. Currently, this supports removing only hot 24782082Seschrock * spares. 24792082Seschrock */ 24802082Seschrock int 24812082Seschrock spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 24822082Seschrock { 24832082Seschrock vdev_t *vd; 24842082Seschrock nvlist_t **spares, *nv, **newspares; 24852082Seschrock uint_t i, j, nspares; 24862082Seschrock int ret = 0; 24872082Seschrock 24882082Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 24892082Seschrock 24902082Seschrock vd = spa_lookup_by_guid(spa, guid); 24912082Seschrock 24922082Seschrock nv = NULL; 24932082Seschrock if (spa->spa_spares != NULL && 24942082Seschrock nvlist_lookup_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 24952082Seschrock &spares, &nspares) == 0) { 24962082Seschrock for (i = 0; i < nspares; i++) { 24972082Seschrock uint64_t theguid; 24982082Seschrock 24992082Seschrock VERIFY(nvlist_lookup_uint64(spares[i], 25002082Seschrock ZPOOL_CONFIG_GUID, &theguid) == 0); 25012082Seschrock if (theguid == guid) { 25022082Seschrock nv = spares[i]; 25032082Seschrock break; 25042082Seschrock } 25052082Seschrock } 25062082Seschrock } 25072082Seschrock 25082082Seschrock /* 25092082Seschrock * We only support removing a hot spare, and only if it's not currently 25102082Seschrock * in use in this pool. 25112082Seschrock */ 25122082Seschrock if (nv == NULL && vd == NULL) { 25132082Seschrock ret = ENOENT; 25142082Seschrock goto out; 25152082Seschrock } 25162082Seschrock 25172082Seschrock if (nv == NULL && vd != NULL) { 25182082Seschrock ret = ENOTSUP; 25192082Seschrock goto out; 25202082Seschrock } 25212082Seschrock 25222082Seschrock if (!unspare && nv != NULL && vd != NULL) { 25232082Seschrock ret = EBUSY; 25242082Seschrock goto out; 25252082Seschrock } 25262082Seschrock 25272082Seschrock if (nspares == 1) { 25282082Seschrock newspares = NULL; 25292082Seschrock } else { 25302082Seschrock newspares = kmem_alloc((nspares - 1) * sizeof (void *), 25312082Seschrock KM_SLEEP); 25322082Seschrock for (i = 0, j = 0; i < nspares; i++) { 25332082Seschrock if (spares[i] != nv) 25342082Seschrock VERIFY(nvlist_dup(spares[i], 25352082Seschrock &newspares[j++], KM_SLEEP) == 0); 25362082Seschrock } 25372082Seschrock } 25382082Seschrock 25392082Seschrock VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 25402082Seschrock DATA_TYPE_NVLIST_ARRAY) == 0); 25412082Seschrock VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 25422082Seschrock newspares, nspares - 1) == 0); 25432082Seschrock for (i = 0; i < nspares - 1; i++) 25442082Seschrock nvlist_free(newspares[i]); 25452082Seschrock kmem_free(newspares, (nspares - 1) * sizeof (void *)); 25462082Seschrock spa_load_spares(spa); 25472082Seschrock spa->spa_sync_spares = B_TRUE; 25482082Seschrock 25492082Seschrock out: 25502082Seschrock spa_config_exit(spa, FTAG); 25512082Seschrock 25522082Seschrock return (ret); 2553789Sahrens } 2554789Sahrens 2555789Sahrens /* 25564451Seschrock * Find any device that's done replacing, or a vdev marked 'unspare' that's 25574451Seschrock * current spared, so we can detach it. 2558789Sahrens */ 25591544Seschrock static vdev_t * 25604451Seschrock spa_vdev_resilver_done_hunt(vdev_t *vd) 2561789Sahrens { 25621544Seschrock vdev_t *newvd, *oldvd; 2563789Sahrens int c; 2564789Sahrens 25651544Seschrock for (c = 0; c < vd->vdev_children; c++) { 25664451Seschrock oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 25671544Seschrock if (oldvd != NULL) 25681544Seschrock return (oldvd); 25691544Seschrock } 2570789Sahrens 25714451Seschrock /* 25724451Seschrock * Check for a completed replacement. 25734451Seschrock */ 2574789Sahrens if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 25751544Seschrock oldvd = vd->vdev_child[0]; 25761544Seschrock newvd = vd->vdev_child[1]; 2577789Sahrens 25781544Seschrock mutex_enter(&newvd->vdev_dtl_lock); 25791544Seschrock if (newvd->vdev_dtl_map.sm_space == 0 && 25801544Seschrock newvd->vdev_dtl_scrub.sm_space == 0) { 25811544Seschrock mutex_exit(&newvd->vdev_dtl_lock); 25821544Seschrock return (oldvd); 25831544Seschrock } 25841544Seschrock mutex_exit(&newvd->vdev_dtl_lock); 25851544Seschrock } 2586789Sahrens 25874451Seschrock /* 25884451Seschrock * Check for a completed resilver with the 'unspare' flag set. 25894451Seschrock */ 25904451Seschrock if (vd->vdev_ops == &vdev_spare_ops && vd->vdev_children == 2) { 25914451Seschrock newvd = vd->vdev_child[0]; 25924451Seschrock oldvd = vd->vdev_child[1]; 25934451Seschrock 25944451Seschrock mutex_enter(&newvd->vdev_dtl_lock); 25954451Seschrock if (newvd->vdev_unspare && 25964451Seschrock newvd->vdev_dtl_map.sm_space == 0 && 25974451Seschrock newvd->vdev_dtl_scrub.sm_space == 0) { 25984451Seschrock newvd->vdev_unspare = 0; 25994451Seschrock mutex_exit(&newvd->vdev_dtl_lock); 26004451Seschrock return (oldvd); 26014451Seschrock } 26024451Seschrock mutex_exit(&newvd->vdev_dtl_lock); 26034451Seschrock } 26044451Seschrock 26051544Seschrock return (NULL); 2606789Sahrens } 2607789Sahrens 26081544Seschrock static void 26094451Seschrock spa_vdev_resilver_done(spa_t *spa) 2610789Sahrens { 26111544Seschrock vdev_t *vd; 26122082Seschrock vdev_t *pvd; 26131544Seschrock uint64_t guid; 26142082Seschrock uint64_t pguid = 0; 2615789Sahrens 26161544Seschrock spa_config_enter(spa, RW_READER, FTAG); 2617789Sahrens 26184451Seschrock while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 26191544Seschrock guid = vd->vdev_guid; 26202082Seschrock /* 26212082Seschrock * If we have just finished replacing a hot spared device, then 26222082Seschrock * we need to detach the parent's first child (the original hot 26232082Seschrock * spare) as well. 26242082Seschrock */ 26252082Seschrock pvd = vd->vdev_parent; 26262082Seschrock if (pvd->vdev_parent->vdev_ops == &vdev_spare_ops && 26272082Seschrock pvd->vdev_id == 0) { 26282082Seschrock ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 26292082Seschrock ASSERT(pvd->vdev_parent->vdev_children == 2); 26302082Seschrock pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid; 26312082Seschrock } 26321544Seschrock spa_config_exit(spa, FTAG); 26331544Seschrock if (spa_vdev_detach(spa, guid, B_TRUE) != 0) 26341544Seschrock return; 26352082Seschrock if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0) 26362082Seschrock return; 26371544Seschrock spa_config_enter(spa, RW_READER, FTAG); 2638789Sahrens } 2639789Sahrens 26401544Seschrock spa_config_exit(spa, FTAG); 2641789Sahrens } 2642789Sahrens 2643789Sahrens /* 26441354Seschrock * Update the stored path for this vdev. Dirty the vdev configuration, relying 26451354Seschrock * on spa_vdev_enter/exit() to synchronize the labels and cache. 26461354Seschrock */ 26471354Seschrock int 26481354Seschrock spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 26491354Seschrock { 26501354Seschrock vdev_t *rvd, *vd; 26511354Seschrock uint64_t txg; 26521354Seschrock 26531354Seschrock rvd = spa->spa_root_vdev; 26541354Seschrock 26551354Seschrock txg = spa_vdev_enter(spa); 26561354Seschrock 26572082Seschrock if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 26582082Seschrock /* 26592082Seschrock * Determine if this is a reference to a hot spare. In that 26602082Seschrock * case, update the path as stored in the spare list. 26612082Seschrock */ 26622082Seschrock nvlist_t **spares; 26632082Seschrock uint_t i, nspares; 26642082Seschrock if (spa->spa_sparelist != NULL) { 26652082Seschrock VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 26662082Seschrock ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 26672082Seschrock for (i = 0; i < nspares; i++) { 26682082Seschrock uint64_t theguid; 26692082Seschrock VERIFY(nvlist_lookup_uint64(spares[i], 26702082Seschrock ZPOOL_CONFIG_GUID, &theguid) == 0); 26712082Seschrock if (theguid == guid) 26722082Seschrock break; 26732082Seschrock } 26742082Seschrock 26752082Seschrock if (i == nspares) 26762082Seschrock return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 26772082Seschrock 26782082Seschrock VERIFY(nvlist_add_string(spares[i], 26792082Seschrock ZPOOL_CONFIG_PATH, newpath) == 0); 26802082Seschrock spa_load_spares(spa); 26812082Seschrock spa->spa_sync_spares = B_TRUE; 26822082Seschrock return (spa_vdev_exit(spa, NULL, txg, 0)); 26832082Seschrock } else { 26842082Seschrock return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 26852082Seschrock } 26862082Seschrock } 26871354Seschrock 26881585Sbonwick if (!vd->vdev_ops->vdev_op_leaf) 26891585Sbonwick return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 26901585Sbonwick 26911354Seschrock spa_strfree(vd->vdev_path); 26921354Seschrock vd->vdev_path = spa_strdup(newpath); 26931354Seschrock 26941354Seschrock vdev_config_dirty(vd->vdev_top); 26951354Seschrock 26961354Seschrock return (spa_vdev_exit(spa, NULL, txg, 0)); 26971354Seschrock } 26981354Seschrock 26991354Seschrock /* 2700789Sahrens * ========================================================================== 2701789Sahrens * SPA Scrubbing 2702789Sahrens * ========================================================================== 2703789Sahrens */ 2704789Sahrens 2705789Sahrens static void 2706789Sahrens spa_scrub_io_done(zio_t *zio) 2707789Sahrens { 2708789Sahrens spa_t *spa = zio->io_spa; 2709789Sahrens 27104309Smaybee arc_data_buf_free(zio->io_data, zio->io_size); 2711789Sahrens 2712789Sahrens mutex_enter(&spa->spa_scrub_lock); 27131544Seschrock if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 27141775Sbillm vdev_t *vd = zio->io_vd ? zio->io_vd : spa->spa_root_vdev; 2715789Sahrens spa->spa_scrub_errors++; 2716789Sahrens mutex_enter(&vd->vdev_stat_lock); 2717789Sahrens vd->vdev_stat.vs_scrub_errors++; 2718789Sahrens mutex_exit(&vd->vdev_stat_lock); 2719789Sahrens } 27203697Smishra 27213697Smishra if (--spa->spa_scrub_inflight < spa->spa_scrub_maxinflight) 27221544Seschrock cv_broadcast(&spa->spa_scrub_io_cv); 27233697Smishra 27243697Smishra ASSERT(spa->spa_scrub_inflight >= 0); 27253697Smishra 27261544Seschrock mutex_exit(&spa->spa_scrub_lock); 2727789Sahrens } 2728789Sahrens 2729789Sahrens static void 27301544Seschrock spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags, 27311544Seschrock zbookmark_t *zb) 2732789Sahrens { 2733789Sahrens size_t size = BP_GET_LSIZE(bp); 27343697Smishra void *data; 2735789Sahrens 2736789Sahrens mutex_enter(&spa->spa_scrub_lock); 27373697Smishra /* 27383697Smishra * Do not give too much work to vdev(s). 27393697Smishra */ 27403697Smishra while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight) { 27413697Smishra cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 27423697Smishra } 2743789Sahrens spa->spa_scrub_inflight++; 2744789Sahrens mutex_exit(&spa->spa_scrub_lock); 2745789Sahrens 27464309Smaybee data = arc_data_buf_alloc(size); 27473697Smishra 27481544Seschrock if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET) 27491544Seschrock flags |= ZIO_FLAG_SPECULATIVE; /* intent log block */ 27501544Seschrock 27511807Sbonwick flags |= ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL; 27521544Seschrock 2753789Sahrens zio_nowait(zio_read(NULL, spa, bp, data, size, 27541544Seschrock spa_scrub_io_done, NULL, priority, flags, zb)); 2755789Sahrens } 2756789Sahrens 2757789Sahrens /* ARGSUSED */ 2758789Sahrens static int 2759789Sahrens spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) 2760789Sahrens { 2761789Sahrens blkptr_t *bp = &bc->bc_blkptr; 27621775Sbillm vdev_t *vd = spa->spa_root_vdev; 27631775Sbillm dva_t *dva = bp->blk_dva; 27641775Sbillm int needs_resilver = B_FALSE; 27651775Sbillm int d; 2766789Sahrens 27671775Sbillm if (bc->bc_errno) { 2768789Sahrens /* 2769789Sahrens * We can't scrub this block, but we can continue to scrub 2770789Sahrens * the rest of the pool. Note the error and move along. 2771789Sahrens */ 2772789Sahrens mutex_enter(&spa->spa_scrub_lock); 2773789Sahrens spa->spa_scrub_errors++; 2774789Sahrens mutex_exit(&spa->spa_scrub_lock); 2775789Sahrens 27761775Sbillm mutex_enter(&vd->vdev_stat_lock); 27771775Sbillm vd->vdev_stat.vs_scrub_errors++; 27781775Sbillm mutex_exit(&vd->vdev_stat_lock); 2779789Sahrens 2780789Sahrens return (ERESTART); 2781789Sahrens } 2782789Sahrens 2783789Sahrens ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg); 2784789Sahrens 27851775Sbillm for (d = 0; d < BP_GET_NDVAS(bp); d++) { 27861775Sbillm vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d])); 27871775Sbillm 27881775Sbillm ASSERT(vd != NULL); 27891775Sbillm 27901775Sbillm /* 27911775Sbillm * Keep track of how much data we've examined so that 27921775Sbillm * zpool(1M) status can make useful progress reports. 27931775Sbillm */ 27941775Sbillm mutex_enter(&vd->vdev_stat_lock); 27951775Sbillm vd->vdev_stat.vs_scrub_examined += DVA_GET_ASIZE(&dva[d]); 27961775Sbillm mutex_exit(&vd->vdev_stat_lock); 2797789Sahrens 27981775Sbillm if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) { 27991775Sbillm if (DVA_GET_GANG(&dva[d])) { 28001775Sbillm /* 28011775Sbillm * Gang members may be spread across multiple 28021775Sbillm * vdevs, so the best we can do is look at the 28031775Sbillm * pool-wide DTL. 28041775Sbillm * XXX -- it would be better to change our 28051775Sbillm * allocation policy to ensure that this can't 28061775Sbillm * happen. 28071775Sbillm */ 28081775Sbillm vd = spa->spa_root_vdev; 28091775Sbillm } 28101775Sbillm if (vdev_dtl_contains(&vd->vdev_dtl_map, 28111775Sbillm bp->blk_birth, 1)) 28121775Sbillm needs_resilver = B_TRUE; 2813789Sahrens } 28141775Sbillm } 28151775Sbillm 28161775Sbillm if (spa->spa_scrub_type == POOL_SCRUB_EVERYTHING) 2817789Sahrens spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB, 28181544Seschrock ZIO_FLAG_SCRUB, &bc->bc_bookmark); 28191775Sbillm else if (needs_resilver) 28201775Sbillm spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER, 28211775Sbillm ZIO_FLAG_RESILVER, &bc->bc_bookmark); 2822789Sahrens 2823789Sahrens return (0); 2824789Sahrens } 2825789Sahrens 2826789Sahrens static void 2827789Sahrens spa_scrub_thread(spa_t *spa) 2828789Sahrens { 2829789Sahrens callb_cpr_t cprinfo; 2830789Sahrens traverse_handle_t *th = spa->spa_scrub_th; 2831789Sahrens vdev_t *rvd = spa->spa_root_vdev; 2832789Sahrens pool_scrub_type_t scrub_type = spa->spa_scrub_type; 2833789Sahrens int error = 0; 2834789Sahrens boolean_t complete; 2835789Sahrens 2836789Sahrens CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG); 2837789Sahrens 2838797Sbonwick /* 2839797Sbonwick * If we're restarting due to a snapshot create/delete, 2840797Sbonwick * wait for that to complete. 2841797Sbonwick */ 2842797Sbonwick txg_wait_synced(spa_get_dsl(spa), 0); 2843797Sbonwick 28441544Seschrock dprintf("start %s mintxg=%llu maxtxg=%llu\n", 28451544Seschrock scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 28461544Seschrock spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg); 28471544Seschrock 28481544Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 28491544Seschrock vdev_reopen(rvd); /* purge all vdev caches */ 2850789Sahrens vdev_config_dirty(rvd); /* rewrite all disk labels */ 2851789Sahrens vdev_scrub_stat_update(rvd, scrub_type, B_FALSE); 28521544Seschrock spa_config_exit(spa, FTAG); 2853789Sahrens 2854789Sahrens mutex_enter(&spa->spa_scrub_lock); 2855789Sahrens spa->spa_scrub_errors = 0; 2856789Sahrens spa->spa_scrub_active = 1; 28571544Seschrock ASSERT(spa->spa_scrub_inflight == 0); 2858789Sahrens 2859789Sahrens while (!spa->spa_scrub_stop) { 2860789Sahrens CALLB_CPR_SAFE_BEGIN(&cprinfo); 28611544Seschrock while (spa->spa_scrub_suspended) { 2862789Sahrens spa->spa_scrub_active = 0; 2863789Sahrens cv_broadcast(&spa->spa_scrub_cv); 2864789Sahrens cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2865789Sahrens spa->spa_scrub_active = 1; 2866789Sahrens } 2867789Sahrens CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock); 2868789Sahrens 2869789Sahrens if (spa->spa_scrub_restart_txg != 0) 2870789Sahrens break; 2871789Sahrens 2872789Sahrens mutex_exit(&spa->spa_scrub_lock); 2873789Sahrens error = traverse_more(th); 2874789Sahrens mutex_enter(&spa->spa_scrub_lock); 2875789Sahrens if (error != EAGAIN) 2876789Sahrens break; 2877789Sahrens } 2878789Sahrens 2879789Sahrens while (spa->spa_scrub_inflight) 2880789Sahrens cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2881789Sahrens 28821601Sbonwick spa->spa_scrub_active = 0; 28831601Sbonwick cv_broadcast(&spa->spa_scrub_cv); 28841601Sbonwick 28851601Sbonwick mutex_exit(&spa->spa_scrub_lock); 28861601Sbonwick 28871601Sbonwick spa_config_enter(spa, RW_WRITER, FTAG); 28881601Sbonwick 28891601Sbonwick mutex_enter(&spa->spa_scrub_lock); 28901601Sbonwick 28911601Sbonwick /* 28921601Sbonwick * Note: we check spa_scrub_restart_txg under both spa_scrub_lock 28931601Sbonwick * AND the spa config lock to synchronize with any config changes 28941601Sbonwick * that revise the DTLs under spa_vdev_enter() / spa_vdev_exit(). 28951601Sbonwick */ 2896789Sahrens if (spa->spa_scrub_restart_txg != 0) 2897789Sahrens error = ERESTART; 2898789Sahrens 28991544Seschrock if (spa->spa_scrub_stop) 29001544Seschrock error = EINTR; 29011544Seschrock 2902789Sahrens /* 29031544Seschrock * Even if there were uncorrectable errors, we consider the scrub 29041544Seschrock * completed. The downside is that if there is a transient error during 29051544Seschrock * a resilver, we won't resilver the data properly to the target. But 29061544Seschrock * if the damage is permanent (more likely) we will resilver forever, 29071544Seschrock * which isn't really acceptable. Since there is enough information for 29081544Seschrock * the user to know what has failed and why, this seems like a more 29091544Seschrock * tractable approach. 2910789Sahrens */ 29111544Seschrock complete = (error == 0); 2912789Sahrens 29131544Seschrock dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n", 29141544Seschrock scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 2915789Sahrens spa->spa_scrub_maxtxg, complete ? "done" : "FAILED", 2916789Sahrens error, spa->spa_scrub_errors, spa->spa_scrub_stop); 2917789Sahrens 2918789Sahrens mutex_exit(&spa->spa_scrub_lock); 2919789Sahrens 2920789Sahrens /* 2921789Sahrens * If the scrub/resilver completed, update all DTLs to reflect this. 2922789Sahrens * Whether it succeeded or not, vacate all temporary scrub DTLs. 2923789Sahrens */ 2924789Sahrens vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1, 2925789Sahrens complete ? spa->spa_scrub_maxtxg : 0, B_TRUE); 2926789Sahrens vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete); 29271544Seschrock spa_errlog_rotate(spa); 29281601Sbonwick 29294451Seschrock if (scrub_type == POOL_SCRUB_RESILVER && complete) 29304451Seschrock spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_FINISH); 29314451Seschrock 29321544Seschrock spa_config_exit(spa, FTAG); 2933789Sahrens 2934789Sahrens mutex_enter(&spa->spa_scrub_lock); 2935789Sahrens 29361544Seschrock /* 29371544Seschrock * We may have finished replacing a device. 29381544Seschrock * Let the async thread assess this and handle the detach. 29391544Seschrock */ 29404451Seschrock spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 2941789Sahrens 2942789Sahrens /* 2943789Sahrens * If we were told to restart, our final act is to start a new scrub. 2944789Sahrens */ 2945789Sahrens if (error == ERESTART) 29461544Seschrock spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ? 29471544Seschrock SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB); 2948789Sahrens 29491544Seschrock spa->spa_scrub_type = POOL_SCRUB_NONE; 29501544Seschrock spa->spa_scrub_active = 0; 29511544Seschrock spa->spa_scrub_thread = NULL; 29521544Seschrock cv_broadcast(&spa->spa_scrub_cv); 2953789Sahrens CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */ 2954789Sahrens thread_exit(); 2955789Sahrens } 2956789Sahrens 2957789Sahrens void 2958789Sahrens spa_scrub_suspend(spa_t *spa) 2959789Sahrens { 2960789Sahrens mutex_enter(&spa->spa_scrub_lock); 29611544Seschrock spa->spa_scrub_suspended++; 2962789Sahrens while (spa->spa_scrub_active) { 2963789Sahrens cv_broadcast(&spa->spa_scrub_cv); 2964789Sahrens cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2965789Sahrens } 2966789Sahrens while (spa->spa_scrub_inflight) 2967789Sahrens cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2968789Sahrens mutex_exit(&spa->spa_scrub_lock); 2969789Sahrens } 2970789Sahrens 2971789Sahrens void 2972789Sahrens spa_scrub_resume(spa_t *spa) 2973789Sahrens { 2974789Sahrens mutex_enter(&spa->spa_scrub_lock); 29751544Seschrock ASSERT(spa->spa_scrub_suspended != 0); 29761544Seschrock if (--spa->spa_scrub_suspended == 0) 2977789Sahrens cv_broadcast(&spa->spa_scrub_cv); 2978789Sahrens mutex_exit(&spa->spa_scrub_lock); 2979789Sahrens } 2980789Sahrens 2981789Sahrens void 2982789Sahrens spa_scrub_restart(spa_t *spa, uint64_t txg) 2983789Sahrens { 2984789Sahrens /* 2985789Sahrens * Something happened (e.g. snapshot create/delete) that means 2986789Sahrens * we must restart any in-progress scrubs. The itinerary will 2987789Sahrens * fix this properly. 2988789Sahrens */ 2989789Sahrens mutex_enter(&spa->spa_scrub_lock); 2990789Sahrens spa->spa_scrub_restart_txg = txg; 2991789Sahrens mutex_exit(&spa->spa_scrub_lock); 2992789Sahrens } 2993789Sahrens 29941544Seschrock int 29951544Seschrock spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) 2996789Sahrens { 2997789Sahrens space_seg_t *ss; 2998789Sahrens uint64_t mintxg, maxtxg; 2999789Sahrens vdev_t *rvd = spa->spa_root_vdev; 3000789Sahrens 30014808Sek110237 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 30024808Sek110237 ASSERT(!spa_config_held(spa, RW_WRITER)); 30034808Sek110237 3004789Sahrens if ((uint_t)type >= POOL_SCRUB_TYPES) 3005789Sahrens return (ENOTSUP); 3006789Sahrens 30071544Seschrock mutex_enter(&spa->spa_scrub_lock); 30081544Seschrock 3009789Sahrens /* 3010789Sahrens * If there's a scrub or resilver already in progress, stop it. 3011789Sahrens */ 3012789Sahrens while (spa->spa_scrub_thread != NULL) { 3013789Sahrens /* 3014789Sahrens * Don't stop a resilver unless forced. 3015789Sahrens */ 30161544Seschrock if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) { 30171544Seschrock mutex_exit(&spa->spa_scrub_lock); 3018789Sahrens return (EBUSY); 30191544Seschrock } 3020789Sahrens spa->spa_scrub_stop = 1; 3021789Sahrens cv_broadcast(&spa->spa_scrub_cv); 3022789Sahrens cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 3023789Sahrens } 3024789Sahrens 3025789Sahrens /* 3026789Sahrens * Terminate the previous traverse. 3027789Sahrens */ 3028789Sahrens if (spa->spa_scrub_th != NULL) { 3029789Sahrens traverse_fini(spa->spa_scrub_th); 3030789Sahrens spa->spa_scrub_th = NULL; 3031789Sahrens } 3032789Sahrens 30331544Seschrock if (rvd == NULL) { 30341544Seschrock ASSERT(spa->spa_scrub_stop == 0); 30351544Seschrock ASSERT(spa->spa_scrub_type == type); 30361544Seschrock ASSERT(spa->spa_scrub_restart_txg == 0); 30371544Seschrock mutex_exit(&spa->spa_scrub_lock); 30381544Seschrock return (0); 30391544Seschrock } 3040789Sahrens 3041789Sahrens mintxg = TXG_INITIAL - 1; 3042789Sahrens maxtxg = spa_last_synced_txg(spa) + 1; 3043789Sahrens 30441544Seschrock mutex_enter(&rvd->vdev_dtl_lock); 3045789Sahrens 30461544Seschrock if (rvd->vdev_dtl_map.sm_space == 0) { 30471544Seschrock /* 30481544Seschrock * The pool-wide DTL is empty. 30491732Sbonwick * If this is a resilver, there's nothing to do except 30501732Sbonwick * check whether any in-progress replacements have completed. 30511544Seschrock */ 30521732Sbonwick if (type == POOL_SCRUB_RESILVER) { 30531544Seschrock type = POOL_SCRUB_NONE; 30544451Seschrock spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 30551732Sbonwick } 30561544Seschrock } else { 30571544Seschrock /* 30581544Seschrock * The pool-wide DTL is non-empty. 30591544Seschrock * If this is a normal scrub, upgrade to a resilver instead. 30601544Seschrock */ 30611544Seschrock if (type == POOL_SCRUB_EVERYTHING) 30621544Seschrock type = POOL_SCRUB_RESILVER; 30631544Seschrock } 3064789Sahrens 30651544Seschrock if (type == POOL_SCRUB_RESILVER) { 3066789Sahrens /* 3067789Sahrens * Determine the resilvering boundaries. 3068789Sahrens * 3069789Sahrens * Note: (mintxg, maxtxg) is an open interval, 3070789Sahrens * i.e. mintxg and maxtxg themselves are not included. 3071789Sahrens * 3072789Sahrens * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1 3073789Sahrens * so we don't claim to resilver a txg that's still changing. 3074789Sahrens */ 3075789Sahrens ss = avl_first(&rvd->vdev_dtl_map.sm_root); 30761544Seschrock mintxg = ss->ss_start - 1; 3077789Sahrens ss = avl_last(&rvd->vdev_dtl_map.sm_root); 30781544Seschrock maxtxg = MIN(ss->ss_end, maxtxg); 30794451Seschrock 30804451Seschrock spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_START); 3081789Sahrens } 3082789Sahrens 30831544Seschrock mutex_exit(&rvd->vdev_dtl_lock); 30841544Seschrock 30851544Seschrock spa->spa_scrub_stop = 0; 30861544Seschrock spa->spa_scrub_type = type; 30871544Seschrock spa->spa_scrub_restart_txg = 0; 30881544Seschrock 30891544Seschrock if (type != POOL_SCRUB_NONE) { 30901544Seschrock spa->spa_scrub_mintxg = mintxg; 3091789Sahrens spa->spa_scrub_maxtxg = maxtxg; 3092789Sahrens spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL, 30931635Sbonwick ADVANCE_PRE | ADVANCE_PRUNE | ADVANCE_ZIL, 30941635Sbonwick ZIO_FLAG_CANFAIL); 3095789Sahrens traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg); 3096789Sahrens spa->spa_scrub_thread = thread_create(NULL, 0, 3097789Sahrens spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri); 3098789Sahrens } 3099789Sahrens 31001544Seschrock mutex_exit(&spa->spa_scrub_lock); 31011544Seschrock 3102789Sahrens return (0); 3103789Sahrens } 3104789Sahrens 31051544Seschrock /* 31061544Seschrock * ========================================================================== 31071544Seschrock * SPA async task processing 31081544Seschrock * ========================================================================== 31091544Seschrock */ 31101544Seschrock 31111544Seschrock static void 31124451Seschrock spa_async_remove(spa_t *spa, vdev_t *vd) 3113789Sahrens { 31141544Seschrock vdev_t *tvd; 31151544Seschrock int c; 31161544Seschrock 31174451Seschrock for (c = 0; c < vd->vdev_children; c++) { 31184451Seschrock tvd = vd->vdev_child[c]; 31194451Seschrock if (tvd->vdev_remove_wanted) { 31204451Seschrock tvd->vdev_remove_wanted = 0; 31214451Seschrock vdev_set_state(tvd, B_FALSE, VDEV_STATE_REMOVED, 31224451Seschrock VDEV_AUX_NONE); 3123*5329Sgw25295 vdev_clear(spa, tvd, B_TRUE); 31244451Seschrock vdev_config_dirty(tvd->vdev_top); 31251544Seschrock } 31264451Seschrock spa_async_remove(spa, tvd); 31271544Seschrock } 31281544Seschrock } 31291544Seschrock 31301544Seschrock static void 31311544Seschrock spa_async_thread(spa_t *spa) 31321544Seschrock { 31331544Seschrock int tasks; 31344451Seschrock uint64_t txg; 31351544Seschrock 31361544Seschrock ASSERT(spa->spa_sync_on); 3137789Sahrens 31381544Seschrock mutex_enter(&spa->spa_async_lock); 31391544Seschrock tasks = spa->spa_async_tasks; 31401544Seschrock spa->spa_async_tasks = 0; 31411544Seschrock mutex_exit(&spa->spa_async_lock); 31421544Seschrock 31431544Seschrock /* 31441635Sbonwick * See if the config needs to be updated. 31451635Sbonwick */ 31461635Sbonwick if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 31471635Sbonwick mutex_enter(&spa_namespace_lock); 31481635Sbonwick spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 31491635Sbonwick mutex_exit(&spa_namespace_lock); 31501635Sbonwick } 31511635Sbonwick 31521635Sbonwick /* 31534451Seschrock * See if any devices need to be marked REMOVED. 3154*5329Sgw25295 * 3155*5329Sgw25295 * XXX - We avoid doing this when we are in 3156*5329Sgw25295 * I/O failure state since spa_vdev_enter() grabs 3157*5329Sgw25295 * the namespace lock and would not be able to obtain 3158*5329Sgw25295 * the writer config lock. 31591544Seschrock */ 3160*5329Sgw25295 if (tasks & SPA_ASYNC_REMOVE && 3161*5329Sgw25295 spa_state(spa) != POOL_STATE_IO_FAILURE) { 31624451Seschrock txg = spa_vdev_enter(spa); 31634451Seschrock spa_async_remove(spa, spa->spa_root_vdev); 31644451Seschrock (void) spa_vdev_exit(spa, NULL, txg, 0); 31654451Seschrock } 31661544Seschrock 31671544Seschrock /* 31681544Seschrock * If any devices are done replacing, detach them. 31691544Seschrock */ 31704451Seschrock if (tasks & SPA_ASYNC_RESILVER_DONE) 31714451Seschrock spa_vdev_resilver_done(spa); 3172789Sahrens 31731544Seschrock /* 31744451Seschrock * Kick off a scrub. When starting a RESILVER scrub (or an EVERYTHING 31754451Seschrock * scrub which can become a resilver), we need to hold 31764451Seschrock * spa_namespace_lock() because the sysevent we post via 31774451Seschrock * spa_event_notify() needs to get the name of the pool. 31781544Seschrock */ 31794451Seschrock if (tasks & SPA_ASYNC_SCRUB) { 31804451Seschrock mutex_enter(&spa_namespace_lock); 31811544Seschrock VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0); 31824451Seschrock mutex_exit(&spa_namespace_lock); 31834451Seschrock } 31841544Seschrock 31851544Seschrock /* 31861544Seschrock * Kick off a resilver. 31871544Seschrock */ 31884451Seschrock if (tasks & SPA_ASYNC_RESILVER) { 31894451Seschrock mutex_enter(&spa_namespace_lock); 31901544Seschrock VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 31914451Seschrock mutex_exit(&spa_namespace_lock); 31924451Seschrock } 31931544Seschrock 31941544Seschrock /* 31951544Seschrock * Let the world know that we're done. 31961544Seschrock */ 31971544Seschrock mutex_enter(&spa->spa_async_lock); 31981544Seschrock spa->spa_async_thread = NULL; 31991544Seschrock cv_broadcast(&spa->spa_async_cv); 32001544Seschrock mutex_exit(&spa->spa_async_lock); 32011544Seschrock thread_exit(); 32021544Seschrock } 32031544Seschrock 32041544Seschrock void 32051544Seschrock spa_async_suspend(spa_t *spa) 32061544Seschrock { 32071544Seschrock mutex_enter(&spa->spa_async_lock); 32081544Seschrock spa->spa_async_suspended++; 32091544Seschrock while (spa->spa_async_thread != NULL) 32101544Seschrock cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 32111544Seschrock mutex_exit(&spa->spa_async_lock); 32121544Seschrock } 32131544Seschrock 32141544Seschrock void 32151544Seschrock spa_async_resume(spa_t *spa) 32161544Seschrock { 32171544Seschrock mutex_enter(&spa->spa_async_lock); 32181544Seschrock ASSERT(spa->spa_async_suspended != 0); 32191544Seschrock spa->spa_async_suspended--; 32201544Seschrock mutex_exit(&spa->spa_async_lock); 32211544Seschrock } 32221544Seschrock 32231544Seschrock static void 32241544Seschrock spa_async_dispatch(spa_t *spa) 32251544Seschrock { 32261544Seschrock mutex_enter(&spa->spa_async_lock); 32271544Seschrock if (spa->spa_async_tasks && !spa->spa_async_suspended && 32281635Sbonwick spa->spa_async_thread == NULL && 32291635Sbonwick rootdir != NULL && !vn_is_readonly(rootdir)) 32301544Seschrock spa->spa_async_thread = thread_create(NULL, 0, 32311544Seschrock spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 32321544Seschrock mutex_exit(&spa->spa_async_lock); 32331544Seschrock } 32341544Seschrock 32351544Seschrock void 32361544Seschrock spa_async_request(spa_t *spa, int task) 32371544Seschrock { 32381544Seschrock mutex_enter(&spa->spa_async_lock); 32391544Seschrock spa->spa_async_tasks |= task; 32401544Seschrock mutex_exit(&spa->spa_async_lock); 3241789Sahrens } 3242789Sahrens 3243789Sahrens /* 3244789Sahrens * ========================================================================== 3245789Sahrens * SPA syncing routines 3246789Sahrens * ========================================================================== 3247789Sahrens */ 3248789Sahrens 3249789Sahrens static void 3250789Sahrens spa_sync_deferred_frees(spa_t *spa, uint64_t txg) 3251789Sahrens { 3252789Sahrens bplist_t *bpl = &spa->spa_sync_bplist; 3253789Sahrens dmu_tx_t *tx; 3254789Sahrens blkptr_t blk; 3255789Sahrens uint64_t itor = 0; 3256789Sahrens zio_t *zio; 3257789Sahrens int error; 3258789Sahrens uint8_t c = 1; 3259789Sahrens 3260789Sahrens zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD); 3261789Sahrens 3262789Sahrens while (bplist_iterate(bpl, &itor, &blk) == 0) 3263789Sahrens zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL)); 3264789Sahrens 3265789Sahrens error = zio_wait(zio); 3266789Sahrens ASSERT3U(error, ==, 0); 3267789Sahrens 3268789Sahrens tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 3269789Sahrens bplist_vacate(bpl, tx); 3270789Sahrens 3271789Sahrens /* 3272789Sahrens * Pre-dirty the first block so we sync to convergence faster. 3273789Sahrens * (Usually only the first block is needed.) 3274789Sahrens */ 3275789Sahrens dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 3276789Sahrens dmu_tx_commit(tx); 3277789Sahrens } 3278789Sahrens 3279789Sahrens static void 32802082Seschrock spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 32812082Seschrock { 32822082Seschrock char *packed = NULL; 32832082Seschrock size_t nvsize = 0; 32842082Seschrock dmu_buf_t *db; 32852082Seschrock 32862082Seschrock VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 32872082Seschrock 32882082Seschrock packed = kmem_alloc(nvsize, KM_SLEEP); 32892082Seschrock 32902082Seschrock VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 32912082Seschrock KM_SLEEP) == 0); 32922082Seschrock 32932082Seschrock dmu_write(spa->spa_meta_objset, obj, 0, nvsize, packed, tx); 32942082Seschrock 32952082Seschrock kmem_free(packed, nvsize); 32962082Seschrock 32972082Seschrock VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 32982082Seschrock dmu_buf_will_dirty(db, tx); 32992082Seschrock *(uint64_t *)db->db_data = nvsize; 33002082Seschrock dmu_buf_rele(db, FTAG); 33012082Seschrock } 33022082Seschrock 33032082Seschrock static void 33042082Seschrock spa_sync_spares(spa_t *spa, dmu_tx_t *tx) 33052082Seschrock { 33062082Seschrock nvlist_t *nvroot; 33072082Seschrock nvlist_t **spares; 33082082Seschrock int i; 33092082Seschrock 33102082Seschrock if (!spa->spa_sync_spares) 33112082Seschrock return; 33122082Seschrock 33132082Seschrock /* 33142082Seschrock * Update the MOS nvlist describing the list of available spares. 33152082Seschrock * spa_validate_spares() will have already made sure this nvlist is 33164451Seschrock * valid and the vdevs are labeled appropriately. 33172082Seschrock */ 33182082Seschrock if (spa->spa_spares_object == 0) { 33192082Seschrock spa->spa_spares_object = dmu_object_alloc(spa->spa_meta_objset, 33202082Seschrock DMU_OT_PACKED_NVLIST, 1 << 14, 33212082Seschrock DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 33222082Seschrock VERIFY(zap_update(spa->spa_meta_objset, 33232082Seschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SPARES, 33242082Seschrock sizeof (uint64_t), 1, &spa->spa_spares_object, tx) == 0); 33252082Seschrock } 33262082Seschrock 33272082Seschrock VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 33282082Seschrock if (spa->spa_nspares == 0) { 33292082Seschrock VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 33302082Seschrock NULL, 0) == 0); 33312082Seschrock } else { 33322082Seschrock spares = kmem_alloc(spa->spa_nspares * sizeof (void *), 33332082Seschrock KM_SLEEP); 33342082Seschrock for (i = 0; i < spa->spa_nspares; i++) 33352082Seschrock spares[i] = vdev_config_generate(spa, 33362082Seschrock spa->spa_spares[i], B_FALSE, B_TRUE); 33372082Seschrock VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 33382082Seschrock spares, spa->spa_nspares) == 0); 33392082Seschrock for (i = 0; i < spa->spa_nspares; i++) 33402082Seschrock nvlist_free(spares[i]); 33412082Seschrock kmem_free(spares, spa->spa_nspares * sizeof (void *)); 33422082Seschrock } 33432082Seschrock 33442082Seschrock spa_sync_nvlist(spa, spa->spa_spares_object, nvroot, tx); 33452926Sek110237 nvlist_free(nvroot); 33462082Seschrock 33472082Seschrock spa->spa_sync_spares = B_FALSE; 33482082Seschrock } 33492082Seschrock 33502082Seschrock static void 3351789Sahrens spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 3352789Sahrens { 3353789Sahrens nvlist_t *config; 3354789Sahrens 3355789Sahrens if (list_is_empty(&spa->spa_dirty_list)) 3356789Sahrens return; 3357789Sahrens 3358789Sahrens config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE); 3359789Sahrens 33601635Sbonwick if (spa->spa_config_syncing) 33611635Sbonwick nvlist_free(spa->spa_config_syncing); 33621635Sbonwick spa->spa_config_syncing = config; 3363789Sahrens 33642082Seschrock spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 3365789Sahrens } 3366789Sahrens 33675094Slling /* 33685094Slling * Set zpool properties. 33695094Slling */ 33703912Slling static void 33714543Smarks spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) 33723912Slling { 33733912Slling spa_t *spa = arg1; 33745094Slling objset_t *mos = spa->spa_meta_objset; 33753912Slling nvlist_t *nvp = arg2; 33765094Slling nvpair_t *elem; 33774451Seschrock uint64_t intval; 33785094Slling char *strval; 33795094Slling zpool_prop_t prop; 33805094Slling const char *propname; 33815094Slling zprop_type_t proptype; 33825094Slling 33835094Slling elem = NULL; 33845094Slling while ((elem = nvlist_next_nvpair(nvp, elem))) { 33855094Slling switch (prop = zpool_name_to_prop(nvpair_name(elem))) { 33865094Slling case ZPOOL_PROP_VERSION: 33875094Slling /* 33885094Slling * Only set version for non-zpool-creation cases 33895094Slling * (set/import). spa_create() needs special care 33905094Slling * for version setting. 33915094Slling */ 33925094Slling if (tx->tx_txg != TXG_INITIAL) { 33935094Slling VERIFY(nvpair_value_uint64(elem, 33945094Slling &intval) == 0); 33955094Slling ASSERT(intval <= SPA_VERSION); 33965094Slling ASSERT(intval >= spa_version(spa)); 33975094Slling spa->spa_uberblock.ub_version = intval; 33985094Slling vdev_config_dirty(spa->spa_root_vdev); 33995094Slling } 34005094Slling break; 34015094Slling 34025094Slling case ZPOOL_PROP_ALTROOT: 34035094Slling /* 34045094Slling * 'altroot' is a non-persistent property. It should 34055094Slling * have been set temporarily at creation or import time. 34065094Slling */ 34075094Slling ASSERT(spa->spa_root != NULL); 34085094Slling break; 34095094Slling 34105094Slling case ZPOOL_PROP_TEMPORARY: 34115094Slling /* 34125094Slling * 'temporary' is a non-persistant property. 34135094Slling */ 34145094Slling VERIFY(nvpair_value_uint64(elem, &intval) == 0); 34155094Slling spa->spa_temporary = intval; 34164543Smarks break; 34175094Slling default: 34185094Slling /* 34195094Slling * Set pool property values in the poolprops mos object. 34205094Slling */ 34215094Slling mutex_enter(&spa->spa_props_lock); 34225094Slling if (spa->spa_pool_props_object == 0) { 34235094Slling objset_t *mos = spa->spa_meta_objset; 34245094Slling 34255094Slling VERIFY((spa->spa_pool_props_object = 34265094Slling zap_create(mos, DMU_OT_POOL_PROPS, 34275094Slling DMU_OT_NONE, 0, tx)) > 0); 34285094Slling 34295094Slling VERIFY(zap_update(mos, 34305094Slling DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 34315094Slling 8, 1, &spa->spa_pool_props_object, tx) 34325094Slling == 0); 34335094Slling } 34345094Slling mutex_exit(&spa->spa_props_lock); 34355094Slling 34365094Slling /* normalize the property name */ 34375094Slling propname = zpool_prop_to_name(prop); 34385094Slling proptype = zpool_prop_get_type(prop); 34395094Slling 34405094Slling if (nvpair_type(elem) == DATA_TYPE_STRING) { 34415094Slling ASSERT(proptype == PROP_TYPE_STRING); 34425094Slling VERIFY(nvpair_value_string(elem, &strval) == 0); 34435094Slling VERIFY(zap_update(mos, 34445094Slling spa->spa_pool_props_object, propname, 34455094Slling 1, strlen(strval) + 1, strval, tx) == 0); 34465094Slling 34475094Slling } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 34485094Slling VERIFY(nvpair_value_uint64(elem, &intval) == 0); 34495094Slling 34505094Slling if (proptype == PROP_TYPE_INDEX) { 34515094Slling const char *unused; 34525094Slling VERIFY(zpool_prop_index_to_string( 34535094Slling prop, intval, &unused) == 0); 34545094Slling } 34555094Slling VERIFY(zap_update(mos, 34565094Slling spa->spa_pool_props_object, propname, 34575094Slling 8, 1, &intval, tx) == 0); 34585094Slling } else { 34595094Slling ASSERT(0); /* not allowed */ 34605094Slling } 34615094Slling 3462*5329Sgw25295 switch (prop) { 3463*5329Sgw25295 case ZPOOL_PROP_DELEGATION: 34645094Slling spa->spa_delegation = intval; 3465*5329Sgw25295 break; 3466*5329Sgw25295 case ZPOOL_PROP_BOOTFS: 34675094Slling spa->spa_bootfs = intval; 3468*5329Sgw25295 break; 3469*5329Sgw25295 case ZPOOL_PROP_FAILUREMODE: 3470*5329Sgw25295 spa->spa_failmode = intval; 3471*5329Sgw25295 break; 3472*5329Sgw25295 default: 3473*5329Sgw25295 break; 3474*5329Sgw25295 } 34753912Slling } 34765094Slling 34775094Slling /* log internal history if this is not a zpool create */ 34785094Slling if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY && 34795094Slling tx->tx_txg != TXG_INITIAL) { 34805094Slling spa_history_internal_log(LOG_POOL_PROPSET, 34815094Slling spa, tx, cr, "%s %lld %s", 34825094Slling nvpair_name(elem), intval, spa->spa_name); 34835094Slling } 34843912Slling } 34853912Slling } 34863912Slling 3487789Sahrens /* 3488789Sahrens * Sync the specified transaction group. New blocks may be dirtied as 3489789Sahrens * part of the process, so we iterate until it converges. 3490789Sahrens */ 3491789Sahrens void 3492789Sahrens spa_sync(spa_t *spa, uint64_t txg) 3493789Sahrens { 3494789Sahrens dsl_pool_t *dp = spa->spa_dsl_pool; 3495789Sahrens objset_t *mos = spa->spa_meta_objset; 3496789Sahrens bplist_t *bpl = &spa->spa_sync_bplist; 34971635Sbonwick vdev_t *rvd = spa->spa_root_vdev; 3498789Sahrens vdev_t *vd; 3499789Sahrens dmu_tx_t *tx; 3500789Sahrens int dirty_vdevs; 3501789Sahrens 3502789Sahrens /* 3503789Sahrens * Lock out configuration changes. 3504789Sahrens */ 35051544Seschrock spa_config_enter(spa, RW_READER, FTAG); 3506789Sahrens 3507789Sahrens spa->spa_syncing_txg = txg; 3508789Sahrens spa->spa_sync_pass = 0; 3509789Sahrens 35101544Seschrock VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); 3511789Sahrens 35122082Seschrock tx = dmu_tx_create_assigned(dp, txg); 35132082Seschrock 35142082Seschrock /* 35154577Sahrens * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 35162082Seschrock * set spa_deflate if we have no raid-z vdevs. 35172082Seschrock */ 35184577Sahrens if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 35194577Sahrens spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 35202082Seschrock int i; 35212082Seschrock 35222082Seschrock for (i = 0; i < rvd->vdev_children; i++) { 35232082Seschrock vd = rvd->vdev_child[i]; 35242082Seschrock if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 35252082Seschrock break; 35262082Seschrock } 35272082Seschrock if (i == rvd->vdev_children) { 35282082Seschrock spa->spa_deflate = TRUE; 35292082Seschrock VERIFY(0 == zap_add(spa->spa_meta_objset, 35302082Seschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 35312082Seschrock sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 35322082Seschrock } 35332082Seschrock } 35342082Seschrock 3535789Sahrens /* 3536789Sahrens * If anything has changed in this txg, push the deferred frees 3537789Sahrens * from the previous txg. If not, leave them alone so that we 3538789Sahrens * don't generate work on an otherwise idle system. 3539789Sahrens */ 3540789Sahrens if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 35412329Sek110237 !txg_list_empty(&dp->dp_dirty_dirs, txg) || 35422329Sek110237 !txg_list_empty(&dp->dp_sync_tasks, txg)) 3543789Sahrens spa_sync_deferred_frees(spa, txg); 3544789Sahrens 3545789Sahrens /* 3546789Sahrens * Iterate to convergence. 3547789Sahrens */ 3548789Sahrens do { 3549789Sahrens spa->spa_sync_pass++; 3550789Sahrens 3551789Sahrens spa_sync_config_object(spa, tx); 35522082Seschrock spa_sync_spares(spa, tx); 35531544Seschrock spa_errlog_sync(spa, txg); 3554789Sahrens dsl_pool_sync(dp, txg); 3555789Sahrens 3556789Sahrens dirty_vdevs = 0; 3557789Sahrens while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 3558789Sahrens vdev_sync(vd, txg); 3559789Sahrens dirty_vdevs++; 3560789Sahrens } 3561789Sahrens 3562789Sahrens bplist_sync(bpl, tx); 3563789Sahrens } while (dirty_vdevs); 3564789Sahrens 3565789Sahrens bplist_close(bpl); 3566789Sahrens 3567789Sahrens dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 3568789Sahrens 3569789Sahrens /* 3570789Sahrens * Rewrite the vdev configuration (which includes the uberblock) 3571789Sahrens * to commit the transaction group. 35721635Sbonwick * 35731635Sbonwick * If there are any dirty vdevs, sync the uberblock to all vdevs. 35741635Sbonwick * Otherwise, pick a random top-level vdev that's known to be 35751635Sbonwick * visible in the config cache (see spa_vdev_add() for details). 35761635Sbonwick * If the write fails, try the next vdev until we're tried them all. 3577789Sahrens */ 35781635Sbonwick if (!list_is_empty(&spa->spa_dirty_list)) { 35791635Sbonwick VERIFY(vdev_config_sync(rvd, txg) == 0); 35801635Sbonwick } else { 35811635Sbonwick int children = rvd->vdev_children; 35821635Sbonwick int c0 = spa_get_random(children); 35831635Sbonwick int c; 35841635Sbonwick 35851635Sbonwick for (c = 0; c < children; c++) { 35861635Sbonwick vd = rvd->vdev_child[(c0 + c) % children]; 35871635Sbonwick if (vd->vdev_ms_array == 0) 35881635Sbonwick continue; 35891635Sbonwick if (vdev_config_sync(vd, txg) == 0) 35901635Sbonwick break; 35911635Sbonwick } 35921635Sbonwick if (c == children) 35931635Sbonwick VERIFY(vdev_config_sync(rvd, txg) == 0); 35941635Sbonwick } 35951635Sbonwick 35962082Seschrock dmu_tx_commit(tx); 35972082Seschrock 35981635Sbonwick /* 35991635Sbonwick * Clear the dirty config list. 36001635Sbonwick */ 36011635Sbonwick while ((vd = list_head(&spa->spa_dirty_list)) != NULL) 36021635Sbonwick vdev_config_clean(vd); 36031635Sbonwick 36041635Sbonwick /* 36051635Sbonwick * Now that the new config has synced transactionally, 36061635Sbonwick * let it become visible to the config cache. 36071635Sbonwick */ 36081635Sbonwick if (spa->spa_config_syncing != NULL) { 36091635Sbonwick spa_config_set(spa, spa->spa_config_syncing); 36101635Sbonwick spa->spa_config_txg = txg; 36111635Sbonwick spa->spa_config_syncing = NULL; 36121635Sbonwick } 3613789Sahrens 3614789Sahrens /* 3615789Sahrens * Make a stable copy of the fully synced uberblock. 3616789Sahrens * We use this as the root for pool traversals. 3617789Sahrens */ 3618789Sahrens spa->spa_traverse_wanted = 1; /* tells traverse_more() to stop */ 3619789Sahrens 3620789Sahrens spa_scrub_suspend(spa); /* stop scrubbing and finish I/Os */ 3621789Sahrens 3622789Sahrens rw_enter(&spa->spa_traverse_lock, RW_WRITER); 3623789Sahrens spa->spa_traverse_wanted = 0; 3624789Sahrens spa->spa_ubsync = spa->spa_uberblock; 3625789Sahrens rw_exit(&spa->spa_traverse_lock); 3626789Sahrens 3627789Sahrens spa_scrub_resume(spa); /* resume scrub with new ubsync */ 3628789Sahrens 3629789Sahrens /* 3630789Sahrens * Clean up the ZIL records for the synced txg. 3631789Sahrens */ 3632789Sahrens dsl_pool_zil_clean(dp); 3633789Sahrens 3634789Sahrens /* 3635789Sahrens * Update usable space statistics. 3636789Sahrens */ 3637789Sahrens while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 3638789Sahrens vdev_sync_done(vd, txg); 3639789Sahrens 3640789Sahrens /* 3641789Sahrens * It had better be the case that we didn't dirty anything 36422082Seschrock * since vdev_config_sync(). 3643789Sahrens */ 3644789Sahrens ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 3645789Sahrens ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 3646789Sahrens ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 3647789Sahrens ASSERT(bpl->bpl_queue == NULL); 3648789Sahrens 36491544Seschrock spa_config_exit(spa, FTAG); 36501544Seschrock 36511544Seschrock /* 36521544Seschrock * If any async tasks have been requested, kick them off. 36531544Seschrock */ 36541544Seschrock spa_async_dispatch(spa); 3655789Sahrens } 3656789Sahrens 3657789Sahrens /* 3658789Sahrens * Sync all pools. We don't want to hold the namespace lock across these 3659789Sahrens * operations, so we take a reference on the spa_t and drop the lock during the 3660789Sahrens * sync. 3661789Sahrens */ 3662789Sahrens void 3663789Sahrens spa_sync_allpools(void) 3664789Sahrens { 3665789Sahrens spa_t *spa = NULL; 3666789Sahrens mutex_enter(&spa_namespace_lock); 3667789Sahrens while ((spa = spa_next(spa)) != NULL) { 3668789Sahrens if (spa_state(spa) != POOL_STATE_ACTIVE) 3669789Sahrens continue; 3670789Sahrens spa_open_ref(spa, FTAG); 3671789Sahrens mutex_exit(&spa_namespace_lock); 3672789Sahrens txg_wait_synced(spa_get_dsl(spa), 0); 3673789Sahrens mutex_enter(&spa_namespace_lock); 3674789Sahrens spa_close(spa, FTAG); 3675789Sahrens } 3676789Sahrens mutex_exit(&spa_namespace_lock); 3677789Sahrens } 3678789Sahrens 3679789Sahrens /* 3680789Sahrens * ========================================================================== 3681789Sahrens * Miscellaneous routines 3682789Sahrens * ========================================================================== 3683789Sahrens */ 3684789Sahrens 3685789Sahrens /* 3686789Sahrens * Remove all pools in the system. 3687789Sahrens */ 3688789Sahrens void 3689789Sahrens spa_evict_all(void) 3690789Sahrens { 3691789Sahrens spa_t *spa; 3692789Sahrens 3693789Sahrens /* 3694789Sahrens * Remove all cached state. All pools should be closed now, 3695789Sahrens * so every spa in the AVL tree should be unreferenced. 3696789Sahrens */ 3697789Sahrens mutex_enter(&spa_namespace_lock); 3698789Sahrens while ((spa = spa_next(NULL)) != NULL) { 3699789Sahrens /* 37001544Seschrock * Stop async tasks. The async thread may need to detach 37011544Seschrock * a device that's been replaced, which requires grabbing 37021544Seschrock * spa_namespace_lock, so we must drop it here. 3703789Sahrens */ 3704789Sahrens spa_open_ref(spa, FTAG); 3705789Sahrens mutex_exit(&spa_namespace_lock); 37061544Seschrock spa_async_suspend(spa); 37074808Sek110237 mutex_enter(&spa_namespace_lock); 3708789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 3709789Sahrens spa_close(spa, FTAG); 3710789Sahrens 3711789Sahrens if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 3712789Sahrens spa_unload(spa); 3713789Sahrens spa_deactivate(spa); 3714789Sahrens } 3715789Sahrens spa_remove(spa); 3716789Sahrens } 3717789Sahrens mutex_exit(&spa_namespace_lock); 3718789Sahrens } 37191544Seschrock 37201544Seschrock vdev_t * 37211544Seschrock spa_lookup_by_guid(spa_t *spa, uint64_t guid) 37221544Seschrock { 37231544Seschrock return (vdev_lookup_by_guid(spa->spa_root_vdev, guid)); 37241544Seschrock } 37251760Seschrock 37261760Seschrock void 37275094Slling spa_upgrade(spa_t *spa, uint64_t version) 37281760Seschrock { 37291760Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 37301760Seschrock 37311760Seschrock /* 37321760Seschrock * This should only be called for a non-faulted pool, and since a 37331760Seschrock * future version would result in an unopenable pool, this shouldn't be 37341760Seschrock * possible. 37351760Seschrock */ 37364577Sahrens ASSERT(spa->spa_uberblock.ub_version <= SPA_VERSION); 37375094Slling ASSERT(version >= spa->spa_uberblock.ub_version); 37385094Slling 37395094Slling spa->spa_uberblock.ub_version = version; 37401760Seschrock vdev_config_dirty(spa->spa_root_vdev); 37411760Seschrock 37421760Seschrock spa_config_exit(spa, FTAG); 37432082Seschrock 37442082Seschrock txg_wait_synced(spa_get_dsl(spa), 0); 37451760Seschrock } 37462082Seschrock 37472082Seschrock boolean_t 37482082Seschrock spa_has_spare(spa_t *spa, uint64_t guid) 37492082Seschrock { 37502082Seschrock int i; 37513377Seschrock uint64_t spareguid; 37522082Seschrock 37532082Seschrock for (i = 0; i < spa->spa_nspares; i++) 37542082Seschrock if (spa->spa_spares[i]->vdev_guid == guid) 37552082Seschrock return (B_TRUE); 37562082Seschrock 37573377Seschrock for (i = 0; i < spa->spa_pending_nspares; i++) { 37583377Seschrock if (nvlist_lookup_uint64(spa->spa_pending_spares[i], 37593377Seschrock ZPOOL_CONFIG_GUID, &spareguid) == 0 && 37603377Seschrock spareguid == guid) 37613377Seschrock return (B_TRUE); 37623377Seschrock } 37633377Seschrock 37642082Seschrock return (B_FALSE); 37652082Seschrock } 37663912Slling 37674451Seschrock /* 37684451Seschrock * Post a sysevent corresponding to the given event. The 'name' must be one of 37694451Seschrock * the event definitions in sys/sysevent/eventdefs.h. The payload will be 37704451Seschrock * filled in from the spa and (optionally) the vdev. This doesn't do anything 37714451Seschrock * in the userland libzpool, as we don't want consumers to misinterpret ztest 37724451Seschrock * or zdb as real changes. 37734451Seschrock */ 37744451Seschrock void 37754451Seschrock spa_event_notify(spa_t *spa, vdev_t *vd, const char *name) 37764451Seschrock { 37774451Seschrock #ifdef _KERNEL 37784451Seschrock sysevent_t *ev; 37794451Seschrock sysevent_attr_list_t *attr = NULL; 37804451Seschrock sysevent_value_t value; 37814451Seschrock sysevent_id_t eid; 37824451Seschrock 37834451Seschrock ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", 37844451Seschrock SE_SLEEP); 37854451Seschrock 37864451Seschrock value.value_type = SE_DATA_TYPE_STRING; 37874451Seschrock value.value.sv_string = spa_name(spa); 37884451Seschrock if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) 37894451Seschrock goto done; 37904451Seschrock 37914451Seschrock value.value_type = SE_DATA_TYPE_UINT64; 37924451Seschrock value.value.sv_uint64 = spa_guid(spa); 37934451Seschrock if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) 37944451Seschrock goto done; 37954451Seschrock 37964451Seschrock if (vd) { 37974451Seschrock value.value_type = SE_DATA_TYPE_UINT64; 37984451Seschrock value.value.sv_uint64 = vd->vdev_guid; 37994451Seschrock if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, 38004451Seschrock SE_SLEEP) != 0) 38014451Seschrock goto done; 38024451Seschrock 38034451Seschrock if (vd->vdev_path) { 38044451Seschrock value.value_type = SE_DATA_TYPE_STRING; 38054451Seschrock value.value.sv_string = vd->vdev_path; 38064451Seschrock if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, 38074451Seschrock &value, SE_SLEEP) != 0) 38084451Seschrock goto done; 38094451Seschrock } 38104451Seschrock } 38114451Seschrock 38124451Seschrock (void) log_sysevent(ev, SE_SLEEP, &eid); 38134451Seschrock 38144451Seschrock done: 38154451Seschrock if (attr) 38164451Seschrock sysevent_free_attr(attr); 38174451Seschrock sysevent_free(ev); 38184451Seschrock #endif 38194451Seschrock } 3820