1789Sahrens /* 2789Sahrens * CDDL HEADER START 3789Sahrens * 4789Sahrens * The contents of this file are subject to the terms of the 51544Seschrock * Common Development and Distribution License (the "License"). 61544Seschrock * You may not use this file except in compliance with the License. 7789Sahrens * 8789Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9789Sahrens * or http://www.opensolaris.org/os/licensing. 10789Sahrens * See the License for the specific language governing permissions 11789Sahrens * and limitations under the License. 12789Sahrens * 13789Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14789Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15789Sahrens * If applicable, add the following below this CDDL HEADER, with the 16789Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17789Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18789Sahrens * 19789Sahrens * CDDL HEADER END 20789Sahrens */ 212082Seschrock 22789Sahrens /* 233377Seschrock * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24789Sahrens * Use is subject to license terms. 25789Sahrens */ 26789Sahrens 27789Sahrens #pragma ident "%Z%%M% %I% %E% SMI" 28789Sahrens 29789Sahrens /* 30789Sahrens * This file contains all the routines used when modifying on-disk SPA state. 31789Sahrens * This includes opening, importing, destroying, exporting a pool, and syncing a 32789Sahrens * pool. 33789Sahrens */ 34789Sahrens 35789Sahrens #include <sys/zfs_context.h> 361544Seschrock #include <sys/fm/fs/zfs.h> 37789Sahrens #include <sys/spa_impl.h> 38789Sahrens #include <sys/zio.h> 39789Sahrens #include <sys/zio_checksum.h> 40789Sahrens #include <sys/zio_compress.h> 41789Sahrens #include <sys/dmu.h> 42789Sahrens #include <sys/dmu_tx.h> 43789Sahrens #include <sys/zap.h> 44789Sahrens #include <sys/zil.h> 45789Sahrens #include <sys/vdev_impl.h> 46789Sahrens #include <sys/metaslab.h> 47789Sahrens #include <sys/uberblock_impl.h> 48789Sahrens #include <sys/txg.h> 49789Sahrens #include <sys/avl.h> 50789Sahrens #include <sys/dmu_traverse.h> 513912Slling #include <sys/dmu_objset.h> 52789Sahrens #include <sys/unique.h> 53789Sahrens #include <sys/dsl_pool.h> 543912Slling #include <sys/dsl_dataset.h> 55789Sahrens #include <sys/dsl_dir.h> 56789Sahrens #include <sys/dsl_prop.h> 573912Slling #include <sys/dsl_synctask.h> 58789Sahrens #include <sys/fs/zfs.h> 59789Sahrens #include <sys/callb.h> 603975Sek110237 #include <sys/systeminfo.h> 613975Sek110237 #include <sys/sunddi.h> 62789Sahrens 635094Slling #include "zfs_prop.h" 645094Slling 652986Sek110237 int zio_taskq_threads = 8; 662986Sek110237 675094Slling static void spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx); 685094Slling 695094Slling /* 705094Slling * ========================================================================== 715094Slling * SPA properties routines 725094Slling * ========================================================================== 735094Slling */ 745094Slling 755094Slling /* 765094Slling * Add a (source=src, propname=propval) list to an nvlist. 775094Slling */ 785094Slling static int 795094Slling spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, 805094Slling uint64_t intval, zprop_source_t src) 815094Slling { 825094Slling const char *propname = zpool_prop_to_name(prop); 835094Slling nvlist_t *propval; 845094Slling int err = 0; 855094Slling 865094Slling if (err = nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP)) 875094Slling return (err); 885094Slling 895094Slling if (err = nvlist_add_uint64(propval, ZPROP_SOURCE, src)) 905094Slling goto out; 915094Slling 925094Slling if (strval != NULL) { 935094Slling if (err = nvlist_add_string(propval, ZPROP_VALUE, strval)) 945094Slling goto out; 955094Slling } else { 965094Slling if (err = nvlist_add_uint64(propval, ZPROP_VALUE, intval)) 975094Slling goto out; 985094Slling } 995094Slling 1005094Slling err = nvlist_add_nvlist(nvl, propname, propval); 1015094Slling out: 1025094Slling nvlist_free(propval); 1035094Slling return (err); 1045094Slling } 1055094Slling 1065094Slling /* 1075094Slling * Get property values from the spa configuration. 1085094Slling */ 1095094Slling static int 1105094Slling spa_prop_get_config(spa_t *spa, nvlist_t **nvp) 1115094Slling { 1125094Slling uint64_t size = spa_get_space(spa); 1135094Slling uint64_t used = spa_get_alloc(spa); 1145094Slling uint64_t cap, version; 1155094Slling zprop_source_t src = ZPROP_SRC_NONE; 1165094Slling int err; 117*5363Seschrock char *cachefile; 118*5363Seschrock size_t len; 1195094Slling 1205094Slling /* 1215094Slling * readonly properties 1225094Slling */ 1235094Slling if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa->spa_name, 1245094Slling 0, src)) 1255094Slling return (err); 1265094Slling 1275094Slling if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src)) 1285094Slling return (err); 1295094Slling 1305094Slling if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_USED, NULL, used, src)) 1315094Slling return (err); 1325094Slling 1335094Slling if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_AVAILABLE, NULL, 1345094Slling size - used, src)) 1355094Slling return (err); 1365094Slling 1375094Slling cap = (size == 0) ? 0 : (used * 100 / size); 1385094Slling if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src)) 1395094Slling return (err); 1405094Slling 1415094Slling if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, 1425094Slling spa_guid(spa), src)) 1435094Slling return (err); 1445094Slling 1455094Slling if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, 1465094Slling spa->spa_root_vdev->vdev_state, src)) 1475094Slling return (err); 1485094Slling 1495094Slling /* 1505094Slling * settable properties that are not stored in the pool property object. 1515094Slling */ 1525094Slling version = spa_version(spa); 1535094Slling if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) 1545094Slling src = ZPROP_SRC_DEFAULT; 1555094Slling else 1565094Slling src = ZPROP_SRC_LOCAL; 1575094Slling if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, 1585094Slling version, src)) 1595094Slling return (err); 1605094Slling 1615094Slling if (spa->spa_root != NULL) { 1625094Slling src = ZPROP_SRC_LOCAL; 1635094Slling if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, 1645094Slling spa->spa_root, 0, src)) 1655094Slling return (err); 1665094Slling } 1675094Slling 168*5363Seschrock if (spa->spa_config_dir != NULL) { 169*5363Seschrock if (strcmp(spa->spa_config_dir, "none") == 0) { 170*5363Seschrock err = spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 171*5363Seschrock spa->spa_config_dir, 0, ZPROP_SRC_LOCAL); 172*5363Seschrock } else { 173*5363Seschrock len = strlen(spa->spa_config_dir) + 174*5363Seschrock strlen(spa->spa_config_file) + 2; 175*5363Seschrock cachefile = kmem_alloc(len, KM_SLEEP); 176*5363Seschrock (void) snprintf(cachefile, len, "%s/%s", 177*5363Seschrock spa->spa_config_dir, spa->spa_config_file); 178*5363Seschrock err = spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 179*5363Seschrock cachefile, 0, ZPROP_SRC_LOCAL); 180*5363Seschrock kmem_free(cachefile, len); 181*5363Seschrock } 182*5363Seschrock 183*5363Seschrock if (err) 184*5363Seschrock return (err); 185*5363Seschrock } 1865094Slling 1875094Slling return (0); 1885094Slling } 1895094Slling 1905094Slling /* 1915094Slling * Get zpool property values. 1925094Slling */ 1935094Slling int 1945094Slling spa_prop_get(spa_t *spa, nvlist_t **nvp) 1955094Slling { 1965094Slling zap_cursor_t zc; 1975094Slling zap_attribute_t za; 1985094Slling objset_t *mos = spa->spa_meta_objset; 1995094Slling int err; 2005094Slling 2015094Slling if (err = nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP)) 2025094Slling return (err); 2035094Slling 2045094Slling /* 2055094Slling * Get properties from the spa config. 2065094Slling */ 2075094Slling if (err = spa_prop_get_config(spa, nvp)) 2085094Slling goto out; 2095094Slling 2105094Slling mutex_enter(&spa->spa_props_lock); 2115094Slling /* If no pool property object, no more prop to get. */ 2125094Slling if (spa->spa_pool_props_object == 0) { 2135094Slling mutex_exit(&spa->spa_props_lock); 2145094Slling return (0); 2155094Slling } 2165094Slling 2175094Slling /* 2185094Slling * Get properties from the MOS pool property object. 2195094Slling */ 2205094Slling for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 2215094Slling (err = zap_cursor_retrieve(&zc, &za)) == 0; 2225094Slling zap_cursor_advance(&zc)) { 2235094Slling uint64_t intval = 0; 2245094Slling char *strval = NULL; 2255094Slling zprop_source_t src = ZPROP_SRC_DEFAULT; 2265094Slling zpool_prop_t prop; 2275094Slling 2285094Slling if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL) 2295094Slling continue; 2305094Slling 2315094Slling switch (za.za_integer_length) { 2325094Slling case 8: 2335094Slling /* integer property */ 2345094Slling if (za.za_first_integer != 2355094Slling zpool_prop_default_numeric(prop)) 2365094Slling src = ZPROP_SRC_LOCAL; 2375094Slling 2385094Slling if (prop == ZPOOL_PROP_BOOTFS) { 2395094Slling dsl_pool_t *dp; 2405094Slling dsl_dataset_t *ds = NULL; 2415094Slling 2425094Slling dp = spa_get_dsl(spa); 2435094Slling rw_enter(&dp->dp_config_rwlock, RW_READER); 2445094Slling if (err = dsl_dataset_open_obj(dp, 2455094Slling za.za_first_integer, NULL, DS_MODE_NONE, 2465094Slling FTAG, &ds)) { 2475094Slling rw_exit(&dp->dp_config_rwlock); 2485094Slling break; 2495094Slling } 2505094Slling 2515094Slling strval = kmem_alloc( 2525094Slling MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, 2535094Slling KM_SLEEP); 2545094Slling dsl_dataset_name(ds, strval); 2555094Slling dsl_dataset_close(ds, DS_MODE_NONE, FTAG); 2565094Slling rw_exit(&dp->dp_config_rwlock); 2575094Slling } else { 2585094Slling strval = NULL; 2595094Slling intval = za.za_first_integer; 2605094Slling } 2615094Slling 2625094Slling err = spa_prop_add_list(*nvp, prop, strval, 2635094Slling intval, src); 2645094Slling 2655094Slling if (strval != NULL) 2665094Slling kmem_free(strval, 2675094Slling MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); 2685094Slling 2695094Slling break; 2705094Slling 2715094Slling case 1: 2725094Slling /* string property */ 2735094Slling strval = kmem_alloc(za.za_num_integers, KM_SLEEP); 2745094Slling err = zap_lookup(mos, spa->spa_pool_props_object, 2755094Slling za.za_name, 1, za.za_num_integers, strval); 2765094Slling if (err) { 2775094Slling kmem_free(strval, za.za_num_integers); 2785094Slling break; 2795094Slling } 2805094Slling err = spa_prop_add_list(*nvp, prop, strval, 0, src); 2815094Slling kmem_free(strval, za.za_num_integers); 2825094Slling break; 2835094Slling 2845094Slling default: 2855094Slling break; 2865094Slling } 2875094Slling } 2885094Slling zap_cursor_fini(&zc); 2895094Slling mutex_exit(&spa->spa_props_lock); 2905094Slling out: 2915094Slling if (err && err != ENOENT) { 2925094Slling nvlist_free(*nvp); 2935094Slling return (err); 2945094Slling } 2955094Slling 2965094Slling return (0); 2975094Slling } 2985094Slling 2995094Slling /* 3005094Slling * Validate the given pool properties nvlist and modify the list 3015094Slling * for the property values to be set. 3025094Slling */ 3035094Slling static int 3045094Slling spa_prop_validate(spa_t *spa, nvlist_t *props) 3055094Slling { 3065094Slling nvpair_t *elem; 3075094Slling int error = 0, reset_bootfs = 0; 3085094Slling uint64_t objnum; 3095094Slling 3105094Slling elem = NULL; 3115094Slling while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 3125094Slling zpool_prop_t prop; 3135094Slling char *propname, *strval; 3145094Slling uint64_t intval; 3155094Slling vdev_t *rvdev; 3165094Slling char *vdev_type; 3175094Slling objset_t *os; 318*5363Seschrock char *slash; 3195094Slling 3205094Slling propname = nvpair_name(elem); 3215094Slling 3225094Slling if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL) 3235094Slling return (EINVAL); 3245094Slling 3255094Slling switch (prop) { 3265094Slling case ZPOOL_PROP_VERSION: 3275094Slling error = nvpair_value_uint64(elem, &intval); 3285094Slling if (!error && 3295094Slling (intval < spa_version(spa) || intval > SPA_VERSION)) 3305094Slling error = EINVAL; 3315094Slling break; 3325094Slling 3335094Slling case ZPOOL_PROP_DELEGATION: 3345094Slling case ZPOOL_PROP_AUTOREPLACE: 3355094Slling error = nvpair_value_uint64(elem, &intval); 3365094Slling if (!error && intval > 1) 3375094Slling error = EINVAL; 3385094Slling break; 3395094Slling 3405094Slling case ZPOOL_PROP_BOOTFS: 3415094Slling if (spa_version(spa) < SPA_VERSION_BOOTFS) { 3425094Slling error = ENOTSUP; 3435094Slling break; 3445094Slling } 3455094Slling 3465094Slling /* 3475094Slling * A bootable filesystem can not be on a RAIDZ pool 3485094Slling * nor a striped pool with more than 1 device. 3495094Slling */ 3505094Slling rvdev = spa->spa_root_vdev; 3515094Slling vdev_type = 3525094Slling rvdev->vdev_child[0]->vdev_ops->vdev_op_type; 3535094Slling if (rvdev->vdev_children > 1 || 3545094Slling strcmp(vdev_type, VDEV_TYPE_RAIDZ) == 0 || 3555094Slling strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) { 3565094Slling error = ENOTSUP; 3575094Slling break; 3585094Slling } 3595094Slling 3605094Slling reset_bootfs = 1; 3615094Slling 3625094Slling error = nvpair_value_string(elem, &strval); 3635094Slling 3645094Slling if (!error) { 3655094Slling if (strval == NULL || strval[0] == '\0') { 3665094Slling objnum = zpool_prop_default_numeric( 3675094Slling ZPOOL_PROP_BOOTFS); 3685094Slling break; 3695094Slling } 3705094Slling 3715094Slling if (error = dmu_objset_open(strval, DMU_OST_ZFS, 3725094Slling DS_MODE_STANDARD | DS_MODE_READONLY, &os)) 3735094Slling break; 3745094Slling objnum = dmu_objset_id(os); 3755094Slling dmu_objset_close(os); 3765094Slling } 3775094Slling break; 3785329Sgw25295 case ZPOOL_PROP_FAILUREMODE: 3795329Sgw25295 error = nvpair_value_uint64(elem, &intval); 3805329Sgw25295 if (!error && (intval < ZIO_FAILURE_MODE_WAIT || 3815329Sgw25295 intval > ZIO_FAILURE_MODE_PANIC)) 3825329Sgw25295 error = EINVAL; 3835329Sgw25295 3845329Sgw25295 /* 3855329Sgw25295 * This is a special case which only occurs when 3865329Sgw25295 * the pool has completely failed. This allows 3875329Sgw25295 * the user to change the in-core failmode property 3885329Sgw25295 * without syncing it out to disk (I/Os might 3895329Sgw25295 * currently be blocked). We do this by returning 3905329Sgw25295 * EIO to the caller (spa_prop_set) to trick it 3915329Sgw25295 * into thinking we encountered a property validation 3925329Sgw25295 * error. 3935329Sgw25295 */ 3945329Sgw25295 if (!error && spa_state(spa) == POOL_STATE_IO_FAILURE) { 3955329Sgw25295 spa->spa_failmode = intval; 3965329Sgw25295 error = EIO; 3975329Sgw25295 } 3985329Sgw25295 break; 399*5363Seschrock 400*5363Seschrock case ZPOOL_PROP_CACHEFILE: 401*5363Seschrock if ((error = nvpair_value_string(elem, &strval)) != 0) 402*5363Seschrock break; 403*5363Seschrock 404*5363Seschrock if (strval[0] == '\0') 405*5363Seschrock break; 406*5363Seschrock 407*5363Seschrock if (strcmp(strval, "none") == 0) 408*5363Seschrock break; 409*5363Seschrock 410*5363Seschrock if (strval[0] != '/') { 411*5363Seschrock error = EINVAL; 412*5363Seschrock break; 413*5363Seschrock } 414*5363Seschrock 415*5363Seschrock slash = strrchr(strval, '/'); 416*5363Seschrock ASSERT(slash != NULL); 417*5363Seschrock 418*5363Seschrock if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || 419*5363Seschrock strcmp(slash, "/..") == 0) 420*5363Seschrock error = EINVAL; 421*5363Seschrock break; 4225094Slling } 4235094Slling 4245094Slling if (error) 4255094Slling break; 4265094Slling } 4275094Slling 4285094Slling if (!error && reset_bootfs) { 4295094Slling error = nvlist_remove(props, 4305094Slling zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 4315094Slling 4325094Slling if (!error) { 4335094Slling error = nvlist_add_uint64(props, 4345094Slling zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 4355094Slling } 4365094Slling } 4375094Slling 4385094Slling return (error); 4395094Slling } 4405094Slling 4415094Slling int 4425094Slling spa_prop_set(spa_t *spa, nvlist_t *nvp) 4435094Slling { 4445094Slling int error; 4455094Slling 4465094Slling if ((error = spa_prop_validate(spa, nvp)) != 0) 4475094Slling return (error); 4485094Slling 4495094Slling return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, 4505094Slling spa, nvp, 3)); 4515094Slling } 4525094Slling 4535094Slling /* 4545094Slling * If the bootfs property value is dsobj, clear it. 4555094Slling */ 4565094Slling void 4575094Slling spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 4585094Slling { 4595094Slling if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 4605094Slling VERIFY(zap_remove(spa->spa_meta_objset, 4615094Slling spa->spa_pool_props_object, 4625094Slling zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 4635094Slling spa->spa_bootfs = 0; 4645094Slling } 4655094Slling } 4665094Slling 467789Sahrens /* 468789Sahrens * ========================================================================== 469789Sahrens * SPA state manipulation (open/create/destroy/import/export) 470789Sahrens * ========================================================================== 471789Sahrens */ 472789Sahrens 4731544Seschrock static int 4741544Seschrock spa_error_entry_compare(const void *a, const void *b) 4751544Seschrock { 4761544Seschrock spa_error_entry_t *sa = (spa_error_entry_t *)a; 4771544Seschrock spa_error_entry_t *sb = (spa_error_entry_t *)b; 4781544Seschrock int ret; 4791544Seschrock 4801544Seschrock ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 4811544Seschrock sizeof (zbookmark_t)); 4821544Seschrock 4831544Seschrock if (ret < 0) 4841544Seschrock return (-1); 4851544Seschrock else if (ret > 0) 4861544Seschrock return (1); 4871544Seschrock else 4881544Seschrock return (0); 4891544Seschrock } 4901544Seschrock 4911544Seschrock /* 4921544Seschrock * Utility function which retrieves copies of the current logs and 4931544Seschrock * re-initializes them in the process. 4941544Seschrock */ 4951544Seschrock void 4961544Seschrock spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 4971544Seschrock { 4981544Seschrock ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 4991544Seschrock 5001544Seschrock bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 5011544Seschrock bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 5021544Seschrock 5031544Seschrock avl_create(&spa->spa_errlist_scrub, 5041544Seschrock spa_error_entry_compare, sizeof (spa_error_entry_t), 5051544Seschrock offsetof(spa_error_entry_t, se_avl)); 5061544Seschrock avl_create(&spa->spa_errlist_last, 5071544Seschrock spa_error_entry_compare, sizeof (spa_error_entry_t), 5081544Seschrock offsetof(spa_error_entry_t, se_avl)); 5091544Seschrock } 5101544Seschrock 511789Sahrens /* 512789Sahrens * Activate an uninitialized pool. 513789Sahrens */ 514789Sahrens static void 515789Sahrens spa_activate(spa_t *spa) 516789Sahrens { 517789Sahrens int t; 518789Sahrens 519789Sahrens ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 520789Sahrens 521789Sahrens spa->spa_state = POOL_STATE_ACTIVE; 522789Sahrens 523789Sahrens spa->spa_normal_class = metaslab_class_create(); 5244527Sperrin spa->spa_log_class = metaslab_class_create(); 525789Sahrens 526789Sahrens for (t = 0; t < ZIO_TYPES; t++) { 527789Sahrens spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue", 5282986Sek110237 zio_taskq_threads, maxclsyspri, 50, INT_MAX, 529789Sahrens TASKQ_PREPOPULATE); 530789Sahrens spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr", 5312986Sek110237 zio_taskq_threads, maxclsyspri, 50, INT_MAX, 532789Sahrens TASKQ_PREPOPULATE); 533789Sahrens } 534789Sahrens 535789Sahrens list_create(&spa->spa_dirty_list, sizeof (vdev_t), 536789Sahrens offsetof(vdev_t, vdev_dirty_node)); 5375329Sgw25295 list_create(&spa->spa_zio_list, sizeof (zio_t), 5385329Sgw25295 offsetof(zio_t, zio_link_node)); 539789Sahrens 540789Sahrens txg_list_create(&spa->spa_vdev_txg_list, 541789Sahrens offsetof(struct vdev, vdev_txg_node)); 5421544Seschrock 5431544Seschrock avl_create(&spa->spa_errlist_scrub, 5441544Seschrock spa_error_entry_compare, sizeof (spa_error_entry_t), 5451544Seschrock offsetof(spa_error_entry_t, se_avl)); 5461544Seschrock avl_create(&spa->spa_errlist_last, 5471544Seschrock spa_error_entry_compare, sizeof (spa_error_entry_t), 5481544Seschrock offsetof(spa_error_entry_t, se_avl)); 549789Sahrens } 550789Sahrens 551789Sahrens /* 552789Sahrens * Opposite of spa_activate(). 553789Sahrens */ 554789Sahrens static void 555789Sahrens spa_deactivate(spa_t *spa) 556789Sahrens { 557789Sahrens int t; 558789Sahrens 559789Sahrens ASSERT(spa->spa_sync_on == B_FALSE); 560789Sahrens ASSERT(spa->spa_dsl_pool == NULL); 561789Sahrens ASSERT(spa->spa_root_vdev == NULL); 562789Sahrens 563789Sahrens ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 564789Sahrens 565789Sahrens txg_list_destroy(&spa->spa_vdev_txg_list); 566789Sahrens 567789Sahrens list_destroy(&spa->spa_dirty_list); 5685329Sgw25295 list_destroy(&spa->spa_zio_list); 569789Sahrens 570789Sahrens for (t = 0; t < ZIO_TYPES; t++) { 571789Sahrens taskq_destroy(spa->spa_zio_issue_taskq[t]); 572789Sahrens taskq_destroy(spa->spa_zio_intr_taskq[t]); 573789Sahrens spa->spa_zio_issue_taskq[t] = NULL; 574789Sahrens spa->spa_zio_intr_taskq[t] = NULL; 575789Sahrens } 576789Sahrens 577789Sahrens metaslab_class_destroy(spa->spa_normal_class); 578789Sahrens spa->spa_normal_class = NULL; 579789Sahrens 5804527Sperrin metaslab_class_destroy(spa->spa_log_class); 5814527Sperrin spa->spa_log_class = NULL; 5824527Sperrin 5831544Seschrock /* 5841544Seschrock * If this was part of an import or the open otherwise failed, we may 5851544Seschrock * still have errors left in the queues. Empty them just in case. 5861544Seschrock */ 5871544Seschrock spa_errlog_drain(spa); 5881544Seschrock 5891544Seschrock avl_destroy(&spa->spa_errlist_scrub); 5901544Seschrock avl_destroy(&spa->spa_errlist_last); 5911544Seschrock 592789Sahrens spa->spa_state = POOL_STATE_UNINITIALIZED; 593789Sahrens } 594789Sahrens 595789Sahrens /* 596789Sahrens * Verify a pool configuration, and construct the vdev tree appropriately. This 597789Sahrens * will create all the necessary vdevs in the appropriate layout, with each vdev 598789Sahrens * in the CLOSED state. This will prep the pool before open/creation/import. 599789Sahrens * All vdev validation is done by the vdev_alloc() routine. 600789Sahrens */ 6012082Seschrock static int 6022082Seschrock spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 6032082Seschrock uint_t id, int atype) 604789Sahrens { 605789Sahrens nvlist_t **child; 606789Sahrens uint_t c, children; 6072082Seschrock int error; 6082082Seschrock 6092082Seschrock if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 6102082Seschrock return (error); 6112082Seschrock 6122082Seschrock if ((*vdp)->vdev_ops->vdev_op_leaf) 6132082Seschrock return (0); 614789Sahrens 615789Sahrens if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 616789Sahrens &child, &children) != 0) { 6172082Seschrock vdev_free(*vdp); 6182082Seschrock *vdp = NULL; 6192082Seschrock return (EINVAL); 620789Sahrens } 621789Sahrens 622789Sahrens for (c = 0; c < children; c++) { 6232082Seschrock vdev_t *vd; 6242082Seschrock if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 6252082Seschrock atype)) != 0) { 6262082Seschrock vdev_free(*vdp); 6272082Seschrock *vdp = NULL; 6282082Seschrock return (error); 629789Sahrens } 630789Sahrens } 631789Sahrens 6322082Seschrock ASSERT(*vdp != NULL); 6332082Seschrock 6342082Seschrock return (0); 635789Sahrens } 636789Sahrens 637789Sahrens /* 638789Sahrens * Opposite of spa_load(). 639789Sahrens */ 640789Sahrens static void 641789Sahrens spa_unload(spa_t *spa) 642789Sahrens { 6432082Seschrock int i; 6442082Seschrock 645789Sahrens /* 6461544Seschrock * Stop async tasks. 6471544Seschrock */ 6481544Seschrock spa_async_suspend(spa); 6491544Seschrock 6501544Seschrock /* 651789Sahrens * Stop syncing. 652789Sahrens */ 653789Sahrens if (spa->spa_sync_on) { 654789Sahrens txg_sync_stop(spa->spa_dsl_pool); 655789Sahrens spa->spa_sync_on = B_FALSE; 656789Sahrens } 657789Sahrens 658789Sahrens /* 659789Sahrens * Wait for any outstanding prefetch I/O to complete. 660789Sahrens */ 6611544Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 6621544Seschrock spa_config_exit(spa, FTAG); 663789Sahrens 664789Sahrens /* 665789Sahrens * Close the dsl pool. 666789Sahrens */ 667789Sahrens if (spa->spa_dsl_pool) { 668789Sahrens dsl_pool_close(spa->spa_dsl_pool); 669789Sahrens spa->spa_dsl_pool = NULL; 670789Sahrens } 671789Sahrens 672789Sahrens /* 673789Sahrens * Close all vdevs. 674789Sahrens */ 6751585Sbonwick if (spa->spa_root_vdev) 676789Sahrens vdev_free(spa->spa_root_vdev); 6771585Sbonwick ASSERT(spa->spa_root_vdev == NULL); 6781544Seschrock 6792082Seschrock for (i = 0; i < spa->spa_nspares; i++) 6802082Seschrock vdev_free(spa->spa_spares[i]); 6812082Seschrock if (spa->spa_spares) { 6822082Seschrock kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); 6832082Seschrock spa->spa_spares = NULL; 6842082Seschrock } 6852082Seschrock if (spa->spa_sparelist) { 6862082Seschrock nvlist_free(spa->spa_sparelist); 6872082Seschrock spa->spa_sparelist = NULL; 6882082Seschrock } 6892082Seschrock 6901544Seschrock spa->spa_async_suspended = 0; 691789Sahrens } 692789Sahrens 693789Sahrens /* 6942082Seschrock * Load (or re-load) the current list of vdevs describing the active spares for 6952082Seschrock * this pool. When this is called, we have some form of basic information in 6962082Seschrock * 'spa_sparelist'. We parse this into vdevs, try to open them, and then 6972082Seschrock * re-generate a more complete list including status information. 6982082Seschrock */ 6992082Seschrock static void 7002082Seschrock spa_load_spares(spa_t *spa) 7012082Seschrock { 7022082Seschrock nvlist_t **spares; 7032082Seschrock uint_t nspares; 7042082Seschrock int i; 7053377Seschrock vdev_t *vd, *tvd; 7062082Seschrock 7072082Seschrock /* 7082082Seschrock * First, close and free any existing spare vdevs. 7092082Seschrock */ 7102082Seschrock for (i = 0; i < spa->spa_nspares; i++) { 7113377Seschrock vd = spa->spa_spares[i]; 7123377Seschrock 7133377Seschrock /* Undo the call to spa_activate() below */ 7143377Seschrock if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL && 7153377Seschrock tvd->vdev_isspare) 7163377Seschrock spa_spare_remove(tvd); 7173377Seschrock vdev_close(vd); 7183377Seschrock vdev_free(vd); 7192082Seschrock } 7203377Seschrock 7212082Seschrock if (spa->spa_spares) 7222082Seschrock kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); 7232082Seschrock 7242082Seschrock if (spa->spa_sparelist == NULL) 7252082Seschrock nspares = 0; 7262082Seschrock else 7272082Seschrock VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 7282082Seschrock ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 7292082Seschrock 7302082Seschrock spa->spa_nspares = (int)nspares; 7312082Seschrock spa->spa_spares = NULL; 7322082Seschrock 7332082Seschrock if (nspares == 0) 7342082Seschrock return; 7352082Seschrock 7362082Seschrock /* 7372082Seschrock * Construct the array of vdevs, opening them to get status in the 7383377Seschrock * process. For each spare, there is potentially two different vdev_t 7393377Seschrock * structures associated with it: one in the list of spares (used only 7403377Seschrock * for basic validation purposes) and one in the active vdev 7413377Seschrock * configuration (if it's spared in). During this phase we open and 7423377Seschrock * validate each vdev on the spare list. If the vdev also exists in the 7433377Seschrock * active configuration, then we also mark this vdev as an active spare. 7442082Seschrock */ 7452082Seschrock spa->spa_spares = kmem_alloc(nspares * sizeof (void *), KM_SLEEP); 7462082Seschrock for (i = 0; i < spa->spa_nspares; i++) { 7472082Seschrock VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 7482082Seschrock VDEV_ALLOC_SPARE) == 0); 7492082Seschrock ASSERT(vd != NULL); 7502082Seschrock 7512082Seschrock spa->spa_spares[i] = vd; 7522082Seschrock 7533377Seschrock if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL) { 7543377Seschrock if (!tvd->vdev_isspare) 7553377Seschrock spa_spare_add(tvd); 7563377Seschrock 7573377Seschrock /* 7583377Seschrock * We only mark the spare active if we were successfully 7593377Seschrock * able to load the vdev. Otherwise, importing a pool 7603377Seschrock * with a bad active spare would result in strange 7613377Seschrock * behavior, because multiple pool would think the spare 7623377Seschrock * is actively in use. 7633377Seschrock * 7643377Seschrock * There is a vulnerability here to an equally bizarre 7653377Seschrock * circumstance, where a dead active spare is later 7663377Seschrock * brought back to life (onlined or otherwise). Given 7673377Seschrock * the rarity of this scenario, and the extra complexity 7683377Seschrock * it adds, we ignore the possibility. 7693377Seschrock */ 7703377Seschrock if (!vdev_is_dead(tvd)) 7713377Seschrock spa_spare_activate(tvd); 7723377Seschrock } 7733377Seschrock 7742082Seschrock if (vdev_open(vd) != 0) 7752082Seschrock continue; 7762082Seschrock 7772082Seschrock vd->vdev_top = vd; 7782082Seschrock (void) vdev_validate_spare(vd); 7792082Seschrock } 7802082Seschrock 7812082Seschrock /* 7822082Seschrock * Recompute the stashed list of spares, with status information 7832082Seschrock * this time. 7842082Seschrock */ 7852082Seschrock VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 7862082Seschrock DATA_TYPE_NVLIST_ARRAY) == 0); 7872082Seschrock 7882082Seschrock spares = kmem_alloc(spa->spa_nspares * sizeof (void *), KM_SLEEP); 7892082Seschrock for (i = 0; i < spa->spa_nspares; i++) 7902082Seschrock spares[i] = vdev_config_generate(spa, spa->spa_spares[i], 7912082Seschrock B_TRUE, B_TRUE); 7922082Seschrock VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 7932082Seschrock spares, spa->spa_nspares) == 0); 7942082Seschrock for (i = 0; i < spa->spa_nspares; i++) 7952082Seschrock nvlist_free(spares[i]); 7962082Seschrock kmem_free(spares, spa->spa_nspares * sizeof (void *)); 7972082Seschrock } 7982082Seschrock 7992082Seschrock static int 8002082Seschrock load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 8012082Seschrock { 8022082Seschrock dmu_buf_t *db; 8032082Seschrock char *packed = NULL; 8042082Seschrock size_t nvsize = 0; 8052082Seschrock int error; 8062082Seschrock *value = NULL; 8072082Seschrock 8082082Seschrock VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 8092082Seschrock nvsize = *(uint64_t *)db->db_data; 8102082Seschrock dmu_buf_rele(db, FTAG); 8112082Seschrock 8122082Seschrock packed = kmem_alloc(nvsize, KM_SLEEP); 8132082Seschrock error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed); 8142082Seschrock if (error == 0) 8152082Seschrock error = nvlist_unpack(packed, nvsize, value, 0); 8162082Seschrock kmem_free(packed, nvsize); 8172082Seschrock 8182082Seschrock return (error); 8192082Seschrock } 8202082Seschrock 8212082Seschrock /* 8224451Seschrock * Checks to see if the given vdev could not be opened, in which case we post a 8234451Seschrock * sysevent to notify the autoreplace code that the device has been removed. 8244451Seschrock */ 8254451Seschrock static void 8264451Seschrock spa_check_removed(vdev_t *vd) 8274451Seschrock { 8284451Seschrock int c; 8294451Seschrock 8304451Seschrock for (c = 0; c < vd->vdev_children; c++) 8314451Seschrock spa_check_removed(vd->vdev_child[c]); 8324451Seschrock 8334451Seschrock if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) { 8344451Seschrock zfs_post_autoreplace(vd->vdev_spa, vd); 8354451Seschrock spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK); 8364451Seschrock } 8374451Seschrock } 8384451Seschrock 8394451Seschrock /* 840789Sahrens * Load an existing storage pool, using the pool's builtin spa_config as a 8411544Seschrock * source of configuration information. 842789Sahrens */ 843789Sahrens static int 8441544Seschrock spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) 845789Sahrens { 846789Sahrens int error = 0; 847789Sahrens nvlist_t *nvroot = NULL; 848789Sahrens vdev_t *rvd; 849789Sahrens uberblock_t *ub = &spa->spa_uberblock; 8501635Sbonwick uint64_t config_cache_txg = spa->spa_config_txg; 851789Sahrens uint64_t pool_guid; 8522082Seschrock uint64_t version; 853789Sahrens zio_t *zio; 8544451Seschrock uint64_t autoreplace = 0; 855789Sahrens 8561544Seschrock spa->spa_load_state = state; 8571635Sbonwick 858789Sahrens if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 8591733Sbonwick nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 8601544Seschrock error = EINVAL; 8611544Seschrock goto out; 8621544Seschrock } 863789Sahrens 8642082Seschrock /* 8652082Seschrock * Versioning wasn't explicitly added to the label until later, so if 8662082Seschrock * it's not present treat it as the initial version. 8672082Seschrock */ 8682082Seschrock if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0) 8694577Sahrens version = SPA_VERSION_INITIAL; 8702082Seschrock 8711733Sbonwick (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 8721733Sbonwick &spa->spa_config_txg); 8731733Sbonwick 8741635Sbonwick if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 8751544Seschrock spa_guid_exists(pool_guid, 0)) { 8761544Seschrock error = EEXIST; 8771544Seschrock goto out; 8781544Seschrock } 879789Sahrens 8802174Seschrock spa->spa_load_guid = pool_guid; 8812174Seschrock 882789Sahrens /* 8832082Seschrock * Parse the configuration into a vdev tree. We explicitly set the 8842082Seschrock * value that will be returned by spa_version() since parsing the 8852082Seschrock * configuration requires knowing the version number. 886789Sahrens */ 8871544Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 8882082Seschrock spa->spa_ubsync.ub_version = version; 8892082Seschrock error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 8901544Seschrock spa_config_exit(spa, FTAG); 891789Sahrens 8922082Seschrock if (error != 0) 8931544Seschrock goto out; 894789Sahrens 8951585Sbonwick ASSERT(spa->spa_root_vdev == rvd); 896789Sahrens ASSERT(spa_guid(spa) == pool_guid); 897789Sahrens 898789Sahrens /* 899789Sahrens * Try to open all vdevs, loading each label in the process. 900789Sahrens */ 9014070Smc142369 error = vdev_open(rvd); 9024070Smc142369 if (error != 0) 9031544Seschrock goto out; 904789Sahrens 905789Sahrens /* 9061986Seschrock * Validate the labels for all leaf vdevs. We need to grab the config 9071986Seschrock * lock because all label I/O is done with the ZIO_FLAG_CONFIG_HELD 9081986Seschrock * flag. 9091986Seschrock */ 9101986Seschrock spa_config_enter(spa, RW_READER, FTAG); 9111986Seschrock error = vdev_validate(rvd); 9121986Seschrock spa_config_exit(spa, FTAG); 9131986Seschrock 9144070Smc142369 if (error != 0) 9151986Seschrock goto out; 9161986Seschrock 9171986Seschrock if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 9181986Seschrock error = ENXIO; 9191986Seschrock goto out; 9201986Seschrock } 9211986Seschrock 9221986Seschrock /* 923789Sahrens * Find the best uberblock. 924789Sahrens */ 925789Sahrens bzero(ub, sizeof (uberblock_t)); 926789Sahrens 927789Sahrens zio = zio_root(spa, NULL, NULL, 928789Sahrens ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 929789Sahrens vdev_uberblock_load(zio, rvd, ub); 930789Sahrens error = zio_wait(zio); 931789Sahrens 932789Sahrens /* 933789Sahrens * If we weren't able to find a single valid uberblock, return failure. 934789Sahrens */ 935789Sahrens if (ub->ub_txg == 0) { 9361760Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 9371760Seschrock VDEV_AUX_CORRUPT_DATA); 9381544Seschrock error = ENXIO; 9391544Seschrock goto out; 9401544Seschrock } 9411544Seschrock 9421544Seschrock /* 9431544Seschrock * If the pool is newer than the code, we can't open it. 9441544Seschrock */ 9454577Sahrens if (ub->ub_version > SPA_VERSION) { 9461760Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 9471760Seschrock VDEV_AUX_VERSION_NEWER); 9481544Seschrock error = ENOTSUP; 9491544Seschrock goto out; 950789Sahrens } 951789Sahrens 952789Sahrens /* 953789Sahrens * If the vdev guid sum doesn't match the uberblock, we have an 954789Sahrens * incomplete configuration. 955789Sahrens */ 9561732Sbonwick if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 9571544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 9581544Seschrock VDEV_AUX_BAD_GUID_SUM); 9591544Seschrock error = ENXIO; 9601544Seschrock goto out; 961789Sahrens } 962789Sahrens 963789Sahrens /* 964789Sahrens * Initialize internal SPA structures. 965789Sahrens */ 966789Sahrens spa->spa_state = POOL_STATE_ACTIVE; 967789Sahrens spa->spa_ubsync = spa->spa_uberblock; 968789Sahrens spa->spa_first_txg = spa_last_synced_txg(spa) + 1; 9691544Seschrock error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 9701544Seschrock if (error) { 9711544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 9721544Seschrock VDEV_AUX_CORRUPT_DATA); 9731544Seschrock goto out; 9741544Seschrock } 975789Sahrens spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 976789Sahrens 9771544Seschrock if (zap_lookup(spa->spa_meta_objset, 978789Sahrens DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 9791544Seschrock sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { 9801544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 9811544Seschrock VDEV_AUX_CORRUPT_DATA); 9821544Seschrock error = EIO; 9831544Seschrock goto out; 9841544Seschrock } 985789Sahrens 986789Sahrens if (!mosconfig) { 9872082Seschrock nvlist_t *newconfig; 9883975Sek110237 uint64_t hostid; 9892082Seschrock 9902082Seschrock if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) { 9911544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 9921544Seschrock VDEV_AUX_CORRUPT_DATA); 9931544Seschrock error = EIO; 9941544Seschrock goto out; 9951544Seschrock } 996789Sahrens 9973975Sek110237 if (nvlist_lookup_uint64(newconfig, ZPOOL_CONFIG_HOSTID, 9983975Sek110237 &hostid) == 0) { 9993975Sek110237 char *hostname; 10003975Sek110237 unsigned long myhostid = 0; 10013975Sek110237 10023975Sek110237 VERIFY(nvlist_lookup_string(newconfig, 10033975Sek110237 ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 10043975Sek110237 10053975Sek110237 (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 10064178Slling if (hostid != 0 && myhostid != 0 && 10074178Slling (unsigned long)hostid != myhostid) { 10083975Sek110237 cmn_err(CE_WARN, "pool '%s' could not be " 10093975Sek110237 "loaded as it was last accessed by " 10103975Sek110237 "another system (host: %s hostid: 0x%lx). " 10113975Sek110237 "See: http://www.sun.com/msg/ZFS-8000-EY", 10123975Sek110237 spa->spa_name, hostname, 10133975Sek110237 (unsigned long)hostid); 10143975Sek110237 error = EBADF; 10153975Sek110237 goto out; 10163975Sek110237 } 10173975Sek110237 } 10183975Sek110237 1019789Sahrens spa_config_set(spa, newconfig); 1020789Sahrens spa_unload(spa); 1021789Sahrens spa_deactivate(spa); 1022789Sahrens spa_activate(spa); 1023789Sahrens 10241544Seschrock return (spa_load(spa, newconfig, state, B_TRUE)); 10251544Seschrock } 10261544Seschrock 10271544Seschrock if (zap_lookup(spa->spa_meta_objset, 10281544Seschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 10291544Seschrock sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { 10301544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 10311544Seschrock VDEV_AUX_CORRUPT_DATA); 10321544Seschrock error = EIO; 10331544Seschrock goto out; 1034789Sahrens } 1035789Sahrens 10361544Seschrock /* 10372082Seschrock * Load the bit that tells us to use the new accounting function 10382082Seschrock * (raid-z deflation). If we have an older pool, this will not 10392082Seschrock * be present. 10402082Seschrock */ 10412082Seschrock error = zap_lookup(spa->spa_meta_objset, 10422082Seschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 10432082Seschrock sizeof (uint64_t), 1, &spa->spa_deflate); 10442082Seschrock if (error != 0 && error != ENOENT) { 10452082Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 10462082Seschrock VDEV_AUX_CORRUPT_DATA); 10472082Seschrock error = EIO; 10482082Seschrock goto out; 10492082Seschrock } 10502082Seschrock 10512082Seschrock /* 10521544Seschrock * Load the persistent error log. If we have an older pool, this will 10531544Seschrock * not be present. 10541544Seschrock */ 10551544Seschrock error = zap_lookup(spa->spa_meta_objset, 10561544Seschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, 10571544Seschrock sizeof (uint64_t), 1, &spa->spa_errlog_last); 10581807Sbonwick if (error != 0 && error != ENOENT) { 10591544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 10601544Seschrock VDEV_AUX_CORRUPT_DATA); 10611544Seschrock error = EIO; 10621544Seschrock goto out; 10631544Seschrock } 10641544Seschrock 10651544Seschrock error = zap_lookup(spa->spa_meta_objset, 10661544Seschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, 10671544Seschrock sizeof (uint64_t), 1, &spa->spa_errlog_scrub); 10681544Seschrock if (error != 0 && error != ENOENT) { 10691544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 10701544Seschrock VDEV_AUX_CORRUPT_DATA); 10711544Seschrock error = EIO; 10721544Seschrock goto out; 10731544Seschrock } 1074789Sahrens 1075789Sahrens /* 10762926Sek110237 * Load the history object. If we have an older pool, this 10772926Sek110237 * will not be present. 10782926Sek110237 */ 10792926Sek110237 error = zap_lookup(spa->spa_meta_objset, 10802926Sek110237 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY, 10812926Sek110237 sizeof (uint64_t), 1, &spa->spa_history); 10822926Sek110237 if (error != 0 && error != ENOENT) { 10832926Sek110237 vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 10842926Sek110237 VDEV_AUX_CORRUPT_DATA); 10852926Sek110237 error = EIO; 10862926Sek110237 goto out; 10872926Sek110237 } 10882926Sek110237 10892926Sek110237 /* 10902082Seschrock * Load any hot spares for this pool. 10912082Seschrock */ 10922082Seschrock error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 10932082Seschrock DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares_object); 10942082Seschrock if (error != 0 && error != ENOENT) { 10952082Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 10962082Seschrock VDEV_AUX_CORRUPT_DATA); 10972082Seschrock error = EIO; 10982082Seschrock goto out; 10992082Seschrock } 11002082Seschrock if (error == 0) { 11014577Sahrens ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 11022082Seschrock if (load_nvlist(spa, spa->spa_spares_object, 11032082Seschrock &spa->spa_sparelist) != 0) { 11042082Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 11052082Seschrock VDEV_AUX_CORRUPT_DATA); 11062082Seschrock error = EIO; 11072082Seschrock goto out; 11082082Seschrock } 11092082Seschrock 11102082Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 11112082Seschrock spa_load_spares(spa); 11122082Seschrock spa_config_exit(spa, FTAG); 11132082Seschrock } 11142082Seschrock 11155094Slling spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 11164543Smarks 11173912Slling error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 11183912Slling DMU_POOL_PROPS, sizeof (uint64_t), 1, &spa->spa_pool_props_object); 11193912Slling 11203912Slling if (error && error != ENOENT) { 11213912Slling vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 11223912Slling VDEV_AUX_CORRUPT_DATA); 11233912Slling error = EIO; 11243912Slling goto out; 11253912Slling } 11263912Slling 11273912Slling if (error == 0) { 11283912Slling (void) zap_lookup(spa->spa_meta_objset, 11293912Slling spa->spa_pool_props_object, 11304451Seschrock zpool_prop_to_name(ZPOOL_PROP_BOOTFS), 11313912Slling sizeof (uint64_t), 1, &spa->spa_bootfs); 11324451Seschrock (void) zap_lookup(spa->spa_meta_objset, 11334451Seschrock spa->spa_pool_props_object, 11344451Seschrock zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), 11354451Seschrock sizeof (uint64_t), 1, &autoreplace); 11364543Smarks (void) zap_lookup(spa->spa_meta_objset, 11374543Smarks spa->spa_pool_props_object, 11384543Smarks zpool_prop_to_name(ZPOOL_PROP_DELEGATION), 11394543Smarks sizeof (uint64_t), 1, &spa->spa_delegation); 11405329Sgw25295 (void) zap_lookup(spa->spa_meta_objset, 11415329Sgw25295 spa->spa_pool_props_object, 11425329Sgw25295 zpool_prop_to_name(ZPOOL_PROP_FAILUREMODE), 11435329Sgw25295 sizeof (uint64_t), 1, &spa->spa_failmode); 11443912Slling } 11453912Slling 11462082Seschrock /* 11474451Seschrock * If the 'autoreplace' property is set, then post a resource notifying 11484451Seschrock * the ZFS DE that it should not issue any faults for unopenable 11494451Seschrock * devices. We also iterate over the vdevs, and post a sysevent for any 11504451Seschrock * unopenable vdevs so that the normal autoreplace handler can take 11514451Seschrock * over. 11524451Seschrock */ 11534451Seschrock if (autoreplace) 11544451Seschrock spa_check_removed(spa->spa_root_vdev); 11554451Seschrock 11564451Seschrock /* 11571986Seschrock * Load the vdev state for all toplevel vdevs. 1158789Sahrens */ 11591986Seschrock vdev_load(rvd); 1160789Sahrens 1161789Sahrens /* 1162789Sahrens * Propagate the leaf DTLs we just loaded all the way up the tree. 1163789Sahrens */ 11641544Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 1165789Sahrens vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 11661544Seschrock spa_config_exit(spa, FTAG); 1167789Sahrens 1168789Sahrens /* 1169789Sahrens * Check the state of the root vdev. If it can't be opened, it 1170789Sahrens * indicates one or more toplevel vdevs are faulted. 1171789Sahrens */ 11721544Seschrock if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 11731544Seschrock error = ENXIO; 11741544Seschrock goto out; 11751544Seschrock } 1176789Sahrens 11771544Seschrock if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) { 11781635Sbonwick dmu_tx_t *tx; 11791635Sbonwick int need_update = B_FALSE; 11801585Sbonwick int c; 11811601Sbonwick 11821635Sbonwick /* 11831635Sbonwick * Claim log blocks that haven't been committed yet. 11841635Sbonwick * This must all happen in a single txg. 11851635Sbonwick */ 11861601Sbonwick tx = dmu_tx_create_assigned(spa_get_dsl(spa), 1187789Sahrens spa_first_txg(spa)); 11882417Sahrens (void) dmu_objset_find(spa->spa_name, 11892417Sahrens zil_claim, tx, DS_FIND_CHILDREN); 1190789Sahrens dmu_tx_commit(tx); 1191789Sahrens 1192789Sahrens spa->spa_sync_on = B_TRUE; 1193789Sahrens txg_sync_start(spa->spa_dsl_pool); 1194789Sahrens 1195789Sahrens /* 1196789Sahrens * Wait for all claims to sync. 1197789Sahrens */ 1198789Sahrens txg_wait_synced(spa->spa_dsl_pool, 0); 11991585Sbonwick 12001585Sbonwick /* 12011635Sbonwick * If the config cache is stale, or we have uninitialized 12021635Sbonwick * metaslabs (see spa_vdev_add()), then update the config. 12031585Sbonwick */ 12041635Sbonwick if (config_cache_txg != spa->spa_config_txg || 12051635Sbonwick state == SPA_LOAD_IMPORT) 12061635Sbonwick need_update = B_TRUE; 12071635Sbonwick 12081635Sbonwick for (c = 0; c < rvd->vdev_children; c++) 12091635Sbonwick if (rvd->vdev_child[c]->vdev_ms_array == 0) 12101635Sbonwick need_update = B_TRUE; 12111585Sbonwick 12121585Sbonwick /* 12131635Sbonwick * Update the config cache asychronously in case we're the 12141635Sbonwick * root pool, in which case the config cache isn't writable yet. 12151585Sbonwick */ 12161635Sbonwick if (need_update) 12171635Sbonwick spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 1218789Sahrens } 1219789Sahrens 12201544Seschrock error = 0; 12211544Seschrock out: 12222082Seschrock if (error && error != EBADF) 12231544Seschrock zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0); 12241544Seschrock spa->spa_load_state = SPA_LOAD_NONE; 12251544Seschrock spa->spa_ena = 0; 12261544Seschrock 12271544Seschrock return (error); 1228789Sahrens } 1229789Sahrens 1230789Sahrens /* 1231789Sahrens * Pool Open/Import 1232789Sahrens * 1233789Sahrens * The import case is identical to an open except that the configuration is sent 1234789Sahrens * down from userland, instead of grabbed from the configuration cache. For the 1235789Sahrens * case of an open, the pool configuration will exist in the 12364451Seschrock * POOL_STATE_UNINITIALIZED state. 1237789Sahrens * 1238789Sahrens * The stats information (gen/count/ustats) is used to gather vdev statistics at 1239789Sahrens * the same time open the pool, without having to keep around the spa_t in some 1240789Sahrens * ambiguous state. 1241789Sahrens */ 1242789Sahrens static int 1243789Sahrens spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) 1244789Sahrens { 1245789Sahrens spa_t *spa; 1246789Sahrens int error; 1247789Sahrens int loaded = B_FALSE; 1248789Sahrens int locked = B_FALSE; 1249789Sahrens 1250789Sahrens *spapp = NULL; 1251789Sahrens 1252789Sahrens /* 1253789Sahrens * As disgusting as this is, we need to support recursive calls to this 1254789Sahrens * function because dsl_dir_open() is called during spa_load(), and ends 1255789Sahrens * up calling spa_open() again. The real fix is to figure out how to 1256789Sahrens * avoid dsl_dir_open() calling this in the first place. 1257789Sahrens */ 1258789Sahrens if (mutex_owner(&spa_namespace_lock) != curthread) { 1259789Sahrens mutex_enter(&spa_namespace_lock); 1260789Sahrens locked = B_TRUE; 1261789Sahrens } 1262789Sahrens 1263789Sahrens if ((spa = spa_lookup(pool)) == NULL) { 1264789Sahrens if (locked) 1265789Sahrens mutex_exit(&spa_namespace_lock); 1266789Sahrens return (ENOENT); 1267789Sahrens } 1268789Sahrens if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 1269789Sahrens 1270789Sahrens spa_activate(spa); 1271789Sahrens 12721635Sbonwick error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE); 1273789Sahrens 1274789Sahrens if (error == EBADF) { 1275789Sahrens /* 12761986Seschrock * If vdev_validate() returns failure (indicated by 12771986Seschrock * EBADF), it indicates that one of the vdevs indicates 12781986Seschrock * that the pool has been exported or destroyed. If 12791986Seschrock * this is the case, the config cache is out of sync and 12801986Seschrock * we should remove the pool from the namespace. 1281789Sahrens */ 12822082Seschrock zfs_post_ok(spa, NULL); 1283789Sahrens spa_unload(spa); 1284789Sahrens spa_deactivate(spa); 1285789Sahrens spa_remove(spa); 1286789Sahrens spa_config_sync(); 1287789Sahrens if (locked) 1288789Sahrens mutex_exit(&spa_namespace_lock); 1289789Sahrens return (ENOENT); 12901544Seschrock } 12911544Seschrock 12921544Seschrock if (error) { 1293789Sahrens /* 1294789Sahrens * We can't open the pool, but we still have useful 1295789Sahrens * information: the state of each vdev after the 1296789Sahrens * attempted vdev_open(). Return this to the user. 1297789Sahrens */ 12981635Sbonwick if (config != NULL && spa->spa_root_vdev != NULL) { 12991635Sbonwick spa_config_enter(spa, RW_READER, FTAG); 1300789Sahrens *config = spa_config_generate(spa, NULL, -1ULL, 1301789Sahrens B_TRUE); 13021635Sbonwick spa_config_exit(spa, FTAG); 13031635Sbonwick } 1304789Sahrens spa_unload(spa); 1305789Sahrens spa_deactivate(spa); 13061544Seschrock spa->spa_last_open_failed = B_TRUE; 1307789Sahrens if (locked) 1308789Sahrens mutex_exit(&spa_namespace_lock); 1309789Sahrens *spapp = NULL; 1310789Sahrens return (error); 13111544Seschrock } else { 13121544Seschrock zfs_post_ok(spa, NULL); 13131544Seschrock spa->spa_last_open_failed = B_FALSE; 1314789Sahrens } 1315789Sahrens 1316789Sahrens loaded = B_TRUE; 1317789Sahrens } 1318789Sahrens 1319789Sahrens spa_open_ref(spa, tag); 13204451Seschrock 13214451Seschrock /* 13224451Seschrock * If we just loaded the pool, resilver anything that's out of date. 13234451Seschrock */ 13244451Seschrock if (loaded && (spa_mode & FWRITE)) 13254451Seschrock VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 13264451Seschrock 1327789Sahrens if (locked) 1328789Sahrens mutex_exit(&spa_namespace_lock); 1329789Sahrens 1330789Sahrens *spapp = spa; 1331789Sahrens 1332789Sahrens if (config != NULL) { 13331544Seschrock spa_config_enter(spa, RW_READER, FTAG); 1334789Sahrens *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 13351544Seschrock spa_config_exit(spa, FTAG); 1336789Sahrens } 1337789Sahrens 1338789Sahrens return (0); 1339789Sahrens } 1340789Sahrens 1341789Sahrens int 1342789Sahrens spa_open(const char *name, spa_t **spapp, void *tag) 1343789Sahrens { 1344789Sahrens return (spa_open_common(name, spapp, tag, NULL)); 1345789Sahrens } 1346789Sahrens 13471544Seschrock /* 13481544Seschrock * Lookup the given spa_t, incrementing the inject count in the process, 13491544Seschrock * preventing it from being exported or destroyed. 13501544Seschrock */ 13511544Seschrock spa_t * 13521544Seschrock spa_inject_addref(char *name) 13531544Seschrock { 13541544Seschrock spa_t *spa; 13551544Seschrock 13561544Seschrock mutex_enter(&spa_namespace_lock); 13571544Seschrock if ((spa = spa_lookup(name)) == NULL) { 13581544Seschrock mutex_exit(&spa_namespace_lock); 13591544Seschrock return (NULL); 13601544Seschrock } 13611544Seschrock spa->spa_inject_ref++; 13621544Seschrock mutex_exit(&spa_namespace_lock); 13631544Seschrock 13641544Seschrock return (spa); 13651544Seschrock } 13661544Seschrock 13671544Seschrock void 13681544Seschrock spa_inject_delref(spa_t *spa) 13691544Seschrock { 13701544Seschrock mutex_enter(&spa_namespace_lock); 13711544Seschrock spa->spa_inject_ref--; 13721544Seschrock mutex_exit(&spa_namespace_lock); 13731544Seschrock } 13741544Seschrock 13752082Seschrock static void 13762082Seschrock spa_add_spares(spa_t *spa, nvlist_t *config) 13772082Seschrock { 13782082Seschrock nvlist_t **spares; 13792082Seschrock uint_t i, nspares; 13802082Seschrock nvlist_t *nvroot; 13812082Seschrock uint64_t guid; 13822082Seschrock vdev_stat_t *vs; 13832082Seschrock uint_t vsc; 13843377Seschrock uint64_t pool; 13852082Seschrock 13862082Seschrock if (spa->spa_nspares == 0) 13872082Seschrock return; 13882082Seschrock 13892082Seschrock VERIFY(nvlist_lookup_nvlist(config, 13902082Seschrock ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 13912082Seschrock VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 13922082Seschrock ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 13932082Seschrock if (nspares != 0) { 13942082Seschrock VERIFY(nvlist_add_nvlist_array(nvroot, 13952082Seschrock ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 13962082Seschrock VERIFY(nvlist_lookup_nvlist_array(nvroot, 13972082Seschrock ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 13982082Seschrock 13992082Seschrock /* 14002082Seschrock * Go through and find any spares which have since been 14012082Seschrock * repurposed as an active spare. If this is the case, update 14022082Seschrock * their status appropriately. 14032082Seschrock */ 14042082Seschrock for (i = 0; i < nspares; i++) { 14052082Seschrock VERIFY(nvlist_lookup_uint64(spares[i], 14062082Seschrock ZPOOL_CONFIG_GUID, &guid) == 0); 14073377Seschrock if (spa_spare_exists(guid, &pool) && pool != 0ULL) { 14082082Seschrock VERIFY(nvlist_lookup_uint64_array( 14092082Seschrock spares[i], ZPOOL_CONFIG_STATS, 14102082Seschrock (uint64_t **)&vs, &vsc) == 0); 14112082Seschrock vs->vs_state = VDEV_STATE_CANT_OPEN; 14122082Seschrock vs->vs_aux = VDEV_AUX_SPARED; 14132082Seschrock } 14142082Seschrock } 14152082Seschrock } 14162082Seschrock } 14172082Seschrock 1418789Sahrens int 14191544Seschrock spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 1420789Sahrens { 1421789Sahrens int error; 1422789Sahrens spa_t *spa; 1423789Sahrens 1424789Sahrens *config = NULL; 1425789Sahrens error = spa_open_common(name, &spa, FTAG, config); 1426789Sahrens 14272082Seschrock if (spa && *config != NULL) { 14281544Seschrock VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, 14291544Seschrock spa_get_errlog_size(spa)) == 0); 14301544Seschrock 14312082Seschrock spa_add_spares(spa, *config); 14322082Seschrock } 14332082Seschrock 14341544Seschrock /* 14351544Seschrock * We want to get the alternate root even for faulted pools, so we cheat 14361544Seschrock * and call spa_lookup() directly. 14371544Seschrock */ 14381544Seschrock if (altroot) { 14391544Seschrock if (spa == NULL) { 14401544Seschrock mutex_enter(&spa_namespace_lock); 14411544Seschrock spa = spa_lookup(name); 14421544Seschrock if (spa) 14431544Seschrock spa_altroot(spa, altroot, buflen); 14441544Seschrock else 14451544Seschrock altroot[0] = '\0'; 14461544Seschrock spa = NULL; 14471544Seschrock mutex_exit(&spa_namespace_lock); 14481544Seschrock } else { 14491544Seschrock spa_altroot(spa, altroot, buflen); 14501544Seschrock } 14511544Seschrock } 14521544Seschrock 1453789Sahrens if (spa != NULL) 1454789Sahrens spa_close(spa, FTAG); 1455789Sahrens 1456789Sahrens return (error); 1457789Sahrens } 1458789Sahrens 1459789Sahrens /* 14602082Seschrock * Validate that the 'spares' array is well formed. We must have an array of 14613377Seschrock * nvlists, each which describes a valid leaf vdev. If this is an import (mode 14623377Seschrock * is VDEV_ALLOC_SPARE), then we allow corrupted spares to be specified, as long 14633377Seschrock * as they are well-formed. 14642082Seschrock */ 14652082Seschrock static int 14662082Seschrock spa_validate_spares(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 14672082Seschrock { 14682082Seschrock nvlist_t **spares; 14692082Seschrock uint_t i, nspares; 14702082Seschrock vdev_t *vd; 14712082Seschrock int error; 14722082Seschrock 14732082Seschrock /* 14742082Seschrock * It's acceptable to have no spares specified. 14752082Seschrock */ 14762082Seschrock if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 14772082Seschrock &spares, &nspares) != 0) 14782082Seschrock return (0); 14792082Seschrock 14802082Seschrock if (nspares == 0) 14812082Seschrock return (EINVAL); 14822082Seschrock 14832082Seschrock /* 14842082Seschrock * Make sure the pool is formatted with a version that supports hot 14852082Seschrock * spares. 14862082Seschrock */ 14874577Sahrens if (spa_version(spa) < SPA_VERSION_SPARES) 14882082Seschrock return (ENOTSUP); 14892082Seschrock 14903377Seschrock /* 14913377Seschrock * Set the pending spare list so we correctly handle device in-use 14923377Seschrock * checking. 14933377Seschrock */ 14943377Seschrock spa->spa_pending_spares = spares; 14953377Seschrock spa->spa_pending_nspares = nspares; 14963377Seschrock 14972082Seschrock for (i = 0; i < nspares; i++) { 14982082Seschrock if ((error = spa_config_parse(spa, &vd, spares[i], NULL, 0, 14992082Seschrock mode)) != 0) 15003377Seschrock goto out; 15012082Seschrock 15022082Seschrock if (!vd->vdev_ops->vdev_op_leaf) { 15032082Seschrock vdev_free(vd); 15043377Seschrock error = EINVAL; 15053377Seschrock goto out; 15062082Seschrock } 15072082Seschrock 15082082Seschrock vd->vdev_top = vd; 15093377Seschrock 15103377Seschrock if ((error = vdev_open(vd)) == 0 && 15113377Seschrock (error = vdev_label_init(vd, crtxg, 15123377Seschrock VDEV_LABEL_SPARE)) == 0) { 15133377Seschrock VERIFY(nvlist_add_uint64(spares[i], ZPOOL_CONFIG_GUID, 15143377Seschrock vd->vdev_guid) == 0); 15152082Seschrock } 15162082Seschrock 15172082Seschrock vdev_free(vd); 15183377Seschrock 15193377Seschrock if (error && mode != VDEV_ALLOC_SPARE) 15203377Seschrock goto out; 15213377Seschrock else 15223377Seschrock error = 0; 15232082Seschrock } 15242082Seschrock 15253377Seschrock out: 15263377Seschrock spa->spa_pending_spares = NULL; 15273377Seschrock spa->spa_pending_nspares = 0; 15283377Seschrock return (error); 15292082Seschrock } 15302082Seschrock 15312082Seschrock /* 1532789Sahrens * Pool Creation 1533789Sahrens */ 1534789Sahrens int 15355094Slling spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 15364715Sek110237 const char *history_str) 1537789Sahrens { 1538789Sahrens spa_t *spa; 15395094Slling char *altroot = NULL; 15401635Sbonwick vdev_t *rvd; 1541789Sahrens dsl_pool_t *dp; 1542789Sahrens dmu_tx_t *tx; 15432082Seschrock int c, error = 0; 1544789Sahrens uint64_t txg = TXG_INITIAL; 15452082Seschrock nvlist_t **spares; 15462082Seschrock uint_t nspares; 15475094Slling uint64_t version; 1548789Sahrens 1549789Sahrens /* 1550789Sahrens * If this pool already exists, return failure. 1551789Sahrens */ 1552789Sahrens mutex_enter(&spa_namespace_lock); 1553789Sahrens if (spa_lookup(pool) != NULL) { 1554789Sahrens mutex_exit(&spa_namespace_lock); 1555789Sahrens return (EEXIST); 1556789Sahrens } 1557789Sahrens 1558789Sahrens /* 1559789Sahrens * Allocate a new spa_t structure. 1560789Sahrens */ 15615094Slling (void) nvlist_lookup_string(props, 15625094Slling zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 15631635Sbonwick spa = spa_add(pool, altroot); 1564789Sahrens spa_activate(spa); 1565789Sahrens 1566789Sahrens spa->spa_uberblock.ub_txg = txg - 1; 15675094Slling 15685094Slling if (props && (error = spa_prop_validate(spa, props))) { 15695094Slling spa_unload(spa); 15705094Slling spa_deactivate(spa); 15715094Slling spa_remove(spa); 15725094Slling return (error); 15735094Slling } 15745094Slling 15755094Slling if (nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), 15765094Slling &version) != 0) 15775094Slling version = SPA_VERSION; 15785094Slling ASSERT(version <= SPA_VERSION); 15795094Slling spa->spa_uberblock.ub_version = version; 1580789Sahrens spa->spa_ubsync = spa->spa_uberblock; 1581789Sahrens 15821635Sbonwick /* 15831635Sbonwick * Create the root vdev. 15841635Sbonwick */ 15851635Sbonwick spa_config_enter(spa, RW_WRITER, FTAG); 15861635Sbonwick 15872082Seschrock error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 15882082Seschrock 15892082Seschrock ASSERT(error != 0 || rvd != NULL); 15902082Seschrock ASSERT(error != 0 || spa->spa_root_vdev == rvd); 15912082Seschrock 15922082Seschrock if (error == 0 && rvd->vdev_children == 0) 15931635Sbonwick error = EINVAL; 15942082Seschrock 15952082Seschrock if (error == 0 && 15962082Seschrock (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 15972082Seschrock (error = spa_validate_spares(spa, nvroot, txg, 15982082Seschrock VDEV_ALLOC_ADD)) == 0) { 15992082Seschrock for (c = 0; c < rvd->vdev_children; c++) 16002082Seschrock vdev_init(rvd->vdev_child[c], txg); 16012082Seschrock vdev_config_dirty(rvd); 16021635Sbonwick } 16031635Sbonwick 16041635Sbonwick spa_config_exit(spa, FTAG); 1605789Sahrens 16062082Seschrock if (error != 0) { 1607789Sahrens spa_unload(spa); 1608789Sahrens spa_deactivate(spa); 1609789Sahrens spa_remove(spa); 1610789Sahrens mutex_exit(&spa_namespace_lock); 1611789Sahrens return (error); 1612789Sahrens } 1613789Sahrens 16142082Seschrock /* 16152082Seschrock * Get the list of spares, if specified. 16162082Seschrock */ 16172082Seschrock if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 16182082Seschrock &spares, &nspares) == 0) { 16192082Seschrock VERIFY(nvlist_alloc(&spa->spa_sparelist, NV_UNIQUE_NAME, 16202082Seschrock KM_SLEEP) == 0); 16212082Seschrock VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 16222082Seschrock ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 16232082Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 16242082Seschrock spa_load_spares(spa); 16252082Seschrock spa_config_exit(spa, FTAG); 16262082Seschrock spa->spa_sync_spares = B_TRUE; 16272082Seschrock } 16282082Seschrock 1629789Sahrens spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg); 1630789Sahrens spa->spa_meta_objset = dp->dp_meta_objset; 1631789Sahrens 1632789Sahrens tx = dmu_tx_create_assigned(dp, txg); 1633789Sahrens 1634789Sahrens /* 1635789Sahrens * Create the pool config object. 1636789Sahrens */ 1637789Sahrens spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 1638789Sahrens DMU_OT_PACKED_NVLIST, 1 << 14, 1639789Sahrens DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 1640789Sahrens 16411544Seschrock if (zap_add(spa->spa_meta_objset, 1642789Sahrens DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 16431544Seschrock sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 16441544Seschrock cmn_err(CE_PANIC, "failed to add pool config"); 16451544Seschrock } 1646789Sahrens 16475094Slling /* Newly created pools with the right version are always deflated. */ 16485094Slling if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 16495094Slling spa->spa_deflate = TRUE; 16505094Slling if (zap_add(spa->spa_meta_objset, 16515094Slling DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 16525094Slling sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 16535094Slling cmn_err(CE_PANIC, "failed to add deflate"); 16545094Slling } 16552082Seschrock } 16562082Seschrock 1657789Sahrens /* 1658789Sahrens * Create the deferred-free bplist object. Turn off compression 1659789Sahrens * because sync-to-convergence takes longer if the blocksize 1660789Sahrens * keeps changing. 1661789Sahrens */ 1662789Sahrens spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 1663789Sahrens 1 << 14, tx); 1664789Sahrens dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 1665789Sahrens ZIO_COMPRESS_OFF, tx); 1666789Sahrens 16671544Seschrock if (zap_add(spa->spa_meta_objset, 1668789Sahrens DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 16691544Seschrock sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { 16701544Seschrock cmn_err(CE_PANIC, "failed to add bplist"); 16711544Seschrock } 1672789Sahrens 16732926Sek110237 /* 16742926Sek110237 * Create the pool's history object. 16752926Sek110237 */ 16765094Slling if (version >= SPA_VERSION_ZPOOL_HISTORY) 16775094Slling spa_history_create_obj(spa, tx); 16785094Slling 16795094Slling /* 16805094Slling * Set pool properties. 16815094Slling */ 16825094Slling spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 16835094Slling spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 16845329Sgw25295 spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 16855094Slling if (props) 16865094Slling spa_sync_props(spa, props, CRED(), tx); 16872926Sek110237 1688789Sahrens dmu_tx_commit(tx); 1689789Sahrens 1690789Sahrens spa->spa_sync_on = B_TRUE; 1691789Sahrens txg_sync_start(spa->spa_dsl_pool); 1692789Sahrens 1693789Sahrens /* 1694789Sahrens * We explicitly wait for the first transaction to complete so that our 1695789Sahrens * bean counters are appropriately updated. 1696789Sahrens */ 1697789Sahrens txg_wait_synced(spa->spa_dsl_pool, txg); 1698789Sahrens 1699789Sahrens spa_config_sync(); 1700789Sahrens 17015094Slling if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL) 17024715Sek110237 (void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE); 17034715Sek110237 1704789Sahrens mutex_exit(&spa_namespace_lock); 1705789Sahrens 1706789Sahrens return (0); 1707789Sahrens } 1708789Sahrens 1709789Sahrens /* 1710789Sahrens * Import the given pool into the system. We set up the necessary spa_t and 1711789Sahrens * then call spa_load() to do the dirty work. 1712789Sahrens */ 1713789Sahrens int 17145094Slling spa_import(const char *pool, nvlist_t *config, nvlist_t *props) 1715789Sahrens { 1716789Sahrens spa_t *spa; 17175094Slling char *altroot = NULL; 1718789Sahrens int error; 17192082Seschrock nvlist_t *nvroot; 17202082Seschrock nvlist_t **spares; 17212082Seschrock uint_t nspares; 1722789Sahrens 1723789Sahrens /* 1724789Sahrens * If a pool with this name exists, return failure. 1725789Sahrens */ 1726789Sahrens mutex_enter(&spa_namespace_lock); 1727789Sahrens if (spa_lookup(pool) != NULL) { 1728789Sahrens mutex_exit(&spa_namespace_lock); 1729789Sahrens return (EEXIST); 1730789Sahrens } 1731789Sahrens 1732789Sahrens /* 17331635Sbonwick * Create and initialize the spa structure. 1734789Sahrens */ 17355094Slling (void) nvlist_lookup_string(props, 17365094Slling zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 17371635Sbonwick spa = spa_add(pool, altroot); 1738789Sahrens spa_activate(spa); 1739789Sahrens 1740789Sahrens /* 17411635Sbonwick * Pass off the heavy lifting to spa_load(). 17421732Sbonwick * Pass TRUE for mosconfig because the user-supplied config 17431732Sbonwick * is actually the one to trust when doing an import. 17441601Sbonwick */ 17451732Sbonwick error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE); 1746789Sahrens 17472082Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 17482082Seschrock /* 17492082Seschrock * Toss any existing sparelist, as it doesn't have any validity anymore, 17502082Seschrock * and conflicts with spa_has_spare(). 17512082Seschrock */ 17522082Seschrock if (spa->spa_sparelist) { 17532082Seschrock nvlist_free(spa->spa_sparelist); 17542082Seschrock spa->spa_sparelist = NULL; 17552082Seschrock spa_load_spares(spa); 17562082Seschrock } 17572082Seschrock 17582082Seschrock VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 17592082Seschrock &nvroot) == 0); 17605094Slling if (error == 0) { 17612082Seschrock error = spa_validate_spares(spa, nvroot, -1ULL, 17622082Seschrock VDEV_ALLOC_SPARE); 17635094Slling } 17642082Seschrock spa_config_exit(spa, FTAG); 17652082Seschrock 17665094Slling if (error != 0 || (props && (error = spa_prop_set(spa, props)))) { 1767789Sahrens spa_unload(spa); 1768789Sahrens spa_deactivate(spa); 1769789Sahrens spa_remove(spa); 1770789Sahrens mutex_exit(&spa_namespace_lock); 1771789Sahrens return (error); 1772789Sahrens } 1773789Sahrens 17741635Sbonwick /* 17752082Seschrock * Override any spares as specified by the user, as these may have 17762082Seschrock * correct device names/devids, etc. 17772082Seschrock */ 17782082Seschrock if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 17792082Seschrock &spares, &nspares) == 0) { 17802082Seschrock if (spa->spa_sparelist) 17812082Seschrock VERIFY(nvlist_remove(spa->spa_sparelist, 17822082Seschrock ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 17832082Seschrock else 17842082Seschrock VERIFY(nvlist_alloc(&spa->spa_sparelist, 17852082Seschrock NV_UNIQUE_NAME, KM_SLEEP) == 0); 17862082Seschrock VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 17872082Seschrock ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 17882082Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 17892082Seschrock spa_load_spares(spa); 17902082Seschrock spa_config_exit(spa, FTAG); 17912082Seschrock spa->spa_sync_spares = B_TRUE; 17922082Seschrock } 17932082Seschrock 17942082Seschrock /* 17951635Sbonwick * Update the config cache to include the newly-imported pool. 17961635Sbonwick */ 17974627Sck153898 if (spa_mode & FWRITE) 17984627Sck153898 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 17991635Sbonwick 1800789Sahrens /* 1801789Sahrens * Resilver anything that's out of date. 1802789Sahrens */ 1803789Sahrens if (spa_mode & FWRITE) 1804789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1805789Sahrens 18064451Seschrock mutex_exit(&spa_namespace_lock); 18074451Seschrock 1808789Sahrens return (0); 1809789Sahrens } 1810789Sahrens 1811789Sahrens /* 1812789Sahrens * This (illegal) pool name is used when temporarily importing a spa_t in order 1813789Sahrens * to get the vdev stats associated with the imported devices. 1814789Sahrens */ 1815789Sahrens #define TRYIMPORT_NAME "$import" 1816789Sahrens 1817789Sahrens nvlist_t * 1818789Sahrens spa_tryimport(nvlist_t *tryconfig) 1819789Sahrens { 1820789Sahrens nvlist_t *config = NULL; 1821789Sahrens char *poolname; 1822789Sahrens spa_t *spa; 1823789Sahrens uint64_t state; 1824789Sahrens 1825789Sahrens if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 1826789Sahrens return (NULL); 1827789Sahrens 1828789Sahrens if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 1829789Sahrens return (NULL); 1830789Sahrens 18311635Sbonwick /* 18321635Sbonwick * Create and initialize the spa structure. 18331635Sbonwick */ 1834789Sahrens mutex_enter(&spa_namespace_lock); 18351635Sbonwick spa = spa_add(TRYIMPORT_NAME, NULL); 1836789Sahrens spa_activate(spa); 1837789Sahrens 1838789Sahrens /* 18391635Sbonwick * Pass off the heavy lifting to spa_load(). 18401732Sbonwick * Pass TRUE for mosconfig because the user-supplied config 18411732Sbonwick * is actually the one to trust when doing an import. 1842789Sahrens */ 18431732Sbonwick (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); 1844789Sahrens 1845789Sahrens /* 1846789Sahrens * If 'tryconfig' was at least parsable, return the current config. 1847789Sahrens */ 1848789Sahrens if (spa->spa_root_vdev != NULL) { 18491635Sbonwick spa_config_enter(spa, RW_READER, FTAG); 1850789Sahrens config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 18511635Sbonwick spa_config_exit(spa, FTAG); 1852789Sahrens VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 1853789Sahrens poolname) == 0); 1854789Sahrens VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 1855789Sahrens state) == 0); 18563975Sek110237 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 18573975Sek110237 spa->spa_uberblock.ub_timestamp) == 0); 18582082Seschrock 18592082Seschrock /* 18602082Seschrock * Add the list of hot spares. 18612082Seschrock */ 18622082Seschrock spa_add_spares(spa, config); 1863789Sahrens } 1864789Sahrens 1865789Sahrens spa_unload(spa); 1866789Sahrens spa_deactivate(spa); 1867789Sahrens spa_remove(spa); 1868789Sahrens mutex_exit(&spa_namespace_lock); 1869789Sahrens 1870789Sahrens return (config); 1871789Sahrens } 1872789Sahrens 1873789Sahrens /* 1874789Sahrens * Pool export/destroy 1875789Sahrens * 1876789Sahrens * The act of destroying or exporting a pool is very simple. We make sure there 1877789Sahrens * is no more pending I/O and any references to the pool are gone. Then, we 1878789Sahrens * update the pool state and sync all the labels to disk, removing the 1879789Sahrens * configuration from the cache afterwards. 1880789Sahrens */ 1881789Sahrens static int 18821775Sbillm spa_export_common(char *pool, int new_state, nvlist_t **oldconfig) 1883789Sahrens { 1884789Sahrens spa_t *spa; 1885789Sahrens 18861775Sbillm if (oldconfig) 18871775Sbillm *oldconfig = NULL; 18881775Sbillm 1889789Sahrens if (!(spa_mode & FWRITE)) 1890789Sahrens return (EROFS); 1891789Sahrens 1892789Sahrens mutex_enter(&spa_namespace_lock); 1893789Sahrens if ((spa = spa_lookup(pool)) == NULL) { 1894789Sahrens mutex_exit(&spa_namespace_lock); 1895789Sahrens return (ENOENT); 1896789Sahrens } 1897789Sahrens 1898789Sahrens /* 18991544Seschrock * Put a hold on the pool, drop the namespace lock, stop async tasks, 19001544Seschrock * reacquire the namespace lock, and see if we can export. 19011544Seschrock */ 19021544Seschrock spa_open_ref(spa, FTAG); 19031544Seschrock mutex_exit(&spa_namespace_lock); 19041544Seschrock spa_async_suspend(spa); 19051544Seschrock mutex_enter(&spa_namespace_lock); 19061544Seschrock spa_close(spa, FTAG); 19071544Seschrock 19081544Seschrock /* 1909789Sahrens * The pool will be in core if it's openable, 1910789Sahrens * in which case we can modify its state. 1911789Sahrens */ 1912789Sahrens if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 1913789Sahrens /* 1914789Sahrens * Objsets may be open only because they're dirty, so we 1915789Sahrens * have to force it to sync before checking spa_refcnt. 1916789Sahrens */ 1917789Sahrens spa_scrub_suspend(spa); 1918789Sahrens txg_wait_synced(spa->spa_dsl_pool, 0); 1919789Sahrens 19201544Seschrock /* 19211544Seschrock * A pool cannot be exported or destroyed if there are active 19221544Seschrock * references. If we are resetting a pool, allow references by 19231544Seschrock * fault injection handlers. 19241544Seschrock */ 19251544Seschrock if (!spa_refcount_zero(spa) || 19261544Seschrock (spa->spa_inject_ref != 0 && 19271544Seschrock new_state != POOL_STATE_UNINITIALIZED)) { 1928789Sahrens spa_scrub_resume(spa); 19291544Seschrock spa_async_resume(spa); 1930789Sahrens mutex_exit(&spa_namespace_lock); 1931789Sahrens return (EBUSY); 1932789Sahrens } 1933789Sahrens 1934789Sahrens spa_scrub_resume(spa); 1935789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 1936789Sahrens 1937789Sahrens /* 1938789Sahrens * We want this to be reflected on every label, 1939789Sahrens * so mark them all dirty. spa_unload() will do the 1940789Sahrens * final sync that pushes these changes out. 1941789Sahrens */ 19421544Seschrock if (new_state != POOL_STATE_UNINITIALIZED) { 19431601Sbonwick spa_config_enter(spa, RW_WRITER, FTAG); 19441544Seschrock spa->spa_state = new_state; 19451635Sbonwick spa->spa_final_txg = spa_last_synced_txg(spa) + 1; 19461544Seschrock vdev_config_dirty(spa->spa_root_vdev); 19471601Sbonwick spa_config_exit(spa, FTAG); 19481544Seschrock } 1949789Sahrens } 1950789Sahrens 19514451Seschrock spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY); 19524451Seschrock 1953789Sahrens if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 1954789Sahrens spa_unload(spa); 1955789Sahrens spa_deactivate(spa); 1956789Sahrens } 1957789Sahrens 19581775Sbillm if (oldconfig && spa->spa_config) 19591775Sbillm VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 19601775Sbillm 19611544Seschrock if (new_state != POOL_STATE_UNINITIALIZED) { 1962*5363Seschrock spa_config_check(spa->spa_config_dir, 1963*5363Seschrock spa->spa_config_file); 19641544Seschrock spa_remove(spa); 19651544Seschrock spa_config_sync(); 19661544Seschrock } 1967789Sahrens mutex_exit(&spa_namespace_lock); 1968789Sahrens 1969789Sahrens return (0); 1970789Sahrens } 1971789Sahrens 1972789Sahrens /* 1973789Sahrens * Destroy a storage pool. 1974789Sahrens */ 1975789Sahrens int 1976789Sahrens spa_destroy(char *pool) 1977789Sahrens { 19781775Sbillm return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL)); 1979789Sahrens } 1980789Sahrens 1981789Sahrens /* 1982789Sahrens * Export a storage pool. 1983789Sahrens */ 1984789Sahrens int 19851775Sbillm spa_export(char *pool, nvlist_t **oldconfig) 1986789Sahrens { 19871775Sbillm return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig)); 1988789Sahrens } 1989789Sahrens 1990789Sahrens /* 19911544Seschrock * Similar to spa_export(), this unloads the spa_t without actually removing it 19921544Seschrock * from the namespace in any way. 19931544Seschrock */ 19941544Seschrock int 19951544Seschrock spa_reset(char *pool) 19961544Seschrock { 19971775Sbillm return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL)); 19981544Seschrock } 19991544Seschrock 20001544Seschrock 20011544Seschrock /* 2002789Sahrens * ========================================================================== 2003789Sahrens * Device manipulation 2004789Sahrens * ========================================================================== 2005789Sahrens */ 2006789Sahrens 2007789Sahrens /* 20084527Sperrin * Add a device to a storage pool. 2009789Sahrens */ 2010789Sahrens int 2011789Sahrens spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 2012789Sahrens { 2013789Sahrens uint64_t txg; 20141635Sbonwick int c, error; 2015789Sahrens vdev_t *rvd = spa->spa_root_vdev; 20161585Sbonwick vdev_t *vd, *tvd; 20172082Seschrock nvlist_t **spares; 20182082Seschrock uint_t i, nspares; 2019789Sahrens 2020789Sahrens txg = spa_vdev_enter(spa); 2021789Sahrens 20222082Seschrock if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 20232082Seschrock VDEV_ALLOC_ADD)) != 0) 20242082Seschrock return (spa_vdev_exit(spa, NULL, txg, error)); 20252082Seschrock 20263377Seschrock spa->spa_pending_vdev = vd; 2027789Sahrens 20282082Seschrock if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 20292082Seschrock &spares, &nspares) != 0) 20302082Seschrock nspares = 0; 20312082Seschrock 20323377Seschrock if (vd->vdev_children == 0 && nspares == 0) { 20333377Seschrock spa->spa_pending_vdev = NULL; 20342082Seschrock return (spa_vdev_exit(spa, vd, txg, EINVAL)); 20353377Seschrock } 20362082Seschrock 20372082Seschrock if (vd->vdev_children != 0) { 20383377Seschrock if ((error = vdev_create(vd, txg, B_FALSE)) != 0) { 20393377Seschrock spa->spa_pending_vdev = NULL; 20402082Seschrock return (spa_vdev_exit(spa, vd, txg, error)); 20412082Seschrock } 20422082Seschrock } 20432082Seschrock 20443377Seschrock /* 20453377Seschrock * We must validate the spares after checking the children. Otherwise, 20463377Seschrock * vdev_inuse() will blindly overwrite the spare. 20473377Seschrock */ 20483377Seschrock if ((error = spa_validate_spares(spa, nvroot, txg, 20493377Seschrock VDEV_ALLOC_ADD)) != 0) { 20503377Seschrock spa->spa_pending_vdev = NULL; 20513377Seschrock return (spa_vdev_exit(spa, vd, txg, error)); 20523377Seschrock } 20533377Seschrock 20543377Seschrock spa->spa_pending_vdev = NULL; 20553377Seschrock 20563377Seschrock /* 20573377Seschrock * Transfer each new top-level vdev from vd to rvd. 20583377Seschrock */ 20593377Seschrock for (c = 0; c < vd->vdev_children; c++) { 20603377Seschrock tvd = vd->vdev_child[c]; 20613377Seschrock vdev_remove_child(vd, tvd); 20623377Seschrock tvd->vdev_id = rvd->vdev_children; 20633377Seschrock vdev_add_child(rvd, tvd); 20643377Seschrock vdev_config_dirty(tvd); 20653377Seschrock } 20663377Seschrock 20672082Seschrock if (nspares != 0) { 20682082Seschrock if (spa->spa_sparelist != NULL) { 20692082Seschrock nvlist_t **oldspares; 20702082Seschrock uint_t oldnspares; 20712082Seschrock nvlist_t **newspares; 20722082Seschrock 20732082Seschrock VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 20742082Seschrock ZPOOL_CONFIG_SPARES, &oldspares, &oldnspares) == 0); 20752082Seschrock 20762082Seschrock newspares = kmem_alloc(sizeof (void *) * 20772082Seschrock (nspares + oldnspares), KM_SLEEP); 20782082Seschrock for (i = 0; i < oldnspares; i++) 20792082Seschrock VERIFY(nvlist_dup(oldspares[i], 20802082Seschrock &newspares[i], KM_SLEEP) == 0); 20812082Seschrock for (i = 0; i < nspares; i++) 20822082Seschrock VERIFY(nvlist_dup(spares[i], 20832082Seschrock &newspares[i + oldnspares], 20842082Seschrock KM_SLEEP) == 0); 20852082Seschrock 20862082Seschrock VERIFY(nvlist_remove(spa->spa_sparelist, 20872082Seschrock ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 20882082Seschrock 20892082Seschrock VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 20902082Seschrock ZPOOL_CONFIG_SPARES, newspares, 20912082Seschrock nspares + oldnspares) == 0); 20922082Seschrock for (i = 0; i < oldnspares + nspares; i++) 20932082Seschrock nvlist_free(newspares[i]); 20942082Seschrock kmem_free(newspares, (oldnspares + nspares) * 20952082Seschrock sizeof (void *)); 20962082Seschrock } else { 20972082Seschrock VERIFY(nvlist_alloc(&spa->spa_sparelist, 20982082Seschrock NV_UNIQUE_NAME, KM_SLEEP) == 0); 20992082Seschrock VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, 21002082Seschrock ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 21012082Seschrock } 21022082Seschrock 21032082Seschrock spa_load_spares(spa); 21042082Seschrock spa->spa_sync_spares = B_TRUE; 2105789Sahrens } 2106789Sahrens 2107789Sahrens /* 21081585Sbonwick * We have to be careful when adding new vdevs to an existing pool. 21091585Sbonwick * If other threads start allocating from these vdevs before we 21101585Sbonwick * sync the config cache, and we lose power, then upon reboot we may 21111585Sbonwick * fail to open the pool because there are DVAs that the config cache 21121585Sbonwick * can't translate. Therefore, we first add the vdevs without 21131585Sbonwick * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 21141635Sbonwick * and then let spa_config_update() initialize the new metaslabs. 21151585Sbonwick * 21161585Sbonwick * spa_load() checks for added-but-not-initialized vdevs, so that 21171585Sbonwick * if we lose power at any point in this sequence, the remaining 21181585Sbonwick * steps will be completed the next time we load the pool. 2119789Sahrens */ 21201635Sbonwick (void) spa_vdev_exit(spa, vd, txg, 0); 21211585Sbonwick 21221635Sbonwick mutex_enter(&spa_namespace_lock); 21231635Sbonwick spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 21241635Sbonwick mutex_exit(&spa_namespace_lock); 2125789Sahrens 21261635Sbonwick return (0); 2127789Sahrens } 2128789Sahrens 2129789Sahrens /* 2130789Sahrens * Attach a device to a mirror. The arguments are the path to any device 2131789Sahrens * in the mirror, and the nvroot for the new device. If the path specifies 2132789Sahrens * a device that is not mirrored, we automatically insert the mirror vdev. 2133789Sahrens * 2134789Sahrens * If 'replacing' is specified, the new device is intended to replace the 2135789Sahrens * existing device; in this case the two devices are made into their own 21364451Seschrock * mirror using the 'replacing' vdev, which is functionally identical to 2137789Sahrens * the mirror vdev (it actually reuses all the same ops) but has a few 2138789Sahrens * extra rules: you can't attach to it after it's been created, and upon 2139789Sahrens * completion of resilvering, the first disk (the one being replaced) 2140789Sahrens * is automatically detached. 2141789Sahrens */ 2142789Sahrens int 21431544Seschrock spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 2144789Sahrens { 2145789Sahrens uint64_t txg, open_txg; 2146789Sahrens int error; 2147789Sahrens vdev_t *rvd = spa->spa_root_vdev; 2148789Sahrens vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 21492082Seschrock vdev_ops_t *pvops; 21504527Sperrin int is_log; 2151789Sahrens 2152789Sahrens txg = spa_vdev_enter(spa); 2153789Sahrens 21541544Seschrock oldvd = vdev_lookup_by_guid(rvd, guid); 2155789Sahrens 2156789Sahrens if (oldvd == NULL) 2157789Sahrens return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 2158789Sahrens 21591585Sbonwick if (!oldvd->vdev_ops->vdev_op_leaf) 21601585Sbonwick return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 21611585Sbonwick 2162789Sahrens pvd = oldvd->vdev_parent; 2163789Sahrens 21642082Seschrock if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 21654451Seschrock VDEV_ALLOC_ADD)) != 0) 21664451Seschrock return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 21674451Seschrock 21684451Seschrock if (newrootvd->vdev_children != 1) 2169789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 2170789Sahrens 2171789Sahrens newvd = newrootvd->vdev_child[0]; 2172789Sahrens 2173789Sahrens if (!newvd->vdev_ops->vdev_op_leaf) 2174789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 2175789Sahrens 21762082Seschrock if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 2177789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, error)); 2178789Sahrens 21794527Sperrin /* 21804527Sperrin * Spares can't replace logs 21814527Sperrin */ 21824527Sperrin is_log = oldvd->vdev_islog; 21834527Sperrin if (is_log && newvd->vdev_isspare) 21844527Sperrin return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 21854527Sperrin 21862082Seschrock if (!replacing) { 21872082Seschrock /* 21882082Seschrock * For attach, the only allowable parent is a mirror or the root 21892082Seschrock * vdev. 21902082Seschrock */ 21912082Seschrock if (pvd->vdev_ops != &vdev_mirror_ops && 21922082Seschrock pvd->vdev_ops != &vdev_root_ops) 21932082Seschrock return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 21942082Seschrock 21952082Seschrock pvops = &vdev_mirror_ops; 21962082Seschrock } else { 21972082Seschrock /* 21982082Seschrock * Active hot spares can only be replaced by inactive hot 21992082Seschrock * spares. 22002082Seschrock */ 22012082Seschrock if (pvd->vdev_ops == &vdev_spare_ops && 22022082Seschrock pvd->vdev_child[1] == oldvd && 22032082Seschrock !spa_has_spare(spa, newvd->vdev_guid)) 22042082Seschrock return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 22052082Seschrock 22062082Seschrock /* 22072082Seschrock * If the source is a hot spare, and the parent isn't already a 22082082Seschrock * spare, then we want to create a new hot spare. Otherwise, we 22093377Seschrock * want to create a replacing vdev. The user is not allowed to 22103377Seschrock * attach to a spared vdev child unless the 'isspare' state is 22113377Seschrock * the same (spare replaces spare, non-spare replaces 22123377Seschrock * non-spare). 22132082Seschrock */ 22142082Seschrock if (pvd->vdev_ops == &vdev_replacing_ops) 22152082Seschrock return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 22163377Seschrock else if (pvd->vdev_ops == &vdev_spare_ops && 22173377Seschrock newvd->vdev_isspare != oldvd->vdev_isspare) 22183377Seschrock return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 22192082Seschrock else if (pvd->vdev_ops != &vdev_spare_ops && 22202082Seschrock newvd->vdev_isspare) 22212082Seschrock pvops = &vdev_spare_ops; 22222082Seschrock else 22232082Seschrock pvops = &vdev_replacing_ops; 22242082Seschrock } 22252082Seschrock 22261175Slling /* 22271175Slling * Compare the new device size with the replaceable/attachable 22281175Slling * device size. 22291175Slling */ 22301175Slling if (newvd->vdev_psize < vdev_get_rsize(oldvd)) 2231789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 2232789Sahrens 22331732Sbonwick /* 22341732Sbonwick * The new device cannot have a higher alignment requirement 22351732Sbonwick * than the top-level vdev. 22361732Sbonwick */ 22371732Sbonwick if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 2238789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 2239789Sahrens 2240789Sahrens /* 2241789Sahrens * If this is an in-place replacement, update oldvd's path and devid 2242789Sahrens * to make it distinguishable from newvd, and unopenable from now on. 2243789Sahrens */ 2244789Sahrens if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 2245789Sahrens spa_strfree(oldvd->vdev_path); 2246789Sahrens oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 2247789Sahrens KM_SLEEP); 2248789Sahrens (void) sprintf(oldvd->vdev_path, "%s/%s", 2249789Sahrens newvd->vdev_path, "old"); 2250789Sahrens if (oldvd->vdev_devid != NULL) { 2251789Sahrens spa_strfree(oldvd->vdev_devid); 2252789Sahrens oldvd->vdev_devid = NULL; 2253789Sahrens } 2254789Sahrens } 2255789Sahrens 2256789Sahrens /* 22572082Seschrock * If the parent is not a mirror, or if we're replacing, insert the new 22582082Seschrock * mirror/replacing/spare vdev above oldvd. 2259789Sahrens */ 2260789Sahrens if (pvd->vdev_ops != pvops) 2261789Sahrens pvd = vdev_add_parent(oldvd, pvops); 2262789Sahrens 2263789Sahrens ASSERT(pvd->vdev_top->vdev_parent == rvd); 2264789Sahrens ASSERT(pvd->vdev_ops == pvops); 2265789Sahrens ASSERT(oldvd->vdev_parent == pvd); 2266789Sahrens 2267789Sahrens /* 2268789Sahrens * Extract the new device from its root and add it to pvd. 2269789Sahrens */ 2270789Sahrens vdev_remove_child(newrootvd, newvd); 2271789Sahrens newvd->vdev_id = pvd->vdev_children; 2272789Sahrens vdev_add_child(pvd, newvd); 2273789Sahrens 22741544Seschrock /* 22751544Seschrock * If newvd is smaller than oldvd, but larger than its rsize, 22761544Seschrock * the addition of newvd may have decreased our parent's asize. 22771544Seschrock */ 22781544Seschrock pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize); 22791544Seschrock 2280789Sahrens tvd = newvd->vdev_top; 2281789Sahrens ASSERT(pvd->vdev_top == tvd); 2282789Sahrens ASSERT(tvd->vdev_parent == rvd); 2283789Sahrens 2284789Sahrens vdev_config_dirty(tvd); 2285789Sahrens 2286789Sahrens /* 2287789Sahrens * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 2288789Sahrens * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 2289789Sahrens */ 2290789Sahrens open_txg = txg + TXG_CONCURRENT_STATES - 1; 2291789Sahrens 2292789Sahrens mutex_enter(&newvd->vdev_dtl_lock); 2293789Sahrens space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, 2294789Sahrens open_txg - TXG_INITIAL + 1); 2295789Sahrens mutex_exit(&newvd->vdev_dtl_lock); 2296789Sahrens 22973377Seschrock if (newvd->vdev_isspare) 22983377Seschrock spa_spare_activate(newvd); 22991544Seschrock 2300789Sahrens /* 2301789Sahrens * Mark newvd's DTL dirty in this txg. 2302789Sahrens */ 23031732Sbonwick vdev_dirty(tvd, VDD_DTL, newvd, txg); 2304789Sahrens 2305789Sahrens (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 2306789Sahrens 2307789Sahrens /* 23084451Seschrock * Kick off a resilver to update newvd. We need to grab the namespace 23094451Seschrock * lock because spa_scrub() needs to post a sysevent with the pool name. 2310789Sahrens */ 23114451Seschrock mutex_enter(&spa_namespace_lock); 2312789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 23134451Seschrock mutex_exit(&spa_namespace_lock); 2314789Sahrens 2315789Sahrens return (0); 2316789Sahrens } 2317789Sahrens 2318789Sahrens /* 2319789Sahrens * Detach a device from a mirror or replacing vdev. 2320789Sahrens * If 'replace_done' is specified, only detach if the parent 2321789Sahrens * is a replacing vdev. 2322789Sahrens */ 2323789Sahrens int 23241544Seschrock spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) 2325789Sahrens { 2326789Sahrens uint64_t txg; 2327789Sahrens int c, t, error; 2328789Sahrens vdev_t *rvd = spa->spa_root_vdev; 2329789Sahrens vdev_t *vd, *pvd, *cvd, *tvd; 23302082Seschrock boolean_t unspare = B_FALSE; 23312082Seschrock uint64_t unspare_guid; 2332789Sahrens 2333789Sahrens txg = spa_vdev_enter(spa); 2334789Sahrens 23351544Seschrock vd = vdev_lookup_by_guid(rvd, guid); 2336789Sahrens 2337789Sahrens if (vd == NULL) 2338789Sahrens return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 2339789Sahrens 23401585Sbonwick if (!vd->vdev_ops->vdev_op_leaf) 23411585Sbonwick return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 23421585Sbonwick 2343789Sahrens pvd = vd->vdev_parent; 2344789Sahrens 2345789Sahrens /* 2346789Sahrens * If replace_done is specified, only remove this device if it's 23472082Seschrock * the first child of a replacing vdev. For the 'spare' vdev, either 23482082Seschrock * disk can be removed. 2349789Sahrens */ 23502082Seschrock if (replace_done) { 23512082Seschrock if (pvd->vdev_ops == &vdev_replacing_ops) { 23522082Seschrock if (vd->vdev_id != 0) 23532082Seschrock return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 23542082Seschrock } else if (pvd->vdev_ops != &vdev_spare_ops) { 23552082Seschrock return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 23562082Seschrock } 23572082Seschrock } 23582082Seschrock 23592082Seschrock ASSERT(pvd->vdev_ops != &vdev_spare_ops || 23604577Sahrens spa_version(spa) >= SPA_VERSION_SPARES); 2361789Sahrens 2362789Sahrens /* 23632082Seschrock * Only mirror, replacing, and spare vdevs support detach. 2364789Sahrens */ 2365789Sahrens if (pvd->vdev_ops != &vdev_replacing_ops && 23662082Seschrock pvd->vdev_ops != &vdev_mirror_ops && 23672082Seschrock pvd->vdev_ops != &vdev_spare_ops) 2368789Sahrens return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 2369789Sahrens 2370789Sahrens /* 2371789Sahrens * If there's only one replica, you can't detach it. 2372789Sahrens */ 2373789Sahrens if (pvd->vdev_children <= 1) 2374789Sahrens return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 2375789Sahrens 2376789Sahrens /* 2377789Sahrens * If all siblings have non-empty DTLs, this device may have the only 2378789Sahrens * valid copy of the data, which means we cannot safely detach it. 2379789Sahrens * 2380789Sahrens * XXX -- as in the vdev_offline() case, we really want a more 2381789Sahrens * precise DTL check. 2382789Sahrens */ 2383789Sahrens for (c = 0; c < pvd->vdev_children; c++) { 2384789Sahrens uint64_t dirty; 2385789Sahrens 2386789Sahrens cvd = pvd->vdev_child[c]; 2387789Sahrens if (cvd == vd) 2388789Sahrens continue; 2389789Sahrens if (vdev_is_dead(cvd)) 2390789Sahrens continue; 2391789Sahrens mutex_enter(&cvd->vdev_dtl_lock); 2392789Sahrens dirty = cvd->vdev_dtl_map.sm_space | 2393789Sahrens cvd->vdev_dtl_scrub.sm_space; 2394789Sahrens mutex_exit(&cvd->vdev_dtl_lock); 2395789Sahrens if (!dirty) 2396789Sahrens break; 2397789Sahrens } 23982082Seschrock 23992082Seschrock /* 24002082Seschrock * If we are a replacing or spare vdev, then we can always detach the 24012082Seschrock * latter child, as that is how one cancels the operation. 24022082Seschrock */ 24032082Seschrock if ((pvd->vdev_ops == &vdev_mirror_ops || vd->vdev_id != 1) && 24042082Seschrock c == pvd->vdev_children) 2405789Sahrens return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 2406789Sahrens 2407789Sahrens /* 24082082Seschrock * If we are detaching the original disk from a spare, then it implies 24092082Seschrock * that the spare should become a real disk, and be removed from the 24102082Seschrock * active spare list for the pool. 24112082Seschrock */ 24122082Seschrock if (pvd->vdev_ops == &vdev_spare_ops && 24132082Seschrock vd->vdev_id == 0) 24142082Seschrock unspare = B_TRUE; 24152082Seschrock 24162082Seschrock /* 2417789Sahrens * Erase the disk labels so the disk can be used for other things. 2418789Sahrens * This must be done after all other error cases are handled, 2419789Sahrens * but before we disembowel vd (so we can still do I/O to it). 2420789Sahrens * But if we can't do it, don't treat the error as fatal -- 2421789Sahrens * it may be that the unwritability of the disk is the reason 2422789Sahrens * it's being detached! 2423789Sahrens */ 24243377Seschrock error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 2425789Sahrens 2426789Sahrens /* 2427789Sahrens * Remove vd from its parent and compact the parent's children. 2428789Sahrens */ 2429789Sahrens vdev_remove_child(pvd, vd); 2430789Sahrens vdev_compact_children(pvd); 2431789Sahrens 2432789Sahrens /* 2433789Sahrens * Remember one of the remaining children so we can get tvd below. 2434789Sahrens */ 2435789Sahrens cvd = pvd->vdev_child[0]; 2436789Sahrens 2437789Sahrens /* 24382082Seschrock * If we need to remove the remaining child from the list of hot spares, 24392082Seschrock * do it now, marking the vdev as no longer a spare in the process. We 24402082Seschrock * must do this before vdev_remove_parent(), because that can change the 24412082Seschrock * GUID if it creates a new toplevel GUID. 24422082Seschrock */ 24432082Seschrock if (unspare) { 24442082Seschrock ASSERT(cvd->vdev_isspare); 24453377Seschrock spa_spare_remove(cvd); 24462082Seschrock unspare_guid = cvd->vdev_guid; 24472082Seschrock } 24482082Seschrock 24492082Seschrock /* 2450789Sahrens * If the parent mirror/replacing vdev only has one child, 2451789Sahrens * the parent is no longer needed. Remove it from the tree. 2452789Sahrens */ 2453789Sahrens if (pvd->vdev_children == 1) 2454789Sahrens vdev_remove_parent(cvd); 2455789Sahrens 2456789Sahrens /* 2457789Sahrens * We don't set tvd until now because the parent we just removed 2458789Sahrens * may have been the previous top-level vdev. 2459789Sahrens */ 2460789Sahrens tvd = cvd->vdev_top; 2461789Sahrens ASSERT(tvd->vdev_parent == rvd); 2462789Sahrens 2463789Sahrens /* 24643377Seschrock * Reevaluate the parent vdev state. 2465789Sahrens */ 24664451Seschrock vdev_propagate_state(cvd); 2467789Sahrens 2468789Sahrens /* 24693377Seschrock * If the device we just detached was smaller than the others, it may be 24703377Seschrock * possible to add metaslabs (i.e. grow the pool). vdev_metaslab_init() 24713377Seschrock * can't fail because the existing metaslabs are already in core, so 24723377Seschrock * there's nothing to read from disk. 2473789Sahrens */ 24741732Sbonwick VERIFY(vdev_metaslab_init(tvd, txg) == 0); 2475789Sahrens 2476789Sahrens vdev_config_dirty(tvd); 2477789Sahrens 2478789Sahrens /* 24793377Seschrock * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 24803377Seschrock * vd->vdev_detached is set and free vd's DTL object in syncing context. 24813377Seschrock * But first make sure we're not on any *other* txg's DTL list, to 24823377Seschrock * prevent vd from being accessed after it's freed. 2483789Sahrens */ 2484789Sahrens for (t = 0; t < TXG_SIZE; t++) 2485789Sahrens (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 24861732Sbonwick vd->vdev_detached = B_TRUE; 24871732Sbonwick vdev_dirty(tvd, VDD_DTL, vd, txg); 2488789Sahrens 24894451Seschrock spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); 24904451Seschrock 24912082Seschrock error = spa_vdev_exit(spa, vd, txg, 0); 24922082Seschrock 24932082Seschrock /* 24943377Seschrock * If this was the removal of the original device in a hot spare vdev, 24953377Seschrock * then we want to go through and remove the device from the hot spare 24963377Seschrock * list of every other pool. 24972082Seschrock */ 24982082Seschrock if (unspare) { 24992082Seschrock spa = NULL; 25002082Seschrock mutex_enter(&spa_namespace_lock); 25012082Seschrock while ((spa = spa_next(spa)) != NULL) { 25022082Seschrock if (spa->spa_state != POOL_STATE_ACTIVE) 25032082Seschrock continue; 25042082Seschrock 25052082Seschrock (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 25062082Seschrock } 25072082Seschrock mutex_exit(&spa_namespace_lock); 25082082Seschrock } 25092082Seschrock 25102082Seschrock return (error); 25112082Seschrock } 25122082Seschrock 25132082Seschrock /* 25142082Seschrock * Remove a device from the pool. Currently, this supports removing only hot 25152082Seschrock * spares. 25162082Seschrock */ 25172082Seschrock int 25182082Seschrock spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 25192082Seschrock { 25202082Seschrock vdev_t *vd; 25212082Seschrock nvlist_t **spares, *nv, **newspares; 25222082Seschrock uint_t i, j, nspares; 25232082Seschrock int ret = 0; 25242082Seschrock 25252082Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 25262082Seschrock 25272082Seschrock vd = spa_lookup_by_guid(spa, guid); 25282082Seschrock 25292082Seschrock nv = NULL; 25302082Seschrock if (spa->spa_spares != NULL && 25312082Seschrock nvlist_lookup_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 25322082Seschrock &spares, &nspares) == 0) { 25332082Seschrock for (i = 0; i < nspares; i++) { 25342082Seschrock uint64_t theguid; 25352082Seschrock 25362082Seschrock VERIFY(nvlist_lookup_uint64(spares[i], 25372082Seschrock ZPOOL_CONFIG_GUID, &theguid) == 0); 25382082Seschrock if (theguid == guid) { 25392082Seschrock nv = spares[i]; 25402082Seschrock break; 25412082Seschrock } 25422082Seschrock } 25432082Seschrock } 25442082Seschrock 25452082Seschrock /* 25462082Seschrock * We only support removing a hot spare, and only if it's not currently 25472082Seschrock * in use in this pool. 25482082Seschrock */ 25492082Seschrock if (nv == NULL && vd == NULL) { 25502082Seschrock ret = ENOENT; 25512082Seschrock goto out; 25522082Seschrock } 25532082Seschrock 25542082Seschrock if (nv == NULL && vd != NULL) { 25552082Seschrock ret = ENOTSUP; 25562082Seschrock goto out; 25572082Seschrock } 25582082Seschrock 25592082Seschrock if (!unspare && nv != NULL && vd != NULL) { 25602082Seschrock ret = EBUSY; 25612082Seschrock goto out; 25622082Seschrock } 25632082Seschrock 25642082Seschrock if (nspares == 1) { 25652082Seschrock newspares = NULL; 25662082Seschrock } else { 25672082Seschrock newspares = kmem_alloc((nspares - 1) * sizeof (void *), 25682082Seschrock KM_SLEEP); 25692082Seschrock for (i = 0, j = 0; i < nspares; i++) { 25702082Seschrock if (spares[i] != nv) 25712082Seschrock VERIFY(nvlist_dup(spares[i], 25722082Seschrock &newspares[j++], KM_SLEEP) == 0); 25732082Seschrock } 25742082Seschrock } 25752082Seschrock 25762082Seschrock VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 25772082Seschrock DATA_TYPE_NVLIST_ARRAY) == 0); 25782082Seschrock VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, 25792082Seschrock newspares, nspares - 1) == 0); 25802082Seschrock for (i = 0; i < nspares - 1; i++) 25812082Seschrock nvlist_free(newspares[i]); 25822082Seschrock kmem_free(newspares, (nspares - 1) * sizeof (void *)); 25832082Seschrock spa_load_spares(spa); 25842082Seschrock spa->spa_sync_spares = B_TRUE; 25852082Seschrock 25862082Seschrock out: 25872082Seschrock spa_config_exit(spa, FTAG); 25882082Seschrock 25892082Seschrock return (ret); 2590789Sahrens } 2591789Sahrens 2592789Sahrens /* 25934451Seschrock * Find any device that's done replacing, or a vdev marked 'unspare' that's 25944451Seschrock * current spared, so we can detach it. 2595789Sahrens */ 25961544Seschrock static vdev_t * 25974451Seschrock spa_vdev_resilver_done_hunt(vdev_t *vd) 2598789Sahrens { 25991544Seschrock vdev_t *newvd, *oldvd; 2600789Sahrens int c; 2601789Sahrens 26021544Seschrock for (c = 0; c < vd->vdev_children; c++) { 26034451Seschrock oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 26041544Seschrock if (oldvd != NULL) 26051544Seschrock return (oldvd); 26061544Seschrock } 2607789Sahrens 26084451Seschrock /* 26094451Seschrock * Check for a completed replacement. 26104451Seschrock */ 2611789Sahrens if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 26121544Seschrock oldvd = vd->vdev_child[0]; 26131544Seschrock newvd = vd->vdev_child[1]; 2614789Sahrens 26151544Seschrock mutex_enter(&newvd->vdev_dtl_lock); 26161544Seschrock if (newvd->vdev_dtl_map.sm_space == 0 && 26171544Seschrock newvd->vdev_dtl_scrub.sm_space == 0) { 26181544Seschrock mutex_exit(&newvd->vdev_dtl_lock); 26191544Seschrock return (oldvd); 26201544Seschrock } 26211544Seschrock mutex_exit(&newvd->vdev_dtl_lock); 26221544Seschrock } 2623789Sahrens 26244451Seschrock /* 26254451Seschrock * Check for a completed resilver with the 'unspare' flag set. 26264451Seschrock */ 26274451Seschrock if (vd->vdev_ops == &vdev_spare_ops && vd->vdev_children == 2) { 26284451Seschrock newvd = vd->vdev_child[0]; 26294451Seschrock oldvd = vd->vdev_child[1]; 26304451Seschrock 26314451Seschrock mutex_enter(&newvd->vdev_dtl_lock); 26324451Seschrock if (newvd->vdev_unspare && 26334451Seschrock newvd->vdev_dtl_map.sm_space == 0 && 26344451Seschrock newvd->vdev_dtl_scrub.sm_space == 0) { 26354451Seschrock newvd->vdev_unspare = 0; 26364451Seschrock mutex_exit(&newvd->vdev_dtl_lock); 26374451Seschrock return (oldvd); 26384451Seschrock } 26394451Seschrock mutex_exit(&newvd->vdev_dtl_lock); 26404451Seschrock } 26414451Seschrock 26421544Seschrock return (NULL); 2643789Sahrens } 2644789Sahrens 26451544Seschrock static void 26464451Seschrock spa_vdev_resilver_done(spa_t *spa) 2647789Sahrens { 26481544Seschrock vdev_t *vd; 26492082Seschrock vdev_t *pvd; 26501544Seschrock uint64_t guid; 26512082Seschrock uint64_t pguid = 0; 2652789Sahrens 26531544Seschrock spa_config_enter(spa, RW_READER, FTAG); 2654789Sahrens 26554451Seschrock while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 26561544Seschrock guid = vd->vdev_guid; 26572082Seschrock /* 26582082Seschrock * If we have just finished replacing a hot spared device, then 26592082Seschrock * we need to detach the parent's first child (the original hot 26602082Seschrock * spare) as well. 26612082Seschrock */ 26622082Seschrock pvd = vd->vdev_parent; 26632082Seschrock if (pvd->vdev_parent->vdev_ops == &vdev_spare_ops && 26642082Seschrock pvd->vdev_id == 0) { 26652082Seschrock ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 26662082Seschrock ASSERT(pvd->vdev_parent->vdev_children == 2); 26672082Seschrock pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid; 26682082Seschrock } 26691544Seschrock spa_config_exit(spa, FTAG); 26701544Seschrock if (spa_vdev_detach(spa, guid, B_TRUE) != 0) 26711544Seschrock return; 26722082Seschrock if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0) 26732082Seschrock return; 26741544Seschrock spa_config_enter(spa, RW_READER, FTAG); 2675789Sahrens } 2676789Sahrens 26771544Seschrock spa_config_exit(spa, FTAG); 2678789Sahrens } 2679789Sahrens 2680789Sahrens /* 26811354Seschrock * Update the stored path for this vdev. Dirty the vdev configuration, relying 26821354Seschrock * on spa_vdev_enter/exit() to synchronize the labels and cache. 26831354Seschrock */ 26841354Seschrock int 26851354Seschrock spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 26861354Seschrock { 26871354Seschrock vdev_t *rvd, *vd; 26881354Seschrock uint64_t txg; 26891354Seschrock 26901354Seschrock rvd = spa->spa_root_vdev; 26911354Seschrock 26921354Seschrock txg = spa_vdev_enter(spa); 26931354Seschrock 26942082Seschrock if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 26952082Seschrock /* 26962082Seschrock * Determine if this is a reference to a hot spare. In that 26972082Seschrock * case, update the path as stored in the spare list. 26982082Seschrock */ 26992082Seschrock nvlist_t **spares; 27002082Seschrock uint_t i, nspares; 27012082Seschrock if (spa->spa_sparelist != NULL) { 27022082Seschrock VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, 27032082Seschrock ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 27042082Seschrock for (i = 0; i < nspares; i++) { 27052082Seschrock uint64_t theguid; 27062082Seschrock VERIFY(nvlist_lookup_uint64(spares[i], 27072082Seschrock ZPOOL_CONFIG_GUID, &theguid) == 0); 27082082Seschrock if (theguid == guid) 27092082Seschrock break; 27102082Seschrock } 27112082Seschrock 27122082Seschrock if (i == nspares) 27132082Seschrock return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 27142082Seschrock 27152082Seschrock VERIFY(nvlist_add_string(spares[i], 27162082Seschrock ZPOOL_CONFIG_PATH, newpath) == 0); 27172082Seschrock spa_load_spares(spa); 27182082Seschrock spa->spa_sync_spares = B_TRUE; 27192082Seschrock return (spa_vdev_exit(spa, NULL, txg, 0)); 27202082Seschrock } else { 27212082Seschrock return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 27222082Seschrock } 27232082Seschrock } 27241354Seschrock 27251585Sbonwick if (!vd->vdev_ops->vdev_op_leaf) 27261585Sbonwick return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 27271585Sbonwick 27281354Seschrock spa_strfree(vd->vdev_path); 27291354Seschrock vd->vdev_path = spa_strdup(newpath); 27301354Seschrock 27311354Seschrock vdev_config_dirty(vd->vdev_top); 27321354Seschrock 27331354Seschrock return (spa_vdev_exit(spa, NULL, txg, 0)); 27341354Seschrock } 27351354Seschrock 27361354Seschrock /* 2737789Sahrens * ========================================================================== 2738789Sahrens * SPA Scrubbing 2739789Sahrens * ========================================================================== 2740789Sahrens */ 2741789Sahrens 2742789Sahrens static void 2743789Sahrens spa_scrub_io_done(zio_t *zio) 2744789Sahrens { 2745789Sahrens spa_t *spa = zio->io_spa; 2746789Sahrens 27474309Smaybee arc_data_buf_free(zio->io_data, zio->io_size); 2748789Sahrens 2749789Sahrens mutex_enter(&spa->spa_scrub_lock); 27501544Seschrock if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 27511775Sbillm vdev_t *vd = zio->io_vd ? zio->io_vd : spa->spa_root_vdev; 2752789Sahrens spa->spa_scrub_errors++; 2753789Sahrens mutex_enter(&vd->vdev_stat_lock); 2754789Sahrens vd->vdev_stat.vs_scrub_errors++; 2755789Sahrens mutex_exit(&vd->vdev_stat_lock); 2756789Sahrens } 27573697Smishra 27583697Smishra if (--spa->spa_scrub_inflight < spa->spa_scrub_maxinflight) 27591544Seschrock cv_broadcast(&spa->spa_scrub_io_cv); 27603697Smishra 27613697Smishra ASSERT(spa->spa_scrub_inflight >= 0); 27623697Smishra 27631544Seschrock mutex_exit(&spa->spa_scrub_lock); 2764789Sahrens } 2765789Sahrens 2766789Sahrens static void 27671544Seschrock spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags, 27681544Seschrock zbookmark_t *zb) 2769789Sahrens { 2770789Sahrens size_t size = BP_GET_LSIZE(bp); 27713697Smishra void *data; 2772789Sahrens 2773789Sahrens mutex_enter(&spa->spa_scrub_lock); 27743697Smishra /* 27753697Smishra * Do not give too much work to vdev(s). 27763697Smishra */ 27773697Smishra while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight) { 27783697Smishra cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 27793697Smishra } 2780789Sahrens spa->spa_scrub_inflight++; 2781789Sahrens mutex_exit(&spa->spa_scrub_lock); 2782789Sahrens 27834309Smaybee data = arc_data_buf_alloc(size); 27843697Smishra 27851544Seschrock if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET) 27861544Seschrock flags |= ZIO_FLAG_SPECULATIVE; /* intent log block */ 27871544Seschrock 27881807Sbonwick flags |= ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL; 27891544Seschrock 2790789Sahrens zio_nowait(zio_read(NULL, spa, bp, data, size, 27911544Seschrock spa_scrub_io_done, NULL, priority, flags, zb)); 2792789Sahrens } 2793789Sahrens 2794789Sahrens /* ARGSUSED */ 2795789Sahrens static int 2796789Sahrens spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) 2797789Sahrens { 2798789Sahrens blkptr_t *bp = &bc->bc_blkptr; 27991775Sbillm vdev_t *vd = spa->spa_root_vdev; 28001775Sbillm dva_t *dva = bp->blk_dva; 28011775Sbillm int needs_resilver = B_FALSE; 28021775Sbillm int d; 2803789Sahrens 28041775Sbillm if (bc->bc_errno) { 2805789Sahrens /* 2806789Sahrens * We can't scrub this block, but we can continue to scrub 2807789Sahrens * the rest of the pool. Note the error and move along. 2808789Sahrens */ 2809789Sahrens mutex_enter(&spa->spa_scrub_lock); 2810789Sahrens spa->spa_scrub_errors++; 2811789Sahrens mutex_exit(&spa->spa_scrub_lock); 2812789Sahrens 28131775Sbillm mutex_enter(&vd->vdev_stat_lock); 28141775Sbillm vd->vdev_stat.vs_scrub_errors++; 28151775Sbillm mutex_exit(&vd->vdev_stat_lock); 2816789Sahrens 2817789Sahrens return (ERESTART); 2818789Sahrens } 2819789Sahrens 2820789Sahrens ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg); 2821789Sahrens 28221775Sbillm for (d = 0; d < BP_GET_NDVAS(bp); d++) { 28231775Sbillm vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d])); 28241775Sbillm 28251775Sbillm ASSERT(vd != NULL); 28261775Sbillm 28271775Sbillm /* 28281775Sbillm * Keep track of how much data we've examined so that 28291775Sbillm * zpool(1M) status can make useful progress reports. 28301775Sbillm */ 28311775Sbillm mutex_enter(&vd->vdev_stat_lock); 28321775Sbillm vd->vdev_stat.vs_scrub_examined += DVA_GET_ASIZE(&dva[d]); 28331775Sbillm mutex_exit(&vd->vdev_stat_lock); 2834789Sahrens 28351775Sbillm if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) { 28361775Sbillm if (DVA_GET_GANG(&dva[d])) { 28371775Sbillm /* 28381775Sbillm * Gang members may be spread across multiple 28391775Sbillm * vdevs, so the best we can do is look at the 28401775Sbillm * pool-wide DTL. 28411775Sbillm * XXX -- it would be better to change our 28421775Sbillm * allocation policy to ensure that this can't 28431775Sbillm * happen. 28441775Sbillm */ 28451775Sbillm vd = spa->spa_root_vdev; 28461775Sbillm } 28471775Sbillm if (vdev_dtl_contains(&vd->vdev_dtl_map, 28481775Sbillm bp->blk_birth, 1)) 28491775Sbillm needs_resilver = B_TRUE; 2850789Sahrens } 28511775Sbillm } 28521775Sbillm 28531775Sbillm if (spa->spa_scrub_type == POOL_SCRUB_EVERYTHING) 2854789Sahrens spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB, 28551544Seschrock ZIO_FLAG_SCRUB, &bc->bc_bookmark); 28561775Sbillm else if (needs_resilver) 28571775Sbillm spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER, 28581775Sbillm ZIO_FLAG_RESILVER, &bc->bc_bookmark); 2859789Sahrens 2860789Sahrens return (0); 2861789Sahrens } 2862789Sahrens 2863789Sahrens static void 2864789Sahrens spa_scrub_thread(spa_t *spa) 2865789Sahrens { 2866789Sahrens callb_cpr_t cprinfo; 2867789Sahrens traverse_handle_t *th = spa->spa_scrub_th; 2868789Sahrens vdev_t *rvd = spa->spa_root_vdev; 2869789Sahrens pool_scrub_type_t scrub_type = spa->spa_scrub_type; 2870789Sahrens int error = 0; 2871789Sahrens boolean_t complete; 2872789Sahrens 2873789Sahrens CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG); 2874789Sahrens 2875797Sbonwick /* 2876797Sbonwick * If we're restarting due to a snapshot create/delete, 2877797Sbonwick * wait for that to complete. 2878797Sbonwick */ 2879797Sbonwick txg_wait_synced(spa_get_dsl(spa), 0); 2880797Sbonwick 28811544Seschrock dprintf("start %s mintxg=%llu maxtxg=%llu\n", 28821544Seschrock scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 28831544Seschrock spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg); 28841544Seschrock 28851544Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 28861544Seschrock vdev_reopen(rvd); /* purge all vdev caches */ 2887789Sahrens vdev_config_dirty(rvd); /* rewrite all disk labels */ 2888789Sahrens vdev_scrub_stat_update(rvd, scrub_type, B_FALSE); 28891544Seschrock spa_config_exit(spa, FTAG); 2890789Sahrens 2891789Sahrens mutex_enter(&spa->spa_scrub_lock); 2892789Sahrens spa->spa_scrub_errors = 0; 2893789Sahrens spa->spa_scrub_active = 1; 28941544Seschrock ASSERT(spa->spa_scrub_inflight == 0); 2895789Sahrens 2896789Sahrens while (!spa->spa_scrub_stop) { 2897789Sahrens CALLB_CPR_SAFE_BEGIN(&cprinfo); 28981544Seschrock while (spa->spa_scrub_suspended) { 2899789Sahrens spa->spa_scrub_active = 0; 2900789Sahrens cv_broadcast(&spa->spa_scrub_cv); 2901789Sahrens cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 2902789Sahrens spa->spa_scrub_active = 1; 2903789Sahrens } 2904789Sahrens CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock); 2905789Sahrens 2906789Sahrens if (spa->spa_scrub_restart_txg != 0) 2907789Sahrens break; 2908789Sahrens 2909789Sahrens mutex_exit(&spa->spa_scrub_lock); 2910789Sahrens error = traverse_more(th); 2911789Sahrens mutex_enter(&spa->spa_scrub_lock); 2912789Sahrens if (error != EAGAIN) 2913789Sahrens break; 2914789Sahrens } 2915789Sahrens 2916789Sahrens while (spa->spa_scrub_inflight) 2917789Sahrens cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2918789Sahrens 29191601Sbonwick spa->spa_scrub_active = 0; 29201601Sbonwick cv_broadcast(&spa->spa_scrub_cv); 29211601Sbonwick 29221601Sbonwick mutex_exit(&spa->spa_scrub_lock); 29231601Sbonwick 29241601Sbonwick spa_config_enter(spa, RW_WRITER, FTAG); 29251601Sbonwick 29261601Sbonwick mutex_enter(&spa->spa_scrub_lock); 29271601Sbonwick 29281601Sbonwick /* 29291601Sbonwick * Note: we check spa_scrub_restart_txg under both spa_scrub_lock 29301601Sbonwick * AND the spa config lock to synchronize with any config changes 29311601Sbonwick * that revise the DTLs under spa_vdev_enter() / spa_vdev_exit(). 29321601Sbonwick */ 2933789Sahrens if (spa->spa_scrub_restart_txg != 0) 2934789Sahrens error = ERESTART; 2935789Sahrens 29361544Seschrock if (spa->spa_scrub_stop) 29371544Seschrock error = EINTR; 29381544Seschrock 2939789Sahrens /* 29401544Seschrock * Even if there were uncorrectable errors, we consider the scrub 29411544Seschrock * completed. The downside is that if there is a transient error during 29421544Seschrock * a resilver, we won't resilver the data properly to the target. But 29431544Seschrock * if the damage is permanent (more likely) we will resilver forever, 29441544Seschrock * which isn't really acceptable. Since there is enough information for 29451544Seschrock * the user to know what has failed and why, this seems like a more 29461544Seschrock * tractable approach. 2947789Sahrens */ 29481544Seschrock complete = (error == 0); 2949789Sahrens 29501544Seschrock dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n", 29511544Seschrock scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 2952789Sahrens spa->spa_scrub_maxtxg, complete ? "done" : "FAILED", 2953789Sahrens error, spa->spa_scrub_errors, spa->spa_scrub_stop); 2954789Sahrens 2955789Sahrens mutex_exit(&spa->spa_scrub_lock); 2956789Sahrens 2957789Sahrens /* 2958789Sahrens * If the scrub/resilver completed, update all DTLs to reflect this. 2959789Sahrens * Whether it succeeded or not, vacate all temporary scrub DTLs. 2960789Sahrens */ 2961789Sahrens vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1, 2962789Sahrens complete ? spa->spa_scrub_maxtxg : 0, B_TRUE); 2963789Sahrens vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete); 29641544Seschrock spa_errlog_rotate(spa); 29651601Sbonwick 29664451Seschrock if (scrub_type == POOL_SCRUB_RESILVER && complete) 29674451Seschrock spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_FINISH); 29684451Seschrock 29691544Seschrock spa_config_exit(spa, FTAG); 2970789Sahrens 2971789Sahrens mutex_enter(&spa->spa_scrub_lock); 2972789Sahrens 29731544Seschrock /* 29741544Seschrock * We may have finished replacing a device. 29751544Seschrock * Let the async thread assess this and handle the detach. 29761544Seschrock */ 29774451Seschrock spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 2978789Sahrens 2979789Sahrens /* 2980789Sahrens * If we were told to restart, our final act is to start a new scrub. 2981789Sahrens */ 2982789Sahrens if (error == ERESTART) 29831544Seschrock spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ? 29841544Seschrock SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB); 2985789Sahrens 29861544Seschrock spa->spa_scrub_type = POOL_SCRUB_NONE; 29871544Seschrock spa->spa_scrub_active = 0; 29881544Seschrock spa->spa_scrub_thread = NULL; 29891544Seschrock cv_broadcast(&spa->spa_scrub_cv); 2990789Sahrens CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */ 2991789Sahrens thread_exit(); 2992789Sahrens } 2993789Sahrens 2994789Sahrens void 2995789Sahrens spa_scrub_suspend(spa_t *spa) 2996789Sahrens { 2997789Sahrens mutex_enter(&spa->spa_scrub_lock); 29981544Seschrock spa->spa_scrub_suspended++; 2999789Sahrens while (spa->spa_scrub_active) { 3000789Sahrens cv_broadcast(&spa->spa_scrub_cv); 3001789Sahrens cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 3002789Sahrens } 3003789Sahrens while (spa->spa_scrub_inflight) 3004789Sahrens cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 3005789Sahrens mutex_exit(&spa->spa_scrub_lock); 3006789Sahrens } 3007789Sahrens 3008789Sahrens void 3009789Sahrens spa_scrub_resume(spa_t *spa) 3010789Sahrens { 3011789Sahrens mutex_enter(&spa->spa_scrub_lock); 30121544Seschrock ASSERT(spa->spa_scrub_suspended != 0); 30131544Seschrock if (--spa->spa_scrub_suspended == 0) 3014789Sahrens cv_broadcast(&spa->spa_scrub_cv); 3015789Sahrens mutex_exit(&spa->spa_scrub_lock); 3016789Sahrens } 3017789Sahrens 3018789Sahrens void 3019789Sahrens spa_scrub_restart(spa_t *spa, uint64_t txg) 3020789Sahrens { 3021789Sahrens /* 3022789Sahrens * Something happened (e.g. snapshot create/delete) that means 3023789Sahrens * we must restart any in-progress scrubs. The itinerary will 3024789Sahrens * fix this properly. 3025789Sahrens */ 3026789Sahrens mutex_enter(&spa->spa_scrub_lock); 3027789Sahrens spa->spa_scrub_restart_txg = txg; 3028789Sahrens mutex_exit(&spa->spa_scrub_lock); 3029789Sahrens } 3030789Sahrens 30311544Seschrock int 30321544Seschrock spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) 3033789Sahrens { 3034789Sahrens space_seg_t *ss; 3035789Sahrens uint64_t mintxg, maxtxg; 3036789Sahrens vdev_t *rvd = spa->spa_root_vdev; 3037789Sahrens 30384808Sek110237 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 30394808Sek110237 ASSERT(!spa_config_held(spa, RW_WRITER)); 30404808Sek110237 3041789Sahrens if ((uint_t)type >= POOL_SCRUB_TYPES) 3042789Sahrens return (ENOTSUP); 3043789Sahrens 30441544Seschrock mutex_enter(&spa->spa_scrub_lock); 30451544Seschrock 3046789Sahrens /* 3047789Sahrens * If there's a scrub or resilver already in progress, stop it. 3048789Sahrens */ 3049789Sahrens while (spa->spa_scrub_thread != NULL) { 3050789Sahrens /* 3051789Sahrens * Don't stop a resilver unless forced. 3052789Sahrens */ 30531544Seschrock if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) { 30541544Seschrock mutex_exit(&spa->spa_scrub_lock); 3055789Sahrens return (EBUSY); 30561544Seschrock } 3057789Sahrens spa->spa_scrub_stop = 1; 3058789Sahrens cv_broadcast(&spa->spa_scrub_cv); 3059789Sahrens cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 3060789Sahrens } 3061789Sahrens 3062789Sahrens /* 3063789Sahrens * Terminate the previous traverse. 3064789Sahrens */ 3065789Sahrens if (spa->spa_scrub_th != NULL) { 3066789Sahrens traverse_fini(spa->spa_scrub_th); 3067789Sahrens spa->spa_scrub_th = NULL; 3068789Sahrens } 3069789Sahrens 30701544Seschrock if (rvd == NULL) { 30711544Seschrock ASSERT(spa->spa_scrub_stop == 0); 30721544Seschrock ASSERT(spa->spa_scrub_type == type); 30731544Seschrock ASSERT(spa->spa_scrub_restart_txg == 0); 30741544Seschrock mutex_exit(&spa->spa_scrub_lock); 30751544Seschrock return (0); 30761544Seschrock } 3077789Sahrens 3078789Sahrens mintxg = TXG_INITIAL - 1; 3079789Sahrens maxtxg = spa_last_synced_txg(spa) + 1; 3080789Sahrens 30811544Seschrock mutex_enter(&rvd->vdev_dtl_lock); 3082789Sahrens 30831544Seschrock if (rvd->vdev_dtl_map.sm_space == 0) { 30841544Seschrock /* 30851544Seschrock * The pool-wide DTL is empty. 30861732Sbonwick * If this is a resilver, there's nothing to do except 30871732Sbonwick * check whether any in-progress replacements have completed. 30881544Seschrock */ 30891732Sbonwick if (type == POOL_SCRUB_RESILVER) { 30901544Seschrock type = POOL_SCRUB_NONE; 30914451Seschrock spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 30921732Sbonwick } 30931544Seschrock } else { 30941544Seschrock /* 30951544Seschrock * The pool-wide DTL is non-empty. 30961544Seschrock * If this is a normal scrub, upgrade to a resilver instead. 30971544Seschrock */ 30981544Seschrock if (type == POOL_SCRUB_EVERYTHING) 30991544Seschrock type = POOL_SCRUB_RESILVER; 31001544Seschrock } 3101789Sahrens 31021544Seschrock if (type == POOL_SCRUB_RESILVER) { 3103789Sahrens /* 3104789Sahrens * Determine the resilvering boundaries. 3105789Sahrens * 3106789Sahrens * Note: (mintxg, maxtxg) is an open interval, 3107789Sahrens * i.e. mintxg and maxtxg themselves are not included. 3108789Sahrens * 3109789Sahrens * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1 3110789Sahrens * so we don't claim to resilver a txg that's still changing. 3111789Sahrens */ 3112789Sahrens ss = avl_first(&rvd->vdev_dtl_map.sm_root); 31131544Seschrock mintxg = ss->ss_start - 1; 3114789Sahrens ss = avl_last(&rvd->vdev_dtl_map.sm_root); 31151544Seschrock maxtxg = MIN(ss->ss_end, maxtxg); 31164451Seschrock 31174451Seschrock spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_START); 3118789Sahrens } 3119789Sahrens 31201544Seschrock mutex_exit(&rvd->vdev_dtl_lock); 31211544Seschrock 31221544Seschrock spa->spa_scrub_stop = 0; 31231544Seschrock spa->spa_scrub_type = type; 31241544Seschrock spa->spa_scrub_restart_txg = 0; 31251544Seschrock 31261544Seschrock if (type != POOL_SCRUB_NONE) { 31271544Seschrock spa->spa_scrub_mintxg = mintxg; 3128789Sahrens spa->spa_scrub_maxtxg = maxtxg; 3129789Sahrens spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL, 31301635Sbonwick ADVANCE_PRE | ADVANCE_PRUNE | ADVANCE_ZIL, 31311635Sbonwick ZIO_FLAG_CANFAIL); 3132789Sahrens traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg); 3133789Sahrens spa->spa_scrub_thread = thread_create(NULL, 0, 3134789Sahrens spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri); 3135789Sahrens } 3136789Sahrens 31371544Seschrock mutex_exit(&spa->spa_scrub_lock); 31381544Seschrock 3139789Sahrens return (0); 3140789Sahrens } 3141789Sahrens 31421544Seschrock /* 31431544Seschrock * ========================================================================== 31441544Seschrock * SPA async task processing 31451544Seschrock * ========================================================================== 31461544Seschrock */ 31471544Seschrock 31481544Seschrock static void 31494451Seschrock spa_async_remove(spa_t *spa, vdev_t *vd) 3150789Sahrens { 31511544Seschrock vdev_t *tvd; 31521544Seschrock int c; 31531544Seschrock 31544451Seschrock for (c = 0; c < vd->vdev_children; c++) { 31554451Seschrock tvd = vd->vdev_child[c]; 31564451Seschrock if (tvd->vdev_remove_wanted) { 31574451Seschrock tvd->vdev_remove_wanted = 0; 31584451Seschrock vdev_set_state(tvd, B_FALSE, VDEV_STATE_REMOVED, 31594451Seschrock VDEV_AUX_NONE); 31605329Sgw25295 vdev_clear(spa, tvd, B_TRUE); 31614451Seschrock vdev_config_dirty(tvd->vdev_top); 31621544Seschrock } 31634451Seschrock spa_async_remove(spa, tvd); 31641544Seschrock } 31651544Seschrock } 31661544Seschrock 31671544Seschrock static void 31681544Seschrock spa_async_thread(spa_t *spa) 31691544Seschrock { 31701544Seschrock int tasks; 31714451Seschrock uint64_t txg; 31721544Seschrock 31731544Seschrock ASSERT(spa->spa_sync_on); 3174789Sahrens 31751544Seschrock mutex_enter(&spa->spa_async_lock); 31761544Seschrock tasks = spa->spa_async_tasks; 31771544Seschrock spa->spa_async_tasks = 0; 31781544Seschrock mutex_exit(&spa->spa_async_lock); 31791544Seschrock 31801544Seschrock /* 31811635Sbonwick * See if the config needs to be updated. 31821635Sbonwick */ 31831635Sbonwick if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 31841635Sbonwick mutex_enter(&spa_namespace_lock); 31851635Sbonwick spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 31861635Sbonwick mutex_exit(&spa_namespace_lock); 31871635Sbonwick } 31881635Sbonwick 31891635Sbonwick /* 31904451Seschrock * See if any devices need to be marked REMOVED. 31915329Sgw25295 * 31925329Sgw25295 * XXX - We avoid doing this when we are in 31935329Sgw25295 * I/O failure state since spa_vdev_enter() grabs 31945329Sgw25295 * the namespace lock and would not be able to obtain 31955329Sgw25295 * the writer config lock. 31961544Seschrock */ 31975329Sgw25295 if (tasks & SPA_ASYNC_REMOVE && 31985329Sgw25295 spa_state(spa) != POOL_STATE_IO_FAILURE) { 31994451Seschrock txg = spa_vdev_enter(spa); 32004451Seschrock spa_async_remove(spa, spa->spa_root_vdev); 32014451Seschrock (void) spa_vdev_exit(spa, NULL, txg, 0); 32024451Seschrock } 32031544Seschrock 32041544Seschrock /* 32051544Seschrock * If any devices are done replacing, detach them. 32061544Seschrock */ 32074451Seschrock if (tasks & SPA_ASYNC_RESILVER_DONE) 32084451Seschrock spa_vdev_resilver_done(spa); 3209789Sahrens 32101544Seschrock /* 32114451Seschrock * Kick off a scrub. When starting a RESILVER scrub (or an EVERYTHING 32124451Seschrock * scrub which can become a resilver), we need to hold 32134451Seschrock * spa_namespace_lock() because the sysevent we post via 32144451Seschrock * spa_event_notify() needs to get the name of the pool. 32151544Seschrock */ 32164451Seschrock if (tasks & SPA_ASYNC_SCRUB) { 32174451Seschrock mutex_enter(&spa_namespace_lock); 32181544Seschrock VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0); 32194451Seschrock mutex_exit(&spa_namespace_lock); 32204451Seschrock } 32211544Seschrock 32221544Seschrock /* 32231544Seschrock * Kick off a resilver. 32241544Seschrock */ 32254451Seschrock if (tasks & SPA_ASYNC_RESILVER) { 32264451Seschrock mutex_enter(&spa_namespace_lock); 32271544Seschrock VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 32284451Seschrock mutex_exit(&spa_namespace_lock); 32294451Seschrock } 32301544Seschrock 32311544Seschrock /* 32321544Seschrock * Let the world know that we're done. 32331544Seschrock */ 32341544Seschrock mutex_enter(&spa->spa_async_lock); 32351544Seschrock spa->spa_async_thread = NULL; 32361544Seschrock cv_broadcast(&spa->spa_async_cv); 32371544Seschrock mutex_exit(&spa->spa_async_lock); 32381544Seschrock thread_exit(); 32391544Seschrock } 32401544Seschrock 32411544Seschrock void 32421544Seschrock spa_async_suspend(spa_t *spa) 32431544Seschrock { 32441544Seschrock mutex_enter(&spa->spa_async_lock); 32451544Seschrock spa->spa_async_suspended++; 32461544Seschrock while (spa->spa_async_thread != NULL) 32471544Seschrock cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 32481544Seschrock mutex_exit(&spa->spa_async_lock); 32491544Seschrock } 32501544Seschrock 32511544Seschrock void 32521544Seschrock spa_async_resume(spa_t *spa) 32531544Seschrock { 32541544Seschrock mutex_enter(&spa->spa_async_lock); 32551544Seschrock ASSERT(spa->spa_async_suspended != 0); 32561544Seschrock spa->spa_async_suspended--; 32571544Seschrock mutex_exit(&spa->spa_async_lock); 32581544Seschrock } 32591544Seschrock 32601544Seschrock static void 32611544Seschrock spa_async_dispatch(spa_t *spa) 32621544Seschrock { 32631544Seschrock mutex_enter(&spa->spa_async_lock); 32641544Seschrock if (spa->spa_async_tasks && !spa->spa_async_suspended && 32651635Sbonwick spa->spa_async_thread == NULL && 32661635Sbonwick rootdir != NULL && !vn_is_readonly(rootdir)) 32671544Seschrock spa->spa_async_thread = thread_create(NULL, 0, 32681544Seschrock spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 32691544Seschrock mutex_exit(&spa->spa_async_lock); 32701544Seschrock } 32711544Seschrock 32721544Seschrock void 32731544Seschrock spa_async_request(spa_t *spa, int task) 32741544Seschrock { 32751544Seschrock mutex_enter(&spa->spa_async_lock); 32761544Seschrock spa->spa_async_tasks |= task; 32771544Seschrock mutex_exit(&spa->spa_async_lock); 3278789Sahrens } 3279789Sahrens 3280789Sahrens /* 3281789Sahrens * ========================================================================== 3282789Sahrens * SPA syncing routines 3283789Sahrens * ========================================================================== 3284789Sahrens */ 3285789Sahrens 3286789Sahrens static void 3287789Sahrens spa_sync_deferred_frees(spa_t *spa, uint64_t txg) 3288789Sahrens { 3289789Sahrens bplist_t *bpl = &spa->spa_sync_bplist; 3290789Sahrens dmu_tx_t *tx; 3291789Sahrens blkptr_t blk; 3292789Sahrens uint64_t itor = 0; 3293789Sahrens zio_t *zio; 3294789Sahrens int error; 3295789Sahrens uint8_t c = 1; 3296789Sahrens 3297789Sahrens zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD); 3298789Sahrens 3299789Sahrens while (bplist_iterate(bpl, &itor, &blk) == 0) 3300789Sahrens zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL)); 3301789Sahrens 3302789Sahrens error = zio_wait(zio); 3303789Sahrens ASSERT3U(error, ==, 0); 3304789Sahrens 3305789Sahrens tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 3306789Sahrens bplist_vacate(bpl, tx); 3307789Sahrens 3308789Sahrens /* 3309789Sahrens * Pre-dirty the first block so we sync to convergence faster. 3310789Sahrens * (Usually only the first block is needed.) 3311789Sahrens */ 3312789Sahrens dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 3313789Sahrens dmu_tx_commit(tx); 3314789Sahrens } 3315789Sahrens 3316789Sahrens static void 33172082Seschrock spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 33182082Seschrock { 33192082Seschrock char *packed = NULL; 33202082Seschrock size_t nvsize = 0; 33212082Seschrock dmu_buf_t *db; 33222082Seschrock 33232082Seschrock VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 33242082Seschrock 33252082Seschrock packed = kmem_alloc(nvsize, KM_SLEEP); 33262082Seschrock 33272082Seschrock VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 33282082Seschrock KM_SLEEP) == 0); 33292082Seschrock 33302082Seschrock dmu_write(spa->spa_meta_objset, obj, 0, nvsize, packed, tx); 33312082Seschrock 33322082Seschrock kmem_free(packed, nvsize); 33332082Seschrock 33342082Seschrock VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 33352082Seschrock dmu_buf_will_dirty(db, tx); 33362082Seschrock *(uint64_t *)db->db_data = nvsize; 33372082Seschrock dmu_buf_rele(db, FTAG); 33382082Seschrock } 33392082Seschrock 33402082Seschrock static void 33412082Seschrock spa_sync_spares(spa_t *spa, dmu_tx_t *tx) 33422082Seschrock { 33432082Seschrock nvlist_t *nvroot; 33442082Seschrock nvlist_t **spares; 33452082Seschrock int i; 33462082Seschrock 33472082Seschrock if (!spa->spa_sync_spares) 33482082Seschrock return; 33492082Seschrock 33502082Seschrock /* 33512082Seschrock * Update the MOS nvlist describing the list of available spares. 33522082Seschrock * spa_validate_spares() will have already made sure this nvlist is 33534451Seschrock * valid and the vdevs are labeled appropriately. 33542082Seschrock */ 33552082Seschrock if (spa->spa_spares_object == 0) { 33562082Seschrock spa->spa_spares_object = dmu_object_alloc(spa->spa_meta_objset, 33572082Seschrock DMU_OT_PACKED_NVLIST, 1 << 14, 33582082Seschrock DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 33592082Seschrock VERIFY(zap_update(spa->spa_meta_objset, 33602082Seschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SPARES, 33612082Seschrock sizeof (uint64_t), 1, &spa->spa_spares_object, tx) == 0); 33622082Seschrock } 33632082Seschrock 33642082Seschrock VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 33652082Seschrock if (spa->spa_nspares == 0) { 33662082Seschrock VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 33672082Seschrock NULL, 0) == 0); 33682082Seschrock } else { 33692082Seschrock spares = kmem_alloc(spa->spa_nspares * sizeof (void *), 33702082Seschrock KM_SLEEP); 33712082Seschrock for (i = 0; i < spa->spa_nspares; i++) 33722082Seschrock spares[i] = vdev_config_generate(spa, 33732082Seschrock spa->spa_spares[i], B_FALSE, B_TRUE); 33742082Seschrock VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 33752082Seschrock spares, spa->spa_nspares) == 0); 33762082Seschrock for (i = 0; i < spa->spa_nspares; i++) 33772082Seschrock nvlist_free(spares[i]); 33782082Seschrock kmem_free(spares, spa->spa_nspares * sizeof (void *)); 33792082Seschrock } 33802082Seschrock 33812082Seschrock spa_sync_nvlist(spa, spa->spa_spares_object, nvroot, tx); 33822926Sek110237 nvlist_free(nvroot); 33832082Seschrock 33842082Seschrock spa->spa_sync_spares = B_FALSE; 33852082Seschrock } 33862082Seschrock 33872082Seschrock static void 3388789Sahrens spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 3389789Sahrens { 3390789Sahrens nvlist_t *config; 3391789Sahrens 3392789Sahrens if (list_is_empty(&spa->spa_dirty_list)) 3393789Sahrens return; 3394789Sahrens 3395789Sahrens config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE); 3396789Sahrens 33971635Sbonwick if (spa->spa_config_syncing) 33981635Sbonwick nvlist_free(spa->spa_config_syncing); 33991635Sbonwick spa->spa_config_syncing = config; 3400789Sahrens 34012082Seschrock spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 3402789Sahrens } 3403789Sahrens 34045094Slling /* 34055094Slling * Set zpool properties. 34065094Slling */ 34073912Slling static void 34084543Smarks spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) 34093912Slling { 34103912Slling spa_t *spa = arg1; 34115094Slling objset_t *mos = spa->spa_meta_objset; 34123912Slling nvlist_t *nvp = arg2; 34135094Slling nvpair_t *elem; 34144451Seschrock uint64_t intval; 3415*5363Seschrock char *strval, *slash; 34165094Slling zpool_prop_t prop; 34175094Slling const char *propname; 34185094Slling zprop_type_t proptype; 34195094Slling 34205094Slling elem = NULL; 34215094Slling while ((elem = nvlist_next_nvpair(nvp, elem))) { 34225094Slling switch (prop = zpool_name_to_prop(nvpair_name(elem))) { 34235094Slling case ZPOOL_PROP_VERSION: 34245094Slling /* 34255094Slling * Only set version for non-zpool-creation cases 34265094Slling * (set/import). spa_create() needs special care 34275094Slling * for version setting. 34285094Slling */ 34295094Slling if (tx->tx_txg != TXG_INITIAL) { 34305094Slling VERIFY(nvpair_value_uint64(elem, 34315094Slling &intval) == 0); 34325094Slling ASSERT(intval <= SPA_VERSION); 34335094Slling ASSERT(intval >= spa_version(spa)); 34345094Slling spa->spa_uberblock.ub_version = intval; 34355094Slling vdev_config_dirty(spa->spa_root_vdev); 34365094Slling } 34375094Slling break; 34385094Slling 34395094Slling case ZPOOL_PROP_ALTROOT: 34405094Slling /* 34415094Slling * 'altroot' is a non-persistent property. It should 34425094Slling * have been set temporarily at creation or import time. 34435094Slling */ 34445094Slling ASSERT(spa->spa_root != NULL); 34455094Slling break; 34465094Slling 3447*5363Seschrock case ZPOOL_PROP_CACHEFILE: 34485094Slling /* 3449*5363Seschrock * 'cachefile' is a non-persistent property, but note 3450*5363Seschrock * an async request that the config cache needs to be 3451*5363Seschrock * udpated. 34525094Slling */ 3453*5363Seschrock VERIFY(nvpair_value_string(elem, &strval) == 0); 3454*5363Seschrock if (spa->spa_config_dir) 3455*5363Seschrock spa_strfree(spa->spa_config_dir); 3456*5363Seschrock if (spa->spa_config_file) 3457*5363Seschrock spa_strfree(spa->spa_config_file); 3458*5363Seschrock 3459*5363Seschrock if (strval[0] == '\0') { 3460*5363Seschrock spa->spa_config_dir = NULL; 3461*5363Seschrock spa->spa_config_file = NULL; 3462*5363Seschrock } else if (strcmp(strval, "none") == 0) { 3463*5363Seschrock spa->spa_config_dir = spa_strdup(strval); 3464*5363Seschrock spa->spa_config_file = NULL; 3465*5363Seschrock } else { 3466*5363Seschrock slash = strrchr(strval, '/'); 3467*5363Seschrock ASSERT(slash != NULL); 3468*5363Seschrock *slash = '\0'; 3469*5363Seschrock spa->spa_config_dir = spa_strdup(strval); 3470*5363Seschrock spa->spa_config_file = spa_strdup(slash + 1); 3471*5363Seschrock } 3472*5363Seschrock spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 34734543Smarks break; 34745094Slling default: 34755094Slling /* 34765094Slling * Set pool property values in the poolprops mos object. 34775094Slling */ 34785094Slling mutex_enter(&spa->spa_props_lock); 34795094Slling if (spa->spa_pool_props_object == 0) { 34805094Slling objset_t *mos = spa->spa_meta_objset; 34815094Slling 34825094Slling VERIFY((spa->spa_pool_props_object = 34835094Slling zap_create(mos, DMU_OT_POOL_PROPS, 34845094Slling DMU_OT_NONE, 0, tx)) > 0); 34855094Slling 34865094Slling VERIFY(zap_update(mos, 34875094Slling DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 34885094Slling 8, 1, &spa->spa_pool_props_object, tx) 34895094Slling == 0); 34905094Slling } 34915094Slling mutex_exit(&spa->spa_props_lock); 34925094Slling 34935094Slling /* normalize the property name */ 34945094Slling propname = zpool_prop_to_name(prop); 34955094Slling proptype = zpool_prop_get_type(prop); 34965094Slling 34975094Slling if (nvpair_type(elem) == DATA_TYPE_STRING) { 34985094Slling ASSERT(proptype == PROP_TYPE_STRING); 34995094Slling VERIFY(nvpair_value_string(elem, &strval) == 0); 35005094Slling VERIFY(zap_update(mos, 35015094Slling spa->spa_pool_props_object, propname, 35025094Slling 1, strlen(strval) + 1, strval, tx) == 0); 35035094Slling 35045094Slling } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 35055094Slling VERIFY(nvpair_value_uint64(elem, &intval) == 0); 35065094Slling 35075094Slling if (proptype == PROP_TYPE_INDEX) { 35085094Slling const char *unused; 35095094Slling VERIFY(zpool_prop_index_to_string( 35105094Slling prop, intval, &unused) == 0); 35115094Slling } 35125094Slling VERIFY(zap_update(mos, 35135094Slling spa->spa_pool_props_object, propname, 35145094Slling 8, 1, &intval, tx) == 0); 35155094Slling } else { 35165094Slling ASSERT(0); /* not allowed */ 35175094Slling } 35185094Slling 35195329Sgw25295 switch (prop) { 35205329Sgw25295 case ZPOOL_PROP_DELEGATION: 35215094Slling spa->spa_delegation = intval; 35225329Sgw25295 break; 35235329Sgw25295 case ZPOOL_PROP_BOOTFS: 35245094Slling spa->spa_bootfs = intval; 35255329Sgw25295 break; 35265329Sgw25295 case ZPOOL_PROP_FAILUREMODE: 35275329Sgw25295 spa->spa_failmode = intval; 35285329Sgw25295 break; 35295329Sgw25295 default: 35305329Sgw25295 break; 35315329Sgw25295 } 35323912Slling } 35335094Slling 35345094Slling /* log internal history if this is not a zpool create */ 35355094Slling if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY && 35365094Slling tx->tx_txg != TXG_INITIAL) { 35375094Slling spa_history_internal_log(LOG_POOL_PROPSET, 35385094Slling spa, tx, cr, "%s %lld %s", 35395094Slling nvpair_name(elem), intval, spa->spa_name); 35405094Slling } 35413912Slling } 35423912Slling } 35433912Slling 3544789Sahrens /* 3545789Sahrens * Sync the specified transaction group. New blocks may be dirtied as 3546789Sahrens * part of the process, so we iterate until it converges. 3547789Sahrens */ 3548789Sahrens void 3549789Sahrens spa_sync(spa_t *spa, uint64_t txg) 3550789Sahrens { 3551789Sahrens dsl_pool_t *dp = spa->spa_dsl_pool; 3552789Sahrens objset_t *mos = spa->spa_meta_objset; 3553789Sahrens bplist_t *bpl = &spa->spa_sync_bplist; 35541635Sbonwick vdev_t *rvd = spa->spa_root_vdev; 3555789Sahrens vdev_t *vd; 3556789Sahrens dmu_tx_t *tx; 3557789Sahrens int dirty_vdevs; 3558789Sahrens 3559789Sahrens /* 3560789Sahrens * Lock out configuration changes. 3561789Sahrens */ 35621544Seschrock spa_config_enter(spa, RW_READER, FTAG); 3563789Sahrens 3564789Sahrens spa->spa_syncing_txg = txg; 3565789Sahrens spa->spa_sync_pass = 0; 3566789Sahrens 35671544Seschrock VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); 3568789Sahrens 35692082Seschrock tx = dmu_tx_create_assigned(dp, txg); 35702082Seschrock 35712082Seschrock /* 35724577Sahrens * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 35732082Seschrock * set spa_deflate if we have no raid-z vdevs. 35742082Seschrock */ 35754577Sahrens if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 35764577Sahrens spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 35772082Seschrock int i; 35782082Seschrock 35792082Seschrock for (i = 0; i < rvd->vdev_children; i++) { 35802082Seschrock vd = rvd->vdev_child[i]; 35812082Seschrock if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 35822082Seschrock break; 35832082Seschrock } 35842082Seschrock if (i == rvd->vdev_children) { 35852082Seschrock spa->spa_deflate = TRUE; 35862082Seschrock VERIFY(0 == zap_add(spa->spa_meta_objset, 35872082Seschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 35882082Seschrock sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 35892082Seschrock } 35902082Seschrock } 35912082Seschrock 3592789Sahrens /* 3593789Sahrens * If anything has changed in this txg, push the deferred frees 3594789Sahrens * from the previous txg. If not, leave them alone so that we 3595789Sahrens * don't generate work on an otherwise idle system. 3596789Sahrens */ 3597789Sahrens if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 35982329Sek110237 !txg_list_empty(&dp->dp_dirty_dirs, txg) || 35992329Sek110237 !txg_list_empty(&dp->dp_sync_tasks, txg)) 3600789Sahrens spa_sync_deferred_frees(spa, txg); 3601789Sahrens 3602789Sahrens /* 3603789Sahrens * Iterate to convergence. 3604789Sahrens */ 3605789Sahrens do { 3606789Sahrens spa->spa_sync_pass++; 3607789Sahrens 3608789Sahrens spa_sync_config_object(spa, tx); 36092082Seschrock spa_sync_spares(spa, tx); 36101544Seschrock spa_errlog_sync(spa, txg); 3611789Sahrens dsl_pool_sync(dp, txg); 3612789Sahrens 3613789Sahrens dirty_vdevs = 0; 3614789Sahrens while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 3615789Sahrens vdev_sync(vd, txg); 3616789Sahrens dirty_vdevs++; 3617789Sahrens } 3618789Sahrens 3619789Sahrens bplist_sync(bpl, tx); 3620789Sahrens } while (dirty_vdevs); 3621789Sahrens 3622789Sahrens bplist_close(bpl); 3623789Sahrens 3624789Sahrens dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 3625789Sahrens 3626789Sahrens /* 3627789Sahrens * Rewrite the vdev configuration (which includes the uberblock) 3628789Sahrens * to commit the transaction group. 36291635Sbonwick * 36301635Sbonwick * If there are any dirty vdevs, sync the uberblock to all vdevs. 36311635Sbonwick * Otherwise, pick a random top-level vdev that's known to be 36321635Sbonwick * visible in the config cache (see spa_vdev_add() for details). 36331635Sbonwick * If the write fails, try the next vdev until we're tried them all. 3634789Sahrens */ 36351635Sbonwick if (!list_is_empty(&spa->spa_dirty_list)) { 36361635Sbonwick VERIFY(vdev_config_sync(rvd, txg) == 0); 36371635Sbonwick } else { 36381635Sbonwick int children = rvd->vdev_children; 36391635Sbonwick int c0 = spa_get_random(children); 36401635Sbonwick int c; 36411635Sbonwick 36421635Sbonwick for (c = 0; c < children; c++) { 36431635Sbonwick vd = rvd->vdev_child[(c0 + c) % children]; 36441635Sbonwick if (vd->vdev_ms_array == 0) 36451635Sbonwick continue; 36461635Sbonwick if (vdev_config_sync(vd, txg) == 0) 36471635Sbonwick break; 36481635Sbonwick } 36491635Sbonwick if (c == children) 36501635Sbonwick VERIFY(vdev_config_sync(rvd, txg) == 0); 36511635Sbonwick } 36521635Sbonwick 36532082Seschrock dmu_tx_commit(tx); 36542082Seschrock 36551635Sbonwick /* 36561635Sbonwick * Clear the dirty config list. 36571635Sbonwick */ 36581635Sbonwick while ((vd = list_head(&spa->spa_dirty_list)) != NULL) 36591635Sbonwick vdev_config_clean(vd); 36601635Sbonwick 36611635Sbonwick /* 36621635Sbonwick * Now that the new config has synced transactionally, 36631635Sbonwick * let it become visible to the config cache. 36641635Sbonwick */ 36651635Sbonwick if (spa->spa_config_syncing != NULL) { 36661635Sbonwick spa_config_set(spa, spa->spa_config_syncing); 36671635Sbonwick spa->spa_config_txg = txg; 36681635Sbonwick spa->spa_config_syncing = NULL; 36691635Sbonwick } 3670789Sahrens 3671789Sahrens /* 3672789Sahrens * Make a stable copy of the fully synced uberblock. 3673789Sahrens * We use this as the root for pool traversals. 3674789Sahrens */ 3675789Sahrens spa->spa_traverse_wanted = 1; /* tells traverse_more() to stop */ 3676789Sahrens 3677789Sahrens spa_scrub_suspend(spa); /* stop scrubbing and finish I/Os */ 3678789Sahrens 3679789Sahrens rw_enter(&spa->spa_traverse_lock, RW_WRITER); 3680789Sahrens spa->spa_traverse_wanted = 0; 3681789Sahrens spa->spa_ubsync = spa->spa_uberblock; 3682789Sahrens rw_exit(&spa->spa_traverse_lock); 3683789Sahrens 3684789Sahrens spa_scrub_resume(spa); /* resume scrub with new ubsync */ 3685789Sahrens 3686789Sahrens /* 3687789Sahrens * Clean up the ZIL records for the synced txg. 3688789Sahrens */ 3689789Sahrens dsl_pool_zil_clean(dp); 3690789Sahrens 3691789Sahrens /* 3692789Sahrens * Update usable space statistics. 3693789Sahrens */ 3694789Sahrens while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 3695789Sahrens vdev_sync_done(vd, txg); 3696789Sahrens 3697789Sahrens /* 3698789Sahrens * It had better be the case that we didn't dirty anything 36992082Seschrock * since vdev_config_sync(). 3700789Sahrens */ 3701789Sahrens ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 3702789Sahrens ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 3703789Sahrens ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 3704789Sahrens ASSERT(bpl->bpl_queue == NULL); 3705789Sahrens 37061544Seschrock spa_config_exit(spa, FTAG); 37071544Seschrock 37081544Seschrock /* 37091544Seschrock * If any async tasks have been requested, kick them off. 37101544Seschrock */ 37111544Seschrock spa_async_dispatch(spa); 3712789Sahrens } 3713789Sahrens 3714789Sahrens /* 3715789Sahrens * Sync all pools. We don't want to hold the namespace lock across these 3716789Sahrens * operations, so we take a reference on the spa_t and drop the lock during the 3717789Sahrens * sync. 3718789Sahrens */ 3719789Sahrens void 3720789Sahrens spa_sync_allpools(void) 3721789Sahrens { 3722789Sahrens spa_t *spa = NULL; 3723789Sahrens mutex_enter(&spa_namespace_lock); 3724789Sahrens while ((spa = spa_next(spa)) != NULL) { 3725789Sahrens if (spa_state(spa) != POOL_STATE_ACTIVE) 3726789Sahrens continue; 3727789Sahrens spa_open_ref(spa, FTAG); 3728789Sahrens mutex_exit(&spa_namespace_lock); 3729789Sahrens txg_wait_synced(spa_get_dsl(spa), 0); 3730789Sahrens mutex_enter(&spa_namespace_lock); 3731789Sahrens spa_close(spa, FTAG); 3732789Sahrens } 3733789Sahrens mutex_exit(&spa_namespace_lock); 3734789Sahrens } 3735789Sahrens 3736789Sahrens /* 3737789Sahrens * ========================================================================== 3738789Sahrens * Miscellaneous routines 3739789Sahrens * ========================================================================== 3740789Sahrens */ 3741789Sahrens 3742789Sahrens /* 3743789Sahrens * Remove all pools in the system. 3744789Sahrens */ 3745789Sahrens void 3746789Sahrens spa_evict_all(void) 3747789Sahrens { 3748789Sahrens spa_t *spa; 3749789Sahrens 3750789Sahrens /* 3751789Sahrens * Remove all cached state. All pools should be closed now, 3752789Sahrens * so every spa in the AVL tree should be unreferenced. 3753789Sahrens */ 3754789Sahrens mutex_enter(&spa_namespace_lock); 3755789Sahrens while ((spa = spa_next(NULL)) != NULL) { 3756789Sahrens /* 37571544Seschrock * Stop async tasks. The async thread may need to detach 37581544Seschrock * a device that's been replaced, which requires grabbing 37591544Seschrock * spa_namespace_lock, so we must drop it here. 3760789Sahrens */ 3761789Sahrens spa_open_ref(spa, FTAG); 3762789Sahrens mutex_exit(&spa_namespace_lock); 37631544Seschrock spa_async_suspend(spa); 37644808Sek110237 mutex_enter(&spa_namespace_lock); 3765789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 3766789Sahrens spa_close(spa, FTAG); 3767789Sahrens 3768789Sahrens if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 3769789Sahrens spa_unload(spa); 3770789Sahrens spa_deactivate(spa); 3771789Sahrens } 3772789Sahrens spa_remove(spa); 3773789Sahrens } 3774789Sahrens mutex_exit(&spa_namespace_lock); 3775789Sahrens } 37761544Seschrock 37771544Seschrock vdev_t * 37781544Seschrock spa_lookup_by_guid(spa_t *spa, uint64_t guid) 37791544Seschrock { 37801544Seschrock return (vdev_lookup_by_guid(spa->spa_root_vdev, guid)); 37811544Seschrock } 37821760Seschrock 37831760Seschrock void 37845094Slling spa_upgrade(spa_t *spa, uint64_t version) 37851760Seschrock { 37861760Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 37871760Seschrock 37881760Seschrock /* 37891760Seschrock * This should only be called for a non-faulted pool, and since a 37901760Seschrock * future version would result in an unopenable pool, this shouldn't be 37911760Seschrock * possible. 37921760Seschrock */ 37934577Sahrens ASSERT(spa->spa_uberblock.ub_version <= SPA_VERSION); 37945094Slling ASSERT(version >= spa->spa_uberblock.ub_version); 37955094Slling 37965094Slling spa->spa_uberblock.ub_version = version; 37971760Seschrock vdev_config_dirty(spa->spa_root_vdev); 37981760Seschrock 37991760Seschrock spa_config_exit(spa, FTAG); 38002082Seschrock 38012082Seschrock txg_wait_synced(spa_get_dsl(spa), 0); 38021760Seschrock } 38032082Seschrock 38042082Seschrock boolean_t 38052082Seschrock spa_has_spare(spa_t *spa, uint64_t guid) 38062082Seschrock { 38072082Seschrock int i; 38083377Seschrock uint64_t spareguid; 38092082Seschrock 38102082Seschrock for (i = 0; i < spa->spa_nspares; i++) 38112082Seschrock if (spa->spa_spares[i]->vdev_guid == guid) 38122082Seschrock return (B_TRUE); 38132082Seschrock 38143377Seschrock for (i = 0; i < spa->spa_pending_nspares; i++) { 38153377Seschrock if (nvlist_lookup_uint64(spa->spa_pending_spares[i], 38163377Seschrock ZPOOL_CONFIG_GUID, &spareguid) == 0 && 38173377Seschrock spareguid == guid) 38183377Seschrock return (B_TRUE); 38193377Seschrock } 38203377Seschrock 38212082Seschrock return (B_FALSE); 38222082Seschrock } 38233912Slling 38244451Seschrock /* 38254451Seschrock * Post a sysevent corresponding to the given event. The 'name' must be one of 38264451Seschrock * the event definitions in sys/sysevent/eventdefs.h. The payload will be 38274451Seschrock * filled in from the spa and (optionally) the vdev. This doesn't do anything 38284451Seschrock * in the userland libzpool, as we don't want consumers to misinterpret ztest 38294451Seschrock * or zdb as real changes. 38304451Seschrock */ 38314451Seschrock void 38324451Seschrock spa_event_notify(spa_t *spa, vdev_t *vd, const char *name) 38334451Seschrock { 38344451Seschrock #ifdef _KERNEL 38354451Seschrock sysevent_t *ev; 38364451Seschrock sysevent_attr_list_t *attr = NULL; 38374451Seschrock sysevent_value_t value; 38384451Seschrock sysevent_id_t eid; 38394451Seschrock 38404451Seschrock ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", 38414451Seschrock SE_SLEEP); 38424451Seschrock 38434451Seschrock value.value_type = SE_DATA_TYPE_STRING; 38444451Seschrock value.value.sv_string = spa_name(spa); 38454451Seschrock if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) 38464451Seschrock goto done; 38474451Seschrock 38484451Seschrock value.value_type = SE_DATA_TYPE_UINT64; 38494451Seschrock value.value.sv_uint64 = spa_guid(spa); 38504451Seschrock if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) 38514451Seschrock goto done; 38524451Seschrock 38534451Seschrock if (vd) { 38544451Seschrock value.value_type = SE_DATA_TYPE_UINT64; 38554451Seschrock value.value.sv_uint64 = vd->vdev_guid; 38564451Seschrock if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, 38574451Seschrock SE_SLEEP) != 0) 38584451Seschrock goto done; 38594451Seschrock 38604451Seschrock if (vd->vdev_path) { 38614451Seschrock value.value_type = SE_DATA_TYPE_STRING; 38624451Seschrock value.value.sv_string = vd->vdev_path; 38634451Seschrock if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, 38644451Seschrock &value, SE_SLEEP) != 0) 38654451Seschrock goto done; 38664451Seschrock } 38674451Seschrock } 38684451Seschrock 38694451Seschrock (void) log_sysevent(ev, SE_SLEEP, &eid); 38704451Seschrock 38714451Seschrock done: 38724451Seschrock if (attr) 38734451Seschrock sysevent_free_attr(attr); 38744451Seschrock sysevent_free(ev); 38754451Seschrock #endif 38764451Seschrock } 3877