1789Sahrens /* 2789Sahrens * CDDL HEADER START 3789Sahrens * 4789Sahrens * The contents of this file are subject to the terms of the 51544Seschrock * Common Development and Distribution License (the "License"). 61544Seschrock * You may not use this file except in compliance with the License. 7789Sahrens * 8789Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9789Sahrens * or http://www.opensolaris.org/os/licensing. 10789Sahrens * See the License for the specific language governing permissions 11789Sahrens * and limitations under the License. 12789Sahrens * 13789Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14789Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15789Sahrens * If applicable, add the following below this CDDL HEADER, with the 16789Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17789Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18789Sahrens * 19789Sahrens * CDDL HEADER END 20789Sahrens */ 212082Seschrock 22789Sahrens /* 23*12247SGeorge.Wilson@Sun.COM * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24789Sahrens */ 25789Sahrens 26789Sahrens /* 27789Sahrens * This file contains all the routines used when modifying on-disk SPA state. 28789Sahrens * This includes opening, importing, destroying, exporting a pool, and syncing a 29789Sahrens * pool. 30789Sahrens */ 31789Sahrens 32789Sahrens #include <sys/zfs_context.h> 331544Seschrock #include <sys/fm/fs/zfs.h> 34789Sahrens #include <sys/spa_impl.h> 35789Sahrens #include <sys/zio.h> 36789Sahrens #include <sys/zio_checksum.h> 37789Sahrens #include <sys/dmu.h> 38789Sahrens #include <sys/dmu_tx.h> 39789Sahrens #include <sys/zap.h> 40789Sahrens #include <sys/zil.h> 4110922SJeff.Bonwick@Sun.COM #include <sys/ddt.h> 42789Sahrens #include <sys/vdev_impl.h> 43789Sahrens #include <sys/metaslab.h> 4410594SGeorge.Wilson@Sun.COM #include <sys/metaslab_impl.h> 45789Sahrens #include <sys/uberblock_impl.h> 46789Sahrens #include <sys/txg.h> 47789Sahrens #include <sys/avl.h> 48789Sahrens #include <sys/dmu_traverse.h> 493912Slling #include <sys/dmu_objset.h> 50789Sahrens #include <sys/unique.h> 51789Sahrens #include <sys/dsl_pool.h> 523912Slling #include <sys/dsl_dataset.h> 53789Sahrens #include <sys/dsl_dir.h> 54789Sahrens #include <sys/dsl_prop.h> 553912Slling #include <sys/dsl_synctask.h> 56789Sahrens #include <sys/fs/zfs.h> 575450Sbrendan #include <sys/arc.h> 58789Sahrens #include <sys/callb.h> 593975Sek110237 #include <sys/systeminfo.h> 606423Sgw25295 #include <sys/spa_boot.h> 619816SGeorge.Wilson@Sun.COM #include <sys/zfs_ioctl.h> 62789Sahrens 638662SJordan.Vaughan@Sun.com #ifdef _KERNEL 6411173SJonathan.Adams@Sun.COM #include <sys/bootprops.h> 6511173SJonathan.Adams@Sun.COM #include <sys/callb.h> 6611173SJonathan.Adams@Sun.COM #include <sys/cpupart.h> 6711173SJonathan.Adams@Sun.COM #include <sys/pool.h> 6811173SJonathan.Adams@Sun.COM #include <sys/sysdc.h> 698662SJordan.Vaughan@Sun.com #include <sys/zone.h> 708662SJordan.Vaughan@Sun.com #endif /* _KERNEL */ 718662SJordan.Vaughan@Sun.com 725094Slling #include "zfs_prop.h" 735913Sperrin #include "zfs_comutil.h" 745094Slling 7511173SJonathan.Adams@Sun.COM typedef enum zti_modes { 769515SJonathan.Adams@Sun.COM zti_mode_fixed, /* value is # of threads (min 1) */ 779515SJonathan.Adams@Sun.COM zti_mode_online_percent, /* value is % of online CPUs */ 7811173SJonathan.Adams@Sun.COM zti_mode_batch, /* cpu-intensive; value is ignored */ 7911146SGeorge.Wilson@Sun.COM zti_mode_null, /* don't create a taskq */ 809515SJonathan.Adams@Sun.COM zti_nmodes 8111173SJonathan.Adams@Sun.COM } zti_modes_t; 822986Sek110237 8311146SGeorge.Wilson@Sun.COM #define ZTI_FIX(n) { zti_mode_fixed, (n) } 8411146SGeorge.Wilson@Sun.COM #define ZTI_PCT(n) { zti_mode_online_percent, (n) } 8511173SJonathan.Adams@Sun.COM #define ZTI_BATCH { zti_mode_batch, 0 } 8611146SGeorge.Wilson@Sun.COM #define ZTI_NULL { zti_mode_null, 0 } 8711146SGeorge.Wilson@Sun.COM 8811146SGeorge.Wilson@Sun.COM #define ZTI_ONE ZTI_FIX(1) 899515SJonathan.Adams@Sun.COM 909515SJonathan.Adams@Sun.COM typedef struct zio_taskq_info { 9111146SGeorge.Wilson@Sun.COM enum zti_modes zti_mode; 9211146SGeorge.Wilson@Sun.COM uint_t zti_value; 939515SJonathan.Adams@Sun.COM } zio_taskq_info_t; 949515SJonathan.Adams@Sun.COM 959515SJonathan.Adams@Sun.COM static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { 9611173SJonathan.Adams@Sun.COM "issue", "issue_high", "intr", "intr_high" 979515SJonathan.Adams@Sun.COM }; 989515SJonathan.Adams@Sun.COM 9911146SGeorge.Wilson@Sun.COM /* 10011146SGeorge.Wilson@Sun.COM * Define the taskq threads for the following I/O types: 10111146SGeorge.Wilson@Sun.COM * NULL, READ, WRITE, FREE, CLAIM, and IOCTL 10211146SGeorge.Wilson@Sun.COM */ 10311146SGeorge.Wilson@Sun.COM const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { 10411146SGeorge.Wilson@Sun.COM /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ 10511146SGeorge.Wilson@Sun.COM { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, 10611173SJonathan.Adams@Sun.COM { ZTI_FIX(8), ZTI_NULL, ZTI_BATCH, ZTI_NULL }, 10711173SJonathan.Adams@Sun.COM { ZTI_BATCH, ZTI_FIX(5), ZTI_FIX(8), ZTI_FIX(5) }, 10811826SGeorge.Wilson@Sun.COM { ZTI_FIX(10), ZTI_NULL, ZTI_FIX(10), ZTI_NULL }, 10911146SGeorge.Wilson@Sun.COM { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, 11011146SGeorge.Wilson@Sun.COM { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, 1119515SJonathan.Adams@Sun.COM }; 1129515SJonathan.Adams@Sun.COM 1135094Slling static void spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx); 1147214Slling static boolean_t spa_has_active_shared_spare(spa_t *spa); 11511422SMark.Musante@Sun.COM static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config, 11611422SMark.Musante@Sun.COM spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, 11711422SMark.Musante@Sun.COM char **ereport); 1185094Slling 11911173SJonathan.Adams@Sun.COM uint_t zio_taskq_batch_pct = 100; /* 1 thread per cpu in pset */ 12011173SJonathan.Adams@Sun.COM id_t zio_taskq_psrset_bind = PS_NONE; 12111173SJonathan.Adams@Sun.COM boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ 12211173SJonathan.Adams@Sun.COM uint_t zio_taskq_basedc = 80; /* base duty cycle */ 12311173SJonathan.Adams@Sun.COM 12411173SJonathan.Adams@Sun.COM boolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */ 12511173SJonathan.Adams@Sun.COM 12611173SJonathan.Adams@Sun.COM /* 12711173SJonathan.Adams@Sun.COM * This (illegal) pool name is used when temporarily importing a spa_t in order 12811173SJonathan.Adams@Sun.COM * to get the vdev stats associated with the imported devices. 12911173SJonathan.Adams@Sun.COM */ 13011173SJonathan.Adams@Sun.COM #define TRYIMPORT_NAME "$import" 13111173SJonathan.Adams@Sun.COM 1325094Slling /* 1335094Slling * ========================================================================== 1345094Slling * SPA properties routines 1355094Slling * ========================================================================== 1365094Slling */ 1375094Slling 1385094Slling /* 1395094Slling * Add a (source=src, propname=propval) list to an nvlist. 1405094Slling */ 1415949Slling static void 1425094Slling spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, 1435094Slling uint64_t intval, zprop_source_t src) 1445094Slling { 1455094Slling const char *propname = zpool_prop_to_name(prop); 1465094Slling nvlist_t *propval; 1475949Slling 1485949Slling VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 1495949Slling VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); 1505949Slling 1515949Slling if (strval != NULL) 1525949Slling VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); 1535949Slling else 1545949Slling VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0); 1555949Slling 1565949Slling VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); 1575094Slling nvlist_free(propval); 1585094Slling } 1595094Slling 1605094Slling /* 1615094Slling * Get property values from the spa configuration. 1625094Slling */ 1635949Slling static void 1645094Slling spa_prop_get_config(spa_t *spa, nvlist_t **nvp) 1655094Slling { 1668525SEric.Schrock@Sun.COM uint64_t size; 16710956SGeorge.Wilson@Sun.COM uint64_t alloc; 1685094Slling uint64_t cap, version; 1695094Slling zprop_source_t src = ZPROP_SRC_NONE; 1706643Seschrock spa_config_dirent_t *dp; 1715094Slling 1727754SJeff.Bonwick@Sun.COM ASSERT(MUTEX_HELD(&spa->spa_props_lock)); 1737754SJeff.Bonwick@Sun.COM 1748525SEric.Schrock@Sun.COM if (spa->spa_root_vdev != NULL) { 17510956SGeorge.Wilson@Sun.COM alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 17610922SJeff.Bonwick@Sun.COM size = metaslab_class_get_space(spa_normal_class(spa)); 1778525SEric.Schrock@Sun.COM spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); 1788525SEric.Schrock@Sun.COM spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); 17910956SGeorge.Wilson@Sun.COM spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); 18010956SGeorge.Wilson@Sun.COM spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, 18110956SGeorge.Wilson@Sun.COM size - alloc, src); 18210956SGeorge.Wilson@Sun.COM 18310956SGeorge.Wilson@Sun.COM cap = (size == 0) ? 0 : (alloc * 100 / size); 1848525SEric.Schrock@Sun.COM spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); 1858525SEric.Schrock@Sun.COM 18610922SJeff.Bonwick@Sun.COM spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL, 18710922SJeff.Bonwick@Sun.COM ddt_get_pool_dedup_ratio(spa), src); 18810922SJeff.Bonwick@Sun.COM 1898525SEric.Schrock@Sun.COM spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, 1908525SEric.Schrock@Sun.COM spa->spa_root_vdev->vdev_state, src); 1918525SEric.Schrock@Sun.COM 1928525SEric.Schrock@Sun.COM version = spa_version(spa); 1938525SEric.Schrock@Sun.COM if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) 1948525SEric.Schrock@Sun.COM src = ZPROP_SRC_DEFAULT; 1958525SEric.Schrock@Sun.COM else 1968525SEric.Schrock@Sun.COM src = ZPROP_SRC_LOCAL; 1978525SEric.Schrock@Sun.COM spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); 1988525SEric.Schrock@Sun.COM } 1995949Slling 2005949Slling spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); 2015949Slling 2025949Slling if (spa->spa_root != NULL) 2035949Slling spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 2045949Slling 0, ZPROP_SRC_LOCAL); 2055094Slling 2066643Seschrock if ((dp = list_head(&spa->spa_config_list)) != NULL) { 2076643Seschrock if (dp->scd_path == NULL) { 2085949Slling spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 2096643Seschrock "none", 0, ZPROP_SRC_LOCAL); 2106643Seschrock } else if (strcmp(dp->scd_path, spa_config_path) != 0) { 2115949Slling spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 2126643Seschrock dp->scd_path, 0, ZPROP_SRC_LOCAL); 2135363Seschrock } 2145363Seschrock } 2155094Slling } 2165094Slling 2175094Slling /* 2185094Slling * Get zpool property values. 2195094Slling */ 2205094Slling int 2215094Slling spa_prop_get(spa_t *spa, nvlist_t **nvp) 2225094Slling { 22310922SJeff.Bonwick@Sun.COM objset_t *mos = spa->spa_meta_objset; 2245094Slling zap_cursor_t zc; 2255094Slling zap_attribute_t za; 2265094Slling int err; 2275094Slling 2285949Slling VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 2295094Slling 2307754SJeff.Bonwick@Sun.COM mutex_enter(&spa->spa_props_lock); 2317754SJeff.Bonwick@Sun.COM 2325094Slling /* 2335094Slling * Get properties from the spa config. 2345094Slling */ 2355949Slling spa_prop_get_config(spa, nvp); 2365094Slling 2375094Slling /* If no pool property object, no more prop to get. */ 23811619SGeorge.Wilson@Sun.COM if (mos == NULL || spa->spa_pool_props_object == 0) { 2395094Slling mutex_exit(&spa->spa_props_lock); 2405094Slling return (0); 2415094Slling } 2425094Slling 2435094Slling /* 2445094Slling * Get properties from the MOS pool property object. 2455094Slling */ 2465094Slling for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 2475094Slling (err = zap_cursor_retrieve(&zc, &za)) == 0; 2485094Slling zap_cursor_advance(&zc)) { 2495094Slling uint64_t intval = 0; 2505094Slling char *strval = NULL; 2515094Slling zprop_source_t src = ZPROP_SRC_DEFAULT; 2525094Slling zpool_prop_t prop; 2535094Slling 2545094Slling if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL) 2555094Slling continue; 2565094Slling 2575094Slling switch (za.za_integer_length) { 2585094Slling case 8: 2595094Slling /* integer property */ 2605094Slling if (za.za_first_integer != 2615094Slling zpool_prop_default_numeric(prop)) 2625094Slling src = ZPROP_SRC_LOCAL; 2635094Slling 2645094Slling if (prop == ZPOOL_PROP_BOOTFS) { 2655094Slling dsl_pool_t *dp; 2665094Slling dsl_dataset_t *ds = NULL; 2675094Slling 2685094Slling dp = spa_get_dsl(spa); 2695094Slling rw_enter(&dp->dp_config_rwlock, RW_READER); 2706689Smaybee if (err = dsl_dataset_hold_obj(dp, 2716689Smaybee za.za_first_integer, FTAG, &ds)) { 2725094Slling rw_exit(&dp->dp_config_rwlock); 2735094Slling break; 2745094Slling } 2755094Slling 2765094Slling strval = kmem_alloc( 2775094Slling MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, 2785094Slling KM_SLEEP); 2795094Slling dsl_dataset_name(ds, strval); 2806689Smaybee dsl_dataset_rele(ds, FTAG); 2815094Slling rw_exit(&dp->dp_config_rwlock); 2825094Slling } else { 2835094Slling strval = NULL; 2845094Slling intval = za.za_first_integer; 2855094Slling } 2865094Slling 2875949Slling spa_prop_add_list(*nvp, prop, strval, intval, src); 2885094Slling 2895094Slling if (strval != NULL) 2905094Slling kmem_free(strval, 2915094Slling MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); 2925094Slling 2935094Slling break; 2945094Slling 2955094Slling case 1: 2965094Slling /* string property */ 2975094Slling strval = kmem_alloc(za.za_num_integers, KM_SLEEP); 2985094Slling err = zap_lookup(mos, spa->spa_pool_props_object, 2995094Slling za.za_name, 1, za.za_num_integers, strval); 3005094Slling if (err) { 3015094Slling kmem_free(strval, za.za_num_integers); 3025094Slling break; 3035094Slling } 3045949Slling spa_prop_add_list(*nvp, prop, strval, 0, src); 3055094Slling kmem_free(strval, za.za_num_integers); 3065094Slling break; 3075094Slling 3085094Slling default: 3095094Slling break; 3105094Slling } 3115094Slling } 3125094Slling zap_cursor_fini(&zc); 3135094Slling mutex_exit(&spa->spa_props_lock); 3145094Slling out: 3155094Slling if (err && err != ENOENT) { 3165094Slling nvlist_free(*nvp); 3175949Slling *nvp = NULL; 3185094Slling return (err); 3195094Slling } 3205094Slling 3215094Slling return (0); 3225094Slling } 3235094Slling 3245094Slling /* 3255094Slling * Validate the given pool properties nvlist and modify the list 3265094Slling * for the property values to be set. 3275094Slling */ 3285094Slling static int 3295094Slling spa_prop_validate(spa_t *spa, nvlist_t *props) 3305094Slling { 3315094Slling nvpair_t *elem; 3325094Slling int error = 0, reset_bootfs = 0; 3335094Slling uint64_t objnum; 3345094Slling 3355094Slling elem = NULL; 3365094Slling while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 3375094Slling zpool_prop_t prop; 3385094Slling char *propname, *strval; 3395094Slling uint64_t intval; 3405094Slling objset_t *os; 3415363Seschrock char *slash; 3425094Slling 3435094Slling propname = nvpair_name(elem); 3445094Slling 3455094Slling if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL) 3465094Slling return (EINVAL); 3475094Slling 3485094Slling switch (prop) { 3495094Slling case ZPOOL_PROP_VERSION: 3505094Slling error = nvpair_value_uint64(elem, &intval); 3515094Slling if (!error && 3525094Slling (intval < spa_version(spa) || intval > SPA_VERSION)) 3535094Slling error = EINVAL; 3545094Slling break; 3555094Slling 3565094Slling case ZPOOL_PROP_DELEGATION: 3575094Slling case ZPOOL_PROP_AUTOREPLACE: 3587538SRichard.Morris@Sun.COM case ZPOOL_PROP_LISTSNAPS: 3599816SGeorge.Wilson@Sun.COM case ZPOOL_PROP_AUTOEXPAND: 3605094Slling error = nvpair_value_uint64(elem, &intval); 3615094Slling if (!error && intval > 1) 3625094Slling error = EINVAL; 3635094Slling break; 3645094Slling 3655094Slling case ZPOOL_PROP_BOOTFS: 3669630SJeff.Bonwick@Sun.COM /* 3679630SJeff.Bonwick@Sun.COM * If the pool version is less than SPA_VERSION_BOOTFS, 3689630SJeff.Bonwick@Sun.COM * or the pool is still being created (version == 0), 3699630SJeff.Bonwick@Sun.COM * the bootfs property cannot be set. 3709630SJeff.Bonwick@Sun.COM */ 3715094Slling if (spa_version(spa) < SPA_VERSION_BOOTFS) { 3725094Slling error = ENOTSUP; 3735094Slling break; 3745094Slling } 3755094Slling 3765094Slling /* 3777042Sgw25295 * Make sure the vdev config is bootable 3785094Slling */ 3797042Sgw25295 if (!vdev_is_bootable(spa->spa_root_vdev)) { 3805094Slling error = ENOTSUP; 3815094Slling break; 3825094Slling } 3835094Slling 3845094Slling reset_bootfs = 1; 3855094Slling 3865094Slling error = nvpair_value_string(elem, &strval); 3875094Slling 3885094Slling if (!error) { 3897042Sgw25295 uint64_t compress; 3907042Sgw25295 3915094Slling if (strval == NULL || strval[0] == '\0') { 3925094Slling objnum = zpool_prop_default_numeric( 3935094Slling ZPOOL_PROP_BOOTFS); 3945094Slling break; 3955094Slling } 3965094Slling 39710298SMatthew.Ahrens@Sun.COM if (error = dmu_objset_hold(strval, FTAG, &os)) 3985094Slling break; 3997042Sgw25295 40010298SMatthew.Ahrens@Sun.COM /* Must be ZPL and not gzip compressed. */ 40110298SMatthew.Ahrens@Sun.COM 40210298SMatthew.Ahrens@Sun.COM if (dmu_objset_type(os) != DMU_OST_ZFS) { 40310298SMatthew.Ahrens@Sun.COM error = ENOTSUP; 40410298SMatthew.Ahrens@Sun.COM } else if ((error = dsl_prop_get_integer(strval, 4057042Sgw25295 zfs_prop_to_name(ZFS_PROP_COMPRESSION), 4067042Sgw25295 &compress, NULL)) == 0 && 4077042Sgw25295 !BOOTFS_COMPRESS_VALID(compress)) { 4087042Sgw25295 error = ENOTSUP; 4097042Sgw25295 } else { 4107042Sgw25295 objnum = dmu_objset_id(os); 4117042Sgw25295 } 41210298SMatthew.Ahrens@Sun.COM dmu_objset_rele(os, FTAG); 4135094Slling } 4145094Slling break; 4157754SJeff.Bonwick@Sun.COM 4165329Sgw25295 case ZPOOL_PROP_FAILUREMODE: 4175329Sgw25295 error = nvpair_value_uint64(elem, &intval); 4185329Sgw25295 if (!error && (intval < ZIO_FAILURE_MODE_WAIT || 4195329Sgw25295 intval > ZIO_FAILURE_MODE_PANIC)) 4205329Sgw25295 error = EINVAL; 4215329Sgw25295 4225329Sgw25295 /* 4235329Sgw25295 * This is a special case which only occurs when 4245329Sgw25295 * the pool has completely failed. This allows 4255329Sgw25295 * the user to change the in-core failmode property 4265329Sgw25295 * without syncing it out to disk (I/Os might 4275329Sgw25295 * currently be blocked). We do this by returning 4285329Sgw25295 * EIO to the caller (spa_prop_set) to trick it 4295329Sgw25295 * into thinking we encountered a property validation 4305329Sgw25295 * error. 4315329Sgw25295 */ 4327754SJeff.Bonwick@Sun.COM if (!error && spa_suspended(spa)) { 4335329Sgw25295 spa->spa_failmode = intval; 4345329Sgw25295 error = EIO; 4355329Sgw25295 } 4365329Sgw25295 break; 4375363Seschrock 4385363Seschrock case ZPOOL_PROP_CACHEFILE: 4395363Seschrock if ((error = nvpair_value_string(elem, &strval)) != 0) 4405363Seschrock break; 4415363Seschrock 4425363Seschrock if (strval[0] == '\0') 4435363Seschrock break; 4445363Seschrock 4455363Seschrock if (strcmp(strval, "none") == 0) 4465363Seschrock break; 4475363Seschrock 4485363Seschrock if (strval[0] != '/') { 4495363Seschrock error = EINVAL; 4505363Seschrock break; 4515363Seschrock } 4525363Seschrock 4535363Seschrock slash = strrchr(strval, '/'); 4545363Seschrock ASSERT(slash != NULL); 4555363Seschrock 4565363Seschrock if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || 4575363Seschrock strcmp(slash, "/..") == 0) 4585363Seschrock error = EINVAL; 4595363Seschrock break; 46010922SJeff.Bonwick@Sun.COM 46110922SJeff.Bonwick@Sun.COM case ZPOOL_PROP_DEDUPDITTO: 46210922SJeff.Bonwick@Sun.COM if (spa_version(spa) < SPA_VERSION_DEDUP) 46310922SJeff.Bonwick@Sun.COM error = ENOTSUP; 46410922SJeff.Bonwick@Sun.COM else 46510922SJeff.Bonwick@Sun.COM error = nvpair_value_uint64(elem, &intval); 46610922SJeff.Bonwick@Sun.COM if (error == 0 && 46710922SJeff.Bonwick@Sun.COM intval != 0 && intval < ZIO_DEDUPDITTO_MIN) 46810922SJeff.Bonwick@Sun.COM error = EINVAL; 46910922SJeff.Bonwick@Sun.COM break; 4705094Slling } 4715094Slling 4725094Slling if (error) 4735094Slling break; 4745094Slling } 4755094Slling 4765094Slling if (!error && reset_bootfs) { 4775094Slling error = nvlist_remove(props, 4785094Slling zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 4795094Slling 4805094Slling if (!error) { 4815094Slling error = nvlist_add_uint64(props, 4825094Slling zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 4835094Slling } 4845094Slling } 4855094Slling 4865094Slling return (error); 4875094Slling } 4885094Slling 4898525SEric.Schrock@Sun.COM void 4908525SEric.Schrock@Sun.COM spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) 4918525SEric.Schrock@Sun.COM { 4928525SEric.Schrock@Sun.COM char *cachefile; 4938525SEric.Schrock@Sun.COM spa_config_dirent_t *dp; 4948525SEric.Schrock@Sun.COM 4958525SEric.Schrock@Sun.COM if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), 4968525SEric.Schrock@Sun.COM &cachefile) != 0) 4978525SEric.Schrock@Sun.COM return; 4988525SEric.Schrock@Sun.COM 4998525SEric.Schrock@Sun.COM dp = kmem_alloc(sizeof (spa_config_dirent_t), 5008525SEric.Schrock@Sun.COM KM_SLEEP); 5018525SEric.Schrock@Sun.COM 5028525SEric.Schrock@Sun.COM if (cachefile[0] == '\0') 5038525SEric.Schrock@Sun.COM dp->scd_path = spa_strdup(spa_config_path); 5048525SEric.Schrock@Sun.COM else if (strcmp(cachefile, "none") == 0) 5058525SEric.Schrock@Sun.COM dp->scd_path = NULL; 5068525SEric.Schrock@Sun.COM else 5078525SEric.Schrock@Sun.COM dp->scd_path = spa_strdup(cachefile); 5088525SEric.Schrock@Sun.COM 5098525SEric.Schrock@Sun.COM list_insert_head(&spa->spa_config_list, dp); 5108525SEric.Schrock@Sun.COM if (need_sync) 5118525SEric.Schrock@Sun.COM spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 5128525SEric.Schrock@Sun.COM } 5138525SEric.Schrock@Sun.COM 5145094Slling int 5155094Slling spa_prop_set(spa_t *spa, nvlist_t *nvp) 5165094Slling { 5175094Slling int error; 5188525SEric.Schrock@Sun.COM nvpair_t *elem; 5198525SEric.Schrock@Sun.COM boolean_t need_sync = B_FALSE; 5208525SEric.Schrock@Sun.COM zpool_prop_t prop; 5215094Slling 5225094Slling if ((error = spa_prop_validate(spa, nvp)) != 0) 5235094Slling return (error); 5245094Slling 5258525SEric.Schrock@Sun.COM elem = NULL; 5268525SEric.Schrock@Sun.COM while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { 5278525SEric.Schrock@Sun.COM if ((prop = zpool_name_to_prop( 5288525SEric.Schrock@Sun.COM nvpair_name(elem))) == ZPROP_INVAL) 5298525SEric.Schrock@Sun.COM return (EINVAL); 5308525SEric.Schrock@Sun.COM 5318525SEric.Schrock@Sun.COM if (prop == ZPOOL_PROP_CACHEFILE || prop == ZPOOL_PROP_ALTROOT) 5328525SEric.Schrock@Sun.COM continue; 5338525SEric.Schrock@Sun.COM 5348525SEric.Schrock@Sun.COM need_sync = B_TRUE; 5358525SEric.Schrock@Sun.COM break; 5368525SEric.Schrock@Sun.COM } 5378525SEric.Schrock@Sun.COM 5388525SEric.Schrock@Sun.COM if (need_sync) 5398525SEric.Schrock@Sun.COM return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, 5408525SEric.Schrock@Sun.COM spa, nvp, 3)); 5418525SEric.Schrock@Sun.COM else 5428525SEric.Schrock@Sun.COM return (0); 5435094Slling } 5445094Slling 5455094Slling /* 5465094Slling * If the bootfs property value is dsobj, clear it. 5475094Slling */ 5485094Slling void 5495094Slling spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 5505094Slling { 5515094Slling if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 5525094Slling VERIFY(zap_remove(spa->spa_meta_objset, 5535094Slling spa->spa_pool_props_object, 5545094Slling zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 5555094Slling spa->spa_bootfs = 0; 5565094Slling } 5575094Slling } 5585094Slling 559789Sahrens /* 560789Sahrens * ========================================================================== 561789Sahrens * SPA state manipulation (open/create/destroy/import/export) 562789Sahrens * ========================================================================== 563789Sahrens */ 564789Sahrens 5651544Seschrock static int 5661544Seschrock spa_error_entry_compare(const void *a, const void *b) 5671544Seschrock { 5681544Seschrock spa_error_entry_t *sa = (spa_error_entry_t *)a; 5691544Seschrock spa_error_entry_t *sb = (spa_error_entry_t *)b; 5701544Seschrock int ret; 5711544Seschrock 5721544Seschrock ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 5731544Seschrock sizeof (zbookmark_t)); 5741544Seschrock 5751544Seschrock if (ret < 0) 5761544Seschrock return (-1); 5771544Seschrock else if (ret > 0) 5781544Seschrock return (1); 5791544Seschrock else 5801544Seschrock return (0); 5811544Seschrock } 5821544Seschrock 5831544Seschrock /* 5841544Seschrock * Utility function which retrieves copies of the current logs and 5851544Seschrock * re-initializes them in the process. 5861544Seschrock */ 5871544Seschrock void 5881544Seschrock spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 5891544Seschrock { 5901544Seschrock ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 5911544Seschrock 5921544Seschrock bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 5931544Seschrock bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 5941544Seschrock 5951544Seschrock avl_create(&spa->spa_errlist_scrub, 5961544Seschrock spa_error_entry_compare, sizeof (spa_error_entry_t), 5971544Seschrock offsetof(spa_error_entry_t, se_avl)); 5981544Seschrock avl_create(&spa->spa_errlist_last, 5991544Seschrock spa_error_entry_compare, sizeof (spa_error_entry_t), 6001544Seschrock offsetof(spa_error_entry_t, se_avl)); 6011544Seschrock } 6021544Seschrock 60311173SJonathan.Adams@Sun.COM static taskq_t * 60411173SJonathan.Adams@Sun.COM spa_taskq_create(spa_t *spa, const char *name, enum zti_modes mode, 60511173SJonathan.Adams@Sun.COM uint_t value) 60611173SJonathan.Adams@Sun.COM { 60711173SJonathan.Adams@Sun.COM uint_t flags = TASKQ_PREPOPULATE; 60811173SJonathan.Adams@Sun.COM boolean_t batch = B_FALSE; 60911173SJonathan.Adams@Sun.COM 61011173SJonathan.Adams@Sun.COM switch (mode) { 61111173SJonathan.Adams@Sun.COM case zti_mode_null: 61211173SJonathan.Adams@Sun.COM return (NULL); /* no taskq needed */ 61311173SJonathan.Adams@Sun.COM 61411173SJonathan.Adams@Sun.COM case zti_mode_fixed: 61511173SJonathan.Adams@Sun.COM ASSERT3U(value, >=, 1); 61611173SJonathan.Adams@Sun.COM value = MAX(value, 1); 61711173SJonathan.Adams@Sun.COM break; 61811173SJonathan.Adams@Sun.COM 61911173SJonathan.Adams@Sun.COM case zti_mode_batch: 62011173SJonathan.Adams@Sun.COM batch = B_TRUE; 62111173SJonathan.Adams@Sun.COM flags |= TASKQ_THREADS_CPU_PCT; 62211173SJonathan.Adams@Sun.COM value = zio_taskq_batch_pct; 62311173SJonathan.Adams@Sun.COM break; 62411173SJonathan.Adams@Sun.COM 62511173SJonathan.Adams@Sun.COM case zti_mode_online_percent: 62611173SJonathan.Adams@Sun.COM flags |= TASKQ_THREADS_CPU_PCT; 62711173SJonathan.Adams@Sun.COM break; 62811173SJonathan.Adams@Sun.COM 62911173SJonathan.Adams@Sun.COM default: 63011173SJonathan.Adams@Sun.COM panic("unrecognized mode for %s taskq (%u:%u) in " 63111173SJonathan.Adams@Sun.COM "spa_activate()", 63211173SJonathan.Adams@Sun.COM name, mode, value); 63311173SJonathan.Adams@Sun.COM break; 63411173SJonathan.Adams@Sun.COM } 63511173SJonathan.Adams@Sun.COM 63611173SJonathan.Adams@Sun.COM if (zio_taskq_sysdc && spa->spa_proc != &p0) { 63711173SJonathan.Adams@Sun.COM if (batch) 63811173SJonathan.Adams@Sun.COM flags |= TASKQ_DC_BATCH; 63911173SJonathan.Adams@Sun.COM 64011173SJonathan.Adams@Sun.COM return (taskq_create_sysdc(name, value, 50, INT_MAX, 64111173SJonathan.Adams@Sun.COM spa->spa_proc, zio_taskq_basedc, flags)); 64211173SJonathan.Adams@Sun.COM } 64311173SJonathan.Adams@Sun.COM return (taskq_create_proc(name, value, maxclsyspri, 50, INT_MAX, 64411173SJonathan.Adams@Sun.COM spa->spa_proc, flags)); 64511173SJonathan.Adams@Sun.COM } 64611173SJonathan.Adams@Sun.COM 64711173SJonathan.Adams@Sun.COM static void 64811173SJonathan.Adams@Sun.COM spa_create_zio_taskqs(spa_t *spa) 64911173SJonathan.Adams@Sun.COM { 65011173SJonathan.Adams@Sun.COM for (int t = 0; t < ZIO_TYPES; t++) { 65111173SJonathan.Adams@Sun.COM for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 65211173SJonathan.Adams@Sun.COM const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; 65311173SJonathan.Adams@Sun.COM enum zti_modes mode = ztip->zti_mode; 65411173SJonathan.Adams@Sun.COM uint_t value = ztip->zti_value; 65511173SJonathan.Adams@Sun.COM char name[32]; 65611173SJonathan.Adams@Sun.COM 65711173SJonathan.Adams@Sun.COM (void) snprintf(name, sizeof (name), 65811173SJonathan.Adams@Sun.COM "%s_%s", zio_type_name[t], zio_taskq_types[q]); 65911173SJonathan.Adams@Sun.COM 66011173SJonathan.Adams@Sun.COM spa->spa_zio_taskq[t][q] = 66111173SJonathan.Adams@Sun.COM spa_taskq_create(spa, name, mode, value); 66211173SJonathan.Adams@Sun.COM } 66311173SJonathan.Adams@Sun.COM } 66411173SJonathan.Adams@Sun.COM } 66511173SJonathan.Adams@Sun.COM 66611173SJonathan.Adams@Sun.COM #ifdef _KERNEL 66711173SJonathan.Adams@Sun.COM static void 66811173SJonathan.Adams@Sun.COM spa_thread(void *arg) 66911173SJonathan.Adams@Sun.COM { 67011173SJonathan.Adams@Sun.COM callb_cpr_t cprinfo; 67111173SJonathan.Adams@Sun.COM 67211173SJonathan.Adams@Sun.COM spa_t *spa = arg; 67311173SJonathan.Adams@Sun.COM user_t *pu = PTOU(curproc); 67411173SJonathan.Adams@Sun.COM 67511173SJonathan.Adams@Sun.COM CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, 67611173SJonathan.Adams@Sun.COM spa->spa_name); 67711173SJonathan.Adams@Sun.COM 67811173SJonathan.Adams@Sun.COM ASSERT(curproc != &p0); 67911173SJonathan.Adams@Sun.COM (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), 68011173SJonathan.Adams@Sun.COM "zpool-%s", spa->spa_name); 68111173SJonathan.Adams@Sun.COM (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); 68211173SJonathan.Adams@Sun.COM 68311173SJonathan.Adams@Sun.COM /* bind this thread to the requested psrset */ 68411173SJonathan.Adams@Sun.COM if (zio_taskq_psrset_bind != PS_NONE) { 68511173SJonathan.Adams@Sun.COM pool_lock(); 68611173SJonathan.Adams@Sun.COM mutex_enter(&cpu_lock); 68711173SJonathan.Adams@Sun.COM mutex_enter(&pidlock); 68811173SJonathan.Adams@Sun.COM mutex_enter(&curproc->p_lock); 68911173SJonathan.Adams@Sun.COM 69011173SJonathan.Adams@Sun.COM if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind, 69111173SJonathan.Adams@Sun.COM 0, NULL, NULL) == 0) { 69211173SJonathan.Adams@Sun.COM curthread->t_bind_pset = zio_taskq_psrset_bind; 69311173SJonathan.Adams@Sun.COM } else { 69411173SJonathan.Adams@Sun.COM cmn_err(CE_WARN, 69511173SJonathan.Adams@Sun.COM "Couldn't bind process for zfs pool \"%s\" to " 69611173SJonathan.Adams@Sun.COM "pset %d\n", spa->spa_name, zio_taskq_psrset_bind); 69711173SJonathan.Adams@Sun.COM } 69811173SJonathan.Adams@Sun.COM 69911173SJonathan.Adams@Sun.COM mutex_exit(&curproc->p_lock); 70011173SJonathan.Adams@Sun.COM mutex_exit(&pidlock); 70111173SJonathan.Adams@Sun.COM mutex_exit(&cpu_lock); 70211173SJonathan.Adams@Sun.COM pool_unlock(); 70311173SJonathan.Adams@Sun.COM } 70411173SJonathan.Adams@Sun.COM 70511173SJonathan.Adams@Sun.COM if (zio_taskq_sysdc) { 70611173SJonathan.Adams@Sun.COM sysdc_thread_enter(curthread, 100, 0); 70711173SJonathan.Adams@Sun.COM } 70811173SJonathan.Adams@Sun.COM 70911173SJonathan.Adams@Sun.COM spa->spa_proc = curproc; 71011173SJonathan.Adams@Sun.COM spa->spa_did = curthread->t_did; 71111173SJonathan.Adams@Sun.COM 71211173SJonathan.Adams@Sun.COM spa_create_zio_taskqs(spa); 71311173SJonathan.Adams@Sun.COM 71411173SJonathan.Adams@Sun.COM mutex_enter(&spa->spa_proc_lock); 71511173SJonathan.Adams@Sun.COM ASSERT(spa->spa_proc_state == SPA_PROC_CREATED); 71611173SJonathan.Adams@Sun.COM 71711173SJonathan.Adams@Sun.COM spa->spa_proc_state = SPA_PROC_ACTIVE; 71811173SJonathan.Adams@Sun.COM cv_broadcast(&spa->spa_proc_cv); 71911173SJonathan.Adams@Sun.COM 72011173SJonathan.Adams@Sun.COM CALLB_CPR_SAFE_BEGIN(&cprinfo); 72111173SJonathan.Adams@Sun.COM while (spa->spa_proc_state == SPA_PROC_ACTIVE) 72211173SJonathan.Adams@Sun.COM cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 72311173SJonathan.Adams@Sun.COM CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock); 72411173SJonathan.Adams@Sun.COM 72511173SJonathan.Adams@Sun.COM ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE); 72611173SJonathan.Adams@Sun.COM spa->spa_proc_state = SPA_PROC_GONE; 72711173SJonathan.Adams@Sun.COM spa->spa_proc = &p0; 72811173SJonathan.Adams@Sun.COM cv_broadcast(&spa->spa_proc_cv); 72911173SJonathan.Adams@Sun.COM CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ 73011173SJonathan.Adams@Sun.COM 73111173SJonathan.Adams@Sun.COM mutex_enter(&curproc->p_lock); 73211173SJonathan.Adams@Sun.COM lwp_exit(); 73311173SJonathan.Adams@Sun.COM } 73411173SJonathan.Adams@Sun.COM #endif 73511173SJonathan.Adams@Sun.COM 736789Sahrens /* 737789Sahrens * Activate an uninitialized pool. 738789Sahrens */ 739789Sahrens static void 7408241SJeff.Bonwick@Sun.COM spa_activate(spa_t *spa, int mode) 741789Sahrens { 742789Sahrens ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 743789Sahrens 744789Sahrens spa->spa_state = POOL_STATE_ACTIVE; 7458241SJeff.Bonwick@Sun.COM spa->spa_mode = mode; 746789Sahrens 74710594SGeorge.Wilson@Sun.COM spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops); 74810594SGeorge.Wilson@Sun.COM spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops); 749789Sahrens 75011173SJonathan.Adams@Sun.COM /* Try to create a covering process */ 75111173SJonathan.Adams@Sun.COM mutex_enter(&spa->spa_proc_lock); 75211173SJonathan.Adams@Sun.COM ASSERT(spa->spa_proc_state == SPA_PROC_NONE); 75311173SJonathan.Adams@Sun.COM ASSERT(spa->spa_proc == &p0); 75411173SJonathan.Adams@Sun.COM spa->spa_did = 0; 75511173SJonathan.Adams@Sun.COM 75611173SJonathan.Adams@Sun.COM /* Only create a process if we're going to be around a while. */ 75711173SJonathan.Adams@Sun.COM if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { 75811173SJonathan.Adams@Sun.COM if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, 75911173SJonathan.Adams@Sun.COM NULL, 0) == 0) { 76011173SJonathan.Adams@Sun.COM spa->spa_proc_state = SPA_PROC_CREATED; 76111173SJonathan.Adams@Sun.COM while (spa->spa_proc_state == SPA_PROC_CREATED) { 76211173SJonathan.Adams@Sun.COM cv_wait(&spa->spa_proc_cv, 76311173SJonathan.Adams@Sun.COM &spa->spa_proc_lock); 7649515SJonathan.Adams@Sun.COM } 76511173SJonathan.Adams@Sun.COM ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 76611173SJonathan.Adams@Sun.COM ASSERT(spa->spa_proc != &p0); 76711173SJonathan.Adams@Sun.COM ASSERT(spa->spa_did != 0); 76811173SJonathan.Adams@Sun.COM } else { 76911173SJonathan.Adams@Sun.COM #ifdef _KERNEL 77011173SJonathan.Adams@Sun.COM cmn_err(CE_WARN, 77111173SJonathan.Adams@Sun.COM "Couldn't create process for zfs pool \"%s\"\n", 77211173SJonathan.Adams@Sun.COM spa->spa_name); 77311173SJonathan.Adams@Sun.COM #endif 7747754SJeff.Bonwick@Sun.COM } 775789Sahrens } 77611173SJonathan.Adams@Sun.COM mutex_exit(&spa->spa_proc_lock); 77711173SJonathan.Adams@Sun.COM 77811173SJonathan.Adams@Sun.COM /* If we didn't create a process, we need to create our taskqs. */ 77911173SJonathan.Adams@Sun.COM if (spa->spa_proc == &p0) { 78011173SJonathan.Adams@Sun.COM spa_create_zio_taskqs(spa); 78111173SJonathan.Adams@Sun.COM } 782789Sahrens 7837754SJeff.Bonwick@Sun.COM list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), 7847754SJeff.Bonwick@Sun.COM offsetof(vdev_t, vdev_config_dirty_node)); 7857754SJeff.Bonwick@Sun.COM list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), 7867754SJeff.Bonwick@Sun.COM offsetof(vdev_t, vdev_state_dirty_node)); 787789Sahrens 788789Sahrens txg_list_create(&spa->spa_vdev_txg_list, 789789Sahrens offsetof(struct vdev, vdev_txg_node)); 7901544Seschrock 7911544Seschrock avl_create(&spa->spa_errlist_scrub, 7921544Seschrock spa_error_entry_compare, sizeof (spa_error_entry_t), 7931544Seschrock offsetof(spa_error_entry_t, se_avl)); 7941544Seschrock avl_create(&spa->spa_errlist_last, 7951544Seschrock spa_error_entry_compare, sizeof (spa_error_entry_t), 7961544Seschrock offsetof(spa_error_entry_t, se_avl)); 797789Sahrens } 798789Sahrens 799789Sahrens /* 800789Sahrens * Opposite of spa_activate(). 801789Sahrens */ 802789Sahrens static void 803789Sahrens spa_deactivate(spa_t *spa) 804789Sahrens { 805789Sahrens ASSERT(spa->spa_sync_on == B_FALSE); 806789Sahrens ASSERT(spa->spa_dsl_pool == NULL); 807789Sahrens ASSERT(spa->spa_root_vdev == NULL); 8089630SJeff.Bonwick@Sun.COM ASSERT(spa->spa_async_zio_root == NULL); 809789Sahrens ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 810789Sahrens 811789Sahrens txg_list_destroy(&spa->spa_vdev_txg_list); 812789Sahrens 8137754SJeff.Bonwick@Sun.COM list_destroy(&spa->spa_config_dirty_list); 8147754SJeff.Bonwick@Sun.COM list_destroy(&spa->spa_state_dirty_list); 8157754SJeff.Bonwick@Sun.COM 8167754SJeff.Bonwick@Sun.COM for (int t = 0; t < ZIO_TYPES; t++) { 8177754SJeff.Bonwick@Sun.COM for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 81811146SGeorge.Wilson@Sun.COM if (spa->spa_zio_taskq[t][q] != NULL) 81911146SGeorge.Wilson@Sun.COM taskq_destroy(spa->spa_zio_taskq[t][q]); 8207754SJeff.Bonwick@Sun.COM spa->spa_zio_taskq[t][q] = NULL; 8217754SJeff.Bonwick@Sun.COM } 822789Sahrens } 823789Sahrens 824789Sahrens metaslab_class_destroy(spa->spa_normal_class); 825789Sahrens spa->spa_normal_class = NULL; 826789Sahrens 8274527Sperrin metaslab_class_destroy(spa->spa_log_class); 8284527Sperrin spa->spa_log_class = NULL; 8294527Sperrin 8301544Seschrock /* 8311544Seschrock * If this was part of an import or the open otherwise failed, we may 8321544Seschrock * still have errors left in the queues. Empty them just in case. 8331544Seschrock */ 8341544Seschrock spa_errlog_drain(spa); 8351544Seschrock 8361544Seschrock avl_destroy(&spa->spa_errlist_scrub); 8371544Seschrock avl_destroy(&spa->spa_errlist_last); 8381544Seschrock 839789Sahrens spa->spa_state = POOL_STATE_UNINITIALIZED; 84011173SJonathan.Adams@Sun.COM 84111173SJonathan.Adams@Sun.COM mutex_enter(&spa->spa_proc_lock); 84211173SJonathan.Adams@Sun.COM if (spa->spa_proc_state != SPA_PROC_NONE) { 84311173SJonathan.Adams@Sun.COM ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 84411173SJonathan.Adams@Sun.COM spa->spa_proc_state = SPA_PROC_DEACTIVATE; 84511173SJonathan.Adams@Sun.COM cv_broadcast(&spa->spa_proc_cv); 84611173SJonathan.Adams@Sun.COM while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) { 84711173SJonathan.Adams@Sun.COM ASSERT(spa->spa_proc != &p0); 84811173SJonathan.Adams@Sun.COM cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 84911173SJonathan.Adams@Sun.COM } 85011173SJonathan.Adams@Sun.COM ASSERT(spa->spa_proc_state == SPA_PROC_GONE); 85111173SJonathan.Adams@Sun.COM spa->spa_proc_state = SPA_PROC_NONE; 85211173SJonathan.Adams@Sun.COM } 85311173SJonathan.Adams@Sun.COM ASSERT(spa->spa_proc == &p0); 85411173SJonathan.Adams@Sun.COM mutex_exit(&spa->spa_proc_lock); 85511173SJonathan.Adams@Sun.COM 85611173SJonathan.Adams@Sun.COM /* 85711173SJonathan.Adams@Sun.COM * We want to make sure spa_thread() has actually exited the ZFS 85811173SJonathan.Adams@Sun.COM * module, so that the module can't be unloaded out from underneath 85911173SJonathan.Adams@Sun.COM * it. 86011173SJonathan.Adams@Sun.COM */ 86111173SJonathan.Adams@Sun.COM if (spa->spa_did != 0) { 86211173SJonathan.Adams@Sun.COM thread_join(spa->spa_did); 86311173SJonathan.Adams@Sun.COM spa->spa_did = 0; 86411173SJonathan.Adams@Sun.COM } 865789Sahrens } 866789Sahrens 867789Sahrens /* 868789Sahrens * Verify a pool configuration, and construct the vdev tree appropriately. This 869789Sahrens * will create all the necessary vdevs in the appropriate layout, with each vdev 870789Sahrens * in the CLOSED state. This will prep the pool before open/creation/import. 871789Sahrens * All vdev validation is done by the vdev_alloc() routine. 872789Sahrens */ 8732082Seschrock static int 8742082Seschrock spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 8752082Seschrock uint_t id, int atype) 876789Sahrens { 877789Sahrens nvlist_t **child; 8789816SGeorge.Wilson@Sun.COM uint_t children; 8792082Seschrock int error; 8802082Seschrock 8812082Seschrock if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 8822082Seschrock return (error); 8832082Seschrock 8842082Seschrock if ((*vdp)->vdev_ops->vdev_op_leaf) 8852082Seschrock return (0); 886789Sahrens 8877754SJeff.Bonwick@Sun.COM error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 8887754SJeff.Bonwick@Sun.COM &child, &children); 8897754SJeff.Bonwick@Sun.COM 8907754SJeff.Bonwick@Sun.COM if (error == ENOENT) 8917754SJeff.Bonwick@Sun.COM return (0); 8927754SJeff.Bonwick@Sun.COM 8937754SJeff.Bonwick@Sun.COM if (error) { 8942082Seschrock vdev_free(*vdp); 8952082Seschrock *vdp = NULL; 8962082Seschrock return (EINVAL); 897789Sahrens } 898789Sahrens 8999816SGeorge.Wilson@Sun.COM for (int c = 0; c < children; c++) { 9002082Seschrock vdev_t *vd; 9012082Seschrock if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 9022082Seschrock atype)) != 0) { 9032082Seschrock vdev_free(*vdp); 9042082Seschrock *vdp = NULL; 9052082Seschrock return (error); 906789Sahrens } 907789Sahrens } 908789Sahrens 9092082Seschrock ASSERT(*vdp != NULL); 9102082Seschrock 9112082Seschrock return (0); 912789Sahrens } 913789Sahrens 914789Sahrens /* 915789Sahrens * Opposite of spa_load(). 916789Sahrens */ 917789Sahrens static void 918789Sahrens spa_unload(spa_t *spa) 919789Sahrens { 9202082Seschrock int i; 9212082Seschrock 9227754SJeff.Bonwick@Sun.COM ASSERT(MUTEX_HELD(&spa_namespace_lock)); 9237754SJeff.Bonwick@Sun.COM 924789Sahrens /* 9251544Seschrock * Stop async tasks. 9261544Seschrock */ 9271544Seschrock spa_async_suspend(spa); 9281544Seschrock 9291544Seschrock /* 930789Sahrens * Stop syncing. 931789Sahrens */ 932789Sahrens if (spa->spa_sync_on) { 933789Sahrens txg_sync_stop(spa->spa_dsl_pool); 934789Sahrens spa->spa_sync_on = B_FALSE; 935789Sahrens } 936789Sahrens 937789Sahrens /* 9387754SJeff.Bonwick@Sun.COM * Wait for any outstanding async I/O to complete. 939789Sahrens */ 9409234SGeorge.Wilson@Sun.COM if (spa->spa_async_zio_root != NULL) { 9419234SGeorge.Wilson@Sun.COM (void) zio_wait(spa->spa_async_zio_root); 9429234SGeorge.Wilson@Sun.COM spa->spa_async_zio_root = NULL; 9439234SGeorge.Wilson@Sun.COM } 944789Sahrens 945789Sahrens /* 946789Sahrens * Close the dsl pool. 947789Sahrens */ 948789Sahrens if (spa->spa_dsl_pool) { 949789Sahrens dsl_pool_close(spa->spa_dsl_pool); 950789Sahrens spa->spa_dsl_pool = NULL; 95111619SGeorge.Wilson@Sun.COM spa->spa_meta_objset = NULL; 952789Sahrens } 953789Sahrens 95410922SJeff.Bonwick@Sun.COM ddt_unload(spa); 95510922SJeff.Bonwick@Sun.COM 9568241SJeff.Bonwick@Sun.COM spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 9578241SJeff.Bonwick@Sun.COM 9588241SJeff.Bonwick@Sun.COM /* 9598241SJeff.Bonwick@Sun.COM * Drop and purge level 2 cache 9608241SJeff.Bonwick@Sun.COM */ 9618241SJeff.Bonwick@Sun.COM spa_l2cache_drop(spa); 9628241SJeff.Bonwick@Sun.COM 963789Sahrens /* 964789Sahrens * Close all vdevs. 965789Sahrens */ 9661585Sbonwick if (spa->spa_root_vdev) 967789Sahrens vdev_free(spa->spa_root_vdev); 9681585Sbonwick ASSERT(spa->spa_root_vdev == NULL); 9691544Seschrock 9705450Sbrendan for (i = 0; i < spa->spa_spares.sav_count; i++) 9715450Sbrendan vdev_free(spa->spa_spares.sav_vdevs[i]); 9725450Sbrendan if (spa->spa_spares.sav_vdevs) { 9735450Sbrendan kmem_free(spa->spa_spares.sav_vdevs, 9745450Sbrendan spa->spa_spares.sav_count * sizeof (void *)); 9755450Sbrendan spa->spa_spares.sav_vdevs = NULL; 9765450Sbrendan } 9775450Sbrendan if (spa->spa_spares.sav_config) { 9785450Sbrendan nvlist_free(spa->spa_spares.sav_config); 9795450Sbrendan spa->spa_spares.sav_config = NULL; 9802082Seschrock } 9817377SEric.Schrock@Sun.COM spa->spa_spares.sav_count = 0; 9825450Sbrendan 9835450Sbrendan for (i = 0; i < spa->spa_l2cache.sav_count; i++) 9845450Sbrendan vdev_free(spa->spa_l2cache.sav_vdevs[i]); 9855450Sbrendan if (spa->spa_l2cache.sav_vdevs) { 9865450Sbrendan kmem_free(spa->spa_l2cache.sav_vdevs, 9875450Sbrendan spa->spa_l2cache.sav_count * sizeof (void *)); 9885450Sbrendan spa->spa_l2cache.sav_vdevs = NULL; 9895450Sbrendan } 9905450Sbrendan if (spa->spa_l2cache.sav_config) { 9915450Sbrendan nvlist_free(spa->spa_l2cache.sav_config); 9925450Sbrendan spa->spa_l2cache.sav_config = NULL; 9932082Seschrock } 9947377SEric.Schrock@Sun.COM spa->spa_l2cache.sav_count = 0; 9952082Seschrock 9961544Seschrock spa->spa_async_suspended = 0; 9978241SJeff.Bonwick@Sun.COM 9988241SJeff.Bonwick@Sun.COM spa_config_exit(spa, SCL_ALL, FTAG); 999789Sahrens } 1000789Sahrens 1001789Sahrens /* 10022082Seschrock * Load (or re-load) the current list of vdevs describing the active spares for 10032082Seschrock * this pool. When this is called, we have some form of basic information in 10045450Sbrendan * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and 10055450Sbrendan * then re-generate a more complete list including status information. 10062082Seschrock */ 10072082Seschrock static void 10082082Seschrock spa_load_spares(spa_t *spa) 10092082Seschrock { 10102082Seschrock nvlist_t **spares; 10112082Seschrock uint_t nspares; 10122082Seschrock int i; 10133377Seschrock vdev_t *vd, *tvd; 10142082Seschrock 10157754SJeff.Bonwick@Sun.COM ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 10167754SJeff.Bonwick@Sun.COM 10172082Seschrock /* 10182082Seschrock * First, close and free any existing spare vdevs. 10192082Seschrock */ 10205450Sbrendan for (i = 0; i < spa->spa_spares.sav_count; i++) { 10215450Sbrendan vd = spa->spa_spares.sav_vdevs[i]; 10223377Seschrock 10233377Seschrock /* Undo the call to spa_activate() below */ 10246643Seschrock if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 10256643Seschrock B_FALSE)) != NULL && tvd->vdev_isspare) 10263377Seschrock spa_spare_remove(tvd); 10273377Seschrock vdev_close(vd); 10283377Seschrock vdev_free(vd); 10292082Seschrock } 10303377Seschrock 10315450Sbrendan if (spa->spa_spares.sav_vdevs) 10325450Sbrendan kmem_free(spa->spa_spares.sav_vdevs, 10335450Sbrendan spa->spa_spares.sav_count * sizeof (void *)); 10345450Sbrendan 10355450Sbrendan if (spa->spa_spares.sav_config == NULL) 10362082Seschrock nspares = 0; 10372082Seschrock else 10385450Sbrendan VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 10392082Seschrock ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 10402082Seschrock 10415450Sbrendan spa->spa_spares.sav_count = (int)nspares; 10425450Sbrendan spa->spa_spares.sav_vdevs = NULL; 10432082Seschrock 10442082Seschrock if (nspares == 0) 10452082Seschrock return; 10462082Seschrock 10472082Seschrock /* 10482082Seschrock * Construct the array of vdevs, opening them to get status in the 10493377Seschrock * process. For each spare, there is potentially two different vdev_t 10503377Seschrock * structures associated with it: one in the list of spares (used only 10513377Seschrock * for basic validation purposes) and one in the active vdev 10523377Seschrock * configuration (if it's spared in). During this phase we open and 10533377Seschrock * validate each vdev on the spare list. If the vdev also exists in the 10543377Seschrock * active configuration, then we also mark this vdev as an active spare. 10552082Seschrock */ 10565450Sbrendan spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *), 10575450Sbrendan KM_SLEEP); 10585450Sbrendan for (i = 0; i < spa->spa_spares.sav_count; i++) { 10592082Seschrock VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 10602082Seschrock VDEV_ALLOC_SPARE) == 0); 10612082Seschrock ASSERT(vd != NULL); 10622082Seschrock 10635450Sbrendan spa->spa_spares.sav_vdevs[i] = vd; 10642082Seschrock 10656643Seschrock if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 10666643Seschrock B_FALSE)) != NULL) { 10673377Seschrock if (!tvd->vdev_isspare) 10683377Seschrock spa_spare_add(tvd); 10693377Seschrock 10703377Seschrock /* 10713377Seschrock * We only mark the spare active if we were successfully 10723377Seschrock * able to load the vdev. Otherwise, importing a pool 10733377Seschrock * with a bad active spare would result in strange 10743377Seschrock * behavior, because multiple pool would think the spare 10753377Seschrock * is actively in use. 10763377Seschrock * 10773377Seschrock * There is a vulnerability here to an equally bizarre 10783377Seschrock * circumstance, where a dead active spare is later 10793377Seschrock * brought back to life (onlined or otherwise). Given 10803377Seschrock * the rarity of this scenario, and the extra complexity 10813377Seschrock * it adds, we ignore the possibility. 10823377Seschrock */ 10833377Seschrock if (!vdev_is_dead(tvd)) 10843377Seschrock spa_spare_activate(tvd); 10853377Seschrock } 10863377Seschrock 10877754SJeff.Bonwick@Sun.COM vd->vdev_top = vd; 10889425SEric.Schrock@Sun.COM vd->vdev_aux = &spa->spa_spares; 10897754SJeff.Bonwick@Sun.COM 10902082Seschrock if (vdev_open(vd) != 0) 10912082Seschrock continue; 10922082Seschrock 10935450Sbrendan if (vdev_validate_aux(vd) == 0) 10945450Sbrendan spa_spare_add(vd); 10952082Seschrock } 10962082Seschrock 10972082Seschrock /* 10982082Seschrock * Recompute the stashed list of spares, with status information 10992082Seschrock * this time. 11002082Seschrock */ 11015450Sbrendan VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, 11022082Seschrock DATA_TYPE_NVLIST_ARRAY) == 0); 11032082Seschrock 11045450Sbrendan spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), 11055450Sbrendan KM_SLEEP); 11065450Sbrendan for (i = 0; i < spa->spa_spares.sav_count; i++) 11075450Sbrendan spares[i] = vdev_config_generate(spa, 11085450Sbrendan spa->spa_spares.sav_vdevs[i], B_TRUE, B_TRUE, B_FALSE); 11095450Sbrendan VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 11105450Sbrendan ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); 11115450Sbrendan for (i = 0; i < spa->spa_spares.sav_count; i++) 11122082Seschrock nvlist_free(spares[i]); 11135450Sbrendan kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); 11145450Sbrendan } 11155450Sbrendan 11165450Sbrendan /* 11175450Sbrendan * Load (or re-load) the current list of vdevs describing the active l2cache for 11185450Sbrendan * this pool. When this is called, we have some form of basic information in 11195450Sbrendan * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and 11205450Sbrendan * then re-generate a more complete list including status information. 11215450Sbrendan * Devices which are already active have their details maintained, and are 11225450Sbrendan * not re-opened. 11235450Sbrendan */ 11245450Sbrendan static void 11255450Sbrendan spa_load_l2cache(spa_t *spa) 11265450Sbrendan { 11275450Sbrendan nvlist_t **l2cache; 11285450Sbrendan uint_t nl2cache; 11295450Sbrendan int i, j, oldnvdevs; 11309816SGeorge.Wilson@Sun.COM uint64_t guid; 11315450Sbrendan vdev_t *vd, **oldvdevs, **newvdevs; 11325450Sbrendan spa_aux_vdev_t *sav = &spa->spa_l2cache; 11335450Sbrendan 11347754SJeff.Bonwick@Sun.COM ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 11357754SJeff.Bonwick@Sun.COM 11365450Sbrendan if (sav->sav_config != NULL) { 11375450Sbrendan VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 11385450Sbrendan ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 11395450Sbrendan newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); 11405450Sbrendan } else { 11415450Sbrendan nl2cache = 0; 11425450Sbrendan } 11435450Sbrendan 11445450Sbrendan oldvdevs = sav->sav_vdevs; 11455450Sbrendan oldnvdevs = sav->sav_count; 11465450Sbrendan sav->sav_vdevs = NULL; 11475450Sbrendan sav->sav_count = 0; 11485450Sbrendan 11495450Sbrendan /* 11505450Sbrendan * Process new nvlist of vdevs. 11515450Sbrendan */ 11525450Sbrendan for (i = 0; i < nl2cache; i++) { 11535450Sbrendan VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, 11545450Sbrendan &guid) == 0); 11555450Sbrendan 11565450Sbrendan newvdevs[i] = NULL; 11575450Sbrendan for (j = 0; j < oldnvdevs; j++) { 11585450Sbrendan vd = oldvdevs[j]; 11595450Sbrendan if (vd != NULL && guid == vd->vdev_guid) { 11605450Sbrendan /* 11615450Sbrendan * Retain previous vdev for add/remove ops. 11625450Sbrendan */ 11635450Sbrendan newvdevs[i] = vd; 11645450Sbrendan oldvdevs[j] = NULL; 11655450Sbrendan break; 11665450Sbrendan } 11675450Sbrendan } 11685450Sbrendan 11695450Sbrendan if (newvdevs[i] == NULL) { 11705450Sbrendan /* 11715450Sbrendan * Create new vdev 11725450Sbrendan */ 11735450Sbrendan VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, 11745450Sbrendan VDEV_ALLOC_L2CACHE) == 0); 11755450Sbrendan ASSERT(vd != NULL); 11765450Sbrendan newvdevs[i] = vd; 11775450Sbrendan 11785450Sbrendan /* 11795450Sbrendan * Commit this vdev as an l2cache device, 11805450Sbrendan * even if it fails to open. 11815450Sbrendan */ 11825450Sbrendan spa_l2cache_add(vd); 11835450Sbrendan 11846643Seschrock vd->vdev_top = vd; 11856643Seschrock vd->vdev_aux = sav; 11866643Seschrock 11876643Seschrock spa_l2cache_activate(vd); 11886643Seschrock 11895450Sbrendan if (vdev_open(vd) != 0) 11905450Sbrendan continue; 11915450Sbrendan 11925450Sbrendan (void) vdev_validate_aux(vd); 11935450Sbrendan 11949816SGeorge.Wilson@Sun.COM if (!vdev_is_dead(vd)) 11959816SGeorge.Wilson@Sun.COM l2arc_add_vdev(spa, vd); 11965450Sbrendan } 11975450Sbrendan } 11985450Sbrendan 11995450Sbrendan /* 12005450Sbrendan * Purge vdevs that were dropped 12015450Sbrendan */ 12025450Sbrendan for (i = 0; i < oldnvdevs; i++) { 12035450Sbrendan uint64_t pool; 12045450Sbrendan 12055450Sbrendan vd = oldvdevs[i]; 12065450Sbrendan if (vd != NULL) { 12078241SJeff.Bonwick@Sun.COM if (spa_l2cache_exists(vd->vdev_guid, &pool) && 12088241SJeff.Bonwick@Sun.COM pool != 0ULL && l2arc_vdev_present(vd)) 12095450Sbrendan l2arc_remove_vdev(vd); 12105450Sbrendan (void) vdev_close(vd); 12115450Sbrendan spa_l2cache_remove(vd); 12125450Sbrendan } 12135450Sbrendan } 12145450Sbrendan 12155450Sbrendan if (oldvdevs) 12165450Sbrendan kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); 12175450Sbrendan 12185450Sbrendan if (sav->sav_config == NULL) 12195450Sbrendan goto out; 12205450Sbrendan 12215450Sbrendan sav->sav_vdevs = newvdevs; 12225450Sbrendan sav->sav_count = (int)nl2cache; 12235450Sbrendan 12245450Sbrendan /* 12255450Sbrendan * Recompute the stashed list of l2cache devices, with status 12265450Sbrendan * information this time. 12275450Sbrendan */ 12285450Sbrendan VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 12295450Sbrendan DATA_TYPE_NVLIST_ARRAY) == 0); 12305450Sbrendan 12315450Sbrendan l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 12325450Sbrendan for (i = 0; i < sav->sav_count; i++) 12335450Sbrendan l2cache[i] = vdev_config_generate(spa, 12345450Sbrendan sav->sav_vdevs[i], B_TRUE, B_FALSE, B_TRUE); 12355450Sbrendan VERIFY(nvlist_add_nvlist_array(sav->sav_config, 12365450Sbrendan ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); 12375450Sbrendan out: 12385450Sbrendan for (i = 0; i < sav->sav_count; i++) 12395450Sbrendan nvlist_free(l2cache[i]); 12405450Sbrendan if (sav->sav_count) 12415450Sbrendan kmem_free(l2cache, sav->sav_count * sizeof (void *)); 12422082Seschrock } 12432082Seschrock 12442082Seschrock static int 12452082Seschrock load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 12462082Seschrock { 12472082Seschrock dmu_buf_t *db; 12482082Seschrock char *packed = NULL; 12492082Seschrock size_t nvsize = 0; 12502082Seschrock int error; 12512082Seschrock *value = NULL; 12522082Seschrock 12532082Seschrock VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 12542082Seschrock nvsize = *(uint64_t *)db->db_data; 12552082Seschrock dmu_buf_rele(db, FTAG); 12562082Seschrock 12572082Seschrock packed = kmem_alloc(nvsize, KM_SLEEP); 12589512SNeil.Perrin@Sun.COM error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, 12599512SNeil.Perrin@Sun.COM DMU_READ_PREFETCH); 12602082Seschrock if (error == 0) 12612082Seschrock error = nvlist_unpack(packed, nvsize, value, 0); 12622082Seschrock kmem_free(packed, nvsize); 12632082Seschrock 12642082Seschrock return (error); 12652082Seschrock } 12662082Seschrock 12672082Seschrock /* 12684451Seschrock * Checks to see if the given vdev could not be opened, in which case we post a 12694451Seschrock * sysevent to notify the autoreplace code that the device has been removed. 12704451Seschrock */ 12714451Seschrock static void 12724451Seschrock spa_check_removed(vdev_t *vd) 12734451Seschrock { 12749816SGeorge.Wilson@Sun.COM for (int c = 0; c < vd->vdev_children; c++) 12754451Seschrock spa_check_removed(vd->vdev_child[c]); 12764451Seschrock 12774451Seschrock if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) { 12784451Seschrock zfs_post_autoreplace(vd->vdev_spa, vd); 12794451Seschrock spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK); 12804451Seschrock } 12814451Seschrock } 12824451Seschrock 12834451Seschrock /* 12849701SGeorge.Wilson@Sun.COM * Load the slog device state from the config object since it's possible 12859701SGeorge.Wilson@Sun.COM * that the label does not contain the most up-to-date information. 12869701SGeorge.Wilson@Sun.COM */ 12879701SGeorge.Wilson@Sun.COM void 128810594SGeorge.Wilson@Sun.COM spa_load_log_state(spa_t *spa, nvlist_t *nv) 12899701SGeorge.Wilson@Sun.COM { 129010594SGeorge.Wilson@Sun.COM vdev_t *ovd, *rvd = spa->spa_root_vdev; 129110594SGeorge.Wilson@Sun.COM 129210594SGeorge.Wilson@Sun.COM /* 129310594SGeorge.Wilson@Sun.COM * Load the original root vdev tree from the passed config. 129410594SGeorge.Wilson@Sun.COM */ 129510594SGeorge.Wilson@Sun.COM spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 129610594SGeorge.Wilson@Sun.COM VERIFY(spa_config_parse(spa, &ovd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); 129710594SGeorge.Wilson@Sun.COM 129810594SGeorge.Wilson@Sun.COM for (int c = 0; c < rvd->vdev_children; c++) { 129910594SGeorge.Wilson@Sun.COM vdev_t *cvd = rvd->vdev_child[c]; 130010594SGeorge.Wilson@Sun.COM if (cvd->vdev_islog) 130110594SGeorge.Wilson@Sun.COM vdev_load_log_state(cvd, ovd->vdev_child[c]); 13029701SGeorge.Wilson@Sun.COM } 130310594SGeorge.Wilson@Sun.COM vdev_free(ovd); 130410594SGeorge.Wilson@Sun.COM spa_config_exit(spa, SCL_ALL, FTAG); 13059701SGeorge.Wilson@Sun.COM } 13069701SGeorge.Wilson@Sun.COM 13079701SGeorge.Wilson@Sun.COM /* 13087294Sperrin * Check for missing log devices 13097294Sperrin */ 13107294Sperrin int 13117294Sperrin spa_check_logs(spa_t *spa) 13127294Sperrin { 13137294Sperrin switch (spa->spa_log_state) { 13147294Sperrin case SPA_LOG_MISSING: 13157294Sperrin /* need to recheck in case slog has been restored */ 13167294Sperrin case SPA_LOG_UNKNOWN: 13177294Sperrin if (dmu_objset_find(spa->spa_name, zil_check_log_chain, NULL, 13187294Sperrin DS_FIND_CHILDREN)) { 131911422SMark.Musante@Sun.COM spa_set_log_state(spa, SPA_LOG_MISSING); 13207294Sperrin return (1); 13217294Sperrin } 13227294Sperrin break; 13237294Sperrin } 13247294Sperrin return (0); 13257294Sperrin } 13267294Sperrin 132711422SMark.Musante@Sun.COM static boolean_t 132811422SMark.Musante@Sun.COM spa_passivate_log(spa_t *spa) 132911422SMark.Musante@Sun.COM { 133011422SMark.Musante@Sun.COM vdev_t *rvd = spa->spa_root_vdev; 133111422SMark.Musante@Sun.COM boolean_t slog_found = B_FALSE; 133211422SMark.Musante@Sun.COM 133311422SMark.Musante@Sun.COM ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 133411422SMark.Musante@Sun.COM 133511422SMark.Musante@Sun.COM if (!spa_has_slogs(spa)) 133611422SMark.Musante@Sun.COM return (B_FALSE); 133711422SMark.Musante@Sun.COM 133811422SMark.Musante@Sun.COM for (int c = 0; c < rvd->vdev_children; c++) { 133911422SMark.Musante@Sun.COM vdev_t *tvd = rvd->vdev_child[c]; 134011422SMark.Musante@Sun.COM metaslab_group_t *mg = tvd->vdev_mg; 134111422SMark.Musante@Sun.COM 134211422SMark.Musante@Sun.COM if (tvd->vdev_islog) { 134311422SMark.Musante@Sun.COM metaslab_group_passivate(mg); 134411422SMark.Musante@Sun.COM slog_found = B_TRUE; 134511422SMark.Musante@Sun.COM } 134611422SMark.Musante@Sun.COM } 134711422SMark.Musante@Sun.COM 134811422SMark.Musante@Sun.COM return (slog_found); 134911422SMark.Musante@Sun.COM } 135011422SMark.Musante@Sun.COM 135111422SMark.Musante@Sun.COM static void 135211422SMark.Musante@Sun.COM spa_activate_log(spa_t *spa) 135311422SMark.Musante@Sun.COM { 135411422SMark.Musante@Sun.COM vdev_t *rvd = spa->spa_root_vdev; 135511422SMark.Musante@Sun.COM 135611422SMark.Musante@Sun.COM ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 135711422SMark.Musante@Sun.COM 135811422SMark.Musante@Sun.COM for (int c = 0; c < rvd->vdev_children; c++) { 135911422SMark.Musante@Sun.COM vdev_t *tvd = rvd->vdev_child[c]; 136011422SMark.Musante@Sun.COM metaslab_group_t *mg = tvd->vdev_mg; 136111422SMark.Musante@Sun.COM 136211422SMark.Musante@Sun.COM if (tvd->vdev_islog) 136311422SMark.Musante@Sun.COM metaslab_group_activate(mg); 136411422SMark.Musante@Sun.COM } 136511422SMark.Musante@Sun.COM } 136611422SMark.Musante@Sun.COM 136711422SMark.Musante@Sun.COM int 136811422SMark.Musante@Sun.COM spa_offline_log(spa_t *spa) 136911422SMark.Musante@Sun.COM { 137011422SMark.Musante@Sun.COM int error = 0; 137111422SMark.Musante@Sun.COM 137211422SMark.Musante@Sun.COM if ((error = dmu_objset_find(spa_name(spa), zil_vdev_offline, 137311422SMark.Musante@Sun.COM NULL, DS_FIND_CHILDREN)) == 0) { 137411422SMark.Musante@Sun.COM 137511422SMark.Musante@Sun.COM /* 137611422SMark.Musante@Sun.COM * We successfully offlined the log device, sync out the 137711422SMark.Musante@Sun.COM * current txg so that the "stubby" block can be removed 137811422SMark.Musante@Sun.COM * by zil_sync(). 137911422SMark.Musante@Sun.COM */ 138011422SMark.Musante@Sun.COM txg_wait_synced(spa->spa_dsl_pool, 0); 138111422SMark.Musante@Sun.COM } 138211422SMark.Musante@Sun.COM return (error); 138311422SMark.Musante@Sun.COM } 138411422SMark.Musante@Sun.COM 138510672SEric.Schrock@Sun.COM static void 138610672SEric.Schrock@Sun.COM spa_aux_check_removed(spa_aux_vdev_t *sav) 138710672SEric.Schrock@Sun.COM { 138810922SJeff.Bonwick@Sun.COM for (int i = 0; i < sav->sav_count; i++) 138910672SEric.Schrock@Sun.COM spa_check_removed(sav->sav_vdevs[i]); 139010672SEric.Schrock@Sun.COM } 139110672SEric.Schrock@Sun.COM 139210922SJeff.Bonwick@Sun.COM void 139310922SJeff.Bonwick@Sun.COM spa_claim_notify(zio_t *zio) 139410922SJeff.Bonwick@Sun.COM { 139510922SJeff.Bonwick@Sun.COM spa_t *spa = zio->io_spa; 139610922SJeff.Bonwick@Sun.COM 139710922SJeff.Bonwick@Sun.COM if (zio->io_error) 139810922SJeff.Bonwick@Sun.COM return; 139910922SJeff.Bonwick@Sun.COM 140010922SJeff.Bonwick@Sun.COM mutex_enter(&spa->spa_props_lock); /* any mutex will do */ 140110922SJeff.Bonwick@Sun.COM if (spa->spa_claim_max_txg < zio->io_bp->blk_birth) 140210922SJeff.Bonwick@Sun.COM spa->spa_claim_max_txg = zio->io_bp->blk_birth; 140310922SJeff.Bonwick@Sun.COM mutex_exit(&spa->spa_props_lock); 140410922SJeff.Bonwick@Sun.COM } 140510922SJeff.Bonwick@Sun.COM 140610921STim.Haley@Sun.COM typedef struct spa_load_error { 140711727SVictor.Latushkin@Sun.COM uint64_t sle_meta_count; 140810921STim.Haley@Sun.COM uint64_t sle_data_count; 140910921STim.Haley@Sun.COM } spa_load_error_t; 141010921STim.Haley@Sun.COM 141110921STim.Haley@Sun.COM static void 141210921STim.Haley@Sun.COM spa_load_verify_done(zio_t *zio) 141310921STim.Haley@Sun.COM { 141410921STim.Haley@Sun.COM blkptr_t *bp = zio->io_bp; 141510921STim.Haley@Sun.COM spa_load_error_t *sle = zio->io_private; 141610921STim.Haley@Sun.COM dmu_object_type_t type = BP_GET_TYPE(bp); 141710921STim.Haley@Sun.COM int error = zio->io_error; 141810921STim.Haley@Sun.COM 141910921STim.Haley@Sun.COM if (error) { 142010921STim.Haley@Sun.COM if ((BP_GET_LEVEL(bp) != 0 || dmu_ot[type].ot_metadata) && 142110921STim.Haley@Sun.COM type != DMU_OT_INTENT_LOG) 142211727SVictor.Latushkin@Sun.COM atomic_add_64(&sle->sle_meta_count, 1); 142310921STim.Haley@Sun.COM else 142410921STim.Haley@Sun.COM atomic_add_64(&sle->sle_data_count, 1); 142510921STim.Haley@Sun.COM } 142610921STim.Haley@Sun.COM zio_data_buf_free(zio->io_data, zio->io_size); 142710921STim.Haley@Sun.COM } 142810921STim.Haley@Sun.COM 142910921STim.Haley@Sun.COM /*ARGSUSED*/ 143010921STim.Haley@Sun.COM static int 143110922SJeff.Bonwick@Sun.COM spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 143210922SJeff.Bonwick@Sun.COM const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) 143310921STim.Haley@Sun.COM { 143410921STim.Haley@Sun.COM if (bp != NULL) { 143510921STim.Haley@Sun.COM zio_t *rio = arg; 143610921STim.Haley@Sun.COM size_t size = BP_GET_PSIZE(bp); 143710921STim.Haley@Sun.COM void *data = zio_data_buf_alloc(size); 143810921STim.Haley@Sun.COM 143910921STim.Haley@Sun.COM zio_nowait(zio_read(rio, spa, bp, data, size, 144010921STim.Haley@Sun.COM spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, 144110921STim.Haley@Sun.COM ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | 144210921STim.Haley@Sun.COM ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); 144310921STim.Haley@Sun.COM } 144410921STim.Haley@Sun.COM return (0); 144510921STim.Haley@Sun.COM } 144610921STim.Haley@Sun.COM 144710921STim.Haley@Sun.COM static int 144810921STim.Haley@Sun.COM spa_load_verify(spa_t *spa) 144910921STim.Haley@Sun.COM { 145010921STim.Haley@Sun.COM zio_t *rio; 145110921STim.Haley@Sun.COM spa_load_error_t sle = { 0 }; 145210921STim.Haley@Sun.COM zpool_rewind_policy_t policy; 145310921STim.Haley@Sun.COM boolean_t verify_ok = B_FALSE; 145410921STim.Haley@Sun.COM int error; 145510921STim.Haley@Sun.COM 145611727SVictor.Latushkin@Sun.COM zpool_get_rewind_policy(spa->spa_config, &policy); 145711727SVictor.Latushkin@Sun.COM 145811727SVictor.Latushkin@Sun.COM if (policy.zrp_request & ZPOOL_NEVER_REWIND) 145911727SVictor.Latushkin@Sun.COM return (0); 146011727SVictor.Latushkin@Sun.COM 146110921STim.Haley@Sun.COM rio = zio_root(spa, NULL, &sle, 146210921STim.Haley@Sun.COM ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 146310921STim.Haley@Sun.COM 146411125SJeff.Bonwick@Sun.COM error = traverse_pool(spa, spa->spa_verify_min_txg, 146511125SJeff.Bonwick@Sun.COM TRAVERSE_PRE | TRAVERSE_PREFETCH, spa_load_verify_cb, rio); 146610921STim.Haley@Sun.COM 146710921STim.Haley@Sun.COM (void) zio_wait(rio); 146810921STim.Haley@Sun.COM 146911727SVictor.Latushkin@Sun.COM spa->spa_load_meta_errors = sle.sle_meta_count; 147010921STim.Haley@Sun.COM spa->spa_load_data_errors = sle.sle_data_count; 147110921STim.Haley@Sun.COM 147211727SVictor.Latushkin@Sun.COM if (!error && sle.sle_meta_count <= policy.zrp_maxmeta && 147310921STim.Haley@Sun.COM sle.sle_data_count <= policy.zrp_maxdata) { 147410921STim.Haley@Sun.COM verify_ok = B_TRUE; 147510921STim.Haley@Sun.COM spa->spa_load_txg = spa->spa_uberblock.ub_txg; 147610921STim.Haley@Sun.COM spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; 147711026STim.Haley@Sun.COM } else { 147811026STim.Haley@Sun.COM spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; 147910921STim.Haley@Sun.COM } 148010921STim.Haley@Sun.COM 148110921STim.Haley@Sun.COM if (error) { 148210921STim.Haley@Sun.COM if (error != ENXIO && error != EIO) 148310921STim.Haley@Sun.COM error = EIO; 148410921STim.Haley@Sun.COM return (error); 148510921STim.Haley@Sun.COM } 148610921STim.Haley@Sun.COM 148710921STim.Haley@Sun.COM return (verify_ok ? 0 : EIO); 148810921STim.Haley@Sun.COM } 148910921STim.Haley@Sun.COM 14907294Sperrin /* 149111422SMark.Musante@Sun.COM * Find a value in the pool props object. 149211422SMark.Musante@Sun.COM */ 149311422SMark.Musante@Sun.COM static void 149411422SMark.Musante@Sun.COM spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) 149511422SMark.Musante@Sun.COM { 149611422SMark.Musante@Sun.COM (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, 149711422SMark.Musante@Sun.COM zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); 149811422SMark.Musante@Sun.COM } 149911422SMark.Musante@Sun.COM 150011422SMark.Musante@Sun.COM /* 150111422SMark.Musante@Sun.COM * Find a value in the pool directory object. 150211422SMark.Musante@Sun.COM */ 150311422SMark.Musante@Sun.COM static int 150411422SMark.Musante@Sun.COM spa_dir_prop(spa_t *spa, const char *name, uint64_t *val) 150511422SMark.Musante@Sun.COM { 150611422SMark.Musante@Sun.COM return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 150711422SMark.Musante@Sun.COM name, sizeof (uint64_t), 1, val)); 150811422SMark.Musante@Sun.COM } 150911422SMark.Musante@Sun.COM 151011422SMark.Musante@Sun.COM static int 151111422SMark.Musante@Sun.COM spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) 151211422SMark.Musante@Sun.COM { 151311422SMark.Musante@Sun.COM vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); 151411422SMark.Musante@Sun.COM return (err); 151511422SMark.Musante@Sun.COM } 151611422SMark.Musante@Sun.COM 151711422SMark.Musante@Sun.COM /* 151811422SMark.Musante@Sun.COM * Fix up config after a partly-completed split. This is done with the 151911422SMark.Musante@Sun.COM * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off 152011422SMark.Musante@Sun.COM * pool have that entry in their config, but only the splitting one contains 152111422SMark.Musante@Sun.COM * a list of all the guids of the vdevs that are being split off. 152211422SMark.Musante@Sun.COM * 152311422SMark.Musante@Sun.COM * This function determines what to do with that list: either rejoin 152411422SMark.Musante@Sun.COM * all the disks to the pool, or complete the splitting process. To attempt 152511422SMark.Musante@Sun.COM * the rejoin, each disk that is offlined is marked online again, and 152611422SMark.Musante@Sun.COM * we do a reopen() call. If the vdev label for every disk that was 152711422SMark.Musante@Sun.COM * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL) 152811422SMark.Musante@Sun.COM * then we call vdev_split() on each disk, and complete the split. 152911422SMark.Musante@Sun.COM * 153011497SMark.Musante@Sun.COM * Otherwise we leave the config alone, with all the vdevs in place in 153111497SMark.Musante@Sun.COM * the original pool. 153211422SMark.Musante@Sun.COM */ 153311422SMark.Musante@Sun.COM static void 153411422SMark.Musante@Sun.COM spa_try_repair(spa_t *spa, nvlist_t *config) 153511422SMark.Musante@Sun.COM { 153611422SMark.Musante@Sun.COM uint_t extracted; 153711422SMark.Musante@Sun.COM uint64_t *glist; 153811422SMark.Musante@Sun.COM uint_t i, gcount; 153911422SMark.Musante@Sun.COM nvlist_t *nvl; 154011422SMark.Musante@Sun.COM vdev_t **vd; 154111422SMark.Musante@Sun.COM boolean_t attempt_reopen; 154211422SMark.Musante@Sun.COM 154311422SMark.Musante@Sun.COM if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) 154411422SMark.Musante@Sun.COM return; 154511422SMark.Musante@Sun.COM 154611422SMark.Musante@Sun.COM /* check that the config is complete */ 154711422SMark.Musante@Sun.COM if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 154811422SMark.Musante@Sun.COM &glist, &gcount) != 0) 154911422SMark.Musante@Sun.COM return; 155011422SMark.Musante@Sun.COM 155111422SMark.Musante@Sun.COM vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP); 155211422SMark.Musante@Sun.COM 155311422SMark.Musante@Sun.COM /* attempt to online all the vdevs & validate */ 155411422SMark.Musante@Sun.COM attempt_reopen = B_TRUE; 155511422SMark.Musante@Sun.COM for (i = 0; i < gcount; i++) { 155611422SMark.Musante@Sun.COM if (glist[i] == 0) /* vdev is hole */ 155711422SMark.Musante@Sun.COM continue; 155811422SMark.Musante@Sun.COM 155911422SMark.Musante@Sun.COM vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); 156011422SMark.Musante@Sun.COM if (vd[i] == NULL) { 156111422SMark.Musante@Sun.COM /* 156211422SMark.Musante@Sun.COM * Don't bother attempting to reopen the disks; 156311422SMark.Musante@Sun.COM * just do the split. 156411422SMark.Musante@Sun.COM */ 156511422SMark.Musante@Sun.COM attempt_reopen = B_FALSE; 156611422SMark.Musante@Sun.COM } else { 156711422SMark.Musante@Sun.COM /* attempt to re-online it */ 156811422SMark.Musante@Sun.COM vd[i]->vdev_offline = B_FALSE; 156911422SMark.Musante@Sun.COM } 157011422SMark.Musante@Sun.COM } 157111422SMark.Musante@Sun.COM 157211422SMark.Musante@Sun.COM if (attempt_reopen) { 157311422SMark.Musante@Sun.COM vdev_reopen(spa->spa_root_vdev); 157411422SMark.Musante@Sun.COM 157511422SMark.Musante@Sun.COM /* check each device to see what state it's in */ 157611422SMark.Musante@Sun.COM for (extracted = 0, i = 0; i < gcount; i++) { 157711422SMark.Musante@Sun.COM if (vd[i] != NULL && 157811422SMark.Musante@Sun.COM vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) 157911422SMark.Musante@Sun.COM break; 158011422SMark.Musante@Sun.COM ++extracted; 158111422SMark.Musante@Sun.COM } 158211422SMark.Musante@Sun.COM } 158311422SMark.Musante@Sun.COM 158411422SMark.Musante@Sun.COM /* 158511422SMark.Musante@Sun.COM * If every disk has been moved to the new pool, or if we never 158611422SMark.Musante@Sun.COM * even attempted to look at them, then we split them off for 158711422SMark.Musante@Sun.COM * good. 158811422SMark.Musante@Sun.COM */ 158911422SMark.Musante@Sun.COM if (!attempt_reopen || gcount == extracted) { 159011422SMark.Musante@Sun.COM for (i = 0; i < gcount; i++) 159111422SMark.Musante@Sun.COM if (vd[i] != NULL) 159211422SMark.Musante@Sun.COM vdev_split(vd[i]); 159311422SMark.Musante@Sun.COM vdev_reopen(spa->spa_root_vdev); 159411422SMark.Musante@Sun.COM } 159511422SMark.Musante@Sun.COM 159611422SMark.Musante@Sun.COM kmem_free(vd, gcount * sizeof (vdev_t *)); 159711422SMark.Musante@Sun.COM } 159811422SMark.Musante@Sun.COM 159911422SMark.Musante@Sun.COM static int 160011422SMark.Musante@Sun.COM spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type, 160111422SMark.Musante@Sun.COM boolean_t mosconfig) 160211422SMark.Musante@Sun.COM { 160311422SMark.Musante@Sun.COM nvlist_t *config = spa->spa_config; 160411422SMark.Musante@Sun.COM char *ereport = FM_EREPORT_ZFS_POOL; 160511422SMark.Musante@Sun.COM int error; 160611422SMark.Musante@Sun.COM uint64_t pool_guid; 160711422SMark.Musante@Sun.COM nvlist_t *nvl; 160811422SMark.Musante@Sun.COM 160911422SMark.Musante@Sun.COM if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) 161011422SMark.Musante@Sun.COM return (EINVAL); 161111422SMark.Musante@Sun.COM 161211422SMark.Musante@Sun.COM /* 161311422SMark.Musante@Sun.COM * Versioning wasn't explicitly added to the label until later, so if 161411422SMark.Musante@Sun.COM * it's not present treat it as the initial version. 161511422SMark.Musante@Sun.COM */ 161611422SMark.Musante@Sun.COM if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 161711422SMark.Musante@Sun.COM &spa->spa_ubsync.ub_version) != 0) 161811422SMark.Musante@Sun.COM spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 161911422SMark.Musante@Sun.COM 162011422SMark.Musante@Sun.COM (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 162111422SMark.Musante@Sun.COM &spa->spa_config_txg); 162211422SMark.Musante@Sun.COM 162311422SMark.Musante@Sun.COM if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 162411422SMark.Musante@Sun.COM spa_guid_exists(pool_guid, 0)) { 162511422SMark.Musante@Sun.COM error = EEXIST; 162611422SMark.Musante@Sun.COM } else { 162711422SMark.Musante@Sun.COM spa->spa_load_guid = pool_guid; 162811422SMark.Musante@Sun.COM 162911422SMark.Musante@Sun.COM if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, 163011422SMark.Musante@Sun.COM &nvl) == 0) { 163111422SMark.Musante@Sun.COM VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting, 163211422SMark.Musante@Sun.COM KM_SLEEP) == 0); 163311422SMark.Musante@Sun.COM } 163411422SMark.Musante@Sun.COM 163511422SMark.Musante@Sun.COM error = spa_load_impl(spa, pool_guid, config, state, type, 163611422SMark.Musante@Sun.COM mosconfig, &ereport); 163711422SMark.Musante@Sun.COM } 163811422SMark.Musante@Sun.COM 163911422SMark.Musante@Sun.COM spa->spa_minref = refcount_count(&spa->spa_refcount); 164011422SMark.Musante@Sun.COM if (error && error != EBADF) 164111422SMark.Musante@Sun.COM zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); 164211422SMark.Musante@Sun.COM spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; 164311422SMark.Musante@Sun.COM spa->spa_ena = 0; 164411422SMark.Musante@Sun.COM 164511422SMark.Musante@Sun.COM return (error); 164611422SMark.Musante@Sun.COM } 164711422SMark.Musante@Sun.COM 164811422SMark.Musante@Sun.COM /* 1649789Sahrens * Load an existing storage pool, using the pool's builtin spa_config as a 16501544Seschrock * source of configuration information. 1651789Sahrens */ 1652789Sahrens static int 165311422SMark.Musante@Sun.COM spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, 165411422SMark.Musante@Sun.COM spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, 165511422SMark.Musante@Sun.COM char **ereport) 1656789Sahrens { 1657789Sahrens int error = 0; 165811810SMark.Musante@Sun.COM nvlist_t *nvroot = NULL; 1659789Sahrens vdev_t *rvd; 1660789Sahrens uberblock_t *ub = &spa->spa_uberblock; 16611635Sbonwick uint64_t config_cache_txg = spa->spa_config_txg; 16628241SJeff.Bonwick@Sun.COM int orig_mode = spa->spa_mode; 166311422SMark.Musante@Sun.COM int parse; 1664789Sahrens 16658241SJeff.Bonwick@Sun.COM /* 16668241SJeff.Bonwick@Sun.COM * If this is an untrusted config, access the pool in read-only mode. 16678241SJeff.Bonwick@Sun.COM * This prevents things like resilvering recently removed devices. 16688241SJeff.Bonwick@Sun.COM */ 16698241SJeff.Bonwick@Sun.COM if (!mosconfig) 16708241SJeff.Bonwick@Sun.COM spa->spa_mode = FREAD; 16718241SJeff.Bonwick@Sun.COM 16727754SJeff.Bonwick@Sun.COM ASSERT(MUTEX_HELD(&spa_namespace_lock)); 16737754SJeff.Bonwick@Sun.COM 16741544Seschrock spa->spa_load_state = state; 16751635Sbonwick 167611422SMark.Musante@Sun.COM if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot)) 167711422SMark.Musante@Sun.COM return (EINVAL); 167811422SMark.Musante@Sun.COM 167911422SMark.Musante@Sun.COM parse = (type == SPA_IMPORT_EXISTING ? 168011422SMark.Musante@Sun.COM VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); 16812174Seschrock 1682789Sahrens /* 16839234SGeorge.Wilson@Sun.COM * Create "The Godfather" zio to hold all async IOs 16849234SGeorge.Wilson@Sun.COM */ 16859630SJeff.Bonwick@Sun.COM spa->spa_async_zio_root = zio_root(spa, NULL, NULL, 16869630SJeff.Bonwick@Sun.COM ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); 16879234SGeorge.Wilson@Sun.COM 16889234SGeorge.Wilson@Sun.COM /* 16892082Seschrock * Parse the configuration into a vdev tree. We explicitly set the 16902082Seschrock * value that will be returned by spa_version() since parsing the 16912082Seschrock * configuration requires knowing the version number. 1692789Sahrens */ 16937754SJeff.Bonwick@Sun.COM spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 169411422SMark.Musante@Sun.COM error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse); 16957754SJeff.Bonwick@Sun.COM spa_config_exit(spa, SCL_ALL, FTAG); 1696789Sahrens 16972082Seschrock if (error != 0) 169811422SMark.Musante@Sun.COM return (error); 1699789Sahrens 17001585Sbonwick ASSERT(spa->spa_root_vdev == rvd); 170111422SMark.Musante@Sun.COM 170211422SMark.Musante@Sun.COM if (type != SPA_IMPORT_ASSEMBLE) { 170311422SMark.Musante@Sun.COM ASSERT(spa_guid(spa) == pool_guid); 170411422SMark.Musante@Sun.COM } 1705789Sahrens 1706789Sahrens /* 1707789Sahrens * Try to open all vdevs, loading each label in the process. 1708789Sahrens */ 17097754SJeff.Bonwick@Sun.COM spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 17104070Smc142369 error = vdev_open(rvd); 17117754SJeff.Bonwick@Sun.COM spa_config_exit(spa, SCL_ALL, FTAG); 17124070Smc142369 if (error != 0) 171311422SMark.Musante@Sun.COM return (error); 1714789Sahrens 1715789Sahrens /* 17169276SMark.Musante@Sun.COM * We need to validate the vdev labels against the configuration that 17179276SMark.Musante@Sun.COM * we have in hand, which is dependent on the setting of mosconfig. If 17189276SMark.Musante@Sun.COM * mosconfig is true then we're validating the vdev labels based on 171911422SMark.Musante@Sun.COM * that config. Otherwise, we're validating against the cached config 17209276SMark.Musante@Sun.COM * (zpool.cache) that was read when we loaded the zfs module, and then 17219276SMark.Musante@Sun.COM * later we will recursively call spa_load() and validate against 17229276SMark.Musante@Sun.COM * the vdev config. 172311422SMark.Musante@Sun.COM * 172411422SMark.Musante@Sun.COM * If we're assembling a new pool that's been split off from an 172511422SMark.Musante@Sun.COM * existing pool, the labels haven't yet been updated so we skip 172611422SMark.Musante@Sun.COM * validation for now. 17271986Seschrock */ 172811422SMark.Musante@Sun.COM if (type != SPA_IMPORT_ASSEMBLE) { 172911422SMark.Musante@Sun.COM spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 173011422SMark.Musante@Sun.COM error = vdev_validate(rvd); 173111422SMark.Musante@Sun.COM spa_config_exit(spa, SCL_ALL, FTAG); 173211422SMark.Musante@Sun.COM 173311422SMark.Musante@Sun.COM if (error != 0) 173411422SMark.Musante@Sun.COM return (error); 173511422SMark.Musante@Sun.COM 173611422SMark.Musante@Sun.COM if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 173711422SMark.Musante@Sun.COM return (ENXIO); 17381986Seschrock } 17391986Seschrock 17401986Seschrock /* 1741789Sahrens * Find the best uberblock. 1742789Sahrens */ 17437754SJeff.Bonwick@Sun.COM vdev_uberblock_load(NULL, rvd, ub); 1744789Sahrens 1745789Sahrens /* 1746789Sahrens * If we weren't able to find a single valid uberblock, return failure. 1747789Sahrens */ 174811422SMark.Musante@Sun.COM if (ub->ub_txg == 0) 174911422SMark.Musante@Sun.COM return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); 17501544Seschrock 17511544Seschrock /* 17521544Seschrock * If the pool is newer than the code, we can't open it. 17531544Seschrock */ 175411422SMark.Musante@Sun.COM if (ub->ub_version > SPA_VERSION) 175511422SMark.Musante@Sun.COM return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); 1756789Sahrens 1757789Sahrens /* 1758789Sahrens * If the vdev guid sum doesn't match the uberblock, we have an 1759789Sahrens * incomplete configuration. 1760789Sahrens */ 176111422SMark.Musante@Sun.COM if (mosconfig && type != SPA_IMPORT_ASSEMBLE && 176211422SMark.Musante@Sun.COM rvd->vdev_guid_sum != ub->ub_guid_sum) 176311422SMark.Musante@Sun.COM return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); 176411422SMark.Musante@Sun.COM 176511422SMark.Musante@Sun.COM if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { 176611422SMark.Musante@Sun.COM spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 176711422SMark.Musante@Sun.COM spa_try_repair(spa, config); 176811422SMark.Musante@Sun.COM spa_config_exit(spa, SCL_ALL, FTAG); 176911422SMark.Musante@Sun.COM nvlist_free(spa->spa_config_splitting); 177011422SMark.Musante@Sun.COM spa->spa_config_splitting = NULL; 1771789Sahrens } 1772789Sahrens 1773789Sahrens /* 1774789Sahrens * Initialize internal SPA structures. 1775789Sahrens */ 1776789Sahrens spa->spa_state = POOL_STATE_ACTIVE; 1777789Sahrens spa->spa_ubsync = spa->spa_uberblock; 177810921STim.Haley@Sun.COM spa->spa_verify_min_txg = spa->spa_extreme_rewind ? 177911727SVictor.Latushkin@Sun.COM TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; 178010921STim.Haley@Sun.COM spa->spa_first_txg = spa->spa_last_ubsync_txg ? 178110921STim.Haley@Sun.COM spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; 178210922SJeff.Bonwick@Sun.COM spa->spa_claim_max_txg = spa->spa_first_txg; 178310922SJeff.Bonwick@Sun.COM 17841544Seschrock error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 178511422SMark.Musante@Sun.COM if (error) 178611422SMark.Musante@Sun.COM return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 1787789Sahrens spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 1788789Sahrens 178911422SMark.Musante@Sun.COM if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0) 179011422SMark.Musante@Sun.COM return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 179111422SMark.Musante@Sun.COM 1792789Sahrens if (!mosconfig) { 17933975Sek110237 uint64_t hostid; 179411810SMark.Musante@Sun.COM nvlist_t *policy = NULL, *nvconfig; 179511810SMark.Musante@Sun.COM 179611810SMark.Musante@Sun.COM if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) 179711810SMark.Musante@Sun.COM return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 17982082Seschrock 179910594SGeorge.Wilson@Sun.COM if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig, 18007706SLin.Ling@Sun.COM ZPOOL_CONFIG_HOSTID, &hostid) == 0) { 18013975Sek110237 char *hostname; 18023975Sek110237 unsigned long myhostid = 0; 18033975Sek110237 180410594SGeorge.Wilson@Sun.COM VERIFY(nvlist_lookup_string(nvconfig, 18053975Sek110237 ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 18063975Sek110237 18078662SJordan.Vaughan@Sun.com #ifdef _KERNEL 18088662SJordan.Vaughan@Sun.com myhostid = zone_get_hostid(NULL); 18098662SJordan.Vaughan@Sun.com #else /* _KERNEL */ 18108662SJordan.Vaughan@Sun.com /* 18118662SJordan.Vaughan@Sun.com * We're emulating the system's hostid in userland, so 18128662SJordan.Vaughan@Sun.com * we can't use zone_get_hostid(). 18138662SJordan.Vaughan@Sun.com */ 18143975Sek110237 (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 18158662SJordan.Vaughan@Sun.com #endif /* _KERNEL */ 18164178Slling if (hostid != 0 && myhostid != 0 && 18178662SJordan.Vaughan@Sun.com hostid != myhostid) { 181811810SMark.Musante@Sun.COM nvlist_free(nvconfig); 18193975Sek110237 cmn_err(CE_WARN, "pool '%s' could not be " 18203975Sek110237 "loaded as it was last accessed by " 18217706SLin.Ling@Sun.COM "another system (host: %s hostid: 0x%lx). " 18223975Sek110237 "See: http://www.sun.com/msg/ZFS-8000-EY", 18237754SJeff.Bonwick@Sun.COM spa_name(spa), hostname, 18243975Sek110237 (unsigned long)hostid); 182511422SMark.Musante@Sun.COM return (EBADF); 18263975Sek110237 } 18273975Sek110237 } 182811727SVictor.Latushkin@Sun.COM if (nvlist_lookup_nvlist(spa->spa_config, 182911727SVictor.Latushkin@Sun.COM ZPOOL_REWIND_POLICY, &policy) == 0) 183011727SVictor.Latushkin@Sun.COM VERIFY(nvlist_add_nvlist(nvconfig, 183111727SVictor.Latushkin@Sun.COM ZPOOL_REWIND_POLICY, policy) == 0); 18323975Sek110237 183310594SGeorge.Wilson@Sun.COM spa_config_set(spa, nvconfig); 1834789Sahrens spa_unload(spa); 1835789Sahrens spa_deactivate(spa); 18368241SJeff.Bonwick@Sun.COM spa_activate(spa, orig_mode); 1837789Sahrens 183811422SMark.Musante@Sun.COM return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE)); 18391544Seschrock } 18401544Seschrock 184111422SMark.Musante@Sun.COM if (spa_dir_prop(spa, DMU_POOL_SYNC_BPLIST, 184211422SMark.Musante@Sun.COM &spa->spa_deferred_bplist_obj) != 0) 184311422SMark.Musante@Sun.COM return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 1844789Sahrens 18451544Seschrock /* 18462082Seschrock * Load the bit that tells us to use the new accounting function 18472082Seschrock * (raid-z deflation). If we have an older pool, this will not 18482082Seschrock * be present. 18492082Seschrock */ 185011422SMark.Musante@Sun.COM error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate); 185111422SMark.Musante@Sun.COM if (error != 0 && error != ENOENT) 185211422SMark.Musante@Sun.COM return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 18532082Seschrock 18542082Seschrock /* 18551544Seschrock * Load the persistent error log. If we have an older pool, this will 18561544Seschrock * not be present. 18571544Seschrock */ 185811422SMark.Musante@Sun.COM error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last); 185911422SMark.Musante@Sun.COM if (error != 0 && error != ENOENT) 186011422SMark.Musante@Sun.COM return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 186111422SMark.Musante@Sun.COM 186211422SMark.Musante@Sun.COM error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, 186311422SMark.Musante@Sun.COM &spa->spa_errlog_scrub); 186411422SMark.Musante@Sun.COM if (error != 0 && error != ENOENT) 186511422SMark.Musante@Sun.COM return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 1866789Sahrens 1867789Sahrens /* 18682926Sek110237 * Load the history object. If we have an older pool, this 18692926Sek110237 * will not be present. 18702926Sek110237 */ 187111422SMark.Musante@Sun.COM error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history); 187211422SMark.Musante@Sun.COM if (error != 0 && error != ENOENT) 187311422SMark.Musante@Sun.COM return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 187411422SMark.Musante@Sun.COM 187511422SMark.Musante@Sun.COM /* 187611422SMark.Musante@Sun.COM * If we're assembling the pool from the split-off vdevs of 187711422SMark.Musante@Sun.COM * an existing pool, we don't want to attach the spares & cache 187811422SMark.Musante@Sun.COM * devices. 187911422SMark.Musante@Sun.COM */ 18802926Sek110237 18812926Sek110237 /* 18822082Seschrock * Load any hot spares for this pool. 18832082Seschrock */ 188411422SMark.Musante@Sun.COM error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object); 188511422SMark.Musante@Sun.COM if (error != 0 && error != ENOENT) 188611422SMark.Musante@Sun.COM return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 188711422SMark.Musante@Sun.COM if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 18884577Sahrens ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 18895450Sbrendan if (load_nvlist(spa, spa->spa_spares.sav_object, 189011422SMark.Musante@Sun.COM &spa->spa_spares.sav_config) != 0) 189111422SMark.Musante@Sun.COM return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 18922082Seschrock 18937754SJeff.Bonwick@Sun.COM spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 18942082Seschrock spa_load_spares(spa); 18957754SJeff.Bonwick@Sun.COM spa_config_exit(spa, SCL_ALL, FTAG); 189611422SMark.Musante@Sun.COM } else if (error == 0) { 189711422SMark.Musante@Sun.COM spa->spa_spares.sav_sync = B_TRUE; 18982082Seschrock } 18992082Seschrock 19005450Sbrendan /* 19015450Sbrendan * Load any level 2 ARC devices for this pool. 19025450Sbrendan */ 190311422SMark.Musante@Sun.COM error = spa_dir_prop(spa, DMU_POOL_L2CACHE, 19045450Sbrendan &spa->spa_l2cache.sav_object); 190511422SMark.Musante@Sun.COM if (error != 0 && error != ENOENT) 190611422SMark.Musante@Sun.COM return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 190711422SMark.Musante@Sun.COM if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 19085450Sbrendan ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); 19095450Sbrendan if (load_nvlist(spa, spa->spa_l2cache.sav_object, 191011422SMark.Musante@Sun.COM &spa->spa_l2cache.sav_config) != 0) 191111422SMark.Musante@Sun.COM return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 19125450Sbrendan 19137754SJeff.Bonwick@Sun.COM spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 19145450Sbrendan spa_load_l2cache(spa); 19157754SJeff.Bonwick@Sun.COM spa_config_exit(spa, SCL_ALL, FTAG); 191611422SMark.Musante@Sun.COM } else if (error == 0) { 191711422SMark.Musante@Sun.COM spa->spa_l2cache.sav_sync = B_TRUE; 19185450Sbrendan } 19195450Sbrendan 19205094Slling spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 19214543Smarks 192211422SMark.Musante@Sun.COM error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object); 192311422SMark.Musante@Sun.COM if (error && error != ENOENT) 192411422SMark.Musante@Sun.COM return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 19253912Slling 19263912Slling if (error == 0) { 192711422SMark.Musante@Sun.COM uint64_t autoreplace; 192811422SMark.Musante@Sun.COM 192911422SMark.Musante@Sun.COM spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); 193011422SMark.Musante@Sun.COM spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); 193111422SMark.Musante@Sun.COM spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); 193211422SMark.Musante@Sun.COM spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); 193311422SMark.Musante@Sun.COM spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); 193411422SMark.Musante@Sun.COM spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO, 193511422SMark.Musante@Sun.COM &spa->spa_dedup_ditto); 193611422SMark.Musante@Sun.COM 193710672SEric.Schrock@Sun.COM spa->spa_autoreplace = (autoreplace != 0); 19383912Slling } 19393912Slling 19402082Seschrock /* 19414451Seschrock * If the 'autoreplace' property is set, then post a resource notifying 19424451Seschrock * the ZFS DE that it should not issue any faults for unopenable 19434451Seschrock * devices. We also iterate over the vdevs, and post a sysevent for any 19444451Seschrock * unopenable vdevs so that the normal autoreplace handler can take 19454451Seschrock * over. 19464451Seschrock */ 194710672SEric.Schrock@Sun.COM if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) { 19484451Seschrock spa_check_removed(spa->spa_root_vdev); 194910672SEric.Schrock@Sun.COM /* 195010672SEric.Schrock@Sun.COM * For the import case, this is done in spa_import(), because 195110672SEric.Schrock@Sun.COM * at this point we're using the spare definitions from 195210672SEric.Schrock@Sun.COM * the MOS config, not necessarily from the userland config. 195310672SEric.Schrock@Sun.COM */ 195410672SEric.Schrock@Sun.COM if (state != SPA_LOAD_IMPORT) { 195510672SEric.Schrock@Sun.COM spa_aux_check_removed(&spa->spa_spares); 195610672SEric.Schrock@Sun.COM spa_aux_check_removed(&spa->spa_l2cache); 195710672SEric.Schrock@Sun.COM } 195810672SEric.Schrock@Sun.COM } 19594451Seschrock 19604451Seschrock /* 19611986Seschrock * Load the vdev state for all toplevel vdevs. 1962789Sahrens */ 19631986Seschrock vdev_load(rvd); 1964789Sahrens 1965789Sahrens /* 1966789Sahrens * Propagate the leaf DTLs we just loaded all the way up the tree. 1967789Sahrens */ 19687754SJeff.Bonwick@Sun.COM spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1969789Sahrens vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 19707754SJeff.Bonwick@Sun.COM spa_config_exit(spa, SCL_ALL, FTAG); 1971789Sahrens 1972789Sahrens /* 1973789Sahrens * Check the state of the root vdev. If it can't be opened, it 1974789Sahrens * indicates one or more toplevel vdevs are faulted. 1975789Sahrens */ 197611422SMark.Musante@Sun.COM if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 197711422SMark.Musante@Sun.COM return (ENXIO); 1978789Sahrens 197910922SJeff.Bonwick@Sun.COM /* 198010922SJeff.Bonwick@Sun.COM * Load the DDTs (dedup tables). 198110922SJeff.Bonwick@Sun.COM */ 198210922SJeff.Bonwick@Sun.COM error = ddt_load(spa); 198311422SMark.Musante@Sun.COM if (error != 0) 198411422SMark.Musante@Sun.COM return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 198510922SJeff.Bonwick@Sun.COM 198610956SGeorge.Wilson@Sun.COM spa_update_dspace(spa); 198710956SGeorge.Wilson@Sun.COM 198810921STim.Haley@Sun.COM if (state != SPA_LOAD_TRYIMPORT) { 198910921STim.Haley@Sun.COM error = spa_load_verify(spa); 199011422SMark.Musante@Sun.COM if (error) 199111422SMark.Musante@Sun.COM return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 199211422SMark.Musante@Sun.COM error)); 199310921STim.Haley@Sun.COM } 199410921STim.Haley@Sun.COM 199510922SJeff.Bonwick@Sun.COM /* 199611422SMark.Musante@Sun.COM * Load the intent log state and check log integrity. If we're 199711422SMark.Musante@Sun.COM * assembling a pool from a split, the log is not transferred over. 199810922SJeff.Bonwick@Sun.COM */ 199911422SMark.Musante@Sun.COM if (type != SPA_IMPORT_ASSEMBLE) { 200011810SMark.Musante@Sun.COM nvlist_t *nvconfig; 200111810SMark.Musante@Sun.COM 200211810SMark.Musante@Sun.COM if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) 200311810SMark.Musante@Sun.COM return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 200411810SMark.Musante@Sun.COM 200511422SMark.Musante@Sun.COM VERIFY(nvlist_lookup_nvlist(nvconfig, ZPOOL_CONFIG_VDEV_TREE, 200611422SMark.Musante@Sun.COM &nvroot) == 0); 200711422SMark.Musante@Sun.COM spa_load_log_state(spa, nvroot); 200811422SMark.Musante@Sun.COM nvlist_free(nvconfig); 200911422SMark.Musante@Sun.COM 201011422SMark.Musante@Sun.COM if (spa_check_logs(spa)) { 201111422SMark.Musante@Sun.COM *ereport = FM_EREPORT_ZFS_LOG_REPLAY; 201211422SMark.Musante@Sun.COM return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO)); 201311422SMark.Musante@Sun.COM } 201410922SJeff.Bonwick@Sun.COM } 201510922SJeff.Bonwick@Sun.COM 201610921STim.Haley@Sun.COM if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER || 201710921STim.Haley@Sun.COM spa->spa_load_max_txg == UINT64_MAX)) { 20181635Sbonwick dmu_tx_t *tx; 20191635Sbonwick int need_update = B_FALSE; 20208241SJeff.Bonwick@Sun.COM 20218241SJeff.Bonwick@Sun.COM ASSERT(state != SPA_LOAD_TRYIMPORT); 20221601Sbonwick 20231635Sbonwick /* 20241635Sbonwick * Claim log blocks that haven't been committed yet. 20251635Sbonwick * This must all happen in a single txg. 202610922SJeff.Bonwick@Sun.COM * Note: spa_claim_max_txg is updated by spa_claim_notify(), 202710922SJeff.Bonwick@Sun.COM * invoked from zil_claim_log_block()'s i/o done callback. 202810921STim.Haley@Sun.COM * Price of rollback is that we abandon the log. 20291635Sbonwick */ 203010922SJeff.Bonwick@Sun.COM spa->spa_claiming = B_TRUE; 203110922SJeff.Bonwick@Sun.COM 20321601Sbonwick tx = dmu_tx_create_assigned(spa_get_dsl(spa), 2033789Sahrens spa_first_txg(spa)); 20347754SJeff.Bonwick@Sun.COM (void) dmu_objset_find(spa_name(spa), 20352417Sahrens zil_claim, tx, DS_FIND_CHILDREN); 2036789Sahrens dmu_tx_commit(tx); 2037789Sahrens 203810922SJeff.Bonwick@Sun.COM spa->spa_claiming = B_FALSE; 203910922SJeff.Bonwick@Sun.COM 204011422SMark.Musante@Sun.COM spa_set_log_state(spa, SPA_LOG_GOOD); 2041789Sahrens spa->spa_sync_on = B_TRUE; 2042789Sahrens txg_sync_start(spa->spa_dsl_pool); 2043789Sahrens 2044789Sahrens /* 204510922SJeff.Bonwick@Sun.COM * Wait for all claims to sync. We sync up to the highest 204610922SJeff.Bonwick@Sun.COM * claimed log block birth time so that claimed log blocks 204710922SJeff.Bonwick@Sun.COM * don't appear to be from the future. spa_claim_max_txg 204810922SJeff.Bonwick@Sun.COM * will have been set for us by either zil_check_log_chain() 204910922SJeff.Bonwick@Sun.COM * (invoked from spa_check_logs()) or zil_claim() above. 2050789Sahrens */ 205110922SJeff.Bonwick@Sun.COM txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); 20521585Sbonwick 20531585Sbonwick /* 20541635Sbonwick * If the config cache is stale, or we have uninitialized 20551635Sbonwick * metaslabs (see spa_vdev_add()), then update the config. 205610100SLin.Ling@Sun.COM * 205710100SLin.Ling@Sun.COM * If spa_load_verbatim is true, trust the current 205810100SLin.Ling@Sun.COM * in-core spa_config and update the disk labels. 20591585Sbonwick */ 20601635Sbonwick if (config_cache_txg != spa->spa_config_txg || 206110921STim.Haley@Sun.COM state == SPA_LOAD_IMPORT || spa->spa_load_verbatim || 206210921STim.Haley@Sun.COM state == SPA_LOAD_RECOVER) 20631635Sbonwick need_update = B_TRUE; 20641635Sbonwick 20658241SJeff.Bonwick@Sun.COM for (int c = 0; c < rvd->vdev_children; c++) 20661635Sbonwick if (rvd->vdev_child[c]->vdev_ms_array == 0) 20671635Sbonwick need_update = B_TRUE; 20681585Sbonwick 20691585Sbonwick /* 20701635Sbonwick * Update the config cache asychronously in case we're the 20711635Sbonwick * root pool, in which case the config cache isn't writable yet. 20721585Sbonwick */ 20731635Sbonwick if (need_update) 20741635Sbonwick spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 20758241SJeff.Bonwick@Sun.COM 20768241SJeff.Bonwick@Sun.COM /* 20778241SJeff.Bonwick@Sun.COM * Check all DTLs to see if anything needs resilvering. 20788241SJeff.Bonwick@Sun.COM */ 20798241SJeff.Bonwick@Sun.COM if (vdev_resilver_needed(rvd, NULL, NULL)) 20808241SJeff.Bonwick@Sun.COM spa_async_request(spa, SPA_ASYNC_RESILVER); 208110298SMatthew.Ahrens@Sun.COM 208210298SMatthew.Ahrens@Sun.COM /* 208310298SMatthew.Ahrens@Sun.COM * Delete any inconsistent datasets. 208410298SMatthew.Ahrens@Sun.COM */ 208510298SMatthew.Ahrens@Sun.COM (void) dmu_objset_find(spa_name(spa), 208610298SMatthew.Ahrens@Sun.COM dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); 208710342Schris.kirby@sun.com 208810342Schris.kirby@sun.com /* 208910342Schris.kirby@sun.com * Clean up any stale temporary dataset userrefs. 209010342Schris.kirby@sun.com */ 209110342Schris.kirby@sun.com dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); 2092789Sahrens } 2093789Sahrens 209411422SMark.Musante@Sun.COM return (0); 2095789Sahrens } 2096789Sahrens 209710921STim.Haley@Sun.COM static int 209810921STim.Haley@Sun.COM spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig) 209910921STim.Haley@Sun.COM { 210010921STim.Haley@Sun.COM spa_unload(spa); 210110921STim.Haley@Sun.COM spa_deactivate(spa); 210210921STim.Haley@Sun.COM 210310921STim.Haley@Sun.COM spa->spa_load_max_txg--; 210410921STim.Haley@Sun.COM 210510921STim.Haley@Sun.COM spa_activate(spa, spa_mode_global); 210610921STim.Haley@Sun.COM spa_async_suspend(spa); 210710921STim.Haley@Sun.COM 210811422SMark.Musante@Sun.COM return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig)); 210910921STim.Haley@Sun.COM } 211010921STim.Haley@Sun.COM 211110921STim.Haley@Sun.COM static int 211210921STim.Haley@Sun.COM spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, 211311727SVictor.Latushkin@Sun.COM uint64_t max_request, int rewind_flags) 211410921STim.Haley@Sun.COM { 211510921STim.Haley@Sun.COM nvlist_t *config = NULL; 211610921STim.Haley@Sun.COM int load_error, rewind_error; 211711727SVictor.Latushkin@Sun.COM uint64_t safe_rewind_txg; 211810921STim.Haley@Sun.COM uint64_t min_txg; 211910921STim.Haley@Sun.COM 212011026STim.Haley@Sun.COM if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { 212110921STim.Haley@Sun.COM spa->spa_load_max_txg = spa->spa_load_txg; 212211422SMark.Musante@Sun.COM spa_set_log_state(spa, SPA_LOG_CLEAR); 212311026STim.Haley@Sun.COM } else { 212410921STim.Haley@Sun.COM spa->spa_load_max_txg = max_request; 212511026STim.Haley@Sun.COM } 212610921STim.Haley@Sun.COM 212711422SMark.Musante@Sun.COM load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING, 212811422SMark.Musante@Sun.COM mosconfig); 212910921STim.Haley@Sun.COM if (load_error == 0) 213010921STim.Haley@Sun.COM return (0); 213110921STim.Haley@Sun.COM 213210921STim.Haley@Sun.COM if (spa->spa_root_vdev != NULL) 213310921STim.Haley@Sun.COM config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 213410921STim.Haley@Sun.COM 213510921STim.Haley@Sun.COM spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; 213610921STim.Haley@Sun.COM spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; 213710921STim.Haley@Sun.COM 213811727SVictor.Latushkin@Sun.COM if (rewind_flags & ZPOOL_NEVER_REWIND) { 213910921STim.Haley@Sun.COM nvlist_free(config); 214010921STim.Haley@Sun.COM return (load_error); 214110921STim.Haley@Sun.COM } 214210921STim.Haley@Sun.COM 214310921STim.Haley@Sun.COM /* Price of rolling back is discarding txgs, including log */ 214410921STim.Haley@Sun.COM if (state == SPA_LOAD_RECOVER) 214511422SMark.Musante@Sun.COM spa_set_log_state(spa, SPA_LOG_CLEAR); 214610921STim.Haley@Sun.COM 214711727SVictor.Latushkin@Sun.COM spa->spa_load_max_txg = spa->spa_last_ubsync_txg; 214811727SVictor.Latushkin@Sun.COM safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; 214911727SVictor.Latushkin@Sun.COM min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? 215011727SVictor.Latushkin@Sun.COM TXG_INITIAL : safe_rewind_txg; 215111727SVictor.Latushkin@Sun.COM 215211727SVictor.Latushkin@Sun.COM /* 215311727SVictor.Latushkin@Sun.COM * Continue as long as we're finding errors, we're still within 215411727SVictor.Latushkin@Sun.COM * the acceptable rewind range, and we're still finding uberblocks 215511727SVictor.Latushkin@Sun.COM */ 215611727SVictor.Latushkin@Sun.COM while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && 215711727SVictor.Latushkin@Sun.COM spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { 215811727SVictor.Latushkin@Sun.COM if (spa->spa_load_max_txg < safe_rewind_txg) 215910921STim.Haley@Sun.COM spa->spa_extreme_rewind = B_TRUE; 216010921STim.Haley@Sun.COM rewind_error = spa_load_retry(spa, state, mosconfig); 216110921STim.Haley@Sun.COM } 216210921STim.Haley@Sun.COM 216310921STim.Haley@Sun.COM if (config) 216410921STim.Haley@Sun.COM spa_rewind_data_to_nvlist(spa, config); 216510921STim.Haley@Sun.COM 216610921STim.Haley@Sun.COM spa->spa_extreme_rewind = B_FALSE; 216710921STim.Haley@Sun.COM spa->spa_load_max_txg = UINT64_MAX; 216810921STim.Haley@Sun.COM 216910921STim.Haley@Sun.COM if (config && (rewind_error || state != SPA_LOAD_RECOVER)) 217010921STim.Haley@Sun.COM spa_config_set(spa, config); 217110921STim.Haley@Sun.COM 217210921STim.Haley@Sun.COM return (state == SPA_LOAD_RECOVER ? rewind_error : load_error); 217310921STim.Haley@Sun.COM } 217410921STim.Haley@Sun.COM 2175789Sahrens /* 2176789Sahrens * Pool Open/Import 2177789Sahrens * 2178789Sahrens * The import case is identical to an open except that the configuration is sent 2179789Sahrens * down from userland, instead of grabbed from the configuration cache. For the 2180789Sahrens * case of an open, the pool configuration will exist in the 21814451Seschrock * POOL_STATE_UNINITIALIZED state. 2182789Sahrens * 2183789Sahrens * The stats information (gen/count/ustats) is used to gather vdev statistics at 2184789Sahrens * the same time open the pool, without having to keep around the spa_t in some 2185789Sahrens * ambiguous state. 2186789Sahrens */ 2187789Sahrens static int 218810921STim.Haley@Sun.COM spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, 218910921STim.Haley@Sun.COM nvlist_t **config) 2190789Sahrens { 2191789Sahrens spa_t *spa; 2192789Sahrens int error; 2193789Sahrens int locked = B_FALSE; 2194789Sahrens 2195789Sahrens *spapp = NULL; 2196789Sahrens 2197789Sahrens /* 2198789Sahrens * As disgusting as this is, we need to support recursive calls to this 2199789Sahrens * function because dsl_dir_open() is called during spa_load(), and ends 2200789Sahrens * up calling spa_open() again. The real fix is to figure out how to 2201789Sahrens * avoid dsl_dir_open() calling this in the first place. 2202789Sahrens */ 2203789Sahrens if (mutex_owner(&spa_namespace_lock) != curthread) { 2204789Sahrens mutex_enter(&spa_namespace_lock); 2205789Sahrens locked = B_TRUE; 2206789Sahrens } 2207789Sahrens 2208789Sahrens if ((spa = spa_lookup(pool)) == NULL) { 2209789Sahrens if (locked) 2210789Sahrens mutex_exit(&spa_namespace_lock); 2211789Sahrens return (ENOENT); 2212789Sahrens } 221310921STim.Haley@Sun.COM 2214789Sahrens if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 221511819STim.Haley@Sun.COM spa_load_state_t state = SPA_LOAD_OPEN; 221611819STim.Haley@Sun.COM zpool_rewind_policy_t policy; 221711819STim.Haley@Sun.COM 221811819STim.Haley@Sun.COM zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config, 221911819STim.Haley@Sun.COM &policy); 222011819STim.Haley@Sun.COM if (policy.zrp_request & ZPOOL_DO_REWIND) 222111819STim.Haley@Sun.COM state = SPA_LOAD_RECOVER; 2222789Sahrens 22238241SJeff.Bonwick@Sun.COM spa_activate(spa, spa_mode_global); 2224789Sahrens 222511727SVictor.Latushkin@Sun.COM if (spa->spa_last_open_failed && (policy.zrp_request & 222611727SVictor.Latushkin@Sun.COM (ZPOOL_NO_REWIND | ZPOOL_NEVER_REWIND))) { 222710921STim.Haley@Sun.COM if (config != NULL && spa->spa_config) 222810921STim.Haley@Sun.COM VERIFY(nvlist_dup(spa->spa_config, 222910921STim.Haley@Sun.COM config, KM_SLEEP) == 0); 223010921STim.Haley@Sun.COM spa_deactivate(spa); 223110921STim.Haley@Sun.COM if (locked) 223210921STim.Haley@Sun.COM mutex_exit(&spa_namespace_lock); 223310921STim.Haley@Sun.COM return (spa->spa_last_open_failed); 223410921STim.Haley@Sun.COM } 223510921STim.Haley@Sun.COM 223610921STim.Haley@Sun.COM if (state != SPA_LOAD_RECOVER) 223710921STim.Haley@Sun.COM spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 223810921STim.Haley@Sun.COM 223910921STim.Haley@Sun.COM error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg, 224011727SVictor.Latushkin@Sun.COM policy.zrp_request); 2241789Sahrens 2242789Sahrens if (error == EBADF) { 2243789Sahrens /* 22441986Seschrock * If vdev_validate() returns failure (indicated by 22451986Seschrock * EBADF), it indicates that one of the vdevs indicates 22461986Seschrock * that the pool has been exported or destroyed. If 22471986Seschrock * this is the case, the config cache is out of sync and 22481986Seschrock * we should remove the pool from the namespace. 2249789Sahrens */ 2250789Sahrens spa_unload(spa); 2251789Sahrens spa_deactivate(spa); 22526643Seschrock spa_config_sync(spa, B_TRUE, B_TRUE); 2253789Sahrens spa_remove(spa); 2254789Sahrens if (locked) 2255789Sahrens mutex_exit(&spa_namespace_lock); 2256789Sahrens return (ENOENT); 22571544Seschrock } 22581544Seschrock 22591544Seschrock if (error) { 2260789Sahrens /* 2261789Sahrens * We can't open the pool, but we still have useful 2262789Sahrens * information: the state of each vdev after the 2263789Sahrens * attempted vdev_open(). Return this to the user. 2264789Sahrens */ 226510921STim.Haley@Sun.COM if (config != NULL && spa->spa_config) 226610921STim.Haley@Sun.COM VERIFY(nvlist_dup(spa->spa_config, config, 226710921STim.Haley@Sun.COM KM_SLEEP) == 0); 2268789Sahrens spa_unload(spa); 2269789Sahrens spa_deactivate(spa); 227010921STim.Haley@Sun.COM spa->spa_last_open_failed = error; 2271789Sahrens if (locked) 2272789Sahrens mutex_exit(&spa_namespace_lock); 2273789Sahrens *spapp = NULL; 2274789Sahrens return (error); 2275789Sahrens } 227610921STim.Haley@Sun.COM 2277789Sahrens } 2278789Sahrens 2279789Sahrens spa_open_ref(spa, tag); 22804451Seschrock 228110921STim.Haley@Sun.COM 228210921STim.Haley@Sun.COM if (config != NULL) 228310921STim.Haley@Sun.COM *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 228410921STim.Haley@Sun.COM 228511026STim.Haley@Sun.COM if (locked) { 228611026STim.Haley@Sun.COM spa->spa_last_open_failed = 0; 228711026STim.Haley@Sun.COM spa->spa_last_ubsync_txg = 0; 228811026STim.Haley@Sun.COM spa->spa_load_txg = 0; 2289789Sahrens mutex_exit(&spa_namespace_lock); 229011026STim.Haley@Sun.COM } 2291789Sahrens 2292789Sahrens *spapp = spa; 2293789Sahrens 2294789Sahrens return (0); 2295789Sahrens } 2296789Sahrens 2297789Sahrens int 229810921STim.Haley@Sun.COM spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy, 229910921STim.Haley@Sun.COM nvlist_t **config) 230010921STim.Haley@Sun.COM { 230110921STim.Haley@Sun.COM return (spa_open_common(name, spapp, tag, policy, config)); 230210921STim.Haley@Sun.COM } 230310921STim.Haley@Sun.COM 230410921STim.Haley@Sun.COM int 2305789Sahrens spa_open(const char *name, spa_t **spapp, void *tag) 2306789Sahrens { 230710921STim.Haley@Sun.COM return (spa_open_common(name, spapp, tag, NULL, NULL)); 2308789Sahrens } 2309789Sahrens 23101544Seschrock /* 23111544Seschrock * Lookup the given spa_t, incrementing the inject count in the process, 23121544Seschrock * preventing it from being exported or destroyed. 23131544Seschrock */ 23141544Seschrock spa_t * 23151544Seschrock spa_inject_addref(char *name) 23161544Seschrock { 23171544Seschrock spa_t *spa; 23181544Seschrock 23191544Seschrock mutex_enter(&spa_namespace_lock); 23201544Seschrock if ((spa = spa_lookup(name)) == NULL) { 23211544Seschrock mutex_exit(&spa_namespace_lock); 23221544Seschrock return (NULL); 23231544Seschrock } 23241544Seschrock spa->spa_inject_ref++; 23251544Seschrock mutex_exit(&spa_namespace_lock); 23261544Seschrock 23271544Seschrock return (spa); 23281544Seschrock } 23291544Seschrock 23301544Seschrock void 23311544Seschrock spa_inject_delref(spa_t *spa) 23321544Seschrock { 23331544Seschrock mutex_enter(&spa_namespace_lock); 23341544Seschrock spa->spa_inject_ref--; 23351544Seschrock mutex_exit(&spa_namespace_lock); 23361544Seschrock } 23371544Seschrock 23385450Sbrendan /* 23395450Sbrendan * Add spares device information to the nvlist. 23405450Sbrendan */ 23412082Seschrock static void 23422082Seschrock spa_add_spares(spa_t *spa, nvlist_t *config) 23432082Seschrock { 23442082Seschrock nvlist_t **spares; 23452082Seschrock uint_t i, nspares; 23462082Seschrock nvlist_t *nvroot; 23472082Seschrock uint64_t guid; 23482082Seschrock vdev_stat_t *vs; 23492082Seschrock uint_t vsc; 23503377Seschrock uint64_t pool; 23512082Seschrock 23529425SEric.Schrock@Sun.COM ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 23539425SEric.Schrock@Sun.COM 23545450Sbrendan if (spa->spa_spares.sav_count == 0) 23552082Seschrock return; 23562082Seschrock 23572082Seschrock VERIFY(nvlist_lookup_nvlist(config, 23582082Seschrock ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 23595450Sbrendan VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 23602082Seschrock ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 23612082Seschrock if (nspares != 0) { 23622082Seschrock VERIFY(nvlist_add_nvlist_array(nvroot, 23632082Seschrock ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 23642082Seschrock VERIFY(nvlist_lookup_nvlist_array(nvroot, 23652082Seschrock ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 23662082Seschrock 23672082Seschrock /* 23682082Seschrock * Go through and find any spares which have since been 23692082Seschrock * repurposed as an active spare. If this is the case, update 23702082Seschrock * their status appropriately. 23712082Seschrock */ 23722082Seschrock for (i = 0; i < nspares; i++) { 23732082Seschrock VERIFY(nvlist_lookup_uint64(spares[i], 23742082Seschrock ZPOOL_CONFIG_GUID, &guid) == 0); 23757214Slling if (spa_spare_exists(guid, &pool, NULL) && 23767214Slling pool != 0ULL) { 23772082Seschrock VERIFY(nvlist_lookup_uint64_array( 23782082Seschrock spares[i], ZPOOL_CONFIG_STATS, 23792082Seschrock (uint64_t **)&vs, &vsc) == 0); 23802082Seschrock vs->vs_state = VDEV_STATE_CANT_OPEN; 23812082Seschrock vs->vs_aux = VDEV_AUX_SPARED; 23822082Seschrock } 23832082Seschrock } 23842082Seschrock } 23852082Seschrock } 23862082Seschrock 23875450Sbrendan /* 23885450Sbrendan * Add l2cache device information to the nvlist, including vdev stats. 23895450Sbrendan */ 23905450Sbrendan static void 23915450Sbrendan spa_add_l2cache(spa_t *spa, nvlist_t *config) 23925450Sbrendan { 23935450Sbrendan nvlist_t **l2cache; 23945450Sbrendan uint_t i, j, nl2cache; 23955450Sbrendan nvlist_t *nvroot; 23965450Sbrendan uint64_t guid; 23975450Sbrendan vdev_t *vd; 23985450Sbrendan vdev_stat_t *vs; 23995450Sbrendan uint_t vsc; 24005450Sbrendan 24019425SEric.Schrock@Sun.COM ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 24029425SEric.Schrock@Sun.COM 24035450Sbrendan if (spa->spa_l2cache.sav_count == 0) 24045450Sbrendan return; 24055450Sbrendan 24065450Sbrendan VERIFY(nvlist_lookup_nvlist(config, 24075450Sbrendan ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 24085450Sbrendan VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 24095450Sbrendan ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 24105450Sbrendan if (nl2cache != 0) { 24115450Sbrendan VERIFY(nvlist_add_nvlist_array(nvroot, 24125450Sbrendan ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 24135450Sbrendan VERIFY(nvlist_lookup_nvlist_array(nvroot, 24145450Sbrendan ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 24155450Sbrendan 24165450Sbrendan /* 24175450Sbrendan * Update level 2 cache device stats. 24185450Sbrendan */ 24195450Sbrendan 24205450Sbrendan for (i = 0; i < nl2cache; i++) { 24215450Sbrendan VERIFY(nvlist_lookup_uint64(l2cache[i], 24225450Sbrendan ZPOOL_CONFIG_GUID, &guid) == 0); 24235450Sbrendan 24245450Sbrendan vd = NULL; 24255450Sbrendan for (j = 0; j < spa->spa_l2cache.sav_count; j++) { 24265450Sbrendan if (guid == 24275450Sbrendan spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { 24285450Sbrendan vd = spa->spa_l2cache.sav_vdevs[j]; 24295450Sbrendan break; 24305450Sbrendan } 24315450Sbrendan } 24325450Sbrendan ASSERT(vd != NULL); 24335450Sbrendan 24345450Sbrendan VERIFY(nvlist_lookup_uint64_array(l2cache[i], 24355450Sbrendan ZPOOL_CONFIG_STATS, (uint64_t **)&vs, &vsc) == 0); 24365450Sbrendan vdev_get_stats(vd, vs); 24375450Sbrendan } 24385450Sbrendan } 24395450Sbrendan } 24405450Sbrendan 2441789Sahrens int 24421544Seschrock spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 2443789Sahrens { 2444789Sahrens int error; 2445789Sahrens spa_t *spa; 2446789Sahrens 2447789Sahrens *config = NULL; 244810921STim.Haley@Sun.COM error = spa_open_common(name, &spa, FTAG, NULL, config); 2449789Sahrens 24509425SEric.Schrock@Sun.COM if (spa != NULL) { 24519425SEric.Schrock@Sun.COM /* 24529425SEric.Schrock@Sun.COM * This still leaves a window of inconsistency where the spares 24539425SEric.Schrock@Sun.COM * or l2cache devices could change and the config would be 24549425SEric.Schrock@Sun.COM * self-inconsistent. 24559425SEric.Schrock@Sun.COM */ 24569425SEric.Schrock@Sun.COM spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 24579425SEric.Schrock@Sun.COM 24589425SEric.Schrock@Sun.COM if (*config != NULL) { 24597754SJeff.Bonwick@Sun.COM VERIFY(nvlist_add_uint64(*config, 24609425SEric.Schrock@Sun.COM ZPOOL_CONFIG_ERRCOUNT, 24619425SEric.Schrock@Sun.COM spa_get_errlog_size(spa)) == 0); 24629425SEric.Schrock@Sun.COM 24639425SEric.Schrock@Sun.COM if (spa_suspended(spa)) 24649425SEric.Schrock@Sun.COM VERIFY(nvlist_add_uint64(*config, 24659425SEric.Schrock@Sun.COM ZPOOL_CONFIG_SUSPENDED, 24669425SEric.Schrock@Sun.COM spa->spa_failmode) == 0); 24679425SEric.Schrock@Sun.COM 24689425SEric.Schrock@Sun.COM spa_add_spares(spa, *config); 24699425SEric.Schrock@Sun.COM spa_add_l2cache(spa, *config); 24709425SEric.Schrock@Sun.COM } 24712082Seschrock } 24722082Seschrock 24731544Seschrock /* 24741544Seschrock * We want to get the alternate root even for faulted pools, so we cheat 24751544Seschrock * and call spa_lookup() directly. 24761544Seschrock */ 24771544Seschrock if (altroot) { 24781544Seschrock if (spa == NULL) { 24791544Seschrock mutex_enter(&spa_namespace_lock); 24801544Seschrock spa = spa_lookup(name); 24811544Seschrock if (spa) 24821544Seschrock spa_altroot(spa, altroot, buflen); 24831544Seschrock else 24841544Seschrock altroot[0] = '\0'; 24851544Seschrock spa = NULL; 24861544Seschrock mutex_exit(&spa_namespace_lock); 24871544Seschrock } else { 24881544Seschrock spa_altroot(spa, altroot, buflen); 24891544Seschrock } 24901544Seschrock } 24911544Seschrock 24929425SEric.Schrock@Sun.COM if (spa != NULL) { 24939425SEric.Schrock@Sun.COM spa_config_exit(spa, SCL_CONFIG, FTAG); 2494789Sahrens spa_close(spa, FTAG); 24959425SEric.Schrock@Sun.COM } 2496789Sahrens 2497789Sahrens return (error); 2498789Sahrens } 2499789Sahrens 2500789Sahrens /* 25015450Sbrendan * Validate that the auxiliary device array is well formed. We must have an 25025450Sbrendan * array of nvlists, each which describes a valid leaf vdev. If this is an 25035450Sbrendan * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be 25045450Sbrendan * specified, as long as they are well-formed. 25052082Seschrock */ 25062082Seschrock static int 25075450Sbrendan spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, 25085450Sbrendan spa_aux_vdev_t *sav, const char *config, uint64_t version, 25095450Sbrendan vdev_labeltype_t label) 25102082Seschrock { 25115450Sbrendan nvlist_t **dev; 25125450Sbrendan uint_t i, ndev; 25132082Seschrock vdev_t *vd; 25142082Seschrock int error; 25152082Seschrock 25167754SJeff.Bonwick@Sun.COM ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 25177754SJeff.Bonwick@Sun.COM 25182082Seschrock /* 25195450Sbrendan * It's acceptable to have no devs specified. 25202082Seschrock */ 25215450Sbrendan if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) 25222082Seschrock return (0); 25232082Seschrock 25245450Sbrendan if (ndev == 0) 25252082Seschrock return (EINVAL); 25262082Seschrock 25272082Seschrock /* 25285450Sbrendan * Make sure the pool is formatted with a version that supports this 25295450Sbrendan * device type. 25302082Seschrock */ 25315450Sbrendan if (spa_version(spa) < version) 25322082Seschrock return (ENOTSUP); 25332082Seschrock 25343377Seschrock /* 25355450Sbrendan * Set the pending device list so we correctly handle device in-use 25363377Seschrock * checking. 25373377Seschrock */ 25385450Sbrendan sav->sav_pending = dev; 25395450Sbrendan sav->sav_npending = ndev; 25405450Sbrendan 25415450Sbrendan for (i = 0; i < ndev; i++) { 25425450Sbrendan if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, 25432082Seschrock mode)) != 0) 25443377Seschrock goto out; 25452082Seschrock 25462082Seschrock if (!vd->vdev_ops->vdev_op_leaf) { 25472082Seschrock vdev_free(vd); 25483377Seschrock error = EINVAL; 25493377Seschrock goto out; 25502082Seschrock } 25512082Seschrock 25525450Sbrendan /* 25537754SJeff.Bonwick@Sun.COM * The L2ARC currently only supports disk devices in 25547754SJeff.Bonwick@Sun.COM * kernel context. For user-level testing, we allow it. 25555450Sbrendan */ 25567754SJeff.Bonwick@Sun.COM #ifdef _KERNEL 25575450Sbrendan if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) && 25585450Sbrendan strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { 25595450Sbrendan error = ENOTBLK; 25605450Sbrendan goto out; 25615450Sbrendan } 25627754SJeff.Bonwick@Sun.COM #endif 25632082Seschrock vd->vdev_top = vd; 25643377Seschrock 25653377Seschrock if ((error = vdev_open(vd)) == 0 && 25665450Sbrendan (error = vdev_label_init(vd, crtxg, label)) == 0) { 25675450Sbrendan VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, 25683377Seschrock vd->vdev_guid) == 0); 25692082Seschrock } 25702082Seschrock 25712082Seschrock vdev_free(vd); 25723377Seschrock 25735450Sbrendan if (error && 25745450Sbrendan (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) 25753377Seschrock goto out; 25763377Seschrock else 25773377Seschrock error = 0; 25782082Seschrock } 25792082Seschrock 25803377Seschrock out: 25815450Sbrendan sav->sav_pending = NULL; 25825450Sbrendan sav->sav_npending = 0; 25833377Seschrock return (error); 25842082Seschrock } 25852082Seschrock 25865450Sbrendan static int 25875450Sbrendan spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 25885450Sbrendan { 25895450Sbrendan int error; 25905450Sbrendan 25917754SJeff.Bonwick@Sun.COM ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 25927754SJeff.Bonwick@Sun.COM 25935450Sbrendan if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, 25945450Sbrendan &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, 25955450Sbrendan VDEV_LABEL_SPARE)) != 0) { 25965450Sbrendan return (error); 25975450Sbrendan } 25985450Sbrendan 25995450Sbrendan return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, 26005450Sbrendan &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, 26015450Sbrendan VDEV_LABEL_L2CACHE)); 26025450Sbrendan } 26035450Sbrendan 26045450Sbrendan static void 26055450Sbrendan spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, 26065450Sbrendan const char *config) 26075450Sbrendan { 26085450Sbrendan int i; 26095450Sbrendan 26105450Sbrendan if (sav->sav_config != NULL) { 26115450Sbrendan nvlist_t **olddevs; 26125450Sbrendan uint_t oldndevs; 26135450Sbrendan nvlist_t **newdevs; 26145450Sbrendan 26155450Sbrendan /* 26165450Sbrendan * Generate new dev list by concatentating with the 26175450Sbrendan * current dev list. 26185450Sbrendan */ 26195450Sbrendan VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, 26205450Sbrendan &olddevs, &oldndevs) == 0); 26215450Sbrendan 26225450Sbrendan newdevs = kmem_alloc(sizeof (void *) * 26235450Sbrendan (ndevs + oldndevs), KM_SLEEP); 26245450Sbrendan for (i = 0; i < oldndevs; i++) 26255450Sbrendan VERIFY(nvlist_dup(olddevs[i], &newdevs[i], 26265450Sbrendan KM_SLEEP) == 0); 26275450Sbrendan for (i = 0; i < ndevs; i++) 26285450Sbrendan VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], 26295450Sbrendan KM_SLEEP) == 0); 26305450Sbrendan 26315450Sbrendan VERIFY(nvlist_remove(sav->sav_config, config, 26325450Sbrendan DATA_TYPE_NVLIST_ARRAY) == 0); 26335450Sbrendan 26345450Sbrendan VERIFY(nvlist_add_nvlist_array(sav->sav_config, 26355450Sbrendan config, newdevs, ndevs + oldndevs) == 0); 26365450Sbrendan for (i = 0; i < oldndevs + ndevs; i++) 26375450Sbrendan nvlist_free(newdevs[i]); 26385450Sbrendan kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); 26395450Sbrendan } else { 26405450Sbrendan /* 26415450Sbrendan * Generate a new dev list. 26425450Sbrendan */ 26435450Sbrendan VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, 26445450Sbrendan KM_SLEEP) == 0); 26455450Sbrendan VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, 26465450Sbrendan devs, ndevs) == 0); 26475450Sbrendan } 26485450Sbrendan } 26495450Sbrendan 26505450Sbrendan /* 26515450Sbrendan * Stop and drop level 2 ARC devices 26525450Sbrendan */ 26535450Sbrendan void 26545450Sbrendan spa_l2cache_drop(spa_t *spa) 26555450Sbrendan { 26565450Sbrendan vdev_t *vd; 26575450Sbrendan int i; 26585450Sbrendan spa_aux_vdev_t *sav = &spa->spa_l2cache; 26595450Sbrendan 26605450Sbrendan for (i = 0; i < sav->sav_count; i++) { 26615450Sbrendan uint64_t pool; 26625450Sbrendan 26635450Sbrendan vd = sav->sav_vdevs[i]; 26645450Sbrendan ASSERT(vd != NULL); 26655450Sbrendan 26668241SJeff.Bonwick@Sun.COM if (spa_l2cache_exists(vd->vdev_guid, &pool) && 26678241SJeff.Bonwick@Sun.COM pool != 0ULL && l2arc_vdev_present(vd)) 26685450Sbrendan l2arc_remove_vdev(vd); 26695450Sbrendan if (vd->vdev_isl2cache) 26705450Sbrendan spa_l2cache_remove(vd); 26715450Sbrendan vdev_clear_stats(vd); 26725450Sbrendan (void) vdev_close(vd); 26735450Sbrendan } 26745450Sbrendan } 26755450Sbrendan 26762082Seschrock /* 2677789Sahrens * Pool Creation 2678789Sahrens */ 2679789Sahrens int 26805094Slling spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 26817184Stimh const char *history_str, nvlist_t *zplprops) 2682789Sahrens { 2683789Sahrens spa_t *spa; 26845094Slling char *altroot = NULL; 26851635Sbonwick vdev_t *rvd; 2686789Sahrens dsl_pool_t *dp; 2687789Sahrens dmu_tx_t *tx; 26889816SGeorge.Wilson@Sun.COM int error = 0; 2689789Sahrens uint64_t txg = TXG_INITIAL; 26905450Sbrendan nvlist_t **spares, **l2cache; 26915450Sbrendan uint_t nspares, nl2cache; 26925094Slling uint64_t version; 2693789Sahrens 2694789Sahrens /* 2695789Sahrens * If this pool already exists, return failure. 2696789Sahrens */ 2697789Sahrens mutex_enter(&spa_namespace_lock); 2698789Sahrens if (spa_lookup(pool) != NULL) { 2699789Sahrens mutex_exit(&spa_namespace_lock); 2700789Sahrens return (EEXIST); 2701789Sahrens } 2702789Sahrens 2703789Sahrens /* 2704789Sahrens * Allocate a new spa_t structure. 2705789Sahrens */ 27065094Slling (void) nvlist_lookup_string(props, 27075094Slling zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 270810921STim.Haley@Sun.COM spa = spa_add(pool, NULL, altroot); 27098241SJeff.Bonwick@Sun.COM spa_activate(spa, spa_mode_global); 2710789Sahrens 27115094Slling if (props && (error = spa_prop_validate(spa, props))) { 27125094Slling spa_deactivate(spa); 27135094Slling spa_remove(spa); 27146643Seschrock mutex_exit(&spa_namespace_lock); 27155094Slling return (error); 27165094Slling } 27175094Slling 27185094Slling if (nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), 27195094Slling &version) != 0) 27205094Slling version = SPA_VERSION; 27215094Slling ASSERT(version <= SPA_VERSION); 272210922SJeff.Bonwick@Sun.COM 272310922SJeff.Bonwick@Sun.COM spa->spa_first_txg = txg; 272410922SJeff.Bonwick@Sun.COM spa->spa_uberblock.ub_txg = txg - 1; 27255094Slling spa->spa_uberblock.ub_version = version; 2726789Sahrens spa->spa_ubsync = spa->spa_uberblock; 2727789Sahrens 27281635Sbonwick /* 27299234SGeorge.Wilson@Sun.COM * Create "The Godfather" zio to hold all async IOs 27309234SGeorge.Wilson@Sun.COM */ 27319630SJeff.Bonwick@Sun.COM spa->spa_async_zio_root = zio_root(spa, NULL, NULL, 27329630SJeff.Bonwick@Sun.COM ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); 27339234SGeorge.Wilson@Sun.COM 27349234SGeorge.Wilson@Sun.COM /* 27351635Sbonwick * Create the root vdev. 27361635Sbonwick */ 27377754SJeff.Bonwick@Sun.COM spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 27381635Sbonwick 27392082Seschrock error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 27402082Seschrock 27412082Seschrock ASSERT(error != 0 || rvd != NULL); 27422082Seschrock ASSERT(error != 0 || spa->spa_root_vdev == rvd); 27432082Seschrock 27445913Sperrin if (error == 0 && !zfs_allocatable_devs(nvroot)) 27451635Sbonwick error = EINVAL; 27462082Seschrock 27472082Seschrock if (error == 0 && 27482082Seschrock (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 27495450Sbrendan (error = spa_validate_aux(spa, nvroot, txg, 27502082Seschrock VDEV_ALLOC_ADD)) == 0) { 27519816SGeorge.Wilson@Sun.COM for (int c = 0; c < rvd->vdev_children; c++) { 27529816SGeorge.Wilson@Sun.COM vdev_metaslab_set_size(rvd->vdev_child[c]); 27539816SGeorge.Wilson@Sun.COM vdev_expand(rvd->vdev_child[c], txg); 27549816SGeorge.Wilson@Sun.COM } 27551635Sbonwick } 27561635Sbonwick 27577754SJeff.Bonwick@Sun.COM spa_config_exit(spa, SCL_ALL, FTAG); 2758789Sahrens 27592082Seschrock if (error != 0) { 2760789Sahrens spa_unload(spa); 2761789Sahrens spa_deactivate(spa); 2762789Sahrens spa_remove(spa); 2763789Sahrens mutex_exit(&spa_namespace_lock); 2764789Sahrens return (error); 2765789Sahrens } 2766789Sahrens 27672082Seschrock /* 27682082Seschrock * Get the list of spares, if specified. 27692082Seschrock */ 27702082Seschrock if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 27712082Seschrock &spares, &nspares) == 0) { 27725450Sbrendan VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, 27732082Seschrock KM_SLEEP) == 0); 27745450Sbrendan VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 27752082Seschrock ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 27767754SJeff.Bonwick@Sun.COM spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 27772082Seschrock spa_load_spares(spa); 27787754SJeff.Bonwick@Sun.COM spa_config_exit(spa, SCL_ALL, FTAG); 27795450Sbrendan spa->spa_spares.sav_sync = B_TRUE; 27805450Sbrendan } 27815450Sbrendan 27825450Sbrendan /* 27835450Sbrendan * Get the list of level 2 cache devices, if specified. 27845450Sbrendan */ 27855450Sbrendan if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 27865450Sbrendan &l2cache, &nl2cache) == 0) { 27875450Sbrendan VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 27885450Sbrendan NV_UNIQUE_NAME, KM_SLEEP) == 0); 27895450Sbrendan VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 27905450Sbrendan ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 27917754SJeff.Bonwick@Sun.COM spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 27925450Sbrendan spa_load_l2cache(spa); 27937754SJeff.Bonwick@Sun.COM spa_config_exit(spa, SCL_ALL, FTAG); 27945450Sbrendan spa->spa_l2cache.sav_sync = B_TRUE; 27952082Seschrock } 27962082Seschrock 27977184Stimh spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); 2798789Sahrens spa->spa_meta_objset = dp->dp_meta_objset; 2799789Sahrens 280010956SGeorge.Wilson@Sun.COM /* 280110956SGeorge.Wilson@Sun.COM * Create DDTs (dedup tables). 280210956SGeorge.Wilson@Sun.COM */ 280310956SGeorge.Wilson@Sun.COM ddt_create(spa); 280410956SGeorge.Wilson@Sun.COM 280510956SGeorge.Wilson@Sun.COM spa_update_dspace(spa); 280610956SGeorge.Wilson@Sun.COM 2807789Sahrens tx = dmu_tx_create_assigned(dp, txg); 2808789Sahrens 2809789Sahrens /* 2810789Sahrens * Create the pool config object. 2811789Sahrens */ 2812789Sahrens spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 28137497STim.Haley@Sun.COM DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, 2814789Sahrens DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 2815789Sahrens 28161544Seschrock if (zap_add(spa->spa_meta_objset, 2817789Sahrens DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 28181544Seschrock sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 28191544Seschrock cmn_err(CE_PANIC, "failed to add pool config"); 28201544Seschrock } 2821789Sahrens 28225094Slling /* Newly created pools with the right version are always deflated. */ 28235094Slling if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 28245094Slling spa->spa_deflate = TRUE; 28255094Slling if (zap_add(spa->spa_meta_objset, 28265094Slling DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 28275094Slling sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 28285094Slling cmn_err(CE_PANIC, "failed to add deflate"); 28295094Slling } 28302082Seschrock } 28312082Seschrock 2832789Sahrens /* 2833789Sahrens * Create the deferred-free bplist object. Turn off compression 2834789Sahrens * because sync-to-convergence takes longer if the blocksize 2835789Sahrens * keeps changing. 2836789Sahrens */ 283710922SJeff.Bonwick@Sun.COM spa->spa_deferred_bplist_obj = bplist_create(spa->spa_meta_objset, 2838789Sahrens 1 << 14, tx); 283910922SJeff.Bonwick@Sun.COM dmu_object_set_compress(spa->spa_meta_objset, 284010922SJeff.Bonwick@Sun.COM spa->spa_deferred_bplist_obj, ZIO_COMPRESS_OFF, tx); 2841789Sahrens 28421544Seschrock if (zap_add(spa->spa_meta_objset, 2843789Sahrens DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 284410922SJeff.Bonwick@Sun.COM sizeof (uint64_t), 1, &spa->spa_deferred_bplist_obj, tx) != 0) { 28451544Seschrock cmn_err(CE_PANIC, "failed to add bplist"); 28461544Seschrock } 2847789Sahrens 28482926Sek110237 /* 28492926Sek110237 * Create the pool's history object. 28502926Sek110237 */ 28515094Slling if (version >= SPA_VERSION_ZPOOL_HISTORY) 28525094Slling spa_history_create_obj(spa, tx); 28535094Slling 28545094Slling /* 28555094Slling * Set pool properties. 28565094Slling */ 28575094Slling spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 28585094Slling spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 28595329Sgw25295 spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 28609816SGeorge.Wilson@Sun.COM spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); 286110922SJeff.Bonwick@Sun.COM 28628525SEric.Schrock@Sun.COM if (props != NULL) { 28638525SEric.Schrock@Sun.COM spa_configfile_set(spa, props, B_FALSE); 28645094Slling spa_sync_props(spa, props, CRED(), tx); 28658525SEric.Schrock@Sun.COM } 28662926Sek110237 2867789Sahrens dmu_tx_commit(tx); 2868789Sahrens 2869789Sahrens spa->spa_sync_on = B_TRUE; 2870789Sahrens txg_sync_start(spa->spa_dsl_pool); 2871789Sahrens 2872789Sahrens /* 2873789Sahrens * We explicitly wait for the first transaction to complete so that our 2874789Sahrens * bean counters are appropriately updated. 2875789Sahrens */ 2876789Sahrens txg_wait_synced(spa->spa_dsl_pool, txg); 2877789Sahrens 28786643Seschrock spa_config_sync(spa, B_FALSE, B_TRUE); 2879789Sahrens 28805094Slling if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL) 28814715Sek110237 (void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE); 28829946SMark.Musante@Sun.COM spa_history_log_version(spa, LOG_POOL_CREATE); 28834715Sek110237 28848667SGeorge.Wilson@Sun.COM spa->spa_minref = refcount_count(&spa->spa_refcount); 28858667SGeorge.Wilson@Sun.COM 2886789Sahrens mutex_exit(&spa_namespace_lock); 2887789Sahrens 2888789Sahrens return (0); 2889789Sahrens } 2890789Sahrens 28916423Sgw25295 #ifdef _KERNEL 28926423Sgw25295 /* 28939790SLin.Ling@Sun.COM * Get the root pool information from the root disk, then import the root pool 28949790SLin.Ling@Sun.COM * during the system boot up time. 28956423Sgw25295 */ 28969790SLin.Ling@Sun.COM extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); 28979790SLin.Ling@Sun.COM 28989790SLin.Ling@Sun.COM static nvlist_t * 28999790SLin.Ling@Sun.COM spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid) 29006423Sgw25295 { 29019790SLin.Ling@Sun.COM nvlist_t *config; 29026423Sgw25295 nvlist_t *nvtop, *nvroot; 29036423Sgw25295 uint64_t pgid; 29046423Sgw25295 29059790SLin.Ling@Sun.COM if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0) 29069790SLin.Ling@Sun.COM return (NULL); 29079790SLin.Ling@Sun.COM 29086423Sgw25295 /* 29096423Sgw25295 * Add this top-level vdev to the child array. 29106423Sgw25295 */ 29119790SLin.Ling@Sun.COM VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 29129790SLin.Ling@Sun.COM &nvtop) == 0); 29139790SLin.Ling@Sun.COM VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 29149790SLin.Ling@Sun.COM &pgid) == 0); 29159790SLin.Ling@Sun.COM VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0); 29166423Sgw25295 29176423Sgw25295 /* 29186423Sgw25295 * Put this pool's top-level vdevs into a root vdev. 29196423Sgw25295 */ 29206423Sgw25295 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 29219790SLin.Ling@Sun.COM VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 29229790SLin.Ling@Sun.COM VDEV_TYPE_ROOT) == 0); 29236423Sgw25295 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 29246423Sgw25295 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 29256423Sgw25295 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 29266423Sgw25295 &nvtop, 1) == 0); 29276423Sgw25295 29286423Sgw25295 /* 29296423Sgw25295 * Replace the existing vdev_tree with the new root vdev in 29306423Sgw25295 * this pool's configuration (remove the old, add the new). 29316423Sgw25295 */ 29326423Sgw25295 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 29336423Sgw25295 nvlist_free(nvroot); 29349790SLin.Ling@Sun.COM return (config); 29356423Sgw25295 } 29366423Sgw25295 29376423Sgw25295 /* 29389790SLin.Ling@Sun.COM * Walk the vdev tree and see if we can find a device with "better" 29399790SLin.Ling@Sun.COM * configuration. A configuration is "better" if the label on that 29409790SLin.Ling@Sun.COM * device has a more recent txg. 29416423Sgw25295 */ 29429790SLin.Ling@Sun.COM static void 29439790SLin.Ling@Sun.COM spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg) 29447147Staylor { 29459816SGeorge.Wilson@Sun.COM for (int c = 0; c < vd->vdev_children; c++) 29469790SLin.Ling@Sun.COM spa_alt_rootvdev(vd->vdev_child[c], avd, txg); 29479790SLin.Ling@Sun.COM 29489790SLin.Ling@Sun.COM if (vd->vdev_ops->vdev_op_leaf) { 29499790SLin.Ling@Sun.COM nvlist_t *label; 29509790SLin.Ling@Sun.COM uint64_t label_txg; 29519790SLin.Ling@Sun.COM 29529790SLin.Ling@Sun.COM if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid, 29539790SLin.Ling@Sun.COM &label) != 0) 29549790SLin.Ling@Sun.COM return; 29559790SLin.Ling@Sun.COM 29569790SLin.Ling@Sun.COM VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, 29579790SLin.Ling@Sun.COM &label_txg) == 0); 29589790SLin.Ling@Sun.COM 29599790SLin.Ling@Sun.COM /* 29609790SLin.Ling@Sun.COM * Do we have a better boot device? 29619790SLin.Ling@Sun.COM */ 29629790SLin.Ling@Sun.COM if (label_txg > *txg) { 29639790SLin.Ling@Sun.COM *txg = label_txg; 29649790SLin.Ling@Sun.COM *avd = vd; 29657147Staylor } 29669790SLin.Ling@Sun.COM nvlist_free(label); 29677147Staylor } 29687147Staylor } 29697147Staylor 29706423Sgw25295 /* 29716423Sgw25295 * Import a root pool. 29726423Sgw25295 * 29737147Staylor * For x86. devpath_list will consist of devid and/or physpath name of 29747147Staylor * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a"). 29757147Staylor * The GRUB "findroot" command will return the vdev we should boot. 29766423Sgw25295 * 29776423Sgw25295 * For Sparc, devpath_list consists the physpath name of the booting device 29786423Sgw25295 * no matter the rootpool is a single device pool or a mirrored pool. 29796423Sgw25295 * e.g. 29806423Sgw25295 * "/pci@1f,0/ide@d/disk@0,0:a" 29816423Sgw25295 */ 29826423Sgw25295 int 29837147Staylor spa_import_rootpool(char *devpath, char *devid) 29846423Sgw25295 { 29859790SLin.Ling@Sun.COM spa_t *spa; 29869790SLin.Ling@Sun.COM vdev_t *rvd, *bvd, *avd = NULL; 29879790SLin.Ling@Sun.COM nvlist_t *config, *nvtop; 29889790SLin.Ling@Sun.COM uint64_t guid, txg; 29896423Sgw25295 char *pname; 29906423Sgw25295 int error; 29916423Sgw25295 29926423Sgw25295 /* 29939790SLin.Ling@Sun.COM * Read the label from the boot device and generate a configuration. 29946423Sgw25295 */ 299510822SJack.Meng@Sun.COM config = spa_generate_rootconf(devpath, devid, &guid); 299610822SJack.Meng@Sun.COM #if defined(_OBP) && defined(_KERNEL) 299710822SJack.Meng@Sun.COM if (config == NULL) { 299810822SJack.Meng@Sun.COM if (strstr(devpath, "/iscsi/ssd") != NULL) { 299910822SJack.Meng@Sun.COM /* iscsi boot */ 300010822SJack.Meng@Sun.COM get_iscsi_bootpath_phy(devpath); 300110822SJack.Meng@Sun.COM config = spa_generate_rootconf(devpath, devid, &guid); 300210822SJack.Meng@Sun.COM } 300310822SJack.Meng@Sun.COM } 300410822SJack.Meng@Sun.COM #endif 300510822SJack.Meng@Sun.COM if (config == NULL) { 30069790SLin.Ling@Sun.COM cmn_err(CE_NOTE, "Can not read the pool label from '%s'", 30079790SLin.Ling@Sun.COM devpath); 30089790SLin.Ling@Sun.COM return (EIO); 30099790SLin.Ling@Sun.COM } 30109790SLin.Ling@Sun.COM 30119790SLin.Ling@Sun.COM VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, 30129790SLin.Ling@Sun.COM &pname) == 0); 30139790SLin.Ling@Sun.COM VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); 30146423Sgw25295 30159425SEric.Schrock@Sun.COM mutex_enter(&spa_namespace_lock); 30169425SEric.Schrock@Sun.COM if ((spa = spa_lookup(pname)) != NULL) { 30179425SEric.Schrock@Sun.COM /* 30189425SEric.Schrock@Sun.COM * Remove the existing root pool from the namespace so that we 30199425SEric.Schrock@Sun.COM * can replace it with the correct config we just read in. 30209425SEric.Schrock@Sun.COM */ 30219425SEric.Schrock@Sun.COM spa_remove(spa); 30229425SEric.Schrock@Sun.COM } 30239425SEric.Schrock@Sun.COM 302410921STim.Haley@Sun.COM spa = spa_add(pname, config, NULL); 30259425SEric.Schrock@Sun.COM spa->spa_is_root = B_TRUE; 302610100SLin.Ling@Sun.COM spa->spa_load_verbatim = B_TRUE; 30279790SLin.Ling@Sun.COM 30289790SLin.Ling@Sun.COM /* 30299790SLin.Ling@Sun.COM * Build up a vdev tree based on the boot device's label config. 30309790SLin.Ling@Sun.COM */ 30319790SLin.Ling@Sun.COM VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 30329790SLin.Ling@Sun.COM &nvtop) == 0); 30339790SLin.Ling@Sun.COM spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 30349790SLin.Ling@Sun.COM error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, 30359790SLin.Ling@Sun.COM VDEV_ALLOC_ROOTPOOL); 30369790SLin.Ling@Sun.COM spa_config_exit(spa, SCL_ALL, FTAG); 30379790SLin.Ling@Sun.COM if (error) { 30389790SLin.Ling@Sun.COM mutex_exit(&spa_namespace_lock); 30399790SLin.Ling@Sun.COM nvlist_free(config); 30409790SLin.Ling@Sun.COM cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", 30419790SLin.Ling@Sun.COM pname); 30429790SLin.Ling@Sun.COM return (error); 30439790SLin.Ling@Sun.COM } 30449790SLin.Ling@Sun.COM 30459790SLin.Ling@Sun.COM /* 30469790SLin.Ling@Sun.COM * Get the boot vdev. 30479790SLin.Ling@Sun.COM */ 30489790SLin.Ling@Sun.COM if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 30499790SLin.Ling@Sun.COM cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu", 30509790SLin.Ling@Sun.COM (u_longlong_t)guid); 30519790SLin.Ling@Sun.COM error = ENOENT; 30529790SLin.Ling@Sun.COM goto out; 30539790SLin.Ling@Sun.COM } 30549790SLin.Ling@Sun.COM 30559790SLin.Ling@Sun.COM /* 30569790SLin.Ling@Sun.COM * Determine if there is a better boot device. 30579790SLin.Ling@Sun.COM */ 30589790SLin.Ling@Sun.COM avd = bvd; 30599790SLin.Ling@Sun.COM spa_alt_rootvdev(rvd, &avd, &txg); 30609790SLin.Ling@Sun.COM if (avd != bvd) { 30619790SLin.Ling@Sun.COM cmn_err(CE_NOTE, "The boot device is 'degraded'. Please " 30629790SLin.Ling@Sun.COM "try booting from '%s'", avd->vdev_path); 30639790SLin.Ling@Sun.COM error = EINVAL; 30649790SLin.Ling@Sun.COM goto out; 30659790SLin.Ling@Sun.COM } 30669790SLin.Ling@Sun.COM 30679790SLin.Ling@Sun.COM /* 30689790SLin.Ling@Sun.COM * If the boot device is part of a spare vdev then ensure that 30699790SLin.Ling@Sun.COM * we're booting off the active spare. 30709790SLin.Ling@Sun.COM */ 30719790SLin.Ling@Sun.COM if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops && 30729790SLin.Ling@Sun.COM !bvd->vdev_isspare) { 30739790SLin.Ling@Sun.COM cmn_err(CE_NOTE, "The boot device is currently spared. Please " 30749790SLin.Ling@Sun.COM "try booting from '%s'", 30759790SLin.Ling@Sun.COM bvd->vdev_parent->vdev_child[1]->vdev_path); 30769790SLin.Ling@Sun.COM error = EINVAL; 30779790SLin.Ling@Sun.COM goto out; 30789790SLin.Ling@Sun.COM } 30799790SLin.Ling@Sun.COM 30809790SLin.Ling@Sun.COM error = 0; 30819946SMark.Musante@Sun.COM spa_history_log_version(spa, LOG_POOL_IMPORT); 30829790SLin.Ling@Sun.COM out: 30839790SLin.Ling@Sun.COM spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 30849790SLin.Ling@Sun.COM vdev_free(rvd); 30859790SLin.Ling@Sun.COM spa_config_exit(spa, SCL_ALL, FTAG); 30869425SEric.Schrock@Sun.COM mutex_exit(&spa_namespace_lock); 30876423Sgw25295 30889790SLin.Ling@Sun.COM nvlist_free(config); 30896423Sgw25295 return (error); 30906423Sgw25295 } 30919790SLin.Ling@Sun.COM 30926423Sgw25295 #endif 30936423Sgw25295 30946423Sgw25295 /* 30959425SEric.Schrock@Sun.COM * Take a pool and insert it into the namespace as if it had been loaded at 30969425SEric.Schrock@Sun.COM * boot. 30979425SEric.Schrock@Sun.COM */ 30989425SEric.Schrock@Sun.COM int 30999425SEric.Schrock@Sun.COM spa_import_verbatim(const char *pool, nvlist_t *config, nvlist_t *props) 31009425SEric.Schrock@Sun.COM { 31019425SEric.Schrock@Sun.COM spa_t *spa; 31029425SEric.Schrock@Sun.COM char *altroot = NULL; 31039425SEric.Schrock@Sun.COM 31049425SEric.Schrock@Sun.COM mutex_enter(&spa_namespace_lock); 31059425SEric.Schrock@Sun.COM if (spa_lookup(pool) != NULL) { 31069425SEric.Schrock@Sun.COM mutex_exit(&spa_namespace_lock); 31079425SEric.Schrock@Sun.COM return (EEXIST); 31089425SEric.Schrock@Sun.COM } 31099425SEric.Schrock@Sun.COM 31109425SEric.Schrock@Sun.COM (void) nvlist_lookup_string(props, 31119425SEric.Schrock@Sun.COM zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 311210921STim.Haley@Sun.COM spa = spa_add(pool, config, altroot); 311310921STim.Haley@Sun.COM 311410100SLin.Ling@Sun.COM spa->spa_load_verbatim = B_TRUE; 311510000SVictor.Latushkin@Sun.COM 31169425SEric.Schrock@Sun.COM if (props != NULL) 31179425SEric.Schrock@Sun.COM spa_configfile_set(spa, props, B_FALSE); 31189425SEric.Schrock@Sun.COM 31199425SEric.Schrock@Sun.COM spa_config_sync(spa, B_FALSE, B_TRUE); 31209425SEric.Schrock@Sun.COM 31219425SEric.Schrock@Sun.COM mutex_exit(&spa_namespace_lock); 31229946SMark.Musante@Sun.COM spa_history_log_version(spa, LOG_POOL_IMPORT); 31239425SEric.Schrock@Sun.COM 31249425SEric.Schrock@Sun.COM return (0); 31259425SEric.Schrock@Sun.COM } 31269425SEric.Schrock@Sun.COM 31279425SEric.Schrock@Sun.COM /* 31286423Sgw25295 * Import a non-root pool into the system. 31296423Sgw25295 */ 31306423Sgw25295 int 31316423Sgw25295 spa_import(const char *pool, nvlist_t *config, nvlist_t *props) 31326423Sgw25295 { 31339425SEric.Schrock@Sun.COM spa_t *spa; 31349425SEric.Schrock@Sun.COM char *altroot = NULL; 313510921STim.Haley@Sun.COM spa_load_state_t state = SPA_LOAD_IMPORT; 313610921STim.Haley@Sun.COM zpool_rewind_policy_t policy; 31379425SEric.Schrock@Sun.COM int error; 31389425SEric.Schrock@Sun.COM nvlist_t *nvroot; 31399425SEric.Schrock@Sun.COM nvlist_t **spares, **l2cache; 31409425SEric.Schrock@Sun.COM uint_t nspares, nl2cache; 31419425SEric.Schrock@Sun.COM 31429425SEric.Schrock@Sun.COM /* 31439425SEric.Schrock@Sun.COM * If a pool with this name exists, return failure. 31449425SEric.Schrock@Sun.COM */ 31459425SEric.Schrock@Sun.COM mutex_enter(&spa_namespace_lock); 314611422SMark.Musante@Sun.COM if (spa_lookup(pool) != NULL) { 31479425SEric.Schrock@Sun.COM mutex_exit(&spa_namespace_lock); 31489425SEric.Schrock@Sun.COM return (EEXIST); 31499425SEric.Schrock@Sun.COM } 31509425SEric.Schrock@Sun.COM 315110921STim.Haley@Sun.COM zpool_get_rewind_policy(config, &policy); 315210921STim.Haley@Sun.COM if (policy.zrp_request & ZPOOL_DO_REWIND) 315310921STim.Haley@Sun.COM state = SPA_LOAD_RECOVER; 315410921STim.Haley@Sun.COM 31559425SEric.Schrock@Sun.COM /* 31569425SEric.Schrock@Sun.COM * Create and initialize the spa structure. 31579425SEric.Schrock@Sun.COM */ 31589425SEric.Schrock@Sun.COM (void) nvlist_lookup_string(props, 31599425SEric.Schrock@Sun.COM zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 316010921STim.Haley@Sun.COM spa = spa_add(pool, config, altroot); 31619425SEric.Schrock@Sun.COM spa_activate(spa, spa_mode_global); 31629425SEric.Schrock@Sun.COM 31639425SEric.Schrock@Sun.COM /* 31649630SJeff.Bonwick@Sun.COM * Don't start async tasks until we know everything is healthy. 31659630SJeff.Bonwick@Sun.COM */ 31669630SJeff.Bonwick@Sun.COM spa_async_suspend(spa); 31679630SJeff.Bonwick@Sun.COM 31689630SJeff.Bonwick@Sun.COM /* 31699425SEric.Schrock@Sun.COM * Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig 31709425SEric.Schrock@Sun.COM * because the user-supplied config is actually the one to trust when 31719425SEric.Schrock@Sun.COM * doing an import. 31729425SEric.Schrock@Sun.COM */ 317310921STim.Haley@Sun.COM if (state != SPA_LOAD_RECOVER) 317410921STim.Haley@Sun.COM spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 317510921STim.Haley@Sun.COM error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg, 317611727SVictor.Latushkin@Sun.COM policy.zrp_request); 317710921STim.Haley@Sun.COM 317810921STim.Haley@Sun.COM /* 317910921STim.Haley@Sun.COM * Propagate anything learned about failing or best txgs 318010921STim.Haley@Sun.COM * back to caller 318110921STim.Haley@Sun.COM */ 318210921STim.Haley@Sun.COM spa_rewind_data_to_nvlist(spa, config); 31839425SEric.Schrock@Sun.COM 31849425SEric.Schrock@Sun.COM spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 31859425SEric.Schrock@Sun.COM /* 31869425SEric.Schrock@Sun.COM * Toss any existing sparelist, as it doesn't have any validity 31879425SEric.Schrock@Sun.COM * anymore, and conflicts with spa_has_spare(). 31889425SEric.Schrock@Sun.COM */ 31899425SEric.Schrock@Sun.COM if (spa->spa_spares.sav_config) { 31909425SEric.Schrock@Sun.COM nvlist_free(spa->spa_spares.sav_config); 31919425SEric.Schrock@Sun.COM spa->spa_spares.sav_config = NULL; 31929425SEric.Schrock@Sun.COM spa_load_spares(spa); 31939425SEric.Schrock@Sun.COM } 31949425SEric.Schrock@Sun.COM if (spa->spa_l2cache.sav_config) { 31959425SEric.Schrock@Sun.COM nvlist_free(spa->spa_l2cache.sav_config); 31969425SEric.Schrock@Sun.COM spa->spa_l2cache.sav_config = NULL; 31979425SEric.Schrock@Sun.COM spa_load_l2cache(spa); 31989425SEric.Schrock@Sun.COM } 31999425SEric.Schrock@Sun.COM 32009425SEric.Schrock@Sun.COM VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 32019425SEric.Schrock@Sun.COM &nvroot) == 0); 32029425SEric.Schrock@Sun.COM if (error == 0) 32039425SEric.Schrock@Sun.COM error = spa_validate_aux(spa, nvroot, -1ULL, 32049425SEric.Schrock@Sun.COM VDEV_ALLOC_SPARE); 32059425SEric.Schrock@Sun.COM if (error == 0) 32069425SEric.Schrock@Sun.COM error = spa_validate_aux(spa, nvroot, -1ULL, 32079425SEric.Schrock@Sun.COM VDEV_ALLOC_L2CACHE); 32089425SEric.Schrock@Sun.COM spa_config_exit(spa, SCL_ALL, FTAG); 32099425SEric.Schrock@Sun.COM 32109425SEric.Schrock@Sun.COM if (props != NULL) 32119425SEric.Schrock@Sun.COM spa_configfile_set(spa, props, B_FALSE); 32129425SEric.Schrock@Sun.COM 32139425SEric.Schrock@Sun.COM if (error != 0 || (props && spa_writeable(spa) && 32149425SEric.Schrock@Sun.COM (error = spa_prop_set(spa, props)))) { 32159425SEric.Schrock@Sun.COM spa_unload(spa); 32169425SEric.Schrock@Sun.COM spa_deactivate(spa); 32179425SEric.Schrock@Sun.COM spa_remove(spa); 32189425SEric.Schrock@Sun.COM mutex_exit(&spa_namespace_lock); 32199425SEric.Schrock@Sun.COM return (error); 32209425SEric.Schrock@Sun.COM } 32219425SEric.Schrock@Sun.COM 32229630SJeff.Bonwick@Sun.COM spa_async_resume(spa); 32239630SJeff.Bonwick@Sun.COM 32249425SEric.Schrock@Sun.COM /* 32259425SEric.Schrock@Sun.COM * Override any spares and level 2 cache devices as specified by 32269425SEric.Schrock@Sun.COM * the user, as these may have correct device names/devids, etc. 32279425SEric.Schrock@Sun.COM */ 32289425SEric.Schrock@Sun.COM if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 32299425SEric.Schrock@Sun.COM &spares, &nspares) == 0) { 32309425SEric.Schrock@Sun.COM if (spa->spa_spares.sav_config) 32319425SEric.Schrock@Sun.COM VERIFY(nvlist_remove(spa->spa_spares.sav_config, 32329425SEric.Schrock@Sun.COM ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 32339425SEric.Schrock@Sun.COM else 32349425SEric.Schrock@Sun.COM VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, 32359425SEric.Schrock@Sun.COM NV_UNIQUE_NAME, KM_SLEEP) == 0); 32369425SEric.Schrock@Sun.COM VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 32379425SEric.Schrock@Sun.COM ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 32389425SEric.Schrock@Sun.COM spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 32399425SEric.Schrock@Sun.COM spa_load_spares(spa); 32409425SEric.Schrock@Sun.COM spa_config_exit(spa, SCL_ALL, FTAG); 32419425SEric.Schrock@Sun.COM spa->spa_spares.sav_sync = B_TRUE; 32429425SEric.Schrock@Sun.COM } 32439425SEric.Schrock@Sun.COM if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 32449425SEric.Schrock@Sun.COM &l2cache, &nl2cache) == 0) { 32459425SEric.Schrock@Sun.COM if (spa->spa_l2cache.sav_config) 32469425SEric.Schrock@Sun.COM VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, 32479425SEric.Schrock@Sun.COM ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); 32489425SEric.Schrock@Sun.COM else 32499425SEric.Schrock@Sun.COM VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 32509425SEric.Schrock@Sun.COM NV_UNIQUE_NAME, KM_SLEEP) == 0); 32519425SEric.Schrock@Sun.COM VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 32529425SEric.Schrock@Sun.COM ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 32539425SEric.Schrock@Sun.COM spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 32549425SEric.Schrock@Sun.COM spa_load_l2cache(spa); 32559425SEric.Schrock@Sun.COM spa_config_exit(spa, SCL_ALL, FTAG); 32569425SEric.Schrock@Sun.COM spa->spa_l2cache.sav_sync = B_TRUE; 32579425SEric.Schrock@Sun.COM } 32589425SEric.Schrock@Sun.COM 325910672SEric.Schrock@Sun.COM /* 326010672SEric.Schrock@Sun.COM * Check for any removed devices. 326110672SEric.Schrock@Sun.COM */ 326210672SEric.Schrock@Sun.COM if (spa->spa_autoreplace) { 326310672SEric.Schrock@Sun.COM spa_aux_check_removed(&spa->spa_spares); 326410672SEric.Schrock@Sun.COM spa_aux_check_removed(&spa->spa_l2cache); 326510672SEric.Schrock@Sun.COM } 326610672SEric.Schrock@Sun.COM 32679425SEric.Schrock@Sun.COM if (spa_writeable(spa)) { 32689425SEric.Schrock@Sun.COM /* 32699425SEric.Schrock@Sun.COM * Update the config cache to include the newly-imported pool. 32709425SEric.Schrock@Sun.COM */ 327110100SLin.Ling@Sun.COM spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 32729425SEric.Schrock@Sun.COM } 32739425SEric.Schrock@Sun.COM 32749816SGeorge.Wilson@Sun.COM /* 32759816SGeorge.Wilson@Sun.COM * It's possible that the pool was expanded while it was exported. 32769816SGeorge.Wilson@Sun.COM * We kick off an async task to handle this for us. 32779816SGeorge.Wilson@Sun.COM */ 32789816SGeorge.Wilson@Sun.COM spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); 32799816SGeorge.Wilson@Sun.COM 32809425SEric.Schrock@Sun.COM mutex_exit(&spa_namespace_lock); 32819946SMark.Musante@Sun.COM spa_history_log_version(spa, LOG_POOL_IMPORT); 32829425SEric.Schrock@Sun.COM 32839425SEric.Schrock@Sun.COM return (0); 32846643Seschrock } 32856643Seschrock 3286789Sahrens nvlist_t * 3287789Sahrens spa_tryimport(nvlist_t *tryconfig) 3288789Sahrens { 3289789Sahrens nvlist_t *config = NULL; 3290789Sahrens char *poolname; 3291789Sahrens spa_t *spa; 3292789Sahrens uint64_t state; 32938680SLin.Ling@Sun.COM int error; 3294789Sahrens 3295789Sahrens if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 3296789Sahrens return (NULL); 3297789Sahrens 3298789Sahrens if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 3299789Sahrens return (NULL); 3300789Sahrens 33011635Sbonwick /* 33021635Sbonwick * Create and initialize the spa structure. 33031635Sbonwick */ 3304789Sahrens mutex_enter(&spa_namespace_lock); 330510921STim.Haley@Sun.COM spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); 33068241SJeff.Bonwick@Sun.COM spa_activate(spa, FREAD); 3307789Sahrens 3308789Sahrens /* 33091635Sbonwick * Pass off the heavy lifting to spa_load(). 33101732Sbonwick * Pass TRUE for mosconfig because the user-supplied config 33111732Sbonwick * is actually the one to trust when doing an import. 3312789Sahrens */ 331311422SMark.Musante@Sun.COM error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE); 3314789Sahrens 3315789Sahrens /* 3316789Sahrens * If 'tryconfig' was at least parsable, return the current config. 3317789Sahrens */ 3318789Sahrens if (spa->spa_root_vdev != NULL) { 3319789Sahrens config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 3320789Sahrens VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 3321789Sahrens poolname) == 0); 3322789Sahrens VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 3323789Sahrens state) == 0); 33243975Sek110237 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 33253975Sek110237 spa->spa_uberblock.ub_timestamp) == 0); 33262082Seschrock 33272082Seschrock /* 33286423Sgw25295 * If the bootfs property exists on this pool then we 33296423Sgw25295 * copy it out so that external consumers can tell which 33306423Sgw25295 * pools are bootable. 33316423Sgw25295 */ 33328680SLin.Ling@Sun.COM if ((!error || error == EEXIST) && spa->spa_bootfs) { 33336423Sgw25295 char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 33346423Sgw25295 33356423Sgw25295 /* 33366423Sgw25295 * We have to play games with the name since the 33376423Sgw25295 * pool was opened as TRYIMPORT_NAME. 33386423Sgw25295 */ 33397754SJeff.Bonwick@Sun.COM if (dsl_dsobj_to_dsname(spa_name(spa), 33406423Sgw25295 spa->spa_bootfs, tmpname) == 0) { 33416423Sgw25295 char *cp; 33426423Sgw25295 char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 33436423Sgw25295 33446423Sgw25295 cp = strchr(tmpname, '/'); 33456423Sgw25295 if (cp == NULL) { 33466423Sgw25295 (void) strlcpy(dsname, tmpname, 33476423Sgw25295 MAXPATHLEN); 33486423Sgw25295 } else { 33496423Sgw25295 (void) snprintf(dsname, MAXPATHLEN, 33506423Sgw25295 "%s/%s", poolname, ++cp); 33516423Sgw25295 } 33526423Sgw25295 VERIFY(nvlist_add_string(config, 33536423Sgw25295 ZPOOL_CONFIG_BOOTFS, dsname) == 0); 33546423Sgw25295 kmem_free(dsname, MAXPATHLEN); 33556423Sgw25295 } 33566423Sgw25295 kmem_free(tmpname, MAXPATHLEN); 33576423Sgw25295 } 33586423Sgw25295 33596423Sgw25295 /* 33605450Sbrendan * Add the list of hot spares and level 2 cache devices. 33612082Seschrock */ 33629425SEric.Schrock@Sun.COM spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 33632082Seschrock spa_add_spares(spa, config); 33645450Sbrendan spa_add_l2cache(spa, config); 33659425SEric.Schrock@Sun.COM spa_config_exit(spa, SCL_CONFIG, FTAG); 3366789Sahrens } 3367789Sahrens 3368789Sahrens spa_unload(spa); 3369789Sahrens spa_deactivate(spa); 3370789Sahrens spa_remove(spa); 3371789Sahrens mutex_exit(&spa_namespace_lock); 3372789Sahrens 3373789Sahrens return (config); 3374789Sahrens } 3375789Sahrens 3376789Sahrens /* 3377789Sahrens * Pool export/destroy 3378789Sahrens * 3379789Sahrens * The act of destroying or exporting a pool is very simple. We make sure there 3380789Sahrens * is no more pending I/O and any references to the pool are gone. Then, we 3381789Sahrens * update the pool state and sync all the labels to disk, removing the 33828211SGeorge.Wilson@Sun.COM * configuration from the cache afterwards. If the 'hardforce' flag is set, then 33838211SGeorge.Wilson@Sun.COM * we don't sync the labels or remove the configuration cache. 3384789Sahrens */ 3385789Sahrens static int 33867214Slling spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, 33878211SGeorge.Wilson@Sun.COM boolean_t force, boolean_t hardforce) 3388789Sahrens { 3389789Sahrens spa_t *spa; 3390789Sahrens 33911775Sbillm if (oldconfig) 33921775Sbillm *oldconfig = NULL; 33931775Sbillm 33948241SJeff.Bonwick@Sun.COM if (!(spa_mode_global & FWRITE)) 3395789Sahrens return (EROFS); 3396789Sahrens 3397789Sahrens mutex_enter(&spa_namespace_lock); 3398789Sahrens if ((spa = spa_lookup(pool)) == NULL) { 3399789Sahrens mutex_exit(&spa_namespace_lock); 3400789Sahrens return (ENOENT); 3401789Sahrens } 3402789Sahrens 3403789Sahrens /* 34041544Seschrock * Put a hold on the pool, drop the namespace lock, stop async tasks, 34051544Seschrock * reacquire the namespace lock, and see if we can export. 34061544Seschrock */ 34071544Seschrock spa_open_ref(spa, FTAG); 34081544Seschrock mutex_exit(&spa_namespace_lock); 34091544Seschrock spa_async_suspend(spa); 34101544Seschrock mutex_enter(&spa_namespace_lock); 34111544Seschrock spa_close(spa, FTAG); 34121544Seschrock 34131544Seschrock /* 3414789Sahrens * The pool will be in core if it's openable, 3415789Sahrens * in which case we can modify its state. 3416789Sahrens */ 3417789Sahrens if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 3418789Sahrens /* 3419789Sahrens * Objsets may be open only because they're dirty, so we 3420789Sahrens * have to force it to sync before checking spa_refcnt. 3421789Sahrens */ 3422789Sahrens txg_wait_synced(spa->spa_dsl_pool, 0); 3423789Sahrens 34241544Seschrock /* 34251544Seschrock * A pool cannot be exported or destroyed if there are active 34261544Seschrock * references. If we are resetting a pool, allow references by 34271544Seschrock * fault injection handlers. 34281544Seschrock */ 34291544Seschrock if (!spa_refcount_zero(spa) || 34301544Seschrock (spa->spa_inject_ref != 0 && 34311544Seschrock new_state != POOL_STATE_UNINITIALIZED)) { 34321544Seschrock spa_async_resume(spa); 3433789Sahrens mutex_exit(&spa_namespace_lock); 3434789Sahrens return (EBUSY); 3435789Sahrens } 3436789Sahrens 3437789Sahrens /* 34387214Slling * A pool cannot be exported if it has an active shared spare. 34397214Slling * This is to prevent other pools stealing the active spare 34407214Slling * from an exported pool. At user's own will, such pool can 34417214Slling * be forcedly exported. 34427214Slling */ 34437214Slling if (!force && new_state == POOL_STATE_EXPORTED && 34447214Slling spa_has_active_shared_spare(spa)) { 34457214Slling spa_async_resume(spa); 34467214Slling mutex_exit(&spa_namespace_lock); 34477214Slling return (EXDEV); 34487214Slling } 34497214Slling 34507214Slling /* 3451789Sahrens * We want this to be reflected on every label, 3452789Sahrens * so mark them all dirty. spa_unload() will do the 3453789Sahrens * final sync that pushes these changes out. 3454789Sahrens */ 34558211SGeorge.Wilson@Sun.COM if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 34567754SJeff.Bonwick@Sun.COM spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 34571544Seschrock spa->spa_state = new_state; 34581635Sbonwick spa->spa_final_txg = spa_last_synced_txg(spa) + 1; 34591544Seschrock vdev_config_dirty(spa->spa_root_vdev); 34607754SJeff.Bonwick@Sun.COM spa_config_exit(spa, SCL_ALL, FTAG); 34611544Seschrock } 3462789Sahrens } 3463789Sahrens 34644451Seschrock spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY); 34654451Seschrock 3466789Sahrens if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 3467789Sahrens spa_unload(spa); 3468789Sahrens spa_deactivate(spa); 3469789Sahrens } 3470789Sahrens 34711775Sbillm if (oldconfig && spa->spa_config) 34721775Sbillm VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 34731775Sbillm 34741544Seschrock if (new_state != POOL_STATE_UNINITIALIZED) { 34758211SGeorge.Wilson@Sun.COM if (!hardforce) 34768211SGeorge.Wilson@Sun.COM spa_config_sync(spa, B_TRUE, B_TRUE); 34771544Seschrock spa_remove(spa); 34781544Seschrock } 3479789Sahrens mutex_exit(&spa_namespace_lock); 3480789Sahrens 3481789Sahrens return (0); 3482789Sahrens } 3483789Sahrens 3484789Sahrens /* 3485789Sahrens * Destroy a storage pool. 3486789Sahrens */ 3487789Sahrens int 3488789Sahrens spa_destroy(char *pool) 3489789Sahrens { 34908211SGeorge.Wilson@Sun.COM return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, 34918211SGeorge.Wilson@Sun.COM B_FALSE, B_FALSE)); 3492789Sahrens } 3493789Sahrens 3494789Sahrens /* 3495789Sahrens * Export a storage pool. 3496789Sahrens */ 3497789Sahrens int 34988211SGeorge.Wilson@Sun.COM spa_export(char *pool, nvlist_t **oldconfig, boolean_t force, 34998211SGeorge.Wilson@Sun.COM boolean_t hardforce) 3500789Sahrens { 35018211SGeorge.Wilson@Sun.COM return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, 35028211SGeorge.Wilson@Sun.COM force, hardforce)); 3503789Sahrens } 3504789Sahrens 3505789Sahrens /* 35061544Seschrock * Similar to spa_export(), this unloads the spa_t without actually removing it 35071544Seschrock * from the namespace in any way. 35081544Seschrock */ 35091544Seschrock int 35101544Seschrock spa_reset(char *pool) 35111544Seschrock { 35127214Slling return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, 35138211SGeorge.Wilson@Sun.COM B_FALSE, B_FALSE)); 35141544Seschrock } 35151544Seschrock 35161544Seschrock /* 3517789Sahrens * ========================================================================== 3518789Sahrens * Device manipulation 3519789Sahrens * ========================================================================== 3520789Sahrens */ 3521789Sahrens 3522789Sahrens /* 35234527Sperrin * Add a device to a storage pool. 3524789Sahrens */ 3525789Sahrens int 3526789Sahrens spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 3527789Sahrens { 352810594SGeorge.Wilson@Sun.COM uint64_t txg, id; 35298241SJeff.Bonwick@Sun.COM int error; 3530789Sahrens vdev_t *rvd = spa->spa_root_vdev; 35311585Sbonwick vdev_t *vd, *tvd; 35325450Sbrendan nvlist_t **spares, **l2cache; 35335450Sbrendan uint_t nspares, nl2cache; 3534789Sahrens 3535789Sahrens txg = spa_vdev_enter(spa); 3536789Sahrens 35372082Seschrock if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 35382082Seschrock VDEV_ALLOC_ADD)) != 0) 35392082Seschrock return (spa_vdev_exit(spa, NULL, txg, error)); 35402082Seschrock 35417754SJeff.Bonwick@Sun.COM spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ 3542789Sahrens 35435450Sbrendan if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 35445450Sbrendan &nspares) != 0) 35452082Seschrock nspares = 0; 35462082Seschrock 35475450Sbrendan if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, 35485450Sbrendan &nl2cache) != 0) 35495450Sbrendan nl2cache = 0; 35505450Sbrendan 35517754SJeff.Bonwick@Sun.COM if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) 35522082Seschrock return (spa_vdev_exit(spa, vd, txg, EINVAL)); 35537754SJeff.Bonwick@Sun.COM 35547754SJeff.Bonwick@Sun.COM if (vd->vdev_children != 0 && 35557754SJeff.Bonwick@Sun.COM (error = vdev_create(vd, txg, B_FALSE)) != 0) 35567754SJeff.Bonwick@Sun.COM return (spa_vdev_exit(spa, vd, txg, error)); 35572082Seschrock 35583377Seschrock /* 35595450Sbrendan * We must validate the spares and l2cache devices after checking the 35605450Sbrendan * children. Otherwise, vdev_inuse() will blindly overwrite the spare. 35613377Seschrock */ 35627754SJeff.Bonwick@Sun.COM if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) 35633377Seschrock return (spa_vdev_exit(spa, vd, txg, error)); 35643377Seschrock 35653377Seschrock /* 35663377Seschrock * Transfer each new top-level vdev from vd to rvd. 35673377Seschrock */ 35688241SJeff.Bonwick@Sun.COM for (int c = 0; c < vd->vdev_children; c++) { 356910594SGeorge.Wilson@Sun.COM 357010594SGeorge.Wilson@Sun.COM /* 357110594SGeorge.Wilson@Sun.COM * Set the vdev id to the first hole, if one exists. 357210594SGeorge.Wilson@Sun.COM */ 357310594SGeorge.Wilson@Sun.COM for (id = 0; id < rvd->vdev_children; id++) { 357410594SGeorge.Wilson@Sun.COM if (rvd->vdev_child[id]->vdev_ishole) { 357510594SGeorge.Wilson@Sun.COM vdev_free(rvd->vdev_child[id]); 357610594SGeorge.Wilson@Sun.COM break; 357710594SGeorge.Wilson@Sun.COM } 357810594SGeorge.Wilson@Sun.COM } 35793377Seschrock tvd = vd->vdev_child[c]; 35803377Seschrock vdev_remove_child(vd, tvd); 358110594SGeorge.Wilson@Sun.COM tvd->vdev_id = id; 35823377Seschrock vdev_add_child(rvd, tvd); 35833377Seschrock vdev_config_dirty(tvd); 35843377Seschrock } 35853377Seschrock 35862082Seschrock if (nspares != 0) { 35875450Sbrendan spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, 35885450Sbrendan ZPOOL_CONFIG_SPARES); 35892082Seschrock spa_load_spares(spa); 35905450Sbrendan spa->spa_spares.sav_sync = B_TRUE; 35915450Sbrendan } 35925450Sbrendan 35935450Sbrendan if (nl2cache != 0) { 35945450Sbrendan spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, 35955450Sbrendan ZPOOL_CONFIG_L2CACHE); 35965450Sbrendan spa_load_l2cache(spa); 35975450Sbrendan spa->spa_l2cache.sav_sync = B_TRUE; 3598789Sahrens } 3599789Sahrens 3600789Sahrens /* 36011585Sbonwick * We have to be careful when adding new vdevs to an existing pool. 36021585Sbonwick * If other threads start allocating from these vdevs before we 36031585Sbonwick * sync the config cache, and we lose power, then upon reboot we may 36041585Sbonwick * fail to open the pool because there are DVAs that the config cache 36051585Sbonwick * can't translate. Therefore, we first add the vdevs without 36061585Sbonwick * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 36071635Sbonwick * and then let spa_config_update() initialize the new metaslabs. 36081585Sbonwick * 36091585Sbonwick * spa_load() checks for added-but-not-initialized vdevs, so that 36101585Sbonwick * if we lose power at any point in this sequence, the remaining 36111585Sbonwick * steps will be completed the next time we load the pool. 3612789Sahrens */ 36131635Sbonwick (void) spa_vdev_exit(spa, vd, txg, 0); 36141585Sbonwick 36151635Sbonwick mutex_enter(&spa_namespace_lock); 36161635Sbonwick spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 36171635Sbonwick mutex_exit(&spa_namespace_lock); 3618789Sahrens 36191635Sbonwick return (0); 3620789Sahrens } 3621789Sahrens 3622789Sahrens /* 3623789Sahrens * Attach a device to a mirror. The arguments are the path to any device 3624789Sahrens * in the mirror, and the nvroot for the new device. If the path specifies 3625789Sahrens * a device that is not mirrored, we automatically insert the mirror vdev. 3626789Sahrens * 3627789Sahrens * If 'replacing' is specified, the new device is intended to replace the 3628789Sahrens * existing device; in this case the two devices are made into their own 36294451Seschrock * mirror using the 'replacing' vdev, which is functionally identical to 3630789Sahrens * the mirror vdev (it actually reuses all the same ops) but has a few 3631789Sahrens * extra rules: you can't attach to it after it's been created, and upon 3632789Sahrens * completion of resilvering, the first disk (the one being replaced) 3633789Sahrens * is automatically detached. 3634789Sahrens */ 3635789Sahrens int 36361544Seschrock spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 3637789Sahrens { 3638789Sahrens uint64_t txg, open_txg; 3639789Sahrens vdev_t *rvd = spa->spa_root_vdev; 3640789Sahrens vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 36412082Seschrock vdev_ops_t *pvops; 36427313SEric.Kustarz@Sun.COM char *oldvdpath, *newvdpath; 36437313SEric.Kustarz@Sun.COM int newvd_isspare; 36447313SEric.Kustarz@Sun.COM int error; 3645789Sahrens 3646789Sahrens txg = spa_vdev_enter(spa); 3647789Sahrens 36486643Seschrock oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); 3649789Sahrens 3650789Sahrens if (oldvd == NULL) 3651789Sahrens return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 3652789Sahrens 36531585Sbonwick if (!oldvd->vdev_ops->vdev_op_leaf) 36541585Sbonwick return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 36551585Sbonwick 3656789Sahrens pvd = oldvd->vdev_parent; 3657789Sahrens 36582082Seschrock if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 36594451Seschrock VDEV_ALLOC_ADD)) != 0) 36604451Seschrock return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 36614451Seschrock 36624451Seschrock if (newrootvd->vdev_children != 1) 3663789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 3664789Sahrens 3665789Sahrens newvd = newrootvd->vdev_child[0]; 3666789Sahrens 3667789Sahrens if (!newvd->vdev_ops->vdev_op_leaf) 3668789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 3669789Sahrens 36702082Seschrock if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 3671789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, error)); 3672789Sahrens 36734527Sperrin /* 36744527Sperrin * Spares can't replace logs 36754527Sperrin */ 36767326SEric.Schrock@Sun.COM if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) 36774527Sperrin return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 36784527Sperrin 36792082Seschrock if (!replacing) { 36802082Seschrock /* 36812082Seschrock * For attach, the only allowable parent is a mirror or the root 36822082Seschrock * vdev. 36832082Seschrock */ 36842082Seschrock if (pvd->vdev_ops != &vdev_mirror_ops && 36852082Seschrock pvd->vdev_ops != &vdev_root_ops) 36862082Seschrock return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 36872082Seschrock 36882082Seschrock pvops = &vdev_mirror_ops; 36892082Seschrock } else { 36902082Seschrock /* 36912082Seschrock * Active hot spares can only be replaced by inactive hot 36922082Seschrock * spares. 36932082Seschrock */ 36942082Seschrock if (pvd->vdev_ops == &vdev_spare_ops && 36952082Seschrock pvd->vdev_child[1] == oldvd && 36962082Seschrock !spa_has_spare(spa, newvd->vdev_guid)) 36972082Seschrock return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 36982082Seschrock 36992082Seschrock /* 37002082Seschrock * If the source is a hot spare, and the parent isn't already a 37012082Seschrock * spare, then we want to create a new hot spare. Otherwise, we 37023377Seschrock * want to create a replacing vdev. The user is not allowed to 37033377Seschrock * attach to a spared vdev child unless the 'isspare' state is 37043377Seschrock * the same (spare replaces spare, non-spare replaces 37053377Seschrock * non-spare). 37062082Seschrock */ 37072082Seschrock if (pvd->vdev_ops == &vdev_replacing_ops) 37082082Seschrock return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 37093377Seschrock else if (pvd->vdev_ops == &vdev_spare_ops && 37103377Seschrock newvd->vdev_isspare != oldvd->vdev_isspare) 37113377Seschrock return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 37122082Seschrock else if (pvd->vdev_ops != &vdev_spare_ops && 37132082Seschrock newvd->vdev_isspare) 37142082Seschrock pvops = &vdev_spare_ops; 37152082Seschrock else 37162082Seschrock pvops = &vdev_replacing_ops; 37172082Seschrock } 37182082Seschrock 37191175Slling /* 37209816SGeorge.Wilson@Sun.COM * Make sure the new device is big enough. 37211175Slling */ 37229816SGeorge.Wilson@Sun.COM if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) 3723789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 3724789Sahrens 37251732Sbonwick /* 37261732Sbonwick * The new device cannot have a higher alignment requirement 37271732Sbonwick * than the top-level vdev. 37281732Sbonwick */ 37291732Sbonwick if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 3730789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 3731789Sahrens 3732789Sahrens /* 3733789Sahrens * If this is an in-place replacement, update oldvd's path and devid 3734789Sahrens * to make it distinguishable from newvd, and unopenable from now on. 3735789Sahrens */ 3736789Sahrens if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 3737789Sahrens spa_strfree(oldvd->vdev_path); 3738789Sahrens oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 3739789Sahrens KM_SLEEP); 3740789Sahrens (void) sprintf(oldvd->vdev_path, "%s/%s", 3741789Sahrens newvd->vdev_path, "old"); 3742789Sahrens if (oldvd->vdev_devid != NULL) { 3743789Sahrens spa_strfree(oldvd->vdev_devid); 3744789Sahrens oldvd->vdev_devid = NULL; 3745789Sahrens } 3746789Sahrens } 3747789Sahrens 3748789Sahrens /* 37492082Seschrock * If the parent is not a mirror, or if we're replacing, insert the new 37502082Seschrock * mirror/replacing/spare vdev above oldvd. 3751789Sahrens */ 3752789Sahrens if (pvd->vdev_ops != pvops) 3753789Sahrens pvd = vdev_add_parent(oldvd, pvops); 3754789Sahrens 3755789Sahrens ASSERT(pvd->vdev_top->vdev_parent == rvd); 3756789Sahrens ASSERT(pvd->vdev_ops == pvops); 3757789Sahrens ASSERT(oldvd->vdev_parent == pvd); 3758789Sahrens 3759789Sahrens /* 3760789Sahrens * Extract the new device from its root and add it to pvd. 3761789Sahrens */ 3762789Sahrens vdev_remove_child(newrootvd, newvd); 3763789Sahrens newvd->vdev_id = pvd->vdev_children; 376410594SGeorge.Wilson@Sun.COM newvd->vdev_crtxg = oldvd->vdev_crtxg; 3765789Sahrens vdev_add_child(pvd, newvd); 3766789Sahrens 3767789Sahrens tvd = newvd->vdev_top; 3768789Sahrens ASSERT(pvd->vdev_top == tvd); 3769789Sahrens ASSERT(tvd->vdev_parent == rvd); 3770789Sahrens 3771789Sahrens vdev_config_dirty(tvd); 3772789Sahrens 3773789Sahrens /* 3774789Sahrens * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 3775789Sahrens * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 3776789Sahrens */ 3777789Sahrens open_txg = txg + TXG_CONCURRENT_STATES - 1; 3778789Sahrens 37798241SJeff.Bonwick@Sun.COM vdev_dtl_dirty(newvd, DTL_MISSING, 37808241SJeff.Bonwick@Sun.COM TXG_INITIAL, open_txg - TXG_INITIAL + 1); 3781789Sahrens 37829425SEric.Schrock@Sun.COM if (newvd->vdev_isspare) { 37833377Seschrock spa_spare_activate(newvd); 37849425SEric.Schrock@Sun.COM spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE); 37859425SEric.Schrock@Sun.COM } 37869425SEric.Schrock@Sun.COM 37877754SJeff.Bonwick@Sun.COM oldvdpath = spa_strdup(oldvd->vdev_path); 37887754SJeff.Bonwick@Sun.COM newvdpath = spa_strdup(newvd->vdev_path); 37897313SEric.Kustarz@Sun.COM newvd_isspare = newvd->vdev_isspare; 37901544Seschrock 3791789Sahrens /* 3792789Sahrens * Mark newvd's DTL dirty in this txg. 3793789Sahrens */ 37941732Sbonwick vdev_dirty(tvd, VDD_DTL, newvd, txg); 3795789Sahrens 3796789Sahrens (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 3797789Sahrens 37989946SMark.Musante@Sun.COM spa_history_internal_log(LOG_POOL_VDEV_ATTACH, spa, NULL, 37999946SMark.Musante@Sun.COM CRED(), "%s vdev=%s %s vdev=%s", 38009946SMark.Musante@Sun.COM replacing && newvd_isspare ? "spare in" : 38019946SMark.Musante@Sun.COM replacing ? "replace" : "attach", newvdpath, 38029946SMark.Musante@Sun.COM replacing ? "for" : "to", oldvdpath); 38037313SEric.Kustarz@Sun.COM 38047313SEric.Kustarz@Sun.COM spa_strfree(oldvdpath); 38057313SEric.Kustarz@Sun.COM spa_strfree(newvdpath); 38067313SEric.Kustarz@Sun.COM 3807789Sahrens /* 38087046Sahrens * Kick off a resilver to update newvd. 3809789Sahrens */ 38107046Sahrens VERIFY3U(spa_scrub(spa, POOL_SCRUB_RESILVER), ==, 0); 3811789Sahrens 3812789Sahrens return (0); 3813789Sahrens } 3814789Sahrens 3815789Sahrens /* 3816789Sahrens * Detach a device from a mirror or replacing vdev. 3817789Sahrens * If 'replace_done' is specified, only detach if the parent 3818789Sahrens * is a replacing vdev. 3819789Sahrens */ 3820789Sahrens int 38218241SJeff.Bonwick@Sun.COM spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) 3822789Sahrens { 3823789Sahrens uint64_t txg; 38248241SJeff.Bonwick@Sun.COM int error; 3825789Sahrens vdev_t *rvd = spa->spa_root_vdev; 3826789Sahrens vdev_t *vd, *pvd, *cvd, *tvd; 38272082Seschrock boolean_t unspare = B_FALSE; 38282082Seschrock uint64_t unspare_guid; 38296673Seschrock size_t len; 383011422SMark.Musante@Sun.COM char *vdpath; 3831789Sahrens 3832789Sahrens txg = spa_vdev_enter(spa); 3833789Sahrens 38346643Seschrock vd = spa_lookup_by_guid(spa, guid, B_FALSE); 3835789Sahrens 3836789Sahrens if (vd == NULL) 3837789Sahrens return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 3838789Sahrens 38391585Sbonwick if (!vd->vdev_ops->vdev_op_leaf) 38401585Sbonwick return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 38411585Sbonwick 3842789Sahrens pvd = vd->vdev_parent; 3843789Sahrens 3844789Sahrens /* 38458241SJeff.Bonwick@Sun.COM * If the parent/child relationship is not as expected, don't do it. 38468241SJeff.Bonwick@Sun.COM * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing 38478241SJeff.Bonwick@Sun.COM * vdev that's replacing B with C. The user's intent in replacing 38488241SJeff.Bonwick@Sun.COM * is to go from M(A,B) to M(A,C). If the user decides to cancel 38498241SJeff.Bonwick@Sun.COM * the replace by detaching C, the expected behavior is to end up 38508241SJeff.Bonwick@Sun.COM * M(A,B). But suppose that right after deciding to detach C, 38518241SJeff.Bonwick@Sun.COM * the replacement of B completes. We would have M(A,C), and then 38528241SJeff.Bonwick@Sun.COM * ask to detach C, which would leave us with just A -- not what 38538241SJeff.Bonwick@Sun.COM * the user wanted. To prevent this, we make sure that the 38548241SJeff.Bonwick@Sun.COM * parent/child relationship hasn't changed -- in this example, 38558241SJeff.Bonwick@Sun.COM * that C's parent is still the replacing vdev R. 38568241SJeff.Bonwick@Sun.COM */ 38578241SJeff.Bonwick@Sun.COM if (pvd->vdev_guid != pguid && pguid != 0) 38588241SJeff.Bonwick@Sun.COM return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 38598241SJeff.Bonwick@Sun.COM 38608241SJeff.Bonwick@Sun.COM /* 3861789Sahrens * If replace_done is specified, only remove this device if it's 38622082Seschrock * the first child of a replacing vdev. For the 'spare' vdev, either 38632082Seschrock * disk can be removed. 3864789Sahrens */ 38652082Seschrock if (replace_done) { 38662082Seschrock if (pvd->vdev_ops == &vdev_replacing_ops) { 38672082Seschrock if (vd->vdev_id != 0) 38682082Seschrock return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 38692082Seschrock } else if (pvd->vdev_ops != &vdev_spare_ops) { 38702082Seschrock return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 38712082Seschrock } 38722082Seschrock } 38732082Seschrock 38742082Seschrock ASSERT(pvd->vdev_ops != &vdev_spare_ops || 38754577Sahrens spa_version(spa) >= SPA_VERSION_SPARES); 3876789Sahrens 3877789Sahrens /* 38782082Seschrock * Only mirror, replacing, and spare vdevs support detach. 3879789Sahrens */ 3880789Sahrens if (pvd->vdev_ops != &vdev_replacing_ops && 38812082Seschrock pvd->vdev_ops != &vdev_mirror_ops && 38822082Seschrock pvd->vdev_ops != &vdev_spare_ops) 3883789Sahrens return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3884789Sahrens 3885789Sahrens /* 38868241SJeff.Bonwick@Sun.COM * If this device has the only valid copy of some data, 38878241SJeff.Bonwick@Sun.COM * we cannot safely detach it. 3888789Sahrens */ 38898241SJeff.Bonwick@Sun.COM if (vdev_dtl_required(vd)) 3890789Sahrens return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 3891789Sahrens 38928241SJeff.Bonwick@Sun.COM ASSERT(pvd->vdev_children >= 2); 38938241SJeff.Bonwick@Sun.COM 3894789Sahrens /* 38956673Seschrock * If we are detaching the second disk from a replacing vdev, then 38966673Seschrock * check to see if we changed the original vdev's path to have "/old" 38976673Seschrock * at the end in spa_vdev_attach(). If so, undo that change now. 38986673Seschrock */ 38996673Seschrock if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id == 1 && 39006673Seschrock pvd->vdev_child[0]->vdev_path != NULL && 39016673Seschrock pvd->vdev_child[1]->vdev_path != NULL) { 39026673Seschrock ASSERT(pvd->vdev_child[1] == vd); 39036673Seschrock cvd = pvd->vdev_child[0]; 39046673Seschrock len = strlen(vd->vdev_path); 39056673Seschrock if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && 39066673Seschrock strcmp(cvd->vdev_path + len, "/old") == 0) { 39076673Seschrock spa_strfree(cvd->vdev_path); 39086673Seschrock cvd->vdev_path = spa_strdup(vd->vdev_path); 39096673Seschrock } 39106673Seschrock } 39116673Seschrock 39126673Seschrock /* 39132082Seschrock * If we are detaching the original disk from a spare, then it implies 39142082Seschrock * that the spare should become a real disk, and be removed from the 39152082Seschrock * active spare list for the pool. 39162082Seschrock */ 39172082Seschrock if (pvd->vdev_ops == &vdev_spare_ops && 39188241SJeff.Bonwick@Sun.COM vd->vdev_id == 0 && pvd->vdev_child[1]->vdev_isspare) 39192082Seschrock unspare = B_TRUE; 39202082Seschrock 39212082Seschrock /* 3922789Sahrens * Erase the disk labels so the disk can be used for other things. 3923789Sahrens * This must be done after all other error cases are handled, 3924789Sahrens * but before we disembowel vd (so we can still do I/O to it). 3925789Sahrens * But if we can't do it, don't treat the error as fatal -- 3926789Sahrens * it may be that the unwritability of the disk is the reason 3927789Sahrens * it's being detached! 3928789Sahrens */ 39293377Seschrock error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 3930789Sahrens 3931789Sahrens /* 3932789Sahrens * Remove vd from its parent and compact the parent's children. 3933789Sahrens */ 3934789Sahrens vdev_remove_child(pvd, vd); 3935789Sahrens vdev_compact_children(pvd); 3936789Sahrens 3937789Sahrens /* 3938789Sahrens * Remember one of the remaining children so we can get tvd below. 3939789Sahrens */ 3940789Sahrens cvd = pvd->vdev_child[0]; 3941789Sahrens 3942789Sahrens /* 39432082Seschrock * If we need to remove the remaining child from the list of hot spares, 39448241SJeff.Bonwick@Sun.COM * do it now, marking the vdev as no longer a spare in the process. 39458241SJeff.Bonwick@Sun.COM * We must do this before vdev_remove_parent(), because that can 39468241SJeff.Bonwick@Sun.COM * change the GUID if it creates a new toplevel GUID. For a similar 39478241SJeff.Bonwick@Sun.COM * reason, we must remove the spare now, in the same txg as the detach; 39488241SJeff.Bonwick@Sun.COM * otherwise someone could attach a new sibling, change the GUID, and 39498241SJeff.Bonwick@Sun.COM * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. 39502082Seschrock */ 39512082Seschrock if (unspare) { 39522082Seschrock ASSERT(cvd->vdev_isspare); 39533377Seschrock spa_spare_remove(cvd); 39542082Seschrock unspare_guid = cvd->vdev_guid; 39558241SJeff.Bonwick@Sun.COM (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 39562082Seschrock } 39572082Seschrock 39582082Seschrock /* 3959789Sahrens * If the parent mirror/replacing vdev only has one child, 3960789Sahrens * the parent is no longer needed. Remove it from the tree. 3961789Sahrens */ 3962789Sahrens if (pvd->vdev_children == 1) 3963789Sahrens vdev_remove_parent(cvd); 3964789Sahrens 3965789Sahrens /* 3966789Sahrens * We don't set tvd until now because the parent we just removed 3967789Sahrens * may have been the previous top-level vdev. 3968789Sahrens */ 3969789Sahrens tvd = cvd->vdev_top; 3970789Sahrens ASSERT(tvd->vdev_parent == rvd); 3971789Sahrens 3972789Sahrens /* 39733377Seschrock * Reevaluate the parent vdev state. 3974789Sahrens */ 39754451Seschrock vdev_propagate_state(cvd); 3976789Sahrens 3977789Sahrens /* 39789816SGeorge.Wilson@Sun.COM * If the 'autoexpand' property is set on the pool then automatically 39799816SGeorge.Wilson@Sun.COM * try to expand the size of the pool. For example if the device we 39809816SGeorge.Wilson@Sun.COM * just detached was smaller than the others, it may be possible to 39819816SGeorge.Wilson@Sun.COM * add metaslabs (i.e. grow the pool). We need to reopen the vdev 39829816SGeorge.Wilson@Sun.COM * first so that we can obtain the updated sizes of the leaf vdevs. 3983789Sahrens */ 39849816SGeorge.Wilson@Sun.COM if (spa->spa_autoexpand) { 39859816SGeorge.Wilson@Sun.COM vdev_reopen(tvd); 39869816SGeorge.Wilson@Sun.COM vdev_expand(tvd, txg); 39879816SGeorge.Wilson@Sun.COM } 3988789Sahrens 3989789Sahrens vdev_config_dirty(tvd); 3990789Sahrens 3991789Sahrens /* 39923377Seschrock * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 39933377Seschrock * vd->vdev_detached is set and free vd's DTL object in syncing context. 39943377Seschrock * But first make sure we're not on any *other* txg's DTL list, to 39953377Seschrock * prevent vd from being accessed after it's freed. 3996789Sahrens */ 399711422SMark.Musante@Sun.COM vdpath = spa_strdup(vd->vdev_path); 39988241SJeff.Bonwick@Sun.COM for (int t = 0; t < TXG_SIZE; t++) 3999789Sahrens (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 40001732Sbonwick vd->vdev_detached = B_TRUE; 40011732Sbonwick vdev_dirty(tvd, VDD_DTL, vd, txg); 4002789Sahrens 40034451Seschrock spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); 40044451Seschrock 40052082Seschrock error = spa_vdev_exit(spa, vd, txg, 0); 40062082Seschrock 400711422SMark.Musante@Sun.COM spa_history_internal_log(LOG_POOL_VDEV_DETACH, spa, NULL, CRED(), 400811422SMark.Musante@Sun.COM "vdev=%s", vdpath); 400911422SMark.Musante@Sun.COM spa_strfree(vdpath); 401011422SMark.Musante@Sun.COM 40112082Seschrock /* 40123377Seschrock * If this was the removal of the original device in a hot spare vdev, 40133377Seschrock * then we want to go through and remove the device from the hot spare 40143377Seschrock * list of every other pool. 40152082Seschrock */ 40162082Seschrock if (unspare) { 40178241SJeff.Bonwick@Sun.COM spa_t *myspa = spa; 40182082Seschrock spa = NULL; 40192082Seschrock mutex_enter(&spa_namespace_lock); 40202082Seschrock while ((spa = spa_next(spa)) != NULL) { 40212082Seschrock if (spa->spa_state != POOL_STATE_ACTIVE) 40222082Seschrock continue; 40238241SJeff.Bonwick@Sun.COM if (spa == myspa) 40248241SJeff.Bonwick@Sun.COM continue; 40257793SJeff.Bonwick@Sun.COM spa_open_ref(spa, FTAG); 40267793SJeff.Bonwick@Sun.COM mutex_exit(&spa_namespace_lock); 40272082Seschrock (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 40287793SJeff.Bonwick@Sun.COM mutex_enter(&spa_namespace_lock); 40297793SJeff.Bonwick@Sun.COM spa_close(spa, FTAG); 40302082Seschrock } 40312082Seschrock mutex_exit(&spa_namespace_lock); 40322082Seschrock } 40332082Seschrock 40342082Seschrock return (error); 40352082Seschrock } 40362082Seschrock 403711422SMark.Musante@Sun.COM /* 403811422SMark.Musante@Sun.COM * Split a set of devices from their mirrors, and create a new pool from them. 403911422SMark.Musante@Sun.COM */ 404011422SMark.Musante@Sun.COM int 404111422SMark.Musante@Sun.COM spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, 404211422SMark.Musante@Sun.COM nvlist_t *props, boolean_t exp) 404311422SMark.Musante@Sun.COM { 404411422SMark.Musante@Sun.COM int error = 0; 404511422SMark.Musante@Sun.COM uint64_t txg, *glist; 404611422SMark.Musante@Sun.COM spa_t *newspa; 404711422SMark.Musante@Sun.COM uint_t c, children, lastlog; 404811422SMark.Musante@Sun.COM nvlist_t **child, *nvl, *tmp; 404911422SMark.Musante@Sun.COM dmu_tx_t *tx; 405011422SMark.Musante@Sun.COM char *altroot = NULL; 405111422SMark.Musante@Sun.COM vdev_t *rvd, **vml = NULL; /* vdev modify list */ 405211422SMark.Musante@Sun.COM boolean_t activate_slog; 405311422SMark.Musante@Sun.COM 405411422SMark.Musante@Sun.COM if (!spa_writeable(spa)) 405511422SMark.Musante@Sun.COM return (EROFS); 405611422SMark.Musante@Sun.COM 405711422SMark.Musante@Sun.COM txg = spa_vdev_enter(spa); 405811422SMark.Musante@Sun.COM 405911422SMark.Musante@Sun.COM /* clear the log and flush everything up to now */ 406011422SMark.Musante@Sun.COM activate_slog = spa_passivate_log(spa); 406111422SMark.Musante@Sun.COM (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 406211422SMark.Musante@Sun.COM error = spa_offline_log(spa); 406311422SMark.Musante@Sun.COM txg = spa_vdev_config_enter(spa); 406411422SMark.Musante@Sun.COM 406511422SMark.Musante@Sun.COM if (activate_slog) 406611422SMark.Musante@Sun.COM spa_activate_log(spa); 406711422SMark.Musante@Sun.COM 406811422SMark.Musante@Sun.COM if (error != 0) 406911422SMark.Musante@Sun.COM return (spa_vdev_exit(spa, NULL, txg, error)); 407011422SMark.Musante@Sun.COM 407111422SMark.Musante@Sun.COM /* check new spa name before going any further */ 407211422SMark.Musante@Sun.COM if (spa_lookup(newname) != NULL) 407311422SMark.Musante@Sun.COM return (spa_vdev_exit(spa, NULL, txg, EEXIST)); 407411422SMark.Musante@Sun.COM 407511422SMark.Musante@Sun.COM /* 407611422SMark.Musante@Sun.COM * scan through all the children to ensure they're all mirrors 407711422SMark.Musante@Sun.COM */ 407811422SMark.Musante@Sun.COM if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || 407911422SMark.Musante@Sun.COM nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, 408011422SMark.Musante@Sun.COM &children) != 0) 408111422SMark.Musante@Sun.COM return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 408211422SMark.Musante@Sun.COM 408311422SMark.Musante@Sun.COM /* first, check to ensure we've got the right child count */ 408411422SMark.Musante@Sun.COM rvd = spa->spa_root_vdev; 408511422SMark.Musante@Sun.COM lastlog = 0; 408611422SMark.Musante@Sun.COM for (c = 0; c < rvd->vdev_children; c++) { 408711422SMark.Musante@Sun.COM vdev_t *vd = rvd->vdev_child[c]; 408811422SMark.Musante@Sun.COM 408911422SMark.Musante@Sun.COM /* don't count the holes & logs as children */ 409011422SMark.Musante@Sun.COM if (vd->vdev_islog || vd->vdev_ishole) { 409111422SMark.Musante@Sun.COM if (lastlog == 0) 409211422SMark.Musante@Sun.COM lastlog = c; 409311422SMark.Musante@Sun.COM continue; 409411422SMark.Musante@Sun.COM } 409511422SMark.Musante@Sun.COM 409611422SMark.Musante@Sun.COM lastlog = 0; 409711422SMark.Musante@Sun.COM } 409811422SMark.Musante@Sun.COM if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) 409911422SMark.Musante@Sun.COM return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 410011422SMark.Musante@Sun.COM 410111422SMark.Musante@Sun.COM /* next, ensure no spare or cache devices are part of the split */ 410211422SMark.Musante@Sun.COM if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || 410311422SMark.Musante@Sun.COM nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) 410411422SMark.Musante@Sun.COM return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 410511422SMark.Musante@Sun.COM 410611422SMark.Musante@Sun.COM vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); 410711422SMark.Musante@Sun.COM glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); 410811422SMark.Musante@Sun.COM 410911422SMark.Musante@Sun.COM /* then, loop over each vdev and validate it */ 411011422SMark.Musante@Sun.COM for (c = 0; c < children; c++) { 411111422SMark.Musante@Sun.COM uint64_t is_hole = 0; 411211422SMark.Musante@Sun.COM 411311422SMark.Musante@Sun.COM (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, 411411422SMark.Musante@Sun.COM &is_hole); 411511422SMark.Musante@Sun.COM 411611422SMark.Musante@Sun.COM if (is_hole != 0) { 411711422SMark.Musante@Sun.COM if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || 411811422SMark.Musante@Sun.COM spa->spa_root_vdev->vdev_child[c]->vdev_islog) { 411911422SMark.Musante@Sun.COM continue; 412011422SMark.Musante@Sun.COM } else { 412111422SMark.Musante@Sun.COM error = EINVAL; 412211422SMark.Musante@Sun.COM break; 412311422SMark.Musante@Sun.COM } 412411422SMark.Musante@Sun.COM } 412511422SMark.Musante@Sun.COM 412611422SMark.Musante@Sun.COM /* which disk is going to be split? */ 412711422SMark.Musante@Sun.COM if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, 412811422SMark.Musante@Sun.COM &glist[c]) != 0) { 412911422SMark.Musante@Sun.COM error = EINVAL; 413011422SMark.Musante@Sun.COM break; 413111422SMark.Musante@Sun.COM } 413211422SMark.Musante@Sun.COM 413311422SMark.Musante@Sun.COM /* look it up in the spa */ 413411422SMark.Musante@Sun.COM vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); 413511422SMark.Musante@Sun.COM if (vml[c] == NULL) { 413611422SMark.Musante@Sun.COM error = ENODEV; 413711422SMark.Musante@Sun.COM break; 413811422SMark.Musante@Sun.COM } 413911422SMark.Musante@Sun.COM 414011422SMark.Musante@Sun.COM /* make sure there's nothing stopping the split */ 414111422SMark.Musante@Sun.COM if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || 414211422SMark.Musante@Sun.COM vml[c]->vdev_islog || 414311422SMark.Musante@Sun.COM vml[c]->vdev_ishole || 414411422SMark.Musante@Sun.COM vml[c]->vdev_isspare || 414511422SMark.Musante@Sun.COM vml[c]->vdev_isl2cache || 414611422SMark.Musante@Sun.COM !vdev_writeable(vml[c]) || 414711497SMark.Musante@Sun.COM vml[c]->vdev_children != 0 || 414811422SMark.Musante@Sun.COM vml[c]->vdev_state != VDEV_STATE_HEALTHY || 414911422SMark.Musante@Sun.COM c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { 415011422SMark.Musante@Sun.COM error = EINVAL; 415111422SMark.Musante@Sun.COM break; 415211422SMark.Musante@Sun.COM } 415311422SMark.Musante@Sun.COM 415411422SMark.Musante@Sun.COM if (vdev_dtl_required(vml[c])) { 415511422SMark.Musante@Sun.COM error = EBUSY; 415611422SMark.Musante@Sun.COM break; 415711422SMark.Musante@Sun.COM } 415811422SMark.Musante@Sun.COM 415911422SMark.Musante@Sun.COM /* we need certain info from the top level */ 416011422SMark.Musante@Sun.COM VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, 416111422SMark.Musante@Sun.COM vml[c]->vdev_top->vdev_ms_array) == 0); 416211422SMark.Musante@Sun.COM VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, 416311422SMark.Musante@Sun.COM vml[c]->vdev_top->vdev_ms_shift) == 0); 416411422SMark.Musante@Sun.COM VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, 416511422SMark.Musante@Sun.COM vml[c]->vdev_top->vdev_asize) == 0); 416611422SMark.Musante@Sun.COM VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, 416711422SMark.Musante@Sun.COM vml[c]->vdev_top->vdev_ashift) == 0); 416811422SMark.Musante@Sun.COM } 416911422SMark.Musante@Sun.COM 417011422SMark.Musante@Sun.COM if (error != 0) { 417111422SMark.Musante@Sun.COM kmem_free(vml, children * sizeof (vdev_t *)); 417211422SMark.Musante@Sun.COM kmem_free(glist, children * sizeof (uint64_t)); 417311422SMark.Musante@Sun.COM return (spa_vdev_exit(spa, NULL, txg, error)); 417411422SMark.Musante@Sun.COM } 417511422SMark.Musante@Sun.COM 417611422SMark.Musante@Sun.COM /* stop writers from using the disks */ 417711422SMark.Musante@Sun.COM for (c = 0; c < children; c++) { 417811422SMark.Musante@Sun.COM if (vml[c] != NULL) 417911422SMark.Musante@Sun.COM vml[c]->vdev_offline = B_TRUE; 418011422SMark.Musante@Sun.COM } 418111422SMark.Musante@Sun.COM vdev_reopen(spa->spa_root_vdev); 418211422SMark.Musante@Sun.COM 418311422SMark.Musante@Sun.COM /* 418411422SMark.Musante@Sun.COM * Temporarily record the splitting vdevs in the spa config. This 418511422SMark.Musante@Sun.COM * will disappear once the config is regenerated. 418611422SMark.Musante@Sun.COM */ 418711422SMark.Musante@Sun.COM VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); 418811422SMark.Musante@Sun.COM VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 418911422SMark.Musante@Sun.COM glist, children) == 0); 419011422SMark.Musante@Sun.COM kmem_free(glist, children * sizeof (uint64_t)); 419111422SMark.Musante@Sun.COM 419211864SMark.Musante@Sun.COM mutex_enter(&spa->spa_props_lock); 419311422SMark.Musante@Sun.COM VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, 419411422SMark.Musante@Sun.COM nvl) == 0); 419511864SMark.Musante@Sun.COM mutex_exit(&spa->spa_props_lock); 419611422SMark.Musante@Sun.COM spa->spa_config_splitting = nvl; 419711422SMark.Musante@Sun.COM vdev_config_dirty(spa->spa_root_vdev); 419811422SMark.Musante@Sun.COM 419911422SMark.Musante@Sun.COM /* configure and create the new pool */ 420011422SMark.Musante@Sun.COM VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0); 420111422SMark.Musante@Sun.COM VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 420211422SMark.Musante@Sun.COM exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0); 420311422SMark.Musante@Sun.COM VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 420411422SMark.Musante@Sun.COM spa_version(spa)) == 0); 420511422SMark.Musante@Sun.COM VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, 420611422SMark.Musante@Sun.COM spa->spa_config_txg) == 0); 420711422SMark.Musante@Sun.COM VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, 420811422SMark.Musante@Sun.COM spa_generate_guid(NULL)) == 0); 420911422SMark.Musante@Sun.COM (void) nvlist_lookup_string(props, 421011422SMark.Musante@Sun.COM zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 421111422SMark.Musante@Sun.COM 421211497SMark.Musante@Sun.COM /* add the new pool to the namespace */ 421311422SMark.Musante@Sun.COM newspa = spa_add(newname, config, altroot); 421411422SMark.Musante@Sun.COM newspa->spa_config_txg = spa->spa_config_txg; 421511422SMark.Musante@Sun.COM spa_set_log_state(newspa, SPA_LOG_CLEAR); 421611422SMark.Musante@Sun.COM 421711422SMark.Musante@Sun.COM /* release the spa config lock, retaining the namespace lock */ 421811422SMark.Musante@Sun.COM spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 421911422SMark.Musante@Sun.COM 422011422SMark.Musante@Sun.COM if (zio_injection_enabled) 422111422SMark.Musante@Sun.COM zio_handle_panic_injection(spa, FTAG, 1); 422211422SMark.Musante@Sun.COM 422311422SMark.Musante@Sun.COM spa_activate(newspa, spa_mode_global); 422411422SMark.Musante@Sun.COM spa_async_suspend(newspa); 422511422SMark.Musante@Sun.COM 422611422SMark.Musante@Sun.COM /* create the new pool from the disks of the original pool */ 422711422SMark.Musante@Sun.COM error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE); 422811422SMark.Musante@Sun.COM if (error) 422911422SMark.Musante@Sun.COM goto out; 423011422SMark.Musante@Sun.COM 423111422SMark.Musante@Sun.COM /* if that worked, generate a real config for the new pool */ 423211422SMark.Musante@Sun.COM if (newspa->spa_root_vdev != NULL) { 423311422SMark.Musante@Sun.COM VERIFY(nvlist_alloc(&newspa->spa_config_splitting, 423411422SMark.Musante@Sun.COM NV_UNIQUE_NAME, KM_SLEEP) == 0); 423511422SMark.Musante@Sun.COM VERIFY(nvlist_add_uint64(newspa->spa_config_splitting, 423611422SMark.Musante@Sun.COM ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0); 423711422SMark.Musante@Sun.COM spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, 423811422SMark.Musante@Sun.COM B_TRUE)); 423911422SMark.Musante@Sun.COM } 424011422SMark.Musante@Sun.COM 424111422SMark.Musante@Sun.COM /* set the props */ 424211422SMark.Musante@Sun.COM if (props != NULL) { 424311422SMark.Musante@Sun.COM spa_configfile_set(newspa, props, B_FALSE); 424411422SMark.Musante@Sun.COM error = spa_prop_set(newspa, props); 424511422SMark.Musante@Sun.COM if (error) 424611422SMark.Musante@Sun.COM goto out; 424711422SMark.Musante@Sun.COM } 424811422SMark.Musante@Sun.COM 424911422SMark.Musante@Sun.COM /* flush everything */ 425011422SMark.Musante@Sun.COM txg = spa_vdev_config_enter(newspa); 425111422SMark.Musante@Sun.COM vdev_config_dirty(newspa->spa_root_vdev); 425211422SMark.Musante@Sun.COM (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); 425311422SMark.Musante@Sun.COM 425411422SMark.Musante@Sun.COM if (zio_injection_enabled) 425511422SMark.Musante@Sun.COM zio_handle_panic_injection(spa, FTAG, 2); 425611422SMark.Musante@Sun.COM 425711422SMark.Musante@Sun.COM spa_async_resume(newspa); 425811422SMark.Musante@Sun.COM 425911422SMark.Musante@Sun.COM /* finally, update the original pool's config */ 426011422SMark.Musante@Sun.COM txg = spa_vdev_config_enter(spa); 426111422SMark.Musante@Sun.COM tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 426211422SMark.Musante@Sun.COM error = dmu_tx_assign(tx, TXG_WAIT); 426311422SMark.Musante@Sun.COM if (error != 0) 426411422SMark.Musante@Sun.COM dmu_tx_abort(tx); 426511422SMark.Musante@Sun.COM for (c = 0; c < children; c++) { 426611422SMark.Musante@Sun.COM if (vml[c] != NULL) { 426711422SMark.Musante@Sun.COM vdev_split(vml[c]); 426811422SMark.Musante@Sun.COM if (error == 0) 426911422SMark.Musante@Sun.COM spa_history_internal_log(LOG_POOL_VDEV_DETACH, 427011422SMark.Musante@Sun.COM spa, tx, CRED(), "vdev=%s", 427111422SMark.Musante@Sun.COM vml[c]->vdev_path); 427211422SMark.Musante@Sun.COM vdev_free(vml[c]); 427311422SMark.Musante@Sun.COM } 427411422SMark.Musante@Sun.COM } 427511422SMark.Musante@Sun.COM vdev_config_dirty(spa->spa_root_vdev); 427611422SMark.Musante@Sun.COM spa->spa_config_splitting = NULL; 427711422SMark.Musante@Sun.COM nvlist_free(nvl); 427811422SMark.Musante@Sun.COM if (error == 0) 427911422SMark.Musante@Sun.COM dmu_tx_commit(tx); 428011422SMark.Musante@Sun.COM (void) spa_vdev_exit(spa, NULL, txg, 0); 428111422SMark.Musante@Sun.COM 428211422SMark.Musante@Sun.COM if (zio_injection_enabled) 428311422SMark.Musante@Sun.COM zio_handle_panic_injection(spa, FTAG, 3); 428411422SMark.Musante@Sun.COM 428511422SMark.Musante@Sun.COM /* split is complete; log a history record */ 428611422SMark.Musante@Sun.COM spa_history_internal_log(LOG_POOL_SPLIT, newspa, NULL, CRED(), 428711422SMark.Musante@Sun.COM "split new pool %s from pool %s", newname, spa_name(spa)); 428811422SMark.Musante@Sun.COM 428911422SMark.Musante@Sun.COM kmem_free(vml, children * sizeof (vdev_t *)); 429011422SMark.Musante@Sun.COM 429111422SMark.Musante@Sun.COM /* if we're not going to mount the filesystems in userland, export */ 429211422SMark.Musante@Sun.COM if (exp) 429311422SMark.Musante@Sun.COM error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, 429411422SMark.Musante@Sun.COM B_FALSE, B_FALSE); 429511422SMark.Musante@Sun.COM 429611422SMark.Musante@Sun.COM return (error); 429711422SMark.Musante@Sun.COM 429811422SMark.Musante@Sun.COM out: 429911422SMark.Musante@Sun.COM spa_unload(newspa); 430011422SMark.Musante@Sun.COM spa_deactivate(newspa); 430111422SMark.Musante@Sun.COM spa_remove(newspa); 430211422SMark.Musante@Sun.COM 430311422SMark.Musante@Sun.COM txg = spa_vdev_config_enter(spa); 430411864SMark.Musante@Sun.COM 430511864SMark.Musante@Sun.COM /* re-online all offlined disks */ 430611864SMark.Musante@Sun.COM for (c = 0; c < children; c++) { 430711864SMark.Musante@Sun.COM if (vml[c] != NULL) 430811864SMark.Musante@Sun.COM vml[c]->vdev_offline = B_FALSE; 430911864SMark.Musante@Sun.COM } 431011864SMark.Musante@Sun.COM vdev_reopen(spa->spa_root_vdev); 431111864SMark.Musante@Sun.COM 431211422SMark.Musante@Sun.COM nvlist_free(spa->spa_config_splitting); 431311422SMark.Musante@Sun.COM spa->spa_config_splitting = NULL; 431411497SMark.Musante@Sun.COM (void) spa_vdev_exit(spa, NULL, txg, error); 431511422SMark.Musante@Sun.COM 431611422SMark.Musante@Sun.COM kmem_free(vml, children * sizeof (vdev_t *)); 431711422SMark.Musante@Sun.COM return (error); 431811422SMark.Musante@Sun.COM } 431911422SMark.Musante@Sun.COM 43207754SJeff.Bonwick@Sun.COM static nvlist_t * 43217754SJeff.Bonwick@Sun.COM spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) 43222082Seschrock { 43237754SJeff.Bonwick@Sun.COM for (int i = 0; i < count; i++) { 43247754SJeff.Bonwick@Sun.COM uint64_t guid; 43257754SJeff.Bonwick@Sun.COM 43267754SJeff.Bonwick@Sun.COM VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID, 43277754SJeff.Bonwick@Sun.COM &guid) == 0); 43287754SJeff.Bonwick@Sun.COM 43297754SJeff.Bonwick@Sun.COM if (guid == target_guid) 43307754SJeff.Bonwick@Sun.COM return (nvpp[i]); 43312082Seschrock } 43322082Seschrock 43337754SJeff.Bonwick@Sun.COM return (NULL); 43345450Sbrendan } 43355450Sbrendan 43367754SJeff.Bonwick@Sun.COM static void 43377754SJeff.Bonwick@Sun.COM spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, 43387754SJeff.Bonwick@Sun.COM nvlist_t *dev_to_remove) 43395450Sbrendan { 43407754SJeff.Bonwick@Sun.COM nvlist_t **newdev = NULL; 43417754SJeff.Bonwick@Sun.COM 43427754SJeff.Bonwick@Sun.COM if (count > 1) 43437754SJeff.Bonwick@Sun.COM newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP); 43447754SJeff.Bonwick@Sun.COM 43457754SJeff.Bonwick@Sun.COM for (int i = 0, j = 0; i < count; i++) { 43467754SJeff.Bonwick@Sun.COM if (dev[i] == dev_to_remove) 43477754SJeff.Bonwick@Sun.COM continue; 43487754SJeff.Bonwick@Sun.COM VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); 43495450Sbrendan } 43505450Sbrendan 43517754SJeff.Bonwick@Sun.COM VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); 43527754SJeff.Bonwick@Sun.COM VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0); 43537754SJeff.Bonwick@Sun.COM 43547754SJeff.Bonwick@Sun.COM for (int i = 0; i < count - 1; i++) 43557754SJeff.Bonwick@Sun.COM nvlist_free(newdev[i]); 43567754SJeff.Bonwick@Sun.COM 43577754SJeff.Bonwick@Sun.COM if (count > 1) 43587754SJeff.Bonwick@Sun.COM kmem_free(newdev, (count - 1) * sizeof (void *)); 43595450Sbrendan } 43605450Sbrendan 43615450Sbrendan /* 436210594SGeorge.Wilson@Sun.COM * Removing a device from the vdev namespace requires several steps 436310594SGeorge.Wilson@Sun.COM * and can take a significant amount of time. As a result we use 436410594SGeorge.Wilson@Sun.COM * the spa_vdev_config_[enter/exit] functions which allow us to 436510594SGeorge.Wilson@Sun.COM * grab and release the spa_config_lock while still holding the namespace 436610594SGeorge.Wilson@Sun.COM * lock. During each step the configuration is synced out. 436710594SGeorge.Wilson@Sun.COM */ 436810594SGeorge.Wilson@Sun.COM 436910594SGeorge.Wilson@Sun.COM /* 437010594SGeorge.Wilson@Sun.COM * Evacuate the device. 437110594SGeorge.Wilson@Sun.COM */ 437210594SGeorge.Wilson@Sun.COM int 437310594SGeorge.Wilson@Sun.COM spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd) 437410594SGeorge.Wilson@Sun.COM { 437510974SJeff.Bonwick@Sun.COM int error = 0; 437610594SGeorge.Wilson@Sun.COM uint64_t txg; 437710594SGeorge.Wilson@Sun.COM 437810594SGeorge.Wilson@Sun.COM ASSERT(MUTEX_HELD(&spa_namespace_lock)); 437910594SGeorge.Wilson@Sun.COM ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 438010922SJeff.Bonwick@Sun.COM ASSERT(vd == vd->vdev_top); 438110594SGeorge.Wilson@Sun.COM 438210594SGeorge.Wilson@Sun.COM /* 438310594SGeorge.Wilson@Sun.COM * Evacuate the device. We don't hold the config lock as writer 438410594SGeorge.Wilson@Sun.COM * since we need to do I/O but we do keep the 438510594SGeorge.Wilson@Sun.COM * spa_namespace_lock held. Once this completes the device 438610594SGeorge.Wilson@Sun.COM * should no longer have any blocks allocated on it. 438710594SGeorge.Wilson@Sun.COM */ 438810594SGeorge.Wilson@Sun.COM if (vd->vdev_islog) { 438910974SJeff.Bonwick@Sun.COM error = dmu_objset_find(spa_name(spa), zil_vdev_offline, 439010974SJeff.Bonwick@Sun.COM NULL, DS_FIND_CHILDREN); 439110974SJeff.Bonwick@Sun.COM } else { 439210974SJeff.Bonwick@Sun.COM error = ENOTSUP; /* until we have bp rewrite */ 439310594SGeorge.Wilson@Sun.COM } 439410594SGeorge.Wilson@Sun.COM 439510974SJeff.Bonwick@Sun.COM txg_wait_synced(spa_get_dsl(spa), 0); 439610974SJeff.Bonwick@Sun.COM 439710974SJeff.Bonwick@Sun.COM if (error) 439810974SJeff.Bonwick@Sun.COM return (error); 439910974SJeff.Bonwick@Sun.COM 440010594SGeorge.Wilson@Sun.COM /* 440110974SJeff.Bonwick@Sun.COM * The evacuation succeeded. Remove any remaining MOS metadata 440210974SJeff.Bonwick@Sun.COM * associated with this vdev, and wait for these changes to sync. 440310594SGeorge.Wilson@Sun.COM */ 440410594SGeorge.Wilson@Sun.COM txg = spa_vdev_config_enter(spa); 440510594SGeorge.Wilson@Sun.COM vd->vdev_removing = B_TRUE; 440610594SGeorge.Wilson@Sun.COM vdev_dirty(vd, 0, NULL, txg); 440710594SGeorge.Wilson@Sun.COM vdev_config_dirty(vd); 440810594SGeorge.Wilson@Sun.COM spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 440910594SGeorge.Wilson@Sun.COM 441010594SGeorge.Wilson@Sun.COM return (0); 441110594SGeorge.Wilson@Sun.COM } 441210594SGeorge.Wilson@Sun.COM 441310594SGeorge.Wilson@Sun.COM /* 441410594SGeorge.Wilson@Sun.COM * Complete the removal by cleaning up the namespace. 441510594SGeorge.Wilson@Sun.COM */ 441610594SGeorge.Wilson@Sun.COM void 441710974SJeff.Bonwick@Sun.COM spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd) 441810594SGeorge.Wilson@Sun.COM { 441910594SGeorge.Wilson@Sun.COM vdev_t *rvd = spa->spa_root_vdev; 442010594SGeorge.Wilson@Sun.COM uint64_t id = vd->vdev_id; 442110594SGeorge.Wilson@Sun.COM boolean_t last_vdev = (id == (rvd->vdev_children - 1)); 442210594SGeorge.Wilson@Sun.COM 442310594SGeorge.Wilson@Sun.COM ASSERT(MUTEX_HELD(&spa_namespace_lock)); 442410594SGeorge.Wilson@Sun.COM ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 442510922SJeff.Bonwick@Sun.COM ASSERT(vd == vd->vdev_top); 442610594SGeorge.Wilson@Sun.COM 442710594SGeorge.Wilson@Sun.COM (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 442810922SJeff.Bonwick@Sun.COM 442910922SJeff.Bonwick@Sun.COM if (list_link_active(&vd->vdev_state_dirty_node)) 443010922SJeff.Bonwick@Sun.COM vdev_state_clean(vd); 443110922SJeff.Bonwick@Sun.COM if (list_link_active(&vd->vdev_config_dirty_node)) 443210922SJeff.Bonwick@Sun.COM vdev_config_clean(vd); 443310922SJeff.Bonwick@Sun.COM 443410594SGeorge.Wilson@Sun.COM vdev_free(vd); 443510594SGeorge.Wilson@Sun.COM 443610594SGeorge.Wilson@Sun.COM if (last_vdev) { 443710594SGeorge.Wilson@Sun.COM vdev_compact_children(rvd); 443810594SGeorge.Wilson@Sun.COM } else { 443910594SGeorge.Wilson@Sun.COM vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); 444010594SGeorge.Wilson@Sun.COM vdev_add_child(rvd, vd); 444110594SGeorge.Wilson@Sun.COM } 444210594SGeorge.Wilson@Sun.COM vdev_config_dirty(rvd); 444310594SGeorge.Wilson@Sun.COM 444410594SGeorge.Wilson@Sun.COM /* 444510594SGeorge.Wilson@Sun.COM * Reassess the health of our root vdev. 444610594SGeorge.Wilson@Sun.COM */ 444710594SGeorge.Wilson@Sun.COM vdev_reopen(rvd); 444810594SGeorge.Wilson@Sun.COM } 444910594SGeorge.Wilson@Sun.COM 445010594SGeorge.Wilson@Sun.COM /* 44515450Sbrendan * Remove a device from the pool. Currently, this supports removing only hot 445210594SGeorge.Wilson@Sun.COM * spares, slogs, and level 2 ARC devices. 44535450Sbrendan */ 44545450Sbrendan int 44555450Sbrendan spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 44565450Sbrendan { 44575450Sbrendan vdev_t *vd; 445810974SJeff.Bonwick@Sun.COM metaslab_group_t *mg; 44597754SJeff.Bonwick@Sun.COM nvlist_t **spares, **l2cache, *nv; 446010594SGeorge.Wilson@Sun.COM uint64_t txg = 0; 44615450Sbrendan uint_t nspares, nl2cache; 44625450Sbrendan int error = 0; 44638241SJeff.Bonwick@Sun.COM boolean_t locked = MUTEX_HELD(&spa_namespace_lock); 44648241SJeff.Bonwick@Sun.COM 44658241SJeff.Bonwick@Sun.COM if (!locked) 44668241SJeff.Bonwick@Sun.COM txg = spa_vdev_enter(spa); 44675450Sbrendan 44686643Seschrock vd = spa_lookup_by_guid(spa, guid, B_FALSE); 44695450Sbrendan 44705450Sbrendan if (spa->spa_spares.sav_vdevs != NULL && 44715450Sbrendan nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 44727754SJeff.Bonwick@Sun.COM ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 && 44737754SJeff.Bonwick@Sun.COM (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) { 44747754SJeff.Bonwick@Sun.COM /* 44757754SJeff.Bonwick@Sun.COM * Only remove the hot spare if it's not currently in use 44767754SJeff.Bonwick@Sun.COM * in this pool. 44777754SJeff.Bonwick@Sun.COM */ 44787754SJeff.Bonwick@Sun.COM if (vd == NULL || unspare) { 44797754SJeff.Bonwick@Sun.COM spa_vdev_remove_aux(spa->spa_spares.sav_config, 44807754SJeff.Bonwick@Sun.COM ZPOOL_CONFIG_SPARES, spares, nspares, nv); 44817754SJeff.Bonwick@Sun.COM spa_load_spares(spa); 44827754SJeff.Bonwick@Sun.COM spa->spa_spares.sav_sync = B_TRUE; 44837754SJeff.Bonwick@Sun.COM } else { 44847754SJeff.Bonwick@Sun.COM error = EBUSY; 44857754SJeff.Bonwick@Sun.COM } 44867754SJeff.Bonwick@Sun.COM } else if (spa->spa_l2cache.sav_vdevs != NULL && 44875450Sbrendan nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 44887754SJeff.Bonwick@Sun.COM ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && 44897754SJeff.Bonwick@Sun.COM (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { 44907754SJeff.Bonwick@Sun.COM /* 44917754SJeff.Bonwick@Sun.COM * Cache devices can always be removed. 44927754SJeff.Bonwick@Sun.COM */ 44937754SJeff.Bonwick@Sun.COM spa_vdev_remove_aux(spa->spa_l2cache.sav_config, 44947754SJeff.Bonwick@Sun.COM ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); 44955450Sbrendan spa_load_l2cache(spa); 44965450Sbrendan spa->spa_l2cache.sav_sync = B_TRUE; 449710594SGeorge.Wilson@Sun.COM } else if (vd != NULL && vd->vdev_islog) { 449810594SGeorge.Wilson@Sun.COM ASSERT(!locked); 449910922SJeff.Bonwick@Sun.COM ASSERT(vd == vd->vdev_top); 450010594SGeorge.Wilson@Sun.COM 450110594SGeorge.Wilson@Sun.COM /* 450210594SGeorge.Wilson@Sun.COM * XXX - Once we have bp-rewrite this should 450310594SGeorge.Wilson@Sun.COM * become the common case. 450410594SGeorge.Wilson@Sun.COM */ 450510594SGeorge.Wilson@Sun.COM 450610974SJeff.Bonwick@Sun.COM mg = vd->vdev_mg; 450710974SJeff.Bonwick@Sun.COM 450810594SGeorge.Wilson@Sun.COM /* 450910974SJeff.Bonwick@Sun.COM * Stop allocating from this vdev. 451010594SGeorge.Wilson@Sun.COM */ 451110974SJeff.Bonwick@Sun.COM metaslab_group_passivate(mg); 451210594SGeorge.Wilson@Sun.COM 451310922SJeff.Bonwick@Sun.COM /* 451410922SJeff.Bonwick@Sun.COM * Wait for the youngest allocations and frees to sync, 451510922SJeff.Bonwick@Sun.COM * and then wait for the deferral of those frees to finish. 451610922SJeff.Bonwick@Sun.COM */ 451710922SJeff.Bonwick@Sun.COM spa_vdev_config_exit(spa, NULL, 451810922SJeff.Bonwick@Sun.COM txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); 451910922SJeff.Bonwick@Sun.COM 452010974SJeff.Bonwick@Sun.COM /* 452110974SJeff.Bonwick@Sun.COM * Attempt to evacuate the vdev. 452210974SJeff.Bonwick@Sun.COM */ 452310974SJeff.Bonwick@Sun.COM error = spa_vdev_remove_evacuate(spa, vd); 452410974SJeff.Bonwick@Sun.COM 452510594SGeorge.Wilson@Sun.COM txg = spa_vdev_config_enter(spa); 452610594SGeorge.Wilson@Sun.COM 452710974SJeff.Bonwick@Sun.COM /* 452810974SJeff.Bonwick@Sun.COM * If we couldn't evacuate the vdev, unwind. 452910974SJeff.Bonwick@Sun.COM */ 453010974SJeff.Bonwick@Sun.COM if (error) { 453110974SJeff.Bonwick@Sun.COM metaslab_group_activate(mg); 453210974SJeff.Bonwick@Sun.COM return (spa_vdev_exit(spa, NULL, txg, error)); 453310974SJeff.Bonwick@Sun.COM } 453410974SJeff.Bonwick@Sun.COM 453510974SJeff.Bonwick@Sun.COM /* 453610974SJeff.Bonwick@Sun.COM * Clean up the vdev namespace. 453710974SJeff.Bonwick@Sun.COM */ 453810974SJeff.Bonwick@Sun.COM spa_vdev_remove_from_namespace(spa, vd); 453910594SGeorge.Wilson@Sun.COM 45407754SJeff.Bonwick@Sun.COM } else if (vd != NULL) { 45417754SJeff.Bonwick@Sun.COM /* 45427754SJeff.Bonwick@Sun.COM * Normal vdevs cannot be removed (yet). 45437754SJeff.Bonwick@Sun.COM */ 45447754SJeff.Bonwick@Sun.COM error = ENOTSUP; 45457754SJeff.Bonwick@Sun.COM } else { 45467754SJeff.Bonwick@Sun.COM /* 45477754SJeff.Bonwick@Sun.COM * There is no vdev of any kind with the specified guid. 45487754SJeff.Bonwick@Sun.COM */ 45497754SJeff.Bonwick@Sun.COM error = ENOENT; 45505450Sbrendan } 45512082Seschrock 45528241SJeff.Bonwick@Sun.COM if (!locked) 45538241SJeff.Bonwick@Sun.COM return (spa_vdev_exit(spa, NULL, txg, error)); 45548241SJeff.Bonwick@Sun.COM 45558241SJeff.Bonwick@Sun.COM return (error); 4556789Sahrens } 4557789Sahrens 4558789Sahrens /* 45594451Seschrock * Find any device that's done replacing, or a vdev marked 'unspare' that's 45604451Seschrock * current spared, so we can detach it. 4561789Sahrens */ 45621544Seschrock static vdev_t * 45634451Seschrock spa_vdev_resilver_done_hunt(vdev_t *vd) 4564789Sahrens { 45651544Seschrock vdev_t *newvd, *oldvd; 45669816SGeorge.Wilson@Sun.COM 45679816SGeorge.Wilson@Sun.COM for (int c = 0; c < vd->vdev_children; c++) { 45684451Seschrock oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 45691544Seschrock if (oldvd != NULL) 45701544Seschrock return (oldvd); 45711544Seschrock } 4572789Sahrens 45734451Seschrock /* 45744451Seschrock * Check for a completed replacement. 45754451Seschrock */ 4576789Sahrens if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 45771544Seschrock oldvd = vd->vdev_child[0]; 45781544Seschrock newvd = vd->vdev_child[1]; 4579789Sahrens 45808241SJeff.Bonwick@Sun.COM if (vdev_dtl_empty(newvd, DTL_MISSING) && 458111820SVictor.Latushkin@Sun.COM vdev_dtl_empty(newvd, DTL_OUTAGE) && 45828241SJeff.Bonwick@Sun.COM !vdev_dtl_required(oldvd)) 45831544Seschrock return (oldvd); 45841544Seschrock } 4585789Sahrens 45864451Seschrock /* 45874451Seschrock * Check for a completed resilver with the 'unspare' flag set. 45884451Seschrock */ 45894451Seschrock if (vd->vdev_ops == &vdev_spare_ops && vd->vdev_children == 2) { 45904451Seschrock newvd = vd->vdev_child[0]; 45914451Seschrock oldvd = vd->vdev_child[1]; 45924451Seschrock 45934451Seschrock if (newvd->vdev_unspare && 45948241SJeff.Bonwick@Sun.COM vdev_dtl_empty(newvd, DTL_MISSING) && 459511820SVictor.Latushkin@Sun.COM vdev_dtl_empty(newvd, DTL_OUTAGE) && 45968241SJeff.Bonwick@Sun.COM !vdev_dtl_required(oldvd)) { 45974451Seschrock newvd->vdev_unspare = 0; 45984451Seschrock return (oldvd); 45994451Seschrock } 46004451Seschrock } 46014451Seschrock 46021544Seschrock return (NULL); 4603789Sahrens } 4604789Sahrens 46051544Seschrock static void 46064451Seschrock spa_vdev_resilver_done(spa_t *spa) 4607789Sahrens { 46088241SJeff.Bonwick@Sun.COM vdev_t *vd, *pvd, *ppvd; 46098241SJeff.Bonwick@Sun.COM uint64_t guid, sguid, pguid, ppguid; 46108241SJeff.Bonwick@Sun.COM 46118241SJeff.Bonwick@Sun.COM spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4612789Sahrens 46134451Seschrock while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 46148241SJeff.Bonwick@Sun.COM pvd = vd->vdev_parent; 46158241SJeff.Bonwick@Sun.COM ppvd = pvd->vdev_parent; 46161544Seschrock guid = vd->vdev_guid; 46178241SJeff.Bonwick@Sun.COM pguid = pvd->vdev_guid; 46188241SJeff.Bonwick@Sun.COM ppguid = ppvd->vdev_guid; 46198241SJeff.Bonwick@Sun.COM sguid = 0; 46202082Seschrock /* 46212082Seschrock * If we have just finished replacing a hot spared device, then 46222082Seschrock * we need to detach the parent's first child (the original hot 46232082Seschrock * spare) as well. 46242082Seschrock */ 46258241SJeff.Bonwick@Sun.COM if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0) { 46262082Seschrock ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 46278241SJeff.Bonwick@Sun.COM ASSERT(ppvd->vdev_children == 2); 46288241SJeff.Bonwick@Sun.COM sguid = ppvd->vdev_child[1]->vdev_guid; 46292082Seschrock } 46308241SJeff.Bonwick@Sun.COM spa_config_exit(spa, SCL_ALL, FTAG); 46318241SJeff.Bonwick@Sun.COM if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) 46321544Seschrock return; 46338241SJeff.Bonwick@Sun.COM if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) 46342082Seschrock return; 46358241SJeff.Bonwick@Sun.COM spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4636789Sahrens } 4637789Sahrens 46388241SJeff.Bonwick@Sun.COM spa_config_exit(spa, SCL_ALL, FTAG); 4639789Sahrens } 4640789Sahrens 4641789Sahrens /* 464211041SEric.Taylor@Sun.COM * Update the stored path or FRU for this vdev. 46431354Seschrock */ 46441354Seschrock int 46459425SEric.Schrock@Sun.COM spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, 46469425SEric.Schrock@Sun.COM boolean_t ispath) 46471354Seschrock { 46486643Seschrock vdev_t *vd; 464911817SGeorge.Wilson@Sun.COM boolean_t sync = B_FALSE; 465011041SEric.Taylor@Sun.COM 465111041SEric.Taylor@Sun.COM spa_vdev_state_enter(spa, SCL_ALL); 46521354Seschrock 46539425SEric.Schrock@Sun.COM if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 465411041SEric.Taylor@Sun.COM return (spa_vdev_state_exit(spa, NULL, ENOENT)); 46551354Seschrock 46561585Sbonwick if (!vd->vdev_ops->vdev_op_leaf) 465711041SEric.Taylor@Sun.COM return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 46581585Sbonwick 46599425SEric.Schrock@Sun.COM if (ispath) { 466011817SGeorge.Wilson@Sun.COM if (strcmp(value, vd->vdev_path) != 0) { 466111817SGeorge.Wilson@Sun.COM spa_strfree(vd->vdev_path); 466211817SGeorge.Wilson@Sun.COM vd->vdev_path = spa_strdup(value); 466311817SGeorge.Wilson@Sun.COM sync = B_TRUE; 466411817SGeorge.Wilson@Sun.COM } 46659425SEric.Schrock@Sun.COM } else { 466611817SGeorge.Wilson@Sun.COM if (vd->vdev_fru == NULL) { 466711817SGeorge.Wilson@Sun.COM vd->vdev_fru = spa_strdup(value); 466811817SGeorge.Wilson@Sun.COM sync = B_TRUE; 466911817SGeorge.Wilson@Sun.COM } else if (strcmp(value, vd->vdev_fru) != 0) { 46709425SEric.Schrock@Sun.COM spa_strfree(vd->vdev_fru); 467111817SGeorge.Wilson@Sun.COM vd->vdev_fru = spa_strdup(value); 467211817SGeorge.Wilson@Sun.COM sync = B_TRUE; 467311817SGeorge.Wilson@Sun.COM } 46749425SEric.Schrock@Sun.COM } 46751354Seschrock 467611817SGeorge.Wilson@Sun.COM return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0)); 46771354Seschrock } 46781354Seschrock 46799425SEric.Schrock@Sun.COM int 46809425SEric.Schrock@Sun.COM spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 46819425SEric.Schrock@Sun.COM { 46829425SEric.Schrock@Sun.COM return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); 46839425SEric.Schrock@Sun.COM } 46849425SEric.Schrock@Sun.COM 46859425SEric.Schrock@Sun.COM int 46869425SEric.Schrock@Sun.COM spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) 46879425SEric.Schrock@Sun.COM { 46889425SEric.Schrock@Sun.COM return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); 46899425SEric.Schrock@Sun.COM } 46909425SEric.Schrock@Sun.COM 46911354Seschrock /* 4692789Sahrens * ========================================================================== 4693789Sahrens * SPA Scrubbing 4694789Sahrens * ========================================================================== 4695789Sahrens */ 4696789Sahrens 46977046Sahrens int 46987046Sahrens spa_scrub(spa_t *spa, pool_scrub_type_t type) 4699789Sahrens { 47007754SJeff.Bonwick@Sun.COM ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 47014808Sek110237 4702789Sahrens if ((uint_t)type >= POOL_SCRUB_TYPES) 4703789Sahrens return (ENOTSUP); 4704789Sahrens 4705789Sahrens /* 47067046Sahrens * If a resilver was requested, but there is no DTL on a 47077046Sahrens * writeable leaf device, we have nothing to do. 4708789Sahrens */ 47097046Sahrens if (type == POOL_SCRUB_RESILVER && 47107046Sahrens !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 47117046Sahrens spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 47121544Seschrock return (0); 47131544Seschrock } 4714789Sahrens 47157046Sahrens if (type == POOL_SCRUB_EVERYTHING && 47167046Sahrens spa->spa_dsl_pool->dp_scrub_func != SCRUB_FUNC_NONE && 47177046Sahrens spa->spa_dsl_pool->dp_scrub_isresilver) 47187046Sahrens return (EBUSY); 47197046Sahrens 47207046Sahrens if (type == POOL_SCRUB_EVERYTHING || type == POOL_SCRUB_RESILVER) { 47217046Sahrens return (dsl_pool_scrub_clean(spa->spa_dsl_pool)); 47227046Sahrens } else if (type == POOL_SCRUB_NONE) { 47237046Sahrens return (dsl_pool_scrub_cancel(spa->spa_dsl_pool)); 47241544Seschrock } else { 47257046Sahrens return (EINVAL); 47261544Seschrock } 4727789Sahrens } 4728789Sahrens 47291544Seschrock /* 47301544Seschrock * ========================================================================== 47311544Seschrock * SPA async task processing 47321544Seschrock * ========================================================================== 47331544Seschrock */ 47341544Seschrock 47351544Seschrock static void 47364451Seschrock spa_async_remove(spa_t *spa, vdev_t *vd) 4737789Sahrens { 47387361SBrendan.Gregg@Sun.COM if (vd->vdev_remove_wanted) { 4739*12247SGeorge.Wilson@Sun.COM vd->vdev_remove_wanted = B_FALSE; 4740*12247SGeorge.Wilson@Sun.COM vd->vdev_delayed_close = B_FALSE; 47417361SBrendan.Gregg@Sun.COM vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); 474210575SEric.Schrock@Sun.COM 474310575SEric.Schrock@Sun.COM /* 474410575SEric.Schrock@Sun.COM * We want to clear the stats, but we don't want to do a full 474510575SEric.Schrock@Sun.COM * vdev_clear() as that will cause us to throw away 474610575SEric.Schrock@Sun.COM * degraded/faulted state as well as attempt to reopen the 474710575SEric.Schrock@Sun.COM * device, all of which is a waste. 474810575SEric.Schrock@Sun.COM */ 474910575SEric.Schrock@Sun.COM vd->vdev_stat.vs_read_errors = 0; 475010575SEric.Schrock@Sun.COM vd->vdev_stat.vs_write_errors = 0; 475110575SEric.Schrock@Sun.COM vd->vdev_stat.vs_checksum_errors = 0; 475210575SEric.Schrock@Sun.COM 47537754SJeff.Bonwick@Sun.COM vdev_state_dirty(vd->vdev_top); 47541544Seschrock } 47557361SBrendan.Gregg@Sun.COM 47567754SJeff.Bonwick@Sun.COM for (int c = 0; c < vd->vdev_children; c++) 47577361SBrendan.Gregg@Sun.COM spa_async_remove(spa, vd->vdev_child[c]); 47581544Seschrock } 47591544Seschrock 47601544Seschrock static void 47617754SJeff.Bonwick@Sun.COM spa_async_probe(spa_t *spa, vdev_t *vd) 47627754SJeff.Bonwick@Sun.COM { 47637754SJeff.Bonwick@Sun.COM if (vd->vdev_probe_wanted) { 4764*12247SGeorge.Wilson@Sun.COM vd->vdev_probe_wanted = B_FALSE; 47657754SJeff.Bonwick@Sun.COM vdev_reopen(vd); /* vdev_open() does the actual probe */ 47667754SJeff.Bonwick@Sun.COM } 47677754SJeff.Bonwick@Sun.COM 47687754SJeff.Bonwick@Sun.COM for (int c = 0; c < vd->vdev_children; c++) 47697754SJeff.Bonwick@Sun.COM spa_async_probe(spa, vd->vdev_child[c]); 47707754SJeff.Bonwick@Sun.COM } 47717754SJeff.Bonwick@Sun.COM 47727754SJeff.Bonwick@Sun.COM static void 47739816SGeorge.Wilson@Sun.COM spa_async_autoexpand(spa_t *spa, vdev_t *vd) 47749816SGeorge.Wilson@Sun.COM { 47759816SGeorge.Wilson@Sun.COM sysevent_id_t eid; 47769816SGeorge.Wilson@Sun.COM nvlist_t *attr; 47779816SGeorge.Wilson@Sun.COM char *physpath; 47789816SGeorge.Wilson@Sun.COM 47799816SGeorge.Wilson@Sun.COM if (!spa->spa_autoexpand) 47809816SGeorge.Wilson@Sun.COM return; 47819816SGeorge.Wilson@Sun.COM 47829816SGeorge.Wilson@Sun.COM for (int c = 0; c < vd->vdev_children; c++) { 47839816SGeorge.Wilson@Sun.COM vdev_t *cvd = vd->vdev_child[c]; 47849816SGeorge.Wilson@Sun.COM spa_async_autoexpand(spa, cvd); 47859816SGeorge.Wilson@Sun.COM } 47869816SGeorge.Wilson@Sun.COM 47879816SGeorge.Wilson@Sun.COM if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) 47889816SGeorge.Wilson@Sun.COM return; 47899816SGeorge.Wilson@Sun.COM 47909816SGeorge.Wilson@Sun.COM physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP); 47919816SGeorge.Wilson@Sun.COM (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath); 47929816SGeorge.Wilson@Sun.COM 47939816SGeorge.Wilson@Sun.COM VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0); 47949816SGeorge.Wilson@Sun.COM VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0); 47959816SGeorge.Wilson@Sun.COM 47969816SGeorge.Wilson@Sun.COM (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS, 47979816SGeorge.Wilson@Sun.COM ESC_DEV_DLE, attr, &eid, DDI_SLEEP); 47989816SGeorge.Wilson@Sun.COM 47999816SGeorge.Wilson@Sun.COM nvlist_free(attr); 48009816SGeorge.Wilson@Sun.COM kmem_free(physpath, MAXPATHLEN); 48019816SGeorge.Wilson@Sun.COM } 48029816SGeorge.Wilson@Sun.COM 48039816SGeorge.Wilson@Sun.COM static void 48041544Seschrock spa_async_thread(spa_t *spa) 48051544Seschrock { 48067754SJeff.Bonwick@Sun.COM int tasks; 48071544Seschrock 48081544Seschrock ASSERT(spa->spa_sync_on); 4809789Sahrens 48101544Seschrock mutex_enter(&spa->spa_async_lock); 48111544Seschrock tasks = spa->spa_async_tasks; 48121544Seschrock spa->spa_async_tasks = 0; 48131544Seschrock mutex_exit(&spa->spa_async_lock); 48141544Seschrock 48151544Seschrock /* 48161635Sbonwick * See if the config needs to be updated. 48171635Sbonwick */ 48181635Sbonwick if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 481910922SJeff.Bonwick@Sun.COM uint64_t old_space, new_space; 48209816SGeorge.Wilson@Sun.COM 48211635Sbonwick mutex_enter(&spa_namespace_lock); 482210922SJeff.Bonwick@Sun.COM old_space = metaslab_class_get_space(spa_normal_class(spa)); 48231635Sbonwick spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 482410922SJeff.Bonwick@Sun.COM new_space = metaslab_class_get_space(spa_normal_class(spa)); 48251635Sbonwick mutex_exit(&spa_namespace_lock); 48269816SGeorge.Wilson@Sun.COM 48279816SGeorge.Wilson@Sun.COM /* 48289816SGeorge.Wilson@Sun.COM * If the pool grew as a result of the config update, 48299816SGeorge.Wilson@Sun.COM * then log an internal history event. 48309816SGeorge.Wilson@Sun.COM */ 483110922SJeff.Bonwick@Sun.COM if (new_space != old_space) { 48329946SMark.Musante@Sun.COM spa_history_internal_log(LOG_POOL_VDEV_ONLINE, 48339946SMark.Musante@Sun.COM spa, NULL, CRED(), 48349946SMark.Musante@Sun.COM "pool '%s' size: %llu(+%llu)", 483510922SJeff.Bonwick@Sun.COM spa_name(spa), new_space, new_space - old_space); 48369816SGeorge.Wilson@Sun.COM } 48371635Sbonwick } 48381635Sbonwick 48391635Sbonwick /* 48404451Seschrock * See if any devices need to be marked REMOVED. 48411544Seschrock */ 48427754SJeff.Bonwick@Sun.COM if (tasks & SPA_ASYNC_REMOVE) { 484310685SGeorge.Wilson@Sun.COM spa_vdev_state_enter(spa, SCL_NONE); 48444451Seschrock spa_async_remove(spa, spa->spa_root_vdev); 48457754SJeff.Bonwick@Sun.COM for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 48467361SBrendan.Gregg@Sun.COM spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); 48477754SJeff.Bonwick@Sun.COM for (int i = 0; i < spa->spa_spares.sav_count; i++) 48487361SBrendan.Gregg@Sun.COM spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); 48497754SJeff.Bonwick@Sun.COM (void) spa_vdev_state_exit(spa, NULL, 0); 48507754SJeff.Bonwick@Sun.COM } 48517754SJeff.Bonwick@Sun.COM 48529816SGeorge.Wilson@Sun.COM if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { 48539816SGeorge.Wilson@Sun.COM spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 48549816SGeorge.Wilson@Sun.COM spa_async_autoexpand(spa, spa->spa_root_vdev); 48559816SGeorge.Wilson@Sun.COM spa_config_exit(spa, SCL_CONFIG, FTAG); 48569816SGeorge.Wilson@Sun.COM } 48579816SGeorge.Wilson@Sun.COM 48587754SJeff.Bonwick@Sun.COM /* 48597754SJeff.Bonwick@Sun.COM * See if any devices need to be probed. 48607754SJeff.Bonwick@Sun.COM */ 48617754SJeff.Bonwick@Sun.COM if (tasks & SPA_ASYNC_PROBE) { 486210685SGeorge.Wilson@Sun.COM spa_vdev_state_enter(spa, SCL_NONE); 48637754SJeff.Bonwick@Sun.COM spa_async_probe(spa, spa->spa_root_vdev); 48647754SJeff.Bonwick@Sun.COM (void) spa_vdev_state_exit(spa, NULL, 0); 48654451Seschrock } 48661544Seschrock 48671544Seschrock /* 48681544Seschrock * If any devices are done replacing, detach them. 48691544Seschrock */ 48704451Seschrock if (tasks & SPA_ASYNC_RESILVER_DONE) 48714451Seschrock spa_vdev_resilver_done(spa); 4872789Sahrens 48731544Seschrock /* 48741544Seschrock * Kick off a resilver. 48751544Seschrock */ 48767046Sahrens if (tasks & SPA_ASYNC_RESILVER) 48777046Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER) == 0); 48781544Seschrock 48791544Seschrock /* 48801544Seschrock * Let the world know that we're done. 48811544Seschrock */ 48821544Seschrock mutex_enter(&spa->spa_async_lock); 48831544Seschrock spa->spa_async_thread = NULL; 48841544Seschrock cv_broadcast(&spa->spa_async_cv); 48851544Seschrock mutex_exit(&spa->spa_async_lock); 48861544Seschrock thread_exit(); 48871544Seschrock } 48881544Seschrock 48891544Seschrock void 48901544Seschrock spa_async_suspend(spa_t *spa) 48911544Seschrock { 48921544Seschrock mutex_enter(&spa->spa_async_lock); 48931544Seschrock spa->spa_async_suspended++; 48941544Seschrock while (spa->spa_async_thread != NULL) 48951544Seschrock cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 48961544Seschrock mutex_exit(&spa->spa_async_lock); 48971544Seschrock } 48981544Seschrock 48991544Seschrock void 49001544Seschrock spa_async_resume(spa_t *spa) 49011544Seschrock { 49021544Seschrock mutex_enter(&spa->spa_async_lock); 49031544Seschrock ASSERT(spa->spa_async_suspended != 0); 49041544Seschrock spa->spa_async_suspended--; 49051544Seschrock mutex_exit(&spa->spa_async_lock); 49061544Seschrock } 49071544Seschrock 49081544Seschrock static void 49091544Seschrock spa_async_dispatch(spa_t *spa) 49101544Seschrock { 49111544Seschrock mutex_enter(&spa->spa_async_lock); 49121544Seschrock if (spa->spa_async_tasks && !spa->spa_async_suspended && 49131635Sbonwick spa->spa_async_thread == NULL && 49141635Sbonwick rootdir != NULL && !vn_is_readonly(rootdir)) 49151544Seschrock spa->spa_async_thread = thread_create(NULL, 0, 49161544Seschrock spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 49171544Seschrock mutex_exit(&spa->spa_async_lock); 49181544Seschrock } 49191544Seschrock 49201544Seschrock void 49211544Seschrock spa_async_request(spa_t *spa, int task) 49221544Seschrock { 49231544Seschrock mutex_enter(&spa->spa_async_lock); 49241544Seschrock spa->spa_async_tasks |= task; 49251544Seschrock mutex_exit(&spa->spa_async_lock); 4926789Sahrens } 4927789Sahrens 4928789Sahrens /* 4929789Sahrens * ========================================================================== 4930789Sahrens * SPA syncing routines 4931789Sahrens * ========================================================================== 4932789Sahrens */ 4933789Sahrens static void 493410922SJeff.Bonwick@Sun.COM spa_sync_deferred_bplist(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx, uint64_t txg) 4935789Sahrens { 4936789Sahrens blkptr_t blk; 4937789Sahrens uint64_t itor = 0; 4938789Sahrens uint8_t c = 1; 4939789Sahrens 49407754SJeff.Bonwick@Sun.COM while (bplist_iterate(bpl, &itor, &blk) == 0) { 49417754SJeff.Bonwick@Sun.COM ASSERT(blk.blk_birth < txg); 494210922SJeff.Bonwick@Sun.COM zio_free(spa, txg, &blk); 49437754SJeff.Bonwick@Sun.COM } 4944789Sahrens 4945789Sahrens bplist_vacate(bpl, tx); 4946789Sahrens 4947789Sahrens /* 4948789Sahrens * Pre-dirty the first block so we sync to convergence faster. 4949789Sahrens * (Usually only the first block is needed.) 4950789Sahrens */ 495110922SJeff.Bonwick@Sun.COM dmu_write(bpl->bpl_mos, spa->spa_deferred_bplist_obj, 0, 1, &c, tx); 495210922SJeff.Bonwick@Sun.COM } 495310922SJeff.Bonwick@Sun.COM 495410922SJeff.Bonwick@Sun.COM static void 495510922SJeff.Bonwick@Sun.COM spa_sync_free(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 495610922SJeff.Bonwick@Sun.COM { 495710922SJeff.Bonwick@Sun.COM zio_t *zio = arg; 495810922SJeff.Bonwick@Sun.COM 495910922SJeff.Bonwick@Sun.COM zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp, 496010922SJeff.Bonwick@Sun.COM zio->io_flags)); 4961789Sahrens } 4962789Sahrens 4963789Sahrens static void 49642082Seschrock spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 49652082Seschrock { 49662082Seschrock char *packed = NULL; 49677497STim.Haley@Sun.COM size_t bufsize; 49682082Seschrock size_t nvsize = 0; 49692082Seschrock dmu_buf_t *db; 49702082Seschrock 49712082Seschrock VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 49722082Seschrock 49737497STim.Haley@Sun.COM /* 49747497STim.Haley@Sun.COM * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration 49757497STim.Haley@Sun.COM * information. This avoids the dbuf_will_dirty() path and 49767497STim.Haley@Sun.COM * saves us a pre-read to get data we don't actually care about. 49777497STim.Haley@Sun.COM */ 49787497STim.Haley@Sun.COM bufsize = P2ROUNDUP(nvsize, SPA_CONFIG_BLOCKSIZE); 49797497STim.Haley@Sun.COM packed = kmem_alloc(bufsize, KM_SLEEP); 49802082Seschrock 49812082Seschrock VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 49822082Seschrock KM_SLEEP) == 0); 49837497STim.Haley@Sun.COM bzero(packed + nvsize, bufsize - nvsize); 49847497STim.Haley@Sun.COM 49857497STim.Haley@Sun.COM dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); 49867497STim.Haley@Sun.COM 49877497STim.Haley@Sun.COM kmem_free(packed, bufsize); 49882082Seschrock 49892082Seschrock VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 49902082Seschrock dmu_buf_will_dirty(db, tx); 49912082Seschrock *(uint64_t *)db->db_data = nvsize; 49922082Seschrock dmu_buf_rele(db, FTAG); 49932082Seschrock } 49942082Seschrock 49952082Seschrock static void 49965450Sbrendan spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, 49975450Sbrendan const char *config, const char *entry) 49982082Seschrock { 49992082Seschrock nvlist_t *nvroot; 50005450Sbrendan nvlist_t **list; 50012082Seschrock int i; 50022082Seschrock 50035450Sbrendan if (!sav->sav_sync) 50042082Seschrock return; 50052082Seschrock 50062082Seschrock /* 50075450Sbrendan * Update the MOS nvlist describing the list of available devices. 50085450Sbrendan * spa_validate_aux() will have already made sure this nvlist is 50094451Seschrock * valid and the vdevs are labeled appropriately. 50102082Seschrock */ 50115450Sbrendan if (sav->sav_object == 0) { 50125450Sbrendan sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, 50135450Sbrendan DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, 50145450Sbrendan sizeof (uint64_t), tx); 50152082Seschrock VERIFY(zap_update(spa->spa_meta_objset, 50165450Sbrendan DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, 50175450Sbrendan &sav->sav_object, tx) == 0); 50182082Seschrock } 50192082Seschrock 50202082Seschrock VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 50215450Sbrendan if (sav->sav_count == 0) { 50225450Sbrendan VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); 50232082Seschrock } else { 50245450Sbrendan list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 50255450Sbrendan for (i = 0; i < sav->sav_count; i++) 50265450Sbrendan list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], 50275450Sbrendan B_FALSE, B_FALSE, B_TRUE); 50285450Sbrendan VERIFY(nvlist_add_nvlist_array(nvroot, config, list, 50295450Sbrendan sav->sav_count) == 0); 50305450Sbrendan for (i = 0; i < sav->sav_count; i++) 50315450Sbrendan nvlist_free(list[i]); 50325450Sbrendan kmem_free(list, sav->sav_count * sizeof (void *)); 50332082Seschrock } 50342082Seschrock 50355450Sbrendan spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); 50362926Sek110237 nvlist_free(nvroot); 50372082Seschrock 50385450Sbrendan sav->sav_sync = B_FALSE; 50392082Seschrock } 50402082Seschrock 50412082Seschrock static void 5042789Sahrens spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 5043789Sahrens { 5044789Sahrens nvlist_t *config; 5045789Sahrens 50467754SJeff.Bonwick@Sun.COM if (list_is_empty(&spa->spa_config_dirty_list)) 5047789Sahrens return; 5048789Sahrens 50497754SJeff.Bonwick@Sun.COM spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 50507754SJeff.Bonwick@Sun.COM 50517754SJeff.Bonwick@Sun.COM config = spa_config_generate(spa, spa->spa_root_vdev, 50527754SJeff.Bonwick@Sun.COM dmu_tx_get_txg(tx), B_FALSE); 50537754SJeff.Bonwick@Sun.COM 50547754SJeff.Bonwick@Sun.COM spa_config_exit(spa, SCL_STATE, FTAG); 5055789Sahrens 50561635Sbonwick if (spa->spa_config_syncing) 50571635Sbonwick nvlist_free(spa->spa_config_syncing); 50581635Sbonwick spa->spa_config_syncing = config; 5059789Sahrens 50602082Seschrock spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 5061789Sahrens } 5062789Sahrens 50635094Slling /* 50645094Slling * Set zpool properties. 50655094Slling */ 50663912Slling static void 50674543Smarks spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) 50683912Slling { 50693912Slling spa_t *spa = arg1; 50705094Slling objset_t *mos = spa->spa_meta_objset; 50713912Slling nvlist_t *nvp = arg2; 50725094Slling nvpair_t *elem; 50734451Seschrock uint64_t intval; 50746643Seschrock char *strval; 50755094Slling zpool_prop_t prop; 50765094Slling const char *propname; 50775094Slling zprop_type_t proptype; 50785094Slling 50797754SJeff.Bonwick@Sun.COM mutex_enter(&spa->spa_props_lock); 50807754SJeff.Bonwick@Sun.COM 50815094Slling elem = NULL; 50825094Slling while ((elem = nvlist_next_nvpair(nvp, elem))) { 50835094Slling switch (prop = zpool_name_to_prop(nvpair_name(elem))) { 50845094Slling case ZPOOL_PROP_VERSION: 50855094Slling /* 50865094Slling * Only set version for non-zpool-creation cases 50875094Slling * (set/import). spa_create() needs special care 50885094Slling * for version setting. 50895094Slling */ 50905094Slling if (tx->tx_txg != TXG_INITIAL) { 50915094Slling VERIFY(nvpair_value_uint64(elem, 50925094Slling &intval) == 0); 50935094Slling ASSERT(intval <= SPA_VERSION); 50945094Slling ASSERT(intval >= spa_version(spa)); 50955094Slling spa->spa_uberblock.ub_version = intval; 50965094Slling vdev_config_dirty(spa->spa_root_vdev); 50975094Slling } 50985094Slling break; 50995094Slling 51005094Slling case ZPOOL_PROP_ALTROOT: 51015094Slling /* 51025094Slling * 'altroot' is a non-persistent property. It should 51035094Slling * have been set temporarily at creation or import time. 51045094Slling */ 51055094Slling ASSERT(spa->spa_root != NULL); 51065094Slling break; 51075094Slling 51085363Seschrock case ZPOOL_PROP_CACHEFILE: 51095094Slling /* 51108525SEric.Schrock@Sun.COM * 'cachefile' is also a non-persisitent property. 51115094Slling */ 51124543Smarks break; 51135094Slling default: 51145094Slling /* 51155094Slling * Set pool property values in the poolprops mos object. 51165094Slling */ 51175094Slling if (spa->spa_pool_props_object == 0) { 51185094Slling VERIFY((spa->spa_pool_props_object = 51195094Slling zap_create(mos, DMU_OT_POOL_PROPS, 51205094Slling DMU_OT_NONE, 0, tx)) > 0); 51215094Slling 51225094Slling VERIFY(zap_update(mos, 51235094Slling DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 51245094Slling 8, 1, &spa->spa_pool_props_object, tx) 51255094Slling == 0); 51265094Slling } 51275094Slling 51285094Slling /* normalize the property name */ 51295094Slling propname = zpool_prop_to_name(prop); 51305094Slling proptype = zpool_prop_get_type(prop); 51315094Slling 51325094Slling if (nvpair_type(elem) == DATA_TYPE_STRING) { 51335094Slling ASSERT(proptype == PROP_TYPE_STRING); 51345094Slling VERIFY(nvpair_value_string(elem, &strval) == 0); 51355094Slling VERIFY(zap_update(mos, 51365094Slling spa->spa_pool_props_object, propname, 51375094Slling 1, strlen(strval) + 1, strval, tx) == 0); 51385094Slling 51395094Slling } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 51405094Slling VERIFY(nvpair_value_uint64(elem, &intval) == 0); 51415094Slling 51425094Slling if (proptype == PROP_TYPE_INDEX) { 51435094Slling const char *unused; 51445094Slling VERIFY(zpool_prop_index_to_string( 51455094Slling prop, intval, &unused) == 0); 51465094Slling } 51475094Slling VERIFY(zap_update(mos, 51485094Slling spa->spa_pool_props_object, propname, 51495094Slling 8, 1, &intval, tx) == 0); 51505094Slling } else { 51515094Slling ASSERT(0); /* not allowed */ 51525094Slling } 51535094Slling 51545329Sgw25295 switch (prop) { 51555329Sgw25295 case ZPOOL_PROP_DELEGATION: 51565094Slling spa->spa_delegation = intval; 51575329Sgw25295 break; 51585329Sgw25295 case ZPOOL_PROP_BOOTFS: 51595094Slling spa->spa_bootfs = intval; 51605329Sgw25295 break; 51615329Sgw25295 case ZPOOL_PROP_FAILUREMODE: 51625329Sgw25295 spa->spa_failmode = intval; 51635329Sgw25295 break; 51649816SGeorge.Wilson@Sun.COM case ZPOOL_PROP_AUTOEXPAND: 51659816SGeorge.Wilson@Sun.COM spa->spa_autoexpand = intval; 51669816SGeorge.Wilson@Sun.COM spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); 51679816SGeorge.Wilson@Sun.COM break; 516810922SJeff.Bonwick@Sun.COM case ZPOOL_PROP_DEDUPDITTO: 516910922SJeff.Bonwick@Sun.COM spa->spa_dedup_ditto = intval; 517010922SJeff.Bonwick@Sun.COM break; 51715329Sgw25295 default: 51725329Sgw25295 break; 51735329Sgw25295 } 51743912Slling } 51755094Slling 51765094Slling /* log internal history if this is not a zpool create */ 51775094Slling if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY && 51785094Slling tx->tx_txg != TXG_INITIAL) { 51795094Slling spa_history_internal_log(LOG_POOL_PROPSET, 51805094Slling spa, tx, cr, "%s %lld %s", 51817754SJeff.Bonwick@Sun.COM nvpair_name(elem), intval, spa_name(spa)); 51825094Slling } 51833912Slling } 51847754SJeff.Bonwick@Sun.COM 51857754SJeff.Bonwick@Sun.COM mutex_exit(&spa->spa_props_lock); 51863912Slling } 51873912Slling 5188789Sahrens /* 5189789Sahrens * Sync the specified transaction group. New blocks may be dirtied as 5190789Sahrens * part of the process, so we iterate until it converges. 5191789Sahrens */ 5192789Sahrens void 5193789Sahrens spa_sync(spa_t *spa, uint64_t txg) 5194789Sahrens { 5195789Sahrens dsl_pool_t *dp = spa->spa_dsl_pool; 5196789Sahrens objset_t *mos = spa->spa_meta_objset; 519710922SJeff.Bonwick@Sun.COM bplist_t *defer_bpl = &spa->spa_deferred_bplist; 519810922SJeff.Bonwick@Sun.COM bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; 51991635Sbonwick vdev_t *rvd = spa->spa_root_vdev; 5200789Sahrens vdev_t *vd; 5201789Sahrens dmu_tx_t *tx; 52027754SJeff.Bonwick@Sun.COM int error; 5203789Sahrens 5204789Sahrens /* 5205789Sahrens * Lock out configuration changes. 5206789Sahrens */ 52077754SJeff.Bonwick@Sun.COM spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 5208789Sahrens 5209789Sahrens spa->spa_syncing_txg = txg; 5210789Sahrens spa->spa_sync_pass = 0; 5211789Sahrens 52127754SJeff.Bonwick@Sun.COM /* 52137754SJeff.Bonwick@Sun.COM * If there are any pending vdev state changes, convert them 52147754SJeff.Bonwick@Sun.COM * into config changes that go out with this transaction group. 52157754SJeff.Bonwick@Sun.COM */ 52167754SJeff.Bonwick@Sun.COM spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 52178241SJeff.Bonwick@Sun.COM while (list_head(&spa->spa_state_dirty_list) != NULL) { 52188241SJeff.Bonwick@Sun.COM /* 52198241SJeff.Bonwick@Sun.COM * We need the write lock here because, for aux vdevs, 52208241SJeff.Bonwick@Sun.COM * calling vdev_config_dirty() modifies sav_config. 52218241SJeff.Bonwick@Sun.COM * This is ugly and will become unnecessary when we 52228241SJeff.Bonwick@Sun.COM * eliminate the aux vdev wart by integrating all vdevs 52238241SJeff.Bonwick@Sun.COM * into the root vdev tree. 52248241SJeff.Bonwick@Sun.COM */ 52258241SJeff.Bonwick@Sun.COM spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 52268241SJeff.Bonwick@Sun.COM spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); 52278241SJeff.Bonwick@Sun.COM while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 52288241SJeff.Bonwick@Sun.COM vdev_state_clean(vd); 52298241SJeff.Bonwick@Sun.COM vdev_config_dirty(vd); 52308241SJeff.Bonwick@Sun.COM } 52318241SJeff.Bonwick@Sun.COM spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 52328241SJeff.Bonwick@Sun.COM spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 52337754SJeff.Bonwick@Sun.COM } 52347754SJeff.Bonwick@Sun.COM spa_config_exit(spa, SCL_STATE, FTAG); 52357754SJeff.Bonwick@Sun.COM 523610922SJeff.Bonwick@Sun.COM VERIFY(0 == bplist_open(defer_bpl, mos, spa->spa_deferred_bplist_obj)); 5237789Sahrens 52382082Seschrock tx = dmu_tx_create_assigned(dp, txg); 52392082Seschrock 52402082Seschrock /* 52414577Sahrens * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 52422082Seschrock * set spa_deflate if we have no raid-z vdevs. 52432082Seschrock */ 52444577Sahrens if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 52454577Sahrens spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 52462082Seschrock int i; 52472082Seschrock 52482082Seschrock for (i = 0; i < rvd->vdev_children; i++) { 52492082Seschrock vd = rvd->vdev_child[i]; 52502082Seschrock if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 52512082Seschrock break; 52522082Seschrock } 52532082Seschrock if (i == rvd->vdev_children) { 52542082Seschrock spa->spa_deflate = TRUE; 52552082Seschrock VERIFY(0 == zap_add(spa->spa_meta_objset, 52562082Seschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 52572082Seschrock sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 52582082Seschrock } 52592082Seschrock } 52602082Seschrock 52617046Sahrens if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && 52627046Sahrens spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { 52637046Sahrens dsl_pool_create_origin(dp, tx); 52647046Sahrens 52657046Sahrens /* Keeping the origin open increases spa_minref */ 52667046Sahrens spa->spa_minref += 3; 52677046Sahrens } 52687046Sahrens 52697046Sahrens if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && 52707046Sahrens spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { 52717046Sahrens dsl_pool_upgrade_clones(dp, tx); 52727046Sahrens } 52737046Sahrens 5274789Sahrens /* 5275789Sahrens * If anything has changed in this txg, push the deferred frees 5276789Sahrens * from the previous txg. If not, leave them alone so that we 5277789Sahrens * don't generate work on an otherwise idle system. 5278789Sahrens */ 5279789Sahrens if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 52802329Sek110237 !txg_list_empty(&dp->dp_dirty_dirs, txg) || 52812329Sek110237 !txg_list_empty(&dp->dp_sync_tasks, txg)) 528210922SJeff.Bonwick@Sun.COM spa_sync_deferred_bplist(spa, defer_bpl, tx, txg); 5283789Sahrens 5284789Sahrens /* 5285789Sahrens * Iterate to convergence. 5286789Sahrens */ 5287789Sahrens do { 528810922SJeff.Bonwick@Sun.COM int pass = ++spa->spa_sync_pass; 5289789Sahrens 5290789Sahrens spa_sync_config_object(spa, tx); 52915450Sbrendan spa_sync_aux_dev(spa, &spa->spa_spares, tx, 52925450Sbrendan ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); 52935450Sbrendan spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, 52945450Sbrendan ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); 52951544Seschrock spa_errlog_sync(spa, txg); 5296789Sahrens dsl_pool_sync(dp, txg); 5297789Sahrens 529810922SJeff.Bonwick@Sun.COM if (pass <= SYNC_PASS_DEFERRED_FREE) { 529910922SJeff.Bonwick@Sun.COM zio_t *zio = zio_root(spa, NULL, NULL, 0); 530010922SJeff.Bonwick@Sun.COM bplist_sync(free_bpl, spa_sync_free, zio, tx); 530110922SJeff.Bonwick@Sun.COM VERIFY(zio_wait(zio) == 0); 530210922SJeff.Bonwick@Sun.COM } else { 530310922SJeff.Bonwick@Sun.COM bplist_sync(free_bpl, bplist_enqueue_cb, defer_bpl, tx); 5304789Sahrens } 5305789Sahrens 530610922SJeff.Bonwick@Sun.COM ddt_sync(spa, txg); 530710922SJeff.Bonwick@Sun.COM 530811619SGeorge.Wilson@Sun.COM mutex_enter(&spa->spa_scrub_lock); 530911619SGeorge.Wilson@Sun.COM while (spa->spa_scrub_inflight > 0) 531011619SGeorge.Wilson@Sun.COM cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 531111619SGeorge.Wilson@Sun.COM mutex_exit(&spa->spa_scrub_lock); 531211619SGeorge.Wilson@Sun.COM 531310922SJeff.Bonwick@Sun.COM while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) 531410922SJeff.Bonwick@Sun.COM vdev_sync(vd, txg); 531510922SJeff.Bonwick@Sun.COM 531610922SJeff.Bonwick@Sun.COM } while (dmu_objset_is_dirty(mos, txg)); 531710922SJeff.Bonwick@Sun.COM 531811932SGeorge.Wilson@Sun.COM ASSERT(list_is_empty(&free_bpl->bpl_queue)); 531910922SJeff.Bonwick@Sun.COM 532010922SJeff.Bonwick@Sun.COM bplist_close(defer_bpl); 5321789Sahrens 5322789Sahrens /* 5323789Sahrens * Rewrite the vdev configuration (which includes the uberblock) 5324789Sahrens * to commit the transaction group. 53251635Sbonwick * 53265688Sbonwick * If there are no dirty vdevs, we sync the uberblock to a few 53275688Sbonwick * random top-level vdevs that are known to be visible in the 53287754SJeff.Bonwick@Sun.COM * config cache (see spa_vdev_add() for a complete description). 53297754SJeff.Bonwick@Sun.COM * If there *are* dirty vdevs, sync the uberblock to all vdevs. 5330789Sahrens */ 53317754SJeff.Bonwick@Sun.COM for (;;) { 53327754SJeff.Bonwick@Sun.COM /* 53337754SJeff.Bonwick@Sun.COM * We hold SCL_STATE to prevent vdev open/close/etc. 53347754SJeff.Bonwick@Sun.COM * while we're attempting to write the vdev labels. 53357754SJeff.Bonwick@Sun.COM */ 53367754SJeff.Bonwick@Sun.COM spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 53377754SJeff.Bonwick@Sun.COM 53387754SJeff.Bonwick@Sun.COM if (list_is_empty(&spa->spa_config_dirty_list)) { 53397754SJeff.Bonwick@Sun.COM vdev_t *svd[SPA_DVAS_PER_BP]; 53407754SJeff.Bonwick@Sun.COM int svdcount = 0; 53417754SJeff.Bonwick@Sun.COM int children = rvd->vdev_children; 53427754SJeff.Bonwick@Sun.COM int c0 = spa_get_random(children); 53439816SGeorge.Wilson@Sun.COM 53449816SGeorge.Wilson@Sun.COM for (int c = 0; c < children; c++) { 53457754SJeff.Bonwick@Sun.COM vd = rvd->vdev_child[(c0 + c) % children]; 53467754SJeff.Bonwick@Sun.COM if (vd->vdev_ms_array == 0 || vd->vdev_islog) 53477754SJeff.Bonwick@Sun.COM continue; 53487754SJeff.Bonwick@Sun.COM svd[svdcount++] = vd; 53497754SJeff.Bonwick@Sun.COM if (svdcount == SPA_DVAS_PER_BP) 53507754SJeff.Bonwick@Sun.COM break; 53517754SJeff.Bonwick@Sun.COM } 53529725SEric.Schrock@Sun.COM error = vdev_config_sync(svd, svdcount, txg, B_FALSE); 53539725SEric.Schrock@Sun.COM if (error != 0) 53549725SEric.Schrock@Sun.COM error = vdev_config_sync(svd, svdcount, txg, 53559725SEric.Schrock@Sun.COM B_TRUE); 53567754SJeff.Bonwick@Sun.COM } else { 53577754SJeff.Bonwick@Sun.COM error = vdev_config_sync(rvd->vdev_child, 53589725SEric.Schrock@Sun.COM rvd->vdev_children, txg, B_FALSE); 53599725SEric.Schrock@Sun.COM if (error != 0) 53609725SEric.Schrock@Sun.COM error = vdev_config_sync(rvd->vdev_child, 53619725SEric.Schrock@Sun.COM rvd->vdev_children, txg, B_TRUE); 53621635Sbonwick } 53637754SJeff.Bonwick@Sun.COM 53647754SJeff.Bonwick@Sun.COM spa_config_exit(spa, SCL_STATE, FTAG); 53657754SJeff.Bonwick@Sun.COM 53667754SJeff.Bonwick@Sun.COM if (error == 0) 53677754SJeff.Bonwick@Sun.COM break; 53687754SJeff.Bonwick@Sun.COM zio_suspend(spa, NULL); 53697754SJeff.Bonwick@Sun.COM zio_resume_wait(spa); 53701635Sbonwick } 53712082Seschrock dmu_tx_commit(tx); 53722082Seschrock 53731635Sbonwick /* 53741635Sbonwick * Clear the dirty config list. 53751635Sbonwick */ 53767754SJeff.Bonwick@Sun.COM while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) 53771635Sbonwick vdev_config_clean(vd); 53781635Sbonwick 53791635Sbonwick /* 53801635Sbonwick * Now that the new config has synced transactionally, 53811635Sbonwick * let it become visible to the config cache. 53821635Sbonwick */ 53831635Sbonwick if (spa->spa_config_syncing != NULL) { 53841635Sbonwick spa_config_set(spa, spa->spa_config_syncing); 53851635Sbonwick spa->spa_config_txg = txg; 53861635Sbonwick spa->spa_config_syncing = NULL; 53871635Sbonwick } 5388789Sahrens 5389789Sahrens spa->spa_ubsync = spa->spa_uberblock; 5390789Sahrens 539110922SJeff.Bonwick@Sun.COM dsl_pool_sync_done(dp, txg); 5392789Sahrens 5393789Sahrens /* 5394789Sahrens * Update usable space statistics. 5395789Sahrens */ 5396789Sahrens while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 5397789Sahrens vdev_sync_done(vd, txg); 5398789Sahrens 539910956SGeorge.Wilson@Sun.COM spa_update_dspace(spa); 540010956SGeorge.Wilson@Sun.COM 5401789Sahrens /* 5402789Sahrens * It had better be the case that we didn't dirty anything 54032082Seschrock * since vdev_config_sync(). 5404789Sahrens */ 5405789Sahrens ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 5406789Sahrens ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 5407789Sahrens ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 540811932SGeorge.Wilson@Sun.COM ASSERT(list_is_empty(&defer_bpl->bpl_queue)); 540911932SGeorge.Wilson@Sun.COM ASSERT(list_is_empty(&free_bpl->bpl_queue)); 541010922SJeff.Bonwick@Sun.COM 541110922SJeff.Bonwick@Sun.COM spa->spa_sync_pass = 0; 5412789Sahrens 54137754SJeff.Bonwick@Sun.COM spa_config_exit(spa, SCL_CONFIG, FTAG); 54141544Seschrock 541510921STim.Haley@Sun.COM spa_handle_ignored_writes(spa); 541610921STim.Haley@Sun.COM 54171544Seschrock /* 54181544Seschrock * If any async tasks have been requested, kick them off. 54191544Seschrock */ 54201544Seschrock spa_async_dispatch(spa); 5421789Sahrens } 5422789Sahrens 5423789Sahrens /* 5424789Sahrens * Sync all pools. We don't want to hold the namespace lock across these 5425789Sahrens * operations, so we take a reference on the spa_t and drop the lock during the 5426789Sahrens * sync. 5427789Sahrens */ 5428789Sahrens void 5429789Sahrens spa_sync_allpools(void) 5430789Sahrens { 5431789Sahrens spa_t *spa = NULL; 5432789Sahrens mutex_enter(&spa_namespace_lock); 5433789Sahrens while ((spa = spa_next(spa)) != NULL) { 54347754SJeff.Bonwick@Sun.COM if (spa_state(spa) != POOL_STATE_ACTIVE || spa_suspended(spa)) 5435789Sahrens continue; 5436789Sahrens spa_open_ref(spa, FTAG); 5437789Sahrens mutex_exit(&spa_namespace_lock); 5438789Sahrens txg_wait_synced(spa_get_dsl(spa), 0); 5439789Sahrens mutex_enter(&spa_namespace_lock); 5440789Sahrens spa_close(spa, FTAG); 5441789Sahrens } 5442789Sahrens mutex_exit(&spa_namespace_lock); 5443789Sahrens } 5444789Sahrens 5445789Sahrens /* 5446789Sahrens * ========================================================================== 5447789Sahrens * Miscellaneous routines 5448789Sahrens * ========================================================================== 5449789Sahrens */ 5450789Sahrens 5451789Sahrens /* 5452789Sahrens * Remove all pools in the system. 5453789Sahrens */ 5454789Sahrens void 5455789Sahrens spa_evict_all(void) 5456789Sahrens { 5457789Sahrens spa_t *spa; 5458789Sahrens 5459789Sahrens /* 5460789Sahrens * Remove all cached state. All pools should be closed now, 5461789Sahrens * so every spa in the AVL tree should be unreferenced. 5462789Sahrens */ 5463789Sahrens mutex_enter(&spa_namespace_lock); 5464789Sahrens while ((spa = spa_next(NULL)) != NULL) { 5465789Sahrens /* 54661544Seschrock * Stop async tasks. The async thread may need to detach 54671544Seschrock * a device that's been replaced, which requires grabbing 54681544Seschrock * spa_namespace_lock, so we must drop it here. 5469789Sahrens */ 5470789Sahrens spa_open_ref(spa, FTAG); 5471789Sahrens mutex_exit(&spa_namespace_lock); 54721544Seschrock spa_async_suspend(spa); 54734808Sek110237 mutex_enter(&spa_namespace_lock); 5474789Sahrens spa_close(spa, FTAG); 5475789Sahrens 5476789Sahrens if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 5477789Sahrens spa_unload(spa); 5478789Sahrens spa_deactivate(spa); 5479789Sahrens } 5480789Sahrens spa_remove(spa); 5481789Sahrens } 5482789Sahrens mutex_exit(&spa_namespace_lock); 5483789Sahrens } 54841544Seschrock 54851544Seschrock vdev_t * 54869425SEric.Schrock@Sun.COM spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) 54871544Seschrock { 54886643Seschrock vdev_t *vd; 54896643Seschrock int i; 54906643Seschrock 54916643Seschrock if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) 54926643Seschrock return (vd); 54936643Seschrock 54949425SEric.Schrock@Sun.COM if (aux) { 54956643Seschrock for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 54966643Seschrock vd = spa->spa_l2cache.sav_vdevs[i]; 54976643Seschrock if (vd->vdev_guid == guid) 54986643Seschrock return (vd); 54996643Seschrock } 55009425SEric.Schrock@Sun.COM 55019425SEric.Schrock@Sun.COM for (i = 0; i < spa->spa_spares.sav_count; i++) { 55029425SEric.Schrock@Sun.COM vd = spa->spa_spares.sav_vdevs[i]; 55039425SEric.Schrock@Sun.COM if (vd->vdev_guid == guid) 55049425SEric.Schrock@Sun.COM return (vd); 55059425SEric.Schrock@Sun.COM } 55066643Seschrock } 55076643Seschrock 55086643Seschrock return (NULL); 55091544Seschrock } 55101760Seschrock 55111760Seschrock void 55125094Slling spa_upgrade(spa_t *spa, uint64_t version) 55131760Seschrock { 55147754SJeff.Bonwick@Sun.COM spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 55151760Seschrock 55161760Seschrock /* 55171760Seschrock * This should only be called for a non-faulted pool, and since a 55181760Seschrock * future version would result in an unopenable pool, this shouldn't be 55191760Seschrock * possible. 55201760Seschrock */ 55214577Sahrens ASSERT(spa->spa_uberblock.ub_version <= SPA_VERSION); 55225094Slling ASSERT(version >= spa->spa_uberblock.ub_version); 55235094Slling 55245094Slling spa->spa_uberblock.ub_version = version; 55251760Seschrock vdev_config_dirty(spa->spa_root_vdev); 55261760Seschrock 55277754SJeff.Bonwick@Sun.COM spa_config_exit(spa, SCL_ALL, FTAG); 55282082Seschrock 55292082Seschrock txg_wait_synced(spa_get_dsl(spa), 0); 55301760Seschrock } 55312082Seschrock 55322082Seschrock boolean_t 55332082Seschrock spa_has_spare(spa_t *spa, uint64_t guid) 55342082Seschrock { 55352082Seschrock int i; 55363377Seschrock uint64_t spareguid; 55375450Sbrendan spa_aux_vdev_t *sav = &spa->spa_spares; 55385450Sbrendan 55395450Sbrendan for (i = 0; i < sav->sav_count; i++) 55405450Sbrendan if (sav->sav_vdevs[i]->vdev_guid == guid) 55412082Seschrock return (B_TRUE); 55422082Seschrock 55435450Sbrendan for (i = 0; i < sav->sav_npending; i++) { 55445450Sbrendan if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, 55455450Sbrendan &spareguid) == 0 && spareguid == guid) 55463377Seschrock return (B_TRUE); 55473377Seschrock } 55483377Seschrock 55492082Seschrock return (B_FALSE); 55502082Seschrock } 55513912Slling 55524451Seschrock /* 55537214Slling * Check if a pool has an active shared spare device. 55547214Slling * Note: reference count of an active spare is 2, as a spare and as a replace 55557214Slling */ 55567214Slling static boolean_t 55577214Slling spa_has_active_shared_spare(spa_t *spa) 55587214Slling { 55597214Slling int i, refcnt; 55607214Slling uint64_t pool; 55617214Slling spa_aux_vdev_t *sav = &spa->spa_spares; 55627214Slling 55637214Slling for (i = 0; i < sav->sav_count; i++) { 55647214Slling if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, 55657214Slling &refcnt) && pool != 0ULL && pool == spa_guid(spa) && 55667214Slling refcnt > 2) 55677214Slling return (B_TRUE); 55687214Slling } 55697214Slling 55707214Slling return (B_FALSE); 55717214Slling } 55727214Slling 55737214Slling /* 55744451Seschrock * Post a sysevent corresponding to the given event. The 'name' must be one of 55754451Seschrock * the event definitions in sys/sysevent/eventdefs.h. The payload will be 55764451Seschrock * filled in from the spa and (optionally) the vdev. This doesn't do anything 55774451Seschrock * in the userland libzpool, as we don't want consumers to misinterpret ztest 55784451Seschrock * or zdb as real changes. 55794451Seschrock */ 55804451Seschrock void 55814451Seschrock spa_event_notify(spa_t *spa, vdev_t *vd, const char *name) 55824451Seschrock { 55834451Seschrock #ifdef _KERNEL 55844451Seschrock sysevent_t *ev; 55854451Seschrock sysevent_attr_list_t *attr = NULL; 55864451Seschrock sysevent_value_t value; 55874451Seschrock sysevent_id_t eid; 55884451Seschrock 55894451Seschrock ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", 55904451Seschrock SE_SLEEP); 55914451Seschrock 55924451Seschrock value.value_type = SE_DATA_TYPE_STRING; 55934451Seschrock value.value.sv_string = spa_name(spa); 55944451Seschrock if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) 55954451Seschrock goto done; 55964451Seschrock 55974451Seschrock value.value_type = SE_DATA_TYPE_UINT64; 55984451Seschrock value.value.sv_uint64 = spa_guid(spa); 55994451Seschrock if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) 56004451Seschrock goto done; 56014451Seschrock 56024451Seschrock if (vd) { 56034451Seschrock value.value_type = SE_DATA_TYPE_UINT64; 56044451Seschrock value.value.sv_uint64 = vd->vdev_guid; 56054451Seschrock if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, 56064451Seschrock SE_SLEEP) != 0) 56074451Seschrock goto done; 56084451Seschrock 56094451Seschrock if (vd->vdev_path) { 56104451Seschrock value.value_type = SE_DATA_TYPE_STRING; 56114451Seschrock value.value.sv_string = vd->vdev_path; 56124451Seschrock if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, 56134451Seschrock &value, SE_SLEEP) != 0) 56144451Seschrock goto done; 56154451Seschrock } 56164451Seschrock } 56174451Seschrock 56185756Seschrock if (sysevent_attach_attributes(ev, attr) != 0) 56195756Seschrock goto done; 56205756Seschrock attr = NULL; 56215756Seschrock 56224451Seschrock (void) log_sysevent(ev, SE_SLEEP, &eid); 56234451Seschrock 56244451Seschrock done: 56254451Seschrock if (attr) 56264451Seschrock sysevent_free_attr(attr); 56274451Seschrock sysevent_free(ev); 56284451Seschrock #endif 56294451Seschrock } 5630