1789Sahrens /* 2789Sahrens * CDDL HEADER START 3789Sahrens * 4789Sahrens * The contents of this file are subject to the terms of the 51485Slling * Common Development and Distribution License (the "License"). 61485Slling * You may not use this file except in compliance with the License. 7789Sahrens * 8789Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9789Sahrens * or http://www.opensolaris.org/os/licensing. 10789Sahrens * See the License for the specific language governing permissions 11789Sahrens * and limitations under the License. 12789Sahrens * 13789Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14789Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15789Sahrens * If applicable, add the following below this CDDL HEADER, with the 16789Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17789Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18789Sahrens * 19789Sahrens * CDDL HEADER END 20789Sahrens */ 212082Seschrock 22789Sahrens /* 236523Sek110237 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24789Sahrens * Use is subject to license terms. 25789Sahrens */ 26789Sahrens 27789Sahrens #pragma ident "%Z%%M% %I% %E% SMI" 28789Sahrens 29789Sahrens #include <sys/zfs_context.h> 301544Seschrock #include <sys/fm/fs/zfs.h> 31789Sahrens #include <sys/spa.h> 32789Sahrens #include <sys/spa_impl.h> 33789Sahrens #include <sys/dmu.h> 34789Sahrens #include <sys/dmu_tx.h> 35789Sahrens #include <sys/vdev_impl.h> 36789Sahrens #include <sys/uberblock_impl.h> 37789Sahrens #include <sys/metaslab.h> 38789Sahrens #include <sys/metaslab_impl.h> 39789Sahrens #include <sys/space_map.h> 40789Sahrens #include <sys/zio.h> 41789Sahrens #include <sys/zap.h> 42789Sahrens #include <sys/fs/zfs.h> 436643Seschrock #include <sys/arc.h> 44789Sahrens 45789Sahrens /* 46789Sahrens * Virtual device management. 47789Sahrens */ 48789Sahrens 49789Sahrens static vdev_ops_t *vdev_ops_table[] = { 50789Sahrens &vdev_root_ops, 51789Sahrens &vdev_raidz_ops, 52789Sahrens &vdev_mirror_ops, 53789Sahrens &vdev_replacing_ops, 542082Seschrock &vdev_spare_ops, 55789Sahrens &vdev_disk_ops, 56789Sahrens &vdev_file_ops, 57789Sahrens &vdev_missing_ops, 58789Sahrens NULL 59789Sahrens }; 60789Sahrens 61*7046Sahrens /* maximum scrub/resilver I/O queue per leaf vdev */ 62*7046Sahrens int zfs_scrub_limit = 10; 633697Smishra 64789Sahrens /* 65789Sahrens * Given a vdev type, return the appropriate ops vector. 66789Sahrens */ 67789Sahrens static vdev_ops_t * 68789Sahrens vdev_getops(const char *type) 69789Sahrens { 70789Sahrens vdev_ops_t *ops, **opspp; 71789Sahrens 72789Sahrens for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++) 73789Sahrens if (strcmp(ops->vdev_op_type, type) == 0) 74789Sahrens break; 75789Sahrens 76789Sahrens return (ops); 77789Sahrens } 78789Sahrens 79789Sahrens /* 80789Sahrens * Default asize function: return the MAX of psize with the asize of 81789Sahrens * all children. This is what's used by anything other than RAID-Z. 82789Sahrens */ 83789Sahrens uint64_t 84789Sahrens vdev_default_asize(vdev_t *vd, uint64_t psize) 85789Sahrens { 861732Sbonwick uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift); 87789Sahrens uint64_t csize; 88789Sahrens uint64_t c; 89789Sahrens 90789Sahrens for (c = 0; c < vd->vdev_children; c++) { 91789Sahrens csize = vdev_psize_to_asize(vd->vdev_child[c], psize); 92789Sahrens asize = MAX(asize, csize); 93789Sahrens } 94789Sahrens 95789Sahrens return (asize); 96789Sahrens } 97789Sahrens 981175Slling /* 991175Slling * Get the replaceable or attachable device size. 1001175Slling * If the parent is a mirror or raidz, the replaceable size is the minimum 1011175Slling * psize of all its children. For the rest, just return our own psize. 1021175Slling * 1031175Slling * e.g. 1041175Slling * psize rsize 1051175Slling * root - - 1061175Slling * mirror/raidz - - 1071175Slling * disk1 20g 20g 1081175Slling * disk2 40g 20g 1091175Slling * disk3 80g 80g 1101175Slling */ 1111175Slling uint64_t 1121175Slling vdev_get_rsize(vdev_t *vd) 1131175Slling { 1141175Slling vdev_t *pvd, *cvd; 1151175Slling uint64_t c, rsize; 1161175Slling 1171175Slling pvd = vd->vdev_parent; 1181175Slling 1191175Slling /* 1201175Slling * If our parent is NULL or the root, just return our own psize. 1211175Slling */ 1221175Slling if (pvd == NULL || pvd->vdev_parent == NULL) 1231175Slling return (vd->vdev_psize); 1241175Slling 1251175Slling rsize = 0; 1261175Slling 1271175Slling for (c = 0; c < pvd->vdev_children; c++) { 1281175Slling cvd = pvd->vdev_child[c]; 1291175Slling rsize = MIN(rsize - 1, cvd->vdev_psize - 1) + 1; 1301175Slling } 1311175Slling 1321175Slling return (rsize); 1331175Slling } 1341175Slling 135789Sahrens vdev_t * 136789Sahrens vdev_lookup_top(spa_t *spa, uint64_t vdev) 137789Sahrens { 138789Sahrens vdev_t *rvd = spa->spa_root_vdev; 139789Sahrens 140*7046Sahrens ASSERT(spa_config_held(spa, RW_READER)); 1415530Sbonwick 142*7046Sahrens if (vdev < rvd->vdev_children) { 143*7046Sahrens ASSERT(rvd->vdev_child[vdev] != NULL); 144789Sahrens return (rvd->vdev_child[vdev]); 145*7046Sahrens } 146789Sahrens 147789Sahrens return (NULL); 148789Sahrens } 149789Sahrens 150789Sahrens vdev_t * 151789Sahrens vdev_lookup_by_guid(vdev_t *vd, uint64_t guid) 152789Sahrens { 153789Sahrens int c; 154789Sahrens vdev_t *mvd; 155789Sahrens 1561585Sbonwick if (vd->vdev_guid == guid) 157789Sahrens return (vd); 158789Sahrens 159789Sahrens for (c = 0; c < vd->vdev_children; c++) 160789Sahrens if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) != 161789Sahrens NULL) 162789Sahrens return (mvd); 163789Sahrens 164789Sahrens return (NULL); 165789Sahrens } 166789Sahrens 167789Sahrens void 168789Sahrens vdev_add_child(vdev_t *pvd, vdev_t *cvd) 169789Sahrens { 170789Sahrens size_t oldsize, newsize; 171789Sahrens uint64_t id = cvd->vdev_id; 172789Sahrens vdev_t **newchild; 173789Sahrens 174789Sahrens ASSERT(spa_config_held(cvd->vdev_spa, RW_WRITER)); 175789Sahrens ASSERT(cvd->vdev_parent == NULL); 176789Sahrens 177789Sahrens cvd->vdev_parent = pvd; 178789Sahrens 179789Sahrens if (pvd == NULL) 180789Sahrens return; 181789Sahrens 182789Sahrens ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL); 183789Sahrens 184789Sahrens oldsize = pvd->vdev_children * sizeof (vdev_t *); 185789Sahrens pvd->vdev_children = MAX(pvd->vdev_children, id + 1); 186789Sahrens newsize = pvd->vdev_children * sizeof (vdev_t *); 187789Sahrens 188789Sahrens newchild = kmem_zalloc(newsize, KM_SLEEP); 189789Sahrens if (pvd->vdev_child != NULL) { 190789Sahrens bcopy(pvd->vdev_child, newchild, oldsize); 191789Sahrens kmem_free(pvd->vdev_child, oldsize); 192789Sahrens } 193789Sahrens 194789Sahrens pvd->vdev_child = newchild; 195789Sahrens pvd->vdev_child[id] = cvd; 196789Sahrens 197789Sahrens cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd); 198789Sahrens ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL); 199789Sahrens 200789Sahrens /* 201789Sahrens * Walk up all ancestors to update guid sum. 202789Sahrens */ 203789Sahrens for (; pvd != NULL; pvd = pvd->vdev_parent) 204789Sahrens pvd->vdev_guid_sum += cvd->vdev_guid_sum; 2053697Smishra 2063697Smishra if (cvd->vdev_ops->vdev_op_leaf) 2073697Smishra cvd->vdev_spa->spa_scrub_maxinflight += zfs_scrub_limit; 208789Sahrens } 209789Sahrens 210789Sahrens void 211789Sahrens vdev_remove_child(vdev_t *pvd, vdev_t *cvd) 212789Sahrens { 213789Sahrens int c; 214789Sahrens uint_t id = cvd->vdev_id; 215789Sahrens 216789Sahrens ASSERT(cvd->vdev_parent == pvd); 217789Sahrens 218789Sahrens if (pvd == NULL) 219789Sahrens return; 220789Sahrens 221789Sahrens ASSERT(id < pvd->vdev_children); 222789Sahrens ASSERT(pvd->vdev_child[id] == cvd); 223789Sahrens 224789Sahrens pvd->vdev_child[id] = NULL; 225789Sahrens cvd->vdev_parent = NULL; 226789Sahrens 227789Sahrens for (c = 0; c < pvd->vdev_children; c++) 228789Sahrens if (pvd->vdev_child[c]) 229789Sahrens break; 230789Sahrens 231789Sahrens if (c == pvd->vdev_children) { 232789Sahrens kmem_free(pvd->vdev_child, c * sizeof (vdev_t *)); 233789Sahrens pvd->vdev_child = NULL; 234789Sahrens pvd->vdev_children = 0; 235789Sahrens } 236789Sahrens 237789Sahrens /* 238789Sahrens * Walk up all ancestors to update guid sum. 239789Sahrens */ 240789Sahrens for (; pvd != NULL; pvd = pvd->vdev_parent) 241789Sahrens pvd->vdev_guid_sum -= cvd->vdev_guid_sum; 2423697Smishra 2433697Smishra if (cvd->vdev_ops->vdev_op_leaf) 2443697Smishra cvd->vdev_spa->spa_scrub_maxinflight -= zfs_scrub_limit; 245789Sahrens } 246789Sahrens 247789Sahrens /* 248789Sahrens * Remove any holes in the child array. 249789Sahrens */ 250789Sahrens void 251789Sahrens vdev_compact_children(vdev_t *pvd) 252789Sahrens { 253789Sahrens vdev_t **newchild, *cvd; 254789Sahrens int oldc = pvd->vdev_children; 255789Sahrens int newc, c; 256789Sahrens 257789Sahrens ASSERT(spa_config_held(pvd->vdev_spa, RW_WRITER)); 258789Sahrens 259789Sahrens for (c = newc = 0; c < oldc; c++) 260789Sahrens if (pvd->vdev_child[c]) 261789Sahrens newc++; 262789Sahrens 263789Sahrens newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP); 264789Sahrens 265789Sahrens for (c = newc = 0; c < oldc; c++) { 266789Sahrens if ((cvd = pvd->vdev_child[c]) != NULL) { 267789Sahrens newchild[newc] = cvd; 268789Sahrens cvd->vdev_id = newc++; 269789Sahrens } 270789Sahrens } 271789Sahrens 272789Sahrens kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *)); 273789Sahrens pvd->vdev_child = newchild; 274789Sahrens pvd->vdev_children = newc; 275789Sahrens } 276789Sahrens 277789Sahrens /* 278789Sahrens * Allocate and minimally initialize a vdev_t. 279789Sahrens */ 280789Sahrens static vdev_t * 281789Sahrens vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) 282789Sahrens { 283789Sahrens vdev_t *vd; 284789Sahrens 2851585Sbonwick vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP); 2861585Sbonwick 2871585Sbonwick if (spa->spa_root_vdev == NULL) { 2881585Sbonwick ASSERT(ops == &vdev_root_ops); 2891585Sbonwick spa->spa_root_vdev = vd; 2901585Sbonwick } 291789Sahrens 2921585Sbonwick if (guid == 0) { 2931585Sbonwick if (spa->spa_root_vdev == vd) { 2941585Sbonwick /* 2951585Sbonwick * The root vdev's guid will also be the pool guid, 2961585Sbonwick * which must be unique among all pools. 2971585Sbonwick */ 2981585Sbonwick while (guid == 0 || spa_guid_exists(guid, 0)) 2991585Sbonwick guid = spa_get_random(-1ULL); 3001585Sbonwick } else { 3011585Sbonwick /* 3021585Sbonwick * Any other vdev's guid must be unique within the pool. 3031585Sbonwick */ 3041585Sbonwick while (guid == 0 || 3051585Sbonwick spa_guid_exists(spa_guid(spa), guid)) 3061585Sbonwick guid = spa_get_random(-1ULL); 3071585Sbonwick } 3081585Sbonwick ASSERT(!spa_guid_exists(spa_guid(spa), guid)); 3091585Sbonwick } 310789Sahrens 311789Sahrens vd->vdev_spa = spa; 312789Sahrens vd->vdev_id = id; 313789Sahrens vd->vdev_guid = guid; 314789Sahrens vd->vdev_guid_sum = guid; 315789Sahrens vd->vdev_ops = ops; 316789Sahrens vd->vdev_state = VDEV_STATE_CLOSED; 317789Sahrens 318789Sahrens mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL); 3192856Snd150628 mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); 320789Sahrens space_map_create(&vd->vdev_dtl_map, 0, -1ULL, 0, &vd->vdev_dtl_lock); 321789Sahrens space_map_create(&vd->vdev_dtl_scrub, 0, -1ULL, 0, &vd->vdev_dtl_lock); 322789Sahrens txg_list_create(&vd->vdev_ms_list, 323789Sahrens offsetof(struct metaslab, ms_txg_node)); 324789Sahrens txg_list_create(&vd->vdev_dtl_list, 325789Sahrens offsetof(struct vdev, vdev_dtl_node)); 326789Sahrens vd->vdev_stat.vs_timestamp = gethrtime(); 3274451Seschrock vdev_queue_init(vd); 3284451Seschrock vdev_cache_init(vd); 329789Sahrens 330789Sahrens return (vd); 331789Sahrens } 332789Sahrens 333789Sahrens /* 334789Sahrens * Allocate a new vdev. The 'alloctype' is used to control whether we are 335789Sahrens * creating a new vdev or loading an existing one - the behavior is slightly 336789Sahrens * different for each case. 337789Sahrens */ 3382082Seschrock int 3392082Seschrock vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, 3402082Seschrock int alloctype) 341789Sahrens { 342789Sahrens vdev_ops_t *ops; 343789Sahrens char *type; 3444527Sperrin uint64_t guid = 0, islog, nparity; 345789Sahrens vdev_t *vd; 346789Sahrens 347789Sahrens ASSERT(spa_config_held(spa, RW_WRITER)); 348789Sahrens 349789Sahrens if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) 3502082Seschrock return (EINVAL); 351789Sahrens 352789Sahrens if ((ops = vdev_getops(type)) == NULL) 3532082Seschrock return (EINVAL); 354789Sahrens 355789Sahrens /* 356789Sahrens * If this is a load, get the vdev guid from the nvlist. 357789Sahrens * Otherwise, vdev_alloc_common() will generate one for us. 358789Sahrens */ 359789Sahrens if (alloctype == VDEV_ALLOC_LOAD) { 360789Sahrens uint64_t label_id; 361789Sahrens 362789Sahrens if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) || 363789Sahrens label_id != id) 3642082Seschrock return (EINVAL); 365789Sahrens 366789Sahrens if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 3672082Seschrock return (EINVAL); 3682082Seschrock } else if (alloctype == VDEV_ALLOC_SPARE) { 3692082Seschrock if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 3702082Seschrock return (EINVAL); 3715450Sbrendan } else if (alloctype == VDEV_ALLOC_L2CACHE) { 3725450Sbrendan if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 3735450Sbrendan return (EINVAL); 374789Sahrens } 375789Sahrens 3762082Seschrock /* 3772082Seschrock * The first allocated vdev must be of type 'root'. 3782082Seschrock */ 3792082Seschrock if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL) 3802082Seschrock return (EINVAL); 3812082Seschrock 3824527Sperrin /* 3834527Sperrin * Determine whether we're a log vdev. 3844527Sperrin */ 3854527Sperrin islog = 0; 3864527Sperrin (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog); 3875094Slling if (islog && spa_version(spa) < SPA_VERSION_SLOGS) 3884527Sperrin return (ENOTSUP); 3894527Sperrin 3904527Sperrin /* 3914527Sperrin * Set the nparity property for RAID-Z vdevs. 3924527Sperrin */ 3934527Sperrin nparity = -1ULL; 3944527Sperrin if (ops == &vdev_raidz_ops) { 3954527Sperrin if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, 3964527Sperrin &nparity) == 0) { 3974527Sperrin /* 3984527Sperrin * Currently, we can only support 2 parity devices. 3994527Sperrin */ 4004527Sperrin if (nparity == 0 || nparity > 2) 4014527Sperrin return (EINVAL); 4024527Sperrin /* 4034527Sperrin * Older versions can only support 1 parity device. 4044527Sperrin */ 4054527Sperrin if (nparity == 2 && 4064577Sahrens spa_version(spa) < SPA_VERSION_RAID6) 4074527Sperrin return (ENOTSUP); 4084527Sperrin } else { 4094527Sperrin /* 4104527Sperrin * We require the parity to be specified for SPAs that 4114527Sperrin * support multiple parity levels. 4124527Sperrin */ 4134577Sahrens if (spa_version(spa) >= SPA_VERSION_RAID6) 4144527Sperrin return (EINVAL); 4154527Sperrin /* 4164527Sperrin * Otherwise, we default to 1 parity device for RAID-Z. 4174527Sperrin */ 4184527Sperrin nparity = 1; 4194527Sperrin } 4204527Sperrin } else { 4214527Sperrin nparity = 0; 4224527Sperrin } 4234527Sperrin ASSERT(nparity != -1ULL); 4244527Sperrin 425789Sahrens vd = vdev_alloc_common(spa, id, guid, ops); 426789Sahrens 4274527Sperrin vd->vdev_islog = islog; 4284527Sperrin vd->vdev_nparity = nparity; 4294527Sperrin 430789Sahrens if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0) 431789Sahrens vd->vdev_path = spa_strdup(vd->vdev_path); 432789Sahrens if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0) 433789Sahrens vd->vdev_devid = spa_strdup(vd->vdev_devid); 4344451Seschrock if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, 4354451Seschrock &vd->vdev_physpath) == 0) 4364451Seschrock vd->vdev_physpath = spa_strdup(vd->vdev_physpath); 437789Sahrens 438789Sahrens /* 4391171Seschrock * Set the whole_disk property. If it's not specified, leave the value 4401171Seschrock * as -1. 4411171Seschrock */ 4421171Seschrock if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, 4431171Seschrock &vd->vdev_wholedisk) != 0) 4441171Seschrock vd->vdev_wholedisk = -1ULL; 4451171Seschrock 4461171Seschrock /* 4471544Seschrock * Look for the 'not present' flag. This will only be set if the device 4481544Seschrock * was not present at the time of import. 4491544Seschrock */ 4506643Seschrock if (!spa->spa_import_faulted) 4516643Seschrock (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 4526643Seschrock &vd->vdev_not_present); 4531544Seschrock 4541544Seschrock /* 4551732Sbonwick * Get the alignment requirement. 4561732Sbonwick */ 4571732Sbonwick (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift); 4581732Sbonwick 4591732Sbonwick /* 460789Sahrens * If we're a top-level vdev, try to load the allocation parameters. 461789Sahrens */ 462789Sahrens if (parent && !parent->vdev_parent && alloctype == VDEV_ALLOC_LOAD) { 463789Sahrens (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, 464789Sahrens &vd->vdev_ms_array); 465789Sahrens (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, 466789Sahrens &vd->vdev_ms_shift); 467789Sahrens (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE, 468789Sahrens &vd->vdev_asize); 469789Sahrens } 470789Sahrens 471789Sahrens /* 4724451Seschrock * If we're a leaf vdev, try to load the DTL object and other state. 473789Sahrens */ 4746643Seschrock if (vd->vdev_ops->vdev_op_leaf && 4756643Seschrock (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE)) { 4766643Seschrock if (alloctype == VDEV_ALLOC_LOAD) { 4776643Seschrock (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, 4786643Seschrock &vd->vdev_dtl.smo_object); 4796643Seschrock (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE, 4806643Seschrock &vd->vdev_unspare); 4816643Seschrock } 4821732Sbonwick (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, 4831732Sbonwick &vd->vdev_offline); 4846643Seschrock 4854451Seschrock /* 4864451Seschrock * When importing a pool, we want to ignore the persistent fault 4874451Seschrock * state, as the diagnosis made on another system may not be 4884451Seschrock * valid in the current context. 4894451Seschrock */ 4904451Seschrock if (spa->spa_load_state == SPA_LOAD_OPEN) { 4914451Seschrock (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, 4924451Seschrock &vd->vdev_faulted); 4934451Seschrock (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED, 4944451Seschrock &vd->vdev_degraded); 4954451Seschrock (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, 4964451Seschrock &vd->vdev_removed); 4974451Seschrock } 498789Sahrens } 499789Sahrens 500789Sahrens /* 501789Sahrens * Add ourselves to the parent's list of children. 502789Sahrens */ 503789Sahrens vdev_add_child(parent, vd); 504789Sahrens 5052082Seschrock *vdp = vd; 5062082Seschrock 5072082Seschrock return (0); 508789Sahrens } 509789Sahrens 510789Sahrens void 511789Sahrens vdev_free(vdev_t *vd) 512789Sahrens { 513789Sahrens int c; 5144451Seschrock spa_t *spa = vd->vdev_spa; 515789Sahrens 516789Sahrens /* 517789Sahrens * vdev_free() implies closing the vdev first. This is simpler than 518789Sahrens * trying to ensure complicated semantics for all callers. 519789Sahrens */ 520789Sahrens vdev_close(vd); 521789Sahrens 5224451Seschrock 5231732Sbonwick ASSERT(!list_link_active(&vd->vdev_dirty_node)); 524789Sahrens 525789Sahrens /* 526789Sahrens * Free all children. 527789Sahrens */ 528789Sahrens for (c = 0; c < vd->vdev_children; c++) 529789Sahrens vdev_free(vd->vdev_child[c]); 530789Sahrens 531789Sahrens ASSERT(vd->vdev_child == NULL); 532789Sahrens ASSERT(vd->vdev_guid_sum == vd->vdev_guid); 533789Sahrens 534789Sahrens /* 535789Sahrens * Discard allocation state. 536789Sahrens */ 537789Sahrens if (vd == vd->vdev_top) 538789Sahrens vdev_metaslab_fini(vd); 539789Sahrens 540789Sahrens ASSERT3U(vd->vdev_stat.vs_space, ==, 0); 5412082Seschrock ASSERT3U(vd->vdev_stat.vs_dspace, ==, 0); 542789Sahrens ASSERT3U(vd->vdev_stat.vs_alloc, ==, 0); 543789Sahrens 544789Sahrens /* 545789Sahrens * Remove this vdev from its parent's child list. 546789Sahrens */ 547789Sahrens vdev_remove_child(vd->vdev_parent, vd); 548789Sahrens 549789Sahrens ASSERT(vd->vdev_parent == NULL); 550789Sahrens 5514451Seschrock /* 5524451Seschrock * Clean up vdev structure. 5534451Seschrock */ 5544451Seschrock vdev_queue_fini(vd); 5554451Seschrock vdev_cache_fini(vd); 5564451Seschrock 5574451Seschrock if (vd->vdev_path) 5584451Seschrock spa_strfree(vd->vdev_path); 5594451Seschrock if (vd->vdev_devid) 5604451Seschrock spa_strfree(vd->vdev_devid); 5614451Seschrock if (vd->vdev_physpath) 5624451Seschrock spa_strfree(vd->vdev_physpath); 5634451Seschrock 5644451Seschrock if (vd->vdev_isspare) 5654451Seschrock spa_spare_remove(vd); 5665450Sbrendan if (vd->vdev_isl2cache) 5675450Sbrendan spa_l2cache_remove(vd); 5684451Seschrock 5694451Seschrock txg_list_destroy(&vd->vdev_ms_list); 5704451Seschrock txg_list_destroy(&vd->vdev_dtl_list); 5714451Seschrock mutex_enter(&vd->vdev_dtl_lock); 5724451Seschrock space_map_unload(&vd->vdev_dtl_map); 5734451Seschrock space_map_destroy(&vd->vdev_dtl_map); 5744451Seschrock space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL); 5754451Seschrock space_map_destroy(&vd->vdev_dtl_scrub); 5764451Seschrock mutex_exit(&vd->vdev_dtl_lock); 5774451Seschrock mutex_destroy(&vd->vdev_dtl_lock); 5784451Seschrock mutex_destroy(&vd->vdev_stat_lock); 5794451Seschrock 5804451Seschrock if (vd == spa->spa_root_vdev) 5814451Seschrock spa->spa_root_vdev = NULL; 5824451Seschrock 5834451Seschrock kmem_free(vd, sizeof (vdev_t)); 584789Sahrens } 585789Sahrens 586789Sahrens /* 587789Sahrens * Transfer top-level vdev state from svd to tvd. 588789Sahrens */ 589789Sahrens static void 590789Sahrens vdev_top_transfer(vdev_t *svd, vdev_t *tvd) 591789Sahrens { 592789Sahrens spa_t *spa = svd->vdev_spa; 593789Sahrens metaslab_t *msp; 594789Sahrens vdev_t *vd; 595789Sahrens int t; 596789Sahrens 597789Sahrens ASSERT(tvd == tvd->vdev_top); 598789Sahrens 599789Sahrens tvd->vdev_ms_array = svd->vdev_ms_array; 600789Sahrens tvd->vdev_ms_shift = svd->vdev_ms_shift; 601789Sahrens tvd->vdev_ms_count = svd->vdev_ms_count; 602789Sahrens 603789Sahrens svd->vdev_ms_array = 0; 604789Sahrens svd->vdev_ms_shift = 0; 605789Sahrens svd->vdev_ms_count = 0; 606789Sahrens 607789Sahrens tvd->vdev_mg = svd->vdev_mg; 608789Sahrens tvd->vdev_ms = svd->vdev_ms; 609789Sahrens 610789Sahrens svd->vdev_mg = NULL; 611789Sahrens svd->vdev_ms = NULL; 6121732Sbonwick 6131732Sbonwick if (tvd->vdev_mg != NULL) 6141732Sbonwick tvd->vdev_mg->mg_vd = tvd; 615789Sahrens 616789Sahrens tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc; 617789Sahrens tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space; 6182082Seschrock tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace; 619789Sahrens 620789Sahrens svd->vdev_stat.vs_alloc = 0; 621789Sahrens svd->vdev_stat.vs_space = 0; 6222082Seschrock svd->vdev_stat.vs_dspace = 0; 623789Sahrens 624789Sahrens for (t = 0; t < TXG_SIZE; t++) { 625789Sahrens while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL) 626789Sahrens (void) txg_list_add(&tvd->vdev_ms_list, msp, t); 627789Sahrens while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL) 628789Sahrens (void) txg_list_add(&tvd->vdev_dtl_list, vd, t); 629789Sahrens if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t)) 630789Sahrens (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t); 631789Sahrens } 632789Sahrens 6331732Sbonwick if (list_link_active(&svd->vdev_dirty_node)) { 634789Sahrens vdev_config_clean(svd); 635789Sahrens vdev_config_dirty(tvd); 636789Sahrens } 637789Sahrens 6382082Seschrock tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio; 6392082Seschrock svd->vdev_deflate_ratio = 0; 6404527Sperrin 6414527Sperrin tvd->vdev_islog = svd->vdev_islog; 6424527Sperrin svd->vdev_islog = 0; 643789Sahrens } 644789Sahrens 645789Sahrens static void 646789Sahrens vdev_top_update(vdev_t *tvd, vdev_t *vd) 647789Sahrens { 648789Sahrens int c; 649789Sahrens 650789Sahrens if (vd == NULL) 651789Sahrens return; 652789Sahrens 653789Sahrens vd->vdev_top = tvd; 654789Sahrens 655789Sahrens for (c = 0; c < vd->vdev_children; c++) 656789Sahrens vdev_top_update(tvd, vd->vdev_child[c]); 657789Sahrens } 658789Sahrens 659789Sahrens /* 660789Sahrens * Add a mirror/replacing vdev above an existing vdev. 661789Sahrens */ 662789Sahrens vdev_t * 663789Sahrens vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) 664789Sahrens { 665789Sahrens spa_t *spa = cvd->vdev_spa; 666789Sahrens vdev_t *pvd = cvd->vdev_parent; 667789Sahrens vdev_t *mvd; 668789Sahrens 669789Sahrens ASSERT(spa_config_held(spa, RW_WRITER)); 670789Sahrens 671789Sahrens mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops); 6721732Sbonwick 6731732Sbonwick mvd->vdev_asize = cvd->vdev_asize; 6741732Sbonwick mvd->vdev_ashift = cvd->vdev_ashift; 6751732Sbonwick mvd->vdev_state = cvd->vdev_state; 6761732Sbonwick 677789Sahrens vdev_remove_child(pvd, cvd); 678789Sahrens vdev_add_child(pvd, mvd); 679789Sahrens cvd->vdev_id = mvd->vdev_children; 680789Sahrens vdev_add_child(mvd, cvd); 681789Sahrens vdev_top_update(cvd->vdev_top, cvd->vdev_top); 682789Sahrens 683789Sahrens if (mvd == mvd->vdev_top) 684789Sahrens vdev_top_transfer(cvd, mvd); 685789Sahrens 686789Sahrens return (mvd); 687789Sahrens } 688789Sahrens 689789Sahrens /* 690789Sahrens * Remove a 1-way mirror/replacing vdev from the tree. 691789Sahrens */ 692789Sahrens void 693789Sahrens vdev_remove_parent(vdev_t *cvd) 694789Sahrens { 695789Sahrens vdev_t *mvd = cvd->vdev_parent; 696789Sahrens vdev_t *pvd = mvd->vdev_parent; 697789Sahrens 698789Sahrens ASSERT(spa_config_held(cvd->vdev_spa, RW_WRITER)); 699789Sahrens 700789Sahrens ASSERT(mvd->vdev_children == 1); 701789Sahrens ASSERT(mvd->vdev_ops == &vdev_mirror_ops || 7022082Seschrock mvd->vdev_ops == &vdev_replacing_ops || 7032082Seschrock mvd->vdev_ops == &vdev_spare_ops); 7041732Sbonwick cvd->vdev_ashift = mvd->vdev_ashift; 705789Sahrens 706789Sahrens vdev_remove_child(mvd, cvd); 707789Sahrens vdev_remove_child(pvd, mvd); 708789Sahrens cvd->vdev_id = mvd->vdev_id; 709789Sahrens vdev_add_child(pvd, cvd); 7102082Seschrock /* 7112082Seschrock * If we created a new toplevel vdev, then we need to change the child's 7122082Seschrock * vdev GUID to match the old toplevel vdev. Otherwise, we could have 7132082Seschrock * detached an offline device, and when we go to import the pool we'll 7142082Seschrock * think we have two toplevel vdevs, instead of a different version of 7152082Seschrock * the same toplevel vdev. 7162082Seschrock */ 7172082Seschrock if (cvd->vdev_top == cvd) { 7182082Seschrock pvd->vdev_guid_sum -= cvd->vdev_guid; 7192082Seschrock cvd->vdev_guid_sum -= cvd->vdev_guid; 7202082Seschrock cvd->vdev_guid = mvd->vdev_guid; 7212082Seschrock cvd->vdev_guid_sum += mvd->vdev_guid; 7222082Seschrock pvd->vdev_guid_sum += cvd->vdev_guid; 7232082Seschrock } 724789Sahrens vdev_top_update(cvd->vdev_top, cvd->vdev_top); 725789Sahrens 726789Sahrens if (cvd == cvd->vdev_top) 727789Sahrens vdev_top_transfer(mvd, cvd); 728789Sahrens 729789Sahrens ASSERT(mvd->vdev_children == 0); 730789Sahrens vdev_free(mvd); 731789Sahrens } 732789Sahrens 7331544Seschrock int 734789Sahrens vdev_metaslab_init(vdev_t *vd, uint64_t txg) 735789Sahrens { 736789Sahrens spa_t *spa = vd->vdev_spa; 7371732Sbonwick objset_t *mos = spa->spa_meta_objset; 7384527Sperrin metaslab_class_t *mc; 7391732Sbonwick uint64_t m; 740789Sahrens uint64_t oldc = vd->vdev_ms_count; 741789Sahrens uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; 7421732Sbonwick metaslab_t **mspp; 7431732Sbonwick int error; 744789Sahrens 7451585Sbonwick if (vd->vdev_ms_shift == 0) /* not being allocated from yet */ 7461585Sbonwick return (0); 7471585Sbonwick 748789Sahrens dprintf("%s oldc %llu newc %llu\n", vdev_description(vd), oldc, newc); 749789Sahrens 750789Sahrens ASSERT(oldc <= newc); 751789Sahrens 7524527Sperrin if (vd->vdev_islog) 7534527Sperrin mc = spa->spa_log_class; 7544527Sperrin else 7554527Sperrin mc = spa->spa_normal_class; 7564527Sperrin 7571732Sbonwick if (vd->vdev_mg == NULL) 7581732Sbonwick vd->vdev_mg = metaslab_group_create(mc, vd); 7591732Sbonwick 7601732Sbonwick mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP); 7611732Sbonwick 7621732Sbonwick if (oldc != 0) { 7631732Sbonwick bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp)); 7641732Sbonwick kmem_free(vd->vdev_ms, oldc * sizeof (*mspp)); 7651732Sbonwick } 7661732Sbonwick 7671732Sbonwick vd->vdev_ms = mspp; 768789Sahrens vd->vdev_ms_count = newc; 769789Sahrens 7701732Sbonwick for (m = oldc; m < newc; m++) { 7711732Sbonwick space_map_obj_t smo = { 0, 0, 0 }; 772789Sahrens if (txg == 0) { 7731732Sbonwick uint64_t object = 0; 7741732Sbonwick error = dmu_read(mos, vd->vdev_ms_array, 7751732Sbonwick m * sizeof (uint64_t), sizeof (uint64_t), &object); 7761732Sbonwick if (error) 7771732Sbonwick return (error); 7781732Sbonwick if (object != 0) { 7791732Sbonwick dmu_buf_t *db; 7801732Sbonwick error = dmu_bonus_hold(mos, object, FTAG, &db); 7811732Sbonwick if (error) 7821732Sbonwick return (error); 7834944Smaybee ASSERT3U(db->db_size, >=, sizeof (smo)); 7844944Smaybee bcopy(db->db_data, &smo, sizeof (smo)); 7851732Sbonwick ASSERT3U(smo.smo_object, ==, object); 7861544Seschrock dmu_buf_rele(db, FTAG); 787789Sahrens } 788789Sahrens } 7891732Sbonwick vd->vdev_ms[m] = metaslab_init(vd->vdev_mg, &smo, 7901732Sbonwick m << vd->vdev_ms_shift, 1ULL << vd->vdev_ms_shift, txg); 791789Sahrens } 792789Sahrens 7931544Seschrock return (0); 794789Sahrens } 795789Sahrens 796789Sahrens void 797789Sahrens vdev_metaslab_fini(vdev_t *vd) 798789Sahrens { 799789Sahrens uint64_t m; 800789Sahrens uint64_t count = vd->vdev_ms_count; 801789Sahrens 802789Sahrens if (vd->vdev_ms != NULL) { 803789Sahrens for (m = 0; m < count; m++) 8041732Sbonwick if (vd->vdev_ms[m] != NULL) 8051732Sbonwick metaslab_fini(vd->vdev_ms[m]); 806789Sahrens kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *)); 807789Sahrens vd->vdev_ms = NULL; 808789Sahrens } 809789Sahrens } 810789Sahrens 8115329Sgw25295 int 8125329Sgw25295 vdev_probe(vdev_t *vd) 8135329Sgw25295 { 8145329Sgw25295 if (vd == NULL) 8155329Sgw25295 return (EINVAL); 8165329Sgw25295 8175329Sgw25295 /* 8185329Sgw25295 * Right now we only support status checks on the leaf vdevs. 8195329Sgw25295 */ 8205329Sgw25295 if (vd->vdev_ops->vdev_op_leaf) 8215329Sgw25295 return (vd->vdev_ops->vdev_op_probe(vd)); 8225329Sgw25295 8235329Sgw25295 return (0); 8245329Sgw25295 } 8255329Sgw25295 826789Sahrens /* 827789Sahrens * Prepare a virtual device for access. 828789Sahrens */ 829789Sahrens int 830789Sahrens vdev_open(vdev_t *vd) 831789Sahrens { 832789Sahrens int error; 833789Sahrens int c; 834789Sahrens uint64_t osize = 0; 835789Sahrens uint64_t asize, psize; 8361732Sbonwick uint64_t ashift = 0; 837789Sahrens 838789Sahrens ASSERT(vd->vdev_state == VDEV_STATE_CLOSED || 839789Sahrens vd->vdev_state == VDEV_STATE_CANT_OPEN || 840789Sahrens vd->vdev_state == VDEV_STATE_OFFLINE); 841789Sahrens 842789Sahrens if (vd->vdev_fault_mode == VDEV_FAULT_COUNT) 843789Sahrens vd->vdev_fault_arg >>= 1; 844789Sahrens else 845789Sahrens vd->vdev_fault_mode = VDEV_FAULT_NONE; 846789Sahrens 847789Sahrens vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 848789Sahrens 8494451Seschrock if (!vd->vdev_removed && vd->vdev_faulted) { 8504451Seschrock ASSERT(vd->vdev_children == 0); 8514451Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, 8524451Seschrock VDEV_AUX_ERR_EXCEEDED); 8534451Seschrock return (ENXIO); 8544451Seschrock } else if (vd->vdev_offline) { 855789Sahrens ASSERT(vd->vdev_children == 0); 8561544Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE); 857789Sahrens return (ENXIO); 858789Sahrens } 859789Sahrens 860789Sahrens error = vd->vdev_ops->vdev_op_open(vd, &osize, &ashift); 861789Sahrens 8621544Seschrock if (zio_injection_enabled && error == 0) 8631544Seschrock error = zio_handle_device_injection(vd, ENXIO); 8641544Seschrock 8654451Seschrock if (error) { 8664451Seschrock if (vd->vdev_removed && 8674451Seschrock vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED) 8684451Seschrock vd->vdev_removed = B_FALSE; 869789Sahrens 8701544Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 871789Sahrens vd->vdev_stat.vs_aux); 872789Sahrens return (error); 873789Sahrens } 874789Sahrens 8754451Seschrock vd->vdev_removed = B_FALSE; 8764451Seschrock 8774451Seschrock if (vd->vdev_degraded) { 8784451Seschrock ASSERT(vd->vdev_children == 0); 8794451Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, 8804451Seschrock VDEV_AUX_ERR_EXCEEDED); 8814451Seschrock } else { 8824451Seschrock vd->vdev_state = VDEV_STATE_HEALTHY; 8834451Seschrock } 884789Sahrens 885789Sahrens for (c = 0; c < vd->vdev_children; c++) 8861544Seschrock if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { 8871544Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, 8881544Seschrock VDEV_AUX_NONE); 8891544Seschrock break; 8901544Seschrock } 891789Sahrens 892789Sahrens osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t)); 893789Sahrens 894789Sahrens if (vd->vdev_children == 0) { 895789Sahrens if (osize < SPA_MINDEVSIZE) { 8961544Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 8971544Seschrock VDEV_AUX_TOO_SMALL); 898789Sahrens return (EOVERFLOW); 899789Sahrens } 900789Sahrens psize = osize; 901789Sahrens asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); 902789Sahrens } else { 9031732Sbonwick if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE - 904789Sahrens (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) { 9051544Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 9061544Seschrock VDEV_AUX_TOO_SMALL); 907789Sahrens return (EOVERFLOW); 908789Sahrens } 909789Sahrens psize = 0; 910789Sahrens asize = osize; 911789Sahrens } 912789Sahrens 913789Sahrens vd->vdev_psize = psize; 914789Sahrens 915789Sahrens if (vd->vdev_asize == 0) { 916789Sahrens /* 917789Sahrens * This is the first-ever open, so use the computed values. 9181732Sbonwick * For testing purposes, a higher ashift can be requested. 919789Sahrens */ 920789Sahrens vd->vdev_asize = asize; 9211732Sbonwick vd->vdev_ashift = MAX(ashift, vd->vdev_ashift); 922789Sahrens } else { 923789Sahrens /* 924789Sahrens * Make sure the alignment requirement hasn't increased. 925789Sahrens */ 9261732Sbonwick if (ashift > vd->vdev_top->vdev_ashift) { 9271544Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 9281544Seschrock VDEV_AUX_BAD_LABEL); 929789Sahrens return (EINVAL); 930789Sahrens } 931789Sahrens 932789Sahrens /* 933789Sahrens * Make sure the device hasn't shrunk. 934789Sahrens */ 935789Sahrens if (asize < vd->vdev_asize) { 9361544Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 9371544Seschrock VDEV_AUX_BAD_LABEL); 938789Sahrens return (EINVAL); 939789Sahrens } 940789Sahrens 941789Sahrens /* 942789Sahrens * If all children are healthy and the asize has increased, 943789Sahrens * then we've experienced dynamic LUN growth. 944789Sahrens */ 945789Sahrens if (vd->vdev_state == VDEV_STATE_HEALTHY && 946789Sahrens asize > vd->vdev_asize) { 947789Sahrens vd->vdev_asize = asize; 948789Sahrens } 949789Sahrens } 950789Sahrens 9511544Seschrock /* 9525329Sgw25295 * Ensure we can issue some IO before declaring the 9535329Sgw25295 * vdev open for business. 9545329Sgw25295 */ 9555329Sgw25295 error = vdev_probe(vd); 9565329Sgw25295 if (error) { 9575329Sgw25295 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 9585329Sgw25295 VDEV_AUX_OPEN_FAILED); 9595329Sgw25295 return (error); 9605329Sgw25295 } 9615329Sgw25295 9625329Sgw25295 /* 9632082Seschrock * If this is a top-level vdev, compute the raidz-deflation 9642082Seschrock * ratio. Note, we hard-code in 128k (1<<17) because it is the 9652082Seschrock * current "typical" blocksize. Even if SPA_MAXBLOCKSIZE 9662082Seschrock * changes, this algorithm must never change, or we will 9672082Seschrock * inconsistently account for existing bp's. 9682082Seschrock */ 9692082Seschrock if (vd->vdev_top == vd) { 9702082Seschrock vd->vdev_deflate_ratio = (1<<17) / 9712082Seschrock (vdev_psize_to_asize(vd, 1<<17) >> SPA_MINBLOCKSHIFT); 9722082Seschrock } 9732082Seschrock 974*7046Sahrens /* 975*7046Sahrens * If a leaf vdev has a DTL, and seems healthy, then kick off a 976*7046Sahrens * resilver. But don't do this if we are doing a reopen for a 977*7046Sahrens * scrub, since this would just restart the scrub we are already 978*7046Sahrens * doing. 979*7046Sahrens */ 980*7046Sahrens if (vd->vdev_children == 0 && !vd->vdev_spa->spa_scrub_reopen) { 981*7046Sahrens mutex_enter(&vd->vdev_dtl_lock); 982*7046Sahrens if (vd->vdev_dtl_map.sm_space != 0 && vdev_writeable(vd)) 983*7046Sahrens spa_async_request(vd->vdev_spa, SPA_ASYNC_RESILVER); 984*7046Sahrens mutex_exit(&vd->vdev_dtl_lock); 985*7046Sahrens } 986*7046Sahrens 987789Sahrens return (0); 988789Sahrens } 989789Sahrens 990789Sahrens /* 9911986Seschrock * Called once the vdevs are all opened, this routine validates the label 9921986Seschrock * contents. This needs to be done before vdev_load() so that we don't 9934451Seschrock * inadvertently do repair I/Os to the wrong device. 9941986Seschrock * 9951986Seschrock * This function will only return failure if one of the vdevs indicates that it 9961986Seschrock * has since been destroyed or exported. This is only possible if 9971986Seschrock * /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state 9981986Seschrock * will be updated but the function will return 0. 9991986Seschrock */ 10001986Seschrock int 10011986Seschrock vdev_validate(vdev_t *vd) 10021986Seschrock { 10031986Seschrock spa_t *spa = vd->vdev_spa; 10041986Seschrock int c; 10051986Seschrock nvlist_t *label; 10061986Seschrock uint64_t guid; 10071986Seschrock uint64_t state; 10081986Seschrock 10091986Seschrock for (c = 0; c < vd->vdev_children; c++) 10101986Seschrock if (vdev_validate(vd->vdev_child[c]) != 0) 10114070Smc142369 return (EBADF); 10121986Seschrock 10132174Seschrock /* 10142174Seschrock * If the device has already failed, or was marked offline, don't do 10152174Seschrock * any further validation. Otherwise, label I/O will fail and we will 10162174Seschrock * overwrite the previous state. 10172174Seschrock */ 10182174Seschrock if (vd->vdev_ops->vdev_op_leaf && !vdev_is_dead(vd)) { 10191986Seschrock 10201986Seschrock if ((label = vdev_label_read_config(vd)) == NULL) { 10211986Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 10221986Seschrock VDEV_AUX_BAD_LABEL); 10231986Seschrock return (0); 10241986Seschrock } 10251986Seschrock 10261986Seschrock if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, 10271986Seschrock &guid) != 0 || guid != spa_guid(spa)) { 10281986Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 10291986Seschrock VDEV_AUX_CORRUPT_DATA); 10301986Seschrock nvlist_free(label); 10311986Seschrock return (0); 10321986Seschrock } 10331986Seschrock 10341986Seschrock if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, 10351986Seschrock &guid) != 0 || guid != vd->vdev_guid) { 10361986Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 10371986Seschrock VDEV_AUX_CORRUPT_DATA); 10381986Seschrock nvlist_free(label); 10391986Seschrock return (0); 10401986Seschrock } 10411986Seschrock 10421986Seschrock if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, 10431986Seschrock &state) != 0) { 10441986Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 10451986Seschrock VDEV_AUX_CORRUPT_DATA); 10461986Seschrock nvlist_free(label); 10471986Seschrock return (0); 10481986Seschrock } 10491986Seschrock 10501986Seschrock nvlist_free(label); 10511986Seschrock 10521986Seschrock if (spa->spa_load_state == SPA_LOAD_OPEN && 10531986Seschrock state != POOL_STATE_ACTIVE) 10544070Smc142369 return (EBADF); 10556976Seschrock 10566976Seschrock /* 10576976Seschrock * If we were able to open and validate a vdev that was 10586976Seschrock * previously marked permanently unavailable, clear that state 10596976Seschrock * now. 10606976Seschrock */ 10616976Seschrock if (vd->vdev_not_present) 10626976Seschrock vd->vdev_not_present = 0; 10631986Seschrock } 10641986Seschrock 10651986Seschrock return (0); 10661986Seschrock } 10671986Seschrock 10681986Seschrock /* 1069789Sahrens * Close a virtual device. 1070789Sahrens */ 1071789Sahrens void 1072789Sahrens vdev_close(vdev_t *vd) 1073789Sahrens { 1074789Sahrens vd->vdev_ops->vdev_op_close(vd); 1075789Sahrens 10764451Seschrock vdev_cache_purge(vd); 1077789Sahrens 10781986Seschrock /* 10791986Seschrock * We record the previous state before we close it, so that if we are 10801986Seschrock * doing a reopen(), we don't generate FMA ereports if we notice that 10811986Seschrock * it's still faulted. 10821986Seschrock */ 10831986Seschrock vd->vdev_prevstate = vd->vdev_state; 10841986Seschrock 1085789Sahrens if (vd->vdev_offline) 1086789Sahrens vd->vdev_state = VDEV_STATE_OFFLINE; 1087789Sahrens else 1088789Sahrens vd->vdev_state = VDEV_STATE_CLOSED; 10891544Seschrock vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 1090789Sahrens } 1091789Sahrens 1092789Sahrens void 10931544Seschrock vdev_reopen(vdev_t *vd) 1094789Sahrens { 10951544Seschrock spa_t *spa = vd->vdev_spa; 1096789Sahrens 10971544Seschrock ASSERT(spa_config_held(spa, RW_WRITER)); 10981544Seschrock 1099789Sahrens vdev_close(vd); 1100789Sahrens (void) vdev_open(vd); 1101789Sahrens 1102789Sahrens /* 11033377Seschrock * Call vdev_validate() here to make sure we have the same device. 11043377Seschrock * Otherwise, a device with an invalid label could be successfully 11053377Seschrock * opened in response to vdev_reopen(). 11063377Seschrock */ 11076643Seschrock if (vd->vdev_aux) { 11086643Seschrock (void) vdev_validate_aux(vd); 11096643Seschrock if (!vdev_is_dead(vd) && 11106643Seschrock !l2arc_vdev_present(vd)) { 11116643Seschrock uint64_t size = vdev_get_rsize(vd); 11126643Seschrock l2arc_add_vdev(spa, vd, 11136643Seschrock VDEV_LABEL_START_SIZE, 11146643Seschrock size - VDEV_LABEL_START_SIZE); 11156643Seschrock } 11166643Seschrock } else { 11176643Seschrock (void) vdev_validate(vd); 11186643Seschrock } 11193377Seschrock 11203377Seschrock /* 11214451Seschrock * Reassess parent vdev's health. 1122789Sahrens */ 11234451Seschrock vdev_propagate_state(vd); 1124789Sahrens } 1125789Sahrens 1126789Sahrens int 11272082Seschrock vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing) 1128789Sahrens { 1129789Sahrens int error; 1130789Sahrens 1131789Sahrens /* 1132789Sahrens * Normally, partial opens (e.g. of a mirror) are allowed. 1133789Sahrens * For a create, however, we want to fail the request if 1134789Sahrens * there are any components we can't open. 1135789Sahrens */ 1136789Sahrens error = vdev_open(vd); 1137789Sahrens 1138789Sahrens if (error || vd->vdev_state != VDEV_STATE_HEALTHY) { 1139789Sahrens vdev_close(vd); 1140789Sahrens return (error ? error : ENXIO); 1141789Sahrens } 1142789Sahrens 1143789Sahrens /* 1144789Sahrens * Recursively initialize all labels. 1145789Sahrens */ 11463377Seschrock if ((error = vdev_label_init(vd, txg, isreplacing ? 11473377Seschrock VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) { 1148789Sahrens vdev_close(vd); 1149789Sahrens return (error); 1150789Sahrens } 1151789Sahrens 1152789Sahrens return (0); 1153789Sahrens } 1154789Sahrens 1155789Sahrens /* 1156789Sahrens * The is the latter half of vdev_create(). It is distinct because it 1157789Sahrens * involves initiating transactions in order to do metaslab creation. 1158789Sahrens * For creation, we want to try to create all vdevs at once and then undo it 1159789Sahrens * if anything fails; this is much harder if we have pending transactions. 1160789Sahrens */ 11611585Sbonwick void 1162789Sahrens vdev_init(vdev_t *vd, uint64_t txg) 1163789Sahrens { 1164789Sahrens /* 1165789Sahrens * Aim for roughly 200 metaslabs per vdev. 1166789Sahrens */ 1167789Sahrens vd->vdev_ms_shift = highbit(vd->vdev_asize / 200); 1168789Sahrens vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT); 1169789Sahrens 1170789Sahrens /* 11711585Sbonwick * Initialize the vdev's metaslabs. This can't fail because 11721585Sbonwick * there's nothing to read when creating all new metaslabs. 1173789Sahrens */ 11741585Sbonwick VERIFY(vdev_metaslab_init(vd, txg) == 0); 1175789Sahrens } 1176789Sahrens 1177789Sahrens void 11781732Sbonwick vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg) 1179789Sahrens { 11801732Sbonwick ASSERT(vd == vd->vdev_top); 11811732Sbonwick ASSERT(ISP2(flags)); 1182789Sahrens 11831732Sbonwick if (flags & VDD_METASLAB) 11841732Sbonwick (void) txg_list_add(&vd->vdev_ms_list, arg, txg); 11851732Sbonwick 11861732Sbonwick if (flags & VDD_DTL) 11871732Sbonwick (void) txg_list_add(&vd->vdev_dtl_list, arg, txg); 11881732Sbonwick 11891732Sbonwick (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg); 1190789Sahrens } 1191789Sahrens 1192789Sahrens void 1193789Sahrens vdev_dtl_dirty(space_map_t *sm, uint64_t txg, uint64_t size) 1194789Sahrens { 1195789Sahrens mutex_enter(sm->sm_lock); 1196789Sahrens if (!space_map_contains(sm, txg, size)) 1197789Sahrens space_map_add(sm, txg, size); 1198789Sahrens mutex_exit(sm->sm_lock); 1199789Sahrens } 1200789Sahrens 1201789Sahrens int 1202789Sahrens vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size) 1203789Sahrens { 1204789Sahrens int dirty; 1205789Sahrens 1206789Sahrens /* 1207789Sahrens * Quick test without the lock -- covers the common case that 1208789Sahrens * there are no dirty time segments. 1209789Sahrens */ 1210789Sahrens if (sm->sm_space == 0) 1211789Sahrens return (0); 1212789Sahrens 1213789Sahrens mutex_enter(sm->sm_lock); 1214789Sahrens dirty = space_map_contains(sm, txg, size); 1215789Sahrens mutex_exit(sm->sm_lock); 1216789Sahrens 1217789Sahrens return (dirty); 1218789Sahrens } 1219789Sahrens 1220789Sahrens /* 1221789Sahrens * Reassess DTLs after a config change or scrub completion. 1222789Sahrens */ 1223789Sahrens void 1224789Sahrens vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) 1225789Sahrens { 12261544Seschrock spa_t *spa = vd->vdev_spa; 1227789Sahrens int c; 1228789Sahrens 1229*7046Sahrens ASSERT(spa_config_held(spa, RW_READER)); 1230789Sahrens 1231789Sahrens if (vd->vdev_children == 0) { 1232789Sahrens mutex_enter(&vd->vdev_dtl_lock); 1233*7046Sahrens if (scrub_txg != 0 && 1234*7046Sahrens (spa->spa_scrub_started || spa->spa_scrub_errors == 0)) { 1235*7046Sahrens /* XXX should check scrub_done? */ 1236*7046Sahrens /* 1237*7046Sahrens * We completed a scrub up to scrub_txg. If we 1238*7046Sahrens * did it without rebooting, then the scrub dtl 1239*7046Sahrens * will be valid, so excise the old region and 1240*7046Sahrens * fold in the scrub dtl. Otherwise, leave the 1241*7046Sahrens * dtl as-is if there was an error. 1242*7046Sahrens */ 1243789Sahrens space_map_excise(&vd->vdev_dtl_map, 0, scrub_txg); 1244789Sahrens space_map_union(&vd->vdev_dtl_map, &vd->vdev_dtl_scrub); 1245789Sahrens } 1246789Sahrens if (scrub_done) 1247789Sahrens space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL); 1248789Sahrens mutex_exit(&vd->vdev_dtl_lock); 1249*7046Sahrens 12501732Sbonwick if (txg != 0) 12511732Sbonwick vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); 1252789Sahrens return; 1253789Sahrens } 1254789Sahrens 12551544Seschrock /* 12561544Seschrock * Make sure the DTLs are always correct under the scrub lock. 12571544Seschrock */ 12581544Seschrock if (vd == spa->spa_root_vdev) 12591544Seschrock mutex_enter(&spa->spa_scrub_lock); 12601544Seschrock 1261789Sahrens mutex_enter(&vd->vdev_dtl_lock); 1262789Sahrens space_map_vacate(&vd->vdev_dtl_map, NULL, NULL); 1263789Sahrens space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL); 1264789Sahrens mutex_exit(&vd->vdev_dtl_lock); 1265789Sahrens 1266789Sahrens for (c = 0; c < vd->vdev_children; c++) { 1267789Sahrens vdev_t *cvd = vd->vdev_child[c]; 1268789Sahrens vdev_dtl_reassess(cvd, txg, scrub_txg, scrub_done); 1269789Sahrens mutex_enter(&vd->vdev_dtl_lock); 1270789Sahrens space_map_union(&vd->vdev_dtl_map, &cvd->vdev_dtl_map); 1271789Sahrens space_map_union(&vd->vdev_dtl_scrub, &cvd->vdev_dtl_scrub); 1272789Sahrens mutex_exit(&vd->vdev_dtl_lock); 1273789Sahrens } 12741544Seschrock 12751544Seschrock if (vd == spa->spa_root_vdev) 12761544Seschrock mutex_exit(&spa->spa_scrub_lock); 1277789Sahrens } 1278789Sahrens 1279789Sahrens static int 1280789Sahrens vdev_dtl_load(vdev_t *vd) 1281789Sahrens { 1282789Sahrens spa_t *spa = vd->vdev_spa; 1283789Sahrens space_map_obj_t *smo = &vd->vdev_dtl; 12841732Sbonwick objset_t *mos = spa->spa_meta_objset; 1285789Sahrens dmu_buf_t *db; 1286789Sahrens int error; 1287789Sahrens 1288789Sahrens ASSERT(vd->vdev_children == 0); 1289789Sahrens 1290789Sahrens if (smo->smo_object == 0) 1291789Sahrens return (0); 1292789Sahrens 12931732Sbonwick if ((error = dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)) != 0) 12941544Seschrock return (error); 12951732Sbonwick 12964944Smaybee ASSERT3U(db->db_size, >=, sizeof (*smo)); 12974944Smaybee bcopy(db->db_data, smo, sizeof (*smo)); 12981544Seschrock dmu_buf_rele(db, FTAG); 1299789Sahrens 1300789Sahrens mutex_enter(&vd->vdev_dtl_lock); 13011732Sbonwick error = space_map_load(&vd->vdev_dtl_map, NULL, SM_ALLOC, smo, mos); 1302789Sahrens mutex_exit(&vd->vdev_dtl_lock); 1303789Sahrens 1304789Sahrens return (error); 1305789Sahrens } 1306789Sahrens 1307789Sahrens void 1308789Sahrens vdev_dtl_sync(vdev_t *vd, uint64_t txg) 1309789Sahrens { 1310789Sahrens spa_t *spa = vd->vdev_spa; 1311789Sahrens space_map_obj_t *smo = &vd->vdev_dtl; 1312789Sahrens space_map_t *sm = &vd->vdev_dtl_map; 13131732Sbonwick objset_t *mos = spa->spa_meta_objset; 1314789Sahrens space_map_t smsync; 1315789Sahrens kmutex_t smlock; 1316789Sahrens dmu_buf_t *db; 1317789Sahrens dmu_tx_t *tx; 1318789Sahrens 1319789Sahrens dprintf("%s in txg %llu pass %d\n", 1320789Sahrens vdev_description(vd), (u_longlong_t)txg, spa_sync_pass(spa)); 1321789Sahrens 1322789Sahrens tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 1323789Sahrens 1324789Sahrens if (vd->vdev_detached) { 1325789Sahrens if (smo->smo_object != 0) { 13261732Sbonwick int err = dmu_object_free(mos, smo->smo_object, tx); 1327789Sahrens ASSERT3U(err, ==, 0); 1328789Sahrens smo->smo_object = 0; 1329789Sahrens } 1330789Sahrens dmu_tx_commit(tx); 13311732Sbonwick dprintf("detach %s committed in txg %llu\n", 13321732Sbonwick vdev_description(vd), txg); 1333789Sahrens return; 1334789Sahrens } 1335789Sahrens 1336789Sahrens if (smo->smo_object == 0) { 1337789Sahrens ASSERT(smo->smo_objsize == 0); 1338789Sahrens ASSERT(smo->smo_alloc == 0); 13391732Sbonwick smo->smo_object = dmu_object_alloc(mos, 1340789Sahrens DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT, 1341789Sahrens DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx); 1342789Sahrens ASSERT(smo->smo_object != 0); 1343789Sahrens vdev_config_dirty(vd->vdev_top); 1344789Sahrens } 1345789Sahrens 1346789Sahrens mutex_init(&smlock, NULL, MUTEX_DEFAULT, NULL); 1347789Sahrens 1348789Sahrens space_map_create(&smsync, sm->sm_start, sm->sm_size, sm->sm_shift, 1349789Sahrens &smlock); 1350789Sahrens 1351789Sahrens mutex_enter(&smlock); 1352789Sahrens 1353789Sahrens mutex_enter(&vd->vdev_dtl_lock); 13541732Sbonwick space_map_walk(sm, space_map_add, &smsync); 1355789Sahrens mutex_exit(&vd->vdev_dtl_lock); 1356789Sahrens 13571732Sbonwick space_map_truncate(smo, mos, tx); 13581732Sbonwick space_map_sync(&smsync, SM_ALLOC, smo, mos, tx); 1359789Sahrens 1360789Sahrens space_map_destroy(&smsync); 1361789Sahrens 1362789Sahrens mutex_exit(&smlock); 1363789Sahrens mutex_destroy(&smlock); 1364789Sahrens 13651732Sbonwick VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)); 1366789Sahrens dmu_buf_will_dirty(db, tx); 13674944Smaybee ASSERT3U(db->db_size, >=, sizeof (*smo)); 13684944Smaybee bcopy(smo, db->db_data, sizeof (*smo)); 13691544Seschrock dmu_buf_rele(db, FTAG); 1370789Sahrens 1371789Sahrens dmu_tx_commit(tx); 1372789Sahrens } 1373789Sahrens 1374*7046Sahrens /* 1375*7046Sahrens * Determine if resilver is needed, and if so the txg range. 1376*7046Sahrens */ 1377*7046Sahrens boolean_t 1378*7046Sahrens vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp) 1379*7046Sahrens { 1380*7046Sahrens boolean_t needed = B_FALSE; 1381*7046Sahrens uint64_t thismin = UINT64_MAX; 1382*7046Sahrens uint64_t thismax = 0; 1383*7046Sahrens 1384*7046Sahrens if (vd->vdev_children == 0) { 1385*7046Sahrens mutex_enter(&vd->vdev_dtl_lock); 1386*7046Sahrens if (vd->vdev_dtl_map.sm_space != 0 && vdev_writeable(vd)) { 1387*7046Sahrens space_seg_t *ss; 1388*7046Sahrens 1389*7046Sahrens ss = avl_first(&vd->vdev_dtl_map.sm_root); 1390*7046Sahrens thismin = ss->ss_start - 1; 1391*7046Sahrens ss = avl_last(&vd->vdev_dtl_map.sm_root); 1392*7046Sahrens thismax = ss->ss_end; 1393*7046Sahrens needed = B_TRUE; 1394*7046Sahrens } 1395*7046Sahrens mutex_exit(&vd->vdev_dtl_lock); 1396*7046Sahrens } else { 1397*7046Sahrens int c; 1398*7046Sahrens for (c = 0; c < vd->vdev_children; c++) { 1399*7046Sahrens vdev_t *cvd = vd->vdev_child[c]; 1400*7046Sahrens uint64_t cmin, cmax; 1401*7046Sahrens 1402*7046Sahrens if (vdev_resilver_needed(cvd, &cmin, &cmax)) { 1403*7046Sahrens thismin = MIN(thismin, cmin); 1404*7046Sahrens thismax = MAX(thismax, cmax); 1405*7046Sahrens needed = B_TRUE; 1406*7046Sahrens } 1407*7046Sahrens } 1408*7046Sahrens } 1409*7046Sahrens 1410*7046Sahrens if (needed && minp) { 1411*7046Sahrens *minp = thismin; 1412*7046Sahrens *maxp = thismax; 1413*7046Sahrens } 1414*7046Sahrens return (needed); 1415*7046Sahrens } 1416*7046Sahrens 14171986Seschrock void 14181544Seschrock vdev_load(vdev_t *vd) 1419789Sahrens { 14201986Seschrock int c; 1421789Sahrens 1422789Sahrens /* 1423789Sahrens * Recursively load all children. 1424789Sahrens */ 1425789Sahrens for (c = 0; c < vd->vdev_children; c++) 14261986Seschrock vdev_load(vd->vdev_child[c]); 1427789Sahrens 1428789Sahrens /* 14291585Sbonwick * If this is a top-level vdev, initialize its metaslabs. 1430789Sahrens */ 14311986Seschrock if (vd == vd->vdev_top && 14321986Seschrock (vd->vdev_ashift == 0 || vd->vdev_asize == 0 || 14331986Seschrock vdev_metaslab_init(vd, 0) != 0)) 14341986Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 14351986Seschrock VDEV_AUX_CORRUPT_DATA); 1436789Sahrens 1437789Sahrens /* 1438789Sahrens * If this is a leaf vdev, load its DTL. 1439789Sahrens */ 14401986Seschrock if (vd->vdev_ops->vdev_op_leaf && vdev_dtl_load(vd) != 0) 14411986Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 14421986Seschrock VDEV_AUX_CORRUPT_DATA); 1443789Sahrens } 1444789Sahrens 14452082Seschrock /* 14465450Sbrendan * The special vdev case is used for hot spares and l2cache devices. Its 14475450Sbrendan * sole purpose it to set the vdev state for the associated vdev. To do this, 14485450Sbrendan * we make sure that we can open the underlying device, then try to read the 14495450Sbrendan * label, and make sure that the label is sane and that it hasn't been 14505450Sbrendan * repurposed to another pool. 14512082Seschrock */ 14522082Seschrock int 14535450Sbrendan vdev_validate_aux(vdev_t *vd) 14542082Seschrock { 14552082Seschrock nvlist_t *label; 14562082Seschrock uint64_t guid, version; 14572082Seschrock uint64_t state; 14582082Seschrock 14596643Seschrock if (vdev_is_dead(vd)) 14606643Seschrock return (0); 14616643Seschrock 14622082Seschrock if ((label = vdev_label_read_config(vd)) == NULL) { 14632082Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 14642082Seschrock VDEV_AUX_CORRUPT_DATA); 14652082Seschrock return (-1); 14662082Seschrock } 14672082Seschrock 14682082Seschrock if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 || 14694577Sahrens version > SPA_VERSION || 14702082Seschrock nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 || 14712082Seschrock guid != vd->vdev_guid || 14722082Seschrock nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { 14732082Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 14742082Seschrock VDEV_AUX_CORRUPT_DATA); 14752082Seschrock nvlist_free(label); 14762082Seschrock return (-1); 14772082Seschrock } 14782082Seschrock 14792082Seschrock /* 14802082Seschrock * We don't actually check the pool state here. If it's in fact in 14812082Seschrock * use by another pool, we update this fact on the fly when requested. 14822082Seschrock */ 14832082Seschrock nvlist_free(label); 14842082Seschrock return (0); 14852082Seschrock } 14862082Seschrock 1487789Sahrens void 1488789Sahrens vdev_sync_done(vdev_t *vd, uint64_t txg) 1489789Sahrens { 1490789Sahrens metaslab_t *msp; 1491789Sahrens 1492789Sahrens dprintf("%s txg %llu\n", vdev_description(vd), txg); 1493789Sahrens 1494789Sahrens while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))) 1495789Sahrens metaslab_sync_done(msp, txg); 1496789Sahrens } 1497789Sahrens 1498789Sahrens void 1499789Sahrens vdev_sync(vdev_t *vd, uint64_t txg) 1500789Sahrens { 1501789Sahrens spa_t *spa = vd->vdev_spa; 1502789Sahrens vdev_t *lvd; 1503789Sahrens metaslab_t *msp; 15041732Sbonwick dmu_tx_t *tx; 1505789Sahrens 1506789Sahrens dprintf("%s txg %llu pass %d\n", 1507789Sahrens vdev_description(vd), (u_longlong_t)txg, spa_sync_pass(spa)); 1508789Sahrens 15091732Sbonwick if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0) { 15101732Sbonwick ASSERT(vd == vd->vdev_top); 15111732Sbonwick tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 15121732Sbonwick vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset, 15131732Sbonwick DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx); 15141732Sbonwick ASSERT(vd->vdev_ms_array != 0); 15151732Sbonwick vdev_config_dirty(vd); 15161732Sbonwick dmu_tx_commit(tx); 15171732Sbonwick } 1518789Sahrens 15191732Sbonwick while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) { 1520789Sahrens metaslab_sync(msp, txg); 15211732Sbonwick (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg)); 15221732Sbonwick } 1523789Sahrens 1524789Sahrens while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL) 1525789Sahrens vdev_dtl_sync(lvd, txg); 1526789Sahrens 1527789Sahrens (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)); 1528789Sahrens } 1529789Sahrens 1530789Sahrens uint64_t 1531789Sahrens vdev_psize_to_asize(vdev_t *vd, uint64_t psize) 1532789Sahrens { 1533789Sahrens return (vd->vdev_ops->vdev_op_asize(vd, psize)); 1534789Sahrens } 1535789Sahrens 1536789Sahrens const char * 1537789Sahrens vdev_description(vdev_t *vd) 1538789Sahrens { 1539789Sahrens if (vd == NULL || vd->vdev_ops == NULL) 1540789Sahrens return ("<unknown>"); 1541789Sahrens 1542789Sahrens if (vd->vdev_path != NULL) 1543789Sahrens return (vd->vdev_path); 1544789Sahrens 1545789Sahrens if (vd->vdev_parent == NULL) 1546789Sahrens return (spa_name(vd->vdev_spa)); 1547789Sahrens 1548789Sahrens return (vd->vdev_ops->vdev_op_type); 1549789Sahrens } 1550789Sahrens 15514451Seschrock /* 15524451Seschrock * Mark the given vdev faulted. A faulted vdev behaves as if the device could 15534451Seschrock * not be opened, and no I/O is attempted. 15544451Seschrock */ 1555789Sahrens int 15564451Seschrock vdev_fault(spa_t *spa, uint64_t guid) 15574451Seschrock { 15586643Seschrock vdev_t *vd; 15594451Seschrock uint64_t txg; 15604451Seschrock 15615329Sgw25295 /* 15625329Sgw25295 * Disregard a vdev fault request if the pool has 15635329Sgw25295 * experienced a complete failure. 15645329Sgw25295 * 15655329Sgw25295 * XXX - We do this here so that we don't hold the 15665329Sgw25295 * spa_namespace_lock in the event that we can't get 15675329Sgw25295 * the RW_WRITER spa_config_lock. 15685329Sgw25295 */ 15695329Sgw25295 if (spa_state(spa) == POOL_STATE_IO_FAILURE) 15705329Sgw25295 return (EIO); 15715329Sgw25295 15724451Seschrock txg = spa_vdev_enter(spa); 15734451Seschrock 15746643Seschrock if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 15754451Seschrock return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 15764451Seschrock if (!vd->vdev_ops->vdev_op_leaf) 15774451Seschrock return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 15784451Seschrock 15794451Seschrock /* 15804451Seschrock * Faulted state takes precedence over degraded. 15814451Seschrock */ 15824451Seschrock vd->vdev_faulted = 1ULL; 15834451Seschrock vd->vdev_degraded = 0ULL; 15844451Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, 15854451Seschrock VDEV_AUX_ERR_EXCEEDED); 15864451Seschrock 15874451Seschrock /* 15884451Seschrock * If marking the vdev as faulted cause the toplevel vdev to become 15894451Seschrock * unavailable, then back off and simply mark the vdev as degraded 15904451Seschrock * instead. 15914451Seschrock */ 15926643Seschrock if (vdev_is_dead(vd->vdev_top) && vd->vdev_aux == NULL) { 15934451Seschrock vd->vdev_degraded = 1ULL; 15944451Seschrock vd->vdev_faulted = 0ULL; 15954451Seschrock 15964451Seschrock /* 15974451Seschrock * If we reopen the device and it's not dead, only then do we 15984451Seschrock * mark it degraded. 15994451Seschrock */ 16004451Seschrock vdev_reopen(vd); 16014451Seschrock 16025329Sgw25295 if (vdev_readable(vd)) { 16034451Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, 16044451Seschrock VDEV_AUX_ERR_EXCEEDED); 16054451Seschrock } 16064451Seschrock } 16074451Seschrock 16084451Seschrock vdev_config_dirty(vd->vdev_top); 16094451Seschrock 16104451Seschrock (void) spa_vdev_exit(spa, NULL, txg, 0); 16114451Seschrock 16124451Seschrock return (0); 16134451Seschrock } 16144451Seschrock 16154451Seschrock /* 16164451Seschrock * Mark the given vdev degraded. A degraded vdev is purely an indication to the 16174451Seschrock * user that something is wrong. The vdev continues to operate as normal as far 16184451Seschrock * as I/O is concerned. 16194451Seschrock */ 16204451Seschrock int 16214451Seschrock vdev_degrade(spa_t *spa, uint64_t guid) 16224451Seschrock { 16236643Seschrock vdev_t *vd; 16244451Seschrock uint64_t txg; 16254451Seschrock 16265329Sgw25295 /* 16275329Sgw25295 * Disregard a vdev fault request if the pool has 16285329Sgw25295 * experienced a complete failure. 16295329Sgw25295 * 16305329Sgw25295 * XXX - We do this here so that we don't hold the 16315329Sgw25295 * spa_namespace_lock in the event that we can't get 16325329Sgw25295 * the RW_WRITER spa_config_lock. 16335329Sgw25295 */ 16345329Sgw25295 if (spa_state(spa) == POOL_STATE_IO_FAILURE) 16355329Sgw25295 return (EIO); 16365329Sgw25295 16374451Seschrock txg = spa_vdev_enter(spa); 16384451Seschrock 16396643Seschrock if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 16404451Seschrock return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 16414451Seschrock if (!vd->vdev_ops->vdev_op_leaf) 16424451Seschrock return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 16434451Seschrock 16444451Seschrock /* 16454451Seschrock * If the vdev is already faulted, then don't do anything. 16464451Seschrock */ 16474451Seschrock if (vd->vdev_faulted || vd->vdev_degraded) { 16484451Seschrock (void) spa_vdev_exit(spa, NULL, txg, 0); 16494451Seschrock return (0); 16504451Seschrock } 16514451Seschrock 16524451Seschrock vd->vdev_degraded = 1ULL; 16534451Seschrock if (!vdev_is_dead(vd)) 16544451Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, 16554451Seschrock VDEV_AUX_ERR_EXCEEDED); 16564451Seschrock vdev_config_dirty(vd->vdev_top); 16574451Seschrock 16584451Seschrock (void) spa_vdev_exit(spa, NULL, txg, 0); 16594451Seschrock 16604451Seschrock return (0); 16614451Seschrock } 16624451Seschrock 16634451Seschrock /* 16644451Seschrock * Online the given vdev. If 'unspare' is set, it implies two things. First, 16654451Seschrock * any attached spare device should be detached when the device finishes 16664451Seschrock * resilvering. Second, the online should be treated like a 'test' online case, 16674451Seschrock * so no FMA events are generated if the device fails to open. 16684451Seschrock */ 16694451Seschrock int 16704451Seschrock vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, 16714451Seschrock vdev_state_t *newstate) 1672789Sahrens { 16736643Seschrock vdev_t *vd; 16741485Slling uint64_t txg; 1675789Sahrens 16765329Sgw25295 /* 16775329Sgw25295 * Disregard a vdev fault request if the pool has 16785329Sgw25295 * experienced a complete failure. 16795329Sgw25295 * 16805329Sgw25295 * XXX - We do this here so that we don't hold the 16815329Sgw25295 * spa_namespace_lock in the event that we can't get 16825329Sgw25295 * the RW_WRITER spa_config_lock. 16835329Sgw25295 */ 16845329Sgw25295 if (spa_state(spa) == POOL_STATE_IO_FAILURE) 16855329Sgw25295 return (EIO); 16865329Sgw25295 16871485Slling txg = spa_vdev_enter(spa); 16881485Slling 16896643Seschrock if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 16901485Slling return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1691789Sahrens 16921585Sbonwick if (!vd->vdev_ops->vdev_op_leaf) 16931585Sbonwick return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 16941585Sbonwick 1695789Sahrens vd->vdev_offline = B_FALSE; 16961485Slling vd->vdev_tmpoffline = B_FALSE; 16974451Seschrock vd->vdev_checkremove = (flags & ZFS_ONLINE_CHECKREMOVE) ? 16984451Seschrock B_TRUE : B_FALSE; 16994451Seschrock vd->vdev_forcefault = (flags & ZFS_ONLINE_FORCEFAULT) ? 17004451Seschrock B_TRUE : B_FALSE; 17011544Seschrock vdev_reopen(vd->vdev_top); 17024451Seschrock vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE; 17034451Seschrock 17044451Seschrock if (newstate) 17054451Seschrock *newstate = vd->vdev_state; 17064451Seschrock if ((flags & ZFS_ONLINE_UNSPARE) && 17074451Seschrock !vdev_is_dead(vd) && vd->vdev_parent && 17084451Seschrock vd->vdev_parent->vdev_ops == &vdev_spare_ops && 17094451Seschrock vd->vdev_parent->vdev_child[0] == vd) 17104451Seschrock vd->vdev_unspare = B_TRUE; 1711789Sahrens 17121485Slling vdev_config_dirty(vd->vdev_top); 17131485Slling 17141485Slling (void) spa_vdev_exit(spa, NULL, txg, 0); 1715789Sahrens 17164451Seschrock /* 17174451Seschrock * Must hold spa_namespace_lock in order to post resilver sysevent 17184451Seschrock * w/pool name. 17194451Seschrock */ 17204451Seschrock mutex_enter(&spa_namespace_lock); 1721*7046Sahrens VERIFY3U(spa_scrub(spa, POOL_SCRUB_RESILVER), ==, 0); 17224451Seschrock mutex_exit(&spa_namespace_lock); 1723789Sahrens 1724789Sahrens return (0); 1725789Sahrens } 1726789Sahrens 1727789Sahrens int 17284451Seschrock vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) 1729789Sahrens { 17306643Seschrock vdev_t *vd; 17311485Slling uint64_t txg; 1732789Sahrens 17335329Sgw25295 /* 17345329Sgw25295 * Disregard a vdev fault request if the pool has 17355329Sgw25295 * experienced a complete failure. 17365329Sgw25295 * 17375329Sgw25295 * XXX - We do this here so that we don't hold the 17385329Sgw25295 * spa_namespace_lock in the event that we can't get 17395329Sgw25295 * the RW_WRITER spa_config_lock. 17405329Sgw25295 */ 17415329Sgw25295 if (spa_state(spa) == POOL_STATE_IO_FAILURE) 17425329Sgw25295 return (EIO); 17435329Sgw25295 17441485Slling txg = spa_vdev_enter(spa); 1745789Sahrens 17466643Seschrock if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 17471485Slling return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1748789Sahrens 17491585Sbonwick if (!vd->vdev_ops->vdev_op_leaf) 17501585Sbonwick return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 17511585Sbonwick 1752789Sahrens /* 17531732Sbonwick * If the device isn't already offline, try to offline it. 1754789Sahrens */ 17551732Sbonwick if (!vd->vdev_offline) { 17561732Sbonwick /* 17571732Sbonwick * If this device's top-level vdev has a non-empty DTL, 17581732Sbonwick * don't allow the device to be offlined. 17591732Sbonwick * 17601732Sbonwick * XXX -- make this more precise by allowing the offline 17611732Sbonwick * as long as the remaining devices don't have any DTL holes. 17621732Sbonwick */ 17631732Sbonwick if (vd->vdev_top->vdev_dtl_map.sm_space != 0) 17641732Sbonwick return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1765789Sahrens 17661732Sbonwick /* 17671732Sbonwick * Offline this device and reopen its top-level vdev. 17681732Sbonwick * If this action results in the top-level vdev becoming 17691732Sbonwick * unusable, undo it and fail the request. 17701732Sbonwick */ 17711732Sbonwick vd->vdev_offline = B_TRUE; 17721544Seschrock vdev_reopen(vd->vdev_top); 17736643Seschrock if (vdev_is_dead(vd->vdev_top) && vd->vdev_aux == NULL) { 17741732Sbonwick vd->vdev_offline = B_FALSE; 17751732Sbonwick vdev_reopen(vd->vdev_top); 17761732Sbonwick return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 17771732Sbonwick } 1778789Sahrens } 1779789Sahrens 17804451Seschrock vd->vdev_tmpoffline = (flags & ZFS_OFFLINE_TEMPORARY) ? 17814451Seschrock B_TRUE : B_FALSE; 17821732Sbonwick 17831732Sbonwick vdev_config_dirty(vd->vdev_top); 17841485Slling 17851485Slling return (spa_vdev_exit(spa, NULL, txg, 0)); 1786789Sahrens } 1787789Sahrens 17881544Seschrock /* 17891544Seschrock * Clear the error counts associated with this vdev. Unlike vdev_online() and 17901544Seschrock * vdev_offline(), we assume the spa config is locked. We also clear all 17911544Seschrock * children. If 'vd' is NULL, then the user wants to clear all vdevs. 17925329Sgw25295 * If reopen is specified then attempt to reopen the vdev if the vdev is 17935329Sgw25295 * faulted or degraded. 17941544Seschrock */ 17951544Seschrock void 17965329Sgw25295 vdev_clear(spa_t *spa, vdev_t *vd, boolean_t reopen_wanted) 1797789Sahrens { 17981544Seschrock int c; 1799789Sahrens 18001544Seschrock if (vd == NULL) 18011544Seschrock vd = spa->spa_root_vdev; 1802789Sahrens 18031544Seschrock vd->vdev_stat.vs_read_errors = 0; 18041544Seschrock vd->vdev_stat.vs_write_errors = 0; 18051544Seschrock vd->vdev_stat.vs_checksum_errors = 0; 18065329Sgw25295 vd->vdev_is_failing = B_FALSE; 1807789Sahrens 18081544Seschrock for (c = 0; c < vd->vdev_children; c++) 18095329Sgw25295 vdev_clear(spa, vd->vdev_child[c], reopen_wanted); 18104451Seschrock 18114451Seschrock /* 18126959Sek110237 * If we're in the FAULTED state or have experienced failed I/O, then 18136959Sek110237 * clear the persistent state and attempt to reopen the device. We 18146959Sek110237 * also mark the vdev config dirty, so that the new faulted state is 18156959Sek110237 * written out to disk. 18164451Seschrock */ 18176959Sek110237 if (reopen_wanted && (vd->vdev_faulted || vd->vdev_degraded || 18186959Sek110237 vd->vdev_stat.vs_aux == VDEV_AUX_IO_FAILURE)) { 18196959Sek110237 boolean_t resilver = (vd->vdev_faulted || vd->vdev_degraded); 18206959Sek110237 18214451Seschrock vd->vdev_faulted = vd->vdev_degraded = 0; 18224451Seschrock vdev_reopen(vd); 18234451Seschrock vdev_config_dirty(vd->vdev_top); 18244451Seschrock 18256959Sek110237 if (resilver && vd->vdev_aux == NULL && !vdev_is_dead(vd)) 18264808Sek110237 spa_async_request(spa, SPA_ASYNC_RESILVER); 18274451Seschrock 18284451Seschrock spa_event_notify(spa, vd, ESC_ZFS_VDEV_CLEAR); 18294451Seschrock } 1830789Sahrens } 1831789Sahrens 1832789Sahrens int 18335329Sgw25295 vdev_readable(vdev_t *vd) 18345329Sgw25295 { 18355329Sgw25295 /* XXPOLICY */ 18365329Sgw25295 return (!vdev_is_dead(vd)); 18375329Sgw25295 } 18385329Sgw25295 18395329Sgw25295 int 18405329Sgw25295 vdev_writeable(vdev_t *vd) 18415329Sgw25295 { 18425369Sgw25295 return (!vdev_is_dead(vd) && !vd->vdev_is_failing); 18435329Sgw25295 } 18445329Sgw25295 18455329Sgw25295 int 1846789Sahrens vdev_is_dead(vdev_t *vd) 1847789Sahrens { 18486523Sek110237 /* 18496523Sek110237 * If the vdev experienced I/O failures, then the vdev is marked 18506523Sek110237 * as faulted (VDEV_STATE_FAULTED) for status output and FMA; however, 18516523Sek110237 * we need to allow access to the vdev for resumed I/Os (see 18526523Sek110237 * zio_vdev_resume_io() ). 18536523Sek110237 */ 18546523Sek110237 return (vd->vdev_state < VDEV_STATE_DEGRADED && 18556523Sek110237 vd->vdev_stat.vs_aux != VDEV_AUX_IO_FAILURE); 1856789Sahrens } 1857789Sahrens 1858789Sahrens int 1859789Sahrens vdev_error_inject(vdev_t *vd, zio_t *zio) 1860789Sahrens { 1861789Sahrens int error = 0; 1862789Sahrens 1863789Sahrens if (vd->vdev_fault_mode == VDEV_FAULT_NONE) 1864789Sahrens return (0); 1865789Sahrens 1866789Sahrens if (((1ULL << zio->io_type) & vd->vdev_fault_mask) == 0) 1867789Sahrens return (0); 1868789Sahrens 1869789Sahrens switch (vd->vdev_fault_mode) { 1870789Sahrens case VDEV_FAULT_RANDOM: 1871789Sahrens if (spa_get_random(vd->vdev_fault_arg) == 0) 1872789Sahrens error = EIO; 1873789Sahrens break; 1874789Sahrens 1875789Sahrens case VDEV_FAULT_COUNT: 1876789Sahrens if ((int64_t)--vd->vdev_fault_arg <= 0) 1877789Sahrens vd->vdev_fault_mode = VDEV_FAULT_NONE; 1878789Sahrens error = EIO; 1879789Sahrens break; 1880789Sahrens } 1881789Sahrens 1882789Sahrens return (error); 1883789Sahrens } 1884789Sahrens 1885789Sahrens /* 1886789Sahrens * Get statistics for the given vdev. 1887789Sahrens */ 1888789Sahrens void 1889789Sahrens vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) 1890789Sahrens { 1891789Sahrens vdev_t *rvd = vd->vdev_spa->spa_root_vdev; 1892789Sahrens int c, t; 1893789Sahrens 1894789Sahrens mutex_enter(&vd->vdev_stat_lock); 1895789Sahrens bcopy(&vd->vdev_stat, vs, sizeof (*vs)); 1896*7046Sahrens vs->vs_scrub_errors = vd->vdev_spa->spa_scrub_errors; 1897789Sahrens vs->vs_timestamp = gethrtime() - vs->vs_timestamp; 1898789Sahrens vs->vs_state = vd->vdev_state; 18991175Slling vs->vs_rsize = vdev_get_rsize(vd); 1900789Sahrens mutex_exit(&vd->vdev_stat_lock); 1901789Sahrens 1902789Sahrens /* 1903789Sahrens * If we're getting stats on the root vdev, aggregate the I/O counts 1904789Sahrens * over all top-level vdevs (i.e. the direct children of the root). 1905789Sahrens */ 1906789Sahrens if (vd == rvd) { 1907789Sahrens for (c = 0; c < rvd->vdev_children; c++) { 1908789Sahrens vdev_t *cvd = rvd->vdev_child[c]; 1909789Sahrens vdev_stat_t *cvs = &cvd->vdev_stat; 1910789Sahrens 1911789Sahrens mutex_enter(&vd->vdev_stat_lock); 1912789Sahrens for (t = 0; t < ZIO_TYPES; t++) { 1913789Sahrens vs->vs_ops[t] += cvs->vs_ops[t]; 1914789Sahrens vs->vs_bytes[t] += cvs->vs_bytes[t]; 1915789Sahrens } 1916789Sahrens vs->vs_read_errors += cvs->vs_read_errors; 1917789Sahrens vs->vs_write_errors += cvs->vs_write_errors; 1918789Sahrens vs->vs_checksum_errors += cvs->vs_checksum_errors; 1919789Sahrens vs->vs_scrub_examined += cvs->vs_scrub_examined; 1920789Sahrens mutex_exit(&vd->vdev_stat_lock); 1921789Sahrens } 1922789Sahrens } 1923789Sahrens } 1924789Sahrens 1925789Sahrens void 19265450Sbrendan vdev_clear_stats(vdev_t *vd) 19275450Sbrendan { 19285450Sbrendan mutex_enter(&vd->vdev_stat_lock); 19295450Sbrendan vd->vdev_stat.vs_space = 0; 19305450Sbrendan vd->vdev_stat.vs_dspace = 0; 19315450Sbrendan vd->vdev_stat.vs_alloc = 0; 19325450Sbrendan mutex_exit(&vd->vdev_stat_lock); 19335450Sbrendan } 19345450Sbrendan 19355450Sbrendan void 1936789Sahrens vdev_stat_update(zio_t *zio) 1937789Sahrens { 1938789Sahrens vdev_t *vd = zio->io_vd; 1939789Sahrens vdev_t *pvd; 1940789Sahrens uint64_t txg = zio->io_txg; 1941789Sahrens vdev_stat_t *vs = &vd->vdev_stat; 1942789Sahrens zio_type_t type = zio->io_type; 1943789Sahrens int flags = zio->io_flags; 1944789Sahrens 1945789Sahrens if (zio->io_error == 0) { 1946789Sahrens if (!(flags & ZIO_FLAG_IO_BYPASS)) { 1947789Sahrens mutex_enter(&vd->vdev_stat_lock); 1948789Sahrens vs->vs_ops[type]++; 1949789Sahrens vs->vs_bytes[type] += zio->io_size; 1950789Sahrens mutex_exit(&vd->vdev_stat_lock); 1951789Sahrens } 1952789Sahrens if ((flags & ZIO_FLAG_IO_REPAIR) && 1953789Sahrens zio->io_delegate_list == NULL) { 1954789Sahrens mutex_enter(&vd->vdev_stat_lock); 19551807Sbonwick if (flags & ZIO_FLAG_SCRUB_THREAD) 1956789Sahrens vs->vs_scrub_repaired += zio->io_size; 1957789Sahrens else 1958789Sahrens vs->vs_self_healed += zio->io_size; 1959789Sahrens mutex_exit(&vd->vdev_stat_lock); 1960789Sahrens } 1961789Sahrens return; 1962789Sahrens } 1963789Sahrens 1964789Sahrens if (flags & ZIO_FLAG_SPECULATIVE) 1965789Sahrens return; 1966789Sahrens 19675329Sgw25295 if (vdev_readable(vd)) { 1968789Sahrens mutex_enter(&vd->vdev_stat_lock); 1969789Sahrens if (type == ZIO_TYPE_READ) { 1970789Sahrens if (zio->io_error == ECKSUM) 1971789Sahrens vs->vs_checksum_errors++; 1972789Sahrens else 1973789Sahrens vs->vs_read_errors++; 1974789Sahrens } 1975789Sahrens if (type == ZIO_TYPE_WRITE) 1976789Sahrens vs->vs_write_errors++; 1977789Sahrens mutex_exit(&vd->vdev_stat_lock); 1978789Sahrens } 1979789Sahrens 1980789Sahrens if (type == ZIO_TYPE_WRITE) { 1981789Sahrens if (txg == 0 || vd->vdev_children != 0) 1982789Sahrens return; 19831807Sbonwick if (flags & ZIO_FLAG_SCRUB_THREAD) { 1984789Sahrens ASSERT(flags & ZIO_FLAG_IO_REPAIR); 1985789Sahrens for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) 1986789Sahrens vdev_dtl_dirty(&pvd->vdev_dtl_scrub, txg, 1); 1987789Sahrens } 1988789Sahrens if (!(flags & ZIO_FLAG_IO_REPAIR)) { 1989789Sahrens if (vdev_dtl_contains(&vd->vdev_dtl_map, txg, 1)) 1990789Sahrens return; 19911732Sbonwick vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); 1992789Sahrens for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) 1993789Sahrens vdev_dtl_dirty(&pvd->vdev_dtl_map, txg, 1); 1994789Sahrens } 1995789Sahrens } 1996789Sahrens } 1997789Sahrens 1998789Sahrens void 1999789Sahrens vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete) 2000789Sahrens { 2001789Sahrens int c; 2002789Sahrens vdev_stat_t *vs = &vd->vdev_stat; 2003789Sahrens 2004789Sahrens for (c = 0; c < vd->vdev_children; c++) 2005789Sahrens vdev_scrub_stat_update(vd->vdev_child[c], type, complete); 2006789Sahrens 2007789Sahrens mutex_enter(&vd->vdev_stat_lock); 2008789Sahrens 2009789Sahrens if (type == POOL_SCRUB_NONE) { 2010789Sahrens /* 2011789Sahrens * Update completion and end time. Leave everything else alone 2012789Sahrens * so we can report what happened during the previous scrub. 2013789Sahrens */ 2014789Sahrens vs->vs_scrub_complete = complete; 2015789Sahrens vs->vs_scrub_end = gethrestime_sec(); 2016789Sahrens } else { 2017789Sahrens vs->vs_scrub_type = type; 2018789Sahrens vs->vs_scrub_complete = 0; 2019789Sahrens vs->vs_scrub_examined = 0; 2020789Sahrens vs->vs_scrub_repaired = 0; 2021789Sahrens vs->vs_scrub_start = gethrestime_sec(); 2022789Sahrens vs->vs_scrub_end = 0; 2023789Sahrens } 2024789Sahrens 2025789Sahrens mutex_exit(&vd->vdev_stat_lock); 2026789Sahrens } 2027789Sahrens 2028789Sahrens /* 2029789Sahrens * Update the in-core space usage stats for this vdev and the root vdev. 2030789Sahrens */ 2031789Sahrens void 20325450Sbrendan vdev_space_update(vdev_t *vd, int64_t space_delta, int64_t alloc_delta, 20335450Sbrendan boolean_t update_root) 2034789Sahrens { 20354527Sperrin int64_t dspace_delta = space_delta; 20364527Sperrin spa_t *spa = vd->vdev_spa; 20374527Sperrin vdev_t *rvd = spa->spa_root_vdev; 20384527Sperrin 2039789Sahrens ASSERT(vd == vd->vdev_top); 20404527Sperrin 20414527Sperrin /* 20424527Sperrin * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion 20434527Sperrin * factor. We must calculate this here and not at the root vdev 20444527Sperrin * because the root vdev's psize-to-asize is simply the max of its 20454527Sperrin * childrens', thus not accurate enough for us. 20464527Sperrin */ 20474527Sperrin ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0); 20484527Sperrin dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) * 20494527Sperrin vd->vdev_deflate_ratio; 2050789Sahrens 20514527Sperrin mutex_enter(&vd->vdev_stat_lock); 20524527Sperrin vd->vdev_stat.vs_space += space_delta; 20534527Sperrin vd->vdev_stat.vs_alloc += alloc_delta; 20544527Sperrin vd->vdev_stat.vs_dspace += dspace_delta; 20554527Sperrin mutex_exit(&vd->vdev_stat_lock); 20562082Seschrock 20575450Sbrendan if (update_root) { 20585450Sbrendan ASSERT(rvd == vd->vdev_parent); 20595450Sbrendan ASSERT(vd->vdev_ms_count != 0); 20604527Sperrin 20615450Sbrendan /* 20625450Sbrendan * Don't count non-normal (e.g. intent log) space as part of 20635450Sbrendan * the pool's capacity. 20645450Sbrendan */ 20655450Sbrendan if (vd->vdev_mg->mg_class != spa->spa_normal_class) 20665450Sbrendan return; 20675450Sbrendan 20685450Sbrendan mutex_enter(&rvd->vdev_stat_lock); 20695450Sbrendan rvd->vdev_stat.vs_space += space_delta; 20705450Sbrendan rvd->vdev_stat.vs_alloc += alloc_delta; 20715450Sbrendan rvd->vdev_stat.vs_dspace += dspace_delta; 20725450Sbrendan mutex_exit(&rvd->vdev_stat_lock); 20735450Sbrendan } 2074789Sahrens } 2075789Sahrens 2076789Sahrens /* 2077789Sahrens * Mark a top-level vdev's config as dirty, placing it on the dirty list 2078789Sahrens * so that it will be written out next time the vdev configuration is synced. 2079789Sahrens * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs. 2080789Sahrens */ 2081789Sahrens void 2082789Sahrens vdev_config_dirty(vdev_t *vd) 2083789Sahrens { 2084789Sahrens spa_t *spa = vd->vdev_spa; 2085789Sahrens vdev_t *rvd = spa->spa_root_vdev; 2086789Sahrens int c; 2087789Sahrens 20881601Sbonwick /* 20896643Seschrock * If this is an aux vdev (as with l2cache devices), then we update the 20906643Seschrock * vdev config manually and set the sync flag. 20916643Seschrock */ 20926643Seschrock if (vd->vdev_aux != NULL) { 20936643Seschrock spa_aux_vdev_t *sav = vd->vdev_aux; 20946643Seschrock nvlist_t **aux; 20956643Seschrock uint_t naux; 20966643Seschrock 20976643Seschrock for (c = 0; c < sav->sav_count; c++) { 20986643Seschrock if (sav->sav_vdevs[c] == vd) 20996643Seschrock break; 21006643Seschrock } 21016643Seschrock 21026643Seschrock ASSERT(c < sav->sav_count); 21036643Seschrock sav->sav_sync = B_TRUE; 21046643Seschrock 21056643Seschrock VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 21066643Seschrock ZPOOL_CONFIG_L2CACHE, &aux, &naux) == 0); 21076643Seschrock 21086643Seschrock ASSERT(c < naux); 21096643Seschrock 21106643Seschrock /* 21116643Seschrock * Setting the nvlist in the middle if the array is a little 21126643Seschrock * sketchy, but it will work. 21136643Seschrock */ 21146643Seschrock nvlist_free(aux[c]); 21156643Seschrock aux[c] = vdev_config_generate(spa, vd, B_TRUE, B_FALSE, B_TRUE); 21166643Seschrock 21176643Seschrock return; 21186643Seschrock } 21196643Seschrock 21206643Seschrock /* 21211601Sbonwick * The dirty list is protected by the config lock. The caller must 21221601Sbonwick * either hold the config lock as writer, or must be the sync thread 21231601Sbonwick * (which holds the lock as reader). There's only one sync thread, 21241601Sbonwick * so this is sufficient to ensure mutual exclusion. 21251601Sbonwick */ 21261601Sbonwick ASSERT(spa_config_held(spa, RW_WRITER) || 21271601Sbonwick dsl_pool_sync_context(spa_get_dsl(spa))); 21281601Sbonwick 2129789Sahrens if (vd == rvd) { 2130789Sahrens for (c = 0; c < rvd->vdev_children; c++) 2131789Sahrens vdev_config_dirty(rvd->vdev_child[c]); 2132789Sahrens } else { 2133789Sahrens ASSERT(vd == vd->vdev_top); 2134789Sahrens 21351732Sbonwick if (!list_link_active(&vd->vdev_dirty_node)) 2136789Sahrens list_insert_head(&spa->spa_dirty_list, vd); 2137789Sahrens } 2138789Sahrens } 2139789Sahrens 2140789Sahrens void 2141789Sahrens vdev_config_clean(vdev_t *vd) 2142789Sahrens { 21431601Sbonwick spa_t *spa = vd->vdev_spa; 21441601Sbonwick 21451601Sbonwick ASSERT(spa_config_held(spa, RW_WRITER) || 21461601Sbonwick dsl_pool_sync_context(spa_get_dsl(spa))); 21471601Sbonwick 21481732Sbonwick ASSERT(list_link_active(&vd->vdev_dirty_node)); 21491601Sbonwick list_remove(&spa->spa_dirty_list, vd); 2150789Sahrens } 2151789Sahrens 21526523Sek110237 /* 21536523Sek110237 * Propagate vdev state up from children to parent. 21546523Sek110237 */ 21551775Sbillm void 21561775Sbillm vdev_propagate_state(vdev_t *vd) 21571775Sbillm { 21581775Sbillm vdev_t *rvd = vd->vdev_spa->spa_root_vdev; 21591775Sbillm int degraded = 0, faulted = 0; 21601775Sbillm int corrupted = 0; 21611775Sbillm int c; 21621775Sbillm vdev_t *child; 21631775Sbillm 21644451Seschrock if (vd->vdev_children > 0) { 21654451Seschrock for (c = 0; c < vd->vdev_children; c++) { 21664451Seschrock child = vd->vdev_child[c]; 21676976Seschrock 21686976Seschrock if ((vdev_is_dead(child) && !vdev_readable(child)) || 21696976Seschrock child->vdev_stat.vs_aux == VDEV_AUX_IO_FAILURE) { 21706976Seschrock /* 21716976Seschrock * Root special: if there is a top-level log 21726976Seschrock * device, treat the root vdev as if it were 21736976Seschrock * degraded. 21746976Seschrock */ 21756976Seschrock if (child->vdev_islog && vd == rvd) 21766976Seschrock degraded++; 21776976Seschrock else 21786976Seschrock faulted++; 21796976Seschrock } else if (child->vdev_state <= VDEV_STATE_DEGRADED) { 21804451Seschrock degraded++; 21816976Seschrock } 21824451Seschrock 21834451Seschrock if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) 21844451Seschrock corrupted++; 21854451Seschrock } 21861775Sbillm 21874451Seschrock vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded); 21884451Seschrock 21894451Seschrock /* 21904451Seschrock * Root special: if there is a toplevel vdev that cannot be 21914451Seschrock * opened due to corrupted metadata, then propagate the root 21924451Seschrock * vdev's aux state as 'corrupt' rather than 'insufficient 21934451Seschrock * replicas'. 21944451Seschrock */ 21954451Seschrock if (corrupted && vd == rvd && 21964451Seschrock rvd->vdev_state == VDEV_STATE_CANT_OPEN) 21974451Seschrock vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN, 21984451Seschrock VDEV_AUX_CORRUPT_DATA); 21991775Sbillm } 22001775Sbillm 22016976Seschrock if (vd->vdev_parent) 22024451Seschrock vdev_propagate_state(vd->vdev_parent); 22031775Sbillm } 22041775Sbillm 2205789Sahrens /* 22061544Seschrock * Set a vdev's state. If this is during an open, we don't update the parent 22071544Seschrock * state, because we're in the process of opening children depth-first. 22081544Seschrock * Otherwise, we propagate the change to the parent. 22091544Seschrock * 22101544Seschrock * If this routine places a device in a faulted state, an appropriate ereport is 22111544Seschrock * generated. 2212789Sahrens */ 2213789Sahrens void 22141544Seschrock vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) 2215789Sahrens { 22161986Seschrock uint64_t save_state; 22176643Seschrock spa_t *spa = vd->vdev_spa; 22181544Seschrock 22191544Seschrock if (state == vd->vdev_state) { 22201544Seschrock vd->vdev_stat.vs_aux = aux; 2221789Sahrens return; 22221544Seschrock } 22231544Seschrock 22241986Seschrock save_state = vd->vdev_state; 2225789Sahrens 2226789Sahrens vd->vdev_state = state; 2227789Sahrens vd->vdev_stat.vs_aux = aux; 2228789Sahrens 22294451Seschrock /* 22304451Seschrock * If we are setting the vdev state to anything but an open state, then 22314451Seschrock * always close the underlying device. Otherwise, we keep accessible 22324451Seschrock * but invalid devices open forever. We don't call vdev_close() itself, 22334451Seschrock * because that implies some extra checks (offline, etc) that we don't 22344451Seschrock * want here. This is limited to leaf devices, because otherwise 22354451Seschrock * closing the device will affect other children. 22364451Seschrock */ 22375329Sgw25295 if (!vdev_readable(vd) && vd->vdev_ops->vdev_op_leaf) 22384451Seschrock vd->vdev_ops->vdev_op_close(vd); 22394451Seschrock 22404451Seschrock if (vd->vdev_removed && 22414451Seschrock state == VDEV_STATE_CANT_OPEN && 22424451Seschrock (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) { 22434451Seschrock /* 22444451Seschrock * If the previous state is set to VDEV_STATE_REMOVED, then this 22454451Seschrock * device was previously marked removed and someone attempted to 22464451Seschrock * reopen it. If this failed due to a nonexistent device, then 22474451Seschrock * keep the device in the REMOVED state. We also let this be if 22484451Seschrock * it is one of our special test online cases, which is only 22494451Seschrock * attempting to online the device and shouldn't generate an FMA 22504451Seschrock * fault. 22514451Seschrock */ 22524451Seschrock vd->vdev_state = VDEV_STATE_REMOVED; 22534451Seschrock vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 22544451Seschrock } else if (state == VDEV_STATE_REMOVED) { 22554451Seschrock /* 22564451Seschrock * Indicate to the ZFS DE that this device has been removed, and 22574451Seschrock * any recent errors should be ignored. 22584451Seschrock */ 22596643Seschrock zfs_post_remove(spa, vd); 22604451Seschrock vd->vdev_removed = B_TRUE; 22614451Seschrock } else if (state == VDEV_STATE_CANT_OPEN) { 22621544Seschrock /* 22631544Seschrock * If we fail to open a vdev during an import, we mark it as 22641544Seschrock * "not available", which signifies that it was never there to 22651544Seschrock * begin with. Failure to open such a device is not considered 22661544Seschrock * an error. 22671544Seschrock */ 22686643Seschrock if (spa->spa_load_state == SPA_LOAD_IMPORT && 22696643Seschrock !spa->spa_import_faulted && 22701986Seschrock vd->vdev_ops->vdev_op_leaf) 22711986Seschrock vd->vdev_not_present = 1; 22721986Seschrock 22731986Seschrock /* 22741986Seschrock * Post the appropriate ereport. If the 'prevstate' field is 22751986Seschrock * set to something other than VDEV_STATE_UNKNOWN, it indicates 22761986Seschrock * that this is part of a vdev_reopen(). In this case, we don't 22771986Seschrock * want to post the ereport if the device was already in the 22781986Seschrock * CANT_OPEN state beforehand. 22794451Seschrock * 22804451Seschrock * If the 'checkremove' flag is set, then this is an attempt to 22814451Seschrock * online the device in response to an insertion event. If we 22824451Seschrock * hit this case, then we have detected an insertion event for a 22834451Seschrock * faulted or offline device that wasn't in the removed state. 22844451Seschrock * In this scenario, we don't post an ereport because we are 22854451Seschrock * about to replace the device, or attempt an online with 22864451Seschrock * vdev_forcefault, which will generate the fault for us. 22871986Seschrock */ 22884451Seschrock if ((vd->vdev_prevstate != state || vd->vdev_forcefault) && 22894451Seschrock !vd->vdev_not_present && !vd->vdev_checkremove && 22906643Seschrock vd != spa->spa_root_vdev) { 22911544Seschrock const char *class; 22921544Seschrock 22931544Seschrock switch (aux) { 22941544Seschrock case VDEV_AUX_OPEN_FAILED: 22951544Seschrock class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED; 22961544Seschrock break; 22971544Seschrock case VDEV_AUX_CORRUPT_DATA: 22981544Seschrock class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA; 22991544Seschrock break; 23001544Seschrock case VDEV_AUX_NO_REPLICAS: 23011544Seschrock class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS; 23021544Seschrock break; 23031544Seschrock case VDEV_AUX_BAD_GUID_SUM: 23041544Seschrock class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM; 23051544Seschrock break; 23061544Seschrock case VDEV_AUX_TOO_SMALL: 23071544Seschrock class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL; 23081544Seschrock break; 23091544Seschrock case VDEV_AUX_BAD_LABEL: 23101544Seschrock class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL; 23111544Seschrock break; 23121544Seschrock default: 23131544Seschrock class = FM_EREPORT_ZFS_DEVICE_UNKNOWN; 23141544Seschrock } 23151544Seschrock 23166643Seschrock zfs_ereport_post(class, spa, vd, NULL, save_state, 0); 23171544Seschrock } 23184451Seschrock 23194451Seschrock /* Erase any notion of persistent removed state */ 23204451Seschrock vd->vdev_removed = B_FALSE; 23214451Seschrock } else { 23224451Seschrock vd->vdev_removed = B_FALSE; 23231544Seschrock } 23241544Seschrock 23254451Seschrock if (!isopen) 23264451Seschrock vdev_propagate_state(vd); 2327789Sahrens } 23287042Sgw25295 23297042Sgw25295 /* 23307042Sgw25295 * Check the vdev configuration to ensure that it's capable of supporting 23317042Sgw25295 * a root pool. Currently, we do not support RAID-Z or partial configuration. 23327042Sgw25295 * In addition, only a single top-level vdev is allowed and none of the leaves 23337042Sgw25295 * can be wholedisks. 23347042Sgw25295 */ 23357042Sgw25295 boolean_t 23367042Sgw25295 vdev_is_bootable(vdev_t *vd) 23377042Sgw25295 { 23387042Sgw25295 int c; 23397042Sgw25295 23407042Sgw25295 if (!vd->vdev_ops->vdev_op_leaf) { 23417042Sgw25295 char *vdev_type = vd->vdev_ops->vdev_op_type; 23427042Sgw25295 23437042Sgw25295 if (strcmp(vdev_type, VDEV_TYPE_ROOT) == 0 && 23447042Sgw25295 vd->vdev_children > 1) { 23457042Sgw25295 return (B_FALSE); 23467042Sgw25295 } else if (strcmp(vdev_type, VDEV_TYPE_RAIDZ) == 0 || 23477042Sgw25295 strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) { 23487042Sgw25295 return (B_FALSE); 23497042Sgw25295 } 23507042Sgw25295 } else if (vd->vdev_wholedisk == 1) { 23517042Sgw25295 return (B_FALSE); 23527042Sgw25295 } 23537042Sgw25295 23547042Sgw25295 for (c = 0; c < vd->vdev_children; c++) { 23557042Sgw25295 if (!vdev_is_bootable(vd->vdev_child[c])) 23567042Sgw25295 return (B_FALSE); 23577042Sgw25295 } 23587042Sgw25295 return (B_TRUE); 23597042Sgw25295 } 2360