1789Sahrens /* 2789Sahrens * CDDL HEADER START 3789Sahrens * 4789Sahrens * The contents of this file are subject to the terms of the 51485Slling * Common Development and Distribution License (the "License"). 61485Slling * You may not use this file except in compliance with the License. 7789Sahrens * 8789Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9789Sahrens * or http://www.opensolaris.org/os/licensing. 10789Sahrens * See the License for the specific language governing permissions 11789Sahrens * and limitations under the License. 12789Sahrens * 13789Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14789Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15789Sahrens * If applicable, add the following below this CDDL HEADER, with the 16789Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17789Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18789Sahrens * 19789Sahrens * CDDL HEADER END 20789Sahrens */ 212082Seschrock 22789Sahrens /* 23*6523Sek110237 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24789Sahrens * Use is subject to license terms. 25789Sahrens */ 26789Sahrens 27789Sahrens #pragma ident "%Z%%M% %I% %E% SMI" 28789Sahrens 29789Sahrens #include <sys/zfs_context.h> 301544Seschrock #include <sys/fm/fs/zfs.h> 31789Sahrens #include <sys/spa.h> 32789Sahrens #include <sys/spa_impl.h> 33789Sahrens #include <sys/dmu.h> 34789Sahrens #include <sys/dmu_tx.h> 35789Sahrens #include <sys/vdev_impl.h> 36789Sahrens #include <sys/uberblock_impl.h> 37789Sahrens #include <sys/metaslab.h> 38789Sahrens #include <sys/metaslab_impl.h> 39789Sahrens #include <sys/space_map.h> 40789Sahrens #include <sys/zio.h> 41789Sahrens #include <sys/zap.h> 42789Sahrens #include <sys/fs/zfs.h> 43789Sahrens 44789Sahrens /* 45789Sahrens * Virtual device management. 46789Sahrens */ 47789Sahrens 48789Sahrens static vdev_ops_t *vdev_ops_table[] = { 49789Sahrens &vdev_root_ops, 50789Sahrens &vdev_raidz_ops, 51789Sahrens &vdev_mirror_ops, 52789Sahrens &vdev_replacing_ops, 532082Seschrock &vdev_spare_ops, 54789Sahrens &vdev_disk_ops, 55789Sahrens &vdev_file_ops, 56789Sahrens &vdev_missing_ops, 57789Sahrens NULL 58789Sahrens }; 59789Sahrens 603697Smishra /* maximum scrub/resilver I/O queue */ 613697Smishra int zfs_scrub_limit = 70; 623697Smishra 63789Sahrens /* 64789Sahrens * Given a vdev type, return the appropriate ops vector. 65789Sahrens */ 66789Sahrens static vdev_ops_t * 67789Sahrens vdev_getops(const char *type) 68789Sahrens { 69789Sahrens vdev_ops_t *ops, **opspp; 70789Sahrens 71789Sahrens for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++) 72789Sahrens if (strcmp(ops->vdev_op_type, type) == 0) 73789Sahrens break; 74789Sahrens 75789Sahrens return (ops); 76789Sahrens } 77789Sahrens 78789Sahrens /* 79789Sahrens * Default asize function: return the MAX of psize with the asize of 80789Sahrens * all children. This is what's used by anything other than RAID-Z. 81789Sahrens */ 82789Sahrens uint64_t 83789Sahrens vdev_default_asize(vdev_t *vd, uint64_t psize) 84789Sahrens { 851732Sbonwick uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift); 86789Sahrens uint64_t csize; 87789Sahrens uint64_t c; 88789Sahrens 89789Sahrens for (c = 0; c < vd->vdev_children; c++) { 90789Sahrens csize = vdev_psize_to_asize(vd->vdev_child[c], psize); 91789Sahrens asize = MAX(asize, csize); 92789Sahrens } 93789Sahrens 94789Sahrens return (asize); 95789Sahrens } 96789Sahrens 971175Slling /* 981175Slling * Get the replaceable or attachable device size. 991175Slling * If the parent is a mirror or raidz, the replaceable size is the minimum 1001175Slling * psize of all its children. For the rest, just return our own psize. 1011175Slling * 1021175Slling * e.g. 1031175Slling * psize rsize 1041175Slling * root - - 1051175Slling * mirror/raidz - - 1061175Slling * disk1 20g 20g 1071175Slling * disk2 40g 20g 1081175Slling * disk3 80g 80g 1091175Slling */ 1101175Slling uint64_t 1111175Slling vdev_get_rsize(vdev_t *vd) 1121175Slling { 1131175Slling vdev_t *pvd, *cvd; 1141175Slling uint64_t c, rsize; 1151175Slling 1161175Slling pvd = vd->vdev_parent; 1171175Slling 1181175Slling /* 1191175Slling * If our parent is NULL or the root, just return our own psize. 1201175Slling */ 1211175Slling if (pvd == NULL || pvd->vdev_parent == NULL) 1221175Slling return (vd->vdev_psize); 1231175Slling 1241175Slling rsize = 0; 1251175Slling 1261175Slling for (c = 0; c < pvd->vdev_children; c++) { 1271175Slling cvd = pvd->vdev_child[c]; 1281175Slling rsize = MIN(rsize - 1, cvd->vdev_psize - 1) + 1; 1291175Slling } 1301175Slling 1311175Slling return (rsize); 1321175Slling } 1331175Slling 134789Sahrens vdev_t * 135789Sahrens vdev_lookup_top(spa_t *spa, uint64_t vdev) 136789Sahrens { 137789Sahrens vdev_t *rvd = spa->spa_root_vdev; 138789Sahrens 1395530Sbonwick ASSERT(spa_config_held(spa, RW_READER) || 1405530Sbonwick curthread == spa->spa_scrub_thread); 1415530Sbonwick 142789Sahrens if (vdev < rvd->vdev_children) 143789Sahrens return (rvd->vdev_child[vdev]); 144789Sahrens 145789Sahrens return (NULL); 146789Sahrens } 147789Sahrens 148789Sahrens vdev_t * 149789Sahrens vdev_lookup_by_guid(vdev_t *vd, uint64_t guid) 150789Sahrens { 151789Sahrens int c; 152789Sahrens vdev_t *mvd; 153789Sahrens 1541585Sbonwick if (vd->vdev_guid == guid) 155789Sahrens return (vd); 156789Sahrens 157789Sahrens for (c = 0; c < vd->vdev_children; c++) 158789Sahrens if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) != 159789Sahrens NULL) 160789Sahrens return (mvd); 161789Sahrens 162789Sahrens return (NULL); 163789Sahrens } 164789Sahrens 165789Sahrens void 166789Sahrens vdev_add_child(vdev_t *pvd, vdev_t *cvd) 167789Sahrens { 168789Sahrens size_t oldsize, newsize; 169789Sahrens uint64_t id = cvd->vdev_id; 170789Sahrens vdev_t **newchild; 171789Sahrens 172789Sahrens ASSERT(spa_config_held(cvd->vdev_spa, RW_WRITER)); 173789Sahrens ASSERT(cvd->vdev_parent == NULL); 174789Sahrens 175789Sahrens cvd->vdev_parent = pvd; 176789Sahrens 177789Sahrens if (pvd == NULL) 178789Sahrens return; 179789Sahrens 180789Sahrens ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL); 181789Sahrens 182789Sahrens oldsize = pvd->vdev_children * sizeof (vdev_t *); 183789Sahrens pvd->vdev_children = MAX(pvd->vdev_children, id + 1); 184789Sahrens newsize = pvd->vdev_children * sizeof (vdev_t *); 185789Sahrens 186789Sahrens newchild = kmem_zalloc(newsize, KM_SLEEP); 187789Sahrens if (pvd->vdev_child != NULL) { 188789Sahrens bcopy(pvd->vdev_child, newchild, oldsize); 189789Sahrens kmem_free(pvd->vdev_child, oldsize); 190789Sahrens } 191789Sahrens 192789Sahrens pvd->vdev_child = newchild; 193789Sahrens pvd->vdev_child[id] = cvd; 194789Sahrens 195789Sahrens cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd); 196789Sahrens ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL); 197789Sahrens 198789Sahrens /* 199789Sahrens * Walk up all ancestors to update guid sum. 200789Sahrens */ 201789Sahrens for (; pvd != NULL; pvd = pvd->vdev_parent) 202789Sahrens pvd->vdev_guid_sum += cvd->vdev_guid_sum; 2033697Smishra 2043697Smishra if (cvd->vdev_ops->vdev_op_leaf) 2053697Smishra cvd->vdev_spa->spa_scrub_maxinflight += zfs_scrub_limit; 206789Sahrens } 207789Sahrens 208789Sahrens void 209789Sahrens vdev_remove_child(vdev_t *pvd, vdev_t *cvd) 210789Sahrens { 211789Sahrens int c; 212789Sahrens uint_t id = cvd->vdev_id; 213789Sahrens 214789Sahrens ASSERT(cvd->vdev_parent == pvd); 215789Sahrens 216789Sahrens if (pvd == NULL) 217789Sahrens return; 218789Sahrens 219789Sahrens ASSERT(id < pvd->vdev_children); 220789Sahrens ASSERT(pvd->vdev_child[id] == cvd); 221789Sahrens 222789Sahrens pvd->vdev_child[id] = NULL; 223789Sahrens cvd->vdev_parent = NULL; 224789Sahrens 225789Sahrens for (c = 0; c < pvd->vdev_children; c++) 226789Sahrens if (pvd->vdev_child[c]) 227789Sahrens break; 228789Sahrens 229789Sahrens if (c == pvd->vdev_children) { 230789Sahrens kmem_free(pvd->vdev_child, c * sizeof (vdev_t *)); 231789Sahrens pvd->vdev_child = NULL; 232789Sahrens pvd->vdev_children = 0; 233789Sahrens } 234789Sahrens 235789Sahrens /* 236789Sahrens * Walk up all ancestors to update guid sum. 237789Sahrens */ 238789Sahrens for (; pvd != NULL; pvd = pvd->vdev_parent) 239789Sahrens pvd->vdev_guid_sum -= cvd->vdev_guid_sum; 2403697Smishra 2413697Smishra if (cvd->vdev_ops->vdev_op_leaf) 2423697Smishra cvd->vdev_spa->spa_scrub_maxinflight -= zfs_scrub_limit; 243789Sahrens } 244789Sahrens 245789Sahrens /* 246789Sahrens * Remove any holes in the child array. 247789Sahrens */ 248789Sahrens void 249789Sahrens vdev_compact_children(vdev_t *pvd) 250789Sahrens { 251789Sahrens vdev_t **newchild, *cvd; 252789Sahrens int oldc = pvd->vdev_children; 253789Sahrens int newc, c; 254789Sahrens 255789Sahrens ASSERT(spa_config_held(pvd->vdev_spa, RW_WRITER)); 256789Sahrens 257789Sahrens for (c = newc = 0; c < oldc; c++) 258789Sahrens if (pvd->vdev_child[c]) 259789Sahrens newc++; 260789Sahrens 261789Sahrens newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP); 262789Sahrens 263789Sahrens for (c = newc = 0; c < oldc; c++) { 264789Sahrens if ((cvd = pvd->vdev_child[c]) != NULL) { 265789Sahrens newchild[newc] = cvd; 266789Sahrens cvd->vdev_id = newc++; 267789Sahrens } 268789Sahrens } 269789Sahrens 270789Sahrens kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *)); 271789Sahrens pvd->vdev_child = newchild; 272789Sahrens pvd->vdev_children = newc; 273789Sahrens } 274789Sahrens 275789Sahrens /* 276789Sahrens * Allocate and minimally initialize a vdev_t. 277789Sahrens */ 278789Sahrens static vdev_t * 279789Sahrens vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) 280789Sahrens { 281789Sahrens vdev_t *vd; 282789Sahrens 2831585Sbonwick vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP); 2841585Sbonwick 2851585Sbonwick if (spa->spa_root_vdev == NULL) { 2861585Sbonwick ASSERT(ops == &vdev_root_ops); 2871585Sbonwick spa->spa_root_vdev = vd; 2881585Sbonwick } 289789Sahrens 2901585Sbonwick if (guid == 0) { 2911585Sbonwick if (spa->spa_root_vdev == vd) { 2921585Sbonwick /* 2931585Sbonwick * The root vdev's guid will also be the pool guid, 2941585Sbonwick * which must be unique among all pools. 2951585Sbonwick */ 2961585Sbonwick while (guid == 0 || spa_guid_exists(guid, 0)) 2971585Sbonwick guid = spa_get_random(-1ULL); 2981585Sbonwick } else { 2991585Sbonwick /* 3001585Sbonwick * Any other vdev's guid must be unique within the pool. 3011585Sbonwick */ 3021585Sbonwick while (guid == 0 || 3031585Sbonwick spa_guid_exists(spa_guid(spa), guid)) 3041585Sbonwick guid = spa_get_random(-1ULL); 3051585Sbonwick } 3061585Sbonwick ASSERT(!spa_guid_exists(spa_guid(spa), guid)); 3071585Sbonwick } 308789Sahrens 309789Sahrens vd->vdev_spa = spa; 310789Sahrens vd->vdev_id = id; 311789Sahrens vd->vdev_guid = guid; 312789Sahrens vd->vdev_guid_sum = guid; 313789Sahrens vd->vdev_ops = ops; 314789Sahrens vd->vdev_state = VDEV_STATE_CLOSED; 315789Sahrens 316789Sahrens mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL); 3172856Snd150628 mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); 318789Sahrens space_map_create(&vd->vdev_dtl_map, 0, -1ULL, 0, &vd->vdev_dtl_lock); 319789Sahrens space_map_create(&vd->vdev_dtl_scrub, 0, -1ULL, 0, &vd->vdev_dtl_lock); 320789Sahrens txg_list_create(&vd->vdev_ms_list, 321789Sahrens offsetof(struct metaslab, ms_txg_node)); 322789Sahrens txg_list_create(&vd->vdev_dtl_list, 323789Sahrens offsetof(struct vdev, vdev_dtl_node)); 324789Sahrens vd->vdev_stat.vs_timestamp = gethrtime(); 3254451Seschrock vdev_queue_init(vd); 3264451Seschrock vdev_cache_init(vd); 327789Sahrens 328789Sahrens return (vd); 329789Sahrens } 330789Sahrens 331789Sahrens /* 332789Sahrens * Allocate a new vdev. The 'alloctype' is used to control whether we are 333789Sahrens * creating a new vdev or loading an existing one - the behavior is slightly 334789Sahrens * different for each case. 335789Sahrens */ 3362082Seschrock int 3372082Seschrock vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, 3382082Seschrock int alloctype) 339789Sahrens { 340789Sahrens vdev_ops_t *ops; 341789Sahrens char *type; 3424527Sperrin uint64_t guid = 0, islog, nparity; 343789Sahrens vdev_t *vd; 344789Sahrens 345789Sahrens ASSERT(spa_config_held(spa, RW_WRITER)); 346789Sahrens 347789Sahrens if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) 3482082Seschrock return (EINVAL); 349789Sahrens 350789Sahrens if ((ops = vdev_getops(type)) == NULL) 3512082Seschrock return (EINVAL); 352789Sahrens 353789Sahrens /* 354789Sahrens * If this is a load, get the vdev guid from the nvlist. 355789Sahrens * Otherwise, vdev_alloc_common() will generate one for us. 356789Sahrens */ 357789Sahrens if (alloctype == VDEV_ALLOC_LOAD) { 358789Sahrens uint64_t label_id; 359789Sahrens 360789Sahrens if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) || 361789Sahrens label_id != id) 3622082Seschrock return (EINVAL); 363789Sahrens 364789Sahrens if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 3652082Seschrock return (EINVAL); 3662082Seschrock } else if (alloctype == VDEV_ALLOC_SPARE) { 3672082Seschrock if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 3682082Seschrock return (EINVAL); 3695450Sbrendan } else if (alloctype == VDEV_ALLOC_L2CACHE) { 3705450Sbrendan if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 3715450Sbrendan return (EINVAL); 372789Sahrens } 373789Sahrens 3742082Seschrock /* 3752082Seschrock * The first allocated vdev must be of type 'root'. 3762082Seschrock */ 3772082Seschrock if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL) 3782082Seschrock return (EINVAL); 3792082Seschrock 3804527Sperrin /* 3814527Sperrin * Determine whether we're a log vdev. 3824527Sperrin */ 3834527Sperrin islog = 0; 3844527Sperrin (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog); 3855094Slling if (islog && spa_version(spa) < SPA_VERSION_SLOGS) 3864527Sperrin return (ENOTSUP); 3874527Sperrin 3884527Sperrin /* 3894527Sperrin * Set the nparity property for RAID-Z vdevs. 3904527Sperrin */ 3914527Sperrin nparity = -1ULL; 3924527Sperrin if (ops == &vdev_raidz_ops) { 3934527Sperrin if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, 3944527Sperrin &nparity) == 0) { 3954527Sperrin /* 3964527Sperrin * Currently, we can only support 2 parity devices. 3974527Sperrin */ 3984527Sperrin if (nparity == 0 || nparity > 2) 3994527Sperrin return (EINVAL); 4004527Sperrin /* 4014527Sperrin * Older versions can only support 1 parity device. 4024527Sperrin */ 4034527Sperrin if (nparity == 2 && 4044577Sahrens spa_version(spa) < SPA_VERSION_RAID6) 4054527Sperrin return (ENOTSUP); 4064527Sperrin } else { 4074527Sperrin /* 4084527Sperrin * We require the parity to be specified for SPAs that 4094527Sperrin * support multiple parity levels. 4104527Sperrin */ 4114577Sahrens if (spa_version(spa) >= SPA_VERSION_RAID6) 4124527Sperrin return (EINVAL); 4134527Sperrin /* 4144527Sperrin * Otherwise, we default to 1 parity device for RAID-Z. 4154527Sperrin */ 4164527Sperrin nparity = 1; 4174527Sperrin } 4184527Sperrin } else { 4194527Sperrin nparity = 0; 4204527Sperrin } 4214527Sperrin ASSERT(nparity != -1ULL); 4224527Sperrin 423789Sahrens vd = vdev_alloc_common(spa, id, guid, ops); 424789Sahrens 4254527Sperrin vd->vdev_islog = islog; 4264527Sperrin vd->vdev_nparity = nparity; 4274527Sperrin 428789Sahrens if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0) 429789Sahrens vd->vdev_path = spa_strdup(vd->vdev_path); 430789Sahrens if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0) 431789Sahrens vd->vdev_devid = spa_strdup(vd->vdev_devid); 4324451Seschrock if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, 4334451Seschrock &vd->vdev_physpath) == 0) 4344451Seschrock vd->vdev_physpath = spa_strdup(vd->vdev_physpath); 435789Sahrens 436789Sahrens /* 4371171Seschrock * Set the whole_disk property. If it's not specified, leave the value 4381171Seschrock * as -1. 4391171Seschrock */ 4401171Seschrock if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, 4411171Seschrock &vd->vdev_wholedisk) != 0) 4421171Seschrock vd->vdev_wholedisk = -1ULL; 4431171Seschrock 4441171Seschrock /* 4451544Seschrock * Look for the 'not present' flag. This will only be set if the device 4461544Seschrock * was not present at the time of import. 4471544Seschrock */ 4481544Seschrock (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 4491544Seschrock &vd->vdev_not_present); 4501544Seschrock 4511544Seschrock /* 4521732Sbonwick * Get the alignment requirement. 4531732Sbonwick */ 4541732Sbonwick (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift); 4551732Sbonwick 4561732Sbonwick /* 457789Sahrens * If we're a top-level vdev, try to load the allocation parameters. 458789Sahrens */ 459789Sahrens if (parent && !parent->vdev_parent && alloctype == VDEV_ALLOC_LOAD) { 460789Sahrens (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, 461789Sahrens &vd->vdev_ms_array); 462789Sahrens (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, 463789Sahrens &vd->vdev_ms_shift); 464789Sahrens (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE, 465789Sahrens &vd->vdev_asize); 466789Sahrens } 467789Sahrens 468789Sahrens /* 4694451Seschrock * If we're a leaf vdev, try to load the DTL object and other state. 470789Sahrens */ 471789Sahrens if (vd->vdev_ops->vdev_op_leaf && alloctype == VDEV_ALLOC_LOAD) { 472789Sahrens (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, 473789Sahrens &vd->vdev_dtl.smo_object); 4741732Sbonwick (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, 4751732Sbonwick &vd->vdev_offline); 4764451Seschrock (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE, 4774451Seschrock &vd->vdev_unspare); 4784451Seschrock /* 4794451Seschrock * When importing a pool, we want to ignore the persistent fault 4804451Seschrock * state, as the diagnosis made on another system may not be 4814451Seschrock * valid in the current context. 4824451Seschrock */ 4834451Seschrock if (spa->spa_load_state == SPA_LOAD_OPEN) { 4844451Seschrock (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, 4854451Seschrock &vd->vdev_faulted); 4864451Seschrock (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED, 4874451Seschrock &vd->vdev_degraded); 4884451Seschrock (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, 4894451Seschrock &vd->vdev_removed); 4904451Seschrock } 491789Sahrens } 492789Sahrens 493789Sahrens /* 494789Sahrens * Add ourselves to the parent's list of children. 495789Sahrens */ 496789Sahrens vdev_add_child(parent, vd); 497789Sahrens 4982082Seschrock *vdp = vd; 4992082Seschrock 5002082Seschrock return (0); 501789Sahrens } 502789Sahrens 503789Sahrens void 504789Sahrens vdev_free(vdev_t *vd) 505789Sahrens { 506789Sahrens int c; 5074451Seschrock spa_t *spa = vd->vdev_spa; 508789Sahrens 509789Sahrens /* 510789Sahrens * vdev_free() implies closing the vdev first. This is simpler than 511789Sahrens * trying to ensure complicated semantics for all callers. 512789Sahrens */ 513789Sahrens vdev_close(vd); 514789Sahrens 5154451Seschrock 5161732Sbonwick ASSERT(!list_link_active(&vd->vdev_dirty_node)); 517789Sahrens 518789Sahrens /* 519789Sahrens * Free all children. 520789Sahrens */ 521789Sahrens for (c = 0; c < vd->vdev_children; c++) 522789Sahrens vdev_free(vd->vdev_child[c]); 523789Sahrens 524789Sahrens ASSERT(vd->vdev_child == NULL); 525789Sahrens ASSERT(vd->vdev_guid_sum == vd->vdev_guid); 526789Sahrens 527789Sahrens /* 528789Sahrens * Discard allocation state. 529789Sahrens */ 530789Sahrens if (vd == vd->vdev_top) 531789Sahrens vdev_metaslab_fini(vd); 532789Sahrens 533789Sahrens ASSERT3U(vd->vdev_stat.vs_space, ==, 0); 5342082Seschrock ASSERT3U(vd->vdev_stat.vs_dspace, ==, 0); 535789Sahrens ASSERT3U(vd->vdev_stat.vs_alloc, ==, 0); 536789Sahrens 537789Sahrens /* 538789Sahrens * Remove this vdev from its parent's child list. 539789Sahrens */ 540789Sahrens vdev_remove_child(vd->vdev_parent, vd); 541789Sahrens 542789Sahrens ASSERT(vd->vdev_parent == NULL); 543789Sahrens 5444451Seschrock /* 5454451Seschrock * Clean up vdev structure. 5464451Seschrock */ 5474451Seschrock vdev_queue_fini(vd); 5484451Seschrock vdev_cache_fini(vd); 5494451Seschrock 5504451Seschrock if (vd->vdev_path) 5514451Seschrock spa_strfree(vd->vdev_path); 5524451Seschrock if (vd->vdev_devid) 5534451Seschrock spa_strfree(vd->vdev_devid); 5544451Seschrock if (vd->vdev_physpath) 5554451Seschrock spa_strfree(vd->vdev_physpath); 5564451Seschrock 5574451Seschrock if (vd->vdev_isspare) 5584451Seschrock spa_spare_remove(vd); 5595450Sbrendan if (vd->vdev_isl2cache) 5605450Sbrendan spa_l2cache_remove(vd); 5614451Seschrock 5624451Seschrock txg_list_destroy(&vd->vdev_ms_list); 5634451Seschrock txg_list_destroy(&vd->vdev_dtl_list); 5644451Seschrock mutex_enter(&vd->vdev_dtl_lock); 5654451Seschrock space_map_unload(&vd->vdev_dtl_map); 5664451Seschrock space_map_destroy(&vd->vdev_dtl_map); 5674451Seschrock space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL); 5684451Seschrock space_map_destroy(&vd->vdev_dtl_scrub); 5694451Seschrock mutex_exit(&vd->vdev_dtl_lock); 5704451Seschrock mutex_destroy(&vd->vdev_dtl_lock); 5714451Seschrock mutex_destroy(&vd->vdev_stat_lock); 5724451Seschrock 5734451Seschrock if (vd == spa->spa_root_vdev) 5744451Seschrock spa->spa_root_vdev = NULL; 5754451Seschrock 5764451Seschrock kmem_free(vd, sizeof (vdev_t)); 577789Sahrens } 578789Sahrens 579789Sahrens /* 580789Sahrens * Transfer top-level vdev state from svd to tvd. 581789Sahrens */ 582789Sahrens static void 583789Sahrens vdev_top_transfer(vdev_t *svd, vdev_t *tvd) 584789Sahrens { 585789Sahrens spa_t *spa = svd->vdev_spa; 586789Sahrens metaslab_t *msp; 587789Sahrens vdev_t *vd; 588789Sahrens int t; 589789Sahrens 590789Sahrens ASSERT(tvd == tvd->vdev_top); 591789Sahrens 592789Sahrens tvd->vdev_ms_array = svd->vdev_ms_array; 593789Sahrens tvd->vdev_ms_shift = svd->vdev_ms_shift; 594789Sahrens tvd->vdev_ms_count = svd->vdev_ms_count; 595789Sahrens 596789Sahrens svd->vdev_ms_array = 0; 597789Sahrens svd->vdev_ms_shift = 0; 598789Sahrens svd->vdev_ms_count = 0; 599789Sahrens 600789Sahrens tvd->vdev_mg = svd->vdev_mg; 601789Sahrens tvd->vdev_ms = svd->vdev_ms; 602789Sahrens 603789Sahrens svd->vdev_mg = NULL; 604789Sahrens svd->vdev_ms = NULL; 6051732Sbonwick 6061732Sbonwick if (tvd->vdev_mg != NULL) 6071732Sbonwick tvd->vdev_mg->mg_vd = tvd; 608789Sahrens 609789Sahrens tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc; 610789Sahrens tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space; 6112082Seschrock tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace; 612789Sahrens 613789Sahrens svd->vdev_stat.vs_alloc = 0; 614789Sahrens svd->vdev_stat.vs_space = 0; 6152082Seschrock svd->vdev_stat.vs_dspace = 0; 616789Sahrens 617789Sahrens for (t = 0; t < TXG_SIZE; t++) { 618789Sahrens while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL) 619789Sahrens (void) txg_list_add(&tvd->vdev_ms_list, msp, t); 620789Sahrens while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL) 621789Sahrens (void) txg_list_add(&tvd->vdev_dtl_list, vd, t); 622789Sahrens if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t)) 623789Sahrens (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t); 624789Sahrens } 625789Sahrens 6261732Sbonwick if (list_link_active(&svd->vdev_dirty_node)) { 627789Sahrens vdev_config_clean(svd); 628789Sahrens vdev_config_dirty(tvd); 629789Sahrens } 630789Sahrens 6312082Seschrock tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio; 6322082Seschrock svd->vdev_deflate_ratio = 0; 6334527Sperrin 6344527Sperrin tvd->vdev_islog = svd->vdev_islog; 6354527Sperrin svd->vdev_islog = 0; 636789Sahrens } 637789Sahrens 638789Sahrens static void 639789Sahrens vdev_top_update(vdev_t *tvd, vdev_t *vd) 640789Sahrens { 641789Sahrens int c; 642789Sahrens 643789Sahrens if (vd == NULL) 644789Sahrens return; 645789Sahrens 646789Sahrens vd->vdev_top = tvd; 647789Sahrens 648789Sahrens for (c = 0; c < vd->vdev_children; c++) 649789Sahrens vdev_top_update(tvd, vd->vdev_child[c]); 650789Sahrens } 651789Sahrens 652789Sahrens /* 653789Sahrens * Add a mirror/replacing vdev above an existing vdev. 654789Sahrens */ 655789Sahrens vdev_t * 656789Sahrens vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) 657789Sahrens { 658789Sahrens spa_t *spa = cvd->vdev_spa; 659789Sahrens vdev_t *pvd = cvd->vdev_parent; 660789Sahrens vdev_t *mvd; 661789Sahrens 662789Sahrens ASSERT(spa_config_held(spa, RW_WRITER)); 663789Sahrens 664789Sahrens mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops); 6651732Sbonwick 6661732Sbonwick mvd->vdev_asize = cvd->vdev_asize; 6671732Sbonwick mvd->vdev_ashift = cvd->vdev_ashift; 6681732Sbonwick mvd->vdev_state = cvd->vdev_state; 6691732Sbonwick 670789Sahrens vdev_remove_child(pvd, cvd); 671789Sahrens vdev_add_child(pvd, mvd); 672789Sahrens cvd->vdev_id = mvd->vdev_children; 673789Sahrens vdev_add_child(mvd, cvd); 674789Sahrens vdev_top_update(cvd->vdev_top, cvd->vdev_top); 675789Sahrens 676789Sahrens if (mvd == mvd->vdev_top) 677789Sahrens vdev_top_transfer(cvd, mvd); 678789Sahrens 679789Sahrens return (mvd); 680789Sahrens } 681789Sahrens 682789Sahrens /* 683789Sahrens * Remove a 1-way mirror/replacing vdev from the tree. 684789Sahrens */ 685789Sahrens void 686789Sahrens vdev_remove_parent(vdev_t *cvd) 687789Sahrens { 688789Sahrens vdev_t *mvd = cvd->vdev_parent; 689789Sahrens vdev_t *pvd = mvd->vdev_parent; 690789Sahrens 691789Sahrens ASSERT(spa_config_held(cvd->vdev_spa, RW_WRITER)); 692789Sahrens 693789Sahrens ASSERT(mvd->vdev_children == 1); 694789Sahrens ASSERT(mvd->vdev_ops == &vdev_mirror_ops || 6952082Seschrock mvd->vdev_ops == &vdev_replacing_ops || 6962082Seschrock mvd->vdev_ops == &vdev_spare_ops); 6971732Sbonwick cvd->vdev_ashift = mvd->vdev_ashift; 698789Sahrens 699789Sahrens vdev_remove_child(mvd, cvd); 700789Sahrens vdev_remove_child(pvd, mvd); 701789Sahrens cvd->vdev_id = mvd->vdev_id; 702789Sahrens vdev_add_child(pvd, cvd); 7032082Seschrock /* 7042082Seschrock * If we created a new toplevel vdev, then we need to change the child's 7052082Seschrock * vdev GUID to match the old toplevel vdev. Otherwise, we could have 7062082Seschrock * detached an offline device, and when we go to import the pool we'll 7072082Seschrock * think we have two toplevel vdevs, instead of a different version of 7082082Seschrock * the same toplevel vdev. 7092082Seschrock */ 7102082Seschrock if (cvd->vdev_top == cvd) { 7112082Seschrock pvd->vdev_guid_sum -= cvd->vdev_guid; 7122082Seschrock cvd->vdev_guid_sum -= cvd->vdev_guid; 7132082Seschrock cvd->vdev_guid = mvd->vdev_guid; 7142082Seschrock cvd->vdev_guid_sum += mvd->vdev_guid; 7152082Seschrock pvd->vdev_guid_sum += cvd->vdev_guid; 7162082Seschrock } 717789Sahrens vdev_top_update(cvd->vdev_top, cvd->vdev_top); 718789Sahrens 719789Sahrens if (cvd == cvd->vdev_top) 720789Sahrens vdev_top_transfer(mvd, cvd); 721789Sahrens 722789Sahrens ASSERT(mvd->vdev_children == 0); 723789Sahrens vdev_free(mvd); 724789Sahrens } 725789Sahrens 7261544Seschrock int 727789Sahrens vdev_metaslab_init(vdev_t *vd, uint64_t txg) 728789Sahrens { 729789Sahrens spa_t *spa = vd->vdev_spa; 7301732Sbonwick objset_t *mos = spa->spa_meta_objset; 7314527Sperrin metaslab_class_t *mc; 7321732Sbonwick uint64_t m; 733789Sahrens uint64_t oldc = vd->vdev_ms_count; 734789Sahrens uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; 7351732Sbonwick metaslab_t **mspp; 7361732Sbonwick int error; 737789Sahrens 7381585Sbonwick if (vd->vdev_ms_shift == 0) /* not being allocated from yet */ 7391585Sbonwick return (0); 7401585Sbonwick 741789Sahrens dprintf("%s oldc %llu newc %llu\n", vdev_description(vd), oldc, newc); 742789Sahrens 743789Sahrens ASSERT(oldc <= newc); 744789Sahrens 7454527Sperrin if (vd->vdev_islog) 7464527Sperrin mc = spa->spa_log_class; 7474527Sperrin else 7484527Sperrin mc = spa->spa_normal_class; 7494527Sperrin 7501732Sbonwick if (vd->vdev_mg == NULL) 7511732Sbonwick vd->vdev_mg = metaslab_group_create(mc, vd); 7521732Sbonwick 7531732Sbonwick mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP); 7541732Sbonwick 7551732Sbonwick if (oldc != 0) { 7561732Sbonwick bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp)); 7571732Sbonwick kmem_free(vd->vdev_ms, oldc * sizeof (*mspp)); 7581732Sbonwick } 7591732Sbonwick 7601732Sbonwick vd->vdev_ms = mspp; 761789Sahrens vd->vdev_ms_count = newc; 762789Sahrens 7631732Sbonwick for (m = oldc; m < newc; m++) { 7641732Sbonwick space_map_obj_t smo = { 0, 0, 0 }; 765789Sahrens if (txg == 0) { 7661732Sbonwick uint64_t object = 0; 7671732Sbonwick error = dmu_read(mos, vd->vdev_ms_array, 7681732Sbonwick m * sizeof (uint64_t), sizeof (uint64_t), &object); 7691732Sbonwick if (error) 7701732Sbonwick return (error); 7711732Sbonwick if (object != 0) { 7721732Sbonwick dmu_buf_t *db; 7731732Sbonwick error = dmu_bonus_hold(mos, object, FTAG, &db); 7741732Sbonwick if (error) 7751732Sbonwick return (error); 7764944Smaybee ASSERT3U(db->db_size, >=, sizeof (smo)); 7774944Smaybee bcopy(db->db_data, &smo, sizeof (smo)); 7781732Sbonwick ASSERT3U(smo.smo_object, ==, object); 7791544Seschrock dmu_buf_rele(db, FTAG); 780789Sahrens } 781789Sahrens } 7821732Sbonwick vd->vdev_ms[m] = metaslab_init(vd->vdev_mg, &smo, 7831732Sbonwick m << vd->vdev_ms_shift, 1ULL << vd->vdev_ms_shift, txg); 784789Sahrens } 785789Sahrens 7861544Seschrock return (0); 787789Sahrens } 788789Sahrens 789789Sahrens void 790789Sahrens vdev_metaslab_fini(vdev_t *vd) 791789Sahrens { 792789Sahrens uint64_t m; 793789Sahrens uint64_t count = vd->vdev_ms_count; 794789Sahrens 795789Sahrens if (vd->vdev_ms != NULL) { 796789Sahrens for (m = 0; m < count; m++) 7971732Sbonwick if (vd->vdev_ms[m] != NULL) 7981732Sbonwick metaslab_fini(vd->vdev_ms[m]); 799789Sahrens kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *)); 800789Sahrens vd->vdev_ms = NULL; 801789Sahrens } 802789Sahrens } 803789Sahrens 8045329Sgw25295 int 8055329Sgw25295 vdev_probe(vdev_t *vd) 8065329Sgw25295 { 8075329Sgw25295 if (vd == NULL) 8085329Sgw25295 return (EINVAL); 8095329Sgw25295 8105329Sgw25295 /* 8115329Sgw25295 * Right now we only support status checks on the leaf vdevs. 8125329Sgw25295 */ 8135329Sgw25295 if (vd->vdev_ops->vdev_op_leaf) 8145329Sgw25295 return (vd->vdev_ops->vdev_op_probe(vd)); 8155329Sgw25295 8165329Sgw25295 return (0); 8175329Sgw25295 } 8185329Sgw25295 819789Sahrens /* 820789Sahrens * Prepare a virtual device for access. 821789Sahrens */ 822789Sahrens int 823789Sahrens vdev_open(vdev_t *vd) 824789Sahrens { 825789Sahrens int error; 826789Sahrens int c; 827789Sahrens uint64_t osize = 0; 828789Sahrens uint64_t asize, psize; 8291732Sbonwick uint64_t ashift = 0; 830789Sahrens 831789Sahrens ASSERT(vd->vdev_state == VDEV_STATE_CLOSED || 832789Sahrens vd->vdev_state == VDEV_STATE_CANT_OPEN || 833789Sahrens vd->vdev_state == VDEV_STATE_OFFLINE); 834789Sahrens 835789Sahrens if (vd->vdev_fault_mode == VDEV_FAULT_COUNT) 836789Sahrens vd->vdev_fault_arg >>= 1; 837789Sahrens else 838789Sahrens vd->vdev_fault_mode = VDEV_FAULT_NONE; 839789Sahrens 840789Sahrens vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 841789Sahrens 8424451Seschrock if (!vd->vdev_removed && vd->vdev_faulted) { 8434451Seschrock ASSERT(vd->vdev_children == 0); 8444451Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, 8454451Seschrock VDEV_AUX_ERR_EXCEEDED); 8464451Seschrock return (ENXIO); 8474451Seschrock } else if (vd->vdev_offline) { 848789Sahrens ASSERT(vd->vdev_children == 0); 8491544Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE); 850789Sahrens return (ENXIO); 851789Sahrens } 852789Sahrens 853789Sahrens error = vd->vdev_ops->vdev_op_open(vd, &osize, &ashift); 854789Sahrens 8551544Seschrock if (zio_injection_enabled && error == 0) 8561544Seschrock error = zio_handle_device_injection(vd, ENXIO); 8571544Seschrock 8584451Seschrock if (error) { 8594451Seschrock if (vd->vdev_removed && 8604451Seschrock vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED) 8614451Seschrock vd->vdev_removed = B_FALSE; 862789Sahrens 8631544Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 864789Sahrens vd->vdev_stat.vs_aux); 865789Sahrens return (error); 866789Sahrens } 867789Sahrens 8684451Seschrock vd->vdev_removed = B_FALSE; 8694451Seschrock 8704451Seschrock if (vd->vdev_degraded) { 8714451Seschrock ASSERT(vd->vdev_children == 0); 8724451Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, 8734451Seschrock VDEV_AUX_ERR_EXCEEDED); 8744451Seschrock } else { 8754451Seschrock vd->vdev_state = VDEV_STATE_HEALTHY; 8764451Seschrock } 877789Sahrens 878789Sahrens for (c = 0; c < vd->vdev_children; c++) 8791544Seschrock if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { 8801544Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, 8811544Seschrock VDEV_AUX_NONE); 8821544Seschrock break; 8831544Seschrock } 884789Sahrens 885789Sahrens osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t)); 886789Sahrens 887789Sahrens if (vd->vdev_children == 0) { 888789Sahrens if (osize < SPA_MINDEVSIZE) { 8891544Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 8901544Seschrock VDEV_AUX_TOO_SMALL); 891789Sahrens return (EOVERFLOW); 892789Sahrens } 893789Sahrens psize = osize; 894789Sahrens asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); 895789Sahrens } else { 8961732Sbonwick if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE - 897789Sahrens (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) { 8981544Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 8991544Seschrock VDEV_AUX_TOO_SMALL); 900789Sahrens return (EOVERFLOW); 901789Sahrens } 902789Sahrens psize = 0; 903789Sahrens asize = osize; 904789Sahrens } 905789Sahrens 906789Sahrens vd->vdev_psize = psize; 907789Sahrens 908789Sahrens if (vd->vdev_asize == 0) { 909789Sahrens /* 910789Sahrens * This is the first-ever open, so use the computed values. 9111732Sbonwick * For testing purposes, a higher ashift can be requested. 912789Sahrens */ 913789Sahrens vd->vdev_asize = asize; 9141732Sbonwick vd->vdev_ashift = MAX(ashift, vd->vdev_ashift); 915789Sahrens } else { 916789Sahrens /* 917789Sahrens * Make sure the alignment requirement hasn't increased. 918789Sahrens */ 9191732Sbonwick if (ashift > vd->vdev_top->vdev_ashift) { 9201544Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 9211544Seschrock VDEV_AUX_BAD_LABEL); 922789Sahrens return (EINVAL); 923789Sahrens } 924789Sahrens 925789Sahrens /* 926789Sahrens * Make sure the device hasn't shrunk. 927789Sahrens */ 928789Sahrens if (asize < vd->vdev_asize) { 9291544Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 9301544Seschrock VDEV_AUX_BAD_LABEL); 931789Sahrens return (EINVAL); 932789Sahrens } 933789Sahrens 934789Sahrens /* 935789Sahrens * If all children are healthy and the asize has increased, 936789Sahrens * then we've experienced dynamic LUN growth. 937789Sahrens */ 938789Sahrens if (vd->vdev_state == VDEV_STATE_HEALTHY && 939789Sahrens asize > vd->vdev_asize) { 940789Sahrens vd->vdev_asize = asize; 941789Sahrens } 942789Sahrens } 943789Sahrens 9441544Seschrock /* 9455329Sgw25295 * Ensure we can issue some IO before declaring the 9465329Sgw25295 * vdev open for business. 9475329Sgw25295 */ 9485329Sgw25295 error = vdev_probe(vd); 9495329Sgw25295 if (error) { 9505329Sgw25295 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 9515329Sgw25295 VDEV_AUX_OPEN_FAILED); 9525329Sgw25295 return (error); 9535329Sgw25295 } 9545329Sgw25295 9555329Sgw25295 /* 9562082Seschrock * If this is a top-level vdev, compute the raidz-deflation 9572082Seschrock * ratio. Note, we hard-code in 128k (1<<17) because it is the 9582082Seschrock * current "typical" blocksize. Even if SPA_MAXBLOCKSIZE 9592082Seschrock * changes, this algorithm must never change, or we will 9602082Seschrock * inconsistently account for existing bp's. 9612082Seschrock */ 9622082Seschrock if (vd->vdev_top == vd) { 9632082Seschrock vd->vdev_deflate_ratio = (1<<17) / 9642082Seschrock (vdev_psize_to_asize(vd, 1<<17) >> SPA_MINBLOCKSHIFT); 9652082Seschrock } 9662082Seschrock 9672082Seschrock /* 9681544Seschrock * This allows the ZFS DE to close cases appropriately. If a device 9691544Seschrock * goes away and later returns, we want to close the associated case. 9701544Seschrock * But it's not enough to simply post this only when a device goes from 9711544Seschrock * CANT_OPEN -> HEALTHY. If we reboot the system and the device is 9721544Seschrock * back, we also need to close the case (otherwise we will try to replay 9731544Seschrock * it). So we have to post this notifier every time. Since this only 9741544Seschrock * occurs during pool open or error recovery, this should not be an 9751544Seschrock * issue. 9761544Seschrock */ 9771544Seschrock zfs_post_ok(vd->vdev_spa, vd); 9781544Seschrock 979789Sahrens return (0); 980789Sahrens } 981789Sahrens 982789Sahrens /* 9831986Seschrock * Called once the vdevs are all opened, this routine validates the label 9841986Seschrock * contents. This needs to be done before vdev_load() so that we don't 9854451Seschrock * inadvertently do repair I/Os to the wrong device. 9861986Seschrock * 9871986Seschrock * This function will only return failure if one of the vdevs indicates that it 9881986Seschrock * has since been destroyed or exported. This is only possible if 9891986Seschrock * /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state 9901986Seschrock * will be updated but the function will return 0. 9911986Seschrock */ 9921986Seschrock int 9931986Seschrock vdev_validate(vdev_t *vd) 9941986Seschrock { 9951986Seschrock spa_t *spa = vd->vdev_spa; 9961986Seschrock int c; 9971986Seschrock nvlist_t *label; 9981986Seschrock uint64_t guid; 9991986Seschrock uint64_t state; 10001986Seschrock 10011986Seschrock for (c = 0; c < vd->vdev_children; c++) 10021986Seschrock if (vdev_validate(vd->vdev_child[c]) != 0) 10034070Smc142369 return (EBADF); 10041986Seschrock 10052174Seschrock /* 10062174Seschrock * If the device has already failed, or was marked offline, don't do 10072174Seschrock * any further validation. Otherwise, label I/O will fail and we will 10082174Seschrock * overwrite the previous state. 10092174Seschrock */ 10102174Seschrock if (vd->vdev_ops->vdev_op_leaf && !vdev_is_dead(vd)) { 10111986Seschrock 10121986Seschrock if ((label = vdev_label_read_config(vd)) == NULL) { 10131986Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 10141986Seschrock VDEV_AUX_BAD_LABEL); 10151986Seschrock return (0); 10161986Seschrock } 10171986Seschrock 10181986Seschrock if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, 10191986Seschrock &guid) != 0 || guid != spa_guid(spa)) { 10201986Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 10211986Seschrock VDEV_AUX_CORRUPT_DATA); 10221986Seschrock nvlist_free(label); 10231986Seschrock return (0); 10241986Seschrock } 10251986Seschrock 10261986Seschrock if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, 10271986Seschrock &guid) != 0 || guid != vd->vdev_guid) { 10281986Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 10291986Seschrock VDEV_AUX_CORRUPT_DATA); 10301986Seschrock nvlist_free(label); 10311986Seschrock return (0); 10321986Seschrock } 10331986Seschrock 10341986Seschrock if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, 10351986Seschrock &state) != 0) { 10361986Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 10371986Seschrock VDEV_AUX_CORRUPT_DATA); 10381986Seschrock nvlist_free(label); 10391986Seschrock return (0); 10401986Seschrock } 10411986Seschrock 10421986Seschrock nvlist_free(label); 10431986Seschrock 10441986Seschrock if (spa->spa_load_state == SPA_LOAD_OPEN && 10451986Seschrock state != POOL_STATE_ACTIVE) 10464070Smc142369 return (EBADF); 10471986Seschrock } 10481986Seschrock 10491986Seschrock /* 10501986Seschrock * If we were able to open and validate a vdev that was previously 10511986Seschrock * marked permanently unavailable, clear that state now. 10521986Seschrock */ 10531986Seschrock if (vd->vdev_not_present) 10541986Seschrock vd->vdev_not_present = 0; 10551986Seschrock 10561986Seschrock return (0); 10571986Seschrock } 10581986Seschrock 10591986Seschrock /* 1060789Sahrens * Close a virtual device. 1061789Sahrens */ 1062789Sahrens void 1063789Sahrens vdev_close(vdev_t *vd) 1064789Sahrens { 1065789Sahrens vd->vdev_ops->vdev_op_close(vd); 1066789Sahrens 10674451Seschrock vdev_cache_purge(vd); 1068789Sahrens 10691986Seschrock /* 10701986Seschrock * We record the previous state before we close it, so that if we are 10711986Seschrock * doing a reopen(), we don't generate FMA ereports if we notice that 10721986Seschrock * it's still faulted. 10731986Seschrock */ 10741986Seschrock vd->vdev_prevstate = vd->vdev_state; 10751986Seschrock 1076789Sahrens if (vd->vdev_offline) 1077789Sahrens vd->vdev_state = VDEV_STATE_OFFLINE; 1078789Sahrens else 1079789Sahrens vd->vdev_state = VDEV_STATE_CLOSED; 10801544Seschrock vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 1081789Sahrens } 1082789Sahrens 1083789Sahrens void 10841544Seschrock vdev_reopen(vdev_t *vd) 1085789Sahrens { 10861544Seschrock spa_t *spa = vd->vdev_spa; 1087789Sahrens 10881544Seschrock ASSERT(spa_config_held(spa, RW_WRITER)); 10891544Seschrock 1090789Sahrens vdev_close(vd); 1091789Sahrens (void) vdev_open(vd); 1092789Sahrens 1093789Sahrens /* 10943377Seschrock * Call vdev_validate() here to make sure we have the same device. 10953377Seschrock * Otherwise, a device with an invalid label could be successfully 10963377Seschrock * opened in response to vdev_reopen(). 10973377Seschrock */ 10983377Seschrock (void) vdev_validate(vd); 10993377Seschrock 11003377Seschrock /* 11014451Seschrock * Reassess parent vdev's health. 1102789Sahrens */ 11034451Seschrock vdev_propagate_state(vd); 1104789Sahrens } 1105789Sahrens 1106789Sahrens int 11072082Seschrock vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing) 1108789Sahrens { 1109789Sahrens int error; 1110789Sahrens 1111789Sahrens /* 1112789Sahrens * Normally, partial opens (e.g. of a mirror) are allowed. 1113789Sahrens * For a create, however, we want to fail the request if 1114789Sahrens * there are any components we can't open. 1115789Sahrens */ 1116789Sahrens error = vdev_open(vd); 1117789Sahrens 1118789Sahrens if (error || vd->vdev_state != VDEV_STATE_HEALTHY) { 1119789Sahrens vdev_close(vd); 1120789Sahrens return (error ? error : ENXIO); 1121789Sahrens } 1122789Sahrens 1123789Sahrens /* 1124789Sahrens * Recursively initialize all labels. 1125789Sahrens */ 11263377Seschrock if ((error = vdev_label_init(vd, txg, isreplacing ? 11273377Seschrock VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) { 1128789Sahrens vdev_close(vd); 1129789Sahrens return (error); 1130789Sahrens } 1131789Sahrens 1132789Sahrens return (0); 1133789Sahrens } 1134789Sahrens 1135789Sahrens /* 1136789Sahrens * The is the latter half of vdev_create(). It is distinct because it 1137789Sahrens * involves initiating transactions in order to do metaslab creation. 1138789Sahrens * For creation, we want to try to create all vdevs at once and then undo it 1139789Sahrens * if anything fails; this is much harder if we have pending transactions. 1140789Sahrens */ 11411585Sbonwick void 1142789Sahrens vdev_init(vdev_t *vd, uint64_t txg) 1143789Sahrens { 1144789Sahrens /* 1145789Sahrens * Aim for roughly 200 metaslabs per vdev. 1146789Sahrens */ 1147789Sahrens vd->vdev_ms_shift = highbit(vd->vdev_asize / 200); 1148789Sahrens vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT); 1149789Sahrens 1150789Sahrens /* 11511585Sbonwick * Initialize the vdev's metaslabs. This can't fail because 11521585Sbonwick * there's nothing to read when creating all new metaslabs. 1153789Sahrens */ 11541585Sbonwick VERIFY(vdev_metaslab_init(vd, txg) == 0); 1155789Sahrens } 1156789Sahrens 1157789Sahrens void 11581732Sbonwick vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg) 1159789Sahrens { 11601732Sbonwick ASSERT(vd == vd->vdev_top); 11611732Sbonwick ASSERT(ISP2(flags)); 1162789Sahrens 11631732Sbonwick if (flags & VDD_METASLAB) 11641732Sbonwick (void) txg_list_add(&vd->vdev_ms_list, arg, txg); 11651732Sbonwick 11661732Sbonwick if (flags & VDD_DTL) 11671732Sbonwick (void) txg_list_add(&vd->vdev_dtl_list, arg, txg); 11681732Sbonwick 11691732Sbonwick (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg); 1170789Sahrens } 1171789Sahrens 1172789Sahrens void 1173789Sahrens vdev_dtl_dirty(space_map_t *sm, uint64_t txg, uint64_t size) 1174789Sahrens { 1175789Sahrens mutex_enter(sm->sm_lock); 1176789Sahrens if (!space_map_contains(sm, txg, size)) 1177789Sahrens space_map_add(sm, txg, size); 1178789Sahrens mutex_exit(sm->sm_lock); 1179789Sahrens } 1180789Sahrens 1181789Sahrens int 1182789Sahrens vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size) 1183789Sahrens { 1184789Sahrens int dirty; 1185789Sahrens 1186789Sahrens /* 1187789Sahrens * Quick test without the lock -- covers the common case that 1188789Sahrens * there are no dirty time segments. 1189789Sahrens */ 1190789Sahrens if (sm->sm_space == 0) 1191789Sahrens return (0); 1192789Sahrens 1193789Sahrens mutex_enter(sm->sm_lock); 1194789Sahrens dirty = space_map_contains(sm, txg, size); 1195789Sahrens mutex_exit(sm->sm_lock); 1196789Sahrens 1197789Sahrens return (dirty); 1198789Sahrens } 1199789Sahrens 1200789Sahrens /* 1201789Sahrens * Reassess DTLs after a config change or scrub completion. 1202789Sahrens */ 1203789Sahrens void 1204789Sahrens vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) 1205789Sahrens { 12061544Seschrock spa_t *spa = vd->vdev_spa; 1207789Sahrens int c; 1208789Sahrens 12091544Seschrock ASSERT(spa_config_held(spa, RW_WRITER)); 1210789Sahrens 1211789Sahrens if (vd->vdev_children == 0) { 1212789Sahrens mutex_enter(&vd->vdev_dtl_lock); 1213789Sahrens /* 1214789Sahrens * We're successfully scrubbed everything up to scrub_txg. 1215789Sahrens * Therefore, excise all old DTLs up to that point, then 1216789Sahrens * fold in the DTLs for everything we couldn't scrub. 1217789Sahrens */ 1218789Sahrens if (scrub_txg != 0) { 1219789Sahrens space_map_excise(&vd->vdev_dtl_map, 0, scrub_txg); 1220789Sahrens space_map_union(&vd->vdev_dtl_map, &vd->vdev_dtl_scrub); 1221789Sahrens } 1222789Sahrens if (scrub_done) 1223789Sahrens space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL); 1224789Sahrens mutex_exit(&vd->vdev_dtl_lock); 12251732Sbonwick if (txg != 0) 12261732Sbonwick vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); 1227789Sahrens return; 1228789Sahrens } 1229789Sahrens 12301544Seschrock /* 12311544Seschrock * Make sure the DTLs are always correct under the scrub lock. 12321544Seschrock */ 12331544Seschrock if (vd == spa->spa_root_vdev) 12341544Seschrock mutex_enter(&spa->spa_scrub_lock); 12351544Seschrock 1236789Sahrens mutex_enter(&vd->vdev_dtl_lock); 1237789Sahrens space_map_vacate(&vd->vdev_dtl_map, NULL, NULL); 1238789Sahrens space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL); 1239789Sahrens mutex_exit(&vd->vdev_dtl_lock); 1240789Sahrens 1241789Sahrens for (c = 0; c < vd->vdev_children; c++) { 1242789Sahrens vdev_t *cvd = vd->vdev_child[c]; 1243789Sahrens vdev_dtl_reassess(cvd, txg, scrub_txg, scrub_done); 1244789Sahrens mutex_enter(&vd->vdev_dtl_lock); 1245789Sahrens space_map_union(&vd->vdev_dtl_map, &cvd->vdev_dtl_map); 1246789Sahrens space_map_union(&vd->vdev_dtl_scrub, &cvd->vdev_dtl_scrub); 1247789Sahrens mutex_exit(&vd->vdev_dtl_lock); 1248789Sahrens } 12491544Seschrock 12501544Seschrock if (vd == spa->spa_root_vdev) 12511544Seschrock mutex_exit(&spa->spa_scrub_lock); 1252789Sahrens } 1253789Sahrens 1254789Sahrens static int 1255789Sahrens vdev_dtl_load(vdev_t *vd) 1256789Sahrens { 1257789Sahrens spa_t *spa = vd->vdev_spa; 1258789Sahrens space_map_obj_t *smo = &vd->vdev_dtl; 12591732Sbonwick objset_t *mos = spa->spa_meta_objset; 1260789Sahrens dmu_buf_t *db; 1261789Sahrens int error; 1262789Sahrens 1263789Sahrens ASSERT(vd->vdev_children == 0); 1264789Sahrens 1265789Sahrens if (smo->smo_object == 0) 1266789Sahrens return (0); 1267789Sahrens 12681732Sbonwick if ((error = dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)) != 0) 12691544Seschrock return (error); 12701732Sbonwick 12714944Smaybee ASSERT3U(db->db_size, >=, sizeof (*smo)); 12724944Smaybee bcopy(db->db_data, smo, sizeof (*smo)); 12731544Seschrock dmu_buf_rele(db, FTAG); 1274789Sahrens 1275789Sahrens mutex_enter(&vd->vdev_dtl_lock); 12761732Sbonwick error = space_map_load(&vd->vdev_dtl_map, NULL, SM_ALLOC, smo, mos); 1277789Sahrens mutex_exit(&vd->vdev_dtl_lock); 1278789Sahrens 1279789Sahrens return (error); 1280789Sahrens } 1281789Sahrens 1282789Sahrens void 1283789Sahrens vdev_dtl_sync(vdev_t *vd, uint64_t txg) 1284789Sahrens { 1285789Sahrens spa_t *spa = vd->vdev_spa; 1286789Sahrens space_map_obj_t *smo = &vd->vdev_dtl; 1287789Sahrens space_map_t *sm = &vd->vdev_dtl_map; 12881732Sbonwick objset_t *mos = spa->spa_meta_objset; 1289789Sahrens space_map_t smsync; 1290789Sahrens kmutex_t smlock; 1291789Sahrens dmu_buf_t *db; 1292789Sahrens dmu_tx_t *tx; 1293789Sahrens 1294789Sahrens dprintf("%s in txg %llu pass %d\n", 1295789Sahrens vdev_description(vd), (u_longlong_t)txg, spa_sync_pass(spa)); 1296789Sahrens 1297789Sahrens tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 1298789Sahrens 1299789Sahrens if (vd->vdev_detached) { 1300789Sahrens if (smo->smo_object != 0) { 13011732Sbonwick int err = dmu_object_free(mos, smo->smo_object, tx); 1302789Sahrens ASSERT3U(err, ==, 0); 1303789Sahrens smo->smo_object = 0; 1304789Sahrens } 1305789Sahrens dmu_tx_commit(tx); 13061732Sbonwick dprintf("detach %s committed in txg %llu\n", 13071732Sbonwick vdev_description(vd), txg); 1308789Sahrens return; 1309789Sahrens } 1310789Sahrens 1311789Sahrens if (smo->smo_object == 0) { 1312789Sahrens ASSERT(smo->smo_objsize == 0); 1313789Sahrens ASSERT(smo->smo_alloc == 0); 13141732Sbonwick smo->smo_object = dmu_object_alloc(mos, 1315789Sahrens DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT, 1316789Sahrens DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx); 1317789Sahrens ASSERT(smo->smo_object != 0); 1318789Sahrens vdev_config_dirty(vd->vdev_top); 1319789Sahrens } 1320789Sahrens 1321789Sahrens mutex_init(&smlock, NULL, MUTEX_DEFAULT, NULL); 1322789Sahrens 1323789Sahrens space_map_create(&smsync, sm->sm_start, sm->sm_size, sm->sm_shift, 1324789Sahrens &smlock); 1325789Sahrens 1326789Sahrens mutex_enter(&smlock); 1327789Sahrens 1328789Sahrens mutex_enter(&vd->vdev_dtl_lock); 13291732Sbonwick space_map_walk(sm, space_map_add, &smsync); 1330789Sahrens mutex_exit(&vd->vdev_dtl_lock); 1331789Sahrens 13321732Sbonwick space_map_truncate(smo, mos, tx); 13331732Sbonwick space_map_sync(&smsync, SM_ALLOC, smo, mos, tx); 1334789Sahrens 1335789Sahrens space_map_destroy(&smsync); 1336789Sahrens 1337789Sahrens mutex_exit(&smlock); 1338789Sahrens mutex_destroy(&smlock); 1339789Sahrens 13401732Sbonwick VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)); 1341789Sahrens dmu_buf_will_dirty(db, tx); 13424944Smaybee ASSERT3U(db->db_size, >=, sizeof (*smo)); 13434944Smaybee bcopy(smo, db->db_data, sizeof (*smo)); 13441544Seschrock dmu_buf_rele(db, FTAG); 1345789Sahrens 1346789Sahrens dmu_tx_commit(tx); 1347789Sahrens } 1348789Sahrens 13491986Seschrock void 13501544Seschrock vdev_load(vdev_t *vd) 1351789Sahrens { 13521986Seschrock int c; 1353789Sahrens 1354789Sahrens /* 1355789Sahrens * Recursively load all children. 1356789Sahrens */ 1357789Sahrens for (c = 0; c < vd->vdev_children; c++) 13581986Seschrock vdev_load(vd->vdev_child[c]); 1359789Sahrens 1360789Sahrens /* 13611585Sbonwick * If this is a top-level vdev, initialize its metaslabs. 1362789Sahrens */ 13631986Seschrock if (vd == vd->vdev_top && 13641986Seschrock (vd->vdev_ashift == 0 || vd->vdev_asize == 0 || 13651986Seschrock vdev_metaslab_init(vd, 0) != 0)) 13661986Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 13671986Seschrock VDEV_AUX_CORRUPT_DATA); 1368789Sahrens 1369789Sahrens /* 1370789Sahrens * If this is a leaf vdev, load its DTL. 1371789Sahrens */ 13721986Seschrock if (vd->vdev_ops->vdev_op_leaf && vdev_dtl_load(vd) != 0) 13731986Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 13741986Seschrock VDEV_AUX_CORRUPT_DATA); 1375789Sahrens } 1376789Sahrens 13772082Seschrock /* 13785450Sbrendan * The special vdev case is used for hot spares and l2cache devices. Its 13795450Sbrendan * sole purpose it to set the vdev state for the associated vdev. To do this, 13805450Sbrendan * we make sure that we can open the underlying device, then try to read the 13815450Sbrendan * label, and make sure that the label is sane and that it hasn't been 13825450Sbrendan * repurposed to another pool. 13832082Seschrock */ 13842082Seschrock int 13855450Sbrendan vdev_validate_aux(vdev_t *vd) 13862082Seschrock { 13872082Seschrock nvlist_t *label; 13882082Seschrock uint64_t guid, version; 13892082Seschrock uint64_t state; 13902082Seschrock 13912082Seschrock if ((label = vdev_label_read_config(vd)) == NULL) { 13922082Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 13932082Seschrock VDEV_AUX_CORRUPT_DATA); 13942082Seschrock return (-1); 13952082Seschrock } 13962082Seschrock 13972082Seschrock if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 || 13984577Sahrens version > SPA_VERSION || 13992082Seschrock nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 || 14002082Seschrock guid != vd->vdev_guid || 14012082Seschrock nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { 14022082Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 14032082Seschrock VDEV_AUX_CORRUPT_DATA); 14042082Seschrock nvlist_free(label); 14052082Seschrock return (-1); 14062082Seschrock } 14072082Seschrock 14082082Seschrock /* 14092082Seschrock * We don't actually check the pool state here. If it's in fact in 14102082Seschrock * use by another pool, we update this fact on the fly when requested. 14112082Seschrock */ 14122082Seschrock nvlist_free(label); 14132082Seschrock return (0); 14142082Seschrock } 14152082Seschrock 1416789Sahrens void 1417789Sahrens vdev_sync_done(vdev_t *vd, uint64_t txg) 1418789Sahrens { 1419789Sahrens metaslab_t *msp; 1420789Sahrens 1421789Sahrens dprintf("%s txg %llu\n", vdev_description(vd), txg); 1422789Sahrens 1423789Sahrens while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))) 1424789Sahrens metaslab_sync_done(msp, txg); 1425789Sahrens } 1426789Sahrens 1427789Sahrens void 1428789Sahrens vdev_sync(vdev_t *vd, uint64_t txg) 1429789Sahrens { 1430789Sahrens spa_t *spa = vd->vdev_spa; 1431789Sahrens vdev_t *lvd; 1432789Sahrens metaslab_t *msp; 14331732Sbonwick dmu_tx_t *tx; 1434789Sahrens 1435789Sahrens dprintf("%s txg %llu pass %d\n", 1436789Sahrens vdev_description(vd), (u_longlong_t)txg, spa_sync_pass(spa)); 1437789Sahrens 14381732Sbonwick if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0) { 14391732Sbonwick ASSERT(vd == vd->vdev_top); 14401732Sbonwick tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 14411732Sbonwick vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset, 14421732Sbonwick DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx); 14431732Sbonwick ASSERT(vd->vdev_ms_array != 0); 14441732Sbonwick vdev_config_dirty(vd); 14451732Sbonwick dmu_tx_commit(tx); 14461732Sbonwick } 1447789Sahrens 14481732Sbonwick while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) { 1449789Sahrens metaslab_sync(msp, txg); 14501732Sbonwick (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg)); 14511732Sbonwick } 1452789Sahrens 1453789Sahrens while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL) 1454789Sahrens vdev_dtl_sync(lvd, txg); 1455789Sahrens 1456789Sahrens (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)); 1457789Sahrens } 1458789Sahrens 1459789Sahrens uint64_t 1460789Sahrens vdev_psize_to_asize(vdev_t *vd, uint64_t psize) 1461789Sahrens { 1462789Sahrens return (vd->vdev_ops->vdev_op_asize(vd, psize)); 1463789Sahrens } 1464789Sahrens 1465789Sahrens const char * 1466789Sahrens vdev_description(vdev_t *vd) 1467789Sahrens { 1468789Sahrens if (vd == NULL || vd->vdev_ops == NULL) 1469789Sahrens return ("<unknown>"); 1470789Sahrens 1471789Sahrens if (vd->vdev_path != NULL) 1472789Sahrens return (vd->vdev_path); 1473789Sahrens 1474789Sahrens if (vd->vdev_parent == NULL) 1475789Sahrens return (spa_name(vd->vdev_spa)); 1476789Sahrens 1477789Sahrens return (vd->vdev_ops->vdev_op_type); 1478789Sahrens } 1479789Sahrens 14804451Seschrock /* 14814451Seschrock * Mark the given vdev faulted. A faulted vdev behaves as if the device could 14824451Seschrock * not be opened, and no I/O is attempted. 14834451Seschrock */ 1484789Sahrens int 14854451Seschrock vdev_fault(spa_t *spa, uint64_t guid) 14864451Seschrock { 14874451Seschrock vdev_t *rvd, *vd; 14884451Seschrock uint64_t txg; 14894451Seschrock 14905329Sgw25295 /* 14915329Sgw25295 * Disregard a vdev fault request if the pool has 14925329Sgw25295 * experienced a complete failure. 14935329Sgw25295 * 14945329Sgw25295 * XXX - We do this here so that we don't hold the 14955329Sgw25295 * spa_namespace_lock in the event that we can't get 14965329Sgw25295 * the RW_WRITER spa_config_lock. 14975329Sgw25295 */ 14985329Sgw25295 if (spa_state(spa) == POOL_STATE_IO_FAILURE) 14995329Sgw25295 return (EIO); 15005329Sgw25295 15014451Seschrock txg = spa_vdev_enter(spa); 15024451Seschrock 15034451Seschrock rvd = spa->spa_root_vdev; 15044451Seschrock 15054451Seschrock if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) 15064451Seschrock return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 15074451Seschrock if (!vd->vdev_ops->vdev_op_leaf) 15084451Seschrock return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 15094451Seschrock 15104451Seschrock /* 15114451Seschrock * Faulted state takes precedence over degraded. 15124451Seschrock */ 15134451Seschrock vd->vdev_faulted = 1ULL; 15144451Seschrock vd->vdev_degraded = 0ULL; 15154451Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, 15164451Seschrock VDEV_AUX_ERR_EXCEEDED); 15174451Seschrock 15184451Seschrock /* 15194451Seschrock * If marking the vdev as faulted cause the toplevel vdev to become 15204451Seschrock * unavailable, then back off and simply mark the vdev as degraded 15214451Seschrock * instead. 15224451Seschrock */ 15234451Seschrock if (vdev_is_dead(vd->vdev_top)) { 15244451Seschrock vd->vdev_degraded = 1ULL; 15254451Seschrock vd->vdev_faulted = 0ULL; 15264451Seschrock 15274451Seschrock /* 15284451Seschrock * If we reopen the device and it's not dead, only then do we 15294451Seschrock * mark it degraded. 15304451Seschrock */ 15314451Seschrock vdev_reopen(vd); 15324451Seschrock 15335329Sgw25295 if (vdev_readable(vd)) { 15344451Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, 15354451Seschrock VDEV_AUX_ERR_EXCEEDED); 15364451Seschrock } 15374451Seschrock } 15384451Seschrock 15394451Seschrock vdev_config_dirty(vd->vdev_top); 15404451Seschrock 15414451Seschrock (void) spa_vdev_exit(spa, NULL, txg, 0); 15424451Seschrock 15434451Seschrock return (0); 15444451Seschrock } 15454451Seschrock 15464451Seschrock /* 15474451Seschrock * Mark the given vdev degraded. A degraded vdev is purely an indication to the 15484451Seschrock * user that something is wrong. The vdev continues to operate as normal as far 15494451Seschrock * as I/O is concerned. 15504451Seschrock */ 15514451Seschrock int 15524451Seschrock vdev_degrade(spa_t *spa, uint64_t guid) 15534451Seschrock { 15544451Seschrock vdev_t *rvd, *vd; 15554451Seschrock uint64_t txg; 15564451Seschrock 15575329Sgw25295 /* 15585329Sgw25295 * Disregard a vdev fault request if the pool has 15595329Sgw25295 * experienced a complete failure. 15605329Sgw25295 * 15615329Sgw25295 * XXX - We do this here so that we don't hold the 15625329Sgw25295 * spa_namespace_lock in the event that we can't get 15635329Sgw25295 * the RW_WRITER spa_config_lock. 15645329Sgw25295 */ 15655329Sgw25295 if (spa_state(spa) == POOL_STATE_IO_FAILURE) 15665329Sgw25295 return (EIO); 15675329Sgw25295 15684451Seschrock txg = spa_vdev_enter(spa); 15694451Seschrock 15704451Seschrock rvd = spa->spa_root_vdev; 15714451Seschrock 15724451Seschrock if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) 15734451Seschrock return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 15744451Seschrock if (!vd->vdev_ops->vdev_op_leaf) 15754451Seschrock return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 15764451Seschrock 15774451Seschrock /* 15784451Seschrock * If the vdev is already faulted, then don't do anything. 15794451Seschrock */ 15804451Seschrock if (vd->vdev_faulted || vd->vdev_degraded) { 15814451Seschrock (void) spa_vdev_exit(spa, NULL, txg, 0); 15824451Seschrock return (0); 15834451Seschrock } 15844451Seschrock 15854451Seschrock vd->vdev_degraded = 1ULL; 15864451Seschrock if (!vdev_is_dead(vd)) 15874451Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, 15884451Seschrock VDEV_AUX_ERR_EXCEEDED); 15894451Seschrock vdev_config_dirty(vd->vdev_top); 15904451Seschrock 15914451Seschrock (void) spa_vdev_exit(spa, NULL, txg, 0); 15924451Seschrock 15934451Seschrock return (0); 15944451Seschrock } 15954451Seschrock 15964451Seschrock /* 15974451Seschrock * Online the given vdev. If 'unspare' is set, it implies two things. First, 15984451Seschrock * any attached spare device should be detached when the device finishes 15994451Seschrock * resilvering. Second, the online should be treated like a 'test' online case, 16004451Seschrock * so no FMA events are generated if the device fails to open. 16014451Seschrock */ 16024451Seschrock int 16034451Seschrock vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, 16044451Seschrock vdev_state_t *newstate) 1605789Sahrens { 16061485Slling vdev_t *rvd, *vd; 16071485Slling uint64_t txg; 1608789Sahrens 16095329Sgw25295 /* 16105329Sgw25295 * Disregard a vdev fault request if the pool has 16115329Sgw25295 * experienced a complete failure. 16125329Sgw25295 * 16135329Sgw25295 * XXX - We do this here so that we don't hold the 16145329Sgw25295 * spa_namespace_lock in the event that we can't get 16155329Sgw25295 * the RW_WRITER spa_config_lock. 16165329Sgw25295 */ 16175329Sgw25295 if (spa_state(spa) == POOL_STATE_IO_FAILURE) 16185329Sgw25295 return (EIO); 16195329Sgw25295 16201485Slling txg = spa_vdev_enter(spa); 16211485Slling 16221485Slling rvd = spa->spa_root_vdev; 16231585Sbonwick 16241544Seschrock if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) 16251485Slling return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1626789Sahrens 16271585Sbonwick if (!vd->vdev_ops->vdev_op_leaf) 16281585Sbonwick return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 16291585Sbonwick 1630789Sahrens vd->vdev_offline = B_FALSE; 16311485Slling vd->vdev_tmpoffline = B_FALSE; 16324451Seschrock vd->vdev_checkremove = (flags & ZFS_ONLINE_CHECKREMOVE) ? 16334451Seschrock B_TRUE : B_FALSE; 16344451Seschrock vd->vdev_forcefault = (flags & ZFS_ONLINE_FORCEFAULT) ? 16354451Seschrock B_TRUE : B_FALSE; 16361544Seschrock vdev_reopen(vd->vdev_top); 16374451Seschrock vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE; 16384451Seschrock 16394451Seschrock if (newstate) 16404451Seschrock *newstate = vd->vdev_state; 16414451Seschrock if ((flags & ZFS_ONLINE_UNSPARE) && 16424451Seschrock !vdev_is_dead(vd) && vd->vdev_parent && 16434451Seschrock vd->vdev_parent->vdev_ops == &vdev_spare_ops && 16444451Seschrock vd->vdev_parent->vdev_child[0] == vd) 16454451Seschrock vd->vdev_unspare = B_TRUE; 1646789Sahrens 16471485Slling vdev_config_dirty(vd->vdev_top); 16481485Slling 16491485Slling (void) spa_vdev_exit(spa, NULL, txg, 0); 1650789Sahrens 16514451Seschrock /* 16524451Seschrock * Must hold spa_namespace_lock in order to post resilver sysevent 16534451Seschrock * w/pool name. 16544451Seschrock */ 16554451Seschrock mutex_enter(&spa_namespace_lock); 1656789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 16574451Seschrock mutex_exit(&spa_namespace_lock); 1658789Sahrens 1659789Sahrens return (0); 1660789Sahrens } 1661789Sahrens 1662789Sahrens int 16634451Seschrock vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) 1664789Sahrens { 16651485Slling vdev_t *rvd, *vd; 16661485Slling uint64_t txg; 1667789Sahrens 16685329Sgw25295 /* 16695329Sgw25295 * Disregard a vdev fault request if the pool has 16705329Sgw25295 * experienced a complete failure. 16715329Sgw25295 * 16725329Sgw25295 * XXX - We do this here so that we don't hold the 16735329Sgw25295 * spa_namespace_lock in the event that we can't get 16745329Sgw25295 * the RW_WRITER spa_config_lock. 16755329Sgw25295 */ 16765329Sgw25295 if (spa_state(spa) == POOL_STATE_IO_FAILURE) 16775329Sgw25295 return (EIO); 16785329Sgw25295 16791485Slling txg = spa_vdev_enter(spa); 1680789Sahrens 16811485Slling rvd = spa->spa_root_vdev; 16821585Sbonwick 16831544Seschrock if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) 16841485Slling return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1685789Sahrens 16861585Sbonwick if (!vd->vdev_ops->vdev_op_leaf) 16871585Sbonwick return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 16881585Sbonwick 1689789Sahrens /* 16901732Sbonwick * If the device isn't already offline, try to offline it. 1691789Sahrens */ 16921732Sbonwick if (!vd->vdev_offline) { 16931732Sbonwick /* 16941732Sbonwick * If this device's top-level vdev has a non-empty DTL, 16951732Sbonwick * don't allow the device to be offlined. 16961732Sbonwick * 16971732Sbonwick * XXX -- make this more precise by allowing the offline 16981732Sbonwick * as long as the remaining devices don't have any DTL holes. 16991732Sbonwick */ 17001732Sbonwick if (vd->vdev_top->vdev_dtl_map.sm_space != 0) 17011732Sbonwick return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1702789Sahrens 17031732Sbonwick /* 17041732Sbonwick * Offline this device and reopen its top-level vdev. 17051732Sbonwick * If this action results in the top-level vdev becoming 17061732Sbonwick * unusable, undo it and fail the request. 17071732Sbonwick */ 17081732Sbonwick vd->vdev_offline = B_TRUE; 17091544Seschrock vdev_reopen(vd->vdev_top); 17101732Sbonwick if (vdev_is_dead(vd->vdev_top)) { 17111732Sbonwick vd->vdev_offline = B_FALSE; 17121732Sbonwick vdev_reopen(vd->vdev_top); 17131732Sbonwick return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 17141732Sbonwick } 1715789Sahrens } 1716789Sahrens 17174451Seschrock vd->vdev_tmpoffline = (flags & ZFS_OFFLINE_TEMPORARY) ? 17184451Seschrock B_TRUE : B_FALSE; 17191732Sbonwick 17201732Sbonwick vdev_config_dirty(vd->vdev_top); 17211485Slling 17221485Slling return (spa_vdev_exit(spa, NULL, txg, 0)); 1723789Sahrens } 1724789Sahrens 17251544Seschrock /* 17261544Seschrock * Clear the error counts associated with this vdev. Unlike vdev_online() and 17271544Seschrock * vdev_offline(), we assume the spa config is locked. We also clear all 17281544Seschrock * children. If 'vd' is NULL, then the user wants to clear all vdevs. 17295329Sgw25295 * If reopen is specified then attempt to reopen the vdev if the vdev is 17305329Sgw25295 * faulted or degraded. 17311544Seschrock */ 17321544Seschrock void 17335329Sgw25295 vdev_clear(spa_t *spa, vdev_t *vd, boolean_t reopen_wanted) 1734789Sahrens { 17351544Seschrock int c; 1736789Sahrens 17371544Seschrock if (vd == NULL) 17381544Seschrock vd = spa->spa_root_vdev; 1739789Sahrens 17401544Seschrock vd->vdev_stat.vs_read_errors = 0; 17411544Seschrock vd->vdev_stat.vs_write_errors = 0; 17421544Seschrock vd->vdev_stat.vs_checksum_errors = 0; 17435329Sgw25295 vd->vdev_is_failing = B_FALSE; 1744789Sahrens 17451544Seschrock for (c = 0; c < vd->vdev_children; c++) 17465329Sgw25295 vdev_clear(spa, vd->vdev_child[c], reopen_wanted); 17474451Seschrock 17484451Seschrock /* 17494451Seschrock * If we're in the FAULTED state, then clear the persistent state and 17504451Seschrock * attempt to reopen the device. We also mark the vdev config dirty, so 17514451Seschrock * that the new faulted state is written out to disk. 17524451Seschrock */ 17535329Sgw25295 if (reopen_wanted && (vd->vdev_faulted || vd->vdev_degraded)) { 17544451Seschrock vd->vdev_faulted = vd->vdev_degraded = 0; 17554451Seschrock vdev_reopen(vd); 17564451Seschrock vdev_config_dirty(vd->vdev_top); 17574451Seschrock 17584451Seschrock if (vd->vdev_faulted) 17594808Sek110237 spa_async_request(spa, SPA_ASYNC_RESILVER); 17604451Seschrock 17614451Seschrock spa_event_notify(spa, vd, ESC_ZFS_VDEV_CLEAR); 17624451Seschrock } 1763789Sahrens } 1764789Sahrens 1765789Sahrens int 17665329Sgw25295 vdev_readable(vdev_t *vd) 17675329Sgw25295 { 17685329Sgw25295 /* XXPOLICY */ 17695329Sgw25295 return (!vdev_is_dead(vd)); 17705329Sgw25295 } 17715329Sgw25295 17725329Sgw25295 int 17735329Sgw25295 vdev_writeable(vdev_t *vd) 17745329Sgw25295 { 17755369Sgw25295 return (!vdev_is_dead(vd) && !vd->vdev_is_failing); 17765329Sgw25295 } 17775329Sgw25295 17785329Sgw25295 int 1779789Sahrens vdev_is_dead(vdev_t *vd) 1780789Sahrens { 1781*6523Sek110237 /* 1782*6523Sek110237 * If the vdev experienced I/O failures, then the vdev is marked 1783*6523Sek110237 * as faulted (VDEV_STATE_FAULTED) for status output and FMA; however, 1784*6523Sek110237 * we need to allow access to the vdev for resumed I/Os (see 1785*6523Sek110237 * zio_vdev_resume_io() ). 1786*6523Sek110237 */ 1787*6523Sek110237 return (vd->vdev_state < VDEV_STATE_DEGRADED && 1788*6523Sek110237 vd->vdev_stat.vs_aux != VDEV_AUX_IO_FAILURE); 1789789Sahrens } 1790789Sahrens 1791789Sahrens int 1792789Sahrens vdev_error_inject(vdev_t *vd, zio_t *zio) 1793789Sahrens { 1794789Sahrens int error = 0; 1795789Sahrens 1796789Sahrens if (vd->vdev_fault_mode == VDEV_FAULT_NONE) 1797789Sahrens return (0); 1798789Sahrens 1799789Sahrens if (((1ULL << zio->io_type) & vd->vdev_fault_mask) == 0) 1800789Sahrens return (0); 1801789Sahrens 1802789Sahrens switch (vd->vdev_fault_mode) { 1803789Sahrens case VDEV_FAULT_RANDOM: 1804789Sahrens if (spa_get_random(vd->vdev_fault_arg) == 0) 1805789Sahrens error = EIO; 1806789Sahrens break; 1807789Sahrens 1808789Sahrens case VDEV_FAULT_COUNT: 1809789Sahrens if ((int64_t)--vd->vdev_fault_arg <= 0) 1810789Sahrens vd->vdev_fault_mode = VDEV_FAULT_NONE; 1811789Sahrens error = EIO; 1812789Sahrens break; 1813789Sahrens } 1814789Sahrens 1815789Sahrens return (error); 1816789Sahrens } 1817789Sahrens 1818789Sahrens /* 1819789Sahrens * Get statistics for the given vdev. 1820789Sahrens */ 1821789Sahrens void 1822789Sahrens vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) 1823789Sahrens { 1824789Sahrens vdev_t *rvd = vd->vdev_spa->spa_root_vdev; 1825789Sahrens int c, t; 1826789Sahrens 1827789Sahrens mutex_enter(&vd->vdev_stat_lock); 1828789Sahrens bcopy(&vd->vdev_stat, vs, sizeof (*vs)); 1829789Sahrens vs->vs_timestamp = gethrtime() - vs->vs_timestamp; 1830789Sahrens vs->vs_state = vd->vdev_state; 18311175Slling vs->vs_rsize = vdev_get_rsize(vd); 1832789Sahrens mutex_exit(&vd->vdev_stat_lock); 1833789Sahrens 1834789Sahrens /* 1835789Sahrens * If we're getting stats on the root vdev, aggregate the I/O counts 1836789Sahrens * over all top-level vdevs (i.e. the direct children of the root). 1837789Sahrens */ 1838789Sahrens if (vd == rvd) { 1839789Sahrens for (c = 0; c < rvd->vdev_children; c++) { 1840789Sahrens vdev_t *cvd = rvd->vdev_child[c]; 1841789Sahrens vdev_stat_t *cvs = &cvd->vdev_stat; 1842789Sahrens 1843789Sahrens mutex_enter(&vd->vdev_stat_lock); 1844789Sahrens for (t = 0; t < ZIO_TYPES; t++) { 1845789Sahrens vs->vs_ops[t] += cvs->vs_ops[t]; 1846789Sahrens vs->vs_bytes[t] += cvs->vs_bytes[t]; 1847789Sahrens } 1848789Sahrens vs->vs_read_errors += cvs->vs_read_errors; 1849789Sahrens vs->vs_write_errors += cvs->vs_write_errors; 1850789Sahrens vs->vs_checksum_errors += cvs->vs_checksum_errors; 1851789Sahrens vs->vs_scrub_examined += cvs->vs_scrub_examined; 1852789Sahrens vs->vs_scrub_errors += cvs->vs_scrub_errors; 1853789Sahrens mutex_exit(&vd->vdev_stat_lock); 1854789Sahrens } 1855789Sahrens } 1856789Sahrens } 1857789Sahrens 1858789Sahrens void 18595450Sbrendan vdev_clear_stats(vdev_t *vd) 18605450Sbrendan { 18615450Sbrendan mutex_enter(&vd->vdev_stat_lock); 18625450Sbrendan vd->vdev_stat.vs_space = 0; 18635450Sbrendan vd->vdev_stat.vs_dspace = 0; 18645450Sbrendan vd->vdev_stat.vs_alloc = 0; 18655450Sbrendan mutex_exit(&vd->vdev_stat_lock); 18665450Sbrendan } 18675450Sbrendan 18685450Sbrendan void 1869789Sahrens vdev_stat_update(zio_t *zio) 1870789Sahrens { 1871789Sahrens vdev_t *vd = zio->io_vd; 1872789Sahrens vdev_t *pvd; 1873789Sahrens uint64_t txg = zio->io_txg; 1874789Sahrens vdev_stat_t *vs = &vd->vdev_stat; 1875789Sahrens zio_type_t type = zio->io_type; 1876789Sahrens int flags = zio->io_flags; 1877789Sahrens 1878789Sahrens if (zio->io_error == 0) { 1879789Sahrens if (!(flags & ZIO_FLAG_IO_BYPASS)) { 1880789Sahrens mutex_enter(&vd->vdev_stat_lock); 1881789Sahrens vs->vs_ops[type]++; 1882789Sahrens vs->vs_bytes[type] += zio->io_size; 1883789Sahrens mutex_exit(&vd->vdev_stat_lock); 1884789Sahrens } 1885789Sahrens if ((flags & ZIO_FLAG_IO_REPAIR) && 1886789Sahrens zio->io_delegate_list == NULL) { 1887789Sahrens mutex_enter(&vd->vdev_stat_lock); 18881807Sbonwick if (flags & ZIO_FLAG_SCRUB_THREAD) 1889789Sahrens vs->vs_scrub_repaired += zio->io_size; 1890789Sahrens else 1891789Sahrens vs->vs_self_healed += zio->io_size; 1892789Sahrens mutex_exit(&vd->vdev_stat_lock); 1893789Sahrens } 1894789Sahrens return; 1895789Sahrens } 1896789Sahrens 1897789Sahrens if (flags & ZIO_FLAG_SPECULATIVE) 1898789Sahrens return; 1899789Sahrens 19005329Sgw25295 if (vdev_readable(vd)) { 1901789Sahrens mutex_enter(&vd->vdev_stat_lock); 1902789Sahrens if (type == ZIO_TYPE_READ) { 1903789Sahrens if (zio->io_error == ECKSUM) 1904789Sahrens vs->vs_checksum_errors++; 1905789Sahrens else 1906789Sahrens vs->vs_read_errors++; 1907789Sahrens } 1908789Sahrens if (type == ZIO_TYPE_WRITE) 1909789Sahrens vs->vs_write_errors++; 1910789Sahrens mutex_exit(&vd->vdev_stat_lock); 1911789Sahrens } 1912789Sahrens 1913789Sahrens if (type == ZIO_TYPE_WRITE) { 1914789Sahrens if (txg == 0 || vd->vdev_children != 0) 1915789Sahrens return; 19161807Sbonwick if (flags & ZIO_FLAG_SCRUB_THREAD) { 1917789Sahrens ASSERT(flags & ZIO_FLAG_IO_REPAIR); 1918789Sahrens for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) 1919789Sahrens vdev_dtl_dirty(&pvd->vdev_dtl_scrub, txg, 1); 1920789Sahrens } 1921789Sahrens if (!(flags & ZIO_FLAG_IO_REPAIR)) { 1922789Sahrens if (vdev_dtl_contains(&vd->vdev_dtl_map, txg, 1)) 1923789Sahrens return; 19241732Sbonwick vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); 1925789Sahrens for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) 1926789Sahrens vdev_dtl_dirty(&pvd->vdev_dtl_map, txg, 1); 1927789Sahrens } 1928789Sahrens } 1929789Sahrens } 1930789Sahrens 1931789Sahrens void 1932789Sahrens vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete) 1933789Sahrens { 1934789Sahrens int c; 1935789Sahrens vdev_stat_t *vs = &vd->vdev_stat; 1936789Sahrens 1937789Sahrens for (c = 0; c < vd->vdev_children; c++) 1938789Sahrens vdev_scrub_stat_update(vd->vdev_child[c], type, complete); 1939789Sahrens 1940789Sahrens mutex_enter(&vd->vdev_stat_lock); 1941789Sahrens 1942789Sahrens if (type == POOL_SCRUB_NONE) { 1943789Sahrens /* 1944789Sahrens * Update completion and end time. Leave everything else alone 1945789Sahrens * so we can report what happened during the previous scrub. 1946789Sahrens */ 1947789Sahrens vs->vs_scrub_complete = complete; 1948789Sahrens vs->vs_scrub_end = gethrestime_sec(); 1949789Sahrens } else { 1950789Sahrens vs->vs_scrub_type = type; 1951789Sahrens vs->vs_scrub_complete = 0; 1952789Sahrens vs->vs_scrub_examined = 0; 1953789Sahrens vs->vs_scrub_repaired = 0; 1954789Sahrens vs->vs_scrub_errors = 0; 1955789Sahrens vs->vs_scrub_start = gethrestime_sec(); 1956789Sahrens vs->vs_scrub_end = 0; 1957789Sahrens } 1958789Sahrens 1959789Sahrens mutex_exit(&vd->vdev_stat_lock); 1960789Sahrens } 1961789Sahrens 1962789Sahrens /* 1963789Sahrens * Update the in-core space usage stats for this vdev and the root vdev. 1964789Sahrens */ 1965789Sahrens void 19665450Sbrendan vdev_space_update(vdev_t *vd, int64_t space_delta, int64_t alloc_delta, 19675450Sbrendan boolean_t update_root) 1968789Sahrens { 19694527Sperrin int64_t dspace_delta = space_delta; 19704527Sperrin spa_t *spa = vd->vdev_spa; 19714527Sperrin vdev_t *rvd = spa->spa_root_vdev; 19724527Sperrin 1973789Sahrens ASSERT(vd == vd->vdev_top); 19744527Sperrin 19754527Sperrin /* 19764527Sperrin * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion 19774527Sperrin * factor. We must calculate this here and not at the root vdev 19784527Sperrin * because the root vdev's psize-to-asize is simply the max of its 19794527Sperrin * childrens', thus not accurate enough for us. 19804527Sperrin */ 19814527Sperrin ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0); 19824527Sperrin dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) * 19834527Sperrin vd->vdev_deflate_ratio; 1984789Sahrens 19854527Sperrin mutex_enter(&vd->vdev_stat_lock); 19864527Sperrin vd->vdev_stat.vs_space += space_delta; 19874527Sperrin vd->vdev_stat.vs_alloc += alloc_delta; 19884527Sperrin vd->vdev_stat.vs_dspace += dspace_delta; 19894527Sperrin mutex_exit(&vd->vdev_stat_lock); 19902082Seschrock 19915450Sbrendan if (update_root) { 19925450Sbrendan ASSERT(rvd == vd->vdev_parent); 19935450Sbrendan ASSERT(vd->vdev_ms_count != 0); 19944527Sperrin 19955450Sbrendan /* 19965450Sbrendan * Don't count non-normal (e.g. intent log) space as part of 19975450Sbrendan * the pool's capacity. 19985450Sbrendan */ 19995450Sbrendan if (vd->vdev_mg->mg_class != spa->spa_normal_class) 20005450Sbrendan return; 20015450Sbrendan 20025450Sbrendan mutex_enter(&rvd->vdev_stat_lock); 20035450Sbrendan rvd->vdev_stat.vs_space += space_delta; 20045450Sbrendan rvd->vdev_stat.vs_alloc += alloc_delta; 20055450Sbrendan rvd->vdev_stat.vs_dspace += dspace_delta; 20065450Sbrendan mutex_exit(&rvd->vdev_stat_lock); 20075450Sbrendan } 2008789Sahrens } 2009789Sahrens 2010789Sahrens /* 2011789Sahrens * Mark a top-level vdev's config as dirty, placing it on the dirty list 2012789Sahrens * so that it will be written out next time the vdev configuration is synced. 2013789Sahrens * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs. 2014789Sahrens */ 2015789Sahrens void 2016789Sahrens vdev_config_dirty(vdev_t *vd) 2017789Sahrens { 2018789Sahrens spa_t *spa = vd->vdev_spa; 2019789Sahrens vdev_t *rvd = spa->spa_root_vdev; 2020789Sahrens int c; 2021789Sahrens 20221601Sbonwick /* 20231601Sbonwick * The dirty list is protected by the config lock. The caller must 20241601Sbonwick * either hold the config lock as writer, or must be the sync thread 20251601Sbonwick * (which holds the lock as reader). There's only one sync thread, 20261601Sbonwick * so this is sufficient to ensure mutual exclusion. 20271601Sbonwick */ 20281601Sbonwick ASSERT(spa_config_held(spa, RW_WRITER) || 20291601Sbonwick dsl_pool_sync_context(spa_get_dsl(spa))); 20301601Sbonwick 2031789Sahrens if (vd == rvd) { 2032789Sahrens for (c = 0; c < rvd->vdev_children; c++) 2033789Sahrens vdev_config_dirty(rvd->vdev_child[c]); 2034789Sahrens } else { 2035789Sahrens ASSERT(vd == vd->vdev_top); 2036789Sahrens 20371732Sbonwick if (!list_link_active(&vd->vdev_dirty_node)) 2038789Sahrens list_insert_head(&spa->spa_dirty_list, vd); 2039789Sahrens } 2040789Sahrens } 2041789Sahrens 2042789Sahrens void 2043789Sahrens vdev_config_clean(vdev_t *vd) 2044789Sahrens { 20451601Sbonwick spa_t *spa = vd->vdev_spa; 20461601Sbonwick 20471601Sbonwick ASSERT(spa_config_held(spa, RW_WRITER) || 20481601Sbonwick dsl_pool_sync_context(spa_get_dsl(spa))); 20491601Sbonwick 20501732Sbonwick ASSERT(list_link_active(&vd->vdev_dirty_node)); 20511601Sbonwick list_remove(&spa->spa_dirty_list, vd); 2052789Sahrens } 2053789Sahrens 2054*6523Sek110237 /* 2055*6523Sek110237 * Propagate vdev state up from children to parent. 2056*6523Sek110237 */ 20571775Sbillm void 20581775Sbillm vdev_propagate_state(vdev_t *vd) 20591775Sbillm { 20601775Sbillm vdev_t *rvd = vd->vdev_spa->spa_root_vdev; 20611775Sbillm int degraded = 0, faulted = 0; 20621775Sbillm int corrupted = 0; 20631775Sbillm int c; 20641775Sbillm vdev_t *child; 20651775Sbillm 20664451Seschrock if (vd->vdev_children > 0) { 20674451Seschrock for (c = 0; c < vd->vdev_children; c++) { 20684451Seschrock child = vd->vdev_child[c]; 20695329Sgw25295 if (vdev_is_dead(child) && !vdev_readable(child)) 20704451Seschrock faulted++; 2071*6523Sek110237 else if (child->vdev_stat.vs_aux == VDEV_AUX_IO_FAILURE) 2072*6523Sek110237 faulted++; 20735329Sgw25295 else if (child->vdev_state <= VDEV_STATE_DEGRADED) 20744451Seschrock degraded++; 20754451Seschrock 20764451Seschrock if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) 20774451Seschrock corrupted++; 20784451Seschrock } 20791775Sbillm 20804451Seschrock vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded); 20814451Seschrock 20824451Seschrock /* 20834451Seschrock * Root special: if there is a toplevel vdev that cannot be 20844451Seschrock * opened due to corrupted metadata, then propagate the root 20854451Seschrock * vdev's aux state as 'corrupt' rather than 'insufficient 20864451Seschrock * replicas'. 20874451Seschrock */ 20884451Seschrock if (corrupted && vd == rvd && 20894451Seschrock rvd->vdev_state == VDEV_STATE_CANT_OPEN) 20904451Seschrock vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN, 20914451Seschrock VDEV_AUX_CORRUPT_DATA); 20921775Sbillm } 20931775Sbillm 20944527Sperrin if (vd->vdev_parent && !vd->vdev_islog) 20954451Seschrock vdev_propagate_state(vd->vdev_parent); 20961775Sbillm } 20971775Sbillm 2098789Sahrens /* 20991544Seschrock * Set a vdev's state. If this is during an open, we don't update the parent 21001544Seschrock * state, because we're in the process of opening children depth-first. 21011544Seschrock * Otherwise, we propagate the change to the parent. 21021544Seschrock * 21031544Seschrock * If this routine places a device in a faulted state, an appropriate ereport is 21041544Seschrock * generated. 2105789Sahrens */ 2106789Sahrens void 21071544Seschrock vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) 2108789Sahrens { 21091986Seschrock uint64_t save_state; 21101544Seschrock 21111544Seschrock if (state == vd->vdev_state) { 21121544Seschrock vd->vdev_stat.vs_aux = aux; 2113789Sahrens return; 21141544Seschrock } 21151544Seschrock 21161986Seschrock save_state = vd->vdev_state; 2117789Sahrens 2118789Sahrens vd->vdev_state = state; 2119789Sahrens vd->vdev_stat.vs_aux = aux; 2120789Sahrens 21214451Seschrock /* 21224451Seschrock * If we are setting the vdev state to anything but an open state, then 21234451Seschrock * always close the underlying device. Otherwise, we keep accessible 21244451Seschrock * but invalid devices open forever. We don't call vdev_close() itself, 21254451Seschrock * because that implies some extra checks (offline, etc) that we don't 21264451Seschrock * want here. This is limited to leaf devices, because otherwise 21274451Seschrock * closing the device will affect other children. 21284451Seschrock */ 21295329Sgw25295 if (!vdev_readable(vd) && vd->vdev_ops->vdev_op_leaf) 21304451Seschrock vd->vdev_ops->vdev_op_close(vd); 21314451Seschrock 21324451Seschrock if (vd->vdev_removed && 21334451Seschrock state == VDEV_STATE_CANT_OPEN && 21344451Seschrock (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) { 21354451Seschrock /* 21364451Seschrock * If the previous state is set to VDEV_STATE_REMOVED, then this 21374451Seschrock * device was previously marked removed and someone attempted to 21384451Seschrock * reopen it. If this failed due to a nonexistent device, then 21394451Seschrock * keep the device in the REMOVED state. We also let this be if 21404451Seschrock * it is one of our special test online cases, which is only 21414451Seschrock * attempting to online the device and shouldn't generate an FMA 21424451Seschrock * fault. 21434451Seschrock */ 21444451Seschrock vd->vdev_state = VDEV_STATE_REMOVED; 21454451Seschrock vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 21464451Seschrock } else if (state == VDEV_STATE_REMOVED) { 21474451Seschrock /* 21484451Seschrock * Indicate to the ZFS DE that this device has been removed, and 21494451Seschrock * any recent errors should be ignored. 21504451Seschrock */ 21514451Seschrock zfs_post_remove(vd->vdev_spa, vd); 21524451Seschrock vd->vdev_removed = B_TRUE; 21534451Seschrock } else if (state == VDEV_STATE_CANT_OPEN) { 21541544Seschrock /* 21551544Seschrock * If we fail to open a vdev during an import, we mark it as 21561544Seschrock * "not available", which signifies that it was never there to 21571544Seschrock * begin with. Failure to open such a device is not considered 21581544Seschrock * an error. 21591544Seschrock */ 21601986Seschrock if (vd->vdev_spa->spa_load_state == SPA_LOAD_IMPORT && 21611986Seschrock vd->vdev_ops->vdev_op_leaf) 21621986Seschrock vd->vdev_not_present = 1; 21631986Seschrock 21641986Seschrock /* 21651986Seschrock * Post the appropriate ereport. If the 'prevstate' field is 21661986Seschrock * set to something other than VDEV_STATE_UNKNOWN, it indicates 21671986Seschrock * that this is part of a vdev_reopen(). In this case, we don't 21681986Seschrock * want to post the ereport if the device was already in the 21691986Seschrock * CANT_OPEN state beforehand. 21704451Seschrock * 21714451Seschrock * If the 'checkremove' flag is set, then this is an attempt to 21724451Seschrock * online the device in response to an insertion event. If we 21734451Seschrock * hit this case, then we have detected an insertion event for a 21744451Seschrock * faulted or offline device that wasn't in the removed state. 21754451Seschrock * In this scenario, we don't post an ereport because we are 21764451Seschrock * about to replace the device, or attempt an online with 21774451Seschrock * vdev_forcefault, which will generate the fault for us. 21781986Seschrock */ 21794451Seschrock if ((vd->vdev_prevstate != state || vd->vdev_forcefault) && 21804451Seschrock !vd->vdev_not_present && !vd->vdev_checkremove && 21811544Seschrock vd != vd->vdev_spa->spa_root_vdev) { 21821544Seschrock const char *class; 21831544Seschrock 21841544Seschrock switch (aux) { 21851544Seschrock case VDEV_AUX_OPEN_FAILED: 21861544Seschrock class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED; 21871544Seschrock break; 21881544Seschrock case VDEV_AUX_CORRUPT_DATA: 21891544Seschrock class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA; 21901544Seschrock break; 21911544Seschrock case VDEV_AUX_NO_REPLICAS: 21921544Seschrock class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS; 21931544Seschrock break; 21941544Seschrock case VDEV_AUX_BAD_GUID_SUM: 21951544Seschrock class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM; 21961544Seschrock break; 21971544Seschrock case VDEV_AUX_TOO_SMALL: 21981544Seschrock class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL; 21991544Seschrock break; 22001544Seschrock case VDEV_AUX_BAD_LABEL: 22011544Seschrock class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL; 22021544Seschrock break; 22031544Seschrock default: 22041544Seschrock class = FM_EREPORT_ZFS_DEVICE_UNKNOWN; 22051544Seschrock } 22061544Seschrock 22071544Seschrock zfs_ereport_post(class, vd->vdev_spa, 22081986Seschrock vd, NULL, save_state, 0); 22091544Seschrock } 22104451Seschrock 22114451Seschrock /* Erase any notion of persistent removed state */ 22124451Seschrock vd->vdev_removed = B_FALSE; 22134451Seschrock } else { 22144451Seschrock vd->vdev_removed = B_FALSE; 22151544Seschrock } 22161544Seschrock 22174451Seschrock if (!isopen) 22184451Seschrock vdev_propagate_state(vd); 2219789Sahrens } 2220