1789Sahrens /* 2789Sahrens * CDDL HEADER START 3789Sahrens * 4789Sahrens * The contents of this file are subject to the terms of the 51485Slling * Common Development and Distribution License (the "License"). 61485Slling * You may not use this file except in compliance with the License. 7789Sahrens * 8789Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9789Sahrens * or http://www.opensolaris.org/os/licensing. 10789Sahrens * See the License for the specific language governing permissions 11789Sahrens * and limitations under the License. 12789Sahrens * 13789Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14789Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15789Sahrens * If applicable, add the following below this CDDL HEADER, with the 16789Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17789Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18789Sahrens * 19789Sahrens * CDDL HEADER END 20789Sahrens */ 212082Seschrock 22789Sahrens /* 233377Seschrock * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24789Sahrens * Use is subject to license terms. 25789Sahrens */ 26789Sahrens 27789Sahrens #pragma ident "%Z%%M% %I% %E% SMI" 28789Sahrens 29789Sahrens #include <sys/zfs_context.h> 301544Seschrock #include <sys/fm/fs/zfs.h> 31789Sahrens #include <sys/spa.h> 32789Sahrens #include <sys/spa_impl.h> 33789Sahrens #include <sys/dmu.h> 34789Sahrens #include <sys/dmu_tx.h> 35789Sahrens #include <sys/vdev_impl.h> 36789Sahrens #include <sys/uberblock_impl.h> 37789Sahrens #include <sys/metaslab.h> 38789Sahrens #include <sys/metaslab_impl.h> 39789Sahrens #include <sys/space_map.h> 40789Sahrens #include <sys/zio.h> 41789Sahrens #include <sys/zap.h> 42789Sahrens #include <sys/fs/zfs.h> 43789Sahrens 44789Sahrens /* 45789Sahrens * Virtual device management. 46789Sahrens */ 47789Sahrens 48789Sahrens static vdev_ops_t *vdev_ops_table[] = { 49789Sahrens &vdev_root_ops, 50789Sahrens &vdev_raidz_ops, 51789Sahrens &vdev_mirror_ops, 52789Sahrens &vdev_replacing_ops, 532082Seschrock &vdev_spare_ops, 54789Sahrens &vdev_disk_ops, 55789Sahrens &vdev_file_ops, 56789Sahrens &vdev_missing_ops, 57789Sahrens NULL 58789Sahrens }; 59789Sahrens 603697Smishra /* maximum scrub/resilver I/O queue */ 613697Smishra int zfs_scrub_limit = 70; 623697Smishra 63789Sahrens /* 64789Sahrens * Given a vdev type, return the appropriate ops vector. 65789Sahrens */ 66789Sahrens static vdev_ops_t * 67789Sahrens vdev_getops(const char *type) 68789Sahrens { 69789Sahrens vdev_ops_t *ops, **opspp; 70789Sahrens 71789Sahrens for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++) 72789Sahrens if (strcmp(ops->vdev_op_type, type) == 0) 73789Sahrens break; 74789Sahrens 75789Sahrens return (ops); 76789Sahrens } 77789Sahrens 78789Sahrens /* 79789Sahrens * Default asize function: return the MAX of psize with the asize of 80789Sahrens * all children. This is what's used by anything other than RAID-Z. 81789Sahrens */ 82789Sahrens uint64_t 83789Sahrens vdev_default_asize(vdev_t *vd, uint64_t psize) 84789Sahrens { 851732Sbonwick uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift); 86789Sahrens uint64_t csize; 87789Sahrens uint64_t c; 88789Sahrens 89789Sahrens for (c = 0; c < vd->vdev_children; c++) { 90789Sahrens csize = vdev_psize_to_asize(vd->vdev_child[c], psize); 91789Sahrens asize = MAX(asize, csize); 92789Sahrens } 93789Sahrens 94789Sahrens return (asize); 95789Sahrens } 96789Sahrens 971175Slling /* 981175Slling * Get the replaceable or attachable device size. 991175Slling * If the parent is a mirror or raidz, the replaceable size is the minimum 1001175Slling * psize of all its children. For the rest, just return our own psize. 1011175Slling * 1021175Slling * e.g. 1031175Slling * psize rsize 1041175Slling * root - - 1051175Slling * mirror/raidz - - 1061175Slling * disk1 20g 20g 1071175Slling * disk2 40g 20g 1081175Slling * disk3 80g 80g 1091175Slling */ 1101175Slling uint64_t 1111175Slling vdev_get_rsize(vdev_t *vd) 1121175Slling { 1131175Slling vdev_t *pvd, *cvd; 1141175Slling uint64_t c, rsize; 1151175Slling 1161175Slling pvd = vd->vdev_parent; 1171175Slling 1181175Slling /* 1191175Slling * If our parent is NULL or the root, just return our own psize. 1201175Slling */ 1211175Slling if (pvd == NULL || pvd->vdev_parent == NULL) 1221175Slling return (vd->vdev_psize); 1231175Slling 1241175Slling rsize = 0; 1251175Slling 1261175Slling for (c = 0; c < pvd->vdev_children; c++) { 1271175Slling cvd = pvd->vdev_child[c]; 1281175Slling rsize = MIN(rsize - 1, cvd->vdev_psize - 1) + 1; 1291175Slling } 1301175Slling 1311175Slling return (rsize); 1321175Slling } 1331175Slling 134789Sahrens vdev_t * 135789Sahrens vdev_lookup_top(spa_t *spa, uint64_t vdev) 136789Sahrens { 137789Sahrens vdev_t *rvd = spa->spa_root_vdev; 138789Sahrens 139789Sahrens if (vdev < rvd->vdev_children) 140789Sahrens return (rvd->vdev_child[vdev]); 141789Sahrens 142789Sahrens return (NULL); 143789Sahrens } 144789Sahrens 145789Sahrens vdev_t * 146789Sahrens vdev_lookup_by_guid(vdev_t *vd, uint64_t guid) 147789Sahrens { 148789Sahrens int c; 149789Sahrens vdev_t *mvd; 150789Sahrens 1511585Sbonwick if (vd->vdev_guid == guid) 152789Sahrens return (vd); 153789Sahrens 154789Sahrens for (c = 0; c < vd->vdev_children; c++) 155789Sahrens if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) != 156789Sahrens NULL) 157789Sahrens return (mvd); 158789Sahrens 159789Sahrens return (NULL); 160789Sahrens } 161789Sahrens 162789Sahrens void 163789Sahrens vdev_add_child(vdev_t *pvd, vdev_t *cvd) 164789Sahrens { 165789Sahrens size_t oldsize, newsize; 166789Sahrens uint64_t id = cvd->vdev_id; 167789Sahrens vdev_t **newchild; 168789Sahrens 169789Sahrens ASSERT(spa_config_held(cvd->vdev_spa, RW_WRITER)); 170789Sahrens ASSERT(cvd->vdev_parent == NULL); 171789Sahrens 172789Sahrens cvd->vdev_parent = pvd; 173789Sahrens 174789Sahrens if (pvd == NULL) 175789Sahrens return; 176789Sahrens 177789Sahrens ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL); 178789Sahrens 179789Sahrens oldsize = pvd->vdev_children * sizeof (vdev_t *); 180789Sahrens pvd->vdev_children = MAX(pvd->vdev_children, id + 1); 181789Sahrens newsize = pvd->vdev_children * sizeof (vdev_t *); 182789Sahrens 183789Sahrens newchild = kmem_zalloc(newsize, KM_SLEEP); 184789Sahrens if (pvd->vdev_child != NULL) { 185789Sahrens bcopy(pvd->vdev_child, newchild, oldsize); 186789Sahrens kmem_free(pvd->vdev_child, oldsize); 187789Sahrens } 188789Sahrens 189789Sahrens pvd->vdev_child = newchild; 190789Sahrens pvd->vdev_child[id] = cvd; 191789Sahrens 192789Sahrens cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd); 193789Sahrens ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL); 194789Sahrens 195789Sahrens /* 196789Sahrens * Walk up all ancestors to update guid sum. 197789Sahrens */ 198789Sahrens for (; pvd != NULL; pvd = pvd->vdev_parent) 199789Sahrens pvd->vdev_guid_sum += cvd->vdev_guid_sum; 2003697Smishra 2013697Smishra if (cvd->vdev_ops->vdev_op_leaf) 2023697Smishra cvd->vdev_spa->spa_scrub_maxinflight += zfs_scrub_limit; 203789Sahrens } 204789Sahrens 205789Sahrens void 206789Sahrens vdev_remove_child(vdev_t *pvd, vdev_t *cvd) 207789Sahrens { 208789Sahrens int c; 209789Sahrens uint_t id = cvd->vdev_id; 210789Sahrens 211789Sahrens ASSERT(cvd->vdev_parent == pvd); 212789Sahrens 213789Sahrens if (pvd == NULL) 214789Sahrens return; 215789Sahrens 216789Sahrens ASSERT(id < pvd->vdev_children); 217789Sahrens ASSERT(pvd->vdev_child[id] == cvd); 218789Sahrens 219789Sahrens pvd->vdev_child[id] = NULL; 220789Sahrens cvd->vdev_parent = NULL; 221789Sahrens 222789Sahrens for (c = 0; c < pvd->vdev_children; c++) 223789Sahrens if (pvd->vdev_child[c]) 224789Sahrens break; 225789Sahrens 226789Sahrens if (c == pvd->vdev_children) { 227789Sahrens kmem_free(pvd->vdev_child, c * sizeof (vdev_t *)); 228789Sahrens pvd->vdev_child = NULL; 229789Sahrens pvd->vdev_children = 0; 230789Sahrens } 231789Sahrens 232789Sahrens /* 233789Sahrens * Walk up all ancestors to update guid sum. 234789Sahrens */ 235789Sahrens for (; pvd != NULL; pvd = pvd->vdev_parent) 236789Sahrens pvd->vdev_guid_sum -= cvd->vdev_guid_sum; 2373697Smishra 2383697Smishra if (cvd->vdev_ops->vdev_op_leaf) 2393697Smishra cvd->vdev_spa->spa_scrub_maxinflight -= zfs_scrub_limit; 240789Sahrens } 241789Sahrens 242789Sahrens /* 243789Sahrens * Remove any holes in the child array. 244789Sahrens */ 245789Sahrens void 246789Sahrens vdev_compact_children(vdev_t *pvd) 247789Sahrens { 248789Sahrens vdev_t **newchild, *cvd; 249789Sahrens int oldc = pvd->vdev_children; 250789Sahrens int newc, c; 251789Sahrens 252789Sahrens ASSERT(spa_config_held(pvd->vdev_spa, RW_WRITER)); 253789Sahrens 254789Sahrens for (c = newc = 0; c < oldc; c++) 255789Sahrens if (pvd->vdev_child[c]) 256789Sahrens newc++; 257789Sahrens 258789Sahrens newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP); 259789Sahrens 260789Sahrens for (c = newc = 0; c < oldc; c++) { 261789Sahrens if ((cvd = pvd->vdev_child[c]) != NULL) { 262789Sahrens newchild[newc] = cvd; 263789Sahrens cvd->vdev_id = newc++; 264789Sahrens } 265789Sahrens } 266789Sahrens 267789Sahrens kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *)); 268789Sahrens pvd->vdev_child = newchild; 269789Sahrens pvd->vdev_children = newc; 270789Sahrens } 271789Sahrens 272789Sahrens /* 273789Sahrens * Allocate and minimally initialize a vdev_t. 274789Sahrens */ 275789Sahrens static vdev_t * 276789Sahrens vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) 277789Sahrens { 278789Sahrens vdev_t *vd; 279789Sahrens 2801585Sbonwick vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP); 2811585Sbonwick 2821585Sbonwick if (spa->spa_root_vdev == NULL) { 2831585Sbonwick ASSERT(ops == &vdev_root_ops); 2841585Sbonwick spa->spa_root_vdev = vd; 2851585Sbonwick } 286789Sahrens 2871585Sbonwick if (guid == 0) { 2881585Sbonwick if (spa->spa_root_vdev == vd) { 2891585Sbonwick /* 2901585Sbonwick * The root vdev's guid will also be the pool guid, 2911585Sbonwick * which must be unique among all pools. 2921585Sbonwick */ 2931585Sbonwick while (guid == 0 || spa_guid_exists(guid, 0)) 2941585Sbonwick guid = spa_get_random(-1ULL); 2951585Sbonwick } else { 2961585Sbonwick /* 2971585Sbonwick * Any other vdev's guid must be unique within the pool. 2981585Sbonwick */ 2991585Sbonwick while (guid == 0 || 3001585Sbonwick spa_guid_exists(spa_guid(spa), guid)) 3011585Sbonwick guid = spa_get_random(-1ULL); 3021585Sbonwick } 3031585Sbonwick ASSERT(!spa_guid_exists(spa_guid(spa), guid)); 3041585Sbonwick } 305789Sahrens 306789Sahrens vd->vdev_spa = spa; 307789Sahrens vd->vdev_id = id; 308789Sahrens vd->vdev_guid = guid; 309789Sahrens vd->vdev_guid_sum = guid; 310789Sahrens vd->vdev_ops = ops; 311789Sahrens vd->vdev_state = VDEV_STATE_CLOSED; 312789Sahrens 313789Sahrens mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL); 3142856Snd150628 mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); 315789Sahrens space_map_create(&vd->vdev_dtl_map, 0, -1ULL, 0, &vd->vdev_dtl_lock); 316789Sahrens space_map_create(&vd->vdev_dtl_scrub, 0, -1ULL, 0, &vd->vdev_dtl_lock); 317789Sahrens txg_list_create(&vd->vdev_ms_list, 318789Sahrens offsetof(struct metaslab, ms_txg_node)); 319789Sahrens txg_list_create(&vd->vdev_dtl_list, 320789Sahrens offsetof(struct vdev, vdev_dtl_node)); 321789Sahrens vd->vdev_stat.vs_timestamp = gethrtime(); 3224451Seschrock vdev_queue_init(vd); 3234451Seschrock vdev_cache_init(vd); 324789Sahrens 325789Sahrens return (vd); 326789Sahrens } 327789Sahrens 328789Sahrens /* 329789Sahrens * Allocate a new vdev. The 'alloctype' is used to control whether we are 330789Sahrens * creating a new vdev or loading an existing one - the behavior is slightly 331789Sahrens * different for each case. 332789Sahrens */ 3332082Seschrock int 3342082Seschrock vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, 3352082Seschrock int alloctype) 336789Sahrens { 337789Sahrens vdev_ops_t *ops; 338789Sahrens char *type; 3394527Sperrin uint64_t guid = 0, islog, nparity; 340789Sahrens vdev_t *vd; 341789Sahrens 342789Sahrens ASSERT(spa_config_held(spa, RW_WRITER)); 343789Sahrens 344789Sahrens if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) 3452082Seschrock return (EINVAL); 346789Sahrens 347789Sahrens if ((ops = vdev_getops(type)) == NULL) 3482082Seschrock return (EINVAL); 349789Sahrens 350789Sahrens /* 351789Sahrens * If this is a load, get the vdev guid from the nvlist. 352789Sahrens * Otherwise, vdev_alloc_common() will generate one for us. 353789Sahrens */ 354789Sahrens if (alloctype == VDEV_ALLOC_LOAD) { 355789Sahrens uint64_t label_id; 356789Sahrens 357789Sahrens if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) || 358789Sahrens label_id != id) 3592082Seschrock return (EINVAL); 360789Sahrens 361789Sahrens if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 3622082Seschrock return (EINVAL); 3632082Seschrock } else if (alloctype == VDEV_ALLOC_SPARE) { 3642082Seschrock if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 3652082Seschrock return (EINVAL); 366789Sahrens } 367789Sahrens 3682082Seschrock /* 3692082Seschrock * The first allocated vdev must be of type 'root'. 3702082Seschrock */ 3712082Seschrock if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL) 3722082Seschrock return (EINVAL); 3732082Seschrock 3744527Sperrin /* 3754527Sperrin * Determine whether we're a log vdev. 3764527Sperrin */ 3774527Sperrin islog = 0; 3784527Sperrin (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog); 3795094Slling if (islog && spa_version(spa) < SPA_VERSION_SLOGS) 3804527Sperrin return (ENOTSUP); 3814527Sperrin 3824527Sperrin /* 3834527Sperrin * Set the nparity property for RAID-Z vdevs. 3844527Sperrin */ 3854527Sperrin nparity = -1ULL; 3864527Sperrin if (ops == &vdev_raidz_ops) { 3874527Sperrin if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, 3884527Sperrin &nparity) == 0) { 3894527Sperrin /* 3904527Sperrin * Currently, we can only support 2 parity devices. 3914527Sperrin */ 3924527Sperrin if (nparity == 0 || nparity > 2) 3934527Sperrin return (EINVAL); 3944527Sperrin /* 3954527Sperrin * Older versions can only support 1 parity device. 3964527Sperrin */ 3974527Sperrin if (nparity == 2 && 3984577Sahrens spa_version(spa) < SPA_VERSION_RAID6) 3994527Sperrin return (ENOTSUP); 4004527Sperrin } else { 4014527Sperrin /* 4024527Sperrin * We require the parity to be specified for SPAs that 4034527Sperrin * support multiple parity levels. 4044527Sperrin */ 4054577Sahrens if (spa_version(spa) >= SPA_VERSION_RAID6) 4064527Sperrin return (EINVAL); 4074527Sperrin /* 4084527Sperrin * Otherwise, we default to 1 parity device for RAID-Z. 4094527Sperrin */ 4104527Sperrin nparity = 1; 4114527Sperrin } 4124527Sperrin } else { 4134527Sperrin nparity = 0; 4144527Sperrin } 4154527Sperrin ASSERT(nparity != -1ULL); 4164527Sperrin 417789Sahrens vd = vdev_alloc_common(spa, id, guid, ops); 418789Sahrens 4194527Sperrin vd->vdev_islog = islog; 4204527Sperrin vd->vdev_nparity = nparity; 4214527Sperrin 422789Sahrens if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0) 423789Sahrens vd->vdev_path = spa_strdup(vd->vdev_path); 424789Sahrens if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0) 425789Sahrens vd->vdev_devid = spa_strdup(vd->vdev_devid); 4264451Seschrock if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, 4274451Seschrock &vd->vdev_physpath) == 0) 4284451Seschrock vd->vdev_physpath = spa_strdup(vd->vdev_physpath); 429789Sahrens 430789Sahrens /* 4311171Seschrock * Set the whole_disk property. If it's not specified, leave the value 4321171Seschrock * as -1. 4331171Seschrock */ 4341171Seschrock if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, 4351171Seschrock &vd->vdev_wholedisk) != 0) 4361171Seschrock vd->vdev_wholedisk = -1ULL; 4371171Seschrock 4381171Seschrock /* 4391544Seschrock * Look for the 'not present' flag. This will only be set if the device 4401544Seschrock * was not present at the time of import. 4411544Seschrock */ 4421544Seschrock (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 4431544Seschrock &vd->vdev_not_present); 4441544Seschrock 4451544Seschrock /* 4461732Sbonwick * Get the alignment requirement. 4471732Sbonwick */ 4481732Sbonwick (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift); 4491732Sbonwick 4501732Sbonwick /* 451789Sahrens * If we're a top-level vdev, try to load the allocation parameters. 452789Sahrens */ 453789Sahrens if (parent && !parent->vdev_parent && alloctype == VDEV_ALLOC_LOAD) { 454789Sahrens (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, 455789Sahrens &vd->vdev_ms_array); 456789Sahrens (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, 457789Sahrens &vd->vdev_ms_shift); 458789Sahrens (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE, 459789Sahrens &vd->vdev_asize); 460789Sahrens } 461789Sahrens 462789Sahrens /* 4634451Seschrock * If we're a leaf vdev, try to load the DTL object and other state. 464789Sahrens */ 465789Sahrens if (vd->vdev_ops->vdev_op_leaf && alloctype == VDEV_ALLOC_LOAD) { 466789Sahrens (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, 467789Sahrens &vd->vdev_dtl.smo_object); 4681732Sbonwick (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, 4691732Sbonwick &vd->vdev_offline); 4704451Seschrock (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE, 4714451Seschrock &vd->vdev_unspare); 4724451Seschrock /* 4734451Seschrock * When importing a pool, we want to ignore the persistent fault 4744451Seschrock * state, as the diagnosis made on another system may not be 4754451Seschrock * valid in the current context. 4764451Seschrock */ 4774451Seschrock if (spa->spa_load_state == SPA_LOAD_OPEN) { 4784451Seschrock (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, 4794451Seschrock &vd->vdev_faulted); 4804451Seschrock (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED, 4814451Seschrock &vd->vdev_degraded); 4824451Seschrock (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, 4834451Seschrock &vd->vdev_removed); 4844451Seschrock } 485789Sahrens } 486789Sahrens 487789Sahrens /* 488789Sahrens * Add ourselves to the parent's list of children. 489789Sahrens */ 490789Sahrens vdev_add_child(parent, vd); 491789Sahrens 4922082Seschrock *vdp = vd; 4932082Seschrock 4942082Seschrock return (0); 495789Sahrens } 496789Sahrens 497789Sahrens void 498789Sahrens vdev_free(vdev_t *vd) 499789Sahrens { 500789Sahrens int c; 5014451Seschrock spa_t *spa = vd->vdev_spa; 502789Sahrens 503789Sahrens /* 504789Sahrens * vdev_free() implies closing the vdev first. This is simpler than 505789Sahrens * trying to ensure complicated semantics for all callers. 506789Sahrens */ 507789Sahrens vdev_close(vd); 508789Sahrens 5094451Seschrock 5101732Sbonwick ASSERT(!list_link_active(&vd->vdev_dirty_node)); 511789Sahrens 512789Sahrens /* 513789Sahrens * Free all children. 514789Sahrens */ 515789Sahrens for (c = 0; c < vd->vdev_children; c++) 516789Sahrens vdev_free(vd->vdev_child[c]); 517789Sahrens 518789Sahrens ASSERT(vd->vdev_child == NULL); 519789Sahrens ASSERT(vd->vdev_guid_sum == vd->vdev_guid); 520789Sahrens 521789Sahrens /* 522789Sahrens * Discard allocation state. 523789Sahrens */ 524789Sahrens if (vd == vd->vdev_top) 525789Sahrens vdev_metaslab_fini(vd); 526789Sahrens 527789Sahrens ASSERT3U(vd->vdev_stat.vs_space, ==, 0); 5282082Seschrock ASSERT3U(vd->vdev_stat.vs_dspace, ==, 0); 529789Sahrens ASSERT3U(vd->vdev_stat.vs_alloc, ==, 0); 530789Sahrens 531789Sahrens /* 532789Sahrens * Remove this vdev from its parent's child list. 533789Sahrens */ 534789Sahrens vdev_remove_child(vd->vdev_parent, vd); 535789Sahrens 536789Sahrens ASSERT(vd->vdev_parent == NULL); 537789Sahrens 5384451Seschrock /* 5394451Seschrock * Clean up vdev structure. 5404451Seschrock */ 5414451Seschrock vdev_queue_fini(vd); 5424451Seschrock vdev_cache_fini(vd); 5434451Seschrock 5444451Seschrock if (vd->vdev_path) 5454451Seschrock spa_strfree(vd->vdev_path); 5464451Seschrock if (vd->vdev_devid) 5474451Seschrock spa_strfree(vd->vdev_devid); 5484451Seschrock if (vd->vdev_physpath) 5494451Seschrock spa_strfree(vd->vdev_physpath); 5504451Seschrock 5514451Seschrock if (vd->vdev_isspare) 5524451Seschrock spa_spare_remove(vd); 5534451Seschrock 5544451Seschrock txg_list_destroy(&vd->vdev_ms_list); 5554451Seschrock txg_list_destroy(&vd->vdev_dtl_list); 5564451Seschrock mutex_enter(&vd->vdev_dtl_lock); 5574451Seschrock space_map_unload(&vd->vdev_dtl_map); 5584451Seschrock space_map_destroy(&vd->vdev_dtl_map); 5594451Seschrock space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL); 5604451Seschrock space_map_destroy(&vd->vdev_dtl_scrub); 5614451Seschrock mutex_exit(&vd->vdev_dtl_lock); 5624451Seschrock mutex_destroy(&vd->vdev_dtl_lock); 5634451Seschrock mutex_destroy(&vd->vdev_stat_lock); 5644451Seschrock 5654451Seschrock if (vd == spa->spa_root_vdev) 5664451Seschrock spa->spa_root_vdev = NULL; 5674451Seschrock 5684451Seschrock kmem_free(vd, sizeof (vdev_t)); 569789Sahrens } 570789Sahrens 571789Sahrens /* 572789Sahrens * Transfer top-level vdev state from svd to tvd. 573789Sahrens */ 574789Sahrens static void 575789Sahrens vdev_top_transfer(vdev_t *svd, vdev_t *tvd) 576789Sahrens { 577789Sahrens spa_t *spa = svd->vdev_spa; 578789Sahrens metaslab_t *msp; 579789Sahrens vdev_t *vd; 580789Sahrens int t; 581789Sahrens 582789Sahrens ASSERT(tvd == tvd->vdev_top); 583789Sahrens 584789Sahrens tvd->vdev_ms_array = svd->vdev_ms_array; 585789Sahrens tvd->vdev_ms_shift = svd->vdev_ms_shift; 586789Sahrens tvd->vdev_ms_count = svd->vdev_ms_count; 587789Sahrens 588789Sahrens svd->vdev_ms_array = 0; 589789Sahrens svd->vdev_ms_shift = 0; 590789Sahrens svd->vdev_ms_count = 0; 591789Sahrens 592789Sahrens tvd->vdev_mg = svd->vdev_mg; 593789Sahrens tvd->vdev_ms = svd->vdev_ms; 594789Sahrens 595789Sahrens svd->vdev_mg = NULL; 596789Sahrens svd->vdev_ms = NULL; 5971732Sbonwick 5981732Sbonwick if (tvd->vdev_mg != NULL) 5991732Sbonwick tvd->vdev_mg->mg_vd = tvd; 600789Sahrens 601789Sahrens tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc; 602789Sahrens tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space; 6032082Seschrock tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace; 604789Sahrens 605789Sahrens svd->vdev_stat.vs_alloc = 0; 606789Sahrens svd->vdev_stat.vs_space = 0; 6072082Seschrock svd->vdev_stat.vs_dspace = 0; 608789Sahrens 609789Sahrens for (t = 0; t < TXG_SIZE; t++) { 610789Sahrens while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL) 611789Sahrens (void) txg_list_add(&tvd->vdev_ms_list, msp, t); 612789Sahrens while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL) 613789Sahrens (void) txg_list_add(&tvd->vdev_dtl_list, vd, t); 614789Sahrens if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t)) 615789Sahrens (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t); 616789Sahrens } 617789Sahrens 6181732Sbonwick if (list_link_active(&svd->vdev_dirty_node)) { 619789Sahrens vdev_config_clean(svd); 620789Sahrens vdev_config_dirty(tvd); 621789Sahrens } 622789Sahrens 6232082Seschrock tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio; 6242082Seschrock svd->vdev_deflate_ratio = 0; 6254527Sperrin 6264527Sperrin tvd->vdev_islog = svd->vdev_islog; 6274527Sperrin svd->vdev_islog = 0; 628789Sahrens } 629789Sahrens 630789Sahrens static void 631789Sahrens vdev_top_update(vdev_t *tvd, vdev_t *vd) 632789Sahrens { 633789Sahrens int c; 634789Sahrens 635789Sahrens if (vd == NULL) 636789Sahrens return; 637789Sahrens 638789Sahrens vd->vdev_top = tvd; 639789Sahrens 640789Sahrens for (c = 0; c < vd->vdev_children; c++) 641789Sahrens vdev_top_update(tvd, vd->vdev_child[c]); 642789Sahrens } 643789Sahrens 644789Sahrens /* 645789Sahrens * Add a mirror/replacing vdev above an existing vdev. 646789Sahrens */ 647789Sahrens vdev_t * 648789Sahrens vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) 649789Sahrens { 650789Sahrens spa_t *spa = cvd->vdev_spa; 651789Sahrens vdev_t *pvd = cvd->vdev_parent; 652789Sahrens vdev_t *mvd; 653789Sahrens 654789Sahrens ASSERT(spa_config_held(spa, RW_WRITER)); 655789Sahrens 656789Sahrens mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops); 6571732Sbonwick 6581732Sbonwick mvd->vdev_asize = cvd->vdev_asize; 6591732Sbonwick mvd->vdev_ashift = cvd->vdev_ashift; 6601732Sbonwick mvd->vdev_state = cvd->vdev_state; 6611732Sbonwick 662789Sahrens vdev_remove_child(pvd, cvd); 663789Sahrens vdev_add_child(pvd, mvd); 664789Sahrens cvd->vdev_id = mvd->vdev_children; 665789Sahrens vdev_add_child(mvd, cvd); 666789Sahrens vdev_top_update(cvd->vdev_top, cvd->vdev_top); 667789Sahrens 668789Sahrens if (mvd == mvd->vdev_top) 669789Sahrens vdev_top_transfer(cvd, mvd); 670789Sahrens 671789Sahrens return (mvd); 672789Sahrens } 673789Sahrens 674789Sahrens /* 675789Sahrens * Remove a 1-way mirror/replacing vdev from the tree. 676789Sahrens */ 677789Sahrens void 678789Sahrens vdev_remove_parent(vdev_t *cvd) 679789Sahrens { 680789Sahrens vdev_t *mvd = cvd->vdev_parent; 681789Sahrens vdev_t *pvd = mvd->vdev_parent; 682789Sahrens 683789Sahrens ASSERT(spa_config_held(cvd->vdev_spa, RW_WRITER)); 684789Sahrens 685789Sahrens ASSERT(mvd->vdev_children == 1); 686789Sahrens ASSERT(mvd->vdev_ops == &vdev_mirror_ops || 6872082Seschrock mvd->vdev_ops == &vdev_replacing_ops || 6882082Seschrock mvd->vdev_ops == &vdev_spare_ops); 6891732Sbonwick cvd->vdev_ashift = mvd->vdev_ashift; 690789Sahrens 691789Sahrens vdev_remove_child(mvd, cvd); 692789Sahrens vdev_remove_child(pvd, mvd); 693789Sahrens cvd->vdev_id = mvd->vdev_id; 694789Sahrens vdev_add_child(pvd, cvd); 6952082Seschrock /* 6962082Seschrock * If we created a new toplevel vdev, then we need to change the child's 6972082Seschrock * vdev GUID to match the old toplevel vdev. Otherwise, we could have 6982082Seschrock * detached an offline device, and when we go to import the pool we'll 6992082Seschrock * think we have two toplevel vdevs, instead of a different version of 7002082Seschrock * the same toplevel vdev. 7012082Seschrock */ 7022082Seschrock if (cvd->vdev_top == cvd) { 7032082Seschrock pvd->vdev_guid_sum -= cvd->vdev_guid; 7042082Seschrock cvd->vdev_guid_sum -= cvd->vdev_guid; 7052082Seschrock cvd->vdev_guid = mvd->vdev_guid; 7062082Seschrock cvd->vdev_guid_sum += mvd->vdev_guid; 7072082Seschrock pvd->vdev_guid_sum += cvd->vdev_guid; 7082082Seschrock } 709789Sahrens vdev_top_update(cvd->vdev_top, cvd->vdev_top); 710789Sahrens 711789Sahrens if (cvd == cvd->vdev_top) 712789Sahrens vdev_top_transfer(mvd, cvd); 713789Sahrens 714789Sahrens ASSERT(mvd->vdev_children == 0); 715789Sahrens vdev_free(mvd); 716789Sahrens } 717789Sahrens 7181544Seschrock int 719789Sahrens vdev_metaslab_init(vdev_t *vd, uint64_t txg) 720789Sahrens { 721789Sahrens spa_t *spa = vd->vdev_spa; 7221732Sbonwick objset_t *mos = spa->spa_meta_objset; 7234527Sperrin metaslab_class_t *mc; 7241732Sbonwick uint64_t m; 725789Sahrens uint64_t oldc = vd->vdev_ms_count; 726789Sahrens uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; 7271732Sbonwick metaslab_t **mspp; 7281732Sbonwick int error; 729789Sahrens 7301585Sbonwick if (vd->vdev_ms_shift == 0) /* not being allocated from yet */ 7311585Sbonwick return (0); 7321585Sbonwick 733789Sahrens dprintf("%s oldc %llu newc %llu\n", vdev_description(vd), oldc, newc); 734789Sahrens 735789Sahrens ASSERT(oldc <= newc); 736789Sahrens 7374527Sperrin if (vd->vdev_islog) 7384527Sperrin mc = spa->spa_log_class; 7394527Sperrin else 7404527Sperrin mc = spa->spa_normal_class; 7414527Sperrin 7421732Sbonwick if (vd->vdev_mg == NULL) 7431732Sbonwick vd->vdev_mg = metaslab_group_create(mc, vd); 7441732Sbonwick 7451732Sbonwick mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP); 7461732Sbonwick 7471732Sbonwick if (oldc != 0) { 7481732Sbonwick bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp)); 7491732Sbonwick kmem_free(vd->vdev_ms, oldc * sizeof (*mspp)); 7501732Sbonwick } 7511732Sbonwick 7521732Sbonwick vd->vdev_ms = mspp; 753789Sahrens vd->vdev_ms_count = newc; 754789Sahrens 7551732Sbonwick for (m = oldc; m < newc; m++) { 7561732Sbonwick space_map_obj_t smo = { 0, 0, 0 }; 757789Sahrens if (txg == 0) { 7581732Sbonwick uint64_t object = 0; 7591732Sbonwick error = dmu_read(mos, vd->vdev_ms_array, 7601732Sbonwick m * sizeof (uint64_t), sizeof (uint64_t), &object); 7611732Sbonwick if (error) 7621732Sbonwick return (error); 7631732Sbonwick if (object != 0) { 7641732Sbonwick dmu_buf_t *db; 7651732Sbonwick error = dmu_bonus_hold(mos, object, FTAG, &db); 7661732Sbonwick if (error) 7671732Sbonwick return (error); 7684944Smaybee ASSERT3U(db->db_size, >=, sizeof (smo)); 7694944Smaybee bcopy(db->db_data, &smo, sizeof (smo)); 7701732Sbonwick ASSERT3U(smo.smo_object, ==, object); 7711544Seschrock dmu_buf_rele(db, FTAG); 772789Sahrens } 773789Sahrens } 7741732Sbonwick vd->vdev_ms[m] = metaslab_init(vd->vdev_mg, &smo, 7751732Sbonwick m << vd->vdev_ms_shift, 1ULL << vd->vdev_ms_shift, txg); 776789Sahrens } 777789Sahrens 7781544Seschrock return (0); 779789Sahrens } 780789Sahrens 781789Sahrens void 782789Sahrens vdev_metaslab_fini(vdev_t *vd) 783789Sahrens { 784789Sahrens uint64_t m; 785789Sahrens uint64_t count = vd->vdev_ms_count; 786789Sahrens 787789Sahrens if (vd->vdev_ms != NULL) { 788789Sahrens for (m = 0; m < count; m++) 7891732Sbonwick if (vd->vdev_ms[m] != NULL) 7901732Sbonwick metaslab_fini(vd->vdev_ms[m]); 791789Sahrens kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *)); 792789Sahrens vd->vdev_ms = NULL; 793789Sahrens } 794789Sahrens } 795789Sahrens 796*5329Sgw25295 int 797*5329Sgw25295 vdev_probe(vdev_t *vd) 798*5329Sgw25295 { 799*5329Sgw25295 if (vd == NULL) 800*5329Sgw25295 return (EINVAL); 801*5329Sgw25295 802*5329Sgw25295 /* 803*5329Sgw25295 * Right now we only support status checks on the leaf vdevs. 804*5329Sgw25295 */ 805*5329Sgw25295 if (vd->vdev_ops->vdev_op_leaf) 806*5329Sgw25295 return (vd->vdev_ops->vdev_op_probe(vd)); 807*5329Sgw25295 808*5329Sgw25295 return (0); 809*5329Sgw25295 } 810*5329Sgw25295 811789Sahrens /* 812789Sahrens * Prepare a virtual device for access. 813789Sahrens */ 814789Sahrens int 815789Sahrens vdev_open(vdev_t *vd) 816789Sahrens { 817789Sahrens int error; 818789Sahrens int c; 819789Sahrens uint64_t osize = 0; 820789Sahrens uint64_t asize, psize; 8211732Sbonwick uint64_t ashift = 0; 822789Sahrens 823789Sahrens ASSERT(vd->vdev_state == VDEV_STATE_CLOSED || 824789Sahrens vd->vdev_state == VDEV_STATE_CANT_OPEN || 825789Sahrens vd->vdev_state == VDEV_STATE_OFFLINE); 826789Sahrens 827789Sahrens if (vd->vdev_fault_mode == VDEV_FAULT_COUNT) 828789Sahrens vd->vdev_fault_arg >>= 1; 829789Sahrens else 830789Sahrens vd->vdev_fault_mode = VDEV_FAULT_NONE; 831789Sahrens 832789Sahrens vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 833789Sahrens 8344451Seschrock if (!vd->vdev_removed && vd->vdev_faulted) { 8354451Seschrock ASSERT(vd->vdev_children == 0); 8364451Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, 8374451Seschrock VDEV_AUX_ERR_EXCEEDED); 8384451Seschrock return (ENXIO); 8394451Seschrock } else if (vd->vdev_offline) { 840789Sahrens ASSERT(vd->vdev_children == 0); 8411544Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE); 842789Sahrens return (ENXIO); 843789Sahrens } 844789Sahrens 845789Sahrens error = vd->vdev_ops->vdev_op_open(vd, &osize, &ashift); 846789Sahrens 8471544Seschrock if (zio_injection_enabled && error == 0) 8481544Seschrock error = zio_handle_device_injection(vd, ENXIO); 8491544Seschrock 8504451Seschrock if (error) { 8514451Seschrock if (vd->vdev_removed && 8524451Seschrock vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED) 8534451Seschrock vd->vdev_removed = B_FALSE; 854789Sahrens 8551544Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 856789Sahrens vd->vdev_stat.vs_aux); 857789Sahrens return (error); 858789Sahrens } 859789Sahrens 8604451Seschrock vd->vdev_removed = B_FALSE; 8614451Seschrock 8624451Seschrock if (vd->vdev_degraded) { 8634451Seschrock ASSERT(vd->vdev_children == 0); 8644451Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, 8654451Seschrock VDEV_AUX_ERR_EXCEEDED); 8664451Seschrock } else { 8674451Seschrock vd->vdev_state = VDEV_STATE_HEALTHY; 8684451Seschrock } 869789Sahrens 870789Sahrens for (c = 0; c < vd->vdev_children; c++) 8711544Seschrock if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { 8721544Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, 8731544Seschrock VDEV_AUX_NONE); 8741544Seschrock break; 8751544Seschrock } 876789Sahrens 877789Sahrens osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t)); 878789Sahrens 879789Sahrens if (vd->vdev_children == 0) { 880789Sahrens if (osize < SPA_MINDEVSIZE) { 8811544Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 8821544Seschrock VDEV_AUX_TOO_SMALL); 883789Sahrens return (EOVERFLOW); 884789Sahrens } 885789Sahrens psize = osize; 886789Sahrens asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); 887789Sahrens } else { 8881732Sbonwick if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE - 889789Sahrens (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) { 8901544Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 8911544Seschrock VDEV_AUX_TOO_SMALL); 892789Sahrens return (EOVERFLOW); 893789Sahrens } 894789Sahrens psize = 0; 895789Sahrens asize = osize; 896789Sahrens } 897789Sahrens 898789Sahrens vd->vdev_psize = psize; 899789Sahrens 900789Sahrens if (vd->vdev_asize == 0) { 901789Sahrens /* 902789Sahrens * This is the first-ever open, so use the computed values. 9031732Sbonwick * For testing purposes, a higher ashift can be requested. 904789Sahrens */ 905789Sahrens vd->vdev_asize = asize; 9061732Sbonwick vd->vdev_ashift = MAX(ashift, vd->vdev_ashift); 907789Sahrens } else { 908789Sahrens /* 909789Sahrens * Make sure the alignment requirement hasn't increased. 910789Sahrens */ 9111732Sbonwick if (ashift > vd->vdev_top->vdev_ashift) { 9121544Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 9131544Seschrock VDEV_AUX_BAD_LABEL); 914789Sahrens return (EINVAL); 915789Sahrens } 916789Sahrens 917789Sahrens /* 918789Sahrens * Make sure the device hasn't shrunk. 919789Sahrens */ 920789Sahrens if (asize < vd->vdev_asize) { 9211544Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 9221544Seschrock VDEV_AUX_BAD_LABEL); 923789Sahrens return (EINVAL); 924789Sahrens } 925789Sahrens 926789Sahrens /* 927789Sahrens * If all children are healthy and the asize has increased, 928789Sahrens * then we've experienced dynamic LUN growth. 929789Sahrens */ 930789Sahrens if (vd->vdev_state == VDEV_STATE_HEALTHY && 931789Sahrens asize > vd->vdev_asize) { 932789Sahrens vd->vdev_asize = asize; 933789Sahrens } 934789Sahrens } 935789Sahrens 9361544Seschrock /* 937*5329Sgw25295 * Ensure we can issue some IO before declaring the 938*5329Sgw25295 * vdev open for business. 939*5329Sgw25295 */ 940*5329Sgw25295 error = vdev_probe(vd); 941*5329Sgw25295 if (error) { 942*5329Sgw25295 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 943*5329Sgw25295 VDEV_AUX_OPEN_FAILED); 944*5329Sgw25295 return (error); 945*5329Sgw25295 } 946*5329Sgw25295 947*5329Sgw25295 /* 9482082Seschrock * If this is a top-level vdev, compute the raidz-deflation 9492082Seschrock * ratio. Note, we hard-code in 128k (1<<17) because it is the 9502082Seschrock * current "typical" blocksize. Even if SPA_MAXBLOCKSIZE 9512082Seschrock * changes, this algorithm must never change, or we will 9522082Seschrock * inconsistently account for existing bp's. 9532082Seschrock */ 9542082Seschrock if (vd->vdev_top == vd) { 9552082Seschrock vd->vdev_deflate_ratio = (1<<17) / 9562082Seschrock (vdev_psize_to_asize(vd, 1<<17) >> SPA_MINBLOCKSHIFT); 9572082Seschrock } 9582082Seschrock 9592082Seschrock /* 9601544Seschrock * This allows the ZFS DE to close cases appropriately. If a device 9611544Seschrock * goes away and later returns, we want to close the associated case. 9621544Seschrock * But it's not enough to simply post this only when a device goes from 9631544Seschrock * CANT_OPEN -> HEALTHY. If we reboot the system and the device is 9641544Seschrock * back, we also need to close the case (otherwise we will try to replay 9651544Seschrock * it). So we have to post this notifier every time. Since this only 9661544Seschrock * occurs during pool open or error recovery, this should not be an 9671544Seschrock * issue. 9681544Seschrock */ 9691544Seschrock zfs_post_ok(vd->vdev_spa, vd); 9701544Seschrock 971789Sahrens return (0); 972789Sahrens } 973789Sahrens 974789Sahrens /* 9751986Seschrock * Called once the vdevs are all opened, this routine validates the label 9761986Seschrock * contents. This needs to be done before vdev_load() so that we don't 9774451Seschrock * inadvertently do repair I/Os to the wrong device. 9781986Seschrock * 9791986Seschrock * This function will only return failure if one of the vdevs indicates that it 9801986Seschrock * has since been destroyed or exported. This is only possible if 9811986Seschrock * /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state 9821986Seschrock * will be updated but the function will return 0. 9831986Seschrock */ 9841986Seschrock int 9851986Seschrock vdev_validate(vdev_t *vd) 9861986Seschrock { 9871986Seschrock spa_t *spa = vd->vdev_spa; 9881986Seschrock int c; 9891986Seschrock nvlist_t *label; 9901986Seschrock uint64_t guid; 9911986Seschrock uint64_t state; 9921986Seschrock 9931986Seschrock for (c = 0; c < vd->vdev_children; c++) 9941986Seschrock if (vdev_validate(vd->vdev_child[c]) != 0) 9954070Smc142369 return (EBADF); 9961986Seschrock 9972174Seschrock /* 9982174Seschrock * If the device has already failed, or was marked offline, don't do 9992174Seschrock * any further validation. Otherwise, label I/O will fail and we will 10002174Seschrock * overwrite the previous state. 10012174Seschrock */ 10022174Seschrock if (vd->vdev_ops->vdev_op_leaf && !vdev_is_dead(vd)) { 10031986Seschrock 10041986Seschrock if ((label = vdev_label_read_config(vd)) == NULL) { 10051986Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 10061986Seschrock VDEV_AUX_BAD_LABEL); 10071986Seschrock return (0); 10081986Seschrock } 10091986Seschrock 10101986Seschrock if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, 10111986Seschrock &guid) != 0 || guid != spa_guid(spa)) { 10121986Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 10131986Seschrock VDEV_AUX_CORRUPT_DATA); 10141986Seschrock nvlist_free(label); 10151986Seschrock return (0); 10161986Seschrock } 10171986Seschrock 10181986Seschrock if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, 10191986Seschrock &guid) != 0 || guid != vd->vdev_guid) { 10201986Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 10211986Seschrock VDEV_AUX_CORRUPT_DATA); 10221986Seschrock nvlist_free(label); 10231986Seschrock return (0); 10241986Seschrock } 10251986Seschrock 10261986Seschrock if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, 10271986Seschrock &state) != 0) { 10281986Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 10291986Seschrock VDEV_AUX_CORRUPT_DATA); 10301986Seschrock nvlist_free(label); 10311986Seschrock return (0); 10321986Seschrock } 10331986Seschrock 10341986Seschrock nvlist_free(label); 10351986Seschrock 10361986Seschrock if (spa->spa_load_state == SPA_LOAD_OPEN && 10371986Seschrock state != POOL_STATE_ACTIVE) 10384070Smc142369 return (EBADF); 10391986Seschrock } 10401986Seschrock 10411986Seschrock /* 10421986Seschrock * If we were able to open and validate a vdev that was previously 10431986Seschrock * marked permanently unavailable, clear that state now. 10441986Seschrock */ 10451986Seschrock if (vd->vdev_not_present) 10461986Seschrock vd->vdev_not_present = 0; 10471986Seschrock 10481986Seschrock return (0); 10491986Seschrock } 10501986Seschrock 10511986Seschrock /* 1052789Sahrens * Close a virtual device. 1053789Sahrens */ 1054789Sahrens void 1055789Sahrens vdev_close(vdev_t *vd) 1056789Sahrens { 1057789Sahrens vd->vdev_ops->vdev_op_close(vd); 1058789Sahrens 10594451Seschrock vdev_cache_purge(vd); 1060789Sahrens 10611986Seschrock /* 10621986Seschrock * We record the previous state before we close it, so that if we are 10631986Seschrock * doing a reopen(), we don't generate FMA ereports if we notice that 10641986Seschrock * it's still faulted. 10651986Seschrock */ 10661986Seschrock vd->vdev_prevstate = vd->vdev_state; 10671986Seschrock 1068789Sahrens if (vd->vdev_offline) 1069789Sahrens vd->vdev_state = VDEV_STATE_OFFLINE; 1070789Sahrens else 1071789Sahrens vd->vdev_state = VDEV_STATE_CLOSED; 10721544Seschrock vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 1073789Sahrens } 1074789Sahrens 1075789Sahrens void 10761544Seschrock vdev_reopen(vdev_t *vd) 1077789Sahrens { 10781544Seschrock spa_t *spa = vd->vdev_spa; 1079789Sahrens 10801544Seschrock ASSERT(spa_config_held(spa, RW_WRITER)); 10811544Seschrock 1082789Sahrens vdev_close(vd); 1083789Sahrens (void) vdev_open(vd); 1084789Sahrens 1085789Sahrens /* 10863377Seschrock * Call vdev_validate() here to make sure we have the same device. 10873377Seschrock * Otherwise, a device with an invalid label could be successfully 10883377Seschrock * opened in response to vdev_reopen(). 10893377Seschrock */ 10903377Seschrock (void) vdev_validate(vd); 10913377Seschrock 10923377Seschrock /* 10934451Seschrock * Reassess parent vdev's health. 1094789Sahrens */ 10954451Seschrock vdev_propagate_state(vd); 1096789Sahrens } 1097789Sahrens 1098789Sahrens int 10992082Seschrock vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing) 1100789Sahrens { 1101789Sahrens int error; 1102789Sahrens 1103789Sahrens /* 1104789Sahrens * Normally, partial opens (e.g. of a mirror) are allowed. 1105789Sahrens * For a create, however, we want to fail the request if 1106789Sahrens * there are any components we can't open. 1107789Sahrens */ 1108789Sahrens error = vdev_open(vd); 1109789Sahrens 1110789Sahrens if (error || vd->vdev_state != VDEV_STATE_HEALTHY) { 1111789Sahrens vdev_close(vd); 1112789Sahrens return (error ? error : ENXIO); 1113789Sahrens } 1114789Sahrens 1115789Sahrens /* 1116789Sahrens * Recursively initialize all labels. 1117789Sahrens */ 11183377Seschrock if ((error = vdev_label_init(vd, txg, isreplacing ? 11193377Seschrock VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) { 1120789Sahrens vdev_close(vd); 1121789Sahrens return (error); 1122789Sahrens } 1123789Sahrens 1124789Sahrens return (0); 1125789Sahrens } 1126789Sahrens 1127789Sahrens /* 1128789Sahrens * The is the latter half of vdev_create(). It is distinct because it 1129789Sahrens * involves initiating transactions in order to do metaslab creation. 1130789Sahrens * For creation, we want to try to create all vdevs at once and then undo it 1131789Sahrens * if anything fails; this is much harder if we have pending transactions. 1132789Sahrens */ 11331585Sbonwick void 1134789Sahrens vdev_init(vdev_t *vd, uint64_t txg) 1135789Sahrens { 1136789Sahrens /* 1137789Sahrens * Aim for roughly 200 metaslabs per vdev. 1138789Sahrens */ 1139789Sahrens vd->vdev_ms_shift = highbit(vd->vdev_asize / 200); 1140789Sahrens vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT); 1141789Sahrens 1142789Sahrens /* 11431585Sbonwick * Initialize the vdev's metaslabs. This can't fail because 11441585Sbonwick * there's nothing to read when creating all new metaslabs. 1145789Sahrens */ 11461585Sbonwick VERIFY(vdev_metaslab_init(vd, txg) == 0); 1147789Sahrens } 1148789Sahrens 1149789Sahrens void 11501732Sbonwick vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg) 1151789Sahrens { 11521732Sbonwick ASSERT(vd == vd->vdev_top); 11531732Sbonwick ASSERT(ISP2(flags)); 1154789Sahrens 11551732Sbonwick if (flags & VDD_METASLAB) 11561732Sbonwick (void) txg_list_add(&vd->vdev_ms_list, arg, txg); 11571732Sbonwick 11581732Sbonwick if (flags & VDD_DTL) 11591732Sbonwick (void) txg_list_add(&vd->vdev_dtl_list, arg, txg); 11601732Sbonwick 11611732Sbonwick (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg); 1162789Sahrens } 1163789Sahrens 1164789Sahrens void 1165789Sahrens vdev_dtl_dirty(space_map_t *sm, uint64_t txg, uint64_t size) 1166789Sahrens { 1167789Sahrens mutex_enter(sm->sm_lock); 1168789Sahrens if (!space_map_contains(sm, txg, size)) 1169789Sahrens space_map_add(sm, txg, size); 1170789Sahrens mutex_exit(sm->sm_lock); 1171789Sahrens } 1172789Sahrens 1173789Sahrens int 1174789Sahrens vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size) 1175789Sahrens { 1176789Sahrens int dirty; 1177789Sahrens 1178789Sahrens /* 1179789Sahrens * Quick test without the lock -- covers the common case that 1180789Sahrens * there are no dirty time segments. 1181789Sahrens */ 1182789Sahrens if (sm->sm_space == 0) 1183789Sahrens return (0); 1184789Sahrens 1185789Sahrens mutex_enter(sm->sm_lock); 1186789Sahrens dirty = space_map_contains(sm, txg, size); 1187789Sahrens mutex_exit(sm->sm_lock); 1188789Sahrens 1189789Sahrens return (dirty); 1190789Sahrens } 1191789Sahrens 1192789Sahrens /* 1193789Sahrens * Reassess DTLs after a config change or scrub completion. 1194789Sahrens */ 1195789Sahrens void 1196789Sahrens vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) 1197789Sahrens { 11981544Seschrock spa_t *spa = vd->vdev_spa; 1199789Sahrens int c; 1200789Sahrens 12011544Seschrock ASSERT(spa_config_held(spa, RW_WRITER)); 1202789Sahrens 1203789Sahrens if (vd->vdev_children == 0) { 1204789Sahrens mutex_enter(&vd->vdev_dtl_lock); 1205789Sahrens /* 1206789Sahrens * We're successfully scrubbed everything up to scrub_txg. 1207789Sahrens * Therefore, excise all old DTLs up to that point, then 1208789Sahrens * fold in the DTLs for everything we couldn't scrub. 1209789Sahrens */ 1210789Sahrens if (scrub_txg != 0) { 1211789Sahrens space_map_excise(&vd->vdev_dtl_map, 0, scrub_txg); 1212789Sahrens space_map_union(&vd->vdev_dtl_map, &vd->vdev_dtl_scrub); 1213789Sahrens } 1214789Sahrens if (scrub_done) 1215789Sahrens space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL); 1216789Sahrens mutex_exit(&vd->vdev_dtl_lock); 12171732Sbonwick if (txg != 0) 12181732Sbonwick vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); 1219789Sahrens return; 1220789Sahrens } 1221789Sahrens 12221544Seschrock /* 12231544Seschrock * Make sure the DTLs are always correct under the scrub lock. 12241544Seschrock */ 12251544Seschrock if (vd == spa->spa_root_vdev) 12261544Seschrock mutex_enter(&spa->spa_scrub_lock); 12271544Seschrock 1228789Sahrens mutex_enter(&vd->vdev_dtl_lock); 1229789Sahrens space_map_vacate(&vd->vdev_dtl_map, NULL, NULL); 1230789Sahrens space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL); 1231789Sahrens mutex_exit(&vd->vdev_dtl_lock); 1232789Sahrens 1233789Sahrens for (c = 0; c < vd->vdev_children; c++) { 1234789Sahrens vdev_t *cvd = vd->vdev_child[c]; 1235789Sahrens vdev_dtl_reassess(cvd, txg, scrub_txg, scrub_done); 1236789Sahrens mutex_enter(&vd->vdev_dtl_lock); 1237789Sahrens space_map_union(&vd->vdev_dtl_map, &cvd->vdev_dtl_map); 1238789Sahrens space_map_union(&vd->vdev_dtl_scrub, &cvd->vdev_dtl_scrub); 1239789Sahrens mutex_exit(&vd->vdev_dtl_lock); 1240789Sahrens } 12411544Seschrock 12421544Seschrock if (vd == spa->spa_root_vdev) 12431544Seschrock mutex_exit(&spa->spa_scrub_lock); 1244789Sahrens } 1245789Sahrens 1246789Sahrens static int 1247789Sahrens vdev_dtl_load(vdev_t *vd) 1248789Sahrens { 1249789Sahrens spa_t *spa = vd->vdev_spa; 1250789Sahrens space_map_obj_t *smo = &vd->vdev_dtl; 12511732Sbonwick objset_t *mos = spa->spa_meta_objset; 1252789Sahrens dmu_buf_t *db; 1253789Sahrens int error; 1254789Sahrens 1255789Sahrens ASSERT(vd->vdev_children == 0); 1256789Sahrens 1257789Sahrens if (smo->smo_object == 0) 1258789Sahrens return (0); 1259789Sahrens 12601732Sbonwick if ((error = dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)) != 0) 12611544Seschrock return (error); 12621732Sbonwick 12634944Smaybee ASSERT3U(db->db_size, >=, sizeof (*smo)); 12644944Smaybee bcopy(db->db_data, smo, sizeof (*smo)); 12651544Seschrock dmu_buf_rele(db, FTAG); 1266789Sahrens 1267789Sahrens mutex_enter(&vd->vdev_dtl_lock); 12681732Sbonwick error = space_map_load(&vd->vdev_dtl_map, NULL, SM_ALLOC, smo, mos); 1269789Sahrens mutex_exit(&vd->vdev_dtl_lock); 1270789Sahrens 1271789Sahrens return (error); 1272789Sahrens } 1273789Sahrens 1274789Sahrens void 1275789Sahrens vdev_dtl_sync(vdev_t *vd, uint64_t txg) 1276789Sahrens { 1277789Sahrens spa_t *spa = vd->vdev_spa; 1278789Sahrens space_map_obj_t *smo = &vd->vdev_dtl; 1279789Sahrens space_map_t *sm = &vd->vdev_dtl_map; 12801732Sbonwick objset_t *mos = spa->spa_meta_objset; 1281789Sahrens space_map_t smsync; 1282789Sahrens kmutex_t smlock; 1283789Sahrens dmu_buf_t *db; 1284789Sahrens dmu_tx_t *tx; 1285789Sahrens 1286789Sahrens dprintf("%s in txg %llu pass %d\n", 1287789Sahrens vdev_description(vd), (u_longlong_t)txg, spa_sync_pass(spa)); 1288789Sahrens 1289789Sahrens tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 1290789Sahrens 1291789Sahrens if (vd->vdev_detached) { 1292789Sahrens if (smo->smo_object != 0) { 12931732Sbonwick int err = dmu_object_free(mos, smo->smo_object, tx); 1294789Sahrens ASSERT3U(err, ==, 0); 1295789Sahrens smo->smo_object = 0; 1296789Sahrens } 1297789Sahrens dmu_tx_commit(tx); 12981732Sbonwick dprintf("detach %s committed in txg %llu\n", 12991732Sbonwick vdev_description(vd), txg); 1300789Sahrens return; 1301789Sahrens } 1302789Sahrens 1303789Sahrens if (smo->smo_object == 0) { 1304789Sahrens ASSERT(smo->smo_objsize == 0); 1305789Sahrens ASSERT(smo->smo_alloc == 0); 13061732Sbonwick smo->smo_object = dmu_object_alloc(mos, 1307789Sahrens DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT, 1308789Sahrens DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx); 1309789Sahrens ASSERT(smo->smo_object != 0); 1310789Sahrens vdev_config_dirty(vd->vdev_top); 1311789Sahrens } 1312789Sahrens 1313789Sahrens mutex_init(&smlock, NULL, MUTEX_DEFAULT, NULL); 1314789Sahrens 1315789Sahrens space_map_create(&smsync, sm->sm_start, sm->sm_size, sm->sm_shift, 1316789Sahrens &smlock); 1317789Sahrens 1318789Sahrens mutex_enter(&smlock); 1319789Sahrens 1320789Sahrens mutex_enter(&vd->vdev_dtl_lock); 13211732Sbonwick space_map_walk(sm, space_map_add, &smsync); 1322789Sahrens mutex_exit(&vd->vdev_dtl_lock); 1323789Sahrens 13241732Sbonwick space_map_truncate(smo, mos, tx); 13251732Sbonwick space_map_sync(&smsync, SM_ALLOC, smo, mos, tx); 1326789Sahrens 1327789Sahrens space_map_destroy(&smsync); 1328789Sahrens 1329789Sahrens mutex_exit(&smlock); 1330789Sahrens mutex_destroy(&smlock); 1331789Sahrens 13321732Sbonwick VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)); 1333789Sahrens dmu_buf_will_dirty(db, tx); 13344944Smaybee ASSERT3U(db->db_size, >=, sizeof (*smo)); 13354944Smaybee bcopy(smo, db->db_data, sizeof (*smo)); 13361544Seschrock dmu_buf_rele(db, FTAG); 1337789Sahrens 1338789Sahrens dmu_tx_commit(tx); 1339789Sahrens } 1340789Sahrens 13411986Seschrock void 13421544Seschrock vdev_load(vdev_t *vd) 1343789Sahrens { 13441986Seschrock int c; 1345789Sahrens 1346789Sahrens /* 1347789Sahrens * Recursively load all children. 1348789Sahrens */ 1349789Sahrens for (c = 0; c < vd->vdev_children; c++) 13501986Seschrock vdev_load(vd->vdev_child[c]); 1351789Sahrens 1352789Sahrens /* 13531585Sbonwick * If this is a top-level vdev, initialize its metaslabs. 1354789Sahrens */ 13551986Seschrock if (vd == vd->vdev_top && 13561986Seschrock (vd->vdev_ashift == 0 || vd->vdev_asize == 0 || 13571986Seschrock vdev_metaslab_init(vd, 0) != 0)) 13581986Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 13591986Seschrock VDEV_AUX_CORRUPT_DATA); 1360789Sahrens 1361789Sahrens /* 1362789Sahrens * If this is a leaf vdev, load its DTL. 1363789Sahrens */ 13641986Seschrock if (vd->vdev_ops->vdev_op_leaf && vdev_dtl_load(vd) != 0) 13651986Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 13661986Seschrock VDEV_AUX_CORRUPT_DATA); 1367789Sahrens } 1368789Sahrens 13692082Seschrock /* 13702082Seschrock * This special case of vdev_spare() is used for hot spares. It's sole purpose 13712082Seschrock * it to set the vdev state for the associated vdev. To do this, we make sure 13722082Seschrock * that we can open the underlying device, then try to read the label, and make 13732082Seschrock * sure that the label is sane and that it hasn't been repurposed to another 13742082Seschrock * pool. 13752082Seschrock */ 13762082Seschrock int 13772082Seschrock vdev_validate_spare(vdev_t *vd) 13782082Seschrock { 13792082Seschrock nvlist_t *label; 13802082Seschrock uint64_t guid, version; 13812082Seschrock uint64_t state; 13822082Seschrock 13832082Seschrock if ((label = vdev_label_read_config(vd)) == NULL) { 13842082Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 13852082Seschrock VDEV_AUX_CORRUPT_DATA); 13862082Seschrock return (-1); 13872082Seschrock } 13882082Seschrock 13892082Seschrock if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 || 13904577Sahrens version > SPA_VERSION || 13912082Seschrock nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 || 13922082Seschrock guid != vd->vdev_guid || 13932082Seschrock nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { 13942082Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 13952082Seschrock VDEV_AUX_CORRUPT_DATA); 13962082Seschrock nvlist_free(label); 13972082Seschrock return (-1); 13982082Seschrock } 13992082Seschrock 14003377Seschrock spa_spare_add(vd); 14013377Seschrock 14022082Seschrock /* 14032082Seschrock * We don't actually check the pool state here. If it's in fact in 14042082Seschrock * use by another pool, we update this fact on the fly when requested. 14052082Seschrock */ 14062082Seschrock nvlist_free(label); 14072082Seschrock return (0); 14082082Seschrock } 14092082Seschrock 1410789Sahrens void 1411789Sahrens vdev_sync_done(vdev_t *vd, uint64_t txg) 1412789Sahrens { 1413789Sahrens metaslab_t *msp; 1414789Sahrens 1415789Sahrens dprintf("%s txg %llu\n", vdev_description(vd), txg); 1416789Sahrens 1417789Sahrens while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))) 1418789Sahrens metaslab_sync_done(msp, txg); 1419789Sahrens } 1420789Sahrens 1421789Sahrens void 1422789Sahrens vdev_sync(vdev_t *vd, uint64_t txg) 1423789Sahrens { 1424789Sahrens spa_t *spa = vd->vdev_spa; 1425789Sahrens vdev_t *lvd; 1426789Sahrens metaslab_t *msp; 14271732Sbonwick dmu_tx_t *tx; 1428789Sahrens 1429789Sahrens dprintf("%s txg %llu pass %d\n", 1430789Sahrens vdev_description(vd), (u_longlong_t)txg, spa_sync_pass(spa)); 1431789Sahrens 14321732Sbonwick if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0) { 14331732Sbonwick ASSERT(vd == vd->vdev_top); 14341732Sbonwick tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 14351732Sbonwick vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset, 14361732Sbonwick DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx); 14371732Sbonwick ASSERT(vd->vdev_ms_array != 0); 14381732Sbonwick vdev_config_dirty(vd); 14391732Sbonwick dmu_tx_commit(tx); 14401732Sbonwick } 1441789Sahrens 14421732Sbonwick while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) { 1443789Sahrens metaslab_sync(msp, txg); 14441732Sbonwick (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg)); 14451732Sbonwick } 1446789Sahrens 1447789Sahrens while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL) 1448789Sahrens vdev_dtl_sync(lvd, txg); 1449789Sahrens 1450789Sahrens (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)); 1451789Sahrens } 1452789Sahrens 1453789Sahrens uint64_t 1454789Sahrens vdev_psize_to_asize(vdev_t *vd, uint64_t psize) 1455789Sahrens { 1456789Sahrens return (vd->vdev_ops->vdev_op_asize(vd, psize)); 1457789Sahrens } 1458789Sahrens 1459789Sahrens void 1460789Sahrens vdev_io_start(zio_t *zio) 1461789Sahrens { 1462789Sahrens zio->io_vd->vdev_ops->vdev_op_io_start(zio); 1463789Sahrens } 1464789Sahrens 1465789Sahrens void 1466789Sahrens vdev_io_done(zio_t *zio) 1467789Sahrens { 1468789Sahrens zio->io_vd->vdev_ops->vdev_op_io_done(zio); 1469789Sahrens } 1470789Sahrens 1471789Sahrens const char * 1472789Sahrens vdev_description(vdev_t *vd) 1473789Sahrens { 1474789Sahrens if (vd == NULL || vd->vdev_ops == NULL) 1475789Sahrens return ("<unknown>"); 1476789Sahrens 1477789Sahrens if (vd->vdev_path != NULL) 1478789Sahrens return (vd->vdev_path); 1479789Sahrens 1480789Sahrens if (vd->vdev_parent == NULL) 1481789Sahrens return (spa_name(vd->vdev_spa)); 1482789Sahrens 1483789Sahrens return (vd->vdev_ops->vdev_op_type); 1484789Sahrens } 1485789Sahrens 14864451Seschrock /* 14874451Seschrock * Mark the given vdev faulted. A faulted vdev behaves as if the device could 14884451Seschrock * not be opened, and no I/O is attempted. 14894451Seschrock */ 1490789Sahrens int 14914451Seschrock vdev_fault(spa_t *spa, uint64_t guid) 14924451Seschrock { 14934451Seschrock vdev_t *rvd, *vd; 14944451Seschrock uint64_t txg; 14954451Seschrock 1496*5329Sgw25295 /* 1497*5329Sgw25295 * Disregard a vdev fault request if the pool has 1498*5329Sgw25295 * experienced a complete failure. 1499*5329Sgw25295 * 1500*5329Sgw25295 * XXX - We do this here so that we don't hold the 1501*5329Sgw25295 * spa_namespace_lock in the event that we can't get 1502*5329Sgw25295 * the RW_WRITER spa_config_lock. 1503*5329Sgw25295 */ 1504*5329Sgw25295 if (spa_state(spa) == POOL_STATE_IO_FAILURE) 1505*5329Sgw25295 return (EIO); 1506*5329Sgw25295 15074451Seschrock txg = spa_vdev_enter(spa); 15084451Seschrock 15094451Seschrock rvd = spa->spa_root_vdev; 15104451Seschrock 15114451Seschrock if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) 15124451Seschrock return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 15134451Seschrock if (!vd->vdev_ops->vdev_op_leaf) 15144451Seschrock return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 15154451Seschrock 15164451Seschrock /* 15174451Seschrock * Faulted state takes precedence over degraded. 15184451Seschrock */ 15194451Seschrock vd->vdev_faulted = 1ULL; 15204451Seschrock vd->vdev_degraded = 0ULL; 15214451Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, 15224451Seschrock VDEV_AUX_ERR_EXCEEDED); 15234451Seschrock 15244451Seschrock /* 15254451Seschrock * If marking the vdev as faulted cause the toplevel vdev to become 15264451Seschrock * unavailable, then back off and simply mark the vdev as degraded 15274451Seschrock * instead. 15284451Seschrock */ 15294451Seschrock if (vdev_is_dead(vd->vdev_top)) { 15304451Seschrock vd->vdev_degraded = 1ULL; 15314451Seschrock vd->vdev_faulted = 0ULL; 15324451Seschrock 15334451Seschrock /* 15344451Seschrock * If we reopen the device and it's not dead, only then do we 15354451Seschrock * mark it degraded. 15364451Seschrock */ 15374451Seschrock vdev_reopen(vd); 15384451Seschrock 1539*5329Sgw25295 if (vdev_readable(vd)) { 15404451Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, 15414451Seschrock VDEV_AUX_ERR_EXCEEDED); 15424451Seschrock } 15434451Seschrock } 15444451Seschrock 15454451Seschrock vdev_config_dirty(vd->vdev_top); 15464451Seschrock 15474451Seschrock (void) spa_vdev_exit(spa, NULL, txg, 0); 15484451Seschrock 15494451Seschrock return (0); 15504451Seschrock } 15514451Seschrock 15524451Seschrock /* 15534451Seschrock * Mark the given vdev degraded. A degraded vdev is purely an indication to the 15544451Seschrock * user that something is wrong. The vdev continues to operate as normal as far 15554451Seschrock * as I/O is concerned. 15564451Seschrock */ 15574451Seschrock int 15584451Seschrock vdev_degrade(spa_t *spa, uint64_t guid) 15594451Seschrock { 15604451Seschrock vdev_t *rvd, *vd; 15614451Seschrock uint64_t txg; 15624451Seschrock 1563*5329Sgw25295 /* 1564*5329Sgw25295 * Disregard a vdev fault request if the pool has 1565*5329Sgw25295 * experienced a complete failure. 1566*5329Sgw25295 * 1567*5329Sgw25295 * XXX - We do this here so that we don't hold the 1568*5329Sgw25295 * spa_namespace_lock in the event that we can't get 1569*5329Sgw25295 * the RW_WRITER spa_config_lock. 1570*5329Sgw25295 */ 1571*5329Sgw25295 if (spa_state(spa) == POOL_STATE_IO_FAILURE) 1572*5329Sgw25295 return (EIO); 1573*5329Sgw25295 15744451Seschrock txg = spa_vdev_enter(spa); 15754451Seschrock 15764451Seschrock rvd = spa->spa_root_vdev; 15774451Seschrock 15784451Seschrock if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) 15794451Seschrock return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 15804451Seschrock if (!vd->vdev_ops->vdev_op_leaf) 15814451Seschrock return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 15824451Seschrock 15834451Seschrock /* 15844451Seschrock * If the vdev is already faulted, then don't do anything. 15854451Seschrock */ 15864451Seschrock if (vd->vdev_faulted || vd->vdev_degraded) { 15874451Seschrock (void) spa_vdev_exit(spa, NULL, txg, 0); 15884451Seschrock return (0); 15894451Seschrock } 15904451Seschrock 15914451Seschrock vd->vdev_degraded = 1ULL; 15924451Seschrock if (!vdev_is_dead(vd)) 15934451Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, 15944451Seschrock VDEV_AUX_ERR_EXCEEDED); 15954451Seschrock vdev_config_dirty(vd->vdev_top); 15964451Seschrock 15974451Seschrock (void) spa_vdev_exit(spa, NULL, txg, 0); 15984451Seschrock 15994451Seschrock return (0); 16004451Seschrock } 16014451Seschrock 16024451Seschrock /* 16034451Seschrock * Online the given vdev. If 'unspare' is set, it implies two things. First, 16044451Seschrock * any attached spare device should be detached when the device finishes 16054451Seschrock * resilvering. Second, the online should be treated like a 'test' online case, 16064451Seschrock * so no FMA events are generated if the device fails to open. 16074451Seschrock */ 16084451Seschrock int 16094451Seschrock vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, 16104451Seschrock vdev_state_t *newstate) 1611789Sahrens { 16121485Slling vdev_t *rvd, *vd; 16131485Slling uint64_t txg; 1614789Sahrens 1615*5329Sgw25295 /* 1616*5329Sgw25295 * Disregard a vdev fault request if the pool has 1617*5329Sgw25295 * experienced a complete failure. 1618*5329Sgw25295 * 1619*5329Sgw25295 * XXX - We do this here so that we don't hold the 1620*5329Sgw25295 * spa_namespace_lock in the event that we can't get 1621*5329Sgw25295 * the RW_WRITER spa_config_lock. 1622*5329Sgw25295 */ 1623*5329Sgw25295 if (spa_state(spa) == POOL_STATE_IO_FAILURE) 1624*5329Sgw25295 return (EIO); 1625*5329Sgw25295 16261485Slling txg = spa_vdev_enter(spa); 16271485Slling 16281485Slling rvd = spa->spa_root_vdev; 16291585Sbonwick 16301544Seschrock if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) 16311485Slling return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1632789Sahrens 16331585Sbonwick if (!vd->vdev_ops->vdev_op_leaf) 16341585Sbonwick return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 16351585Sbonwick 1636789Sahrens vd->vdev_offline = B_FALSE; 16371485Slling vd->vdev_tmpoffline = B_FALSE; 16384451Seschrock vd->vdev_checkremove = (flags & ZFS_ONLINE_CHECKREMOVE) ? 16394451Seschrock B_TRUE : B_FALSE; 16404451Seschrock vd->vdev_forcefault = (flags & ZFS_ONLINE_FORCEFAULT) ? 16414451Seschrock B_TRUE : B_FALSE; 16421544Seschrock vdev_reopen(vd->vdev_top); 16434451Seschrock vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE; 16444451Seschrock 16454451Seschrock if (newstate) 16464451Seschrock *newstate = vd->vdev_state; 16474451Seschrock if ((flags & ZFS_ONLINE_UNSPARE) && 16484451Seschrock !vdev_is_dead(vd) && vd->vdev_parent && 16494451Seschrock vd->vdev_parent->vdev_ops == &vdev_spare_ops && 16504451Seschrock vd->vdev_parent->vdev_child[0] == vd) 16514451Seschrock vd->vdev_unspare = B_TRUE; 1652789Sahrens 16531485Slling vdev_config_dirty(vd->vdev_top); 16541485Slling 16551485Slling (void) spa_vdev_exit(spa, NULL, txg, 0); 1656789Sahrens 16574451Seschrock /* 16584451Seschrock * Must hold spa_namespace_lock in order to post resilver sysevent 16594451Seschrock * w/pool name. 16604451Seschrock */ 16614451Seschrock mutex_enter(&spa_namespace_lock); 1662789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 16634451Seschrock mutex_exit(&spa_namespace_lock); 1664789Sahrens 1665789Sahrens return (0); 1666789Sahrens } 1667789Sahrens 1668789Sahrens int 16694451Seschrock vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) 1670789Sahrens { 16711485Slling vdev_t *rvd, *vd; 16721485Slling uint64_t txg; 1673789Sahrens 1674*5329Sgw25295 /* 1675*5329Sgw25295 * Disregard a vdev fault request if the pool has 1676*5329Sgw25295 * experienced a complete failure. 1677*5329Sgw25295 * 1678*5329Sgw25295 * XXX - We do this here so that we don't hold the 1679*5329Sgw25295 * spa_namespace_lock in the event that we can't get 1680*5329Sgw25295 * the RW_WRITER spa_config_lock. 1681*5329Sgw25295 */ 1682*5329Sgw25295 if (spa_state(spa) == POOL_STATE_IO_FAILURE) 1683*5329Sgw25295 return (EIO); 1684*5329Sgw25295 16851485Slling txg = spa_vdev_enter(spa); 1686789Sahrens 16871485Slling rvd = spa->spa_root_vdev; 16881585Sbonwick 16891544Seschrock if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) 16901485Slling return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1691789Sahrens 16921585Sbonwick if (!vd->vdev_ops->vdev_op_leaf) 16931585Sbonwick return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 16941585Sbonwick 1695789Sahrens /* 16961732Sbonwick * If the device isn't already offline, try to offline it. 1697789Sahrens */ 16981732Sbonwick if (!vd->vdev_offline) { 16991732Sbonwick /* 17001732Sbonwick * If this device's top-level vdev has a non-empty DTL, 17011732Sbonwick * don't allow the device to be offlined. 17021732Sbonwick * 17031732Sbonwick * XXX -- make this more precise by allowing the offline 17041732Sbonwick * as long as the remaining devices don't have any DTL holes. 17051732Sbonwick */ 17061732Sbonwick if (vd->vdev_top->vdev_dtl_map.sm_space != 0) 17071732Sbonwick return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1708789Sahrens 17091732Sbonwick /* 17101732Sbonwick * Offline this device and reopen its top-level vdev. 17111732Sbonwick * If this action results in the top-level vdev becoming 17121732Sbonwick * unusable, undo it and fail the request. 17131732Sbonwick */ 17141732Sbonwick vd->vdev_offline = B_TRUE; 17151544Seschrock vdev_reopen(vd->vdev_top); 17161732Sbonwick if (vdev_is_dead(vd->vdev_top)) { 17171732Sbonwick vd->vdev_offline = B_FALSE; 17181732Sbonwick vdev_reopen(vd->vdev_top); 17191732Sbonwick return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 17201732Sbonwick } 1721789Sahrens } 1722789Sahrens 17234451Seschrock vd->vdev_tmpoffline = (flags & ZFS_OFFLINE_TEMPORARY) ? 17244451Seschrock B_TRUE : B_FALSE; 17251732Sbonwick 17261732Sbonwick vdev_config_dirty(vd->vdev_top); 17271485Slling 17281485Slling return (spa_vdev_exit(spa, NULL, txg, 0)); 1729789Sahrens } 1730789Sahrens 17311544Seschrock /* 17321544Seschrock * Clear the error counts associated with this vdev. Unlike vdev_online() and 17331544Seschrock * vdev_offline(), we assume the spa config is locked. We also clear all 17341544Seschrock * children. If 'vd' is NULL, then the user wants to clear all vdevs. 1735*5329Sgw25295 * If reopen is specified then attempt to reopen the vdev if the vdev is 1736*5329Sgw25295 * faulted or degraded. 17371544Seschrock */ 17381544Seschrock void 1739*5329Sgw25295 vdev_clear(spa_t *spa, vdev_t *vd, boolean_t reopen_wanted) 1740789Sahrens { 17411544Seschrock int c; 1742789Sahrens 17431544Seschrock if (vd == NULL) 17441544Seschrock vd = spa->spa_root_vdev; 1745789Sahrens 17461544Seschrock vd->vdev_stat.vs_read_errors = 0; 17471544Seschrock vd->vdev_stat.vs_write_errors = 0; 17481544Seschrock vd->vdev_stat.vs_checksum_errors = 0; 1749*5329Sgw25295 vd->vdev_is_failing = B_FALSE; 1750789Sahrens 17511544Seschrock for (c = 0; c < vd->vdev_children; c++) 1752*5329Sgw25295 vdev_clear(spa, vd->vdev_child[c], reopen_wanted); 17534451Seschrock 17544451Seschrock /* 17554451Seschrock * If we're in the FAULTED state, then clear the persistent state and 17564451Seschrock * attempt to reopen the device. We also mark the vdev config dirty, so 17574451Seschrock * that the new faulted state is written out to disk. 17584451Seschrock */ 1759*5329Sgw25295 if (reopen_wanted && (vd->vdev_faulted || vd->vdev_degraded)) { 17604451Seschrock vd->vdev_faulted = vd->vdev_degraded = 0; 17614451Seschrock vdev_reopen(vd); 17624451Seschrock vdev_config_dirty(vd->vdev_top); 17634451Seschrock 17644451Seschrock if (vd->vdev_faulted) 17654808Sek110237 spa_async_request(spa, SPA_ASYNC_RESILVER); 17664451Seschrock 17674451Seschrock spa_event_notify(spa, vd, ESC_ZFS_VDEV_CLEAR); 17684451Seschrock } 1769789Sahrens } 1770789Sahrens 1771789Sahrens int 1772*5329Sgw25295 vdev_readable(vdev_t *vd) 1773*5329Sgw25295 { 1774*5329Sgw25295 /* XXPOLICY */ 1775*5329Sgw25295 return (!vdev_is_dead(vd)); 1776*5329Sgw25295 } 1777*5329Sgw25295 1778*5329Sgw25295 int 1779*5329Sgw25295 vdev_writeable(vdev_t *vd) 1780*5329Sgw25295 { 1781*5329Sgw25295 return (vd->vdev_ops->vdev_op_leaf ? 1782*5329Sgw25295 !vd->vdev_is_failing : !vdev_is_dead(vd)); 1783*5329Sgw25295 } 1784*5329Sgw25295 1785*5329Sgw25295 int 1786789Sahrens vdev_is_dead(vdev_t *vd) 1787789Sahrens { 17884451Seschrock return (vd->vdev_state < VDEV_STATE_DEGRADED); 1789789Sahrens } 1790789Sahrens 1791789Sahrens int 1792789Sahrens vdev_error_inject(vdev_t *vd, zio_t *zio) 1793789Sahrens { 1794789Sahrens int error = 0; 1795789Sahrens 1796789Sahrens if (vd->vdev_fault_mode == VDEV_FAULT_NONE) 1797789Sahrens return (0); 1798789Sahrens 1799789Sahrens if (((1ULL << zio->io_type) & vd->vdev_fault_mask) == 0) 1800789Sahrens return (0); 1801789Sahrens 1802789Sahrens switch (vd->vdev_fault_mode) { 1803789Sahrens case VDEV_FAULT_RANDOM: 1804789Sahrens if (spa_get_random(vd->vdev_fault_arg) == 0) 1805789Sahrens error = EIO; 1806789Sahrens break; 1807789Sahrens 1808789Sahrens case VDEV_FAULT_COUNT: 1809789Sahrens if ((int64_t)--vd->vdev_fault_arg <= 0) 1810789Sahrens vd->vdev_fault_mode = VDEV_FAULT_NONE; 1811789Sahrens error = EIO; 1812789Sahrens break; 1813789Sahrens } 1814789Sahrens 1815789Sahrens return (error); 1816789Sahrens } 1817789Sahrens 1818789Sahrens /* 1819789Sahrens * Get statistics for the given vdev. 1820789Sahrens */ 1821789Sahrens void 1822789Sahrens vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) 1823789Sahrens { 1824789Sahrens vdev_t *rvd = vd->vdev_spa->spa_root_vdev; 1825789Sahrens int c, t; 1826789Sahrens 1827789Sahrens mutex_enter(&vd->vdev_stat_lock); 1828789Sahrens bcopy(&vd->vdev_stat, vs, sizeof (*vs)); 1829789Sahrens vs->vs_timestamp = gethrtime() - vs->vs_timestamp; 1830789Sahrens vs->vs_state = vd->vdev_state; 18311175Slling vs->vs_rsize = vdev_get_rsize(vd); 1832789Sahrens mutex_exit(&vd->vdev_stat_lock); 1833789Sahrens 1834789Sahrens /* 1835789Sahrens * If we're getting stats on the root vdev, aggregate the I/O counts 1836789Sahrens * over all top-level vdevs (i.e. the direct children of the root). 1837789Sahrens */ 1838789Sahrens if (vd == rvd) { 1839789Sahrens for (c = 0; c < rvd->vdev_children; c++) { 1840789Sahrens vdev_t *cvd = rvd->vdev_child[c]; 1841789Sahrens vdev_stat_t *cvs = &cvd->vdev_stat; 1842789Sahrens 1843789Sahrens mutex_enter(&vd->vdev_stat_lock); 1844789Sahrens for (t = 0; t < ZIO_TYPES; t++) { 1845789Sahrens vs->vs_ops[t] += cvs->vs_ops[t]; 1846789Sahrens vs->vs_bytes[t] += cvs->vs_bytes[t]; 1847789Sahrens } 1848789Sahrens vs->vs_read_errors += cvs->vs_read_errors; 1849789Sahrens vs->vs_write_errors += cvs->vs_write_errors; 1850789Sahrens vs->vs_checksum_errors += cvs->vs_checksum_errors; 1851789Sahrens vs->vs_scrub_examined += cvs->vs_scrub_examined; 1852789Sahrens vs->vs_scrub_errors += cvs->vs_scrub_errors; 1853789Sahrens mutex_exit(&vd->vdev_stat_lock); 1854789Sahrens } 1855789Sahrens } 1856789Sahrens } 1857789Sahrens 1858789Sahrens void 1859789Sahrens vdev_stat_update(zio_t *zio) 1860789Sahrens { 1861789Sahrens vdev_t *vd = zio->io_vd; 1862789Sahrens vdev_t *pvd; 1863789Sahrens uint64_t txg = zio->io_txg; 1864789Sahrens vdev_stat_t *vs = &vd->vdev_stat; 1865789Sahrens zio_type_t type = zio->io_type; 1866789Sahrens int flags = zio->io_flags; 1867789Sahrens 1868789Sahrens if (zio->io_error == 0) { 1869789Sahrens if (!(flags & ZIO_FLAG_IO_BYPASS)) { 1870789Sahrens mutex_enter(&vd->vdev_stat_lock); 1871789Sahrens vs->vs_ops[type]++; 1872789Sahrens vs->vs_bytes[type] += zio->io_size; 1873789Sahrens mutex_exit(&vd->vdev_stat_lock); 1874789Sahrens } 1875789Sahrens if ((flags & ZIO_FLAG_IO_REPAIR) && 1876789Sahrens zio->io_delegate_list == NULL) { 1877789Sahrens mutex_enter(&vd->vdev_stat_lock); 18781807Sbonwick if (flags & ZIO_FLAG_SCRUB_THREAD) 1879789Sahrens vs->vs_scrub_repaired += zio->io_size; 1880789Sahrens else 1881789Sahrens vs->vs_self_healed += zio->io_size; 1882789Sahrens mutex_exit(&vd->vdev_stat_lock); 1883789Sahrens } 1884789Sahrens return; 1885789Sahrens } 1886789Sahrens 1887789Sahrens if (flags & ZIO_FLAG_SPECULATIVE) 1888789Sahrens return; 1889789Sahrens 1890*5329Sgw25295 if (vdev_readable(vd)) { 1891789Sahrens mutex_enter(&vd->vdev_stat_lock); 1892789Sahrens if (type == ZIO_TYPE_READ) { 1893789Sahrens if (zio->io_error == ECKSUM) 1894789Sahrens vs->vs_checksum_errors++; 1895789Sahrens else 1896789Sahrens vs->vs_read_errors++; 1897789Sahrens } 1898789Sahrens if (type == ZIO_TYPE_WRITE) 1899789Sahrens vs->vs_write_errors++; 1900789Sahrens mutex_exit(&vd->vdev_stat_lock); 1901789Sahrens } 1902789Sahrens 1903789Sahrens if (type == ZIO_TYPE_WRITE) { 1904789Sahrens if (txg == 0 || vd->vdev_children != 0) 1905789Sahrens return; 19061807Sbonwick if (flags & ZIO_FLAG_SCRUB_THREAD) { 1907789Sahrens ASSERT(flags & ZIO_FLAG_IO_REPAIR); 1908789Sahrens for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) 1909789Sahrens vdev_dtl_dirty(&pvd->vdev_dtl_scrub, txg, 1); 1910789Sahrens } 1911789Sahrens if (!(flags & ZIO_FLAG_IO_REPAIR)) { 1912789Sahrens if (vdev_dtl_contains(&vd->vdev_dtl_map, txg, 1)) 1913789Sahrens return; 19141732Sbonwick vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); 1915789Sahrens for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) 1916789Sahrens vdev_dtl_dirty(&pvd->vdev_dtl_map, txg, 1); 1917789Sahrens } 1918789Sahrens } 1919789Sahrens } 1920789Sahrens 1921789Sahrens void 1922789Sahrens vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete) 1923789Sahrens { 1924789Sahrens int c; 1925789Sahrens vdev_stat_t *vs = &vd->vdev_stat; 1926789Sahrens 1927789Sahrens for (c = 0; c < vd->vdev_children; c++) 1928789Sahrens vdev_scrub_stat_update(vd->vdev_child[c], type, complete); 1929789Sahrens 1930789Sahrens mutex_enter(&vd->vdev_stat_lock); 1931789Sahrens 1932789Sahrens if (type == POOL_SCRUB_NONE) { 1933789Sahrens /* 1934789Sahrens * Update completion and end time. Leave everything else alone 1935789Sahrens * so we can report what happened during the previous scrub. 1936789Sahrens */ 1937789Sahrens vs->vs_scrub_complete = complete; 1938789Sahrens vs->vs_scrub_end = gethrestime_sec(); 1939789Sahrens } else { 1940789Sahrens vs->vs_scrub_type = type; 1941789Sahrens vs->vs_scrub_complete = 0; 1942789Sahrens vs->vs_scrub_examined = 0; 1943789Sahrens vs->vs_scrub_repaired = 0; 1944789Sahrens vs->vs_scrub_errors = 0; 1945789Sahrens vs->vs_scrub_start = gethrestime_sec(); 1946789Sahrens vs->vs_scrub_end = 0; 1947789Sahrens } 1948789Sahrens 1949789Sahrens mutex_exit(&vd->vdev_stat_lock); 1950789Sahrens } 1951789Sahrens 1952789Sahrens /* 1953789Sahrens * Update the in-core space usage stats for this vdev and the root vdev. 1954789Sahrens */ 1955789Sahrens void 19562082Seschrock vdev_space_update(vdev_t *vd, int64_t space_delta, int64_t alloc_delta) 1957789Sahrens { 19584527Sperrin int64_t dspace_delta = space_delta; 19594527Sperrin spa_t *spa = vd->vdev_spa; 19604527Sperrin vdev_t *rvd = spa->spa_root_vdev; 19614527Sperrin 1962789Sahrens ASSERT(vd == vd->vdev_top); 19634527Sperrin ASSERT(rvd == vd->vdev_parent); 19644527Sperrin ASSERT(vd->vdev_ms_count != 0); 19654527Sperrin 19664527Sperrin /* 19674527Sperrin * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion 19684527Sperrin * factor. We must calculate this here and not at the root vdev 19694527Sperrin * because the root vdev's psize-to-asize is simply the max of its 19704527Sperrin * childrens', thus not accurate enough for us. 19714527Sperrin */ 19724527Sperrin ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0); 19734527Sperrin dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) * 19744527Sperrin vd->vdev_deflate_ratio; 1975789Sahrens 19764527Sperrin mutex_enter(&vd->vdev_stat_lock); 19774527Sperrin vd->vdev_stat.vs_space += space_delta; 19784527Sperrin vd->vdev_stat.vs_alloc += alloc_delta; 19794527Sperrin vd->vdev_stat.vs_dspace += dspace_delta; 19804527Sperrin mutex_exit(&vd->vdev_stat_lock); 19812082Seschrock 19824527Sperrin /* 19834527Sperrin * Don't count non-normal (e.g. intent log) space as part of 19844527Sperrin * the pool's capacity. 19854527Sperrin */ 19864527Sperrin if (vd->vdev_mg->mg_class != spa->spa_normal_class) 19874527Sperrin return; 19884527Sperrin 19894527Sperrin mutex_enter(&rvd->vdev_stat_lock); 19904527Sperrin rvd->vdev_stat.vs_space += space_delta; 19914527Sperrin rvd->vdev_stat.vs_alloc += alloc_delta; 19924527Sperrin rvd->vdev_stat.vs_dspace += dspace_delta; 19934527Sperrin mutex_exit(&rvd->vdev_stat_lock); 1994789Sahrens } 1995789Sahrens 1996789Sahrens /* 1997789Sahrens * Mark a top-level vdev's config as dirty, placing it on the dirty list 1998789Sahrens * so that it will be written out next time the vdev configuration is synced. 1999789Sahrens * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs. 2000789Sahrens */ 2001789Sahrens void 2002789Sahrens vdev_config_dirty(vdev_t *vd) 2003789Sahrens { 2004789Sahrens spa_t *spa = vd->vdev_spa; 2005789Sahrens vdev_t *rvd = spa->spa_root_vdev; 2006789Sahrens int c; 2007789Sahrens 20081601Sbonwick /* 20091601Sbonwick * The dirty list is protected by the config lock. The caller must 20101601Sbonwick * either hold the config lock as writer, or must be the sync thread 20111601Sbonwick * (which holds the lock as reader). There's only one sync thread, 20121601Sbonwick * so this is sufficient to ensure mutual exclusion. 20131601Sbonwick */ 20141601Sbonwick ASSERT(spa_config_held(spa, RW_WRITER) || 20151601Sbonwick dsl_pool_sync_context(spa_get_dsl(spa))); 20161601Sbonwick 2017789Sahrens if (vd == rvd) { 2018789Sahrens for (c = 0; c < rvd->vdev_children; c++) 2019789Sahrens vdev_config_dirty(rvd->vdev_child[c]); 2020789Sahrens } else { 2021789Sahrens ASSERT(vd == vd->vdev_top); 2022789Sahrens 20231732Sbonwick if (!list_link_active(&vd->vdev_dirty_node)) 2024789Sahrens list_insert_head(&spa->spa_dirty_list, vd); 2025789Sahrens } 2026789Sahrens } 2027789Sahrens 2028789Sahrens void 2029789Sahrens vdev_config_clean(vdev_t *vd) 2030789Sahrens { 20311601Sbonwick spa_t *spa = vd->vdev_spa; 20321601Sbonwick 20331601Sbonwick ASSERT(spa_config_held(spa, RW_WRITER) || 20341601Sbonwick dsl_pool_sync_context(spa_get_dsl(spa))); 20351601Sbonwick 20361732Sbonwick ASSERT(list_link_active(&vd->vdev_dirty_node)); 20371601Sbonwick list_remove(&spa->spa_dirty_list, vd); 2038789Sahrens } 2039789Sahrens 20401775Sbillm void 20411775Sbillm vdev_propagate_state(vdev_t *vd) 20421775Sbillm { 20431775Sbillm vdev_t *rvd = vd->vdev_spa->spa_root_vdev; 20441775Sbillm int degraded = 0, faulted = 0; 20451775Sbillm int corrupted = 0; 20461775Sbillm int c; 20471775Sbillm vdev_t *child; 20481775Sbillm 20494451Seschrock if (vd->vdev_children > 0) { 20504451Seschrock for (c = 0; c < vd->vdev_children; c++) { 20514451Seschrock child = vd->vdev_child[c]; 2052*5329Sgw25295 if (vdev_is_dead(child) && !vdev_readable(child)) 20534451Seschrock faulted++; 2054*5329Sgw25295 else if (child->vdev_state <= VDEV_STATE_DEGRADED) 20554451Seschrock degraded++; 20564451Seschrock 20574451Seschrock if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) 20584451Seschrock corrupted++; 20594451Seschrock } 20601775Sbillm 20614451Seschrock vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded); 20624451Seschrock 20634451Seschrock /* 20644451Seschrock * Root special: if there is a toplevel vdev that cannot be 20654451Seschrock * opened due to corrupted metadata, then propagate the root 20664451Seschrock * vdev's aux state as 'corrupt' rather than 'insufficient 20674451Seschrock * replicas'. 20684451Seschrock */ 20694451Seschrock if (corrupted && vd == rvd && 20704451Seschrock rvd->vdev_state == VDEV_STATE_CANT_OPEN) 20714451Seschrock vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN, 20724451Seschrock VDEV_AUX_CORRUPT_DATA); 20731775Sbillm } 20741775Sbillm 20754527Sperrin if (vd->vdev_parent && !vd->vdev_islog) 20764451Seschrock vdev_propagate_state(vd->vdev_parent); 20771775Sbillm } 20781775Sbillm 2079789Sahrens /* 20801544Seschrock * Set a vdev's state. If this is during an open, we don't update the parent 20811544Seschrock * state, because we're in the process of opening children depth-first. 20821544Seschrock * Otherwise, we propagate the change to the parent. 20831544Seschrock * 20841544Seschrock * If this routine places a device in a faulted state, an appropriate ereport is 20851544Seschrock * generated. 2086789Sahrens */ 2087789Sahrens void 20881544Seschrock vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) 2089789Sahrens { 20901986Seschrock uint64_t save_state; 20911544Seschrock 20921544Seschrock if (state == vd->vdev_state) { 20931544Seschrock vd->vdev_stat.vs_aux = aux; 2094789Sahrens return; 20951544Seschrock } 20961544Seschrock 20971986Seschrock save_state = vd->vdev_state; 2098789Sahrens 2099789Sahrens vd->vdev_state = state; 2100789Sahrens vd->vdev_stat.vs_aux = aux; 2101789Sahrens 21024451Seschrock /* 21034451Seschrock * If we are setting the vdev state to anything but an open state, then 21044451Seschrock * always close the underlying device. Otherwise, we keep accessible 21054451Seschrock * but invalid devices open forever. We don't call vdev_close() itself, 21064451Seschrock * because that implies some extra checks (offline, etc) that we don't 21074451Seschrock * want here. This is limited to leaf devices, because otherwise 21084451Seschrock * closing the device will affect other children. 21094451Seschrock */ 2110*5329Sgw25295 if (!vdev_readable(vd) && vd->vdev_ops->vdev_op_leaf) 21114451Seschrock vd->vdev_ops->vdev_op_close(vd); 21124451Seschrock 21134451Seschrock if (vd->vdev_removed && 21144451Seschrock state == VDEV_STATE_CANT_OPEN && 21154451Seschrock (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) { 21164451Seschrock /* 21174451Seschrock * If the previous state is set to VDEV_STATE_REMOVED, then this 21184451Seschrock * device was previously marked removed and someone attempted to 21194451Seschrock * reopen it. If this failed due to a nonexistent device, then 21204451Seschrock * keep the device in the REMOVED state. We also let this be if 21214451Seschrock * it is one of our special test online cases, which is only 21224451Seschrock * attempting to online the device and shouldn't generate an FMA 21234451Seschrock * fault. 21244451Seschrock */ 21254451Seschrock vd->vdev_state = VDEV_STATE_REMOVED; 21264451Seschrock vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 21274451Seschrock } else if (state == VDEV_STATE_REMOVED) { 21284451Seschrock /* 21294451Seschrock * Indicate to the ZFS DE that this device has been removed, and 21304451Seschrock * any recent errors should be ignored. 21314451Seschrock */ 21324451Seschrock zfs_post_remove(vd->vdev_spa, vd); 21334451Seschrock vd->vdev_removed = B_TRUE; 21344451Seschrock } else if (state == VDEV_STATE_CANT_OPEN) { 21351544Seschrock /* 21361544Seschrock * If we fail to open a vdev during an import, we mark it as 21371544Seschrock * "not available", which signifies that it was never there to 21381544Seschrock * begin with. Failure to open such a device is not considered 21391544Seschrock * an error. 21401544Seschrock */ 21411986Seschrock if (vd->vdev_spa->spa_load_state == SPA_LOAD_IMPORT && 21421986Seschrock vd->vdev_ops->vdev_op_leaf) 21431986Seschrock vd->vdev_not_present = 1; 21441986Seschrock 21451986Seschrock /* 21461986Seschrock * Post the appropriate ereport. If the 'prevstate' field is 21471986Seschrock * set to something other than VDEV_STATE_UNKNOWN, it indicates 21481986Seschrock * that this is part of a vdev_reopen(). In this case, we don't 21491986Seschrock * want to post the ereport if the device was already in the 21501986Seschrock * CANT_OPEN state beforehand. 21514451Seschrock * 21524451Seschrock * If the 'checkremove' flag is set, then this is an attempt to 21534451Seschrock * online the device in response to an insertion event. If we 21544451Seschrock * hit this case, then we have detected an insertion event for a 21554451Seschrock * faulted or offline device that wasn't in the removed state. 21564451Seschrock * In this scenario, we don't post an ereport because we are 21574451Seschrock * about to replace the device, or attempt an online with 21584451Seschrock * vdev_forcefault, which will generate the fault for us. 21591986Seschrock */ 21604451Seschrock if ((vd->vdev_prevstate != state || vd->vdev_forcefault) && 21614451Seschrock !vd->vdev_not_present && !vd->vdev_checkremove && 21621544Seschrock vd != vd->vdev_spa->spa_root_vdev) { 21631544Seschrock const char *class; 21641544Seschrock 21651544Seschrock switch (aux) { 21661544Seschrock case VDEV_AUX_OPEN_FAILED: 21671544Seschrock class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED; 21681544Seschrock break; 21691544Seschrock case VDEV_AUX_CORRUPT_DATA: 21701544Seschrock class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA; 21711544Seschrock break; 21721544Seschrock case VDEV_AUX_NO_REPLICAS: 21731544Seschrock class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS; 21741544Seschrock break; 21751544Seschrock case VDEV_AUX_BAD_GUID_SUM: 21761544Seschrock class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM; 21771544Seschrock break; 21781544Seschrock case VDEV_AUX_TOO_SMALL: 21791544Seschrock class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL; 21801544Seschrock break; 21811544Seschrock case VDEV_AUX_BAD_LABEL: 21821544Seschrock class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL; 21831544Seschrock break; 21841544Seschrock default: 21851544Seschrock class = FM_EREPORT_ZFS_DEVICE_UNKNOWN; 21861544Seschrock } 21871544Seschrock 21881544Seschrock zfs_ereport_post(class, vd->vdev_spa, 21891986Seschrock vd, NULL, save_state, 0); 21901544Seschrock } 21914451Seschrock 21924451Seschrock /* Erase any notion of persistent removed state */ 21934451Seschrock vd->vdev_removed = B_FALSE; 21944451Seschrock } else { 21954451Seschrock vd->vdev_removed = B_FALSE; 21961544Seschrock } 21971544Seschrock 21984451Seschrock if (!isopen) 21994451Seschrock vdev_propagate_state(vd); 2200789Sahrens } 2201