1789Sahrens /* 2789Sahrens * CDDL HEADER START 3789Sahrens * 4789Sahrens * The contents of this file are subject to the terms of the 51485Slling * Common Development and Distribution License (the "License"). 61485Slling * You may not use this file except in compliance with the License. 7789Sahrens * 8789Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9789Sahrens * or http://www.opensolaris.org/os/licensing. 10789Sahrens * See the License for the specific language governing permissions 11789Sahrens * and limitations under the License. 12789Sahrens * 13789Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14789Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15789Sahrens * If applicable, add the following below this CDDL HEADER, with the 16789Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17789Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18789Sahrens * 19789Sahrens * CDDL HEADER END 20789Sahrens */ 212082Seschrock 22789Sahrens /* 233377Seschrock * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24789Sahrens * Use is subject to license terms. 25789Sahrens */ 26789Sahrens 27789Sahrens #pragma ident "%Z%%M% %I% %E% SMI" 28789Sahrens 29789Sahrens #include <sys/zfs_context.h> 301544Seschrock #include <sys/fm/fs/zfs.h> 31789Sahrens #include <sys/spa.h> 32789Sahrens #include <sys/spa_impl.h> 33789Sahrens #include <sys/dmu.h> 34789Sahrens #include <sys/dmu_tx.h> 35789Sahrens #include <sys/vdev_impl.h> 36789Sahrens #include <sys/uberblock_impl.h> 37789Sahrens #include <sys/metaslab.h> 38789Sahrens #include <sys/metaslab_impl.h> 39789Sahrens #include <sys/space_map.h> 40789Sahrens #include <sys/zio.h> 41789Sahrens #include <sys/zap.h> 42789Sahrens #include <sys/fs/zfs.h> 43789Sahrens 44789Sahrens /* 45789Sahrens * Virtual device management. 46789Sahrens */ 47789Sahrens 48789Sahrens static vdev_ops_t *vdev_ops_table[] = { 49789Sahrens &vdev_root_ops, 50789Sahrens &vdev_raidz_ops, 51789Sahrens &vdev_mirror_ops, 52789Sahrens &vdev_replacing_ops, 532082Seschrock &vdev_spare_ops, 54789Sahrens &vdev_disk_ops, 55789Sahrens &vdev_file_ops, 56789Sahrens &vdev_missing_ops, 57789Sahrens NULL 58789Sahrens }; 59789Sahrens 603697Smishra /* maximum scrub/resilver I/O queue */ 613697Smishra int zfs_scrub_limit = 70; 623697Smishra 63789Sahrens /* 64789Sahrens * Given a vdev type, return the appropriate ops vector. 65789Sahrens */ 66789Sahrens static vdev_ops_t * 67789Sahrens vdev_getops(const char *type) 68789Sahrens { 69789Sahrens vdev_ops_t *ops, **opspp; 70789Sahrens 71789Sahrens for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++) 72789Sahrens if (strcmp(ops->vdev_op_type, type) == 0) 73789Sahrens break; 74789Sahrens 75789Sahrens return (ops); 76789Sahrens } 77789Sahrens 78789Sahrens /* 79789Sahrens * Default asize function: return the MAX of psize with the asize of 80789Sahrens * all children. This is what's used by anything other than RAID-Z. 81789Sahrens */ 82789Sahrens uint64_t 83789Sahrens vdev_default_asize(vdev_t *vd, uint64_t psize) 84789Sahrens { 851732Sbonwick uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift); 86789Sahrens uint64_t csize; 87789Sahrens uint64_t c; 88789Sahrens 89789Sahrens for (c = 0; c < vd->vdev_children; c++) { 90789Sahrens csize = vdev_psize_to_asize(vd->vdev_child[c], psize); 91789Sahrens asize = MAX(asize, csize); 92789Sahrens } 93789Sahrens 94789Sahrens return (asize); 95789Sahrens } 96789Sahrens 971175Slling /* 981175Slling * Get the replaceable or attachable device size. 991175Slling * If the parent is a mirror or raidz, the replaceable size is the minimum 1001175Slling * psize of all its children. For the rest, just return our own psize. 1011175Slling * 1021175Slling * e.g. 1031175Slling * psize rsize 1041175Slling * root - - 1051175Slling * mirror/raidz - - 1061175Slling * disk1 20g 20g 1071175Slling * disk2 40g 20g 1081175Slling * disk3 80g 80g 1091175Slling */ 1101175Slling uint64_t 1111175Slling vdev_get_rsize(vdev_t *vd) 1121175Slling { 1131175Slling vdev_t *pvd, *cvd; 1141175Slling uint64_t c, rsize; 1151175Slling 1161175Slling pvd = vd->vdev_parent; 1171175Slling 1181175Slling /* 1191175Slling * If our parent is NULL or the root, just return our own psize. 1201175Slling */ 1211175Slling if (pvd == NULL || pvd->vdev_parent == NULL) 1221175Slling return (vd->vdev_psize); 1231175Slling 1241175Slling rsize = 0; 1251175Slling 1261175Slling for (c = 0; c < pvd->vdev_children; c++) { 1271175Slling cvd = pvd->vdev_child[c]; 1281175Slling rsize = MIN(rsize - 1, cvd->vdev_psize - 1) + 1; 1291175Slling } 1301175Slling 1311175Slling return (rsize); 1321175Slling } 1331175Slling 134789Sahrens vdev_t * 135789Sahrens vdev_lookup_top(spa_t *spa, uint64_t vdev) 136789Sahrens { 137789Sahrens vdev_t *rvd = spa->spa_root_vdev; 138789Sahrens 139789Sahrens if (vdev < rvd->vdev_children) 140789Sahrens return (rvd->vdev_child[vdev]); 141789Sahrens 142789Sahrens return (NULL); 143789Sahrens } 144789Sahrens 145789Sahrens vdev_t * 146789Sahrens vdev_lookup_by_guid(vdev_t *vd, uint64_t guid) 147789Sahrens { 148789Sahrens int c; 149789Sahrens vdev_t *mvd; 150789Sahrens 1511585Sbonwick if (vd->vdev_guid == guid) 152789Sahrens return (vd); 153789Sahrens 154789Sahrens for (c = 0; c < vd->vdev_children; c++) 155789Sahrens if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) != 156789Sahrens NULL) 157789Sahrens return (mvd); 158789Sahrens 159789Sahrens return (NULL); 160789Sahrens } 161789Sahrens 162789Sahrens void 163789Sahrens vdev_add_child(vdev_t *pvd, vdev_t *cvd) 164789Sahrens { 165789Sahrens size_t oldsize, newsize; 166789Sahrens uint64_t id = cvd->vdev_id; 167789Sahrens vdev_t **newchild; 168789Sahrens 169789Sahrens ASSERT(spa_config_held(cvd->vdev_spa, RW_WRITER)); 170789Sahrens ASSERT(cvd->vdev_parent == NULL); 171789Sahrens 172789Sahrens cvd->vdev_parent = pvd; 173789Sahrens 174789Sahrens if (pvd == NULL) 175789Sahrens return; 176789Sahrens 177789Sahrens ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL); 178789Sahrens 179789Sahrens oldsize = pvd->vdev_children * sizeof (vdev_t *); 180789Sahrens pvd->vdev_children = MAX(pvd->vdev_children, id + 1); 181789Sahrens newsize = pvd->vdev_children * sizeof (vdev_t *); 182789Sahrens 183789Sahrens newchild = kmem_zalloc(newsize, KM_SLEEP); 184789Sahrens if (pvd->vdev_child != NULL) { 185789Sahrens bcopy(pvd->vdev_child, newchild, oldsize); 186789Sahrens kmem_free(pvd->vdev_child, oldsize); 187789Sahrens } 188789Sahrens 189789Sahrens pvd->vdev_child = newchild; 190789Sahrens pvd->vdev_child[id] = cvd; 191789Sahrens 192789Sahrens cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd); 193789Sahrens ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL); 194789Sahrens 195789Sahrens /* 196789Sahrens * Walk up all ancestors to update guid sum. 197789Sahrens */ 198789Sahrens for (; pvd != NULL; pvd = pvd->vdev_parent) 199789Sahrens pvd->vdev_guid_sum += cvd->vdev_guid_sum; 2003697Smishra 2013697Smishra if (cvd->vdev_ops->vdev_op_leaf) 2023697Smishra cvd->vdev_spa->spa_scrub_maxinflight += zfs_scrub_limit; 203789Sahrens } 204789Sahrens 205789Sahrens void 206789Sahrens vdev_remove_child(vdev_t *pvd, vdev_t *cvd) 207789Sahrens { 208789Sahrens int c; 209789Sahrens uint_t id = cvd->vdev_id; 210789Sahrens 211789Sahrens ASSERT(cvd->vdev_parent == pvd); 212789Sahrens 213789Sahrens if (pvd == NULL) 214789Sahrens return; 215789Sahrens 216789Sahrens ASSERT(id < pvd->vdev_children); 217789Sahrens ASSERT(pvd->vdev_child[id] == cvd); 218789Sahrens 219789Sahrens pvd->vdev_child[id] = NULL; 220789Sahrens cvd->vdev_parent = NULL; 221789Sahrens 222789Sahrens for (c = 0; c < pvd->vdev_children; c++) 223789Sahrens if (pvd->vdev_child[c]) 224789Sahrens break; 225789Sahrens 226789Sahrens if (c == pvd->vdev_children) { 227789Sahrens kmem_free(pvd->vdev_child, c * sizeof (vdev_t *)); 228789Sahrens pvd->vdev_child = NULL; 229789Sahrens pvd->vdev_children = 0; 230789Sahrens } 231789Sahrens 232789Sahrens /* 233789Sahrens * Walk up all ancestors to update guid sum. 234789Sahrens */ 235789Sahrens for (; pvd != NULL; pvd = pvd->vdev_parent) 236789Sahrens pvd->vdev_guid_sum -= cvd->vdev_guid_sum; 2373697Smishra 2383697Smishra if (cvd->vdev_ops->vdev_op_leaf) 2393697Smishra cvd->vdev_spa->spa_scrub_maxinflight -= zfs_scrub_limit; 240789Sahrens } 241789Sahrens 242789Sahrens /* 243789Sahrens * Remove any holes in the child array. 244789Sahrens */ 245789Sahrens void 246789Sahrens vdev_compact_children(vdev_t *pvd) 247789Sahrens { 248789Sahrens vdev_t **newchild, *cvd; 249789Sahrens int oldc = pvd->vdev_children; 250789Sahrens int newc, c; 251789Sahrens 252789Sahrens ASSERT(spa_config_held(pvd->vdev_spa, RW_WRITER)); 253789Sahrens 254789Sahrens for (c = newc = 0; c < oldc; c++) 255789Sahrens if (pvd->vdev_child[c]) 256789Sahrens newc++; 257789Sahrens 258789Sahrens newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP); 259789Sahrens 260789Sahrens for (c = newc = 0; c < oldc; c++) { 261789Sahrens if ((cvd = pvd->vdev_child[c]) != NULL) { 262789Sahrens newchild[newc] = cvd; 263789Sahrens cvd->vdev_id = newc++; 264789Sahrens } 265789Sahrens } 266789Sahrens 267789Sahrens kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *)); 268789Sahrens pvd->vdev_child = newchild; 269789Sahrens pvd->vdev_children = newc; 270789Sahrens } 271789Sahrens 272789Sahrens /* 273789Sahrens * Allocate and minimally initialize a vdev_t. 274789Sahrens */ 275789Sahrens static vdev_t * 276789Sahrens vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) 277789Sahrens { 278789Sahrens vdev_t *vd; 279789Sahrens 2801585Sbonwick vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP); 2811585Sbonwick 2821585Sbonwick if (spa->spa_root_vdev == NULL) { 2831585Sbonwick ASSERT(ops == &vdev_root_ops); 2841585Sbonwick spa->spa_root_vdev = vd; 2851585Sbonwick } 286789Sahrens 2871585Sbonwick if (guid == 0) { 2881585Sbonwick if (spa->spa_root_vdev == vd) { 2891585Sbonwick /* 2901585Sbonwick * The root vdev's guid will also be the pool guid, 2911585Sbonwick * which must be unique among all pools. 2921585Sbonwick */ 2931585Sbonwick while (guid == 0 || spa_guid_exists(guid, 0)) 2941585Sbonwick guid = spa_get_random(-1ULL); 2951585Sbonwick } else { 2961585Sbonwick /* 2971585Sbonwick * Any other vdev's guid must be unique within the pool. 2981585Sbonwick */ 2991585Sbonwick while (guid == 0 || 3001585Sbonwick spa_guid_exists(spa_guid(spa), guid)) 3011585Sbonwick guid = spa_get_random(-1ULL); 3021585Sbonwick } 3031585Sbonwick ASSERT(!spa_guid_exists(spa_guid(spa), guid)); 3041585Sbonwick } 305789Sahrens 306789Sahrens vd->vdev_spa = spa; 307789Sahrens vd->vdev_id = id; 308789Sahrens vd->vdev_guid = guid; 309789Sahrens vd->vdev_guid_sum = guid; 310789Sahrens vd->vdev_ops = ops; 311789Sahrens vd->vdev_state = VDEV_STATE_CLOSED; 312789Sahrens 313789Sahrens mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL); 3142856Snd150628 mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); 315789Sahrens space_map_create(&vd->vdev_dtl_map, 0, -1ULL, 0, &vd->vdev_dtl_lock); 316789Sahrens space_map_create(&vd->vdev_dtl_scrub, 0, -1ULL, 0, &vd->vdev_dtl_lock); 317789Sahrens txg_list_create(&vd->vdev_ms_list, 318789Sahrens offsetof(struct metaslab, ms_txg_node)); 319789Sahrens txg_list_create(&vd->vdev_dtl_list, 320789Sahrens offsetof(struct vdev, vdev_dtl_node)); 321789Sahrens vd->vdev_stat.vs_timestamp = gethrtime(); 322*4451Seschrock vdev_queue_init(vd); 323*4451Seschrock vdev_cache_init(vd); 324789Sahrens 325789Sahrens return (vd); 326789Sahrens } 327789Sahrens 328789Sahrens /* 329789Sahrens * Allocate a new vdev. The 'alloctype' is used to control whether we are 330789Sahrens * creating a new vdev or loading an existing one - the behavior is slightly 331789Sahrens * different for each case. 332789Sahrens */ 3332082Seschrock int 3342082Seschrock vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, 3352082Seschrock int alloctype) 336789Sahrens { 337789Sahrens vdev_ops_t *ops; 338789Sahrens char *type; 3391732Sbonwick uint64_t guid = 0; 340789Sahrens vdev_t *vd; 341789Sahrens 342789Sahrens ASSERT(spa_config_held(spa, RW_WRITER)); 343789Sahrens 344789Sahrens if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) 3452082Seschrock return (EINVAL); 346789Sahrens 347789Sahrens if ((ops = vdev_getops(type)) == NULL) 3482082Seschrock return (EINVAL); 349789Sahrens 350789Sahrens /* 351789Sahrens * If this is a load, get the vdev guid from the nvlist. 352789Sahrens * Otherwise, vdev_alloc_common() will generate one for us. 353789Sahrens */ 354789Sahrens if (alloctype == VDEV_ALLOC_LOAD) { 355789Sahrens uint64_t label_id; 356789Sahrens 357789Sahrens if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) || 358789Sahrens label_id != id) 3592082Seschrock return (EINVAL); 360789Sahrens 361789Sahrens if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 3622082Seschrock return (EINVAL); 3632082Seschrock } else if (alloctype == VDEV_ALLOC_SPARE) { 3642082Seschrock if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 3652082Seschrock return (EINVAL); 366789Sahrens } 367789Sahrens 3682082Seschrock /* 3692082Seschrock * The first allocated vdev must be of type 'root'. 3702082Seschrock */ 3712082Seschrock if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL) 3722082Seschrock return (EINVAL); 3732082Seschrock 374789Sahrens vd = vdev_alloc_common(spa, id, guid, ops); 375789Sahrens 376789Sahrens if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0) 377789Sahrens vd->vdev_path = spa_strdup(vd->vdev_path); 378789Sahrens if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0) 379789Sahrens vd->vdev_devid = spa_strdup(vd->vdev_devid); 380*4451Seschrock if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, 381*4451Seschrock &vd->vdev_physpath) == 0) 382*4451Seschrock vd->vdev_physpath = spa_strdup(vd->vdev_physpath); 383789Sahrens 384789Sahrens /* 3852082Seschrock * Set the nparity propery for RAID-Z vdevs. 3862082Seschrock */ 3872082Seschrock if (ops == &vdev_raidz_ops) { 3882082Seschrock if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, 3892082Seschrock &vd->vdev_nparity) == 0) { 3902082Seschrock /* 3912082Seschrock * Currently, we can only support 2 parity devices. 3922082Seschrock */ 3932082Seschrock if (vd->vdev_nparity > 2) 3942082Seschrock return (EINVAL); 3952082Seschrock /* 3962082Seschrock * Older versions can only support 1 parity device. 3972082Seschrock */ 3982082Seschrock if (vd->vdev_nparity == 2 && 3992082Seschrock spa_version(spa) < ZFS_VERSION_RAID6) 4002082Seschrock return (ENOTSUP); 4012082Seschrock 4022082Seschrock } else { 4032082Seschrock /* 4042082Seschrock * We require the parity to be specified for SPAs that 4052082Seschrock * support multiple parity levels. 4062082Seschrock */ 4072082Seschrock if (spa_version(spa) >= ZFS_VERSION_RAID6) 4082082Seschrock return (EINVAL); 4092082Seschrock 4102082Seschrock /* 4112082Seschrock * Otherwise, we default to 1 parity device for RAID-Z. 4122082Seschrock */ 4132082Seschrock vd->vdev_nparity = 1; 4142082Seschrock } 4152082Seschrock } else { 4162082Seschrock vd->vdev_nparity = 0; 4172082Seschrock } 4182082Seschrock 4192082Seschrock /* 4201171Seschrock * Set the whole_disk property. If it's not specified, leave the value 4211171Seschrock * as -1. 4221171Seschrock */ 4231171Seschrock if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, 4241171Seschrock &vd->vdev_wholedisk) != 0) 4251171Seschrock vd->vdev_wholedisk = -1ULL; 4261171Seschrock 4271171Seschrock /* 4281544Seschrock * Look for the 'not present' flag. This will only be set if the device 4291544Seschrock * was not present at the time of import. 4301544Seschrock */ 4311544Seschrock (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 4321544Seschrock &vd->vdev_not_present); 4331544Seschrock 4341544Seschrock /* 4351732Sbonwick * Get the alignment requirement. 4361732Sbonwick */ 4371732Sbonwick (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift); 4381732Sbonwick 4391732Sbonwick /* 440789Sahrens * If we're a top-level vdev, try to load the allocation parameters. 441789Sahrens */ 442789Sahrens if (parent && !parent->vdev_parent && alloctype == VDEV_ALLOC_LOAD) { 443789Sahrens (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, 444789Sahrens &vd->vdev_ms_array); 445789Sahrens (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, 446789Sahrens &vd->vdev_ms_shift); 447789Sahrens (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE, 448789Sahrens &vd->vdev_asize); 449789Sahrens } 450789Sahrens 451789Sahrens /* 452*4451Seschrock * If we're a leaf vdev, try to load the DTL object and other state. 453789Sahrens */ 454789Sahrens if (vd->vdev_ops->vdev_op_leaf && alloctype == VDEV_ALLOC_LOAD) { 455789Sahrens (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, 456789Sahrens &vd->vdev_dtl.smo_object); 4571732Sbonwick (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, 4581732Sbonwick &vd->vdev_offline); 459*4451Seschrock (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE, 460*4451Seschrock &vd->vdev_unspare); 461*4451Seschrock /* 462*4451Seschrock * When importing a pool, we want to ignore the persistent fault 463*4451Seschrock * state, as the diagnosis made on another system may not be 464*4451Seschrock * valid in the current context. 465*4451Seschrock */ 466*4451Seschrock if (spa->spa_load_state == SPA_LOAD_OPEN) { 467*4451Seschrock (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, 468*4451Seschrock &vd->vdev_faulted); 469*4451Seschrock (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED, 470*4451Seschrock &vd->vdev_degraded); 471*4451Seschrock (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, 472*4451Seschrock &vd->vdev_removed); 473*4451Seschrock } 474789Sahrens } 475789Sahrens 476789Sahrens /* 477789Sahrens * Add ourselves to the parent's list of children. 478789Sahrens */ 479789Sahrens vdev_add_child(parent, vd); 480789Sahrens 4812082Seschrock *vdp = vd; 4822082Seschrock 4832082Seschrock return (0); 484789Sahrens } 485789Sahrens 486789Sahrens void 487789Sahrens vdev_free(vdev_t *vd) 488789Sahrens { 489789Sahrens int c; 490*4451Seschrock spa_t *spa = vd->vdev_spa; 491789Sahrens 492789Sahrens /* 493789Sahrens * vdev_free() implies closing the vdev first. This is simpler than 494789Sahrens * trying to ensure complicated semantics for all callers. 495789Sahrens */ 496789Sahrens vdev_close(vd); 497789Sahrens 498*4451Seschrock 4991732Sbonwick ASSERT(!list_link_active(&vd->vdev_dirty_node)); 500789Sahrens 501789Sahrens /* 502789Sahrens * Free all children. 503789Sahrens */ 504789Sahrens for (c = 0; c < vd->vdev_children; c++) 505789Sahrens vdev_free(vd->vdev_child[c]); 506789Sahrens 507789Sahrens ASSERT(vd->vdev_child == NULL); 508789Sahrens ASSERT(vd->vdev_guid_sum == vd->vdev_guid); 509789Sahrens 510789Sahrens /* 511789Sahrens * Discard allocation state. 512789Sahrens */ 513789Sahrens if (vd == vd->vdev_top) 514789Sahrens vdev_metaslab_fini(vd); 515789Sahrens 516789Sahrens ASSERT3U(vd->vdev_stat.vs_space, ==, 0); 5172082Seschrock ASSERT3U(vd->vdev_stat.vs_dspace, ==, 0); 518789Sahrens ASSERT3U(vd->vdev_stat.vs_alloc, ==, 0); 519789Sahrens 520789Sahrens /* 521789Sahrens * Remove this vdev from its parent's child list. 522789Sahrens */ 523789Sahrens vdev_remove_child(vd->vdev_parent, vd); 524789Sahrens 525789Sahrens ASSERT(vd->vdev_parent == NULL); 526789Sahrens 527*4451Seschrock /* 528*4451Seschrock * Clean up vdev structure. 529*4451Seschrock */ 530*4451Seschrock vdev_queue_fini(vd); 531*4451Seschrock vdev_cache_fini(vd); 532*4451Seschrock 533*4451Seschrock if (vd->vdev_path) 534*4451Seschrock spa_strfree(vd->vdev_path); 535*4451Seschrock if (vd->vdev_devid) 536*4451Seschrock spa_strfree(vd->vdev_devid); 537*4451Seschrock if (vd->vdev_physpath) 538*4451Seschrock spa_strfree(vd->vdev_physpath); 539*4451Seschrock 540*4451Seschrock if (vd->vdev_isspare) 541*4451Seschrock spa_spare_remove(vd); 542*4451Seschrock 543*4451Seschrock txg_list_destroy(&vd->vdev_ms_list); 544*4451Seschrock txg_list_destroy(&vd->vdev_dtl_list); 545*4451Seschrock mutex_enter(&vd->vdev_dtl_lock); 546*4451Seschrock space_map_unload(&vd->vdev_dtl_map); 547*4451Seschrock space_map_destroy(&vd->vdev_dtl_map); 548*4451Seschrock space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL); 549*4451Seschrock space_map_destroy(&vd->vdev_dtl_scrub); 550*4451Seschrock mutex_exit(&vd->vdev_dtl_lock); 551*4451Seschrock mutex_destroy(&vd->vdev_dtl_lock); 552*4451Seschrock mutex_destroy(&vd->vdev_stat_lock); 553*4451Seschrock 554*4451Seschrock if (vd == spa->spa_root_vdev) 555*4451Seschrock spa->spa_root_vdev = NULL; 556*4451Seschrock 557*4451Seschrock kmem_free(vd, sizeof (vdev_t)); 558789Sahrens } 559789Sahrens 560789Sahrens /* 561789Sahrens * Transfer top-level vdev state from svd to tvd. 562789Sahrens */ 563789Sahrens static void 564789Sahrens vdev_top_transfer(vdev_t *svd, vdev_t *tvd) 565789Sahrens { 566789Sahrens spa_t *spa = svd->vdev_spa; 567789Sahrens metaslab_t *msp; 568789Sahrens vdev_t *vd; 569789Sahrens int t; 570789Sahrens 571789Sahrens ASSERT(tvd == tvd->vdev_top); 572789Sahrens 573789Sahrens tvd->vdev_ms_array = svd->vdev_ms_array; 574789Sahrens tvd->vdev_ms_shift = svd->vdev_ms_shift; 575789Sahrens tvd->vdev_ms_count = svd->vdev_ms_count; 576789Sahrens 577789Sahrens svd->vdev_ms_array = 0; 578789Sahrens svd->vdev_ms_shift = 0; 579789Sahrens svd->vdev_ms_count = 0; 580789Sahrens 581789Sahrens tvd->vdev_mg = svd->vdev_mg; 582789Sahrens tvd->vdev_ms = svd->vdev_ms; 583789Sahrens 584789Sahrens svd->vdev_mg = NULL; 585789Sahrens svd->vdev_ms = NULL; 5861732Sbonwick 5871732Sbonwick if (tvd->vdev_mg != NULL) 5881732Sbonwick tvd->vdev_mg->mg_vd = tvd; 589789Sahrens 590789Sahrens tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc; 591789Sahrens tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space; 5922082Seschrock tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace; 593789Sahrens 594789Sahrens svd->vdev_stat.vs_alloc = 0; 595789Sahrens svd->vdev_stat.vs_space = 0; 5962082Seschrock svd->vdev_stat.vs_dspace = 0; 597789Sahrens 598789Sahrens for (t = 0; t < TXG_SIZE; t++) { 599789Sahrens while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL) 600789Sahrens (void) txg_list_add(&tvd->vdev_ms_list, msp, t); 601789Sahrens while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL) 602789Sahrens (void) txg_list_add(&tvd->vdev_dtl_list, vd, t); 603789Sahrens if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t)) 604789Sahrens (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t); 605789Sahrens } 606789Sahrens 6071732Sbonwick if (list_link_active(&svd->vdev_dirty_node)) { 608789Sahrens vdev_config_clean(svd); 609789Sahrens vdev_config_dirty(tvd); 610789Sahrens } 611789Sahrens 6122082Seschrock tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio; 6132082Seschrock svd->vdev_deflate_ratio = 0; 614789Sahrens } 615789Sahrens 616789Sahrens static void 617789Sahrens vdev_top_update(vdev_t *tvd, vdev_t *vd) 618789Sahrens { 619789Sahrens int c; 620789Sahrens 621789Sahrens if (vd == NULL) 622789Sahrens return; 623789Sahrens 624789Sahrens vd->vdev_top = tvd; 625789Sahrens 626789Sahrens for (c = 0; c < vd->vdev_children; c++) 627789Sahrens vdev_top_update(tvd, vd->vdev_child[c]); 628789Sahrens } 629789Sahrens 630789Sahrens /* 631789Sahrens * Add a mirror/replacing vdev above an existing vdev. 632789Sahrens */ 633789Sahrens vdev_t * 634789Sahrens vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) 635789Sahrens { 636789Sahrens spa_t *spa = cvd->vdev_spa; 637789Sahrens vdev_t *pvd = cvd->vdev_parent; 638789Sahrens vdev_t *mvd; 639789Sahrens 640789Sahrens ASSERT(spa_config_held(spa, RW_WRITER)); 641789Sahrens 642789Sahrens mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops); 6431732Sbonwick 6441732Sbonwick mvd->vdev_asize = cvd->vdev_asize; 6451732Sbonwick mvd->vdev_ashift = cvd->vdev_ashift; 6461732Sbonwick mvd->vdev_state = cvd->vdev_state; 6471732Sbonwick 648789Sahrens vdev_remove_child(pvd, cvd); 649789Sahrens vdev_add_child(pvd, mvd); 650789Sahrens cvd->vdev_id = mvd->vdev_children; 651789Sahrens vdev_add_child(mvd, cvd); 652789Sahrens vdev_top_update(cvd->vdev_top, cvd->vdev_top); 653789Sahrens 654789Sahrens if (mvd == mvd->vdev_top) 655789Sahrens vdev_top_transfer(cvd, mvd); 656789Sahrens 657789Sahrens return (mvd); 658789Sahrens } 659789Sahrens 660789Sahrens /* 661789Sahrens * Remove a 1-way mirror/replacing vdev from the tree. 662789Sahrens */ 663789Sahrens void 664789Sahrens vdev_remove_parent(vdev_t *cvd) 665789Sahrens { 666789Sahrens vdev_t *mvd = cvd->vdev_parent; 667789Sahrens vdev_t *pvd = mvd->vdev_parent; 668789Sahrens 669789Sahrens ASSERT(spa_config_held(cvd->vdev_spa, RW_WRITER)); 670789Sahrens 671789Sahrens ASSERT(mvd->vdev_children == 1); 672789Sahrens ASSERT(mvd->vdev_ops == &vdev_mirror_ops || 6732082Seschrock mvd->vdev_ops == &vdev_replacing_ops || 6742082Seschrock mvd->vdev_ops == &vdev_spare_ops); 6751732Sbonwick cvd->vdev_ashift = mvd->vdev_ashift; 676789Sahrens 677789Sahrens vdev_remove_child(mvd, cvd); 678789Sahrens vdev_remove_child(pvd, mvd); 679789Sahrens cvd->vdev_id = mvd->vdev_id; 680789Sahrens vdev_add_child(pvd, cvd); 6812082Seschrock /* 6822082Seschrock * If we created a new toplevel vdev, then we need to change the child's 6832082Seschrock * vdev GUID to match the old toplevel vdev. Otherwise, we could have 6842082Seschrock * detached an offline device, and when we go to import the pool we'll 6852082Seschrock * think we have two toplevel vdevs, instead of a different version of 6862082Seschrock * the same toplevel vdev. 6872082Seschrock */ 6882082Seschrock if (cvd->vdev_top == cvd) { 6892082Seschrock pvd->vdev_guid_sum -= cvd->vdev_guid; 6902082Seschrock cvd->vdev_guid_sum -= cvd->vdev_guid; 6912082Seschrock cvd->vdev_guid = mvd->vdev_guid; 6922082Seschrock cvd->vdev_guid_sum += mvd->vdev_guid; 6932082Seschrock pvd->vdev_guid_sum += cvd->vdev_guid; 6942082Seschrock } 695789Sahrens vdev_top_update(cvd->vdev_top, cvd->vdev_top); 696789Sahrens 697789Sahrens if (cvd == cvd->vdev_top) 698789Sahrens vdev_top_transfer(mvd, cvd); 699789Sahrens 700789Sahrens ASSERT(mvd->vdev_children == 0); 701789Sahrens vdev_free(mvd); 702789Sahrens } 703789Sahrens 7041544Seschrock int 705789Sahrens vdev_metaslab_init(vdev_t *vd, uint64_t txg) 706789Sahrens { 707789Sahrens spa_t *spa = vd->vdev_spa; 7081732Sbonwick objset_t *mos = spa->spa_meta_objset; 709789Sahrens metaslab_class_t *mc = spa_metaslab_class_select(spa); 7101732Sbonwick uint64_t m; 711789Sahrens uint64_t oldc = vd->vdev_ms_count; 712789Sahrens uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; 7131732Sbonwick metaslab_t **mspp; 7141732Sbonwick int error; 715789Sahrens 7161585Sbonwick if (vd->vdev_ms_shift == 0) /* not being allocated from yet */ 7171585Sbonwick return (0); 7181585Sbonwick 719789Sahrens dprintf("%s oldc %llu newc %llu\n", vdev_description(vd), oldc, newc); 720789Sahrens 721789Sahrens ASSERT(oldc <= newc); 722789Sahrens 7231732Sbonwick if (vd->vdev_mg == NULL) 7241732Sbonwick vd->vdev_mg = metaslab_group_create(mc, vd); 7251732Sbonwick 7261732Sbonwick mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP); 7271732Sbonwick 7281732Sbonwick if (oldc != 0) { 7291732Sbonwick bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp)); 7301732Sbonwick kmem_free(vd->vdev_ms, oldc * sizeof (*mspp)); 7311732Sbonwick } 7321732Sbonwick 7331732Sbonwick vd->vdev_ms = mspp; 734789Sahrens vd->vdev_ms_count = newc; 735789Sahrens 7361732Sbonwick for (m = oldc; m < newc; m++) { 7371732Sbonwick space_map_obj_t smo = { 0, 0, 0 }; 738789Sahrens if (txg == 0) { 7391732Sbonwick uint64_t object = 0; 7401732Sbonwick error = dmu_read(mos, vd->vdev_ms_array, 7411732Sbonwick m * sizeof (uint64_t), sizeof (uint64_t), &object); 7421732Sbonwick if (error) 7431732Sbonwick return (error); 7441732Sbonwick if (object != 0) { 7451732Sbonwick dmu_buf_t *db; 7461732Sbonwick error = dmu_bonus_hold(mos, object, FTAG, &db); 7471732Sbonwick if (error) 7481732Sbonwick return (error); 7491732Sbonwick ASSERT3U(db->db_size, ==, sizeof (smo)); 7501732Sbonwick bcopy(db->db_data, &smo, db->db_size); 7511732Sbonwick ASSERT3U(smo.smo_object, ==, object); 7521544Seschrock dmu_buf_rele(db, FTAG); 753789Sahrens } 754789Sahrens } 7551732Sbonwick vd->vdev_ms[m] = metaslab_init(vd->vdev_mg, &smo, 7561732Sbonwick m << vd->vdev_ms_shift, 1ULL << vd->vdev_ms_shift, txg); 757789Sahrens } 758789Sahrens 7591544Seschrock return (0); 760789Sahrens } 761789Sahrens 762789Sahrens void 763789Sahrens vdev_metaslab_fini(vdev_t *vd) 764789Sahrens { 765789Sahrens uint64_t m; 766789Sahrens uint64_t count = vd->vdev_ms_count; 767789Sahrens 768789Sahrens if (vd->vdev_ms != NULL) { 769789Sahrens for (m = 0; m < count; m++) 7701732Sbonwick if (vd->vdev_ms[m] != NULL) 7711732Sbonwick metaslab_fini(vd->vdev_ms[m]); 772789Sahrens kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *)); 773789Sahrens vd->vdev_ms = NULL; 774789Sahrens } 775789Sahrens } 776789Sahrens 777789Sahrens /* 778789Sahrens * Prepare a virtual device for access. 779789Sahrens */ 780789Sahrens int 781789Sahrens vdev_open(vdev_t *vd) 782789Sahrens { 783789Sahrens int error; 784789Sahrens int c; 785789Sahrens uint64_t osize = 0; 786789Sahrens uint64_t asize, psize; 7871732Sbonwick uint64_t ashift = 0; 788789Sahrens 789789Sahrens ASSERT(vd->vdev_state == VDEV_STATE_CLOSED || 790789Sahrens vd->vdev_state == VDEV_STATE_CANT_OPEN || 791789Sahrens vd->vdev_state == VDEV_STATE_OFFLINE); 792789Sahrens 793789Sahrens if (vd->vdev_fault_mode == VDEV_FAULT_COUNT) 794789Sahrens vd->vdev_fault_arg >>= 1; 795789Sahrens else 796789Sahrens vd->vdev_fault_mode = VDEV_FAULT_NONE; 797789Sahrens 798789Sahrens vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 799789Sahrens 800*4451Seschrock if (!vd->vdev_removed && vd->vdev_faulted) { 801*4451Seschrock ASSERT(vd->vdev_children == 0); 802*4451Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, 803*4451Seschrock VDEV_AUX_ERR_EXCEEDED); 804*4451Seschrock return (ENXIO); 805*4451Seschrock } else if (vd->vdev_offline) { 806789Sahrens ASSERT(vd->vdev_children == 0); 8071544Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE); 808789Sahrens return (ENXIO); 809789Sahrens } 810789Sahrens 811789Sahrens error = vd->vdev_ops->vdev_op_open(vd, &osize, &ashift); 812789Sahrens 8131544Seschrock if (zio_injection_enabled && error == 0) 8141544Seschrock error = zio_handle_device_injection(vd, ENXIO); 8151544Seschrock 816*4451Seschrock if (error) { 817*4451Seschrock if (vd->vdev_removed && 818*4451Seschrock vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED) 819*4451Seschrock vd->vdev_removed = B_FALSE; 820789Sahrens 8211544Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 822789Sahrens vd->vdev_stat.vs_aux); 823789Sahrens return (error); 824789Sahrens } 825789Sahrens 826*4451Seschrock vd->vdev_removed = B_FALSE; 827*4451Seschrock 828*4451Seschrock if (vd->vdev_degraded) { 829*4451Seschrock ASSERT(vd->vdev_children == 0); 830*4451Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, 831*4451Seschrock VDEV_AUX_ERR_EXCEEDED); 832*4451Seschrock } else { 833*4451Seschrock vd->vdev_state = VDEV_STATE_HEALTHY; 834*4451Seschrock } 835789Sahrens 836789Sahrens for (c = 0; c < vd->vdev_children; c++) 8371544Seschrock if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { 8381544Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, 8391544Seschrock VDEV_AUX_NONE); 8401544Seschrock break; 8411544Seschrock } 842789Sahrens 843789Sahrens osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t)); 844789Sahrens 845789Sahrens if (vd->vdev_children == 0) { 846789Sahrens if (osize < SPA_MINDEVSIZE) { 8471544Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 8481544Seschrock VDEV_AUX_TOO_SMALL); 849789Sahrens return (EOVERFLOW); 850789Sahrens } 851789Sahrens psize = osize; 852789Sahrens asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); 853789Sahrens } else { 8541732Sbonwick if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE - 855789Sahrens (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) { 8561544Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 8571544Seschrock VDEV_AUX_TOO_SMALL); 858789Sahrens return (EOVERFLOW); 859789Sahrens } 860789Sahrens psize = 0; 861789Sahrens asize = osize; 862789Sahrens } 863789Sahrens 864789Sahrens vd->vdev_psize = psize; 865789Sahrens 866789Sahrens if (vd->vdev_asize == 0) { 867789Sahrens /* 868789Sahrens * This is the first-ever open, so use the computed values. 8691732Sbonwick * For testing purposes, a higher ashift can be requested. 870789Sahrens */ 871789Sahrens vd->vdev_asize = asize; 8721732Sbonwick vd->vdev_ashift = MAX(ashift, vd->vdev_ashift); 873789Sahrens } else { 874789Sahrens /* 875789Sahrens * Make sure the alignment requirement hasn't increased. 876789Sahrens */ 8771732Sbonwick if (ashift > vd->vdev_top->vdev_ashift) { 8781544Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 8791544Seschrock VDEV_AUX_BAD_LABEL); 880789Sahrens return (EINVAL); 881789Sahrens } 882789Sahrens 883789Sahrens /* 884789Sahrens * Make sure the device hasn't shrunk. 885789Sahrens */ 886789Sahrens if (asize < vd->vdev_asize) { 8871544Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 8881544Seschrock VDEV_AUX_BAD_LABEL); 889789Sahrens return (EINVAL); 890789Sahrens } 891789Sahrens 892789Sahrens /* 893789Sahrens * If all children are healthy and the asize has increased, 894789Sahrens * then we've experienced dynamic LUN growth. 895789Sahrens */ 896789Sahrens if (vd->vdev_state == VDEV_STATE_HEALTHY && 897789Sahrens asize > vd->vdev_asize) { 898789Sahrens vd->vdev_asize = asize; 899789Sahrens } 900789Sahrens } 901789Sahrens 9021544Seschrock /* 9032082Seschrock * If this is a top-level vdev, compute the raidz-deflation 9042082Seschrock * ratio. Note, we hard-code in 128k (1<<17) because it is the 9052082Seschrock * current "typical" blocksize. Even if SPA_MAXBLOCKSIZE 9062082Seschrock * changes, this algorithm must never change, or we will 9072082Seschrock * inconsistently account for existing bp's. 9082082Seschrock */ 9092082Seschrock if (vd->vdev_top == vd) { 9102082Seschrock vd->vdev_deflate_ratio = (1<<17) / 9112082Seschrock (vdev_psize_to_asize(vd, 1<<17) >> SPA_MINBLOCKSHIFT); 9122082Seschrock } 9132082Seschrock 9142082Seschrock /* 9151544Seschrock * This allows the ZFS DE to close cases appropriately. If a device 9161544Seschrock * goes away and later returns, we want to close the associated case. 9171544Seschrock * But it's not enough to simply post this only when a device goes from 9181544Seschrock * CANT_OPEN -> HEALTHY. If we reboot the system and the device is 9191544Seschrock * back, we also need to close the case (otherwise we will try to replay 9201544Seschrock * it). So we have to post this notifier every time. Since this only 9211544Seschrock * occurs during pool open or error recovery, this should not be an 9221544Seschrock * issue. 9231544Seschrock */ 9241544Seschrock zfs_post_ok(vd->vdev_spa, vd); 9251544Seschrock 926789Sahrens return (0); 927789Sahrens } 928789Sahrens 929789Sahrens /* 9301986Seschrock * Called once the vdevs are all opened, this routine validates the label 9311986Seschrock * contents. This needs to be done before vdev_load() so that we don't 932*4451Seschrock * inadvertently do repair I/Os to the wrong device. 9331986Seschrock * 9341986Seschrock * This function will only return failure if one of the vdevs indicates that it 9351986Seschrock * has since been destroyed or exported. This is only possible if 9361986Seschrock * /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state 9371986Seschrock * will be updated but the function will return 0. 9381986Seschrock */ 9391986Seschrock int 9401986Seschrock vdev_validate(vdev_t *vd) 9411986Seschrock { 9421986Seschrock spa_t *spa = vd->vdev_spa; 9431986Seschrock int c; 9441986Seschrock nvlist_t *label; 9451986Seschrock uint64_t guid; 9461986Seschrock uint64_t state; 9471986Seschrock 9481986Seschrock for (c = 0; c < vd->vdev_children; c++) 9491986Seschrock if (vdev_validate(vd->vdev_child[c]) != 0) 9504070Smc142369 return (EBADF); 9511986Seschrock 9522174Seschrock /* 9532174Seschrock * If the device has already failed, or was marked offline, don't do 9542174Seschrock * any further validation. Otherwise, label I/O will fail and we will 9552174Seschrock * overwrite the previous state. 9562174Seschrock */ 9572174Seschrock if (vd->vdev_ops->vdev_op_leaf && !vdev_is_dead(vd)) { 9581986Seschrock 9591986Seschrock if ((label = vdev_label_read_config(vd)) == NULL) { 9601986Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 9611986Seschrock VDEV_AUX_BAD_LABEL); 9621986Seschrock return (0); 9631986Seschrock } 9641986Seschrock 9651986Seschrock if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, 9661986Seschrock &guid) != 0 || guid != spa_guid(spa)) { 9671986Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 9681986Seschrock VDEV_AUX_CORRUPT_DATA); 9691986Seschrock nvlist_free(label); 9701986Seschrock return (0); 9711986Seschrock } 9721986Seschrock 9731986Seschrock if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, 9741986Seschrock &guid) != 0 || guid != vd->vdev_guid) { 9751986Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 9761986Seschrock VDEV_AUX_CORRUPT_DATA); 9771986Seschrock nvlist_free(label); 9781986Seschrock return (0); 9791986Seschrock } 9801986Seschrock 9811986Seschrock if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, 9821986Seschrock &state) != 0) { 9831986Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 9841986Seschrock VDEV_AUX_CORRUPT_DATA); 9851986Seschrock nvlist_free(label); 9861986Seschrock return (0); 9871986Seschrock } 9881986Seschrock 9891986Seschrock nvlist_free(label); 9901986Seschrock 9911986Seschrock if (spa->spa_load_state == SPA_LOAD_OPEN && 9921986Seschrock state != POOL_STATE_ACTIVE) 9934070Smc142369 return (EBADF); 9941986Seschrock } 9951986Seschrock 9961986Seschrock /* 9971986Seschrock * If we were able to open and validate a vdev that was previously 9981986Seschrock * marked permanently unavailable, clear that state now. 9991986Seschrock */ 10001986Seschrock if (vd->vdev_not_present) 10011986Seschrock vd->vdev_not_present = 0; 10021986Seschrock 10031986Seschrock return (0); 10041986Seschrock } 10051986Seschrock 10061986Seschrock /* 1007789Sahrens * Close a virtual device. 1008789Sahrens */ 1009789Sahrens void 1010789Sahrens vdev_close(vdev_t *vd) 1011789Sahrens { 1012789Sahrens vd->vdev_ops->vdev_op_close(vd); 1013789Sahrens 1014*4451Seschrock vdev_cache_purge(vd); 1015789Sahrens 10161986Seschrock /* 10171986Seschrock * We record the previous state before we close it, so that if we are 10181986Seschrock * doing a reopen(), we don't generate FMA ereports if we notice that 10191986Seschrock * it's still faulted. 10201986Seschrock */ 10211986Seschrock vd->vdev_prevstate = vd->vdev_state; 10221986Seschrock 1023789Sahrens if (vd->vdev_offline) 1024789Sahrens vd->vdev_state = VDEV_STATE_OFFLINE; 1025789Sahrens else 1026789Sahrens vd->vdev_state = VDEV_STATE_CLOSED; 10271544Seschrock vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 1028789Sahrens } 1029789Sahrens 1030789Sahrens void 10311544Seschrock vdev_reopen(vdev_t *vd) 1032789Sahrens { 10331544Seschrock spa_t *spa = vd->vdev_spa; 1034789Sahrens 10351544Seschrock ASSERT(spa_config_held(spa, RW_WRITER)); 10361544Seschrock 1037789Sahrens vdev_close(vd); 1038789Sahrens (void) vdev_open(vd); 1039789Sahrens 1040789Sahrens /* 10413377Seschrock * Call vdev_validate() here to make sure we have the same device. 10423377Seschrock * Otherwise, a device with an invalid label could be successfully 10433377Seschrock * opened in response to vdev_reopen(). 10443377Seschrock */ 10453377Seschrock (void) vdev_validate(vd); 10463377Seschrock 10473377Seschrock /* 1048*4451Seschrock * Reassess parent vdev's health. 1049789Sahrens */ 1050*4451Seschrock vdev_propagate_state(vd); 1051789Sahrens } 1052789Sahrens 1053789Sahrens int 10542082Seschrock vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing) 1055789Sahrens { 1056789Sahrens int error; 1057789Sahrens 1058789Sahrens /* 1059789Sahrens * Normally, partial opens (e.g. of a mirror) are allowed. 1060789Sahrens * For a create, however, we want to fail the request if 1061789Sahrens * there are any components we can't open. 1062789Sahrens */ 1063789Sahrens error = vdev_open(vd); 1064789Sahrens 1065789Sahrens if (error || vd->vdev_state != VDEV_STATE_HEALTHY) { 1066789Sahrens vdev_close(vd); 1067789Sahrens return (error ? error : ENXIO); 1068789Sahrens } 1069789Sahrens 1070789Sahrens /* 1071789Sahrens * Recursively initialize all labels. 1072789Sahrens */ 10733377Seschrock if ((error = vdev_label_init(vd, txg, isreplacing ? 10743377Seschrock VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) { 1075789Sahrens vdev_close(vd); 1076789Sahrens return (error); 1077789Sahrens } 1078789Sahrens 1079789Sahrens return (0); 1080789Sahrens } 1081789Sahrens 1082789Sahrens /* 1083789Sahrens * The is the latter half of vdev_create(). It is distinct because it 1084789Sahrens * involves initiating transactions in order to do metaslab creation. 1085789Sahrens * For creation, we want to try to create all vdevs at once and then undo it 1086789Sahrens * if anything fails; this is much harder if we have pending transactions. 1087789Sahrens */ 10881585Sbonwick void 1089789Sahrens vdev_init(vdev_t *vd, uint64_t txg) 1090789Sahrens { 1091789Sahrens /* 1092789Sahrens * Aim for roughly 200 metaslabs per vdev. 1093789Sahrens */ 1094789Sahrens vd->vdev_ms_shift = highbit(vd->vdev_asize / 200); 1095789Sahrens vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT); 1096789Sahrens 1097789Sahrens /* 10981585Sbonwick * Initialize the vdev's metaslabs. This can't fail because 10991585Sbonwick * there's nothing to read when creating all new metaslabs. 1100789Sahrens */ 11011585Sbonwick VERIFY(vdev_metaslab_init(vd, txg) == 0); 1102789Sahrens } 1103789Sahrens 1104789Sahrens void 11051732Sbonwick vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg) 1106789Sahrens { 11071732Sbonwick ASSERT(vd == vd->vdev_top); 11081732Sbonwick ASSERT(ISP2(flags)); 1109789Sahrens 11101732Sbonwick if (flags & VDD_METASLAB) 11111732Sbonwick (void) txg_list_add(&vd->vdev_ms_list, arg, txg); 11121732Sbonwick 11131732Sbonwick if (flags & VDD_DTL) 11141732Sbonwick (void) txg_list_add(&vd->vdev_dtl_list, arg, txg); 11151732Sbonwick 11161732Sbonwick (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg); 1117789Sahrens } 1118789Sahrens 1119789Sahrens void 1120789Sahrens vdev_dtl_dirty(space_map_t *sm, uint64_t txg, uint64_t size) 1121789Sahrens { 1122789Sahrens mutex_enter(sm->sm_lock); 1123789Sahrens if (!space_map_contains(sm, txg, size)) 1124789Sahrens space_map_add(sm, txg, size); 1125789Sahrens mutex_exit(sm->sm_lock); 1126789Sahrens } 1127789Sahrens 1128789Sahrens int 1129789Sahrens vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size) 1130789Sahrens { 1131789Sahrens int dirty; 1132789Sahrens 1133789Sahrens /* 1134789Sahrens * Quick test without the lock -- covers the common case that 1135789Sahrens * there are no dirty time segments. 1136789Sahrens */ 1137789Sahrens if (sm->sm_space == 0) 1138789Sahrens return (0); 1139789Sahrens 1140789Sahrens mutex_enter(sm->sm_lock); 1141789Sahrens dirty = space_map_contains(sm, txg, size); 1142789Sahrens mutex_exit(sm->sm_lock); 1143789Sahrens 1144789Sahrens return (dirty); 1145789Sahrens } 1146789Sahrens 1147789Sahrens /* 1148789Sahrens * Reassess DTLs after a config change or scrub completion. 1149789Sahrens */ 1150789Sahrens void 1151789Sahrens vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) 1152789Sahrens { 11531544Seschrock spa_t *spa = vd->vdev_spa; 1154789Sahrens int c; 1155789Sahrens 11561544Seschrock ASSERT(spa_config_held(spa, RW_WRITER)); 1157789Sahrens 1158789Sahrens if (vd->vdev_children == 0) { 1159789Sahrens mutex_enter(&vd->vdev_dtl_lock); 1160789Sahrens /* 1161789Sahrens * We're successfully scrubbed everything up to scrub_txg. 1162789Sahrens * Therefore, excise all old DTLs up to that point, then 1163789Sahrens * fold in the DTLs for everything we couldn't scrub. 1164789Sahrens */ 1165789Sahrens if (scrub_txg != 0) { 1166789Sahrens space_map_excise(&vd->vdev_dtl_map, 0, scrub_txg); 1167789Sahrens space_map_union(&vd->vdev_dtl_map, &vd->vdev_dtl_scrub); 1168789Sahrens } 1169789Sahrens if (scrub_done) 1170789Sahrens space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL); 1171789Sahrens mutex_exit(&vd->vdev_dtl_lock); 11721732Sbonwick if (txg != 0) 11731732Sbonwick vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); 1174789Sahrens return; 1175789Sahrens } 1176789Sahrens 11771544Seschrock /* 11781544Seschrock * Make sure the DTLs are always correct under the scrub lock. 11791544Seschrock */ 11801544Seschrock if (vd == spa->spa_root_vdev) 11811544Seschrock mutex_enter(&spa->spa_scrub_lock); 11821544Seschrock 1183789Sahrens mutex_enter(&vd->vdev_dtl_lock); 1184789Sahrens space_map_vacate(&vd->vdev_dtl_map, NULL, NULL); 1185789Sahrens space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL); 1186789Sahrens mutex_exit(&vd->vdev_dtl_lock); 1187789Sahrens 1188789Sahrens for (c = 0; c < vd->vdev_children; c++) { 1189789Sahrens vdev_t *cvd = vd->vdev_child[c]; 1190789Sahrens vdev_dtl_reassess(cvd, txg, scrub_txg, scrub_done); 1191789Sahrens mutex_enter(&vd->vdev_dtl_lock); 1192789Sahrens space_map_union(&vd->vdev_dtl_map, &cvd->vdev_dtl_map); 1193789Sahrens space_map_union(&vd->vdev_dtl_scrub, &cvd->vdev_dtl_scrub); 1194789Sahrens mutex_exit(&vd->vdev_dtl_lock); 1195789Sahrens } 11961544Seschrock 11971544Seschrock if (vd == spa->spa_root_vdev) 11981544Seschrock mutex_exit(&spa->spa_scrub_lock); 1199789Sahrens } 1200789Sahrens 1201789Sahrens static int 1202789Sahrens vdev_dtl_load(vdev_t *vd) 1203789Sahrens { 1204789Sahrens spa_t *spa = vd->vdev_spa; 1205789Sahrens space_map_obj_t *smo = &vd->vdev_dtl; 12061732Sbonwick objset_t *mos = spa->spa_meta_objset; 1207789Sahrens dmu_buf_t *db; 1208789Sahrens int error; 1209789Sahrens 1210789Sahrens ASSERT(vd->vdev_children == 0); 1211789Sahrens 1212789Sahrens if (smo->smo_object == 0) 1213789Sahrens return (0); 1214789Sahrens 12151732Sbonwick if ((error = dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)) != 0) 12161544Seschrock return (error); 12171732Sbonwick 1218789Sahrens ASSERT3U(db->db_size, ==, sizeof (*smo)); 1219789Sahrens bcopy(db->db_data, smo, db->db_size); 12201544Seschrock dmu_buf_rele(db, FTAG); 1221789Sahrens 1222789Sahrens mutex_enter(&vd->vdev_dtl_lock); 12231732Sbonwick error = space_map_load(&vd->vdev_dtl_map, NULL, SM_ALLOC, smo, mos); 1224789Sahrens mutex_exit(&vd->vdev_dtl_lock); 1225789Sahrens 1226789Sahrens return (error); 1227789Sahrens } 1228789Sahrens 1229789Sahrens void 1230789Sahrens vdev_dtl_sync(vdev_t *vd, uint64_t txg) 1231789Sahrens { 1232789Sahrens spa_t *spa = vd->vdev_spa; 1233789Sahrens space_map_obj_t *smo = &vd->vdev_dtl; 1234789Sahrens space_map_t *sm = &vd->vdev_dtl_map; 12351732Sbonwick objset_t *mos = spa->spa_meta_objset; 1236789Sahrens space_map_t smsync; 1237789Sahrens kmutex_t smlock; 1238789Sahrens dmu_buf_t *db; 1239789Sahrens dmu_tx_t *tx; 1240789Sahrens 1241789Sahrens dprintf("%s in txg %llu pass %d\n", 1242789Sahrens vdev_description(vd), (u_longlong_t)txg, spa_sync_pass(spa)); 1243789Sahrens 1244789Sahrens tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 1245789Sahrens 1246789Sahrens if (vd->vdev_detached) { 1247789Sahrens if (smo->smo_object != 0) { 12481732Sbonwick int err = dmu_object_free(mos, smo->smo_object, tx); 1249789Sahrens ASSERT3U(err, ==, 0); 1250789Sahrens smo->smo_object = 0; 1251789Sahrens } 1252789Sahrens dmu_tx_commit(tx); 12531732Sbonwick dprintf("detach %s committed in txg %llu\n", 12541732Sbonwick vdev_description(vd), txg); 1255789Sahrens return; 1256789Sahrens } 1257789Sahrens 1258789Sahrens if (smo->smo_object == 0) { 1259789Sahrens ASSERT(smo->smo_objsize == 0); 1260789Sahrens ASSERT(smo->smo_alloc == 0); 12611732Sbonwick smo->smo_object = dmu_object_alloc(mos, 1262789Sahrens DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT, 1263789Sahrens DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx); 1264789Sahrens ASSERT(smo->smo_object != 0); 1265789Sahrens vdev_config_dirty(vd->vdev_top); 1266789Sahrens } 1267789Sahrens 1268789Sahrens mutex_init(&smlock, NULL, MUTEX_DEFAULT, NULL); 1269789Sahrens 1270789Sahrens space_map_create(&smsync, sm->sm_start, sm->sm_size, sm->sm_shift, 1271789Sahrens &smlock); 1272789Sahrens 1273789Sahrens mutex_enter(&smlock); 1274789Sahrens 1275789Sahrens mutex_enter(&vd->vdev_dtl_lock); 12761732Sbonwick space_map_walk(sm, space_map_add, &smsync); 1277789Sahrens mutex_exit(&vd->vdev_dtl_lock); 1278789Sahrens 12791732Sbonwick space_map_truncate(smo, mos, tx); 12801732Sbonwick space_map_sync(&smsync, SM_ALLOC, smo, mos, tx); 1281789Sahrens 1282789Sahrens space_map_destroy(&smsync); 1283789Sahrens 1284789Sahrens mutex_exit(&smlock); 1285789Sahrens mutex_destroy(&smlock); 1286789Sahrens 12871732Sbonwick VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)); 1288789Sahrens dmu_buf_will_dirty(db, tx); 1289789Sahrens ASSERT3U(db->db_size, ==, sizeof (*smo)); 1290789Sahrens bcopy(smo, db->db_data, db->db_size); 12911544Seschrock dmu_buf_rele(db, FTAG); 1292789Sahrens 1293789Sahrens dmu_tx_commit(tx); 1294789Sahrens } 1295789Sahrens 12961986Seschrock void 12971544Seschrock vdev_load(vdev_t *vd) 1298789Sahrens { 12991986Seschrock int c; 1300789Sahrens 1301789Sahrens /* 1302789Sahrens * Recursively load all children. 1303789Sahrens */ 1304789Sahrens for (c = 0; c < vd->vdev_children; c++) 13051986Seschrock vdev_load(vd->vdev_child[c]); 1306789Sahrens 1307789Sahrens /* 13081585Sbonwick * If this is a top-level vdev, initialize its metaslabs. 1309789Sahrens */ 13101986Seschrock if (vd == vd->vdev_top && 13111986Seschrock (vd->vdev_ashift == 0 || vd->vdev_asize == 0 || 13121986Seschrock vdev_metaslab_init(vd, 0) != 0)) 13131986Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 13141986Seschrock VDEV_AUX_CORRUPT_DATA); 1315789Sahrens 1316789Sahrens /* 1317789Sahrens * If this is a leaf vdev, load its DTL. 1318789Sahrens */ 13191986Seschrock if (vd->vdev_ops->vdev_op_leaf && vdev_dtl_load(vd) != 0) 13201986Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 13211986Seschrock VDEV_AUX_CORRUPT_DATA); 1322789Sahrens } 1323789Sahrens 13242082Seschrock /* 13252082Seschrock * This special case of vdev_spare() is used for hot spares. It's sole purpose 13262082Seschrock * it to set the vdev state for the associated vdev. To do this, we make sure 13272082Seschrock * that we can open the underlying device, then try to read the label, and make 13282082Seschrock * sure that the label is sane and that it hasn't been repurposed to another 13292082Seschrock * pool. 13302082Seschrock */ 13312082Seschrock int 13322082Seschrock vdev_validate_spare(vdev_t *vd) 13332082Seschrock { 13342082Seschrock nvlist_t *label; 13352082Seschrock uint64_t guid, version; 13362082Seschrock uint64_t state; 13372082Seschrock 13382082Seschrock if ((label = vdev_label_read_config(vd)) == NULL) { 13392082Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 13402082Seschrock VDEV_AUX_CORRUPT_DATA); 13412082Seschrock return (-1); 13422082Seschrock } 13432082Seschrock 13442082Seschrock if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 || 13452082Seschrock version > ZFS_VERSION || 13462082Seschrock nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 || 13472082Seschrock guid != vd->vdev_guid || 13482082Seschrock nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { 13492082Seschrock vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 13502082Seschrock VDEV_AUX_CORRUPT_DATA); 13512082Seschrock nvlist_free(label); 13522082Seschrock return (-1); 13532082Seschrock } 13542082Seschrock 13553377Seschrock spa_spare_add(vd); 13563377Seschrock 13572082Seschrock /* 13582082Seschrock * We don't actually check the pool state here. If it's in fact in 13592082Seschrock * use by another pool, we update this fact on the fly when requested. 13602082Seschrock */ 13612082Seschrock nvlist_free(label); 13622082Seschrock return (0); 13632082Seschrock } 13642082Seschrock 1365789Sahrens void 1366789Sahrens vdev_sync_done(vdev_t *vd, uint64_t txg) 1367789Sahrens { 1368789Sahrens metaslab_t *msp; 1369789Sahrens 1370789Sahrens dprintf("%s txg %llu\n", vdev_description(vd), txg); 1371789Sahrens 1372789Sahrens while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))) 1373789Sahrens metaslab_sync_done(msp, txg); 1374789Sahrens } 1375789Sahrens 1376789Sahrens void 1377789Sahrens vdev_sync(vdev_t *vd, uint64_t txg) 1378789Sahrens { 1379789Sahrens spa_t *spa = vd->vdev_spa; 1380789Sahrens vdev_t *lvd; 1381789Sahrens metaslab_t *msp; 13821732Sbonwick dmu_tx_t *tx; 1383789Sahrens 1384789Sahrens dprintf("%s txg %llu pass %d\n", 1385789Sahrens vdev_description(vd), (u_longlong_t)txg, spa_sync_pass(spa)); 1386789Sahrens 13871732Sbonwick if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0) { 13881732Sbonwick ASSERT(vd == vd->vdev_top); 13891732Sbonwick tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 13901732Sbonwick vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset, 13911732Sbonwick DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx); 13921732Sbonwick ASSERT(vd->vdev_ms_array != 0); 13931732Sbonwick vdev_config_dirty(vd); 13941732Sbonwick dmu_tx_commit(tx); 13951732Sbonwick } 1396789Sahrens 13971732Sbonwick while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) { 1398789Sahrens metaslab_sync(msp, txg); 13991732Sbonwick (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg)); 14001732Sbonwick } 1401789Sahrens 1402789Sahrens while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL) 1403789Sahrens vdev_dtl_sync(lvd, txg); 1404789Sahrens 1405789Sahrens (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)); 1406789Sahrens } 1407789Sahrens 1408789Sahrens uint64_t 1409789Sahrens vdev_psize_to_asize(vdev_t *vd, uint64_t psize) 1410789Sahrens { 1411789Sahrens return (vd->vdev_ops->vdev_op_asize(vd, psize)); 1412789Sahrens } 1413789Sahrens 1414789Sahrens void 1415789Sahrens vdev_io_start(zio_t *zio) 1416789Sahrens { 1417789Sahrens zio->io_vd->vdev_ops->vdev_op_io_start(zio); 1418789Sahrens } 1419789Sahrens 1420789Sahrens void 1421789Sahrens vdev_io_done(zio_t *zio) 1422789Sahrens { 1423789Sahrens zio->io_vd->vdev_ops->vdev_op_io_done(zio); 1424789Sahrens } 1425789Sahrens 1426789Sahrens const char * 1427789Sahrens vdev_description(vdev_t *vd) 1428789Sahrens { 1429789Sahrens if (vd == NULL || vd->vdev_ops == NULL) 1430789Sahrens return ("<unknown>"); 1431789Sahrens 1432789Sahrens if (vd->vdev_path != NULL) 1433789Sahrens return (vd->vdev_path); 1434789Sahrens 1435789Sahrens if (vd->vdev_parent == NULL) 1436789Sahrens return (spa_name(vd->vdev_spa)); 1437789Sahrens 1438789Sahrens return (vd->vdev_ops->vdev_op_type); 1439789Sahrens } 1440789Sahrens 1441*4451Seschrock /* 1442*4451Seschrock * Mark the given vdev faulted. A faulted vdev behaves as if the device could 1443*4451Seschrock * not be opened, and no I/O is attempted. 1444*4451Seschrock */ 1445789Sahrens int 1446*4451Seschrock vdev_fault(spa_t *spa, uint64_t guid) 1447*4451Seschrock { 1448*4451Seschrock vdev_t *rvd, *vd; 1449*4451Seschrock uint64_t txg; 1450*4451Seschrock 1451*4451Seschrock txg = spa_vdev_enter(spa); 1452*4451Seschrock 1453*4451Seschrock rvd = spa->spa_root_vdev; 1454*4451Seschrock 1455*4451Seschrock if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) 1456*4451Seschrock return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1457*4451Seschrock if (!vd->vdev_ops->vdev_op_leaf) 1458*4451Seschrock return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1459*4451Seschrock 1460*4451Seschrock /* 1461*4451Seschrock * Faulted state takes precedence over degraded. 1462*4451Seschrock */ 1463*4451Seschrock vd->vdev_faulted = 1ULL; 1464*4451Seschrock vd->vdev_degraded = 0ULL; 1465*4451Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, 1466*4451Seschrock VDEV_AUX_ERR_EXCEEDED); 1467*4451Seschrock 1468*4451Seschrock /* 1469*4451Seschrock * If marking the vdev as faulted cause the toplevel vdev to become 1470*4451Seschrock * unavailable, then back off and simply mark the vdev as degraded 1471*4451Seschrock * instead. 1472*4451Seschrock */ 1473*4451Seschrock if (vdev_is_dead(vd->vdev_top)) { 1474*4451Seschrock vd->vdev_degraded = 1ULL; 1475*4451Seschrock vd->vdev_faulted = 0ULL; 1476*4451Seschrock 1477*4451Seschrock /* 1478*4451Seschrock * If we reopen the device and it's not dead, only then do we 1479*4451Seschrock * mark it degraded. 1480*4451Seschrock */ 1481*4451Seschrock vdev_reopen(vd); 1482*4451Seschrock 1483*4451Seschrock if (!vdev_is_dead(vd)) { 1484*4451Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, 1485*4451Seschrock VDEV_AUX_ERR_EXCEEDED); 1486*4451Seschrock } 1487*4451Seschrock } 1488*4451Seschrock 1489*4451Seschrock vdev_config_dirty(vd->vdev_top); 1490*4451Seschrock 1491*4451Seschrock (void) spa_vdev_exit(spa, NULL, txg, 0); 1492*4451Seschrock 1493*4451Seschrock return (0); 1494*4451Seschrock } 1495*4451Seschrock 1496*4451Seschrock /* 1497*4451Seschrock * Mark the given vdev degraded. A degraded vdev is purely an indication to the 1498*4451Seschrock * user that something is wrong. The vdev continues to operate as normal as far 1499*4451Seschrock * as I/O is concerned. 1500*4451Seschrock */ 1501*4451Seschrock int 1502*4451Seschrock vdev_degrade(spa_t *spa, uint64_t guid) 1503*4451Seschrock { 1504*4451Seschrock vdev_t *rvd, *vd; 1505*4451Seschrock uint64_t txg; 1506*4451Seschrock 1507*4451Seschrock txg = spa_vdev_enter(spa); 1508*4451Seschrock 1509*4451Seschrock rvd = spa->spa_root_vdev; 1510*4451Seschrock 1511*4451Seschrock if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) 1512*4451Seschrock return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1513*4451Seschrock if (!vd->vdev_ops->vdev_op_leaf) 1514*4451Seschrock return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1515*4451Seschrock 1516*4451Seschrock /* 1517*4451Seschrock * If the vdev is already faulted, then don't do anything. 1518*4451Seschrock */ 1519*4451Seschrock if (vd->vdev_faulted || vd->vdev_degraded) { 1520*4451Seschrock (void) spa_vdev_exit(spa, NULL, txg, 0); 1521*4451Seschrock return (0); 1522*4451Seschrock } 1523*4451Seschrock 1524*4451Seschrock vd->vdev_degraded = 1ULL; 1525*4451Seschrock if (!vdev_is_dead(vd)) 1526*4451Seschrock vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, 1527*4451Seschrock VDEV_AUX_ERR_EXCEEDED); 1528*4451Seschrock vdev_config_dirty(vd->vdev_top); 1529*4451Seschrock 1530*4451Seschrock (void) spa_vdev_exit(spa, NULL, txg, 0); 1531*4451Seschrock 1532*4451Seschrock return (0); 1533*4451Seschrock } 1534*4451Seschrock 1535*4451Seschrock /* 1536*4451Seschrock * Online the given vdev. If 'unspare' is set, it implies two things. First, 1537*4451Seschrock * any attached spare device should be detached when the device finishes 1538*4451Seschrock * resilvering. Second, the online should be treated like a 'test' online case, 1539*4451Seschrock * so no FMA events are generated if the device fails to open. 1540*4451Seschrock */ 1541*4451Seschrock int 1542*4451Seschrock vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, 1543*4451Seschrock vdev_state_t *newstate) 1544789Sahrens { 15451485Slling vdev_t *rvd, *vd; 15461485Slling uint64_t txg; 1547789Sahrens 15481485Slling txg = spa_vdev_enter(spa); 15491485Slling 15501485Slling rvd = spa->spa_root_vdev; 15511585Sbonwick 15521544Seschrock if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) 15531485Slling return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1554789Sahrens 15551585Sbonwick if (!vd->vdev_ops->vdev_op_leaf) 15561585Sbonwick return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 15571585Sbonwick 1558789Sahrens vd->vdev_offline = B_FALSE; 15591485Slling vd->vdev_tmpoffline = B_FALSE; 1560*4451Seschrock vd->vdev_checkremove = (flags & ZFS_ONLINE_CHECKREMOVE) ? 1561*4451Seschrock B_TRUE : B_FALSE; 1562*4451Seschrock vd->vdev_forcefault = (flags & ZFS_ONLINE_FORCEFAULT) ? 1563*4451Seschrock B_TRUE : B_FALSE; 15641544Seschrock vdev_reopen(vd->vdev_top); 1565*4451Seschrock vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE; 1566*4451Seschrock 1567*4451Seschrock if (newstate) 1568*4451Seschrock *newstate = vd->vdev_state; 1569*4451Seschrock if ((flags & ZFS_ONLINE_UNSPARE) && 1570*4451Seschrock !vdev_is_dead(vd) && vd->vdev_parent && 1571*4451Seschrock vd->vdev_parent->vdev_ops == &vdev_spare_ops && 1572*4451Seschrock vd->vdev_parent->vdev_child[0] == vd) 1573*4451Seschrock vd->vdev_unspare = B_TRUE; 1574789Sahrens 15751485Slling vdev_config_dirty(vd->vdev_top); 15761485Slling 15771485Slling (void) spa_vdev_exit(spa, NULL, txg, 0); 1578789Sahrens 1579*4451Seschrock /* 1580*4451Seschrock * Must hold spa_namespace_lock in order to post resilver sysevent 1581*4451Seschrock * w/pool name. 1582*4451Seschrock */ 1583*4451Seschrock mutex_enter(&spa_namespace_lock); 1584789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1585*4451Seschrock mutex_exit(&spa_namespace_lock); 1586789Sahrens 1587789Sahrens return (0); 1588789Sahrens } 1589789Sahrens 1590789Sahrens int 1591*4451Seschrock vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) 1592789Sahrens { 15931485Slling vdev_t *rvd, *vd; 15941485Slling uint64_t txg; 1595789Sahrens 15961485Slling txg = spa_vdev_enter(spa); 1597789Sahrens 15981485Slling rvd = spa->spa_root_vdev; 15991585Sbonwick 16001544Seschrock if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) 16011485Slling return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1602789Sahrens 16031585Sbonwick if (!vd->vdev_ops->vdev_op_leaf) 16041585Sbonwick return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 16051585Sbonwick 1606789Sahrens /* 16071732Sbonwick * If the device isn't already offline, try to offline it. 1608789Sahrens */ 16091732Sbonwick if (!vd->vdev_offline) { 16101732Sbonwick /* 16111732Sbonwick * If this device's top-level vdev has a non-empty DTL, 16121732Sbonwick * don't allow the device to be offlined. 16131732Sbonwick * 16141732Sbonwick * XXX -- make this more precise by allowing the offline 16151732Sbonwick * as long as the remaining devices don't have any DTL holes. 16161732Sbonwick */ 16171732Sbonwick if (vd->vdev_top->vdev_dtl_map.sm_space != 0) 16181732Sbonwick return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1619789Sahrens 16201732Sbonwick /* 16211732Sbonwick * Offline this device and reopen its top-level vdev. 16221732Sbonwick * If this action results in the top-level vdev becoming 16231732Sbonwick * unusable, undo it and fail the request. 16241732Sbonwick */ 16251732Sbonwick vd->vdev_offline = B_TRUE; 16261544Seschrock vdev_reopen(vd->vdev_top); 16271732Sbonwick if (vdev_is_dead(vd->vdev_top)) { 16281732Sbonwick vd->vdev_offline = B_FALSE; 16291732Sbonwick vdev_reopen(vd->vdev_top); 16301732Sbonwick return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 16311732Sbonwick } 1632789Sahrens } 1633789Sahrens 1634*4451Seschrock vd->vdev_tmpoffline = (flags & ZFS_OFFLINE_TEMPORARY) ? 1635*4451Seschrock B_TRUE : B_FALSE; 16361732Sbonwick 16371732Sbonwick vdev_config_dirty(vd->vdev_top); 16381485Slling 16391485Slling return (spa_vdev_exit(spa, NULL, txg, 0)); 1640789Sahrens } 1641789Sahrens 16421544Seschrock /* 16431544Seschrock * Clear the error counts associated with this vdev. Unlike vdev_online() and 16441544Seschrock * vdev_offline(), we assume the spa config is locked. We also clear all 16451544Seschrock * children. If 'vd' is NULL, then the user wants to clear all vdevs. 16461544Seschrock */ 16471544Seschrock void 16481544Seschrock vdev_clear(spa_t *spa, vdev_t *vd) 1649789Sahrens { 16501544Seschrock int c; 1651789Sahrens 16521544Seschrock if (vd == NULL) 16531544Seschrock vd = spa->spa_root_vdev; 1654789Sahrens 16551544Seschrock vd->vdev_stat.vs_read_errors = 0; 16561544Seschrock vd->vdev_stat.vs_write_errors = 0; 16571544Seschrock vd->vdev_stat.vs_checksum_errors = 0; 1658789Sahrens 16591544Seschrock for (c = 0; c < vd->vdev_children; c++) 16601544Seschrock vdev_clear(spa, vd->vdev_child[c]); 1661*4451Seschrock 1662*4451Seschrock /* 1663*4451Seschrock * If we're in the FAULTED state, then clear the persistent state and 1664*4451Seschrock * attempt to reopen the device. We also mark the vdev config dirty, so 1665*4451Seschrock * that the new faulted state is written out to disk. 1666*4451Seschrock */ 1667*4451Seschrock if (vd->vdev_faulted || vd->vdev_degraded) { 1668*4451Seschrock vd->vdev_faulted = vd->vdev_degraded = 0; 1669*4451Seschrock vdev_reopen(vd); 1670*4451Seschrock vdev_config_dirty(vd->vdev_top); 1671*4451Seschrock 1672*4451Seschrock if (vd->vdev_faulted) 1673*4451Seschrock VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, 1674*4451Seschrock B_TRUE) == 0); 1675*4451Seschrock 1676*4451Seschrock spa_event_notify(spa, vd, ESC_ZFS_VDEV_CLEAR); 1677*4451Seschrock } 1678789Sahrens } 1679789Sahrens 1680789Sahrens int 1681789Sahrens vdev_is_dead(vdev_t *vd) 1682789Sahrens { 1683*4451Seschrock return (vd->vdev_state < VDEV_STATE_DEGRADED); 1684789Sahrens } 1685789Sahrens 1686789Sahrens int 1687789Sahrens vdev_error_inject(vdev_t *vd, zio_t *zio) 1688789Sahrens { 1689789Sahrens int error = 0; 1690789Sahrens 1691789Sahrens if (vd->vdev_fault_mode == VDEV_FAULT_NONE) 1692789Sahrens return (0); 1693789Sahrens 1694789Sahrens if (((1ULL << zio->io_type) & vd->vdev_fault_mask) == 0) 1695789Sahrens return (0); 1696789Sahrens 1697789Sahrens switch (vd->vdev_fault_mode) { 1698789Sahrens case VDEV_FAULT_RANDOM: 1699789Sahrens if (spa_get_random(vd->vdev_fault_arg) == 0) 1700789Sahrens error = EIO; 1701789Sahrens break; 1702789Sahrens 1703789Sahrens case VDEV_FAULT_COUNT: 1704789Sahrens if ((int64_t)--vd->vdev_fault_arg <= 0) 1705789Sahrens vd->vdev_fault_mode = VDEV_FAULT_NONE; 1706789Sahrens error = EIO; 1707789Sahrens break; 1708789Sahrens } 1709789Sahrens 1710789Sahrens return (error); 1711789Sahrens } 1712789Sahrens 1713789Sahrens /* 1714789Sahrens * Get statistics for the given vdev. 1715789Sahrens */ 1716789Sahrens void 1717789Sahrens vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) 1718789Sahrens { 1719789Sahrens vdev_t *rvd = vd->vdev_spa->spa_root_vdev; 1720789Sahrens int c, t; 1721789Sahrens 1722789Sahrens mutex_enter(&vd->vdev_stat_lock); 1723789Sahrens bcopy(&vd->vdev_stat, vs, sizeof (*vs)); 1724789Sahrens vs->vs_timestamp = gethrtime() - vs->vs_timestamp; 1725789Sahrens vs->vs_state = vd->vdev_state; 17261175Slling vs->vs_rsize = vdev_get_rsize(vd); 1727789Sahrens mutex_exit(&vd->vdev_stat_lock); 1728789Sahrens 1729789Sahrens /* 1730789Sahrens * If we're getting stats on the root vdev, aggregate the I/O counts 1731789Sahrens * over all top-level vdevs (i.e. the direct children of the root). 1732789Sahrens */ 1733789Sahrens if (vd == rvd) { 1734789Sahrens for (c = 0; c < rvd->vdev_children; c++) { 1735789Sahrens vdev_t *cvd = rvd->vdev_child[c]; 1736789Sahrens vdev_stat_t *cvs = &cvd->vdev_stat; 1737789Sahrens 1738789Sahrens mutex_enter(&vd->vdev_stat_lock); 1739789Sahrens for (t = 0; t < ZIO_TYPES; t++) { 1740789Sahrens vs->vs_ops[t] += cvs->vs_ops[t]; 1741789Sahrens vs->vs_bytes[t] += cvs->vs_bytes[t]; 1742789Sahrens } 1743789Sahrens vs->vs_read_errors += cvs->vs_read_errors; 1744789Sahrens vs->vs_write_errors += cvs->vs_write_errors; 1745789Sahrens vs->vs_checksum_errors += cvs->vs_checksum_errors; 1746789Sahrens vs->vs_scrub_examined += cvs->vs_scrub_examined; 1747789Sahrens vs->vs_scrub_errors += cvs->vs_scrub_errors; 1748789Sahrens mutex_exit(&vd->vdev_stat_lock); 1749789Sahrens } 1750789Sahrens } 1751789Sahrens } 1752789Sahrens 1753789Sahrens void 1754789Sahrens vdev_stat_update(zio_t *zio) 1755789Sahrens { 1756789Sahrens vdev_t *vd = zio->io_vd; 1757789Sahrens vdev_t *pvd; 1758789Sahrens uint64_t txg = zio->io_txg; 1759789Sahrens vdev_stat_t *vs = &vd->vdev_stat; 1760789Sahrens zio_type_t type = zio->io_type; 1761789Sahrens int flags = zio->io_flags; 1762789Sahrens 1763789Sahrens if (zio->io_error == 0) { 1764789Sahrens if (!(flags & ZIO_FLAG_IO_BYPASS)) { 1765789Sahrens mutex_enter(&vd->vdev_stat_lock); 1766789Sahrens vs->vs_ops[type]++; 1767789Sahrens vs->vs_bytes[type] += zio->io_size; 1768789Sahrens mutex_exit(&vd->vdev_stat_lock); 1769789Sahrens } 1770789Sahrens if ((flags & ZIO_FLAG_IO_REPAIR) && 1771789Sahrens zio->io_delegate_list == NULL) { 1772789Sahrens mutex_enter(&vd->vdev_stat_lock); 17731807Sbonwick if (flags & ZIO_FLAG_SCRUB_THREAD) 1774789Sahrens vs->vs_scrub_repaired += zio->io_size; 1775789Sahrens else 1776789Sahrens vs->vs_self_healed += zio->io_size; 1777789Sahrens mutex_exit(&vd->vdev_stat_lock); 1778789Sahrens } 1779789Sahrens return; 1780789Sahrens } 1781789Sahrens 1782789Sahrens if (flags & ZIO_FLAG_SPECULATIVE) 1783789Sahrens return; 1784789Sahrens 1785789Sahrens if (!vdev_is_dead(vd)) { 1786789Sahrens mutex_enter(&vd->vdev_stat_lock); 1787789Sahrens if (type == ZIO_TYPE_READ) { 1788789Sahrens if (zio->io_error == ECKSUM) 1789789Sahrens vs->vs_checksum_errors++; 1790789Sahrens else 1791789Sahrens vs->vs_read_errors++; 1792789Sahrens } 1793789Sahrens if (type == ZIO_TYPE_WRITE) 1794789Sahrens vs->vs_write_errors++; 1795789Sahrens mutex_exit(&vd->vdev_stat_lock); 1796789Sahrens } 1797789Sahrens 1798789Sahrens if (type == ZIO_TYPE_WRITE) { 1799789Sahrens if (txg == 0 || vd->vdev_children != 0) 1800789Sahrens return; 18011807Sbonwick if (flags & ZIO_FLAG_SCRUB_THREAD) { 1802789Sahrens ASSERT(flags & ZIO_FLAG_IO_REPAIR); 1803789Sahrens for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) 1804789Sahrens vdev_dtl_dirty(&pvd->vdev_dtl_scrub, txg, 1); 1805789Sahrens } 1806789Sahrens if (!(flags & ZIO_FLAG_IO_REPAIR)) { 1807789Sahrens if (vdev_dtl_contains(&vd->vdev_dtl_map, txg, 1)) 1808789Sahrens return; 18091732Sbonwick vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); 1810789Sahrens for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) 1811789Sahrens vdev_dtl_dirty(&pvd->vdev_dtl_map, txg, 1); 1812789Sahrens } 1813789Sahrens } 1814789Sahrens } 1815789Sahrens 1816789Sahrens void 1817789Sahrens vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete) 1818789Sahrens { 1819789Sahrens int c; 1820789Sahrens vdev_stat_t *vs = &vd->vdev_stat; 1821789Sahrens 1822789Sahrens for (c = 0; c < vd->vdev_children; c++) 1823789Sahrens vdev_scrub_stat_update(vd->vdev_child[c], type, complete); 1824789Sahrens 1825789Sahrens mutex_enter(&vd->vdev_stat_lock); 1826789Sahrens 1827789Sahrens if (type == POOL_SCRUB_NONE) { 1828789Sahrens /* 1829789Sahrens * Update completion and end time. Leave everything else alone 1830789Sahrens * so we can report what happened during the previous scrub. 1831789Sahrens */ 1832789Sahrens vs->vs_scrub_complete = complete; 1833789Sahrens vs->vs_scrub_end = gethrestime_sec(); 1834789Sahrens } else { 1835789Sahrens vs->vs_scrub_type = type; 1836789Sahrens vs->vs_scrub_complete = 0; 1837789Sahrens vs->vs_scrub_examined = 0; 1838789Sahrens vs->vs_scrub_repaired = 0; 1839789Sahrens vs->vs_scrub_errors = 0; 1840789Sahrens vs->vs_scrub_start = gethrestime_sec(); 1841789Sahrens vs->vs_scrub_end = 0; 1842789Sahrens } 1843789Sahrens 1844789Sahrens mutex_exit(&vd->vdev_stat_lock); 1845789Sahrens } 1846789Sahrens 1847789Sahrens /* 1848789Sahrens * Update the in-core space usage stats for this vdev and the root vdev. 1849789Sahrens */ 1850789Sahrens void 18512082Seschrock vdev_space_update(vdev_t *vd, int64_t space_delta, int64_t alloc_delta) 1852789Sahrens { 1853789Sahrens ASSERT(vd == vd->vdev_top); 18542082Seschrock int64_t dspace_delta = space_delta; 1855789Sahrens 1856789Sahrens do { 18572082Seschrock if (vd->vdev_ms_count) { 18582082Seschrock /* 18592082Seschrock * If this is a top-level vdev, apply the 18602082Seschrock * inverse of its psize-to-asize (ie. RAID-Z) 18612082Seschrock * space-expansion factor. We must calculate 18622082Seschrock * this here and not at the root vdev because 18632082Seschrock * the root vdev's psize-to-asize is simply the 18642082Seschrock * max of its childrens', thus not accurate 18652082Seschrock * enough for us. 18662082Seschrock */ 18672082Seschrock ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0); 18682082Seschrock dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) * 18692082Seschrock vd->vdev_deflate_ratio; 18702082Seschrock } 18712082Seschrock 1872789Sahrens mutex_enter(&vd->vdev_stat_lock); 1873789Sahrens vd->vdev_stat.vs_space += space_delta; 1874789Sahrens vd->vdev_stat.vs_alloc += alloc_delta; 18752082Seschrock vd->vdev_stat.vs_dspace += dspace_delta; 1876789Sahrens mutex_exit(&vd->vdev_stat_lock); 1877789Sahrens } while ((vd = vd->vdev_parent) != NULL); 1878789Sahrens } 1879789Sahrens 1880789Sahrens /* 1881789Sahrens * Mark a top-level vdev's config as dirty, placing it on the dirty list 1882789Sahrens * so that it will be written out next time the vdev configuration is synced. 1883789Sahrens * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs. 1884789Sahrens */ 1885789Sahrens void 1886789Sahrens vdev_config_dirty(vdev_t *vd) 1887789Sahrens { 1888789Sahrens spa_t *spa = vd->vdev_spa; 1889789Sahrens vdev_t *rvd = spa->spa_root_vdev; 1890789Sahrens int c; 1891789Sahrens 18921601Sbonwick /* 18931601Sbonwick * The dirty list is protected by the config lock. The caller must 18941601Sbonwick * either hold the config lock as writer, or must be the sync thread 18951601Sbonwick * (which holds the lock as reader). There's only one sync thread, 18961601Sbonwick * so this is sufficient to ensure mutual exclusion. 18971601Sbonwick */ 18981601Sbonwick ASSERT(spa_config_held(spa, RW_WRITER) || 18991601Sbonwick dsl_pool_sync_context(spa_get_dsl(spa))); 19001601Sbonwick 1901789Sahrens if (vd == rvd) { 1902789Sahrens for (c = 0; c < rvd->vdev_children; c++) 1903789Sahrens vdev_config_dirty(rvd->vdev_child[c]); 1904789Sahrens } else { 1905789Sahrens ASSERT(vd == vd->vdev_top); 1906789Sahrens 19071732Sbonwick if (!list_link_active(&vd->vdev_dirty_node)) 1908789Sahrens list_insert_head(&spa->spa_dirty_list, vd); 1909789Sahrens } 1910789Sahrens } 1911789Sahrens 1912789Sahrens void 1913789Sahrens vdev_config_clean(vdev_t *vd) 1914789Sahrens { 19151601Sbonwick spa_t *spa = vd->vdev_spa; 19161601Sbonwick 19171601Sbonwick ASSERT(spa_config_held(spa, RW_WRITER) || 19181601Sbonwick dsl_pool_sync_context(spa_get_dsl(spa))); 19191601Sbonwick 19201732Sbonwick ASSERT(list_link_active(&vd->vdev_dirty_node)); 19211601Sbonwick list_remove(&spa->spa_dirty_list, vd); 1922789Sahrens } 1923789Sahrens 19241775Sbillm void 19251775Sbillm vdev_propagate_state(vdev_t *vd) 19261775Sbillm { 19271775Sbillm vdev_t *rvd = vd->vdev_spa->spa_root_vdev; 19281775Sbillm int degraded = 0, faulted = 0; 19291775Sbillm int corrupted = 0; 19301775Sbillm int c; 19311775Sbillm vdev_t *child; 19321775Sbillm 1933*4451Seschrock if (vd->vdev_children > 0) { 1934*4451Seschrock for (c = 0; c < vd->vdev_children; c++) { 1935*4451Seschrock child = vd->vdev_child[c]; 1936*4451Seschrock if (vdev_is_dead(child)) 1937*4451Seschrock faulted++; 1938*4451Seschrock else if (child->vdev_state == VDEV_STATE_DEGRADED) 1939*4451Seschrock degraded++; 1940*4451Seschrock 1941*4451Seschrock if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) 1942*4451Seschrock corrupted++; 1943*4451Seschrock } 19441775Sbillm 1945*4451Seschrock vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded); 1946*4451Seschrock 1947*4451Seschrock /* 1948*4451Seschrock * Root special: if there is a toplevel vdev that cannot be 1949*4451Seschrock * opened due to corrupted metadata, then propagate the root 1950*4451Seschrock * vdev's aux state as 'corrupt' rather than 'insufficient 1951*4451Seschrock * replicas'. 1952*4451Seschrock */ 1953*4451Seschrock if (corrupted && vd == rvd && 1954*4451Seschrock rvd->vdev_state == VDEV_STATE_CANT_OPEN) 1955*4451Seschrock vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN, 1956*4451Seschrock VDEV_AUX_CORRUPT_DATA); 19571775Sbillm } 19581775Sbillm 1959*4451Seschrock if (vd->vdev_parent) 1960*4451Seschrock vdev_propagate_state(vd->vdev_parent); 19611775Sbillm } 19621775Sbillm 1963789Sahrens /* 19641544Seschrock * Set a vdev's state. If this is during an open, we don't update the parent 19651544Seschrock * state, because we're in the process of opening children depth-first. 19661544Seschrock * Otherwise, we propagate the change to the parent. 19671544Seschrock * 19681544Seschrock * If this routine places a device in a faulted state, an appropriate ereport is 19691544Seschrock * generated. 1970789Sahrens */ 1971789Sahrens void 19721544Seschrock vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) 1973789Sahrens { 19741986Seschrock uint64_t save_state; 19751544Seschrock 19761544Seschrock if (state == vd->vdev_state) { 19771544Seschrock vd->vdev_stat.vs_aux = aux; 1978789Sahrens return; 19791544Seschrock } 19801544Seschrock 19811986Seschrock save_state = vd->vdev_state; 1982789Sahrens 1983789Sahrens vd->vdev_state = state; 1984789Sahrens vd->vdev_stat.vs_aux = aux; 1985789Sahrens 1986*4451Seschrock /* 1987*4451Seschrock * If we are setting the vdev state to anything but an open state, then 1988*4451Seschrock * always close the underlying device. Otherwise, we keep accessible 1989*4451Seschrock * but invalid devices open forever. We don't call vdev_close() itself, 1990*4451Seschrock * because that implies some extra checks (offline, etc) that we don't 1991*4451Seschrock * want here. This is limited to leaf devices, because otherwise 1992*4451Seschrock * closing the device will affect other children. 1993*4451Seschrock */ 1994*4451Seschrock if (vdev_is_dead(vd) && vd->vdev_ops->vdev_op_leaf) 1995*4451Seschrock vd->vdev_ops->vdev_op_close(vd); 1996*4451Seschrock 1997*4451Seschrock if (vd->vdev_removed && 1998*4451Seschrock state == VDEV_STATE_CANT_OPEN && 1999*4451Seschrock (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) { 2000*4451Seschrock /* 2001*4451Seschrock * If the previous state is set to VDEV_STATE_REMOVED, then this 2002*4451Seschrock * device was previously marked removed and someone attempted to 2003*4451Seschrock * reopen it. If this failed due to a nonexistent device, then 2004*4451Seschrock * keep the device in the REMOVED state. We also let this be if 2005*4451Seschrock * it is one of our special test online cases, which is only 2006*4451Seschrock * attempting to online the device and shouldn't generate an FMA 2007*4451Seschrock * fault. 2008*4451Seschrock */ 2009*4451Seschrock vd->vdev_state = VDEV_STATE_REMOVED; 2010*4451Seschrock vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 2011*4451Seschrock } else if (state == VDEV_STATE_REMOVED) { 2012*4451Seschrock /* 2013*4451Seschrock * Indicate to the ZFS DE that this device has been removed, and 2014*4451Seschrock * any recent errors should be ignored. 2015*4451Seschrock */ 2016*4451Seschrock zfs_post_remove(vd->vdev_spa, vd); 2017*4451Seschrock vd->vdev_removed = B_TRUE; 2018*4451Seschrock } else if (state == VDEV_STATE_CANT_OPEN) { 20191544Seschrock /* 20201544Seschrock * If we fail to open a vdev during an import, we mark it as 20211544Seschrock * "not available", which signifies that it was never there to 20221544Seschrock * begin with. Failure to open such a device is not considered 20231544Seschrock * an error. 20241544Seschrock */ 20251986Seschrock if (vd->vdev_spa->spa_load_state == SPA_LOAD_IMPORT && 20261986Seschrock vd->vdev_ops->vdev_op_leaf) 20271986Seschrock vd->vdev_not_present = 1; 20281986Seschrock 20291986Seschrock /* 20301986Seschrock * Post the appropriate ereport. If the 'prevstate' field is 20311986Seschrock * set to something other than VDEV_STATE_UNKNOWN, it indicates 20321986Seschrock * that this is part of a vdev_reopen(). In this case, we don't 20331986Seschrock * want to post the ereport if the device was already in the 20341986Seschrock * CANT_OPEN state beforehand. 2035*4451Seschrock * 2036*4451Seschrock * If the 'checkremove' flag is set, then this is an attempt to 2037*4451Seschrock * online the device in response to an insertion event. If we 2038*4451Seschrock * hit this case, then we have detected an insertion event for a 2039*4451Seschrock * faulted or offline device that wasn't in the removed state. 2040*4451Seschrock * In this scenario, we don't post an ereport because we are 2041*4451Seschrock * about to replace the device, or attempt an online with 2042*4451Seschrock * vdev_forcefault, which will generate the fault for us. 20431986Seschrock */ 2044*4451Seschrock if ((vd->vdev_prevstate != state || vd->vdev_forcefault) && 2045*4451Seschrock !vd->vdev_not_present && !vd->vdev_checkremove && 20461544Seschrock vd != vd->vdev_spa->spa_root_vdev) { 20471544Seschrock const char *class; 20481544Seschrock 20491544Seschrock switch (aux) { 20501544Seschrock case VDEV_AUX_OPEN_FAILED: 20511544Seschrock class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED; 20521544Seschrock break; 20531544Seschrock case VDEV_AUX_CORRUPT_DATA: 20541544Seschrock class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA; 20551544Seschrock break; 20561544Seschrock case VDEV_AUX_NO_REPLICAS: 20571544Seschrock class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS; 20581544Seschrock break; 20591544Seschrock case VDEV_AUX_BAD_GUID_SUM: 20601544Seschrock class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM; 20611544Seschrock break; 20621544Seschrock case VDEV_AUX_TOO_SMALL: 20631544Seschrock class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL; 20641544Seschrock break; 20651544Seschrock case VDEV_AUX_BAD_LABEL: 20661544Seschrock class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL; 20671544Seschrock break; 20681544Seschrock default: 20691544Seschrock class = FM_EREPORT_ZFS_DEVICE_UNKNOWN; 20701544Seschrock } 20711544Seschrock 20721544Seschrock zfs_ereport_post(class, vd->vdev_spa, 20731986Seschrock vd, NULL, save_state, 0); 20741544Seschrock } 2075*4451Seschrock 2076*4451Seschrock /* Erase any notion of persistent removed state */ 2077*4451Seschrock vd->vdev_removed = B_FALSE; 2078*4451Seschrock } else { 2079*4451Seschrock vd->vdev_removed = B_FALSE; 20801544Seschrock } 20811544Seschrock 2082*4451Seschrock if (!isopen) 2083*4451Seschrock vdev_propagate_state(vd); 2084789Sahrens } 2085