1*789Sahrens /* 2*789Sahrens * CDDL HEADER START 3*789Sahrens * 4*789Sahrens * The contents of this file are subject to the terms of the 5*789Sahrens * Common Development and Distribution License, Version 1.0 only 6*789Sahrens * (the "License"). You may not use this file except in compliance 7*789Sahrens * with the License. 8*789Sahrens * 9*789Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10*789Sahrens * or http://www.opensolaris.org/os/licensing. 11*789Sahrens * See the License for the specific language governing permissions 12*789Sahrens * and limitations under the License. 13*789Sahrens * 14*789Sahrens * When distributing Covered Code, include this CDDL HEADER in each 15*789Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16*789Sahrens * If applicable, add the following below this CDDL HEADER, with the 17*789Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 18*789Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 19*789Sahrens * 20*789Sahrens * CDDL HEADER END 21*789Sahrens */ 22*789Sahrens /* 23*789Sahrens * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24*789Sahrens * Use is subject to license terms. 25*789Sahrens */ 26*789Sahrens 27*789Sahrens #pragma ident "%Z%%M% %I% %E% SMI" 28*789Sahrens 29*789Sahrens #include <sys/zfs_context.h> 30*789Sahrens #include <sys/spa.h> 31*789Sahrens #include <sys/spa_impl.h> 32*789Sahrens #include <sys/dmu.h> 33*789Sahrens #include <sys/dmu_tx.h> 34*789Sahrens #include <sys/vdev_impl.h> 35*789Sahrens #include <sys/uberblock_impl.h> 36*789Sahrens #include <sys/metaslab.h> 37*789Sahrens #include <sys/metaslab_impl.h> 38*789Sahrens #include <sys/space_map.h> 39*789Sahrens #include <sys/zio.h> 40*789Sahrens #include <sys/zap.h> 41*789Sahrens #include <sys/fs/zfs.h> 42*789Sahrens 43*789Sahrens /* 44*789Sahrens * Virtual device management. 45*789Sahrens */ 46*789Sahrens 47*789Sahrens static vdev_ops_t *vdev_ops_table[] = { 48*789Sahrens &vdev_root_ops, 49*789Sahrens &vdev_raidz_ops, 50*789Sahrens &vdev_mirror_ops, 51*789Sahrens &vdev_replacing_ops, 52*789Sahrens &vdev_disk_ops, 53*789Sahrens &vdev_file_ops, 54*789Sahrens &vdev_missing_ops, 55*789Sahrens NULL 56*789Sahrens }; 57*789Sahrens 58*789Sahrens /* 59*789Sahrens * Given a vdev type, return the appropriate ops vector. 60*789Sahrens */ 61*789Sahrens static vdev_ops_t * 62*789Sahrens vdev_getops(const char *type) 63*789Sahrens { 64*789Sahrens vdev_ops_t *ops, **opspp; 65*789Sahrens 66*789Sahrens for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++) 67*789Sahrens if (strcmp(ops->vdev_op_type, type) == 0) 68*789Sahrens break; 69*789Sahrens 70*789Sahrens return (ops); 71*789Sahrens } 72*789Sahrens 73*789Sahrens /* 74*789Sahrens * Default asize function: return the MAX of psize with the asize of 75*789Sahrens * all children. This is what's used by anything other than RAID-Z. 76*789Sahrens */ 77*789Sahrens uint64_t 78*789Sahrens vdev_default_asize(vdev_t *vd, uint64_t psize) 79*789Sahrens { 80*789Sahrens uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_ashift); 81*789Sahrens uint64_t csize; 82*789Sahrens uint64_t c; 83*789Sahrens 84*789Sahrens for (c = 0; c < vd->vdev_children; c++) { 85*789Sahrens csize = vdev_psize_to_asize(vd->vdev_child[c], psize); 86*789Sahrens asize = MAX(asize, csize); 87*789Sahrens } 88*789Sahrens 89*789Sahrens return (asize); 90*789Sahrens } 91*789Sahrens 92*789Sahrens vdev_t * 93*789Sahrens vdev_lookup_top(spa_t *spa, uint64_t vdev) 94*789Sahrens { 95*789Sahrens vdev_t *rvd = spa->spa_root_vdev; 96*789Sahrens 97*789Sahrens if (vdev < rvd->vdev_children) 98*789Sahrens return (rvd->vdev_child[vdev]); 99*789Sahrens 100*789Sahrens return (NULL); 101*789Sahrens } 102*789Sahrens 103*789Sahrens vdev_t * 104*789Sahrens vdev_lookup_by_path(vdev_t *vd, const char *path) 105*789Sahrens { 106*789Sahrens int c; 107*789Sahrens vdev_t *mvd; 108*789Sahrens 109*789Sahrens if (vd->vdev_path != NULL && strcmp(path, vd->vdev_path) == 0) 110*789Sahrens return (vd); 111*789Sahrens 112*789Sahrens for (c = 0; c < vd->vdev_children; c++) 113*789Sahrens if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) != 114*789Sahrens NULL) 115*789Sahrens return (mvd); 116*789Sahrens 117*789Sahrens return (NULL); 118*789Sahrens } 119*789Sahrens 120*789Sahrens vdev_t * 121*789Sahrens vdev_lookup_by_guid(vdev_t *vd, uint64_t guid) 122*789Sahrens { 123*789Sahrens int c; 124*789Sahrens vdev_t *mvd; 125*789Sahrens 126*789Sahrens if (vd->vdev_children == 0 && vd->vdev_guid == guid) 127*789Sahrens return (vd); 128*789Sahrens 129*789Sahrens for (c = 0; c < vd->vdev_children; c++) 130*789Sahrens if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) != 131*789Sahrens NULL) 132*789Sahrens return (mvd); 133*789Sahrens 134*789Sahrens return (NULL); 135*789Sahrens } 136*789Sahrens 137*789Sahrens void 138*789Sahrens vdev_add_child(vdev_t *pvd, vdev_t *cvd) 139*789Sahrens { 140*789Sahrens size_t oldsize, newsize; 141*789Sahrens uint64_t id = cvd->vdev_id; 142*789Sahrens vdev_t **newchild; 143*789Sahrens 144*789Sahrens ASSERT(spa_config_held(cvd->vdev_spa, RW_WRITER)); 145*789Sahrens ASSERT(cvd->vdev_parent == NULL); 146*789Sahrens 147*789Sahrens cvd->vdev_parent = pvd; 148*789Sahrens 149*789Sahrens if (pvd == NULL) 150*789Sahrens return; 151*789Sahrens 152*789Sahrens ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL); 153*789Sahrens 154*789Sahrens oldsize = pvd->vdev_children * sizeof (vdev_t *); 155*789Sahrens pvd->vdev_children = MAX(pvd->vdev_children, id + 1); 156*789Sahrens newsize = pvd->vdev_children * sizeof (vdev_t *); 157*789Sahrens 158*789Sahrens newchild = kmem_zalloc(newsize, KM_SLEEP); 159*789Sahrens if (pvd->vdev_child != NULL) { 160*789Sahrens bcopy(pvd->vdev_child, newchild, oldsize); 161*789Sahrens kmem_free(pvd->vdev_child, oldsize); 162*789Sahrens } 163*789Sahrens 164*789Sahrens pvd->vdev_child = newchild; 165*789Sahrens pvd->vdev_child[id] = cvd; 166*789Sahrens 167*789Sahrens cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd); 168*789Sahrens ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL); 169*789Sahrens 170*789Sahrens /* 171*789Sahrens * Walk up all ancestors to update guid sum. 172*789Sahrens */ 173*789Sahrens for (; pvd != NULL; pvd = pvd->vdev_parent) 174*789Sahrens pvd->vdev_guid_sum += cvd->vdev_guid_sum; 175*789Sahrens } 176*789Sahrens 177*789Sahrens void 178*789Sahrens vdev_remove_child(vdev_t *pvd, vdev_t *cvd) 179*789Sahrens { 180*789Sahrens int c; 181*789Sahrens uint_t id = cvd->vdev_id; 182*789Sahrens 183*789Sahrens ASSERT(cvd->vdev_parent == pvd); 184*789Sahrens 185*789Sahrens if (pvd == NULL) 186*789Sahrens return; 187*789Sahrens 188*789Sahrens ASSERT(id < pvd->vdev_children); 189*789Sahrens ASSERT(pvd->vdev_child[id] == cvd); 190*789Sahrens 191*789Sahrens pvd->vdev_child[id] = NULL; 192*789Sahrens cvd->vdev_parent = NULL; 193*789Sahrens 194*789Sahrens for (c = 0; c < pvd->vdev_children; c++) 195*789Sahrens if (pvd->vdev_child[c]) 196*789Sahrens break; 197*789Sahrens 198*789Sahrens if (c == pvd->vdev_children) { 199*789Sahrens kmem_free(pvd->vdev_child, c * sizeof (vdev_t *)); 200*789Sahrens pvd->vdev_child = NULL; 201*789Sahrens pvd->vdev_children = 0; 202*789Sahrens } 203*789Sahrens 204*789Sahrens /* 205*789Sahrens * Walk up all ancestors to update guid sum. 206*789Sahrens */ 207*789Sahrens for (; pvd != NULL; pvd = pvd->vdev_parent) 208*789Sahrens pvd->vdev_guid_sum -= cvd->vdev_guid_sum; 209*789Sahrens } 210*789Sahrens 211*789Sahrens /* 212*789Sahrens * Remove any holes in the child array. 213*789Sahrens */ 214*789Sahrens void 215*789Sahrens vdev_compact_children(vdev_t *pvd) 216*789Sahrens { 217*789Sahrens vdev_t **newchild, *cvd; 218*789Sahrens int oldc = pvd->vdev_children; 219*789Sahrens int newc, c; 220*789Sahrens 221*789Sahrens ASSERT(spa_config_held(pvd->vdev_spa, RW_WRITER)); 222*789Sahrens 223*789Sahrens for (c = newc = 0; c < oldc; c++) 224*789Sahrens if (pvd->vdev_child[c]) 225*789Sahrens newc++; 226*789Sahrens 227*789Sahrens newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP); 228*789Sahrens 229*789Sahrens for (c = newc = 0; c < oldc; c++) { 230*789Sahrens if ((cvd = pvd->vdev_child[c]) != NULL) { 231*789Sahrens newchild[newc] = cvd; 232*789Sahrens cvd->vdev_id = newc++; 233*789Sahrens } 234*789Sahrens } 235*789Sahrens 236*789Sahrens kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *)); 237*789Sahrens pvd->vdev_child = newchild; 238*789Sahrens pvd->vdev_children = newc; 239*789Sahrens } 240*789Sahrens 241*789Sahrens /* 242*789Sahrens * Allocate and minimally initialize a vdev_t. 243*789Sahrens */ 244*789Sahrens static vdev_t * 245*789Sahrens vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) 246*789Sahrens { 247*789Sahrens vdev_t *vd; 248*789Sahrens 249*789Sahrens while (guid == 0) 250*789Sahrens guid = spa_get_random(-1ULL); 251*789Sahrens 252*789Sahrens vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP); 253*789Sahrens 254*789Sahrens vd->vdev_spa = spa; 255*789Sahrens vd->vdev_id = id; 256*789Sahrens vd->vdev_guid = guid; 257*789Sahrens vd->vdev_guid_sum = guid; 258*789Sahrens vd->vdev_ops = ops; 259*789Sahrens vd->vdev_state = VDEV_STATE_CLOSED; 260*789Sahrens 261*789Sahrens mutex_init(&vd->vdev_io_lock, NULL, MUTEX_DEFAULT, NULL); 262*789Sahrens cv_init(&vd->vdev_io_cv, NULL, CV_DEFAULT, NULL); 263*789Sahrens list_create(&vd->vdev_io_pending, sizeof (zio_t), 264*789Sahrens offsetof(zio_t, io_pending)); 265*789Sahrens mutex_init(&vd->vdev_dirty_lock, NULL, MUTEX_DEFAULT, NULL); 266*789Sahrens mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL); 267*789Sahrens space_map_create(&vd->vdev_dtl_map, 0, -1ULL, 0, &vd->vdev_dtl_lock); 268*789Sahrens space_map_create(&vd->vdev_dtl_scrub, 0, -1ULL, 0, &vd->vdev_dtl_lock); 269*789Sahrens txg_list_create(&vd->vdev_ms_list, 270*789Sahrens offsetof(struct metaslab, ms_txg_node)); 271*789Sahrens txg_list_create(&vd->vdev_dtl_list, 272*789Sahrens offsetof(struct vdev, vdev_dtl_node)); 273*789Sahrens vd->vdev_stat.vs_timestamp = gethrtime(); 274*789Sahrens 275*789Sahrens return (vd); 276*789Sahrens } 277*789Sahrens 278*789Sahrens /* 279*789Sahrens * Free a vdev_t that has been removed from service. 280*789Sahrens */ 281*789Sahrens static void 282*789Sahrens vdev_free_common(vdev_t *vd) 283*789Sahrens { 284*789Sahrens if (vd->vdev_path) 285*789Sahrens spa_strfree(vd->vdev_path); 286*789Sahrens if (vd->vdev_devid) 287*789Sahrens spa_strfree(vd->vdev_devid); 288*789Sahrens 289*789Sahrens txg_list_destroy(&vd->vdev_ms_list); 290*789Sahrens txg_list_destroy(&vd->vdev_dtl_list); 291*789Sahrens mutex_enter(&vd->vdev_dtl_lock); 292*789Sahrens space_map_vacate(&vd->vdev_dtl_map, NULL, NULL); 293*789Sahrens space_map_destroy(&vd->vdev_dtl_map); 294*789Sahrens space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL); 295*789Sahrens space_map_destroy(&vd->vdev_dtl_scrub); 296*789Sahrens mutex_exit(&vd->vdev_dtl_lock); 297*789Sahrens mutex_destroy(&vd->vdev_dtl_lock); 298*789Sahrens mutex_destroy(&vd->vdev_dirty_lock); 299*789Sahrens list_destroy(&vd->vdev_io_pending); 300*789Sahrens mutex_destroy(&vd->vdev_io_lock); 301*789Sahrens cv_destroy(&vd->vdev_io_cv); 302*789Sahrens 303*789Sahrens kmem_free(vd, sizeof (vdev_t)); 304*789Sahrens } 305*789Sahrens 306*789Sahrens /* 307*789Sahrens * Allocate a new vdev. The 'alloctype' is used to control whether we are 308*789Sahrens * creating a new vdev or loading an existing one - the behavior is slightly 309*789Sahrens * different for each case. 310*789Sahrens */ 311*789Sahrens vdev_t * 312*789Sahrens vdev_alloc(spa_t *spa, nvlist_t *nv, vdev_t *parent, uint_t id, int alloctype) 313*789Sahrens { 314*789Sahrens vdev_ops_t *ops; 315*789Sahrens char *type; 316*789Sahrens uint64_t guid = 0; 317*789Sahrens vdev_t *vd; 318*789Sahrens 319*789Sahrens ASSERT(spa_config_held(spa, RW_WRITER)); 320*789Sahrens 321*789Sahrens if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) 322*789Sahrens return (NULL); 323*789Sahrens 324*789Sahrens if ((ops = vdev_getops(type)) == NULL) 325*789Sahrens return (NULL); 326*789Sahrens 327*789Sahrens /* 328*789Sahrens * If this is a load, get the vdev guid from the nvlist. 329*789Sahrens * Otherwise, vdev_alloc_common() will generate one for us. 330*789Sahrens */ 331*789Sahrens if (alloctype == VDEV_ALLOC_LOAD) { 332*789Sahrens uint64_t label_id; 333*789Sahrens 334*789Sahrens if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) || 335*789Sahrens label_id != id) 336*789Sahrens return (NULL); 337*789Sahrens 338*789Sahrens if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 339*789Sahrens return (NULL); 340*789Sahrens } 341*789Sahrens 342*789Sahrens vd = vdev_alloc_common(spa, id, guid, ops); 343*789Sahrens 344*789Sahrens if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0) 345*789Sahrens vd->vdev_path = spa_strdup(vd->vdev_path); 346*789Sahrens if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0) 347*789Sahrens vd->vdev_devid = spa_strdup(vd->vdev_devid); 348*789Sahrens 349*789Sahrens /* 350*789Sahrens * If we're a top-level vdev, try to load the allocation parameters. 351*789Sahrens */ 352*789Sahrens if (parent && !parent->vdev_parent && alloctype == VDEV_ALLOC_LOAD) { 353*789Sahrens (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, 354*789Sahrens &vd->vdev_ms_array); 355*789Sahrens (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, 356*789Sahrens &vd->vdev_ms_shift); 357*789Sahrens (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, 358*789Sahrens &vd->vdev_ashift); 359*789Sahrens (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE, 360*789Sahrens &vd->vdev_asize); 361*789Sahrens } 362*789Sahrens 363*789Sahrens /* 364*789Sahrens * If we're a leaf vdev, try to load the DTL object. 365*789Sahrens */ 366*789Sahrens if (vd->vdev_ops->vdev_op_leaf && alloctype == VDEV_ALLOC_LOAD) { 367*789Sahrens (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, 368*789Sahrens &vd->vdev_dtl.smo_object); 369*789Sahrens } 370*789Sahrens 371*789Sahrens /* 372*789Sahrens * Add ourselves to the parent's list of children. 373*789Sahrens */ 374*789Sahrens vdev_add_child(parent, vd); 375*789Sahrens 376*789Sahrens return (vd); 377*789Sahrens } 378*789Sahrens 379*789Sahrens void 380*789Sahrens vdev_free(vdev_t *vd) 381*789Sahrens { 382*789Sahrens int c; 383*789Sahrens 384*789Sahrens /* 385*789Sahrens * vdev_free() implies closing the vdev first. This is simpler than 386*789Sahrens * trying to ensure complicated semantics for all callers. 387*789Sahrens */ 388*789Sahrens vdev_close(vd); 389*789Sahrens 390*789Sahrens /* 391*789Sahrens * It's possible to free a vdev that's been added to the dirty 392*789Sahrens * list when in the middle of spa_vdev_add(). Handle that case 393*789Sahrens * correctly here. 394*789Sahrens */ 395*789Sahrens if (vd->vdev_is_dirty) 396*789Sahrens vdev_config_clean(vd); 397*789Sahrens 398*789Sahrens /* 399*789Sahrens * Free all children. 400*789Sahrens */ 401*789Sahrens for (c = 0; c < vd->vdev_children; c++) 402*789Sahrens vdev_free(vd->vdev_child[c]); 403*789Sahrens 404*789Sahrens ASSERT(vd->vdev_child == NULL); 405*789Sahrens ASSERT(vd->vdev_guid_sum == vd->vdev_guid); 406*789Sahrens 407*789Sahrens /* 408*789Sahrens * Discard allocation state. 409*789Sahrens */ 410*789Sahrens if (vd == vd->vdev_top) 411*789Sahrens vdev_metaslab_fini(vd); 412*789Sahrens 413*789Sahrens ASSERT3U(vd->vdev_stat.vs_space, ==, 0); 414*789Sahrens ASSERT3U(vd->vdev_stat.vs_alloc, ==, 0); 415*789Sahrens 416*789Sahrens /* 417*789Sahrens * Remove this vdev from its parent's child list. 418*789Sahrens */ 419*789Sahrens vdev_remove_child(vd->vdev_parent, vd); 420*789Sahrens 421*789Sahrens ASSERT(vd->vdev_parent == NULL); 422*789Sahrens 423*789Sahrens vdev_free_common(vd); 424*789Sahrens } 425*789Sahrens 426*789Sahrens /* 427*789Sahrens * Transfer top-level vdev state from svd to tvd. 428*789Sahrens */ 429*789Sahrens static void 430*789Sahrens vdev_top_transfer(vdev_t *svd, vdev_t *tvd) 431*789Sahrens { 432*789Sahrens spa_t *spa = svd->vdev_spa; 433*789Sahrens metaslab_t *msp; 434*789Sahrens vdev_t *vd; 435*789Sahrens int t; 436*789Sahrens 437*789Sahrens ASSERT(tvd == tvd->vdev_top); 438*789Sahrens 439*789Sahrens tvd->vdev_ms_array = svd->vdev_ms_array; 440*789Sahrens tvd->vdev_ms_shift = svd->vdev_ms_shift; 441*789Sahrens tvd->vdev_ms_count = svd->vdev_ms_count; 442*789Sahrens 443*789Sahrens svd->vdev_ms_array = 0; 444*789Sahrens svd->vdev_ms_shift = 0; 445*789Sahrens svd->vdev_ms_count = 0; 446*789Sahrens 447*789Sahrens tvd->vdev_mg = svd->vdev_mg; 448*789Sahrens tvd->vdev_mg->mg_vd = tvd; 449*789Sahrens tvd->vdev_ms = svd->vdev_ms; 450*789Sahrens tvd->vdev_smo = svd->vdev_smo; 451*789Sahrens 452*789Sahrens svd->vdev_mg = NULL; 453*789Sahrens svd->vdev_ms = NULL; 454*789Sahrens svd->vdev_smo = NULL; 455*789Sahrens 456*789Sahrens tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc; 457*789Sahrens tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space; 458*789Sahrens 459*789Sahrens svd->vdev_stat.vs_alloc = 0; 460*789Sahrens svd->vdev_stat.vs_space = 0; 461*789Sahrens 462*789Sahrens for (t = 0; t < TXG_SIZE; t++) { 463*789Sahrens while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL) 464*789Sahrens (void) txg_list_add(&tvd->vdev_ms_list, msp, t); 465*789Sahrens while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL) 466*789Sahrens (void) txg_list_add(&tvd->vdev_dtl_list, vd, t); 467*789Sahrens if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t)) 468*789Sahrens (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t); 469*789Sahrens tvd->vdev_dirty[t] = svd->vdev_dirty[t]; 470*789Sahrens svd->vdev_dirty[t] = 0; 471*789Sahrens } 472*789Sahrens 473*789Sahrens if (svd->vdev_is_dirty) { 474*789Sahrens vdev_config_clean(svd); 475*789Sahrens vdev_config_dirty(tvd); 476*789Sahrens } 477*789Sahrens 478*789Sahrens ASSERT(svd->vdev_io_retry == NULL); 479*789Sahrens ASSERT(list_is_empty(&svd->vdev_io_pending)); 480*789Sahrens } 481*789Sahrens 482*789Sahrens static void 483*789Sahrens vdev_top_update(vdev_t *tvd, vdev_t *vd) 484*789Sahrens { 485*789Sahrens int c; 486*789Sahrens 487*789Sahrens if (vd == NULL) 488*789Sahrens return; 489*789Sahrens 490*789Sahrens vd->vdev_top = tvd; 491*789Sahrens 492*789Sahrens for (c = 0; c < vd->vdev_children; c++) 493*789Sahrens vdev_top_update(tvd, vd->vdev_child[c]); 494*789Sahrens } 495*789Sahrens 496*789Sahrens /* 497*789Sahrens * Add a mirror/replacing vdev above an existing vdev. 498*789Sahrens */ 499*789Sahrens vdev_t * 500*789Sahrens vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) 501*789Sahrens { 502*789Sahrens spa_t *spa = cvd->vdev_spa; 503*789Sahrens vdev_t *pvd = cvd->vdev_parent; 504*789Sahrens vdev_t *mvd; 505*789Sahrens 506*789Sahrens ASSERT(spa_config_held(spa, RW_WRITER)); 507*789Sahrens 508*789Sahrens mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops); 509*789Sahrens vdev_remove_child(pvd, cvd); 510*789Sahrens vdev_add_child(pvd, mvd); 511*789Sahrens cvd->vdev_id = mvd->vdev_children; 512*789Sahrens vdev_add_child(mvd, cvd); 513*789Sahrens vdev_top_update(cvd->vdev_top, cvd->vdev_top); 514*789Sahrens 515*789Sahrens mvd->vdev_asize = cvd->vdev_asize; 516*789Sahrens mvd->vdev_ashift = cvd->vdev_ashift; 517*789Sahrens mvd->vdev_state = cvd->vdev_state; 518*789Sahrens 519*789Sahrens if (mvd == mvd->vdev_top) 520*789Sahrens vdev_top_transfer(cvd, mvd); 521*789Sahrens 522*789Sahrens return (mvd); 523*789Sahrens } 524*789Sahrens 525*789Sahrens /* 526*789Sahrens * Remove a 1-way mirror/replacing vdev from the tree. 527*789Sahrens */ 528*789Sahrens void 529*789Sahrens vdev_remove_parent(vdev_t *cvd) 530*789Sahrens { 531*789Sahrens vdev_t *mvd = cvd->vdev_parent; 532*789Sahrens vdev_t *pvd = mvd->vdev_parent; 533*789Sahrens 534*789Sahrens ASSERT(spa_config_held(cvd->vdev_spa, RW_WRITER)); 535*789Sahrens 536*789Sahrens ASSERT(mvd->vdev_children == 1); 537*789Sahrens ASSERT(mvd->vdev_ops == &vdev_mirror_ops || 538*789Sahrens mvd->vdev_ops == &vdev_replacing_ops); 539*789Sahrens 540*789Sahrens vdev_remove_child(mvd, cvd); 541*789Sahrens vdev_remove_child(pvd, mvd); 542*789Sahrens cvd->vdev_id = mvd->vdev_id; 543*789Sahrens vdev_add_child(pvd, cvd); 544*789Sahrens vdev_top_update(cvd->vdev_top, cvd->vdev_top); 545*789Sahrens 546*789Sahrens if (cvd == cvd->vdev_top) 547*789Sahrens vdev_top_transfer(mvd, cvd); 548*789Sahrens 549*789Sahrens ASSERT(mvd->vdev_children == 0); 550*789Sahrens vdev_free(mvd); 551*789Sahrens } 552*789Sahrens 553*789Sahrens void 554*789Sahrens vdev_metaslab_init(vdev_t *vd, uint64_t txg) 555*789Sahrens { 556*789Sahrens spa_t *spa = vd->vdev_spa; 557*789Sahrens metaslab_class_t *mc = spa_metaslab_class_select(spa); 558*789Sahrens uint64_t c; 559*789Sahrens uint64_t oldc = vd->vdev_ms_count; 560*789Sahrens uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; 561*789Sahrens space_map_obj_t *smo = vd->vdev_smo; 562*789Sahrens metaslab_t **mspp = vd->vdev_ms; 563*789Sahrens 564*789Sahrens dprintf("%s oldc %llu newc %llu\n", vdev_description(vd), oldc, newc); 565*789Sahrens 566*789Sahrens ASSERT(oldc <= newc); 567*789Sahrens 568*789Sahrens vd->vdev_smo = kmem_zalloc(newc * sizeof (*smo), KM_SLEEP); 569*789Sahrens vd->vdev_ms = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP); 570*789Sahrens vd->vdev_ms_count = newc; 571*789Sahrens 572*789Sahrens if (vd->vdev_mg == NULL) { 573*789Sahrens if (txg == 0) { 574*789Sahrens dmu_buf_t *db; 575*789Sahrens uint64_t *ms_array; 576*789Sahrens 577*789Sahrens ms_array = kmem_zalloc(newc * sizeof (uint64_t), 578*789Sahrens KM_SLEEP); 579*789Sahrens 580*789Sahrens dmu_read(spa->spa_meta_objset, vd->vdev_ms_array, 581*789Sahrens 0, newc * sizeof (uint64_t), ms_array); 582*789Sahrens 583*789Sahrens for (c = 0; c < newc; c++) { 584*789Sahrens if (ms_array[c] == 0) 585*789Sahrens continue; 586*789Sahrens db = dmu_bonus_hold(spa->spa_meta_objset, 587*789Sahrens ms_array[c]); 588*789Sahrens dmu_buf_read(db); 589*789Sahrens ASSERT3U(db->db_size, ==, sizeof (*smo)); 590*789Sahrens bcopy(db->db_data, &vd->vdev_smo[c], 591*789Sahrens db->db_size); 592*789Sahrens ASSERT3U(vd->vdev_smo[c].smo_object, ==, 593*789Sahrens ms_array[c]); 594*789Sahrens dmu_buf_rele(db); 595*789Sahrens } 596*789Sahrens kmem_free(ms_array, newc * sizeof (uint64_t)); 597*789Sahrens } 598*789Sahrens vd->vdev_mg = metaslab_group_create(mc, vd); 599*789Sahrens } 600*789Sahrens 601*789Sahrens for (c = 0; c < oldc; c++) { 602*789Sahrens vd->vdev_smo[c] = smo[c]; 603*789Sahrens vd->vdev_ms[c] = mspp[c]; 604*789Sahrens mspp[c]->ms_smo = &vd->vdev_smo[c]; 605*789Sahrens } 606*789Sahrens 607*789Sahrens for (c = oldc; c < newc; c++) 608*789Sahrens metaslab_init(vd->vdev_mg, &vd->vdev_smo[c], &vd->vdev_ms[c], 609*789Sahrens c << vd->vdev_ms_shift, 1ULL << vd->vdev_ms_shift, txg); 610*789Sahrens 611*789Sahrens if (oldc != 0) { 612*789Sahrens kmem_free(smo, oldc * sizeof (*smo)); 613*789Sahrens kmem_free(mspp, oldc * sizeof (*mspp)); 614*789Sahrens } 615*789Sahrens 616*789Sahrens } 617*789Sahrens 618*789Sahrens void 619*789Sahrens vdev_metaslab_fini(vdev_t *vd) 620*789Sahrens { 621*789Sahrens uint64_t m; 622*789Sahrens uint64_t count = vd->vdev_ms_count; 623*789Sahrens 624*789Sahrens if (vd->vdev_ms != NULL) { 625*789Sahrens for (m = 0; m < count; m++) 626*789Sahrens metaslab_fini(vd->vdev_ms[m]); 627*789Sahrens kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *)); 628*789Sahrens vd->vdev_ms = NULL; 629*789Sahrens } 630*789Sahrens 631*789Sahrens if (vd->vdev_smo != NULL) { 632*789Sahrens kmem_free(vd->vdev_smo, count * sizeof (space_map_obj_t)); 633*789Sahrens vd->vdev_smo = NULL; 634*789Sahrens } 635*789Sahrens } 636*789Sahrens 637*789Sahrens /* 638*789Sahrens * Prepare a virtual device for access. 639*789Sahrens */ 640*789Sahrens int 641*789Sahrens vdev_open(vdev_t *vd) 642*789Sahrens { 643*789Sahrens int error; 644*789Sahrens vdev_knob_t *vk; 645*789Sahrens int c; 646*789Sahrens uint64_t osize = 0; 647*789Sahrens uint64_t asize, psize; 648*789Sahrens uint64_t ashift = -1ULL; 649*789Sahrens 650*789Sahrens ASSERT(vd->vdev_state == VDEV_STATE_CLOSED || 651*789Sahrens vd->vdev_state == VDEV_STATE_CANT_OPEN || 652*789Sahrens vd->vdev_state == VDEV_STATE_OFFLINE); 653*789Sahrens 654*789Sahrens if (vd->vdev_fault_mode == VDEV_FAULT_COUNT) 655*789Sahrens vd->vdev_fault_arg >>= 1; 656*789Sahrens else 657*789Sahrens vd->vdev_fault_mode = VDEV_FAULT_NONE; 658*789Sahrens 659*789Sahrens vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 660*789Sahrens 661*789Sahrens for (vk = vdev_knob_next(NULL); vk != NULL; vk = vdev_knob_next(vk)) { 662*789Sahrens uint64_t *valp = (uint64_t *)((char *)vd + vk->vk_offset); 663*789Sahrens 664*789Sahrens *valp = vk->vk_default; 665*789Sahrens *valp = MAX(*valp, vk->vk_min); 666*789Sahrens *valp = MIN(*valp, vk->vk_max); 667*789Sahrens } 668*789Sahrens 669*789Sahrens if (vd->vdev_ops->vdev_op_leaf) { 670*789Sahrens vdev_cache_init(vd); 671*789Sahrens vdev_queue_init(vd); 672*789Sahrens vd->vdev_cache_active = B_TRUE; 673*789Sahrens } 674*789Sahrens 675*789Sahrens if (vd->vdev_offline) { 676*789Sahrens ASSERT(vd->vdev_children == 0); 677*789Sahrens dprintf("OFFLINE: %s = ENXIO\n", vdev_description(vd)); 678*789Sahrens vd->vdev_state = VDEV_STATE_OFFLINE; 679*789Sahrens return (ENXIO); 680*789Sahrens } 681*789Sahrens 682*789Sahrens error = vd->vdev_ops->vdev_op_open(vd, &osize, &ashift); 683*789Sahrens 684*789Sahrens dprintf("%s = %d, osize %llu, state = %d\n", 685*789Sahrens vdev_description(vd), error, osize, vd->vdev_state); 686*789Sahrens 687*789Sahrens if (error) { 688*789Sahrens dprintf("%s in %s failed to open, error %d, aux %d\n", 689*789Sahrens vdev_description(vd), 690*789Sahrens vdev_description(vd->vdev_parent), 691*789Sahrens error, 692*789Sahrens vd->vdev_stat.vs_aux); 693*789Sahrens 694*789Sahrens vd->vdev_state = VDEV_STATE_CANT_OPEN; 695*789Sahrens return (error); 696*789Sahrens } 697*789Sahrens 698*789Sahrens vd->vdev_state = VDEV_STATE_HEALTHY; 699*789Sahrens 700*789Sahrens for (c = 0; c < vd->vdev_children; c++) 701*789Sahrens if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) 702*789Sahrens vd->vdev_state = VDEV_STATE_DEGRADED; 703*789Sahrens 704*789Sahrens osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t)); 705*789Sahrens 706*789Sahrens if (vd->vdev_children == 0) { 707*789Sahrens if (osize < SPA_MINDEVSIZE) { 708*789Sahrens vd->vdev_state = VDEV_STATE_CANT_OPEN; 709*789Sahrens vd->vdev_stat.vs_aux = VDEV_AUX_TOO_SMALL; 710*789Sahrens return (EOVERFLOW); 711*789Sahrens } 712*789Sahrens psize = osize; 713*789Sahrens asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); 714*789Sahrens } else { 715*789Sahrens if (osize < SPA_MINDEVSIZE - 716*789Sahrens (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) { 717*789Sahrens vd->vdev_state = VDEV_STATE_CANT_OPEN; 718*789Sahrens vd->vdev_stat.vs_aux = VDEV_AUX_TOO_SMALL; 719*789Sahrens return (EOVERFLOW); 720*789Sahrens } 721*789Sahrens psize = 0; 722*789Sahrens asize = osize; 723*789Sahrens } 724*789Sahrens 725*789Sahrens vd->vdev_psize = psize; 726*789Sahrens 727*789Sahrens if (vd->vdev_asize == 0) { 728*789Sahrens /* 729*789Sahrens * This is the first-ever open, so use the computed values. 730*789Sahrens */ 731*789Sahrens vd->vdev_asize = asize; 732*789Sahrens vd->vdev_ashift = ashift; 733*789Sahrens } else { 734*789Sahrens /* 735*789Sahrens * Make sure the alignment requirement hasn't increased. 736*789Sahrens */ 737*789Sahrens if (ashift > vd->vdev_ashift) { 738*789Sahrens dprintf("%s: ashift grew\n", vdev_description(vd)); 739*789Sahrens vd->vdev_state = VDEV_STATE_CANT_OPEN; 740*789Sahrens vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 741*789Sahrens return (EINVAL); 742*789Sahrens } 743*789Sahrens 744*789Sahrens /* 745*789Sahrens * Make sure the device hasn't shrunk. 746*789Sahrens */ 747*789Sahrens if (asize < vd->vdev_asize) { 748*789Sahrens dprintf("%s: device shrank\n", vdev_description(vd)); 749*789Sahrens vd->vdev_state = VDEV_STATE_CANT_OPEN; 750*789Sahrens vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 751*789Sahrens return (EINVAL); 752*789Sahrens } 753*789Sahrens 754*789Sahrens /* 755*789Sahrens * If all children are healthy and the asize has increased, 756*789Sahrens * then we've experienced dynamic LUN growth. 757*789Sahrens */ 758*789Sahrens if (vd->vdev_state == VDEV_STATE_HEALTHY && 759*789Sahrens asize > vd->vdev_asize) { 760*789Sahrens dprintf("%s: device grew\n", vdev_description(vd)); 761*789Sahrens vd->vdev_asize = asize; 762*789Sahrens } 763*789Sahrens } 764*789Sahrens 765*789Sahrens return (0); 766*789Sahrens } 767*789Sahrens 768*789Sahrens /* 769*789Sahrens * Close a virtual device. 770*789Sahrens */ 771*789Sahrens void 772*789Sahrens vdev_close(vdev_t *vd) 773*789Sahrens { 774*789Sahrens ASSERT3P(list_head(&vd->vdev_io_pending), ==, NULL); 775*789Sahrens 776*789Sahrens vd->vdev_ops->vdev_op_close(vd); 777*789Sahrens 778*789Sahrens if (vd->vdev_cache_active) { 779*789Sahrens vdev_cache_fini(vd); 780*789Sahrens vdev_queue_fini(vd); 781*789Sahrens vd->vdev_cache_active = B_FALSE; 782*789Sahrens } 783*789Sahrens 784*789Sahrens if (vd->vdev_offline) 785*789Sahrens vd->vdev_state = VDEV_STATE_OFFLINE; 786*789Sahrens else 787*789Sahrens vd->vdev_state = VDEV_STATE_CLOSED; 788*789Sahrens } 789*789Sahrens 790*789Sahrens void 791*789Sahrens vdev_reopen(vdev_t *vd, zio_t **rq) 792*789Sahrens { 793*789Sahrens vdev_t *rvd = vd->vdev_spa->spa_root_vdev; 794*789Sahrens int c; 795*789Sahrens 796*789Sahrens if (vd == rvd) { 797*789Sahrens ASSERT(rq == NULL); 798*789Sahrens for (c = 0; c < rvd->vdev_children; c++) 799*789Sahrens vdev_reopen(rvd->vdev_child[c], NULL); 800*789Sahrens return; 801*789Sahrens } 802*789Sahrens 803*789Sahrens /* only valid for top-level vdevs */ 804*789Sahrens ASSERT3P(vd, ==, vd->vdev_top); 805*789Sahrens 806*789Sahrens /* 807*789Sahrens * vdev_state can change when spa_config_lock is held as writer, 808*789Sahrens * or when it's held as reader and we're doing a vdev_reopen(). 809*789Sahrens * To handle the latter case, we grab rvd's io_lock to serialize 810*789Sahrens * reopens. This ensures that there's never more than one vdev 811*789Sahrens * state changer active at a time. 812*789Sahrens */ 813*789Sahrens mutex_enter(&rvd->vdev_io_lock); 814*789Sahrens 815*789Sahrens mutex_enter(&vd->vdev_io_lock); 816*789Sahrens while (list_head(&vd->vdev_io_pending) != NULL) 817*789Sahrens cv_wait(&vd->vdev_io_cv, &vd->vdev_io_lock); 818*789Sahrens vdev_close(vd); 819*789Sahrens (void) vdev_open(vd); 820*789Sahrens if (rq != NULL) { 821*789Sahrens *rq = vd->vdev_io_retry; 822*789Sahrens vd->vdev_io_retry = NULL; 823*789Sahrens } 824*789Sahrens mutex_exit(&vd->vdev_io_lock); 825*789Sahrens 826*789Sahrens /* 827*789Sahrens * Reassess root vdev's health. 828*789Sahrens */ 829*789Sahrens rvd->vdev_state = VDEV_STATE_HEALTHY; 830*789Sahrens for (c = 0; c < rvd->vdev_children; c++) { 831*789Sahrens uint64_t state = rvd->vdev_child[c]->vdev_state; 832*789Sahrens rvd->vdev_state = MIN(rvd->vdev_state, state); 833*789Sahrens } 834*789Sahrens 835*789Sahrens mutex_exit(&rvd->vdev_io_lock); 836*789Sahrens } 837*789Sahrens 838*789Sahrens int 839*789Sahrens vdev_create(vdev_t *vd, uint64_t txg) 840*789Sahrens { 841*789Sahrens int error; 842*789Sahrens 843*789Sahrens /* 844*789Sahrens * Normally, partial opens (e.g. of a mirror) are allowed. 845*789Sahrens * For a create, however, we want to fail the request if 846*789Sahrens * there are any components we can't open. 847*789Sahrens */ 848*789Sahrens error = vdev_open(vd); 849*789Sahrens 850*789Sahrens if (error || vd->vdev_state != VDEV_STATE_HEALTHY) { 851*789Sahrens vdev_close(vd); 852*789Sahrens return (error ? error : ENXIO); 853*789Sahrens } 854*789Sahrens 855*789Sahrens /* 856*789Sahrens * Recursively initialize all labels. 857*789Sahrens */ 858*789Sahrens if ((error = vdev_label_init(vd, txg)) != 0) { 859*789Sahrens vdev_close(vd); 860*789Sahrens return (error); 861*789Sahrens } 862*789Sahrens 863*789Sahrens return (0); 864*789Sahrens } 865*789Sahrens 866*789Sahrens /* 867*789Sahrens * The is the latter half of vdev_create(). It is distinct because it 868*789Sahrens * involves initiating transactions in order to do metaslab creation. 869*789Sahrens * For creation, we want to try to create all vdevs at once and then undo it 870*789Sahrens * if anything fails; this is much harder if we have pending transactions. 871*789Sahrens */ 872*789Sahrens void 873*789Sahrens vdev_init(vdev_t *vd, uint64_t txg) 874*789Sahrens { 875*789Sahrens /* 876*789Sahrens * Aim for roughly 200 metaslabs per vdev. 877*789Sahrens */ 878*789Sahrens vd->vdev_ms_shift = highbit(vd->vdev_asize / 200); 879*789Sahrens vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT); 880*789Sahrens 881*789Sahrens /* 882*789Sahrens * Initialize the vdev's metaslabs. 883*789Sahrens */ 884*789Sahrens vdev_metaslab_init(vd, txg); 885*789Sahrens } 886*789Sahrens 887*789Sahrens void 888*789Sahrens vdev_dirty(vdev_t *vd, uint8_t flags, uint64_t txg) 889*789Sahrens { 890*789Sahrens vdev_t *tvd = vd->vdev_top; 891*789Sahrens 892*789Sahrens mutex_enter(&tvd->vdev_dirty_lock); 893*789Sahrens if ((tvd->vdev_dirty[txg & TXG_MASK] & flags) != flags) { 894*789Sahrens tvd->vdev_dirty[txg & TXG_MASK] |= flags; 895*789Sahrens (void) txg_list_add(&tvd->vdev_spa->spa_vdev_txg_list, 896*789Sahrens tvd, txg); 897*789Sahrens } 898*789Sahrens mutex_exit(&tvd->vdev_dirty_lock); 899*789Sahrens } 900*789Sahrens 901*789Sahrens void 902*789Sahrens vdev_dtl_dirty(space_map_t *sm, uint64_t txg, uint64_t size) 903*789Sahrens { 904*789Sahrens mutex_enter(sm->sm_lock); 905*789Sahrens if (!space_map_contains(sm, txg, size)) 906*789Sahrens space_map_add(sm, txg, size); 907*789Sahrens mutex_exit(sm->sm_lock); 908*789Sahrens } 909*789Sahrens 910*789Sahrens int 911*789Sahrens vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size) 912*789Sahrens { 913*789Sahrens int dirty; 914*789Sahrens 915*789Sahrens /* 916*789Sahrens * Quick test without the lock -- covers the common case that 917*789Sahrens * there are no dirty time segments. 918*789Sahrens */ 919*789Sahrens if (sm->sm_space == 0) 920*789Sahrens return (0); 921*789Sahrens 922*789Sahrens mutex_enter(sm->sm_lock); 923*789Sahrens dirty = space_map_contains(sm, txg, size); 924*789Sahrens mutex_exit(sm->sm_lock); 925*789Sahrens 926*789Sahrens return (dirty); 927*789Sahrens } 928*789Sahrens 929*789Sahrens /* 930*789Sahrens * Reassess DTLs after a config change or scrub completion. 931*789Sahrens */ 932*789Sahrens void 933*789Sahrens vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) 934*789Sahrens { 935*789Sahrens int c; 936*789Sahrens 937*789Sahrens ASSERT(spa_config_held(vd->vdev_spa, RW_WRITER)); 938*789Sahrens 939*789Sahrens if (vd->vdev_children == 0) { 940*789Sahrens mutex_enter(&vd->vdev_dtl_lock); 941*789Sahrens /* 942*789Sahrens * We're successfully scrubbed everything up to scrub_txg. 943*789Sahrens * Therefore, excise all old DTLs up to that point, then 944*789Sahrens * fold in the DTLs for everything we couldn't scrub. 945*789Sahrens */ 946*789Sahrens if (scrub_txg != 0) { 947*789Sahrens space_map_excise(&vd->vdev_dtl_map, 0, scrub_txg); 948*789Sahrens space_map_union(&vd->vdev_dtl_map, &vd->vdev_dtl_scrub); 949*789Sahrens } 950*789Sahrens if (scrub_done) 951*789Sahrens space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL); 952*789Sahrens mutex_exit(&vd->vdev_dtl_lock); 953*789Sahrens if (txg != 0) { 954*789Sahrens vdev_t *tvd = vd->vdev_top; 955*789Sahrens vdev_dirty(tvd, VDD_DTL, txg); 956*789Sahrens (void) txg_list_add(&tvd->vdev_dtl_list, vd, txg); 957*789Sahrens } 958*789Sahrens return; 959*789Sahrens } 960*789Sahrens 961*789Sahrens mutex_enter(&vd->vdev_dtl_lock); 962*789Sahrens space_map_vacate(&vd->vdev_dtl_map, NULL, NULL); 963*789Sahrens space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL); 964*789Sahrens mutex_exit(&vd->vdev_dtl_lock); 965*789Sahrens 966*789Sahrens for (c = 0; c < vd->vdev_children; c++) { 967*789Sahrens vdev_t *cvd = vd->vdev_child[c]; 968*789Sahrens vdev_dtl_reassess(cvd, txg, scrub_txg, scrub_done); 969*789Sahrens mutex_enter(&vd->vdev_dtl_lock); 970*789Sahrens space_map_union(&vd->vdev_dtl_map, &cvd->vdev_dtl_map); 971*789Sahrens space_map_union(&vd->vdev_dtl_scrub, &cvd->vdev_dtl_scrub); 972*789Sahrens mutex_exit(&vd->vdev_dtl_lock); 973*789Sahrens } 974*789Sahrens } 975*789Sahrens 976*789Sahrens static int 977*789Sahrens vdev_dtl_load(vdev_t *vd) 978*789Sahrens { 979*789Sahrens spa_t *spa = vd->vdev_spa; 980*789Sahrens space_map_obj_t *smo = &vd->vdev_dtl; 981*789Sahrens dmu_buf_t *db; 982*789Sahrens int error; 983*789Sahrens 984*789Sahrens ASSERT(vd->vdev_children == 0); 985*789Sahrens 986*789Sahrens if (smo->smo_object == 0) 987*789Sahrens return (0); 988*789Sahrens 989*789Sahrens db = dmu_bonus_hold(spa->spa_meta_objset, smo->smo_object); 990*789Sahrens dmu_buf_read(db); 991*789Sahrens ASSERT3U(db->db_size, ==, sizeof (*smo)); 992*789Sahrens bcopy(db->db_data, smo, db->db_size); 993*789Sahrens dmu_buf_rele(db); 994*789Sahrens 995*789Sahrens mutex_enter(&vd->vdev_dtl_lock); 996*789Sahrens error = space_map_load(&vd->vdev_dtl_map, smo, SM_ALLOC, 997*789Sahrens spa->spa_meta_objset, smo->smo_objsize, smo->smo_alloc); 998*789Sahrens mutex_exit(&vd->vdev_dtl_lock); 999*789Sahrens 1000*789Sahrens return (error); 1001*789Sahrens } 1002*789Sahrens 1003*789Sahrens void 1004*789Sahrens vdev_dtl_sync(vdev_t *vd, uint64_t txg) 1005*789Sahrens { 1006*789Sahrens spa_t *spa = vd->vdev_spa; 1007*789Sahrens space_map_obj_t *smo = &vd->vdev_dtl; 1008*789Sahrens space_map_t *sm = &vd->vdev_dtl_map; 1009*789Sahrens space_map_t smsync; 1010*789Sahrens kmutex_t smlock; 1011*789Sahrens avl_tree_t *t = &sm->sm_root; 1012*789Sahrens space_seg_t *ss; 1013*789Sahrens dmu_buf_t *db; 1014*789Sahrens dmu_tx_t *tx; 1015*789Sahrens 1016*789Sahrens dprintf("%s in txg %llu pass %d\n", 1017*789Sahrens vdev_description(vd), (u_longlong_t)txg, spa_sync_pass(spa)); 1018*789Sahrens 1019*789Sahrens tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 1020*789Sahrens 1021*789Sahrens if (vd->vdev_detached) { 1022*789Sahrens if (smo->smo_object != 0) { 1023*789Sahrens int err = dmu_object_free(spa->spa_meta_objset, 1024*789Sahrens smo->smo_object, tx); 1025*789Sahrens ASSERT3U(err, ==, 0); 1026*789Sahrens smo->smo_object = 0; 1027*789Sahrens } 1028*789Sahrens dmu_tx_commit(tx); 1029*789Sahrens return; 1030*789Sahrens } 1031*789Sahrens 1032*789Sahrens if (smo->smo_object == 0) { 1033*789Sahrens ASSERT(smo->smo_objsize == 0); 1034*789Sahrens ASSERT(smo->smo_alloc == 0); 1035*789Sahrens smo->smo_object = dmu_object_alloc(spa->spa_meta_objset, 1036*789Sahrens DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT, 1037*789Sahrens DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx); 1038*789Sahrens ASSERT(smo->smo_object != 0); 1039*789Sahrens vdev_config_dirty(vd->vdev_top); 1040*789Sahrens } 1041*789Sahrens 1042*789Sahrens dmu_free_range(spa->spa_meta_objset, smo->smo_object, 1043*789Sahrens 0, smo->smo_objsize, tx); 1044*789Sahrens 1045*789Sahrens mutex_init(&smlock, NULL, MUTEX_DEFAULT, NULL); 1046*789Sahrens 1047*789Sahrens space_map_create(&smsync, sm->sm_start, sm->sm_size, sm->sm_shift, 1048*789Sahrens &smlock); 1049*789Sahrens 1050*789Sahrens mutex_enter(&smlock); 1051*789Sahrens 1052*789Sahrens mutex_enter(&vd->vdev_dtl_lock); 1053*789Sahrens for (ss = avl_first(t); ss != NULL; ss = AVL_NEXT(t, ss)) 1054*789Sahrens space_map_add(&smsync, ss->ss_start, ss->ss_end - ss->ss_start); 1055*789Sahrens mutex_exit(&vd->vdev_dtl_lock); 1056*789Sahrens 1057*789Sahrens smo->smo_objsize = 0; 1058*789Sahrens smo->smo_alloc = smsync.sm_space; 1059*789Sahrens 1060*789Sahrens space_map_sync(&smsync, NULL, smo, SM_ALLOC, spa->spa_meta_objset, tx); 1061*789Sahrens space_map_destroy(&smsync); 1062*789Sahrens 1063*789Sahrens mutex_exit(&smlock); 1064*789Sahrens mutex_destroy(&smlock); 1065*789Sahrens 1066*789Sahrens db = dmu_bonus_hold(spa->spa_meta_objset, smo->smo_object); 1067*789Sahrens dmu_buf_will_dirty(db, tx); 1068*789Sahrens ASSERT3U(db->db_size, ==, sizeof (*smo)); 1069*789Sahrens bcopy(smo, db->db_data, db->db_size); 1070*789Sahrens dmu_buf_rele(db); 1071*789Sahrens 1072*789Sahrens dmu_tx_commit(tx); 1073*789Sahrens } 1074*789Sahrens 1075*789Sahrens int 1076*789Sahrens vdev_load(vdev_t *vd, int import) 1077*789Sahrens { 1078*789Sahrens spa_t *spa = vd->vdev_spa; 1079*789Sahrens int c, error; 1080*789Sahrens nvlist_t *label; 1081*789Sahrens uint64_t guid, state; 1082*789Sahrens 1083*789Sahrens dprintf("loading %s\n", vdev_description(vd)); 1084*789Sahrens 1085*789Sahrens /* 1086*789Sahrens * Recursively load all children. 1087*789Sahrens */ 1088*789Sahrens for (c = 0; c < vd->vdev_children; c++) 1089*789Sahrens if ((error = vdev_load(vd->vdev_child[c], import)) != 0) 1090*789Sahrens return (error); 1091*789Sahrens 1092*789Sahrens /* 1093*789Sahrens * If this is a leaf vdev, make sure its agrees with its disk labels. 1094*789Sahrens */ 1095*789Sahrens if (vd->vdev_ops->vdev_op_leaf) { 1096*789Sahrens 1097*789Sahrens if (vdev_is_dead(vd)) 1098*789Sahrens return (0); 1099*789Sahrens 1100*789Sahrens /* 1101*789Sahrens * XXX state transitions don't propagate to parent here. 1102*789Sahrens * Also, merely setting the state isn't sufficient because 1103*789Sahrens * it's not persistent; a vdev_reopen() would make us 1104*789Sahrens * forget all about it. 1105*789Sahrens */ 1106*789Sahrens if ((label = vdev_label_read_config(vd)) == NULL) { 1107*789Sahrens dprintf("can't load label config\n"); 1108*789Sahrens vdev_set_state(vd, VDEV_STATE_CANT_OPEN, 1109*789Sahrens VDEV_AUX_CORRUPT_DATA); 1110*789Sahrens return (0); 1111*789Sahrens } 1112*789Sahrens 1113*789Sahrens if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, 1114*789Sahrens &guid) != 0 || guid != spa_guid(spa)) { 1115*789Sahrens dprintf("bad or missing pool GUID (%llu)\n", guid); 1116*789Sahrens vdev_set_state(vd, VDEV_STATE_CANT_OPEN, 1117*789Sahrens VDEV_AUX_CORRUPT_DATA); 1118*789Sahrens nvlist_free(label); 1119*789Sahrens return (0); 1120*789Sahrens } 1121*789Sahrens 1122*789Sahrens if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) || 1123*789Sahrens guid != vd->vdev_guid) { 1124*789Sahrens dprintf("bad or missing vdev guid (%llu != %llu)\n", 1125*789Sahrens guid, vd->vdev_guid); 1126*789Sahrens vdev_set_state(vd, VDEV_STATE_CANT_OPEN, 1127*789Sahrens VDEV_AUX_CORRUPT_DATA); 1128*789Sahrens nvlist_free(label); 1129*789Sahrens return (0); 1130*789Sahrens } 1131*789Sahrens 1132*789Sahrens /* 1133*789Sahrens * If we find a vdev with a matching pool guid and vdev guid, 1134*789Sahrens * but the pool state is not active, it indicates that the user 1135*789Sahrens * exported or destroyed the pool without affecting the config 1136*789Sahrens * cache (if / was mounted readonly, for example). In this 1137*789Sahrens * case, immediately return EBADF so the caller can remove it 1138*789Sahrens * from the config. 1139*789Sahrens */ 1140*789Sahrens if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, 1141*789Sahrens &state)) { 1142*789Sahrens dprintf("missing pool state\n"); 1143*789Sahrens vdev_set_state(vd, VDEV_STATE_CANT_OPEN, 1144*789Sahrens VDEV_AUX_CORRUPT_DATA); 1145*789Sahrens nvlist_free(label); 1146*789Sahrens return (0); 1147*789Sahrens } 1148*789Sahrens 1149*789Sahrens if (state != POOL_STATE_ACTIVE && 1150*789Sahrens (!import || state != POOL_STATE_EXPORTED)) { 1151*789Sahrens dprintf("pool state not active (%llu)\n", state); 1152*789Sahrens nvlist_free(label); 1153*789Sahrens return (EBADF); 1154*789Sahrens } 1155*789Sahrens 1156*789Sahrens nvlist_free(label); 1157*789Sahrens } 1158*789Sahrens 1159*789Sahrens /* 1160*789Sahrens * If this is a top-level vdev, make sure its allocation parameters 1161*789Sahrens * exist and initialize its metaslabs. 1162*789Sahrens */ 1163*789Sahrens if (vd == vd->vdev_top) { 1164*789Sahrens 1165*789Sahrens if (vd->vdev_ms_array == 0 || 1166*789Sahrens vd->vdev_ms_shift == 0 || 1167*789Sahrens vd->vdev_ashift == 0 || 1168*789Sahrens vd->vdev_asize == 0) { 1169*789Sahrens vdev_set_state(vd, VDEV_STATE_CANT_OPEN, 1170*789Sahrens VDEV_AUX_CORRUPT_DATA); 1171*789Sahrens return (0); 1172*789Sahrens } 1173*789Sahrens 1174*789Sahrens vdev_metaslab_init(vd, 0); 1175*789Sahrens } 1176*789Sahrens 1177*789Sahrens /* 1178*789Sahrens * If this is a leaf vdev, load its DTL. 1179*789Sahrens */ 1180*789Sahrens if (vd->vdev_ops->vdev_op_leaf) { 1181*789Sahrens error = vdev_dtl_load(vd); 1182*789Sahrens if (error) { 1183*789Sahrens dprintf("can't load DTL for %s, error %d\n", 1184*789Sahrens vdev_description(vd), error); 1185*789Sahrens vdev_set_state(vd, VDEV_STATE_CANT_OPEN, 1186*789Sahrens VDEV_AUX_CORRUPT_DATA); 1187*789Sahrens return (0); 1188*789Sahrens } 1189*789Sahrens } 1190*789Sahrens 1191*789Sahrens return (0); 1192*789Sahrens } 1193*789Sahrens 1194*789Sahrens void 1195*789Sahrens vdev_sync_done(vdev_t *vd, uint64_t txg) 1196*789Sahrens { 1197*789Sahrens metaslab_t *msp; 1198*789Sahrens 1199*789Sahrens dprintf("%s txg %llu\n", vdev_description(vd), txg); 1200*789Sahrens 1201*789Sahrens while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))) 1202*789Sahrens metaslab_sync_done(msp, txg); 1203*789Sahrens } 1204*789Sahrens 1205*789Sahrens void 1206*789Sahrens vdev_add_sync(vdev_t *vd, uint64_t txg) 1207*789Sahrens { 1208*789Sahrens spa_t *spa = vd->vdev_spa; 1209*789Sahrens dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 1210*789Sahrens 1211*789Sahrens ASSERT(vd == vd->vdev_top); 1212*789Sahrens 1213*789Sahrens if (vd->vdev_ms_array == 0) 1214*789Sahrens vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset, 1215*789Sahrens DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx); 1216*789Sahrens 1217*789Sahrens ASSERT(vd->vdev_ms_array != 0); 1218*789Sahrens 1219*789Sahrens vdev_config_dirty(vd); 1220*789Sahrens 1221*789Sahrens dmu_tx_commit(tx); 1222*789Sahrens } 1223*789Sahrens 1224*789Sahrens void 1225*789Sahrens vdev_sync(vdev_t *vd, uint64_t txg) 1226*789Sahrens { 1227*789Sahrens spa_t *spa = vd->vdev_spa; 1228*789Sahrens vdev_t *lvd; 1229*789Sahrens metaslab_t *msp; 1230*789Sahrens uint8_t *dirtyp = &vd->vdev_dirty[txg & TXG_MASK]; 1231*789Sahrens uint8_t dirty = *dirtyp; 1232*789Sahrens 1233*789Sahrens mutex_enter(&vd->vdev_dirty_lock); 1234*789Sahrens *dirtyp &= ~(VDD_ALLOC | VDD_FREE | VDD_ADD | VDD_DTL); 1235*789Sahrens mutex_exit(&vd->vdev_dirty_lock); 1236*789Sahrens 1237*789Sahrens dprintf("%s txg %llu pass %d\n", 1238*789Sahrens vdev_description(vd), (u_longlong_t)txg, spa_sync_pass(spa)); 1239*789Sahrens 1240*789Sahrens if (dirty & VDD_ADD) 1241*789Sahrens vdev_add_sync(vd, txg); 1242*789Sahrens 1243*789Sahrens while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) 1244*789Sahrens metaslab_sync(msp, txg); 1245*789Sahrens 1246*789Sahrens while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL) 1247*789Sahrens vdev_dtl_sync(lvd, txg); 1248*789Sahrens 1249*789Sahrens (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)); 1250*789Sahrens } 1251*789Sahrens 1252*789Sahrens uint64_t 1253*789Sahrens vdev_psize_to_asize(vdev_t *vd, uint64_t psize) 1254*789Sahrens { 1255*789Sahrens return (vd->vdev_ops->vdev_op_asize(vd, psize)); 1256*789Sahrens } 1257*789Sahrens 1258*789Sahrens void 1259*789Sahrens vdev_io_start(zio_t *zio) 1260*789Sahrens { 1261*789Sahrens zio->io_vd->vdev_ops->vdev_op_io_start(zio); 1262*789Sahrens } 1263*789Sahrens 1264*789Sahrens void 1265*789Sahrens vdev_io_done(zio_t *zio) 1266*789Sahrens { 1267*789Sahrens zio->io_vd->vdev_ops->vdev_op_io_done(zio); 1268*789Sahrens } 1269*789Sahrens 1270*789Sahrens const char * 1271*789Sahrens vdev_description(vdev_t *vd) 1272*789Sahrens { 1273*789Sahrens if (vd == NULL || vd->vdev_ops == NULL) 1274*789Sahrens return ("<unknown>"); 1275*789Sahrens 1276*789Sahrens if (vd->vdev_path != NULL) 1277*789Sahrens return (vd->vdev_path); 1278*789Sahrens 1279*789Sahrens if (vd->vdev_parent == NULL) 1280*789Sahrens return (spa_name(vd->vdev_spa)); 1281*789Sahrens 1282*789Sahrens return (vd->vdev_ops->vdev_op_type); 1283*789Sahrens } 1284*789Sahrens 1285*789Sahrens int 1286*789Sahrens vdev_online(spa_t *spa, const char *path) 1287*789Sahrens { 1288*789Sahrens vdev_t *vd; 1289*789Sahrens 1290*789Sahrens spa_config_enter(spa, RW_WRITER); 1291*789Sahrens 1292*789Sahrens if ((vd = vdev_lookup_by_path(spa->spa_root_vdev, path)) == NULL) { 1293*789Sahrens spa_config_exit(spa); 1294*789Sahrens return (ENODEV); 1295*789Sahrens } 1296*789Sahrens 1297*789Sahrens dprintf("ONLINE: %s\n", vdev_description(vd)); 1298*789Sahrens 1299*789Sahrens vd->vdev_offline = B_FALSE; 1300*789Sahrens 1301*789Sahrens /* 1302*789Sahrens * Clear the error counts. The idea is that you expect to see all 1303*789Sahrens * zeroes when everything is working, so if you've just onlined a 1304*789Sahrens * device, you don't want to keep hearing about errors from before. 1305*789Sahrens */ 1306*789Sahrens vd->vdev_stat.vs_read_errors = 0; 1307*789Sahrens vd->vdev_stat.vs_write_errors = 0; 1308*789Sahrens vd->vdev_stat.vs_checksum_errors = 0; 1309*789Sahrens 1310*789Sahrens vdev_reopen(vd->vdev_top, NULL); 1311*789Sahrens 1312*789Sahrens spa_config_exit(spa); 1313*789Sahrens 1314*789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1315*789Sahrens 1316*789Sahrens return (0); 1317*789Sahrens } 1318*789Sahrens 1319*789Sahrens int 1320*789Sahrens vdev_offline(spa_t *spa, const char *path) 1321*789Sahrens { 1322*789Sahrens vdev_t *vd; 1323*789Sahrens 1324*789Sahrens spa_config_enter(spa, RW_WRITER); 1325*789Sahrens 1326*789Sahrens if ((vd = vdev_lookup_by_path(spa->spa_root_vdev, path)) == NULL) { 1327*789Sahrens spa_config_exit(spa); 1328*789Sahrens return (ENODEV); 1329*789Sahrens } 1330*789Sahrens 1331*789Sahrens dprintf("OFFLINE: %s\n", vdev_description(vd)); 1332*789Sahrens 1333*789Sahrens /* 1334*789Sahrens * If this device's top-level vdev has a non-empty DTL, 1335*789Sahrens * don't allow the device to be offlined. 1336*789Sahrens * 1337*789Sahrens * XXX -- we should make this more precise by allowing the offline 1338*789Sahrens * as long as the remaining devices don't have any DTL holes. 1339*789Sahrens */ 1340*789Sahrens if (vd->vdev_top->vdev_dtl_map.sm_space != 0) { 1341*789Sahrens spa_config_exit(spa); 1342*789Sahrens return (EBUSY); 1343*789Sahrens } 1344*789Sahrens 1345*789Sahrens /* 1346*789Sahrens * Set this device to offline state and reopen its top-level vdev. 1347*789Sahrens * If this action results in the top-level vdev becoming unusable, 1348*789Sahrens * undo it and fail the request. 1349*789Sahrens */ 1350*789Sahrens vd->vdev_offline = B_TRUE; 1351*789Sahrens vdev_reopen(vd->vdev_top, NULL); 1352*789Sahrens if (vdev_is_dead(vd->vdev_top)) { 1353*789Sahrens vd->vdev_offline = B_FALSE; 1354*789Sahrens vdev_reopen(vd->vdev_top, NULL); 1355*789Sahrens spa_config_exit(spa); 1356*789Sahrens return (EBUSY); 1357*789Sahrens } 1358*789Sahrens 1359*789Sahrens spa_config_exit(spa); 1360*789Sahrens 1361*789Sahrens return (0); 1362*789Sahrens } 1363*789Sahrens 1364*789Sahrens int 1365*789Sahrens vdev_error_setup(spa_t *spa, const char *path, int mode, int mask, uint64_t arg) 1366*789Sahrens { 1367*789Sahrens vdev_t *vd; 1368*789Sahrens 1369*789Sahrens spa_config_enter(spa, RW_WRITER); 1370*789Sahrens 1371*789Sahrens if ((vd = vdev_lookup_by_path(spa->spa_root_vdev, path)) == NULL) { 1372*789Sahrens spa_config_exit(spa); 1373*789Sahrens return (ENODEV); 1374*789Sahrens } 1375*789Sahrens 1376*789Sahrens vd->vdev_fault_mode = mode; 1377*789Sahrens vd->vdev_fault_mask = mask; 1378*789Sahrens vd->vdev_fault_arg = arg; 1379*789Sahrens 1380*789Sahrens spa_config_exit(spa); 1381*789Sahrens 1382*789Sahrens return (0); 1383*789Sahrens } 1384*789Sahrens 1385*789Sahrens int 1386*789Sahrens vdev_is_dead(vdev_t *vd) 1387*789Sahrens { 1388*789Sahrens return (vd->vdev_state <= VDEV_STATE_CANT_OPEN); 1389*789Sahrens } 1390*789Sahrens 1391*789Sahrens int 1392*789Sahrens vdev_error_inject(vdev_t *vd, zio_t *zio) 1393*789Sahrens { 1394*789Sahrens int error = 0; 1395*789Sahrens 1396*789Sahrens if (vd->vdev_fault_mode == VDEV_FAULT_NONE) 1397*789Sahrens return (0); 1398*789Sahrens 1399*789Sahrens if (((1ULL << zio->io_type) & vd->vdev_fault_mask) == 0) 1400*789Sahrens return (0); 1401*789Sahrens 1402*789Sahrens switch (vd->vdev_fault_mode) { 1403*789Sahrens case VDEV_FAULT_RANDOM: 1404*789Sahrens if (spa_get_random(vd->vdev_fault_arg) == 0) 1405*789Sahrens error = EIO; 1406*789Sahrens break; 1407*789Sahrens 1408*789Sahrens case VDEV_FAULT_COUNT: 1409*789Sahrens if ((int64_t)--vd->vdev_fault_arg <= 0) 1410*789Sahrens vd->vdev_fault_mode = VDEV_FAULT_NONE; 1411*789Sahrens error = EIO; 1412*789Sahrens break; 1413*789Sahrens } 1414*789Sahrens 1415*789Sahrens if (error != 0) { 1416*789Sahrens dprintf("returning %d for type %d on %s state %d offset %llx\n", 1417*789Sahrens error, zio->io_type, vdev_description(vd), 1418*789Sahrens vd->vdev_state, zio->io_offset); 1419*789Sahrens } 1420*789Sahrens 1421*789Sahrens return (error); 1422*789Sahrens } 1423*789Sahrens 1424*789Sahrens /* 1425*789Sahrens * Get statistics for the given vdev. 1426*789Sahrens */ 1427*789Sahrens void 1428*789Sahrens vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) 1429*789Sahrens { 1430*789Sahrens vdev_t *rvd = vd->vdev_spa->spa_root_vdev; 1431*789Sahrens int c, t; 1432*789Sahrens 1433*789Sahrens mutex_enter(&vd->vdev_stat_lock); 1434*789Sahrens bcopy(&vd->vdev_stat, vs, sizeof (*vs)); 1435*789Sahrens vs->vs_timestamp = gethrtime() - vs->vs_timestamp; 1436*789Sahrens vs->vs_state = vd->vdev_state; 1437*789Sahrens mutex_exit(&vd->vdev_stat_lock); 1438*789Sahrens 1439*789Sahrens /* 1440*789Sahrens * If we're getting stats on the root vdev, aggregate the I/O counts 1441*789Sahrens * over all top-level vdevs (i.e. the direct children of the root). 1442*789Sahrens */ 1443*789Sahrens if (vd == rvd) { 1444*789Sahrens for (c = 0; c < rvd->vdev_children; c++) { 1445*789Sahrens vdev_t *cvd = rvd->vdev_child[c]; 1446*789Sahrens vdev_stat_t *cvs = &cvd->vdev_stat; 1447*789Sahrens 1448*789Sahrens mutex_enter(&vd->vdev_stat_lock); 1449*789Sahrens for (t = 0; t < ZIO_TYPES; t++) { 1450*789Sahrens vs->vs_ops[t] += cvs->vs_ops[t]; 1451*789Sahrens vs->vs_bytes[t] += cvs->vs_bytes[t]; 1452*789Sahrens } 1453*789Sahrens vs->vs_read_errors += cvs->vs_read_errors; 1454*789Sahrens vs->vs_write_errors += cvs->vs_write_errors; 1455*789Sahrens vs->vs_checksum_errors += cvs->vs_checksum_errors; 1456*789Sahrens vs->vs_scrub_examined += cvs->vs_scrub_examined; 1457*789Sahrens vs->vs_scrub_errors += cvs->vs_scrub_errors; 1458*789Sahrens mutex_exit(&vd->vdev_stat_lock); 1459*789Sahrens } 1460*789Sahrens } 1461*789Sahrens } 1462*789Sahrens 1463*789Sahrens void 1464*789Sahrens vdev_stat_update(zio_t *zio) 1465*789Sahrens { 1466*789Sahrens vdev_t *vd = zio->io_vd; 1467*789Sahrens vdev_t *pvd; 1468*789Sahrens uint64_t txg = zio->io_txg; 1469*789Sahrens vdev_stat_t *vs = &vd->vdev_stat; 1470*789Sahrens zio_type_t type = zio->io_type; 1471*789Sahrens int flags = zio->io_flags; 1472*789Sahrens 1473*789Sahrens if (zio->io_error == 0) { 1474*789Sahrens if (!(flags & ZIO_FLAG_IO_BYPASS)) { 1475*789Sahrens mutex_enter(&vd->vdev_stat_lock); 1476*789Sahrens vs->vs_ops[type]++; 1477*789Sahrens vs->vs_bytes[type] += zio->io_size; 1478*789Sahrens mutex_exit(&vd->vdev_stat_lock); 1479*789Sahrens } 1480*789Sahrens if ((flags & ZIO_FLAG_IO_REPAIR) && 1481*789Sahrens zio->io_delegate_list == NULL) { 1482*789Sahrens mutex_enter(&vd->vdev_stat_lock); 1483*789Sahrens if (flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) 1484*789Sahrens vs->vs_scrub_repaired += zio->io_size; 1485*789Sahrens else 1486*789Sahrens vs->vs_self_healed += zio->io_size; 1487*789Sahrens mutex_exit(&vd->vdev_stat_lock); 1488*789Sahrens } 1489*789Sahrens return; 1490*789Sahrens } 1491*789Sahrens 1492*789Sahrens if (flags & ZIO_FLAG_SPECULATIVE) 1493*789Sahrens return; 1494*789Sahrens 1495*789Sahrens if (!vdev_is_dead(vd)) { 1496*789Sahrens mutex_enter(&vd->vdev_stat_lock); 1497*789Sahrens if (type == ZIO_TYPE_READ) { 1498*789Sahrens if (zio->io_error == ECKSUM) 1499*789Sahrens vs->vs_checksum_errors++; 1500*789Sahrens else 1501*789Sahrens vs->vs_read_errors++; 1502*789Sahrens } 1503*789Sahrens if (type == ZIO_TYPE_WRITE) 1504*789Sahrens vs->vs_write_errors++; 1505*789Sahrens mutex_exit(&vd->vdev_stat_lock); 1506*789Sahrens } 1507*789Sahrens 1508*789Sahrens if (type == ZIO_TYPE_WRITE) { 1509*789Sahrens if (txg == 0 || vd->vdev_children != 0) 1510*789Sahrens return; 1511*789Sahrens if (flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) { 1512*789Sahrens ASSERT(flags & ZIO_FLAG_IO_REPAIR); 1513*789Sahrens for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) 1514*789Sahrens vdev_dtl_dirty(&pvd->vdev_dtl_scrub, txg, 1); 1515*789Sahrens } 1516*789Sahrens if (!(flags & ZIO_FLAG_IO_REPAIR)) { 1517*789Sahrens vdev_t *tvd = vd->vdev_top; 1518*789Sahrens if (vdev_dtl_contains(&vd->vdev_dtl_map, txg, 1)) 1519*789Sahrens return; 1520*789Sahrens vdev_dirty(tvd, VDD_DTL, txg); 1521*789Sahrens (void) txg_list_add(&tvd->vdev_dtl_list, vd, txg); 1522*789Sahrens for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) 1523*789Sahrens vdev_dtl_dirty(&pvd->vdev_dtl_map, txg, 1); 1524*789Sahrens } 1525*789Sahrens } 1526*789Sahrens } 1527*789Sahrens 1528*789Sahrens void 1529*789Sahrens vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete) 1530*789Sahrens { 1531*789Sahrens int c; 1532*789Sahrens vdev_stat_t *vs = &vd->vdev_stat; 1533*789Sahrens 1534*789Sahrens for (c = 0; c < vd->vdev_children; c++) 1535*789Sahrens vdev_scrub_stat_update(vd->vdev_child[c], type, complete); 1536*789Sahrens 1537*789Sahrens mutex_enter(&vd->vdev_stat_lock); 1538*789Sahrens 1539*789Sahrens if (type == POOL_SCRUB_NONE) { 1540*789Sahrens /* 1541*789Sahrens * Update completion and end time. Leave everything else alone 1542*789Sahrens * so we can report what happened during the previous scrub. 1543*789Sahrens */ 1544*789Sahrens vs->vs_scrub_complete = complete; 1545*789Sahrens vs->vs_scrub_end = gethrestime_sec(); 1546*789Sahrens } else { 1547*789Sahrens vs->vs_scrub_type = type; 1548*789Sahrens vs->vs_scrub_complete = 0; 1549*789Sahrens vs->vs_scrub_examined = 0; 1550*789Sahrens vs->vs_scrub_repaired = 0; 1551*789Sahrens vs->vs_scrub_errors = 0; 1552*789Sahrens vs->vs_scrub_start = gethrestime_sec(); 1553*789Sahrens vs->vs_scrub_end = 0; 1554*789Sahrens } 1555*789Sahrens 1556*789Sahrens mutex_exit(&vd->vdev_stat_lock); 1557*789Sahrens } 1558*789Sahrens 1559*789Sahrens /* 1560*789Sahrens * Report checksum errors that a vdev that didn't realize it made. 1561*789Sahrens * This can happen, for example, when RAID-Z combinatorial reconstruction 1562*789Sahrens * infers that one of its components returned bad data. 1563*789Sahrens */ 1564*789Sahrens void 1565*789Sahrens vdev_checksum_error(zio_t *zio, vdev_t *vd) 1566*789Sahrens { 1567*789Sahrens dprintf_bp(zio->io_bp, "imputed checksum error on %s: ", 1568*789Sahrens vdev_description(vd)); 1569*789Sahrens 1570*789Sahrens if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 1571*789Sahrens mutex_enter(&vd->vdev_stat_lock); 1572*789Sahrens vd->vdev_stat.vs_checksum_errors++; 1573*789Sahrens mutex_exit(&vd->vdev_stat_lock); 1574*789Sahrens } 1575*789Sahrens } 1576*789Sahrens 1577*789Sahrens /* 1578*789Sahrens * Update the in-core space usage stats for this vdev and the root vdev. 1579*789Sahrens */ 1580*789Sahrens void 1581*789Sahrens vdev_space_update(vdev_t *vd, uint64_t space_delta, uint64_t alloc_delta) 1582*789Sahrens { 1583*789Sahrens ASSERT(vd == vd->vdev_top); 1584*789Sahrens 1585*789Sahrens do { 1586*789Sahrens mutex_enter(&vd->vdev_stat_lock); 1587*789Sahrens vd->vdev_stat.vs_space += space_delta; 1588*789Sahrens vd->vdev_stat.vs_alloc += alloc_delta; 1589*789Sahrens mutex_exit(&vd->vdev_stat_lock); 1590*789Sahrens } while ((vd = vd->vdev_parent) != NULL); 1591*789Sahrens } 1592*789Sahrens 1593*789Sahrens /* 1594*789Sahrens * Various knobs to tune a vdev. 1595*789Sahrens */ 1596*789Sahrens static vdev_knob_t vdev_knob[] = { 1597*789Sahrens { 1598*789Sahrens "cache_size", 1599*789Sahrens "size of the read-ahead cache", 1600*789Sahrens 0, 1601*789Sahrens 1ULL << 30, 1602*789Sahrens 10ULL << 20, 1603*789Sahrens offsetof(struct vdev, vdev_cache.vc_size) 1604*789Sahrens }, 1605*789Sahrens { 1606*789Sahrens "cache_bshift", 1607*789Sahrens "log2 of cache blocksize", 1608*789Sahrens SPA_MINBLOCKSHIFT, 1609*789Sahrens SPA_MAXBLOCKSHIFT, 1610*789Sahrens 16, 1611*789Sahrens offsetof(struct vdev, vdev_cache.vc_bshift) 1612*789Sahrens }, 1613*789Sahrens { 1614*789Sahrens "cache_max", 1615*789Sahrens "largest block size to cache", 1616*789Sahrens 0, 1617*789Sahrens SPA_MAXBLOCKSIZE, 1618*789Sahrens 1ULL << 14, 1619*789Sahrens offsetof(struct vdev, vdev_cache.vc_max) 1620*789Sahrens }, 1621*789Sahrens { 1622*789Sahrens "min_pending", 1623*789Sahrens "minimum pending I/Os to the disk", 1624*789Sahrens 1, 1625*789Sahrens 10000, 1626*789Sahrens 2, 1627*789Sahrens offsetof(struct vdev, vdev_queue.vq_min_pending) 1628*789Sahrens }, 1629*789Sahrens { 1630*789Sahrens "max_pending", 1631*789Sahrens "maximum pending I/Os to the disk", 1632*789Sahrens 1, 1633*789Sahrens 10000, 1634*789Sahrens 35, 1635*789Sahrens offsetof(struct vdev, vdev_queue.vq_max_pending) 1636*789Sahrens }, 1637*789Sahrens { 1638*789Sahrens "agg_limit", 1639*789Sahrens "maximum size of aggregated I/Os", 1640*789Sahrens 0, 1641*789Sahrens SPA_MAXBLOCKSIZE, 1642*789Sahrens SPA_MAXBLOCKSIZE, 1643*789Sahrens offsetof(struct vdev, vdev_queue.vq_agg_limit) 1644*789Sahrens }, 1645*789Sahrens { 1646*789Sahrens "time_shift", 1647*789Sahrens "deadline = pri + (lbolt >> time_shift)", 1648*789Sahrens 0, 1649*789Sahrens 63, 1650*789Sahrens 4, 1651*789Sahrens offsetof(struct vdev, vdev_queue.vq_time_shift) 1652*789Sahrens }, 1653*789Sahrens { 1654*789Sahrens "ramp_rate", 1655*789Sahrens "exponential I/O issue ramp-up rate", 1656*789Sahrens 1, 1657*789Sahrens 10000, 1658*789Sahrens 2, 1659*789Sahrens offsetof(struct vdev, vdev_queue.vq_ramp_rate) 1660*789Sahrens }, 1661*789Sahrens }; 1662*789Sahrens 1663*789Sahrens vdev_knob_t * 1664*789Sahrens vdev_knob_next(vdev_knob_t *vk) 1665*789Sahrens { 1666*789Sahrens if (vk == NULL) 1667*789Sahrens return (vdev_knob); 1668*789Sahrens 1669*789Sahrens if (++vk == vdev_knob + sizeof (vdev_knob) / sizeof (vdev_knob_t)) 1670*789Sahrens return (NULL); 1671*789Sahrens 1672*789Sahrens return (vk); 1673*789Sahrens } 1674*789Sahrens 1675*789Sahrens /* 1676*789Sahrens * Mark a top-level vdev's config as dirty, placing it on the dirty list 1677*789Sahrens * so that it will be written out next time the vdev configuration is synced. 1678*789Sahrens * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs. 1679*789Sahrens */ 1680*789Sahrens void 1681*789Sahrens vdev_config_dirty(vdev_t *vd) 1682*789Sahrens { 1683*789Sahrens spa_t *spa = vd->vdev_spa; 1684*789Sahrens vdev_t *rvd = spa->spa_root_vdev; 1685*789Sahrens int c; 1686*789Sahrens 1687*789Sahrens if (vd == rvd) { 1688*789Sahrens for (c = 0; c < rvd->vdev_children; c++) 1689*789Sahrens vdev_config_dirty(rvd->vdev_child[c]); 1690*789Sahrens } else { 1691*789Sahrens ASSERT(vd == vd->vdev_top); 1692*789Sahrens 1693*789Sahrens if (!vd->vdev_is_dirty) { 1694*789Sahrens list_insert_head(&spa->spa_dirty_list, vd); 1695*789Sahrens vd->vdev_is_dirty = B_TRUE; 1696*789Sahrens } 1697*789Sahrens } 1698*789Sahrens } 1699*789Sahrens 1700*789Sahrens void 1701*789Sahrens vdev_config_clean(vdev_t *vd) 1702*789Sahrens { 1703*789Sahrens ASSERT(vd->vdev_is_dirty); 1704*789Sahrens 1705*789Sahrens list_remove(&vd->vdev_spa->spa_dirty_list, vd); 1706*789Sahrens vd->vdev_is_dirty = B_FALSE; 1707*789Sahrens } 1708*789Sahrens 1709*789Sahrens /* 1710*789Sahrens * Set a vdev's state, updating any parent's state as well. 1711*789Sahrens */ 1712*789Sahrens void 1713*789Sahrens vdev_set_state(vdev_t *vd, vdev_state_t state, vdev_aux_t aux) 1714*789Sahrens { 1715*789Sahrens if (state == vd->vdev_state) 1716*789Sahrens return; 1717*789Sahrens 1718*789Sahrens vd->vdev_state = state; 1719*789Sahrens vd->vdev_stat.vs_aux = aux; 1720*789Sahrens 1721*789Sahrens if (vd->vdev_parent != NULL) { 1722*789Sahrens int c; 1723*789Sahrens int degraded = 0, faulted = 0; 1724*789Sahrens vdev_t *parent, *child; 1725*789Sahrens 1726*789Sahrens parent = vd->vdev_parent; 1727*789Sahrens for (c = 0; c < parent->vdev_children; c++) { 1728*789Sahrens child = parent->vdev_child[c]; 1729*789Sahrens if (child->vdev_state <= VDEV_STATE_CANT_OPEN) 1730*789Sahrens faulted++; 1731*789Sahrens else if (child->vdev_state == VDEV_STATE_DEGRADED) 1732*789Sahrens degraded++; 1733*789Sahrens } 1734*789Sahrens 1735*789Sahrens vd->vdev_parent->vdev_ops->vdev_op_state_change( 1736*789Sahrens vd->vdev_parent, faulted, degraded); 1737*789Sahrens } 1738*789Sahrens } 1739