1*789Sahrens /* 2*789Sahrens * CDDL HEADER START 3*789Sahrens * 4*789Sahrens * The contents of this file are subject to the terms of the 5*789Sahrens * Common Development and Distribution License, Version 1.0 only 6*789Sahrens * (the "License"). You may not use this file except in compliance 7*789Sahrens * with the License. 8*789Sahrens * 9*789Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10*789Sahrens * or http://www.opensolaris.org/os/licensing. 11*789Sahrens * See the License for the specific language governing permissions 12*789Sahrens * and limitations under the License. 13*789Sahrens * 14*789Sahrens * When distributing Covered Code, include this CDDL HEADER in each 15*789Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16*789Sahrens * If applicable, add the following below this CDDL HEADER, with the 17*789Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 18*789Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 19*789Sahrens * 20*789Sahrens * CDDL HEADER END 21*789Sahrens */ 22*789Sahrens /* 23*789Sahrens * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24*789Sahrens * Use is subject to license terms. 25*789Sahrens */ 26*789Sahrens 27*789Sahrens #pragma ident "%Z%%M% %I% %E% SMI" 28*789Sahrens 29*789Sahrens /* 30*789Sahrens * This file contains all the routines used when modifying on-disk SPA state. 31*789Sahrens * This includes opening, importing, destroying, exporting a pool, and syncing a 32*789Sahrens * pool. 33*789Sahrens */ 34*789Sahrens 35*789Sahrens #include <sys/zfs_context.h> 36*789Sahrens #include <sys/spa_impl.h> 37*789Sahrens #include <sys/zio.h> 38*789Sahrens #include <sys/zio_checksum.h> 39*789Sahrens #include <sys/zio_compress.h> 40*789Sahrens #include <sys/dmu.h> 41*789Sahrens #include <sys/dmu_tx.h> 42*789Sahrens #include <sys/zap.h> 43*789Sahrens #include <sys/zil.h> 44*789Sahrens #include <sys/vdev_impl.h> 45*789Sahrens #include <sys/metaslab.h> 46*789Sahrens #include <sys/uberblock_impl.h> 47*789Sahrens #include <sys/txg.h> 48*789Sahrens #include <sys/avl.h> 49*789Sahrens #include <sys/dmu_traverse.h> 50*789Sahrens #include <sys/unique.h> 51*789Sahrens #include <sys/dsl_pool.h> 52*789Sahrens #include <sys/dsl_dir.h> 53*789Sahrens #include <sys/dsl_prop.h> 54*789Sahrens #include <sys/fs/zfs.h> 55*789Sahrens #include <sys/callb.h> 56*789Sahrens 57*789Sahrens static uint32_t spa_active_count; 58*789Sahrens 59*789Sahrens /* 60*789Sahrens * ========================================================================== 61*789Sahrens * SPA state manipulation (open/create/destroy/import/export) 62*789Sahrens * ========================================================================== 63*789Sahrens */ 64*789Sahrens 65*789Sahrens /* 66*789Sahrens * Activate an uninitialized pool. 67*789Sahrens */ 68*789Sahrens static void 69*789Sahrens spa_activate(spa_t *spa) 70*789Sahrens { 71*789Sahrens int t; 72*789Sahrens 73*789Sahrens ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 74*789Sahrens 75*789Sahrens spa->spa_state = POOL_STATE_ACTIVE; 76*789Sahrens 77*789Sahrens spa->spa_normal_class = metaslab_class_create(); 78*789Sahrens 79*789Sahrens spa->spa_vdev_retry_taskq = taskq_create("spa_vdev_retry", 80*789Sahrens 4, maxclsyspri, 50, INT_MAX, TASKQ_PREPOPULATE); 81*789Sahrens 82*789Sahrens for (t = 0; t < ZIO_TYPES; t++) { 83*789Sahrens spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue", 84*789Sahrens 8, maxclsyspri, 50, INT_MAX, 85*789Sahrens TASKQ_PREPOPULATE); 86*789Sahrens spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr", 87*789Sahrens 8, maxclsyspri, 50, INT_MAX, 88*789Sahrens TASKQ_PREPOPULATE); 89*789Sahrens } 90*789Sahrens 91*789Sahrens rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL); 92*789Sahrens 93*789Sahrens list_create(&spa->spa_dirty_list, sizeof (vdev_t), 94*789Sahrens offsetof(vdev_t, vdev_dirty_node)); 95*789Sahrens 96*789Sahrens txg_list_create(&spa->spa_vdev_txg_list, 97*789Sahrens offsetof(struct vdev, vdev_txg_node)); 98*789Sahrens } 99*789Sahrens 100*789Sahrens /* 101*789Sahrens * Opposite of spa_activate(). 102*789Sahrens */ 103*789Sahrens static void 104*789Sahrens spa_deactivate(spa_t *spa) 105*789Sahrens { 106*789Sahrens int t; 107*789Sahrens 108*789Sahrens ASSERT(spa->spa_sync_on == B_FALSE); 109*789Sahrens ASSERT(spa->spa_dsl_pool == NULL); 110*789Sahrens ASSERT(spa->spa_root_vdev == NULL); 111*789Sahrens 112*789Sahrens ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 113*789Sahrens 114*789Sahrens txg_list_destroy(&spa->spa_vdev_txg_list); 115*789Sahrens 116*789Sahrens list_destroy(&spa->spa_dirty_list); 117*789Sahrens 118*789Sahrens rw_destroy(&spa->spa_traverse_lock); 119*789Sahrens 120*789Sahrens for (t = 0; t < ZIO_TYPES; t++) { 121*789Sahrens taskq_destroy(spa->spa_zio_issue_taskq[t]); 122*789Sahrens taskq_destroy(spa->spa_zio_intr_taskq[t]); 123*789Sahrens spa->spa_zio_issue_taskq[t] = NULL; 124*789Sahrens spa->spa_zio_intr_taskq[t] = NULL; 125*789Sahrens } 126*789Sahrens 127*789Sahrens taskq_destroy(spa->spa_vdev_retry_taskq); 128*789Sahrens spa->spa_vdev_retry_taskq = NULL; 129*789Sahrens 130*789Sahrens metaslab_class_destroy(spa->spa_normal_class); 131*789Sahrens spa->spa_normal_class = NULL; 132*789Sahrens 133*789Sahrens spa->spa_state = POOL_STATE_UNINITIALIZED; 134*789Sahrens } 135*789Sahrens 136*789Sahrens /* 137*789Sahrens * Verify a pool configuration, and construct the vdev tree appropriately. This 138*789Sahrens * will create all the necessary vdevs in the appropriate layout, with each vdev 139*789Sahrens * in the CLOSED state. This will prep the pool before open/creation/import. 140*789Sahrens * All vdev validation is done by the vdev_alloc() routine. 141*789Sahrens */ 142*789Sahrens static vdev_t * 143*789Sahrens spa_config_parse(spa_t *spa, nvlist_t *nv, vdev_t *parent, uint_t id, int atype) 144*789Sahrens { 145*789Sahrens nvlist_t **child; 146*789Sahrens uint_t c, children; 147*789Sahrens vdev_t *vd; 148*789Sahrens 149*789Sahrens if ((vd = vdev_alloc(spa, nv, parent, id, atype)) == NULL) 150*789Sahrens return (NULL); 151*789Sahrens 152*789Sahrens if (vd->vdev_ops->vdev_op_leaf) 153*789Sahrens return (vd); 154*789Sahrens 155*789Sahrens if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 156*789Sahrens &child, &children) != 0) { 157*789Sahrens vdev_free(vd); 158*789Sahrens return (NULL); 159*789Sahrens } 160*789Sahrens 161*789Sahrens for (c = 0; c < children; c++) { 162*789Sahrens if (spa_config_parse(spa, child[c], vd, c, atype) == NULL) { 163*789Sahrens vdev_free(vd); 164*789Sahrens return (NULL); 165*789Sahrens } 166*789Sahrens } 167*789Sahrens 168*789Sahrens return (vd); 169*789Sahrens } 170*789Sahrens 171*789Sahrens /* 172*789Sahrens * Opposite of spa_load(). 173*789Sahrens */ 174*789Sahrens static void 175*789Sahrens spa_unload(spa_t *spa) 176*789Sahrens { 177*789Sahrens /* 178*789Sahrens * Stop syncing. 179*789Sahrens */ 180*789Sahrens if (spa->spa_sync_on) { 181*789Sahrens txg_sync_stop(spa->spa_dsl_pool); 182*789Sahrens spa->spa_sync_on = B_FALSE; 183*789Sahrens } 184*789Sahrens 185*789Sahrens /* 186*789Sahrens * Wait for any outstanding prefetch I/O to complete. 187*789Sahrens */ 188*789Sahrens spa_config_enter(spa, RW_WRITER); 189*789Sahrens spa_config_exit(spa); 190*789Sahrens 191*789Sahrens /* 192*789Sahrens * Close the dsl pool. 193*789Sahrens */ 194*789Sahrens if (spa->spa_dsl_pool) { 195*789Sahrens dsl_pool_close(spa->spa_dsl_pool); 196*789Sahrens spa->spa_dsl_pool = NULL; 197*789Sahrens } 198*789Sahrens 199*789Sahrens /* 200*789Sahrens * Close all vdevs. 201*789Sahrens */ 202*789Sahrens if (spa->spa_root_vdev) { 203*789Sahrens vdev_free(spa->spa_root_vdev); 204*789Sahrens spa->spa_root_vdev = NULL; 205*789Sahrens } 206*789Sahrens } 207*789Sahrens 208*789Sahrens /* 209*789Sahrens * Load an existing storage pool, using the pool's builtin spa_config as a 210*789Sahrens * source of configuration information. The 'readonly' flag will prevent us 211*789Sahrens * from writing any updated state to disk, and can be use when testing a pool 212*789Sahrens * for import. 213*789Sahrens */ 214*789Sahrens static int 215*789Sahrens spa_load(spa_t *spa, nvlist_t *config, int readonly, int import, int mosconfig) 216*789Sahrens { 217*789Sahrens int error = 0; 218*789Sahrens nvlist_t *nvroot = NULL; 219*789Sahrens vdev_t *rvd; 220*789Sahrens uberblock_t *ub = &spa->spa_uberblock; 221*789Sahrens uint64_t pool_guid; 222*789Sahrens zio_t *zio; 223*789Sahrens 224*789Sahrens if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 225*789Sahrens nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) 226*789Sahrens return (EINVAL); 227*789Sahrens 228*789Sahrens (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 229*789Sahrens &spa->spa_config_txg); 230*789Sahrens 231*789Sahrens if (import && spa_guid_exists(pool_guid, 0)) 232*789Sahrens return (EEXIST); 233*789Sahrens 234*789Sahrens /* 235*789Sahrens * Parse the configuration into a vdev tree. 236*789Sahrens */ 237*789Sahrens spa_config_enter(spa, RW_WRITER); 238*789Sahrens rvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 239*789Sahrens spa_config_exit(spa); 240*789Sahrens 241*789Sahrens if (rvd == NULL) 242*789Sahrens return (EINVAL); 243*789Sahrens 244*789Sahrens spa->spa_root_vdev = rvd; 245*789Sahrens ASSERT(spa_guid(spa) == pool_guid); 246*789Sahrens 247*789Sahrens /* 248*789Sahrens * Try to open all vdevs, loading each label in the process. 249*789Sahrens */ 250*789Sahrens if (vdev_open(rvd) != 0) 251*789Sahrens return (ENXIO); 252*789Sahrens 253*789Sahrens /* 254*789Sahrens * Find the best uberblock. 255*789Sahrens */ 256*789Sahrens bzero(ub, sizeof (uberblock_t)); 257*789Sahrens 258*789Sahrens zio = zio_root(spa, NULL, NULL, 259*789Sahrens ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 260*789Sahrens vdev_uberblock_load(zio, rvd, ub); 261*789Sahrens error = zio_wait(zio); 262*789Sahrens 263*789Sahrens /* 264*789Sahrens * If we weren't able to find a single valid uberblock, return failure. 265*789Sahrens */ 266*789Sahrens if (ub->ub_txg == 0) { 267*789Sahrens dprintf("ub_txg is zero\n"); 268*789Sahrens return (ENXIO); 269*789Sahrens } 270*789Sahrens 271*789Sahrens /* 272*789Sahrens * If the vdev guid sum doesn't match the uberblock, we have an 273*789Sahrens * incomplete configuration. 274*789Sahrens */ 275*789Sahrens if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 276*789Sahrens rvd->vdev_state = VDEV_STATE_CANT_OPEN; 277*789Sahrens rvd->vdev_stat.vs_aux = VDEV_AUX_BAD_GUID_SUM; 278*789Sahrens dprintf("vdev_guid_sum %llx != ub_guid_sum %llx\n", 279*789Sahrens rvd->vdev_guid_sum, ub->ub_guid_sum); 280*789Sahrens return (ENXIO); 281*789Sahrens } 282*789Sahrens 283*789Sahrens /* 284*789Sahrens * Initialize internal SPA structures. 285*789Sahrens */ 286*789Sahrens spa->spa_state = POOL_STATE_ACTIVE; 287*789Sahrens spa->spa_ubsync = spa->spa_uberblock; 288*789Sahrens spa->spa_first_txg = spa_last_synced_txg(spa) + 1; 289*789Sahrens spa->spa_dsl_pool = dsl_pool_open(spa, spa->spa_first_txg); 290*789Sahrens spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 291*789Sahrens 292*789Sahrens VERIFY(zap_lookup(spa->spa_meta_objset, 293*789Sahrens DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 294*789Sahrens sizeof (uint64_t), 1, &spa->spa_config_object) == 0); 295*789Sahrens 296*789Sahrens if (!mosconfig) { 297*789Sahrens dmu_buf_t *db; 298*789Sahrens char *packed = NULL; 299*789Sahrens size_t nvsize = 0; 300*789Sahrens nvlist_t *newconfig = NULL; 301*789Sahrens 302*789Sahrens db = dmu_bonus_hold(spa->spa_meta_objset, 303*789Sahrens spa->spa_config_object); 304*789Sahrens dmu_buf_read(db); 305*789Sahrens nvsize = *(uint64_t *)db->db_data; 306*789Sahrens dmu_buf_rele(db); 307*789Sahrens 308*789Sahrens packed = kmem_alloc(nvsize, KM_SLEEP); 309*789Sahrens error = dmu_read_canfail(spa->spa_meta_objset, 310*789Sahrens spa->spa_config_object, 0, nvsize, packed); 311*789Sahrens if (error == 0) 312*789Sahrens error = nvlist_unpack(packed, nvsize, &newconfig, 0); 313*789Sahrens kmem_free(packed, nvsize); 314*789Sahrens 315*789Sahrens if (error) 316*789Sahrens return (ENXIO); 317*789Sahrens 318*789Sahrens spa_config_set(spa, newconfig); 319*789Sahrens 320*789Sahrens spa_unload(spa); 321*789Sahrens spa_deactivate(spa); 322*789Sahrens spa_activate(spa); 323*789Sahrens 324*789Sahrens return (spa_load(spa, newconfig, readonly, import, B_TRUE)); 325*789Sahrens } 326*789Sahrens 327*789Sahrens VERIFY(zap_lookup(spa->spa_meta_objset, 328*789Sahrens DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 329*789Sahrens sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) == 0); 330*789Sahrens 331*789Sahrens /* 332*789Sahrens * Load the vdev state for all top level vdevs. 333*789Sahrens */ 334*789Sahrens if ((error = vdev_load(rvd, import)) != 0) 335*789Sahrens return (error); 336*789Sahrens 337*789Sahrens /* 338*789Sahrens * Propagate the leaf DTLs we just loaded all the way up the tree. 339*789Sahrens */ 340*789Sahrens spa_config_enter(spa, RW_WRITER); 341*789Sahrens vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 342*789Sahrens spa_config_exit(spa); 343*789Sahrens 344*789Sahrens /* 345*789Sahrens * Check the state of the root vdev. If it can't be opened, it 346*789Sahrens * indicates one or more toplevel vdevs are faulted. 347*789Sahrens */ 348*789Sahrens if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 349*789Sahrens return (ENXIO); 350*789Sahrens 351*789Sahrens /* 352*789Sahrens * Claim log blocks that haven't been committed yet, and update all 353*789Sahrens * top-level vdevs to sync any config changes found in vdev_load(). 354*789Sahrens * This must all happen in a single txg. 355*789Sahrens */ 356*789Sahrens if ((spa_mode & FWRITE) && !readonly) { 357*789Sahrens dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa), 358*789Sahrens spa_first_txg(spa)); 359*789Sahrens dmu_objset_find(spa->spa_name, zil_claim, tx, 0); 360*789Sahrens vdev_config_dirty(rvd); 361*789Sahrens dmu_tx_commit(tx); 362*789Sahrens 363*789Sahrens spa->spa_sync_on = B_TRUE; 364*789Sahrens txg_sync_start(spa->spa_dsl_pool); 365*789Sahrens 366*789Sahrens /* 367*789Sahrens * Wait for all claims to sync. 368*789Sahrens */ 369*789Sahrens txg_wait_synced(spa->spa_dsl_pool, 0); 370*789Sahrens } 371*789Sahrens 372*789Sahrens return (0); 373*789Sahrens } 374*789Sahrens 375*789Sahrens /* 376*789Sahrens * Pool Open/Import 377*789Sahrens * 378*789Sahrens * The import case is identical to an open except that the configuration is sent 379*789Sahrens * down from userland, instead of grabbed from the configuration cache. For the 380*789Sahrens * case of an open, the pool configuration will exist in the 381*789Sahrens * POOL_STATE_UNITIALIZED state. 382*789Sahrens * 383*789Sahrens * The stats information (gen/count/ustats) is used to gather vdev statistics at 384*789Sahrens * the same time open the pool, without having to keep around the spa_t in some 385*789Sahrens * ambiguous state. 386*789Sahrens */ 387*789Sahrens static int 388*789Sahrens spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) 389*789Sahrens { 390*789Sahrens spa_t *spa; 391*789Sahrens int error; 392*789Sahrens int loaded = B_FALSE; 393*789Sahrens int locked = B_FALSE; 394*789Sahrens 395*789Sahrens *spapp = NULL; 396*789Sahrens 397*789Sahrens /* 398*789Sahrens * As disgusting as this is, we need to support recursive calls to this 399*789Sahrens * function because dsl_dir_open() is called during spa_load(), and ends 400*789Sahrens * up calling spa_open() again. The real fix is to figure out how to 401*789Sahrens * avoid dsl_dir_open() calling this in the first place. 402*789Sahrens */ 403*789Sahrens if (mutex_owner(&spa_namespace_lock) != curthread) { 404*789Sahrens mutex_enter(&spa_namespace_lock); 405*789Sahrens locked = B_TRUE; 406*789Sahrens } 407*789Sahrens 408*789Sahrens if ((spa = spa_lookup(pool)) == NULL) { 409*789Sahrens if (locked) 410*789Sahrens mutex_exit(&spa_namespace_lock); 411*789Sahrens return (ENOENT); 412*789Sahrens } 413*789Sahrens if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 414*789Sahrens 415*789Sahrens spa_activate(spa); 416*789Sahrens 417*789Sahrens error = spa_load(spa, spa->spa_config, 418*789Sahrens B_FALSE, B_FALSE, B_FALSE); 419*789Sahrens 420*789Sahrens if (error == EBADF) { 421*789Sahrens /* 422*789Sahrens * If vdev_load() returns EBADF, it indicates that one 423*789Sahrens * of the vdevs indicates that the pool has been 424*789Sahrens * exported or destroyed. If this is the case, the 425*789Sahrens * config cache is out of sync and we should remove the 426*789Sahrens * pool from the namespace. 427*789Sahrens */ 428*789Sahrens spa_unload(spa); 429*789Sahrens spa_deactivate(spa); 430*789Sahrens spa_remove(spa); 431*789Sahrens spa_config_sync(); 432*789Sahrens if (locked) 433*789Sahrens mutex_exit(&spa_namespace_lock); 434*789Sahrens return (ENOENT); 435*789Sahrens } if (error) { 436*789Sahrens /* 437*789Sahrens * We can't open the pool, but we still have useful 438*789Sahrens * information: the state of each vdev after the 439*789Sahrens * attempted vdev_open(). Return this to the user. 440*789Sahrens */ 441*789Sahrens if (config != NULL && spa->spa_root_vdev != NULL) 442*789Sahrens *config = spa_config_generate(spa, NULL, -1ULL, 443*789Sahrens B_TRUE); 444*789Sahrens spa_unload(spa); 445*789Sahrens spa_deactivate(spa); 446*789Sahrens if (locked) 447*789Sahrens mutex_exit(&spa_namespace_lock); 448*789Sahrens *spapp = NULL; 449*789Sahrens return (error); 450*789Sahrens } 451*789Sahrens 452*789Sahrens loaded = B_TRUE; 453*789Sahrens } 454*789Sahrens 455*789Sahrens spa_open_ref(spa, tag); 456*789Sahrens if (locked) 457*789Sahrens mutex_exit(&spa_namespace_lock); 458*789Sahrens 459*789Sahrens *spapp = spa; 460*789Sahrens 461*789Sahrens if (config != NULL) { 462*789Sahrens spa_config_enter(spa, RW_READER); 463*789Sahrens *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 464*789Sahrens spa_config_exit(spa); 465*789Sahrens } 466*789Sahrens 467*789Sahrens /* 468*789Sahrens * If we just loaded the pool, resilver anything that's out of date. 469*789Sahrens */ 470*789Sahrens if (loaded && (spa_mode & FWRITE)) 471*789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 472*789Sahrens 473*789Sahrens return (0); 474*789Sahrens } 475*789Sahrens 476*789Sahrens int 477*789Sahrens spa_open(const char *name, spa_t **spapp, void *tag) 478*789Sahrens { 479*789Sahrens return (spa_open_common(name, spapp, tag, NULL)); 480*789Sahrens } 481*789Sahrens 482*789Sahrens int 483*789Sahrens spa_get_stats(const char *name, nvlist_t **config) 484*789Sahrens { 485*789Sahrens int error; 486*789Sahrens spa_t *spa; 487*789Sahrens 488*789Sahrens *config = NULL; 489*789Sahrens error = spa_open_common(name, &spa, FTAG, config); 490*789Sahrens 491*789Sahrens if (spa != NULL) 492*789Sahrens spa_close(spa, FTAG); 493*789Sahrens 494*789Sahrens return (error); 495*789Sahrens } 496*789Sahrens 497*789Sahrens /* 498*789Sahrens * Pool Creation 499*789Sahrens */ 500*789Sahrens int 501*789Sahrens spa_create(const char *pool, nvlist_t *nvroot, char *altroot) 502*789Sahrens { 503*789Sahrens spa_t *spa; 504*789Sahrens dsl_pool_t *dp; 505*789Sahrens dmu_tx_t *tx; 506*789Sahrens int error; 507*789Sahrens uint64_t txg = TXG_INITIAL; 508*789Sahrens 509*789Sahrens /* 510*789Sahrens * If this pool already exists, return failure. 511*789Sahrens */ 512*789Sahrens mutex_enter(&spa_namespace_lock); 513*789Sahrens if (spa_lookup(pool) != NULL) { 514*789Sahrens mutex_exit(&spa_namespace_lock); 515*789Sahrens return (EEXIST); 516*789Sahrens } 517*789Sahrens spa = spa_add(pool); 518*789Sahrens 519*789Sahrens /* 520*789Sahrens * Allocate a new spa_t structure. 521*789Sahrens */ 522*789Sahrens spa_activate(spa); 523*789Sahrens 524*789Sahrens spa->spa_uberblock.ub_txg = txg - 1; 525*789Sahrens spa->spa_ubsync = spa->spa_uberblock; 526*789Sahrens 527*789Sahrens error = spa_vdev_add(spa, nvroot); 528*789Sahrens 529*789Sahrens if (error) { 530*789Sahrens spa_unload(spa); 531*789Sahrens spa_deactivate(spa); 532*789Sahrens spa_remove(spa); 533*789Sahrens mutex_exit(&spa_namespace_lock); 534*789Sahrens return (error); 535*789Sahrens } 536*789Sahrens 537*789Sahrens if (altroot != NULL) { 538*789Sahrens spa->spa_root = spa_strdup(altroot); 539*789Sahrens atomic_add_32(&spa_active_count, 1); 540*789Sahrens } 541*789Sahrens 542*789Sahrens spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg); 543*789Sahrens spa->spa_meta_objset = dp->dp_meta_objset; 544*789Sahrens 545*789Sahrens tx = dmu_tx_create_assigned(dp, txg); 546*789Sahrens 547*789Sahrens /* 548*789Sahrens * Create the pool config object. 549*789Sahrens */ 550*789Sahrens spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 551*789Sahrens DMU_OT_PACKED_NVLIST, 1 << 14, 552*789Sahrens DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 553*789Sahrens 554*789Sahrens VERIFY(zap_add(spa->spa_meta_objset, 555*789Sahrens DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 556*789Sahrens sizeof (uint64_t), 1, &spa->spa_config_object, tx) == 0); 557*789Sahrens 558*789Sahrens /* 559*789Sahrens * Create the deferred-free bplist object. Turn off compression 560*789Sahrens * because sync-to-convergence takes longer if the blocksize 561*789Sahrens * keeps changing. 562*789Sahrens */ 563*789Sahrens spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 564*789Sahrens 1 << 14, tx); 565*789Sahrens dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 566*789Sahrens ZIO_COMPRESS_OFF, tx); 567*789Sahrens 568*789Sahrens VERIFY(zap_add(spa->spa_meta_objset, 569*789Sahrens DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 570*789Sahrens sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) == 0); 571*789Sahrens 572*789Sahrens dmu_tx_commit(tx); 573*789Sahrens 574*789Sahrens spa->spa_sync_on = B_TRUE; 575*789Sahrens txg_sync_start(spa->spa_dsl_pool); 576*789Sahrens 577*789Sahrens /* 578*789Sahrens * We explicitly wait for the first transaction to complete so that our 579*789Sahrens * bean counters are appropriately updated. 580*789Sahrens */ 581*789Sahrens txg_wait_synced(spa->spa_dsl_pool, txg); 582*789Sahrens 583*789Sahrens spa_config_sync(); 584*789Sahrens 585*789Sahrens mutex_exit(&spa_namespace_lock); 586*789Sahrens 587*789Sahrens return (0); 588*789Sahrens } 589*789Sahrens 590*789Sahrens /* 591*789Sahrens * Import the given pool into the system. We set up the necessary spa_t and 592*789Sahrens * then call spa_load() to do the dirty work. 593*789Sahrens */ 594*789Sahrens int 595*789Sahrens spa_import(const char *pool, nvlist_t *config, char *altroot) 596*789Sahrens { 597*789Sahrens spa_t *spa; 598*789Sahrens int error; 599*789Sahrens 600*789Sahrens if (!(spa_mode & FWRITE)) 601*789Sahrens return (EROFS); 602*789Sahrens 603*789Sahrens /* 604*789Sahrens * If a pool with this name exists, return failure. 605*789Sahrens */ 606*789Sahrens mutex_enter(&spa_namespace_lock); 607*789Sahrens if (spa_lookup(pool) != NULL) { 608*789Sahrens mutex_exit(&spa_namespace_lock); 609*789Sahrens return (EEXIST); 610*789Sahrens } 611*789Sahrens 612*789Sahrens /* 613*789Sahrens * Create an initialize the spa structure 614*789Sahrens */ 615*789Sahrens spa = spa_add(pool); 616*789Sahrens spa_activate(spa); 617*789Sahrens 618*789Sahrens /* 619*789Sahrens * Pass off the heavy lifting to spa_load(). We pass TRUE for mosconfig 620*789Sahrens * so that we don't try to open the pool if the config is damaged. 621*789Sahrens */ 622*789Sahrens error = spa_load(spa, config, B_FALSE, B_TRUE, B_TRUE); 623*789Sahrens 624*789Sahrens if (error) { 625*789Sahrens spa_unload(spa); 626*789Sahrens spa_deactivate(spa); 627*789Sahrens spa_remove(spa); 628*789Sahrens mutex_exit(&spa_namespace_lock); 629*789Sahrens return (error); 630*789Sahrens } 631*789Sahrens 632*789Sahrens /* 633*789Sahrens * Set the alternate root, if there is one. 634*789Sahrens */ 635*789Sahrens if (altroot != NULL) { 636*789Sahrens atomic_add_32(&spa_active_count, 1); 637*789Sahrens spa->spa_root = spa_strdup(altroot); 638*789Sahrens } 639*789Sahrens 640*789Sahrens /* 641*789Sahrens * Initialize the config based on the in-core state. 642*789Sahrens */ 643*789Sahrens config = spa_config_generate(spa, NULL, spa_last_synced_txg(spa), 0); 644*789Sahrens 645*789Sahrens spa_config_set(spa, config); 646*789Sahrens 647*789Sahrens /* 648*789Sahrens * Sync the configuration cache. 649*789Sahrens */ 650*789Sahrens spa_config_sync(); 651*789Sahrens 652*789Sahrens mutex_exit(&spa_namespace_lock); 653*789Sahrens 654*789Sahrens /* 655*789Sahrens * Resilver anything that's out of date. 656*789Sahrens */ 657*789Sahrens if (spa_mode & FWRITE) 658*789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 659*789Sahrens 660*789Sahrens return (0); 661*789Sahrens } 662*789Sahrens 663*789Sahrens /* 664*789Sahrens * This (illegal) pool name is used when temporarily importing a spa_t in order 665*789Sahrens * to get the vdev stats associated with the imported devices. 666*789Sahrens */ 667*789Sahrens #define TRYIMPORT_NAME "$import" 668*789Sahrens 669*789Sahrens nvlist_t * 670*789Sahrens spa_tryimport(nvlist_t *tryconfig) 671*789Sahrens { 672*789Sahrens nvlist_t *config = NULL; 673*789Sahrens char *poolname; 674*789Sahrens spa_t *spa; 675*789Sahrens uint64_t state; 676*789Sahrens 677*789Sahrens if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 678*789Sahrens return (NULL); 679*789Sahrens 680*789Sahrens if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 681*789Sahrens return (NULL); 682*789Sahrens 683*789Sahrens mutex_enter(&spa_namespace_lock); 684*789Sahrens spa = spa_add(TRYIMPORT_NAME); 685*789Sahrens 686*789Sahrens ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 687*789Sahrens 688*789Sahrens /* 689*789Sahrens * Initialize the spa_t structure. 690*789Sahrens */ 691*789Sahrens spa_activate(spa); 692*789Sahrens 693*789Sahrens /* 694*789Sahrens * Pass off the heavy lifting to spa_load(). We pass TRUE for mosconfig 695*789Sahrens * so we don't try to open the pool if the config is damaged. 696*789Sahrens */ 697*789Sahrens (void) spa_load(spa, tryconfig, B_TRUE, B_TRUE, B_TRUE); 698*789Sahrens 699*789Sahrens /* 700*789Sahrens * If 'tryconfig' was at least parsable, return the current config. 701*789Sahrens */ 702*789Sahrens if (spa->spa_root_vdev != NULL) { 703*789Sahrens config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 704*789Sahrens VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 705*789Sahrens poolname) == 0); 706*789Sahrens VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 707*789Sahrens state) == 0); 708*789Sahrens } 709*789Sahrens 710*789Sahrens spa_unload(spa); 711*789Sahrens spa_deactivate(spa); 712*789Sahrens spa_remove(spa); 713*789Sahrens mutex_exit(&spa_namespace_lock); 714*789Sahrens 715*789Sahrens return (config); 716*789Sahrens } 717*789Sahrens 718*789Sahrens /* 719*789Sahrens * Pool export/destroy 720*789Sahrens * 721*789Sahrens * The act of destroying or exporting a pool is very simple. We make sure there 722*789Sahrens * is no more pending I/O and any references to the pool are gone. Then, we 723*789Sahrens * update the pool state and sync all the labels to disk, removing the 724*789Sahrens * configuration from the cache afterwards. 725*789Sahrens */ 726*789Sahrens static int 727*789Sahrens spa_export_common(char *pool, int new_state) 728*789Sahrens { 729*789Sahrens spa_t *spa; 730*789Sahrens 731*789Sahrens if (!(spa_mode & FWRITE)) 732*789Sahrens return (EROFS); 733*789Sahrens 734*789Sahrens mutex_enter(&spa_namespace_lock); 735*789Sahrens if ((spa = spa_lookup(pool)) == NULL) { 736*789Sahrens mutex_exit(&spa_namespace_lock); 737*789Sahrens return (ENOENT); 738*789Sahrens } 739*789Sahrens 740*789Sahrens /* 741*789Sahrens * The pool will be in core if it's openable, 742*789Sahrens * in which case we can modify its state. 743*789Sahrens */ 744*789Sahrens if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 745*789Sahrens /* 746*789Sahrens * Objsets may be open only because they're dirty, so we 747*789Sahrens * have to force it to sync before checking spa_refcnt. 748*789Sahrens */ 749*789Sahrens spa_scrub_suspend(spa); 750*789Sahrens txg_wait_synced(spa->spa_dsl_pool, 0); 751*789Sahrens 752*789Sahrens if (!spa_refcount_zero(spa)) { 753*789Sahrens spa_scrub_resume(spa); 754*789Sahrens mutex_exit(&spa_namespace_lock); 755*789Sahrens return (EBUSY); 756*789Sahrens } 757*789Sahrens 758*789Sahrens /* 759*789Sahrens * Update the pool state. 760*789Sahrens */ 761*789Sahrens spa->spa_state = new_state; 762*789Sahrens 763*789Sahrens spa_scrub_resume(spa); 764*789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 765*789Sahrens 766*789Sahrens if (spa->spa_root != NULL) 767*789Sahrens atomic_add_32(&spa_active_count, -1); 768*789Sahrens 769*789Sahrens /* 770*789Sahrens * We want this to be reflected on every label, 771*789Sahrens * so mark them all dirty. spa_unload() will do the 772*789Sahrens * final sync that pushes these changes out. 773*789Sahrens */ 774*789Sahrens vdev_config_dirty(spa->spa_root_vdev); 775*789Sahrens } 776*789Sahrens 777*789Sahrens if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 778*789Sahrens spa_unload(spa); 779*789Sahrens spa_deactivate(spa); 780*789Sahrens } 781*789Sahrens 782*789Sahrens spa_remove(spa); 783*789Sahrens spa_config_sync(); 784*789Sahrens mutex_exit(&spa_namespace_lock); 785*789Sahrens 786*789Sahrens return (0); 787*789Sahrens } 788*789Sahrens 789*789Sahrens /* 790*789Sahrens * Destroy a storage pool. 791*789Sahrens */ 792*789Sahrens int 793*789Sahrens spa_destroy(char *pool) 794*789Sahrens { 795*789Sahrens return (spa_export_common(pool, POOL_STATE_DESTROYED)); 796*789Sahrens } 797*789Sahrens 798*789Sahrens /* 799*789Sahrens * Export a storage pool. 800*789Sahrens */ 801*789Sahrens int 802*789Sahrens spa_export(char *pool) 803*789Sahrens { 804*789Sahrens return (spa_export_common(pool, POOL_STATE_EXPORTED)); 805*789Sahrens } 806*789Sahrens 807*789Sahrens /* 808*789Sahrens * ========================================================================== 809*789Sahrens * Device manipulation 810*789Sahrens * ========================================================================== 811*789Sahrens */ 812*789Sahrens 813*789Sahrens /* 814*789Sahrens * Add capacity to a storage pool. 815*789Sahrens */ 816*789Sahrens int 817*789Sahrens spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 818*789Sahrens { 819*789Sahrens uint64_t txg; 820*789Sahrens int c, error; 821*789Sahrens vdev_t *rvd = spa->spa_root_vdev; 822*789Sahrens vdev_t *vd; 823*789Sahrens 824*789Sahrens txg = spa_vdev_enter(spa); 825*789Sahrens 826*789Sahrens vd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD); 827*789Sahrens 828*789Sahrens if (vd == NULL) 829*789Sahrens return (spa_vdev_exit(spa, vd, txg, EINVAL)); 830*789Sahrens 831*789Sahrens if (rvd == NULL) /* spa_create() */ 832*789Sahrens spa->spa_root_vdev = rvd = vd; 833*789Sahrens 834*789Sahrens if ((error = vdev_create(vd, txg)) != 0) 835*789Sahrens return (spa_vdev_exit(spa, vd, txg, error)); 836*789Sahrens 837*789Sahrens /* 838*789Sahrens * Transfer each top-level vdev from the temporary root 839*789Sahrens * to the spa's root and initialize its metaslabs. 840*789Sahrens */ 841*789Sahrens for (c = 0; c < vd->vdev_children; c++) { 842*789Sahrens vdev_t *tvd = vd->vdev_child[c]; 843*789Sahrens if (vd != rvd) { 844*789Sahrens vdev_remove_child(vd, tvd); 845*789Sahrens tvd->vdev_id = rvd->vdev_children; 846*789Sahrens vdev_add_child(rvd, tvd); 847*789Sahrens } 848*789Sahrens vdev_init(tvd, txg); 849*789Sahrens vdev_config_dirty(tvd); 850*789Sahrens } 851*789Sahrens 852*789Sahrens /* 853*789Sahrens * Update the config based on the new in-core state. 854*789Sahrens */ 855*789Sahrens spa_config_set(spa, spa_config_generate(spa, rvd, txg, 0)); 856*789Sahrens 857*789Sahrens return (spa_vdev_exit(spa, vd, txg, 0)); 858*789Sahrens } 859*789Sahrens 860*789Sahrens /* 861*789Sahrens * Attach a device to a mirror. The arguments are the path to any device 862*789Sahrens * in the mirror, and the nvroot for the new device. If the path specifies 863*789Sahrens * a device that is not mirrored, we automatically insert the mirror vdev. 864*789Sahrens * 865*789Sahrens * If 'replacing' is specified, the new device is intended to replace the 866*789Sahrens * existing device; in this case the two devices are made into their own 867*789Sahrens * mirror using the 'replacing' vdev, which is functionally idendical to 868*789Sahrens * the mirror vdev (it actually reuses all the same ops) but has a few 869*789Sahrens * extra rules: you can't attach to it after it's been created, and upon 870*789Sahrens * completion of resilvering, the first disk (the one being replaced) 871*789Sahrens * is automatically detached. 872*789Sahrens */ 873*789Sahrens int 874*789Sahrens spa_vdev_attach(spa_t *spa, const char *path, nvlist_t *nvroot, int replacing) 875*789Sahrens { 876*789Sahrens uint64_t txg, open_txg; 877*789Sahrens int error; 878*789Sahrens vdev_t *rvd = spa->spa_root_vdev; 879*789Sahrens vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 880*789Sahrens vdev_ops_t *pvops = replacing ? &vdev_replacing_ops : &vdev_mirror_ops; 881*789Sahrens 882*789Sahrens txg = spa_vdev_enter(spa); 883*789Sahrens 884*789Sahrens oldvd = vdev_lookup_by_path(rvd, path); 885*789Sahrens 886*789Sahrens if (oldvd == NULL) 887*789Sahrens return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 888*789Sahrens 889*789Sahrens pvd = oldvd->vdev_parent; 890*789Sahrens 891*789Sahrens /* 892*789Sahrens * The parent must be a mirror or the root, unless we're replacing; 893*789Sahrens * in that case, the parent can be anything but another replacing vdev. 894*789Sahrens */ 895*789Sahrens if (pvd->vdev_ops != &vdev_mirror_ops && 896*789Sahrens pvd->vdev_ops != &vdev_root_ops && 897*789Sahrens (!replacing || pvd->vdev_ops == &vdev_replacing_ops)) 898*789Sahrens return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 899*789Sahrens 900*789Sahrens newrootvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD); 901*789Sahrens 902*789Sahrens if (newrootvd == NULL || newrootvd->vdev_children != 1) 903*789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 904*789Sahrens 905*789Sahrens newvd = newrootvd->vdev_child[0]; 906*789Sahrens 907*789Sahrens if (!newvd->vdev_ops->vdev_op_leaf) 908*789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 909*789Sahrens 910*789Sahrens if ((error = vdev_create(newrootvd, txg)) != 0) 911*789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, error)); 912*789Sahrens 913*789Sahrens if (newvd->vdev_psize < oldvd->vdev_psize) 914*789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 915*789Sahrens 916*789Sahrens if (newvd->vdev_ashift != oldvd->vdev_ashift && oldvd->vdev_ashift != 0) 917*789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 918*789Sahrens 919*789Sahrens /* 920*789Sahrens * If this is an in-place replacement, update oldvd's path and devid 921*789Sahrens * to make it distinguishable from newvd, and unopenable from now on. 922*789Sahrens */ 923*789Sahrens if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 924*789Sahrens spa_strfree(oldvd->vdev_path); 925*789Sahrens oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 926*789Sahrens KM_SLEEP); 927*789Sahrens (void) sprintf(oldvd->vdev_path, "%s/%s", 928*789Sahrens newvd->vdev_path, "old"); 929*789Sahrens if (oldvd->vdev_devid != NULL) { 930*789Sahrens spa_strfree(oldvd->vdev_devid); 931*789Sahrens oldvd->vdev_devid = NULL; 932*789Sahrens } 933*789Sahrens } 934*789Sahrens 935*789Sahrens /* 936*789Sahrens * If the parent is not a mirror, or if we're replacing, 937*789Sahrens * insert the new mirror/replacing vdev above oldvd. 938*789Sahrens */ 939*789Sahrens if (pvd->vdev_ops != pvops) 940*789Sahrens pvd = vdev_add_parent(oldvd, pvops); 941*789Sahrens 942*789Sahrens ASSERT(pvd->vdev_top->vdev_parent == rvd); 943*789Sahrens ASSERT(pvd->vdev_ops == pvops); 944*789Sahrens ASSERT(oldvd->vdev_parent == pvd); 945*789Sahrens 946*789Sahrens /* 947*789Sahrens * Extract the new device from its root and add it to pvd. 948*789Sahrens */ 949*789Sahrens vdev_remove_child(newrootvd, newvd); 950*789Sahrens newvd->vdev_id = pvd->vdev_children; 951*789Sahrens vdev_add_child(pvd, newvd); 952*789Sahrens 953*789Sahrens tvd = newvd->vdev_top; 954*789Sahrens ASSERT(pvd->vdev_top == tvd); 955*789Sahrens ASSERT(tvd->vdev_parent == rvd); 956*789Sahrens 957*789Sahrens /* 958*789Sahrens * Update the config based on the new in-core state. 959*789Sahrens */ 960*789Sahrens spa_config_set(spa, spa_config_generate(spa, rvd, txg, 0)); 961*789Sahrens 962*789Sahrens vdev_config_dirty(tvd); 963*789Sahrens 964*789Sahrens /* 965*789Sahrens * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 966*789Sahrens * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 967*789Sahrens */ 968*789Sahrens open_txg = txg + TXG_CONCURRENT_STATES - 1; 969*789Sahrens 970*789Sahrens mutex_enter(&newvd->vdev_dtl_lock); 971*789Sahrens space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, 972*789Sahrens open_txg - TXG_INITIAL + 1); 973*789Sahrens mutex_exit(&newvd->vdev_dtl_lock); 974*789Sahrens 975*789Sahrens /* 976*789Sahrens * Mark newvd's DTL dirty in this txg. 977*789Sahrens */ 978*789Sahrens vdev_dirty(tvd, VDD_DTL, txg); 979*789Sahrens (void) txg_list_add(&tvd->vdev_dtl_list, newvd, txg); 980*789Sahrens 981*789Sahrens dprintf("attached %s, replacing=%d\n", path, replacing); 982*789Sahrens 983*789Sahrens (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 984*789Sahrens 985*789Sahrens /* 986*789Sahrens * Kick off a resilver to update newvd. 987*789Sahrens */ 988*789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 989*789Sahrens 990*789Sahrens return (0); 991*789Sahrens } 992*789Sahrens 993*789Sahrens /* 994*789Sahrens * Detach a device from a mirror or replacing vdev. 995*789Sahrens * If 'replace_done' is specified, only detach if the parent 996*789Sahrens * is a replacing vdev. 997*789Sahrens */ 998*789Sahrens int 999*789Sahrens spa_vdev_detach(spa_t *spa, const char *path, uint64_t guid, int replace_done) 1000*789Sahrens { 1001*789Sahrens uint64_t txg; 1002*789Sahrens int c, t, error; 1003*789Sahrens vdev_t *rvd = spa->spa_root_vdev; 1004*789Sahrens vdev_t *vd, *pvd, *cvd, *tvd; 1005*789Sahrens 1006*789Sahrens txg = spa_vdev_enter(spa); 1007*789Sahrens 1008*789Sahrens vd = vdev_lookup_by_path(rvd, path); 1009*789Sahrens 1010*789Sahrens if (vd == NULL) 1011*789Sahrens return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1012*789Sahrens 1013*789Sahrens if (guid != 0 && vd->vdev_guid != guid) 1014*789Sahrens return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1015*789Sahrens 1016*789Sahrens pvd = vd->vdev_parent; 1017*789Sahrens 1018*789Sahrens /* 1019*789Sahrens * If replace_done is specified, only remove this device if it's 1020*789Sahrens * the first child of a replacing vdev. 1021*789Sahrens */ 1022*789Sahrens if (replace_done && 1023*789Sahrens (vd->vdev_id != 0 || pvd->vdev_ops != &vdev_replacing_ops)) 1024*789Sahrens return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1025*789Sahrens 1026*789Sahrens /* 1027*789Sahrens * Only mirror and replacing vdevs support detach. 1028*789Sahrens */ 1029*789Sahrens if (pvd->vdev_ops != &vdev_replacing_ops && 1030*789Sahrens pvd->vdev_ops != &vdev_mirror_ops) 1031*789Sahrens return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1032*789Sahrens 1033*789Sahrens /* 1034*789Sahrens * If there's only one replica, you can't detach it. 1035*789Sahrens */ 1036*789Sahrens if (pvd->vdev_children <= 1) 1037*789Sahrens return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1038*789Sahrens 1039*789Sahrens /* 1040*789Sahrens * If all siblings have non-empty DTLs, this device may have the only 1041*789Sahrens * valid copy of the data, which means we cannot safely detach it. 1042*789Sahrens * 1043*789Sahrens * XXX -- as in the vdev_offline() case, we really want a more 1044*789Sahrens * precise DTL check. 1045*789Sahrens */ 1046*789Sahrens for (c = 0; c < pvd->vdev_children; c++) { 1047*789Sahrens uint64_t dirty; 1048*789Sahrens 1049*789Sahrens cvd = pvd->vdev_child[c]; 1050*789Sahrens if (cvd == vd) 1051*789Sahrens continue; 1052*789Sahrens if (vdev_is_dead(cvd)) 1053*789Sahrens continue; 1054*789Sahrens mutex_enter(&cvd->vdev_dtl_lock); 1055*789Sahrens dirty = cvd->vdev_dtl_map.sm_space | 1056*789Sahrens cvd->vdev_dtl_scrub.sm_space; 1057*789Sahrens mutex_exit(&cvd->vdev_dtl_lock); 1058*789Sahrens if (!dirty) 1059*789Sahrens break; 1060*789Sahrens } 1061*789Sahrens if (c == pvd->vdev_children) 1062*789Sahrens return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1063*789Sahrens 1064*789Sahrens /* 1065*789Sahrens * Erase the disk labels so the disk can be used for other things. 1066*789Sahrens * This must be done after all other error cases are handled, 1067*789Sahrens * but before we disembowel vd (so we can still do I/O to it). 1068*789Sahrens * But if we can't do it, don't treat the error as fatal -- 1069*789Sahrens * it may be that the unwritability of the disk is the reason 1070*789Sahrens * it's being detached! 1071*789Sahrens */ 1072*789Sahrens error = vdev_label_init(vd, 0); 1073*789Sahrens if (error) 1074*789Sahrens dprintf("unable to erase labels on %s\n", vdev_description(vd)); 1075*789Sahrens 1076*789Sahrens /* 1077*789Sahrens * Remove vd from its parent and compact the parent's children. 1078*789Sahrens */ 1079*789Sahrens vdev_remove_child(pvd, vd); 1080*789Sahrens vdev_compact_children(pvd); 1081*789Sahrens 1082*789Sahrens /* 1083*789Sahrens * Remember one of the remaining children so we can get tvd below. 1084*789Sahrens */ 1085*789Sahrens cvd = pvd->vdev_child[0]; 1086*789Sahrens 1087*789Sahrens /* 1088*789Sahrens * If the parent mirror/replacing vdev only has one child, 1089*789Sahrens * the parent is no longer needed. Remove it from the tree. 1090*789Sahrens */ 1091*789Sahrens if (pvd->vdev_children == 1) 1092*789Sahrens vdev_remove_parent(cvd); 1093*789Sahrens 1094*789Sahrens /* 1095*789Sahrens * We don't set tvd until now because the parent we just removed 1096*789Sahrens * may have been the previous top-level vdev. 1097*789Sahrens */ 1098*789Sahrens tvd = cvd->vdev_top; 1099*789Sahrens ASSERT(tvd->vdev_parent == rvd); 1100*789Sahrens 1101*789Sahrens /* 1102*789Sahrens * Reopen this top-level vdev to reassess health after detach. 1103*789Sahrens */ 1104*789Sahrens vdev_reopen(tvd, NULL); 1105*789Sahrens 1106*789Sahrens /* 1107*789Sahrens * If the device we just detached was smaller than the others, 1108*789Sahrens * it may be possible to add metaslabs (i.e. grow the pool). 1109*789Sahrens */ 1110*789Sahrens vdev_metaslab_init(tvd, txg); 1111*789Sahrens 1112*789Sahrens /* 1113*789Sahrens * Update the config based on the new in-core state. 1114*789Sahrens */ 1115*789Sahrens spa_config_set(spa, spa_config_generate(spa, rvd, txg, 0)); 1116*789Sahrens 1117*789Sahrens vdev_config_dirty(tvd); 1118*789Sahrens 1119*789Sahrens /* 1120*789Sahrens * Mark vd's DTL as dirty in this txg. 1121*789Sahrens * vdev_dtl_sync() will see that vd->vdev_detached is set 1122*789Sahrens * and free vd's DTL object in syncing context. 1123*789Sahrens * But first make sure we're not on any *other* txg's DTL list, 1124*789Sahrens * to prevent vd from being accessed after it's freed. 1125*789Sahrens */ 1126*789Sahrens vdev_dirty(tvd, VDD_DTL, txg); 1127*789Sahrens vd->vdev_detached = B_TRUE; 1128*789Sahrens for (t = 0; t < TXG_SIZE; t++) 1129*789Sahrens (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 1130*789Sahrens (void) txg_list_add(&tvd->vdev_dtl_list, vd, txg); 1131*789Sahrens 1132*789Sahrens dprintf("detached %s\n", path); 1133*789Sahrens 1134*789Sahrens return (spa_vdev_exit(spa, vd, txg, 0)); 1135*789Sahrens } 1136*789Sahrens 1137*789Sahrens /* 1138*789Sahrens * If there are any replacing vdevs that have finished replacing, detach them. 1139*789Sahrens * We can't hold the config lock across detaches, so we lock the config, 1140*789Sahrens * build a list of candidates, unlock the config, and try each candidate. 1141*789Sahrens */ 1142*789Sahrens typedef struct vdev_detach_link { 1143*789Sahrens char *vdl_path; 1144*789Sahrens uint64_t vdl_guid; 1145*789Sahrens list_node_t vdl_node; 1146*789Sahrens } vdev_detach_link_t; 1147*789Sahrens 1148*789Sahrens static void 1149*789Sahrens spa_vdev_replace_done_make_list(list_t *l, vdev_t *vd) 1150*789Sahrens { 1151*789Sahrens int c; 1152*789Sahrens 1153*789Sahrens for (c = 0; c < vd->vdev_children; c++) 1154*789Sahrens spa_vdev_replace_done_make_list(l, vd->vdev_child[c]); 1155*789Sahrens 1156*789Sahrens if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 1157*789Sahrens vdev_t *cvd0 = vd->vdev_child[0]; 1158*789Sahrens vdev_t *cvd1 = vd->vdev_child[1]; 1159*789Sahrens vdev_detach_link_t *vdl; 1160*789Sahrens int dirty1; 1161*789Sahrens 1162*789Sahrens mutex_enter(&cvd1->vdev_dtl_lock); 1163*789Sahrens dirty1 = cvd1->vdev_dtl_map.sm_space | 1164*789Sahrens cvd1->vdev_dtl_scrub.sm_space; 1165*789Sahrens mutex_exit(&cvd1->vdev_dtl_lock); 1166*789Sahrens 1167*789Sahrens if (!dirty1) { 1168*789Sahrens vdl = kmem_zalloc(sizeof (*vdl), KM_SLEEP); 1169*789Sahrens vdl->vdl_path = spa_strdup(cvd0->vdev_path); 1170*789Sahrens vdl->vdl_guid = cvd0->vdev_guid; 1171*789Sahrens list_insert_tail(l, vdl); 1172*789Sahrens } 1173*789Sahrens } 1174*789Sahrens } 1175*789Sahrens 1176*789Sahrens void 1177*789Sahrens spa_vdev_replace_done(spa_t *spa) 1178*789Sahrens { 1179*789Sahrens vdev_detach_link_t *vdl; 1180*789Sahrens list_t vdlist; 1181*789Sahrens 1182*789Sahrens list_create(&vdlist, sizeof (vdev_detach_link_t), 1183*789Sahrens offsetof(vdev_detach_link_t, vdl_node)); 1184*789Sahrens 1185*789Sahrens spa_config_enter(spa, RW_READER); 1186*789Sahrens spa_vdev_replace_done_make_list(&vdlist, spa->spa_root_vdev); 1187*789Sahrens spa_config_exit(spa); 1188*789Sahrens 1189*789Sahrens while ((vdl = list_head(&vdlist)) != NULL) { 1190*789Sahrens list_remove(&vdlist, vdl); 1191*789Sahrens (void) spa_vdev_detach(spa, vdl->vdl_path, vdl->vdl_guid, 1192*789Sahrens B_TRUE); 1193*789Sahrens spa_strfree(vdl->vdl_path); 1194*789Sahrens kmem_free(vdl, sizeof (*vdl)); 1195*789Sahrens } 1196*789Sahrens 1197*789Sahrens list_destroy(&vdlist); 1198*789Sahrens } 1199*789Sahrens 1200*789Sahrens /* 1201*789Sahrens * ========================================================================== 1202*789Sahrens * SPA Scrubbing 1203*789Sahrens * ========================================================================== 1204*789Sahrens */ 1205*789Sahrens 1206*789Sahrens static int spa_scrub_locked(spa_t *, pool_scrub_type_t, boolean_t); 1207*789Sahrens 1208*789Sahrens static void 1209*789Sahrens spa_scrub_io_done(zio_t *zio) 1210*789Sahrens { 1211*789Sahrens spa_t *spa = zio->io_spa; 1212*789Sahrens 1213*789Sahrens zio_buf_free(zio->io_data, zio->io_size); 1214*789Sahrens 1215*789Sahrens mutex_enter(&spa->spa_scrub_lock); 1216*789Sahrens if (zio->io_error) 1217*789Sahrens spa->spa_scrub_errors++; 1218*789Sahrens if (--spa->spa_scrub_inflight == 0) 1219*789Sahrens cv_broadcast(&spa->spa_scrub_io_cv); 1220*789Sahrens mutex_exit(&spa->spa_scrub_lock); 1221*789Sahrens 1222*789Sahrens if (zio->io_error) { 1223*789Sahrens vdev_t *vd = zio->io_vd; 1224*789Sahrens mutex_enter(&vd->vdev_stat_lock); 1225*789Sahrens vd->vdev_stat.vs_scrub_errors++; 1226*789Sahrens mutex_exit(&vd->vdev_stat_lock); 1227*789Sahrens } 1228*789Sahrens } 1229*789Sahrens 1230*789Sahrens static void 1231*789Sahrens spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags) 1232*789Sahrens { 1233*789Sahrens size_t size = BP_GET_LSIZE(bp); 1234*789Sahrens void *data = zio_buf_alloc(size); 1235*789Sahrens 1236*789Sahrens mutex_enter(&spa->spa_scrub_lock); 1237*789Sahrens spa->spa_scrub_inflight++; 1238*789Sahrens mutex_exit(&spa->spa_scrub_lock); 1239*789Sahrens 1240*789Sahrens zio_nowait(zio_read(NULL, spa, bp, data, size, 1241*789Sahrens spa_scrub_io_done, NULL, priority, flags)); 1242*789Sahrens } 1243*789Sahrens 1244*789Sahrens /* ARGSUSED */ 1245*789Sahrens static int 1246*789Sahrens spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) 1247*789Sahrens { 1248*789Sahrens blkptr_t *bp = &bc->bc_blkptr; 1249*789Sahrens vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[0])); 1250*789Sahrens 1251*789Sahrens if (bc->bc_errno || vd == NULL) { 1252*789Sahrens /* 1253*789Sahrens * We can't scrub this block, but we can continue to scrub 1254*789Sahrens * the rest of the pool. Note the error and move along. 1255*789Sahrens */ 1256*789Sahrens mutex_enter(&spa->spa_scrub_lock); 1257*789Sahrens spa->spa_scrub_errors++; 1258*789Sahrens mutex_exit(&spa->spa_scrub_lock); 1259*789Sahrens 1260*789Sahrens if (vd != NULL) { 1261*789Sahrens mutex_enter(&vd->vdev_stat_lock); 1262*789Sahrens vd->vdev_stat.vs_scrub_errors++; 1263*789Sahrens mutex_exit(&vd->vdev_stat_lock); 1264*789Sahrens } 1265*789Sahrens 1266*789Sahrens return (ERESTART); 1267*789Sahrens } 1268*789Sahrens 1269*789Sahrens ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg); 1270*789Sahrens 1271*789Sahrens /* 1272*789Sahrens * Keep track of how much data we've examined so that 1273*789Sahrens * zpool(1M) status can make useful progress reports. 1274*789Sahrens */ 1275*789Sahrens mutex_enter(&vd->vdev_stat_lock); 1276*789Sahrens vd->vdev_stat.vs_scrub_examined += BP_GET_ASIZE(bp); 1277*789Sahrens mutex_exit(&vd->vdev_stat_lock); 1278*789Sahrens 1279*789Sahrens if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) { 1280*789Sahrens if (DVA_GET_GANG(&bp->blk_dva[0])) { 1281*789Sahrens /* 1282*789Sahrens * Gang members may be spread across multiple vdevs, 1283*789Sahrens * so the best we can do is look at the pool-wide DTL. 1284*789Sahrens * XXX -- it would be better to change our allocation 1285*789Sahrens * policy to ensure that this can't happen. 1286*789Sahrens */ 1287*789Sahrens vd = spa->spa_root_vdev; 1288*789Sahrens } 1289*789Sahrens if (vdev_dtl_contains(&vd->vdev_dtl_map, bp->blk_birth, 1)) { 1290*789Sahrens spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER, 1291*789Sahrens ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | 1292*789Sahrens ZIO_FLAG_RESILVER); 1293*789Sahrens } 1294*789Sahrens } else { 1295*789Sahrens spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB, 1296*789Sahrens ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_SCRUB); 1297*789Sahrens } 1298*789Sahrens 1299*789Sahrens return (0); 1300*789Sahrens } 1301*789Sahrens 1302*789Sahrens static void 1303*789Sahrens spa_scrub_thread(spa_t *spa) 1304*789Sahrens { 1305*789Sahrens callb_cpr_t cprinfo; 1306*789Sahrens traverse_handle_t *th = spa->spa_scrub_th; 1307*789Sahrens vdev_t *rvd = spa->spa_root_vdev; 1308*789Sahrens pool_scrub_type_t scrub_type = spa->spa_scrub_type; 1309*789Sahrens int error = 0; 1310*789Sahrens boolean_t complete; 1311*789Sahrens 1312*789Sahrens CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG); 1313*789Sahrens 1314*789Sahrens spa_config_enter(spa, RW_WRITER); 1315*789Sahrens vdev_reopen(rvd, NULL); /* purge all vdev caches */ 1316*789Sahrens vdev_config_dirty(rvd); /* rewrite all disk labels */ 1317*789Sahrens vdev_scrub_stat_update(rvd, scrub_type, B_FALSE); 1318*789Sahrens spa_config_exit(spa); 1319*789Sahrens 1320*789Sahrens mutex_enter(&spa->spa_scrub_lock); 1321*789Sahrens spa->spa_scrub_errors = 0; 1322*789Sahrens spa->spa_scrub_active = 1; 1323*789Sahrens 1324*789Sahrens while (!spa->spa_scrub_stop) { 1325*789Sahrens CALLB_CPR_SAFE_BEGIN(&cprinfo); 1326*789Sahrens while (spa->spa_scrub_suspend) { 1327*789Sahrens spa->spa_scrub_active = 0; 1328*789Sahrens cv_broadcast(&spa->spa_scrub_cv); 1329*789Sahrens cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 1330*789Sahrens spa->spa_scrub_active = 1; 1331*789Sahrens } 1332*789Sahrens CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock); 1333*789Sahrens 1334*789Sahrens if (spa->spa_scrub_restart_txg != 0) 1335*789Sahrens break; 1336*789Sahrens 1337*789Sahrens mutex_exit(&spa->spa_scrub_lock); 1338*789Sahrens error = traverse_more(th); 1339*789Sahrens mutex_enter(&spa->spa_scrub_lock); 1340*789Sahrens if (error != EAGAIN) 1341*789Sahrens break; 1342*789Sahrens } 1343*789Sahrens 1344*789Sahrens while (spa->spa_scrub_inflight) 1345*789Sahrens cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1346*789Sahrens 1347*789Sahrens if (spa->spa_scrub_restart_txg != 0) 1348*789Sahrens error = ERESTART; 1349*789Sahrens 1350*789Sahrens spa->spa_scrub_active = 0; 1351*789Sahrens cv_broadcast(&spa->spa_scrub_cv); 1352*789Sahrens 1353*789Sahrens /* 1354*789Sahrens * If the traverse completed, and there were no errors, 1355*789Sahrens * then the scrub was completely successful. 1356*789Sahrens */ 1357*789Sahrens complete = (error == 0 && spa->spa_scrub_errors == 0); 1358*789Sahrens 1359*789Sahrens dprintf("scrub to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n", 1360*789Sahrens spa->spa_scrub_maxtxg, complete ? "done" : "FAILED", 1361*789Sahrens error, spa->spa_scrub_errors, spa->spa_scrub_stop); 1362*789Sahrens 1363*789Sahrens mutex_exit(&spa->spa_scrub_lock); 1364*789Sahrens 1365*789Sahrens /* 1366*789Sahrens * If the scrub/resilver completed, update all DTLs to reflect this. 1367*789Sahrens * Whether it succeeded or not, vacate all temporary scrub DTLs. 1368*789Sahrens */ 1369*789Sahrens spa_config_enter(spa, RW_WRITER); 1370*789Sahrens vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1, 1371*789Sahrens complete ? spa->spa_scrub_maxtxg : 0, B_TRUE); 1372*789Sahrens spa_config_exit(spa); 1373*789Sahrens 1374*789Sahrens spa_vdev_replace_done(spa); 1375*789Sahrens 1376*789Sahrens spa_config_enter(spa, RW_READER); 1377*789Sahrens vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete); 1378*789Sahrens spa_config_exit(spa); 1379*789Sahrens 1380*789Sahrens mutex_enter(&spa->spa_scrub_lock); 1381*789Sahrens 1382*789Sahrens spa->spa_scrub_type = POOL_SCRUB_NONE; 1383*789Sahrens spa->spa_scrub_active = 0; 1384*789Sahrens spa->spa_scrub_thread = NULL; 1385*789Sahrens 1386*789Sahrens cv_broadcast(&spa->spa_scrub_cv); 1387*789Sahrens 1388*789Sahrens /* 1389*789Sahrens * If we were told to restart, our final act is to start a new scrub. 1390*789Sahrens */ 1391*789Sahrens if (error == ERESTART) 1392*789Sahrens VERIFY(spa_scrub_locked(spa, scrub_type, B_TRUE) == 0); 1393*789Sahrens 1394*789Sahrens CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */ 1395*789Sahrens thread_exit(); 1396*789Sahrens } 1397*789Sahrens 1398*789Sahrens void 1399*789Sahrens spa_scrub_suspend(spa_t *spa) 1400*789Sahrens { 1401*789Sahrens mutex_enter(&spa->spa_scrub_lock); 1402*789Sahrens spa->spa_scrub_suspend++; 1403*789Sahrens while (spa->spa_scrub_active) { 1404*789Sahrens cv_broadcast(&spa->spa_scrub_cv); 1405*789Sahrens cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 1406*789Sahrens } 1407*789Sahrens while (spa->spa_scrub_inflight) 1408*789Sahrens cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1409*789Sahrens mutex_exit(&spa->spa_scrub_lock); 1410*789Sahrens } 1411*789Sahrens 1412*789Sahrens void 1413*789Sahrens spa_scrub_resume(spa_t *spa) 1414*789Sahrens { 1415*789Sahrens mutex_enter(&spa->spa_scrub_lock); 1416*789Sahrens ASSERT(spa->spa_scrub_suspend != 0); 1417*789Sahrens if (--spa->spa_scrub_suspend == 0) 1418*789Sahrens cv_broadcast(&spa->spa_scrub_cv); 1419*789Sahrens mutex_exit(&spa->spa_scrub_lock); 1420*789Sahrens } 1421*789Sahrens 1422*789Sahrens void 1423*789Sahrens spa_scrub_restart(spa_t *spa, uint64_t txg) 1424*789Sahrens { 1425*789Sahrens /* 1426*789Sahrens * Something happened (e.g. snapshot create/delete) that means 1427*789Sahrens * we must restart any in-progress scrubs. The itinerary will 1428*789Sahrens * fix this properly. 1429*789Sahrens */ 1430*789Sahrens mutex_enter(&spa->spa_scrub_lock); 1431*789Sahrens spa->spa_scrub_restart_txg = txg; 1432*789Sahrens mutex_exit(&spa->spa_scrub_lock); 1433*789Sahrens } 1434*789Sahrens 1435*789Sahrens static int 1436*789Sahrens spa_scrub_locked(spa_t *spa, pool_scrub_type_t type, boolean_t force) 1437*789Sahrens { 1438*789Sahrens space_seg_t *ss; 1439*789Sahrens uint64_t mintxg, maxtxg; 1440*789Sahrens vdev_t *rvd = spa->spa_root_vdev; 1441*789Sahrens int advance = 0; 1442*789Sahrens 1443*789Sahrens if ((uint_t)type >= POOL_SCRUB_TYPES) 1444*789Sahrens return (ENOTSUP); 1445*789Sahrens 1446*789Sahrens /* 1447*789Sahrens * If there's a scrub or resilver already in progress, stop it. 1448*789Sahrens */ 1449*789Sahrens while (spa->spa_scrub_thread != NULL) { 1450*789Sahrens /* 1451*789Sahrens * Don't stop a resilver unless forced. 1452*789Sahrens */ 1453*789Sahrens if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) 1454*789Sahrens return (EBUSY); 1455*789Sahrens 1456*789Sahrens spa->spa_scrub_stop = 1; 1457*789Sahrens cv_broadcast(&spa->spa_scrub_cv); 1458*789Sahrens cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 1459*789Sahrens } 1460*789Sahrens 1461*789Sahrens /* 1462*789Sahrens * Terminate the previous traverse. 1463*789Sahrens */ 1464*789Sahrens if (spa->spa_scrub_th != NULL) { 1465*789Sahrens traverse_fini(spa->spa_scrub_th); 1466*789Sahrens spa->spa_scrub_th = NULL; 1467*789Sahrens } 1468*789Sahrens 1469*789Sahrens spa->spa_scrub_stop = 0; 1470*789Sahrens spa->spa_scrub_type = type; 1471*789Sahrens spa->spa_scrub_restart_txg = 0; 1472*789Sahrens 1473*789Sahrens mintxg = TXG_INITIAL - 1; 1474*789Sahrens maxtxg = spa_last_synced_txg(spa) + 1; 1475*789Sahrens 1476*789Sahrens switch (type) { 1477*789Sahrens 1478*789Sahrens case POOL_SCRUB_NONE: 1479*789Sahrens break; 1480*789Sahrens 1481*789Sahrens case POOL_SCRUB_RESILVER: 1482*789Sahrens /* 1483*789Sahrens * Determine the resilvering boundaries. 1484*789Sahrens * 1485*789Sahrens * Note: (mintxg, maxtxg) is an open interval, 1486*789Sahrens * i.e. mintxg and maxtxg themselves are not included. 1487*789Sahrens * 1488*789Sahrens * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1 1489*789Sahrens * so we don't claim to resilver a txg that's still changing. 1490*789Sahrens */ 1491*789Sahrens mutex_enter(&rvd->vdev_dtl_lock); 1492*789Sahrens ss = avl_first(&rvd->vdev_dtl_map.sm_root); 1493*789Sahrens mintxg = ss ? ss->ss_start - 1 : 0; 1494*789Sahrens ss = avl_last(&rvd->vdev_dtl_map.sm_root); 1495*789Sahrens maxtxg = ss ? ss->ss_end : 0; 1496*789Sahrens maxtxg = MIN(maxtxg, spa_last_synced_txg(spa) + 1); 1497*789Sahrens mutex_exit(&rvd->vdev_dtl_lock); 1498*789Sahrens 1499*789Sahrens advance = ADVANCE_PRE | ADVANCE_PRUNE; 1500*789Sahrens break; 1501*789Sahrens 1502*789Sahrens case POOL_SCRUB_EVERYTHING: 1503*789Sahrens /* 1504*789Sahrens * A scrub is like a resilver, but not pruned by DTL. 1505*789Sahrens */ 1506*789Sahrens advance = ADVANCE_PRE; 1507*789Sahrens break; 1508*789Sahrens } 1509*789Sahrens 1510*789Sahrens if (mintxg != 0 && maxtxg != 0 && type != POOL_SCRUB_NONE) { 1511*789Sahrens spa->spa_scrub_maxtxg = maxtxg; 1512*789Sahrens spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL, 1513*789Sahrens advance, ZIO_FLAG_CANFAIL); 1514*789Sahrens traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg); 1515*789Sahrens spa->spa_scrub_thread = thread_create(NULL, 0, 1516*789Sahrens spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri); 1517*789Sahrens } 1518*789Sahrens 1519*789Sahrens return (0); 1520*789Sahrens } 1521*789Sahrens 1522*789Sahrens int 1523*789Sahrens spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) 1524*789Sahrens { 1525*789Sahrens int error; 1526*789Sahrens traverse_handle_t *th; 1527*789Sahrens 1528*789Sahrens mutex_enter(&spa->spa_scrub_lock); 1529*789Sahrens error = spa_scrub_locked(spa, type, force); 1530*789Sahrens th = spa->spa_scrub_th; 1531*789Sahrens mutex_exit(&spa->spa_scrub_lock); 1532*789Sahrens 1533*789Sahrens if (th == NULL && type != POOL_SCRUB_NONE) 1534*789Sahrens spa_vdev_replace_done(spa); 1535*789Sahrens 1536*789Sahrens return (error); 1537*789Sahrens } 1538*789Sahrens 1539*789Sahrens /* 1540*789Sahrens * ========================================================================== 1541*789Sahrens * SPA syncing routines 1542*789Sahrens * ========================================================================== 1543*789Sahrens */ 1544*789Sahrens 1545*789Sahrens static void 1546*789Sahrens spa_sync_deferred_frees(spa_t *spa, uint64_t txg) 1547*789Sahrens { 1548*789Sahrens bplist_t *bpl = &spa->spa_sync_bplist; 1549*789Sahrens dmu_tx_t *tx; 1550*789Sahrens blkptr_t blk; 1551*789Sahrens uint64_t itor = 0; 1552*789Sahrens zio_t *zio; 1553*789Sahrens int error; 1554*789Sahrens uint8_t c = 1; 1555*789Sahrens 1556*789Sahrens zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD); 1557*789Sahrens 1558*789Sahrens while (bplist_iterate(bpl, &itor, &blk) == 0) 1559*789Sahrens zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL)); 1560*789Sahrens 1561*789Sahrens error = zio_wait(zio); 1562*789Sahrens ASSERT3U(error, ==, 0); 1563*789Sahrens 1564*789Sahrens tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 1565*789Sahrens bplist_vacate(bpl, tx); 1566*789Sahrens 1567*789Sahrens /* 1568*789Sahrens * Pre-dirty the first block so we sync to convergence faster. 1569*789Sahrens * (Usually only the first block is needed.) 1570*789Sahrens */ 1571*789Sahrens dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 1572*789Sahrens dmu_tx_commit(tx); 1573*789Sahrens } 1574*789Sahrens 1575*789Sahrens static void 1576*789Sahrens spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 1577*789Sahrens { 1578*789Sahrens nvlist_t *config; 1579*789Sahrens char *packed = NULL; 1580*789Sahrens size_t nvsize = 0; 1581*789Sahrens dmu_buf_t *db; 1582*789Sahrens 1583*789Sahrens if (list_is_empty(&spa->spa_dirty_list)) 1584*789Sahrens return; 1585*789Sahrens 1586*789Sahrens config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE); 1587*789Sahrens 1588*789Sahrens spa_config_set(spa, config); 1589*789Sahrens 1590*789Sahrens VERIFY(nvlist_size(config, &nvsize, NV_ENCODE_XDR) == 0); 1591*789Sahrens 1592*789Sahrens packed = kmem_alloc(nvsize, KM_SLEEP); 1593*789Sahrens 1594*789Sahrens VERIFY(nvlist_pack(config, &packed, &nvsize, NV_ENCODE_XDR, 0) == 0); 1595*789Sahrens 1596*789Sahrens dmu_write(spa->spa_meta_objset, spa->spa_config_object, 0, nvsize, 1597*789Sahrens packed, tx); 1598*789Sahrens 1599*789Sahrens kmem_free(packed, nvsize); 1600*789Sahrens 1601*789Sahrens db = dmu_bonus_hold(spa->spa_meta_objset, spa->spa_config_object); 1602*789Sahrens dmu_buf_will_dirty(db, tx); 1603*789Sahrens *(uint64_t *)db->db_data = nvsize; 1604*789Sahrens dmu_buf_rele(db); 1605*789Sahrens } 1606*789Sahrens 1607*789Sahrens /* 1608*789Sahrens * Sync the specified transaction group. New blocks may be dirtied as 1609*789Sahrens * part of the process, so we iterate until it converges. 1610*789Sahrens */ 1611*789Sahrens void 1612*789Sahrens spa_sync(spa_t *spa, uint64_t txg) 1613*789Sahrens { 1614*789Sahrens dsl_pool_t *dp = spa->spa_dsl_pool; 1615*789Sahrens objset_t *mos = spa->spa_meta_objset; 1616*789Sahrens bplist_t *bpl = &spa->spa_sync_bplist; 1617*789Sahrens vdev_t *rvd = spa->spa_root_vdev; 1618*789Sahrens vdev_t *vd; 1619*789Sahrens dmu_tx_t *tx; 1620*789Sahrens int dirty_vdevs; 1621*789Sahrens 1622*789Sahrens /* 1623*789Sahrens * Lock out configuration changes. 1624*789Sahrens */ 1625*789Sahrens spa_config_enter(spa, RW_READER); 1626*789Sahrens 1627*789Sahrens spa->spa_syncing_txg = txg; 1628*789Sahrens spa->spa_sync_pass = 0; 1629*789Sahrens 1630*789Sahrens bplist_open(bpl, mos, spa->spa_sync_bplist_obj); 1631*789Sahrens 1632*789Sahrens /* 1633*789Sahrens * If anything has changed in this txg, push the deferred frees 1634*789Sahrens * from the previous txg. If not, leave them alone so that we 1635*789Sahrens * don't generate work on an otherwise idle system. 1636*789Sahrens */ 1637*789Sahrens if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 1638*789Sahrens !txg_list_empty(&dp->dp_dirty_dirs, txg)) 1639*789Sahrens spa_sync_deferred_frees(spa, txg); 1640*789Sahrens 1641*789Sahrens /* 1642*789Sahrens * Iterate to convergence. 1643*789Sahrens */ 1644*789Sahrens do { 1645*789Sahrens spa->spa_sync_pass++; 1646*789Sahrens 1647*789Sahrens tx = dmu_tx_create_assigned(dp, txg); 1648*789Sahrens spa_sync_config_object(spa, tx); 1649*789Sahrens dmu_tx_commit(tx); 1650*789Sahrens 1651*789Sahrens dsl_pool_sync(dp, txg); 1652*789Sahrens 1653*789Sahrens dirty_vdevs = 0; 1654*789Sahrens while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 1655*789Sahrens vdev_sync(vd, txg); 1656*789Sahrens dirty_vdevs++; 1657*789Sahrens } 1658*789Sahrens 1659*789Sahrens tx = dmu_tx_create_assigned(dp, txg); 1660*789Sahrens bplist_sync(bpl, tx); 1661*789Sahrens dmu_tx_commit(tx); 1662*789Sahrens 1663*789Sahrens } while (dirty_vdevs); 1664*789Sahrens 1665*789Sahrens bplist_close(bpl); 1666*789Sahrens 1667*789Sahrens dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 1668*789Sahrens 1669*789Sahrens /* 1670*789Sahrens * Rewrite the vdev configuration (which includes the uberblock) 1671*789Sahrens * to commit the transaction group. 1672*789Sahrens */ 1673*789Sahrens while (spa_sync_labels(spa, txg)) { 1674*789Sahrens dprintf("waiting for devices to heal\n"); 1675*789Sahrens delay(hz); 1676*789Sahrens vdev_reopen(rvd, NULL); 1677*789Sahrens } 1678*789Sahrens 1679*789Sahrens /* 1680*789Sahrens * Make a stable copy of the fully synced uberblock. 1681*789Sahrens * We use this as the root for pool traversals. 1682*789Sahrens */ 1683*789Sahrens spa->spa_traverse_wanted = 1; /* tells traverse_more() to stop */ 1684*789Sahrens 1685*789Sahrens spa_scrub_suspend(spa); /* stop scrubbing and finish I/Os */ 1686*789Sahrens 1687*789Sahrens rw_enter(&spa->spa_traverse_lock, RW_WRITER); 1688*789Sahrens spa->spa_traverse_wanted = 0; 1689*789Sahrens spa->spa_ubsync = spa->spa_uberblock; 1690*789Sahrens rw_exit(&spa->spa_traverse_lock); 1691*789Sahrens 1692*789Sahrens spa_scrub_resume(spa); /* resume scrub with new ubsync */ 1693*789Sahrens 1694*789Sahrens /* 1695*789Sahrens * Clean up the ZIL records for the synced txg. 1696*789Sahrens */ 1697*789Sahrens dsl_pool_zil_clean(dp); 1698*789Sahrens 1699*789Sahrens /* 1700*789Sahrens * Update usable space statistics. 1701*789Sahrens */ 1702*789Sahrens while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 1703*789Sahrens vdev_sync_done(vd, txg); 1704*789Sahrens 1705*789Sahrens /* 1706*789Sahrens * It had better be the case that we didn't dirty anything 1707*789Sahrens * since spa_sync_labels(). 1708*789Sahrens */ 1709*789Sahrens ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 1710*789Sahrens ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 1711*789Sahrens ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 1712*789Sahrens ASSERT(bpl->bpl_queue == NULL); 1713*789Sahrens 1714*789Sahrens spa_config_exit(spa); 1715*789Sahrens } 1716*789Sahrens 1717*789Sahrens /* 1718*789Sahrens * Sync all pools. We don't want to hold the namespace lock across these 1719*789Sahrens * operations, so we take a reference on the spa_t and drop the lock during the 1720*789Sahrens * sync. 1721*789Sahrens */ 1722*789Sahrens void 1723*789Sahrens spa_sync_allpools(void) 1724*789Sahrens { 1725*789Sahrens spa_t *spa = NULL; 1726*789Sahrens mutex_enter(&spa_namespace_lock); 1727*789Sahrens while ((spa = spa_next(spa)) != NULL) { 1728*789Sahrens if (spa_state(spa) != POOL_STATE_ACTIVE) 1729*789Sahrens continue; 1730*789Sahrens spa_open_ref(spa, FTAG); 1731*789Sahrens mutex_exit(&spa_namespace_lock); 1732*789Sahrens txg_wait_synced(spa_get_dsl(spa), 0); 1733*789Sahrens mutex_enter(&spa_namespace_lock); 1734*789Sahrens spa_close(spa, FTAG); 1735*789Sahrens } 1736*789Sahrens mutex_exit(&spa_namespace_lock); 1737*789Sahrens } 1738*789Sahrens 1739*789Sahrens /* 1740*789Sahrens * ========================================================================== 1741*789Sahrens * Miscellaneous routines 1742*789Sahrens * ========================================================================== 1743*789Sahrens */ 1744*789Sahrens 1745*789Sahrens int 1746*789Sahrens spa_busy(void) 1747*789Sahrens { 1748*789Sahrens return (spa_active_count != 0); 1749*789Sahrens } 1750*789Sahrens 1751*789Sahrens /* 1752*789Sahrens * Remove all pools in the system. 1753*789Sahrens */ 1754*789Sahrens void 1755*789Sahrens spa_evict_all(void) 1756*789Sahrens { 1757*789Sahrens spa_t *spa; 1758*789Sahrens 1759*789Sahrens /* 1760*789Sahrens * Remove all cached state. All pools should be closed now, 1761*789Sahrens * so every spa in the AVL tree should be unreferenced. 1762*789Sahrens */ 1763*789Sahrens mutex_enter(&spa_namespace_lock); 1764*789Sahrens while ((spa = spa_next(NULL)) != NULL) { 1765*789Sahrens /* 1766*789Sahrens * Stop all scrub and resilver activity. spa_scrub() needs to 1767*789Sahrens * wait for the scrub thread, which may do a detach and sync the 1768*789Sahrens * configs, which needs spa_namespace_lock. Drop the lock while 1769*789Sahrens * maintaining a hold on the spa_t. 1770*789Sahrens */ 1771*789Sahrens spa_open_ref(spa, FTAG); 1772*789Sahrens mutex_exit(&spa_namespace_lock); 1773*789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 1774*789Sahrens mutex_enter(&spa_namespace_lock); 1775*789Sahrens spa_close(spa, FTAG); 1776*789Sahrens 1777*789Sahrens if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 1778*789Sahrens spa_unload(spa); 1779*789Sahrens spa_deactivate(spa); 1780*789Sahrens } 1781*789Sahrens spa_remove(spa); 1782*789Sahrens } 1783*789Sahrens mutex_exit(&spa_namespace_lock); 1784*789Sahrens } 1785