1789Sahrens /* 2789Sahrens * CDDL HEADER START 3789Sahrens * 4789Sahrens * The contents of this file are subject to the terms of the 5789Sahrens * Common Development and Distribution License, Version 1.0 only 6789Sahrens * (the "License"). You may not use this file except in compliance 7789Sahrens * with the License. 8789Sahrens * 9789Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10789Sahrens * or http://www.opensolaris.org/os/licensing. 11789Sahrens * See the License for the specific language governing permissions 12789Sahrens * and limitations under the License. 13789Sahrens * 14789Sahrens * When distributing Covered Code, include this CDDL HEADER in each 15789Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16789Sahrens * If applicable, add the following below this CDDL HEADER, with the 17789Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 18789Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 19789Sahrens * 20789Sahrens * CDDL HEADER END 21789Sahrens */ 22789Sahrens /* 23789Sahrens * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24789Sahrens * Use is subject to license terms. 25789Sahrens */ 26789Sahrens 27789Sahrens #pragma ident "%Z%%M% %I% %E% SMI" 28789Sahrens 29789Sahrens /* 30789Sahrens * This file contains all the routines used when modifying on-disk SPA state. 31789Sahrens * This includes opening, importing, destroying, exporting a pool, and syncing a 32789Sahrens * pool. 33789Sahrens */ 34789Sahrens 35789Sahrens #include <sys/zfs_context.h> 36789Sahrens #include <sys/spa_impl.h> 37789Sahrens #include <sys/zio.h> 38789Sahrens #include <sys/zio_checksum.h> 39789Sahrens #include <sys/zio_compress.h> 40789Sahrens #include <sys/dmu.h> 41789Sahrens #include <sys/dmu_tx.h> 42789Sahrens #include <sys/zap.h> 43789Sahrens #include <sys/zil.h> 44789Sahrens #include <sys/vdev_impl.h> 45789Sahrens #include <sys/metaslab.h> 46789Sahrens #include <sys/uberblock_impl.h> 47789Sahrens #include <sys/txg.h> 48789Sahrens #include <sys/avl.h> 49789Sahrens #include <sys/dmu_traverse.h> 50789Sahrens #include <sys/unique.h> 51789Sahrens #include <sys/dsl_pool.h> 52789Sahrens #include <sys/dsl_dir.h> 53789Sahrens #include <sys/dsl_prop.h> 54789Sahrens #include <sys/fs/zfs.h> 55789Sahrens #include <sys/callb.h> 56789Sahrens 57789Sahrens static uint32_t spa_active_count; 58789Sahrens 59789Sahrens /* 60789Sahrens * ========================================================================== 61789Sahrens * SPA state manipulation (open/create/destroy/import/export) 62789Sahrens * ========================================================================== 63789Sahrens */ 64789Sahrens 65789Sahrens /* 66789Sahrens * Activate an uninitialized pool. 67789Sahrens */ 68789Sahrens static void 69789Sahrens spa_activate(spa_t *spa) 70789Sahrens { 71789Sahrens int t; 72789Sahrens 73789Sahrens ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 74789Sahrens 75789Sahrens spa->spa_state = POOL_STATE_ACTIVE; 76789Sahrens 77789Sahrens spa->spa_normal_class = metaslab_class_create(); 78789Sahrens 79789Sahrens spa->spa_vdev_retry_taskq = taskq_create("spa_vdev_retry", 80789Sahrens 4, maxclsyspri, 50, INT_MAX, TASKQ_PREPOPULATE); 81789Sahrens 82789Sahrens for (t = 0; t < ZIO_TYPES; t++) { 83789Sahrens spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue", 84789Sahrens 8, maxclsyspri, 50, INT_MAX, 85789Sahrens TASKQ_PREPOPULATE); 86789Sahrens spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr", 87789Sahrens 8, maxclsyspri, 50, INT_MAX, 88789Sahrens TASKQ_PREPOPULATE); 89789Sahrens } 90789Sahrens 91789Sahrens rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL); 92789Sahrens 93789Sahrens list_create(&spa->spa_dirty_list, sizeof (vdev_t), 94789Sahrens offsetof(vdev_t, vdev_dirty_node)); 95789Sahrens 96789Sahrens txg_list_create(&spa->spa_vdev_txg_list, 97789Sahrens offsetof(struct vdev, vdev_txg_node)); 98789Sahrens } 99789Sahrens 100789Sahrens /* 101789Sahrens * Opposite of spa_activate(). 102789Sahrens */ 103789Sahrens static void 104789Sahrens spa_deactivate(spa_t *spa) 105789Sahrens { 106789Sahrens int t; 107789Sahrens 108789Sahrens ASSERT(spa->spa_sync_on == B_FALSE); 109789Sahrens ASSERT(spa->spa_dsl_pool == NULL); 110789Sahrens ASSERT(spa->spa_root_vdev == NULL); 111789Sahrens 112789Sahrens ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 113789Sahrens 114789Sahrens txg_list_destroy(&spa->spa_vdev_txg_list); 115789Sahrens 116789Sahrens list_destroy(&spa->spa_dirty_list); 117789Sahrens 118789Sahrens rw_destroy(&spa->spa_traverse_lock); 119789Sahrens 120789Sahrens for (t = 0; t < ZIO_TYPES; t++) { 121789Sahrens taskq_destroy(spa->spa_zio_issue_taskq[t]); 122789Sahrens taskq_destroy(spa->spa_zio_intr_taskq[t]); 123789Sahrens spa->spa_zio_issue_taskq[t] = NULL; 124789Sahrens spa->spa_zio_intr_taskq[t] = NULL; 125789Sahrens } 126789Sahrens 127789Sahrens taskq_destroy(spa->spa_vdev_retry_taskq); 128789Sahrens spa->spa_vdev_retry_taskq = NULL; 129789Sahrens 130789Sahrens metaslab_class_destroy(spa->spa_normal_class); 131789Sahrens spa->spa_normal_class = NULL; 132789Sahrens 133789Sahrens spa->spa_state = POOL_STATE_UNINITIALIZED; 134789Sahrens } 135789Sahrens 136789Sahrens /* 137789Sahrens * Verify a pool configuration, and construct the vdev tree appropriately. This 138789Sahrens * will create all the necessary vdevs in the appropriate layout, with each vdev 139789Sahrens * in the CLOSED state. This will prep the pool before open/creation/import. 140789Sahrens * All vdev validation is done by the vdev_alloc() routine. 141789Sahrens */ 142789Sahrens static vdev_t * 143789Sahrens spa_config_parse(spa_t *spa, nvlist_t *nv, vdev_t *parent, uint_t id, int atype) 144789Sahrens { 145789Sahrens nvlist_t **child; 146789Sahrens uint_t c, children; 147789Sahrens vdev_t *vd; 148789Sahrens 149789Sahrens if ((vd = vdev_alloc(spa, nv, parent, id, atype)) == NULL) 150789Sahrens return (NULL); 151789Sahrens 152789Sahrens if (vd->vdev_ops->vdev_op_leaf) 153789Sahrens return (vd); 154789Sahrens 155789Sahrens if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 156789Sahrens &child, &children) != 0) { 157789Sahrens vdev_free(vd); 158789Sahrens return (NULL); 159789Sahrens } 160789Sahrens 161789Sahrens for (c = 0; c < children; c++) { 162789Sahrens if (spa_config_parse(spa, child[c], vd, c, atype) == NULL) { 163789Sahrens vdev_free(vd); 164789Sahrens return (NULL); 165789Sahrens } 166789Sahrens } 167789Sahrens 168789Sahrens return (vd); 169789Sahrens } 170789Sahrens 171789Sahrens /* 172789Sahrens * Opposite of spa_load(). 173789Sahrens */ 174789Sahrens static void 175789Sahrens spa_unload(spa_t *spa) 176789Sahrens { 177789Sahrens /* 178789Sahrens * Stop syncing. 179789Sahrens */ 180789Sahrens if (spa->spa_sync_on) { 181789Sahrens txg_sync_stop(spa->spa_dsl_pool); 182789Sahrens spa->spa_sync_on = B_FALSE; 183789Sahrens } 184789Sahrens 185789Sahrens /* 186789Sahrens * Wait for any outstanding prefetch I/O to complete. 187789Sahrens */ 188789Sahrens spa_config_enter(spa, RW_WRITER); 189789Sahrens spa_config_exit(spa); 190789Sahrens 191789Sahrens /* 192789Sahrens * Close the dsl pool. 193789Sahrens */ 194789Sahrens if (spa->spa_dsl_pool) { 195789Sahrens dsl_pool_close(spa->spa_dsl_pool); 196789Sahrens spa->spa_dsl_pool = NULL; 197789Sahrens } 198789Sahrens 199789Sahrens /* 200789Sahrens * Close all vdevs. 201789Sahrens */ 202789Sahrens if (spa->spa_root_vdev) { 203789Sahrens vdev_free(spa->spa_root_vdev); 204789Sahrens spa->spa_root_vdev = NULL; 205789Sahrens } 206789Sahrens } 207789Sahrens 208789Sahrens /* 209789Sahrens * Load an existing storage pool, using the pool's builtin spa_config as a 210789Sahrens * source of configuration information. The 'readonly' flag will prevent us 211789Sahrens * from writing any updated state to disk, and can be use when testing a pool 212789Sahrens * for import. 213789Sahrens */ 214789Sahrens static int 215789Sahrens spa_load(spa_t *spa, nvlist_t *config, int readonly, int import, int mosconfig) 216789Sahrens { 217789Sahrens int error = 0; 218789Sahrens nvlist_t *nvroot = NULL; 219789Sahrens vdev_t *rvd; 220789Sahrens uberblock_t *ub = &spa->spa_uberblock; 221789Sahrens uint64_t pool_guid; 222789Sahrens zio_t *zio; 223789Sahrens 224789Sahrens if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 225789Sahrens nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) 226789Sahrens return (EINVAL); 227789Sahrens 228789Sahrens (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 229789Sahrens &spa->spa_config_txg); 230789Sahrens 231789Sahrens if (import && spa_guid_exists(pool_guid, 0)) 232789Sahrens return (EEXIST); 233789Sahrens 234789Sahrens /* 235789Sahrens * Parse the configuration into a vdev tree. 236789Sahrens */ 237789Sahrens spa_config_enter(spa, RW_WRITER); 238789Sahrens rvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 239789Sahrens spa_config_exit(spa); 240789Sahrens 241789Sahrens if (rvd == NULL) 242789Sahrens return (EINVAL); 243789Sahrens 244789Sahrens spa->spa_root_vdev = rvd; 245789Sahrens ASSERT(spa_guid(spa) == pool_guid); 246789Sahrens 247789Sahrens /* 248789Sahrens * Try to open all vdevs, loading each label in the process. 249789Sahrens */ 250789Sahrens if (vdev_open(rvd) != 0) 251789Sahrens return (ENXIO); 252789Sahrens 253789Sahrens /* 254789Sahrens * Find the best uberblock. 255789Sahrens */ 256789Sahrens bzero(ub, sizeof (uberblock_t)); 257789Sahrens 258789Sahrens zio = zio_root(spa, NULL, NULL, 259789Sahrens ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 260789Sahrens vdev_uberblock_load(zio, rvd, ub); 261789Sahrens error = zio_wait(zio); 262789Sahrens 263789Sahrens /* 264789Sahrens * If we weren't able to find a single valid uberblock, return failure. 265789Sahrens */ 266789Sahrens if (ub->ub_txg == 0) { 267789Sahrens dprintf("ub_txg is zero\n"); 268789Sahrens return (ENXIO); 269789Sahrens } 270789Sahrens 271789Sahrens /* 272789Sahrens * If the vdev guid sum doesn't match the uberblock, we have an 273789Sahrens * incomplete configuration. 274789Sahrens */ 275789Sahrens if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 276789Sahrens rvd->vdev_state = VDEV_STATE_CANT_OPEN; 277789Sahrens rvd->vdev_stat.vs_aux = VDEV_AUX_BAD_GUID_SUM; 278789Sahrens dprintf("vdev_guid_sum %llx != ub_guid_sum %llx\n", 279789Sahrens rvd->vdev_guid_sum, ub->ub_guid_sum); 280789Sahrens return (ENXIO); 281789Sahrens } 282789Sahrens 283789Sahrens /* 284789Sahrens * Initialize internal SPA structures. 285789Sahrens */ 286789Sahrens spa->spa_state = POOL_STATE_ACTIVE; 287789Sahrens spa->spa_ubsync = spa->spa_uberblock; 288789Sahrens spa->spa_first_txg = spa_last_synced_txg(spa) + 1; 289789Sahrens spa->spa_dsl_pool = dsl_pool_open(spa, spa->spa_first_txg); 290789Sahrens spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 291789Sahrens 292789Sahrens VERIFY(zap_lookup(spa->spa_meta_objset, 293789Sahrens DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 294789Sahrens sizeof (uint64_t), 1, &spa->spa_config_object) == 0); 295789Sahrens 296789Sahrens if (!mosconfig) { 297789Sahrens dmu_buf_t *db; 298789Sahrens char *packed = NULL; 299789Sahrens size_t nvsize = 0; 300789Sahrens nvlist_t *newconfig = NULL; 301789Sahrens 302789Sahrens db = dmu_bonus_hold(spa->spa_meta_objset, 303789Sahrens spa->spa_config_object); 304789Sahrens dmu_buf_read(db); 305789Sahrens nvsize = *(uint64_t *)db->db_data; 306789Sahrens dmu_buf_rele(db); 307789Sahrens 308789Sahrens packed = kmem_alloc(nvsize, KM_SLEEP); 309789Sahrens error = dmu_read_canfail(spa->spa_meta_objset, 310789Sahrens spa->spa_config_object, 0, nvsize, packed); 311789Sahrens if (error == 0) 312789Sahrens error = nvlist_unpack(packed, nvsize, &newconfig, 0); 313789Sahrens kmem_free(packed, nvsize); 314789Sahrens 315789Sahrens if (error) 316789Sahrens return (ENXIO); 317789Sahrens 318789Sahrens spa_config_set(spa, newconfig); 319789Sahrens 320789Sahrens spa_unload(spa); 321789Sahrens spa_deactivate(spa); 322789Sahrens spa_activate(spa); 323789Sahrens 324789Sahrens return (spa_load(spa, newconfig, readonly, import, B_TRUE)); 325789Sahrens } 326789Sahrens 327789Sahrens VERIFY(zap_lookup(spa->spa_meta_objset, 328789Sahrens DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 329789Sahrens sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) == 0); 330789Sahrens 331789Sahrens /* 332789Sahrens * Load the vdev state for all top level vdevs. 333789Sahrens */ 334789Sahrens if ((error = vdev_load(rvd, import)) != 0) 335789Sahrens return (error); 336789Sahrens 337789Sahrens /* 338789Sahrens * Propagate the leaf DTLs we just loaded all the way up the tree. 339789Sahrens */ 340789Sahrens spa_config_enter(spa, RW_WRITER); 341789Sahrens vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 342789Sahrens spa_config_exit(spa); 343789Sahrens 344789Sahrens /* 345789Sahrens * Check the state of the root vdev. If it can't be opened, it 346789Sahrens * indicates one or more toplevel vdevs are faulted. 347789Sahrens */ 348789Sahrens if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 349789Sahrens return (ENXIO); 350789Sahrens 351789Sahrens /* 352789Sahrens * Claim log blocks that haven't been committed yet, and update all 353789Sahrens * top-level vdevs to sync any config changes found in vdev_load(). 354789Sahrens * This must all happen in a single txg. 355789Sahrens */ 356789Sahrens if ((spa_mode & FWRITE) && !readonly) { 357789Sahrens dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa), 358789Sahrens spa_first_txg(spa)); 359789Sahrens dmu_objset_find(spa->spa_name, zil_claim, tx, 0); 360789Sahrens vdev_config_dirty(rvd); 361789Sahrens dmu_tx_commit(tx); 362789Sahrens 363789Sahrens spa->spa_sync_on = B_TRUE; 364789Sahrens txg_sync_start(spa->spa_dsl_pool); 365789Sahrens 366789Sahrens /* 367789Sahrens * Wait for all claims to sync. 368789Sahrens */ 369789Sahrens txg_wait_synced(spa->spa_dsl_pool, 0); 370789Sahrens } 371789Sahrens 372789Sahrens return (0); 373789Sahrens } 374789Sahrens 375789Sahrens /* 376789Sahrens * Pool Open/Import 377789Sahrens * 378789Sahrens * The import case is identical to an open except that the configuration is sent 379789Sahrens * down from userland, instead of grabbed from the configuration cache. For the 380789Sahrens * case of an open, the pool configuration will exist in the 381789Sahrens * POOL_STATE_UNITIALIZED state. 382789Sahrens * 383789Sahrens * The stats information (gen/count/ustats) is used to gather vdev statistics at 384789Sahrens * the same time open the pool, without having to keep around the spa_t in some 385789Sahrens * ambiguous state. 386789Sahrens */ 387789Sahrens static int 388789Sahrens spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) 389789Sahrens { 390789Sahrens spa_t *spa; 391789Sahrens int error; 392789Sahrens int loaded = B_FALSE; 393789Sahrens int locked = B_FALSE; 394789Sahrens 395789Sahrens *spapp = NULL; 396789Sahrens 397789Sahrens /* 398789Sahrens * As disgusting as this is, we need to support recursive calls to this 399789Sahrens * function because dsl_dir_open() is called during spa_load(), and ends 400789Sahrens * up calling spa_open() again. The real fix is to figure out how to 401789Sahrens * avoid dsl_dir_open() calling this in the first place. 402789Sahrens */ 403789Sahrens if (mutex_owner(&spa_namespace_lock) != curthread) { 404789Sahrens mutex_enter(&spa_namespace_lock); 405789Sahrens locked = B_TRUE; 406789Sahrens } 407789Sahrens 408789Sahrens if ((spa = spa_lookup(pool)) == NULL) { 409789Sahrens if (locked) 410789Sahrens mutex_exit(&spa_namespace_lock); 411789Sahrens return (ENOENT); 412789Sahrens } 413789Sahrens if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 414789Sahrens 415789Sahrens spa_activate(spa); 416789Sahrens 417789Sahrens error = spa_load(spa, spa->spa_config, 418789Sahrens B_FALSE, B_FALSE, B_FALSE); 419789Sahrens 420789Sahrens if (error == EBADF) { 421789Sahrens /* 422789Sahrens * If vdev_load() returns EBADF, it indicates that one 423789Sahrens * of the vdevs indicates that the pool has been 424789Sahrens * exported or destroyed. If this is the case, the 425789Sahrens * config cache is out of sync and we should remove the 426789Sahrens * pool from the namespace. 427789Sahrens */ 428789Sahrens spa_unload(spa); 429789Sahrens spa_deactivate(spa); 430789Sahrens spa_remove(spa); 431789Sahrens spa_config_sync(); 432789Sahrens if (locked) 433789Sahrens mutex_exit(&spa_namespace_lock); 434789Sahrens return (ENOENT); 435789Sahrens } if (error) { 436789Sahrens /* 437789Sahrens * We can't open the pool, but we still have useful 438789Sahrens * information: the state of each vdev after the 439789Sahrens * attempted vdev_open(). Return this to the user. 440789Sahrens */ 441789Sahrens if (config != NULL && spa->spa_root_vdev != NULL) 442789Sahrens *config = spa_config_generate(spa, NULL, -1ULL, 443789Sahrens B_TRUE); 444789Sahrens spa_unload(spa); 445789Sahrens spa_deactivate(spa); 446789Sahrens if (locked) 447789Sahrens mutex_exit(&spa_namespace_lock); 448789Sahrens *spapp = NULL; 449789Sahrens return (error); 450789Sahrens } 451789Sahrens 452789Sahrens loaded = B_TRUE; 453789Sahrens } 454789Sahrens 455789Sahrens spa_open_ref(spa, tag); 456789Sahrens if (locked) 457789Sahrens mutex_exit(&spa_namespace_lock); 458789Sahrens 459789Sahrens *spapp = spa; 460789Sahrens 461789Sahrens if (config != NULL) { 462789Sahrens spa_config_enter(spa, RW_READER); 463789Sahrens *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 464789Sahrens spa_config_exit(spa); 465789Sahrens } 466789Sahrens 467789Sahrens /* 468789Sahrens * If we just loaded the pool, resilver anything that's out of date. 469789Sahrens */ 470789Sahrens if (loaded && (spa_mode & FWRITE)) 471789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 472789Sahrens 473789Sahrens return (0); 474789Sahrens } 475789Sahrens 476789Sahrens int 477789Sahrens spa_open(const char *name, spa_t **spapp, void *tag) 478789Sahrens { 479789Sahrens return (spa_open_common(name, spapp, tag, NULL)); 480789Sahrens } 481789Sahrens 482789Sahrens int 483789Sahrens spa_get_stats(const char *name, nvlist_t **config) 484789Sahrens { 485789Sahrens int error; 486789Sahrens spa_t *spa; 487789Sahrens 488789Sahrens *config = NULL; 489789Sahrens error = spa_open_common(name, &spa, FTAG, config); 490789Sahrens 491789Sahrens if (spa != NULL) 492789Sahrens spa_close(spa, FTAG); 493789Sahrens 494789Sahrens return (error); 495789Sahrens } 496789Sahrens 497789Sahrens /* 498789Sahrens * Pool Creation 499789Sahrens */ 500789Sahrens int 501789Sahrens spa_create(const char *pool, nvlist_t *nvroot, char *altroot) 502789Sahrens { 503789Sahrens spa_t *spa; 504789Sahrens dsl_pool_t *dp; 505789Sahrens dmu_tx_t *tx; 506789Sahrens int error; 507789Sahrens uint64_t txg = TXG_INITIAL; 508789Sahrens 509789Sahrens /* 510789Sahrens * If this pool already exists, return failure. 511789Sahrens */ 512789Sahrens mutex_enter(&spa_namespace_lock); 513789Sahrens if (spa_lookup(pool) != NULL) { 514789Sahrens mutex_exit(&spa_namespace_lock); 515789Sahrens return (EEXIST); 516789Sahrens } 517789Sahrens spa = spa_add(pool); 518789Sahrens 519789Sahrens /* 520789Sahrens * Allocate a new spa_t structure. 521789Sahrens */ 522789Sahrens spa_activate(spa); 523789Sahrens 524789Sahrens spa->spa_uberblock.ub_txg = txg - 1; 525789Sahrens spa->spa_ubsync = spa->spa_uberblock; 526789Sahrens 527789Sahrens error = spa_vdev_add(spa, nvroot); 528789Sahrens 529789Sahrens if (error) { 530789Sahrens spa_unload(spa); 531789Sahrens spa_deactivate(spa); 532789Sahrens spa_remove(spa); 533789Sahrens mutex_exit(&spa_namespace_lock); 534789Sahrens return (error); 535789Sahrens } 536789Sahrens 537789Sahrens if (altroot != NULL) { 538789Sahrens spa->spa_root = spa_strdup(altroot); 539789Sahrens atomic_add_32(&spa_active_count, 1); 540789Sahrens } 541789Sahrens 542789Sahrens spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg); 543789Sahrens spa->spa_meta_objset = dp->dp_meta_objset; 544789Sahrens 545789Sahrens tx = dmu_tx_create_assigned(dp, txg); 546789Sahrens 547789Sahrens /* 548789Sahrens * Create the pool config object. 549789Sahrens */ 550789Sahrens spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 551789Sahrens DMU_OT_PACKED_NVLIST, 1 << 14, 552789Sahrens DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 553789Sahrens 554789Sahrens VERIFY(zap_add(spa->spa_meta_objset, 555789Sahrens DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 556789Sahrens sizeof (uint64_t), 1, &spa->spa_config_object, tx) == 0); 557789Sahrens 558789Sahrens /* 559789Sahrens * Create the deferred-free bplist object. Turn off compression 560789Sahrens * because sync-to-convergence takes longer if the blocksize 561789Sahrens * keeps changing. 562789Sahrens */ 563789Sahrens spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 564789Sahrens 1 << 14, tx); 565789Sahrens dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 566789Sahrens ZIO_COMPRESS_OFF, tx); 567789Sahrens 568789Sahrens VERIFY(zap_add(spa->spa_meta_objset, 569789Sahrens DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 570789Sahrens sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) == 0); 571789Sahrens 572789Sahrens dmu_tx_commit(tx); 573789Sahrens 574789Sahrens spa->spa_sync_on = B_TRUE; 575789Sahrens txg_sync_start(spa->spa_dsl_pool); 576789Sahrens 577789Sahrens /* 578789Sahrens * We explicitly wait for the first transaction to complete so that our 579789Sahrens * bean counters are appropriately updated. 580789Sahrens */ 581789Sahrens txg_wait_synced(spa->spa_dsl_pool, txg); 582789Sahrens 583789Sahrens spa_config_sync(); 584789Sahrens 585789Sahrens mutex_exit(&spa_namespace_lock); 586789Sahrens 587789Sahrens return (0); 588789Sahrens } 589789Sahrens 590789Sahrens /* 591789Sahrens * Import the given pool into the system. We set up the necessary spa_t and 592789Sahrens * then call spa_load() to do the dirty work. 593789Sahrens */ 594789Sahrens int 595789Sahrens spa_import(const char *pool, nvlist_t *config, char *altroot) 596789Sahrens { 597789Sahrens spa_t *spa; 598789Sahrens int error; 599789Sahrens 600789Sahrens if (!(spa_mode & FWRITE)) 601789Sahrens return (EROFS); 602789Sahrens 603789Sahrens /* 604789Sahrens * If a pool with this name exists, return failure. 605789Sahrens */ 606789Sahrens mutex_enter(&spa_namespace_lock); 607789Sahrens if (spa_lookup(pool) != NULL) { 608789Sahrens mutex_exit(&spa_namespace_lock); 609789Sahrens return (EEXIST); 610789Sahrens } 611789Sahrens 612789Sahrens /* 613789Sahrens * Create an initialize the spa structure 614789Sahrens */ 615789Sahrens spa = spa_add(pool); 616789Sahrens spa_activate(spa); 617789Sahrens 618789Sahrens /* 619789Sahrens * Pass off the heavy lifting to spa_load(). We pass TRUE for mosconfig 620789Sahrens * so that we don't try to open the pool if the config is damaged. 621789Sahrens */ 622789Sahrens error = spa_load(spa, config, B_FALSE, B_TRUE, B_TRUE); 623789Sahrens 624789Sahrens if (error) { 625789Sahrens spa_unload(spa); 626789Sahrens spa_deactivate(spa); 627789Sahrens spa_remove(spa); 628789Sahrens mutex_exit(&spa_namespace_lock); 629789Sahrens return (error); 630789Sahrens } 631789Sahrens 632789Sahrens /* 633789Sahrens * Set the alternate root, if there is one. 634789Sahrens */ 635789Sahrens if (altroot != NULL) { 636789Sahrens atomic_add_32(&spa_active_count, 1); 637789Sahrens spa->spa_root = spa_strdup(altroot); 638789Sahrens } 639789Sahrens 640789Sahrens /* 641789Sahrens * Initialize the config based on the in-core state. 642789Sahrens */ 643789Sahrens config = spa_config_generate(spa, NULL, spa_last_synced_txg(spa), 0); 644789Sahrens 645789Sahrens spa_config_set(spa, config); 646789Sahrens 647789Sahrens /* 648789Sahrens * Sync the configuration cache. 649789Sahrens */ 650789Sahrens spa_config_sync(); 651789Sahrens 652789Sahrens mutex_exit(&spa_namespace_lock); 653789Sahrens 654789Sahrens /* 655789Sahrens * Resilver anything that's out of date. 656789Sahrens */ 657789Sahrens if (spa_mode & FWRITE) 658789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 659789Sahrens 660789Sahrens return (0); 661789Sahrens } 662789Sahrens 663789Sahrens /* 664789Sahrens * This (illegal) pool name is used when temporarily importing a spa_t in order 665789Sahrens * to get the vdev stats associated with the imported devices. 666789Sahrens */ 667789Sahrens #define TRYIMPORT_NAME "$import" 668789Sahrens 669789Sahrens nvlist_t * 670789Sahrens spa_tryimport(nvlist_t *tryconfig) 671789Sahrens { 672789Sahrens nvlist_t *config = NULL; 673789Sahrens char *poolname; 674789Sahrens spa_t *spa; 675789Sahrens uint64_t state; 676789Sahrens 677789Sahrens if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 678789Sahrens return (NULL); 679789Sahrens 680789Sahrens if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 681789Sahrens return (NULL); 682789Sahrens 683789Sahrens mutex_enter(&spa_namespace_lock); 684789Sahrens spa = spa_add(TRYIMPORT_NAME); 685789Sahrens 686789Sahrens ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 687789Sahrens 688789Sahrens /* 689789Sahrens * Initialize the spa_t structure. 690789Sahrens */ 691789Sahrens spa_activate(spa); 692789Sahrens 693789Sahrens /* 694789Sahrens * Pass off the heavy lifting to spa_load(). We pass TRUE for mosconfig 695789Sahrens * so we don't try to open the pool if the config is damaged. 696789Sahrens */ 697789Sahrens (void) spa_load(spa, tryconfig, B_TRUE, B_TRUE, B_TRUE); 698789Sahrens 699789Sahrens /* 700789Sahrens * If 'tryconfig' was at least parsable, return the current config. 701789Sahrens */ 702789Sahrens if (spa->spa_root_vdev != NULL) { 703789Sahrens config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 704789Sahrens VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 705789Sahrens poolname) == 0); 706789Sahrens VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 707789Sahrens state) == 0); 708789Sahrens } 709789Sahrens 710789Sahrens spa_unload(spa); 711789Sahrens spa_deactivate(spa); 712789Sahrens spa_remove(spa); 713789Sahrens mutex_exit(&spa_namespace_lock); 714789Sahrens 715789Sahrens return (config); 716789Sahrens } 717789Sahrens 718789Sahrens /* 719789Sahrens * Pool export/destroy 720789Sahrens * 721789Sahrens * The act of destroying or exporting a pool is very simple. We make sure there 722789Sahrens * is no more pending I/O and any references to the pool are gone. Then, we 723789Sahrens * update the pool state and sync all the labels to disk, removing the 724789Sahrens * configuration from the cache afterwards. 725789Sahrens */ 726789Sahrens static int 727789Sahrens spa_export_common(char *pool, int new_state) 728789Sahrens { 729789Sahrens spa_t *spa; 730789Sahrens 731789Sahrens if (!(spa_mode & FWRITE)) 732789Sahrens return (EROFS); 733789Sahrens 734789Sahrens mutex_enter(&spa_namespace_lock); 735789Sahrens if ((spa = spa_lookup(pool)) == NULL) { 736789Sahrens mutex_exit(&spa_namespace_lock); 737789Sahrens return (ENOENT); 738789Sahrens } 739789Sahrens 740789Sahrens /* 741789Sahrens * The pool will be in core if it's openable, 742789Sahrens * in which case we can modify its state. 743789Sahrens */ 744789Sahrens if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 745789Sahrens /* 746789Sahrens * Objsets may be open only because they're dirty, so we 747789Sahrens * have to force it to sync before checking spa_refcnt. 748789Sahrens */ 749789Sahrens spa_scrub_suspend(spa); 750789Sahrens txg_wait_synced(spa->spa_dsl_pool, 0); 751789Sahrens 752789Sahrens if (!spa_refcount_zero(spa)) { 753789Sahrens spa_scrub_resume(spa); 754789Sahrens mutex_exit(&spa_namespace_lock); 755789Sahrens return (EBUSY); 756789Sahrens } 757789Sahrens 758789Sahrens /* 759789Sahrens * Update the pool state. 760789Sahrens */ 761789Sahrens spa->spa_state = new_state; 762789Sahrens 763789Sahrens spa_scrub_resume(spa); 764789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 765789Sahrens 766789Sahrens if (spa->spa_root != NULL) 767789Sahrens atomic_add_32(&spa_active_count, -1); 768789Sahrens 769789Sahrens /* 770789Sahrens * We want this to be reflected on every label, 771789Sahrens * so mark them all dirty. spa_unload() will do the 772789Sahrens * final sync that pushes these changes out. 773789Sahrens */ 774789Sahrens vdev_config_dirty(spa->spa_root_vdev); 775789Sahrens } 776789Sahrens 777789Sahrens if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 778789Sahrens spa_unload(spa); 779789Sahrens spa_deactivate(spa); 780789Sahrens } 781789Sahrens 782789Sahrens spa_remove(spa); 783789Sahrens spa_config_sync(); 784789Sahrens mutex_exit(&spa_namespace_lock); 785789Sahrens 786789Sahrens return (0); 787789Sahrens } 788789Sahrens 789789Sahrens /* 790789Sahrens * Destroy a storage pool. 791789Sahrens */ 792789Sahrens int 793789Sahrens spa_destroy(char *pool) 794789Sahrens { 795789Sahrens return (spa_export_common(pool, POOL_STATE_DESTROYED)); 796789Sahrens } 797789Sahrens 798789Sahrens /* 799789Sahrens * Export a storage pool. 800789Sahrens */ 801789Sahrens int 802789Sahrens spa_export(char *pool) 803789Sahrens { 804789Sahrens return (spa_export_common(pool, POOL_STATE_EXPORTED)); 805789Sahrens } 806789Sahrens 807789Sahrens /* 808789Sahrens * ========================================================================== 809789Sahrens * Device manipulation 810789Sahrens * ========================================================================== 811789Sahrens */ 812789Sahrens 813789Sahrens /* 814789Sahrens * Add capacity to a storage pool. 815789Sahrens */ 816789Sahrens int 817789Sahrens spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 818789Sahrens { 819789Sahrens uint64_t txg; 820789Sahrens int c, error; 821789Sahrens vdev_t *rvd = spa->spa_root_vdev; 822789Sahrens vdev_t *vd; 823789Sahrens 824789Sahrens txg = spa_vdev_enter(spa); 825789Sahrens 826789Sahrens vd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD); 827789Sahrens 828789Sahrens if (vd == NULL) 829789Sahrens return (spa_vdev_exit(spa, vd, txg, EINVAL)); 830789Sahrens 831789Sahrens if (rvd == NULL) /* spa_create() */ 832789Sahrens spa->spa_root_vdev = rvd = vd; 833789Sahrens 834789Sahrens if ((error = vdev_create(vd, txg)) != 0) 835789Sahrens return (spa_vdev_exit(spa, vd, txg, error)); 836789Sahrens 837789Sahrens /* 838789Sahrens * Transfer each top-level vdev from the temporary root 839789Sahrens * to the spa's root and initialize its metaslabs. 840789Sahrens */ 841789Sahrens for (c = 0; c < vd->vdev_children; c++) { 842789Sahrens vdev_t *tvd = vd->vdev_child[c]; 843789Sahrens if (vd != rvd) { 844789Sahrens vdev_remove_child(vd, tvd); 845789Sahrens tvd->vdev_id = rvd->vdev_children; 846789Sahrens vdev_add_child(rvd, tvd); 847789Sahrens } 848789Sahrens vdev_init(tvd, txg); 849789Sahrens vdev_config_dirty(tvd); 850789Sahrens } 851789Sahrens 852789Sahrens /* 853789Sahrens * Update the config based on the new in-core state. 854789Sahrens */ 855789Sahrens spa_config_set(spa, spa_config_generate(spa, rvd, txg, 0)); 856789Sahrens 857789Sahrens return (spa_vdev_exit(spa, vd, txg, 0)); 858789Sahrens } 859789Sahrens 860789Sahrens /* 861789Sahrens * Attach a device to a mirror. The arguments are the path to any device 862789Sahrens * in the mirror, and the nvroot for the new device. If the path specifies 863789Sahrens * a device that is not mirrored, we automatically insert the mirror vdev. 864789Sahrens * 865789Sahrens * If 'replacing' is specified, the new device is intended to replace the 866789Sahrens * existing device; in this case the two devices are made into their own 867789Sahrens * mirror using the 'replacing' vdev, which is functionally idendical to 868789Sahrens * the mirror vdev (it actually reuses all the same ops) but has a few 869789Sahrens * extra rules: you can't attach to it after it's been created, and upon 870789Sahrens * completion of resilvering, the first disk (the one being replaced) 871789Sahrens * is automatically detached. 872789Sahrens */ 873789Sahrens int 874789Sahrens spa_vdev_attach(spa_t *spa, const char *path, nvlist_t *nvroot, int replacing) 875789Sahrens { 876789Sahrens uint64_t txg, open_txg; 877789Sahrens int error; 878789Sahrens vdev_t *rvd = spa->spa_root_vdev; 879789Sahrens vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 880789Sahrens vdev_ops_t *pvops = replacing ? &vdev_replacing_ops : &vdev_mirror_ops; 881789Sahrens 882789Sahrens txg = spa_vdev_enter(spa); 883789Sahrens 884789Sahrens oldvd = vdev_lookup_by_path(rvd, path); 885789Sahrens 886789Sahrens if (oldvd == NULL) 887789Sahrens return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 888789Sahrens 889789Sahrens pvd = oldvd->vdev_parent; 890789Sahrens 891789Sahrens /* 892789Sahrens * The parent must be a mirror or the root, unless we're replacing; 893789Sahrens * in that case, the parent can be anything but another replacing vdev. 894789Sahrens */ 895789Sahrens if (pvd->vdev_ops != &vdev_mirror_ops && 896789Sahrens pvd->vdev_ops != &vdev_root_ops && 897789Sahrens (!replacing || pvd->vdev_ops == &vdev_replacing_ops)) 898789Sahrens return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 899789Sahrens 900789Sahrens newrootvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD); 901789Sahrens 902789Sahrens if (newrootvd == NULL || newrootvd->vdev_children != 1) 903789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 904789Sahrens 905789Sahrens newvd = newrootvd->vdev_child[0]; 906789Sahrens 907789Sahrens if (!newvd->vdev_ops->vdev_op_leaf) 908789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 909789Sahrens 910789Sahrens if ((error = vdev_create(newrootvd, txg)) != 0) 911789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, error)); 912789Sahrens 913789Sahrens if (newvd->vdev_psize < oldvd->vdev_psize) 914789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 915789Sahrens 916789Sahrens if (newvd->vdev_ashift != oldvd->vdev_ashift && oldvd->vdev_ashift != 0) 917789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 918789Sahrens 919789Sahrens /* 920789Sahrens * If this is an in-place replacement, update oldvd's path and devid 921789Sahrens * to make it distinguishable from newvd, and unopenable from now on. 922789Sahrens */ 923789Sahrens if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 924789Sahrens spa_strfree(oldvd->vdev_path); 925789Sahrens oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 926789Sahrens KM_SLEEP); 927789Sahrens (void) sprintf(oldvd->vdev_path, "%s/%s", 928789Sahrens newvd->vdev_path, "old"); 929789Sahrens if (oldvd->vdev_devid != NULL) { 930789Sahrens spa_strfree(oldvd->vdev_devid); 931789Sahrens oldvd->vdev_devid = NULL; 932789Sahrens } 933789Sahrens } 934789Sahrens 935789Sahrens /* 936789Sahrens * If the parent is not a mirror, or if we're replacing, 937789Sahrens * insert the new mirror/replacing vdev above oldvd. 938789Sahrens */ 939789Sahrens if (pvd->vdev_ops != pvops) 940789Sahrens pvd = vdev_add_parent(oldvd, pvops); 941789Sahrens 942789Sahrens ASSERT(pvd->vdev_top->vdev_parent == rvd); 943789Sahrens ASSERT(pvd->vdev_ops == pvops); 944789Sahrens ASSERT(oldvd->vdev_parent == pvd); 945789Sahrens 946789Sahrens /* 947789Sahrens * Extract the new device from its root and add it to pvd. 948789Sahrens */ 949789Sahrens vdev_remove_child(newrootvd, newvd); 950789Sahrens newvd->vdev_id = pvd->vdev_children; 951789Sahrens vdev_add_child(pvd, newvd); 952789Sahrens 953789Sahrens tvd = newvd->vdev_top; 954789Sahrens ASSERT(pvd->vdev_top == tvd); 955789Sahrens ASSERT(tvd->vdev_parent == rvd); 956789Sahrens 957789Sahrens /* 958789Sahrens * Update the config based on the new in-core state. 959789Sahrens */ 960789Sahrens spa_config_set(spa, spa_config_generate(spa, rvd, txg, 0)); 961789Sahrens 962789Sahrens vdev_config_dirty(tvd); 963789Sahrens 964789Sahrens /* 965789Sahrens * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 966789Sahrens * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 967789Sahrens */ 968789Sahrens open_txg = txg + TXG_CONCURRENT_STATES - 1; 969789Sahrens 970789Sahrens mutex_enter(&newvd->vdev_dtl_lock); 971789Sahrens space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, 972789Sahrens open_txg - TXG_INITIAL + 1); 973789Sahrens mutex_exit(&newvd->vdev_dtl_lock); 974789Sahrens 975789Sahrens /* 976789Sahrens * Mark newvd's DTL dirty in this txg. 977789Sahrens */ 978789Sahrens vdev_dirty(tvd, VDD_DTL, txg); 979789Sahrens (void) txg_list_add(&tvd->vdev_dtl_list, newvd, txg); 980789Sahrens 981789Sahrens dprintf("attached %s, replacing=%d\n", path, replacing); 982789Sahrens 983789Sahrens (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 984789Sahrens 985789Sahrens /* 986789Sahrens * Kick off a resilver to update newvd. 987789Sahrens */ 988789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 989789Sahrens 990789Sahrens return (0); 991789Sahrens } 992789Sahrens 993789Sahrens /* 994789Sahrens * Detach a device from a mirror or replacing vdev. 995789Sahrens * If 'replace_done' is specified, only detach if the parent 996789Sahrens * is a replacing vdev. 997789Sahrens */ 998789Sahrens int 999789Sahrens spa_vdev_detach(spa_t *spa, const char *path, uint64_t guid, int replace_done) 1000789Sahrens { 1001789Sahrens uint64_t txg; 1002789Sahrens int c, t, error; 1003789Sahrens vdev_t *rvd = spa->spa_root_vdev; 1004789Sahrens vdev_t *vd, *pvd, *cvd, *tvd; 1005789Sahrens 1006789Sahrens txg = spa_vdev_enter(spa); 1007789Sahrens 1008789Sahrens vd = vdev_lookup_by_path(rvd, path); 1009789Sahrens 1010789Sahrens if (vd == NULL) 1011789Sahrens return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1012789Sahrens 1013789Sahrens if (guid != 0 && vd->vdev_guid != guid) 1014789Sahrens return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1015789Sahrens 1016789Sahrens pvd = vd->vdev_parent; 1017789Sahrens 1018789Sahrens /* 1019789Sahrens * If replace_done is specified, only remove this device if it's 1020789Sahrens * the first child of a replacing vdev. 1021789Sahrens */ 1022789Sahrens if (replace_done && 1023789Sahrens (vd->vdev_id != 0 || pvd->vdev_ops != &vdev_replacing_ops)) 1024789Sahrens return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1025789Sahrens 1026789Sahrens /* 1027789Sahrens * Only mirror and replacing vdevs support detach. 1028789Sahrens */ 1029789Sahrens if (pvd->vdev_ops != &vdev_replacing_ops && 1030789Sahrens pvd->vdev_ops != &vdev_mirror_ops) 1031789Sahrens return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1032789Sahrens 1033789Sahrens /* 1034789Sahrens * If there's only one replica, you can't detach it. 1035789Sahrens */ 1036789Sahrens if (pvd->vdev_children <= 1) 1037789Sahrens return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1038789Sahrens 1039789Sahrens /* 1040789Sahrens * If all siblings have non-empty DTLs, this device may have the only 1041789Sahrens * valid copy of the data, which means we cannot safely detach it. 1042789Sahrens * 1043789Sahrens * XXX -- as in the vdev_offline() case, we really want a more 1044789Sahrens * precise DTL check. 1045789Sahrens */ 1046789Sahrens for (c = 0; c < pvd->vdev_children; c++) { 1047789Sahrens uint64_t dirty; 1048789Sahrens 1049789Sahrens cvd = pvd->vdev_child[c]; 1050789Sahrens if (cvd == vd) 1051789Sahrens continue; 1052789Sahrens if (vdev_is_dead(cvd)) 1053789Sahrens continue; 1054789Sahrens mutex_enter(&cvd->vdev_dtl_lock); 1055789Sahrens dirty = cvd->vdev_dtl_map.sm_space | 1056789Sahrens cvd->vdev_dtl_scrub.sm_space; 1057789Sahrens mutex_exit(&cvd->vdev_dtl_lock); 1058789Sahrens if (!dirty) 1059789Sahrens break; 1060789Sahrens } 1061789Sahrens if (c == pvd->vdev_children) 1062789Sahrens return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1063789Sahrens 1064789Sahrens /* 1065789Sahrens * Erase the disk labels so the disk can be used for other things. 1066789Sahrens * This must be done after all other error cases are handled, 1067789Sahrens * but before we disembowel vd (so we can still do I/O to it). 1068789Sahrens * But if we can't do it, don't treat the error as fatal -- 1069789Sahrens * it may be that the unwritability of the disk is the reason 1070789Sahrens * it's being detached! 1071789Sahrens */ 1072789Sahrens error = vdev_label_init(vd, 0); 1073789Sahrens if (error) 1074789Sahrens dprintf("unable to erase labels on %s\n", vdev_description(vd)); 1075789Sahrens 1076789Sahrens /* 1077789Sahrens * Remove vd from its parent and compact the parent's children. 1078789Sahrens */ 1079789Sahrens vdev_remove_child(pvd, vd); 1080789Sahrens vdev_compact_children(pvd); 1081789Sahrens 1082789Sahrens /* 1083789Sahrens * Remember one of the remaining children so we can get tvd below. 1084789Sahrens */ 1085789Sahrens cvd = pvd->vdev_child[0]; 1086789Sahrens 1087789Sahrens /* 1088789Sahrens * If the parent mirror/replacing vdev only has one child, 1089789Sahrens * the parent is no longer needed. Remove it from the tree. 1090789Sahrens */ 1091789Sahrens if (pvd->vdev_children == 1) 1092789Sahrens vdev_remove_parent(cvd); 1093789Sahrens 1094789Sahrens /* 1095789Sahrens * We don't set tvd until now because the parent we just removed 1096789Sahrens * may have been the previous top-level vdev. 1097789Sahrens */ 1098789Sahrens tvd = cvd->vdev_top; 1099789Sahrens ASSERT(tvd->vdev_parent == rvd); 1100789Sahrens 1101789Sahrens /* 1102789Sahrens * Reopen this top-level vdev to reassess health after detach. 1103789Sahrens */ 1104789Sahrens vdev_reopen(tvd, NULL); 1105789Sahrens 1106789Sahrens /* 1107789Sahrens * If the device we just detached was smaller than the others, 1108789Sahrens * it may be possible to add metaslabs (i.e. grow the pool). 1109789Sahrens */ 1110789Sahrens vdev_metaslab_init(tvd, txg); 1111789Sahrens 1112789Sahrens /* 1113789Sahrens * Update the config based on the new in-core state. 1114789Sahrens */ 1115789Sahrens spa_config_set(spa, spa_config_generate(spa, rvd, txg, 0)); 1116789Sahrens 1117789Sahrens vdev_config_dirty(tvd); 1118789Sahrens 1119789Sahrens /* 1120789Sahrens * Mark vd's DTL as dirty in this txg. 1121789Sahrens * vdev_dtl_sync() will see that vd->vdev_detached is set 1122789Sahrens * and free vd's DTL object in syncing context. 1123789Sahrens * But first make sure we're not on any *other* txg's DTL list, 1124789Sahrens * to prevent vd from being accessed after it's freed. 1125789Sahrens */ 1126789Sahrens vdev_dirty(tvd, VDD_DTL, txg); 1127789Sahrens vd->vdev_detached = B_TRUE; 1128789Sahrens for (t = 0; t < TXG_SIZE; t++) 1129789Sahrens (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 1130789Sahrens (void) txg_list_add(&tvd->vdev_dtl_list, vd, txg); 1131789Sahrens 1132789Sahrens dprintf("detached %s\n", path); 1133789Sahrens 1134789Sahrens return (spa_vdev_exit(spa, vd, txg, 0)); 1135789Sahrens } 1136789Sahrens 1137789Sahrens /* 1138789Sahrens * If there are any replacing vdevs that have finished replacing, detach them. 1139789Sahrens * We can't hold the config lock across detaches, so we lock the config, 1140789Sahrens * build a list of candidates, unlock the config, and try each candidate. 1141789Sahrens */ 1142789Sahrens typedef struct vdev_detach_link { 1143789Sahrens char *vdl_path; 1144789Sahrens uint64_t vdl_guid; 1145789Sahrens list_node_t vdl_node; 1146789Sahrens } vdev_detach_link_t; 1147789Sahrens 1148789Sahrens static void 1149789Sahrens spa_vdev_replace_done_make_list(list_t *l, vdev_t *vd) 1150789Sahrens { 1151789Sahrens int c; 1152789Sahrens 1153789Sahrens for (c = 0; c < vd->vdev_children; c++) 1154789Sahrens spa_vdev_replace_done_make_list(l, vd->vdev_child[c]); 1155789Sahrens 1156789Sahrens if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 1157789Sahrens vdev_t *cvd0 = vd->vdev_child[0]; 1158789Sahrens vdev_t *cvd1 = vd->vdev_child[1]; 1159789Sahrens vdev_detach_link_t *vdl; 1160789Sahrens int dirty1; 1161789Sahrens 1162789Sahrens mutex_enter(&cvd1->vdev_dtl_lock); 1163789Sahrens dirty1 = cvd1->vdev_dtl_map.sm_space | 1164789Sahrens cvd1->vdev_dtl_scrub.sm_space; 1165789Sahrens mutex_exit(&cvd1->vdev_dtl_lock); 1166789Sahrens 1167789Sahrens if (!dirty1) { 1168789Sahrens vdl = kmem_zalloc(sizeof (*vdl), KM_SLEEP); 1169789Sahrens vdl->vdl_path = spa_strdup(cvd0->vdev_path); 1170789Sahrens vdl->vdl_guid = cvd0->vdev_guid; 1171789Sahrens list_insert_tail(l, vdl); 1172789Sahrens } 1173789Sahrens } 1174789Sahrens } 1175789Sahrens 1176789Sahrens void 1177789Sahrens spa_vdev_replace_done(spa_t *spa) 1178789Sahrens { 1179789Sahrens vdev_detach_link_t *vdl; 1180789Sahrens list_t vdlist; 1181789Sahrens 1182789Sahrens list_create(&vdlist, sizeof (vdev_detach_link_t), 1183789Sahrens offsetof(vdev_detach_link_t, vdl_node)); 1184789Sahrens 1185789Sahrens spa_config_enter(spa, RW_READER); 1186789Sahrens spa_vdev_replace_done_make_list(&vdlist, spa->spa_root_vdev); 1187789Sahrens spa_config_exit(spa); 1188789Sahrens 1189789Sahrens while ((vdl = list_head(&vdlist)) != NULL) { 1190789Sahrens list_remove(&vdlist, vdl); 1191789Sahrens (void) spa_vdev_detach(spa, vdl->vdl_path, vdl->vdl_guid, 1192789Sahrens B_TRUE); 1193789Sahrens spa_strfree(vdl->vdl_path); 1194789Sahrens kmem_free(vdl, sizeof (*vdl)); 1195789Sahrens } 1196789Sahrens 1197789Sahrens list_destroy(&vdlist); 1198789Sahrens } 1199789Sahrens 1200789Sahrens /* 1201789Sahrens * ========================================================================== 1202789Sahrens * SPA Scrubbing 1203789Sahrens * ========================================================================== 1204789Sahrens */ 1205789Sahrens 1206789Sahrens static int spa_scrub_locked(spa_t *, pool_scrub_type_t, boolean_t); 1207789Sahrens 1208789Sahrens static void 1209789Sahrens spa_scrub_io_done(zio_t *zio) 1210789Sahrens { 1211789Sahrens spa_t *spa = zio->io_spa; 1212789Sahrens 1213789Sahrens zio_buf_free(zio->io_data, zio->io_size); 1214789Sahrens 1215789Sahrens mutex_enter(&spa->spa_scrub_lock); 1216789Sahrens if (zio->io_error) 1217789Sahrens spa->spa_scrub_errors++; 1218789Sahrens if (--spa->spa_scrub_inflight == 0) 1219789Sahrens cv_broadcast(&spa->spa_scrub_io_cv); 1220789Sahrens mutex_exit(&spa->spa_scrub_lock); 1221789Sahrens 1222789Sahrens if (zio->io_error) { 1223789Sahrens vdev_t *vd = zio->io_vd; 1224789Sahrens mutex_enter(&vd->vdev_stat_lock); 1225789Sahrens vd->vdev_stat.vs_scrub_errors++; 1226789Sahrens mutex_exit(&vd->vdev_stat_lock); 1227789Sahrens } 1228789Sahrens } 1229789Sahrens 1230789Sahrens static void 1231789Sahrens spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags) 1232789Sahrens { 1233789Sahrens size_t size = BP_GET_LSIZE(bp); 1234789Sahrens void *data = zio_buf_alloc(size); 1235789Sahrens 1236789Sahrens mutex_enter(&spa->spa_scrub_lock); 1237789Sahrens spa->spa_scrub_inflight++; 1238789Sahrens mutex_exit(&spa->spa_scrub_lock); 1239789Sahrens 1240789Sahrens zio_nowait(zio_read(NULL, spa, bp, data, size, 1241789Sahrens spa_scrub_io_done, NULL, priority, flags)); 1242789Sahrens } 1243789Sahrens 1244789Sahrens /* ARGSUSED */ 1245789Sahrens static int 1246789Sahrens spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) 1247789Sahrens { 1248789Sahrens blkptr_t *bp = &bc->bc_blkptr; 1249789Sahrens vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[0])); 1250789Sahrens 1251789Sahrens if (bc->bc_errno || vd == NULL) { 1252789Sahrens /* 1253789Sahrens * We can't scrub this block, but we can continue to scrub 1254789Sahrens * the rest of the pool. Note the error and move along. 1255789Sahrens */ 1256789Sahrens mutex_enter(&spa->spa_scrub_lock); 1257789Sahrens spa->spa_scrub_errors++; 1258789Sahrens mutex_exit(&spa->spa_scrub_lock); 1259789Sahrens 1260789Sahrens if (vd != NULL) { 1261789Sahrens mutex_enter(&vd->vdev_stat_lock); 1262789Sahrens vd->vdev_stat.vs_scrub_errors++; 1263789Sahrens mutex_exit(&vd->vdev_stat_lock); 1264789Sahrens } 1265789Sahrens 1266789Sahrens return (ERESTART); 1267789Sahrens } 1268789Sahrens 1269789Sahrens ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg); 1270789Sahrens 1271789Sahrens /* 1272789Sahrens * Keep track of how much data we've examined so that 1273789Sahrens * zpool(1M) status can make useful progress reports. 1274789Sahrens */ 1275789Sahrens mutex_enter(&vd->vdev_stat_lock); 1276789Sahrens vd->vdev_stat.vs_scrub_examined += BP_GET_ASIZE(bp); 1277789Sahrens mutex_exit(&vd->vdev_stat_lock); 1278789Sahrens 1279789Sahrens if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) { 1280789Sahrens if (DVA_GET_GANG(&bp->blk_dva[0])) { 1281789Sahrens /* 1282789Sahrens * Gang members may be spread across multiple vdevs, 1283789Sahrens * so the best we can do is look at the pool-wide DTL. 1284789Sahrens * XXX -- it would be better to change our allocation 1285789Sahrens * policy to ensure that this can't happen. 1286789Sahrens */ 1287789Sahrens vd = spa->spa_root_vdev; 1288789Sahrens } 1289789Sahrens if (vdev_dtl_contains(&vd->vdev_dtl_map, bp->blk_birth, 1)) { 1290789Sahrens spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER, 1291789Sahrens ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | 1292789Sahrens ZIO_FLAG_RESILVER); 1293789Sahrens } 1294789Sahrens } else { 1295789Sahrens spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB, 1296789Sahrens ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_SCRUB); 1297789Sahrens } 1298789Sahrens 1299789Sahrens return (0); 1300789Sahrens } 1301789Sahrens 1302789Sahrens static void 1303789Sahrens spa_scrub_thread(spa_t *spa) 1304789Sahrens { 1305789Sahrens callb_cpr_t cprinfo; 1306789Sahrens traverse_handle_t *th = spa->spa_scrub_th; 1307789Sahrens vdev_t *rvd = spa->spa_root_vdev; 1308789Sahrens pool_scrub_type_t scrub_type = spa->spa_scrub_type; 1309789Sahrens int error = 0; 1310789Sahrens boolean_t complete; 1311789Sahrens 1312789Sahrens CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG); 1313789Sahrens 1314*797Sbonwick /* 1315*797Sbonwick * If we're restarting due to a snapshot create/delete, 1316*797Sbonwick * wait for that to complete. 1317*797Sbonwick */ 1318*797Sbonwick txg_wait_synced(spa_get_dsl(spa), 0); 1319*797Sbonwick 1320789Sahrens spa_config_enter(spa, RW_WRITER); 1321789Sahrens vdev_reopen(rvd, NULL); /* purge all vdev caches */ 1322789Sahrens vdev_config_dirty(rvd); /* rewrite all disk labels */ 1323789Sahrens vdev_scrub_stat_update(rvd, scrub_type, B_FALSE); 1324789Sahrens spa_config_exit(spa); 1325789Sahrens 1326789Sahrens mutex_enter(&spa->spa_scrub_lock); 1327789Sahrens spa->spa_scrub_errors = 0; 1328789Sahrens spa->spa_scrub_active = 1; 1329789Sahrens 1330789Sahrens while (!spa->spa_scrub_stop) { 1331789Sahrens CALLB_CPR_SAFE_BEGIN(&cprinfo); 1332789Sahrens while (spa->spa_scrub_suspend) { 1333789Sahrens spa->spa_scrub_active = 0; 1334789Sahrens cv_broadcast(&spa->spa_scrub_cv); 1335789Sahrens cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 1336789Sahrens spa->spa_scrub_active = 1; 1337789Sahrens } 1338789Sahrens CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock); 1339789Sahrens 1340789Sahrens if (spa->spa_scrub_restart_txg != 0) 1341789Sahrens break; 1342789Sahrens 1343789Sahrens mutex_exit(&spa->spa_scrub_lock); 1344789Sahrens error = traverse_more(th); 1345789Sahrens mutex_enter(&spa->spa_scrub_lock); 1346789Sahrens if (error != EAGAIN) 1347789Sahrens break; 1348789Sahrens } 1349789Sahrens 1350789Sahrens while (spa->spa_scrub_inflight) 1351789Sahrens cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1352789Sahrens 1353789Sahrens if (spa->spa_scrub_restart_txg != 0) 1354789Sahrens error = ERESTART; 1355789Sahrens 1356789Sahrens spa->spa_scrub_active = 0; 1357789Sahrens cv_broadcast(&spa->spa_scrub_cv); 1358789Sahrens 1359789Sahrens /* 1360789Sahrens * If the traverse completed, and there were no errors, 1361789Sahrens * then the scrub was completely successful. 1362789Sahrens */ 1363789Sahrens complete = (error == 0 && spa->spa_scrub_errors == 0); 1364789Sahrens 1365789Sahrens dprintf("scrub to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n", 1366789Sahrens spa->spa_scrub_maxtxg, complete ? "done" : "FAILED", 1367789Sahrens error, spa->spa_scrub_errors, spa->spa_scrub_stop); 1368789Sahrens 1369789Sahrens mutex_exit(&spa->spa_scrub_lock); 1370789Sahrens 1371789Sahrens /* 1372789Sahrens * If the scrub/resilver completed, update all DTLs to reflect this. 1373789Sahrens * Whether it succeeded or not, vacate all temporary scrub DTLs. 1374789Sahrens */ 1375789Sahrens spa_config_enter(spa, RW_WRITER); 1376789Sahrens vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1, 1377789Sahrens complete ? spa->spa_scrub_maxtxg : 0, B_TRUE); 1378789Sahrens spa_config_exit(spa); 1379789Sahrens 1380789Sahrens spa_vdev_replace_done(spa); 1381789Sahrens 1382789Sahrens spa_config_enter(spa, RW_READER); 1383789Sahrens vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete); 1384789Sahrens spa_config_exit(spa); 1385789Sahrens 1386789Sahrens mutex_enter(&spa->spa_scrub_lock); 1387789Sahrens 1388789Sahrens spa->spa_scrub_type = POOL_SCRUB_NONE; 1389789Sahrens spa->spa_scrub_active = 0; 1390789Sahrens spa->spa_scrub_thread = NULL; 1391789Sahrens 1392789Sahrens cv_broadcast(&spa->spa_scrub_cv); 1393789Sahrens 1394789Sahrens /* 1395789Sahrens * If we were told to restart, our final act is to start a new scrub. 1396789Sahrens */ 1397789Sahrens if (error == ERESTART) 1398789Sahrens VERIFY(spa_scrub_locked(spa, scrub_type, B_TRUE) == 0); 1399789Sahrens 1400789Sahrens CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */ 1401789Sahrens thread_exit(); 1402789Sahrens } 1403789Sahrens 1404789Sahrens void 1405789Sahrens spa_scrub_suspend(spa_t *spa) 1406789Sahrens { 1407789Sahrens mutex_enter(&spa->spa_scrub_lock); 1408789Sahrens spa->spa_scrub_suspend++; 1409789Sahrens while (spa->spa_scrub_active) { 1410789Sahrens cv_broadcast(&spa->spa_scrub_cv); 1411789Sahrens cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 1412789Sahrens } 1413789Sahrens while (spa->spa_scrub_inflight) 1414789Sahrens cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1415789Sahrens mutex_exit(&spa->spa_scrub_lock); 1416789Sahrens } 1417789Sahrens 1418789Sahrens void 1419789Sahrens spa_scrub_resume(spa_t *spa) 1420789Sahrens { 1421789Sahrens mutex_enter(&spa->spa_scrub_lock); 1422789Sahrens ASSERT(spa->spa_scrub_suspend != 0); 1423789Sahrens if (--spa->spa_scrub_suspend == 0) 1424789Sahrens cv_broadcast(&spa->spa_scrub_cv); 1425789Sahrens mutex_exit(&spa->spa_scrub_lock); 1426789Sahrens } 1427789Sahrens 1428789Sahrens void 1429789Sahrens spa_scrub_restart(spa_t *spa, uint64_t txg) 1430789Sahrens { 1431789Sahrens /* 1432789Sahrens * Something happened (e.g. snapshot create/delete) that means 1433789Sahrens * we must restart any in-progress scrubs. The itinerary will 1434789Sahrens * fix this properly. 1435789Sahrens */ 1436789Sahrens mutex_enter(&spa->spa_scrub_lock); 1437789Sahrens spa->spa_scrub_restart_txg = txg; 1438789Sahrens mutex_exit(&spa->spa_scrub_lock); 1439789Sahrens } 1440789Sahrens 1441789Sahrens static int 1442789Sahrens spa_scrub_locked(spa_t *spa, pool_scrub_type_t type, boolean_t force) 1443789Sahrens { 1444789Sahrens space_seg_t *ss; 1445789Sahrens uint64_t mintxg, maxtxg; 1446789Sahrens vdev_t *rvd = spa->spa_root_vdev; 1447789Sahrens int advance = 0; 1448789Sahrens 1449789Sahrens if ((uint_t)type >= POOL_SCRUB_TYPES) 1450789Sahrens return (ENOTSUP); 1451789Sahrens 1452789Sahrens /* 1453789Sahrens * If there's a scrub or resilver already in progress, stop it. 1454789Sahrens */ 1455789Sahrens while (spa->spa_scrub_thread != NULL) { 1456789Sahrens /* 1457789Sahrens * Don't stop a resilver unless forced. 1458789Sahrens */ 1459789Sahrens if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) 1460789Sahrens return (EBUSY); 1461789Sahrens 1462789Sahrens spa->spa_scrub_stop = 1; 1463789Sahrens cv_broadcast(&spa->spa_scrub_cv); 1464789Sahrens cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 1465789Sahrens } 1466789Sahrens 1467789Sahrens /* 1468789Sahrens * Terminate the previous traverse. 1469789Sahrens */ 1470789Sahrens if (spa->spa_scrub_th != NULL) { 1471789Sahrens traverse_fini(spa->spa_scrub_th); 1472789Sahrens spa->spa_scrub_th = NULL; 1473789Sahrens } 1474789Sahrens 1475789Sahrens spa->spa_scrub_stop = 0; 1476789Sahrens spa->spa_scrub_type = type; 1477789Sahrens spa->spa_scrub_restart_txg = 0; 1478789Sahrens 1479789Sahrens mintxg = TXG_INITIAL - 1; 1480789Sahrens maxtxg = spa_last_synced_txg(spa) + 1; 1481789Sahrens 1482789Sahrens switch (type) { 1483789Sahrens 1484789Sahrens case POOL_SCRUB_NONE: 1485789Sahrens break; 1486789Sahrens 1487789Sahrens case POOL_SCRUB_RESILVER: 1488789Sahrens /* 1489789Sahrens * Determine the resilvering boundaries. 1490789Sahrens * 1491789Sahrens * Note: (mintxg, maxtxg) is an open interval, 1492789Sahrens * i.e. mintxg and maxtxg themselves are not included. 1493789Sahrens * 1494789Sahrens * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1 1495789Sahrens * so we don't claim to resilver a txg that's still changing. 1496789Sahrens */ 1497789Sahrens mutex_enter(&rvd->vdev_dtl_lock); 1498789Sahrens ss = avl_first(&rvd->vdev_dtl_map.sm_root); 1499789Sahrens mintxg = ss ? ss->ss_start - 1 : 0; 1500789Sahrens ss = avl_last(&rvd->vdev_dtl_map.sm_root); 1501789Sahrens maxtxg = ss ? ss->ss_end : 0; 1502789Sahrens maxtxg = MIN(maxtxg, spa_last_synced_txg(spa) + 1); 1503789Sahrens mutex_exit(&rvd->vdev_dtl_lock); 1504789Sahrens 1505789Sahrens advance = ADVANCE_PRE | ADVANCE_PRUNE; 1506789Sahrens break; 1507789Sahrens 1508789Sahrens case POOL_SCRUB_EVERYTHING: 1509789Sahrens /* 1510789Sahrens * A scrub is like a resilver, but not pruned by DTL. 1511789Sahrens */ 1512789Sahrens advance = ADVANCE_PRE; 1513789Sahrens break; 1514789Sahrens } 1515789Sahrens 1516789Sahrens if (mintxg != 0 && maxtxg != 0 && type != POOL_SCRUB_NONE) { 1517789Sahrens spa->spa_scrub_maxtxg = maxtxg; 1518789Sahrens spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL, 1519789Sahrens advance, ZIO_FLAG_CANFAIL); 1520789Sahrens traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg); 1521789Sahrens spa->spa_scrub_thread = thread_create(NULL, 0, 1522789Sahrens spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri); 1523789Sahrens } 1524789Sahrens 1525789Sahrens return (0); 1526789Sahrens } 1527789Sahrens 1528789Sahrens int 1529789Sahrens spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) 1530789Sahrens { 1531789Sahrens int error; 1532789Sahrens traverse_handle_t *th; 1533789Sahrens 1534789Sahrens mutex_enter(&spa->spa_scrub_lock); 1535789Sahrens error = spa_scrub_locked(spa, type, force); 1536789Sahrens th = spa->spa_scrub_th; 1537789Sahrens mutex_exit(&spa->spa_scrub_lock); 1538789Sahrens 1539789Sahrens if (th == NULL && type != POOL_SCRUB_NONE) 1540789Sahrens spa_vdev_replace_done(spa); 1541789Sahrens 1542789Sahrens return (error); 1543789Sahrens } 1544789Sahrens 1545789Sahrens /* 1546789Sahrens * ========================================================================== 1547789Sahrens * SPA syncing routines 1548789Sahrens * ========================================================================== 1549789Sahrens */ 1550789Sahrens 1551789Sahrens static void 1552789Sahrens spa_sync_deferred_frees(spa_t *spa, uint64_t txg) 1553789Sahrens { 1554789Sahrens bplist_t *bpl = &spa->spa_sync_bplist; 1555789Sahrens dmu_tx_t *tx; 1556789Sahrens blkptr_t blk; 1557789Sahrens uint64_t itor = 0; 1558789Sahrens zio_t *zio; 1559789Sahrens int error; 1560789Sahrens uint8_t c = 1; 1561789Sahrens 1562789Sahrens zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD); 1563789Sahrens 1564789Sahrens while (bplist_iterate(bpl, &itor, &blk) == 0) 1565789Sahrens zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL)); 1566789Sahrens 1567789Sahrens error = zio_wait(zio); 1568789Sahrens ASSERT3U(error, ==, 0); 1569789Sahrens 1570789Sahrens tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 1571789Sahrens bplist_vacate(bpl, tx); 1572789Sahrens 1573789Sahrens /* 1574789Sahrens * Pre-dirty the first block so we sync to convergence faster. 1575789Sahrens * (Usually only the first block is needed.) 1576789Sahrens */ 1577789Sahrens dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 1578789Sahrens dmu_tx_commit(tx); 1579789Sahrens } 1580789Sahrens 1581789Sahrens static void 1582789Sahrens spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 1583789Sahrens { 1584789Sahrens nvlist_t *config; 1585789Sahrens char *packed = NULL; 1586789Sahrens size_t nvsize = 0; 1587789Sahrens dmu_buf_t *db; 1588789Sahrens 1589789Sahrens if (list_is_empty(&spa->spa_dirty_list)) 1590789Sahrens return; 1591789Sahrens 1592789Sahrens config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE); 1593789Sahrens 1594789Sahrens spa_config_set(spa, config); 1595789Sahrens 1596789Sahrens VERIFY(nvlist_size(config, &nvsize, NV_ENCODE_XDR) == 0); 1597789Sahrens 1598789Sahrens packed = kmem_alloc(nvsize, KM_SLEEP); 1599789Sahrens 1600789Sahrens VERIFY(nvlist_pack(config, &packed, &nvsize, NV_ENCODE_XDR, 0) == 0); 1601789Sahrens 1602789Sahrens dmu_write(spa->spa_meta_objset, spa->spa_config_object, 0, nvsize, 1603789Sahrens packed, tx); 1604789Sahrens 1605789Sahrens kmem_free(packed, nvsize); 1606789Sahrens 1607789Sahrens db = dmu_bonus_hold(spa->spa_meta_objset, spa->spa_config_object); 1608789Sahrens dmu_buf_will_dirty(db, tx); 1609789Sahrens *(uint64_t *)db->db_data = nvsize; 1610789Sahrens dmu_buf_rele(db); 1611789Sahrens } 1612789Sahrens 1613789Sahrens /* 1614789Sahrens * Sync the specified transaction group. New blocks may be dirtied as 1615789Sahrens * part of the process, so we iterate until it converges. 1616789Sahrens */ 1617789Sahrens void 1618789Sahrens spa_sync(spa_t *spa, uint64_t txg) 1619789Sahrens { 1620789Sahrens dsl_pool_t *dp = spa->spa_dsl_pool; 1621789Sahrens objset_t *mos = spa->spa_meta_objset; 1622789Sahrens bplist_t *bpl = &spa->spa_sync_bplist; 1623789Sahrens vdev_t *rvd = spa->spa_root_vdev; 1624789Sahrens vdev_t *vd; 1625789Sahrens dmu_tx_t *tx; 1626789Sahrens int dirty_vdevs; 1627789Sahrens 1628789Sahrens /* 1629789Sahrens * Lock out configuration changes. 1630789Sahrens */ 1631789Sahrens spa_config_enter(spa, RW_READER); 1632789Sahrens 1633789Sahrens spa->spa_syncing_txg = txg; 1634789Sahrens spa->spa_sync_pass = 0; 1635789Sahrens 1636789Sahrens bplist_open(bpl, mos, spa->spa_sync_bplist_obj); 1637789Sahrens 1638789Sahrens /* 1639789Sahrens * If anything has changed in this txg, push the deferred frees 1640789Sahrens * from the previous txg. If not, leave them alone so that we 1641789Sahrens * don't generate work on an otherwise idle system. 1642789Sahrens */ 1643789Sahrens if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 1644789Sahrens !txg_list_empty(&dp->dp_dirty_dirs, txg)) 1645789Sahrens spa_sync_deferred_frees(spa, txg); 1646789Sahrens 1647789Sahrens /* 1648789Sahrens * Iterate to convergence. 1649789Sahrens */ 1650789Sahrens do { 1651789Sahrens spa->spa_sync_pass++; 1652789Sahrens 1653789Sahrens tx = dmu_tx_create_assigned(dp, txg); 1654789Sahrens spa_sync_config_object(spa, tx); 1655789Sahrens dmu_tx_commit(tx); 1656789Sahrens 1657789Sahrens dsl_pool_sync(dp, txg); 1658789Sahrens 1659789Sahrens dirty_vdevs = 0; 1660789Sahrens while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 1661789Sahrens vdev_sync(vd, txg); 1662789Sahrens dirty_vdevs++; 1663789Sahrens } 1664789Sahrens 1665789Sahrens tx = dmu_tx_create_assigned(dp, txg); 1666789Sahrens bplist_sync(bpl, tx); 1667789Sahrens dmu_tx_commit(tx); 1668789Sahrens 1669789Sahrens } while (dirty_vdevs); 1670789Sahrens 1671789Sahrens bplist_close(bpl); 1672789Sahrens 1673789Sahrens dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 1674789Sahrens 1675789Sahrens /* 1676789Sahrens * Rewrite the vdev configuration (which includes the uberblock) 1677789Sahrens * to commit the transaction group. 1678789Sahrens */ 1679789Sahrens while (spa_sync_labels(spa, txg)) { 1680789Sahrens dprintf("waiting for devices to heal\n"); 1681789Sahrens delay(hz); 1682789Sahrens vdev_reopen(rvd, NULL); 1683789Sahrens } 1684789Sahrens 1685789Sahrens /* 1686789Sahrens * Make a stable copy of the fully synced uberblock. 1687789Sahrens * We use this as the root for pool traversals. 1688789Sahrens */ 1689789Sahrens spa->spa_traverse_wanted = 1; /* tells traverse_more() to stop */ 1690789Sahrens 1691789Sahrens spa_scrub_suspend(spa); /* stop scrubbing and finish I/Os */ 1692789Sahrens 1693789Sahrens rw_enter(&spa->spa_traverse_lock, RW_WRITER); 1694789Sahrens spa->spa_traverse_wanted = 0; 1695789Sahrens spa->spa_ubsync = spa->spa_uberblock; 1696789Sahrens rw_exit(&spa->spa_traverse_lock); 1697789Sahrens 1698789Sahrens spa_scrub_resume(spa); /* resume scrub with new ubsync */ 1699789Sahrens 1700789Sahrens /* 1701789Sahrens * Clean up the ZIL records for the synced txg. 1702789Sahrens */ 1703789Sahrens dsl_pool_zil_clean(dp); 1704789Sahrens 1705789Sahrens /* 1706789Sahrens * Update usable space statistics. 1707789Sahrens */ 1708789Sahrens while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 1709789Sahrens vdev_sync_done(vd, txg); 1710789Sahrens 1711789Sahrens /* 1712789Sahrens * It had better be the case that we didn't dirty anything 1713789Sahrens * since spa_sync_labels(). 1714789Sahrens */ 1715789Sahrens ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 1716789Sahrens ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 1717789Sahrens ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 1718789Sahrens ASSERT(bpl->bpl_queue == NULL); 1719789Sahrens 1720789Sahrens spa_config_exit(spa); 1721789Sahrens } 1722789Sahrens 1723789Sahrens /* 1724789Sahrens * Sync all pools. We don't want to hold the namespace lock across these 1725789Sahrens * operations, so we take a reference on the spa_t and drop the lock during the 1726789Sahrens * sync. 1727789Sahrens */ 1728789Sahrens void 1729789Sahrens spa_sync_allpools(void) 1730789Sahrens { 1731789Sahrens spa_t *spa = NULL; 1732789Sahrens mutex_enter(&spa_namespace_lock); 1733789Sahrens while ((spa = spa_next(spa)) != NULL) { 1734789Sahrens if (spa_state(spa) != POOL_STATE_ACTIVE) 1735789Sahrens continue; 1736789Sahrens spa_open_ref(spa, FTAG); 1737789Sahrens mutex_exit(&spa_namespace_lock); 1738789Sahrens txg_wait_synced(spa_get_dsl(spa), 0); 1739789Sahrens mutex_enter(&spa_namespace_lock); 1740789Sahrens spa_close(spa, FTAG); 1741789Sahrens } 1742789Sahrens mutex_exit(&spa_namespace_lock); 1743789Sahrens } 1744789Sahrens 1745789Sahrens /* 1746789Sahrens * ========================================================================== 1747789Sahrens * Miscellaneous routines 1748789Sahrens * ========================================================================== 1749789Sahrens */ 1750789Sahrens 1751789Sahrens int 1752789Sahrens spa_busy(void) 1753789Sahrens { 1754789Sahrens return (spa_active_count != 0); 1755789Sahrens } 1756789Sahrens 1757789Sahrens /* 1758789Sahrens * Remove all pools in the system. 1759789Sahrens */ 1760789Sahrens void 1761789Sahrens spa_evict_all(void) 1762789Sahrens { 1763789Sahrens spa_t *spa; 1764789Sahrens 1765789Sahrens /* 1766789Sahrens * Remove all cached state. All pools should be closed now, 1767789Sahrens * so every spa in the AVL tree should be unreferenced. 1768789Sahrens */ 1769789Sahrens mutex_enter(&spa_namespace_lock); 1770789Sahrens while ((spa = spa_next(NULL)) != NULL) { 1771789Sahrens /* 1772789Sahrens * Stop all scrub and resilver activity. spa_scrub() needs to 1773789Sahrens * wait for the scrub thread, which may do a detach and sync the 1774789Sahrens * configs, which needs spa_namespace_lock. Drop the lock while 1775789Sahrens * maintaining a hold on the spa_t. 1776789Sahrens */ 1777789Sahrens spa_open_ref(spa, FTAG); 1778789Sahrens mutex_exit(&spa_namespace_lock); 1779789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 1780789Sahrens mutex_enter(&spa_namespace_lock); 1781789Sahrens spa_close(spa, FTAG); 1782789Sahrens 1783789Sahrens if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 1784789Sahrens spa_unload(spa); 1785789Sahrens spa_deactivate(spa); 1786789Sahrens } 1787789Sahrens spa_remove(spa); 1788789Sahrens } 1789789Sahrens mutex_exit(&spa_namespace_lock); 1790789Sahrens } 1791