1789Sahrens /* 2789Sahrens * CDDL HEADER START 3789Sahrens * 4789Sahrens * The contents of this file are subject to the terms of the 5789Sahrens * Common Development and Distribution License, Version 1.0 only 6789Sahrens * (the "License"). You may not use this file except in compliance 7789Sahrens * with the License. 8789Sahrens * 9789Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10789Sahrens * or http://www.opensolaris.org/os/licensing. 11789Sahrens * See the License for the specific language governing permissions 12789Sahrens * and limitations under the License. 13789Sahrens * 14789Sahrens * When distributing Covered Code, include this CDDL HEADER in each 15789Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16789Sahrens * If applicable, add the following below this CDDL HEADER, with the 17789Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 18789Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 19789Sahrens * 20789Sahrens * CDDL HEADER END 21789Sahrens */ 22789Sahrens /* 23789Sahrens * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24789Sahrens * Use is subject to license terms. 25789Sahrens */ 26789Sahrens 27789Sahrens #pragma ident "%Z%%M% %I% %E% SMI" 28789Sahrens 29789Sahrens /* 30789Sahrens * This file contains all the routines used when modifying on-disk SPA state. 31789Sahrens * This includes opening, importing, destroying, exporting a pool, and syncing a 32789Sahrens * pool. 33789Sahrens */ 34789Sahrens 35789Sahrens #include <sys/zfs_context.h> 36789Sahrens #include <sys/spa_impl.h> 37789Sahrens #include <sys/zio.h> 38789Sahrens #include <sys/zio_checksum.h> 39789Sahrens #include <sys/zio_compress.h> 40789Sahrens #include <sys/dmu.h> 41789Sahrens #include <sys/dmu_tx.h> 42789Sahrens #include <sys/zap.h> 43789Sahrens #include <sys/zil.h> 44789Sahrens #include <sys/vdev_impl.h> 45789Sahrens #include <sys/metaslab.h> 46789Sahrens #include <sys/uberblock_impl.h> 47789Sahrens #include <sys/txg.h> 48789Sahrens #include <sys/avl.h> 49789Sahrens #include <sys/dmu_traverse.h> 50789Sahrens #include <sys/unique.h> 51789Sahrens #include <sys/dsl_pool.h> 52789Sahrens #include <sys/dsl_dir.h> 53789Sahrens #include <sys/dsl_prop.h> 54789Sahrens #include <sys/fs/zfs.h> 55789Sahrens #include <sys/callb.h> 56789Sahrens 57789Sahrens static uint32_t spa_active_count; 58789Sahrens 59789Sahrens /* 60789Sahrens * ========================================================================== 61789Sahrens * SPA state manipulation (open/create/destroy/import/export) 62789Sahrens * ========================================================================== 63789Sahrens */ 64789Sahrens 65789Sahrens /* 66789Sahrens * Activate an uninitialized pool. 67789Sahrens */ 68789Sahrens static void 69789Sahrens spa_activate(spa_t *spa) 70789Sahrens { 71789Sahrens int t; 72789Sahrens 73789Sahrens ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 74789Sahrens 75789Sahrens spa->spa_state = POOL_STATE_ACTIVE; 76789Sahrens 77789Sahrens spa->spa_normal_class = metaslab_class_create(); 78789Sahrens 79789Sahrens spa->spa_vdev_retry_taskq = taskq_create("spa_vdev_retry", 80789Sahrens 4, maxclsyspri, 50, INT_MAX, TASKQ_PREPOPULATE); 81789Sahrens 82789Sahrens for (t = 0; t < ZIO_TYPES; t++) { 83789Sahrens spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue", 84789Sahrens 8, maxclsyspri, 50, INT_MAX, 85789Sahrens TASKQ_PREPOPULATE); 86789Sahrens spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr", 87789Sahrens 8, maxclsyspri, 50, INT_MAX, 88789Sahrens TASKQ_PREPOPULATE); 89789Sahrens } 90789Sahrens 91789Sahrens rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL); 92789Sahrens 93789Sahrens list_create(&spa->spa_dirty_list, sizeof (vdev_t), 94789Sahrens offsetof(vdev_t, vdev_dirty_node)); 95789Sahrens 96789Sahrens txg_list_create(&spa->spa_vdev_txg_list, 97789Sahrens offsetof(struct vdev, vdev_txg_node)); 98789Sahrens } 99789Sahrens 100789Sahrens /* 101789Sahrens * Opposite of spa_activate(). 102789Sahrens */ 103789Sahrens static void 104789Sahrens spa_deactivate(spa_t *spa) 105789Sahrens { 106789Sahrens int t; 107789Sahrens 108789Sahrens ASSERT(spa->spa_sync_on == B_FALSE); 109789Sahrens ASSERT(spa->spa_dsl_pool == NULL); 110789Sahrens ASSERT(spa->spa_root_vdev == NULL); 111789Sahrens 112789Sahrens ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 113789Sahrens 114789Sahrens txg_list_destroy(&spa->spa_vdev_txg_list); 115789Sahrens 116789Sahrens list_destroy(&spa->spa_dirty_list); 117789Sahrens 118789Sahrens rw_destroy(&spa->spa_traverse_lock); 119789Sahrens 120789Sahrens for (t = 0; t < ZIO_TYPES; t++) { 121789Sahrens taskq_destroy(spa->spa_zio_issue_taskq[t]); 122789Sahrens taskq_destroy(spa->spa_zio_intr_taskq[t]); 123789Sahrens spa->spa_zio_issue_taskq[t] = NULL; 124789Sahrens spa->spa_zio_intr_taskq[t] = NULL; 125789Sahrens } 126789Sahrens 127789Sahrens taskq_destroy(spa->spa_vdev_retry_taskq); 128789Sahrens spa->spa_vdev_retry_taskq = NULL; 129789Sahrens 130789Sahrens metaslab_class_destroy(spa->spa_normal_class); 131789Sahrens spa->spa_normal_class = NULL; 132789Sahrens 133789Sahrens spa->spa_state = POOL_STATE_UNINITIALIZED; 134789Sahrens } 135789Sahrens 136789Sahrens /* 137789Sahrens * Verify a pool configuration, and construct the vdev tree appropriately. This 138789Sahrens * will create all the necessary vdevs in the appropriate layout, with each vdev 139789Sahrens * in the CLOSED state. This will prep the pool before open/creation/import. 140789Sahrens * All vdev validation is done by the vdev_alloc() routine. 141789Sahrens */ 142789Sahrens static vdev_t * 143789Sahrens spa_config_parse(spa_t *spa, nvlist_t *nv, vdev_t *parent, uint_t id, int atype) 144789Sahrens { 145789Sahrens nvlist_t **child; 146789Sahrens uint_t c, children; 147789Sahrens vdev_t *vd; 148789Sahrens 149789Sahrens if ((vd = vdev_alloc(spa, nv, parent, id, atype)) == NULL) 150789Sahrens return (NULL); 151789Sahrens 152789Sahrens if (vd->vdev_ops->vdev_op_leaf) 153789Sahrens return (vd); 154789Sahrens 155789Sahrens if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 156789Sahrens &child, &children) != 0) { 157789Sahrens vdev_free(vd); 158789Sahrens return (NULL); 159789Sahrens } 160789Sahrens 161789Sahrens for (c = 0; c < children; c++) { 162789Sahrens if (spa_config_parse(spa, child[c], vd, c, atype) == NULL) { 163789Sahrens vdev_free(vd); 164789Sahrens return (NULL); 165789Sahrens } 166789Sahrens } 167789Sahrens 168789Sahrens return (vd); 169789Sahrens } 170789Sahrens 171789Sahrens /* 172789Sahrens * Opposite of spa_load(). 173789Sahrens */ 174789Sahrens static void 175789Sahrens spa_unload(spa_t *spa) 176789Sahrens { 177789Sahrens /* 178789Sahrens * Stop syncing. 179789Sahrens */ 180789Sahrens if (spa->spa_sync_on) { 181789Sahrens txg_sync_stop(spa->spa_dsl_pool); 182789Sahrens spa->spa_sync_on = B_FALSE; 183789Sahrens } 184789Sahrens 185789Sahrens /* 186789Sahrens * Wait for any outstanding prefetch I/O to complete. 187789Sahrens */ 188789Sahrens spa_config_enter(spa, RW_WRITER); 189789Sahrens spa_config_exit(spa); 190789Sahrens 191789Sahrens /* 192789Sahrens * Close the dsl pool. 193789Sahrens */ 194789Sahrens if (spa->spa_dsl_pool) { 195789Sahrens dsl_pool_close(spa->spa_dsl_pool); 196789Sahrens spa->spa_dsl_pool = NULL; 197789Sahrens } 198789Sahrens 199789Sahrens /* 200789Sahrens * Close all vdevs. 201789Sahrens */ 202789Sahrens if (spa->spa_root_vdev) { 203789Sahrens vdev_free(spa->spa_root_vdev); 204789Sahrens spa->spa_root_vdev = NULL; 205789Sahrens } 206789Sahrens } 207789Sahrens 208789Sahrens /* 209789Sahrens * Load an existing storage pool, using the pool's builtin spa_config as a 210789Sahrens * source of configuration information. The 'readonly' flag will prevent us 211789Sahrens * from writing any updated state to disk, and can be use when testing a pool 212789Sahrens * for import. 213789Sahrens */ 214789Sahrens static int 215789Sahrens spa_load(spa_t *spa, nvlist_t *config, int readonly, int import, int mosconfig) 216789Sahrens { 217789Sahrens int error = 0; 218789Sahrens nvlist_t *nvroot = NULL; 219789Sahrens vdev_t *rvd; 220789Sahrens uberblock_t *ub = &spa->spa_uberblock; 221789Sahrens uint64_t pool_guid; 222789Sahrens zio_t *zio; 223789Sahrens 224789Sahrens if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 225789Sahrens nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) 226789Sahrens return (EINVAL); 227789Sahrens 228789Sahrens (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 229789Sahrens &spa->spa_config_txg); 230789Sahrens 231789Sahrens if (import && spa_guid_exists(pool_guid, 0)) 232789Sahrens return (EEXIST); 233789Sahrens 234789Sahrens /* 235789Sahrens * Parse the configuration into a vdev tree. 236789Sahrens */ 237789Sahrens spa_config_enter(spa, RW_WRITER); 238789Sahrens rvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 239789Sahrens spa_config_exit(spa); 240789Sahrens 241789Sahrens if (rvd == NULL) 242789Sahrens return (EINVAL); 243789Sahrens 244789Sahrens spa->spa_root_vdev = rvd; 245789Sahrens ASSERT(spa_guid(spa) == pool_guid); 246789Sahrens 247789Sahrens /* 248789Sahrens * Try to open all vdevs, loading each label in the process. 249789Sahrens */ 250789Sahrens if (vdev_open(rvd) != 0) 251789Sahrens return (ENXIO); 252789Sahrens 253789Sahrens /* 254789Sahrens * Find the best uberblock. 255789Sahrens */ 256789Sahrens bzero(ub, sizeof (uberblock_t)); 257789Sahrens 258789Sahrens zio = zio_root(spa, NULL, NULL, 259789Sahrens ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 260789Sahrens vdev_uberblock_load(zio, rvd, ub); 261789Sahrens error = zio_wait(zio); 262789Sahrens 263789Sahrens /* 264789Sahrens * If we weren't able to find a single valid uberblock, return failure. 265789Sahrens */ 266789Sahrens if (ub->ub_txg == 0) { 267789Sahrens dprintf("ub_txg is zero\n"); 268789Sahrens return (ENXIO); 269789Sahrens } 270789Sahrens 271789Sahrens /* 272789Sahrens * If the vdev guid sum doesn't match the uberblock, we have an 273789Sahrens * incomplete configuration. 274789Sahrens */ 275789Sahrens if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 276789Sahrens rvd->vdev_state = VDEV_STATE_CANT_OPEN; 277789Sahrens rvd->vdev_stat.vs_aux = VDEV_AUX_BAD_GUID_SUM; 278789Sahrens dprintf("vdev_guid_sum %llx != ub_guid_sum %llx\n", 279789Sahrens rvd->vdev_guid_sum, ub->ub_guid_sum); 280789Sahrens return (ENXIO); 281789Sahrens } 282789Sahrens 283789Sahrens /* 284789Sahrens * Initialize internal SPA structures. 285789Sahrens */ 286789Sahrens spa->spa_state = POOL_STATE_ACTIVE; 287789Sahrens spa->spa_ubsync = spa->spa_uberblock; 288789Sahrens spa->spa_first_txg = spa_last_synced_txg(spa) + 1; 289789Sahrens spa->spa_dsl_pool = dsl_pool_open(spa, spa->spa_first_txg); 290789Sahrens spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 291789Sahrens 292789Sahrens VERIFY(zap_lookup(spa->spa_meta_objset, 293789Sahrens DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 294789Sahrens sizeof (uint64_t), 1, &spa->spa_config_object) == 0); 295789Sahrens 296789Sahrens if (!mosconfig) { 297789Sahrens dmu_buf_t *db; 298789Sahrens char *packed = NULL; 299789Sahrens size_t nvsize = 0; 300789Sahrens nvlist_t *newconfig = NULL; 301789Sahrens 302789Sahrens db = dmu_bonus_hold(spa->spa_meta_objset, 303789Sahrens spa->spa_config_object); 304789Sahrens dmu_buf_read(db); 305789Sahrens nvsize = *(uint64_t *)db->db_data; 306789Sahrens dmu_buf_rele(db); 307789Sahrens 308789Sahrens packed = kmem_alloc(nvsize, KM_SLEEP); 309789Sahrens error = dmu_read_canfail(spa->spa_meta_objset, 310789Sahrens spa->spa_config_object, 0, nvsize, packed); 311789Sahrens if (error == 0) 312789Sahrens error = nvlist_unpack(packed, nvsize, &newconfig, 0); 313789Sahrens kmem_free(packed, nvsize); 314789Sahrens 315789Sahrens if (error) 316789Sahrens return (ENXIO); 317789Sahrens 318789Sahrens spa_config_set(spa, newconfig); 319789Sahrens 320789Sahrens spa_unload(spa); 321789Sahrens spa_deactivate(spa); 322789Sahrens spa_activate(spa); 323789Sahrens 324789Sahrens return (spa_load(spa, newconfig, readonly, import, B_TRUE)); 325789Sahrens } 326789Sahrens 327789Sahrens VERIFY(zap_lookup(spa->spa_meta_objset, 328789Sahrens DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 329789Sahrens sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) == 0); 330789Sahrens 331789Sahrens /* 332789Sahrens * Load the vdev state for all top level vdevs. 333789Sahrens */ 334789Sahrens if ((error = vdev_load(rvd, import)) != 0) 335789Sahrens return (error); 336789Sahrens 337789Sahrens /* 338789Sahrens * Propagate the leaf DTLs we just loaded all the way up the tree. 339789Sahrens */ 340789Sahrens spa_config_enter(spa, RW_WRITER); 341789Sahrens vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 342789Sahrens spa_config_exit(spa); 343789Sahrens 344789Sahrens /* 345789Sahrens * Check the state of the root vdev. If it can't be opened, it 346789Sahrens * indicates one or more toplevel vdevs are faulted. 347789Sahrens */ 348789Sahrens if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 349789Sahrens return (ENXIO); 350789Sahrens 351789Sahrens /* 352789Sahrens * Claim log blocks that haven't been committed yet, and update all 353789Sahrens * top-level vdevs to sync any config changes found in vdev_load(). 354789Sahrens * This must all happen in a single txg. 355789Sahrens */ 356789Sahrens if ((spa_mode & FWRITE) && !readonly) { 357789Sahrens dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa), 358789Sahrens spa_first_txg(spa)); 359789Sahrens dmu_objset_find(spa->spa_name, zil_claim, tx, 0); 360789Sahrens vdev_config_dirty(rvd); 361789Sahrens dmu_tx_commit(tx); 362789Sahrens 363789Sahrens spa->spa_sync_on = B_TRUE; 364789Sahrens txg_sync_start(spa->spa_dsl_pool); 365789Sahrens 366789Sahrens /* 367789Sahrens * Wait for all claims to sync. 368789Sahrens */ 369789Sahrens txg_wait_synced(spa->spa_dsl_pool, 0); 370789Sahrens } 371789Sahrens 372789Sahrens return (0); 373789Sahrens } 374789Sahrens 375789Sahrens /* 376789Sahrens * Pool Open/Import 377789Sahrens * 378789Sahrens * The import case is identical to an open except that the configuration is sent 379789Sahrens * down from userland, instead of grabbed from the configuration cache. For the 380789Sahrens * case of an open, the pool configuration will exist in the 381789Sahrens * POOL_STATE_UNITIALIZED state. 382789Sahrens * 383789Sahrens * The stats information (gen/count/ustats) is used to gather vdev statistics at 384789Sahrens * the same time open the pool, without having to keep around the spa_t in some 385789Sahrens * ambiguous state. 386789Sahrens */ 387789Sahrens static int 388789Sahrens spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) 389789Sahrens { 390789Sahrens spa_t *spa; 391789Sahrens int error; 392789Sahrens int loaded = B_FALSE; 393789Sahrens int locked = B_FALSE; 394789Sahrens 395789Sahrens *spapp = NULL; 396789Sahrens 397789Sahrens /* 398789Sahrens * As disgusting as this is, we need to support recursive calls to this 399789Sahrens * function because dsl_dir_open() is called during spa_load(), and ends 400789Sahrens * up calling spa_open() again. The real fix is to figure out how to 401789Sahrens * avoid dsl_dir_open() calling this in the first place. 402789Sahrens */ 403789Sahrens if (mutex_owner(&spa_namespace_lock) != curthread) { 404789Sahrens mutex_enter(&spa_namespace_lock); 405789Sahrens locked = B_TRUE; 406789Sahrens } 407789Sahrens 408789Sahrens if ((spa = spa_lookup(pool)) == NULL) { 409789Sahrens if (locked) 410789Sahrens mutex_exit(&spa_namespace_lock); 411789Sahrens return (ENOENT); 412789Sahrens } 413789Sahrens if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 414789Sahrens 415789Sahrens spa_activate(spa); 416789Sahrens 417789Sahrens error = spa_load(spa, spa->spa_config, 418789Sahrens B_FALSE, B_FALSE, B_FALSE); 419789Sahrens 420789Sahrens if (error == EBADF) { 421789Sahrens /* 422789Sahrens * If vdev_load() returns EBADF, it indicates that one 423789Sahrens * of the vdevs indicates that the pool has been 424789Sahrens * exported or destroyed. If this is the case, the 425789Sahrens * config cache is out of sync and we should remove the 426789Sahrens * pool from the namespace. 427789Sahrens */ 428789Sahrens spa_unload(spa); 429789Sahrens spa_deactivate(spa); 430789Sahrens spa_remove(spa); 431789Sahrens spa_config_sync(); 432789Sahrens if (locked) 433789Sahrens mutex_exit(&spa_namespace_lock); 434789Sahrens return (ENOENT); 435789Sahrens } if (error) { 436789Sahrens /* 437789Sahrens * We can't open the pool, but we still have useful 438789Sahrens * information: the state of each vdev after the 439789Sahrens * attempted vdev_open(). Return this to the user. 440789Sahrens */ 441789Sahrens if (config != NULL && spa->spa_root_vdev != NULL) 442789Sahrens *config = spa_config_generate(spa, NULL, -1ULL, 443789Sahrens B_TRUE); 444789Sahrens spa_unload(spa); 445789Sahrens spa_deactivate(spa); 446789Sahrens if (locked) 447789Sahrens mutex_exit(&spa_namespace_lock); 448789Sahrens *spapp = NULL; 449789Sahrens return (error); 450789Sahrens } 451789Sahrens 452789Sahrens loaded = B_TRUE; 453789Sahrens } 454789Sahrens 455789Sahrens spa_open_ref(spa, tag); 456789Sahrens if (locked) 457789Sahrens mutex_exit(&spa_namespace_lock); 458789Sahrens 459789Sahrens *spapp = spa; 460789Sahrens 461789Sahrens if (config != NULL) { 462789Sahrens spa_config_enter(spa, RW_READER); 463789Sahrens *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 464789Sahrens spa_config_exit(spa); 465789Sahrens } 466789Sahrens 467789Sahrens /* 468789Sahrens * If we just loaded the pool, resilver anything that's out of date. 469789Sahrens */ 470789Sahrens if (loaded && (spa_mode & FWRITE)) 471789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 472789Sahrens 473789Sahrens return (0); 474789Sahrens } 475789Sahrens 476789Sahrens int 477789Sahrens spa_open(const char *name, spa_t **spapp, void *tag) 478789Sahrens { 479789Sahrens return (spa_open_common(name, spapp, tag, NULL)); 480789Sahrens } 481789Sahrens 482789Sahrens int 483789Sahrens spa_get_stats(const char *name, nvlist_t **config) 484789Sahrens { 485789Sahrens int error; 486789Sahrens spa_t *spa; 487789Sahrens 488789Sahrens *config = NULL; 489789Sahrens error = spa_open_common(name, &spa, FTAG, config); 490789Sahrens 491789Sahrens if (spa != NULL) 492789Sahrens spa_close(spa, FTAG); 493789Sahrens 494789Sahrens return (error); 495789Sahrens } 496789Sahrens 497789Sahrens /* 498789Sahrens * Pool Creation 499789Sahrens */ 500789Sahrens int 501789Sahrens spa_create(const char *pool, nvlist_t *nvroot, char *altroot) 502789Sahrens { 503789Sahrens spa_t *spa; 504789Sahrens dsl_pool_t *dp; 505789Sahrens dmu_tx_t *tx; 506789Sahrens int error; 507789Sahrens uint64_t txg = TXG_INITIAL; 508789Sahrens 509789Sahrens /* 510789Sahrens * If this pool already exists, return failure. 511789Sahrens */ 512789Sahrens mutex_enter(&spa_namespace_lock); 513789Sahrens if (spa_lookup(pool) != NULL) { 514789Sahrens mutex_exit(&spa_namespace_lock); 515789Sahrens return (EEXIST); 516789Sahrens } 517789Sahrens spa = spa_add(pool); 518789Sahrens 519789Sahrens /* 520789Sahrens * Allocate a new spa_t structure. 521789Sahrens */ 522789Sahrens spa_activate(spa); 523789Sahrens 524789Sahrens spa->spa_uberblock.ub_txg = txg - 1; 525789Sahrens spa->spa_ubsync = spa->spa_uberblock; 526789Sahrens 527789Sahrens error = spa_vdev_add(spa, nvroot); 528789Sahrens 529789Sahrens if (error) { 530789Sahrens spa_unload(spa); 531789Sahrens spa_deactivate(spa); 532789Sahrens spa_remove(spa); 533789Sahrens mutex_exit(&spa_namespace_lock); 534789Sahrens return (error); 535789Sahrens } 536789Sahrens 537789Sahrens if (altroot != NULL) { 538789Sahrens spa->spa_root = spa_strdup(altroot); 539789Sahrens atomic_add_32(&spa_active_count, 1); 540789Sahrens } 541789Sahrens 542789Sahrens spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg); 543789Sahrens spa->spa_meta_objset = dp->dp_meta_objset; 544789Sahrens 545789Sahrens tx = dmu_tx_create_assigned(dp, txg); 546789Sahrens 547789Sahrens /* 548789Sahrens * Create the pool config object. 549789Sahrens */ 550789Sahrens spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 551789Sahrens DMU_OT_PACKED_NVLIST, 1 << 14, 552789Sahrens DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 553789Sahrens 554789Sahrens VERIFY(zap_add(spa->spa_meta_objset, 555789Sahrens DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 556789Sahrens sizeof (uint64_t), 1, &spa->spa_config_object, tx) == 0); 557789Sahrens 558789Sahrens /* 559789Sahrens * Create the deferred-free bplist object. Turn off compression 560789Sahrens * because sync-to-convergence takes longer if the blocksize 561789Sahrens * keeps changing. 562789Sahrens */ 563789Sahrens spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 564789Sahrens 1 << 14, tx); 565789Sahrens dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 566789Sahrens ZIO_COMPRESS_OFF, tx); 567789Sahrens 568789Sahrens VERIFY(zap_add(spa->spa_meta_objset, 569789Sahrens DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 570789Sahrens sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) == 0); 571789Sahrens 572789Sahrens dmu_tx_commit(tx); 573789Sahrens 574789Sahrens spa->spa_sync_on = B_TRUE; 575789Sahrens txg_sync_start(spa->spa_dsl_pool); 576789Sahrens 577789Sahrens /* 578789Sahrens * We explicitly wait for the first transaction to complete so that our 579789Sahrens * bean counters are appropriately updated. 580789Sahrens */ 581789Sahrens txg_wait_synced(spa->spa_dsl_pool, txg); 582789Sahrens 583789Sahrens spa_config_sync(); 584789Sahrens 585789Sahrens mutex_exit(&spa_namespace_lock); 586789Sahrens 587789Sahrens return (0); 588789Sahrens } 589789Sahrens 590789Sahrens /* 591789Sahrens * Import the given pool into the system. We set up the necessary spa_t and 592789Sahrens * then call spa_load() to do the dirty work. 593789Sahrens */ 594789Sahrens int 595789Sahrens spa_import(const char *pool, nvlist_t *config, char *altroot) 596789Sahrens { 597789Sahrens spa_t *spa; 598789Sahrens int error; 599789Sahrens 600789Sahrens if (!(spa_mode & FWRITE)) 601789Sahrens return (EROFS); 602789Sahrens 603789Sahrens /* 604789Sahrens * If a pool with this name exists, return failure. 605789Sahrens */ 606789Sahrens mutex_enter(&spa_namespace_lock); 607789Sahrens if (spa_lookup(pool) != NULL) { 608789Sahrens mutex_exit(&spa_namespace_lock); 609789Sahrens return (EEXIST); 610789Sahrens } 611789Sahrens 612789Sahrens /* 613789Sahrens * Create an initialize the spa structure 614789Sahrens */ 615789Sahrens spa = spa_add(pool); 616789Sahrens spa_activate(spa); 617789Sahrens 618789Sahrens /* 619789Sahrens * Pass off the heavy lifting to spa_load(). We pass TRUE for mosconfig 620789Sahrens * so that we don't try to open the pool if the config is damaged. 621789Sahrens */ 622789Sahrens error = spa_load(spa, config, B_FALSE, B_TRUE, B_TRUE); 623789Sahrens 624789Sahrens if (error) { 625789Sahrens spa_unload(spa); 626789Sahrens spa_deactivate(spa); 627789Sahrens spa_remove(spa); 628789Sahrens mutex_exit(&spa_namespace_lock); 629789Sahrens return (error); 630789Sahrens } 631789Sahrens 632789Sahrens /* 633789Sahrens * Set the alternate root, if there is one. 634789Sahrens */ 635789Sahrens if (altroot != NULL) { 636789Sahrens atomic_add_32(&spa_active_count, 1); 637789Sahrens spa->spa_root = spa_strdup(altroot); 638789Sahrens } 639789Sahrens 640789Sahrens /* 641789Sahrens * Initialize the config based on the in-core state. 642789Sahrens */ 643789Sahrens config = spa_config_generate(spa, NULL, spa_last_synced_txg(spa), 0); 644789Sahrens 645789Sahrens spa_config_set(spa, config); 646789Sahrens 647789Sahrens /* 648789Sahrens * Sync the configuration cache. 649789Sahrens */ 650789Sahrens spa_config_sync(); 651789Sahrens 652789Sahrens mutex_exit(&spa_namespace_lock); 653789Sahrens 654789Sahrens /* 655789Sahrens * Resilver anything that's out of date. 656789Sahrens */ 657789Sahrens if (spa_mode & FWRITE) 658789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 659789Sahrens 660789Sahrens return (0); 661789Sahrens } 662789Sahrens 663789Sahrens /* 664789Sahrens * This (illegal) pool name is used when temporarily importing a spa_t in order 665789Sahrens * to get the vdev stats associated with the imported devices. 666789Sahrens */ 667789Sahrens #define TRYIMPORT_NAME "$import" 668789Sahrens 669789Sahrens nvlist_t * 670789Sahrens spa_tryimport(nvlist_t *tryconfig) 671789Sahrens { 672789Sahrens nvlist_t *config = NULL; 673789Sahrens char *poolname; 674789Sahrens spa_t *spa; 675789Sahrens uint64_t state; 676789Sahrens 677789Sahrens if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 678789Sahrens return (NULL); 679789Sahrens 680789Sahrens if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 681789Sahrens return (NULL); 682789Sahrens 683789Sahrens mutex_enter(&spa_namespace_lock); 684789Sahrens spa = spa_add(TRYIMPORT_NAME); 685789Sahrens 686789Sahrens ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 687789Sahrens 688789Sahrens /* 689789Sahrens * Initialize the spa_t structure. 690789Sahrens */ 691789Sahrens spa_activate(spa); 692789Sahrens 693789Sahrens /* 694789Sahrens * Pass off the heavy lifting to spa_load(). We pass TRUE for mosconfig 695789Sahrens * so we don't try to open the pool if the config is damaged. 696789Sahrens */ 697789Sahrens (void) spa_load(spa, tryconfig, B_TRUE, B_TRUE, B_TRUE); 698789Sahrens 699789Sahrens /* 700789Sahrens * If 'tryconfig' was at least parsable, return the current config. 701789Sahrens */ 702789Sahrens if (spa->spa_root_vdev != NULL) { 703789Sahrens config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 704789Sahrens VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 705789Sahrens poolname) == 0); 706789Sahrens VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 707789Sahrens state) == 0); 708789Sahrens } 709789Sahrens 710789Sahrens spa_unload(spa); 711789Sahrens spa_deactivate(spa); 712789Sahrens spa_remove(spa); 713789Sahrens mutex_exit(&spa_namespace_lock); 714789Sahrens 715789Sahrens return (config); 716789Sahrens } 717789Sahrens 718789Sahrens /* 719789Sahrens * Pool export/destroy 720789Sahrens * 721789Sahrens * The act of destroying or exporting a pool is very simple. We make sure there 722789Sahrens * is no more pending I/O and any references to the pool are gone. Then, we 723789Sahrens * update the pool state and sync all the labels to disk, removing the 724789Sahrens * configuration from the cache afterwards. 725789Sahrens */ 726789Sahrens static int 727789Sahrens spa_export_common(char *pool, int new_state) 728789Sahrens { 729789Sahrens spa_t *spa; 730789Sahrens 731789Sahrens if (!(spa_mode & FWRITE)) 732789Sahrens return (EROFS); 733789Sahrens 734789Sahrens mutex_enter(&spa_namespace_lock); 735789Sahrens if ((spa = spa_lookup(pool)) == NULL) { 736789Sahrens mutex_exit(&spa_namespace_lock); 737789Sahrens return (ENOENT); 738789Sahrens } 739789Sahrens 740789Sahrens /* 741789Sahrens * The pool will be in core if it's openable, 742789Sahrens * in which case we can modify its state. 743789Sahrens */ 744789Sahrens if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 745789Sahrens /* 746789Sahrens * Objsets may be open only because they're dirty, so we 747789Sahrens * have to force it to sync before checking spa_refcnt. 748789Sahrens */ 749789Sahrens spa_scrub_suspend(spa); 750789Sahrens txg_wait_synced(spa->spa_dsl_pool, 0); 751789Sahrens 752789Sahrens if (!spa_refcount_zero(spa)) { 753789Sahrens spa_scrub_resume(spa); 754789Sahrens mutex_exit(&spa_namespace_lock); 755789Sahrens return (EBUSY); 756789Sahrens } 757789Sahrens 758789Sahrens /* 759789Sahrens * Update the pool state. 760789Sahrens */ 761789Sahrens spa->spa_state = new_state; 762789Sahrens 763789Sahrens spa_scrub_resume(spa); 764789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 765789Sahrens 766789Sahrens if (spa->spa_root != NULL) 767789Sahrens atomic_add_32(&spa_active_count, -1); 768789Sahrens 769789Sahrens /* 770789Sahrens * We want this to be reflected on every label, 771789Sahrens * so mark them all dirty. spa_unload() will do the 772789Sahrens * final sync that pushes these changes out. 773789Sahrens */ 774789Sahrens vdev_config_dirty(spa->spa_root_vdev); 775789Sahrens } 776789Sahrens 777789Sahrens if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 778789Sahrens spa_unload(spa); 779789Sahrens spa_deactivate(spa); 780789Sahrens } 781789Sahrens 782789Sahrens spa_remove(spa); 783789Sahrens spa_config_sync(); 784789Sahrens mutex_exit(&spa_namespace_lock); 785789Sahrens 786789Sahrens return (0); 787789Sahrens } 788789Sahrens 789789Sahrens /* 790789Sahrens * Destroy a storage pool. 791789Sahrens */ 792789Sahrens int 793789Sahrens spa_destroy(char *pool) 794789Sahrens { 795789Sahrens return (spa_export_common(pool, POOL_STATE_DESTROYED)); 796789Sahrens } 797789Sahrens 798789Sahrens /* 799789Sahrens * Export a storage pool. 800789Sahrens */ 801789Sahrens int 802789Sahrens spa_export(char *pool) 803789Sahrens { 804789Sahrens return (spa_export_common(pool, POOL_STATE_EXPORTED)); 805789Sahrens } 806789Sahrens 807789Sahrens /* 808789Sahrens * ========================================================================== 809789Sahrens * Device manipulation 810789Sahrens * ========================================================================== 811789Sahrens */ 812789Sahrens 813789Sahrens /* 814789Sahrens * Add capacity to a storage pool. 815789Sahrens */ 816789Sahrens int 817789Sahrens spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 818789Sahrens { 819789Sahrens uint64_t txg; 820789Sahrens int c, error; 821789Sahrens vdev_t *rvd = spa->spa_root_vdev; 822789Sahrens vdev_t *vd; 823789Sahrens 824789Sahrens txg = spa_vdev_enter(spa); 825789Sahrens 826789Sahrens vd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD); 827789Sahrens 828789Sahrens if (vd == NULL) 829789Sahrens return (spa_vdev_exit(spa, vd, txg, EINVAL)); 830789Sahrens 831789Sahrens if (rvd == NULL) /* spa_create() */ 832789Sahrens spa->spa_root_vdev = rvd = vd; 833789Sahrens 834789Sahrens if ((error = vdev_create(vd, txg)) != 0) 835789Sahrens return (spa_vdev_exit(spa, vd, txg, error)); 836789Sahrens 837789Sahrens /* 838789Sahrens * Transfer each top-level vdev from the temporary root 839789Sahrens * to the spa's root and initialize its metaslabs. 840789Sahrens */ 841789Sahrens for (c = 0; c < vd->vdev_children; c++) { 842789Sahrens vdev_t *tvd = vd->vdev_child[c]; 843789Sahrens if (vd != rvd) { 844789Sahrens vdev_remove_child(vd, tvd); 845789Sahrens tvd->vdev_id = rvd->vdev_children; 846789Sahrens vdev_add_child(rvd, tvd); 847789Sahrens } 848789Sahrens vdev_init(tvd, txg); 849789Sahrens vdev_config_dirty(tvd); 850789Sahrens } 851789Sahrens 852789Sahrens /* 853789Sahrens * Update the config based on the new in-core state. 854789Sahrens */ 855789Sahrens spa_config_set(spa, spa_config_generate(spa, rvd, txg, 0)); 856789Sahrens 857789Sahrens return (spa_vdev_exit(spa, vd, txg, 0)); 858789Sahrens } 859789Sahrens 860789Sahrens /* 861789Sahrens * Attach a device to a mirror. The arguments are the path to any device 862789Sahrens * in the mirror, and the nvroot for the new device. If the path specifies 863789Sahrens * a device that is not mirrored, we automatically insert the mirror vdev. 864789Sahrens * 865789Sahrens * If 'replacing' is specified, the new device is intended to replace the 866789Sahrens * existing device; in this case the two devices are made into their own 867789Sahrens * mirror using the 'replacing' vdev, which is functionally idendical to 868789Sahrens * the mirror vdev (it actually reuses all the same ops) but has a few 869789Sahrens * extra rules: you can't attach to it after it's been created, and upon 870789Sahrens * completion of resilvering, the first disk (the one being replaced) 871789Sahrens * is automatically detached. 872789Sahrens */ 873789Sahrens int 874789Sahrens spa_vdev_attach(spa_t *spa, const char *path, nvlist_t *nvroot, int replacing) 875789Sahrens { 876789Sahrens uint64_t txg, open_txg; 877789Sahrens int error; 878789Sahrens vdev_t *rvd = spa->spa_root_vdev; 879789Sahrens vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 880789Sahrens vdev_ops_t *pvops = replacing ? &vdev_replacing_ops : &vdev_mirror_ops; 881789Sahrens 882789Sahrens txg = spa_vdev_enter(spa); 883789Sahrens 884789Sahrens oldvd = vdev_lookup_by_path(rvd, path); 885789Sahrens 886789Sahrens if (oldvd == NULL) 887789Sahrens return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 888789Sahrens 889789Sahrens pvd = oldvd->vdev_parent; 890789Sahrens 891789Sahrens /* 892789Sahrens * The parent must be a mirror or the root, unless we're replacing; 893789Sahrens * in that case, the parent can be anything but another replacing vdev. 894789Sahrens */ 895789Sahrens if (pvd->vdev_ops != &vdev_mirror_ops && 896789Sahrens pvd->vdev_ops != &vdev_root_ops && 897789Sahrens (!replacing || pvd->vdev_ops == &vdev_replacing_ops)) 898789Sahrens return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 899789Sahrens 900789Sahrens newrootvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD); 901789Sahrens 902789Sahrens if (newrootvd == NULL || newrootvd->vdev_children != 1) 903789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 904789Sahrens 905789Sahrens newvd = newrootvd->vdev_child[0]; 906789Sahrens 907789Sahrens if (!newvd->vdev_ops->vdev_op_leaf) 908789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 909789Sahrens 910789Sahrens if ((error = vdev_create(newrootvd, txg)) != 0) 911789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, error)); 912789Sahrens 913*1175Slling /* 914*1175Slling * Compare the new device size with the replaceable/attachable 915*1175Slling * device size. 916*1175Slling */ 917*1175Slling if (newvd->vdev_psize < vdev_get_rsize(oldvd)) 918789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 919789Sahrens 920789Sahrens if (newvd->vdev_ashift != oldvd->vdev_ashift && oldvd->vdev_ashift != 0) 921789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 922789Sahrens 923789Sahrens /* 924789Sahrens * If this is an in-place replacement, update oldvd's path and devid 925789Sahrens * to make it distinguishable from newvd, and unopenable from now on. 926789Sahrens */ 927789Sahrens if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 928789Sahrens spa_strfree(oldvd->vdev_path); 929789Sahrens oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 930789Sahrens KM_SLEEP); 931789Sahrens (void) sprintf(oldvd->vdev_path, "%s/%s", 932789Sahrens newvd->vdev_path, "old"); 933789Sahrens if (oldvd->vdev_devid != NULL) { 934789Sahrens spa_strfree(oldvd->vdev_devid); 935789Sahrens oldvd->vdev_devid = NULL; 936789Sahrens } 937789Sahrens } 938789Sahrens 939789Sahrens /* 940789Sahrens * If the parent is not a mirror, or if we're replacing, 941789Sahrens * insert the new mirror/replacing vdev above oldvd. 942789Sahrens */ 943789Sahrens if (pvd->vdev_ops != pvops) 944789Sahrens pvd = vdev_add_parent(oldvd, pvops); 945789Sahrens 946789Sahrens ASSERT(pvd->vdev_top->vdev_parent == rvd); 947789Sahrens ASSERT(pvd->vdev_ops == pvops); 948789Sahrens ASSERT(oldvd->vdev_parent == pvd); 949789Sahrens 950789Sahrens /* 951789Sahrens * Extract the new device from its root and add it to pvd. 952789Sahrens */ 953789Sahrens vdev_remove_child(newrootvd, newvd); 954789Sahrens newvd->vdev_id = pvd->vdev_children; 955789Sahrens vdev_add_child(pvd, newvd); 956789Sahrens 957789Sahrens tvd = newvd->vdev_top; 958789Sahrens ASSERT(pvd->vdev_top == tvd); 959789Sahrens ASSERT(tvd->vdev_parent == rvd); 960789Sahrens 961789Sahrens /* 962789Sahrens * Update the config based on the new in-core state. 963789Sahrens */ 964789Sahrens spa_config_set(spa, spa_config_generate(spa, rvd, txg, 0)); 965789Sahrens 966789Sahrens vdev_config_dirty(tvd); 967789Sahrens 968789Sahrens /* 969789Sahrens * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 970789Sahrens * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 971789Sahrens */ 972789Sahrens open_txg = txg + TXG_CONCURRENT_STATES - 1; 973789Sahrens 974789Sahrens mutex_enter(&newvd->vdev_dtl_lock); 975789Sahrens space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, 976789Sahrens open_txg - TXG_INITIAL + 1); 977789Sahrens mutex_exit(&newvd->vdev_dtl_lock); 978789Sahrens 979789Sahrens /* 980789Sahrens * Mark newvd's DTL dirty in this txg. 981789Sahrens */ 982789Sahrens vdev_dirty(tvd, VDD_DTL, txg); 983789Sahrens (void) txg_list_add(&tvd->vdev_dtl_list, newvd, txg); 984789Sahrens 985789Sahrens dprintf("attached %s, replacing=%d\n", path, replacing); 986789Sahrens 987789Sahrens (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 988789Sahrens 989789Sahrens /* 990789Sahrens * Kick off a resilver to update newvd. 991789Sahrens */ 992789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 993789Sahrens 994789Sahrens return (0); 995789Sahrens } 996789Sahrens 997789Sahrens /* 998789Sahrens * Detach a device from a mirror or replacing vdev. 999789Sahrens * If 'replace_done' is specified, only detach if the parent 1000789Sahrens * is a replacing vdev. 1001789Sahrens */ 1002789Sahrens int 1003789Sahrens spa_vdev_detach(spa_t *spa, const char *path, uint64_t guid, int replace_done) 1004789Sahrens { 1005789Sahrens uint64_t txg; 1006789Sahrens int c, t, error; 1007789Sahrens vdev_t *rvd = spa->spa_root_vdev; 1008789Sahrens vdev_t *vd, *pvd, *cvd, *tvd; 1009789Sahrens 1010789Sahrens txg = spa_vdev_enter(spa); 1011789Sahrens 1012789Sahrens vd = vdev_lookup_by_path(rvd, path); 1013789Sahrens 1014789Sahrens if (vd == NULL) 1015789Sahrens return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1016789Sahrens 1017789Sahrens if (guid != 0 && vd->vdev_guid != guid) 1018789Sahrens return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1019789Sahrens 1020789Sahrens pvd = vd->vdev_parent; 1021789Sahrens 1022789Sahrens /* 1023789Sahrens * If replace_done is specified, only remove this device if it's 1024789Sahrens * the first child of a replacing vdev. 1025789Sahrens */ 1026789Sahrens if (replace_done && 1027789Sahrens (vd->vdev_id != 0 || pvd->vdev_ops != &vdev_replacing_ops)) 1028789Sahrens return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1029789Sahrens 1030789Sahrens /* 1031789Sahrens * Only mirror and replacing vdevs support detach. 1032789Sahrens */ 1033789Sahrens if (pvd->vdev_ops != &vdev_replacing_ops && 1034789Sahrens pvd->vdev_ops != &vdev_mirror_ops) 1035789Sahrens return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1036789Sahrens 1037789Sahrens /* 1038789Sahrens * If there's only one replica, you can't detach it. 1039789Sahrens */ 1040789Sahrens if (pvd->vdev_children <= 1) 1041789Sahrens return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1042789Sahrens 1043789Sahrens /* 1044789Sahrens * If all siblings have non-empty DTLs, this device may have the only 1045789Sahrens * valid copy of the data, which means we cannot safely detach it. 1046789Sahrens * 1047789Sahrens * XXX -- as in the vdev_offline() case, we really want a more 1048789Sahrens * precise DTL check. 1049789Sahrens */ 1050789Sahrens for (c = 0; c < pvd->vdev_children; c++) { 1051789Sahrens uint64_t dirty; 1052789Sahrens 1053789Sahrens cvd = pvd->vdev_child[c]; 1054789Sahrens if (cvd == vd) 1055789Sahrens continue; 1056789Sahrens if (vdev_is_dead(cvd)) 1057789Sahrens continue; 1058789Sahrens mutex_enter(&cvd->vdev_dtl_lock); 1059789Sahrens dirty = cvd->vdev_dtl_map.sm_space | 1060789Sahrens cvd->vdev_dtl_scrub.sm_space; 1061789Sahrens mutex_exit(&cvd->vdev_dtl_lock); 1062789Sahrens if (!dirty) 1063789Sahrens break; 1064789Sahrens } 1065789Sahrens if (c == pvd->vdev_children) 1066789Sahrens return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1067789Sahrens 1068789Sahrens /* 1069789Sahrens * Erase the disk labels so the disk can be used for other things. 1070789Sahrens * This must be done after all other error cases are handled, 1071789Sahrens * but before we disembowel vd (so we can still do I/O to it). 1072789Sahrens * But if we can't do it, don't treat the error as fatal -- 1073789Sahrens * it may be that the unwritability of the disk is the reason 1074789Sahrens * it's being detached! 1075789Sahrens */ 1076789Sahrens error = vdev_label_init(vd, 0); 1077789Sahrens if (error) 1078789Sahrens dprintf("unable to erase labels on %s\n", vdev_description(vd)); 1079789Sahrens 1080789Sahrens /* 1081789Sahrens * Remove vd from its parent and compact the parent's children. 1082789Sahrens */ 1083789Sahrens vdev_remove_child(pvd, vd); 1084789Sahrens vdev_compact_children(pvd); 1085789Sahrens 1086789Sahrens /* 1087789Sahrens * Remember one of the remaining children so we can get tvd below. 1088789Sahrens */ 1089789Sahrens cvd = pvd->vdev_child[0]; 1090789Sahrens 1091789Sahrens /* 1092789Sahrens * If the parent mirror/replacing vdev only has one child, 1093789Sahrens * the parent is no longer needed. Remove it from the tree. 1094789Sahrens */ 1095789Sahrens if (pvd->vdev_children == 1) 1096789Sahrens vdev_remove_parent(cvd); 1097789Sahrens 1098789Sahrens /* 1099789Sahrens * We don't set tvd until now because the parent we just removed 1100789Sahrens * may have been the previous top-level vdev. 1101789Sahrens */ 1102789Sahrens tvd = cvd->vdev_top; 1103789Sahrens ASSERT(tvd->vdev_parent == rvd); 1104789Sahrens 1105789Sahrens /* 1106789Sahrens * Reopen this top-level vdev to reassess health after detach. 1107789Sahrens */ 1108789Sahrens vdev_reopen(tvd, NULL); 1109789Sahrens 1110789Sahrens /* 1111789Sahrens * If the device we just detached was smaller than the others, 1112789Sahrens * it may be possible to add metaslabs (i.e. grow the pool). 1113789Sahrens */ 1114789Sahrens vdev_metaslab_init(tvd, txg); 1115789Sahrens 1116789Sahrens /* 1117789Sahrens * Update the config based on the new in-core state. 1118789Sahrens */ 1119789Sahrens spa_config_set(spa, spa_config_generate(spa, rvd, txg, 0)); 1120789Sahrens 1121789Sahrens vdev_config_dirty(tvd); 1122789Sahrens 1123789Sahrens /* 1124789Sahrens * Mark vd's DTL as dirty in this txg. 1125789Sahrens * vdev_dtl_sync() will see that vd->vdev_detached is set 1126789Sahrens * and free vd's DTL object in syncing context. 1127789Sahrens * But first make sure we're not on any *other* txg's DTL list, 1128789Sahrens * to prevent vd from being accessed after it's freed. 1129789Sahrens */ 1130789Sahrens vdev_dirty(tvd, VDD_DTL, txg); 1131789Sahrens vd->vdev_detached = B_TRUE; 1132789Sahrens for (t = 0; t < TXG_SIZE; t++) 1133789Sahrens (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 1134789Sahrens (void) txg_list_add(&tvd->vdev_dtl_list, vd, txg); 1135789Sahrens 1136789Sahrens dprintf("detached %s\n", path); 1137789Sahrens 1138789Sahrens return (spa_vdev_exit(spa, vd, txg, 0)); 1139789Sahrens } 1140789Sahrens 1141789Sahrens /* 1142789Sahrens * If there are any replacing vdevs that have finished replacing, detach them. 1143789Sahrens * We can't hold the config lock across detaches, so we lock the config, 1144789Sahrens * build a list of candidates, unlock the config, and try each candidate. 1145789Sahrens */ 1146789Sahrens typedef struct vdev_detach_link { 1147789Sahrens char *vdl_path; 1148789Sahrens uint64_t vdl_guid; 1149789Sahrens list_node_t vdl_node; 1150789Sahrens } vdev_detach_link_t; 1151789Sahrens 1152789Sahrens static void 1153789Sahrens spa_vdev_replace_done_make_list(list_t *l, vdev_t *vd) 1154789Sahrens { 1155789Sahrens int c; 1156789Sahrens 1157789Sahrens for (c = 0; c < vd->vdev_children; c++) 1158789Sahrens spa_vdev_replace_done_make_list(l, vd->vdev_child[c]); 1159789Sahrens 1160789Sahrens if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 1161789Sahrens vdev_t *cvd0 = vd->vdev_child[0]; 1162789Sahrens vdev_t *cvd1 = vd->vdev_child[1]; 1163789Sahrens vdev_detach_link_t *vdl; 1164789Sahrens int dirty1; 1165789Sahrens 1166789Sahrens mutex_enter(&cvd1->vdev_dtl_lock); 1167789Sahrens dirty1 = cvd1->vdev_dtl_map.sm_space | 1168789Sahrens cvd1->vdev_dtl_scrub.sm_space; 1169789Sahrens mutex_exit(&cvd1->vdev_dtl_lock); 1170789Sahrens 1171789Sahrens if (!dirty1) { 1172789Sahrens vdl = kmem_zalloc(sizeof (*vdl), KM_SLEEP); 1173789Sahrens vdl->vdl_path = spa_strdup(cvd0->vdev_path); 1174789Sahrens vdl->vdl_guid = cvd0->vdev_guid; 1175789Sahrens list_insert_tail(l, vdl); 1176789Sahrens } 1177789Sahrens } 1178789Sahrens } 1179789Sahrens 1180789Sahrens void 1181789Sahrens spa_vdev_replace_done(spa_t *spa) 1182789Sahrens { 1183789Sahrens vdev_detach_link_t *vdl; 1184789Sahrens list_t vdlist; 1185789Sahrens 1186789Sahrens list_create(&vdlist, sizeof (vdev_detach_link_t), 1187789Sahrens offsetof(vdev_detach_link_t, vdl_node)); 1188789Sahrens 1189789Sahrens spa_config_enter(spa, RW_READER); 1190789Sahrens spa_vdev_replace_done_make_list(&vdlist, spa->spa_root_vdev); 1191789Sahrens spa_config_exit(spa); 1192789Sahrens 1193789Sahrens while ((vdl = list_head(&vdlist)) != NULL) { 1194789Sahrens list_remove(&vdlist, vdl); 1195789Sahrens (void) spa_vdev_detach(spa, vdl->vdl_path, vdl->vdl_guid, 1196789Sahrens B_TRUE); 1197789Sahrens spa_strfree(vdl->vdl_path); 1198789Sahrens kmem_free(vdl, sizeof (*vdl)); 1199789Sahrens } 1200789Sahrens 1201789Sahrens list_destroy(&vdlist); 1202789Sahrens } 1203789Sahrens 1204789Sahrens /* 1205789Sahrens * ========================================================================== 1206789Sahrens * SPA Scrubbing 1207789Sahrens * ========================================================================== 1208789Sahrens */ 1209789Sahrens 1210789Sahrens static int spa_scrub_locked(spa_t *, pool_scrub_type_t, boolean_t); 1211789Sahrens 1212789Sahrens static void 1213789Sahrens spa_scrub_io_done(zio_t *zio) 1214789Sahrens { 1215789Sahrens spa_t *spa = zio->io_spa; 1216789Sahrens 1217789Sahrens zio_buf_free(zio->io_data, zio->io_size); 1218789Sahrens 1219789Sahrens mutex_enter(&spa->spa_scrub_lock); 1220789Sahrens if (zio->io_error) 1221789Sahrens spa->spa_scrub_errors++; 1222789Sahrens if (--spa->spa_scrub_inflight == 0) 1223789Sahrens cv_broadcast(&spa->spa_scrub_io_cv); 1224789Sahrens mutex_exit(&spa->spa_scrub_lock); 1225789Sahrens 1226789Sahrens if (zio->io_error) { 1227789Sahrens vdev_t *vd = zio->io_vd; 1228789Sahrens mutex_enter(&vd->vdev_stat_lock); 1229789Sahrens vd->vdev_stat.vs_scrub_errors++; 1230789Sahrens mutex_exit(&vd->vdev_stat_lock); 1231789Sahrens } 1232789Sahrens } 1233789Sahrens 1234789Sahrens static void 1235789Sahrens spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags) 1236789Sahrens { 1237789Sahrens size_t size = BP_GET_LSIZE(bp); 1238789Sahrens void *data = zio_buf_alloc(size); 1239789Sahrens 1240789Sahrens mutex_enter(&spa->spa_scrub_lock); 1241789Sahrens spa->spa_scrub_inflight++; 1242789Sahrens mutex_exit(&spa->spa_scrub_lock); 1243789Sahrens 1244789Sahrens zio_nowait(zio_read(NULL, spa, bp, data, size, 1245789Sahrens spa_scrub_io_done, NULL, priority, flags)); 1246789Sahrens } 1247789Sahrens 1248789Sahrens /* ARGSUSED */ 1249789Sahrens static int 1250789Sahrens spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) 1251789Sahrens { 1252789Sahrens blkptr_t *bp = &bc->bc_blkptr; 1253789Sahrens vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[0])); 1254789Sahrens 1255789Sahrens if (bc->bc_errno || vd == NULL) { 1256789Sahrens /* 1257789Sahrens * We can't scrub this block, but we can continue to scrub 1258789Sahrens * the rest of the pool. Note the error and move along. 1259789Sahrens */ 1260789Sahrens mutex_enter(&spa->spa_scrub_lock); 1261789Sahrens spa->spa_scrub_errors++; 1262789Sahrens mutex_exit(&spa->spa_scrub_lock); 1263789Sahrens 1264789Sahrens if (vd != NULL) { 1265789Sahrens mutex_enter(&vd->vdev_stat_lock); 1266789Sahrens vd->vdev_stat.vs_scrub_errors++; 1267789Sahrens mutex_exit(&vd->vdev_stat_lock); 1268789Sahrens } 1269789Sahrens 1270789Sahrens return (ERESTART); 1271789Sahrens } 1272789Sahrens 1273789Sahrens ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg); 1274789Sahrens 1275789Sahrens /* 1276789Sahrens * Keep track of how much data we've examined so that 1277789Sahrens * zpool(1M) status can make useful progress reports. 1278789Sahrens */ 1279789Sahrens mutex_enter(&vd->vdev_stat_lock); 1280789Sahrens vd->vdev_stat.vs_scrub_examined += BP_GET_ASIZE(bp); 1281789Sahrens mutex_exit(&vd->vdev_stat_lock); 1282789Sahrens 1283789Sahrens if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) { 1284789Sahrens if (DVA_GET_GANG(&bp->blk_dva[0])) { 1285789Sahrens /* 1286789Sahrens * Gang members may be spread across multiple vdevs, 1287789Sahrens * so the best we can do is look at the pool-wide DTL. 1288789Sahrens * XXX -- it would be better to change our allocation 1289789Sahrens * policy to ensure that this can't happen. 1290789Sahrens */ 1291789Sahrens vd = spa->spa_root_vdev; 1292789Sahrens } 1293789Sahrens if (vdev_dtl_contains(&vd->vdev_dtl_map, bp->blk_birth, 1)) { 1294789Sahrens spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER, 1295789Sahrens ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | 1296789Sahrens ZIO_FLAG_RESILVER); 1297789Sahrens } 1298789Sahrens } else { 1299789Sahrens spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB, 1300789Sahrens ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_SCRUB); 1301789Sahrens } 1302789Sahrens 1303789Sahrens return (0); 1304789Sahrens } 1305789Sahrens 1306789Sahrens static void 1307789Sahrens spa_scrub_thread(spa_t *spa) 1308789Sahrens { 1309789Sahrens callb_cpr_t cprinfo; 1310789Sahrens traverse_handle_t *th = spa->spa_scrub_th; 1311789Sahrens vdev_t *rvd = spa->spa_root_vdev; 1312789Sahrens pool_scrub_type_t scrub_type = spa->spa_scrub_type; 1313789Sahrens int error = 0; 1314789Sahrens boolean_t complete; 1315789Sahrens 1316789Sahrens CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG); 1317789Sahrens 1318797Sbonwick /* 1319797Sbonwick * If we're restarting due to a snapshot create/delete, 1320797Sbonwick * wait for that to complete. 1321797Sbonwick */ 1322797Sbonwick txg_wait_synced(spa_get_dsl(spa), 0); 1323797Sbonwick 1324789Sahrens spa_config_enter(spa, RW_WRITER); 1325789Sahrens vdev_reopen(rvd, NULL); /* purge all vdev caches */ 1326789Sahrens vdev_config_dirty(rvd); /* rewrite all disk labels */ 1327789Sahrens vdev_scrub_stat_update(rvd, scrub_type, B_FALSE); 1328789Sahrens spa_config_exit(spa); 1329789Sahrens 1330789Sahrens mutex_enter(&spa->spa_scrub_lock); 1331789Sahrens spa->spa_scrub_errors = 0; 1332789Sahrens spa->spa_scrub_active = 1; 1333789Sahrens 1334789Sahrens while (!spa->spa_scrub_stop) { 1335789Sahrens CALLB_CPR_SAFE_BEGIN(&cprinfo); 1336789Sahrens while (spa->spa_scrub_suspend) { 1337789Sahrens spa->spa_scrub_active = 0; 1338789Sahrens cv_broadcast(&spa->spa_scrub_cv); 1339789Sahrens cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 1340789Sahrens spa->spa_scrub_active = 1; 1341789Sahrens } 1342789Sahrens CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock); 1343789Sahrens 1344789Sahrens if (spa->spa_scrub_restart_txg != 0) 1345789Sahrens break; 1346789Sahrens 1347789Sahrens mutex_exit(&spa->spa_scrub_lock); 1348789Sahrens error = traverse_more(th); 1349789Sahrens mutex_enter(&spa->spa_scrub_lock); 1350789Sahrens if (error != EAGAIN) 1351789Sahrens break; 1352789Sahrens } 1353789Sahrens 1354789Sahrens while (spa->spa_scrub_inflight) 1355789Sahrens cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1356789Sahrens 1357789Sahrens if (spa->spa_scrub_restart_txg != 0) 1358789Sahrens error = ERESTART; 1359789Sahrens 1360789Sahrens spa->spa_scrub_active = 0; 1361789Sahrens cv_broadcast(&spa->spa_scrub_cv); 1362789Sahrens 1363789Sahrens /* 1364789Sahrens * If the traverse completed, and there were no errors, 1365789Sahrens * then the scrub was completely successful. 1366789Sahrens */ 1367789Sahrens complete = (error == 0 && spa->spa_scrub_errors == 0); 1368789Sahrens 1369789Sahrens dprintf("scrub to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n", 1370789Sahrens spa->spa_scrub_maxtxg, complete ? "done" : "FAILED", 1371789Sahrens error, spa->spa_scrub_errors, spa->spa_scrub_stop); 1372789Sahrens 1373789Sahrens mutex_exit(&spa->spa_scrub_lock); 1374789Sahrens 1375789Sahrens /* 1376789Sahrens * If the scrub/resilver completed, update all DTLs to reflect this. 1377789Sahrens * Whether it succeeded or not, vacate all temporary scrub DTLs. 1378789Sahrens */ 1379789Sahrens spa_config_enter(spa, RW_WRITER); 1380789Sahrens vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1, 1381789Sahrens complete ? spa->spa_scrub_maxtxg : 0, B_TRUE); 1382789Sahrens spa_config_exit(spa); 1383789Sahrens 1384789Sahrens spa_vdev_replace_done(spa); 1385789Sahrens 1386789Sahrens spa_config_enter(spa, RW_READER); 1387789Sahrens vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete); 1388789Sahrens spa_config_exit(spa); 1389789Sahrens 1390789Sahrens mutex_enter(&spa->spa_scrub_lock); 1391789Sahrens 1392789Sahrens spa->spa_scrub_type = POOL_SCRUB_NONE; 1393789Sahrens spa->spa_scrub_active = 0; 1394789Sahrens spa->spa_scrub_thread = NULL; 1395789Sahrens 1396789Sahrens cv_broadcast(&spa->spa_scrub_cv); 1397789Sahrens 1398789Sahrens /* 1399789Sahrens * If we were told to restart, our final act is to start a new scrub. 1400789Sahrens */ 1401789Sahrens if (error == ERESTART) 1402789Sahrens VERIFY(spa_scrub_locked(spa, scrub_type, B_TRUE) == 0); 1403789Sahrens 1404789Sahrens CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */ 1405789Sahrens thread_exit(); 1406789Sahrens } 1407789Sahrens 1408789Sahrens void 1409789Sahrens spa_scrub_suspend(spa_t *spa) 1410789Sahrens { 1411789Sahrens mutex_enter(&spa->spa_scrub_lock); 1412789Sahrens spa->spa_scrub_suspend++; 1413789Sahrens while (spa->spa_scrub_active) { 1414789Sahrens cv_broadcast(&spa->spa_scrub_cv); 1415789Sahrens cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 1416789Sahrens } 1417789Sahrens while (spa->spa_scrub_inflight) 1418789Sahrens cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1419789Sahrens mutex_exit(&spa->spa_scrub_lock); 1420789Sahrens } 1421789Sahrens 1422789Sahrens void 1423789Sahrens spa_scrub_resume(spa_t *spa) 1424789Sahrens { 1425789Sahrens mutex_enter(&spa->spa_scrub_lock); 1426789Sahrens ASSERT(spa->spa_scrub_suspend != 0); 1427789Sahrens if (--spa->spa_scrub_suspend == 0) 1428789Sahrens cv_broadcast(&spa->spa_scrub_cv); 1429789Sahrens mutex_exit(&spa->spa_scrub_lock); 1430789Sahrens } 1431789Sahrens 1432789Sahrens void 1433789Sahrens spa_scrub_restart(spa_t *spa, uint64_t txg) 1434789Sahrens { 1435789Sahrens /* 1436789Sahrens * Something happened (e.g. snapshot create/delete) that means 1437789Sahrens * we must restart any in-progress scrubs. The itinerary will 1438789Sahrens * fix this properly. 1439789Sahrens */ 1440789Sahrens mutex_enter(&spa->spa_scrub_lock); 1441789Sahrens spa->spa_scrub_restart_txg = txg; 1442789Sahrens mutex_exit(&spa->spa_scrub_lock); 1443789Sahrens } 1444789Sahrens 1445789Sahrens static int 1446789Sahrens spa_scrub_locked(spa_t *spa, pool_scrub_type_t type, boolean_t force) 1447789Sahrens { 1448789Sahrens space_seg_t *ss; 1449789Sahrens uint64_t mintxg, maxtxg; 1450789Sahrens vdev_t *rvd = spa->spa_root_vdev; 1451789Sahrens int advance = 0; 1452789Sahrens 1453789Sahrens if ((uint_t)type >= POOL_SCRUB_TYPES) 1454789Sahrens return (ENOTSUP); 1455789Sahrens 1456789Sahrens /* 1457789Sahrens * If there's a scrub or resilver already in progress, stop it. 1458789Sahrens */ 1459789Sahrens while (spa->spa_scrub_thread != NULL) { 1460789Sahrens /* 1461789Sahrens * Don't stop a resilver unless forced. 1462789Sahrens */ 1463789Sahrens if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) 1464789Sahrens return (EBUSY); 1465789Sahrens 1466789Sahrens spa->spa_scrub_stop = 1; 1467789Sahrens cv_broadcast(&spa->spa_scrub_cv); 1468789Sahrens cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 1469789Sahrens } 1470789Sahrens 1471789Sahrens /* 1472789Sahrens * Terminate the previous traverse. 1473789Sahrens */ 1474789Sahrens if (spa->spa_scrub_th != NULL) { 1475789Sahrens traverse_fini(spa->spa_scrub_th); 1476789Sahrens spa->spa_scrub_th = NULL; 1477789Sahrens } 1478789Sahrens 1479789Sahrens spa->spa_scrub_stop = 0; 1480789Sahrens spa->spa_scrub_type = type; 1481789Sahrens spa->spa_scrub_restart_txg = 0; 1482789Sahrens 1483789Sahrens mintxg = TXG_INITIAL - 1; 1484789Sahrens maxtxg = spa_last_synced_txg(spa) + 1; 1485789Sahrens 1486789Sahrens switch (type) { 1487789Sahrens 1488789Sahrens case POOL_SCRUB_NONE: 1489789Sahrens break; 1490789Sahrens 1491789Sahrens case POOL_SCRUB_RESILVER: 1492789Sahrens /* 1493789Sahrens * Determine the resilvering boundaries. 1494789Sahrens * 1495789Sahrens * Note: (mintxg, maxtxg) is an open interval, 1496789Sahrens * i.e. mintxg and maxtxg themselves are not included. 1497789Sahrens * 1498789Sahrens * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1 1499789Sahrens * so we don't claim to resilver a txg that's still changing. 1500789Sahrens */ 1501789Sahrens mutex_enter(&rvd->vdev_dtl_lock); 1502789Sahrens ss = avl_first(&rvd->vdev_dtl_map.sm_root); 1503789Sahrens mintxg = ss ? ss->ss_start - 1 : 0; 1504789Sahrens ss = avl_last(&rvd->vdev_dtl_map.sm_root); 1505789Sahrens maxtxg = ss ? ss->ss_end : 0; 1506789Sahrens maxtxg = MIN(maxtxg, spa_last_synced_txg(spa) + 1); 1507789Sahrens mutex_exit(&rvd->vdev_dtl_lock); 1508789Sahrens 1509789Sahrens advance = ADVANCE_PRE | ADVANCE_PRUNE; 1510789Sahrens break; 1511789Sahrens 1512789Sahrens case POOL_SCRUB_EVERYTHING: 1513789Sahrens /* 1514789Sahrens * A scrub is like a resilver, but not pruned by DTL. 1515789Sahrens */ 1516789Sahrens advance = ADVANCE_PRE; 1517789Sahrens break; 1518789Sahrens } 1519789Sahrens 1520789Sahrens if (mintxg != 0 && maxtxg != 0 && type != POOL_SCRUB_NONE) { 1521789Sahrens spa->spa_scrub_maxtxg = maxtxg; 1522789Sahrens spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL, 1523789Sahrens advance, ZIO_FLAG_CANFAIL); 1524789Sahrens traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg); 1525789Sahrens spa->spa_scrub_thread = thread_create(NULL, 0, 1526789Sahrens spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri); 1527789Sahrens } 1528789Sahrens 1529789Sahrens return (0); 1530789Sahrens } 1531789Sahrens 1532789Sahrens int 1533789Sahrens spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) 1534789Sahrens { 1535789Sahrens int error; 1536789Sahrens traverse_handle_t *th; 1537789Sahrens 1538789Sahrens mutex_enter(&spa->spa_scrub_lock); 1539789Sahrens error = spa_scrub_locked(spa, type, force); 1540789Sahrens th = spa->spa_scrub_th; 1541789Sahrens mutex_exit(&spa->spa_scrub_lock); 1542789Sahrens 1543789Sahrens if (th == NULL && type != POOL_SCRUB_NONE) 1544789Sahrens spa_vdev_replace_done(spa); 1545789Sahrens 1546789Sahrens return (error); 1547789Sahrens } 1548789Sahrens 1549789Sahrens /* 1550789Sahrens * ========================================================================== 1551789Sahrens * SPA syncing routines 1552789Sahrens * ========================================================================== 1553789Sahrens */ 1554789Sahrens 1555789Sahrens static void 1556789Sahrens spa_sync_deferred_frees(spa_t *spa, uint64_t txg) 1557789Sahrens { 1558789Sahrens bplist_t *bpl = &spa->spa_sync_bplist; 1559789Sahrens dmu_tx_t *tx; 1560789Sahrens blkptr_t blk; 1561789Sahrens uint64_t itor = 0; 1562789Sahrens zio_t *zio; 1563789Sahrens int error; 1564789Sahrens uint8_t c = 1; 1565789Sahrens 1566789Sahrens zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD); 1567789Sahrens 1568789Sahrens while (bplist_iterate(bpl, &itor, &blk) == 0) 1569789Sahrens zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL)); 1570789Sahrens 1571789Sahrens error = zio_wait(zio); 1572789Sahrens ASSERT3U(error, ==, 0); 1573789Sahrens 1574789Sahrens tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 1575789Sahrens bplist_vacate(bpl, tx); 1576789Sahrens 1577789Sahrens /* 1578789Sahrens * Pre-dirty the first block so we sync to convergence faster. 1579789Sahrens * (Usually only the first block is needed.) 1580789Sahrens */ 1581789Sahrens dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 1582789Sahrens dmu_tx_commit(tx); 1583789Sahrens } 1584789Sahrens 1585789Sahrens static void 1586789Sahrens spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 1587789Sahrens { 1588789Sahrens nvlist_t *config; 1589789Sahrens char *packed = NULL; 1590789Sahrens size_t nvsize = 0; 1591789Sahrens dmu_buf_t *db; 1592789Sahrens 1593789Sahrens if (list_is_empty(&spa->spa_dirty_list)) 1594789Sahrens return; 1595789Sahrens 1596789Sahrens config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE); 1597789Sahrens 1598789Sahrens spa_config_set(spa, config); 1599789Sahrens 1600789Sahrens VERIFY(nvlist_size(config, &nvsize, NV_ENCODE_XDR) == 0); 1601789Sahrens 1602789Sahrens packed = kmem_alloc(nvsize, KM_SLEEP); 1603789Sahrens 1604789Sahrens VERIFY(nvlist_pack(config, &packed, &nvsize, NV_ENCODE_XDR, 0) == 0); 1605789Sahrens 1606789Sahrens dmu_write(spa->spa_meta_objset, spa->spa_config_object, 0, nvsize, 1607789Sahrens packed, tx); 1608789Sahrens 1609789Sahrens kmem_free(packed, nvsize); 1610789Sahrens 1611789Sahrens db = dmu_bonus_hold(spa->spa_meta_objset, spa->spa_config_object); 1612789Sahrens dmu_buf_will_dirty(db, tx); 1613789Sahrens *(uint64_t *)db->db_data = nvsize; 1614789Sahrens dmu_buf_rele(db); 1615789Sahrens } 1616789Sahrens 1617789Sahrens /* 1618789Sahrens * Sync the specified transaction group. New blocks may be dirtied as 1619789Sahrens * part of the process, so we iterate until it converges. 1620789Sahrens */ 1621789Sahrens void 1622789Sahrens spa_sync(spa_t *spa, uint64_t txg) 1623789Sahrens { 1624789Sahrens dsl_pool_t *dp = spa->spa_dsl_pool; 1625789Sahrens objset_t *mos = spa->spa_meta_objset; 1626789Sahrens bplist_t *bpl = &spa->spa_sync_bplist; 1627789Sahrens vdev_t *rvd = spa->spa_root_vdev; 1628789Sahrens vdev_t *vd; 1629789Sahrens dmu_tx_t *tx; 1630789Sahrens int dirty_vdevs; 1631789Sahrens 1632789Sahrens /* 1633789Sahrens * Lock out configuration changes. 1634789Sahrens */ 1635789Sahrens spa_config_enter(spa, RW_READER); 1636789Sahrens 1637789Sahrens spa->spa_syncing_txg = txg; 1638789Sahrens spa->spa_sync_pass = 0; 1639789Sahrens 1640789Sahrens bplist_open(bpl, mos, spa->spa_sync_bplist_obj); 1641789Sahrens 1642789Sahrens /* 1643789Sahrens * If anything has changed in this txg, push the deferred frees 1644789Sahrens * from the previous txg. If not, leave them alone so that we 1645789Sahrens * don't generate work on an otherwise idle system. 1646789Sahrens */ 1647789Sahrens if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 1648789Sahrens !txg_list_empty(&dp->dp_dirty_dirs, txg)) 1649789Sahrens spa_sync_deferred_frees(spa, txg); 1650789Sahrens 1651789Sahrens /* 1652789Sahrens * Iterate to convergence. 1653789Sahrens */ 1654789Sahrens do { 1655789Sahrens spa->spa_sync_pass++; 1656789Sahrens 1657789Sahrens tx = dmu_tx_create_assigned(dp, txg); 1658789Sahrens spa_sync_config_object(spa, tx); 1659789Sahrens dmu_tx_commit(tx); 1660789Sahrens 1661789Sahrens dsl_pool_sync(dp, txg); 1662789Sahrens 1663789Sahrens dirty_vdevs = 0; 1664789Sahrens while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 1665789Sahrens vdev_sync(vd, txg); 1666789Sahrens dirty_vdevs++; 1667789Sahrens } 1668789Sahrens 1669789Sahrens tx = dmu_tx_create_assigned(dp, txg); 1670789Sahrens bplist_sync(bpl, tx); 1671789Sahrens dmu_tx_commit(tx); 1672789Sahrens 1673789Sahrens } while (dirty_vdevs); 1674789Sahrens 1675789Sahrens bplist_close(bpl); 1676789Sahrens 1677789Sahrens dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 1678789Sahrens 1679789Sahrens /* 1680789Sahrens * Rewrite the vdev configuration (which includes the uberblock) 1681789Sahrens * to commit the transaction group. 1682789Sahrens */ 1683789Sahrens while (spa_sync_labels(spa, txg)) { 1684789Sahrens dprintf("waiting for devices to heal\n"); 1685789Sahrens delay(hz); 1686789Sahrens vdev_reopen(rvd, NULL); 1687789Sahrens } 1688789Sahrens 1689789Sahrens /* 1690789Sahrens * Make a stable copy of the fully synced uberblock. 1691789Sahrens * We use this as the root for pool traversals. 1692789Sahrens */ 1693789Sahrens spa->spa_traverse_wanted = 1; /* tells traverse_more() to stop */ 1694789Sahrens 1695789Sahrens spa_scrub_suspend(spa); /* stop scrubbing and finish I/Os */ 1696789Sahrens 1697789Sahrens rw_enter(&spa->spa_traverse_lock, RW_WRITER); 1698789Sahrens spa->spa_traverse_wanted = 0; 1699789Sahrens spa->spa_ubsync = spa->spa_uberblock; 1700789Sahrens rw_exit(&spa->spa_traverse_lock); 1701789Sahrens 1702789Sahrens spa_scrub_resume(spa); /* resume scrub with new ubsync */ 1703789Sahrens 1704789Sahrens /* 1705789Sahrens * Clean up the ZIL records for the synced txg. 1706789Sahrens */ 1707789Sahrens dsl_pool_zil_clean(dp); 1708789Sahrens 1709789Sahrens /* 1710789Sahrens * Update usable space statistics. 1711789Sahrens */ 1712789Sahrens while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 1713789Sahrens vdev_sync_done(vd, txg); 1714789Sahrens 1715789Sahrens /* 1716789Sahrens * It had better be the case that we didn't dirty anything 1717789Sahrens * since spa_sync_labels(). 1718789Sahrens */ 1719789Sahrens ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 1720789Sahrens ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 1721789Sahrens ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 1722789Sahrens ASSERT(bpl->bpl_queue == NULL); 1723789Sahrens 1724789Sahrens spa_config_exit(spa); 1725789Sahrens } 1726789Sahrens 1727789Sahrens /* 1728789Sahrens * Sync all pools. We don't want to hold the namespace lock across these 1729789Sahrens * operations, so we take a reference on the spa_t and drop the lock during the 1730789Sahrens * sync. 1731789Sahrens */ 1732789Sahrens void 1733789Sahrens spa_sync_allpools(void) 1734789Sahrens { 1735789Sahrens spa_t *spa = NULL; 1736789Sahrens mutex_enter(&spa_namespace_lock); 1737789Sahrens while ((spa = spa_next(spa)) != NULL) { 1738789Sahrens if (spa_state(spa) != POOL_STATE_ACTIVE) 1739789Sahrens continue; 1740789Sahrens spa_open_ref(spa, FTAG); 1741789Sahrens mutex_exit(&spa_namespace_lock); 1742789Sahrens txg_wait_synced(spa_get_dsl(spa), 0); 1743789Sahrens mutex_enter(&spa_namespace_lock); 1744789Sahrens spa_close(spa, FTAG); 1745789Sahrens } 1746789Sahrens mutex_exit(&spa_namespace_lock); 1747789Sahrens } 1748789Sahrens 1749789Sahrens /* 1750789Sahrens * ========================================================================== 1751789Sahrens * Miscellaneous routines 1752789Sahrens * ========================================================================== 1753789Sahrens */ 1754789Sahrens 1755789Sahrens int 1756789Sahrens spa_busy(void) 1757789Sahrens { 1758789Sahrens return (spa_active_count != 0); 1759789Sahrens } 1760789Sahrens 1761789Sahrens /* 1762789Sahrens * Remove all pools in the system. 1763789Sahrens */ 1764789Sahrens void 1765789Sahrens spa_evict_all(void) 1766789Sahrens { 1767789Sahrens spa_t *spa; 1768789Sahrens 1769789Sahrens /* 1770789Sahrens * Remove all cached state. All pools should be closed now, 1771789Sahrens * so every spa in the AVL tree should be unreferenced. 1772789Sahrens */ 1773789Sahrens mutex_enter(&spa_namespace_lock); 1774789Sahrens while ((spa = spa_next(NULL)) != NULL) { 1775789Sahrens /* 1776789Sahrens * Stop all scrub and resilver activity. spa_scrub() needs to 1777789Sahrens * wait for the scrub thread, which may do a detach and sync the 1778789Sahrens * configs, which needs spa_namespace_lock. Drop the lock while 1779789Sahrens * maintaining a hold on the spa_t. 1780789Sahrens */ 1781789Sahrens spa_open_ref(spa, FTAG); 1782789Sahrens mutex_exit(&spa_namespace_lock); 1783789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 1784789Sahrens mutex_enter(&spa_namespace_lock); 1785789Sahrens spa_close(spa, FTAG); 1786789Sahrens 1787789Sahrens if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 1788789Sahrens spa_unload(spa); 1789789Sahrens spa_deactivate(spa); 1790789Sahrens } 1791789Sahrens spa_remove(spa); 1792789Sahrens } 1793789Sahrens mutex_exit(&spa_namespace_lock); 1794789Sahrens } 1795