1789Sahrens /* 2789Sahrens * CDDL HEADER START 3789Sahrens * 4789Sahrens * The contents of this file are subject to the terms of the 5*1544Seschrock * Common Development and Distribution License (the "License"). 6*1544Seschrock * You may not use this file except in compliance with the License. 7789Sahrens * 8789Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9789Sahrens * or http://www.opensolaris.org/os/licensing. 10789Sahrens * See the License for the specific language governing permissions 11789Sahrens * and limitations under the License. 12789Sahrens * 13789Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14789Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15789Sahrens * If applicable, add the following below this CDDL HEADER, with the 16789Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17789Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18789Sahrens * 19789Sahrens * CDDL HEADER END 20789Sahrens */ 21789Sahrens /* 221354Seschrock * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23789Sahrens * Use is subject to license terms. 24789Sahrens */ 25789Sahrens 26789Sahrens #pragma ident "%Z%%M% %I% %E% SMI" 27789Sahrens 28789Sahrens /* 29789Sahrens * This file contains all the routines used when modifying on-disk SPA state. 30789Sahrens * This includes opening, importing, destroying, exporting a pool, and syncing a 31789Sahrens * pool. 32789Sahrens */ 33789Sahrens 34789Sahrens #include <sys/zfs_context.h> 35*1544Seschrock #include <sys/fm/fs/zfs.h> 36789Sahrens #include <sys/spa_impl.h> 37789Sahrens #include <sys/zio.h> 38789Sahrens #include <sys/zio_checksum.h> 39789Sahrens #include <sys/zio_compress.h> 40789Sahrens #include <sys/dmu.h> 41789Sahrens #include <sys/dmu_tx.h> 42789Sahrens #include <sys/zap.h> 43789Sahrens #include <sys/zil.h> 44789Sahrens #include <sys/vdev_impl.h> 45789Sahrens #include <sys/metaslab.h> 46789Sahrens #include <sys/uberblock_impl.h> 47789Sahrens #include <sys/txg.h> 48789Sahrens #include <sys/avl.h> 49789Sahrens #include <sys/dmu_traverse.h> 50789Sahrens #include <sys/unique.h> 51789Sahrens #include <sys/dsl_pool.h> 52789Sahrens #include <sys/dsl_dir.h> 53789Sahrens #include <sys/dsl_prop.h> 54789Sahrens #include <sys/fs/zfs.h> 55789Sahrens #include <sys/callb.h> 56789Sahrens 57789Sahrens static uint32_t spa_active_count; 58789Sahrens 59789Sahrens /* 60789Sahrens * ========================================================================== 61789Sahrens * SPA state manipulation (open/create/destroy/import/export) 62789Sahrens * ========================================================================== 63789Sahrens */ 64789Sahrens 65*1544Seschrock static int 66*1544Seschrock spa_error_entry_compare(const void *a, const void *b) 67*1544Seschrock { 68*1544Seschrock spa_error_entry_t *sa = (spa_error_entry_t *)a; 69*1544Seschrock spa_error_entry_t *sb = (spa_error_entry_t *)b; 70*1544Seschrock int ret; 71*1544Seschrock 72*1544Seschrock ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 73*1544Seschrock sizeof (zbookmark_t)); 74*1544Seschrock 75*1544Seschrock if (ret < 0) 76*1544Seschrock return (-1); 77*1544Seschrock else if (ret > 0) 78*1544Seschrock return (1); 79*1544Seschrock else 80*1544Seschrock return (0); 81*1544Seschrock } 82*1544Seschrock 83*1544Seschrock /* 84*1544Seschrock * Utility function which retrieves copies of the current logs and 85*1544Seschrock * re-initializes them in the process. 86*1544Seschrock */ 87*1544Seschrock void 88*1544Seschrock spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 89*1544Seschrock { 90*1544Seschrock ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 91*1544Seschrock 92*1544Seschrock bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 93*1544Seschrock bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 94*1544Seschrock 95*1544Seschrock avl_create(&spa->spa_errlist_scrub, 96*1544Seschrock spa_error_entry_compare, sizeof (spa_error_entry_t), 97*1544Seschrock offsetof(spa_error_entry_t, se_avl)); 98*1544Seschrock avl_create(&spa->spa_errlist_last, 99*1544Seschrock spa_error_entry_compare, sizeof (spa_error_entry_t), 100*1544Seschrock offsetof(spa_error_entry_t, se_avl)); 101*1544Seschrock } 102*1544Seschrock 103789Sahrens /* 104789Sahrens * Activate an uninitialized pool. 105789Sahrens */ 106789Sahrens static void 107789Sahrens spa_activate(spa_t *spa) 108789Sahrens { 109789Sahrens int t; 110789Sahrens 111789Sahrens ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 112789Sahrens 113789Sahrens spa->spa_state = POOL_STATE_ACTIVE; 114789Sahrens 115789Sahrens spa->spa_normal_class = metaslab_class_create(); 116789Sahrens 117789Sahrens for (t = 0; t < ZIO_TYPES; t++) { 118789Sahrens spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue", 119789Sahrens 8, maxclsyspri, 50, INT_MAX, 120789Sahrens TASKQ_PREPOPULATE); 121789Sahrens spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr", 122789Sahrens 8, maxclsyspri, 50, INT_MAX, 123789Sahrens TASKQ_PREPOPULATE); 124789Sahrens } 125789Sahrens 126789Sahrens rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL); 127789Sahrens 128789Sahrens list_create(&spa->spa_dirty_list, sizeof (vdev_t), 129789Sahrens offsetof(vdev_t, vdev_dirty_node)); 130789Sahrens 131789Sahrens txg_list_create(&spa->spa_vdev_txg_list, 132789Sahrens offsetof(struct vdev, vdev_txg_node)); 133*1544Seschrock 134*1544Seschrock avl_create(&spa->spa_errlist_scrub, 135*1544Seschrock spa_error_entry_compare, sizeof (spa_error_entry_t), 136*1544Seschrock offsetof(spa_error_entry_t, se_avl)); 137*1544Seschrock avl_create(&spa->spa_errlist_last, 138*1544Seschrock spa_error_entry_compare, sizeof (spa_error_entry_t), 139*1544Seschrock offsetof(spa_error_entry_t, se_avl)); 140789Sahrens } 141789Sahrens 142789Sahrens /* 143789Sahrens * Opposite of spa_activate(). 144789Sahrens */ 145789Sahrens static void 146789Sahrens spa_deactivate(spa_t *spa) 147789Sahrens { 148789Sahrens int t; 149789Sahrens 150789Sahrens ASSERT(spa->spa_sync_on == B_FALSE); 151789Sahrens ASSERT(spa->spa_dsl_pool == NULL); 152789Sahrens ASSERT(spa->spa_root_vdev == NULL); 153789Sahrens 154789Sahrens ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 155789Sahrens 156789Sahrens txg_list_destroy(&spa->spa_vdev_txg_list); 157789Sahrens 158789Sahrens list_destroy(&spa->spa_dirty_list); 159789Sahrens 160789Sahrens rw_destroy(&spa->spa_traverse_lock); 161789Sahrens 162789Sahrens for (t = 0; t < ZIO_TYPES; t++) { 163789Sahrens taskq_destroy(spa->spa_zio_issue_taskq[t]); 164789Sahrens taskq_destroy(spa->spa_zio_intr_taskq[t]); 165789Sahrens spa->spa_zio_issue_taskq[t] = NULL; 166789Sahrens spa->spa_zio_intr_taskq[t] = NULL; 167789Sahrens } 168789Sahrens 169789Sahrens metaslab_class_destroy(spa->spa_normal_class); 170789Sahrens spa->spa_normal_class = NULL; 171789Sahrens 172*1544Seschrock /* 173*1544Seschrock * If this was part of an import or the open otherwise failed, we may 174*1544Seschrock * still have errors left in the queues. Empty them just in case. 175*1544Seschrock */ 176*1544Seschrock spa_errlog_drain(spa); 177*1544Seschrock 178*1544Seschrock avl_destroy(&spa->spa_errlist_scrub); 179*1544Seschrock avl_destroy(&spa->spa_errlist_last); 180*1544Seschrock 181789Sahrens spa->spa_state = POOL_STATE_UNINITIALIZED; 182789Sahrens } 183789Sahrens 184789Sahrens /* 185789Sahrens * Verify a pool configuration, and construct the vdev tree appropriately. This 186789Sahrens * will create all the necessary vdevs in the appropriate layout, with each vdev 187789Sahrens * in the CLOSED state. This will prep the pool before open/creation/import. 188789Sahrens * All vdev validation is done by the vdev_alloc() routine. 189789Sahrens */ 190789Sahrens static vdev_t * 191789Sahrens spa_config_parse(spa_t *spa, nvlist_t *nv, vdev_t *parent, uint_t id, int atype) 192789Sahrens { 193789Sahrens nvlist_t **child; 194789Sahrens uint_t c, children; 195789Sahrens vdev_t *vd; 196789Sahrens 197789Sahrens if ((vd = vdev_alloc(spa, nv, parent, id, atype)) == NULL) 198789Sahrens return (NULL); 199789Sahrens 200789Sahrens if (vd->vdev_ops->vdev_op_leaf) 201789Sahrens return (vd); 202789Sahrens 203789Sahrens if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 204789Sahrens &child, &children) != 0) { 205789Sahrens vdev_free(vd); 206789Sahrens return (NULL); 207789Sahrens } 208789Sahrens 209789Sahrens for (c = 0; c < children; c++) { 210789Sahrens if (spa_config_parse(spa, child[c], vd, c, atype) == NULL) { 211789Sahrens vdev_free(vd); 212789Sahrens return (NULL); 213789Sahrens } 214789Sahrens } 215789Sahrens 216789Sahrens return (vd); 217789Sahrens } 218789Sahrens 219789Sahrens /* 220789Sahrens * Opposite of spa_load(). 221789Sahrens */ 222789Sahrens static void 223789Sahrens spa_unload(spa_t *spa) 224789Sahrens { 225789Sahrens /* 226*1544Seschrock * Stop async tasks. 227*1544Seschrock */ 228*1544Seschrock spa_async_suspend(spa); 229*1544Seschrock 230*1544Seschrock /* 231789Sahrens * Stop syncing. 232789Sahrens */ 233789Sahrens if (spa->spa_sync_on) { 234789Sahrens txg_sync_stop(spa->spa_dsl_pool); 235789Sahrens spa->spa_sync_on = B_FALSE; 236789Sahrens } 237789Sahrens 238789Sahrens /* 239789Sahrens * Wait for any outstanding prefetch I/O to complete. 240789Sahrens */ 241*1544Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 242*1544Seschrock spa_config_exit(spa, FTAG); 243789Sahrens 244789Sahrens /* 245789Sahrens * Close the dsl pool. 246789Sahrens */ 247789Sahrens if (spa->spa_dsl_pool) { 248789Sahrens dsl_pool_close(spa->spa_dsl_pool); 249789Sahrens spa->spa_dsl_pool = NULL; 250789Sahrens } 251789Sahrens 252789Sahrens /* 253789Sahrens * Close all vdevs. 254789Sahrens */ 255789Sahrens if (spa->spa_root_vdev) { 256789Sahrens vdev_free(spa->spa_root_vdev); 257789Sahrens spa->spa_root_vdev = NULL; 258789Sahrens } 259*1544Seschrock 260*1544Seschrock spa->spa_async_suspended = 0; 261789Sahrens } 262789Sahrens 263789Sahrens /* 264789Sahrens * Load an existing storage pool, using the pool's builtin spa_config as a 265*1544Seschrock * source of configuration information. 266789Sahrens */ 267789Sahrens static int 268*1544Seschrock spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) 269789Sahrens { 270789Sahrens int error = 0; 271789Sahrens nvlist_t *nvroot = NULL; 272789Sahrens vdev_t *rvd; 273789Sahrens uberblock_t *ub = &spa->spa_uberblock; 274789Sahrens uint64_t pool_guid; 275789Sahrens zio_t *zio; 276789Sahrens 277*1544Seschrock spa->spa_load_state = state; 278789Sahrens if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 279*1544Seschrock nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 280*1544Seschrock error = EINVAL; 281*1544Seschrock goto out; 282*1544Seschrock } 283789Sahrens 284789Sahrens (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 285789Sahrens &spa->spa_config_txg); 286789Sahrens 287*1544Seschrock if ((spa->spa_load_state == SPA_LOAD_IMPORT || 288*1544Seschrock spa->spa_load_state == SPA_LOAD_TRYIMPORT) && 289*1544Seschrock spa_guid_exists(pool_guid, 0)) { 290*1544Seschrock error = EEXIST; 291*1544Seschrock goto out; 292*1544Seschrock } 293789Sahrens 294789Sahrens /* 295789Sahrens * Parse the configuration into a vdev tree. 296789Sahrens */ 297*1544Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 298789Sahrens rvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_LOAD); 299*1544Seschrock spa_config_exit(spa, FTAG); 300789Sahrens 301*1544Seschrock if (rvd == NULL) { 302*1544Seschrock error = EINVAL; 303*1544Seschrock goto out; 304*1544Seschrock } 305789Sahrens 306789Sahrens spa->spa_root_vdev = rvd; 307789Sahrens ASSERT(spa_guid(spa) == pool_guid); 308789Sahrens 309789Sahrens /* 310789Sahrens * Try to open all vdevs, loading each label in the process. 311789Sahrens */ 312*1544Seschrock if (vdev_open(rvd) != 0) { 313*1544Seschrock error = ENXIO; 314*1544Seschrock goto out; 315*1544Seschrock } 316789Sahrens 317789Sahrens /* 318789Sahrens * Find the best uberblock. 319789Sahrens */ 320789Sahrens bzero(ub, sizeof (uberblock_t)); 321789Sahrens 322789Sahrens zio = zio_root(spa, NULL, NULL, 323789Sahrens ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 324789Sahrens vdev_uberblock_load(zio, rvd, ub); 325789Sahrens error = zio_wait(zio); 326789Sahrens 327789Sahrens /* 328789Sahrens * If we weren't able to find a single valid uberblock, return failure. 329789Sahrens */ 330789Sahrens if (ub->ub_txg == 0) { 331*1544Seschrock error = ENXIO; 332*1544Seschrock goto out; 333*1544Seschrock } 334*1544Seschrock 335*1544Seschrock /* 336*1544Seschrock * If the pool is newer than the code, we can't open it. 337*1544Seschrock */ 338*1544Seschrock if (ub->ub_version > UBERBLOCK_VERSION) { 339*1544Seschrock error = ENOTSUP; 340*1544Seschrock goto out; 341789Sahrens } 342789Sahrens 343789Sahrens /* 344789Sahrens * If the vdev guid sum doesn't match the uberblock, we have an 345789Sahrens * incomplete configuration. 346789Sahrens */ 347789Sahrens if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { 348*1544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 349*1544Seschrock VDEV_AUX_BAD_GUID_SUM); 350*1544Seschrock error = ENXIO; 351*1544Seschrock goto out; 352789Sahrens } 353789Sahrens 354789Sahrens /* 355789Sahrens * Initialize internal SPA structures. 356789Sahrens */ 357789Sahrens spa->spa_state = POOL_STATE_ACTIVE; 358789Sahrens spa->spa_ubsync = spa->spa_uberblock; 359789Sahrens spa->spa_first_txg = spa_last_synced_txg(spa) + 1; 360*1544Seschrock error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 361*1544Seschrock if (error) { 362*1544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 363*1544Seschrock VDEV_AUX_CORRUPT_DATA); 364*1544Seschrock goto out; 365*1544Seschrock } 366789Sahrens spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 367789Sahrens 368*1544Seschrock if (zap_lookup(spa->spa_meta_objset, 369789Sahrens DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 370*1544Seschrock sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { 371*1544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 372*1544Seschrock VDEV_AUX_CORRUPT_DATA); 373*1544Seschrock error = EIO; 374*1544Seschrock goto out; 375*1544Seschrock } 376789Sahrens 377789Sahrens if (!mosconfig) { 378789Sahrens dmu_buf_t *db; 379789Sahrens char *packed = NULL; 380789Sahrens size_t nvsize = 0; 381789Sahrens nvlist_t *newconfig = NULL; 382789Sahrens 383*1544Seschrock VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, 384*1544Seschrock spa->spa_config_object, FTAG, &db)); 385789Sahrens nvsize = *(uint64_t *)db->db_data; 386*1544Seschrock dmu_buf_rele(db, FTAG); 387789Sahrens 388789Sahrens packed = kmem_alloc(nvsize, KM_SLEEP); 389*1544Seschrock error = dmu_read(spa->spa_meta_objset, 390789Sahrens spa->spa_config_object, 0, nvsize, packed); 391789Sahrens if (error == 0) 392789Sahrens error = nvlist_unpack(packed, nvsize, &newconfig, 0); 393789Sahrens kmem_free(packed, nvsize); 394789Sahrens 395*1544Seschrock if (error) { 396*1544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 397*1544Seschrock VDEV_AUX_CORRUPT_DATA); 398*1544Seschrock error = EIO; 399*1544Seschrock goto out; 400*1544Seschrock } 401789Sahrens 402789Sahrens spa_config_set(spa, newconfig); 403789Sahrens 404789Sahrens spa_unload(spa); 405789Sahrens spa_deactivate(spa); 406789Sahrens spa_activate(spa); 407789Sahrens 408*1544Seschrock return (spa_load(spa, newconfig, state, B_TRUE)); 409*1544Seschrock } 410*1544Seschrock 411*1544Seschrock if (zap_lookup(spa->spa_meta_objset, 412*1544Seschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 413*1544Seschrock sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { 414*1544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 415*1544Seschrock VDEV_AUX_CORRUPT_DATA); 416*1544Seschrock error = EIO; 417*1544Seschrock goto out; 418789Sahrens } 419789Sahrens 420*1544Seschrock /* 421*1544Seschrock * Load the persistent error log. If we have an older pool, this will 422*1544Seschrock * not be present. 423*1544Seschrock */ 424*1544Seschrock error = zap_lookup(spa->spa_meta_objset, 425*1544Seschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, 426*1544Seschrock sizeof (uint64_t), 1, &spa->spa_errlog_last); 427*1544Seschrock if (error != 0 &&error != ENOENT) { 428*1544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 429*1544Seschrock VDEV_AUX_CORRUPT_DATA); 430*1544Seschrock error = EIO; 431*1544Seschrock goto out; 432*1544Seschrock } 433*1544Seschrock 434*1544Seschrock error = zap_lookup(spa->spa_meta_objset, 435*1544Seschrock DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, 436*1544Seschrock sizeof (uint64_t), 1, &spa->spa_errlog_scrub); 437*1544Seschrock if (error != 0 && error != ENOENT) { 438*1544Seschrock vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, 439*1544Seschrock VDEV_AUX_CORRUPT_DATA); 440*1544Seschrock error = EIO; 441*1544Seschrock goto out; 442*1544Seschrock } 443789Sahrens 444789Sahrens /* 445*1544Seschrock * Load the vdev state for all top level vdevs. We need to grab the 446*1544Seschrock * config lock because all label I/O is done with the 447*1544Seschrock * ZIO_FLAG_CONFIG_HELD flag. 448789Sahrens */ 449*1544Seschrock spa_config_enter(spa, RW_READER, FTAG); 450*1544Seschrock if ((error = vdev_load(rvd)) != 0) { 451*1544Seschrock spa_config_exit(spa, FTAG); 452*1544Seschrock goto out; 453*1544Seschrock } 454*1544Seschrock spa_config_exit(spa, FTAG); 455789Sahrens 456789Sahrens /* 457789Sahrens * Propagate the leaf DTLs we just loaded all the way up the tree. 458789Sahrens */ 459*1544Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 460789Sahrens vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 461*1544Seschrock spa_config_exit(spa, FTAG); 462789Sahrens 463789Sahrens /* 464789Sahrens * Check the state of the root vdev. If it can't be opened, it 465789Sahrens * indicates one or more toplevel vdevs are faulted. 466789Sahrens */ 467*1544Seschrock if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 468*1544Seschrock error = ENXIO; 469*1544Seschrock goto out; 470*1544Seschrock } 471789Sahrens 472789Sahrens /* 473789Sahrens * Claim log blocks that haven't been committed yet, and update all 474789Sahrens * top-level vdevs to sync any config changes found in vdev_load(). 475789Sahrens * This must all happen in a single txg. 476789Sahrens */ 477*1544Seschrock if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) { 478789Sahrens dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa), 479789Sahrens spa_first_txg(spa)); 480789Sahrens dmu_objset_find(spa->spa_name, zil_claim, tx, 0); 481789Sahrens vdev_config_dirty(rvd); 482789Sahrens dmu_tx_commit(tx); 483789Sahrens 484789Sahrens spa->spa_sync_on = B_TRUE; 485789Sahrens txg_sync_start(spa->spa_dsl_pool); 486789Sahrens 487789Sahrens /* 488789Sahrens * Wait for all claims to sync. 489789Sahrens */ 490789Sahrens txg_wait_synced(spa->spa_dsl_pool, 0); 491789Sahrens } 492789Sahrens 493*1544Seschrock error = 0; 494*1544Seschrock out: 495*1544Seschrock if (error) 496*1544Seschrock zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0); 497*1544Seschrock spa->spa_load_state = SPA_LOAD_NONE; 498*1544Seschrock spa->spa_ena = 0; 499*1544Seschrock 500*1544Seschrock return (error); 501789Sahrens } 502789Sahrens 503789Sahrens /* 504789Sahrens * Pool Open/Import 505789Sahrens * 506789Sahrens * The import case is identical to an open except that the configuration is sent 507789Sahrens * down from userland, instead of grabbed from the configuration cache. For the 508789Sahrens * case of an open, the pool configuration will exist in the 509789Sahrens * POOL_STATE_UNITIALIZED state. 510789Sahrens * 511789Sahrens * The stats information (gen/count/ustats) is used to gather vdev statistics at 512789Sahrens * the same time open the pool, without having to keep around the spa_t in some 513789Sahrens * ambiguous state. 514789Sahrens */ 515789Sahrens static int 516789Sahrens spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) 517789Sahrens { 518789Sahrens spa_t *spa; 519789Sahrens int error; 520789Sahrens int loaded = B_FALSE; 521789Sahrens int locked = B_FALSE; 522789Sahrens 523789Sahrens *spapp = NULL; 524789Sahrens 525789Sahrens /* 526789Sahrens * As disgusting as this is, we need to support recursive calls to this 527789Sahrens * function because dsl_dir_open() is called during spa_load(), and ends 528789Sahrens * up calling spa_open() again. The real fix is to figure out how to 529789Sahrens * avoid dsl_dir_open() calling this in the first place. 530789Sahrens */ 531789Sahrens if (mutex_owner(&spa_namespace_lock) != curthread) { 532789Sahrens mutex_enter(&spa_namespace_lock); 533789Sahrens locked = B_TRUE; 534789Sahrens } 535789Sahrens 536789Sahrens if ((spa = spa_lookup(pool)) == NULL) { 537789Sahrens if (locked) 538789Sahrens mutex_exit(&spa_namespace_lock); 539789Sahrens return (ENOENT); 540789Sahrens } 541789Sahrens if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 542789Sahrens 543789Sahrens spa_activate(spa); 544789Sahrens 545789Sahrens error = spa_load(spa, spa->spa_config, 546*1544Seschrock SPA_LOAD_OPEN, B_FALSE); 547789Sahrens 548789Sahrens if (error == EBADF) { 549789Sahrens /* 550789Sahrens * If vdev_load() returns EBADF, it indicates that one 551789Sahrens * of the vdevs indicates that the pool has been 552789Sahrens * exported or destroyed. If this is the case, the 553789Sahrens * config cache is out of sync and we should remove the 554789Sahrens * pool from the namespace. 555789Sahrens */ 556789Sahrens spa_unload(spa); 557789Sahrens spa_deactivate(spa); 558789Sahrens spa_remove(spa); 559789Sahrens spa_config_sync(); 560789Sahrens if (locked) 561789Sahrens mutex_exit(&spa_namespace_lock); 562789Sahrens return (ENOENT); 563*1544Seschrock } 564*1544Seschrock 565*1544Seschrock if (error) { 566789Sahrens /* 567789Sahrens * We can't open the pool, but we still have useful 568789Sahrens * information: the state of each vdev after the 569789Sahrens * attempted vdev_open(). Return this to the user. 570789Sahrens */ 571789Sahrens if (config != NULL && spa->spa_root_vdev != NULL) 572789Sahrens *config = spa_config_generate(spa, NULL, -1ULL, 573789Sahrens B_TRUE); 574789Sahrens spa_unload(spa); 575789Sahrens spa_deactivate(spa); 576*1544Seschrock spa->spa_last_open_failed = B_TRUE; 577789Sahrens if (locked) 578789Sahrens mutex_exit(&spa_namespace_lock); 579789Sahrens *spapp = NULL; 580789Sahrens return (error); 581*1544Seschrock } else { 582*1544Seschrock zfs_post_ok(spa, NULL); 583*1544Seschrock spa->spa_last_open_failed = B_FALSE; 584789Sahrens } 585789Sahrens 586789Sahrens loaded = B_TRUE; 587789Sahrens } 588789Sahrens 589789Sahrens spa_open_ref(spa, tag); 590789Sahrens if (locked) 591789Sahrens mutex_exit(&spa_namespace_lock); 592789Sahrens 593789Sahrens *spapp = spa; 594789Sahrens 595789Sahrens if (config != NULL) { 596*1544Seschrock spa_config_enter(spa, RW_READER, FTAG); 597789Sahrens *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 598*1544Seschrock spa_config_exit(spa, FTAG); 599789Sahrens } 600789Sahrens 601789Sahrens /* 602789Sahrens * If we just loaded the pool, resilver anything that's out of date. 603789Sahrens */ 604789Sahrens if (loaded && (spa_mode & FWRITE)) 605789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 606789Sahrens 607789Sahrens return (0); 608789Sahrens } 609789Sahrens 610789Sahrens int 611789Sahrens spa_open(const char *name, spa_t **spapp, void *tag) 612789Sahrens { 613789Sahrens return (spa_open_common(name, spapp, tag, NULL)); 614789Sahrens } 615789Sahrens 616*1544Seschrock /* 617*1544Seschrock * Lookup the given spa_t, incrementing the inject count in the process, 618*1544Seschrock * preventing it from being exported or destroyed. 619*1544Seschrock */ 620*1544Seschrock spa_t * 621*1544Seschrock spa_inject_addref(char *name) 622*1544Seschrock { 623*1544Seschrock spa_t *spa; 624*1544Seschrock 625*1544Seschrock mutex_enter(&spa_namespace_lock); 626*1544Seschrock if ((spa = spa_lookup(name)) == NULL) { 627*1544Seschrock mutex_exit(&spa_namespace_lock); 628*1544Seschrock return (NULL); 629*1544Seschrock } 630*1544Seschrock spa->spa_inject_ref++; 631*1544Seschrock mutex_exit(&spa_namespace_lock); 632*1544Seschrock 633*1544Seschrock return (spa); 634*1544Seschrock } 635*1544Seschrock 636*1544Seschrock void 637*1544Seschrock spa_inject_delref(spa_t *spa) 638*1544Seschrock { 639*1544Seschrock mutex_enter(&spa_namespace_lock); 640*1544Seschrock spa->spa_inject_ref--; 641*1544Seschrock mutex_exit(&spa_namespace_lock); 642*1544Seschrock } 643*1544Seschrock 644789Sahrens int 645*1544Seschrock spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 646789Sahrens { 647789Sahrens int error; 648789Sahrens spa_t *spa; 649789Sahrens 650789Sahrens *config = NULL; 651789Sahrens error = spa_open_common(name, &spa, FTAG, config); 652789Sahrens 653*1544Seschrock if (spa && *config != NULL) 654*1544Seschrock VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, 655*1544Seschrock spa_get_errlog_size(spa)) == 0); 656*1544Seschrock 657*1544Seschrock /* 658*1544Seschrock * We want to get the alternate root even for faulted pools, so we cheat 659*1544Seschrock * and call spa_lookup() directly. 660*1544Seschrock */ 661*1544Seschrock if (altroot) { 662*1544Seschrock if (spa == NULL) { 663*1544Seschrock mutex_enter(&spa_namespace_lock); 664*1544Seschrock spa = spa_lookup(name); 665*1544Seschrock if (spa) 666*1544Seschrock spa_altroot(spa, altroot, buflen); 667*1544Seschrock else 668*1544Seschrock altroot[0] = '\0'; 669*1544Seschrock spa = NULL; 670*1544Seschrock mutex_exit(&spa_namespace_lock); 671*1544Seschrock } else { 672*1544Seschrock spa_altroot(spa, altroot, buflen); 673*1544Seschrock } 674*1544Seschrock } 675*1544Seschrock 676789Sahrens if (spa != NULL) 677789Sahrens spa_close(spa, FTAG); 678789Sahrens 679789Sahrens return (error); 680789Sahrens } 681789Sahrens 682789Sahrens /* 683789Sahrens * Pool Creation 684789Sahrens */ 685789Sahrens int 686789Sahrens spa_create(const char *pool, nvlist_t *nvroot, char *altroot) 687789Sahrens { 688789Sahrens spa_t *spa; 689789Sahrens dsl_pool_t *dp; 690789Sahrens dmu_tx_t *tx; 691789Sahrens int error; 692789Sahrens uint64_t txg = TXG_INITIAL; 693789Sahrens 694789Sahrens /* 695789Sahrens * If this pool already exists, return failure. 696789Sahrens */ 697789Sahrens mutex_enter(&spa_namespace_lock); 698789Sahrens if (spa_lookup(pool) != NULL) { 699789Sahrens mutex_exit(&spa_namespace_lock); 700789Sahrens return (EEXIST); 701789Sahrens } 702789Sahrens spa = spa_add(pool); 703789Sahrens 704789Sahrens /* 705789Sahrens * Allocate a new spa_t structure. 706789Sahrens */ 707789Sahrens spa_activate(spa); 708789Sahrens 709789Sahrens spa->spa_uberblock.ub_txg = txg - 1; 710789Sahrens spa->spa_ubsync = spa->spa_uberblock; 711789Sahrens 712789Sahrens error = spa_vdev_add(spa, nvroot); 713789Sahrens 714789Sahrens if (error) { 715789Sahrens spa_unload(spa); 716789Sahrens spa_deactivate(spa); 717789Sahrens spa_remove(spa); 718789Sahrens mutex_exit(&spa_namespace_lock); 719789Sahrens return (error); 720789Sahrens } 721789Sahrens 722789Sahrens if (altroot != NULL) { 723789Sahrens spa->spa_root = spa_strdup(altroot); 724789Sahrens atomic_add_32(&spa_active_count, 1); 725789Sahrens } 726789Sahrens 727789Sahrens spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg); 728789Sahrens spa->spa_meta_objset = dp->dp_meta_objset; 729789Sahrens 730789Sahrens tx = dmu_tx_create_assigned(dp, txg); 731789Sahrens 732789Sahrens /* 733789Sahrens * Create the pool config object. 734789Sahrens */ 735789Sahrens spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 736789Sahrens DMU_OT_PACKED_NVLIST, 1 << 14, 737789Sahrens DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 738789Sahrens 739*1544Seschrock if (zap_add(spa->spa_meta_objset, 740789Sahrens DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 741*1544Seschrock sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 742*1544Seschrock cmn_err(CE_PANIC, "failed to add pool config"); 743*1544Seschrock } 744789Sahrens 745789Sahrens /* 746789Sahrens * Create the deferred-free bplist object. Turn off compression 747789Sahrens * because sync-to-convergence takes longer if the blocksize 748789Sahrens * keeps changing. 749789Sahrens */ 750789Sahrens spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, 751789Sahrens 1 << 14, tx); 752789Sahrens dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 753789Sahrens ZIO_COMPRESS_OFF, tx); 754789Sahrens 755*1544Seschrock if (zap_add(spa->spa_meta_objset, 756789Sahrens DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, 757*1544Seschrock sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { 758*1544Seschrock cmn_err(CE_PANIC, "failed to add bplist"); 759*1544Seschrock } 760789Sahrens 761789Sahrens dmu_tx_commit(tx); 762789Sahrens 763789Sahrens spa->spa_sync_on = B_TRUE; 764789Sahrens txg_sync_start(spa->spa_dsl_pool); 765789Sahrens 766789Sahrens /* 767789Sahrens * We explicitly wait for the first transaction to complete so that our 768789Sahrens * bean counters are appropriately updated. 769789Sahrens */ 770789Sahrens txg_wait_synced(spa->spa_dsl_pool, txg); 771789Sahrens 772789Sahrens spa_config_sync(); 773789Sahrens 774789Sahrens mutex_exit(&spa_namespace_lock); 775789Sahrens 776789Sahrens return (0); 777789Sahrens } 778789Sahrens 779789Sahrens /* 780789Sahrens * Import the given pool into the system. We set up the necessary spa_t and 781789Sahrens * then call spa_load() to do the dirty work. 782789Sahrens */ 783789Sahrens int 784789Sahrens spa_import(const char *pool, nvlist_t *config, char *altroot) 785789Sahrens { 786789Sahrens spa_t *spa; 787789Sahrens int error; 788789Sahrens 789789Sahrens if (!(spa_mode & FWRITE)) 790789Sahrens return (EROFS); 791789Sahrens 792789Sahrens /* 793789Sahrens * If a pool with this name exists, return failure. 794789Sahrens */ 795789Sahrens mutex_enter(&spa_namespace_lock); 796789Sahrens if (spa_lookup(pool) != NULL) { 797789Sahrens mutex_exit(&spa_namespace_lock); 798789Sahrens return (EEXIST); 799789Sahrens } 800789Sahrens 801789Sahrens /* 802789Sahrens * Create an initialize the spa structure 803789Sahrens */ 804789Sahrens spa = spa_add(pool); 805789Sahrens spa_activate(spa); 806789Sahrens 807789Sahrens /* 808789Sahrens * Pass off the heavy lifting to spa_load(). We pass TRUE for mosconfig 809789Sahrens * so that we don't try to open the pool if the config is damaged. 810789Sahrens */ 811*1544Seschrock error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE); 812789Sahrens 813789Sahrens if (error) { 814789Sahrens spa_unload(spa); 815789Sahrens spa_deactivate(spa); 816789Sahrens spa_remove(spa); 817789Sahrens mutex_exit(&spa_namespace_lock); 818789Sahrens return (error); 819789Sahrens } 820789Sahrens 821789Sahrens /* 822789Sahrens * Set the alternate root, if there is one. 823789Sahrens */ 824789Sahrens if (altroot != NULL) { 825789Sahrens atomic_add_32(&spa_active_count, 1); 826789Sahrens spa->spa_root = spa_strdup(altroot); 827789Sahrens } 828789Sahrens 829789Sahrens /* 830789Sahrens * Initialize the config based on the in-core state. 831789Sahrens */ 832789Sahrens config = spa_config_generate(spa, NULL, spa_last_synced_txg(spa), 0); 833789Sahrens 834789Sahrens spa_config_set(spa, config); 835789Sahrens 836789Sahrens /* 837789Sahrens * Sync the configuration cache. 838789Sahrens */ 839789Sahrens spa_config_sync(); 840789Sahrens 841789Sahrens mutex_exit(&spa_namespace_lock); 842789Sahrens 843789Sahrens /* 844789Sahrens * Resilver anything that's out of date. 845789Sahrens */ 846789Sahrens if (spa_mode & FWRITE) 847789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 848789Sahrens 849789Sahrens return (0); 850789Sahrens } 851789Sahrens 852789Sahrens /* 853789Sahrens * This (illegal) pool name is used when temporarily importing a spa_t in order 854789Sahrens * to get the vdev stats associated with the imported devices. 855789Sahrens */ 856789Sahrens #define TRYIMPORT_NAME "$import" 857789Sahrens 858789Sahrens nvlist_t * 859789Sahrens spa_tryimport(nvlist_t *tryconfig) 860789Sahrens { 861789Sahrens nvlist_t *config = NULL; 862789Sahrens char *poolname; 863789Sahrens spa_t *spa; 864789Sahrens uint64_t state; 865789Sahrens 866789Sahrens if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 867789Sahrens return (NULL); 868789Sahrens 869789Sahrens if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 870789Sahrens return (NULL); 871789Sahrens 872789Sahrens mutex_enter(&spa_namespace_lock); 873789Sahrens spa = spa_add(TRYIMPORT_NAME); 874789Sahrens 875789Sahrens ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 876789Sahrens 877789Sahrens /* 878789Sahrens * Initialize the spa_t structure. 879789Sahrens */ 880789Sahrens spa_activate(spa); 881789Sahrens 882789Sahrens /* 883789Sahrens * Pass off the heavy lifting to spa_load(). We pass TRUE for mosconfig 884789Sahrens * so we don't try to open the pool if the config is damaged. 885789Sahrens */ 886*1544Seschrock (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); 887789Sahrens 888789Sahrens /* 889789Sahrens * If 'tryconfig' was at least parsable, return the current config. 890789Sahrens */ 891789Sahrens if (spa->spa_root_vdev != NULL) { 892789Sahrens config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 893789Sahrens VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 894789Sahrens poolname) == 0); 895789Sahrens VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 896789Sahrens state) == 0); 897789Sahrens } 898789Sahrens 899789Sahrens spa_unload(spa); 900789Sahrens spa_deactivate(spa); 901789Sahrens spa_remove(spa); 902789Sahrens mutex_exit(&spa_namespace_lock); 903789Sahrens 904789Sahrens return (config); 905789Sahrens } 906789Sahrens 907789Sahrens /* 908789Sahrens * Pool export/destroy 909789Sahrens * 910789Sahrens * The act of destroying or exporting a pool is very simple. We make sure there 911789Sahrens * is no more pending I/O and any references to the pool are gone. Then, we 912789Sahrens * update the pool state and sync all the labels to disk, removing the 913789Sahrens * configuration from the cache afterwards. 914789Sahrens */ 915789Sahrens static int 916789Sahrens spa_export_common(char *pool, int new_state) 917789Sahrens { 918789Sahrens spa_t *spa; 919789Sahrens 920789Sahrens if (!(spa_mode & FWRITE)) 921789Sahrens return (EROFS); 922789Sahrens 923789Sahrens mutex_enter(&spa_namespace_lock); 924789Sahrens if ((spa = spa_lookup(pool)) == NULL) { 925789Sahrens mutex_exit(&spa_namespace_lock); 926789Sahrens return (ENOENT); 927789Sahrens } 928789Sahrens 929789Sahrens /* 930*1544Seschrock * Put a hold on the pool, drop the namespace lock, stop async tasks, 931*1544Seschrock * reacquire the namespace lock, and see if we can export. 932*1544Seschrock */ 933*1544Seschrock spa_open_ref(spa, FTAG); 934*1544Seschrock mutex_exit(&spa_namespace_lock); 935*1544Seschrock spa_async_suspend(spa); 936*1544Seschrock mutex_enter(&spa_namespace_lock); 937*1544Seschrock spa_close(spa, FTAG); 938*1544Seschrock 939*1544Seschrock /* 940789Sahrens * The pool will be in core if it's openable, 941789Sahrens * in which case we can modify its state. 942789Sahrens */ 943789Sahrens if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 944789Sahrens /* 945789Sahrens * Objsets may be open only because they're dirty, so we 946789Sahrens * have to force it to sync before checking spa_refcnt. 947789Sahrens */ 948789Sahrens spa_scrub_suspend(spa); 949789Sahrens txg_wait_synced(spa->spa_dsl_pool, 0); 950789Sahrens 951*1544Seschrock /* 952*1544Seschrock * A pool cannot be exported or destroyed if there are active 953*1544Seschrock * references. If we are resetting a pool, allow references by 954*1544Seschrock * fault injection handlers. 955*1544Seschrock */ 956*1544Seschrock if (!spa_refcount_zero(spa) || 957*1544Seschrock (spa->spa_inject_ref != 0 && 958*1544Seschrock new_state != POOL_STATE_UNINITIALIZED)) { 959789Sahrens spa_scrub_resume(spa); 960*1544Seschrock spa_async_resume(spa); 961789Sahrens mutex_exit(&spa_namespace_lock); 962789Sahrens return (EBUSY); 963789Sahrens } 964789Sahrens 965789Sahrens spa_scrub_resume(spa); 966789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 967789Sahrens 968789Sahrens if (spa->spa_root != NULL) 969789Sahrens atomic_add_32(&spa_active_count, -1); 970789Sahrens 971789Sahrens /* 972789Sahrens * We want this to be reflected on every label, 973789Sahrens * so mark them all dirty. spa_unload() will do the 974789Sahrens * final sync that pushes these changes out. 975789Sahrens */ 976*1544Seschrock if (new_state != POOL_STATE_UNINITIALIZED) { 977*1544Seschrock spa->spa_state = new_state; 978*1544Seschrock vdev_config_dirty(spa->spa_root_vdev); 979*1544Seschrock } 980789Sahrens } 981789Sahrens 982789Sahrens if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 983789Sahrens spa_unload(spa); 984789Sahrens spa_deactivate(spa); 985789Sahrens } 986789Sahrens 987*1544Seschrock if (new_state != POOL_STATE_UNINITIALIZED) { 988*1544Seschrock spa_remove(spa); 989*1544Seschrock spa_config_sync(); 990*1544Seschrock } 991789Sahrens mutex_exit(&spa_namespace_lock); 992789Sahrens 993789Sahrens return (0); 994789Sahrens } 995789Sahrens 996789Sahrens /* 997789Sahrens * Destroy a storage pool. 998789Sahrens */ 999789Sahrens int 1000789Sahrens spa_destroy(char *pool) 1001789Sahrens { 1002789Sahrens return (spa_export_common(pool, POOL_STATE_DESTROYED)); 1003789Sahrens } 1004789Sahrens 1005789Sahrens /* 1006789Sahrens * Export a storage pool. 1007789Sahrens */ 1008789Sahrens int 1009789Sahrens spa_export(char *pool) 1010789Sahrens { 1011789Sahrens return (spa_export_common(pool, POOL_STATE_EXPORTED)); 1012789Sahrens } 1013789Sahrens 1014789Sahrens /* 1015*1544Seschrock * Similar to spa_export(), this unloads the spa_t without actually removing it 1016*1544Seschrock * from the namespace in any way. 1017*1544Seschrock */ 1018*1544Seschrock int 1019*1544Seschrock spa_reset(char *pool) 1020*1544Seschrock { 1021*1544Seschrock return (spa_export_common(pool, POOL_STATE_UNINITIALIZED)); 1022*1544Seschrock } 1023*1544Seschrock 1024*1544Seschrock 1025*1544Seschrock /* 1026789Sahrens * ========================================================================== 1027789Sahrens * Device manipulation 1028789Sahrens * ========================================================================== 1029789Sahrens */ 1030789Sahrens 1031789Sahrens /* 1032789Sahrens * Add capacity to a storage pool. 1033789Sahrens */ 1034789Sahrens int 1035789Sahrens spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 1036789Sahrens { 1037789Sahrens uint64_t txg; 1038789Sahrens int c, error; 1039789Sahrens vdev_t *rvd = spa->spa_root_vdev; 1040789Sahrens vdev_t *vd; 1041789Sahrens 1042789Sahrens txg = spa_vdev_enter(spa); 1043789Sahrens 1044789Sahrens vd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD); 1045789Sahrens 1046789Sahrens if (vd == NULL) 1047789Sahrens return (spa_vdev_exit(spa, vd, txg, EINVAL)); 1048789Sahrens 1049789Sahrens if (rvd == NULL) /* spa_create() */ 1050789Sahrens spa->spa_root_vdev = rvd = vd; 1051789Sahrens 1052789Sahrens if ((error = vdev_create(vd, txg)) != 0) 1053789Sahrens return (spa_vdev_exit(spa, vd, txg, error)); 1054789Sahrens 1055789Sahrens /* 1056789Sahrens * Transfer each top-level vdev from the temporary root 1057789Sahrens * to the spa's root and initialize its metaslabs. 1058789Sahrens */ 1059789Sahrens for (c = 0; c < vd->vdev_children; c++) { 1060789Sahrens vdev_t *tvd = vd->vdev_child[c]; 1061789Sahrens if (vd != rvd) { 1062789Sahrens vdev_remove_child(vd, tvd); 1063789Sahrens tvd->vdev_id = rvd->vdev_children; 1064789Sahrens vdev_add_child(rvd, tvd); 1065789Sahrens } 1066*1544Seschrock if ((error = vdev_init(tvd, txg)) != 0) 1067*1544Seschrock return (spa_vdev_exit(spa, vd, txg, error)); 1068789Sahrens vdev_config_dirty(tvd); 1069789Sahrens } 1070789Sahrens 1071789Sahrens /* 1072789Sahrens * Update the config based on the new in-core state. 1073789Sahrens */ 1074789Sahrens spa_config_set(spa, spa_config_generate(spa, rvd, txg, 0)); 1075789Sahrens 1076789Sahrens return (spa_vdev_exit(spa, vd, txg, 0)); 1077789Sahrens } 1078789Sahrens 1079789Sahrens /* 1080789Sahrens * Attach a device to a mirror. The arguments are the path to any device 1081789Sahrens * in the mirror, and the nvroot for the new device. If the path specifies 1082789Sahrens * a device that is not mirrored, we automatically insert the mirror vdev. 1083789Sahrens * 1084789Sahrens * If 'replacing' is specified, the new device is intended to replace the 1085789Sahrens * existing device; in this case the two devices are made into their own 1086789Sahrens * mirror using the 'replacing' vdev, which is functionally idendical to 1087789Sahrens * the mirror vdev (it actually reuses all the same ops) but has a few 1088789Sahrens * extra rules: you can't attach to it after it's been created, and upon 1089789Sahrens * completion of resilvering, the first disk (the one being replaced) 1090789Sahrens * is automatically detached. 1091789Sahrens */ 1092789Sahrens int 1093*1544Seschrock spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 1094789Sahrens { 1095789Sahrens uint64_t txg, open_txg; 1096789Sahrens int error; 1097789Sahrens vdev_t *rvd = spa->spa_root_vdev; 1098789Sahrens vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 1099789Sahrens vdev_ops_t *pvops = replacing ? &vdev_replacing_ops : &vdev_mirror_ops; 1100789Sahrens 1101789Sahrens txg = spa_vdev_enter(spa); 1102789Sahrens 1103*1544Seschrock oldvd = vdev_lookup_by_guid(rvd, guid); 1104789Sahrens 1105789Sahrens if (oldvd == NULL) 1106789Sahrens return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1107789Sahrens 1108789Sahrens pvd = oldvd->vdev_parent; 1109789Sahrens 1110789Sahrens /* 1111789Sahrens * The parent must be a mirror or the root, unless we're replacing; 1112789Sahrens * in that case, the parent can be anything but another replacing vdev. 1113789Sahrens */ 1114789Sahrens if (pvd->vdev_ops != &vdev_mirror_ops && 1115789Sahrens pvd->vdev_ops != &vdev_root_ops && 1116789Sahrens (!replacing || pvd->vdev_ops == &vdev_replacing_ops)) 1117789Sahrens return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1118789Sahrens 1119789Sahrens newrootvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_ADD); 1120789Sahrens 1121789Sahrens if (newrootvd == NULL || newrootvd->vdev_children != 1) 1122789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1123789Sahrens 1124789Sahrens newvd = newrootvd->vdev_child[0]; 1125789Sahrens 1126789Sahrens if (!newvd->vdev_ops->vdev_op_leaf) 1127789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 1128789Sahrens 1129789Sahrens if ((error = vdev_create(newrootvd, txg)) != 0) 1130789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, error)); 1131789Sahrens 11321175Slling /* 11331175Slling * Compare the new device size with the replaceable/attachable 11341175Slling * device size. 11351175Slling */ 11361175Slling if (newvd->vdev_psize < vdev_get_rsize(oldvd)) 1137789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 1138789Sahrens 1139789Sahrens if (newvd->vdev_ashift != oldvd->vdev_ashift && oldvd->vdev_ashift != 0) 1140789Sahrens return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 1141789Sahrens 1142789Sahrens /* 1143789Sahrens * If this is an in-place replacement, update oldvd's path and devid 1144789Sahrens * to make it distinguishable from newvd, and unopenable from now on. 1145789Sahrens */ 1146789Sahrens if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 1147789Sahrens spa_strfree(oldvd->vdev_path); 1148789Sahrens oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 1149789Sahrens KM_SLEEP); 1150789Sahrens (void) sprintf(oldvd->vdev_path, "%s/%s", 1151789Sahrens newvd->vdev_path, "old"); 1152789Sahrens if (oldvd->vdev_devid != NULL) { 1153789Sahrens spa_strfree(oldvd->vdev_devid); 1154789Sahrens oldvd->vdev_devid = NULL; 1155789Sahrens } 1156789Sahrens } 1157789Sahrens 1158789Sahrens /* 1159789Sahrens * If the parent is not a mirror, or if we're replacing, 1160789Sahrens * insert the new mirror/replacing vdev above oldvd. 1161789Sahrens */ 1162789Sahrens if (pvd->vdev_ops != pvops) 1163789Sahrens pvd = vdev_add_parent(oldvd, pvops); 1164789Sahrens 1165789Sahrens ASSERT(pvd->vdev_top->vdev_parent == rvd); 1166789Sahrens ASSERT(pvd->vdev_ops == pvops); 1167789Sahrens ASSERT(oldvd->vdev_parent == pvd); 1168789Sahrens 1169789Sahrens /* 1170789Sahrens * Extract the new device from its root and add it to pvd. 1171789Sahrens */ 1172789Sahrens vdev_remove_child(newrootvd, newvd); 1173789Sahrens newvd->vdev_id = pvd->vdev_children; 1174789Sahrens vdev_add_child(pvd, newvd); 1175789Sahrens 1176*1544Seschrock /* 1177*1544Seschrock * If newvd is smaller than oldvd, but larger than its rsize, 1178*1544Seschrock * the addition of newvd may have decreased our parent's asize. 1179*1544Seschrock */ 1180*1544Seschrock pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize); 1181*1544Seschrock 1182789Sahrens tvd = newvd->vdev_top; 1183789Sahrens ASSERT(pvd->vdev_top == tvd); 1184789Sahrens ASSERT(tvd->vdev_parent == rvd); 1185789Sahrens 1186789Sahrens /* 1187789Sahrens * Update the config based on the new in-core state. 1188789Sahrens */ 1189789Sahrens spa_config_set(spa, spa_config_generate(spa, rvd, txg, 0)); 1190789Sahrens vdev_config_dirty(tvd); 1191789Sahrens 1192789Sahrens /* 1193789Sahrens * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate 1194789Sahrens * upward when spa_vdev_exit() calls vdev_dtl_reassess(). 1195789Sahrens */ 1196789Sahrens open_txg = txg + TXG_CONCURRENT_STATES - 1; 1197789Sahrens 1198789Sahrens mutex_enter(&newvd->vdev_dtl_lock); 1199789Sahrens space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, 1200789Sahrens open_txg - TXG_INITIAL + 1); 1201789Sahrens mutex_exit(&newvd->vdev_dtl_lock); 1202789Sahrens 1203*1544Seschrock dprintf("attached %s in txg %llu\n", newvd->vdev_path, txg); 1204*1544Seschrock 1205789Sahrens /* 1206789Sahrens * Mark newvd's DTL dirty in this txg. 1207789Sahrens */ 1208789Sahrens vdev_dirty(tvd, VDD_DTL, txg); 1209789Sahrens (void) txg_list_add(&tvd->vdev_dtl_list, newvd, txg); 1210789Sahrens 1211789Sahrens (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); 1212789Sahrens 1213789Sahrens /* 1214789Sahrens * Kick off a resilver to update newvd. 1215789Sahrens */ 1216789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1217789Sahrens 1218789Sahrens return (0); 1219789Sahrens } 1220789Sahrens 1221789Sahrens /* 1222789Sahrens * Detach a device from a mirror or replacing vdev. 1223789Sahrens * If 'replace_done' is specified, only detach if the parent 1224789Sahrens * is a replacing vdev. 1225789Sahrens */ 1226789Sahrens int 1227*1544Seschrock spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) 1228789Sahrens { 1229789Sahrens uint64_t txg; 1230789Sahrens int c, t, error; 1231789Sahrens vdev_t *rvd = spa->spa_root_vdev; 1232789Sahrens vdev_t *vd, *pvd, *cvd, *tvd; 1233789Sahrens 1234789Sahrens txg = spa_vdev_enter(spa); 1235789Sahrens 1236*1544Seschrock vd = vdev_lookup_by_guid(rvd, guid); 1237789Sahrens 1238789Sahrens if (vd == NULL) 1239789Sahrens return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1240789Sahrens 1241789Sahrens pvd = vd->vdev_parent; 1242789Sahrens 1243789Sahrens /* 1244789Sahrens * If replace_done is specified, only remove this device if it's 1245789Sahrens * the first child of a replacing vdev. 1246789Sahrens */ 1247789Sahrens if (replace_done && 1248789Sahrens (vd->vdev_id != 0 || pvd->vdev_ops != &vdev_replacing_ops)) 1249789Sahrens return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1250789Sahrens 1251789Sahrens /* 1252789Sahrens * Only mirror and replacing vdevs support detach. 1253789Sahrens */ 1254789Sahrens if (pvd->vdev_ops != &vdev_replacing_ops && 1255789Sahrens pvd->vdev_ops != &vdev_mirror_ops) 1256789Sahrens return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1257789Sahrens 1258789Sahrens /* 1259789Sahrens * If there's only one replica, you can't detach it. 1260789Sahrens */ 1261789Sahrens if (pvd->vdev_children <= 1) 1262789Sahrens return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1263789Sahrens 1264789Sahrens /* 1265789Sahrens * If all siblings have non-empty DTLs, this device may have the only 1266789Sahrens * valid copy of the data, which means we cannot safely detach it. 1267789Sahrens * 1268789Sahrens * XXX -- as in the vdev_offline() case, we really want a more 1269789Sahrens * precise DTL check. 1270789Sahrens */ 1271789Sahrens for (c = 0; c < pvd->vdev_children; c++) { 1272789Sahrens uint64_t dirty; 1273789Sahrens 1274789Sahrens cvd = pvd->vdev_child[c]; 1275789Sahrens if (cvd == vd) 1276789Sahrens continue; 1277789Sahrens if (vdev_is_dead(cvd)) 1278789Sahrens continue; 1279789Sahrens mutex_enter(&cvd->vdev_dtl_lock); 1280789Sahrens dirty = cvd->vdev_dtl_map.sm_space | 1281789Sahrens cvd->vdev_dtl_scrub.sm_space; 1282789Sahrens mutex_exit(&cvd->vdev_dtl_lock); 1283789Sahrens if (!dirty) 1284789Sahrens break; 1285789Sahrens } 1286789Sahrens if (c == pvd->vdev_children) 1287789Sahrens return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1288789Sahrens 1289789Sahrens /* 1290789Sahrens * Erase the disk labels so the disk can be used for other things. 1291789Sahrens * This must be done after all other error cases are handled, 1292789Sahrens * but before we disembowel vd (so we can still do I/O to it). 1293789Sahrens * But if we can't do it, don't treat the error as fatal -- 1294789Sahrens * it may be that the unwritability of the disk is the reason 1295789Sahrens * it's being detached! 1296789Sahrens */ 1297789Sahrens error = vdev_label_init(vd, 0); 1298789Sahrens if (error) 1299789Sahrens dprintf("unable to erase labels on %s\n", vdev_description(vd)); 1300789Sahrens 1301789Sahrens /* 1302789Sahrens * Remove vd from its parent and compact the parent's children. 1303789Sahrens */ 1304789Sahrens vdev_remove_child(pvd, vd); 1305789Sahrens vdev_compact_children(pvd); 1306789Sahrens 1307789Sahrens /* 1308789Sahrens * Remember one of the remaining children so we can get tvd below. 1309789Sahrens */ 1310789Sahrens cvd = pvd->vdev_child[0]; 1311789Sahrens 1312789Sahrens /* 1313789Sahrens * If the parent mirror/replacing vdev only has one child, 1314789Sahrens * the parent is no longer needed. Remove it from the tree. 1315789Sahrens */ 1316789Sahrens if (pvd->vdev_children == 1) 1317789Sahrens vdev_remove_parent(cvd); 1318789Sahrens 1319789Sahrens /* 1320789Sahrens * We don't set tvd until now because the parent we just removed 1321789Sahrens * may have been the previous top-level vdev. 1322789Sahrens */ 1323789Sahrens tvd = cvd->vdev_top; 1324789Sahrens ASSERT(tvd->vdev_parent == rvd); 1325789Sahrens 1326789Sahrens /* 1327789Sahrens * Reopen this top-level vdev to reassess health after detach. 1328789Sahrens */ 1329*1544Seschrock vdev_reopen(tvd); 1330789Sahrens 1331789Sahrens /* 1332789Sahrens * If the device we just detached was smaller than the others, 1333*1544Seschrock * it may be possible to add metaslabs (i.e. grow the pool). We ignore 1334*1544Seschrock * the error here because the detach still succeeded - we just weren't 1335*1544Seschrock * able to reinitialize the metaslabs. This pool is in for a world of 1336*1544Seschrock * hurt, in any case. 1337789Sahrens */ 1338*1544Seschrock (void) vdev_metaslab_init(tvd, txg); 1339789Sahrens 1340789Sahrens /* 1341789Sahrens * Update the config based on the new in-core state. 1342789Sahrens */ 1343789Sahrens spa_config_set(spa, spa_config_generate(spa, rvd, txg, 0)); 1344789Sahrens 1345789Sahrens vdev_config_dirty(tvd); 1346789Sahrens 1347789Sahrens /* 1348789Sahrens * Mark vd's DTL as dirty in this txg. 1349789Sahrens * vdev_dtl_sync() will see that vd->vdev_detached is set 1350789Sahrens * and free vd's DTL object in syncing context. 1351789Sahrens * But first make sure we're not on any *other* txg's DTL list, 1352789Sahrens * to prevent vd from being accessed after it's freed. 1353789Sahrens */ 1354789Sahrens vdev_dirty(tvd, VDD_DTL, txg); 1355789Sahrens vd->vdev_detached = B_TRUE; 1356789Sahrens for (t = 0; t < TXG_SIZE; t++) 1357789Sahrens (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 1358789Sahrens (void) txg_list_add(&tvd->vdev_dtl_list, vd, txg); 1359789Sahrens 1360*1544Seschrock dprintf("detached %s in txg %llu\n", vd->vdev_path, txg); 1361789Sahrens 1362789Sahrens return (spa_vdev_exit(spa, vd, txg, 0)); 1363789Sahrens } 1364789Sahrens 1365789Sahrens /* 1366*1544Seschrock * Find any device that's done replacing, so we can detach it. 1367789Sahrens */ 1368*1544Seschrock static vdev_t * 1369*1544Seschrock spa_vdev_replace_done_hunt(vdev_t *vd) 1370789Sahrens { 1371*1544Seschrock vdev_t *newvd, *oldvd; 1372789Sahrens int c; 1373789Sahrens 1374*1544Seschrock for (c = 0; c < vd->vdev_children; c++) { 1375*1544Seschrock oldvd = spa_vdev_replace_done_hunt(vd->vdev_child[c]); 1376*1544Seschrock if (oldvd != NULL) 1377*1544Seschrock return (oldvd); 1378*1544Seschrock } 1379789Sahrens 1380789Sahrens if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { 1381*1544Seschrock oldvd = vd->vdev_child[0]; 1382*1544Seschrock newvd = vd->vdev_child[1]; 1383789Sahrens 1384*1544Seschrock mutex_enter(&newvd->vdev_dtl_lock); 1385*1544Seschrock if (newvd->vdev_dtl_map.sm_space == 0 && 1386*1544Seschrock newvd->vdev_dtl_scrub.sm_space == 0) { 1387*1544Seschrock mutex_exit(&newvd->vdev_dtl_lock); 1388*1544Seschrock return (oldvd); 1389*1544Seschrock } 1390*1544Seschrock mutex_exit(&newvd->vdev_dtl_lock); 1391*1544Seschrock } 1392789Sahrens 1393*1544Seschrock return (NULL); 1394789Sahrens } 1395789Sahrens 1396*1544Seschrock static void 1397789Sahrens spa_vdev_replace_done(spa_t *spa) 1398789Sahrens { 1399*1544Seschrock vdev_t *vd; 1400*1544Seschrock uint64_t guid; 1401789Sahrens 1402*1544Seschrock spa_config_enter(spa, RW_READER, FTAG); 1403789Sahrens 1404*1544Seschrock while ((vd = spa_vdev_replace_done_hunt(spa->spa_root_vdev)) != NULL) { 1405*1544Seschrock guid = vd->vdev_guid; 1406*1544Seschrock spa_config_exit(spa, FTAG); 1407*1544Seschrock if (spa_vdev_detach(spa, guid, B_TRUE) != 0) 1408*1544Seschrock return; 1409*1544Seschrock spa_config_enter(spa, RW_READER, FTAG); 1410789Sahrens } 1411789Sahrens 1412*1544Seschrock spa_config_exit(spa, FTAG); 1413789Sahrens } 1414789Sahrens 1415789Sahrens /* 14161354Seschrock * Update the stored path for this vdev. Dirty the vdev configuration, relying 14171354Seschrock * on spa_vdev_enter/exit() to synchronize the labels and cache. 14181354Seschrock */ 14191354Seschrock int 14201354Seschrock spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 14211354Seschrock { 14221354Seschrock vdev_t *rvd, *vd; 14231354Seschrock uint64_t txg; 14241354Seschrock 14251354Seschrock rvd = spa->spa_root_vdev; 14261354Seschrock 14271354Seschrock txg = spa_vdev_enter(spa); 14281354Seschrock 14291354Seschrock if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) 14301354Seschrock return (spa_vdev_exit(spa, NULL, txg, ENOENT)); 14311354Seschrock 14321354Seschrock spa_strfree(vd->vdev_path); 14331354Seschrock vd->vdev_path = spa_strdup(newpath); 14341354Seschrock 14351354Seschrock spa_config_set(spa, spa_config_generate(spa, rvd, txg, 0)); 14361354Seschrock 14371354Seschrock vdev_config_dirty(vd->vdev_top); 14381354Seschrock 14391354Seschrock return (spa_vdev_exit(spa, NULL, txg, 0)); 14401354Seschrock } 14411354Seschrock 14421354Seschrock /* 1443789Sahrens * ========================================================================== 1444789Sahrens * SPA Scrubbing 1445789Sahrens * ========================================================================== 1446789Sahrens */ 1447789Sahrens 1448*1544Seschrock void 1449*1544Seschrock spa_scrub_throttle(spa_t *spa, int direction) 1450*1544Seschrock { 1451*1544Seschrock mutex_enter(&spa->spa_scrub_lock); 1452*1544Seschrock spa->spa_scrub_throttled += direction; 1453*1544Seschrock ASSERT(spa->spa_scrub_throttled >= 0); 1454*1544Seschrock if (spa->spa_scrub_throttled == 0) 1455*1544Seschrock cv_broadcast(&spa->spa_scrub_io_cv); 1456*1544Seschrock mutex_exit(&spa->spa_scrub_lock); 1457*1544Seschrock } 1458789Sahrens 1459789Sahrens static void 1460789Sahrens spa_scrub_io_done(zio_t *zio) 1461789Sahrens { 1462789Sahrens spa_t *spa = zio->io_spa; 1463789Sahrens 1464789Sahrens zio_buf_free(zio->io_data, zio->io_size); 1465789Sahrens 1466789Sahrens mutex_enter(&spa->spa_scrub_lock); 1467*1544Seschrock if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 1468*1544Seschrock vdev_t *vd = zio->io_vd; 1469789Sahrens spa->spa_scrub_errors++; 1470789Sahrens mutex_enter(&vd->vdev_stat_lock); 1471789Sahrens vd->vdev_stat.vs_scrub_errors++; 1472789Sahrens mutex_exit(&vd->vdev_stat_lock); 1473789Sahrens } 1474*1544Seschrock if (--spa->spa_scrub_inflight == 0) { 1475*1544Seschrock cv_broadcast(&spa->spa_scrub_io_cv); 1476*1544Seschrock ASSERT(spa->spa_scrub_throttled == 0); 1477*1544Seschrock } 1478*1544Seschrock mutex_exit(&spa->spa_scrub_lock); 1479789Sahrens } 1480789Sahrens 1481789Sahrens static void 1482*1544Seschrock spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags, 1483*1544Seschrock zbookmark_t *zb) 1484789Sahrens { 1485789Sahrens size_t size = BP_GET_LSIZE(bp); 1486789Sahrens void *data = zio_buf_alloc(size); 1487789Sahrens 1488789Sahrens mutex_enter(&spa->spa_scrub_lock); 1489789Sahrens spa->spa_scrub_inflight++; 1490789Sahrens mutex_exit(&spa->spa_scrub_lock); 1491789Sahrens 1492*1544Seschrock if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET) 1493*1544Seschrock flags |= ZIO_FLAG_SPECULATIVE; /* intent log block */ 1494*1544Seschrock 1495*1544Seschrock flags |= ZIO_FLAG_CANFAIL; 1496*1544Seschrock 1497789Sahrens zio_nowait(zio_read(NULL, spa, bp, data, size, 1498*1544Seschrock spa_scrub_io_done, NULL, priority, flags, zb)); 1499789Sahrens } 1500789Sahrens 1501789Sahrens /* ARGSUSED */ 1502789Sahrens static int 1503789Sahrens spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) 1504789Sahrens { 1505789Sahrens blkptr_t *bp = &bc->bc_blkptr; 1506789Sahrens vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[0])); 1507789Sahrens 1508789Sahrens if (bc->bc_errno || vd == NULL) { 1509789Sahrens /* 1510789Sahrens * We can't scrub this block, but we can continue to scrub 1511789Sahrens * the rest of the pool. Note the error and move along. 1512789Sahrens */ 1513789Sahrens mutex_enter(&spa->spa_scrub_lock); 1514789Sahrens spa->spa_scrub_errors++; 1515789Sahrens mutex_exit(&spa->spa_scrub_lock); 1516789Sahrens 1517789Sahrens if (vd != NULL) { 1518789Sahrens mutex_enter(&vd->vdev_stat_lock); 1519789Sahrens vd->vdev_stat.vs_scrub_errors++; 1520789Sahrens mutex_exit(&vd->vdev_stat_lock); 1521789Sahrens } 1522789Sahrens 1523789Sahrens return (ERESTART); 1524789Sahrens } 1525789Sahrens 1526789Sahrens ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg); 1527789Sahrens 1528789Sahrens /* 1529789Sahrens * Keep track of how much data we've examined so that 1530789Sahrens * zpool(1M) status can make useful progress reports. 1531789Sahrens */ 1532789Sahrens mutex_enter(&vd->vdev_stat_lock); 1533789Sahrens vd->vdev_stat.vs_scrub_examined += BP_GET_ASIZE(bp); 1534789Sahrens mutex_exit(&vd->vdev_stat_lock); 1535789Sahrens 1536789Sahrens if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) { 1537789Sahrens if (DVA_GET_GANG(&bp->blk_dva[0])) { 1538789Sahrens /* 1539789Sahrens * Gang members may be spread across multiple vdevs, 1540789Sahrens * so the best we can do is look at the pool-wide DTL. 1541789Sahrens * XXX -- it would be better to change our allocation 1542789Sahrens * policy to ensure that this can't happen. 1543789Sahrens */ 1544789Sahrens vd = spa->spa_root_vdev; 1545789Sahrens } 1546789Sahrens if (vdev_dtl_contains(&vd->vdev_dtl_map, bp->blk_birth, 1)) { 1547789Sahrens spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER, 1548*1544Seschrock ZIO_FLAG_RESILVER, &bc->bc_bookmark); 1549789Sahrens } 1550789Sahrens } else { 1551789Sahrens spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB, 1552*1544Seschrock ZIO_FLAG_SCRUB, &bc->bc_bookmark); 1553789Sahrens } 1554789Sahrens 1555789Sahrens return (0); 1556789Sahrens } 1557789Sahrens 1558789Sahrens static void 1559789Sahrens spa_scrub_thread(spa_t *spa) 1560789Sahrens { 1561789Sahrens callb_cpr_t cprinfo; 1562789Sahrens traverse_handle_t *th = spa->spa_scrub_th; 1563789Sahrens vdev_t *rvd = spa->spa_root_vdev; 1564789Sahrens pool_scrub_type_t scrub_type = spa->spa_scrub_type; 1565789Sahrens int error = 0; 1566789Sahrens boolean_t complete; 1567789Sahrens 1568789Sahrens CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG); 1569789Sahrens 1570797Sbonwick /* 1571797Sbonwick * If we're restarting due to a snapshot create/delete, 1572797Sbonwick * wait for that to complete. 1573797Sbonwick */ 1574797Sbonwick txg_wait_synced(spa_get_dsl(spa), 0); 1575797Sbonwick 1576*1544Seschrock dprintf("start %s mintxg=%llu maxtxg=%llu\n", 1577*1544Seschrock scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 1578*1544Seschrock spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg); 1579*1544Seschrock 1580*1544Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 1581*1544Seschrock vdev_reopen(rvd); /* purge all vdev caches */ 1582789Sahrens vdev_config_dirty(rvd); /* rewrite all disk labels */ 1583789Sahrens vdev_scrub_stat_update(rvd, scrub_type, B_FALSE); 1584*1544Seschrock spa_config_exit(spa, FTAG); 1585789Sahrens 1586789Sahrens mutex_enter(&spa->spa_scrub_lock); 1587789Sahrens spa->spa_scrub_errors = 0; 1588789Sahrens spa->spa_scrub_active = 1; 1589*1544Seschrock ASSERT(spa->spa_scrub_inflight == 0); 1590*1544Seschrock ASSERT(spa->spa_scrub_throttled == 0); 1591789Sahrens 1592789Sahrens while (!spa->spa_scrub_stop) { 1593789Sahrens CALLB_CPR_SAFE_BEGIN(&cprinfo); 1594*1544Seschrock while (spa->spa_scrub_suspended) { 1595789Sahrens spa->spa_scrub_active = 0; 1596789Sahrens cv_broadcast(&spa->spa_scrub_cv); 1597789Sahrens cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 1598789Sahrens spa->spa_scrub_active = 1; 1599789Sahrens } 1600789Sahrens CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock); 1601789Sahrens 1602789Sahrens if (spa->spa_scrub_restart_txg != 0) 1603789Sahrens break; 1604789Sahrens 1605789Sahrens mutex_exit(&spa->spa_scrub_lock); 1606789Sahrens error = traverse_more(th); 1607789Sahrens mutex_enter(&spa->spa_scrub_lock); 1608789Sahrens if (error != EAGAIN) 1609789Sahrens break; 1610*1544Seschrock 1611*1544Seschrock while (spa->spa_scrub_throttled > 0) 1612*1544Seschrock cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1613789Sahrens } 1614789Sahrens 1615789Sahrens while (spa->spa_scrub_inflight) 1616789Sahrens cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1617789Sahrens 1618789Sahrens if (spa->spa_scrub_restart_txg != 0) 1619789Sahrens error = ERESTART; 1620789Sahrens 1621*1544Seschrock if (spa->spa_scrub_stop) 1622*1544Seschrock error = EINTR; 1623*1544Seschrock 1624789Sahrens spa->spa_scrub_active = 0; 1625789Sahrens cv_broadcast(&spa->spa_scrub_cv); 1626789Sahrens 1627789Sahrens /* 1628*1544Seschrock * Even if there were uncorrectable errors, we consider the scrub 1629*1544Seschrock * completed. The downside is that if there is a transient error during 1630*1544Seschrock * a resilver, we won't resilver the data properly to the target. But 1631*1544Seschrock * if the damage is permanent (more likely) we will resilver forever, 1632*1544Seschrock * which isn't really acceptable. Since there is enough information for 1633*1544Seschrock * the user to know what has failed and why, this seems like a more 1634*1544Seschrock * tractable approach. 1635789Sahrens */ 1636*1544Seschrock complete = (error == 0); 1637789Sahrens 1638*1544Seschrock dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n", 1639*1544Seschrock scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", 1640789Sahrens spa->spa_scrub_maxtxg, complete ? "done" : "FAILED", 1641789Sahrens error, spa->spa_scrub_errors, spa->spa_scrub_stop); 1642789Sahrens 1643789Sahrens mutex_exit(&spa->spa_scrub_lock); 1644789Sahrens 1645789Sahrens /* 1646789Sahrens * If the scrub/resilver completed, update all DTLs to reflect this. 1647789Sahrens * Whether it succeeded or not, vacate all temporary scrub DTLs. 1648789Sahrens */ 1649*1544Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 1650789Sahrens vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1, 1651789Sahrens complete ? spa->spa_scrub_maxtxg : 0, B_TRUE); 1652789Sahrens vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete); 1653*1544Seschrock spa_errlog_rotate(spa); 1654*1544Seschrock spa_config_exit(spa, FTAG); 1655789Sahrens 1656789Sahrens mutex_enter(&spa->spa_scrub_lock); 1657789Sahrens 1658*1544Seschrock /* 1659*1544Seschrock * We may have finished replacing a device. 1660*1544Seschrock * Let the async thread assess this and handle the detach. 1661*1544Seschrock */ 1662*1544Seschrock spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); 1663789Sahrens 1664789Sahrens /* 1665789Sahrens * If we were told to restart, our final act is to start a new scrub. 1666789Sahrens */ 1667789Sahrens if (error == ERESTART) 1668*1544Seschrock spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ? 1669*1544Seschrock SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB); 1670789Sahrens 1671*1544Seschrock spa->spa_scrub_type = POOL_SCRUB_NONE; 1672*1544Seschrock spa->spa_scrub_active = 0; 1673*1544Seschrock spa->spa_scrub_thread = NULL; 1674*1544Seschrock cv_broadcast(&spa->spa_scrub_cv); 1675789Sahrens CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */ 1676789Sahrens thread_exit(); 1677789Sahrens } 1678789Sahrens 1679789Sahrens void 1680789Sahrens spa_scrub_suspend(spa_t *spa) 1681789Sahrens { 1682789Sahrens mutex_enter(&spa->spa_scrub_lock); 1683*1544Seschrock spa->spa_scrub_suspended++; 1684789Sahrens while (spa->spa_scrub_active) { 1685789Sahrens cv_broadcast(&spa->spa_scrub_cv); 1686789Sahrens cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 1687789Sahrens } 1688789Sahrens while (spa->spa_scrub_inflight) 1689789Sahrens cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1690789Sahrens mutex_exit(&spa->spa_scrub_lock); 1691789Sahrens } 1692789Sahrens 1693789Sahrens void 1694789Sahrens spa_scrub_resume(spa_t *spa) 1695789Sahrens { 1696789Sahrens mutex_enter(&spa->spa_scrub_lock); 1697*1544Seschrock ASSERT(spa->spa_scrub_suspended != 0); 1698*1544Seschrock if (--spa->spa_scrub_suspended == 0) 1699789Sahrens cv_broadcast(&spa->spa_scrub_cv); 1700789Sahrens mutex_exit(&spa->spa_scrub_lock); 1701789Sahrens } 1702789Sahrens 1703789Sahrens void 1704789Sahrens spa_scrub_restart(spa_t *spa, uint64_t txg) 1705789Sahrens { 1706789Sahrens /* 1707789Sahrens * Something happened (e.g. snapshot create/delete) that means 1708789Sahrens * we must restart any in-progress scrubs. The itinerary will 1709789Sahrens * fix this properly. 1710789Sahrens */ 1711789Sahrens mutex_enter(&spa->spa_scrub_lock); 1712789Sahrens spa->spa_scrub_restart_txg = txg; 1713789Sahrens mutex_exit(&spa->spa_scrub_lock); 1714789Sahrens } 1715789Sahrens 1716*1544Seschrock int 1717*1544Seschrock spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) 1718789Sahrens { 1719789Sahrens space_seg_t *ss; 1720789Sahrens uint64_t mintxg, maxtxg; 1721789Sahrens vdev_t *rvd = spa->spa_root_vdev; 1722*1544Seschrock int advance = ADVANCE_PRE | ADVANCE_ZIL; 1723789Sahrens 1724789Sahrens if ((uint_t)type >= POOL_SCRUB_TYPES) 1725789Sahrens return (ENOTSUP); 1726789Sahrens 1727*1544Seschrock mutex_enter(&spa->spa_scrub_lock); 1728*1544Seschrock 1729789Sahrens /* 1730789Sahrens * If there's a scrub or resilver already in progress, stop it. 1731789Sahrens */ 1732789Sahrens while (spa->spa_scrub_thread != NULL) { 1733789Sahrens /* 1734789Sahrens * Don't stop a resilver unless forced. 1735789Sahrens */ 1736*1544Seschrock if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) { 1737*1544Seschrock mutex_exit(&spa->spa_scrub_lock); 1738789Sahrens return (EBUSY); 1739*1544Seschrock } 1740789Sahrens spa->spa_scrub_stop = 1; 1741789Sahrens cv_broadcast(&spa->spa_scrub_cv); 1742789Sahrens cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); 1743789Sahrens } 1744789Sahrens 1745789Sahrens /* 1746789Sahrens * Terminate the previous traverse. 1747789Sahrens */ 1748789Sahrens if (spa->spa_scrub_th != NULL) { 1749789Sahrens traverse_fini(spa->spa_scrub_th); 1750789Sahrens spa->spa_scrub_th = NULL; 1751789Sahrens } 1752789Sahrens 1753*1544Seschrock if (rvd == NULL) { 1754*1544Seschrock ASSERT(spa->spa_scrub_stop == 0); 1755*1544Seschrock ASSERT(spa->spa_scrub_type == type); 1756*1544Seschrock ASSERT(spa->spa_scrub_restart_txg == 0); 1757*1544Seschrock mutex_exit(&spa->spa_scrub_lock); 1758*1544Seschrock return (0); 1759*1544Seschrock } 1760789Sahrens 1761789Sahrens mintxg = TXG_INITIAL - 1; 1762789Sahrens maxtxg = spa_last_synced_txg(spa) + 1; 1763789Sahrens 1764*1544Seschrock mutex_enter(&rvd->vdev_dtl_lock); 1765789Sahrens 1766*1544Seschrock if (rvd->vdev_dtl_map.sm_space == 0) { 1767*1544Seschrock /* 1768*1544Seschrock * The pool-wide DTL is empty. 1769*1544Seschrock * If this is a resilver, there's nothing to do. 1770*1544Seschrock */ 1771*1544Seschrock if (type == POOL_SCRUB_RESILVER) 1772*1544Seschrock type = POOL_SCRUB_NONE; 1773*1544Seschrock } else { 1774*1544Seschrock /* 1775*1544Seschrock * The pool-wide DTL is non-empty. 1776*1544Seschrock * If this is a normal scrub, upgrade to a resilver instead. 1777*1544Seschrock */ 1778*1544Seschrock if (type == POOL_SCRUB_EVERYTHING) 1779*1544Seschrock type = POOL_SCRUB_RESILVER; 1780*1544Seschrock } 1781789Sahrens 1782*1544Seschrock if (type == POOL_SCRUB_RESILVER) { 1783789Sahrens /* 1784789Sahrens * Determine the resilvering boundaries. 1785789Sahrens * 1786789Sahrens * Note: (mintxg, maxtxg) is an open interval, 1787789Sahrens * i.e. mintxg and maxtxg themselves are not included. 1788789Sahrens * 1789789Sahrens * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1 1790789Sahrens * so we don't claim to resilver a txg that's still changing. 1791789Sahrens */ 1792789Sahrens ss = avl_first(&rvd->vdev_dtl_map.sm_root); 1793*1544Seschrock mintxg = ss->ss_start - 1; 1794789Sahrens ss = avl_last(&rvd->vdev_dtl_map.sm_root); 1795*1544Seschrock maxtxg = MIN(ss->ss_end, maxtxg); 1796789Sahrens 1797*1544Seschrock advance |= ADVANCE_PRUNE; 1798789Sahrens } 1799789Sahrens 1800*1544Seschrock mutex_exit(&rvd->vdev_dtl_lock); 1801*1544Seschrock 1802*1544Seschrock spa->spa_scrub_stop = 0; 1803*1544Seschrock spa->spa_scrub_type = type; 1804*1544Seschrock spa->spa_scrub_restart_txg = 0; 1805*1544Seschrock 1806*1544Seschrock if (type != POOL_SCRUB_NONE) { 1807*1544Seschrock spa->spa_scrub_mintxg = mintxg; 1808789Sahrens spa->spa_scrub_maxtxg = maxtxg; 1809789Sahrens spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL, 1810789Sahrens advance, ZIO_FLAG_CANFAIL); 1811789Sahrens traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg); 1812789Sahrens spa->spa_scrub_thread = thread_create(NULL, 0, 1813789Sahrens spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri); 1814789Sahrens } 1815789Sahrens 1816*1544Seschrock mutex_exit(&spa->spa_scrub_lock); 1817*1544Seschrock 1818789Sahrens return (0); 1819789Sahrens } 1820789Sahrens 1821*1544Seschrock /* 1822*1544Seschrock * ========================================================================== 1823*1544Seschrock * SPA async task processing 1824*1544Seschrock * ========================================================================== 1825*1544Seschrock */ 1826*1544Seschrock 1827*1544Seschrock static void 1828*1544Seschrock spa_async_reopen(spa_t *spa) 1829789Sahrens { 1830*1544Seschrock vdev_t *rvd = spa->spa_root_vdev; 1831*1544Seschrock vdev_t *tvd; 1832*1544Seschrock int c; 1833*1544Seschrock 1834*1544Seschrock spa_config_enter(spa, RW_WRITER, FTAG); 1835*1544Seschrock 1836*1544Seschrock for (c = 0; c < rvd->vdev_children; c++) { 1837*1544Seschrock tvd = rvd->vdev_child[c]; 1838*1544Seschrock if (tvd->vdev_reopen_wanted) { 1839*1544Seschrock tvd->vdev_reopen_wanted = 0; 1840*1544Seschrock vdev_reopen(tvd); 1841*1544Seschrock } 1842*1544Seschrock } 1843789Sahrens 1844*1544Seschrock spa_config_exit(spa, FTAG); 1845*1544Seschrock } 1846*1544Seschrock 1847*1544Seschrock static void 1848*1544Seschrock spa_async_thread(spa_t *spa) 1849*1544Seschrock { 1850*1544Seschrock int tasks; 1851*1544Seschrock 1852*1544Seschrock ASSERT(spa->spa_sync_on); 1853789Sahrens 1854*1544Seschrock mutex_enter(&spa->spa_async_lock); 1855*1544Seschrock tasks = spa->spa_async_tasks; 1856*1544Seschrock spa->spa_async_tasks = 0; 1857*1544Seschrock mutex_exit(&spa->spa_async_lock); 1858*1544Seschrock 1859*1544Seschrock /* 1860*1544Seschrock * See if any devices need to be reopened. 1861*1544Seschrock */ 1862*1544Seschrock if (tasks & SPA_ASYNC_REOPEN) 1863*1544Seschrock spa_async_reopen(spa); 1864*1544Seschrock 1865*1544Seschrock /* 1866*1544Seschrock * If any devices are done replacing, detach them. 1867*1544Seschrock */ 1868*1544Seschrock if (tasks & SPA_ASYNC_REPLACE_DONE) 1869789Sahrens spa_vdev_replace_done(spa); 1870789Sahrens 1871*1544Seschrock /* 1872*1544Seschrock * Kick off a scrub. 1873*1544Seschrock */ 1874*1544Seschrock if (tasks & SPA_ASYNC_SCRUB) 1875*1544Seschrock VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0); 1876*1544Seschrock 1877*1544Seschrock /* 1878*1544Seschrock * Kick off a resilver. 1879*1544Seschrock */ 1880*1544Seschrock if (tasks & SPA_ASYNC_RESILVER) 1881*1544Seschrock VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1882*1544Seschrock 1883*1544Seschrock /* 1884*1544Seschrock * Let the world know that we're done. 1885*1544Seschrock */ 1886*1544Seschrock mutex_enter(&spa->spa_async_lock); 1887*1544Seschrock spa->spa_async_thread = NULL; 1888*1544Seschrock cv_broadcast(&spa->spa_async_cv); 1889*1544Seschrock mutex_exit(&spa->spa_async_lock); 1890*1544Seschrock thread_exit(); 1891*1544Seschrock } 1892*1544Seschrock 1893*1544Seschrock void 1894*1544Seschrock spa_async_suspend(spa_t *spa) 1895*1544Seschrock { 1896*1544Seschrock mutex_enter(&spa->spa_async_lock); 1897*1544Seschrock spa->spa_async_suspended++; 1898*1544Seschrock while (spa->spa_async_thread != NULL) 1899*1544Seschrock cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 1900*1544Seschrock mutex_exit(&spa->spa_async_lock); 1901*1544Seschrock } 1902*1544Seschrock 1903*1544Seschrock void 1904*1544Seschrock spa_async_resume(spa_t *spa) 1905*1544Seschrock { 1906*1544Seschrock mutex_enter(&spa->spa_async_lock); 1907*1544Seschrock ASSERT(spa->spa_async_suspended != 0); 1908*1544Seschrock spa->spa_async_suspended--; 1909*1544Seschrock mutex_exit(&spa->spa_async_lock); 1910*1544Seschrock } 1911*1544Seschrock 1912*1544Seschrock static void 1913*1544Seschrock spa_async_dispatch(spa_t *spa) 1914*1544Seschrock { 1915*1544Seschrock mutex_enter(&spa->spa_async_lock); 1916*1544Seschrock if (spa->spa_async_tasks && !spa->spa_async_suspended && 1917*1544Seschrock spa->spa_async_thread == NULL) 1918*1544Seschrock spa->spa_async_thread = thread_create(NULL, 0, 1919*1544Seschrock spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 1920*1544Seschrock mutex_exit(&spa->spa_async_lock); 1921*1544Seschrock } 1922*1544Seschrock 1923*1544Seschrock void 1924*1544Seschrock spa_async_request(spa_t *spa, int task) 1925*1544Seschrock { 1926*1544Seschrock mutex_enter(&spa->spa_async_lock); 1927*1544Seschrock spa->spa_async_tasks |= task; 1928*1544Seschrock mutex_exit(&spa->spa_async_lock); 1929789Sahrens } 1930789Sahrens 1931789Sahrens /* 1932789Sahrens * ========================================================================== 1933789Sahrens * SPA syncing routines 1934789Sahrens * ========================================================================== 1935789Sahrens */ 1936789Sahrens 1937789Sahrens static void 1938789Sahrens spa_sync_deferred_frees(spa_t *spa, uint64_t txg) 1939789Sahrens { 1940789Sahrens bplist_t *bpl = &spa->spa_sync_bplist; 1941789Sahrens dmu_tx_t *tx; 1942789Sahrens blkptr_t blk; 1943789Sahrens uint64_t itor = 0; 1944789Sahrens zio_t *zio; 1945789Sahrens int error; 1946789Sahrens uint8_t c = 1; 1947789Sahrens 1948789Sahrens zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD); 1949789Sahrens 1950789Sahrens while (bplist_iterate(bpl, &itor, &blk) == 0) 1951789Sahrens zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL)); 1952789Sahrens 1953789Sahrens error = zio_wait(zio); 1954789Sahrens ASSERT3U(error, ==, 0); 1955789Sahrens 1956789Sahrens tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 1957789Sahrens bplist_vacate(bpl, tx); 1958789Sahrens 1959789Sahrens /* 1960789Sahrens * Pre-dirty the first block so we sync to convergence faster. 1961789Sahrens * (Usually only the first block is needed.) 1962789Sahrens */ 1963789Sahrens dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); 1964789Sahrens dmu_tx_commit(tx); 1965789Sahrens } 1966789Sahrens 1967789Sahrens static void 1968789Sahrens spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 1969789Sahrens { 1970789Sahrens nvlist_t *config; 1971789Sahrens char *packed = NULL; 1972789Sahrens size_t nvsize = 0; 1973789Sahrens dmu_buf_t *db; 1974789Sahrens 1975789Sahrens if (list_is_empty(&spa->spa_dirty_list)) 1976789Sahrens return; 1977789Sahrens 1978789Sahrens config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE); 1979789Sahrens 1980789Sahrens spa_config_set(spa, config); 1981789Sahrens 1982789Sahrens VERIFY(nvlist_size(config, &nvsize, NV_ENCODE_XDR) == 0); 1983789Sahrens 1984789Sahrens packed = kmem_alloc(nvsize, KM_SLEEP); 1985789Sahrens 1986*1544Seschrock VERIFY(nvlist_pack(config, &packed, &nvsize, NV_ENCODE_XDR, 1987*1544Seschrock KM_SLEEP) == 0); 1988789Sahrens 1989789Sahrens dmu_write(spa->spa_meta_objset, spa->spa_config_object, 0, nvsize, 1990789Sahrens packed, tx); 1991789Sahrens 1992789Sahrens kmem_free(packed, nvsize); 1993789Sahrens 1994*1544Seschrock VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, 1995*1544Seschrock spa->spa_config_object, FTAG, &db)); 1996789Sahrens dmu_buf_will_dirty(db, tx); 1997789Sahrens *(uint64_t *)db->db_data = nvsize; 1998*1544Seschrock dmu_buf_rele(db, FTAG); 1999789Sahrens } 2000789Sahrens 2001789Sahrens /* 2002789Sahrens * Sync the specified transaction group. New blocks may be dirtied as 2003789Sahrens * part of the process, so we iterate until it converges. 2004789Sahrens */ 2005789Sahrens void 2006789Sahrens spa_sync(spa_t *spa, uint64_t txg) 2007789Sahrens { 2008789Sahrens dsl_pool_t *dp = spa->spa_dsl_pool; 2009789Sahrens objset_t *mos = spa->spa_meta_objset; 2010789Sahrens bplist_t *bpl = &spa->spa_sync_bplist; 2011789Sahrens vdev_t *vd; 2012789Sahrens dmu_tx_t *tx; 2013789Sahrens int dirty_vdevs; 2014789Sahrens 2015789Sahrens /* 2016789Sahrens * Lock out configuration changes. 2017789Sahrens */ 2018*1544Seschrock spa_config_enter(spa, RW_READER, FTAG); 2019789Sahrens 2020789Sahrens spa->spa_syncing_txg = txg; 2021789Sahrens spa->spa_sync_pass = 0; 2022789Sahrens 2023*1544Seschrock VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); 2024789Sahrens 2025789Sahrens /* 2026789Sahrens * If anything has changed in this txg, push the deferred frees 2027789Sahrens * from the previous txg. If not, leave them alone so that we 2028789Sahrens * don't generate work on an otherwise idle system. 2029789Sahrens */ 2030789Sahrens if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 2031789Sahrens !txg_list_empty(&dp->dp_dirty_dirs, txg)) 2032789Sahrens spa_sync_deferred_frees(spa, txg); 2033789Sahrens 2034789Sahrens /* 2035789Sahrens * Iterate to convergence. 2036789Sahrens */ 2037789Sahrens do { 2038789Sahrens spa->spa_sync_pass++; 2039789Sahrens 2040789Sahrens tx = dmu_tx_create_assigned(dp, txg); 2041789Sahrens spa_sync_config_object(spa, tx); 2042789Sahrens dmu_tx_commit(tx); 2043789Sahrens 2044*1544Seschrock spa_errlog_sync(spa, txg); 2045*1544Seschrock 2046789Sahrens dsl_pool_sync(dp, txg); 2047789Sahrens 2048789Sahrens dirty_vdevs = 0; 2049789Sahrens while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { 2050789Sahrens vdev_sync(vd, txg); 2051789Sahrens dirty_vdevs++; 2052789Sahrens } 2053789Sahrens 2054789Sahrens tx = dmu_tx_create_assigned(dp, txg); 2055789Sahrens bplist_sync(bpl, tx); 2056789Sahrens dmu_tx_commit(tx); 2057789Sahrens 2058789Sahrens } while (dirty_vdevs); 2059789Sahrens 2060789Sahrens bplist_close(bpl); 2061789Sahrens 2062789Sahrens dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); 2063789Sahrens 2064789Sahrens /* 2065789Sahrens * Rewrite the vdev configuration (which includes the uberblock) 2066789Sahrens * to commit the transaction group. 2067789Sahrens */ 2068*1544Seschrock VERIFY(0 == spa_sync_labels(spa, txg)); 2069789Sahrens 2070789Sahrens /* 2071789Sahrens * Make a stable copy of the fully synced uberblock. 2072789Sahrens * We use this as the root for pool traversals. 2073789Sahrens */ 2074789Sahrens spa->spa_traverse_wanted = 1; /* tells traverse_more() to stop */ 2075789Sahrens 2076789Sahrens spa_scrub_suspend(spa); /* stop scrubbing and finish I/Os */ 2077789Sahrens 2078789Sahrens rw_enter(&spa->spa_traverse_lock, RW_WRITER); 2079789Sahrens spa->spa_traverse_wanted = 0; 2080789Sahrens spa->spa_ubsync = spa->spa_uberblock; 2081789Sahrens rw_exit(&spa->spa_traverse_lock); 2082789Sahrens 2083789Sahrens spa_scrub_resume(spa); /* resume scrub with new ubsync */ 2084789Sahrens 2085789Sahrens /* 2086789Sahrens * Clean up the ZIL records for the synced txg. 2087789Sahrens */ 2088789Sahrens dsl_pool_zil_clean(dp); 2089789Sahrens 2090789Sahrens /* 2091789Sahrens * Update usable space statistics. 2092789Sahrens */ 2093789Sahrens while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 2094789Sahrens vdev_sync_done(vd, txg); 2095789Sahrens 2096789Sahrens /* 2097789Sahrens * It had better be the case that we didn't dirty anything 2098789Sahrens * since spa_sync_labels(). 2099789Sahrens */ 2100789Sahrens ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 2101789Sahrens ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 2102789Sahrens ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 2103789Sahrens ASSERT(bpl->bpl_queue == NULL); 2104789Sahrens 2105*1544Seschrock spa_config_exit(spa, FTAG); 2106*1544Seschrock 2107*1544Seschrock /* 2108*1544Seschrock * If any async tasks have been requested, kick them off. 2109*1544Seschrock */ 2110*1544Seschrock spa_async_dispatch(spa); 2111789Sahrens } 2112789Sahrens 2113789Sahrens /* 2114789Sahrens * Sync all pools. We don't want to hold the namespace lock across these 2115789Sahrens * operations, so we take a reference on the spa_t and drop the lock during the 2116789Sahrens * sync. 2117789Sahrens */ 2118789Sahrens void 2119789Sahrens spa_sync_allpools(void) 2120789Sahrens { 2121789Sahrens spa_t *spa = NULL; 2122789Sahrens mutex_enter(&spa_namespace_lock); 2123789Sahrens while ((spa = spa_next(spa)) != NULL) { 2124789Sahrens if (spa_state(spa) != POOL_STATE_ACTIVE) 2125789Sahrens continue; 2126789Sahrens spa_open_ref(spa, FTAG); 2127789Sahrens mutex_exit(&spa_namespace_lock); 2128789Sahrens txg_wait_synced(spa_get_dsl(spa), 0); 2129789Sahrens mutex_enter(&spa_namespace_lock); 2130789Sahrens spa_close(spa, FTAG); 2131789Sahrens } 2132789Sahrens mutex_exit(&spa_namespace_lock); 2133789Sahrens } 2134789Sahrens 2135789Sahrens /* 2136789Sahrens * ========================================================================== 2137789Sahrens * Miscellaneous routines 2138789Sahrens * ========================================================================== 2139789Sahrens */ 2140789Sahrens 2141789Sahrens int 2142789Sahrens spa_busy(void) 2143789Sahrens { 2144789Sahrens return (spa_active_count != 0); 2145789Sahrens } 2146789Sahrens 2147789Sahrens /* 2148789Sahrens * Remove all pools in the system. 2149789Sahrens */ 2150789Sahrens void 2151789Sahrens spa_evict_all(void) 2152789Sahrens { 2153789Sahrens spa_t *spa; 2154789Sahrens 2155789Sahrens /* 2156789Sahrens * Remove all cached state. All pools should be closed now, 2157789Sahrens * so every spa in the AVL tree should be unreferenced. 2158789Sahrens */ 2159789Sahrens mutex_enter(&spa_namespace_lock); 2160789Sahrens while ((spa = spa_next(NULL)) != NULL) { 2161789Sahrens /* 2162*1544Seschrock * Stop async tasks. The async thread may need to detach 2163*1544Seschrock * a device that's been replaced, which requires grabbing 2164*1544Seschrock * spa_namespace_lock, so we must drop it here. 2165789Sahrens */ 2166789Sahrens spa_open_ref(spa, FTAG); 2167789Sahrens mutex_exit(&spa_namespace_lock); 2168*1544Seschrock spa_async_suspend(spa); 2169789Sahrens VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); 2170789Sahrens mutex_enter(&spa_namespace_lock); 2171789Sahrens spa_close(spa, FTAG); 2172789Sahrens 2173789Sahrens if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 2174789Sahrens spa_unload(spa); 2175789Sahrens spa_deactivate(spa); 2176789Sahrens } 2177789Sahrens spa_remove(spa); 2178789Sahrens } 2179789Sahrens mutex_exit(&spa_namespace_lock); 2180789Sahrens } 2181*1544Seschrock 2182*1544Seschrock vdev_t * 2183*1544Seschrock spa_lookup_by_guid(spa_t *spa, uint64_t guid) 2184*1544Seschrock { 2185*1544Seschrock return (vdev_lookup_by_guid(spa->spa_root_vdev, guid)); 2186*1544Seschrock } 2187