1*eda14cbcSMatt Macy /* 2*eda14cbcSMatt Macy * CDDL HEADER START 3*eda14cbcSMatt Macy * 4*eda14cbcSMatt Macy * The contents of this file are subject to the terms of the 5*eda14cbcSMatt Macy * Common Development and Distribution License (the "License"). 6*eda14cbcSMatt Macy * You may not use this file except in compliance with the License. 7*eda14cbcSMatt Macy * 8*eda14cbcSMatt Macy * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9*eda14cbcSMatt Macy * or http://www.opensolaris.org/os/licensing. 10*eda14cbcSMatt Macy * See the License for the specific language governing permissions 11*eda14cbcSMatt Macy * and limitations under the License. 12*eda14cbcSMatt Macy * 13*eda14cbcSMatt Macy * When distributing Covered Code, include this CDDL HEADER in each 14*eda14cbcSMatt Macy * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15*eda14cbcSMatt Macy * If applicable, add the following below this CDDL HEADER, with the 16*eda14cbcSMatt Macy * fields enclosed by brackets "[]" replaced with your own identifying 17*eda14cbcSMatt Macy * information: Portions Copyright [yyyy] [name of copyright owner] 18*eda14cbcSMatt Macy * 19*eda14cbcSMatt Macy * CDDL HEADER END 20*eda14cbcSMatt Macy */ 21*eda14cbcSMatt Macy 22*eda14cbcSMatt Macy /* 23*eda14cbcSMatt Macy * Copyright (c) 2016, 2019 by Delphix. All rights reserved. 24*eda14cbcSMatt Macy */ 25*eda14cbcSMatt Macy 26*eda14cbcSMatt Macy #include <sys/spa.h> 27*eda14cbcSMatt Macy #include <sys/spa_impl.h> 28*eda14cbcSMatt Macy #include <sys/txg.h> 29*eda14cbcSMatt Macy #include <sys/vdev_impl.h> 30*eda14cbcSMatt Macy #include <sys/metaslab_impl.h> 31*eda14cbcSMatt Macy #include <sys/dsl_synctask.h> 32*eda14cbcSMatt Macy #include <sys/zap.h> 33*eda14cbcSMatt Macy #include <sys/dmu_tx.h> 34*eda14cbcSMatt Macy #include <sys/vdev_initialize.h> 35*eda14cbcSMatt Macy 36*eda14cbcSMatt Macy /* 37*eda14cbcSMatt Macy * Value that is written to disk during initialization. 38*eda14cbcSMatt Macy */ 39*eda14cbcSMatt Macy #ifdef _ILP32 40*eda14cbcSMatt Macy unsigned long zfs_initialize_value = 0xdeadbeefUL; 41*eda14cbcSMatt Macy #else 42*eda14cbcSMatt Macy unsigned long zfs_initialize_value = 0xdeadbeefdeadbeeeULL; 43*eda14cbcSMatt Macy #endif 44*eda14cbcSMatt Macy 45*eda14cbcSMatt Macy /* maximum number of I/Os outstanding per leaf vdev */ 46*eda14cbcSMatt Macy int zfs_initialize_limit = 1; 47*eda14cbcSMatt Macy 48*eda14cbcSMatt Macy /* size of initializing writes; default 1MiB, see zfs_remove_max_segment */ 49*eda14cbcSMatt Macy unsigned long zfs_initialize_chunk_size = 1024 * 1024; 50*eda14cbcSMatt Macy 51*eda14cbcSMatt Macy static boolean_t 52*eda14cbcSMatt Macy vdev_initialize_should_stop(vdev_t *vd) 53*eda14cbcSMatt Macy { 54*eda14cbcSMatt Macy return (vd->vdev_initialize_exit_wanted || !vdev_writeable(vd) || 55*eda14cbcSMatt Macy vd->vdev_detached || vd->vdev_top->vdev_removing); 56*eda14cbcSMatt Macy } 57*eda14cbcSMatt Macy 58*eda14cbcSMatt Macy static void 59*eda14cbcSMatt Macy vdev_initialize_zap_update_sync(void *arg, dmu_tx_t *tx) 60*eda14cbcSMatt Macy { 61*eda14cbcSMatt Macy /* 62*eda14cbcSMatt Macy * We pass in the guid instead of the vdev_t since the vdev may 63*eda14cbcSMatt Macy * have been freed prior to the sync task being processed. This 64*eda14cbcSMatt Macy * happens when a vdev is detached as we call spa_config_vdev_exit(), 65*eda14cbcSMatt Macy * stop the initializing thread, schedule the sync task, and free 66*eda14cbcSMatt Macy * the vdev. Later when the scheduled sync task is invoked, it would 67*eda14cbcSMatt Macy * find that the vdev has been freed. 68*eda14cbcSMatt Macy */ 69*eda14cbcSMatt Macy uint64_t guid = *(uint64_t *)arg; 70*eda14cbcSMatt Macy uint64_t txg = dmu_tx_get_txg(tx); 71*eda14cbcSMatt Macy kmem_free(arg, sizeof (uint64_t)); 72*eda14cbcSMatt Macy 73*eda14cbcSMatt Macy vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE); 74*eda14cbcSMatt Macy if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd)) 75*eda14cbcSMatt Macy return; 76*eda14cbcSMatt Macy 77*eda14cbcSMatt Macy uint64_t last_offset = vd->vdev_initialize_offset[txg & TXG_MASK]; 78*eda14cbcSMatt Macy vd->vdev_initialize_offset[txg & TXG_MASK] = 0; 79*eda14cbcSMatt Macy 80*eda14cbcSMatt Macy VERIFY(vd->vdev_leaf_zap != 0); 81*eda14cbcSMatt Macy 82*eda14cbcSMatt Macy objset_t *mos = vd->vdev_spa->spa_meta_objset; 83*eda14cbcSMatt Macy 84*eda14cbcSMatt Macy if (last_offset > 0) { 85*eda14cbcSMatt Macy vd->vdev_initialize_last_offset = last_offset; 86*eda14cbcSMatt Macy VERIFY0(zap_update(mos, vd->vdev_leaf_zap, 87*eda14cbcSMatt Macy VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET, 88*eda14cbcSMatt Macy sizeof (last_offset), 1, &last_offset, tx)); 89*eda14cbcSMatt Macy } 90*eda14cbcSMatt Macy if (vd->vdev_initialize_action_time > 0) { 91*eda14cbcSMatt Macy uint64_t val = (uint64_t)vd->vdev_initialize_action_time; 92*eda14cbcSMatt Macy VERIFY0(zap_update(mos, vd->vdev_leaf_zap, 93*eda14cbcSMatt Macy VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME, sizeof (val), 94*eda14cbcSMatt Macy 1, &val, tx)); 95*eda14cbcSMatt Macy } 96*eda14cbcSMatt Macy 97*eda14cbcSMatt Macy uint64_t initialize_state = vd->vdev_initialize_state; 98*eda14cbcSMatt Macy VERIFY0(zap_update(mos, vd->vdev_leaf_zap, 99*eda14cbcSMatt Macy VDEV_LEAF_ZAP_INITIALIZE_STATE, sizeof (initialize_state), 1, 100*eda14cbcSMatt Macy &initialize_state, tx)); 101*eda14cbcSMatt Macy } 102*eda14cbcSMatt Macy 103*eda14cbcSMatt Macy static void 104*eda14cbcSMatt Macy vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state) 105*eda14cbcSMatt Macy { 106*eda14cbcSMatt Macy ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); 107*eda14cbcSMatt Macy spa_t *spa = vd->vdev_spa; 108*eda14cbcSMatt Macy 109*eda14cbcSMatt Macy if (new_state == vd->vdev_initialize_state) 110*eda14cbcSMatt Macy return; 111*eda14cbcSMatt Macy 112*eda14cbcSMatt Macy /* 113*eda14cbcSMatt Macy * Copy the vd's guid, this will be freed by the sync task. 114*eda14cbcSMatt Macy */ 115*eda14cbcSMatt Macy uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); 116*eda14cbcSMatt Macy *guid = vd->vdev_guid; 117*eda14cbcSMatt Macy 118*eda14cbcSMatt Macy /* 119*eda14cbcSMatt Macy * If we're suspending, then preserving the original start time. 120*eda14cbcSMatt Macy */ 121*eda14cbcSMatt Macy if (vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED) { 122*eda14cbcSMatt Macy vd->vdev_initialize_action_time = gethrestime_sec(); 123*eda14cbcSMatt Macy } 124*eda14cbcSMatt Macy vd->vdev_initialize_state = new_state; 125*eda14cbcSMatt Macy 126*eda14cbcSMatt Macy dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 127*eda14cbcSMatt Macy VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); 128*eda14cbcSMatt Macy dsl_sync_task_nowait(spa_get_dsl(spa), vdev_initialize_zap_update_sync, 129*eda14cbcSMatt Macy guid, 2, ZFS_SPACE_CHECK_NONE, tx); 130*eda14cbcSMatt Macy 131*eda14cbcSMatt Macy switch (new_state) { 132*eda14cbcSMatt Macy case VDEV_INITIALIZE_ACTIVE: 133*eda14cbcSMatt Macy spa_history_log_internal(spa, "initialize", tx, 134*eda14cbcSMatt Macy "vdev=%s activated", vd->vdev_path); 135*eda14cbcSMatt Macy break; 136*eda14cbcSMatt Macy case VDEV_INITIALIZE_SUSPENDED: 137*eda14cbcSMatt Macy spa_history_log_internal(spa, "initialize", tx, 138*eda14cbcSMatt Macy "vdev=%s suspended", vd->vdev_path); 139*eda14cbcSMatt Macy break; 140*eda14cbcSMatt Macy case VDEV_INITIALIZE_CANCELED: 141*eda14cbcSMatt Macy spa_history_log_internal(spa, "initialize", tx, 142*eda14cbcSMatt Macy "vdev=%s canceled", vd->vdev_path); 143*eda14cbcSMatt Macy break; 144*eda14cbcSMatt Macy case VDEV_INITIALIZE_COMPLETE: 145*eda14cbcSMatt Macy spa_history_log_internal(spa, "initialize", tx, 146*eda14cbcSMatt Macy "vdev=%s complete", vd->vdev_path); 147*eda14cbcSMatt Macy break; 148*eda14cbcSMatt Macy default: 149*eda14cbcSMatt Macy panic("invalid state %llu", (unsigned long long)new_state); 150*eda14cbcSMatt Macy } 151*eda14cbcSMatt Macy 152*eda14cbcSMatt Macy dmu_tx_commit(tx); 153*eda14cbcSMatt Macy 154*eda14cbcSMatt Macy if (new_state != VDEV_INITIALIZE_ACTIVE) 155*eda14cbcSMatt Macy spa_notify_waiters(spa); 156*eda14cbcSMatt Macy } 157*eda14cbcSMatt Macy 158*eda14cbcSMatt Macy static void 159*eda14cbcSMatt Macy vdev_initialize_cb(zio_t *zio) 160*eda14cbcSMatt Macy { 161*eda14cbcSMatt Macy vdev_t *vd = zio->io_vd; 162*eda14cbcSMatt Macy mutex_enter(&vd->vdev_initialize_io_lock); 163*eda14cbcSMatt Macy if (zio->io_error == ENXIO && !vdev_writeable(vd)) { 164*eda14cbcSMatt Macy /* 165*eda14cbcSMatt Macy * The I/O failed because the vdev was unavailable; roll the 166*eda14cbcSMatt Macy * last offset back. (This works because spa_sync waits on 167*eda14cbcSMatt Macy * spa_txg_zio before it runs sync tasks.) 168*eda14cbcSMatt Macy */ 169*eda14cbcSMatt Macy uint64_t *off = 170*eda14cbcSMatt Macy &vd->vdev_initialize_offset[zio->io_txg & TXG_MASK]; 171*eda14cbcSMatt Macy *off = MIN(*off, zio->io_offset); 172*eda14cbcSMatt Macy } else { 173*eda14cbcSMatt Macy /* 174*eda14cbcSMatt Macy * Since initializing is best-effort, we ignore I/O errors and 175*eda14cbcSMatt Macy * rely on vdev_probe to determine if the errors are more 176*eda14cbcSMatt Macy * critical. 177*eda14cbcSMatt Macy */ 178*eda14cbcSMatt Macy if (zio->io_error != 0) 179*eda14cbcSMatt Macy vd->vdev_stat.vs_initialize_errors++; 180*eda14cbcSMatt Macy 181*eda14cbcSMatt Macy vd->vdev_initialize_bytes_done += zio->io_orig_size; 182*eda14cbcSMatt Macy } 183*eda14cbcSMatt Macy ASSERT3U(vd->vdev_initialize_inflight, >, 0); 184*eda14cbcSMatt Macy vd->vdev_initialize_inflight--; 185*eda14cbcSMatt Macy cv_broadcast(&vd->vdev_initialize_io_cv); 186*eda14cbcSMatt Macy mutex_exit(&vd->vdev_initialize_io_lock); 187*eda14cbcSMatt Macy 188*eda14cbcSMatt Macy spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); 189*eda14cbcSMatt Macy } 190*eda14cbcSMatt Macy 191*eda14cbcSMatt Macy /* Takes care of physical writing and limiting # of concurrent ZIOs. */ 192*eda14cbcSMatt Macy static int 193*eda14cbcSMatt Macy vdev_initialize_write(vdev_t *vd, uint64_t start, uint64_t size, abd_t *data) 194*eda14cbcSMatt Macy { 195*eda14cbcSMatt Macy spa_t *spa = vd->vdev_spa; 196*eda14cbcSMatt Macy 197*eda14cbcSMatt Macy /* Limit inflight initializing I/Os */ 198*eda14cbcSMatt Macy mutex_enter(&vd->vdev_initialize_io_lock); 199*eda14cbcSMatt Macy while (vd->vdev_initialize_inflight >= zfs_initialize_limit) { 200*eda14cbcSMatt Macy cv_wait(&vd->vdev_initialize_io_cv, 201*eda14cbcSMatt Macy &vd->vdev_initialize_io_lock); 202*eda14cbcSMatt Macy } 203*eda14cbcSMatt Macy vd->vdev_initialize_inflight++; 204*eda14cbcSMatt Macy mutex_exit(&vd->vdev_initialize_io_lock); 205*eda14cbcSMatt Macy 206*eda14cbcSMatt Macy dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 207*eda14cbcSMatt Macy VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); 208*eda14cbcSMatt Macy uint64_t txg = dmu_tx_get_txg(tx); 209*eda14cbcSMatt Macy 210*eda14cbcSMatt Macy spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER); 211*eda14cbcSMatt Macy mutex_enter(&vd->vdev_initialize_lock); 212*eda14cbcSMatt Macy 213*eda14cbcSMatt Macy if (vd->vdev_initialize_offset[txg & TXG_MASK] == 0) { 214*eda14cbcSMatt Macy uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); 215*eda14cbcSMatt Macy *guid = vd->vdev_guid; 216*eda14cbcSMatt Macy 217*eda14cbcSMatt Macy /* This is the first write of this txg. */ 218*eda14cbcSMatt Macy dsl_sync_task_nowait(spa_get_dsl(spa), 219*eda14cbcSMatt Macy vdev_initialize_zap_update_sync, guid, 2, 220*eda14cbcSMatt Macy ZFS_SPACE_CHECK_RESERVED, tx); 221*eda14cbcSMatt Macy } 222*eda14cbcSMatt Macy 223*eda14cbcSMatt Macy /* 224*eda14cbcSMatt Macy * We know the vdev struct will still be around since all 225*eda14cbcSMatt Macy * consumers of vdev_free must stop the initialization first. 226*eda14cbcSMatt Macy */ 227*eda14cbcSMatt Macy if (vdev_initialize_should_stop(vd)) { 228*eda14cbcSMatt Macy mutex_enter(&vd->vdev_initialize_io_lock); 229*eda14cbcSMatt Macy ASSERT3U(vd->vdev_initialize_inflight, >, 0); 230*eda14cbcSMatt Macy vd->vdev_initialize_inflight--; 231*eda14cbcSMatt Macy mutex_exit(&vd->vdev_initialize_io_lock); 232*eda14cbcSMatt Macy spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); 233*eda14cbcSMatt Macy mutex_exit(&vd->vdev_initialize_lock); 234*eda14cbcSMatt Macy dmu_tx_commit(tx); 235*eda14cbcSMatt Macy return (SET_ERROR(EINTR)); 236*eda14cbcSMatt Macy } 237*eda14cbcSMatt Macy mutex_exit(&vd->vdev_initialize_lock); 238*eda14cbcSMatt Macy 239*eda14cbcSMatt Macy vd->vdev_initialize_offset[txg & TXG_MASK] = start + size; 240*eda14cbcSMatt Macy zio_nowait(zio_write_phys(spa->spa_txg_zio[txg & TXG_MASK], vd, start, 241*eda14cbcSMatt Macy size, data, ZIO_CHECKSUM_OFF, vdev_initialize_cb, NULL, 242*eda14cbcSMatt Macy ZIO_PRIORITY_INITIALIZING, ZIO_FLAG_CANFAIL, B_FALSE)); 243*eda14cbcSMatt Macy /* vdev_initialize_cb releases SCL_STATE_ALL */ 244*eda14cbcSMatt Macy 245*eda14cbcSMatt Macy dmu_tx_commit(tx); 246*eda14cbcSMatt Macy 247*eda14cbcSMatt Macy return (0); 248*eda14cbcSMatt Macy } 249*eda14cbcSMatt Macy 250*eda14cbcSMatt Macy /* 251*eda14cbcSMatt Macy * Callback to fill each ABD chunk with zfs_initialize_value. len must be 252*eda14cbcSMatt Macy * divisible by sizeof (uint64_t), and buf must be 8-byte aligned. The ABD 253*eda14cbcSMatt Macy * allocation will guarantee these for us. 254*eda14cbcSMatt Macy */ 255*eda14cbcSMatt Macy /* ARGSUSED */ 256*eda14cbcSMatt Macy static int 257*eda14cbcSMatt Macy vdev_initialize_block_fill(void *buf, size_t len, void *unused) 258*eda14cbcSMatt Macy { 259*eda14cbcSMatt Macy ASSERT0(len % sizeof (uint64_t)); 260*eda14cbcSMatt Macy #ifdef _ILP32 261*eda14cbcSMatt Macy for (uint64_t i = 0; i < len; i += sizeof (uint32_t)) { 262*eda14cbcSMatt Macy *(uint32_t *)((char *)(buf) + i) = zfs_initialize_value; 263*eda14cbcSMatt Macy } 264*eda14cbcSMatt Macy #else 265*eda14cbcSMatt Macy for (uint64_t i = 0; i < len; i += sizeof (uint64_t)) { 266*eda14cbcSMatt Macy *(uint64_t *)((char *)(buf) + i) = zfs_initialize_value; 267*eda14cbcSMatt Macy } 268*eda14cbcSMatt Macy #endif 269*eda14cbcSMatt Macy return (0); 270*eda14cbcSMatt Macy } 271*eda14cbcSMatt Macy 272*eda14cbcSMatt Macy static abd_t * 273*eda14cbcSMatt Macy vdev_initialize_block_alloc(void) 274*eda14cbcSMatt Macy { 275*eda14cbcSMatt Macy /* Allocate ABD for filler data */ 276*eda14cbcSMatt Macy abd_t *data = abd_alloc_for_io(zfs_initialize_chunk_size, B_FALSE); 277*eda14cbcSMatt Macy 278*eda14cbcSMatt Macy ASSERT0(zfs_initialize_chunk_size % sizeof (uint64_t)); 279*eda14cbcSMatt Macy (void) abd_iterate_func(data, 0, zfs_initialize_chunk_size, 280*eda14cbcSMatt Macy vdev_initialize_block_fill, NULL); 281*eda14cbcSMatt Macy 282*eda14cbcSMatt Macy return (data); 283*eda14cbcSMatt Macy } 284*eda14cbcSMatt Macy 285*eda14cbcSMatt Macy static void 286*eda14cbcSMatt Macy vdev_initialize_block_free(abd_t *data) 287*eda14cbcSMatt Macy { 288*eda14cbcSMatt Macy abd_free(data); 289*eda14cbcSMatt Macy } 290*eda14cbcSMatt Macy 291*eda14cbcSMatt Macy static int 292*eda14cbcSMatt Macy vdev_initialize_ranges(vdev_t *vd, abd_t *data) 293*eda14cbcSMatt Macy { 294*eda14cbcSMatt Macy range_tree_t *rt = vd->vdev_initialize_tree; 295*eda14cbcSMatt Macy zfs_btree_t *bt = &rt->rt_root; 296*eda14cbcSMatt Macy zfs_btree_index_t where; 297*eda14cbcSMatt Macy 298*eda14cbcSMatt Macy for (range_seg_t *rs = zfs_btree_first(bt, &where); rs != NULL; 299*eda14cbcSMatt Macy rs = zfs_btree_next(bt, &where, &where)) { 300*eda14cbcSMatt Macy uint64_t size = rs_get_end(rs, rt) - rs_get_start(rs, rt); 301*eda14cbcSMatt Macy 302*eda14cbcSMatt Macy /* Split range into legally-sized physical chunks */ 303*eda14cbcSMatt Macy uint64_t writes_required = 304*eda14cbcSMatt Macy ((size - 1) / zfs_initialize_chunk_size) + 1; 305*eda14cbcSMatt Macy 306*eda14cbcSMatt Macy for (uint64_t w = 0; w < writes_required; w++) { 307*eda14cbcSMatt Macy int error; 308*eda14cbcSMatt Macy 309*eda14cbcSMatt Macy error = vdev_initialize_write(vd, 310*eda14cbcSMatt Macy VDEV_LABEL_START_SIZE + rs_get_start(rs, rt) + 311*eda14cbcSMatt Macy (w * zfs_initialize_chunk_size), 312*eda14cbcSMatt Macy MIN(size - (w * zfs_initialize_chunk_size), 313*eda14cbcSMatt Macy zfs_initialize_chunk_size), data); 314*eda14cbcSMatt Macy if (error != 0) 315*eda14cbcSMatt Macy return (error); 316*eda14cbcSMatt Macy } 317*eda14cbcSMatt Macy } 318*eda14cbcSMatt Macy return (0); 319*eda14cbcSMatt Macy } 320*eda14cbcSMatt Macy 321*eda14cbcSMatt Macy static void 322*eda14cbcSMatt Macy vdev_initialize_calculate_progress(vdev_t *vd) 323*eda14cbcSMatt Macy { 324*eda14cbcSMatt Macy ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) || 325*eda14cbcSMatt Macy spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER)); 326*eda14cbcSMatt Macy ASSERT(vd->vdev_leaf_zap != 0); 327*eda14cbcSMatt Macy 328*eda14cbcSMatt Macy vd->vdev_initialize_bytes_est = 0; 329*eda14cbcSMatt Macy vd->vdev_initialize_bytes_done = 0; 330*eda14cbcSMatt Macy 331*eda14cbcSMatt Macy for (uint64_t i = 0; i < vd->vdev_top->vdev_ms_count; i++) { 332*eda14cbcSMatt Macy metaslab_t *msp = vd->vdev_top->vdev_ms[i]; 333*eda14cbcSMatt Macy mutex_enter(&msp->ms_lock); 334*eda14cbcSMatt Macy 335*eda14cbcSMatt Macy uint64_t ms_free = msp->ms_size - 336*eda14cbcSMatt Macy metaslab_allocated_space(msp); 337*eda14cbcSMatt Macy 338*eda14cbcSMatt Macy if (vd->vdev_top->vdev_ops == &vdev_raidz_ops) 339*eda14cbcSMatt Macy ms_free /= vd->vdev_top->vdev_children; 340*eda14cbcSMatt Macy 341*eda14cbcSMatt Macy /* 342*eda14cbcSMatt Macy * Convert the metaslab range to a physical range 343*eda14cbcSMatt Macy * on our vdev. We use this to determine if we are 344*eda14cbcSMatt Macy * in the middle of this metaslab range. 345*eda14cbcSMatt Macy */ 346*eda14cbcSMatt Macy range_seg64_t logical_rs, physical_rs; 347*eda14cbcSMatt Macy logical_rs.rs_start = msp->ms_start; 348*eda14cbcSMatt Macy logical_rs.rs_end = msp->ms_start + msp->ms_size; 349*eda14cbcSMatt Macy vdev_xlate(vd, &logical_rs, &physical_rs); 350*eda14cbcSMatt Macy 351*eda14cbcSMatt Macy if (vd->vdev_initialize_last_offset <= physical_rs.rs_start) { 352*eda14cbcSMatt Macy vd->vdev_initialize_bytes_est += ms_free; 353*eda14cbcSMatt Macy mutex_exit(&msp->ms_lock); 354*eda14cbcSMatt Macy continue; 355*eda14cbcSMatt Macy } else if (vd->vdev_initialize_last_offset > 356*eda14cbcSMatt Macy physical_rs.rs_end) { 357*eda14cbcSMatt Macy vd->vdev_initialize_bytes_done += ms_free; 358*eda14cbcSMatt Macy vd->vdev_initialize_bytes_est += ms_free; 359*eda14cbcSMatt Macy mutex_exit(&msp->ms_lock); 360*eda14cbcSMatt Macy continue; 361*eda14cbcSMatt Macy } 362*eda14cbcSMatt Macy 363*eda14cbcSMatt Macy /* 364*eda14cbcSMatt Macy * If we get here, we're in the middle of initializing this 365*eda14cbcSMatt Macy * metaslab. Load it and walk the free tree for more accurate 366*eda14cbcSMatt Macy * progress estimation. 367*eda14cbcSMatt Macy */ 368*eda14cbcSMatt Macy VERIFY0(metaslab_load(msp)); 369*eda14cbcSMatt Macy 370*eda14cbcSMatt Macy zfs_btree_index_t where; 371*eda14cbcSMatt Macy range_tree_t *rt = msp->ms_allocatable; 372*eda14cbcSMatt Macy for (range_seg_t *rs = 373*eda14cbcSMatt Macy zfs_btree_first(&rt->rt_root, &where); rs; 374*eda14cbcSMatt Macy rs = zfs_btree_next(&rt->rt_root, &where, 375*eda14cbcSMatt Macy &where)) { 376*eda14cbcSMatt Macy logical_rs.rs_start = rs_get_start(rs, rt); 377*eda14cbcSMatt Macy logical_rs.rs_end = rs_get_end(rs, rt); 378*eda14cbcSMatt Macy vdev_xlate(vd, &logical_rs, &physical_rs); 379*eda14cbcSMatt Macy 380*eda14cbcSMatt Macy uint64_t size = physical_rs.rs_end - 381*eda14cbcSMatt Macy physical_rs.rs_start; 382*eda14cbcSMatt Macy vd->vdev_initialize_bytes_est += size; 383*eda14cbcSMatt Macy if (vd->vdev_initialize_last_offset > 384*eda14cbcSMatt Macy physical_rs.rs_end) { 385*eda14cbcSMatt Macy vd->vdev_initialize_bytes_done += size; 386*eda14cbcSMatt Macy } else if (vd->vdev_initialize_last_offset > 387*eda14cbcSMatt Macy physical_rs.rs_start && 388*eda14cbcSMatt Macy vd->vdev_initialize_last_offset < 389*eda14cbcSMatt Macy physical_rs.rs_end) { 390*eda14cbcSMatt Macy vd->vdev_initialize_bytes_done += 391*eda14cbcSMatt Macy vd->vdev_initialize_last_offset - 392*eda14cbcSMatt Macy physical_rs.rs_start; 393*eda14cbcSMatt Macy } 394*eda14cbcSMatt Macy } 395*eda14cbcSMatt Macy mutex_exit(&msp->ms_lock); 396*eda14cbcSMatt Macy } 397*eda14cbcSMatt Macy } 398*eda14cbcSMatt Macy 399*eda14cbcSMatt Macy static int 400*eda14cbcSMatt Macy vdev_initialize_load(vdev_t *vd) 401*eda14cbcSMatt Macy { 402*eda14cbcSMatt Macy int err = 0; 403*eda14cbcSMatt Macy ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) || 404*eda14cbcSMatt Macy spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER)); 405*eda14cbcSMatt Macy ASSERT(vd->vdev_leaf_zap != 0); 406*eda14cbcSMatt Macy 407*eda14cbcSMatt Macy if (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE || 408*eda14cbcSMatt Macy vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED) { 409*eda14cbcSMatt Macy err = zap_lookup(vd->vdev_spa->spa_meta_objset, 410*eda14cbcSMatt Macy vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET, 411*eda14cbcSMatt Macy sizeof (vd->vdev_initialize_last_offset), 1, 412*eda14cbcSMatt Macy &vd->vdev_initialize_last_offset); 413*eda14cbcSMatt Macy if (err == ENOENT) { 414*eda14cbcSMatt Macy vd->vdev_initialize_last_offset = 0; 415*eda14cbcSMatt Macy err = 0; 416*eda14cbcSMatt Macy } 417*eda14cbcSMatt Macy } 418*eda14cbcSMatt Macy 419*eda14cbcSMatt Macy vdev_initialize_calculate_progress(vd); 420*eda14cbcSMatt Macy return (err); 421*eda14cbcSMatt Macy } 422*eda14cbcSMatt Macy 423*eda14cbcSMatt Macy /* 424*eda14cbcSMatt Macy * Convert the logical range into a physical range and add it to our 425*eda14cbcSMatt Macy * avl tree. 426*eda14cbcSMatt Macy */ 427*eda14cbcSMatt Macy static void 428*eda14cbcSMatt Macy vdev_initialize_range_add(void *arg, uint64_t start, uint64_t size) 429*eda14cbcSMatt Macy { 430*eda14cbcSMatt Macy vdev_t *vd = arg; 431*eda14cbcSMatt Macy range_seg64_t logical_rs, physical_rs; 432*eda14cbcSMatt Macy logical_rs.rs_start = start; 433*eda14cbcSMatt Macy logical_rs.rs_end = start + size; 434*eda14cbcSMatt Macy 435*eda14cbcSMatt Macy ASSERT(vd->vdev_ops->vdev_op_leaf); 436*eda14cbcSMatt Macy vdev_xlate(vd, &logical_rs, &physical_rs); 437*eda14cbcSMatt Macy 438*eda14cbcSMatt Macy IMPLY(vd->vdev_top == vd, 439*eda14cbcSMatt Macy logical_rs.rs_start == physical_rs.rs_start); 440*eda14cbcSMatt Macy IMPLY(vd->vdev_top == vd, 441*eda14cbcSMatt Macy logical_rs.rs_end == physical_rs.rs_end); 442*eda14cbcSMatt Macy 443*eda14cbcSMatt Macy /* Only add segments that we have not visited yet */ 444*eda14cbcSMatt Macy if (physical_rs.rs_end <= vd->vdev_initialize_last_offset) 445*eda14cbcSMatt Macy return; 446*eda14cbcSMatt Macy 447*eda14cbcSMatt Macy /* Pick up where we left off mid-range. */ 448*eda14cbcSMatt Macy if (vd->vdev_initialize_last_offset > physical_rs.rs_start) { 449*eda14cbcSMatt Macy zfs_dbgmsg("range write: vd %s changed (%llu, %llu) to " 450*eda14cbcSMatt Macy "(%llu, %llu)", vd->vdev_path, 451*eda14cbcSMatt Macy (u_longlong_t)physical_rs.rs_start, 452*eda14cbcSMatt Macy (u_longlong_t)physical_rs.rs_end, 453*eda14cbcSMatt Macy (u_longlong_t)vd->vdev_initialize_last_offset, 454*eda14cbcSMatt Macy (u_longlong_t)physical_rs.rs_end); 455*eda14cbcSMatt Macy ASSERT3U(physical_rs.rs_end, >, 456*eda14cbcSMatt Macy vd->vdev_initialize_last_offset); 457*eda14cbcSMatt Macy physical_rs.rs_start = vd->vdev_initialize_last_offset; 458*eda14cbcSMatt Macy } 459*eda14cbcSMatt Macy ASSERT3U(physical_rs.rs_end, >=, physical_rs.rs_start); 460*eda14cbcSMatt Macy 461*eda14cbcSMatt Macy /* 462*eda14cbcSMatt Macy * With raidz, it's possible that the logical range does not live on 463*eda14cbcSMatt Macy * this leaf vdev. We only add the physical range to this vdev's if it 464*eda14cbcSMatt Macy * has a length greater than 0. 465*eda14cbcSMatt Macy */ 466*eda14cbcSMatt Macy if (physical_rs.rs_end > physical_rs.rs_start) { 467*eda14cbcSMatt Macy range_tree_add(vd->vdev_initialize_tree, physical_rs.rs_start, 468*eda14cbcSMatt Macy physical_rs.rs_end - physical_rs.rs_start); 469*eda14cbcSMatt Macy } else { 470*eda14cbcSMatt Macy ASSERT3U(physical_rs.rs_end, ==, physical_rs.rs_start); 471*eda14cbcSMatt Macy } 472*eda14cbcSMatt Macy } 473*eda14cbcSMatt Macy 474*eda14cbcSMatt Macy static void 475*eda14cbcSMatt Macy vdev_initialize_thread(void *arg) 476*eda14cbcSMatt Macy { 477*eda14cbcSMatt Macy vdev_t *vd = arg; 478*eda14cbcSMatt Macy spa_t *spa = vd->vdev_spa; 479*eda14cbcSMatt Macy int error = 0; 480*eda14cbcSMatt Macy uint64_t ms_count = 0; 481*eda14cbcSMatt Macy 482*eda14cbcSMatt Macy ASSERT(vdev_is_concrete(vd)); 483*eda14cbcSMatt Macy spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 484*eda14cbcSMatt Macy 485*eda14cbcSMatt Macy vd->vdev_initialize_last_offset = 0; 486*eda14cbcSMatt Macy VERIFY0(vdev_initialize_load(vd)); 487*eda14cbcSMatt Macy 488*eda14cbcSMatt Macy abd_t *deadbeef = vdev_initialize_block_alloc(); 489*eda14cbcSMatt Macy 490*eda14cbcSMatt Macy vd->vdev_initialize_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 491*eda14cbcSMatt Macy 0, 0); 492*eda14cbcSMatt Macy 493*eda14cbcSMatt Macy for (uint64_t i = 0; !vd->vdev_detached && 494*eda14cbcSMatt Macy i < vd->vdev_top->vdev_ms_count; i++) { 495*eda14cbcSMatt Macy metaslab_t *msp = vd->vdev_top->vdev_ms[i]; 496*eda14cbcSMatt Macy boolean_t unload_when_done = B_FALSE; 497*eda14cbcSMatt Macy 498*eda14cbcSMatt Macy /* 499*eda14cbcSMatt Macy * If we've expanded the top-level vdev or it's our 500*eda14cbcSMatt Macy * first pass, calculate our progress. 501*eda14cbcSMatt Macy */ 502*eda14cbcSMatt Macy if (vd->vdev_top->vdev_ms_count != ms_count) { 503*eda14cbcSMatt Macy vdev_initialize_calculate_progress(vd); 504*eda14cbcSMatt Macy ms_count = vd->vdev_top->vdev_ms_count; 505*eda14cbcSMatt Macy } 506*eda14cbcSMatt Macy 507*eda14cbcSMatt Macy spa_config_exit(spa, SCL_CONFIG, FTAG); 508*eda14cbcSMatt Macy metaslab_disable(msp); 509*eda14cbcSMatt Macy mutex_enter(&msp->ms_lock); 510*eda14cbcSMatt Macy if (!msp->ms_loaded && !msp->ms_loading) 511*eda14cbcSMatt Macy unload_when_done = B_TRUE; 512*eda14cbcSMatt Macy VERIFY0(metaslab_load(msp)); 513*eda14cbcSMatt Macy 514*eda14cbcSMatt Macy range_tree_walk(msp->ms_allocatable, vdev_initialize_range_add, 515*eda14cbcSMatt Macy vd); 516*eda14cbcSMatt Macy mutex_exit(&msp->ms_lock); 517*eda14cbcSMatt Macy 518*eda14cbcSMatt Macy error = vdev_initialize_ranges(vd, deadbeef); 519*eda14cbcSMatt Macy metaslab_enable(msp, B_TRUE, unload_when_done); 520*eda14cbcSMatt Macy spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 521*eda14cbcSMatt Macy 522*eda14cbcSMatt Macy range_tree_vacate(vd->vdev_initialize_tree, NULL, NULL); 523*eda14cbcSMatt Macy if (error != 0) 524*eda14cbcSMatt Macy break; 525*eda14cbcSMatt Macy } 526*eda14cbcSMatt Macy 527*eda14cbcSMatt Macy spa_config_exit(spa, SCL_CONFIG, FTAG); 528*eda14cbcSMatt Macy mutex_enter(&vd->vdev_initialize_io_lock); 529*eda14cbcSMatt Macy while (vd->vdev_initialize_inflight > 0) { 530*eda14cbcSMatt Macy cv_wait(&vd->vdev_initialize_io_cv, 531*eda14cbcSMatt Macy &vd->vdev_initialize_io_lock); 532*eda14cbcSMatt Macy } 533*eda14cbcSMatt Macy mutex_exit(&vd->vdev_initialize_io_lock); 534*eda14cbcSMatt Macy 535*eda14cbcSMatt Macy range_tree_destroy(vd->vdev_initialize_tree); 536*eda14cbcSMatt Macy vdev_initialize_block_free(deadbeef); 537*eda14cbcSMatt Macy vd->vdev_initialize_tree = NULL; 538*eda14cbcSMatt Macy 539*eda14cbcSMatt Macy mutex_enter(&vd->vdev_initialize_lock); 540*eda14cbcSMatt Macy if (!vd->vdev_initialize_exit_wanted && vdev_writeable(vd)) { 541*eda14cbcSMatt Macy vdev_initialize_change_state(vd, VDEV_INITIALIZE_COMPLETE); 542*eda14cbcSMatt Macy } 543*eda14cbcSMatt Macy ASSERT(vd->vdev_initialize_thread != NULL || 544*eda14cbcSMatt Macy vd->vdev_initialize_inflight == 0); 545*eda14cbcSMatt Macy 546*eda14cbcSMatt Macy /* 547*eda14cbcSMatt Macy * Drop the vdev_initialize_lock while we sync out the 548*eda14cbcSMatt Macy * txg since it's possible that a device might be trying to 549*eda14cbcSMatt Macy * come online and must check to see if it needs to restart an 550*eda14cbcSMatt Macy * initialization. That thread will be holding the spa_config_lock 551*eda14cbcSMatt Macy * which would prevent the txg_wait_synced from completing. 552*eda14cbcSMatt Macy */ 553*eda14cbcSMatt Macy mutex_exit(&vd->vdev_initialize_lock); 554*eda14cbcSMatt Macy txg_wait_synced(spa_get_dsl(spa), 0); 555*eda14cbcSMatt Macy mutex_enter(&vd->vdev_initialize_lock); 556*eda14cbcSMatt Macy 557*eda14cbcSMatt Macy vd->vdev_initialize_thread = NULL; 558*eda14cbcSMatt Macy cv_broadcast(&vd->vdev_initialize_cv); 559*eda14cbcSMatt Macy mutex_exit(&vd->vdev_initialize_lock); 560*eda14cbcSMatt Macy 561*eda14cbcSMatt Macy thread_exit(); 562*eda14cbcSMatt Macy } 563*eda14cbcSMatt Macy 564*eda14cbcSMatt Macy /* 565*eda14cbcSMatt Macy * Initiates a device. Caller must hold vdev_initialize_lock. 566*eda14cbcSMatt Macy * Device must be a leaf and not already be initializing. 567*eda14cbcSMatt Macy */ 568*eda14cbcSMatt Macy void 569*eda14cbcSMatt Macy vdev_initialize(vdev_t *vd) 570*eda14cbcSMatt Macy { 571*eda14cbcSMatt Macy ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); 572*eda14cbcSMatt Macy ASSERT(vd->vdev_ops->vdev_op_leaf); 573*eda14cbcSMatt Macy ASSERT(vdev_is_concrete(vd)); 574*eda14cbcSMatt Macy ASSERT3P(vd->vdev_initialize_thread, ==, NULL); 575*eda14cbcSMatt Macy ASSERT(!vd->vdev_detached); 576*eda14cbcSMatt Macy ASSERT(!vd->vdev_initialize_exit_wanted); 577*eda14cbcSMatt Macy ASSERT(!vd->vdev_top->vdev_removing); 578*eda14cbcSMatt Macy 579*eda14cbcSMatt Macy vdev_initialize_change_state(vd, VDEV_INITIALIZE_ACTIVE); 580*eda14cbcSMatt Macy vd->vdev_initialize_thread = thread_create(NULL, 0, 581*eda14cbcSMatt Macy vdev_initialize_thread, vd, 0, &p0, TS_RUN, maxclsyspri); 582*eda14cbcSMatt Macy } 583*eda14cbcSMatt Macy 584*eda14cbcSMatt Macy /* 585*eda14cbcSMatt Macy * Wait for the initialize thread to be terminated (cancelled or stopped). 586*eda14cbcSMatt Macy */ 587*eda14cbcSMatt Macy static void 588*eda14cbcSMatt Macy vdev_initialize_stop_wait_impl(vdev_t *vd) 589*eda14cbcSMatt Macy { 590*eda14cbcSMatt Macy ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); 591*eda14cbcSMatt Macy 592*eda14cbcSMatt Macy while (vd->vdev_initialize_thread != NULL) 593*eda14cbcSMatt Macy cv_wait(&vd->vdev_initialize_cv, &vd->vdev_initialize_lock); 594*eda14cbcSMatt Macy 595*eda14cbcSMatt Macy ASSERT3P(vd->vdev_initialize_thread, ==, NULL); 596*eda14cbcSMatt Macy vd->vdev_initialize_exit_wanted = B_FALSE; 597*eda14cbcSMatt Macy } 598*eda14cbcSMatt Macy 599*eda14cbcSMatt Macy /* 600*eda14cbcSMatt Macy * Wait for vdev initialize threads which were either to cleanly exit. 601*eda14cbcSMatt Macy */ 602*eda14cbcSMatt Macy void 603*eda14cbcSMatt Macy vdev_initialize_stop_wait(spa_t *spa, list_t *vd_list) 604*eda14cbcSMatt Macy { 605*eda14cbcSMatt Macy vdev_t *vd; 606*eda14cbcSMatt Macy 607*eda14cbcSMatt Macy ASSERT(MUTEX_HELD(&spa_namespace_lock)); 608*eda14cbcSMatt Macy 609*eda14cbcSMatt Macy while ((vd = list_remove_head(vd_list)) != NULL) { 610*eda14cbcSMatt Macy mutex_enter(&vd->vdev_initialize_lock); 611*eda14cbcSMatt Macy vdev_initialize_stop_wait_impl(vd); 612*eda14cbcSMatt Macy mutex_exit(&vd->vdev_initialize_lock); 613*eda14cbcSMatt Macy } 614*eda14cbcSMatt Macy } 615*eda14cbcSMatt Macy 616*eda14cbcSMatt Macy /* 617*eda14cbcSMatt Macy * Stop initializing a device, with the resultant initializing state being 618*eda14cbcSMatt Macy * tgt_state. For blocking behavior pass NULL for vd_list. Otherwise, when 619*eda14cbcSMatt Macy * a list_t is provided the stopping vdev is inserted in to the list. Callers 620*eda14cbcSMatt Macy * are then required to call vdev_initialize_stop_wait() to block for all the 621*eda14cbcSMatt Macy * initialization threads to exit. The caller must hold vdev_initialize_lock 622*eda14cbcSMatt Macy * and must not be writing to the spa config, as the initializing thread may 623*eda14cbcSMatt Macy * try to enter the config as a reader before exiting. 624*eda14cbcSMatt Macy */ 625*eda14cbcSMatt Macy void 626*eda14cbcSMatt Macy vdev_initialize_stop(vdev_t *vd, vdev_initializing_state_t tgt_state, 627*eda14cbcSMatt Macy list_t *vd_list) 628*eda14cbcSMatt Macy { 629*eda14cbcSMatt Macy ASSERT(!spa_config_held(vd->vdev_spa, SCL_CONFIG|SCL_STATE, RW_WRITER)); 630*eda14cbcSMatt Macy ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); 631*eda14cbcSMatt Macy ASSERT(vd->vdev_ops->vdev_op_leaf); 632*eda14cbcSMatt Macy ASSERT(vdev_is_concrete(vd)); 633*eda14cbcSMatt Macy 634*eda14cbcSMatt Macy /* 635*eda14cbcSMatt Macy * Allow cancel requests to proceed even if the initialize thread 636*eda14cbcSMatt Macy * has stopped. 637*eda14cbcSMatt Macy */ 638*eda14cbcSMatt Macy if (vd->vdev_initialize_thread == NULL && 639*eda14cbcSMatt Macy tgt_state != VDEV_INITIALIZE_CANCELED) { 640*eda14cbcSMatt Macy return; 641*eda14cbcSMatt Macy } 642*eda14cbcSMatt Macy 643*eda14cbcSMatt Macy vdev_initialize_change_state(vd, tgt_state); 644*eda14cbcSMatt Macy vd->vdev_initialize_exit_wanted = B_TRUE; 645*eda14cbcSMatt Macy 646*eda14cbcSMatt Macy if (vd_list == NULL) { 647*eda14cbcSMatt Macy vdev_initialize_stop_wait_impl(vd); 648*eda14cbcSMatt Macy } else { 649*eda14cbcSMatt Macy ASSERT(MUTEX_HELD(&spa_namespace_lock)); 650*eda14cbcSMatt Macy list_insert_tail(vd_list, vd); 651*eda14cbcSMatt Macy } 652*eda14cbcSMatt Macy } 653*eda14cbcSMatt Macy 654*eda14cbcSMatt Macy static void 655*eda14cbcSMatt Macy vdev_initialize_stop_all_impl(vdev_t *vd, vdev_initializing_state_t tgt_state, 656*eda14cbcSMatt Macy list_t *vd_list) 657*eda14cbcSMatt Macy { 658*eda14cbcSMatt Macy if (vd->vdev_ops->vdev_op_leaf && vdev_is_concrete(vd)) { 659*eda14cbcSMatt Macy mutex_enter(&vd->vdev_initialize_lock); 660*eda14cbcSMatt Macy vdev_initialize_stop(vd, tgt_state, vd_list); 661*eda14cbcSMatt Macy mutex_exit(&vd->vdev_initialize_lock); 662*eda14cbcSMatt Macy return; 663*eda14cbcSMatt Macy } 664*eda14cbcSMatt Macy 665*eda14cbcSMatt Macy for (uint64_t i = 0; i < vd->vdev_children; i++) { 666*eda14cbcSMatt Macy vdev_initialize_stop_all_impl(vd->vdev_child[i], tgt_state, 667*eda14cbcSMatt Macy vd_list); 668*eda14cbcSMatt Macy } 669*eda14cbcSMatt Macy } 670*eda14cbcSMatt Macy 671*eda14cbcSMatt Macy /* 672*eda14cbcSMatt Macy * Convenience function to stop initializing of a vdev tree and set all 673*eda14cbcSMatt Macy * initialize thread pointers to NULL. 674*eda14cbcSMatt Macy */ 675*eda14cbcSMatt Macy void 676*eda14cbcSMatt Macy vdev_initialize_stop_all(vdev_t *vd, vdev_initializing_state_t tgt_state) 677*eda14cbcSMatt Macy { 678*eda14cbcSMatt Macy spa_t *spa = vd->vdev_spa; 679*eda14cbcSMatt Macy list_t vd_list; 680*eda14cbcSMatt Macy 681*eda14cbcSMatt Macy ASSERT(MUTEX_HELD(&spa_namespace_lock)); 682*eda14cbcSMatt Macy 683*eda14cbcSMatt Macy list_create(&vd_list, sizeof (vdev_t), 684*eda14cbcSMatt Macy offsetof(vdev_t, vdev_initialize_node)); 685*eda14cbcSMatt Macy 686*eda14cbcSMatt Macy vdev_initialize_stop_all_impl(vd, tgt_state, &vd_list); 687*eda14cbcSMatt Macy vdev_initialize_stop_wait(spa, &vd_list); 688*eda14cbcSMatt Macy 689*eda14cbcSMatt Macy if (vd->vdev_spa->spa_sync_on) { 690*eda14cbcSMatt Macy /* Make sure that our state has been synced to disk */ 691*eda14cbcSMatt Macy txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0); 692*eda14cbcSMatt Macy } 693*eda14cbcSMatt Macy 694*eda14cbcSMatt Macy list_destroy(&vd_list); 695*eda14cbcSMatt Macy } 696*eda14cbcSMatt Macy 697*eda14cbcSMatt Macy void 698*eda14cbcSMatt Macy vdev_initialize_restart(vdev_t *vd) 699*eda14cbcSMatt Macy { 700*eda14cbcSMatt Macy ASSERT(MUTEX_HELD(&spa_namespace_lock)); 701*eda14cbcSMatt Macy ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); 702*eda14cbcSMatt Macy 703*eda14cbcSMatt Macy if (vd->vdev_leaf_zap != 0) { 704*eda14cbcSMatt Macy mutex_enter(&vd->vdev_initialize_lock); 705*eda14cbcSMatt Macy uint64_t initialize_state = VDEV_INITIALIZE_NONE; 706*eda14cbcSMatt Macy int err = zap_lookup(vd->vdev_spa->spa_meta_objset, 707*eda14cbcSMatt Macy vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_STATE, 708*eda14cbcSMatt Macy sizeof (initialize_state), 1, &initialize_state); 709*eda14cbcSMatt Macy ASSERT(err == 0 || err == ENOENT); 710*eda14cbcSMatt Macy vd->vdev_initialize_state = initialize_state; 711*eda14cbcSMatt Macy 712*eda14cbcSMatt Macy uint64_t timestamp = 0; 713*eda14cbcSMatt Macy err = zap_lookup(vd->vdev_spa->spa_meta_objset, 714*eda14cbcSMatt Macy vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME, 715*eda14cbcSMatt Macy sizeof (timestamp), 1, ×tamp); 716*eda14cbcSMatt Macy ASSERT(err == 0 || err == ENOENT); 717*eda14cbcSMatt Macy vd->vdev_initialize_action_time = timestamp; 718*eda14cbcSMatt Macy 719*eda14cbcSMatt Macy if (vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED || 720*eda14cbcSMatt Macy vd->vdev_offline) { 721*eda14cbcSMatt Macy /* load progress for reporting, but don't resume */ 722*eda14cbcSMatt Macy VERIFY0(vdev_initialize_load(vd)); 723*eda14cbcSMatt Macy } else if (vd->vdev_initialize_state == 724*eda14cbcSMatt Macy VDEV_INITIALIZE_ACTIVE && vdev_writeable(vd) && 725*eda14cbcSMatt Macy !vd->vdev_top->vdev_removing && 726*eda14cbcSMatt Macy vd->vdev_initialize_thread == NULL) { 727*eda14cbcSMatt Macy vdev_initialize(vd); 728*eda14cbcSMatt Macy } 729*eda14cbcSMatt Macy 730*eda14cbcSMatt Macy mutex_exit(&vd->vdev_initialize_lock); 731*eda14cbcSMatt Macy } 732*eda14cbcSMatt Macy 733*eda14cbcSMatt Macy for (uint64_t i = 0; i < vd->vdev_children; i++) { 734*eda14cbcSMatt Macy vdev_initialize_restart(vd->vdev_child[i]); 735*eda14cbcSMatt Macy } 736*eda14cbcSMatt Macy } 737*eda14cbcSMatt Macy 738*eda14cbcSMatt Macy EXPORT_SYMBOL(vdev_initialize); 739*eda14cbcSMatt Macy EXPORT_SYMBOL(vdev_initialize_stop); 740*eda14cbcSMatt Macy EXPORT_SYMBOL(vdev_initialize_stop_all); 741*eda14cbcSMatt Macy EXPORT_SYMBOL(vdev_initialize_stop_wait); 742*eda14cbcSMatt Macy EXPORT_SYMBOL(vdev_initialize_restart); 743*eda14cbcSMatt Macy 744*eda14cbcSMatt Macy /* BEGIN CSTYLED */ 745*eda14cbcSMatt Macy ZFS_MODULE_PARAM(zfs, zfs_, initialize_value, ULONG, ZMOD_RW, 746*eda14cbcSMatt Macy "Value written during zpool initialize"); 747*eda14cbcSMatt Macy 748*eda14cbcSMatt Macy ZFS_MODULE_PARAM(zfs, zfs_, initialize_chunk_size, ULONG, ZMOD_RW, 749*eda14cbcSMatt Macy "Size in bytes of writes by zpool initialize"); 750*eda14cbcSMatt Macy /* END CSTYLED */ 751