xref: /freebsd-src/sys/contrib/openzfs/module/zfs/vdev_initialize.c (revision eda14cbc264d6969b02f2b1994cef11148e914f1)
1*eda14cbcSMatt Macy /*
2*eda14cbcSMatt Macy  * CDDL HEADER START
3*eda14cbcSMatt Macy  *
4*eda14cbcSMatt Macy  * The contents of this file are subject to the terms of the
5*eda14cbcSMatt Macy  * Common Development and Distribution License (the "License").
6*eda14cbcSMatt Macy  * You may not use this file except in compliance with the License.
7*eda14cbcSMatt Macy  *
8*eda14cbcSMatt Macy  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9*eda14cbcSMatt Macy  * or http://www.opensolaris.org/os/licensing.
10*eda14cbcSMatt Macy  * See the License for the specific language governing permissions
11*eda14cbcSMatt Macy  * and limitations under the License.
12*eda14cbcSMatt Macy  *
13*eda14cbcSMatt Macy  * When distributing Covered Code, include this CDDL HEADER in each
14*eda14cbcSMatt Macy  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15*eda14cbcSMatt Macy  * If applicable, add the following below this CDDL HEADER, with the
16*eda14cbcSMatt Macy  * fields enclosed by brackets "[]" replaced with your own identifying
17*eda14cbcSMatt Macy  * information: Portions Copyright [yyyy] [name of copyright owner]
18*eda14cbcSMatt Macy  *
19*eda14cbcSMatt Macy  * CDDL HEADER END
20*eda14cbcSMatt Macy  */
21*eda14cbcSMatt Macy 
22*eda14cbcSMatt Macy /*
23*eda14cbcSMatt Macy  * Copyright (c) 2016, 2019 by Delphix. All rights reserved.
24*eda14cbcSMatt Macy  */
25*eda14cbcSMatt Macy 
26*eda14cbcSMatt Macy #include <sys/spa.h>
27*eda14cbcSMatt Macy #include <sys/spa_impl.h>
28*eda14cbcSMatt Macy #include <sys/txg.h>
29*eda14cbcSMatt Macy #include <sys/vdev_impl.h>
30*eda14cbcSMatt Macy #include <sys/metaslab_impl.h>
31*eda14cbcSMatt Macy #include <sys/dsl_synctask.h>
32*eda14cbcSMatt Macy #include <sys/zap.h>
33*eda14cbcSMatt Macy #include <sys/dmu_tx.h>
34*eda14cbcSMatt Macy #include <sys/vdev_initialize.h>
35*eda14cbcSMatt Macy 
36*eda14cbcSMatt Macy /*
37*eda14cbcSMatt Macy  * Value that is written to disk during initialization.
38*eda14cbcSMatt Macy  */
39*eda14cbcSMatt Macy #ifdef _ILP32
40*eda14cbcSMatt Macy unsigned long zfs_initialize_value = 0xdeadbeefUL;
41*eda14cbcSMatt Macy #else
42*eda14cbcSMatt Macy unsigned long zfs_initialize_value = 0xdeadbeefdeadbeeeULL;
43*eda14cbcSMatt Macy #endif
44*eda14cbcSMatt Macy 
45*eda14cbcSMatt Macy /* maximum number of I/Os outstanding per leaf vdev */
46*eda14cbcSMatt Macy int zfs_initialize_limit = 1;
47*eda14cbcSMatt Macy 
48*eda14cbcSMatt Macy /* size of initializing writes; default 1MiB, see zfs_remove_max_segment */
49*eda14cbcSMatt Macy unsigned long zfs_initialize_chunk_size = 1024 * 1024;
50*eda14cbcSMatt Macy 
51*eda14cbcSMatt Macy static boolean_t
52*eda14cbcSMatt Macy vdev_initialize_should_stop(vdev_t *vd)
53*eda14cbcSMatt Macy {
54*eda14cbcSMatt Macy 	return (vd->vdev_initialize_exit_wanted || !vdev_writeable(vd) ||
55*eda14cbcSMatt Macy 	    vd->vdev_detached || vd->vdev_top->vdev_removing);
56*eda14cbcSMatt Macy }
57*eda14cbcSMatt Macy 
58*eda14cbcSMatt Macy static void
59*eda14cbcSMatt Macy vdev_initialize_zap_update_sync(void *arg, dmu_tx_t *tx)
60*eda14cbcSMatt Macy {
61*eda14cbcSMatt Macy 	/*
62*eda14cbcSMatt Macy 	 * We pass in the guid instead of the vdev_t since the vdev may
63*eda14cbcSMatt Macy 	 * have been freed prior to the sync task being processed. This
64*eda14cbcSMatt Macy 	 * happens when a vdev is detached as we call spa_config_vdev_exit(),
65*eda14cbcSMatt Macy 	 * stop the initializing thread, schedule the sync task, and free
66*eda14cbcSMatt Macy 	 * the vdev. Later when the scheduled sync task is invoked, it would
67*eda14cbcSMatt Macy 	 * find that the vdev has been freed.
68*eda14cbcSMatt Macy 	 */
69*eda14cbcSMatt Macy 	uint64_t guid = *(uint64_t *)arg;
70*eda14cbcSMatt Macy 	uint64_t txg = dmu_tx_get_txg(tx);
71*eda14cbcSMatt Macy 	kmem_free(arg, sizeof (uint64_t));
72*eda14cbcSMatt Macy 
73*eda14cbcSMatt Macy 	vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE);
74*eda14cbcSMatt Macy 	if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd))
75*eda14cbcSMatt Macy 		return;
76*eda14cbcSMatt Macy 
77*eda14cbcSMatt Macy 	uint64_t last_offset = vd->vdev_initialize_offset[txg & TXG_MASK];
78*eda14cbcSMatt Macy 	vd->vdev_initialize_offset[txg & TXG_MASK] = 0;
79*eda14cbcSMatt Macy 
80*eda14cbcSMatt Macy 	VERIFY(vd->vdev_leaf_zap != 0);
81*eda14cbcSMatt Macy 
82*eda14cbcSMatt Macy 	objset_t *mos = vd->vdev_spa->spa_meta_objset;
83*eda14cbcSMatt Macy 
84*eda14cbcSMatt Macy 	if (last_offset > 0) {
85*eda14cbcSMatt Macy 		vd->vdev_initialize_last_offset = last_offset;
86*eda14cbcSMatt Macy 		VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
87*eda14cbcSMatt Macy 		    VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET,
88*eda14cbcSMatt Macy 		    sizeof (last_offset), 1, &last_offset, tx));
89*eda14cbcSMatt Macy 	}
90*eda14cbcSMatt Macy 	if (vd->vdev_initialize_action_time > 0) {
91*eda14cbcSMatt Macy 		uint64_t val = (uint64_t)vd->vdev_initialize_action_time;
92*eda14cbcSMatt Macy 		VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
93*eda14cbcSMatt Macy 		    VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME, sizeof (val),
94*eda14cbcSMatt Macy 		    1, &val, tx));
95*eda14cbcSMatt Macy 	}
96*eda14cbcSMatt Macy 
97*eda14cbcSMatt Macy 	uint64_t initialize_state = vd->vdev_initialize_state;
98*eda14cbcSMatt Macy 	VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
99*eda14cbcSMatt Macy 	    VDEV_LEAF_ZAP_INITIALIZE_STATE, sizeof (initialize_state), 1,
100*eda14cbcSMatt Macy 	    &initialize_state, tx));
101*eda14cbcSMatt Macy }
102*eda14cbcSMatt Macy 
103*eda14cbcSMatt Macy static void
104*eda14cbcSMatt Macy vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state)
105*eda14cbcSMatt Macy {
106*eda14cbcSMatt Macy 	ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
107*eda14cbcSMatt Macy 	spa_t *spa = vd->vdev_spa;
108*eda14cbcSMatt Macy 
109*eda14cbcSMatt Macy 	if (new_state == vd->vdev_initialize_state)
110*eda14cbcSMatt Macy 		return;
111*eda14cbcSMatt Macy 
112*eda14cbcSMatt Macy 	/*
113*eda14cbcSMatt Macy 	 * Copy the vd's guid, this will be freed by the sync task.
114*eda14cbcSMatt Macy 	 */
115*eda14cbcSMatt Macy 	uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
116*eda14cbcSMatt Macy 	*guid = vd->vdev_guid;
117*eda14cbcSMatt Macy 
118*eda14cbcSMatt Macy 	/*
119*eda14cbcSMatt Macy 	 * If we're suspending, then preserving the original start time.
120*eda14cbcSMatt Macy 	 */
121*eda14cbcSMatt Macy 	if (vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED) {
122*eda14cbcSMatt Macy 		vd->vdev_initialize_action_time = gethrestime_sec();
123*eda14cbcSMatt Macy 	}
124*eda14cbcSMatt Macy 	vd->vdev_initialize_state = new_state;
125*eda14cbcSMatt Macy 
126*eda14cbcSMatt Macy 	dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
127*eda14cbcSMatt Macy 	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
128*eda14cbcSMatt Macy 	dsl_sync_task_nowait(spa_get_dsl(spa), vdev_initialize_zap_update_sync,
129*eda14cbcSMatt Macy 	    guid, 2, ZFS_SPACE_CHECK_NONE, tx);
130*eda14cbcSMatt Macy 
131*eda14cbcSMatt Macy 	switch (new_state) {
132*eda14cbcSMatt Macy 	case VDEV_INITIALIZE_ACTIVE:
133*eda14cbcSMatt Macy 		spa_history_log_internal(spa, "initialize", tx,
134*eda14cbcSMatt Macy 		    "vdev=%s activated", vd->vdev_path);
135*eda14cbcSMatt Macy 		break;
136*eda14cbcSMatt Macy 	case VDEV_INITIALIZE_SUSPENDED:
137*eda14cbcSMatt Macy 		spa_history_log_internal(spa, "initialize", tx,
138*eda14cbcSMatt Macy 		    "vdev=%s suspended", vd->vdev_path);
139*eda14cbcSMatt Macy 		break;
140*eda14cbcSMatt Macy 	case VDEV_INITIALIZE_CANCELED:
141*eda14cbcSMatt Macy 		spa_history_log_internal(spa, "initialize", tx,
142*eda14cbcSMatt Macy 		    "vdev=%s canceled", vd->vdev_path);
143*eda14cbcSMatt Macy 		break;
144*eda14cbcSMatt Macy 	case VDEV_INITIALIZE_COMPLETE:
145*eda14cbcSMatt Macy 		spa_history_log_internal(spa, "initialize", tx,
146*eda14cbcSMatt Macy 		    "vdev=%s complete", vd->vdev_path);
147*eda14cbcSMatt Macy 		break;
148*eda14cbcSMatt Macy 	default:
149*eda14cbcSMatt Macy 		panic("invalid state %llu", (unsigned long long)new_state);
150*eda14cbcSMatt Macy 	}
151*eda14cbcSMatt Macy 
152*eda14cbcSMatt Macy 	dmu_tx_commit(tx);
153*eda14cbcSMatt Macy 
154*eda14cbcSMatt Macy 	if (new_state != VDEV_INITIALIZE_ACTIVE)
155*eda14cbcSMatt Macy 		spa_notify_waiters(spa);
156*eda14cbcSMatt Macy }
157*eda14cbcSMatt Macy 
158*eda14cbcSMatt Macy static void
159*eda14cbcSMatt Macy vdev_initialize_cb(zio_t *zio)
160*eda14cbcSMatt Macy {
161*eda14cbcSMatt Macy 	vdev_t *vd = zio->io_vd;
162*eda14cbcSMatt Macy 	mutex_enter(&vd->vdev_initialize_io_lock);
163*eda14cbcSMatt Macy 	if (zio->io_error == ENXIO && !vdev_writeable(vd)) {
164*eda14cbcSMatt Macy 		/*
165*eda14cbcSMatt Macy 		 * The I/O failed because the vdev was unavailable; roll the
166*eda14cbcSMatt Macy 		 * last offset back. (This works because spa_sync waits on
167*eda14cbcSMatt Macy 		 * spa_txg_zio before it runs sync tasks.)
168*eda14cbcSMatt Macy 		 */
169*eda14cbcSMatt Macy 		uint64_t *off =
170*eda14cbcSMatt Macy 		    &vd->vdev_initialize_offset[zio->io_txg & TXG_MASK];
171*eda14cbcSMatt Macy 		*off = MIN(*off, zio->io_offset);
172*eda14cbcSMatt Macy 	} else {
173*eda14cbcSMatt Macy 		/*
174*eda14cbcSMatt Macy 		 * Since initializing is best-effort, we ignore I/O errors and
175*eda14cbcSMatt Macy 		 * rely on vdev_probe to determine if the errors are more
176*eda14cbcSMatt Macy 		 * critical.
177*eda14cbcSMatt Macy 		 */
178*eda14cbcSMatt Macy 		if (zio->io_error != 0)
179*eda14cbcSMatt Macy 			vd->vdev_stat.vs_initialize_errors++;
180*eda14cbcSMatt Macy 
181*eda14cbcSMatt Macy 		vd->vdev_initialize_bytes_done += zio->io_orig_size;
182*eda14cbcSMatt Macy 	}
183*eda14cbcSMatt Macy 	ASSERT3U(vd->vdev_initialize_inflight, >, 0);
184*eda14cbcSMatt Macy 	vd->vdev_initialize_inflight--;
185*eda14cbcSMatt Macy 	cv_broadcast(&vd->vdev_initialize_io_cv);
186*eda14cbcSMatt Macy 	mutex_exit(&vd->vdev_initialize_io_lock);
187*eda14cbcSMatt Macy 
188*eda14cbcSMatt Macy 	spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
189*eda14cbcSMatt Macy }
190*eda14cbcSMatt Macy 
191*eda14cbcSMatt Macy /* Takes care of physical writing and limiting # of concurrent ZIOs. */
192*eda14cbcSMatt Macy static int
193*eda14cbcSMatt Macy vdev_initialize_write(vdev_t *vd, uint64_t start, uint64_t size, abd_t *data)
194*eda14cbcSMatt Macy {
195*eda14cbcSMatt Macy 	spa_t *spa = vd->vdev_spa;
196*eda14cbcSMatt Macy 
197*eda14cbcSMatt Macy 	/* Limit inflight initializing I/Os */
198*eda14cbcSMatt Macy 	mutex_enter(&vd->vdev_initialize_io_lock);
199*eda14cbcSMatt Macy 	while (vd->vdev_initialize_inflight >= zfs_initialize_limit) {
200*eda14cbcSMatt Macy 		cv_wait(&vd->vdev_initialize_io_cv,
201*eda14cbcSMatt Macy 		    &vd->vdev_initialize_io_lock);
202*eda14cbcSMatt Macy 	}
203*eda14cbcSMatt Macy 	vd->vdev_initialize_inflight++;
204*eda14cbcSMatt Macy 	mutex_exit(&vd->vdev_initialize_io_lock);
205*eda14cbcSMatt Macy 
206*eda14cbcSMatt Macy 	dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
207*eda14cbcSMatt Macy 	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
208*eda14cbcSMatt Macy 	uint64_t txg = dmu_tx_get_txg(tx);
209*eda14cbcSMatt Macy 
210*eda14cbcSMatt Macy 	spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER);
211*eda14cbcSMatt Macy 	mutex_enter(&vd->vdev_initialize_lock);
212*eda14cbcSMatt Macy 
213*eda14cbcSMatt Macy 	if (vd->vdev_initialize_offset[txg & TXG_MASK] == 0) {
214*eda14cbcSMatt Macy 		uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
215*eda14cbcSMatt Macy 		*guid = vd->vdev_guid;
216*eda14cbcSMatt Macy 
217*eda14cbcSMatt Macy 		/* This is the first write of this txg. */
218*eda14cbcSMatt Macy 		dsl_sync_task_nowait(spa_get_dsl(spa),
219*eda14cbcSMatt Macy 		    vdev_initialize_zap_update_sync, guid, 2,
220*eda14cbcSMatt Macy 		    ZFS_SPACE_CHECK_RESERVED, tx);
221*eda14cbcSMatt Macy 	}
222*eda14cbcSMatt Macy 
223*eda14cbcSMatt Macy 	/*
224*eda14cbcSMatt Macy 	 * We know the vdev struct will still be around since all
225*eda14cbcSMatt Macy 	 * consumers of vdev_free must stop the initialization first.
226*eda14cbcSMatt Macy 	 */
227*eda14cbcSMatt Macy 	if (vdev_initialize_should_stop(vd)) {
228*eda14cbcSMatt Macy 		mutex_enter(&vd->vdev_initialize_io_lock);
229*eda14cbcSMatt Macy 		ASSERT3U(vd->vdev_initialize_inflight, >, 0);
230*eda14cbcSMatt Macy 		vd->vdev_initialize_inflight--;
231*eda14cbcSMatt Macy 		mutex_exit(&vd->vdev_initialize_io_lock);
232*eda14cbcSMatt Macy 		spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
233*eda14cbcSMatt Macy 		mutex_exit(&vd->vdev_initialize_lock);
234*eda14cbcSMatt Macy 		dmu_tx_commit(tx);
235*eda14cbcSMatt Macy 		return (SET_ERROR(EINTR));
236*eda14cbcSMatt Macy 	}
237*eda14cbcSMatt Macy 	mutex_exit(&vd->vdev_initialize_lock);
238*eda14cbcSMatt Macy 
239*eda14cbcSMatt Macy 	vd->vdev_initialize_offset[txg & TXG_MASK] = start + size;
240*eda14cbcSMatt Macy 	zio_nowait(zio_write_phys(spa->spa_txg_zio[txg & TXG_MASK], vd, start,
241*eda14cbcSMatt Macy 	    size, data, ZIO_CHECKSUM_OFF, vdev_initialize_cb, NULL,
242*eda14cbcSMatt Macy 	    ZIO_PRIORITY_INITIALIZING, ZIO_FLAG_CANFAIL, B_FALSE));
243*eda14cbcSMatt Macy 	/* vdev_initialize_cb releases SCL_STATE_ALL */
244*eda14cbcSMatt Macy 
245*eda14cbcSMatt Macy 	dmu_tx_commit(tx);
246*eda14cbcSMatt Macy 
247*eda14cbcSMatt Macy 	return (0);
248*eda14cbcSMatt Macy }
249*eda14cbcSMatt Macy 
250*eda14cbcSMatt Macy /*
251*eda14cbcSMatt Macy  * Callback to fill each ABD chunk with zfs_initialize_value. len must be
252*eda14cbcSMatt Macy  * divisible by sizeof (uint64_t), and buf must be 8-byte aligned. The ABD
253*eda14cbcSMatt Macy  * allocation will guarantee these for us.
254*eda14cbcSMatt Macy  */
255*eda14cbcSMatt Macy /* ARGSUSED */
256*eda14cbcSMatt Macy static int
257*eda14cbcSMatt Macy vdev_initialize_block_fill(void *buf, size_t len, void *unused)
258*eda14cbcSMatt Macy {
259*eda14cbcSMatt Macy 	ASSERT0(len % sizeof (uint64_t));
260*eda14cbcSMatt Macy #ifdef _ILP32
261*eda14cbcSMatt Macy 	for (uint64_t i = 0; i < len; i += sizeof (uint32_t)) {
262*eda14cbcSMatt Macy 		*(uint32_t *)((char *)(buf) + i) = zfs_initialize_value;
263*eda14cbcSMatt Macy 	}
264*eda14cbcSMatt Macy #else
265*eda14cbcSMatt Macy 	for (uint64_t i = 0; i < len; i += sizeof (uint64_t)) {
266*eda14cbcSMatt Macy 		*(uint64_t *)((char *)(buf) + i) = zfs_initialize_value;
267*eda14cbcSMatt Macy 	}
268*eda14cbcSMatt Macy #endif
269*eda14cbcSMatt Macy 	return (0);
270*eda14cbcSMatt Macy }
271*eda14cbcSMatt Macy 
272*eda14cbcSMatt Macy static abd_t *
273*eda14cbcSMatt Macy vdev_initialize_block_alloc(void)
274*eda14cbcSMatt Macy {
275*eda14cbcSMatt Macy 	/* Allocate ABD for filler data */
276*eda14cbcSMatt Macy 	abd_t *data = abd_alloc_for_io(zfs_initialize_chunk_size, B_FALSE);
277*eda14cbcSMatt Macy 
278*eda14cbcSMatt Macy 	ASSERT0(zfs_initialize_chunk_size % sizeof (uint64_t));
279*eda14cbcSMatt Macy 	(void) abd_iterate_func(data, 0, zfs_initialize_chunk_size,
280*eda14cbcSMatt Macy 	    vdev_initialize_block_fill, NULL);
281*eda14cbcSMatt Macy 
282*eda14cbcSMatt Macy 	return (data);
283*eda14cbcSMatt Macy }
284*eda14cbcSMatt Macy 
285*eda14cbcSMatt Macy static void
286*eda14cbcSMatt Macy vdev_initialize_block_free(abd_t *data)
287*eda14cbcSMatt Macy {
288*eda14cbcSMatt Macy 	abd_free(data);
289*eda14cbcSMatt Macy }
290*eda14cbcSMatt Macy 
291*eda14cbcSMatt Macy static int
292*eda14cbcSMatt Macy vdev_initialize_ranges(vdev_t *vd, abd_t *data)
293*eda14cbcSMatt Macy {
294*eda14cbcSMatt Macy 	range_tree_t *rt = vd->vdev_initialize_tree;
295*eda14cbcSMatt Macy 	zfs_btree_t *bt = &rt->rt_root;
296*eda14cbcSMatt Macy 	zfs_btree_index_t where;
297*eda14cbcSMatt Macy 
298*eda14cbcSMatt Macy 	for (range_seg_t *rs = zfs_btree_first(bt, &where); rs != NULL;
299*eda14cbcSMatt Macy 	    rs = zfs_btree_next(bt, &where, &where)) {
300*eda14cbcSMatt Macy 		uint64_t size = rs_get_end(rs, rt) - rs_get_start(rs, rt);
301*eda14cbcSMatt Macy 
302*eda14cbcSMatt Macy 		/* Split range into legally-sized physical chunks */
303*eda14cbcSMatt Macy 		uint64_t writes_required =
304*eda14cbcSMatt Macy 		    ((size - 1) / zfs_initialize_chunk_size) + 1;
305*eda14cbcSMatt Macy 
306*eda14cbcSMatt Macy 		for (uint64_t w = 0; w < writes_required; w++) {
307*eda14cbcSMatt Macy 			int error;
308*eda14cbcSMatt Macy 
309*eda14cbcSMatt Macy 			error = vdev_initialize_write(vd,
310*eda14cbcSMatt Macy 			    VDEV_LABEL_START_SIZE + rs_get_start(rs, rt) +
311*eda14cbcSMatt Macy 			    (w * zfs_initialize_chunk_size),
312*eda14cbcSMatt Macy 			    MIN(size - (w * zfs_initialize_chunk_size),
313*eda14cbcSMatt Macy 			    zfs_initialize_chunk_size), data);
314*eda14cbcSMatt Macy 			if (error != 0)
315*eda14cbcSMatt Macy 				return (error);
316*eda14cbcSMatt Macy 		}
317*eda14cbcSMatt Macy 	}
318*eda14cbcSMatt Macy 	return (0);
319*eda14cbcSMatt Macy }
320*eda14cbcSMatt Macy 
321*eda14cbcSMatt Macy static void
322*eda14cbcSMatt Macy vdev_initialize_calculate_progress(vdev_t *vd)
323*eda14cbcSMatt Macy {
324*eda14cbcSMatt Macy 	ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) ||
325*eda14cbcSMatt Macy 	    spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER));
326*eda14cbcSMatt Macy 	ASSERT(vd->vdev_leaf_zap != 0);
327*eda14cbcSMatt Macy 
328*eda14cbcSMatt Macy 	vd->vdev_initialize_bytes_est = 0;
329*eda14cbcSMatt Macy 	vd->vdev_initialize_bytes_done = 0;
330*eda14cbcSMatt Macy 
331*eda14cbcSMatt Macy 	for (uint64_t i = 0; i < vd->vdev_top->vdev_ms_count; i++) {
332*eda14cbcSMatt Macy 		metaslab_t *msp = vd->vdev_top->vdev_ms[i];
333*eda14cbcSMatt Macy 		mutex_enter(&msp->ms_lock);
334*eda14cbcSMatt Macy 
335*eda14cbcSMatt Macy 		uint64_t ms_free = msp->ms_size -
336*eda14cbcSMatt Macy 		    metaslab_allocated_space(msp);
337*eda14cbcSMatt Macy 
338*eda14cbcSMatt Macy 		if (vd->vdev_top->vdev_ops == &vdev_raidz_ops)
339*eda14cbcSMatt Macy 			ms_free /= vd->vdev_top->vdev_children;
340*eda14cbcSMatt Macy 
341*eda14cbcSMatt Macy 		/*
342*eda14cbcSMatt Macy 		 * Convert the metaslab range to a physical range
343*eda14cbcSMatt Macy 		 * on our vdev. We use this to determine if we are
344*eda14cbcSMatt Macy 		 * in the middle of this metaslab range.
345*eda14cbcSMatt Macy 		 */
346*eda14cbcSMatt Macy 		range_seg64_t logical_rs, physical_rs;
347*eda14cbcSMatt Macy 		logical_rs.rs_start = msp->ms_start;
348*eda14cbcSMatt Macy 		logical_rs.rs_end = msp->ms_start + msp->ms_size;
349*eda14cbcSMatt Macy 		vdev_xlate(vd, &logical_rs, &physical_rs);
350*eda14cbcSMatt Macy 
351*eda14cbcSMatt Macy 		if (vd->vdev_initialize_last_offset <= physical_rs.rs_start) {
352*eda14cbcSMatt Macy 			vd->vdev_initialize_bytes_est += ms_free;
353*eda14cbcSMatt Macy 			mutex_exit(&msp->ms_lock);
354*eda14cbcSMatt Macy 			continue;
355*eda14cbcSMatt Macy 		} else if (vd->vdev_initialize_last_offset >
356*eda14cbcSMatt Macy 		    physical_rs.rs_end) {
357*eda14cbcSMatt Macy 			vd->vdev_initialize_bytes_done += ms_free;
358*eda14cbcSMatt Macy 			vd->vdev_initialize_bytes_est += ms_free;
359*eda14cbcSMatt Macy 			mutex_exit(&msp->ms_lock);
360*eda14cbcSMatt Macy 			continue;
361*eda14cbcSMatt Macy 		}
362*eda14cbcSMatt Macy 
363*eda14cbcSMatt Macy 		/*
364*eda14cbcSMatt Macy 		 * If we get here, we're in the middle of initializing this
365*eda14cbcSMatt Macy 		 * metaslab. Load it and walk the free tree for more accurate
366*eda14cbcSMatt Macy 		 * progress estimation.
367*eda14cbcSMatt Macy 		 */
368*eda14cbcSMatt Macy 		VERIFY0(metaslab_load(msp));
369*eda14cbcSMatt Macy 
370*eda14cbcSMatt Macy 		zfs_btree_index_t where;
371*eda14cbcSMatt Macy 		range_tree_t *rt = msp->ms_allocatable;
372*eda14cbcSMatt Macy 		for (range_seg_t *rs =
373*eda14cbcSMatt Macy 		    zfs_btree_first(&rt->rt_root, &where); rs;
374*eda14cbcSMatt Macy 		    rs = zfs_btree_next(&rt->rt_root, &where,
375*eda14cbcSMatt Macy 		    &where)) {
376*eda14cbcSMatt Macy 			logical_rs.rs_start = rs_get_start(rs, rt);
377*eda14cbcSMatt Macy 			logical_rs.rs_end = rs_get_end(rs, rt);
378*eda14cbcSMatt Macy 			vdev_xlate(vd, &logical_rs, &physical_rs);
379*eda14cbcSMatt Macy 
380*eda14cbcSMatt Macy 			uint64_t size = physical_rs.rs_end -
381*eda14cbcSMatt Macy 			    physical_rs.rs_start;
382*eda14cbcSMatt Macy 			vd->vdev_initialize_bytes_est += size;
383*eda14cbcSMatt Macy 			if (vd->vdev_initialize_last_offset >
384*eda14cbcSMatt Macy 			    physical_rs.rs_end) {
385*eda14cbcSMatt Macy 				vd->vdev_initialize_bytes_done += size;
386*eda14cbcSMatt Macy 			} else if (vd->vdev_initialize_last_offset >
387*eda14cbcSMatt Macy 			    physical_rs.rs_start &&
388*eda14cbcSMatt Macy 			    vd->vdev_initialize_last_offset <
389*eda14cbcSMatt Macy 			    physical_rs.rs_end) {
390*eda14cbcSMatt Macy 				vd->vdev_initialize_bytes_done +=
391*eda14cbcSMatt Macy 				    vd->vdev_initialize_last_offset -
392*eda14cbcSMatt Macy 				    physical_rs.rs_start;
393*eda14cbcSMatt Macy 			}
394*eda14cbcSMatt Macy 		}
395*eda14cbcSMatt Macy 		mutex_exit(&msp->ms_lock);
396*eda14cbcSMatt Macy 	}
397*eda14cbcSMatt Macy }
398*eda14cbcSMatt Macy 
399*eda14cbcSMatt Macy static int
400*eda14cbcSMatt Macy vdev_initialize_load(vdev_t *vd)
401*eda14cbcSMatt Macy {
402*eda14cbcSMatt Macy 	int err = 0;
403*eda14cbcSMatt Macy 	ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) ||
404*eda14cbcSMatt Macy 	    spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER));
405*eda14cbcSMatt Macy 	ASSERT(vd->vdev_leaf_zap != 0);
406*eda14cbcSMatt Macy 
407*eda14cbcSMatt Macy 	if (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE ||
408*eda14cbcSMatt Macy 	    vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED) {
409*eda14cbcSMatt Macy 		err = zap_lookup(vd->vdev_spa->spa_meta_objset,
410*eda14cbcSMatt Macy 		    vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET,
411*eda14cbcSMatt Macy 		    sizeof (vd->vdev_initialize_last_offset), 1,
412*eda14cbcSMatt Macy 		    &vd->vdev_initialize_last_offset);
413*eda14cbcSMatt Macy 		if (err == ENOENT) {
414*eda14cbcSMatt Macy 			vd->vdev_initialize_last_offset = 0;
415*eda14cbcSMatt Macy 			err = 0;
416*eda14cbcSMatt Macy 		}
417*eda14cbcSMatt Macy 	}
418*eda14cbcSMatt Macy 
419*eda14cbcSMatt Macy 	vdev_initialize_calculate_progress(vd);
420*eda14cbcSMatt Macy 	return (err);
421*eda14cbcSMatt Macy }
422*eda14cbcSMatt Macy 
423*eda14cbcSMatt Macy /*
424*eda14cbcSMatt Macy  * Convert the logical range into a physical range and add it to our
425*eda14cbcSMatt Macy  * avl tree.
426*eda14cbcSMatt Macy  */
427*eda14cbcSMatt Macy static void
428*eda14cbcSMatt Macy vdev_initialize_range_add(void *arg, uint64_t start, uint64_t size)
429*eda14cbcSMatt Macy {
430*eda14cbcSMatt Macy 	vdev_t *vd = arg;
431*eda14cbcSMatt Macy 	range_seg64_t logical_rs, physical_rs;
432*eda14cbcSMatt Macy 	logical_rs.rs_start = start;
433*eda14cbcSMatt Macy 	logical_rs.rs_end = start + size;
434*eda14cbcSMatt Macy 
435*eda14cbcSMatt Macy 	ASSERT(vd->vdev_ops->vdev_op_leaf);
436*eda14cbcSMatt Macy 	vdev_xlate(vd, &logical_rs, &physical_rs);
437*eda14cbcSMatt Macy 
438*eda14cbcSMatt Macy 	IMPLY(vd->vdev_top == vd,
439*eda14cbcSMatt Macy 	    logical_rs.rs_start == physical_rs.rs_start);
440*eda14cbcSMatt Macy 	IMPLY(vd->vdev_top == vd,
441*eda14cbcSMatt Macy 	    logical_rs.rs_end == physical_rs.rs_end);
442*eda14cbcSMatt Macy 
443*eda14cbcSMatt Macy 	/* Only add segments that we have not visited yet */
444*eda14cbcSMatt Macy 	if (physical_rs.rs_end <= vd->vdev_initialize_last_offset)
445*eda14cbcSMatt Macy 		return;
446*eda14cbcSMatt Macy 
447*eda14cbcSMatt Macy 	/* Pick up where we left off mid-range. */
448*eda14cbcSMatt Macy 	if (vd->vdev_initialize_last_offset > physical_rs.rs_start) {
449*eda14cbcSMatt Macy 		zfs_dbgmsg("range write: vd %s changed (%llu, %llu) to "
450*eda14cbcSMatt Macy 		    "(%llu, %llu)", vd->vdev_path,
451*eda14cbcSMatt Macy 		    (u_longlong_t)physical_rs.rs_start,
452*eda14cbcSMatt Macy 		    (u_longlong_t)physical_rs.rs_end,
453*eda14cbcSMatt Macy 		    (u_longlong_t)vd->vdev_initialize_last_offset,
454*eda14cbcSMatt Macy 		    (u_longlong_t)physical_rs.rs_end);
455*eda14cbcSMatt Macy 		ASSERT3U(physical_rs.rs_end, >,
456*eda14cbcSMatt Macy 		    vd->vdev_initialize_last_offset);
457*eda14cbcSMatt Macy 		physical_rs.rs_start = vd->vdev_initialize_last_offset;
458*eda14cbcSMatt Macy 	}
459*eda14cbcSMatt Macy 	ASSERT3U(physical_rs.rs_end, >=, physical_rs.rs_start);
460*eda14cbcSMatt Macy 
461*eda14cbcSMatt Macy 	/*
462*eda14cbcSMatt Macy 	 * With raidz, it's possible that the logical range does not live on
463*eda14cbcSMatt Macy 	 * this leaf vdev. We only add the physical range to this vdev's if it
464*eda14cbcSMatt Macy 	 * has a length greater than 0.
465*eda14cbcSMatt Macy 	 */
466*eda14cbcSMatt Macy 	if (physical_rs.rs_end > physical_rs.rs_start) {
467*eda14cbcSMatt Macy 		range_tree_add(vd->vdev_initialize_tree, physical_rs.rs_start,
468*eda14cbcSMatt Macy 		    physical_rs.rs_end - physical_rs.rs_start);
469*eda14cbcSMatt Macy 	} else {
470*eda14cbcSMatt Macy 		ASSERT3U(physical_rs.rs_end, ==, physical_rs.rs_start);
471*eda14cbcSMatt Macy 	}
472*eda14cbcSMatt Macy }
473*eda14cbcSMatt Macy 
474*eda14cbcSMatt Macy static void
475*eda14cbcSMatt Macy vdev_initialize_thread(void *arg)
476*eda14cbcSMatt Macy {
477*eda14cbcSMatt Macy 	vdev_t *vd = arg;
478*eda14cbcSMatt Macy 	spa_t *spa = vd->vdev_spa;
479*eda14cbcSMatt Macy 	int error = 0;
480*eda14cbcSMatt Macy 	uint64_t ms_count = 0;
481*eda14cbcSMatt Macy 
482*eda14cbcSMatt Macy 	ASSERT(vdev_is_concrete(vd));
483*eda14cbcSMatt Macy 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
484*eda14cbcSMatt Macy 
485*eda14cbcSMatt Macy 	vd->vdev_initialize_last_offset = 0;
486*eda14cbcSMatt Macy 	VERIFY0(vdev_initialize_load(vd));
487*eda14cbcSMatt Macy 
488*eda14cbcSMatt Macy 	abd_t *deadbeef = vdev_initialize_block_alloc();
489*eda14cbcSMatt Macy 
490*eda14cbcSMatt Macy 	vd->vdev_initialize_tree = range_tree_create(NULL, RANGE_SEG64, NULL,
491*eda14cbcSMatt Macy 	    0, 0);
492*eda14cbcSMatt Macy 
493*eda14cbcSMatt Macy 	for (uint64_t i = 0; !vd->vdev_detached &&
494*eda14cbcSMatt Macy 	    i < vd->vdev_top->vdev_ms_count; i++) {
495*eda14cbcSMatt Macy 		metaslab_t *msp = vd->vdev_top->vdev_ms[i];
496*eda14cbcSMatt Macy 		boolean_t unload_when_done = B_FALSE;
497*eda14cbcSMatt Macy 
498*eda14cbcSMatt Macy 		/*
499*eda14cbcSMatt Macy 		 * If we've expanded the top-level vdev or it's our
500*eda14cbcSMatt Macy 		 * first pass, calculate our progress.
501*eda14cbcSMatt Macy 		 */
502*eda14cbcSMatt Macy 		if (vd->vdev_top->vdev_ms_count != ms_count) {
503*eda14cbcSMatt Macy 			vdev_initialize_calculate_progress(vd);
504*eda14cbcSMatt Macy 			ms_count = vd->vdev_top->vdev_ms_count;
505*eda14cbcSMatt Macy 		}
506*eda14cbcSMatt Macy 
507*eda14cbcSMatt Macy 		spa_config_exit(spa, SCL_CONFIG, FTAG);
508*eda14cbcSMatt Macy 		metaslab_disable(msp);
509*eda14cbcSMatt Macy 		mutex_enter(&msp->ms_lock);
510*eda14cbcSMatt Macy 		if (!msp->ms_loaded && !msp->ms_loading)
511*eda14cbcSMatt Macy 			unload_when_done = B_TRUE;
512*eda14cbcSMatt Macy 		VERIFY0(metaslab_load(msp));
513*eda14cbcSMatt Macy 
514*eda14cbcSMatt Macy 		range_tree_walk(msp->ms_allocatable, vdev_initialize_range_add,
515*eda14cbcSMatt Macy 		    vd);
516*eda14cbcSMatt Macy 		mutex_exit(&msp->ms_lock);
517*eda14cbcSMatt Macy 
518*eda14cbcSMatt Macy 		error = vdev_initialize_ranges(vd, deadbeef);
519*eda14cbcSMatt Macy 		metaslab_enable(msp, B_TRUE, unload_when_done);
520*eda14cbcSMatt Macy 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
521*eda14cbcSMatt Macy 
522*eda14cbcSMatt Macy 		range_tree_vacate(vd->vdev_initialize_tree, NULL, NULL);
523*eda14cbcSMatt Macy 		if (error != 0)
524*eda14cbcSMatt Macy 			break;
525*eda14cbcSMatt Macy 	}
526*eda14cbcSMatt Macy 
527*eda14cbcSMatt Macy 	spa_config_exit(spa, SCL_CONFIG, FTAG);
528*eda14cbcSMatt Macy 	mutex_enter(&vd->vdev_initialize_io_lock);
529*eda14cbcSMatt Macy 	while (vd->vdev_initialize_inflight > 0) {
530*eda14cbcSMatt Macy 		cv_wait(&vd->vdev_initialize_io_cv,
531*eda14cbcSMatt Macy 		    &vd->vdev_initialize_io_lock);
532*eda14cbcSMatt Macy 	}
533*eda14cbcSMatt Macy 	mutex_exit(&vd->vdev_initialize_io_lock);
534*eda14cbcSMatt Macy 
535*eda14cbcSMatt Macy 	range_tree_destroy(vd->vdev_initialize_tree);
536*eda14cbcSMatt Macy 	vdev_initialize_block_free(deadbeef);
537*eda14cbcSMatt Macy 	vd->vdev_initialize_tree = NULL;
538*eda14cbcSMatt Macy 
539*eda14cbcSMatt Macy 	mutex_enter(&vd->vdev_initialize_lock);
540*eda14cbcSMatt Macy 	if (!vd->vdev_initialize_exit_wanted && vdev_writeable(vd)) {
541*eda14cbcSMatt Macy 		vdev_initialize_change_state(vd, VDEV_INITIALIZE_COMPLETE);
542*eda14cbcSMatt Macy 	}
543*eda14cbcSMatt Macy 	ASSERT(vd->vdev_initialize_thread != NULL ||
544*eda14cbcSMatt Macy 	    vd->vdev_initialize_inflight == 0);
545*eda14cbcSMatt Macy 
546*eda14cbcSMatt Macy 	/*
547*eda14cbcSMatt Macy 	 * Drop the vdev_initialize_lock while we sync out the
548*eda14cbcSMatt Macy 	 * txg since it's possible that a device might be trying to
549*eda14cbcSMatt Macy 	 * come online and must check to see if it needs to restart an
550*eda14cbcSMatt Macy 	 * initialization. That thread will be holding the spa_config_lock
551*eda14cbcSMatt Macy 	 * which would prevent the txg_wait_synced from completing.
552*eda14cbcSMatt Macy 	 */
553*eda14cbcSMatt Macy 	mutex_exit(&vd->vdev_initialize_lock);
554*eda14cbcSMatt Macy 	txg_wait_synced(spa_get_dsl(spa), 0);
555*eda14cbcSMatt Macy 	mutex_enter(&vd->vdev_initialize_lock);
556*eda14cbcSMatt Macy 
557*eda14cbcSMatt Macy 	vd->vdev_initialize_thread = NULL;
558*eda14cbcSMatt Macy 	cv_broadcast(&vd->vdev_initialize_cv);
559*eda14cbcSMatt Macy 	mutex_exit(&vd->vdev_initialize_lock);
560*eda14cbcSMatt Macy 
561*eda14cbcSMatt Macy 	thread_exit();
562*eda14cbcSMatt Macy }
563*eda14cbcSMatt Macy 
564*eda14cbcSMatt Macy /*
565*eda14cbcSMatt Macy  * Initiates a device. Caller must hold vdev_initialize_lock.
566*eda14cbcSMatt Macy  * Device must be a leaf and not already be initializing.
567*eda14cbcSMatt Macy  */
568*eda14cbcSMatt Macy void
569*eda14cbcSMatt Macy vdev_initialize(vdev_t *vd)
570*eda14cbcSMatt Macy {
571*eda14cbcSMatt Macy 	ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
572*eda14cbcSMatt Macy 	ASSERT(vd->vdev_ops->vdev_op_leaf);
573*eda14cbcSMatt Macy 	ASSERT(vdev_is_concrete(vd));
574*eda14cbcSMatt Macy 	ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
575*eda14cbcSMatt Macy 	ASSERT(!vd->vdev_detached);
576*eda14cbcSMatt Macy 	ASSERT(!vd->vdev_initialize_exit_wanted);
577*eda14cbcSMatt Macy 	ASSERT(!vd->vdev_top->vdev_removing);
578*eda14cbcSMatt Macy 
579*eda14cbcSMatt Macy 	vdev_initialize_change_state(vd, VDEV_INITIALIZE_ACTIVE);
580*eda14cbcSMatt Macy 	vd->vdev_initialize_thread = thread_create(NULL, 0,
581*eda14cbcSMatt Macy 	    vdev_initialize_thread, vd, 0, &p0, TS_RUN, maxclsyspri);
582*eda14cbcSMatt Macy }
583*eda14cbcSMatt Macy 
584*eda14cbcSMatt Macy /*
585*eda14cbcSMatt Macy  * Wait for the initialize thread to be terminated (cancelled or stopped).
586*eda14cbcSMatt Macy  */
587*eda14cbcSMatt Macy static void
588*eda14cbcSMatt Macy vdev_initialize_stop_wait_impl(vdev_t *vd)
589*eda14cbcSMatt Macy {
590*eda14cbcSMatt Macy 	ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
591*eda14cbcSMatt Macy 
592*eda14cbcSMatt Macy 	while (vd->vdev_initialize_thread != NULL)
593*eda14cbcSMatt Macy 		cv_wait(&vd->vdev_initialize_cv, &vd->vdev_initialize_lock);
594*eda14cbcSMatt Macy 
595*eda14cbcSMatt Macy 	ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
596*eda14cbcSMatt Macy 	vd->vdev_initialize_exit_wanted = B_FALSE;
597*eda14cbcSMatt Macy }
598*eda14cbcSMatt Macy 
599*eda14cbcSMatt Macy /*
600*eda14cbcSMatt Macy  * Wait for vdev initialize threads which were either to cleanly exit.
601*eda14cbcSMatt Macy  */
602*eda14cbcSMatt Macy void
603*eda14cbcSMatt Macy vdev_initialize_stop_wait(spa_t *spa, list_t *vd_list)
604*eda14cbcSMatt Macy {
605*eda14cbcSMatt Macy 	vdev_t *vd;
606*eda14cbcSMatt Macy 
607*eda14cbcSMatt Macy 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
608*eda14cbcSMatt Macy 
609*eda14cbcSMatt Macy 	while ((vd = list_remove_head(vd_list)) != NULL) {
610*eda14cbcSMatt Macy 		mutex_enter(&vd->vdev_initialize_lock);
611*eda14cbcSMatt Macy 		vdev_initialize_stop_wait_impl(vd);
612*eda14cbcSMatt Macy 		mutex_exit(&vd->vdev_initialize_lock);
613*eda14cbcSMatt Macy 	}
614*eda14cbcSMatt Macy }
615*eda14cbcSMatt Macy 
616*eda14cbcSMatt Macy /*
617*eda14cbcSMatt Macy  * Stop initializing a device, with the resultant initializing state being
618*eda14cbcSMatt Macy  * tgt_state.  For blocking behavior pass NULL for vd_list.  Otherwise, when
619*eda14cbcSMatt Macy  * a list_t is provided the stopping vdev is inserted in to the list.  Callers
620*eda14cbcSMatt Macy  * are then required to call vdev_initialize_stop_wait() to block for all the
621*eda14cbcSMatt Macy  * initialization threads to exit.  The caller must hold vdev_initialize_lock
622*eda14cbcSMatt Macy  * and must not be writing to the spa config, as the initializing thread may
623*eda14cbcSMatt Macy  * try to enter the config as a reader before exiting.
624*eda14cbcSMatt Macy  */
625*eda14cbcSMatt Macy void
626*eda14cbcSMatt Macy vdev_initialize_stop(vdev_t *vd, vdev_initializing_state_t tgt_state,
627*eda14cbcSMatt Macy     list_t *vd_list)
628*eda14cbcSMatt Macy {
629*eda14cbcSMatt Macy 	ASSERT(!spa_config_held(vd->vdev_spa, SCL_CONFIG|SCL_STATE, RW_WRITER));
630*eda14cbcSMatt Macy 	ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
631*eda14cbcSMatt Macy 	ASSERT(vd->vdev_ops->vdev_op_leaf);
632*eda14cbcSMatt Macy 	ASSERT(vdev_is_concrete(vd));
633*eda14cbcSMatt Macy 
634*eda14cbcSMatt Macy 	/*
635*eda14cbcSMatt Macy 	 * Allow cancel requests to proceed even if the initialize thread
636*eda14cbcSMatt Macy 	 * has stopped.
637*eda14cbcSMatt Macy 	 */
638*eda14cbcSMatt Macy 	if (vd->vdev_initialize_thread == NULL &&
639*eda14cbcSMatt Macy 	    tgt_state != VDEV_INITIALIZE_CANCELED) {
640*eda14cbcSMatt Macy 		return;
641*eda14cbcSMatt Macy 	}
642*eda14cbcSMatt Macy 
643*eda14cbcSMatt Macy 	vdev_initialize_change_state(vd, tgt_state);
644*eda14cbcSMatt Macy 	vd->vdev_initialize_exit_wanted = B_TRUE;
645*eda14cbcSMatt Macy 
646*eda14cbcSMatt Macy 	if (vd_list == NULL) {
647*eda14cbcSMatt Macy 		vdev_initialize_stop_wait_impl(vd);
648*eda14cbcSMatt Macy 	} else {
649*eda14cbcSMatt Macy 		ASSERT(MUTEX_HELD(&spa_namespace_lock));
650*eda14cbcSMatt Macy 		list_insert_tail(vd_list, vd);
651*eda14cbcSMatt Macy 	}
652*eda14cbcSMatt Macy }
653*eda14cbcSMatt Macy 
654*eda14cbcSMatt Macy static void
655*eda14cbcSMatt Macy vdev_initialize_stop_all_impl(vdev_t *vd, vdev_initializing_state_t tgt_state,
656*eda14cbcSMatt Macy     list_t *vd_list)
657*eda14cbcSMatt Macy {
658*eda14cbcSMatt Macy 	if (vd->vdev_ops->vdev_op_leaf && vdev_is_concrete(vd)) {
659*eda14cbcSMatt Macy 		mutex_enter(&vd->vdev_initialize_lock);
660*eda14cbcSMatt Macy 		vdev_initialize_stop(vd, tgt_state, vd_list);
661*eda14cbcSMatt Macy 		mutex_exit(&vd->vdev_initialize_lock);
662*eda14cbcSMatt Macy 		return;
663*eda14cbcSMatt Macy 	}
664*eda14cbcSMatt Macy 
665*eda14cbcSMatt Macy 	for (uint64_t i = 0; i < vd->vdev_children; i++) {
666*eda14cbcSMatt Macy 		vdev_initialize_stop_all_impl(vd->vdev_child[i], tgt_state,
667*eda14cbcSMatt Macy 		    vd_list);
668*eda14cbcSMatt Macy 	}
669*eda14cbcSMatt Macy }
670*eda14cbcSMatt Macy 
671*eda14cbcSMatt Macy /*
672*eda14cbcSMatt Macy  * Convenience function to stop initializing of a vdev tree and set all
673*eda14cbcSMatt Macy  * initialize thread pointers to NULL.
674*eda14cbcSMatt Macy  */
675*eda14cbcSMatt Macy void
676*eda14cbcSMatt Macy vdev_initialize_stop_all(vdev_t *vd, vdev_initializing_state_t tgt_state)
677*eda14cbcSMatt Macy {
678*eda14cbcSMatt Macy 	spa_t *spa = vd->vdev_spa;
679*eda14cbcSMatt Macy 	list_t vd_list;
680*eda14cbcSMatt Macy 
681*eda14cbcSMatt Macy 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
682*eda14cbcSMatt Macy 
683*eda14cbcSMatt Macy 	list_create(&vd_list, sizeof (vdev_t),
684*eda14cbcSMatt Macy 	    offsetof(vdev_t, vdev_initialize_node));
685*eda14cbcSMatt Macy 
686*eda14cbcSMatt Macy 	vdev_initialize_stop_all_impl(vd, tgt_state, &vd_list);
687*eda14cbcSMatt Macy 	vdev_initialize_stop_wait(spa, &vd_list);
688*eda14cbcSMatt Macy 
689*eda14cbcSMatt Macy 	if (vd->vdev_spa->spa_sync_on) {
690*eda14cbcSMatt Macy 		/* Make sure that our state has been synced to disk */
691*eda14cbcSMatt Macy 		txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0);
692*eda14cbcSMatt Macy 	}
693*eda14cbcSMatt Macy 
694*eda14cbcSMatt Macy 	list_destroy(&vd_list);
695*eda14cbcSMatt Macy }
696*eda14cbcSMatt Macy 
697*eda14cbcSMatt Macy void
698*eda14cbcSMatt Macy vdev_initialize_restart(vdev_t *vd)
699*eda14cbcSMatt Macy {
700*eda14cbcSMatt Macy 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
701*eda14cbcSMatt Macy 	ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
702*eda14cbcSMatt Macy 
703*eda14cbcSMatt Macy 	if (vd->vdev_leaf_zap != 0) {
704*eda14cbcSMatt Macy 		mutex_enter(&vd->vdev_initialize_lock);
705*eda14cbcSMatt Macy 		uint64_t initialize_state = VDEV_INITIALIZE_NONE;
706*eda14cbcSMatt Macy 		int err = zap_lookup(vd->vdev_spa->spa_meta_objset,
707*eda14cbcSMatt Macy 		    vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_STATE,
708*eda14cbcSMatt Macy 		    sizeof (initialize_state), 1, &initialize_state);
709*eda14cbcSMatt Macy 		ASSERT(err == 0 || err == ENOENT);
710*eda14cbcSMatt Macy 		vd->vdev_initialize_state = initialize_state;
711*eda14cbcSMatt Macy 
712*eda14cbcSMatt Macy 		uint64_t timestamp = 0;
713*eda14cbcSMatt Macy 		err = zap_lookup(vd->vdev_spa->spa_meta_objset,
714*eda14cbcSMatt Macy 		    vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME,
715*eda14cbcSMatt Macy 		    sizeof (timestamp), 1, &timestamp);
716*eda14cbcSMatt Macy 		ASSERT(err == 0 || err == ENOENT);
717*eda14cbcSMatt Macy 		vd->vdev_initialize_action_time = timestamp;
718*eda14cbcSMatt Macy 
719*eda14cbcSMatt Macy 		if (vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED ||
720*eda14cbcSMatt Macy 		    vd->vdev_offline) {
721*eda14cbcSMatt Macy 			/* load progress for reporting, but don't resume */
722*eda14cbcSMatt Macy 			VERIFY0(vdev_initialize_load(vd));
723*eda14cbcSMatt Macy 		} else if (vd->vdev_initialize_state ==
724*eda14cbcSMatt Macy 		    VDEV_INITIALIZE_ACTIVE && vdev_writeable(vd) &&
725*eda14cbcSMatt Macy 		    !vd->vdev_top->vdev_removing &&
726*eda14cbcSMatt Macy 		    vd->vdev_initialize_thread == NULL) {
727*eda14cbcSMatt Macy 			vdev_initialize(vd);
728*eda14cbcSMatt Macy 		}
729*eda14cbcSMatt Macy 
730*eda14cbcSMatt Macy 		mutex_exit(&vd->vdev_initialize_lock);
731*eda14cbcSMatt Macy 	}
732*eda14cbcSMatt Macy 
733*eda14cbcSMatt Macy 	for (uint64_t i = 0; i < vd->vdev_children; i++) {
734*eda14cbcSMatt Macy 		vdev_initialize_restart(vd->vdev_child[i]);
735*eda14cbcSMatt Macy 	}
736*eda14cbcSMatt Macy }
737*eda14cbcSMatt Macy 
738*eda14cbcSMatt Macy EXPORT_SYMBOL(vdev_initialize);
739*eda14cbcSMatt Macy EXPORT_SYMBOL(vdev_initialize_stop);
740*eda14cbcSMatt Macy EXPORT_SYMBOL(vdev_initialize_stop_all);
741*eda14cbcSMatt Macy EXPORT_SYMBOL(vdev_initialize_stop_wait);
742*eda14cbcSMatt Macy EXPORT_SYMBOL(vdev_initialize_restart);
743*eda14cbcSMatt Macy 
744*eda14cbcSMatt Macy /* BEGIN CSTYLED */
745*eda14cbcSMatt Macy ZFS_MODULE_PARAM(zfs, zfs_, initialize_value, ULONG, ZMOD_RW,
746*eda14cbcSMatt Macy 	"Value written during zpool initialize");
747*eda14cbcSMatt Macy 
748*eda14cbcSMatt Macy ZFS_MODULE_PARAM(zfs, zfs_, initialize_chunk_size, ULONG, ZMOD_RW,
749*eda14cbcSMatt Macy 	"Size in bytes of writes by zpool initialize");
750*eda14cbcSMatt Macy /* END CSTYLED */
751