xref: /illumos-gate/usr/src/uts/common/fs/zfs/vdev_initialize.c (revision 4d7988d6050abba5c1ff60e7fd196e95c22e20f4)
1094e47e9SGeorge Wilson /*
2094e47e9SGeorge Wilson  * CDDL HEADER START
3094e47e9SGeorge Wilson  *
4094e47e9SGeorge Wilson  * The contents of this file are subject to the terms of the
5094e47e9SGeorge Wilson  * Common Development and Distribution License (the "License").
6094e47e9SGeorge Wilson  * You may not use this file except in compliance with the License.
7094e47e9SGeorge Wilson  *
8094e47e9SGeorge Wilson  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9094e47e9SGeorge Wilson  * or http://www.opensolaris.org/os/licensing.
10094e47e9SGeorge Wilson  * See the License for the specific language governing permissions
11094e47e9SGeorge Wilson  * and limitations under the License.
12094e47e9SGeorge Wilson  *
13094e47e9SGeorge Wilson  * When distributing Covered Code, include this CDDL HEADER in each
14094e47e9SGeorge Wilson  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15094e47e9SGeorge Wilson  * If applicable, add the following below this CDDL HEADER, with the
16094e47e9SGeorge Wilson  * fields enclosed by brackets "[]" replaced with your own identifying
17094e47e9SGeorge Wilson  * information: Portions Copyright [yyyy] [name of copyright owner]
18094e47e9SGeorge Wilson  *
19094e47e9SGeorge Wilson  * CDDL HEADER END
20094e47e9SGeorge Wilson  */
21094e47e9SGeorge Wilson 
22094e47e9SGeorge Wilson /*
23af1d63abSPaul Dagnelie  * Copyright (c) 2016, 2019 by Delphix. All rights reserved.
24094e47e9SGeorge Wilson  */
25094e47e9SGeorge Wilson 
26094e47e9SGeorge Wilson #include <sys/spa.h>
27094e47e9SGeorge Wilson #include <sys/spa_impl.h>
28094e47e9SGeorge Wilson #include <sys/txg.h>
29094e47e9SGeorge Wilson #include <sys/vdev_impl.h>
30094e47e9SGeorge Wilson #include <sys/refcount.h>
31094e47e9SGeorge Wilson #include <sys/metaslab_impl.h>
32094e47e9SGeorge Wilson #include <sys/dsl_synctask.h>
33094e47e9SGeorge Wilson #include <sys/zap.h>
34094e47e9SGeorge Wilson #include <sys/dmu_tx.h>
35094e47e9SGeorge Wilson 
36094e47e9SGeorge Wilson /*
37094e47e9SGeorge Wilson  * Value that is written to disk during initialization.
38094e47e9SGeorge Wilson  */
39094e47e9SGeorge Wilson uint64_t zfs_initialize_value = 0xdeadbeefdeadbeefULL;
40094e47e9SGeorge Wilson 
41094e47e9SGeorge Wilson /* maximum number of I/Os outstanding per leaf vdev */
42094e47e9SGeorge Wilson int zfs_initialize_limit = 1;
43094e47e9SGeorge Wilson 
44094e47e9SGeorge Wilson /* size of initializing writes; default 1MiB, see zfs_remove_max_segment */
45094e47e9SGeorge Wilson uint64_t zfs_initialize_chunk_size = 1024 * 1024;
46094e47e9SGeorge Wilson 
47094e47e9SGeorge Wilson static boolean_t
vdev_initialize_should_stop(vdev_t * vd)48094e47e9SGeorge Wilson vdev_initialize_should_stop(vdev_t *vd)
49094e47e9SGeorge Wilson {
50094e47e9SGeorge Wilson 	return (vd->vdev_initialize_exit_wanted || !vdev_writeable(vd) ||
51094e47e9SGeorge Wilson 	    vd->vdev_detached || vd->vdev_top->vdev_removing);
52094e47e9SGeorge Wilson }
53094e47e9SGeorge Wilson 
54094e47e9SGeorge Wilson static void
vdev_initialize_zap_update_sync(void * arg,dmu_tx_t * tx)55094e47e9SGeorge Wilson vdev_initialize_zap_update_sync(void *arg, dmu_tx_t *tx)
56094e47e9SGeorge Wilson {
57094e47e9SGeorge Wilson 	/*
58094e47e9SGeorge Wilson 	 * We pass in the guid instead of the vdev_t since the vdev may
59094e47e9SGeorge Wilson 	 * have been freed prior to the sync task being processed. This
60094e47e9SGeorge Wilson 	 * happens when a vdev is detached as we call spa_config_vdev_exit(),
61084fd14fSBrian Behlendorf 	 * stop the initializing thread, schedule the sync task, and free
62094e47e9SGeorge Wilson 	 * the vdev. Later when the scheduled sync task is invoked, it would
63094e47e9SGeorge Wilson 	 * find that the vdev has been freed.
64094e47e9SGeorge Wilson 	 */
65094e47e9SGeorge Wilson 	uint64_t guid = *(uint64_t *)arg;
66094e47e9SGeorge Wilson 	uint64_t txg = dmu_tx_get_txg(tx);
67094e47e9SGeorge Wilson 	kmem_free(arg, sizeof (uint64_t));
68094e47e9SGeorge Wilson 
69094e47e9SGeorge Wilson 	vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE);
70094e47e9SGeorge Wilson 	if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd))
71094e47e9SGeorge Wilson 		return;
72094e47e9SGeorge Wilson 
73094e47e9SGeorge Wilson 	uint64_t last_offset = vd->vdev_initialize_offset[txg & TXG_MASK];
74094e47e9SGeorge Wilson 	vd->vdev_initialize_offset[txg & TXG_MASK] = 0;
75094e47e9SGeorge Wilson 
76094e47e9SGeorge Wilson 	VERIFY(vd->vdev_leaf_zap != 0);
77094e47e9SGeorge Wilson 
78094e47e9SGeorge Wilson 	objset_t *mos = vd->vdev_spa->spa_meta_objset;
79094e47e9SGeorge Wilson 
80094e47e9SGeorge Wilson 	if (last_offset > 0) {
81094e47e9SGeorge Wilson 		vd->vdev_initialize_last_offset = last_offset;
82094e47e9SGeorge Wilson 		VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
83094e47e9SGeorge Wilson 		    VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET,
84094e47e9SGeorge Wilson 		    sizeof (last_offset), 1, &last_offset, tx));
85094e47e9SGeorge Wilson 	}
86094e47e9SGeorge Wilson 	if (vd->vdev_initialize_action_time > 0) {
87094e47e9SGeorge Wilson 		uint64_t val = (uint64_t)vd->vdev_initialize_action_time;
88094e47e9SGeorge Wilson 		VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
89094e47e9SGeorge Wilson 		    VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME, sizeof (val),
90094e47e9SGeorge Wilson 		    1, &val, tx));
91094e47e9SGeorge Wilson 	}
92094e47e9SGeorge Wilson 
93094e47e9SGeorge Wilson 	uint64_t initialize_state = vd->vdev_initialize_state;
94094e47e9SGeorge Wilson 	VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
95094e47e9SGeorge Wilson 	    VDEV_LEAF_ZAP_INITIALIZE_STATE, sizeof (initialize_state), 1,
96094e47e9SGeorge Wilson 	    &initialize_state, tx));
97094e47e9SGeorge Wilson }
98094e47e9SGeorge Wilson 
99094e47e9SGeorge Wilson static void
vdev_initialize_change_state(vdev_t * vd,vdev_initializing_state_t new_state)100094e47e9SGeorge Wilson vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state)
101094e47e9SGeorge Wilson {
102094e47e9SGeorge Wilson 	ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
103094e47e9SGeorge Wilson 	spa_t *spa = vd->vdev_spa;
104094e47e9SGeorge Wilson 
105094e47e9SGeorge Wilson 	if (new_state == vd->vdev_initialize_state)
106094e47e9SGeorge Wilson 		return;
107094e47e9SGeorge Wilson 
108094e47e9SGeorge Wilson 	/*
109094e47e9SGeorge Wilson 	 * Copy the vd's guid, this will be freed by the sync task.
110094e47e9SGeorge Wilson 	 */
111094e47e9SGeorge Wilson 	uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
112094e47e9SGeorge Wilson 	*guid = vd->vdev_guid;
113094e47e9SGeorge Wilson 
114094e47e9SGeorge Wilson 	/*
115094e47e9SGeorge Wilson 	 * If we're suspending, then preserving the original start time.
116094e47e9SGeorge Wilson 	 */
117094e47e9SGeorge Wilson 	if (vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED) {
118094e47e9SGeorge Wilson 		vd->vdev_initialize_action_time = gethrestime_sec();
119094e47e9SGeorge Wilson 	}
120094e47e9SGeorge Wilson 	vd->vdev_initialize_state = new_state;
121094e47e9SGeorge Wilson 
122094e47e9SGeorge Wilson 	dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
123094e47e9SGeorge Wilson 	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
124094e47e9SGeorge Wilson 	dsl_sync_task_nowait(spa_get_dsl(spa), vdev_initialize_zap_update_sync,
125084fd14fSBrian Behlendorf 	    guid, 2, ZFS_SPACE_CHECK_NONE, tx);
126094e47e9SGeorge Wilson 
127094e47e9SGeorge Wilson 	switch (new_state) {
128094e47e9SGeorge Wilson 	case VDEV_INITIALIZE_ACTIVE:
129094e47e9SGeorge Wilson 		spa_history_log_internal(spa, "initialize", tx,
130094e47e9SGeorge Wilson 		    "vdev=%s activated", vd->vdev_path);
131094e47e9SGeorge Wilson 		break;
132094e47e9SGeorge Wilson 	case VDEV_INITIALIZE_SUSPENDED:
133094e47e9SGeorge Wilson 		spa_history_log_internal(spa, "initialize", tx,
134094e47e9SGeorge Wilson 		    "vdev=%s suspended", vd->vdev_path);
135094e47e9SGeorge Wilson 		break;
136094e47e9SGeorge Wilson 	case VDEV_INITIALIZE_CANCELED:
137094e47e9SGeorge Wilson 		spa_history_log_internal(spa, "initialize", tx,
138094e47e9SGeorge Wilson 		    "vdev=%s canceled", vd->vdev_path);
139094e47e9SGeorge Wilson 		break;
140094e47e9SGeorge Wilson 	case VDEV_INITIALIZE_COMPLETE:
141094e47e9SGeorge Wilson 		spa_history_log_internal(spa, "initialize", tx,
142094e47e9SGeorge Wilson 		    "vdev=%s complete", vd->vdev_path);
143094e47e9SGeorge Wilson 		break;
144094e47e9SGeorge Wilson 	default:
145094e47e9SGeorge Wilson 		panic("invalid state %llu", (unsigned long long)new_state);
146094e47e9SGeorge Wilson 	}
147094e47e9SGeorge Wilson 
148094e47e9SGeorge Wilson 	dmu_tx_commit(tx);
149094e47e9SGeorge Wilson }
150094e47e9SGeorge Wilson 
151094e47e9SGeorge Wilson static void
vdev_initialize_cb(zio_t * zio)152094e47e9SGeorge Wilson vdev_initialize_cb(zio_t *zio)
153094e47e9SGeorge Wilson {
154094e47e9SGeorge Wilson 	vdev_t *vd = zio->io_vd;
155094e47e9SGeorge Wilson 	mutex_enter(&vd->vdev_initialize_io_lock);
156094e47e9SGeorge Wilson 	if (zio->io_error == ENXIO && !vdev_writeable(vd)) {
157094e47e9SGeorge Wilson 		/*
158094e47e9SGeorge Wilson 		 * The I/O failed because the vdev was unavailable; roll the
159094e47e9SGeorge Wilson 		 * last offset back. (This works because spa_sync waits on
160094e47e9SGeorge Wilson 		 * spa_txg_zio before it runs sync tasks.)
161094e47e9SGeorge Wilson 		 */
162094e47e9SGeorge Wilson 		uint64_t *off =
163094e47e9SGeorge Wilson 		    &vd->vdev_initialize_offset[zio->io_txg & TXG_MASK];
164094e47e9SGeorge Wilson 		*off = MIN(*off, zio->io_offset);
165094e47e9SGeorge Wilson 	} else {
166094e47e9SGeorge Wilson 		/*
167094e47e9SGeorge Wilson 		 * Since initializing is best-effort, we ignore I/O errors and
168094e47e9SGeorge Wilson 		 * rely on vdev_probe to determine if the errors are more
169094e47e9SGeorge Wilson 		 * critical.
170094e47e9SGeorge Wilson 		 */
171094e47e9SGeorge Wilson 		if (zio->io_error != 0)
172094e47e9SGeorge Wilson 			vd->vdev_stat.vs_initialize_errors++;
173094e47e9SGeorge Wilson 
174094e47e9SGeorge Wilson 		vd->vdev_initialize_bytes_done += zio->io_orig_size;
175094e47e9SGeorge Wilson 	}
176094e47e9SGeorge Wilson 	ASSERT3U(vd->vdev_initialize_inflight, >, 0);
177094e47e9SGeorge Wilson 	vd->vdev_initialize_inflight--;
178094e47e9SGeorge Wilson 	cv_broadcast(&vd->vdev_initialize_io_cv);
179094e47e9SGeorge Wilson 	mutex_exit(&vd->vdev_initialize_io_lock);
180094e47e9SGeorge Wilson 
181094e47e9SGeorge Wilson 	spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
182094e47e9SGeorge Wilson }
183094e47e9SGeorge Wilson 
184094e47e9SGeorge Wilson /* Takes care of physical writing and limiting # of concurrent ZIOs. */
185094e47e9SGeorge Wilson static int
vdev_initialize_write(vdev_t * vd,uint64_t start,uint64_t size,abd_t * data)186094e47e9SGeorge Wilson vdev_initialize_write(vdev_t *vd, uint64_t start, uint64_t size, abd_t *data)
187094e47e9SGeorge Wilson {
188094e47e9SGeorge Wilson 	spa_t *spa = vd->vdev_spa;
189094e47e9SGeorge Wilson 
190094e47e9SGeorge Wilson 	/* Limit inflight initializing I/Os */
191094e47e9SGeorge Wilson 	mutex_enter(&vd->vdev_initialize_io_lock);
192094e47e9SGeorge Wilson 	while (vd->vdev_initialize_inflight >= zfs_initialize_limit) {
193094e47e9SGeorge Wilson 		cv_wait(&vd->vdev_initialize_io_cv,
194094e47e9SGeorge Wilson 		    &vd->vdev_initialize_io_lock);
195094e47e9SGeorge Wilson 	}
196094e47e9SGeorge Wilson 	vd->vdev_initialize_inflight++;
197094e47e9SGeorge Wilson 	mutex_exit(&vd->vdev_initialize_io_lock);
198094e47e9SGeorge Wilson 
199094e47e9SGeorge Wilson 	dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
200094e47e9SGeorge Wilson 	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
201094e47e9SGeorge Wilson 	uint64_t txg = dmu_tx_get_txg(tx);
202094e47e9SGeorge Wilson 
203094e47e9SGeorge Wilson 	spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER);
204094e47e9SGeorge Wilson 	mutex_enter(&vd->vdev_initialize_lock);
205094e47e9SGeorge Wilson 
206094e47e9SGeorge Wilson 	if (vd->vdev_initialize_offset[txg & TXG_MASK] == 0) {
207094e47e9SGeorge Wilson 		uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
208094e47e9SGeorge Wilson 		*guid = vd->vdev_guid;
209094e47e9SGeorge Wilson 
210094e47e9SGeorge Wilson 		/* This is the first write of this txg. */
211094e47e9SGeorge Wilson 		dsl_sync_task_nowait(spa_get_dsl(spa),
212094e47e9SGeorge Wilson 		    vdev_initialize_zap_update_sync, guid, 2,
213094e47e9SGeorge Wilson 		    ZFS_SPACE_CHECK_RESERVED, tx);
214094e47e9SGeorge Wilson 	}
215094e47e9SGeorge Wilson 
216094e47e9SGeorge Wilson 	/*
217094e47e9SGeorge Wilson 	 * We know the vdev struct will still be around since all
218094e47e9SGeorge Wilson 	 * consumers of vdev_free must stop the initialization first.
219094e47e9SGeorge Wilson 	 */
220094e47e9SGeorge Wilson 	if (vdev_initialize_should_stop(vd)) {
221094e47e9SGeorge Wilson 		mutex_enter(&vd->vdev_initialize_io_lock);
222094e47e9SGeorge Wilson 		ASSERT3U(vd->vdev_initialize_inflight, >, 0);
223094e47e9SGeorge Wilson 		vd->vdev_initialize_inflight--;
224094e47e9SGeorge Wilson 		mutex_exit(&vd->vdev_initialize_io_lock);
225094e47e9SGeorge Wilson 		spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
226094e47e9SGeorge Wilson 		mutex_exit(&vd->vdev_initialize_lock);
227094e47e9SGeorge Wilson 		dmu_tx_commit(tx);
228094e47e9SGeorge Wilson 		return (SET_ERROR(EINTR));
229094e47e9SGeorge Wilson 	}
230094e47e9SGeorge Wilson 	mutex_exit(&vd->vdev_initialize_lock);
231094e47e9SGeorge Wilson 
232094e47e9SGeorge Wilson 	vd->vdev_initialize_offset[txg & TXG_MASK] = start + size;
233094e47e9SGeorge Wilson 	zio_nowait(zio_write_phys(spa->spa_txg_zio[txg & TXG_MASK], vd, start,
234094e47e9SGeorge Wilson 	    size, data, ZIO_CHECKSUM_OFF, vdev_initialize_cb, NULL,
235094e47e9SGeorge Wilson 	    ZIO_PRIORITY_INITIALIZING, ZIO_FLAG_CANFAIL, B_FALSE));
236094e47e9SGeorge Wilson 	/* vdev_initialize_cb releases SCL_STATE_ALL */
237094e47e9SGeorge Wilson 
238094e47e9SGeorge Wilson 	dmu_tx_commit(tx);
239094e47e9SGeorge Wilson 
240094e47e9SGeorge Wilson 	return (0);
241094e47e9SGeorge Wilson }
242094e47e9SGeorge Wilson 
243094e47e9SGeorge Wilson /*
244094e47e9SGeorge Wilson  * Callback to fill each ABD chunk with zfs_initialize_value. len must be
245094e47e9SGeorge Wilson  * divisible by sizeof (uint64_t), and buf must be 8-byte aligned. The ABD
246094e47e9SGeorge Wilson  * allocation will guarantee these for us.
247094e47e9SGeorge Wilson  */
248094e47e9SGeorge Wilson /* ARGSUSED */
249094e47e9SGeorge Wilson static int
vdev_initialize_block_fill(void * buf,size_t len,void * unused)250094e47e9SGeorge Wilson vdev_initialize_block_fill(void *buf, size_t len, void *unused)
251094e47e9SGeorge Wilson {
252094e47e9SGeorge Wilson 	ASSERT0(len % sizeof (uint64_t));
253094e47e9SGeorge Wilson 	for (uint64_t i = 0; i < len; i += sizeof (uint64_t)) {
254094e47e9SGeorge Wilson 		*(uint64_t *)((char *)(buf) + i) = zfs_initialize_value;
255094e47e9SGeorge Wilson 	}
256094e47e9SGeorge Wilson 	return (0);
257094e47e9SGeorge Wilson }
258094e47e9SGeorge Wilson 
259094e47e9SGeorge Wilson static abd_t *
vdev_initialize_block_alloc()260094e47e9SGeorge Wilson vdev_initialize_block_alloc()
261094e47e9SGeorge Wilson {
262094e47e9SGeorge Wilson 	/* Allocate ABD for filler data */
263094e47e9SGeorge Wilson 	abd_t *data = abd_alloc_for_io(zfs_initialize_chunk_size, B_FALSE);
264094e47e9SGeorge Wilson 
265094e47e9SGeorge Wilson 	ASSERT0(zfs_initialize_chunk_size % sizeof (uint64_t));
266094e47e9SGeorge Wilson 	(void) abd_iterate_func(data, 0, zfs_initialize_chunk_size,
267094e47e9SGeorge Wilson 	    vdev_initialize_block_fill, NULL);
268094e47e9SGeorge Wilson 
269094e47e9SGeorge Wilson 	return (data);
270094e47e9SGeorge Wilson }
271094e47e9SGeorge Wilson 
272094e47e9SGeorge Wilson static void
vdev_initialize_block_free(abd_t * data)273094e47e9SGeorge Wilson vdev_initialize_block_free(abd_t *data)
274094e47e9SGeorge Wilson {
275094e47e9SGeorge Wilson 	abd_free(data);
276094e47e9SGeorge Wilson }
277094e47e9SGeorge Wilson 
278094e47e9SGeorge Wilson static int
vdev_initialize_ranges(vdev_t * vd,abd_t * data)279094e47e9SGeorge Wilson vdev_initialize_ranges(vdev_t *vd, abd_t *data)
280094e47e9SGeorge Wilson {
281*4d7988d6SPaul Dagnelie 	range_tree_t *rt = vd->vdev_initialize_tree;
282*4d7988d6SPaul Dagnelie 	zfs_btree_t *bt = &rt->rt_root;
283*4d7988d6SPaul Dagnelie 	zfs_btree_index_t where;
284094e47e9SGeorge Wilson 
285*4d7988d6SPaul Dagnelie 	for (range_seg_t *rs = zfs_btree_first(bt, &where); rs != NULL;
286*4d7988d6SPaul Dagnelie 	    rs = zfs_btree_next(bt, &where, &where)) {
287*4d7988d6SPaul Dagnelie 		uint64_t size = rs_get_end(rs, rt) - rs_get_start(rs, rt);
288094e47e9SGeorge Wilson 
289094e47e9SGeorge Wilson 		/* Split range into legally-sized physical chunks */
290094e47e9SGeorge Wilson 		uint64_t writes_required =
291094e47e9SGeorge Wilson 		    ((size - 1) / zfs_initialize_chunk_size) + 1;
292094e47e9SGeorge Wilson 
293094e47e9SGeorge Wilson 		for (uint64_t w = 0; w < writes_required; w++) {
294094e47e9SGeorge Wilson 			int error;
295094e47e9SGeorge Wilson 
296094e47e9SGeorge Wilson 			error = vdev_initialize_write(vd,
297*4d7988d6SPaul Dagnelie 			    VDEV_LABEL_START_SIZE + rs_get_start(rs, rt) +
298094e47e9SGeorge Wilson 			    (w * zfs_initialize_chunk_size),
299094e47e9SGeorge Wilson 			    MIN(size - (w * zfs_initialize_chunk_size),
300094e47e9SGeorge Wilson 			    zfs_initialize_chunk_size), data);
301094e47e9SGeorge Wilson 			if (error != 0)
302094e47e9SGeorge Wilson 				return (error);
303094e47e9SGeorge Wilson 		}
304094e47e9SGeorge Wilson 	}
305094e47e9SGeorge Wilson 	return (0);
306094e47e9SGeorge Wilson }
307094e47e9SGeorge Wilson 
308094e47e9SGeorge Wilson static void
vdev_initialize_calculate_progress(vdev_t * vd)309094e47e9SGeorge Wilson vdev_initialize_calculate_progress(vdev_t *vd)
310094e47e9SGeorge Wilson {
311094e47e9SGeorge Wilson 	ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) ||
312094e47e9SGeorge Wilson 	    spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER));
313094e47e9SGeorge Wilson 	ASSERT(vd->vdev_leaf_zap != 0);
314094e47e9SGeorge Wilson 
315094e47e9SGeorge Wilson 	vd->vdev_initialize_bytes_est = 0;
316094e47e9SGeorge Wilson 	vd->vdev_initialize_bytes_done = 0;
317094e47e9SGeorge Wilson 
318094e47e9SGeorge Wilson 	for (uint64_t i = 0; i < vd->vdev_top->vdev_ms_count; i++) {
319094e47e9SGeorge Wilson 		metaslab_t *msp = vd->vdev_top->vdev_ms[i];
320094e47e9SGeorge Wilson 		mutex_enter(&msp->ms_lock);
321094e47e9SGeorge Wilson 
322094e47e9SGeorge Wilson 		uint64_t ms_free = msp->ms_size -
323555d674dSSerapheim Dimitropoulos 		    metaslab_allocated_space(msp);
324094e47e9SGeorge Wilson 
325094e47e9SGeorge Wilson 		if (vd->vdev_top->vdev_ops == &vdev_raidz_ops)
326094e47e9SGeorge Wilson 			ms_free /= vd->vdev_top->vdev_children;
327094e47e9SGeorge Wilson 
328094e47e9SGeorge Wilson 		/*
329094e47e9SGeorge Wilson 		 * Convert the metaslab range to a physical range
330094e47e9SGeorge Wilson 		 * on our vdev. We use this to determine if we are
331094e47e9SGeorge Wilson 		 * in the middle of this metaslab range.
332094e47e9SGeorge Wilson 		 */
333*4d7988d6SPaul Dagnelie 		range_seg64_t logical_rs, physical_rs;
334094e47e9SGeorge Wilson 		logical_rs.rs_start = msp->ms_start;
335094e47e9SGeorge Wilson 		logical_rs.rs_end = msp->ms_start + msp->ms_size;
336094e47e9SGeorge Wilson 		vdev_xlate(vd, &logical_rs, &physical_rs);
337094e47e9SGeorge Wilson 
338094e47e9SGeorge Wilson 		if (vd->vdev_initialize_last_offset <= physical_rs.rs_start) {
339094e47e9SGeorge Wilson 			vd->vdev_initialize_bytes_est += ms_free;
340094e47e9SGeorge Wilson 			mutex_exit(&msp->ms_lock);
341094e47e9SGeorge Wilson 			continue;
342094e47e9SGeorge Wilson 		} else if (vd->vdev_initialize_last_offset >
343094e47e9SGeorge Wilson 		    physical_rs.rs_end) {
344094e47e9SGeorge Wilson 			vd->vdev_initialize_bytes_done += ms_free;
345094e47e9SGeorge Wilson 			vd->vdev_initialize_bytes_est += ms_free;
346094e47e9SGeorge Wilson 			mutex_exit(&msp->ms_lock);
347094e47e9SGeorge Wilson 			continue;
348094e47e9SGeorge Wilson 		}
349094e47e9SGeorge Wilson 
350094e47e9SGeorge Wilson 		/*
351094e47e9SGeorge Wilson 		 * If we get here, we're in the middle of initializing this
352094e47e9SGeorge Wilson 		 * metaslab. Load it and walk the free tree for more accurate
353094e47e9SGeorge Wilson 		 * progress estimation.
354094e47e9SGeorge Wilson 		 */
355a0b03b16SSerapheim Dimitropoulos 		VERIFY0(metaslab_load(msp));
356094e47e9SGeorge Wilson 
357*4d7988d6SPaul Dagnelie 		zfs_btree_index_t where;
358*4d7988d6SPaul Dagnelie 		range_tree_t *rt = msp->ms_allocatable;
359*4d7988d6SPaul Dagnelie 		for (range_seg_t *rs =
360*4d7988d6SPaul Dagnelie 		    zfs_btree_first(&rt->rt_root, &where); rs;
361*4d7988d6SPaul Dagnelie 		    rs = zfs_btree_next(&rt->rt_root, &where,
362*4d7988d6SPaul Dagnelie 		    &where)) {
363*4d7988d6SPaul Dagnelie 			logical_rs.rs_start = rs_get_start(rs, rt);
364*4d7988d6SPaul Dagnelie 			logical_rs.rs_end = rs_get_end(rs, rt);
365094e47e9SGeorge Wilson 			vdev_xlate(vd, &logical_rs, &physical_rs);
366094e47e9SGeorge Wilson 
367094e47e9SGeorge Wilson 			uint64_t size = physical_rs.rs_end -
368094e47e9SGeorge Wilson 			    physical_rs.rs_start;
369094e47e9SGeorge Wilson 			vd->vdev_initialize_bytes_est += size;
370094e47e9SGeorge Wilson 			if (vd->vdev_initialize_last_offset >
371094e47e9SGeorge Wilson 			    physical_rs.rs_end) {
372094e47e9SGeorge Wilson 				vd->vdev_initialize_bytes_done += size;
373094e47e9SGeorge Wilson 			} else if (vd->vdev_initialize_last_offset >
374094e47e9SGeorge Wilson 			    physical_rs.rs_start &&
375094e47e9SGeorge Wilson 			    vd->vdev_initialize_last_offset <
376094e47e9SGeorge Wilson 			    physical_rs.rs_end) {
377094e47e9SGeorge Wilson 				vd->vdev_initialize_bytes_done +=
378094e47e9SGeorge Wilson 				    vd->vdev_initialize_last_offset -
379094e47e9SGeorge Wilson 				    physical_rs.rs_start;
380094e47e9SGeorge Wilson 			}
381094e47e9SGeorge Wilson 		}
382094e47e9SGeorge Wilson 		mutex_exit(&msp->ms_lock);
383094e47e9SGeorge Wilson 	}
384094e47e9SGeorge Wilson }
385094e47e9SGeorge Wilson 
386084fd14fSBrian Behlendorf static int
vdev_initialize_load(vdev_t * vd)387094e47e9SGeorge Wilson vdev_initialize_load(vdev_t *vd)
388094e47e9SGeorge Wilson {
389084fd14fSBrian Behlendorf 	int err = 0;
390094e47e9SGeorge Wilson 	ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) ||
391094e47e9SGeorge Wilson 	    spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER));
392094e47e9SGeorge Wilson 	ASSERT(vd->vdev_leaf_zap != 0);
393094e47e9SGeorge Wilson 
394094e47e9SGeorge Wilson 	if (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE ||
395094e47e9SGeorge Wilson 	    vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED) {
396084fd14fSBrian Behlendorf 		err = zap_lookup(vd->vdev_spa->spa_meta_objset,
397094e47e9SGeorge Wilson 		    vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET,
398094e47e9SGeorge Wilson 		    sizeof (vd->vdev_initialize_last_offset), 1,
399094e47e9SGeorge Wilson 		    &vd->vdev_initialize_last_offset);
400084fd14fSBrian Behlendorf 		if (err == ENOENT) {
401084fd14fSBrian Behlendorf 			vd->vdev_initialize_last_offset = 0;
402084fd14fSBrian Behlendorf 			err = 0;
403084fd14fSBrian Behlendorf 		}
404094e47e9SGeorge Wilson 	}
405094e47e9SGeorge Wilson 
406094e47e9SGeorge Wilson 	vdev_initialize_calculate_progress(vd);
407084fd14fSBrian Behlendorf 	return (err);
408094e47e9SGeorge Wilson }
409094e47e9SGeorge Wilson 
410094e47e9SGeorge Wilson 
411094e47e9SGeorge Wilson /*
412084fd14fSBrian Behlendorf  * Convert the logical range into a physical range and add it to our
413094e47e9SGeorge Wilson  * avl tree.
414094e47e9SGeorge Wilson  */
415094e47e9SGeorge Wilson void
vdev_initialize_range_add(void * arg,uint64_t start,uint64_t size)416094e47e9SGeorge Wilson vdev_initialize_range_add(void *arg, uint64_t start, uint64_t size)
417094e47e9SGeorge Wilson {
418094e47e9SGeorge Wilson 	vdev_t *vd = arg;
419*4d7988d6SPaul Dagnelie 	range_seg64_t logical_rs, physical_rs;
420094e47e9SGeorge Wilson 	logical_rs.rs_start = start;
421094e47e9SGeorge Wilson 	logical_rs.rs_end = start + size;
422094e47e9SGeorge Wilson 
423094e47e9SGeorge Wilson 	ASSERT(vd->vdev_ops->vdev_op_leaf);
424094e47e9SGeorge Wilson 	vdev_xlate(vd, &logical_rs, &physical_rs);
425094e47e9SGeorge Wilson 
426094e47e9SGeorge Wilson 	IMPLY(vd->vdev_top == vd,
427094e47e9SGeorge Wilson 	    logical_rs.rs_start == physical_rs.rs_start);
428094e47e9SGeorge Wilson 	IMPLY(vd->vdev_top == vd,
429094e47e9SGeorge Wilson 	    logical_rs.rs_end == physical_rs.rs_end);
430094e47e9SGeorge Wilson 
431094e47e9SGeorge Wilson 	/* Only add segments that we have not visited yet */
432094e47e9SGeorge Wilson 	if (physical_rs.rs_end <= vd->vdev_initialize_last_offset)
433094e47e9SGeorge Wilson 		return;
434094e47e9SGeorge Wilson 
435094e47e9SGeorge Wilson 	/* Pick up where we left off mid-range. */
436094e47e9SGeorge Wilson 	if (vd->vdev_initialize_last_offset > physical_rs.rs_start) {
437094e47e9SGeorge Wilson 		zfs_dbgmsg("range write: vd %s changed (%llu, %llu) to "
438094e47e9SGeorge Wilson 		    "(%llu, %llu)", vd->vdev_path,
439094e47e9SGeorge Wilson 		    (u_longlong_t)physical_rs.rs_start,
440094e47e9SGeorge Wilson 		    (u_longlong_t)physical_rs.rs_end,
441094e47e9SGeorge Wilson 		    (u_longlong_t)vd->vdev_initialize_last_offset,
442094e47e9SGeorge Wilson 		    (u_longlong_t)physical_rs.rs_end);
443094e47e9SGeorge Wilson 		ASSERT3U(physical_rs.rs_end, >,
444094e47e9SGeorge Wilson 		    vd->vdev_initialize_last_offset);
445094e47e9SGeorge Wilson 		physical_rs.rs_start = vd->vdev_initialize_last_offset;
446094e47e9SGeorge Wilson 	}
447094e47e9SGeorge Wilson 	ASSERT3U(physical_rs.rs_end, >=, physical_rs.rs_start);
448094e47e9SGeorge Wilson 
449094e47e9SGeorge Wilson 	/*
450094e47e9SGeorge Wilson 	 * With raidz, it's possible that the logical range does not live on
451094e47e9SGeorge Wilson 	 * this leaf vdev. We only add the physical range to this vdev's if it
452094e47e9SGeorge Wilson 	 * has a length greater than 0.
453094e47e9SGeorge Wilson 	 */
454094e47e9SGeorge Wilson 	if (physical_rs.rs_end > physical_rs.rs_start) {
455094e47e9SGeorge Wilson 		range_tree_add(vd->vdev_initialize_tree, physical_rs.rs_start,
456094e47e9SGeorge Wilson 		    physical_rs.rs_end - physical_rs.rs_start);
457094e47e9SGeorge Wilson 	} else {
458094e47e9SGeorge Wilson 		ASSERT3U(physical_rs.rs_end, ==, physical_rs.rs_start);
459094e47e9SGeorge Wilson 	}
460094e47e9SGeorge Wilson }
461094e47e9SGeorge Wilson 
462094e47e9SGeorge Wilson static void
vdev_initialize_thread(void * arg)463094e47e9SGeorge Wilson vdev_initialize_thread(void *arg)
464094e47e9SGeorge Wilson {
465094e47e9SGeorge Wilson 	vdev_t *vd = arg;
466094e47e9SGeorge Wilson 	spa_t *spa = vd->vdev_spa;
467094e47e9SGeorge Wilson 	int error = 0;
468094e47e9SGeorge Wilson 	uint64_t ms_count = 0;
469094e47e9SGeorge Wilson 
470094e47e9SGeorge Wilson 	ASSERT(vdev_is_concrete(vd));
471094e47e9SGeorge Wilson 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
472094e47e9SGeorge Wilson 
473094e47e9SGeorge Wilson 	vd->vdev_initialize_last_offset = 0;
474084fd14fSBrian Behlendorf 	VERIFY0(vdev_initialize_load(vd));
475094e47e9SGeorge Wilson 
476094e47e9SGeorge Wilson 	abd_t *deadbeef = vdev_initialize_block_alloc();
477094e47e9SGeorge Wilson 
478*4d7988d6SPaul Dagnelie 	vd->vdev_initialize_tree = range_tree_create(NULL, RANGE_SEG64, NULL,
479*4d7988d6SPaul Dagnelie 	    0, 0);
480094e47e9SGeorge Wilson 
481094e47e9SGeorge Wilson 	for (uint64_t i = 0; !vd->vdev_detached &&
482094e47e9SGeorge Wilson 	    i < vd->vdev_top->vdev_ms_count; i++) {
483094e47e9SGeorge Wilson 		metaslab_t *msp = vd->vdev_top->vdev_ms[i];
484af1d63abSPaul Dagnelie 		boolean_t unload_when_done = B_FALSE;
485094e47e9SGeorge Wilson 
486094e47e9SGeorge Wilson 		/*
487094e47e9SGeorge Wilson 		 * If we've expanded the top-level vdev or it's our
488094e47e9SGeorge Wilson 		 * first pass, calculate our progress.
489094e47e9SGeorge Wilson 		 */
490094e47e9SGeorge Wilson 		if (vd->vdev_top->vdev_ms_count != ms_count) {
491094e47e9SGeorge Wilson 			vdev_initialize_calculate_progress(vd);
492094e47e9SGeorge Wilson 			ms_count = vd->vdev_top->vdev_ms_count;
493094e47e9SGeorge Wilson 		}
494094e47e9SGeorge Wilson 
495084fd14fSBrian Behlendorf 		spa_config_exit(spa, SCL_CONFIG, FTAG);
496084fd14fSBrian Behlendorf 		metaslab_disable(msp);
497094e47e9SGeorge Wilson 		mutex_enter(&msp->ms_lock);
498af1d63abSPaul Dagnelie 		if (!msp->ms_loaded && !msp->ms_loading)
499af1d63abSPaul Dagnelie 			unload_when_done = B_TRUE;
500a0b03b16SSerapheim Dimitropoulos 		VERIFY0(metaslab_load(msp));
501094e47e9SGeorge Wilson 
502094e47e9SGeorge Wilson 		range_tree_walk(msp->ms_allocatable, vdev_initialize_range_add,
503094e47e9SGeorge Wilson 		    vd);
504094e47e9SGeorge Wilson 		mutex_exit(&msp->ms_lock);
505094e47e9SGeorge Wilson 
506094e47e9SGeorge Wilson 		error = vdev_initialize_ranges(vd, deadbeef);
507af1d63abSPaul Dagnelie 		metaslab_enable(msp, B_TRUE, unload_when_done);
508094e47e9SGeorge Wilson 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
509094e47e9SGeorge Wilson 
510094e47e9SGeorge Wilson 		range_tree_vacate(vd->vdev_initialize_tree, NULL, NULL);
511094e47e9SGeorge Wilson 		if (error != 0)
512094e47e9SGeorge Wilson 			break;
513094e47e9SGeorge Wilson 	}
514094e47e9SGeorge Wilson 
515094e47e9SGeorge Wilson 	spa_config_exit(spa, SCL_CONFIG, FTAG);
516094e47e9SGeorge Wilson 	mutex_enter(&vd->vdev_initialize_io_lock);
517094e47e9SGeorge Wilson 	while (vd->vdev_initialize_inflight > 0) {
518094e47e9SGeorge Wilson 		cv_wait(&vd->vdev_initialize_io_cv,
519094e47e9SGeorge Wilson 		    &vd->vdev_initialize_io_lock);
520094e47e9SGeorge Wilson 	}
521094e47e9SGeorge Wilson 	mutex_exit(&vd->vdev_initialize_io_lock);
522094e47e9SGeorge Wilson 
523094e47e9SGeorge Wilson 	range_tree_destroy(vd->vdev_initialize_tree);
524094e47e9SGeorge Wilson 	vdev_initialize_block_free(deadbeef);
525094e47e9SGeorge Wilson 	vd->vdev_initialize_tree = NULL;
526094e47e9SGeorge Wilson 
527094e47e9SGeorge Wilson 	mutex_enter(&vd->vdev_initialize_lock);
528094e47e9SGeorge Wilson 	if (!vd->vdev_initialize_exit_wanted && vdev_writeable(vd)) {
529094e47e9SGeorge Wilson 		vdev_initialize_change_state(vd, VDEV_INITIALIZE_COMPLETE);
530094e47e9SGeorge Wilson 	}
531094e47e9SGeorge Wilson 	ASSERT(vd->vdev_initialize_thread != NULL ||
532094e47e9SGeorge Wilson 	    vd->vdev_initialize_inflight == 0);
533094e47e9SGeorge Wilson 
534094e47e9SGeorge Wilson 	/*
535094e47e9SGeorge Wilson 	 * Drop the vdev_initialize_lock while we sync out the
536094e47e9SGeorge Wilson 	 * txg since it's possible that a device might be trying to
537094e47e9SGeorge Wilson 	 * come online and must check to see if it needs to restart an
538094e47e9SGeorge Wilson 	 * initialization. That thread will be holding the spa_config_lock
539094e47e9SGeorge Wilson 	 * which would prevent the txg_wait_synced from completing.
540094e47e9SGeorge Wilson 	 */
541094e47e9SGeorge Wilson 	mutex_exit(&vd->vdev_initialize_lock);
542094e47e9SGeorge Wilson 	txg_wait_synced(spa_get_dsl(spa), 0);
543094e47e9SGeorge Wilson 	mutex_enter(&vd->vdev_initialize_lock);
544094e47e9SGeorge Wilson 
545094e47e9SGeorge Wilson 	vd->vdev_initialize_thread = NULL;
546094e47e9SGeorge Wilson 	cv_broadcast(&vd->vdev_initialize_cv);
547094e47e9SGeorge Wilson 	mutex_exit(&vd->vdev_initialize_lock);
548094e47e9SGeorge Wilson }
549094e47e9SGeorge Wilson 
550094e47e9SGeorge Wilson /*
551094e47e9SGeorge Wilson  * Initiates a device. Caller must hold vdev_initialize_lock.
552094e47e9SGeorge Wilson  * Device must be a leaf and not already be initializing.
553094e47e9SGeorge Wilson  */
554094e47e9SGeorge Wilson void
vdev_initialize(vdev_t * vd)555094e47e9SGeorge Wilson vdev_initialize(vdev_t *vd)
556094e47e9SGeorge Wilson {
557094e47e9SGeorge Wilson 	ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
558094e47e9SGeorge Wilson 	ASSERT(vd->vdev_ops->vdev_op_leaf);
559094e47e9SGeorge Wilson 	ASSERT(vdev_is_concrete(vd));
560094e47e9SGeorge Wilson 	ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
561094e47e9SGeorge Wilson 	ASSERT(!vd->vdev_detached);
562094e47e9SGeorge Wilson 	ASSERT(!vd->vdev_initialize_exit_wanted);
563094e47e9SGeorge Wilson 	ASSERT(!vd->vdev_top->vdev_removing);
564094e47e9SGeorge Wilson 
565094e47e9SGeorge Wilson 	vdev_initialize_change_state(vd, VDEV_INITIALIZE_ACTIVE);
566094e47e9SGeorge Wilson 	vd->vdev_initialize_thread = thread_create(NULL, 0,
567094e47e9SGeorge Wilson 	    vdev_initialize_thread, vd, 0, &p0, TS_RUN, maxclsyspri);
568094e47e9SGeorge Wilson }
569094e47e9SGeorge Wilson 
570094e47e9SGeorge Wilson /*
571084fd14fSBrian Behlendorf  * Wait for the initialize thread to be terminated (cancelled or stopped).
572084fd14fSBrian Behlendorf  */
573084fd14fSBrian Behlendorf static void
vdev_initialize_stop_wait_impl(vdev_t * vd)574084fd14fSBrian Behlendorf vdev_initialize_stop_wait_impl(vdev_t *vd)
575084fd14fSBrian Behlendorf {
576084fd14fSBrian Behlendorf 	ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
577084fd14fSBrian Behlendorf 
578084fd14fSBrian Behlendorf 	while (vd->vdev_initialize_thread != NULL)
579084fd14fSBrian Behlendorf 		cv_wait(&vd->vdev_initialize_cv, &vd->vdev_initialize_lock);
580084fd14fSBrian Behlendorf 
581084fd14fSBrian Behlendorf 	ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
582084fd14fSBrian Behlendorf 	vd->vdev_initialize_exit_wanted = B_FALSE;
583084fd14fSBrian Behlendorf }
584084fd14fSBrian Behlendorf 
585084fd14fSBrian Behlendorf /*
586084fd14fSBrian Behlendorf  * Wait for vdev initialize threads which were either to cleanly exit.
587094e47e9SGeorge Wilson  */
588094e47e9SGeorge Wilson void
vdev_initialize_stop_wait(spa_t * spa,list_t * vd_list)589084fd14fSBrian Behlendorf vdev_initialize_stop_wait(spa_t *spa, list_t *vd_list)
590094e47e9SGeorge Wilson {
591084fd14fSBrian Behlendorf 	vdev_t *vd;
592094e47e9SGeorge Wilson 
593084fd14fSBrian Behlendorf 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
594084fd14fSBrian Behlendorf 
595084fd14fSBrian Behlendorf 	while ((vd = list_remove_head(vd_list)) != NULL) {
596084fd14fSBrian Behlendorf 		mutex_enter(&vd->vdev_initialize_lock);
597084fd14fSBrian Behlendorf 		vdev_initialize_stop_wait_impl(vd);
598084fd14fSBrian Behlendorf 		mutex_exit(&vd->vdev_initialize_lock);
599084fd14fSBrian Behlendorf 	}
600084fd14fSBrian Behlendorf }
601084fd14fSBrian Behlendorf 
602084fd14fSBrian Behlendorf /*
603084fd14fSBrian Behlendorf  * Stop initializing a device, with the resultant initializing state being
604084fd14fSBrian Behlendorf  * tgt_state.  For blocking behavior pass NULL for vd_list.  Otherwise, when
605084fd14fSBrian Behlendorf  * a list_t is provided the stopping vdev is inserted in to the list.  Callers
606084fd14fSBrian Behlendorf  * are then required to call vdev_initialize_stop_wait() to block for all the
607084fd14fSBrian Behlendorf  * initialization threads to exit.  The caller must hold vdev_initialize_lock
608084fd14fSBrian Behlendorf  * and must not be writing to the spa config, as the initializing thread may
609084fd14fSBrian Behlendorf  * try to enter the config as a reader before exiting.
610084fd14fSBrian Behlendorf  */
611084fd14fSBrian Behlendorf void
vdev_initialize_stop(vdev_t * vd,vdev_initializing_state_t tgt_state,list_t * vd_list)612084fd14fSBrian Behlendorf vdev_initialize_stop(vdev_t *vd, vdev_initializing_state_t tgt_state,
613084fd14fSBrian Behlendorf     list_t *vd_list)
614084fd14fSBrian Behlendorf {
615084fd14fSBrian Behlendorf 	ASSERT(!spa_config_held(vd->vdev_spa, SCL_CONFIG|SCL_STATE, RW_WRITER));
616094e47e9SGeorge Wilson 	ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
617094e47e9SGeorge Wilson 	ASSERT(vd->vdev_ops->vdev_op_leaf);
618094e47e9SGeorge Wilson 	ASSERT(vdev_is_concrete(vd));
619094e47e9SGeorge Wilson 
620094e47e9SGeorge Wilson 	/*
621094e47e9SGeorge Wilson 	 * Allow cancel requests to proceed even if the initialize thread
622094e47e9SGeorge Wilson 	 * has stopped.
623094e47e9SGeorge Wilson 	 */
624094e47e9SGeorge Wilson 	if (vd->vdev_initialize_thread == NULL &&
625094e47e9SGeorge Wilson 	    tgt_state != VDEV_INITIALIZE_CANCELED) {
626094e47e9SGeorge Wilson 		return;
627094e47e9SGeorge Wilson 	}
628094e47e9SGeorge Wilson 
629094e47e9SGeorge Wilson 	vdev_initialize_change_state(vd, tgt_state);
630094e47e9SGeorge Wilson 	vd->vdev_initialize_exit_wanted = B_TRUE;
631094e47e9SGeorge Wilson 
632084fd14fSBrian Behlendorf 	if (vd_list == NULL) {
633084fd14fSBrian Behlendorf 		vdev_initialize_stop_wait_impl(vd);
634084fd14fSBrian Behlendorf 	} else {
635084fd14fSBrian Behlendorf 		ASSERT(MUTEX_HELD(&spa_namespace_lock));
636084fd14fSBrian Behlendorf 		list_insert_tail(vd_list, vd);
637084fd14fSBrian Behlendorf 	}
638094e47e9SGeorge Wilson }
639094e47e9SGeorge Wilson 
640094e47e9SGeorge Wilson static void
vdev_initialize_stop_all_impl(vdev_t * vd,vdev_initializing_state_t tgt_state,list_t * vd_list)641084fd14fSBrian Behlendorf vdev_initialize_stop_all_impl(vdev_t *vd, vdev_initializing_state_t tgt_state,
642084fd14fSBrian Behlendorf     list_t *vd_list)
643094e47e9SGeorge Wilson {
644094e47e9SGeorge Wilson 	if (vd->vdev_ops->vdev_op_leaf && vdev_is_concrete(vd)) {
645094e47e9SGeorge Wilson 		mutex_enter(&vd->vdev_initialize_lock);
646084fd14fSBrian Behlendorf 		vdev_initialize_stop(vd, tgt_state, vd_list);
647094e47e9SGeorge Wilson 		mutex_exit(&vd->vdev_initialize_lock);
648094e47e9SGeorge Wilson 		return;
649094e47e9SGeorge Wilson 	}
650094e47e9SGeorge Wilson 
651094e47e9SGeorge Wilson 	for (uint64_t i = 0; i < vd->vdev_children; i++) {
652084fd14fSBrian Behlendorf 		vdev_initialize_stop_all_impl(vd->vdev_child[i], tgt_state,
653084fd14fSBrian Behlendorf 		    vd_list);
654094e47e9SGeorge Wilson 	}
655094e47e9SGeorge Wilson }
656094e47e9SGeorge Wilson 
657094e47e9SGeorge Wilson /*
658094e47e9SGeorge Wilson  * Convenience function to stop initializing of a vdev tree and set all
659094e47e9SGeorge Wilson  * initialize thread pointers to NULL.
660094e47e9SGeorge Wilson  */
661094e47e9SGeorge Wilson void
vdev_initialize_stop_all(vdev_t * vd,vdev_initializing_state_t tgt_state)662094e47e9SGeorge Wilson vdev_initialize_stop_all(vdev_t *vd, vdev_initializing_state_t tgt_state)
663094e47e9SGeorge Wilson {
664084fd14fSBrian Behlendorf 	spa_t *spa = vd->vdev_spa;
665084fd14fSBrian Behlendorf 	list_t vd_list;
666084fd14fSBrian Behlendorf 
667084fd14fSBrian Behlendorf 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
668084fd14fSBrian Behlendorf 
669084fd14fSBrian Behlendorf 	list_create(&vd_list, sizeof (vdev_t),
670084fd14fSBrian Behlendorf 	    offsetof(vdev_t, vdev_initialize_node));
671084fd14fSBrian Behlendorf 
672084fd14fSBrian Behlendorf 	vdev_initialize_stop_all_impl(vd, tgt_state, &vd_list);
673084fd14fSBrian Behlendorf 	vdev_initialize_stop_wait(spa, &vd_list);
674094e47e9SGeorge Wilson 
675094e47e9SGeorge Wilson 	if (vd->vdev_spa->spa_sync_on) {
676094e47e9SGeorge Wilson 		/* Make sure that our state has been synced to disk */
677094e47e9SGeorge Wilson 		txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0);
678094e47e9SGeorge Wilson 	}
679084fd14fSBrian Behlendorf 
680084fd14fSBrian Behlendorf 	list_destroy(&vd_list);
681094e47e9SGeorge Wilson }
682094e47e9SGeorge Wilson 
683094e47e9SGeorge Wilson void
vdev_initialize_restart(vdev_t * vd)684094e47e9SGeorge Wilson vdev_initialize_restart(vdev_t *vd)
685094e47e9SGeorge Wilson {
686094e47e9SGeorge Wilson 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
687094e47e9SGeorge Wilson 	ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
688094e47e9SGeorge Wilson 
689094e47e9SGeorge Wilson 	if (vd->vdev_leaf_zap != 0) {
690094e47e9SGeorge Wilson 		mutex_enter(&vd->vdev_initialize_lock);
691094e47e9SGeorge Wilson 		uint64_t initialize_state = VDEV_INITIALIZE_NONE;
692094e47e9SGeorge Wilson 		int err = zap_lookup(vd->vdev_spa->spa_meta_objset,
693094e47e9SGeorge Wilson 		    vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_STATE,
694094e47e9SGeorge Wilson 		    sizeof (initialize_state), 1, &initialize_state);
695094e47e9SGeorge Wilson 		ASSERT(err == 0 || err == ENOENT);
696094e47e9SGeorge Wilson 		vd->vdev_initialize_state = initialize_state;
697094e47e9SGeorge Wilson 
698094e47e9SGeorge Wilson 		uint64_t timestamp = 0;
699094e47e9SGeorge Wilson 		err = zap_lookup(vd->vdev_spa->spa_meta_objset,
700094e47e9SGeorge Wilson 		    vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME,
701094e47e9SGeorge Wilson 		    sizeof (timestamp), 1, &timestamp);
702094e47e9SGeorge Wilson 		ASSERT(err == 0 || err == ENOENT);
703094e47e9SGeorge Wilson 		vd->vdev_initialize_action_time = (time_t)timestamp;
704094e47e9SGeorge Wilson 
705094e47e9SGeorge Wilson 		if (vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED ||
706094e47e9SGeorge Wilson 		    vd->vdev_offline) {
707094e47e9SGeorge Wilson 			/* load progress for reporting, but don't resume */
708084fd14fSBrian Behlendorf 			VERIFY0(vdev_initialize_load(vd));
709094e47e9SGeorge Wilson 		} else if (vd->vdev_initialize_state ==
710084fd14fSBrian Behlendorf 		    VDEV_INITIALIZE_ACTIVE && vdev_writeable(vd) &&
711084fd14fSBrian Behlendorf 		    !vd->vdev_top->vdev_removing &&
712084fd14fSBrian Behlendorf 		    vd->vdev_initialize_thread == NULL) {
713094e47e9SGeorge Wilson 			vdev_initialize(vd);
714094e47e9SGeorge Wilson 		}
715094e47e9SGeorge Wilson 
716094e47e9SGeorge Wilson 		mutex_exit(&vd->vdev_initialize_lock);
717094e47e9SGeorge Wilson 	}
718094e47e9SGeorge Wilson 
719094e47e9SGeorge Wilson 	for (uint64_t i = 0; i < vd->vdev_children; i++) {
720094e47e9SGeorge Wilson 		vdev_initialize_restart(vd->vdev_child[i]);
721094e47e9SGeorge Wilson 	}
722094e47e9SGeorge Wilson }
723