xref: /onnv-gate/usr/src/uts/common/fs/zfs/vdev.c (revision 789)
1*789Sahrens /*
2*789Sahrens  * CDDL HEADER START
3*789Sahrens  *
4*789Sahrens  * The contents of this file are subject to the terms of the
5*789Sahrens  * Common Development and Distribution License, Version 1.0 only
6*789Sahrens  * (the "License").  You may not use this file except in compliance
7*789Sahrens  * with the License.
8*789Sahrens  *
9*789Sahrens  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10*789Sahrens  * or http://www.opensolaris.org/os/licensing.
11*789Sahrens  * See the License for the specific language governing permissions
12*789Sahrens  * and limitations under the License.
13*789Sahrens  *
14*789Sahrens  * When distributing Covered Code, include this CDDL HEADER in each
15*789Sahrens  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16*789Sahrens  * If applicable, add the following below this CDDL HEADER, with the
17*789Sahrens  * fields enclosed by brackets "[]" replaced with your own identifying
18*789Sahrens  * information: Portions Copyright [yyyy] [name of copyright owner]
19*789Sahrens  *
20*789Sahrens  * CDDL HEADER END
21*789Sahrens  */
22*789Sahrens /*
23*789Sahrens  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24*789Sahrens  * Use is subject to license terms.
25*789Sahrens  */
26*789Sahrens 
27*789Sahrens #pragma ident	"%Z%%M%	%I%	%E% SMI"
28*789Sahrens 
29*789Sahrens #include <sys/zfs_context.h>
30*789Sahrens #include <sys/spa.h>
31*789Sahrens #include <sys/spa_impl.h>
32*789Sahrens #include <sys/dmu.h>
33*789Sahrens #include <sys/dmu_tx.h>
34*789Sahrens #include <sys/vdev_impl.h>
35*789Sahrens #include <sys/uberblock_impl.h>
36*789Sahrens #include <sys/metaslab.h>
37*789Sahrens #include <sys/metaslab_impl.h>
38*789Sahrens #include <sys/space_map.h>
39*789Sahrens #include <sys/zio.h>
40*789Sahrens #include <sys/zap.h>
41*789Sahrens #include <sys/fs/zfs.h>
42*789Sahrens 
43*789Sahrens /*
44*789Sahrens  * Virtual device management.
45*789Sahrens  */
46*789Sahrens 
47*789Sahrens static vdev_ops_t *vdev_ops_table[] = {
48*789Sahrens 	&vdev_root_ops,
49*789Sahrens 	&vdev_raidz_ops,
50*789Sahrens 	&vdev_mirror_ops,
51*789Sahrens 	&vdev_replacing_ops,
52*789Sahrens 	&vdev_disk_ops,
53*789Sahrens 	&vdev_file_ops,
54*789Sahrens 	&vdev_missing_ops,
55*789Sahrens 	NULL
56*789Sahrens };
57*789Sahrens 
58*789Sahrens /*
59*789Sahrens  * Given a vdev type, return the appropriate ops vector.
60*789Sahrens  */
61*789Sahrens static vdev_ops_t *
62*789Sahrens vdev_getops(const char *type)
63*789Sahrens {
64*789Sahrens 	vdev_ops_t *ops, **opspp;
65*789Sahrens 
66*789Sahrens 	for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++)
67*789Sahrens 		if (strcmp(ops->vdev_op_type, type) == 0)
68*789Sahrens 			break;
69*789Sahrens 
70*789Sahrens 	return (ops);
71*789Sahrens }
72*789Sahrens 
73*789Sahrens /*
74*789Sahrens  * Default asize function: return the MAX of psize with the asize of
75*789Sahrens  * all children.  This is what's used by anything other than RAID-Z.
76*789Sahrens  */
77*789Sahrens uint64_t
78*789Sahrens vdev_default_asize(vdev_t *vd, uint64_t psize)
79*789Sahrens {
80*789Sahrens 	uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_ashift);
81*789Sahrens 	uint64_t csize;
82*789Sahrens 	uint64_t c;
83*789Sahrens 
84*789Sahrens 	for (c = 0; c < vd->vdev_children; c++) {
85*789Sahrens 		csize = vdev_psize_to_asize(vd->vdev_child[c], psize);
86*789Sahrens 		asize = MAX(asize, csize);
87*789Sahrens 	}
88*789Sahrens 
89*789Sahrens 	return (asize);
90*789Sahrens }
91*789Sahrens 
92*789Sahrens vdev_t *
93*789Sahrens vdev_lookup_top(spa_t *spa, uint64_t vdev)
94*789Sahrens {
95*789Sahrens 	vdev_t *rvd = spa->spa_root_vdev;
96*789Sahrens 
97*789Sahrens 	if (vdev < rvd->vdev_children)
98*789Sahrens 		return (rvd->vdev_child[vdev]);
99*789Sahrens 
100*789Sahrens 	return (NULL);
101*789Sahrens }
102*789Sahrens 
103*789Sahrens vdev_t *
104*789Sahrens vdev_lookup_by_path(vdev_t *vd, const char *path)
105*789Sahrens {
106*789Sahrens 	int c;
107*789Sahrens 	vdev_t *mvd;
108*789Sahrens 
109*789Sahrens 	if (vd->vdev_path != NULL && strcmp(path, vd->vdev_path) == 0)
110*789Sahrens 		return (vd);
111*789Sahrens 
112*789Sahrens 	for (c = 0; c < vd->vdev_children; c++)
113*789Sahrens 		if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) !=
114*789Sahrens 		    NULL)
115*789Sahrens 			return (mvd);
116*789Sahrens 
117*789Sahrens 	return (NULL);
118*789Sahrens }
119*789Sahrens 
120*789Sahrens vdev_t *
121*789Sahrens vdev_lookup_by_guid(vdev_t *vd, uint64_t guid)
122*789Sahrens {
123*789Sahrens 	int c;
124*789Sahrens 	vdev_t *mvd;
125*789Sahrens 
126*789Sahrens 	if (vd->vdev_children == 0 && vd->vdev_guid == guid)
127*789Sahrens 		return (vd);
128*789Sahrens 
129*789Sahrens 	for (c = 0; c < vd->vdev_children; c++)
130*789Sahrens 		if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) !=
131*789Sahrens 		    NULL)
132*789Sahrens 			return (mvd);
133*789Sahrens 
134*789Sahrens 	return (NULL);
135*789Sahrens }
136*789Sahrens 
137*789Sahrens void
138*789Sahrens vdev_add_child(vdev_t *pvd, vdev_t *cvd)
139*789Sahrens {
140*789Sahrens 	size_t oldsize, newsize;
141*789Sahrens 	uint64_t id = cvd->vdev_id;
142*789Sahrens 	vdev_t **newchild;
143*789Sahrens 
144*789Sahrens 	ASSERT(spa_config_held(cvd->vdev_spa, RW_WRITER));
145*789Sahrens 	ASSERT(cvd->vdev_parent == NULL);
146*789Sahrens 
147*789Sahrens 	cvd->vdev_parent = pvd;
148*789Sahrens 
149*789Sahrens 	if (pvd == NULL)
150*789Sahrens 		return;
151*789Sahrens 
152*789Sahrens 	ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL);
153*789Sahrens 
154*789Sahrens 	oldsize = pvd->vdev_children * sizeof (vdev_t *);
155*789Sahrens 	pvd->vdev_children = MAX(pvd->vdev_children, id + 1);
156*789Sahrens 	newsize = pvd->vdev_children * sizeof (vdev_t *);
157*789Sahrens 
158*789Sahrens 	newchild = kmem_zalloc(newsize, KM_SLEEP);
159*789Sahrens 	if (pvd->vdev_child != NULL) {
160*789Sahrens 		bcopy(pvd->vdev_child, newchild, oldsize);
161*789Sahrens 		kmem_free(pvd->vdev_child, oldsize);
162*789Sahrens 	}
163*789Sahrens 
164*789Sahrens 	pvd->vdev_child = newchild;
165*789Sahrens 	pvd->vdev_child[id] = cvd;
166*789Sahrens 
167*789Sahrens 	cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd);
168*789Sahrens 	ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL);
169*789Sahrens 
170*789Sahrens 	/*
171*789Sahrens 	 * Walk up all ancestors to update guid sum.
172*789Sahrens 	 */
173*789Sahrens 	for (; pvd != NULL; pvd = pvd->vdev_parent)
174*789Sahrens 		pvd->vdev_guid_sum += cvd->vdev_guid_sum;
175*789Sahrens }
176*789Sahrens 
177*789Sahrens void
178*789Sahrens vdev_remove_child(vdev_t *pvd, vdev_t *cvd)
179*789Sahrens {
180*789Sahrens 	int c;
181*789Sahrens 	uint_t id = cvd->vdev_id;
182*789Sahrens 
183*789Sahrens 	ASSERT(cvd->vdev_parent == pvd);
184*789Sahrens 
185*789Sahrens 	if (pvd == NULL)
186*789Sahrens 		return;
187*789Sahrens 
188*789Sahrens 	ASSERT(id < pvd->vdev_children);
189*789Sahrens 	ASSERT(pvd->vdev_child[id] == cvd);
190*789Sahrens 
191*789Sahrens 	pvd->vdev_child[id] = NULL;
192*789Sahrens 	cvd->vdev_parent = NULL;
193*789Sahrens 
194*789Sahrens 	for (c = 0; c < pvd->vdev_children; c++)
195*789Sahrens 		if (pvd->vdev_child[c])
196*789Sahrens 			break;
197*789Sahrens 
198*789Sahrens 	if (c == pvd->vdev_children) {
199*789Sahrens 		kmem_free(pvd->vdev_child, c * sizeof (vdev_t *));
200*789Sahrens 		pvd->vdev_child = NULL;
201*789Sahrens 		pvd->vdev_children = 0;
202*789Sahrens 	}
203*789Sahrens 
204*789Sahrens 	/*
205*789Sahrens 	 * Walk up all ancestors to update guid sum.
206*789Sahrens 	 */
207*789Sahrens 	for (; pvd != NULL; pvd = pvd->vdev_parent)
208*789Sahrens 		pvd->vdev_guid_sum -= cvd->vdev_guid_sum;
209*789Sahrens }
210*789Sahrens 
211*789Sahrens /*
212*789Sahrens  * Remove any holes in the child array.
213*789Sahrens  */
214*789Sahrens void
215*789Sahrens vdev_compact_children(vdev_t *pvd)
216*789Sahrens {
217*789Sahrens 	vdev_t **newchild, *cvd;
218*789Sahrens 	int oldc = pvd->vdev_children;
219*789Sahrens 	int newc, c;
220*789Sahrens 
221*789Sahrens 	ASSERT(spa_config_held(pvd->vdev_spa, RW_WRITER));
222*789Sahrens 
223*789Sahrens 	for (c = newc = 0; c < oldc; c++)
224*789Sahrens 		if (pvd->vdev_child[c])
225*789Sahrens 			newc++;
226*789Sahrens 
227*789Sahrens 	newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP);
228*789Sahrens 
229*789Sahrens 	for (c = newc = 0; c < oldc; c++) {
230*789Sahrens 		if ((cvd = pvd->vdev_child[c]) != NULL) {
231*789Sahrens 			newchild[newc] = cvd;
232*789Sahrens 			cvd->vdev_id = newc++;
233*789Sahrens 		}
234*789Sahrens 	}
235*789Sahrens 
236*789Sahrens 	kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *));
237*789Sahrens 	pvd->vdev_child = newchild;
238*789Sahrens 	pvd->vdev_children = newc;
239*789Sahrens }
240*789Sahrens 
241*789Sahrens /*
242*789Sahrens  * Allocate and minimally initialize a vdev_t.
243*789Sahrens  */
244*789Sahrens static vdev_t *
245*789Sahrens vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
246*789Sahrens {
247*789Sahrens 	vdev_t *vd;
248*789Sahrens 
249*789Sahrens 	while (guid == 0)
250*789Sahrens 		guid = spa_get_random(-1ULL);
251*789Sahrens 
252*789Sahrens 	vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP);
253*789Sahrens 
254*789Sahrens 	vd->vdev_spa = spa;
255*789Sahrens 	vd->vdev_id = id;
256*789Sahrens 	vd->vdev_guid = guid;
257*789Sahrens 	vd->vdev_guid_sum = guid;
258*789Sahrens 	vd->vdev_ops = ops;
259*789Sahrens 	vd->vdev_state = VDEV_STATE_CLOSED;
260*789Sahrens 
261*789Sahrens 	mutex_init(&vd->vdev_io_lock, NULL, MUTEX_DEFAULT, NULL);
262*789Sahrens 	cv_init(&vd->vdev_io_cv, NULL, CV_DEFAULT, NULL);
263*789Sahrens 	list_create(&vd->vdev_io_pending, sizeof (zio_t),
264*789Sahrens 	    offsetof(zio_t, io_pending));
265*789Sahrens 	mutex_init(&vd->vdev_dirty_lock, NULL, MUTEX_DEFAULT, NULL);
266*789Sahrens 	mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
267*789Sahrens 	space_map_create(&vd->vdev_dtl_map, 0, -1ULL, 0, &vd->vdev_dtl_lock);
268*789Sahrens 	space_map_create(&vd->vdev_dtl_scrub, 0, -1ULL, 0, &vd->vdev_dtl_lock);
269*789Sahrens 	txg_list_create(&vd->vdev_ms_list,
270*789Sahrens 	    offsetof(struct metaslab, ms_txg_node));
271*789Sahrens 	txg_list_create(&vd->vdev_dtl_list,
272*789Sahrens 	    offsetof(struct vdev, vdev_dtl_node));
273*789Sahrens 	vd->vdev_stat.vs_timestamp = gethrtime();
274*789Sahrens 
275*789Sahrens 	return (vd);
276*789Sahrens }
277*789Sahrens 
278*789Sahrens /*
279*789Sahrens  * Free a vdev_t that has been removed from service.
280*789Sahrens  */
281*789Sahrens static void
282*789Sahrens vdev_free_common(vdev_t *vd)
283*789Sahrens {
284*789Sahrens 	if (vd->vdev_path)
285*789Sahrens 		spa_strfree(vd->vdev_path);
286*789Sahrens 	if (vd->vdev_devid)
287*789Sahrens 		spa_strfree(vd->vdev_devid);
288*789Sahrens 
289*789Sahrens 	txg_list_destroy(&vd->vdev_ms_list);
290*789Sahrens 	txg_list_destroy(&vd->vdev_dtl_list);
291*789Sahrens 	mutex_enter(&vd->vdev_dtl_lock);
292*789Sahrens 	space_map_vacate(&vd->vdev_dtl_map, NULL, NULL);
293*789Sahrens 	space_map_destroy(&vd->vdev_dtl_map);
294*789Sahrens 	space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
295*789Sahrens 	space_map_destroy(&vd->vdev_dtl_scrub);
296*789Sahrens 	mutex_exit(&vd->vdev_dtl_lock);
297*789Sahrens 	mutex_destroy(&vd->vdev_dtl_lock);
298*789Sahrens 	mutex_destroy(&vd->vdev_dirty_lock);
299*789Sahrens 	list_destroy(&vd->vdev_io_pending);
300*789Sahrens 	mutex_destroy(&vd->vdev_io_lock);
301*789Sahrens 	cv_destroy(&vd->vdev_io_cv);
302*789Sahrens 
303*789Sahrens 	kmem_free(vd, sizeof (vdev_t));
304*789Sahrens }
305*789Sahrens 
306*789Sahrens /*
307*789Sahrens  * Allocate a new vdev.  The 'alloctype' is used to control whether we are
308*789Sahrens  * creating a new vdev or loading an existing one - the behavior is slightly
309*789Sahrens  * different for each case.
310*789Sahrens  */
311*789Sahrens vdev_t *
312*789Sahrens vdev_alloc(spa_t *spa, nvlist_t *nv, vdev_t *parent, uint_t id, int alloctype)
313*789Sahrens {
314*789Sahrens 	vdev_ops_t *ops;
315*789Sahrens 	char *type;
316*789Sahrens 	uint64_t guid = 0;
317*789Sahrens 	vdev_t *vd;
318*789Sahrens 
319*789Sahrens 	ASSERT(spa_config_held(spa, RW_WRITER));
320*789Sahrens 
321*789Sahrens 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)
322*789Sahrens 		return (NULL);
323*789Sahrens 
324*789Sahrens 	if ((ops = vdev_getops(type)) == NULL)
325*789Sahrens 		return (NULL);
326*789Sahrens 
327*789Sahrens 	/*
328*789Sahrens 	 * If this is a load, get the vdev guid from the nvlist.
329*789Sahrens 	 * Otherwise, vdev_alloc_common() will generate one for us.
330*789Sahrens 	 */
331*789Sahrens 	if (alloctype == VDEV_ALLOC_LOAD) {
332*789Sahrens 		uint64_t label_id;
333*789Sahrens 
334*789Sahrens 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) ||
335*789Sahrens 		    label_id != id)
336*789Sahrens 			return (NULL);
337*789Sahrens 
338*789Sahrens 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
339*789Sahrens 			return (NULL);
340*789Sahrens 	}
341*789Sahrens 
342*789Sahrens 	vd = vdev_alloc_common(spa, id, guid, ops);
343*789Sahrens 
344*789Sahrens 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0)
345*789Sahrens 		vd->vdev_path = spa_strdup(vd->vdev_path);
346*789Sahrens 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0)
347*789Sahrens 		vd->vdev_devid = spa_strdup(vd->vdev_devid);
348*789Sahrens 
349*789Sahrens 	/*
350*789Sahrens 	 * If we're a top-level vdev, try to load the allocation parameters.
351*789Sahrens 	 */
352*789Sahrens 	if (parent && !parent->vdev_parent && alloctype == VDEV_ALLOC_LOAD) {
353*789Sahrens 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
354*789Sahrens 		    &vd->vdev_ms_array);
355*789Sahrens 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
356*789Sahrens 		    &vd->vdev_ms_shift);
357*789Sahrens 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT,
358*789Sahrens 		    &vd->vdev_ashift);
359*789Sahrens 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE,
360*789Sahrens 		    &vd->vdev_asize);
361*789Sahrens 	}
362*789Sahrens 
363*789Sahrens 	/*
364*789Sahrens 	 * If we're a leaf vdev, try to load the DTL object.
365*789Sahrens 	 */
366*789Sahrens 	if (vd->vdev_ops->vdev_op_leaf && alloctype == VDEV_ALLOC_LOAD) {
367*789Sahrens 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
368*789Sahrens 		    &vd->vdev_dtl.smo_object);
369*789Sahrens 	}
370*789Sahrens 
371*789Sahrens 	/*
372*789Sahrens 	 * Add ourselves to the parent's list of children.
373*789Sahrens 	 */
374*789Sahrens 	vdev_add_child(parent, vd);
375*789Sahrens 
376*789Sahrens 	return (vd);
377*789Sahrens }
378*789Sahrens 
379*789Sahrens void
380*789Sahrens vdev_free(vdev_t *vd)
381*789Sahrens {
382*789Sahrens 	int c;
383*789Sahrens 
384*789Sahrens 	/*
385*789Sahrens 	 * vdev_free() implies closing the vdev first.  This is simpler than
386*789Sahrens 	 * trying to ensure complicated semantics for all callers.
387*789Sahrens 	 */
388*789Sahrens 	vdev_close(vd);
389*789Sahrens 
390*789Sahrens 	/*
391*789Sahrens 	 * It's possible to free a vdev that's been added to the dirty
392*789Sahrens 	 * list when in the middle of spa_vdev_add().  Handle that case
393*789Sahrens 	 * correctly here.
394*789Sahrens 	 */
395*789Sahrens 	if (vd->vdev_is_dirty)
396*789Sahrens 		vdev_config_clean(vd);
397*789Sahrens 
398*789Sahrens 	/*
399*789Sahrens 	 * Free all children.
400*789Sahrens 	 */
401*789Sahrens 	for (c = 0; c < vd->vdev_children; c++)
402*789Sahrens 		vdev_free(vd->vdev_child[c]);
403*789Sahrens 
404*789Sahrens 	ASSERT(vd->vdev_child == NULL);
405*789Sahrens 	ASSERT(vd->vdev_guid_sum == vd->vdev_guid);
406*789Sahrens 
407*789Sahrens 	/*
408*789Sahrens 	 * Discard allocation state.
409*789Sahrens 	 */
410*789Sahrens 	if (vd == vd->vdev_top)
411*789Sahrens 		vdev_metaslab_fini(vd);
412*789Sahrens 
413*789Sahrens 	ASSERT3U(vd->vdev_stat.vs_space, ==, 0);
414*789Sahrens 	ASSERT3U(vd->vdev_stat.vs_alloc, ==, 0);
415*789Sahrens 
416*789Sahrens 	/*
417*789Sahrens 	 * Remove this vdev from its parent's child list.
418*789Sahrens 	 */
419*789Sahrens 	vdev_remove_child(vd->vdev_parent, vd);
420*789Sahrens 
421*789Sahrens 	ASSERT(vd->vdev_parent == NULL);
422*789Sahrens 
423*789Sahrens 	vdev_free_common(vd);
424*789Sahrens }
425*789Sahrens 
426*789Sahrens /*
427*789Sahrens  * Transfer top-level vdev state from svd to tvd.
428*789Sahrens  */
429*789Sahrens static void
430*789Sahrens vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
431*789Sahrens {
432*789Sahrens 	spa_t *spa = svd->vdev_spa;
433*789Sahrens 	metaslab_t *msp;
434*789Sahrens 	vdev_t *vd;
435*789Sahrens 	int t;
436*789Sahrens 
437*789Sahrens 	ASSERT(tvd == tvd->vdev_top);
438*789Sahrens 
439*789Sahrens 	tvd->vdev_ms_array = svd->vdev_ms_array;
440*789Sahrens 	tvd->vdev_ms_shift = svd->vdev_ms_shift;
441*789Sahrens 	tvd->vdev_ms_count = svd->vdev_ms_count;
442*789Sahrens 
443*789Sahrens 	svd->vdev_ms_array = 0;
444*789Sahrens 	svd->vdev_ms_shift = 0;
445*789Sahrens 	svd->vdev_ms_count = 0;
446*789Sahrens 
447*789Sahrens 	tvd->vdev_mg = svd->vdev_mg;
448*789Sahrens 	tvd->vdev_mg->mg_vd = tvd;
449*789Sahrens 	tvd->vdev_ms = svd->vdev_ms;
450*789Sahrens 	tvd->vdev_smo = svd->vdev_smo;
451*789Sahrens 
452*789Sahrens 	svd->vdev_mg = NULL;
453*789Sahrens 	svd->vdev_ms = NULL;
454*789Sahrens 	svd->vdev_smo = NULL;
455*789Sahrens 
456*789Sahrens 	tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc;
457*789Sahrens 	tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space;
458*789Sahrens 
459*789Sahrens 	svd->vdev_stat.vs_alloc = 0;
460*789Sahrens 	svd->vdev_stat.vs_space = 0;
461*789Sahrens 
462*789Sahrens 	for (t = 0; t < TXG_SIZE; t++) {
463*789Sahrens 		while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL)
464*789Sahrens 			(void) txg_list_add(&tvd->vdev_ms_list, msp, t);
465*789Sahrens 		while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL)
466*789Sahrens 			(void) txg_list_add(&tvd->vdev_dtl_list, vd, t);
467*789Sahrens 		if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t))
468*789Sahrens 			(void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t);
469*789Sahrens 		tvd->vdev_dirty[t] = svd->vdev_dirty[t];
470*789Sahrens 		svd->vdev_dirty[t] = 0;
471*789Sahrens 	}
472*789Sahrens 
473*789Sahrens 	if (svd->vdev_is_dirty) {
474*789Sahrens 		vdev_config_clean(svd);
475*789Sahrens 		vdev_config_dirty(tvd);
476*789Sahrens 	}
477*789Sahrens 
478*789Sahrens 	ASSERT(svd->vdev_io_retry == NULL);
479*789Sahrens 	ASSERT(list_is_empty(&svd->vdev_io_pending));
480*789Sahrens }
481*789Sahrens 
482*789Sahrens static void
483*789Sahrens vdev_top_update(vdev_t *tvd, vdev_t *vd)
484*789Sahrens {
485*789Sahrens 	int c;
486*789Sahrens 
487*789Sahrens 	if (vd == NULL)
488*789Sahrens 		return;
489*789Sahrens 
490*789Sahrens 	vd->vdev_top = tvd;
491*789Sahrens 
492*789Sahrens 	for (c = 0; c < vd->vdev_children; c++)
493*789Sahrens 		vdev_top_update(tvd, vd->vdev_child[c]);
494*789Sahrens }
495*789Sahrens 
496*789Sahrens /*
497*789Sahrens  * Add a mirror/replacing vdev above an existing vdev.
498*789Sahrens  */
499*789Sahrens vdev_t *
500*789Sahrens vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops)
501*789Sahrens {
502*789Sahrens 	spa_t *spa = cvd->vdev_spa;
503*789Sahrens 	vdev_t *pvd = cvd->vdev_parent;
504*789Sahrens 	vdev_t *mvd;
505*789Sahrens 
506*789Sahrens 	ASSERT(spa_config_held(spa, RW_WRITER));
507*789Sahrens 
508*789Sahrens 	mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops);
509*789Sahrens 	vdev_remove_child(pvd, cvd);
510*789Sahrens 	vdev_add_child(pvd, mvd);
511*789Sahrens 	cvd->vdev_id = mvd->vdev_children;
512*789Sahrens 	vdev_add_child(mvd, cvd);
513*789Sahrens 	vdev_top_update(cvd->vdev_top, cvd->vdev_top);
514*789Sahrens 
515*789Sahrens 	mvd->vdev_asize = cvd->vdev_asize;
516*789Sahrens 	mvd->vdev_ashift = cvd->vdev_ashift;
517*789Sahrens 	mvd->vdev_state = cvd->vdev_state;
518*789Sahrens 
519*789Sahrens 	if (mvd == mvd->vdev_top)
520*789Sahrens 		vdev_top_transfer(cvd, mvd);
521*789Sahrens 
522*789Sahrens 	return (mvd);
523*789Sahrens }
524*789Sahrens 
525*789Sahrens /*
526*789Sahrens  * Remove a 1-way mirror/replacing vdev from the tree.
527*789Sahrens  */
528*789Sahrens void
529*789Sahrens vdev_remove_parent(vdev_t *cvd)
530*789Sahrens {
531*789Sahrens 	vdev_t *mvd = cvd->vdev_parent;
532*789Sahrens 	vdev_t *pvd = mvd->vdev_parent;
533*789Sahrens 
534*789Sahrens 	ASSERT(spa_config_held(cvd->vdev_spa, RW_WRITER));
535*789Sahrens 
536*789Sahrens 	ASSERT(mvd->vdev_children == 1);
537*789Sahrens 	ASSERT(mvd->vdev_ops == &vdev_mirror_ops ||
538*789Sahrens 	    mvd->vdev_ops == &vdev_replacing_ops);
539*789Sahrens 
540*789Sahrens 	vdev_remove_child(mvd, cvd);
541*789Sahrens 	vdev_remove_child(pvd, mvd);
542*789Sahrens 	cvd->vdev_id = mvd->vdev_id;
543*789Sahrens 	vdev_add_child(pvd, cvd);
544*789Sahrens 	vdev_top_update(cvd->vdev_top, cvd->vdev_top);
545*789Sahrens 
546*789Sahrens 	if (cvd == cvd->vdev_top)
547*789Sahrens 		vdev_top_transfer(mvd, cvd);
548*789Sahrens 
549*789Sahrens 	ASSERT(mvd->vdev_children == 0);
550*789Sahrens 	vdev_free(mvd);
551*789Sahrens }
552*789Sahrens 
553*789Sahrens void
554*789Sahrens vdev_metaslab_init(vdev_t *vd, uint64_t txg)
555*789Sahrens {
556*789Sahrens 	spa_t *spa = vd->vdev_spa;
557*789Sahrens 	metaslab_class_t *mc = spa_metaslab_class_select(spa);
558*789Sahrens 	uint64_t c;
559*789Sahrens 	uint64_t oldc = vd->vdev_ms_count;
560*789Sahrens 	uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
561*789Sahrens 	space_map_obj_t *smo = vd->vdev_smo;
562*789Sahrens 	metaslab_t **mspp = vd->vdev_ms;
563*789Sahrens 
564*789Sahrens 	dprintf("%s oldc %llu newc %llu\n", vdev_description(vd), oldc, newc);
565*789Sahrens 
566*789Sahrens 	ASSERT(oldc <= newc);
567*789Sahrens 
568*789Sahrens 	vd->vdev_smo = kmem_zalloc(newc * sizeof (*smo), KM_SLEEP);
569*789Sahrens 	vd->vdev_ms = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);
570*789Sahrens 	vd->vdev_ms_count = newc;
571*789Sahrens 
572*789Sahrens 	if (vd->vdev_mg == NULL) {
573*789Sahrens 		if (txg == 0) {
574*789Sahrens 			dmu_buf_t *db;
575*789Sahrens 			uint64_t *ms_array;
576*789Sahrens 
577*789Sahrens 			ms_array = kmem_zalloc(newc * sizeof (uint64_t),
578*789Sahrens 			    KM_SLEEP);
579*789Sahrens 
580*789Sahrens 			dmu_read(spa->spa_meta_objset, vd->vdev_ms_array,
581*789Sahrens 			    0, newc * sizeof (uint64_t), ms_array);
582*789Sahrens 
583*789Sahrens 			for (c = 0; c < newc; c++) {
584*789Sahrens 				if (ms_array[c] == 0)
585*789Sahrens 					continue;
586*789Sahrens 				db = dmu_bonus_hold(spa->spa_meta_objset,
587*789Sahrens 				    ms_array[c]);
588*789Sahrens 				dmu_buf_read(db);
589*789Sahrens 				ASSERT3U(db->db_size, ==, sizeof (*smo));
590*789Sahrens 				bcopy(db->db_data, &vd->vdev_smo[c],
591*789Sahrens 				    db->db_size);
592*789Sahrens 				ASSERT3U(vd->vdev_smo[c].smo_object, ==,
593*789Sahrens 				    ms_array[c]);
594*789Sahrens 				dmu_buf_rele(db);
595*789Sahrens 			}
596*789Sahrens 			kmem_free(ms_array, newc * sizeof (uint64_t));
597*789Sahrens 		}
598*789Sahrens 		vd->vdev_mg = metaslab_group_create(mc, vd);
599*789Sahrens 	}
600*789Sahrens 
601*789Sahrens 	for (c = 0; c < oldc; c++) {
602*789Sahrens 		vd->vdev_smo[c] = smo[c];
603*789Sahrens 		vd->vdev_ms[c] = mspp[c];
604*789Sahrens 		mspp[c]->ms_smo = &vd->vdev_smo[c];
605*789Sahrens 	}
606*789Sahrens 
607*789Sahrens 	for (c = oldc; c < newc; c++)
608*789Sahrens 		metaslab_init(vd->vdev_mg, &vd->vdev_smo[c], &vd->vdev_ms[c],
609*789Sahrens 		    c << vd->vdev_ms_shift, 1ULL << vd->vdev_ms_shift, txg);
610*789Sahrens 
611*789Sahrens 	if (oldc != 0) {
612*789Sahrens 		kmem_free(smo, oldc * sizeof (*smo));
613*789Sahrens 		kmem_free(mspp, oldc * sizeof (*mspp));
614*789Sahrens 	}
615*789Sahrens 
616*789Sahrens }
617*789Sahrens 
618*789Sahrens void
619*789Sahrens vdev_metaslab_fini(vdev_t *vd)
620*789Sahrens {
621*789Sahrens 	uint64_t m;
622*789Sahrens 	uint64_t count = vd->vdev_ms_count;
623*789Sahrens 
624*789Sahrens 	if (vd->vdev_ms != NULL) {
625*789Sahrens 		for (m = 0; m < count; m++)
626*789Sahrens 			metaslab_fini(vd->vdev_ms[m]);
627*789Sahrens 		kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
628*789Sahrens 		vd->vdev_ms = NULL;
629*789Sahrens 	}
630*789Sahrens 
631*789Sahrens 	if (vd->vdev_smo != NULL) {
632*789Sahrens 		kmem_free(vd->vdev_smo, count * sizeof (space_map_obj_t));
633*789Sahrens 		vd->vdev_smo = NULL;
634*789Sahrens 	}
635*789Sahrens }
636*789Sahrens 
637*789Sahrens /*
638*789Sahrens  * Prepare a virtual device for access.
639*789Sahrens  */
640*789Sahrens int
641*789Sahrens vdev_open(vdev_t *vd)
642*789Sahrens {
643*789Sahrens 	int error;
644*789Sahrens 	vdev_knob_t *vk;
645*789Sahrens 	int c;
646*789Sahrens 	uint64_t osize = 0;
647*789Sahrens 	uint64_t asize, psize;
648*789Sahrens 	uint64_t ashift = -1ULL;
649*789Sahrens 
650*789Sahrens 	ASSERT(vd->vdev_state == VDEV_STATE_CLOSED ||
651*789Sahrens 	    vd->vdev_state == VDEV_STATE_CANT_OPEN ||
652*789Sahrens 	    vd->vdev_state == VDEV_STATE_OFFLINE);
653*789Sahrens 
654*789Sahrens 	if (vd->vdev_fault_mode == VDEV_FAULT_COUNT)
655*789Sahrens 		vd->vdev_fault_arg >>= 1;
656*789Sahrens 	else
657*789Sahrens 		vd->vdev_fault_mode = VDEV_FAULT_NONE;
658*789Sahrens 
659*789Sahrens 	vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
660*789Sahrens 
661*789Sahrens 	for (vk = vdev_knob_next(NULL); vk != NULL; vk = vdev_knob_next(vk)) {
662*789Sahrens 		uint64_t *valp = (uint64_t *)((char *)vd + vk->vk_offset);
663*789Sahrens 
664*789Sahrens 		*valp = vk->vk_default;
665*789Sahrens 		*valp = MAX(*valp, vk->vk_min);
666*789Sahrens 		*valp = MIN(*valp, vk->vk_max);
667*789Sahrens 	}
668*789Sahrens 
669*789Sahrens 	if (vd->vdev_ops->vdev_op_leaf) {
670*789Sahrens 		vdev_cache_init(vd);
671*789Sahrens 		vdev_queue_init(vd);
672*789Sahrens 		vd->vdev_cache_active = B_TRUE;
673*789Sahrens 	}
674*789Sahrens 
675*789Sahrens 	if (vd->vdev_offline) {
676*789Sahrens 		ASSERT(vd->vdev_children == 0);
677*789Sahrens 		dprintf("OFFLINE: %s = ENXIO\n", vdev_description(vd));
678*789Sahrens 		vd->vdev_state = VDEV_STATE_OFFLINE;
679*789Sahrens 		return (ENXIO);
680*789Sahrens 	}
681*789Sahrens 
682*789Sahrens 	error = vd->vdev_ops->vdev_op_open(vd, &osize, &ashift);
683*789Sahrens 
684*789Sahrens 	dprintf("%s = %d, osize %llu, state = %d\n",
685*789Sahrens 	    vdev_description(vd), error, osize, vd->vdev_state);
686*789Sahrens 
687*789Sahrens 	if (error) {
688*789Sahrens 		dprintf("%s in %s failed to open, error %d, aux %d\n",
689*789Sahrens 		    vdev_description(vd),
690*789Sahrens 		    vdev_description(vd->vdev_parent),
691*789Sahrens 		    error,
692*789Sahrens 		    vd->vdev_stat.vs_aux);
693*789Sahrens 
694*789Sahrens 		vd->vdev_state = VDEV_STATE_CANT_OPEN;
695*789Sahrens 		return (error);
696*789Sahrens 	}
697*789Sahrens 
698*789Sahrens 	vd->vdev_state = VDEV_STATE_HEALTHY;
699*789Sahrens 
700*789Sahrens 	for (c = 0; c < vd->vdev_children; c++)
701*789Sahrens 		if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY)
702*789Sahrens 			vd->vdev_state = VDEV_STATE_DEGRADED;
703*789Sahrens 
704*789Sahrens 	osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t));
705*789Sahrens 
706*789Sahrens 	if (vd->vdev_children == 0) {
707*789Sahrens 		if (osize < SPA_MINDEVSIZE) {
708*789Sahrens 			vd->vdev_state = VDEV_STATE_CANT_OPEN;
709*789Sahrens 			vd->vdev_stat.vs_aux = VDEV_AUX_TOO_SMALL;
710*789Sahrens 			return (EOVERFLOW);
711*789Sahrens 		}
712*789Sahrens 		psize = osize;
713*789Sahrens 		asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE);
714*789Sahrens 	} else {
715*789Sahrens 		if (osize < SPA_MINDEVSIZE -
716*789Sahrens 		    (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) {
717*789Sahrens 			vd->vdev_state = VDEV_STATE_CANT_OPEN;
718*789Sahrens 			vd->vdev_stat.vs_aux = VDEV_AUX_TOO_SMALL;
719*789Sahrens 			return (EOVERFLOW);
720*789Sahrens 		}
721*789Sahrens 		psize = 0;
722*789Sahrens 		asize = osize;
723*789Sahrens 	}
724*789Sahrens 
725*789Sahrens 	vd->vdev_psize = psize;
726*789Sahrens 
727*789Sahrens 	if (vd->vdev_asize == 0) {
728*789Sahrens 		/*
729*789Sahrens 		 * This is the first-ever open, so use the computed values.
730*789Sahrens 		 */
731*789Sahrens 		vd->vdev_asize = asize;
732*789Sahrens 		vd->vdev_ashift = ashift;
733*789Sahrens 	} else {
734*789Sahrens 		/*
735*789Sahrens 		 * Make sure the alignment requirement hasn't increased.
736*789Sahrens 		 */
737*789Sahrens 		if (ashift > vd->vdev_ashift) {
738*789Sahrens 			dprintf("%s: ashift grew\n", vdev_description(vd));
739*789Sahrens 			vd->vdev_state = VDEV_STATE_CANT_OPEN;
740*789Sahrens 			vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
741*789Sahrens 			return (EINVAL);
742*789Sahrens 		}
743*789Sahrens 
744*789Sahrens 		/*
745*789Sahrens 		 * Make sure the device hasn't shrunk.
746*789Sahrens 		 */
747*789Sahrens 		if (asize < vd->vdev_asize) {
748*789Sahrens 			dprintf("%s: device shrank\n", vdev_description(vd));
749*789Sahrens 			vd->vdev_state = VDEV_STATE_CANT_OPEN;
750*789Sahrens 			vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
751*789Sahrens 			return (EINVAL);
752*789Sahrens 		}
753*789Sahrens 
754*789Sahrens 		/*
755*789Sahrens 		 * If all children are healthy and the asize has increased,
756*789Sahrens 		 * then we've experienced dynamic LUN growth.
757*789Sahrens 		 */
758*789Sahrens 		if (vd->vdev_state == VDEV_STATE_HEALTHY &&
759*789Sahrens 		    asize > vd->vdev_asize) {
760*789Sahrens 			dprintf("%s: device grew\n", vdev_description(vd));
761*789Sahrens 			vd->vdev_asize = asize;
762*789Sahrens 		}
763*789Sahrens 	}
764*789Sahrens 
765*789Sahrens 	return (0);
766*789Sahrens }
767*789Sahrens 
768*789Sahrens /*
769*789Sahrens  * Close a virtual device.
770*789Sahrens  */
771*789Sahrens void
772*789Sahrens vdev_close(vdev_t *vd)
773*789Sahrens {
774*789Sahrens 	ASSERT3P(list_head(&vd->vdev_io_pending), ==, NULL);
775*789Sahrens 
776*789Sahrens 	vd->vdev_ops->vdev_op_close(vd);
777*789Sahrens 
778*789Sahrens 	if (vd->vdev_cache_active) {
779*789Sahrens 		vdev_cache_fini(vd);
780*789Sahrens 		vdev_queue_fini(vd);
781*789Sahrens 		vd->vdev_cache_active = B_FALSE;
782*789Sahrens 	}
783*789Sahrens 
784*789Sahrens 	if (vd->vdev_offline)
785*789Sahrens 		vd->vdev_state = VDEV_STATE_OFFLINE;
786*789Sahrens 	else
787*789Sahrens 		vd->vdev_state = VDEV_STATE_CLOSED;
788*789Sahrens }
789*789Sahrens 
790*789Sahrens void
791*789Sahrens vdev_reopen(vdev_t *vd, zio_t **rq)
792*789Sahrens {
793*789Sahrens 	vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
794*789Sahrens 	int c;
795*789Sahrens 
796*789Sahrens 	if (vd == rvd) {
797*789Sahrens 		ASSERT(rq == NULL);
798*789Sahrens 		for (c = 0; c < rvd->vdev_children; c++)
799*789Sahrens 			vdev_reopen(rvd->vdev_child[c], NULL);
800*789Sahrens 		return;
801*789Sahrens 	}
802*789Sahrens 
803*789Sahrens 	/* only valid for top-level vdevs */
804*789Sahrens 	ASSERT3P(vd, ==, vd->vdev_top);
805*789Sahrens 
806*789Sahrens 	/*
807*789Sahrens 	 * vdev_state can change when spa_config_lock is held as writer,
808*789Sahrens 	 * or when it's held as reader and we're doing a vdev_reopen().
809*789Sahrens 	 * To handle the latter case, we grab rvd's io_lock to serialize
810*789Sahrens 	 * reopens.  This ensures that there's never more than one vdev
811*789Sahrens 	 * state changer active at a time.
812*789Sahrens 	 */
813*789Sahrens 	mutex_enter(&rvd->vdev_io_lock);
814*789Sahrens 
815*789Sahrens 	mutex_enter(&vd->vdev_io_lock);
816*789Sahrens 	while (list_head(&vd->vdev_io_pending) != NULL)
817*789Sahrens 		cv_wait(&vd->vdev_io_cv, &vd->vdev_io_lock);
818*789Sahrens 	vdev_close(vd);
819*789Sahrens 	(void) vdev_open(vd);
820*789Sahrens 	if (rq != NULL) {
821*789Sahrens 		*rq = vd->vdev_io_retry;
822*789Sahrens 		vd->vdev_io_retry = NULL;
823*789Sahrens 	}
824*789Sahrens 	mutex_exit(&vd->vdev_io_lock);
825*789Sahrens 
826*789Sahrens 	/*
827*789Sahrens 	 * Reassess root vdev's health.
828*789Sahrens 	 */
829*789Sahrens 	rvd->vdev_state = VDEV_STATE_HEALTHY;
830*789Sahrens 	for (c = 0; c < rvd->vdev_children; c++) {
831*789Sahrens 		uint64_t state = rvd->vdev_child[c]->vdev_state;
832*789Sahrens 		rvd->vdev_state = MIN(rvd->vdev_state, state);
833*789Sahrens 	}
834*789Sahrens 
835*789Sahrens 	mutex_exit(&rvd->vdev_io_lock);
836*789Sahrens }
837*789Sahrens 
838*789Sahrens int
839*789Sahrens vdev_create(vdev_t *vd, uint64_t txg)
840*789Sahrens {
841*789Sahrens 	int error;
842*789Sahrens 
843*789Sahrens 	/*
844*789Sahrens 	 * Normally, partial opens (e.g. of a mirror) are allowed.
845*789Sahrens 	 * For a create, however, we want to fail the request if
846*789Sahrens 	 * there are any components we can't open.
847*789Sahrens 	 */
848*789Sahrens 	error = vdev_open(vd);
849*789Sahrens 
850*789Sahrens 	if (error || vd->vdev_state != VDEV_STATE_HEALTHY) {
851*789Sahrens 		vdev_close(vd);
852*789Sahrens 		return (error ? error : ENXIO);
853*789Sahrens 	}
854*789Sahrens 
855*789Sahrens 	/*
856*789Sahrens 	 * Recursively initialize all labels.
857*789Sahrens 	 */
858*789Sahrens 	if ((error = vdev_label_init(vd, txg)) != 0) {
859*789Sahrens 		vdev_close(vd);
860*789Sahrens 		return (error);
861*789Sahrens 	}
862*789Sahrens 
863*789Sahrens 	return (0);
864*789Sahrens }
865*789Sahrens 
866*789Sahrens /*
867*789Sahrens  * The is the latter half of vdev_create().  It is distinct because it
868*789Sahrens  * involves initiating transactions in order to do metaslab creation.
869*789Sahrens  * For creation, we want to try to create all vdevs at once and then undo it
870*789Sahrens  * if anything fails; this is much harder if we have pending transactions.
871*789Sahrens  */
872*789Sahrens void
873*789Sahrens vdev_init(vdev_t *vd, uint64_t txg)
874*789Sahrens {
875*789Sahrens 	/*
876*789Sahrens 	 * Aim for roughly 200 metaslabs per vdev.
877*789Sahrens 	 */
878*789Sahrens 	vd->vdev_ms_shift = highbit(vd->vdev_asize / 200);
879*789Sahrens 	vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT);
880*789Sahrens 
881*789Sahrens 	/*
882*789Sahrens 	 * Initialize the vdev's metaslabs.
883*789Sahrens 	 */
884*789Sahrens 	vdev_metaslab_init(vd, txg);
885*789Sahrens }
886*789Sahrens 
887*789Sahrens void
888*789Sahrens vdev_dirty(vdev_t *vd, uint8_t flags, uint64_t txg)
889*789Sahrens {
890*789Sahrens 	vdev_t *tvd = vd->vdev_top;
891*789Sahrens 
892*789Sahrens 	mutex_enter(&tvd->vdev_dirty_lock);
893*789Sahrens 	if ((tvd->vdev_dirty[txg & TXG_MASK] & flags) != flags) {
894*789Sahrens 		tvd->vdev_dirty[txg & TXG_MASK] |= flags;
895*789Sahrens 		(void) txg_list_add(&tvd->vdev_spa->spa_vdev_txg_list,
896*789Sahrens 		    tvd, txg);
897*789Sahrens 	}
898*789Sahrens 	mutex_exit(&tvd->vdev_dirty_lock);
899*789Sahrens }
900*789Sahrens 
901*789Sahrens void
902*789Sahrens vdev_dtl_dirty(space_map_t *sm, uint64_t txg, uint64_t size)
903*789Sahrens {
904*789Sahrens 	mutex_enter(sm->sm_lock);
905*789Sahrens 	if (!space_map_contains(sm, txg, size))
906*789Sahrens 		space_map_add(sm, txg, size);
907*789Sahrens 	mutex_exit(sm->sm_lock);
908*789Sahrens }
909*789Sahrens 
910*789Sahrens int
911*789Sahrens vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size)
912*789Sahrens {
913*789Sahrens 	int dirty;
914*789Sahrens 
915*789Sahrens 	/*
916*789Sahrens 	 * Quick test without the lock -- covers the common case that
917*789Sahrens 	 * there are no dirty time segments.
918*789Sahrens 	 */
919*789Sahrens 	if (sm->sm_space == 0)
920*789Sahrens 		return (0);
921*789Sahrens 
922*789Sahrens 	mutex_enter(sm->sm_lock);
923*789Sahrens 	dirty = space_map_contains(sm, txg, size);
924*789Sahrens 	mutex_exit(sm->sm_lock);
925*789Sahrens 
926*789Sahrens 	return (dirty);
927*789Sahrens }
928*789Sahrens 
929*789Sahrens /*
930*789Sahrens  * Reassess DTLs after a config change or scrub completion.
931*789Sahrens  */
932*789Sahrens void
933*789Sahrens vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
934*789Sahrens {
935*789Sahrens 	int c;
936*789Sahrens 
937*789Sahrens 	ASSERT(spa_config_held(vd->vdev_spa, RW_WRITER));
938*789Sahrens 
939*789Sahrens 	if (vd->vdev_children == 0) {
940*789Sahrens 		mutex_enter(&vd->vdev_dtl_lock);
941*789Sahrens 		/*
942*789Sahrens 		 * We're successfully scrubbed everything up to scrub_txg.
943*789Sahrens 		 * Therefore, excise all old DTLs up to that point, then
944*789Sahrens 		 * fold in the DTLs for everything we couldn't scrub.
945*789Sahrens 		 */
946*789Sahrens 		if (scrub_txg != 0) {
947*789Sahrens 			space_map_excise(&vd->vdev_dtl_map, 0, scrub_txg);
948*789Sahrens 			space_map_union(&vd->vdev_dtl_map, &vd->vdev_dtl_scrub);
949*789Sahrens 		}
950*789Sahrens 		if (scrub_done)
951*789Sahrens 			space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
952*789Sahrens 		mutex_exit(&vd->vdev_dtl_lock);
953*789Sahrens 		if (txg != 0) {
954*789Sahrens 			vdev_t *tvd = vd->vdev_top;
955*789Sahrens 			vdev_dirty(tvd, VDD_DTL, txg);
956*789Sahrens 			(void) txg_list_add(&tvd->vdev_dtl_list, vd, txg);
957*789Sahrens 		}
958*789Sahrens 		return;
959*789Sahrens 	}
960*789Sahrens 
961*789Sahrens 	mutex_enter(&vd->vdev_dtl_lock);
962*789Sahrens 	space_map_vacate(&vd->vdev_dtl_map, NULL, NULL);
963*789Sahrens 	space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
964*789Sahrens 	mutex_exit(&vd->vdev_dtl_lock);
965*789Sahrens 
966*789Sahrens 	for (c = 0; c < vd->vdev_children; c++) {
967*789Sahrens 		vdev_t *cvd = vd->vdev_child[c];
968*789Sahrens 		vdev_dtl_reassess(cvd, txg, scrub_txg, scrub_done);
969*789Sahrens 		mutex_enter(&vd->vdev_dtl_lock);
970*789Sahrens 		space_map_union(&vd->vdev_dtl_map, &cvd->vdev_dtl_map);
971*789Sahrens 		space_map_union(&vd->vdev_dtl_scrub, &cvd->vdev_dtl_scrub);
972*789Sahrens 		mutex_exit(&vd->vdev_dtl_lock);
973*789Sahrens 	}
974*789Sahrens }
975*789Sahrens 
976*789Sahrens static int
977*789Sahrens vdev_dtl_load(vdev_t *vd)
978*789Sahrens {
979*789Sahrens 	spa_t *spa = vd->vdev_spa;
980*789Sahrens 	space_map_obj_t *smo = &vd->vdev_dtl;
981*789Sahrens 	dmu_buf_t *db;
982*789Sahrens 	int error;
983*789Sahrens 
984*789Sahrens 	ASSERT(vd->vdev_children == 0);
985*789Sahrens 
986*789Sahrens 	if (smo->smo_object == 0)
987*789Sahrens 		return (0);
988*789Sahrens 
989*789Sahrens 	db = dmu_bonus_hold(spa->spa_meta_objset, smo->smo_object);
990*789Sahrens 	dmu_buf_read(db);
991*789Sahrens 	ASSERT3U(db->db_size, ==, sizeof (*smo));
992*789Sahrens 	bcopy(db->db_data, smo, db->db_size);
993*789Sahrens 	dmu_buf_rele(db);
994*789Sahrens 
995*789Sahrens 	mutex_enter(&vd->vdev_dtl_lock);
996*789Sahrens 	error = space_map_load(&vd->vdev_dtl_map, smo, SM_ALLOC,
997*789Sahrens 	    spa->spa_meta_objset, smo->smo_objsize, smo->smo_alloc);
998*789Sahrens 	mutex_exit(&vd->vdev_dtl_lock);
999*789Sahrens 
1000*789Sahrens 	return (error);
1001*789Sahrens }
1002*789Sahrens 
1003*789Sahrens void
1004*789Sahrens vdev_dtl_sync(vdev_t *vd, uint64_t txg)
1005*789Sahrens {
1006*789Sahrens 	spa_t *spa = vd->vdev_spa;
1007*789Sahrens 	space_map_obj_t *smo = &vd->vdev_dtl;
1008*789Sahrens 	space_map_t *sm = &vd->vdev_dtl_map;
1009*789Sahrens 	space_map_t smsync;
1010*789Sahrens 	kmutex_t smlock;
1011*789Sahrens 	avl_tree_t *t = &sm->sm_root;
1012*789Sahrens 	space_seg_t *ss;
1013*789Sahrens 	dmu_buf_t *db;
1014*789Sahrens 	dmu_tx_t *tx;
1015*789Sahrens 
1016*789Sahrens 	dprintf("%s in txg %llu pass %d\n",
1017*789Sahrens 	    vdev_description(vd), (u_longlong_t)txg, spa_sync_pass(spa));
1018*789Sahrens 
1019*789Sahrens 	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
1020*789Sahrens 
1021*789Sahrens 	if (vd->vdev_detached) {
1022*789Sahrens 		if (smo->smo_object != 0) {
1023*789Sahrens 			int err = dmu_object_free(spa->spa_meta_objset,
1024*789Sahrens 			    smo->smo_object, tx);
1025*789Sahrens 			ASSERT3U(err, ==, 0);
1026*789Sahrens 			smo->smo_object = 0;
1027*789Sahrens 		}
1028*789Sahrens 		dmu_tx_commit(tx);
1029*789Sahrens 		return;
1030*789Sahrens 	}
1031*789Sahrens 
1032*789Sahrens 	if (smo->smo_object == 0) {
1033*789Sahrens 		ASSERT(smo->smo_objsize == 0);
1034*789Sahrens 		ASSERT(smo->smo_alloc == 0);
1035*789Sahrens 		smo->smo_object = dmu_object_alloc(spa->spa_meta_objset,
1036*789Sahrens 		    DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT,
1037*789Sahrens 		    DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx);
1038*789Sahrens 		ASSERT(smo->smo_object != 0);
1039*789Sahrens 		vdev_config_dirty(vd->vdev_top);
1040*789Sahrens 	}
1041*789Sahrens 
1042*789Sahrens 	dmu_free_range(spa->spa_meta_objset, smo->smo_object,
1043*789Sahrens 	    0, smo->smo_objsize, tx);
1044*789Sahrens 
1045*789Sahrens 	mutex_init(&smlock, NULL, MUTEX_DEFAULT, NULL);
1046*789Sahrens 
1047*789Sahrens 	space_map_create(&smsync, sm->sm_start, sm->sm_size, sm->sm_shift,
1048*789Sahrens 	    &smlock);
1049*789Sahrens 
1050*789Sahrens 	mutex_enter(&smlock);
1051*789Sahrens 
1052*789Sahrens 	mutex_enter(&vd->vdev_dtl_lock);
1053*789Sahrens 	for (ss = avl_first(t); ss != NULL; ss = AVL_NEXT(t, ss))
1054*789Sahrens 		space_map_add(&smsync, ss->ss_start, ss->ss_end - ss->ss_start);
1055*789Sahrens 	mutex_exit(&vd->vdev_dtl_lock);
1056*789Sahrens 
1057*789Sahrens 	smo->smo_objsize = 0;
1058*789Sahrens 	smo->smo_alloc = smsync.sm_space;
1059*789Sahrens 
1060*789Sahrens 	space_map_sync(&smsync, NULL, smo, SM_ALLOC, spa->spa_meta_objset, tx);
1061*789Sahrens 	space_map_destroy(&smsync);
1062*789Sahrens 
1063*789Sahrens 	mutex_exit(&smlock);
1064*789Sahrens 	mutex_destroy(&smlock);
1065*789Sahrens 
1066*789Sahrens 	db = dmu_bonus_hold(spa->spa_meta_objset, smo->smo_object);
1067*789Sahrens 	dmu_buf_will_dirty(db, tx);
1068*789Sahrens 	ASSERT3U(db->db_size, ==, sizeof (*smo));
1069*789Sahrens 	bcopy(smo, db->db_data, db->db_size);
1070*789Sahrens 	dmu_buf_rele(db);
1071*789Sahrens 
1072*789Sahrens 	dmu_tx_commit(tx);
1073*789Sahrens }
1074*789Sahrens 
1075*789Sahrens int
1076*789Sahrens vdev_load(vdev_t *vd, int import)
1077*789Sahrens {
1078*789Sahrens 	spa_t *spa = vd->vdev_spa;
1079*789Sahrens 	int c, error;
1080*789Sahrens 	nvlist_t *label;
1081*789Sahrens 	uint64_t guid, state;
1082*789Sahrens 
1083*789Sahrens 	dprintf("loading %s\n", vdev_description(vd));
1084*789Sahrens 
1085*789Sahrens 	/*
1086*789Sahrens 	 * Recursively load all children.
1087*789Sahrens 	 */
1088*789Sahrens 	for (c = 0; c < vd->vdev_children; c++)
1089*789Sahrens 		if ((error = vdev_load(vd->vdev_child[c], import)) != 0)
1090*789Sahrens 			return (error);
1091*789Sahrens 
1092*789Sahrens 	/*
1093*789Sahrens 	 * If this is a leaf vdev, make sure its agrees with its disk labels.
1094*789Sahrens 	 */
1095*789Sahrens 	if (vd->vdev_ops->vdev_op_leaf) {
1096*789Sahrens 
1097*789Sahrens 		if (vdev_is_dead(vd))
1098*789Sahrens 			return (0);
1099*789Sahrens 
1100*789Sahrens 		/*
1101*789Sahrens 		 * XXX state transitions don't propagate to parent here.
1102*789Sahrens 		 * Also, merely setting the state isn't sufficient because
1103*789Sahrens 		 * it's not persistent; a vdev_reopen() would make us
1104*789Sahrens 		 * forget all about it.
1105*789Sahrens 		 */
1106*789Sahrens 		if ((label = vdev_label_read_config(vd)) == NULL) {
1107*789Sahrens 			dprintf("can't load label config\n");
1108*789Sahrens 			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
1109*789Sahrens 			    VDEV_AUX_CORRUPT_DATA);
1110*789Sahrens 			return (0);
1111*789Sahrens 		}
1112*789Sahrens 
1113*789Sahrens 		if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID,
1114*789Sahrens 		    &guid) != 0 || guid != spa_guid(spa)) {
1115*789Sahrens 			dprintf("bad or missing pool GUID (%llu)\n", guid);
1116*789Sahrens 			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
1117*789Sahrens 			    VDEV_AUX_CORRUPT_DATA);
1118*789Sahrens 			nvlist_free(label);
1119*789Sahrens 			return (0);
1120*789Sahrens 		}
1121*789Sahrens 
1122*789Sahrens 		if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) ||
1123*789Sahrens 		    guid != vd->vdev_guid) {
1124*789Sahrens 			dprintf("bad or missing vdev guid (%llu != %llu)\n",
1125*789Sahrens 			    guid, vd->vdev_guid);
1126*789Sahrens 			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
1127*789Sahrens 			    VDEV_AUX_CORRUPT_DATA);
1128*789Sahrens 			nvlist_free(label);
1129*789Sahrens 			return (0);
1130*789Sahrens 		}
1131*789Sahrens 
1132*789Sahrens 		/*
1133*789Sahrens 		 * If we find a vdev with a matching pool guid and vdev guid,
1134*789Sahrens 		 * but the pool state is not active, it indicates that the user
1135*789Sahrens 		 * exported or destroyed the pool without affecting the config
1136*789Sahrens 		 * cache (if / was mounted readonly, for example).  In this
1137*789Sahrens 		 * case, immediately return EBADF so the caller can remove it
1138*789Sahrens 		 * from the config.
1139*789Sahrens 		 */
1140*789Sahrens 		if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
1141*789Sahrens 		    &state)) {
1142*789Sahrens 			dprintf("missing pool state\n");
1143*789Sahrens 			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
1144*789Sahrens 			    VDEV_AUX_CORRUPT_DATA);
1145*789Sahrens 			nvlist_free(label);
1146*789Sahrens 			return (0);
1147*789Sahrens 		}
1148*789Sahrens 
1149*789Sahrens 		if (state != POOL_STATE_ACTIVE &&
1150*789Sahrens 		    (!import || state != POOL_STATE_EXPORTED)) {
1151*789Sahrens 			dprintf("pool state not active (%llu)\n", state);
1152*789Sahrens 			nvlist_free(label);
1153*789Sahrens 			return (EBADF);
1154*789Sahrens 		}
1155*789Sahrens 
1156*789Sahrens 		nvlist_free(label);
1157*789Sahrens 	}
1158*789Sahrens 
1159*789Sahrens 	/*
1160*789Sahrens 	 * If this is a top-level vdev, make sure its allocation parameters
1161*789Sahrens 	 * exist and initialize its metaslabs.
1162*789Sahrens 	 */
1163*789Sahrens 	if (vd == vd->vdev_top) {
1164*789Sahrens 
1165*789Sahrens 		if (vd->vdev_ms_array == 0 ||
1166*789Sahrens 		    vd->vdev_ms_shift == 0 ||
1167*789Sahrens 		    vd->vdev_ashift == 0 ||
1168*789Sahrens 		    vd->vdev_asize == 0) {
1169*789Sahrens 			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
1170*789Sahrens 			    VDEV_AUX_CORRUPT_DATA);
1171*789Sahrens 			return (0);
1172*789Sahrens 		}
1173*789Sahrens 
1174*789Sahrens 		vdev_metaslab_init(vd, 0);
1175*789Sahrens 	}
1176*789Sahrens 
1177*789Sahrens 	/*
1178*789Sahrens 	 * If this is a leaf vdev, load its DTL.
1179*789Sahrens 	 */
1180*789Sahrens 	if (vd->vdev_ops->vdev_op_leaf) {
1181*789Sahrens 		error = vdev_dtl_load(vd);
1182*789Sahrens 		if (error) {
1183*789Sahrens 			dprintf("can't load DTL for %s, error %d\n",
1184*789Sahrens 			    vdev_description(vd), error);
1185*789Sahrens 			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
1186*789Sahrens 			    VDEV_AUX_CORRUPT_DATA);
1187*789Sahrens 			return (0);
1188*789Sahrens 		}
1189*789Sahrens 	}
1190*789Sahrens 
1191*789Sahrens 	return (0);
1192*789Sahrens }
1193*789Sahrens 
1194*789Sahrens void
1195*789Sahrens vdev_sync_done(vdev_t *vd, uint64_t txg)
1196*789Sahrens {
1197*789Sahrens 	metaslab_t *msp;
1198*789Sahrens 
1199*789Sahrens 	dprintf("%s txg %llu\n", vdev_description(vd), txg);
1200*789Sahrens 
1201*789Sahrens 	while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
1202*789Sahrens 		metaslab_sync_done(msp, txg);
1203*789Sahrens }
1204*789Sahrens 
1205*789Sahrens void
1206*789Sahrens vdev_add_sync(vdev_t *vd, uint64_t txg)
1207*789Sahrens {
1208*789Sahrens 	spa_t *spa = vd->vdev_spa;
1209*789Sahrens 	dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
1210*789Sahrens 
1211*789Sahrens 	ASSERT(vd == vd->vdev_top);
1212*789Sahrens 
1213*789Sahrens 	if (vd->vdev_ms_array == 0)
1214*789Sahrens 		vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset,
1215*789Sahrens 		    DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx);
1216*789Sahrens 
1217*789Sahrens 	ASSERT(vd->vdev_ms_array != 0);
1218*789Sahrens 
1219*789Sahrens 	vdev_config_dirty(vd);
1220*789Sahrens 
1221*789Sahrens 	dmu_tx_commit(tx);
1222*789Sahrens }
1223*789Sahrens 
1224*789Sahrens void
1225*789Sahrens vdev_sync(vdev_t *vd, uint64_t txg)
1226*789Sahrens {
1227*789Sahrens 	spa_t *spa = vd->vdev_spa;
1228*789Sahrens 	vdev_t *lvd;
1229*789Sahrens 	metaslab_t *msp;
1230*789Sahrens 	uint8_t *dirtyp = &vd->vdev_dirty[txg & TXG_MASK];
1231*789Sahrens 	uint8_t dirty = *dirtyp;
1232*789Sahrens 
1233*789Sahrens 	mutex_enter(&vd->vdev_dirty_lock);
1234*789Sahrens 	*dirtyp &= ~(VDD_ALLOC | VDD_FREE | VDD_ADD | VDD_DTL);
1235*789Sahrens 	mutex_exit(&vd->vdev_dirty_lock);
1236*789Sahrens 
1237*789Sahrens 	dprintf("%s txg %llu pass %d\n",
1238*789Sahrens 	    vdev_description(vd), (u_longlong_t)txg, spa_sync_pass(spa));
1239*789Sahrens 
1240*789Sahrens 	if (dirty & VDD_ADD)
1241*789Sahrens 		vdev_add_sync(vd, txg);
1242*789Sahrens 
1243*789Sahrens 	while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL)
1244*789Sahrens 		metaslab_sync(msp, txg);
1245*789Sahrens 
1246*789Sahrens 	while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL)
1247*789Sahrens 		vdev_dtl_sync(lvd, txg);
1248*789Sahrens 
1249*789Sahrens 	(void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg));
1250*789Sahrens }
1251*789Sahrens 
1252*789Sahrens uint64_t
1253*789Sahrens vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
1254*789Sahrens {
1255*789Sahrens 	return (vd->vdev_ops->vdev_op_asize(vd, psize));
1256*789Sahrens }
1257*789Sahrens 
1258*789Sahrens void
1259*789Sahrens vdev_io_start(zio_t *zio)
1260*789Sahrens {
1261*789Sahrens 	zio->io_vd->vdev_ops->vdev_op_io_start(zio);
1262*789Sahrens }
1263*789Sahrens 
1264*789Sahrens void
1265*789Sahrens vdev_io_done(zio_t *zio)
1266*789Sahrens {
1267*789Sahrens 	zio->io_vd->vdev_ops->vdev_op_io_done(zio);
1268*789Sahrens }
1269*789Sahrens 
1270*789Sahrens const char *
1271*789Sahrens vdev_description(vdev_t *vd)
1272*789Sahrens {
1273*789Sahrens 	if (vd == NULL || vd->vdev_ops == NULL)
1274*789Sahrens 		return ("<unknown>");
1275*789Sahrens 
1276*789Sahrens 	if (vd->vdev_path != NULL)
1277*789Sahrens 		return (vd->vdev_path);
1278*789Sahrens 
1279*789Sahrens 	if (vd->vdev_parent == NULL)
1280*789Sahrens 		return (spa_name(vd->vdev_spa));
1281*789Sahrens 
1282*789Sahrens 	return (vd->vdev_ops->vdev_op_type);
1283*789Sahrens }
1284*789Sahrens 
1285*789Sahrens int
1286*789Sahrens vdev_online(spa_t *spa, const char *path)
1287*789Sahrens {
1288*789Sahrens 	vdev_t *vd;
1289*789Sahrens 
1290*789Sahrens 	spa_config_enter(spa, RW_WRITER);
1291*789Sahrens 
1292*789Sahrens 	if ((vd = vdev_lookup_by_path(spa->spa_root_vdev, path)) == NULL) {
1293*789Sahrens 		spa_config_exit(spa);
1294*789Sahrens 		return (ENODEV);
1295*789Sahrens 	}
1296*789Sahrens 
1297*789Sahrens 	dprintf("ONLINE: %s\n", vdev_description(vd));
1298*789Sahrens 
1299*789Sahrens 	vd->vdev_offline = B_FALSE;
1300*789Sahrens 
1301*789Sahrens 	/*
1302*789Sahrens 	 * Clear the error counts.  The idea is that you expect to see all
1303*789Sahrens 	 * zeroes when everything is working, so if you've just onlined a
1304*789Sahrens 	 * device, you don't want to keep hearing about errors from before.
1305*789Sahrens 	 */
1306*789Sahrens 	vd->vdev_stat.vs_read_errors = 0;
1307*789Sahrens 	vd->vdev_stat.vs_write_errors = 0;
1308*789Sahrens 	vd->vdev_stat.vs_checksum_errors = 0;
1309*789Sahrens 
1310*789Sahrens 	vdev_reopen(vd->vdev_top, NULL);
1311*789Sahrens 
1312*789Sahrens 	spa_config_exit(spa);
1313*789Sahrens 
1314*789Sahrens 	VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
1315*789Sahrens 
1316*789Sahrens 	return (0);
1317*789Sahrens }
1318*789Sahrens 
1319*789Sahrens int
1320*789Sahrens vdev_offline(spa_t *spa, const char *path)
1321*789Sahrens {
1322*789Sahrens 	vdev_t *vd;
1323*789Sahrens 
1324*789Sahrens 	spa_config_enter(spa, RW_WRITER);
1325*789Sahrens 
1326*789Sahrens 	if ((vd = vdev_lookup_by_path(spa->spa_root_vdev, path)) == NULL) {
1327*789Sahrens 		spa_config_exit(spa);
1328*789Sahrens 		return (ENODEV);
1329*789Sahrens 	}
1330*789Sahrens 
1331*789Sahrens 	dprintf("OFFLINE: %s\n", vdev_description(vd));
1332*789Sahrens 
1333*789Sahrens 	/*
1334*789Sahrens 	 * If this device's top-level vdev has a non-empty DTL,
1335*789Sahrens 	 * don't allow the device to be offlined.
1336*789Sahrens 	 *
1337*789Sahrens 	 * XXX -- we should make this more precise by allowing the offline
1338*789Sahrens 	 * as long as the remaining devices don't have any DTL holes.
1339*789Sahrens 	 */
1340*789Sahrens 	if (vd->vdev_top->vdev_dtl_map.sm_space != 0) {
1341*789Sahrens 		spa_config_exit(spa);
1342*789Sahrens 		return (EBUSY);
1343*789Sahrens 	}
1344*789Sahrens 
1345*789Sahrens 	/*
1346*789Sahrens 	 * Set this device to offline state and reopen its top-level vdev.
1347*789Sahrens 	 * If this action results in the top-level vdev becoming unusable,
1348*789Sahrens 	 * undo it and fail the request.
1349*789Sahrens 	 */
1350*789Sahrens 	vd->vdev_offline = B_TRUE;
1351*789Sahrens 	vdev_reopen(vd->vdev_top, NULL);
1352*789Sahrens 	if (vdev_is_dead(vd->vdev_top)) {
1353*789Sahrens 		vd->vdev_offline = B_FALSE;
1354*789Sahrens 		vdev_reopen(vd->vdev_top, NULL);
1355*789Sahrens 		spa_config_exit(spa);
1356*789Sahrens 		return (EBUSY);
1357*789Sahrens 	}
1358*789Sahrens 
1359*789Sahrens 	spa_config_exit(spa);
1360*789Sahrens 
1361*789Sahrens 	return (0);
1362*789Sahrens }
1363*789Sahrens 
1364*789Sahrens int
1365*789Sahrens vdev_error_setup(spa_t *spa, const char *path, int mode, int mask, uint64_t arg)
1366*789Sahrens {
1367*789Sahrens 	vdev_t *vd;
1368*789Sahrens 
1369*789Sahrens 	spa_config_enter(spa, RW_WRITER);
1370*789Sahrens 
1371*789Sahrens 	if ((vd = vdev_lookup_by_path(spa->spa_root_vdev, path)) == NULL) {
1372*789Sahrens 		spa_config_exit(spa);
1373*789Sahrens 		return (ENODEV);
1374*789Sahrens 	}
1375*789Sahrens 
1376*789Sahrens 	vd->vdev_fault_mode = mode;
1377*789Sahrens 	vd->vdev_fault_mask = mask;
1378*789Sahrens 	vd->vdev_fault_arg = arg;
1379*789Sahrens 
1380*789Sahrens 	spa_config_exit(spa);
1381*789Sahrens 
1382*789Sahrens 	return (0);
1383*789Sahrens }
1384*789Sahrens 
1385*789Sahrens int
1386*789Sahrens vdev_is_dead(vdev_t *vd)
1387*789Sahrens {
1388*789Sahrens 	return (vd->vdev_state <= VDEV_STATE_CANT_OPEN);
1389*789Sahrens }
1390*789Sahrens 
1391*789Sahrens int
1392*789Sahrens vdev_error_inject(vdev_t *vd, zio_t *zio)
1393*789Sahrens {
1394*789Sahrens 	int error = 0;
1395*789Sahrens 
1396*789Sahrens 	if (vd->vdev_fault_mode == VDEV_FAULT_NONE)
1397*789Sahrens 		return (0);
1398*789Sahrens 
1399*789Sahrens 	if (((1ULL << zio->io_type) & vd->vdev_fault_mask) == 0)
1400*789Sahrens 		return (0);
1401*789Sahrens 
1402*789Sahrens 	switch (vd->vdev_fault_mode) {
1403*789Sahrens 	case VDEV_FAULT_RANDOM:
1404*789Sahrens 		if (spa_get_random(vd->vdev_fault_arg) == 0)
1405*789Sahrens 			error = EIO;
1406*789Sahrens 		break;
1407*789Sahrens 
1408*789Sahrens 	case VDEV_FAULT_COUNT:
1409*789Sahrens 		if ((int64_t)--vd->vdev_fault_arg <= 0)
1410*789Sahrens 			vd->vdev_fault_mode = VDEV_FAULT_NONE;
1411*789Sahrens 		error = EIO;
1412*789Sahrens 		break;
1413*789Sahrens 	}
1414*789Sahrens 
1415*789Sahrens 	if (error != 0) {
1416*789Sahrens 		dprintf("returning %d for type %d on %s state %d offset %llx\n",
1417*789Sahrens 		    error, zio->io_type, vdev_description(vd),
1418*789Sahrens 		    vd->vdev_state, zio->io_offset);
1419*789Sahrens 	}
1420*789Sahrens 
1421*789Sahrens 	return (error);
1422*789Sahrens }
1423*789Sahrens 
1424*789Sahrens /*
1425*789Sahrens  * Get statistics for the given vdev.
1426*789Sahrens  */
1427*789Sahrens void
1428*789Sahrens vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
1429*789Sahrens {
1430*789Sahrens 	vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
1431*789Sahrens 	int c, t;
1432*789Sahrens 
1433*789Sahrens 	mutex_enter(&vd->vdev_stat_lock);
1434*789Sahrens 	bcopy(&vd->vdev_stat, vs, sizeof (*vs));
1435*789Sahrens 	vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
1436*789Sahrens 	vs->vs_state = vd->vdev_state;
1437*789Sahrens 	mutex_exit(&vd->vdev_stat_lock);
1438*789Sahrens 
1439*789Sahrens 	/*
1440*789Sahrens 	 * If we're getting stats on the root vdev, aggregate the I/O counts
1441*789Sahrens 	 * over all top-level vdevs (i.e. the direct children of the root).
1442*789Sahrens 	 */
1443*789Sahrens 	if (vd == rvd) {
1444*789Sahrens 		for (c = 0; c < rvd->vdev_children; c++) {
1445*789Sahrens 			vdev_t *cvd = rvd->vdev_child[c];
1446*789Sahrens 			vdev_stat_t *cvs = &cvd->vdev_stat;
1447*789Sahrens 
1448*789Sahrens 			mutex_enter(&vd->vdev_stat_lock);
1449*789Sahrens 			for (t = 0; t < ZIO_TYPES; t++) {
1450*789Sahrens 				vs->vs_ops[t] += cvs->vs_ops[t];
1451*789Sahrens 				vs->vs_bytes[t] += cvs->vs_bytes[t];
1452*789Sahrens 			}
1453*789Sahrens 			vs->vs_read_errors += cvs->vs_read_errors;
1454*789Sahrens 			vs->vs_write_errors += cvs->vs_write_errors;
1455*789Sahrens 			vs->vs_checksum_errors += cvs->vs_checksum_errors;
1456*789Sahrens 			vs->vs_scrub_examined += cvs->vs_scrub_examined;
1457*789Sahrens 			vs->vs_scrub_errors += cvs->vs_scrub_errors;
1458*789Sahrens 			mutex_exit(&vd->vdev_stat_lock);
1459*789Sahrens 		}
1460*789Sahrens 	}
1461*789Sahrens }
1462*789Sahrens 
1463*789Sahrens void
1464*789Sahrens vdev_stat_update(zio_t *zio)
1465*789Sahrens {
1466*789Sahrens 	vdev_t *vd = zio->io_vd;
1467*789Sahrens 	vdev_t *pvd;
1468*789Sahrens 	uint64_t txg = zio->io_txg;
1469*789Sahrens 	vdev_stat_t *vs = &vd->vdev_stat;
1470*789Sahrens 	zio_type_t type = zio->io_type;
1471*789Sahrens 	int flags = zio->io_flags;
1472*789Sahrens 
1473*789Sahrens 	if (zio->io_error == 0) {
1474*789Sahrens 		if (!(flags & ZIO_FLAG_IO_BYPASS)) {
1475*789Sahrens 			mutex_enter(&vd->vdev_stat_lock);
1476*789Sahrens 			vs->vs_ops[type]++;
1477*789Sahrens 			vs->vs_bytes[type] += zio->io_size;
1478*789Sahrens 			mutex_exit(&vd->vdev_stat_lock);
1479*789Sahrens 		}
1480*789Sahrens 		if ((flags & ZIO_FLAG_IO_REPAIR) &&
1481*789Sahrens 		    zio->io_delegate_list == NULL) {
1482*789Sahrens 			mutex_enter(&vd->vdev_stat_lock);
1483*789Sahrens 			if (flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))
1484*789Sahrens 				vs->vs_scrub_repaired += zio->io_size;
1485*789Sahrens 			else
1486*789Sahrens 				vs->vs_self_healed += zio->io_size;
1487*789Sahrens 			mutex_exit(&vd->vdev_stat_lock);
1488*789Sahrens 		}
1489*789Sahrens 		return;
1490*789Sahrens 	}
1491*789Sahrens 
1492*789Sahrens 	if (flags & ZIO_FLAG_SPECULATIVE)
1493*789Sahrens 		return;
1494*789Sahrens 
1495*789Sahrens 	if (!vdev_is_dead(vd)) {
1496*789Sahrens 		mutex_enter(&vd->vdev_stat_lock);
1497*789Sahrens 		if (type == ZIO_TYPE_READ) {
1498*789Sahrens 			if (zio->io_error == ECKSUM)
1499*789Sahrens 				vs->vs_checksum_errors++;
1500*789Sahrens 			else
1501*789Sahrens 				vs->vs_read_errors++;
1502*789Sahrens 		}
1503*789Sahrens 		if (type == ZIO_TYPE_WRITE)
1504*789Sahrens 			vs->vs_write_errors++;
1505*789Sahrens 		mutex_exit(&vd->vdev_stat_lock);
1506*789Sahrens 	}
1507*789Sahrens 
1508*789Sahrens 	if (type == ZIO_TYPE_WRITE) {
1509*789Sahrens 		if (txg == 0 || vd->vdev_children != 0)
1510*789Sahrens 			return;
1511*789Sahrens 		if (flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) {
1512*789Sahrens 			ASSERT(flags & ZIO_FLAG_IO_REPAIR);
1513*789Sahrens 			for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
1514*789Sahrens 				vdev_dtl_dirty(&pvd->vdev_dtl_scrub, txg, 1);
1515*789Sahrens 		}
1516*789Sahrens 		if (!(flags & ZIO_FLAG_IO_REPAIR)) {
1517*789Sahrens 			vdev_t *tvd = vd->vdev_top;
1518*789Sahrens 			if (vdev_dtl_contains(&vd->vdev_dtl_map, txg, 1))
1519*789Sahrens 				return;
1520*789Sahrens 			vdev_dirty(tvd, VDD_DTL, txg);
1521*789Sahrens 			(void) txg_list_add(&tvd->vdev_dtl_list, vd, txg);
1522*789Sahrens 			for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
1523*789Sahrens 				vdev_dtl_dirty(&pvd->vdev_dtl_map, txg, 1);
1524*789Sahrens 		}
1525*789Sahrens 	}
1526*789Sahrens }
1527*789Sahrens 
1528*789Sahrens void
1529*789Sahrens vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete)
1530*789Sahrens {
1531*789Sahrens 	int c;
1532*789Sahrens 	vdev_stat_t *vs = &vd->vdev_stat;
1533*789Sahrens 
1534*789Sahrens 	for (c = 0; c < vd->vdev_children; c++)
1535*789Sahrens 		vdev_scrub_stat_update(vd->vdev_child[c], type, complete);
1536*789Sahrens 
1537*789Sahrens 	mutex_enter(&vd->vdev_stat_lock);
1538*789Sahrens 
1539*789Sahrens 	if (type == POOL_SCRUB_NONE) {
1540*789Sahrens 		/*
1541*789Sahrens 		 * Update completion and end time.  Leave everything else alone
1542*789Sahrens 		 * so we can report what happened during the previous scrub.
1543*789Sahrens 		 */
1544*789Sahrens 		vs->vs_scrub_complete = complete;
1545*789Sahrens 		vs->vs_scrub_end = gethrestime_sec();
1546*789Sahrens 	} else {
1547*789Sahrens 		vs->vs_scrub_type = type;
1548*789Sahrens 		vs->vs_scrub_complete = 0;
1549*789Sahrens 		vs->vs_scrub_examined = 0;
1550*789Sahrens 		vs->vs_scrub_repaired = 0;
1551*789Sahrens 		vs->vs_scrub_errors = 0;
1552*789Sahrens 		vs->vs_scrub_start = gethrestime_sec();
1553*789Sahrens 		vs->vs_scrub_end = 0;
1554*789Sahrens 	}
1555*789Sahrens 
1556*789Sahrens 	mutex_exit(&vd->vdev_stat_lock);
1557*789Sahrens }
1558*789Sahrens 
1559*789Sahrens /*
1560*789Sahrens  * Report checksum errors that a vdev that didn't realize it made.
1561*789Sahrens  * This can happen, for example, when RAID-Z combinatorial reconstruction
1562*789Sahrens  * infers that one of its components returned bad data.
1563*789Sahrens  */
1564*789Sahrens void
1565*789Sahrens vdev_checksum_error(zio_t *zio, vdev_t *vd)
1566*789Sahrens {
1567*789Sahrens 	dprintf_bp(zio->io_bp, "imputed checksum error on %s: ",
1568*789Sahrens 	    vdev_description(vd));
1569*789Sahrens 
1570*789Sahrens 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1571*789Sahrens 		mutex_enter(&vd->vdev_stat_lock);
1572*789Sahrens 		vd->vdev_stat.vs_checksum_errors++;
1573*789Sahrens 		mutex_exit(&vd->vdev_stat_lock);
1574*789Sahrens 	}
1575*789Sahrens }
1576*789Sahrens 
1577*789Sahrens /*
1578*789Sahrens  * Update the in-core space usage stats for this vdev and the root vdev.
1579*789Sahrens  */
1580*789Sahrens void
1581*789Sahrens vdev_space_update(vdev_t *vd, uint64_t space_delta, uint64_t alloc_delta)
1582*789Sahrens {
1583*789Sahrens 	ASSERT(vd == vd->vdev_top);
1584*789Sahrens 
1585*789Sahrens 	do {
1586*789Sahrens 		mutex_enter(&vd->vdev_stat_lock);
1587*789Sahrens 		vd->vdev_stat.vs_space += space_delta;
1588*789Sahrens 		vd->vdev_stat.vs_alloc += alloc_delta;
1589*789Sahrens 		mutex_exit(&vd->vdev_stat_lock);
1590*789Sahrens 	} while ((vd = vd->vdev_parent) != NULL);
1591*789Sahrens }
1592*789Sahrens 
1593*789Sahrens /*
1594*789Sahrens  * Various knobs to tune a vdev.
1595*789Sahrens  */
1596*789Sahrens static vdev_knob_t vdev_knob[] = {
1597*789Sahrens 	{
1598*789Sahrens 		"cache_size",
1599*789Sahrens 		"size of the read-ahead cache",
1600*789Sahrens 		0,
1601*789Sahrens 		1ULL << 30,
1602*789Sahrens 		10ULL << 20,
1603*789Sahrens 		offsetof(struct vdev, vdev_cache.vc_size)
1604*789Sahrens 	},
1605*789Sahrens 	{
1606*789Sahrens 		"cache_bshift",
1607*789Sahrens 		"log2 of cache blocksize",
1608*789Sahrens 		SPA_MINBLOCKSHIFT,
1609*789Sahrens 		SPA_MAXBLOCKSHIFT,
1610*789Sahrens 		16,
1611*789Sahrens 		offsetof(struct vdev, vdev_cache.vc_bshift)
1612*789Sahrens 	},
1613*789Sahrens 	{
1614*789Sahrens 		"cache_max",
1615*789Sahrens 		"largest block size to cache",
1616*789Sahrens 		0,
1617*789Sahrens 		SPA_MAXBLOCKSIZE,
1618*789Sahrens 		1ULL << 14,
1619*789Sahrens 		offsetof(struct vdev, vdev_cache.vc_max)
1620*789Sahrens 	},
1621*789Sahrens 	{
1622*789Sahrens 		"min_pending",
1623*789Sahrens 		"minimum pending I/Os to the disk",
1624*789Sahrens 		1,
1625*789Sahrens 		10000,
1626*789Sahrens 		2,
1627*789Sahrens 		offsetof(struct vdev, vdev_queue.vq_min_pending)
1628*789Sahrens 	},
1629*789Sahrens 	{
1630*789Sahrens 		"max_pending",
1631*789Sahrens 		"maximum pending I/Os to the disk",
1632*789Sahrens 		1,
1633*789Sahrens 		10000,
1634*789Sahrens 		35,
1635*789Sahrens 		offsetof(struct vdev, vdev_queue.vq_max_pending)
1636*789Sahrens 	},
1637*789Sahrens 	{
1638*789Sahrens 		"agg_limit",
1639*789Sahrens 		"maximum size of aggregated I/Os",
1640*789Sahrens 		0,
1641*789Sahrens 		SPA_MAXBLOCKSIZE,
1642*789Sahrens 		SPA_MAXBLOCKSIZE,
1643*789Sahrens 		offsetof(struct vdev, vdev_queue.vq_agg_limit)
1644*789Sahrens 	},
1645*789Sahrens 	{
1646*789Sahrens 		"time_shift",
1647*789Sahrens 		"deadline = pri + (lbolt >> time_shift)",
1648*789Sahrens 		0,
1649*789Sahrens 		63,
1650*789Sahrens 		4,
1651*789Sahrens 		offsetof(struct vdev, vdev_queue.vq_time_shift)
1652*789Sahrens 	},
1653*789Sahrens 	{
1654*789Sahrens 		"ramp_rate",
1655*789Sahrens 		"exponential I/O issue ramp-up rate",
1656*789Sahrens 		1,
1657*789Sahrens 		10000,
1658*789Sahrens 		2,
1659*789Sahrens 		offsetof(struct vdev, vdev_queue.vq_ramp_rate)
1660*789Sahrens 	},
1661*789Sahrens };
1662*789Sahrens 
1663*789Sahrens vdev_knob_t *
1664*789Sahrens vdev_knob_next(vdev_knob_t *vk)
1665*789Sahrens {
1666*789Sahrens 	if (vk == NULL)
1667*789Sahrens 		return (vdev_knob);
1668*789Sahrens 
1669*789Sahrens 	if (++vk == vdev_knob + sizeof (vdev_knob) / sizeof (vdev_knob_t))
1670*789Sahrens 		return (NULL);
1671*789Sahrens 
1672*789Sahrens 	return (vk);
1673*789Sahrens }
1674*789Sahrens 
1675*789Sahrens /*
1676*789Sahrens  * Mark a top-level vdev's config as dirty, placing it on the dirty list
1677*789Sahrens  * so that it will be written out next time the vdev configuration is synced.
1678*789Sahrens  * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs.
1679*789Sahrens  */
1680*789Sahrens void
1681*789Sahrens vdev_config_dirty(vdev_t *vd)
1682*789Sahrens {
1683*789Sahrens 	spa_t *spa = vd->vdev_spa;
1684*789Sahrens 	vdev_t *rvd = spa->spa_root_vdev;
1685*789Sahrens 	int c;
1686*789Sahrens 
1687*789Sahrens 	if (vd == rvd) {
1688*789Sahrens 		for (c = 0; c < rvd->vdev_children; c++)
1689*789Sahrens 			vdev_config_dirty(rvd->vdev_child[c]);
1690*789Sahrens 	} else {
1691*789Sahrens 		ASSERT(vd == vd->vdev_top);
1692*789Sahrens 
1693*789Sahrens 		if (!vd->vdev_is_dirty) {
1694*789Sahrens 			list_insert_head(&spa->spa_dirty_list, vd);
1695*789Sahrens 			vd->vdev_is_dirty = B_TRUE;
1696*789Sahrens 		}
1697*789Sahrens 	}
1698*789Sahrens }
1699*789Sahrens 
1700*789Sahrens void
1701*789Sahrens vdev_config_clean(vdev_t *vd)
1702*789Sahrens {
1703*789Sahrens 	ASSERT(vd->vdev_is_dirty);
1704*789Sahrens 
1705*789Sahrens 	list_remove(&vd->vdev_spa->spa_dirty_list, vd);
1706*789Sahrens 	vd->vdev_is_dirty = B_FALSE;
1707*789Sahrens }
1708*789Sahrens 
1709*789Sahrens /*
1710*789Sahrens  * Set a vdev's state, updating any parent's state as well.
1711*789Sahrens  */
1712*789Sahrens void
1713*789Sahrens vdev_set_state(vdev_t *vd, vdev_state_t state, vdev_aux_t aux)
1714*789Sahrens {
1715*789Sahrens 	if (state == vd->vdev_state)
1716*789Sahrens 		return;
1717*789Sahrens 
1718*789Sahrens 	vd->vdev_state = state;
1719*789Sahrens 	vd->vdev_stat.vs_aux = aux;
1720*789Sahrens 
1721*789Sahrens 	if (vd->vdev_parent != NULL) {
1722*789Sahrens 		int c;
1723*789Sahrens 		int degraded = 0, faulted = 0;
1724*789Sahrens 		vdev_t *parent, *child;
1725*789Sahrens 
1726*789Sahrens 		parent = vd->vdev_parent;
1727*789Sahrens 		for (c = 0; c < parent->vdev_children; c++) {
1728*789Sahrens 			child = parent->vdev_child[c];
1729*789Sahrens 			if (child->vdev_state <= VDEV_STATE_CANT_OPEN)
1730*789Sahrens 				faulted++;
1731*789Sahrens 			else if (child->vdev_state == VDEV_STATE_DEGRADED)
1732*789Sahrens 				degraded++;
1733*789Sahrens 		}
1734*789Sahrens 
1735*789Sahrens 		vd->vdev_parent->vdev_ops->vdev_op_state_change(
1736*789Sahrens 		    vd->vdev_parent, faulted, degraded);
1737*789Sahrens 	    }
1738*789Sahrens }
1739