xref: /onnv-gate/usr/src/uts/common/fs/zfs/zio.c (revision 7754:b80e4842ad54)
1789Sahrens /*
2789Sahrens  * CDDL HEADER START
3789Sahrens  *
4789Sahrens  * The contents of this file are subject to the terms of the
51544Seschrock  * Common Development and Distribution License (the "License").
61544Seschrock  * You may not use this file except in compliance with the License.
7789Sahrens  *
8789Sahrens  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9789Sahrens  * or http://www.opensolaris.org/os/licensing.
10789Sahrens  * See the License for the specific language governing permissions
11789Sahrens  * and limitations under the License.
12789Sahrens  *
13789Sahrens  * When distributing Covered Code, include this CDDL HEADER in each
14789Sahrens  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15789Sahrens  * If applicable, add the following below this CDDL HEADER, with the
16789Sahrens  * fields enclosed by brackets "[]" replaced with your own identifying
17789Sahrens  * information: Portions Copyright [yyyy] [name of copyright owner]
18789Sahrens  *
19789Sahrens  * CDDL HEADER END
20789Sahrens  */
21789Sahrens /*
226245Smaybee  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23789Sahrens  * Use is subject to license terms.
24789Sahrens  */
25789Sahrens 
26789Sahrens #include <sys/zfs_context.h>
271544Seschrock #include <sys/fm/fs/zfs.h>
28789Sahrens #include <sys/spa.h>
29789Sahrens #include <sys/txg.h>
30789Sahrens #include <sys/spa_impl.h>
31789Sahrens #include <sys/vdev_impl.h>
32789Sahrens #include <sys/zio_impl.h>
33789Sahrens #include <sys/zio_compress.h>
34789Sahrens #include <sys/zio_checksum.h>
35789Sahrens 
36789Sahrens /*
37789Sahrens  * ==========================================================================
38789Sahrens  * I/O priority table
39789Sahrens  * ==========================================================================
40789Sahrens  */
41789Sahrens uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = {
42789Sahrens 	0,	/* ZIO_PRIORITY_NOW		*/
43789Sahrens 	0,	/* ZIO_PRIORITY_SYNC_READ	*/
44789Sahrens 	0,	/* ZIO_PRIORITY_SYNC_WRITE	*/
45789Sahrens 	6,	/* ZIO_PRIORITY_ASYNC_READ	*/
46789Sahrens 	4,	/* ZIO_PRIORITY_ASYNC_WRITE	*/
47789Sahrens 	4,	/* ZIO_PRIORITY_FREE		*/
48789Sahrens 	0,	/* ZIO_PRIORITY_CACHE_FILL	*/
49789Sahrens 	0,	/* ZIO_PRIORITY_LOG_WRITE	*/
50789Sahrens 	10,	/* ZIO_PRIORITY_RESILVER	*/
51789Sahrens 	20,	/* ZIO_PRIORITY_SCRUB		*/
52789Sahrens };
53789Sahrens 
54789Sahrens /*
55789Sahrens  * ==========================================================================
56789Sahrens  * I/O type descriptions
57789Sahrens  * ==========================================================================
58789Sahrens  */
59789Sahrens char *zio_type_name[ZIO_TYPES] = {
60789Sahrens 	"null", "read", "write", "free", "claim", "ioctl" };
61789Sahrens 
62*7754SJeff.Bonwick@Sun.COM #define	SYNC_PASS_DEFERRED_FREE	1	/* defer frees after this pass */
63*7754SJeff.Bonwick@Sun.COM #define	SYNC_PASS_DONT_COMPRESS	4	/* don't compress after this pass */
64*7754SJeff.Bonwick@Sun.COM #define	SYNC_PASS_REWRITE	1	/* rewrite new bps after this pass */
655329Sgw25295 
66789Sahrens /*
67789Sahrens  * ==========================================================================
68789Sahrens  * I/O kmem caches
69789Sahrens  * ==========================================================================
70789Sahrens  */
714055Seschrock kmem_cache_t *zio_cache;
72789Sahrens kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
733290Sjohansen kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
743290Sjohansen 
753290Sjohansen #ifdef _KERNEL
763290Sjohansen extern vmem_t *zio_alloc_arena;
773290Sjohansen #endif
78789Sahrens 
795329Sgw25295 /*
80*7754SJeff.Bonwick@Sun.COM  * An allocating zio is one that either currently has the DVA allocate
81*7754SJeff.Bonwick@Sun.COM  * stage set or will have it later in its lifetime.
825329Sgw25295  */
835329Sgw25295 #define	IO_IS_ALLOCATING(zio) \
845688Sbonwick 	((zio)->io_orig_pipeline & (1U << ZIO_STAGE_DVA_ALLOCATE))
855329Sgw25295 
86789Sahrens void
87789Sahrens zio_init(void)
88789Sahrens {
89789Sahrens 	size_t c;
903290Sjohansen 	vmem_t *data_alloc_arena = NULL;
913290Sjohansen 
923290Sjohansen #ifdef _KERNEL
933290Sjohansen 	data_alloc_arena = zio_alloc_arena;
943290Sjohansen #endif
954055Seschrock 	zio_cache = kmem_cache_create("zio_cache", sizeof (zio_t), 0,
964055Seschrock 	    NULL, NULL, NULL, NULL, NULL, 0);
974055Seschrock 
98789Sahrens 	/*
99789Sahrens 	 * For small buffers, we want a cache for each multiple of
100789Sahrens 	 * SPA_MINBLOCKSIZE.  For medium-size buffers, we want a cache
101789Sahrens 	 * for each quarter-power of 2.  For large buffers, we want
102789Sahrens 	 * a cache for each multiple of PAGESIZE.
103789Sahrens 	 */
104789Sahrens 	for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
105789Sahrens 		size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
106789Sahrens 		size_t p2 = size;
107789Sahrens 		size_t align = 0;
108789Sahrens 
109789Sahrens 		while (p2 & (p2 - 1))
110789Sahrens 			p2 &= p2 - 1;
111789Sahrens 
112789Sahrens 		if (size <= 4 * SPA_MINBLOCKSIZE) {
113789Sahrens 			align = SPA_MINBLOCKSIZE;
114789Sahrens 		} else if (P2PHASE(size, PAGESIZE) == 0) {
115789Sahrens 			align = PAGESIZE;
116789Sahrens 		} else if (P2PHASE(size, p2 >> 2) == 0) {
117789Sahrens 			align = p2 >> 2;
118789Sahrens 		}
119789Sahrens 
120789Sahrens 		if (align != 0) {
1213290Sjohansen 			char name[36];
1222856Snd150628 			(void) sprintf(name, "zio_buf_%lu", (ulong_t)size);
123789Sahrens 			zio_buf_cache[c] = kmem_cache_create(name, size,
124849Sbonwick 			    align, NULL, NULL, NULL, NULL, NULL, KMC_NODEBUG);
1253290Sjohansen 
1263290Sjohansen 			(void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size);
1273290Sjohansen 			zio_data_buf_cache[c] = kmem_cache_create(name, size,
1283290Sjohansen 			    align, NULL, NULL, NULL, NULL, data_alloc_arena,
1293290Sjohansen 			    KMC_NODEBUG);
130789Sahrens 		}
131789Sahrens 	}
132789Sahrens 
133789Sahrens 	while (--c != 0) {
134789Sahrens 		ASSERT(zio_buf_cache[c] != NULL);
135789Sahrens 		if (zio_buf_cache[c - 1] == NULL)
136789Sahrens 			zio_buf_cache[c - 1] = zio_buf_cache[c];
1373290Sjohansen 
1383290Sjohansen 		ASSERT(zio_data_buf_cache[c] != NULL);
1393290Sjohansen 		if (zio_data_buf_cache[c - 1] == NULL)
1403290Sjohansen 			zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
141789Sahrens 	}
1421544Seschrock 
1431544Seschrock 	zio_inject_init();
144789Sahrens }
145789Sahrens 
146789Sahrens void
147789Sahrens zio_fini(void)
148789Sahrens {
149789Sahrens 	size_t c;
150789Sahrens 	kmem_cache_t *last_cache = NULL;
1513290Sjohansen 	kmem_cache_t *last_data_cache = NULL;
152789Sahrens 
153789Sahrens 	for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
154789Sahrens 		if (zio_buf_cache[c] != last_cache) {
155789Sahrens 			last_cache = zio_buf_cache[c];
156789Sahrens 			kmem_cache_destroy(zio_buf_cache[c]);
157789Sahrens 		}
158789Sahrens 		zio_buf_cache[c] = NULL;
1593290Sjohansen 
1603290Sjohansen 		if (zio_data_buf_cache[c] != last_data_cache) {
1613290Sjohansen 			last_data_cache = zio_data_buf_cache[c];
1623290Sjohansen 			kmem_cache_destroy(zio_data_buf_cache[c]);
1633290Sjohansen 		}
1643290Sjohansen 		zio_data_buf_cache[c] = NULL;
165789Sahrens 	}
1661544Seschrock 
1674055Seschrock 	kmem_cache_destroy(zio_cache);
1684055Seschrock 
1691544Seschrock 	zio_inject_fini();
170789Sahrens }
171789Sahrens 
172789Sahrens /*
173789Sahrens  * ==========================================================================
174789Sahrens  * Allocate and free I/O buffers
175789Sahrens  * ==========================================================================
176789Sahrens  */
1773290Sjohansen 
1783290Sjohansen /*
1793290Sjohansen  * Use zio_buf_alloc to allocate ZFS metadata.  This data will appear in a
1803290Sjohansen  * crashdump if the kernel panics, so use it judiciously.  Obviously, it's
1813290Sjohansen  * useful to inspect ZFS metadata, but if possible, we should avoid keeping
1823290Sjohansen  * excess / transient data in-core during a crashdump.
1833290Sjohansen  */
184789Sahrens void *
185789Sahrens zio_buf_alloc(size_t size)
186789Sahrens {
187789Sahrens 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
188789Sahrens 
189789Sahrens 	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
190789Sahrens 
1916245Smaybee 	return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE));
192789Sahrens }
193789Sahrens 
1943290Sjohansen /*
1953290Sjohansen  * Use zio_data_buf_alloc to allocate data.  The data will not appear in a
1963290Sjohansen  * crashdump if the kernel panics.  This exists so that we will limit the amount
1973290Sjohansen  * of ZFS data that shows up in a kernel crashdump.  (Thus reducing the amount
1983290Sjohansen  * of kernel heap dumped to disk when the kernel panics)
1993290Sjohansen  */
2003290Sjohansen void *
2013290Sjohansen zio_data_buf_alloc(size_t size)
2023290Sjohansen {
2033290Sjohansen 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
2043290Sjohansen 
2053290Sjohansen 	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
2063290Sjohansen 
2076245Smaybee 	return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE));
2083290Sjohansen }
2093290Sjohansen 
210789Sahrens void
211789Sahrens zio_buf_free(void *buf, size_t size)
212789Sahrens {
213789Sahrens 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
214789Sahrens 
215789Sahrens 	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
216789Sahrens 
217789Sahrens 	kmem_cache_free(zio_buf_cache[c], buf);
218789Sahrens }
219789Sahrens 
2203290Sjohansen void
2213290Sjohansen zio_data_buf_free(void *buf, size_t size)
2223290Sjohansen {
2233290Sjohansen 	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
2243290Sjohansen 
2253290Sjohansen 	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
2263290Sjohansen 
2273290Sjohansen 	kmem_cache_free(zio_data_buf_cache[c], buf);
2283290Sjohansen }
2293463Sahrens 
230789Sahrens /*
231789Sahrens  * ==========================================================================
232789Sahrens  * Push and pop I/O transform buffers
233789Sahrens  * ==========================================================================
234789Sahrens  */
235789Sahrens static void
236*7754SJeff.Bonwick@Sun.COM zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize,
237*7754SJeff.Bonwick@Sun.COM 	zio_transform_func_t *transform)
238789Sahrens {
239789Sahrens 	zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
240789Sahrens 
241*7754SJeff.Bonwick@Sun.COM 	zt->zt_orig_data = zio->io_data;
242*7754SJeff.Bonwick@Sun.COM 	zt->zt_orig_size = zio->io_size;
243789Sahrens 	zt->zt_bufsize = bufsize;
244*7754SJeff.Bonwick@Sun.COM 	zt->zt_transform = transform;
245789Sahrens 
246789Sahrens 	zt->zt_next = zio->io_transform_stack;
247789Sahrens 	zio->io_transform_stack = zt;
248789Sahrens 
249789Sahrens 	zio->io_data = data;
250789Sahrens 	zio->io_size = size;
251789Sahrens }
252789Sahrens 
253789Sahrens static void
254*7754SJeff.Bonwick@Sun.COM zio_pop_transforms(zio_t *zio)
255789Sahrens {
256*7754SJeff.Bonwick@Sun.COM 	zio_transform_t *zt;
257789Sahrens 
258*7754SJeff.Bonwick@Sun.COM 	while ((zt = zio->io_transform_stack) != NULL) {
259*7754SJeff.Bonwick@Sun.COM 		if (zt->zt_transform != NULL)
260*7754SJeff.Bonwick@Sun.COM 			zt->zt_transform(zio,
261*7754SJeff.Bonwick@Sun.COM 			    zt->zt_orig_data, zt->zt_orig_size);
262789Sahrens 
263*7754SJeff.Bonwick@Sun.COM 		zio_buf_free(zio->io_data, zt->zt_bufsize);
264789Sahrens 
265*7754SJeff.Bonwick@Sun.COM 		zio->io_data = zt->zt_orig_data;
266*7754SJeff.Bonwick@Sun.COM 		zio->io_size = zt->zt_orig_size;
267*7754SJeff.Bonwick@Sun.COM 		zio->io_transform_stack = zt->zt_next;
268789Sahrens 
269*7754SJeff.Bonwick@Sun.COM 		kmem_free(zt, sizeof (zio_transform_t));
270789Sahrens 	}
271789Sahrens }
272789Sahrens 
273789Sahrens /*
274789Sahrens  * ==========================================================================
275*7754SJeff.Bonwick@Sun.COM  * I/O transform callbacks for subblocks and decompression
276*7754SJeff.Bonwick@Sun.COM  * ==========================================================================
277*7754SJeff.Bonwick@Sun.COM  */
278*7754SJeff.Bonwick@Sun.COM static void
279*7754SJeff.Bonwick@Sun.COM zio_subblock(zio_t *zio, void *data, uint64_t size)
280*7754SJeff.Bonwick@Sun.COM {
281*7754SJeff.Bonwick@Sun.COM 	ASSERT(zio->io_size > size);
282*7754SJeff.Bonwick@Sun.COM 
283*7754SJeff.Bonwick@Sun.COM 	if (zio->io_type == ZIO_TYPE_READ)
284*7754SJeff.Bonwick@Sun.COM 		bcopy(zio->io_data, data, size);
285*7754SJeff.Bonwick@Sun.COM }
286*7754SJeff.Bonwick@Sun.COM 
287*7754SJeff.Bonwick@Sun.COM static void
288*7754SJeff.Bonwick@Sun.COM zio_decompress(zio_t *zio, void *data, uint64_t size)
289*7754SJeff.Bonwick@Sun.COM {
290*7754SJeff.Bonwick@Sun.COM 	if (zio->io_error == 0 &&
291*7754SJeff.Bonwick@Sun.COM 	    zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
292*7754SJeff.Bonwick@Sun.COM 	    zio->io_data, zio->io_size, data, size) != 0)
293*7754SJeff.Bonwick@Sun.COM 		zio->io_error = EIO;
294*7754SJeff.Bonwick@Sun.COM }
295*7754SJeff.Bonwick@Sun.COM 
296*7754SJeff.Bonwick@Sun.COM /*
297*7754SJeff.Bonwick@Sun.COM  * ==========================================================================
298*7754SJeff.Bonwick@Sun.COM  * I/O parent/child relationships and pipeline interlocks
299*7754SJeff.Bonwick@Sun.COM  * ==========================================================================
300*7754SJeff.Bonwick@Sun.COM  */
301*7754SJeff.Bonwick@Sun.COM 
302*7754SJeff.Bonwick@Sun.COM static void
303*7754SJeff.Bonwick@Sun.COM zio_add_child(zio_t *pio, zio_t *zio)
304*7754SJeff.Bonwick@Sun.COM {
305*7754SJeff.Bonwick@Sun.COM 	mutex_enter(&pio->io_lock);
306*7754SJeff.Bonwick@Sun.COM 	if (zio->io_stage < ZIO_STAGE_READY)
307*7754SJeff.Bonwick@Sun.COM 		pio->io_children[zio->io_child_type][ZIO_WAIT_READY]++;
308*7754SJeff.Bonwick@Sun.COM 	if (zio->io_stage < ZIO_STAGE_DONE)
309*7754SJeff.Bonwick@Sun.COM 		pio->io_children[zio->io_child_type][ZIO_WAIT_DONE]++;
310*7754SJeff.Bonwick@Sun.COM 	zio->io_sibling_prev = NULL;
311*7754SJeff.Bonwick@Sun.COM 	zio->io_sibling_next = pio->io_child;
312*7754SJeff.Bonwick@Sun.COM 	if (pio->io_child != NULL)
313*7754SJeff.Bonwick@Sun.COM 		pio->io_child->io_sibling_prev = zio;
314*7754SJeff.Bonwick@Sun.COM 	pio->io_child = zio;
315*7754SJeff.Bonwick@Sun.COM 	zio->io_parent = pio;
316*7754SJeff.Bonwick@Sun.COM 	mutex_exit(&pio->io_lock);
317*7754SJeff.Bonwick@Sun.COM }
318*7754SJeff.Bonwick@Sun.COM 
319*7754SJeff.Bonwick@Sun.COM static void
320*7754SJeff.Bonwick@Sun.COM zio_remove_child(zio_t *pio, zio_t *zio)
321*7754SJeff.Bonwick@Sun.COM {
322*7754SJeff.Bonwick@Sun.COM 	zio_t *next, *prev;
323*7754SJeff.Bonwick@Sun.COM 
324*7754SJeff.Bonwick@Sun.COM 	ASSERT(zio->io_parent == pio);
325*7754SJeff.Bonwick@Sun.COM 
326*7754SJeff.Bonwick@Sun.COM 	mutex_enter(&pio->io_lock);
327*7754SJeff.Bonwick@Sun.COM 	next = zio->io_sibling_next;
328*7754SJeff.Bonwick@Sun.COM 	prev = zio->io_sibling_prev;
329*7754SJeff.Bonwick@Sun.COM 	if (next != NULL)
330*7754SJeff.Bonwick@Sun.COM 		next->io_sibling_prev = prev;
331*7754SJeff.Bonwick@Sun.COM 	if (prev != NULL)
332*7754SJeff.Bonwick@Sun.COM 		prev->io_sibling_next = next;
333*7754SJeff.Bonwick@Sun.COM 	if (pio->io_child == zio)
334*7754SJeff.Bonwick@Sun.COM 		pio->io_child = next;
335*7754SJeff.Bonwick@Sun.COM 	mutex_exit(&pio->io_lock);
336*7754SJeff.Bonwick@Sun.COM }
337*7754SJeff.Bonwick@Sun.COM 
338*7754SJeff.Bonwick@Sun.COM static boolean_t
339*7754SJeff.Bonwick@Sun.COM zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait)
340*7754SJeff.Bonwick@Sun.COM {
341*7754SJeff.Bonwick@Sun.COM 	uint64_t *countp = &zio->io_children[child][wait];
342*7754SJeff.Bonwick@Sun.COM 	boolean_t waiting = B_FALSE;
343*7754SJeff.Bonwick@Sun.COM 
344*7754SJeff.Bonwick@Sun.COM 	mutex_enter(&zio->io_lock);
345*7754SJeff.Bonwick@Sun.COM 	ASSERT(zio->io_stall == NULL);
346*7754SJeff.Bonwick@Sun.COM 	if (*countp != 0) {
347*7754SJeff.Bonwick@Sun.COM 		zio->io_stage--;
348*7754SJeff.Bonwick@Sun.COM 		zio->io_stall = countp;
349*7754SJeff.Bonwick@Sun.COM 		waiting = B_TRUE;
350*7754SJeff.Bonwick@Sun.COM 	}
351*7754SJeff.Bonwick@Sun.COM 	mutex_exit(&zio->io_lock);
352*7754SJeff.Bonwick@Sun.COM 
353*7754SJeff.Bonwick@Sun.COM 	return (waiting);
354*7754SJeff.Bonwick@Sun.COM }
355*7754SJeff.Bonwick@Sun.COM 
356*7754SJeff.Bonwick@Sun.COM static void
357*7754SJeff.Bonwick@Sun.COM zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait)
358*7754SJeff.Bonwick@Sun.COM {
359*7754SJeff.Bonwick@Sun.COM 	uint64_t *countp = &pio->io_children[zio->io_child_type][wait];
360*7754SJeff.Bonwick@Sun.COM 	int *errorp = &pio->io_child_error[zio->io_child_type];
361*7754SJeff.Bonwick@Sun.COM 
362*7754SJeff.Bonwick@Sun.COM 	mutex_enter(&pio->io_lock);
363*7754SJeff.Bonwick@Sun.COM 	if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
364*7754SJeff.Bonwick@Sun.COM 		*errorp = zio_worst_error(*errorp, zio->io_error);
365*7754SJeff.Bonwick@Sun.COM 	pio->io_reexecute |= zio->io_reexecute;
366*7754SJeff.Bonwick@Sun.COM 	ASSERT3U(*countp, >, 0);
367*7754SJeff.Bonwick@Sun.COM 	if (--*countp == 0 && pio->io_stall == countp) {
368*7754SJeff.Bonwick@Sun.COM 		pio->io_stall = NULL;
369*7754SJeff.Bonwick@Sun.COM 		mutex_exit(&pio->io_lock);
370*7754SJeff.Bonwick@Sun.COM 		zio_execute(pio);
371*7754SJeff.Bonwick@Sun.COM 	} else {
372*7754SJeff.Bonwick@Sun.COM 		mutex_exit(&pio->io_lock);
373*7754SJeff.Bonwick@Sun.COM 	}
374*7754SJeff.Bonwick@Sun.COM }
375*7754SJeff.Bonwick@Sun.COM 
376*7754SJeff.Bonwick@Sun.COM static void
377*7754SJeff.Bonwick@Sun.COM zio_inherit_child_errors(zio_t *zio, enum zio_child c)
378*7754SJeff.Bonwick@Sun.COM {
379*7754SJeff.Bonwick@Sun.COM 	if (zio->io_child_error[c] != 0 && zio->io_error == 0)
380*7754SJeff.Bonwick@Sun.COM 		zio->io_error = zio->io_child_error[c];
381*7754SJeff.Bonwick@Sun.COM }
382*7754SJeff.Bonwick@Sun.COM 
383*7754SJeff.Bonwick@Sun.COM /*
384*7754SJeff.Bonwick@Sun.COM  * ==========================================================================
385*7754SJeff.Bonwick@Sun.COM  * Create the various types of I/O (read, write, free, etc)
386789Sahrens  * ==========================================================================
387789Sahrens  */
388789Sahrens static zio_t *
389789Sahrens zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
390789Sahrens     void *data, uint64_t size, zio_done_func_t *done, void *private,
391*7754SJeff.Bonwick@Sun.COM     zio_type_t type, int priority, int flags, vdev_t *vd, uint64_t offset,
392*7754SJeff.Bonwick@Sun.COM     const zbookmark_t *zb, uint8_t stage, uint32_t pipeline)
393789Sahrens {
394789Sahrens 	zio_t *zio;
395789Sahrens 
396789Sahrens 	ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
397789Sahrens 	ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
398*7754SJeff.Bonwick@Sun.COM 	ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
399789Sahrens 
400*7754SJeff.Bonwick@Sun.COM 	ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER));
401*7754SJeff.Bonwick@Sun.COM 	ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER));
402*7754SJeff.Bonwick@Sun.COM 	ASSERT(vd || stage == ZIO_STAGE_OPEN);
4037046Sahrens 
4044055Seschrock 	zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
4054055Seschrock 	bzero(zio, sizeof (zio_t));
406*7754SJeff.Bonwick@Sun.COM 
407*7754SJeff.Bonwick@Sun.COM 	mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);
408*7754SJeff.Bonwick@Sun.COM 	cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
409*7754SJeff.Bonwick@Sun.COM 
410*7754SJeff.Bonwick@Sun.COM 	if (vd != NULL)
411*7754SJeff.Bonwick@Sun.COM 		zio->io_child_type = ZIO_CHILD_VDEV;
412*7754SJeff.Bonwick@Sun.COM 	else if (flags & ZIO_FLAG_GANG_CHILD)
413*7754SJeff.Bonwick@Sun.COM 		zio->io_child_type = ZIO_CHILD_GANG;
414*7754SJeff.Bonwick@Sun.COM 	else
415*7754SJeff.Bonwick@Sun.COM 		zio->io_child_type = ZIO_CHILD_LOGICAL;
416*7754SJeff.Bonwick@Sun.COM 
417789Sahrens 	if (bp != NULL) {
418789Sahrens 		zio->io_bp = bp;
419789Sahrens 		zio->io_bp_copy = *bp;
420789Sahrens 		zio->io_bp_orig = *bp;
421*7754SJeff.Bonwick@Sun.COM 		if (type != ZIO_TYPE_WRITE)
422*7754SJeff.Bonwick@Sun.COM 			zio->io_bp = &zio->io_bp_copy;	/* so caller can free */
423*7754SJeff.Bonwick@Sun.COM 		if (zio->io_child_type == ZIO_CHILD_LOGICAL) {
424*7754SJeff.Bonwick@Sun.COM 			if (BP_IS_GANG(bp))
425*7754SJeff.Bonwick@Sun.COM 				pipeline |= ZIO_GANG_STAGES;
426*7754SJeff.Bonwick@Sun.COM 			zio->io_logical = zio;
427*7754SJeff.Bonwick@Sun.COM 		}
428789Sahrens 	}
429*7754SJeff.Bonwick@Sun.COM 
430*7754SJeff.Bonwick@Sun.COM 	zio->io_spa = spa;
431*7754SJeff.Bonwick@Sun.COM 	zio->io_txg = txg;
432*7754SJeff.Bonwick@Sun.COM 	zio->io_data = data;
433*7754SJeff.Bonwick@Sun.COM 	zio->io_size = size;
434789Sahrens 	zio->io_done = done;
435789Sahrens 	zio->io_private = private;
436789Sahrens 	zio->io_type = type;
437789Sahrens 	zio->io_priority = priority;
438*7754SJeff.Bonwick@Sun.COM 	zio->io_vd = vd;
439*7754SJeff.Bonwick@Sun.COM 	zio->io_offset = offset;
440*7754SJeff.Bonwick@Sun.COM 	zio->io_orig_flags = zio->io_flags = flags;
441*7754SJeff.Bonwick@Sun.COM 	zio->io_orig_stage = zio->io_stage = stage;
442*7754SJeff.Bonwick@Sun.COM 	zio->io_orig_pipeline = zio->io_pipeline = pipeline;
443*7754SJeff.Bonwick@Sun.COM 
444*7754SJeff.Bonwick@Sun.COM 	if (zb != NULL)
445*7754SJeff.Bonwick@Sun.COM 		zio->io_bookmark = *zb;
446789Sahrens 
447*7754SJeff.Bonwick@Sun.COM 	if (pio != NULL) {
448*7754SJeff.Bonwick@Sun.COM 		/*
449*7754SJeff.Bonwick@Sun.COM 		 * Logical I/Os can have logical, gang, or vdev children.
450*7754SJeff.Bonwick@Sun.COM 		 * Gang I/Os can have gang or vdev children.
451*7754SJeff.Bonwick@Sun.COM 		 * Vdev I/Os can only have vdev children.
452*7754SJeff.Bonwick@Sun.COM 		 * The following ASSERT captures all of these constraints.
453*7754SJeff.Bonwick@Sun.COM 		 */
454*7754SJeff.Bonwick@Sun.COM 		ASSERT(zio->io_child_type <= pio->io_child_type);
455*7754SJeff.Bonwick@Sun.COM 		if (zio->io_logical == NULL)
4561544Seschrock 			zio->io_logical = pio->io_logical;
457*7754SJeff.Bonwick@Sun.COM 		zio_add_child(pio, zio);
458789Sahrens 	}
459789Sahrens 
460789Sahrens 	return (zio);
461789Sahrens }
462789Sahrens 
4635329Sgw25295 static void
464*7754SJeff.Bonwick@Sun.COM zio_destroy(zio_t *zio)
4655329Sgw25295 {
466*7754SJeff.Bonwick@Sun.COM 	spa_t *spa = zio->io_spa;
467*7754SJeff.Bonwick@Sun.COM 	uint8_t async_root = zio->io_async_root;
4685329Sgw25295 
469*7754SJeff.Bonwick@Sun.COM 	mutex_destroy(&zio->io_lock);
470*7754SJeff.Bonwick@Sun.COM 	cv_destroy(&zio->io_cv);
471*7754SJeff.Bonwick@Sun.COM 	kmem_cache_free(zio_cache, zio);
472*7754SJeff.Bonwick@Sun.COM 
473*7754SJeff.Bonwick@Sun.COM 	if (async_root) {
474*7754SJeff.Bonwick@Sun.COM 		mutex_enter(&spa->spa_async_root_lock);
475*7754SJeff.Bonwick@Sun.COM 		if (--spa->spa_async_root_count == 0)
476*7754SJeff.Bonwick@Sun.COM 			cv_broadcast(&spa->spa_async_root_cv);
477*7754SJeff.Bonwick@Sun.COM 		mutex_exit(&spa->spa_async_root_lock);
478*7754SJeff.Bonwick@Sun.COM 	}
4795329Sgw25295 }
4805329Sgw25295 
481789Sahrens zio_t *
482789Sahrens zio_null(zio_t *pio, spa_t *spa, zio_done_func_t *done, void *private,
483789Sahrens 	int flags)
484789Sahrens {
485789Sahrens 	zio_t *zio;
486789Sahrens 
487789Sahrens 	zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
488*7754SJeff.Bonwick@Sun.COM 	    ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, NULL, 0, NULL,
489*7754SJeff.Bonwick@Sun.COM 	    ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE);
490789Sahrens 
491789Sahrens 	return (zio);
492789Sahrens }
493789Sahrens 
494789Sahrens zio_t *
495789Sahrens zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags)
496789Sahrens {
497789Sahrens 	return (zio_null(NULL, spa, done, private, flags));
498789Sahrens }
499789Sahrens 
500789Sahrens zio_t *
501*7754SJeff.Bonwick@Sun.COM zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
502*7754SJeff.Bonwick@Sun.COM     void *data, uint64_t size, zio_done_func_t *done, void *private,
5037046Sahrens     int priority, int flags, const zbookmark_t *zb)
504789Sahrens {
505789Sahrens 	zio_t *zio;
506789Sahrens 
5077046Sahrens 	zio = zio_create(pio, spa, bp->blk_birth, (blkptr_t *)bp,
5087046Sahrens 	    data, size, done, private,
509*7754SJeff.Bonwick@Sun.COM 	    ZIO_TYPE_READ, priority, flags, NULL, 0, zb,
5102981Sahrens 	    ZIO_STAGE_OPEN, ZIO_READ_PIPELINE);
511789Sahrens 
512789Sahrens 	return (zio);
513789Sahrens }
514789Sahrens 
515789Sahrens zio_t *
516*7754SJeff.Bonwick@Sun.COM zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
517*7754SJeff.Bonwick@Sun.COM     void *data, uint64_t size, zio_prop_t *zp,
518*7754SJeff.Bonwick@Sun.COM     zio_done_func_t *ready, zio_done_func_t *done, void *private,
519*7754SJeff.Bonwick@Sun.COM     int priority, int flags, const zbookmark_t *zb)
520789Sahrens {
521789Sahrens 	zio_t *zio;
522789Sahrens 
523*7754SJeff.Bonwick@Sun.COM 	ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF &&
524*7754SJeff.Bonwick@Sun.COM 	    zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS &&
525*7754SJeff.Bonwick@Sun.COM 	    zp->zp_compress >= ZIO_COMPRESS_OFF &&
526*7754SJeff.Bonwick@Sun.COM 	    zp->zp_compress < ZIO_COMPRESS_FUNCTIONS &&
527*7754SJeff.Bonwick@Sun.COM 	    zp->zp_type < DMU_OT_NUMTYPES &&
528*7754SJeff.Bonwick@Sun.COM 	    zp->zp_level < 32 &&
529*7754SJeff.Bonwick@Sun.COM 	    zp->zp_ndvas > 0 &&
530*7754SJeff.Bonwick@Sun.COM 	    zp->zp_ndvas <= spa_max_replication(spa));
531*7754SJeff.Bonwick@Sun.COM 	ASSERT(ready != NULL);
5325329Sgw25295 
533789Sahrens 	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
534*7754SJeff.Bonwick@Sun.COM 	    ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
535789Sahrens 	    ZIO_STAGE_OPEN, ZIO_WRITE_PIPELINE);
536789Sahrens 
5373547Smaybee 	zio->io_ready = ready;
538*7754SJeff.Bonwick@Sun.COM 	zio->io_prop = *zp;
539789Sahrens 
540789Sahrens 	return (zio);
541789Sahrens }
542789Sahrens 
543789Sahrens zio_t *
544*7754SJeff.Bonwick@Sun.COM zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data,
545*7754SJeff.Bonwick@Sun.COM     uint64_t size, zio_done_func_t *done, void *private, int priority,
546*7754SJeff.Bonwick@Sun.COM     int flags, zbookmark_t *zb)
547789Sahrens {
548789Sahrens 	zio_t *zio;
549789Sahrens 
5507181Sperrin 	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
551*7754SJeff.Bonwick@Sun.COM 	    ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
552*7754SJeff.Bonwick@Sun.COM 	    ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
553789Sahrens 
554789Sahrens 	return (zio);
555789Sahrens }
556789Sahrens 
557789Sahrens zio_t *
558789Sahrens zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
559*7754SJeff.Bonwick@Sun.COM     zio_done_func_t *done, void *private, int flags)
560789Sahrens {
561789Sahrens 	zio_t *zio;
562789Sahrens 
563789Sahrens 	ASSERT(!BP_IS_HOLE(bp));
564789Sahrens 
565*7754SJeff.Bonwick@Sun.COM 	if (bp->blk_fill == BLK_FILL_ALREADY_FREED)
566*7754SJeff.Bonwick@Sun.COM 		return (zio_null(pio, spa, NULL, NULL, flags));
567*7754SJeff.Bonwick@Sun.COM 
568789Sahrens 	if (txg == spa->spa_syncing_txg &&
569*7754SJeff.Bonwick@Sun.COM 	    spa_sync_pass(spa) > SYNC_PASS_DEFERRED_FREE) {
570789Sahrens 		bplist_enqueue_deferred(&spa->spa_sync_bplist, bp);
571*7754SJeff.Bonwick@Sun.COM 		return (zio_null(pio, spa, NULL, NULL, flags));
572789Sahrens 	}
573789Sahrens 
574*7754SJeff.Bonwick@Sun.COM 	zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
575*7754SJeff.Bonwick@Sun.COM 	    done, private, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags,
576*7754SJeff.Bonwick@Sun.COM 	    NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE);
577789Sahrens 
578789Sahrens 	return (zio);
579789Sahrens }
580789Sahrens 
581789Sahrens zio_t *
582789Sahrens zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
583*7754SJeff.Bonwick@Sun.COM     zio_done_func_t *done, void *private, int flags)
584789Sahrens {
585789Sahrens 	zio_t *zio;
586789Sahrens 
587789Sahrens 	/*
588789Sahrens 	 * A claim is an allocation of a specific block.  Claims are needed
589789Sahrens 	 * to support immediate writes in the intent log.  The issue is that
590789Sahrens 	 * immediate writes contain committed data, but in a txg that was
591789Sahrens 	 * *not* committed.  Upon opening the pool after an unclean shutdown,
592789Sahrens 	 * the intent log claims all blocks that contain immediate write data
593789Sahrens 	 * so that the SPA knows they're in use.
594789Sahrens 	 *
595789Sahrens 	 * All claims *must* be resolved in the first txg -- before the SPA
596789Sahrens 	 * starts allocating blocks -- so that nothing is allocated twice.
597789Sahrens 	 */
598789Sahrens 	ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa));
599789Sahrens 	ASSERT3U(spa_first_txg(spa), <=, txg);
600789Sahrens 
601*7754SJeff.Bonwick@Sun.COM 	zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
602*7754SJeff.Bonwick@Sun.COM 	    done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags,
603*7754SJeff.Bonwick@Sun.COM 	    NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
604789Sahrens 
605789Sahrens 	return (zio);
606789Sahrens }
607789Sahrens 
608789Sahrens zio_t *
609789Sahrens zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
610789Sahrens     zio_done_func_t *done, void *private, int priority, int flags)
611789Sahrens {
612789Sahrens 	zio_t *zio;
613789Sahrens 	int c;
614789Sahrens 
615789Sahrens 	if (vd->vdev_children == 0) {
616789Sahrens 		zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
617*7754SJeff.Bonwick@Sun.COM 		    ZIO_TYPE_IOCTL, priority, flags, vd, 0, NULL,
618789Sahrens 		    ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
619789Sahrens 
620789Sahrens 		zio->io_cmd = cmd;
621789Sahrens 	} else {
622789Sahrens 		zio = zio_null(pio, spa, NULL, NULL, flags);
623789Sahrens 
624789Sahrens 		for (c = 0; c < vd->vdev_children; c++)
625789Sahrens 			zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
626789Sahrens 			    done, private, priority, flags));
627789Sahrens 	}
628789Sahrens 
629789Sahrens 	return (zio);
630789Sahrens }
631789Sahrens 
632789Sahrens zio_t *
633789Sahrens zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
634789Sahrens     void *data, int checksum, zio_done_func_t *done, void *private,
6355450Sbrendan     int priority, int flags, boolean_t labels)
636789Sahrens {
637789Sahrens 	zio_t *zio;
6385329Sgw25295 
639*7754SJeff.Bonwick@Sun.COM 	ASSERT(vd->vdev_children == 0);
640*7754SJeff.Bonwick@Sun.COM 	ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
641*7754SJeff.Bonwick@Sun.COM 	    offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
642*7754SJeff.Bonwick@Sun.COM 	ASSERT3U(offset + size, <=, vd->vdev_psize);
643789Sahrens 
644*7754SJeff.Bonwick@Sun.COM 	zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
645*7754SJeff.Bonwick@Sun.COM 	    ZIO_TYPE_READ, priority, flags, vd, offset, NULL,
646789Sahrens 	    ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
647789Sahrens 
648*7754SJeff.Bonwick@Sun.COM 	zio->io_prop.zp_checksum = checksum;
649789Sahrens 
650789Sahrens 	return (zio);
651789Sahrens }
652789Sahrens 
653789Sahrens zio_t *
654789Sahrens zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
655789Sahrens     void *data, int checksum, zio_done_func_t *done, void *private,
6565450Sbrendan     int priority, int flags, boolean_t labels)
657789Sahrens {
658789Sahrens 	zio_t *zio;
659789Sahrens 
660*7754SJeff.Bonwick@Sun.COM 	ASSERT(vd->vdev_children == 0);
661*7754SJeff.Bonwick@Sun.COM 	ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
662*7754SJeff.Bonwick@Sun.COM 	    offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
663*7754SJeff.Bonwick@Sun.COM 	ASSERT3U(offset + size, <=, vd->vdev_psize);
6645329Sgw25295 
665*7754SJeff.Bonwick@Sun.COM 	zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
666*7754SJeff.Bonwick@Sun.COM 	    ZIO_TYPE_WRITE, priority, flags, vd, offset, NULL,
667789Sahrens 	    ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
668789Sahrens 
669*7754SJeff.Bonwick@Sun.COM 	zio->io_prop.zp_checksum = checksum;
670789Sahrens 
671789Sahrens 	if (zio_checksum_table[checksum].ci_zbt) {
672789Sahrens 		/*
673789Sahrens 		 * zbt checksums are necessarily destructive -- they modify
674*7754SJeff.Bonwick@Sun.COM 		 * the end of the write buffer to hold the verifier/checksum.
675789Sahrens 		 * Therefore, we must make a local copy in case the data is
676*7754SJeff.Bonwick@Sun.COM 		 * being written to multiple places in parallel.
677789Sahrens 		 */
678*7754SJeff.Bonwick@Sun.COM 		void *wbuf = zio_buf_alloc(size);
679789Sahrens 		bcopy(data, wbuf, size);
680*7754SJeff.Bonwick@Sun.COM 		zio_push_transform(zio, wbuf, size, size, NULL);
681789Sahrens 	}
682789Sahrens 
683789Sahrens 	return (zio);
684789Sahrens }
685789Sahrens 
686789Sahrens /*
687*7754SJeff.Bonwick@Sun.COM  * Create a child I/O to do some work for us.
688789Sahrens  */
689789Sahrens zio_t *
690*7754SJeff.Bonwick@Sun.COM zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
691789Sahrens 	void *data, uint64_t size, int type, int priority, int flags,
692789Sahrens 	zio_done_func_t *done, void *private)
693789Sahrens {
694789Sahrens 	uint32_t pipeline = ZIO_VDEV_CHILD_PIPELINE;
695*7754SJeff.Bonwick@Sun.COM 	zio_t *zio;
696*7754SJeff.Bonwick@Sun.COM 
697*7754SJeff.Bonwick@Sun.COM 	ASSERT(vd->vdev_parent ==
698*7754SJeff.Bonwick@Sun.COM 	    (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev));
699789Sahrens 
700789Sahrens 	if (type == ZIO_TYPE_READ && bp != NULL) {
701789Sahrens 		/*
702789Sahrens 		 * If we have the bp, then the child should perform the
703789Sahrens 		 * checksum and the parent need not.  This pushes error
704789Sahrens 		 * detection as close to the leaves as possible and
705789Sahrens 		 * eliminates redundant checksums in the interior nodes.
706789Sahrens 		 */
707789Sahrens 		pipeline |= 1U << ZIO_STAGE_CHECKSUM_VERIFY;
708*7754SJeff.Bonwick@Sun.COM 		pio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY);
709*7754SJeff.Bonwick@Sun.COM 	}
710*7754SJeff.Bonwick@Sun.COM 
711*7754SJeff.Bonwick@Sun.COM 	if (vd->vdev_children == 0)
712*7754SJeff.Bonwick@Sun.COM 		offset += VDEV_LABEL_START_SIZE;
713*7754SJeff.Bonwick@Sun.COM 
714*7754SJeff.Bonwick@Sun.COM 	zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size,
715*7754SJeff.Bonwick@Sun.COM 	    done, private, type, priority,
716*7754SJeff.Bonwick@Sun.COM 	    (pio->io_flags & ZIO_FLAG_VDEV_INHERIT) |
717*7754SJeff.Bonwick@Sun.COM 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | flags,
718*7754SJeff.Bonwick@Sun.COM 	    vd, offset, &pio->io_bookmark,
719*7754SJeff.Bonwick@Sun.COM 	    ZIO_STAGE_VDEV_IO_START - 1, pipeline);
720*7754SJeff.Bonwick@Sun.COM 
721*7754SJeff.Bonwick@Sun.COM 	return (zio);
722*7754SJeff.Bonwick@Sun.COM }
723*7754SJeff.Bonwick@Sun.COM 
724*7754SJeff.Bonwick@Sun.COM zio_t *
725*7754SJeff.Bonwick@Sun.COM zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size,
726*7754SJeff.Bonwick@Sun.COM 	int type, int priority, int flags, zio_done_func_t *done, void *private)
727*7754SJeff.Bonwick@Sun.COM {
728*7754SJeff.Bonwick@Sun.COM 	zio_t *zio;
729*7754SJeff.Bonwick@Sun.COM 
730*7754SJeff.Bonwick@Sun.COM 	ASSERT(vd->vdev_ops->vdev_op_leaf);
731*7754SJeff.Bonwick@Sun.COM 
732*7754SJeff.Bonwick@Sun.COM 	zio = zio_create(NULL, vd->vdev_spa, 0, NULL,
733*7754SJeff.Bonwick@Sun.COM 	    data, size, done, private, type, priority,
734*7754SJeff.Bonwick@Sun.COM 	    flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY,
735*7754SJeff.Bonwick@Sun.COM 	    vd, offset, NULL,
736*7754SJeff.Bonwick@Sun.COM 	    ZIO_STAGE_VDEV_IO_START - 1, ZIO_VDEV_CHILD_PIPELINE);
737*7754SJeff.Bonwick@Sun.COM 
738*7754SJeff.Bonwick@Sun.COM 	return (zio);
739*7754SJeff.Bonwick@Sun.COM }
740*7754SJeff.Bonwick@Sun.COM 
741*7754SJeff.Bonwick@Sun.COM void
742*7754SJeff.Bonwick@Sun.COM zio_flush(zio_t *zio, vdev_t *vd)
743*7754SJeff.Bonwick@Sun.COM {
744*7754SJeff.Bonwick@Sun.COM 	zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE,
745*7754SJeff.Bonwick@Sun.COM 	    NULL, NULL, ZIO_PRIORITY_NOW,
746*7754SJeff.Bonwick@Sun.COM 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
747*7754SJeff.Bonwick@Sun.COM }
748*7754SJeff.Bonwick@Sun.COM 
749*7754SJeff.Bonwick@Sun.COM /*
750*7754SJeff.Bonwick@Sun.COM  * ==========================================================================
751*7754SJeff.Bonwick@Sun.COM  * Prepare to read and write logical blocks
752*7754SJeff.Bonwick@Sun.COM  * ==========================================================================
753*7754SJeff.Bonwick@Sun.COM  */
754*7754SJeff.Bonwick@Sun.COM 
755*7754SJeff.Bonwick@Sun.COM static int
756*7754SJeff.Bonwick@Sun.COM zio_read_bp_init(zio_t *zio)
757*7754SJeff.Bonwick@Sun.COM {
758*7754SJeff.Bonwick@Sun.COM 	blkptr_t *bp = zio->io_bp;
759*7754SJeff.Bonwick@Sun.COM 
760*7754SJeff.Bonwick@Sun.COM 	if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && zio->io_logical == zio) {
761*7754SJeff.Bonwick@Sun.COM 		uint64_t csize = BP_GET_PSIZE(bp);
762*7754SJeff.Bonwick@Sun.COM 		void *cbuf = zio_buf_alloc(csize);
763*7754SJeff.Bonwick@Sun.COM 
764*7754SJeff.Bonwick@Sun.COM 		zio_push_transform(zio, cbuf, csize, csize, zio_decompress);
765*7754SJeff.Bonwick@Sun.COM 	}
766*7754SJeff.Bonwick@Sun.COM 
767*7754SJeff.Bonwick@Sun.COM 	if (!dmu_ot[BP_GET_TYPE(bp)].ot_metadata && BP_GET_LEVEL(bp) == 0)
768*7754SJeff.Bonwick@Sun.COM 		zio->io_flags |= ZIO_FLAG_DONT_CACHE;
769*7754SJeff.Bonwick@Sun.COM 
770*7754SJeff.Bonwick@Sun.COM 	return (ZIO_PIPELINE_CONTINUE);
771*7754SJeff.Bonwick@Sun.COM }
772*7754SJeff.Bonwick@Sun.COM 
773*7754SJeff.Bonwick@Sun.COM static int
774*7754SJeff.Bonwick@Sun.COM zio_write_bp_init(zio_t *zio)
775*7754SJeff.Bonwick@Sun.COM {
776*7754SJeff.Bonwick@Sun.COM 	zio_prop_t *zp = &zio->io_prop;
777*7754SJeff.Bonwick@Sun.COM 	int compress = zp->zp_compress;
778*7754SJeff.Bonwick@Sun.COM 	blkptr_t *bp = zio->io_bp;
779*7754SJeff.Bonwick@Sun.COM 	void *cbuf;
780*7754SJeff.Bonwick@Sun.COM 	uint64_t lsize = zio->io_size;
781*7754SJeff.Bonwick@Sun.COM 	uint64_t csize = lsize;
782*7754SJeff.Bonwick@Sun.COM 	uint64_t cbufsize = 0;
783*7754SJeff.Bonwick@Sun.COM 	int pass = 1;
784*7754SJeff.Bonwick@Sun.COM 
785*7754SJeff.Bonwick@Sun.COM 	/*
786*7754SJeff.Bonwick@Sun.COM 	 * If our children haven't all reached the ready stage,
787*7754SJeff.Bonwick@Sun.COM 	 * wait for them and then repeat this pipeline stage.
788*7754SJeff.Bonwick@Sun.COM 	 */
789*7754SJeff.Bonwick@Sun.COM 	if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
790*7754SJeff.Bonwick@Sun.COM 	    zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY))
791*7754SJeff.Bonwick@Sun.COM 		return (ZIO_PIPELINE_STOP);
792*7754SJeff.Bonwick@Sun.COM 
793*7754SJeff.Bonwick@Sun.COM 	if (!IO_IS_ALLOCATING(zio))
794*7754SJeff.Bonwick@Sun.COM 		return (ZIO_PIPELINE_CONTINUE);
795*7754SJeff.Bonwick@Sun.COM 
796*7754SJeff.Bonwick@Sun.COM 	ASSERT(compress != ZIO_COMPRESS_INHERIT);
797*7754SJeff.Bonwick@Sun.COM 
798*7754SJeff.Bonwick@Sun.COM 	if (bp->blk_birth == zio->io_txg) {
799*7754SJeff.Bonwick@Sun.COM 		/*
800*7754SJeff.Bonwick@Sun.COM 		 * We're rewriting an existing block, which means we're
801*7754SJeff.Bonwick@Sun.COM 		 * working on behalf of spa_sync().  For spa_sync() to
802*7754SJeff.Bonwick@Sun.COM 		 * converge, it must eventually be the case that we don't
803*7754SJeff.Bonwick@Sun.COM 		 * have to allocate new blocks.  But compression changes
804*7754SJeff.Bonwick@Sun.COM 		 * the blocksize, which forces a reallocate, and makes
805*7754SJeff.Bonwick@Sun.COM 		 * convergence take longer.  Therefore, after the first
806*7754SJeff.Bonwick@Sun.COM 		 * few passes, stop compressing to ensure convergence.
807*7754SJeff.Bonwick@Sun.COM 		 */
808*7754SJeff.Bonwick@Sun.COM 		pass = spa_sync_pass(zio->io_spa);
809*7754SJeff.Bonwick@Sun.COM 		ASSERT(pass > 1);
810*7754SJeff.Bonwick@Sun.COM 
811*7754SJeff.Bonwick@Sun.COM 		if (pass > SYNC_PASS_DONT_COMPRESS)
812*7754SJeff.Bonwick@Sun.COM 			compress = ZIO_COMPRESS_OFF;
813*7754SJeff.Bonwick@Sun.COM 
814*7754SJeff.Bonwick@Sun.COM 		/*
815*7754SJeff.Bonwick@Sun.COM 		 * Only MOS (objset 0) data should need to be rewritten.
816*7754SJeff.Bonwick@Sun.COM 		 */
817*7754SJeff.Bonwick@Sun.COM 		ASSERT(zio->io_logical->io_bookmark.zb_objset == 0);
818*7754SJeff.Bonwick@Sun.COM 
819*7754SJeff.Bonwick@Sun.COM 		/* Make sure someone doesn't change their mind on overwrites */
820*7754SJeff.Bonwick@Sun.COM 		ASSERT(MIN(zp->zp_ndvas + BP_IS_GANG(bp),
821*7754SJeff.Bonwick@Sun.COM 		    spa_max_replication(zio->io_spa)) == BP_GET_NDVAS(bp));
822*7754SJeff.Bonwick@Sun.COM 	}
823*7754SJeff.Bonwick@Sun.COM 
824*7754SJeff.Bonwick@Sun.COM 	if (compress != ZIO_COMPRESS_OFF) {
825*7754SJeff.Bonwick@Sun.COM 		if (!zio_compress_data(compress, zio->io_data, zio->io_size,
826*7754SJeff.Bonwick@Sun.COM 		    &cbuf, &csize, &cbufsize)) {
827*7754SJeff.Bonwick@Sun.COM 			compress = ZIO_COMPRESS_OFF;
828*7754SJeff.Bonwick@Sun.COM 		} else if (csize != 0) {
829*7754SJeff.Bonwick@Sun.COM 			zio_push_transform(zio, cbuf, csize, cbufsize, NULL);
830*7754SJeff.Bonwick@Sun.COM 		}
831789Sahrens 	}
832789Sahrens 
833*7754SJeff.Bonwick@Sun.COM 	/*
834*7754SJeff.Bonwick@Sun.COM 	 * The final pass of spa_sync() must be all rewrites, but the first
835*7754SJeff.Bonwick@Sun.COM 	 * few passes offer a trade-off: allocating blocks defers convergence,
836*7754SJeff.Bonwick@Sun.COM 	 * but newly allocated blocks are sequential, so they can be written
837*7754SJeff.Bonwick@Sun.COM 	 * to disk faster.  Therefore, we allow the first few passes of
838*7754SJeff.Bonwick@Sun.COM 	 * spa_sync() to allocate new blocks, but force rewrites after that.
839*7754SJeff.Bonwick@Sun.COM 	 * There should only be a handful of blocks after pass 1 in any case.
840*7754SJeff.Bonwick@Sun.COM 	 */
841*7754SJeff.Bonwick@Sun.COM 	if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == csize &&
842*7754SJeff.Bonwick@Sun.COM 	    pass > SYNC_PASS_REWRITE) {
843*7754SJeff.Bonwick@Sun.COM 		ASSERT(csize != 0);
844*7754SJeff.Bonwick@Sun.COM 		uint32_t gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
845*7754SJeff.Bonwick@Sun.COM 		zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages;
846*7754SJeff.Bonwick@Sun.COM 		zio->io_flags |= ZIO_FLAG_IO_REWRITE;
847*7754SJeff.Bonwick@Sun.COM 	} else {
848*7754SJeff.Bonwick@Sun.COM 		BP_ZERO(bp);
849*7754SJeff.Bonwick@Sun.COM 		zio->io_pipeline = ZIO_WRITE_PIPELINE;
850*7754SJeff.Bonwick@Sun.COM 	}
851*7754SJeff.Bonwick@Sun.COM 
852*7754SJeff.Bonwick@Sun.COM 	if (csize == 0) {
853*7754SJeff.Bonwick@Sun.COM 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
854*7754SJeff.Bonwick@Sun.COM 	} else {
855*7754SJeff.Bonwick@Sun.COM 		ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER);
856*7754SJeff.Bonwick@Sun.COM 		BP_SET_LSIZE(bp, lsize);
857*7754SJeff.Bonwick@Sun.COM 		BP_SET_PSIZE(bp, csize);
858*7754SJeff.Bonwick@Sun.COM 		BP_SET_COMPRESS(bp, compress);
859*7754SJeff.Bonwick@Sun.COM 		BP_SET_CHECKSUM(bp, zp->zp_checksum);
860*7754SJeff.Bonwick@Sun.COM 		BP_SET_TYPE(bp, zp->zp_type);
861*7754SJeff.Bonwick@Sun.COM 		BP_SET_LEVEL(bp, zp->zp_level);
862*7754SJeff.Bonwick@Sun.COM 		BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
863*7754SJeff.Bonwick@Sun.COM 	}
864*7754SJeff.Bonwick@Sun.COM 
865*7754SJeff.Bonwick@Sun.COM 	return (ZIO_PIPELINE_CONTINUE);
866*7754SJeff.Bonwick@Sun.COM }
867*7754SJeff.Bonwick@Sun.COM 
868*7754SJeff.Bonwick@Sun.COM /*
869*7754SJeff.Bonwick@Sun.COM  * ==========================================================================
870*7754SJeff.Bonwick@Sun.COM  * Execute the I/O pipeline
871*7754SJeff.Bonwick@Sun.COM  * ==========================================================================
872*7754SJeff.Bonwick@Sun.COM  */
873*7754SJeff.Bonwick@Sun.COM 
874*7754SJeff.Bonwick@Sun.COM static void
875*7754SJeff.Bonwick@Sun.COM zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q)
876*7754SJeff.Bonwick@Sun.COM {
877*7754SJeff.Bonwick@Sun.COM 	zio_type_t t = zio->io_type;
878*7754SJeff.Bonwick@Sun.COM 
879*7754SJeff.Bonwick@Sun.COM 	/*
880*7754SJeff.Bonwick@Sun.COM 	 * If we're a config writer, the normal issue and interrupt threads
881*7754SJeff.Bonwick@Sun.COM 	 * may all be blocked waiting for the config lock.  In this case,
882*7754SJeff.Bonwick@Sun.COM 	 * select the otherwise-unused taskq for ZIO_TYPE_NULL.
883*7754SJeff.Bonwick@Sun.COM 	 */
884*7754SJeff.Bonwick@Sun.COM 	if (zio->io_flags & ZIO_FLAG_CONFIG_WRITER)
885*7754SJeff.Bonwick@Sun.COM 		t = ZIO_TYPE_NULL;
886*7754SJeff.Bonwick@Sun.COM 
887*7754SJeff.Bonwick@Sun.COM 	/*
888*7754SJeff.Bonwick@Sun.COM 	 * A similar issue exists for the L2ARC write thread until L2ARC 2.0.
889*7754SJeff.Bonwick@Sun.COM 	 */
890*7754SJeff.Bonwick@Sun.COM 	if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux)
891*7754SJeff.Bonwick@Sun.COM 		t = ZIO_TYPE_NULL;
892*7754SJeff.Bonwick@Sun.COM 
893*7754SJeff.Bonwick@Sun.COM 	(void) taskq_dispatch(zio->io_spa->spa_zio_taskq[t][q],
894*7754SJeff.Bonwick@Sun.COM 	    (task_func_t *)zio_execute, zio, TQ_SLEEP);
895*7754SJeff.Bonwick@Sun.COM }
896*7754SJeff.Bonwick@Sun.COM 
897*7754SJeff.Bonwick@Sun.COM static boolean_t
898*7754SJeff.Bonwick@Sun.COM zio_taskq_member(zio_t *zio, enum zio_taskq_type q)
899*7754SJeff.Bonwick@Sun.COM {
900*7754SJeff.Bonwick@Sun.COM 	kthread_t *executor = zio->io_executor;
901*7754SJeff.Bonwick@Sun.COM 	spa_t *spa = zio->io_spa;
902789Sahrens 
903*7754SJeff.Bonwick@Sun.COM 	for (zio_type_t t = 0; t < ZIO_TYPES; t++)
904*7754SJeff.Bonwick@Sun.COM 		if (taskq_member(spa->spa_zio_taskq[t][q], executor))
905*7754SJeff.Bonwick@Sun.COM 			return (B_TRUE);
906*7754SJeff.Bonwick@Sun.COM 
907*7754SJeff.Bonwick@Sun.COM 	return (B_FALSE);
908*7754SJeff.Bonwick@Sun.COM }
909*7754SJeff.Bonwick@Sun.COM 
910*7754SJeff.Bonwick@Sun.COM static int
911*7754SJeff.Bonwick@Sun.COM zio_issue_async(zio_t *zio)
912*7754SJeff.Bonwick@Sun.COM {
913*7754SJeff.Bonwick@Sun.COM 	zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE);
914*7754SJeff.Bonwick@Sun.COM 
915*7754SJeff.Bonwick@Sun.COM 	return (ZIO_PIPELINE_STOP);
916*7754SJeff.Bonwick@Sun.COM }
917*7754SJeff.Bonwick@Sun.COM 
918*7754SJeff.Bonwick@Sun.COM void
919*7754SJeff.Bonwick@Sun.COM zio_interrupt(zio_t *zio)
920*7754SJeff.Bonwick@Sun.COM {
921*7754SJeff.Bonwick@Sun.COM 	zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT);
922*7754SJeff.Bonwick@Sun.COM }
923*7754SJeff.Bonwick@Sun.COM 
924*7754SJeff.Bonwick@Sun.COM /*
925*7754SJeff.Bonwick@Sun.COM  * Execute the I/O pipeline until one of the following occurs:
926*7754SJeff.Bonwick@Sun.COM  * (1) the I/O completes; (2) the pipeline stalls waiting for
927*7754SJeff.Bonwick@Sun.COM  * dependent child I/Os; (3) the I/O issues, so we're waiting
928*7754SJeff.Bonwick@Sun.COM  * for an I/O completion interrupt; (4) the I/O is delegated by
929*7754SJeff.Bonwick@Sun.COM  * vdev-level caching or aggregation; (5) the I/O is deferred
930*7754SJeff.Bonwick@Sun.COM  * due to vdev-level queueing; (6) the I/O is handed off to
931*7754SJeff.Bonwick@Sun.COM  * another thread.  In all cases, the pipeline stops whenever
932*7754SJeff.Bonwick@Sun.COM  * there's no CPU work; it never burns a thread in cv_wait().
933*7754SJeff.Bonwick@Sun.COM  *
934*7754SJeff.Bonwick@Sun.COM  * There's no locking on io_stage because there's no legitimate way
935*7754SJeff.Bonwick@Sun.COM  * for multiple threads to be attempting to process the same I/O.
936*7754SJeff.Bonwick@Sun.COM  */
937*7754SJeff.Bonwick@Sun.COM static zio_pipe_stage_t *zio_pipeline[ZIO_STAGES];
938789Sahrens 
939*7754SJeff.Bonwick@Sun.COM void
940*7754SJeff.Bonwick@Sun.COM zio_execute(zio_t *zio)
941*7754SJeff.Bonwick@Sun.COM {
942*7754SJeff.Bonwick@Sun.COM 	zio->io_executor = curthread;
943*7754SJeff.Bonwick@Sun.COM 
944*7754SJeff.Bonwick@Sun.COM 	while (zio->io_stage < ZIO_STAGE_DONE) {
945*7754SJeff.Bonwick@Sun.COM 		uint32_t pipeline = zio->io_pipeline;
946*7754SJeff.Bonwick@Sun.COM 		zio_stage_t stage = zio->io_stage;
947*7754SJeff.Bonwick@Sun.COM 		int rv;
948*7754SJeff.Bonwick@Sun.COM 
949*7754SJeff.Bonwick@Sun.COM 		ASSERT(!MUTEX_HELD(&zio->io_lock));
950*7754SJeff.Bonwick@Sun.COM 
951*7754SJeff.Bonwick@Sun.COM 		while (((1U << ++stage) & pipeline) == 0)
952*7754SJeff.Bonwick@Sun.COM 			continue;
953*7754SJeff.Bonwick@Sun.COM 
954*7754SJeff.Bonwick@Sun.COM 		ASSERT(stage <= ZIO_STAGE_DONE);
955*7754SJeff.Bonwick@Sun.COM 		ASSERT(zio->io_stall == NULL);
956*7754SJeff.Bonwick@Sun.COM 
957*7754SJeff.Bonwick@Sun.COM 		/*
958*7754SJeff.Bonwick@Sun.COM 		 * If we are in interrupt context and this pipeline stage
959*7754SJeff.Bonwick@Sun.COM 		 * will grab a config lock that is held across I/O,
960*7754SJeff.Bonwick@Sun.COM 		 * issue async to avoid deadlock.
961*7754SJeff.Bonwick@Sun.COM 		 */
962*7754SJeff.Bonwick@Sun.COM 		if (((1U << stage) & ZIO_CONFIG_LOCK_BLOCKING_STAGES) &&
963*7754SJeff.Bonwick@Sun.COM 		    zio->io_vd == NULL &&
964*7754SJeff.Bonwick@Sun.COM 		    zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) {
965*7754SJeff.Bonwick@Sun.COM 			zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE);
966*7754SJeff.Bonwick@Sun.COM 			return;
967*7754SJeff.Bonwick@Sun.COM 		}
968*7754SJeff.Bonwick@Sun.COM 
969*7754SJeff.Bonwick@Sun.COM 		zio->io_stage = stage;
970*7754SJeff.Bonwick@Sun.COM 		rv = zio_pipeline[stage](zio);
971*7754SJeff.Bonwick@Sun.COM 
972*7754SJeff.Bonwick@Sun.COM 		if (rv == ZIO_PIPELINE_STOP)
973*7754SJeff.Bonwick@Sun.COM 			return;
974*7754SJeff.Bonwick@Sun.COM 
975*7754SJeff.Bonwick@Sun.COM 		ASSERT(rv == ZIO_PIPELINE_CONTINUE);
976*7754SJeff.Bonwick@Sun.COM 	}
977789Sahrens }
978789Sahrens 
979789Sahrens /*
980789Sahrens  * ==========================================================================
981789Sahrens  * Initiate I/O, either sync or async
982789Sahrens  * ==========================================================================
983789Sahrens  */
984789Sahrens int
985789Sahrens zio_wait(zio_t *zio)
986789Sahrens {
987789Sahrens 	int error;
988789Sahrens 
989789Sahrens 	ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
990*7754SJeff.Bonwick@Sun.COM 	ASSERT(zio->io_executor == NULL);
991789Sahrens 
992789Sahrens 	zio->io_waiter = curthread;
993789Sahrens 
9945530Sbonwick 	zio_execute(zio);
995789Sahrens 
996789Sahrens 	mutex_enter(&zio->io_lock);
997*7754SJeff.Bonwick@Sun.COM 	while (zio->io_executor != NULL)
998789Sahrens 		cv_wait(&zio->io_cv, &zio->io_lock);
999789Sahrens 	mutex_exit(&zio->io_lock);
1000789Sahrens 
1001789Sahrens 	error = zio->io_error;
10026523Sek110237 	zio_destroy(zio);
1003789Sahrens 
1004789Sahrens 	return (error);
1005789Sahrens }
1006789Sahrens 
1007789Sahrens void
1008789Sahrens zio_nowait(zio_t *zio)
1009789Sahrens {
1010*7754SJeff.Bonwick@Sun.COM 	ASSERT(zio->io_executor == NULL);
1011*7754SJeff.Bonwick@Sun.COM 
1012*7754SJeff.Bonwick@Sun.COM 	if (zio->io_parent == NULL && zio->io_child_type == ZIO_CHILD_LOGICAL) {
1013*7754SJeff.Bonwick@Sun.COM 		/*
1014*7754SJeff.Bonwick@Sun.COM 		 * This is a logical async I/O with no parent to wait for it.
1015*7754SJeff.Bonwick@Sun.COM 		 * Attach it to the pool's global async root zio so that
1016*7754SJeff.Bonwick@Sun.COM 		 * spa_unload() has a way of waiting for async I/O to finish.
1017*7754SJeff.Bonwick@Sun.COM 		 */
1018*7754SJeff.Bonwick@Sun.COM 		spa_t *spa = zio->io_spa;
1019*7754SJeff.Bonwick@Sun.COM 		zio->io_async_root = B_TRUE;
1020*7754SJeff.Bonwick@Sun.COM 		mutex_enter(&spa->spa_async_root_lock);
1021*7754SJeff.Bonwick@Sun.COM 		spa->spa_async_root_count++;
1022*7754SJeff.Bonwick@Sun.COM 		mutex_exit(&spa->spa_async_root_lock);
1023*7754SJeff.Bonwick@Sun.COM 	}
1024*7754SJeff.Bonwick@Sun.COM 
10255530Sbonwick 	zio_execute(zio);
10265530Sbonwick }
10275530Sbonwick 
1028*7754SJeff.Bonwick@Sun.COM /*
1029*7754SJeff.Bonwick@Sun.COM  * ==========================================================================
1030*7754SJeff.Bonwick@Sun.COM  * Reexecute or suspend/resume failed I/O
1031*7754SJeff.Bonwick@Sun.COM  * ==========================================================================
1032*7754SJeff.Bonwick@Sun.COM  */
1033*7754SJeff.Bonwick@Sun.COM 
1034*7754SJeff.Bonwick@Sun.COM static void
1035*7754SJeff.Bonwick@Sun.COM zio_reexecute(zio_t *pio)
1036*7754SJeff.Bonwick@Sun.COM {
1037*7754SJeff.Bonwick@Sun.COM 	zio_t *zio, *zio_next;
1038*7754SJeff.Bonwick@Sun.COM 
1039*7754SJeff.Bonwick@Sun.COM 	pio->io_flags = pio->io_orig_flags;
1040*7754SJeff.Bonwick@Sun.COM 	pio->io_stage = pio->io_orig_stage;
1041*7754SJeff.Bonwick@Sun.COM 	pio->io_pipeline = pio->io_orig_pipeline;
1042*7754SJeff.Bonwick@Sun.COM 	pio->io_reexecute = 0;
1043*7754SJeff.Bonwick@Sun.COM 	pio->io_error = 0;
1044*7754SJeff.Bonwick@Sun.COM 	for (int c = 0; c < ZIO_CHILD_TYPES; c++)
1045*7754SJeff.Bonwick@Sun.COM 		pio->io_child_error[c] = 0;
1046*7754SJeff.Bonwick@Sun.COM 
1047*7754SJeff.Bonwick@Sun.COM 	if (IO_IS_ALLOCATING(pio)) {
1048*7754SJeff.Bonwick@Sun.COM 		/*
1049*7754SJeff.Bonwick@Sun.COM 		 * Remember the failed bp so that the io_ready() callback
1050*7754SJeff.Bonwick@Sun.COM 		 * can update its accounting upon reexecution.  The block
1051*7754SJeff.Bonwick@Sun.COM 		 * was already freed in zio_done(); we indicate this with
1052*7754SJeff.Bonwick@Sun.COM 		 * a fill count of -1 so that zio_free() knows to skip it.
1053*7754SJeff.Bonwick@Sun.COM 		 */
1054*7754SJeff.Bonwick@Sun.COM 		blkptr_t *bp = pio->io_bp;
1055*7754SJeff.Bonwick@Sun.COM 		ASSERT(bp->blk_birth == 0 || bp->blk_birth == pio->io_txg);
1056*7754SJeff.Bonwick@Sun.COM 		bp->blk_fill = BLK_FILL_ALREADY_FREED;
1057*7754SJeff.Bonwick@Sun.COM 		pio->io_bp_orig = *bp;
1058*7754SJeff.Bonwick@Sun.COM 		BP_ZERO(bp);
1059*7754SJeff.Bonwick@Sun.COM 	}
1060*7754SJeff.Bonwick@Sun.COM 
1061*7754SJeff.Bonwick@Sun.COM 	/*
1062*7754SJeff.Bonwick@Sun.COM 	 * As we reexecute pio's children, new children could be created.
1063*7754SJeff.Bonwick@Sun.COM 	 * New children go to the head of the io_child list, however,
1064*7754SJeff.Bonwick@Sun.COM 	 * so we will (correctly) not reexecute them.  The key is that
1065*7754SJeff.Bonwick@Sun.COM 	 * the remainder of the io_child list, from 'zio_next' onward,
1066*7754SJeff.Bonwick@Sun.COM 	 * cannot be affected by any side effects of reexecuting 'zio'.
1067*7754SJeff.Bonwick@Sun.COM 	 */
1068*7754SJeff.Bonwick@Sun.COM 	for (zio = pio->io_child; zio != NULL; zio = zio_next) {
1069*7754SJeff.Bonwick@Sun.COM 		zio_next = zio->io_sibling_next;
1070*7754SJeff.Bonwick@Sun.COM 		mutex_enter(&pio->io_lock);
1071*7754SJeff.Bonwick@Sun.COM 		pio->io_children[zio->io_child_type][ZIO_WAIT_READY]++;
1072*7754SJeff.Bonwick@Sun.COM 		pio->io_children[zio->io_child_type][ZIO_WAIT_DONE]++;
1073*7754SJeff.Bonwick@Sun.COM 		mutex_exit(&pio->io_lock);
1074*7754SJeff.Bonwick@Sun.COM 		zio_reexecute(zio);
1075*7754SJeff.Bonwick@Sun.COM 	}
1076*7754SJeff.Bonwick@Sun.COM 
1077*7754SJeff.Bonwick@Sun.COM 	/*
1078*7754SJeff.Bonwick@Sun.COM 	 * Now that all children have been reexecuted, execute the parent.
1079*7754SJeff.Bonwick@Sun.COM 	 */
1080*7754SJeff.Bonwick@Sun.COM 	zio_execute(pio);
1081*7754SJeff.Bonwick@Sun.COM }
1082*7754SJeff.Bonwick@Sun.COM 
10835530Sbonwick void
1084*7754SJeff.Bonwick@Sun.COM zio_suspend(spa_t *spa, zio_t *zio)
10855530Sbonwick {
1086*7754SJeff.Bonwick@Sun.COM 	if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC)
1087*7754SJeff.Bonwick@Sun.COM 		fm_panic("Pool '%s' has encountered an uncorrectable I/O "
1088*7754SJeff.Bonwick@Sun.COM 		    "failure and the failure mode property for this pool "
1089*7754SJeff.Bonwick@Sun.COM 		    "is set to panic.", spa_name(spa));
1090*7754SJeff.Bonwick@Sun.COM 
1091*7754SJeff.Bonwick@Sun.COM 	zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0);
1092*7754SJeff.Bonwick@Sun.COM 
1093*7754SJeff.Bonwick@Sun.COM 	mutex_enter(&spa->spa_suspend_lock);
1094*7754SJeff.Bonwick@Sun.COM 
1095*7754SJeff.Bonwick@Sun.COM 	if (spa->spa_suspend_zio_root == NULL)
1096*7754SJeff.Bonwick@Sun.COM 		spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, 0);
1097*7754SJeff.Bonwick@Sun.COM 
1098*7754SJeff.Bonwick@Sun.COM 	spa->spa_suspended = B_TRUE;
1099*7754SJeff.Bonwick@Sun.COM 
1100*7754SJeff.Bonwick@Sun.COM 	if (zio != NULL) {
1101*7754SJeff.Bonwick@Sun.COM 		ASSERT(zio != spa->spa_suspend_zio_root);
1102*7754SJeff.Bonwick@Sun.COM 		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1103*7754SJeff.Bonwick@Sun.COM 		ASSERT(zio->io_parent == NULL);
1104*7754SJeff.Bonwick@Sun.COM 		ASSERT(zio->io_stage == ZIO_STAGE_DONE);
1105*7754SJeff.Bonwick@Sun.COM 		zio_add_child(spa->spa_suspend_zio_root, zio);
1106*7754SJeff.Bonwick@Sun.COM 	}
1107*7754SJeff.Bonwick@Sun.COM 
1108*7754SJeff.Bonwick@Sun.COM 	mutex_exit(&spa->spa_suspend_lock);
11095530Sbonwick }
11105530Sbonwick 
1111*7754SJeff.Bonwick@Sun.COM void
1112*7754SJeff.Bonwick@Sun.COM zio_resume(spa_t *spa)
11135530Sbonwick {
1114*7754SJeff.Bonwick@Sun.COM 	zio_t *pio, *zio;
1115*7754SJeff.Bonwick@Sun.COM 
1116*7754SJeff.Bonwick@Sun.COM 	/*
1117*7754SJeff.Bonwick@Sun.COM 	 * Reexecute all previously suspended i/o.
1118*7754SJeff.Bonwick@Sun.COM 	 */
1119*7754SJeff.Bonwick@Sun.COM 	mutex_enter(&spa->spa_suspend_lock);
1120*7754SJeff.Bonwick@Sun.COM 	spa->spa_suspended = B_FALSE;
1121*7754SJeff.Bonwick@Sun.COM 	cv_broadcast(&spa->spa_suspend_cv);
1122*7754SJeff.Bonwick@Sun.COM 	pio = spa->spa_suspend_zio_root;
1123*7754SJeff.Bonwick@Sun.COM 	spa->spa_suspend_zio_root = NULL;
1124*7754SJeff.Bonwick@Sun.COM 	mutex_exit(&spa->spa_suspend_lock);
1125*7754SJeff.Bonwick@Sun.COM 
1126*7754SJeff.Bonwick@Sun.COM 	if (pio == NULL)
1127*7754SJeff.Bonwick@Sun.COM 		return;
11285530Sbonwick 
1129*7754SJeff.Bonwick@Sun.COM 	while ((zio = pio->io_child) != NULL) {
1130*7754SJeff.Bonwick@Sun.COM 		zio_remove_child(pio, zio);
1131*7754SJeff.Bonwick@Sun.COM 		zio->io_parent = NULL;
1132*7754SJeff.Bonwick@Sun.COM 		zio_reexecute(zio);
1133*7754SJeff.Bonwick@Sun.COM 	}
1134*7754SJeff.Bonwick@Sun.COM 
1135*7754SJeff.Bonwick@Sun.COM 	ASSERT(pio->io_children[ZIO_CHILD_LOGICAL][ZIO_WAIT_DONE] == 0);
1136*7754SJeff.Bonwick@Sun.COM 
1137*7754SJeff.Bonwick@Sun.COM 	(void) zio_wait(pio);
1138*7754SJeff.Bonwick@Sun.COM }
1139*7754SJeff.Bonwick@Sun.COM 
1140*7754SJeff.Bonwick@Sun.COM void
1141*7754SJeff.Bonwick@Sun.COM zio_resume_wait(spa_t *spa)
1142*7754SJeff.Bonwick@Sun.COM {
1143*7754SJeff.Bonwick@Sun.COM 	mutex_enter(&spa->spa_suspend_lock);
1144*7754SJeff.Bonwick@Sun.COM 	while (spa_suspended(spa))
1145*7754SJeff.Bonwick@Sun.COM 		cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock);
1146*7754SJeff.Bonwick@Sun.COM 	mutex_exit(&spa->spa_suspend_lock);
1147789Sahrens }
1148789Sahrens 
1149789Sahrens /*
1150789Sahrens  * ==========================================================================
1151*7754SJeff.Bonwick@Sun.COM  * Gang blocks.
1152*7754SJeff.Bonwick@Sun.COM  *
1153*7754SJeff.Bonwick@Sun.COM  * A gang block is a collection of small blocks that looks to the DMU
1154*7754SJeff.Bonwick@Sun.COM  * like one large block.  When zio_dva_allocate() cannot find a block
1155*7754SJeff.Bonwick@Sun.COM  * of the requested size, due to either severe fragmentation or the pool
1156*7754SJeff.Bonwick@Sun.COM  * being nearly full, it calls zio_write_gang_block() to construct the
1157*7754SJeff.Bonwick@Sun.COM  * block from smaller fragments.
1158*7754SJeff.Bonwick@Sun.COM  *
1159*7754SJeff.Bonwick@Sun.COM  * A gang block consists of a gang header (zio_gbh_phys_t) and up to
1160*7754SJeff.Bonwick@Sun.COM  * three (SPA_GBH_NBLKPTRS) gang members.  The gang header is just like
1161*7754SJeff.Bonwick@Sun.COM  * an indirect block: it's an array of block pointers.  It consumes
1162*7754SJeff.Bonwick@Sun.COM  * only one sector and hence is allocatable regardless of fragmentation.
1163*7754SJeff.Bonwick@Sun.COM  * The gang header's bps point to its gang members, which hold the data.
1164*7754SJeff.Bonwick@Sun.COM  *
1165*7754SJeff.Bonwick@Sun.COM  * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg>
1166*7754SJeff.Bonwick@Sun.COM  * as the verifier to ensure uniqueness of the SHA256 checksum.
1167*7754SJeff.Bonwick@Sun.COM  * Critically, the gang block bp's blk_cksum is the checksum of the data,
1168*7754SJeff.Bonwick@Sun.COM  * not the gang header.  This ensures that data block signatures (needed for
1169*7754SJeff.Bonwick@Sun.COM  * deduplication) are independent of how the block is physically stored.
1170*7754SJeff.Bonwick@Sun.COM  *
1171*7754SJeff.Bonwick@Sun.COM  * Gang blocks can be nested: a gang member may itself be a gang block.
1172*7754SJeff.Bonwick@Sun.COM  * Thus every gang block is a tree in which root and all interior nodes are
1173*7754SJeff.Bonwick@Sun.COM  * gang headers, and the leaves are normal blocks that contain user data.
1174*7754SJeff.Bonwick@Sun.COM  * The root of the gang tree is called the gang leader.
1175*7754SJeff.Bonwick@Sun.COM  *
1176*7754SJeff.Bonwick@Sun.COM  * To perform any operation (read, rewrite, free, claim) on a gang block,
1177*7754SJeff.Bonwick@Sun.COM  * zio_gang_assemble() first assembles the gang tree (minus data leaves)
1178*7754SJeff.Bonwick@Sun.COM  * in the io_gang_tree field of the original logical i/o by recursively
1179*7754SJeff.Bonwick@Sun.COM  * reading the gang leader and all gang headers below it.  This yields
1180*7754SJeff.Bonwick@Sun.COM  * an in-core tree containing the contents of every gang header and the
1181*7754SJeff.Bonwick@Sun.COM  * bps for every constituent of the gang block.
1182*7754SJeff.Bonwick@Sun.COM  *
1183*7754SJeff.Bonwick@Sun.COM  * With the gang tree now assembled, zio_gang_issue() just walks the gang tree
1184*7754SJeff.Bonwick@Sun.COM  * and invokes a callback on each bp.  To free a gang block, zio_gang_issue()
1185*7754SJeff.Bonwick@Sun.COM  * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp.
1186*7754SJeff.Bonwick@Sun.COM  * zio_claim_gang() provides a similarly trivial wrapper for zio_claim().
1187*7754SJeff.Bonwick@Sun.COM  * zio_read_gang() is a wrapper around zio_read() that omits reading gang
1188*7754SJeff.Bonwick@Sun.COM  * headers, since we already have those in io_gang_tree.  zio_rewrite_gang()
1189*7754SJeff.Bonwick@Sun.COM  * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite()
1190*7754SJeff.Bonwick@Sun.COM  * of the gang header plus zio_checksum_compute() of the data to update the
1191*7754SJeff.Bonwick@Sun.COM  * gang header's blk_cksum as described above.
1192*7754SJeff.Bonwick@Sun.COM  *
1193*7754SJeff.Bonwick@Sun.COM  * The two-phase assemble/issue model solves the problem of partial failure --
1194*7754SJeff.Bonwick@Sun.COM  * what if you'd freed part of a gang block but then couldn't read the
1195*7754SJeff.Bonwick@Sun.COM  * gang header for another part?  Assembling the entire gang tree first
1196*7754SJeff.Bonwick@Sun.COM  * ensures that all the necessary gang header I/O has succeeded before
1197*7754SJeff.Bonwick@Sun.COM  * starting the actual work of free, claim, or write.  Once the gang tree
1198*7754SJeff.Bonwick@Sun.COM  * is assembled, free and claim are in-memory operations that cannot fail.
1199*7754SJeff.Bonwick@Sun.COM  *
1200*7754SJeff.Bonwick@Sun.COM  * In the event that a gang write fails, zio_dva_unallocate() walks the
1201*7754SJeff.Bonwick@Sun.COM  * gang tree to immediately free (i.e. insert back into the space map)
1202*7754SJeff.Bonwick@Sun.COM  * everything we've allocated.  This ensures that we don't get ENOSPC
1203*7754SJeff.Bonwick@Sun.COM  * errors during repeated suspend/resume cycles due to a flaky device.
1204*7754SJeff.Bonwick@Sun.COM  *
1205*7754SJeff.Bonwick@Sun.COM  * Gang rewrites only happen during sync-to-convergence.  If we can't assemble
1206*7754SJeff.Bonwick@Sun.COM  * the gang tree, we won't modify the block, so we can safely defer the free
1207*7754SJeff.Bonwick@Sun.COM  * (knowing that the block is still intact).  If we *can* assemble the gang
1208*7754SJeff.Bonwick@Sun.COM  * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free
1209*7754SJeff.Bonwick@Sun.COM  * each constituent bp and we can allocate a new block on the next sync pass.
1210*7754SJeff.Bonwick@Sun.COM  *
1211*7754SJeff.Bonwick@Sun.COM  * In all cases, the gang tree allows complete recovery from partial failure.
1212789Sahrens  * ==========================================================================
1213789Sahrens  */
12145530Sbonwick 
1215*7754SJeff.Bonwick@Sun.COM static zio_t *
1216*7754SJeff.Bonwick@Sun.COM zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1217*7754SJeff.Bonwick@Sun.COM {
1218*7754SJeff.Bonwick@Sun.COM 	if (gn != NULL)
1219*7754SJeff.Bonwick@Sun.COM 		return (pio);
12205530Sbonwick 
1221*7754SJeff.Bonwick@Sun.COM 	return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp),
1222*7754SJeff.Bonwick@Sun.COM 	    NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
1223*7754SJeff.Bonwick@Sun.COM 	    &pio->io_bookmark));
1224789Sahrens }
1225789Sahrens 
1226*7754SJeff.Bonwick@Sun.COM zio_t *
1227*7754SJeff.Bonwick@Sun.COM zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
12286523Sek110237 {
1229*7754SJeff.Bonwick@Sun.COM 	zio_t *zio;
12306523Sek110237 
1231*7754SJeff.Bonwick@Sun.COM 	if (gn != NULL) {
1232*7754SJeff.Bonwick@Sun.COM 		zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
1233*7754SJeff.Bonwick@Sun.COM 		    gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority,
1234*7754SJeff.Bonwick@Sun.COM 		    ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
1235*7754SJeff.Bonwick@Sun.COM 		/*
1236*7754SJeff.Bonwick@Sun.COM 		 * As we rewrite each gang header, the pipeline will compute
1237*7754SJeff.Bonwick@Sun.COM 		 * a new gang block header checksum for it; but no one will
1238*7754SJeff.Bonwick@Sun.COM 		 * compute a new data checksum, so we do that here.  The one
1239*7754SJeff.Bonwick@Sun.COM 		 * exception is the gang leader: the pipeline already computed
1240*7754SJeff.Bonwick@Sun.COM 		 * its data checksum because that stage precedes gang assembly.
1241*7754SJeff.Bonwick@Sun.COM 		 * (Presently, nothing actually uses interior data checksums;
1242*7754SJeff.Bonwick@Sun.COM 		 * this is just good hygiene.)
1243*7754SJeff.Bonwick@Sun.COM 		 */
1244*7754SJeff.Bonwick@Sun.COM 		if (gn != pio->io_logical->io_gang_tree) {
1245*7754SJeff.Bonwick@Sun.COM 			zio_checksum_compute(zio, BP_GET_CHECKSUM(bp),
1246*7754SJeff.Bonwick@Sun.COM 			    data, BP_GET_PSIZE(bp));
1247*7754SJeff.Bonwick@Sun.COM 		}
1248*7754SJeff.Bonwick@Sun.COM 	} else {
1249*7754SJeff.Bonwick@Sun.COM 		zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
1250*7754SJeff.Bonwick@Sun.COM 		    data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority,
1251*7754SJeff.Bonwick@Sun.COM 		    ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
12526523Sek110237 	}
12536523Sek110237 
1254*7754SJeff.Bonwick@Sun.COM 	return (zio);
1255*7754SJeff.Bonwick@Sun.COM }
1256*7754SJeff.Bonwick@Sun.COM 
1257*7754SJeff.Bonwick@Sun.COM /* ARGSUSED */
1258*7754SJeff.Bonwick@Sun.COM zio_t *
1259*7754SJeff.Bonwick@Sun.COM zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1260*7754SJeff.Bonwick@Sun.COM {
1261*7754SJeff.Bonwick@Sun.COM 	return (zio_free(pio, pio->io_spa, pio->io_txg, bp,
1262*7754SJeff.Bonwick@Sun.COM 	    NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)));
1263*7754SJeff.Bonwick@Sun.COM }
1264*7754SJeff.Bonwick@Sun.COM 
1265*7754SJeff.Bonwick@Sun.COM /* ARGSUSED */
1266*7754SJeff.Bonwick@Sun.COM zio_t *
1267*7754SJeff.Bonwick@Sun.COM zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1268*7754SJeff.Bonwick@Sun.COM {
1269*7754SJeff.Bonwick@Sun.COM 	return (zio_claim(pio, pio->io_spa, pio->io_txg, bp,
1270*7754SJeff.Bonwick@Sun.COM 	    NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)));
1271*7754SJeff.Bonwick@Sun.COM }
1272*7754SJeff.Bonwick@Sun.COM 
1273*7754SJeff.Bonwick@Sun.COM static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = {
1274*7754SJeff.Bonwick@Sun.COM 	NULL,
1275*7754SJeff.Bonwick@Sun.COM 	zio_read_gang,
1276*7754SJeff.Bonwick@Sun.COM 	zio_rewrite_gang,
1277*7754SJeff.Bonwick@Sun.COM 	zio_free_gang,
1278*7754SJeff.Bonwick@Sun.COM 	zio_claim_gang,
1279*7754SJeff.Bonwick@Sun.COM 	NULL
1280*7754SJeff.Bonwick@Sun.COM };
1281*7754SJeff.Bonwick@Sun.COM 
1282*7754SJeff.Bonwick@Sun.COM static void zio_gang_tree_assemble_done(zio_t *zio);
1283*7754SJeff.Bonwick@Sun.COM 
1284*7754SJeff.Bonwick@Sun.COM static zio_gang_node_t *
1285*7754SJeff.Bonwick@Sun.COM zio_gang_node_alloc(zio_gang_node_t **gnpp)
1286*7754SJeff.Bonwick@Sun.COM {
1287*7754SJeff.Bonwick@Sun.COM 	zio_gang_node_t *gn;
1288*7754SJeff.Bonwick@Sun.COM 
1289*7754SJeff.Bonwick@Sun.COM 	ASSERT(*gnpp == NULL);
1290*7754SJeff.Bonwick@Sun.COM 
1291*7754SJeff.Bonwick@Sun.COM 	gn = kmem_zalloc(sizeof (*gn), KM_SLEEP);
1292*7754SJeff.Bonwick@Sun.COM 	gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE);
1293*7754SJeff.Bonwick@Sun.COM 	*gnpp = gn;
1294*7754SJeff.Bonwick@Sun.COM 
1295*7754SJeff.Bonwick@Sun.COM 	return (gn);
12966523Sek110237 }
12976523Sek110237 
12986523Sek110237 static void
1299*7754SJeff.Bonwick@Sun.COM zio_gang_node_free(zio_gang_node_t **gnpp)
1300*7754SJeff.Bonwick@Sun.COM {
1301*7754SJeff.Bonwick@Sun.COM 	zio_gang_node_t *gn = *gnpp;
1302*7754SJeff.Bonwick@Sun.COM 
1303*7754SJeff.Bonwick@Sun.COM 	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
1304*7754SJeff.Bonwick@Sun.COM 		ASSERT(gn->gn_child[g] == NULL);
1305*7754SJeff.Bonwick@Sun.COM 
1306*7754SJeff.Bonwick@Sun.COM 	zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE);
1307*7754SJeff.Bonwick@Sun.COM 	kmem_free(gn, sizeof (*gn));
1308*7754SJeff.Bonwick@Sun.COM 	*gnpp = NULL;
1309*7754SJeff.Bonwick@Sun.COM }
1310*7754SJeff.Bonwick@Sun.COM 
1311*7754SJeff.Bonwick@Sun.COM static void
1312*7754SJeff.Bonwick@Sun.COM zio_gang_tree_free(zio_gang_node_t **gnpp)
1313789Sahrens {
1314*7754SJeff.Bonwick@Sun.COM 	zio_gang_node_t *gn = *gnpp;
1315*7754SJeff.Bonwick@Sun.COM 
1316*7754SJeff.Bonwick@Sun.COM 	if (gn == NULL)
1317*7754SJeff.Bonwick@Sun.COM 		return;
1318*7754SJeff.Bonwick@Sun.COM 
1319*7754SJeff.Bonwick@Sun.COM 	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
1320*7754SJeff.Bonwick@Sun.COM 		zio_gang_tree_free(&gn->gn_child[g]);
1321*7754SJeff.Bonwick@Sun.COM 
1322*7754SJeff.Bonwick@Sun.COM 	zio_gang_node_free(gnpp);
1323*7754SJeff.Bonwick@Sun.COM }
1324*7754SJeff.Bonwick@Sun.COM 
1325*7754SJeff.Bonwick@Sun.COM static void
1326*7754SJeff.Bonwick@Sun.COM zio_gang_tree_assemble(zio_t *lio, blkptr_t *bp, zio_gang_node_t **gnpp)
1327*7754SJeff.Bonwick@Sun.COM {
1328*7754SJeff.Bonwick@Sun.COM 	zio_gang_node_t *gn = zio_gang_node_alloc(gnpp);
1329789Sahrens 
1330*7754SJeff.Bonwick@Sun.COM 	ASSERT(lio->io_logical == lio);
1331*7754SJeff.Bonwick@Sun.COM 	ASSERT(BP_IS_GANG(bp));
1332*7754SJeff.Bonwick@Sun.COM 
1333*7754SJeff.Bonwick@Sun.COM 	zio_nowait(zio_read(lio, lio->io_spa, bp, gn->gn_gbh,
1334*7754SJeff.Bonwick@Sun.COM 	    SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn,
1335*7754SJeff.Bonwick@Sun.COM 	    lio->io_priority, ZIO_GANG_CHILD_FLAGS(lio), &lio->io_bookmark));
1336*7754SJeff.Bonwick@Sun.COM }
1337*7754SJeff.Bonwick@Sun.COM 
1338*7754SJeff.Bonwick@Sun.COM static void
1339*7754SJeff.Bonwick@Sun.COM zio_gang_tree_assemble_done(zio_t *zio)
1340*7754SJeff.Bonwick@Sun.COM {
1341*7754SJeff.Bonwick@Sun.COM 	zio_t *lio = zio->io_logical;
1342*7754SJeff.Bonwick@Sun.COM 	zio_gang_node_t *gn = zio->io_private;
1343*7754SJeff.Bonwick@Sun.COM 	blkptr_t *bp = zio->io_bp;
1344*7754SJeff.Bonwick@Sun.COM 
1345*7754SJeff.Bonwick@Sun.COM 	ASSERT(zio->io_parent == lio);
1346*7754SJeff.Bonwick@Sun.COM 	ASSERT(zio->io_child == NULL);
1347*7754SJeff.Bonwick@Sun.COM 
1348*7754SJeff.Bonwick@Sun.COM 	if (zio->io_error)
1349*7754SJeff.Bonwick@Sun.COM 		return;
1350*7754SJeff.Bonwick@Sun.COM 
1351*7754SJeff.Bonwick@Sun.COM 	if (BP_SHOULD_BYTESWAP(bp))
1352*7754SJeff.Bonwick@Sun.COM 		byteswap_uint64_array(zio->io_data, zio->io_size);
1353*7754SJeff.Bonwick@Sun.COM 
1354*7754SJeff.Bonwick@Sun.COM 	ASSERT(zio->io_data == gn->gn_gbh);
1355*7754SJeff.Bonwick@Sun.COM 	ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
1356*7754SJeff.Bonwick@Sun.COM 	ASSERT(gn->gn_gbh->zg_tail.zbt_magic == ZBT_MAGIC);
1357*7754SJeff.Bonwick@Sun.COM 
1358*7754SJeff.Bonwick@Sun.COM 	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
1359*7754SJeff.Bonwick@Sun.COM 		blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
1360*7754SJeff.Bonwick@Sun.COM 		if (!BP_IS_GANG(gbp))
1361*7754SJeff.Bonwick@Sun.COM 			continue;
1362*7754SJeff.Bonwick@Sun.COM 		zio_gang_tree_assemble(lio, gbp, &gn->gn_child[g]);
1363789Sahrens 	}
1364789Sahrens }
1365789Sahrens 
1366*7754SJeff.Bonwick@Sun.COM static void
1367*7754SJeff.Bonwick@Sun.COM zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data)
1368789Sahrens {
1369*7754SJeff.Bonwick@Sun.COM 	zio_t *lio = pio->io_logical;
1370*7754SJeff.Bonwick@Sun.COM 	zio_t *zio;
1371*7754SJeff.Bonwick@Sun.COM 
1372*7754SJeff.Bonwick@Sun.COM 	ASSERT(BP_IS_GANG(bp) == !!gn);
1373*7754SJeff.Bonwick@Sun.COM 	ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(lio->io_bp));
1374*7754SJeff.Bonwick@Sun.COM 	ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == lio->io_gang_tree);
1375*7754SJeff.Bonwick@Sun.COM 
1376*7754SJeff.Bonwick@Sun.COM 	/*
1377*7754SJeff.Bonwick@Sun.COM 	 * If you're a gang header, your data is in gn->gn_gbh.
1378*7754SJeff.Bonwick@Sun.COM 	 * If you're a gang member, your data is in 'data' and gn == NULL.
1379*7754SJeff.Bonwick@Sun.COM 	 */
1380*7754SJeff.Bonwick@Sun.COM 	zio = zio_gang_issue_func[lio->io_type](pio, bp, gn, data);
1381789Sahrens 
1382*7754SJeff.Bonwick@Sun.COM 	if (gn != NULL) {
1383*7754SJeff.Bonwick@Sun.COM 		ASSERT(gn->gn_gbh->zg_tail.zbt_magic == ZBT_MAGIC);
1384*7754SJeff.Bonwick@Sun.COM 
1385*7754SJeff.Bonwick@Sun.COM 		for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
1386*7754SJeff.Bonwick@Sun.COM 			blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
1387*7754SJeff.Bonwick@Sun.COM 			if (BP_IS_HOLE(gbp))
1388*7754SJeff.Bonwick@Sun.COM 				continue;
1389*7754SJeff.Bonwick@Sun.COM 			zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data);
1390*7754SJeff.Bonwick@Sun.COM 			data = (char *)data + BP_GET_PSIZE(gbp);
1391*7754SJeff.Bonwick@Sun.COM 		}
1392*7754SJeff.Bonwick@Sun.COM 	}
1393*7754SJeff.Bonwick@Sun.COM 
1394*7754SJeff.Bonwick@Sun.COM 	if (gn == lio->io_gang_tree)
1395*7754SJeff.Bonwick@Sun.COM 		ASSERT3P((char *)lio->io_data + lio->io_size, ==, data);
1396*7754SJeff.Bonwick@Sun.COM 
1397*7754SJeff.Bonwick@Sun.COM 	if (zio != pio)
1398*7754SJeff.Bonwick@Sun.COM 		zio_nowait(zio);
1399789Sahrens }
1400789Sahrens 
14015530Sbonwick static int
1402*7754SJeff.Bonwick@Sun.COM zio_gang_assemble(zio_t *zio)
14035329Sgw25295 {
14045530Sbonwick 	blkptr_t *bp = zio->io_bp;
14055530Sbonwick 
1406*7754SJeff.Bonwick@Sun.COM 	ASSERT(BP_IS_GANG(bp) && zio == zio->io_logical);
14075530Sbonwick 
1408*7754SJeff.Bonwick@Sun.COM 	zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree);
1409789Sahrens 
14105530Sbonwick 	return (ZIO_PIPELINE_CONTINUE);
1411789Sahrens }
1412789Sahrens 
14135530Sbonwick static int
1414*7754SJeff.Bonwick@Sun.COM zio_gang_issue(zio_t *zio)
14156523Sek110237 {
1416*7754SJeff.Bonwick@Sun.COM 	zio_t *lio = zio->io_logical;
14176523Sek110237 	blkptr_t *bp = zio->io_bp;
1418789Sahrens 
1419*7754SJeff.Bonwick@Sun.COM 	if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE))
1420*7754SJeff.Bonwick@Sun.COM 		return (ZIO_PIPELINE_STOP);
14215329Sgw25295 
1422*7754SJeff.Bonwick@Sun.COM 	ASSERT(BP_IS_GANG(bp) && zio == lio);
1423789Sahrens 
1424*7754SJeff.Bonwick@Sun.COM 	if (zio->io_child_error[ZIO_CHILD_GANG] == 0)
1425*7754SJeff.Bonwick@Sun.COM 		zio_gang_tree_issue(lio, lio->io_gang_tree, bp, lio->io_data);
1426*7754SJeff.Bonwick@Sun.COM 	else
1427*7754SJeff.Bonwick@Sun.COM 		zio_gang_tree_free(&lio->io_gang_tree);
1428789Sahrens 
1429*7754SJeff.Bonwick@Sun.COM 	zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
14305530Sbonwick 
14315530Sbonwick 	return (ZIO_PIPELINE_CONTINUE);
1432789Sahrens }
1433789Sahrens 
1434789Sahrens static void
1435*7754SJeff.Bonwick@Sun.COM zio_write_gang_member_ready(zio_t *zio)
1436789Sahrens {
1437789Sahrens 	zio_t *pio = zio->io_parent;
1438*7754SJeff.Bonwick@Sun.COM 	zio_t *lio = zio->io_logical;
14391775Sbillm 	dva_t *cdva = zio->io_bp->blk_dva;
14401775Sbillm 	dva_t *pdva = pio->io_bp->blk_dva;
1441789Sahrens 	uint64_t asize;
1442*7754SJeff.Bonwick@Sun.COM 
1443*7754SJeff.Bonwick@Sun.COM 	if (BP_IS_HOLE(zio->io_bp))
1444*7754SJeff.Bonwick@Sun.COM 		return;
1445*7754SJeff.Bonwick@Sun.COM 
1446*7754SJeff.Bonwick@Sun.COM 	ASSERT(BP_IS_HOLE(&zio->io_bp_orig));
1447789Sahrens 
1448*7754SJeff.Bonwick@Sun.COM 	ASSERT(zio->io_child_type == ZIO_CHILD_GANG);
1449*7754SJeff.Bonwick@Sun.COM 	ASSERT3U(zio->io_prop.zp_ndvas, ==, lio->io_prop.zp_ndvas);
1450*7754SJeff.Bonwick@Sun.COM 	ASSERT3U(zio->io_prop.zp_ndvas, <=, BP_GET_NDVAS(zio->io_bp));
1451*7754SJeff.Bonwick@Sun.COM 	ASSERT3U(pio->io_prop.zp_ndvas, <=, BP_GET_NDVAS(pio->io_bp));
14521775Sbillm 	ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
14531775Sbillm 
1454789Sahrens 	mutex_enter(&pio->io_lock);
1455*7754SJeff.Bonwick@Sun.COM 	for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) {
14561775Sbillm 		ASSERT(DVA_GET_GANG(&pdva[d]));
14571775Sbillm 		asize = DVA_GET_ASIZE(&pdva[d]);
14581775Sbillm 		asize += DVA_GET_ASIZE(&cdva[d]);
14591775Sbillm 		DVA_SET_ASIZE(&pdva[d], asize);
14601775Sbillm 	}
1461789Sahrens 	mutex_exit(&pio->io_lock);
1462789Sahrens }
1463789Sahrens 
14645329Sgw25295 static int
1465*7754SJeff.Bonwick@Sun.COM zio_write_gang_block(zio_t *pio)
1466789Sahrens {
1467*7754SJeff.Bonwick@Sun.COM 	spa_t *spa = pio->io_spa;
1468*7754SJeff.Bonwick@Sun.COM 	blkptr_t *bp = pio->io_bp;
1469*7754SJeff.Bonwick@Sun.COM 	zio_t *lio = pio->io_logical;
1470*7754SJeff.Bonwick@Sun.COM 	zio_t *zio;
1471*7754SJeff.Bonwick@Sun.COM 	zio_gang_node_t *gn, **gnpp;
1472789Sahrens 	zio_gbh_phys_t *gbh;
1473*7754SJeff.Bonwick@Sun.COM 	uint64_t txg = pio->io_txg;
1474*7754SJeff.Bonwick@Sun.COM 	uint64_t resid = pio->io_size;
1475*7754SJeff.Bonwick@Sun.COM 	uint64_t lsize;
1476*7754SJeff.Bonwick@Sun.COM 	int ndvas = lio->io_prop.zp_ndvas;
14771775Sbillm 	int gbh_ndvas = MIN(ndvas + 1, spa_max_replication(spa));
1478*7754SJeff.Bonwick@Sun.COM 	zio_prop_t zp;
1479789Sahrens 	int error;
1480789Sahrens 
1481*7754SJeff.Bonwick@Sun.COM 	error = metaslab_alloc(spa, spa->spa_normal_class, SPA_GANGBLOCKSIZE,
1482*7754SJeff.Bonwick@Sun.COM 	    bp, gbh_ndvas, txg, pio == lio ? NULL : lio->io_bp,
1483*7754SJeff.Bonwick@Sun.COM 	    METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER);
14845530Sbonwick 	if (error) {
1485*7754SJeff.Bonwick@Sun.COM 		pio->io_error = error;
14865530Sbonwick 		return (ZIO_PIPELINE_CONTINUE);
14875530Sbonwick 	}
1488789Sahrens 
1489*7754SJeff.Bonwick@Sun.COM 	if (pio == lio) {
1490*7754SJeff.Bonwick@Sun.COM 		gnpp = &lio->io_gang_tree;
1491*7754SJeff.Bonwick@Sun.COM 	} else {
1492*7754SJeff.Bonwick@Sun.COM 		gnpp = pio->io_private;
1493*7754SJeff.Bonwick@Sun.COM 		ASSERT(pio->io_ready == zio_write_gang_member_ready);
1494789Sahrens 	}
1495789Sahrens 
1496*7754SJeff.Bonwick@Sun.COM 	gn = zio_gang_node_alloc(gnpp);
1497*7754SJeff.Bonwick@Sun.COM 	gbh = gn->gn_gbh;
1498*7754SJeff.Bonwick@Sun.COM 	bzero(gbh, SPA_GANGBLOCKSIZE);
1499789Sahrens 
1500*7754SJeff.Bonwick@Sun.COM 	/*
1501*7754SJeff.Bonwick@Sun.COM 	 * Create the gang header.
1502*7754SJeff.Bonwick@Sun.COM 	 */
1503*7754SJeff.Bonwick@Sun.COM 	zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL,
1504*7754SJeff.Bonwick@Sun.COM 	    pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
15055530Sbonwick 
15061775Sbillm 	/*
1507*7754SJeff.Bonwick@Sun.COM 	 * Create and nowait the gang children.
15081775Sbillm 	 */
1509*7754SJeff.Bonwick@Sun.COM 	for (int g = 0; resid != 0; resid -= lsize, g++) {
1510*7754SJeff.Bonwick@Sun.COM 		lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g),
1511*7754SJeff.Bonwick@Sun.COM 		    SPA_MINBLOCKSIZE);
1512*7754SJeff.Bonwick@Sun.COM 		ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid);
1513*7754SJeff.Bonwick@Sun.COM 
1514*7754SJeff.Bonwick@Sun.COM 		zp.zp_checksum = lio->io_prop.zp_checksum;
1515*7754SJeff.Bonwick@Sun.COM 		zp.zp_compress = ZIO_COMPRESS_OFF;
1516*7754SJeff.Bonwick@Sun.COM 		zp.zp_type = DMU_OT_NONE;
1517*7754SJeff.Bonwick@Sun.COM 		zp.zp_level = 0;
1518*7754SJeff.Bonwick@Sun.COM 		zp.zp_ndvas = lio->io_prop.zp_ndvas;
1519*7754SJeff.Bonwick@Sun.COM 
1520*7754SJeff.Bonwick@Sun.COM 		zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
1521*7754SJeff.Bonwick@Sun.COM 		    (char *)pio->io_data + (pio->io_size - resid), lsize, &zp,
1522*7754SJeff.Bonwick@Sun.COM 		    zio_write_gang_member_ready, NULL, &gn->gn_child[g],
1523*7754SJeff.Bonwick@Sun.COM 		    pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
1524*7754SJeff.Bonwick@Sun.COM 		    &pio->io_bookmark));
1525*7754SJeff.Bonwick@Sun.COM 	}
1526*7754SJeff.Bonwick@Sun.COM 
1527*7754SJeff.Bonwick@Sun.COM 	/*
1528*7754SJeff.Bonwick@Sun.COM 	 * Set pio's pipeline to just wait for zio to finish.
1529*7754SJeff.Bonwick@Sun.COM 	 */
1530*7754SJeff.Bonwick@Sun.COM 	pio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1531*7754SJeff.Bonwick@Sun.COM 
1532*7754SJeff.Bonwick@Sun.COM 	zio_nowait(zio);
1533*7754SJeff.Bonwick@Sun.COM 
1534*7754SJeff.Bonwick@Sun.COM 	return (ZIO_PIPELINE_CONTINUE);
1535789Sahrens }
1536789Sahrens 
1537789Sahrens /*
1538789Sahrens  * ==========================================================================
1539789Sahrens  * Allocate and free blocks
1540789Sahrens  * ==========================================================================
1541789Sahrens  */
1542*7754SJeff.Bonwick@Sun.COM 
15435530Sbonwick static int
1544789Sahrens zio_dva_allocate(zio_t *zio)
1545789Sahrens {
15464527Sperrin 	spa_t *spa = zio->io_spa;
15474527Sperrin 	metaslab_class_t *mc = spa->spa_normal_class;
1548789Sahrens 	blkptr_t *bp = zio->io_bp;
1549789Sahrens 	int error;
1550789Sahrens 
1551789Sahrens 	ASSERT(BP_IS_HOLE(bp));
15521775Sbillm 	ASSERT3U(BP_GET_NDVAS(bp), ==, 0);
1553*7754SJeff.Bonwick@Sun.COM 	ASSERT3U(zio->io_prop.zp_ndvas, >, 0);
1554*7754SJeff.Bonwick@Sun.COM 	ASSERT3U(zio->io_prop.zp_ndvas, <=, spa_max_replication(spa));
1555789Sahrens 	ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
1556789Sahrens 
1557*7754SJeff.Bonwick@Sun.COM 	error = metaslab_alloc(spa, mc, zio->io_size, bp,
1558*7754SJeff.Bonwick@Sun.COM 	    zio->io_prop.zp_ndvas, zio->io_txg, NULL, 0);
1559789Sahrens 
1560*7754SJeff.Bonwick@Sun.COM 	if (error) {
1561*7754SJeff.Bonwick@Sun.COM 		if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE)
1562*7754SJeff.Bonwick@Sun.COM 			return (zio_write_gang_block(zio));
1563789Sahrens 		zio->io_error = error;
1564789Sahrens 	}
15655530Sbonwick 
15665530Sbonwick 	return (ZIO_PIPELINE_CONTINUE);
1567789Sahrens }
1568789Sahrens 
15695530Sbonwick static int
1570789Sahrens zio_dva_free(zio_t *zio)
1571789Sahrens {
1572*7754SJeff.Bonwick@Sun.COM 	metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE);
1573789Sahrens 
15745530Sbonwick 	return (ZIO_PIPELINE_CONTINUE);
1575789Sahrens }
1576789Sahrens 
15775530Sbonwick static int
1578789Sahrens zio_dva_claim(zio_t *zio)
1579789Sahrens {
1580*7754SJeff.Bonwick@Sun.COM 	int error;
1581*7754SJeff.Bonwick@Sun.COM 
1582*7754SJeff.Bonwick@Sun.COM 	error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg);
1583*7754SJeff.Bonwick@Sun.COM 	if (error)
1584*7754SJeff.Bonwick@Sun.COM 		zio->io_error = error;
1585789Sahrens 
15865530Sbonwick 	return (ZIO_PIPELINE_CONTINUE);
1587789Sahrens }
1588789Sahrens 
1589789Sahrens /*
1590*7754SJeff.Bonwick@Sun.COM  * Undo an allocation.  This is used by zio_done() when an I/O fails
1591*7754SJeff.Bonwick@Sun.COM  * and we want to give back the block we just allocated.
1592*7754SJeff.Bonwick@Sun.COM  * This handles both normal blocks and gang blocks.
1593*7754SJeff.Bonwick@Sun.COM  */
1594*7754SJeff.Bonwick@Sun.COM static void
1595*7754SJeff.Bonwick@Sun.COM zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
1596*7754SJeff.Bonwick@Sun.COM {
1597*7754SJeff.Bonwick@Sun.COM 	spa_t *spa = zio->io_spa;
1598*7754SJeff.Bonwick@Sun.COM 	boolean_t now = !(zio->io_flags & ZIO_FLAG_IO_REWRITE);
1599*7754SJeff.Bonwick@Sun.COM 
1600*7754SJeff.Bonwick@Sun.COM 	ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
1601*7754SJeff.Bonwick@Sun.COM 
1602*7754SJeff.Bonwick@Sun.COM 	if (zio->io_bp == bp && !now) {
1603*7754SJeff.Bonwick@Sun.COM 		/*
1604*7754SJeff.Bonwick@Sun.COM 		 * This is a rewrite for sync-to-convergence.
1605*7754SJeff.Bonwick@Sun.COM 		 * We can't do a metaslab_free(NOW) because bp wasn't allocated
1606*7754SJeff.Bonwick@Sun.COM 		 * during this sync pass, which means that metaslab_sync()
1607*7754SJeff.Bonwick@Sun.COM 		 * already committed the allocation.
1608*7754SJeff.Bonwick@Sun.COM 		 */
1609*7754SJeff.Bonwick@Sun.COM 		ASSERT(DVA_EQUAL(BP_IDENTITY(bp),
1610*7754SJeff.Bonwick@Sun.COM 		    BP_IDENTITY(&zio->io_bp_orig)));
1611*7754SJeff.Bonwick@Sun.COM 		ASSERT(spa_sync_pass(spa) > 1);
1612*7754SJeff.Bonwick@Sun.COM 
1613*7754SJeff.Bonwick@Sun.COM 		if (BP_IS_GANG(bp) && gn == NULL) {
1614*7754SJeff.Bonwick@Sun.COM 			/*
1615*7754SJeff.Bonwick@Sun.COM 			 * This is a gang leader whose gang header(s) we
1616*7754SJeff.Bonwick@Sun.COM 			 * couldn't read now, so defer the free until later.
1617*7754SJeff.Bonwick@Sun.COM 			 * The block should still be intact because without
1618*7754SJeff.Bonwick@Sun.COM 			 * the headers, we'd never even start the rewrite.
1619*7754SJeff.Bonwick@Sun.COM 			 */
1620*7754SJeff.Bonwick@Sun.COM 			bplist_enqueue_deferred(&spa->spa_sync_bplist, bp);
1621*7754SJeff.Bonwick@Sun.COM 			return;
1622*7754SJeff.Bonwick@Sun.COM 		}
1623*7754SJeff.Bonwick@Sun.COM 	}
1624*7754SJeff.Bonwick@Sun.COM 
1625*7754SJeff.Bonwick@Sun.COM 	if (!BP_IS_HOLE(bp))
1626*7754SJeff.Bonwick@Sun.COM 		metaslab_free(spa, bp, bp->blk_birth, now);
1627*7754SJeff.Bonwick@Sun.COM 
1628*7754SJeff.Bonwick@Sun.COM 	if (gn != NULL) {
1629*7754SJeff.Bonwick@Sun.COM 		for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
1630*7754SJeff.Bonwick@Sun.COM 			zio_dva_unallocate(zio, gn->gn_child[g],
1631*7754SJeff.Bonwick@Sun.COM 			    &gn->gn_gbh->zg_blkptr[g]);
1632*7754SJeff.Bonwick@Sun.COM 		}
1633*7754SJeff.Bonwick@Sun.COM 	}
1634*7754SJeff.Bonwick@Sun.COM }
1635*7754SJeff.Bonwick@Sun.COM 
1636*7754SJeff.Bonwick@Sun.COM /*
1637*7754SJeff.Bonwick@Sun.COM  * Try to allocate an intent log block.  Return 0 on success, errno on failure.
1638*7754SJeff.Bonwick@Sun.COM  */
1639*7754SJeff.Bonwick@Sun.COM int
1640*7754SJeff.Bonwick@Sun.COM zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp, blkptr_t *old_bp,
1641*7754SJeff.Bonwick@Sun.COM     uint64_t txg)
1642*7754SJeff.Bonwick@Sun.COM {
1643*7754SJeff.Bonwick@Sun.COM 	int error;
1644*7754SJeff.Bonwick@Sun.COM 
1645*7754SJeff.Bonwick@Sun.COM 	error = metaslab_alloc(spa, spa->spa_log_class, size,
1646*7754SJeff.Bonwick@Sun.COM 	    new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID);
1647*7754SJeff.Bonwick@Sun.COM 
1648*7754SJeff.Bonwick@Sun.COM 	if (error)
1649*7754SJeff.Bonwick@Sun.COM 		error = metaslab_alloc(spa, spa->spa_normal_class, size,
1650*7754SJeff.Bonwick@Sun.COM 		    new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID);
1651*7754SJeff.Bonwick@Sun.COM 
1652*7754SJeff.Bonwick@Sun.COM 	if (error == 0) {
1653*7754SJeff.Bonwick@Sun.COM 		BP_SET_LSIZE(new_bp, size);
1654*7754SJeff.Bonwick@Sun.COM 		BP_SET_PSIZE(new_bp, size);
1655*7754SJeff.Bonwick@Sun.COM 		BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
1656*7754SJeff.Bonwick@Sun.COM 		BP_SET_CHECKSUM(new_bp, ZIO_CHECKSUM_ZILOG);
1657*7754SJeff.Bonwick@Sun.COM 		BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
1658*7754SJeff.Bonwick@Sun.COM 		BP_SET_LEVEL(new_bp, 0);
1659*7754SJeff.Bonwick@Sun.COM 		BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER);
1660*7754SJeff.Bonwick@Sun.COM 	}
1661*7754SJeff.Bonwick@Sun.COM 
1662*7754SJeff.Bonwick@Sun.COM 	return (error);
1663*7754SJeff.Bonwick@Sun.COM }
1664*7754SJeff.Bonwick@Sun.COM 
1665*7754SJeff.Bonwick@Sun.COM /*
1666*7754SJeff.Bonwick@Sun.COM  * Free an intent log block.  We know it can't be a gang block, so there's
1667*7754SJeff.Bonwick@Sun.COM  * nothing to do except metaslab_free() it.
1668*7754SJeff.Bonwick@Sun.COM  */
1669*7754SJeff.Bonwick@Sun.COM void
1670*7754SJeff.Bonwick@Sun.COM zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg)
1671*7754SJeff.Bonwick@Sun.COM {
1672*7754SJeff.Bonwick@Sun.COM 	ASSERT(!BP_IS_GANG(bp));
1673*7754SJeff.Bonwick@Sun.COM 
1674*7754SJeff.Bonwick@Sun.COM 	metaslab_free(spa, bp, txg, B_FALSE);
1675*7754SJeff.Bonwick@Sun.COM }
1676*7754SJeff.Bonwick@Sun.COM 
1677*7754SJeff.Bonwick@Sun.COM /*
1678789Sahrens  * ==========================================================================
1679789Sahrens  * Read and write to physical devices
1680789Sahrens  * ==========================================================================
1681789Sahrens  */
1682789Sahrens 
1683*7754SJeff.Bonwick@Sun.COM static void
1684*7754SJeff.Bonwick@Sun.COM zio_vdev_io_probe_done(zio_t *zio)
1685*7754SJeff.Bonwick@Sun.COM {
1686*7754SJeff.Bonwick@Sun.COM 	zio_t *dio;
1687*7754SJeff.Bonwick@Sun.COM 	vdev_t *vd = zio->io_private;
1688*7754SJeff.Bonwick@Sun.COM 
1689*7754SJeff.Bonwick@Sun.COM 	mutex_enter(&vd->vdev_probe_lock);
1690*7754SJeff.Bonwick@Sun.COM 	ASSERT(vd->vdev_probe_zio == zio);
1691*7754SJeff.Bonwick@Sun.COM 	vd->vdev_probe_zio = NULL;
1692*7754SJeff.Bonwick@Sun.COM 	mutex_exit(&vd->vdev_probe_lock);
1693*7754SJeff.Bonwick@Sun.COM 
1694*7754SJeff.Bonwick@Sun.COM 	while ((dio = zio->io_delegate_list) != NULL) {
1695*7754SJeff.Bonwick@Sun.COM 		zio->io_delegate_list = dio->io_delegate_next;
1696*7754SJeff.Bonwick@Sun.COM 		dio->io_delegate_next = NULL;
1697*7754SJeff.Bonwick@Sun.COM 		if (!vdev_accessible(vd, dio))
1698*7754SJeff.Bonwick@Sun.COM 			dio->io_error = ENXIO;
1699*7754SJeff.Bonwick@Sun.COM 		zio_execute(dio);
1700*7754SJeff.Bonwick@Sun.COM 	}
1701*7754SJeff.Bonwick@Sun.COM }
1702*7754SJeff.Bonwick@Sun.COM 
1703*7754SJeff.Bonwick@Sun.COM /*
1704*7754SJeff.Bonwick@Sun.COM  * Probe the device to determine whether I/O failure is specific to this
1705*7754SJeff.Bonwick@Sun.COM  * zio (e.g. a bad sector) or affects the entire vdev (e.g. unplugged).
1706*7754SJeff.Bonwick@Sun.COM  */
1707*7754SJeff.Bonwick@Sun.COM static int
1708*7754SJeff.Bonwick@Sun.COM zio_vdev_io_probe(zio_t *zio)
1709*7754SJeff.Bonwick@Sun.COM {
1710*7754SJeff.Bonwick@Sun.COM 	vdev_t *vd = zio->io_vd;
1711*7754SJeff.Bonwick@Sun.COM 	zio_t *pio = NULL;
1712*7754SJeff.Bonwick@Sun.COM 	boolean_t created_pio = B_FALSE;
1713*7754SJeff.Bonwick@Sun.COM 
1714*7754SJeff.Bonwick@Sun.COM 	/*
1715*7754SJeff.Bonwick@Sun.COM 	 * Don't probe the probe.
1716*7754SJeff.Bonwick@Sun.COM 	 */
1717*7754SJeff.Bonwick@Sun.COM 	if (zio->io_flags & ZIO_FLAG_PROBE)
1718*7754SJeff.Bonwick@Sun.COM 		return (ZIO_PIPELINE_CONTINUE);
1719*7754SJeff.Bonwick@Sun.COM 
1720*7754SJeff.Bonwick@Sun.COM 	/*
1721*7754SJeff.Bonwick@Sun.COM 	 * To prevent 'probe storms' when a device fails, we create
1722*7754SJeff.Bonwick@Sun.COM 	 * just one probe i/o at a time.  All zios that want to probe
1723*7754SJeff.Bonwick@Sun.COM 	 * this vdev will join the probe zio's io_delegate_list.
1724*7754SJeff.Bonwick@Sun.COM 	 */
1725*7754SJeff.Bonwick@Sun.COM 	mutex_enter(&vd->vdev_probe_lock);
1726*7754SJeff.Bonwick@Sun.COM 
1727*7754SJeff.Bonwick@Sun.COM 	if ((pio = vd->vdev_probe_zio) == NULL) {
1728*7754SJeff.Bonwick@Sun.COM 		vd->vdev_probe_zio = pio = zio_root(zio->io_spa,
1729*7754SJeff.Bonwick@Sun.COM 		    zio_vdev_io_probe_done, vd, ZIO_FLAG_CANFAIL);
1730*7754SJeff.Bonwick@Sun.COM 		created_pio = B_TRUE;
1731*7754SJeff.Bonwick@Sun.COM 		vd->vdev_probe_wanted = B_TRUE;
1732*7754SJeff.Bonwick@Sun.COM 		spa_async_request(zio->io_spa, SPA_ASYNC_PROBE);
1733*7754SJeff.Bonwick@Sun.COM 	}
1734*7754SJeff.Bonwick@Sun.COM 
1735*7754SJeff.Bonwick@Sun.COM 	zio->io_delegate_next = pio->io_delegate_list;
1736*7754SJeff.Bonwick@Sun.COM 	pio->io_delegate_list = zio;
1737*7754SJeff.Bonwick@Sun.COM 
1738*7754SJeff.Bonwick@Sun.COM 	mutex_exit(&vd->vdev_probe_lock);
1739*7754SJeff.Bonwick@Sun.COM 
1740*7754SJeff.Bonwick@Sun.COM 	if (created_pio) {
1741*7754SJeff.Bonwick@Sun.COM 		zio_nowait(vdev_probe(vd, pio));
1742*7754SJeff.Bonwick@Sun.COM 		zio_nowait(pio);
1743*7754SJeff.Bonwick@Sun.COM 	}
1744*7754SJeff.Bonwick@Sun.COM 
1745*7754SJeff.Bonwick@Sun.COM 	return (ZIO_PIPELINE_STOP);
1746*7754SJeff.Bonwick@Sun.COM }
1747*7754SJeff.Bonwick@Sun.COM 
17485530Sbonwick static int
17491775Sbillm zio_vdev_io_start(zio_t *zio)
1750789Sahrens {
1751789Sahrens 	vdev_t *vd = zio->io_vd;
17521775Sbillm 	uint64_t align;
17535329Sgw25295 	spa_t *spa = zio->io_spa;
17545329Sgw25295 
1755*7754SJeff.Bonwick@Sun.COM 	ASSERT(zio->io_error == 0);
1756*7754SJeff.Bonwick@Sun.COM 	ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0);
1757*7754SJeff.Bonwick@Sun.COM 
1758*7754SJeff.Bonwick@Sun.COM 	if (vd == NULL) {
1759*7754SJeff.Bonwick@Sun.COM 		if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
1760*7754SJeff.Bonwick@Sun.COM 			spa_config_enter(spa, SCL_ZIO, zio, RW_READER);
1761789Sahrens 
1762*7754SJeff.Bonwick@Sun.COM 		/*
1763*7754SJeff.Bonwick@Sun.COM 		 * The mirror_ops handle multiple DVAs in a single BP.
1764*7754SJeff.Bonwick@Sun.COM 		 */
17655530Sbonwick 		return (vdev_mirror_ops.vdev_op_io_start(zio));
1766*7754SJeff.Bonwick@Sun.COM 	}
17671775Sbillm 
1768*7754SJeff.Bonwick@Sun.COM 	align = 1ULL << vd->vdev_top->vdev_ashift;
1769789Sahrens 
17701732Sbonwick 	if (P2PHASE(zio->io_size, align) != 0) {
17711732Sbonwick 		uint64_t asize = P2ROUNDUP(zio->io_size, align);
17721732Sbonwick 		char *abuf = zio_buf_alloc(asize);
1773*7754SJeff.Bonwick@Sun.COM 		ASSERT(vd == vd->vdev_top);
17741732Sbonwick 		if (zio->io_type == ZIO_TYPE_WRITE) {
17751732Sbonwick 			bcopy(zio->io_data, abuf, zio->io_size);
17761732Sbonwick 			bzero(abuf + zio->io_size, asize - zio->io_size);
17771732Sbonwick 		}
1778*7754SJeff.Bonwick@Sun.COM 		zio_push_transform(zio, abuf, asize, asize, zio_subblock);
17791732Sbonwick 	}
17801732Sbonwick 
17811732Sbonwick 	ASSERT(P2PHASE(zio->io_offset, align) == 0);
17821732Sbonwick 	ASSERT(P2PHASE(zio->io_size, align) == 0);
1783789Sahrens 	ASSERT(zio->io_type != ZIO_TYPE_WRITE || (spa_mode & FWRITE));
1784789Sahrens 
1785*7754SJeff.Bonwick@Sun.COM 	if (vd->vdev_ops->vdev_op_leaf &&
1786*7754SJeff.Bonwick@Sun.COM 	    (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) {
1787*7754SJeff.Bonwick@Sun.COM 
1788*7754SJeff.Bonwick@Sun.COM 		if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0)
1789*7754SJeff.Bonwick@Sun.COM 			return (ZIO_PIPELINE_STOP);
1790*7754SJeff.Bonwick@Sun.COM 
1791*7754SJeff.Bonwick@Sun.COM 		if ((zio = vdev_queue_io(zio)) == NULL)
1792*7754SJeff.Bonwick@Sun.COM 			return (ZIO_PIPELINE_STOP);
1793*7754SJeff.Bonwick@Sun.COM 
1794*7754SJeff.Bonwick@Sun.COM 		if (!vdev_accessible(vd, zio)) {
1795*7754SJeff.Bonwick@Sun.COM 			zio->io_error = ENXIO;
1796*7754SJeff.Bonwick@Sun.COM 			zio_interrupt(zio);
1797*7754SJeff.Bonwick@Sun.COM 			return (ZIO_PIPELINE_STOP);
1798*7754SJeff.Bonwick@Sun.COM 		}
1799*7754SJeff.Bonwick@Sun.COM 
1800*7754SJeff.Bonwick@Sun.COM 	}
1801*7754SJeff.Bonwick@Sun.COM 
18025530Sbonwick 	return (vd->vdev_ops->vdev_op_io_start(zio));
1803789Sahrens }
1804789Sahrens 
18055530Sbonwick static int
1806789Sahrens zio_vdev_io_done(zio_t *zio)
1807789Sahrens {
1808*7754SJeff.Bonwick@Sun.COM 	vdev_t *vd = zio->io_vd;
1809*7754SJeff.Bonwick@Sun.COM 	vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops;
1810*7754SJeff.Bonwick@Sun.COM 	boolean_t unexpected_error = B_FALSE;
18115530Sbonwick 
1812*7754SJeff.Bonwick@Sun.COM 	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
1813*7754SJeff.Bonwick@Sun.COM 		return (ZIO_PIPELINE_STOP);
1814*7754SJeff.Bonwick@Sun.COM 
1815*7754SJeff.Bonwick@Sun.COM 	ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
1816789Sahrens 
1817*7754SJeff.Bonwick@Sun.COM 	if (vd != NULL && vd->vdev_ops->vdev_op_leaf) {
1818*7754SJeff.Bonwick@Sun.COM 
1819*7754SJeff.Bonwick@Sun.COM 		vdev_queue_io_done(zio);
1820*7754SJeff.Bonwick@Sun.COM 
1821*7754SJeff.Bonwick@Sun.COM 		if (zio->io_type == ZIO_TYPE_WRITE)
1822*7754SJeff.Bonwick@Sun.COM 			vdev_cache_write(zio);
1823*7754SJeff.Bonwick@Sun.COM 
1824*7754SJeff.Bonwick@Sun.COM 		if (zio_injection_enabled && zio->io_error == 0)
1825*7754SJeff.Bonwick@Sun.COM 			zio->io_error = zio_handle_device_injection(vd, EIO);
1826789Sahrens 
1827*7754SJeff.Bonwick@Sun.COM 		if (zio_injection_enabled && zio->io_error == 0)
1828*7754SJeff.Bonwick@Sun.COM 			zio->io_error = zio_handle_label_injection(zio, EIO);
1829*7754SJeff.Bonwick@Sun.COM 
1830*7754SJeff.Bonwick@Sun.COM 		if (zio->io_error) {
1831*7754SJeff.Bonwick@Sun.COM 			if (!vdev_accessible(vd, zio)) {
1832*7754SJeff.Bonwick@Sun.COM 				zio->io_error = ENXIO;
1833*7754SJeff.Bonwick@Sun.COM 			} else {
1834*7754SJeff.Bonwick@Sun.COM 				unexpected_error = B_TRUE;
1835*7754SJeff.Bonwick@Sun.COM 			}
1836*7754SJeff.Bonwick@Sun.COM 		}
18376976Seschrock 	}
1838*7754SJeff.Bonwick@Sun.COM 
1839*7754SJeff.Bonwick@Sun.COM 	ops->vdev_op_io_done(zio);
1840789Sahrens 
1841*7754SJeff.Bonwick@Sun.COM 	if (unexpected_error)
1842*7754SJeff.Bonwick@Sun.COM 		return (zio_vdev_io_probe(zio));
1843*7754SJeff.Bonwick@Sun.COM 
1844*7754SJeff.Bonwick@Sun.COM 	return (ZIO_PIPELINE_CONTINUE);
1845789Sahrens }
1846789Sahrens 
18475530Sbonwick static int
1848789Sahrens zio_vdev_io_assess(zio_t *zio)
1849789Sahrens {
1850789Sahrens 	vdev_t *vd = zio->io_vd;
1851789Sahrens 
1852*7754SJeff.Bonwick@Sun.COM 	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
1853*7754SJeff.Bonwick@Sun.COM 		return (ZIO_PIPELINE_STOP);
1854789Sahrens 
1855*7754SJeff.Bonwick@Sun.COM 	if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
1856*7754SJeff.Bonwick@Sun.COM 		spa_config_exit(zio->io_spa, SCL_ZIO, zio);
1857*7754SJeff.Bonwick@Sun.COM 
1858*7754SJeff.Bonwick@Sun.COM 	if (zio->io_vsd != NULL) {
1859*7754SJeff.Bonwick@Sun.COM 		zio->io_vsd_free(zio);
1860*7754SJeff.Bonwick@Sun.COM 		zio->io_vsd = NULL;
18611732Sbonwick 	}
18621732Sbonwick 
1863*7754SJeff.Bonwick@Sun.COM 	if (zio_injection_enabled && zio->io_error == 0)
18641544Seschrock 		zio->io_error = zio_handle_fault_injection(zio, EIO);
1865789Sahrens 
1866789Sahrens 	/*
1867789Sahrens 	 * If the I/O failed, determine whether we should attempt to retry it.
1868789Sahrens 	 */
1869*7754SJeff.Bonwick@Sun.COM 	if (zio->io_error && vd == NULL &&
1870*7754SJeff.Bonwick@Sun.COM 	    !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) {
1871*7754SJeff.Bonwick@Sun.COM 		ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE));	/* not a leaf */
1872*7754SJeff.Bonwick@Sun.COM 		ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS));	/* not a leaf */
1873789Sahrens 		zio->io_error = 0;
1874*7754SJeff.Bonwick@Sun.COM 		zio->io_flags |= ZIO_FLAG_IO_RETRY |
1875*7754SJeff.Bonwick@Sun.COM 		    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE;
18761775Sbillm 		zio->io_stage = ZIO_STAGE_VDEV_IO_START - 1;
1877*7754SJeff.Bonwick@Sun.COM 		zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE);
1878*7754SJeff.Bonwick@Sun.COM 		return (ZIO_PIPELINE_STOP);
1879*7754SJeff.Bonwick@Sun.COM 	}
1880789Sahrens 
1881*7754SJeff.Bonwick@Sun.COM 	/*
1882*7754SJeff.Bonwick@Sun.COM 	 * If we got an error on a leaf device, convert it to ENXIO
1883*7754SJeff.Bonwick@Sun.COM 	 * if the device is not accessible at all.
1884*7754SJeff.Bonwick@Sun.COM 	 */
1885*7754SJeff.Bonwick@Sun.COM 	if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf &&
1886*7754SJeff.Bonwick@Sun.COM 	    !vdev_accessible(vd, zio))
1887*7754SJeff.Bonwick@Sun.COM 		zio->io_error = ENXIO;
1888*7754SJeff.Bonwick@Sun.COM 
1889*7754SJeff.Bonwick@Sun.COM 	/*
1890*7754SJeff.Bonwick@Sun.COM 	 * If we can't write to an interior vdev (mirror or RAID-Z),
1891*7754SJeff.Bonwick@Sun.COM 	 * set vdev_cant_write so that we stop trying to allocate from it.
1892*7754SJeff.Bonwick@Sun.COM 	 */
1893*7754SJeff.Bonwick@Sun.COM 	if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE &&
1894*7754SJeff.Bonwick@Sun.COM 	    vd != NULL && !vd->vdev_ops->vdev_op_leaf)
1895*7754SJeff.Bonwick@Sun.COM 		vd->vdev_cant_write = B_TRUE;
1896*7754SJeff.Bonwick@Sun.COM 
1897*7754SJeff.Bonwick@Sun.COM 	if (zio->io_error)
1898*7754SJeff.Bonwick@Sun.COM 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1899789Sahrens 
19005530Sbonwick 	return (ZIO_PIPELINE_CONTINUE);
1901789Sahrens }
1902789Sahrens 
1903789Sahrens void
1904789Sahrens zio_vdev_io_reissue(zio_t *zio)
1905789Sahrens {
1906789Sahrens 	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
1907789Sahrens 	ASSERT(zio->io_error == 0);
1908789Sahrens 
1909789Sahrens 	zio->io_stage--;
1910789Sahrens }
1911789Sahrens 
1912789Sahrens void
1913789Sahrens zio_vdev_io_redone(zio_t *zio)
1914789Sahrens {
1915789Sahrens 	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE);
1916789Sahrens 
1917789Sahrens 	zio->io_stage--;
1918789Sahrens }
1919789Sahrens 
1920789Sahrens void
1921789Sahrens zio_vdev_io_bypass(zio_t *zio)
1922789Sahrens {
1923789Sahrens 	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
1924789Sahrens 	ASSERT(zio->io_error == 0);
1925789Sahrens 
1926789Sahrens 	zio->io_flags |= ZIO_FLAG_IO_BYPASS;
1927789Sahrens 	zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS - 1;
1928789Sahrens }
1929789Sahrens 
1930789Sahrens /*
1931789Sahrens  * ==========================================================================
1932789Sahrens  * Generate and verify checksums
1933789Sahrens  * ==========================================================================
1934789Sahrens  */
19355530Sbonwick static int
1936789Sahrens zio_checksum_generate(zio_t *zio)
1937789Sahrens {
1938789Sahrens 	blkptr_t *bp = zio->io_bp;
1939*7754SJeff.Bonwick@Sun.COM 	enum zio_checksum checksum;
1940789Sahrens 
1941*7754SJeff.Bonwick@Sun.COM 	if (bp == NULL) {
1942*7754SJeff.Bonwick@Sun.COM 		/*
1943*7754SJeff.Bonwick@Sun.COM 		 * This is zio_write_phys().
1944*7754SJeff.Bonwick@Sun.COM 		 * We're either generating a label checksum, or none at all.
1945*7754SJeff.Bonwick@Sun.COM 		 */
1946*7754SJeff.Bonwick@Sun.COM 		checksum = zio->io_prop.zp_checksum;
1947789Sahrens 
1948*7754SJeff.Bonwick@Sun.COM 		if (checksum == ZIO_CHECKSUM_OFF)
1949*7754SJeff.Bonwick@Sun.COM 			return (ZIO_PIPELINE_CONTINUE);
1950789Sahrens 
1951*7754SJeff.Bonwick@Sun.COM 		ASSERT(checksum == ZIO_CHECKSUM_LABEL);
1952*7754SJeff.Bonwick@Sun.COM 	} else {
1953*7754SJeff.Bonwick@Sun.COM 		if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) {
1954*7754SJeff.Bonwick@Sun.COM 			ASSERT(!IO_IS_ALLOCATING(zio));
1955*7754SJeff.Bonwick@Sun.COM 			checksum = ZIO_CHECKSUM_GANG_HEADER;
1956*7754SJeff.Bonwick@Sun.COM 		} else {
1957*7754SJeff.Bonwick@Sun.COM 			checksum = BP_GET_CHECKSUM(bp);
1958*7754SJeff.Bonwick@Sun.COM 		}
1959*7754SJeff.Bonwick@Sun.COM 	}
1960789Sahrens 
1961*7754SJeff.Bonwick@Sun.COM 	zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size);
1962789Sahrens 
19635530Sbonwick 	return (ZIO_PIPELINE_CONTINUE);
1964789Sahrens }
1965789Sahrens 
19665530Sbonwick static int
1967789Sahrens zio_checksum_verify(zio_t *zio)
1968789Sahrens {
1969*7754SJeff.Bonwick@Sun.COM 	blkptr_t *bp = zio->io_bp;
1970*7754SJeff.Bonwick@Sun.COM 	int error;
1971*7754SJeff.Bonwick@Sun.COM 
1972*7754SJeff.Bonwick@Sun.COM 	if (bp == NULL) {
1973*7754SJeff.Bonwick@Sun.COM 		/*
1974*7754SJeff.Bonwick@Sun.COM 		 * This is zio_read_phys().
1975*7754SJeff.Bonwick@Sun.COM 		 * We're either verifying a label checksum, or nothing at all.
1976*7754SJeff.Bonwick@Sun.COM 		 */
1977*7754SJeff.Bonwick@Sun.COM 		if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF)
1978*7754SJeff.Bonwick@Sun.COM 			return (ZIO_PIPELINE_CONTINUE);
1979*7754SJeff.Bonwick@Sun.COM 
1980*7754SJeff.Bonwick@Sun.COM 		ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL);
1981*7754SJeff.Bonwick@Sun.COM 	}
1982*7754SJeff.Bonwick@Sun.COM 
1983*7754SJeff.Bonwick@Sun.COM 	if ((error = zio_checksum_error(zio)) != 0) {
1984*7754SJeff.Bonwick@Sun.COM 		zio->io_error = error;
1985*7754SJeff.Bonwick@Sun.COM 		if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
19861544Seschrock 			zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
19871544Seschrock 			    zio->io_spa, zio->io_vd, zio, 0, 0);
1988*7754SJeff.Bonwick@Sun.COM 		}
1989789Sahrens 	}
1990789Sahrens 
19915530Sbonwick 	return (ZIO_PIPELINE_CONTINUE);
1992789Sahrens }
1993789Sahrens 
1994789Sahrens /*
1995789Sahrens  * Called by RAID-Z to ensure we don't compute the checksum twice.
1996789Sahrens  */
1997789Sahrens void
1998789Sahrens zio_checksum_verified(zio_t *zio)
1999789Sahrens {
2000789Sahrens 	zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY);
2001789Sahrens }
2002789Sahrens 
2003789Sahrens /*
2004*7754SJeff.Bonwick@Sun.COM  * ==========================================================================
2005*7754SJeff.Bonwick@Sun.COM  * Error rank.  Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other.
2006*7754SJeff.Bonwick@Sun.COM  * An error of 0 indictes success.  ENXIO indicates whole-device failure,
2007*7754SJeff.Bonwick@Sun.COM  * which may be transient (e.g. unplugged) or permament.  ECKSUM and EIO
2008*7754SJeff.Bonwick@Sun.COM  * indicate errors that are specific to one I/O, and most likely permanent.
2009*7754SJeff.Bonwick@Sun.COM  * Any other error is presumed to be worse because we weren't expecting it.
2010*7754SJeff.Bonwick@Sun.COM  * ==========================================================================
2011789Sahrens  */
2012*7754SJeff.Bonwick@Sun.COM int
2013*7754SJeff.Bonwick@Sun.COM zio_worst_error(int e1, int e2)
2014789Sahrens {
2015*7754SJeff.Bonwick@Sun.COM 	static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO };
2016*7754SJeff.Bonwick@Sun.COM 	int r1, r2;
20171775Sbillm 
2018*7754SJeff.Bonwick@Sun.COM 	for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++)
2019*7754SJeff.Bonwick@Sun.COM 		if (e1 == zio_error_rank[r1])
2020*7754SJeff.Bonwick@Sun.COM 			break;
2021*7754SJeff.Bonwick@Sun.COM 
2022*7754SJeff.Bonwick@Sun.COM 	for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++)
2023*7754SJeff.Bonwick@Sun.COM 		if (e2 == zio_error_rank[r2])
2024*7754SJeff.Bonwick@Sun.COM 			break;
2025*7754SJeff.Bonwick@Sun.COM 
2026*7754SJeff.Bonwick@Sun.COM 	return (r1 > r2 ? e1 : e2);
2027789Sahrens }
2028789Sahrens 
2029789Sahrens /*
2030789Sahrens  * ==========================================================================
2031*7754SJeff.Bonwick@Sun.COM  * I/O completion
2032789Sahrens  * ==========================================================================
2033789Sahrens  */
2034*7754SJeff.Bonwick@Sun.COM static int
2035*7754SJeff.Bonwick@Sun.COM zio_ready(zio_t *zio)
2036*7754SJeff.Bonwick@Sun.COM {
2037*7754SJeff.Bonwick@Sun.COM 	blkptr_t *bp = zio->io_bp;
2038*7754SJeff.Bonwick@Sun.COM 	zio_t *pio = zio->io_parent;
2039*7754SJeff.Bonwick@Sun.COM 
2040*7754SJeff.Bonwick@Sun.COM 	if (zio->io_ready) {
2041*7754SJeff.Bonwick@Sun.COM 		if (BP_IS_GANG(bp) &&
2042*7754SJeff.Bonwick@Sun.COM 		    zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY))
2043*7754SJeff.Bonwick@Sun.COM 			return (ZIO_PIPELINE_STOP);
2044*7754SJeff.Bonwick@Sun.COM 
2045*7754SJeff.Bonwick@Sun.COM 		ASSERT(IO_IS_ALLOCATING(zio));
2046*7754SJeff.Bonwick@Sun.COM 		ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
2047*7754SJeff.Bonwick@Sun.COM 		ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0);
2048*7754SJeff.Bonwick@Sun.COM 
2049*7754SJeff.Bonwick@Sun.COM 		zio->io_ready(zio);
2050*7754SJeff.Bonwick@Sun.COM 	}
2051*7754SJeff.Bonwick@Sun.COM 
2052*7754SJeff.Bonwick@Sun.COM 	if (bp != NULL && bp != &zio->io_bp_copy)
2053*7754SJeff.Bonwick@Sun.COM 		zio->io_bp_copy = *bp;
2054*7754SJeff.Bonwick@Sun.COM 
2055*7754SJeff.Bonwick@Sun.COM 	if (zio->io_error)
2056*7754SJeff.Bonwick@Sun.COM 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
2057*7754SJeff.Bonwick@Sun.COM 
2058*7754SJeff.Bonwick@Sun.COM 	if (pio != NULL)
2059*7754SJeff.Bonwick@Sun.COM 		zio_notify_parent(pio, zio, ZIO_WAIT_READY);
2060*7754SJeff.Bonwick@Sun.COM 
2061*7754SJeff.Bonwick@Sun.COM 	return (ZIO_PIPELINE_CONTINUE);
2062*7754SJeff.Bonwick@Sun.COM }
2063*7754SJeff.Bonwick@Sun.COM 
2064*7754SJeff.Bonwick@Sun.COM static int
2065*7754SJeff.Bonwick@Sun.COM zio_done(zio_t *zio)
2066*7754SJeff.Bonwick@Sun.COM {
2067*7754SJeff.Bonwick@Sun.COM 	spa_t *spa = zio->io_spa;
2068*7754SJeff.Bonwick@Sun.COM 	zio_t *pio = zio->io_parent;
2069*7754SJeff.Bonwick@Sun.COM 	zio_t *lio = zio->io_logical;
2070*7754SJeff.Bonwick@Sun.COM 	blkptr_t *bp = zio->io_bp;
2071*7754SJeff.Bonwick@Sun.COM 	vdev_t *vd = zio->io_vd;
2072*7754SJeff.Bonwick@Sun.COM 	uint64_t psize = zio->io_size;
2073*7754SJeff.Bonwick@Sun.COM 
2074*7754SJeff.Bonwick@Sun.COM 	/*
2075*7754SJeff.Bonwick@Sun.COM 	 * If our of children haven't all completed,
2076*7754SJeff.Bonwick@Sun.COM 	 * wait for them and then repeat this pipeline stage.
2077*7754SJeff.Bonwick@Sun.COM 	 */
2078*7754SJeff.Bonwick@Sun.COM 	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) ||
2079*7754SJeff.Bonwick@Sun.COM 	    zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) ||
2080*7754SJeff.Bonwick@Sun.COM 	    zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE))
2081*7754SJeff.Bonwick@Sun.COM 		return (ZIO_PIPELINE_STOP);
2082*7754SJeff.Bonwick@Sun.COM 
2083*7754SJeff.Bonwick@Sun.COM 	for (int c = 0; c < ZIO_CHILD_TYPES; c++)
2084*7754SJeff.Bonwick@Sun.COM 		for (int w = 0; w < ZIO_WAIT_TYPES; w++)
2085*7754SJeff.Bonwick@Sun.COM 			ASSERT(zio->io_children[c][w] == 0);
2086*7754SJeff.Bonwick@Sun.COM 
2087*7754SJeff.Bonwick@Sun.COM 	if (bp != NULL) {
2088*7754SJeff.Bonwick@Sun.COM 		ASSERT(bp->blk_pad[0] == 0);
2089*7754SJeff.Bonwick@Sun.COM 		ASSERT(bp->blk_pad[1] == 0);
2090*7754SJeff.Bonwick@Sun.COM 		ASSERT(bp->blk_pad[2] == 0);
2091*7754SJeff.Bonwick@Sun.COM 		ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 ||
2092*7754SJeff.Bonwick@Sun.COM 		    (pio != NULL && bp == pio->io_bp));
2093*7754SJeff.Bonwick@Sun.COM 		if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) &&
2094*7754SJeff.Bonwick@Sun.COM 		    !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
2095*7754SJeff.Bonwick@Sun.COM 			ASSERT(!BP_SHOULD_BYTESWAP(bp));
2096*7754SJeff.Bonwick@Sun.COM 			ASSERT3U(zio->io_prop.zp_ndvas, <=, BP_GET_NDVAS(bp));
2097*7754SJeff.Bonwick@Sun.COM 			ASSERT(BP_COUNT_GANG(bp) == 0 ||
2098*7754SJeff.Bonwick@Sun.COM 			    (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp)));
2099*7754SJeff.Bonwick@Sun.COM 		}
2100*7754SJeff.Bonwick@Sun.COM 	}
2101*7754SJeff.Bonwick@Sun.COM 
2102*7754SJeff.Bonwick@Sun.COM 	/*
2103*7754SJeff.Bonwick@Sun.COM 	 * If there were child vdev or gang errors, they apply to us now.
2104*7754SJeff.Bonwick@Sun.COM 	 */
2105*7754SJeff.Bonwick@Sun.COM 	zio_inherit_child_errors(zio, ZIO_CHILD_VDEV);
2106*7754SJeff.Bonwick@Sun.COM 	zio_inherit_child_errors(zio, ZIO_CHILD_GANG);
2107*7754SJeff.Bonwick@Sun.COM 
2108*7754SJeff.Bonwick@Sun.COM 	zio_pop_transforms(zio);	/* note: may set zio->io_error */
2109*7754SJeff.Bonwick@Sun.COM 
2110*7754SJeff.Bonwick@Sun.COM 	vdev_stat_update(zio, psize);
2111*7754SJeff.Bonwick@Sun.COM 
2112*7754SJeff.Bonwick@Sun.COM 	if (zio->io_error) {
2113*7754SJeff.Bonwick@Sun.COM 		/*
2114*7754SJeff.Bonwick@Sun.COM 		 * If this I/O is attached to a particular vdev,
2115*7754SJeff.Bonwick@Sun.COM 		 * generate an error message describing the I/O failure
2116*7754SJeff.Bonwick@Sun.COM 		 * at the block level.  We ignore these errors if the
2117*7754SJeff.Bonwick@Sun.COM 		 * device is currently unavailable.
2118*7754SJeff.Bonwick@Sun.COM 		 */
2119*7754SJeff.Bonwick@Sun.COM 		if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd))
2120*7754SJeff.Bonwick@Sun.COM 			zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0);
2121*7754SJeff.Bonwick@Sun.COM 
2122*7754SJeff.Bonwick@Sun.COM 		if ((zio->io_error == EIO ||
2123*7754SJeff.Bonwick@Sun.COM 		    !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) && zio == lio) {
2124*7754SJeff.Bonwick@Sun.COM 			/*
2125*7754SJeff.Bonwick@Sun.COM 			 * For logical I/O requests, tell the SPA to log the
2126*7754SJeff.Bonwick@Sun.COM 			 * error and generate a logical data ereport.
2127*7754SJeff.Bonwick@Sun.COM 			 */
2128*7754SJeff.Bonwick@Sun.COM 			spa_log_error(spa, zio);
2129*7754SJeff.Bonwick@Sun.COM 			zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio,
2130*7754SJeff.Bonwick@Sun.COM 			    0, 0);
2131*7754SJeff.Bonwick@Sun.COM 		}
2132*7754SJeff.Bonwick@Sun.COM 	}
2133*7754SJeff.Bonwick@Sun.COM 
2134*7754SJeff.Bonwick@Sun.COM 	if (zio->io_error && zio == lio) {
2135*7754SJeff.Bonwick@Sun.COM 		/*
2136*7754SJeff.Bonwick@Sun.COM 		 * Determine whether zio should be reexecuted.  This will
2137*7754SJeff.Bonwick@Sun.COM 		 * propagate all the way to the root via zio_notify_parent().
2138*7754SJeff.Bonwick@Sun.COM 		 */
2139*7754SJeff.Bonwick@Sun.COM 		ASSERT(vd == NULL && bp != NULL);
2140789Sahrens 
2141*7754SJeff.Bonwick@Sun.COM 		if (IO_IS_ALLOCATING(zio))
2142*7754SJeff.Bonwick@Sun.COM 			if (zio->io_error != ENOSPC)
2143*7754SJeff.Bonwick@Sun.COM 				zio->io_reexecute |= ZIO_REEXECUTE_NOW;
2144*7754SJeff.Bonwick@Sun.COM 			else
2145*7754SJeff.Bonwick@Sun.COM 				zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
2146*7754SJeff.Bonwick@Sun.COM 
2147*7754SJeff.Bonwick@Sun.COM 		if ((zio->io_type == ZIO_TYPE_READ ||
2148*7754SJeff.Bonwick@Sun.COM 		    zio->io_type == ZIO_TYPE_FREE) &&
2149*7754SJeff.Bonwick@Sun.COM 		    zio->io_error == ENXIO &&
2150*7754SJeff.Bonwick@Sun.COM 		    spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE)
2151*7754SJeff.Bonwick@Sun.COM 			zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
2152*7754SJeff.Bonwick@Sun.COM 
2153*7754SJeff.Bonwick@Sun.COM 		if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute)
2154*7754SJeff.Bonwick@Sun.COM 			zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
2155*7754SJeff.Bonwick@Sun.COM 	}
2156*7754SJeff.Bonwick@Sun.COM 
2157*7754SJeff.Bonwick@Sun.COM 	/*
2158*7754SJeff.Bonwick@Sun.COM 	 * If there were logical child errors, they apply to us now.
2159*7754SJeff.Bonwick@Sun.COM 	 * We defer this until now to avoid conflating logical child
2160*7754SJeff.Bonwick@Sun.COM 	 * errors with errors that happened to the zio itself when
2161*7754SJeff.Bonwick@Sun.COM 	 * updating vdev stats and reporting FMA events above.
2162*7754SJeff.Bonwick@Sun.COM 	 */
2163*7754SJeff.Bonwick@Sun.COM 	zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL);
2164*7754SJeff.Bonwick@Sun.COM 
2165*7754SJeff.Bonwick@Sun.COM 	if (zio->io_reexecute) {
2166*7754SJeff.Bonwick@Sun.COM 		/*
2167*7754SJeff.Bonwick@Sun.COM 		 * This is a logical I/O that wants to reexecute.
2168*7754SJeff.Bonwick@Sun.COM 		 *
2169*7754SJeff.Bonwick@Sun.COM 		 * Reexecute is top-down.  When an i/o fails, if it's not
2170*7754SJeff.Bonwick@Sun.COM 		 * the root, it simply notifies its parent and sticks around.
2171*7754SJeff.Bonwick@Sun.COM 		 * The parent, seeing that it still has children in zio_done(),
2172*7754SJeff.Bonwick@Sun.COM 		 * does the same.  This percolates all the way up to the root.
2173*7754SJeff.Bonwick@Sun.COM 		 * The root i/o will reexecute or suspend the entire tree.
2174*7754SJeff.Bonwick@Sun.COM 		 *
2175*7754SJeff.Bonwick@Sun.COM 		 * This approach ensures that zio_reexecute() honors
2176*7754SJeff.Bonwick@Sun.COM 		 * all the original i/o dependency relationships, e.g.
2177*7754SJeff.Bonwick@Sun.COM 		 * parents not executing until children are ready.
2178*7754SJeff.Bonwick@Sun.COM 		 */
2179*7754SJeff.Bonwick@Sun.COM 		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
2180*7754SJeff.Bonwick@Sun.COM 
2181*7754SJeff.Bonwick@Sun.COM 		if (IO_IS_ALLOCATING(zio))
2182*7754SJeff.Bonwick@Sun.COM 			zio_dva_unallocate(zio, zio->io_gang_tree, bp);
2183*7754SJeff.Bonwick@Sun.COM 
2184*7754SJeff.Bonwick@Sun.COM 		zio_gang_tree_free(&zio->io_gang_tree);
2185*7754SJeff.Bonwick@Sun.COM 
2186*7754SJeff.Bonwick@Sun.COM 		if (pio != NULL) {
2187*7754SJeff.Bonwick@Sun.COM 			/*
2188*7754SJeff.Bonwick@Sun.COM 			 * We're not a root i/o, so there's nothing to do
2189*7754SJeff.Bonwick@Sun.COM 			 * but notify our parent.  Don't propagate errors
2190*7754SJeff.Bonwick@Sun.COM 			 * upward since we haven't permanently failed yet.
2191*7754SJeff.Bonwick@Sun.COM 			 */
2192*7754SJeff.Bonwick@Sun.COM 			zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE;
2193*7754SJeff.Bonwick@Sun.COM 			zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
2194*7754SJeff.Bonwick@Sun.COM 		} else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) {
2195*7754SJeff.Bonwick@Sun.COM 			/*
2196*7754SJeff.Bonwick@Sun.COM 			 * We'd fail again if we reexecuted now, so suspend
2197*7754SJeff.Bonwick@Sun.COM 			 * until conditions improve (e.g. device comes online).
2198*7754SJeff.Bonwick@Sun.COM 			 */
2199*7754SJeff.Bonwick@Sun.COM 			zio_suspend(spa, zio);
2200*7754SJeff.Bonwick@Sun.COM 		} else {
2201*7754SJeff.Bonwick@Sun.COM 			/*
2202*7754SJeff.Bonwick@Sun.COM 			 * Reexecution is potentially a huge amount of work.
2203*7754SJeff.Bonwick@Sun.COM 			 * Hand it off to the otherwise-unused claim taskq.
2204*7754SJeff.Bonwick@Sun.COM 			 */
2205*7754SJeff.Bonwick@Sun.COM 			(void) taskq_dispatch(
2206*7754SJeff.Bonwick@Sun.COM 			    spa->spa_zio_taskq[ZIO_TYPE_CLAIM][ZIO_TASKQ_ISSUE],
2207*7754SJeff.Bonwick@Sun.COM 			    (task_func_t *)zio_reexecute, zio, TQ_SLEEP);
2208*7754SJeff.Bonwick@Sun.COM 		}
2209*7754SJeff.Bonwick@Sun.COM 		return (ZIO_PIPELINE_STOP);
2210*7754SJeff.Bonwick@Sun.COM 	}
2211*7754SJeff.Bonwick@Sun.COM 
2212*7754SJeff.Bonwick@Sun.COM 	ASSERT(zio->io_child == NULL);
2213*7754SJeff.Bonwick@Sun.COM 	ASSERT(zio->io_reexecute == 0);
2214*7754SJeff.Bonwick@Sun.COM 	ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL));
2215*7754SJeff.Bonwick@Sun.COM 
2216*7754SJeff.Bonwick@Sun.COM 	if (zio->io_done)
2217*7754SJeff.Bonwick@Sun.COM 		zio->io_done(zio);
2218*7754SJeff.Bonwick@Sun.COM 
2219*7754SJeff.Bonwick@Sun.COM 	zio_gang_tree_free(&zio->io_gang_tree);
2220*7754SJeff.Bonwick@Sun.COM 
2221*7754SJeff.Bonwick@Sun.COM 	ASSERT(zio->io_delegate_list == NULL);
2222*7754SJeff.Bonwick@Sun.COM 	ASSERT(zio->io_delegate_next == NULL);
2223*7754SJeff.Bonwick@Sun.COM 
2224*7754SJeff.Bonwick@Sun.COM 	if (pio != NULL) {
2225*7754SJeff.Bonwick@Sun.COM 		zio_remove_child(pio, zio);
2226*7754SJeff.Bonwick@Sun.COM 		zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
2227*7754SJeff.Bonwick@Sun.COM 	}
2228*7754SJeff.Bonwick@Sun.COM 
2229*7754SJeff.Bonwick@Sun.COM 	if (zio->io_waiter != NULL) {
2230*7754SJeff.Bonwick@Sun.COM 		mutex_enter(&zio->io_lock);
2231*7754SJeff.Bonwick@Sun.COM 		zio->io_executor = NULL;
2232*7754SJeff.Bonwick@Sun.COM 		cv_broadcast(&zio->io_cv);
2233*7754SJeff.Bonwick@Sun.COM 		mutex_exit(&zio->io_lock);
2234*7754SJeff.Bonwick@Sun.COM 	} else {
2235*7754SJeff.Bonwick@Sun.COM 		zio_destroy(zio);
2236*7754SJeff.Bonwick@Sun.COM 	}
2237*7754SJeff.Bonwick@Sun.COM 
2238*7754SJeff.Bonwick@Sun.COM 	return (ZIO_PIPELINE_STOP);
2239*7754SJeff.Bonwick@Sun.COM }
2240*7754SJeff.Bonwick@Sun.COM 
2241*7754SJeff.Bonwick@Sun.COM /*
2242*7754SJeff.Bonwick@Sun.COM  * ==========================================================================
2243*7754SJeff.Bonwick@Sun.COM  * I/O pipeline definition
2244*7754SJeff.Bonwick@Sun.COM  * ==========================================================================
2245*7754SJeff.Bonwick@Sun.COM  */
2246*7754SJeff.Bonwick@Sun.COM static zio_pipe_stage_t *zio_pipeline[ZIO_STAGES] = {
22475530Sbonwick 	NULL,
22485530Sbonwick 	zio_issue_async,
2249*7754SJeff.Bonwick@Sun.COM 	zio_read_bp_init,
2250*7754SJeff.Bonwick@Sun.COM 	zio_write_bp_init,
2251789Sahrens 	zio_checksum_generate,
2252*7754SJeff.Bonwick@Sun.COM 	zio_gang_assemble,
2253*7754SJeff.Bonwick@Sun.COM 	zio_gang_issue,
2254789Sahrens 	zio_dva_allocate,
2255789Sahrens 	zio_dva_free,
2256789Sahrens 	zio_dva_claim,
2257789Sahrens 	zio_ready,
2258789Sahrens 	zio_vdev_io_start,
2259789Sahrens 	zio_vdev_io_done,
2260789Sahrens 	zio_vdev_io_assess,
2261789Sahrens 	zio_checksum_verify,
2262*7754SJeff.Bonwick@Sun.COM 	zio_done
2263789Sahrens };
2264