xref: /onnv-gate/usr/src/uts/common/fs/zfs/vdev_raidz.c (revision 4034:b20b176bd1e8)
1789Sahrens /*
2789Sahrens  * CDDL HEADER START
3789Sahrens  *
4789Sahrens  * The contents of this file are subject to the terms of the
51544Seschrock  * Common Development and Distribution License (the "License").
61544Seschrock  * You may not use this file except in compliance with the License.
7789Sahrens  *
8789Sahrens  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9789Sahrens  * or http://www.opensolaris.org/os/licensing.
10789Sahrens  * See the License for the specific language governing permissions
11789Sahrens  * and limitations under the License.
12789Sahrens  *
13789Sahrens  * When distributing Covered Code, include this CDDL HEADER in each
14789Sahrens  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15789Sahrens  * If applicable, add the following below this CDDL HEADER, with the
16789Sahrens  * fields enclosed by brackets "[]" replaced with your own identifying
17789Sahrens  * information: Portions Copyright [yyyy] [name of copyright owner]
18789Sahrens  *
19789Sahrens  * CDDL HEADER END
20789Sahrens  */
212082Seschrock 
22789Sahrens /*
233456Sahl  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24789Sahrens  * Use is subject to license terms.
25789Sahrens  */
26789Sahrens 
27789Sahrens #pragma ident	"%Z%%M%	%I%	%E% SMI"
28789Sahrens 
29789Sahrens #include <sys/zfs_context.h>
30789Sahrens #include <sys/spa.h>
31789Sahrens #include <sys/vdev_impl.h>
32789Sahrens #include <sys/zio.h>
33789Sahrens #include <sys/zio_checksum.h>
34789Sahrens #include <sys/fs/zfs.h>
351544Seschrock #include <sys/fm/fs/zfs.h>
36789Sahrens 
37789Sahrens /*
38789Sahrens  * Virtual device vector for RAID-Z.
392082Seschrock  *
402082Seschrock  * This vdev supports both single and double parity. For single parity, we
412082Seschrock  * use a simple XOR of all the data columns. For double parity, we use both
422082Seschrock  * the simple XOR as well as a technique described in "The mathematics of
432082Seschrock  * RAID-6" by H. Peter Anvin. This technique defines a Galois field, GF(2^8),
442082Seschrock  * over the integers expressable in a single byte. Briefly, the operations on
452082Seschrock  * the field are defined as follows:
462082Seschrock  *
472082Seschrock  *   o addition (+) is represented by a bitwise XOR
482082Seschrock  *   o subtraction (-) is therefore identical to addition: A + B = A - B
492082Seschrock  *   o multiplication of A by 2 is defined by the following bitwise expression:
502082Seschrock  *	(A * 2)_7 = A_6
512082Seschrock  *	(A * 2)_6 = A_5
522082Seschrock  *	(A * 2)_5 = A_4
532082Seschrock  *	(A * 2)_4 = A_3 + A_7
542082Seschrock  *	(A * 2)_3 = A_2 + A_7
552082Seschrock  *	(A * 2)_2 = A_1 + A_7
562082Seschrock  *	(A * 2)_1 = A_0
572082Seschrock  *	(A * 2)_0 = A_7
582082Seschrock  *
592082Seschrock  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
602082Seschrock  *
612082Seschrock  * Observe that any number in the field (except for 0) can be expressed as a
622082Seschrock  * power of 2 -- a generator for the field. We store a table of the powers of
632082Seschrock  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
642082Seschrock  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
652082Seschrock  * than field addition). The inverse of a field element A (A^-1) is A^254.
662082Seschrock  *
672082Seschrock  * The two parity columns, P and Q, over several data columns, D_0, ... D_n-1,
682082Seschrock  * can be expressed by field operations:
692082Seschrock  *
702082Seschrock  *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
712082Seschrock  *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
722082Seschrock  *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
732082Seschrock  *
742082Seschrock  * See the reconstruction code below for how P and Q can used individually or
752082Seschrock  * in concert to recover missing data columns.
76789Sahrens  */
77789Sahrens 
78789Sahrens typedef struct raidz_col {
792082Seschrock 	uint64_t rc_devidx;		/* child device index for I/O */
802082Seschrock 	uint64_t rc_offset;		/* device offset */
812082Seschrock 	uint64_t rc_size;		/* I/O size */
822082Seschrock 	void *rc_data;			/* I/O data */
832082Seschrock 	int rc_error;			/* I/O error for this device */
842082Seschrock 	uint8_t rc_tried;		/* Did we attempt this I/O column? */
852082Seschrock 	uint8_t rc_skipped;		/* Did we skip this I/O column? */
86789Sahrens } raidz_col_t;
87789Sahrens 
88789Sahrens typedef struct raidz_map {
892082Seschrock 	uint64_t rm_cols;		/* Column count */
902082Seschrock 	uint64_t rm_bigcols;		/* Number of oversized columns */
912082Seschrock 	uint64_t rm_asize;		/* Actual total I/O size */
922082Seschrock 	uint64_t rm_missingdata;	/* Count of missing data devices */
932082Seschrock 	uint64_t rm_missingparity;	/* Count of missing parity devices */
942082Seschrock 	uint64_t rm_firstdatacol;	/* First data column/parity count */
952082Seschrock 	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
96789Sahrens } raidz_map_t;
97789Sahrens 
982082Seschrock #define	VDEV_RAIDZ_P		0
992082Seschrock #define	VDEV_RAIDZ_Q		1
1002082Seschrock 
1012082Seschrock #define	VDEV_RAIDZ_MAXPARITY	2
1022082Seschrock 
1032082Seschrock #define	VDEV_RAIDZ_MUL_2(a)	(((a) << 1) ^ (((a) & 0x80) ? 0x1d : 0))
1042082Seschrock 
1052082Seschrock /*
1062082Seschrock  * These two tables represent powers and logs of 2 in the Galois field defined
1072082Seschrock  * above. These values were computed by repeatedly multiplying by 2 as above.
1082082Seschrock  */
1092082Seschrock static const uint8_t vdev_raidz_pow2[256] = {
1102082Seschrock 	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
1112082Seschrock 	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
1122082Seschrock 	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
1132082Seschrock 	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
1142082Seschrock 	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
1152082Seschrock 	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
1162082Seschrock 	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
1172082Seschrock 	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
1182082Seschrock 	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
1192082Seschrock 	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
1202082Seschrock 	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
1212082Seschrock 	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
1222082Seschrock 	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
1232082Seschrock 	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
1242082Seschrock 	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
1252082Seschrock 	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
1262082Seschrock 	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
1272082Seschrock 	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
1282082Seschrock 	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
1292082Seschrock 	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
1302082Seschrock 	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
1312082Seschrock 	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
1322082Seschrock 	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
1332082Seschrock 	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
1342082Seschrock 	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
1352082Seschrock 	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
1362082Seschrock 	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
1372082Seschrock 	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
1382082Seschrock 	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
1392082Seschrock 	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
1402082Seschrock 	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
1412082Seschrock 	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
1422082Seschrock };
1432082Seschrock static const uint8_t vdev_raidz_log2[256] = {
1442082Seschrock 	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
1452082Seschrock 	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
1462082Seschrock 	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
1472082Seschrock 	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
1482082Seschrock 	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
1492082Seschrock 	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
1502082Seschrock 	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
1512082Seschrock 	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
1522082Seschrock 	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
1532082Seschrock 	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
1542082Seschrock 	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
1552082Seschrock 	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
1562082Seschrock 	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
1572082Seschrock 	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
1582082Seschrock 	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
1592082Seschrock 	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
1602082Seschrock 	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
1612082Seschrock 	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
1622082Seschrock 	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
1632082Seschrock 	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
1642082Seschrock 	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
1652082Seschrock 	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
1662082Seschrock 	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
1672082Seschrock 	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
1682082Seschrock 	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
1692082Seschrock 	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
1702082Seschrock 	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
1712082Seschrock 	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
1722082Seschrock 	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
1732082Seschrock 	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
1742082Seschrock 	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
1752082Seschrock 	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
1762082Seschrock };
1772082Seschrock 
1782082Seschrock /*
1792082Seschrock  * Multiply a given number by 2 raised to the given power.
1802082Seschrock  */
1812082Seschrock static uint8_t
1822082Seschrock vdev_raidz_exp2(uint_t a, int exp)
1832082Seschrock {
1842082Seschrock 	if (a == 0)
1852082Seschrock 		return (0);
1862082Seschrock 
1872082Seschrock 	ASSERT(exp >= 0);
1882082Seschrock 	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
1892082Seschrock 
1902082Seschrock 	exp += vdev_raidz_log2[a];
1912082Seschrock 	if (exp > 255)
1922082Seschrock 		exp -= 255;
1932082Seschrock 
1942082Seschrock 	return (vdev_raidz_pow2[exp]);
1952082Seschrock }
1962082Seschrock 
197789Sahrens static raidz_map_t *
1982082Seschrock vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
1992082Seschrock     uint64_t nparity)
200789Sahrens {
201789Sahrens 	raidz_map_t *rm;
202789Sahrens 	uint64_t b = zio->io_offset >> unit_shift;
203789Sahrens 	uint64_t s = zio->io_size >> unit_shift;
204789Sahrens 	uint64_t f = b % dcols;
205789Sahrens 	uint64_t o = (b / dcols) << unit_shift;
2062082Seschrock 	uint64_t q, r, c, bc, col, acols, coff, devidx;
207789Sahrens 
2082082Seschrock 	q = s / (dcols - nparity);
2092082Seschrock 	r = s - q * (dcols - nparity);
2102082Seschrock 	bc = (r == 0 ? 0 : r + nparity);
211789Sahrens 
212789Sahrens 	acols = (q == 0 ? bc : dcols);
213789Sahrens 
214789Sahrens 	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[acols]), KM_SLEEP);
215789Sahrens 
216789Sahrens 	rm->rm_cols = acols;
217789Sahrens 	rm->rm_bigcols = bc;
218789Sahrens 	rm->rm_asize = 0;
2192082Seschrock 	rm->rm_missingdata = 0;
2202082Seschrock 	rm->rm_missingparity = 0;
2212082Seschrock 	rm->rm_firstdatacol = nparity;
222789Sahrens 
223789Sahrens 	for (c = 0; c < acols; c++) {
224789Sahrens 		col = f + c;
225789Sahrens 		coff = o;
226789Sahrens 		if (col >= dcols) {
227789Sahrens 			col -= dcols;
228789Sahrens 			coff += 1ULL << unit_shift;
229789Sahrens 		}
2302082Seschrock 		rm->rm_col[c].rc_devidx = col;
231789Sahrens 		rm->rm_col[c].rc_offset = coff;
232789Sahrens 		rm->rm_col[c].rc_size = (q + (c < bc)) << unit_shift;
233789Sahrens 		rm->rm_col[c].rc_data = NULL;
234789Sahrens 		rm->rm_col[c].rc_error = 0;
235789Sahrens 		rm->rm_col[c].rc_tried = 0;
236789Sahrens 		rm->rm_col[c].rc_skipped = 0;
237789Sahrens 		rm->rm_asize += rm->rm_col[c].rc_size;
238789Sahrens 	}
239789Sahrens 
2402082Seschrock 	rm->rm_asize = roundup(rm->rm_asize, (nparity + 1) << unit_shift);
241789Sahrens 
242789Sahrens 	for (c = 0; c < rm->rm_firstdatacol; c++)
243789Sahrens 		rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
244789Sahrens 
245789Sahrens 	rm->rm_col[c].rc_data = zio->io_data;
246789Sahrens 
247789Sahrens 	for (c = c + 1; c < acols; c++)
248789Sahrens 		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
249789Sahrens 		    rm->rm_col[c - 1].rc_size;
250789Sahrens 
2511133Seschrock 	/*
2522082Seschrock 	 * If all data stored spans all columns, there's a danger that parity
2532082Seschrock 	 * will always be on the same device and, since parity isn't read
2542082Seschrock 	 * during normal operation, that that device's I/O bandwidth won't be
2552082Seschrock 	 * used effectively. We therefore switch the parity every 1MB.
2562082Seschrock 	 *
2572082Seschrock 	 * ... at least that was, ostensibly, the theory. As a practical
2582082Seschrock 	 * matter unless we juggle the parity between all devices evenly, we
2592082Seschrock 	 * won't see any benefit. Further, occasional writes that aren't a
2602082Seschrock 	 * multiple of the LCM of the number of children and the minimum
2612082Seschrock 	 * stripe width are sufficient to avoid pessimal behavior.
2622082Seschrock 	 * Unfortunately, this decision created an implicit on-disk format
2633456Sahl 	 * requirement that we need to support for all eternity, but only
2643456Sahl 	 * for single-parity RAID-Z.
2651133Seschrock 	 */
2661133Seschrock 	ASSERT(rm->rm_cols >= 2);
2671133Seschrock 	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
268789Sahrens 
2692082Seschrock 	if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
2702082Seschrock 		devidx = rm->rm_col[0].rc_devidx;
2711133Seschrock 		o = rm->rm_col[0].rc_offset;
2722082Seschrock 		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
2731133Seschrock 		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
2742082Seschrock 		rm->rm_col[1].rc_devidx = devidx;
2751133Seschrock 		rm->rm_col[1].rc_offset = o;
276789Sahrens 	}
277789Sahrens 
278789Sahrens 	zio->io_vsd = rm;
279789Sahrens 	return (rm);
280789Sahrens }
281789Sahrens 
282789Sahrens static void
283789Sahrens vdev_raidz_map_free(zio_t *zio)
284789Sahrens {
285789Sahrens 	raidz_map_t *rm = zio->io_vsd;
286789Sahrens 	int c;
287789Sahrens 
288789Sahrens 	for (c = 0; c < rm->rm_firstdatacol; c++)
289789Sahrens 		zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
290789Sahrens 
291789Sahrens 	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols]));
292789Sahrens 	zio->io_vsd = NULL;
293789Sahrens }
294789Sahrens 
295789Sahrens static void
2962082Seschrock vdev_raidz_generate_parity_p(raidz_map_t *rm)
2972082Seschrock {
2982082Seschrock 	uint64_t *p, *src, pcount, ccount, i;
2992082Seschrock 	int c;
3002082Seschrock 
3012082Seschrock 	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
3022082Seschrock 
3032082Seschrock 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
3042082Seschrock 		src = rm->rm_col[c].rc_data;
3052082Seschrock 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
3062082Seschrock 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
3072082Seschrock 
3082082Seschrock 		if (c == rm->rm_firstdatacol) {
3092082Seschrock 			ASSERT(ccount == pcount);
3102082Seschrock 			for (i = 0; i < ccount; i++, p++, src++) {
3112082Seschrock 				*p = *src;
3122082Seschrock 			}
3132082Seschrock 		} else {
3142082Seschrock 			ASSERT(ccount <= pcount);
3152082Seschrock 			for (i = 0; i < ccount; i++, p++, src++) {
3162082Seschrock 				*p ^= *src;
3172082Seschrock 			}
3182082Seschrock 		}
3192082Seschrock 	}
3202082Seschrock }
3212082Seschrock 
3222082Seschrock static void
3232082Seschrock vdev_raidz_generate_parity_pq(raidz_map_t *rm)
324789Sahrens {
3252082Seschrock 	uint64_t *q, *p, *src, pcount, ccount, mask, i;
3262082Seschrock 	int c;
3272082Seschrock 
3282082Seschrock 	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
3292082Seschrock 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
3302082Seschrock 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
3312082Seschrock 
3322082Seschrock 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
3332082Seschrock 		src = rm->rm_col[c].rc_data;
3342082Seschrock 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
3352082Seschrock 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
3362082Seschrock 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
3372082Seschrock 
3382082Seschrock 		if (c == rm->rm_firstdatacol) {
3392082Seschrock 			ASSERT(ccount == pcount || ccount == 0);
3402082Seschrock 			for (i = 0; i < ccount; i++, p++, q++, src++) {
3412082Seschrock 				*q = *src;
3422082Seschrock 				*p = *src;
3432082Seschrock 			}
3442082Seschrock 			for (; i < pcount; i++, p++, q++, src++) {
3452082Seschrock 				*q = 0;
3462082Seschrock 				*p = 0;
3472082Seschrock 			}
3482082Seschrock 		} else {
3492082Seschrock 			ASSERT(ccount <= pcount);
350789Sahrens 
3512082Seschrock 			/*
3522082Seschrock 			 * Rather than multiplying each byte individually (as
3532082Seschrock 			 * described above), we are able to handle 8 at once
3542082Seschrock 			 * by generating a mask based on the high bit in each
3552082Seschrock 			 * byte and using that to conditionally XOR in 0x1d.
3562082Seschrock 			 */
3572082Seschrock 			for (i = 0; i < ccount; i++, p++, q++, src++) {
3582082Seschrock 				mask = *q & 0x8080808080808080ULL;
3592082Seschrock 				mask = (mask << 1) - (mask >> 7);
3602082Seschrock 				*q = ((*q << 1) & 0xfefefefefefefefeULL) ^
3612082Seschrock 				    (mask & 0x1d1d1d1d1d1d1d1dULL);
3622082Seschrock 				*q ^= *src;
3632082Seschrock 				*p ^= *src;
3642082Seschrock 			}
3652082Seschrock 
3662082Seschrock 			/*
3672082Seschrock 			 * Treat short columns as though they are full of 0s.
3682082Seschrock 			 */
3692082Seschrock 			for (; i < pcount; i++, q++) {
3702082Seschrock 				mask = *q & 0x8080808080808080ULL;
3712082Seschrock 				mask = (mask << 1) - (mask >> 7);
3722082Seschrock 				*q = ((*q << 1) & 0xfefefefefefefefeULL) ^
3732082Seschrock 				    (mask & 0x1d1d1d1d1d1d1d1dULL);
3742082Seschrock 			}
3752082Seschrock 		}
3762082Seschrock 	}
3772082Seschrock }
3782082Seschrock 
3792082Seschrock static void
3802082Seschrock vdev_raidz_reconstruct_p(raidz_map_t *rm, int x)
3812082Seschrock {
3822082Seschrock 	uint64_t *dst, *src, xcount, ccount, count, i;
3832082Seschrock 	int c;
3842082Seschrock 
3852082Seschrock 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
3862082Seschrock 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
3872082Seschrock 	ASSERT(xcount > 0);
3882082Seschrock 
3892082Seschrock 	src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
3902082Seschrock 	dst = rm->rm_col[x].rc_data;
3912082Seschrock 	for (i = 0; i < xcount; i++, dst++, src++) {
3922082Seschrock 		*dst = *src;
3932082Seschrock 	}
3942082Seschrock 
3952082Seschrock 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
396789Sahrens 		src = rm->rm_col[c].rc_data;
397789Sahrens 		dst = rm->rm_col[x].rc_data;
3982082Seschrock 
3992082Seschrock 		if (c == x)
4002082Seschrock 			continue;
4012082Seschrock 
4022082Seschrock 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
4032082Seschrock 		count = MIN(ccount, xcount);
4042082Seschrock 
4052082Seschrock 		for (i = 0; i < count; i++, dst++, src++) {
4062082Seschrock 			*dst ^= *src;
407789Sahrens 		}
408789Sahrens 	}
409789Sahrens }
410789Sahrens 
4112082Seschrock static void
4122082Seschrock vdev_raidz_reconstruct_q(raidz_map_t *rm, int x)
4132082Seschrock {
4142082Seschrock 	uint64_t *dst, *src, xcount, ccount, count, mask, i;
4152082Seschrock 	uint8_t *b;
4162082Seschrock 	int c, j, exp;
4172082Seschrock 
4182082Seschrock 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
4192082Seschrock 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
4202082Seschrock 
4212082Seschrock 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
4222082Seschrock 		src = rm->rm_col[c].rc_data;
4232082Seschrock 		dst = rm->rm_col[x].rc_data;
4242082Seschrock 
4252082Seschrock 		if (c == x)
4262082Seschrock 			ccount = 0;
4272082Seschrock 		else
4282082Seschrock 			ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
4292082Seschrock 
4302082Seschrock 		count = MIN(ccount, xcount);
4312082Seschrock 
4322082Seschrock 		if (c == rm->rm_firstdatacol) {
4332082Seschrock 			for (i = 0; i < count; i++, dst++, src++) {
4342082Seschrock 				*dst = *src;
4352082Seschrock 			}
4362082Seschrock 			for (; i < xcount; i++, dst++) {
4372082Seschrock 				*dst = 0;
4382082Seschrock 			}
4392082Seschrock 
4402082Seschrock 		} else {
4412082Seschrock 			/*
4422082Seschrock 			 * For an explanation of this, see the comment in
4432082Seschrock 			 * vdev_raidz_generate_parity_pq() above.
4442082Seschrock 			 */
4452082Seschrock 			for (i = 0; i < count; i++, dst++, src++) {
4462082Seschrock 				mask = *dst & 0x8080808080808080ULL;
4472082Seschrock 				mask = (mask << 1) - (mask >> 7);
4482082Seschrock 				*dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
4492082Seschrock 				    (mask & 0x1d1d1d1d1d1d1d1dULL);
4502082Seschrock 				*dst ^= *src;
4512082Seschrock 			}
4522082Seschrock 
4532082Seschrock 			for (; i < xcount; i++, dst++) {
4542082Seschrock 				mask = *dst & 0x8080808080808080ULL;
4552082Seschrock 				mask = (mask << 1) - (mask >> 7);
4562082Seschrock 				*dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
4572082Seschrock 				    (mask & 0x1d1d1d1d1d1d1d1dULL);
4582082Seschrock 			}
4592082Seschrock 		}
4602082Seschrock 	}
4612082Seschrock 
4622082Seschrock 	src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
4632082Seschrock 	dst = rm->rm_col[x].rc_data;
4642082Seschrock 	exp = 255 - (rm->rm_cols - 1 - x);
4652082Seschrock 
4662082Seschrock 	for (i = 0; i < xcount; i++, dst++, src++) {
4672082Seschrock 		*dst ^= *src;
4682082Seschrock 		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
4692082Seschrock 			*b = vdev_raidz_exp2(*b, exp);
4702082Seschrock 		}
4712082Seschrock 	}
4722082Seschrock }
4732082Seschrock 
4742082Seschrock static void
4752082Seschrock vdev_raidz_reconstruct_pq(raidz_map_t *rm, int x, int y)
4762082Seschrock {
4772082Seschrock 	uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
4782082Seschrock 	void *pdata, *qdata;
4792082Seschrock 	uint64_t xsize, ysize, i;
4802082Seschrock 
4812082Seschrock 	ASSERT(x < y);
4822082Seschrock 	ASSERT(x >= rm->rm_firstdatacol);
4832082Seschrock 	ASSERT(y < rm->rm_cols);
4842082Seschrock 
4852082Seschrock 	ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
4862082Seschrock 
4872082Seschrock 	/*
4882082Seschrock 	 * Move the parity data aside -- we're going to compute parity as
4892082Seschrock 	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
4902082Seschrock 	 * reuse the parity generation mechanism without trashing the actual
4912082Seschrock 	 * parity so we make those columns appear to be full of zeros by
4922082Seschrock 	 * setting their lengths to zero.
4932082Seschrock 	 */
4942082Seschrock 	pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
4952082Seschrock 	qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
4962082Seschrock 	xsize = rm->rm_col[x].rc_size;
4972082Seschrock 	ysize = rm->rm_col[y].rc_size;
4982082Seschrock 
4992082Seschrock 	rm->rm_col[VDEV_RAIDZ_P].rc_data =
5002082Seschrock 	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
5012082Seschrock 	rm->rm_col[VDEV_RAIDZ_Q].rc_data =
5022082Seschrock 	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
5032082Seschrock 	rm->rm_col[x].rc_size = 0;
5042082Seschrock 	rm->rm_col[y].rc_size = 0;
5052082Seschrock 
5062082Seschrock 	vdev_raidz_generate_parity_pq(rm);
5072082Seschrock 
5082082Seschrock 	rm->rm_col[x].rc_size = xsize;
5092082Seschrock 	rm->rm_col[y].rc_size = ysize;
5102082Seschrock 
5112082Seschrock 	p = pdata;
5122082Seschrock 	q = qdata;
5132082Seschrock 	pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
5142082Seschrock 	qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
5152082Seschrock 	xd = rm->rm_col[x].rc_data;
5162082Seschrock 	yd = rm->rm_col[y].rc_data;
5172082Seschrock 
5182082Seschrock 	/*
5192082Seschrock 	 * We now have:
5202082Seschrock 	 *	Pxy = P + D_x + D_y
5212082Seschrock 	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
5222082Seschrock 	 *
5232082Seschrock 	 * We can then solve for D_x:
5242082Seschrock 	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
5252082Seschrock 	 * where
5262082Seschrock 	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
5272082Seschrock 	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
5282082Seschrock 	 *
5292082Seschrock 	 * With D_x in hand, we can easily solve for D_y:
5302082Seschrock 	 *	D_y = P + Pxy + D_x
5312082Seschrock 	 */
5322082Seschrock 
5332082Seschrock 	a = vdev_raidz_pow2[255 + x - y];
5342082Seschrock 	b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
5352082Seschrock 	tmp = 255 - vdev_raidz_log2[a ^ 1];
5362082Seschrock 
5372082Seschrock 	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
5382082Seschrock 	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
5392082Seschrock 
5402082Seschrock 	for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
5412082Seschrock 		*xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
5422082Seschrock 		    vdev_raidz_exp2(*q ^ *qxy, bexp);
5432082Seschrock 
5442082Seschrock 		if (i < ysize)
5452082Seschrock 			*yd = *p ^ *pxy ^ *xd;
5462082Seschrock 	}
5472082Seschrock 
5482082Seschrock 	zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
5492082Seschrock 	    rm->rm_col[VDEV_RAIDZ_P].rc_size);
5502082Seschrock 	zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
5512082Seschrock 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
5522082Seschrock 
5532082Seschrock 	/*
5542082Seschrock 	 * Restore the saved parity data.
5552082Seschrock 	 */
5562082Seschrock 	rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
5572082Seschrock 	rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
5582082Seschrock }
5592082Seschrock 
5602082Seschrock 
561789Sahrens static int
562789Sahrens vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
563789Sahrens {
564789Sahrens 	vdev_t *cvd;
5652082Seschrock 	uint64_t nparity = vd->vdev_nparity;
566789Sahrens 	int c, error;
567789Sahrens 	int lasterror = 0;
568789Sahrens 	int numerrors = 0;
569789Sahrens 
5702082Seschrock 	ASSERT(nparity > 0);
5712082Seschrock 
5722082Seschrock 	if (nparity > VDEV_RAIDZ_MAXPARITY ||
5732082Seschrock 	    vd->vdev_children < nparity + 1) {
574789Sahrens 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
575789Sahrens 		return (EINVAL);
576789Sahrens 	}
577789Sahrens 
578789Sahrens 	for (c = 0; c < vd->vdev_children; c++) {
579789Sahrens 		cvd = vd->vdev_child[c];
580789Sahrens 
581789Sahrens 		if ((error = vdev_open(cvd)) != 0) {
582789Sahrens 			lasterror = error;
583789Sahrens 			numerrors++;
584789Sahrens 			continue;
585789Sahrens 		}
586789Sahrens 
587789Sahrens 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
5881732Sbonwick 		*ashift = MAX(*ashift, cvd->vdev_ashift);
589789Sahrens 	}
590789Sahrens 
591789Sahrens 	*asize *= vd->vdev_children;
592789Sahrens 
5932082Seschrock 	if (numerrors > nparity) {
594789Sahrens 		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
595789Sahrens 		return (lasterror);
596789Sahrens 	}
597789Sahrens 
598789Sahrens 	return (0);
599789Sahrens }
600789Sahrens 
601789Sahrens static void
602789Sahrens vdev_raidz_close(vdev_t *vd)
603789Sahrens {
604789Sahrens 	int c;
605789Sahrens 
606789Sahrens 	for (c = 0; c < vd->vdev_children; c++)
607789Sahrens 		vdev_close(vd->vdev_child[c]);
608789Sahrens }
609789Sahrens 
610789Sahrens static uint64_t
611789Sahrens vdev_raidz_asize(vdev_t *vd, uint64_t psize)
612789Sahrens {
613789Sahrens 	uint64_t asize;
6141732Sbonwick 	uint64_t ashift = vd->vdev_top->vdev_ashift;
615789Sahrens 	uint64_t cols = vd->vdev_children;
6162082Seschrock 	uint64_t nparity = vd->vdev_nparity;
617789Sahrens 
6181732Sbonwick 	asize = ((psize - 1) >> ashift) + 1;
6192082Seschrock 	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
6202082Seschrock 	asize = roundup(asize, nparity + 1) << ashift;
621789Sahrens 
622789Sahrens 	return (asize);
623789Sahrens }
624789Sahrens 
625789Sahrens static void
626789Sahrens vdev_raidz_child_done(zio_t *zio)
627789Sahrens {
628789Sahrens 	raidz_col_t *rc = zio->io_private;
629789Sahrens 
630789Sahrens 	rc->rc_error = zio->io_error;
631789Sahrens 	rc->rc_tried = 1;
632789Sahrens 	rc->rc_skipped = 0;
633789Sahrens }
634789Sahrens 
635789Sahrens static void
636789Sahrens vdev_raidz_repair_done(zio_t *zio)
637789Sahrens {
6381732Sbonwick 	ASSERT(zio->io_private == zio->io_parent);
6391732Sbonwick 	vdev_raidz_map_free(zio->io_private);
640789Sahrens }
641789Sahrens 
642789Sahrens static void
643789Sahrens vdev_raidz_io_start(zio_t *zio)
644789Sahrens {
645789Sahrens 	vdev_t *vd = zio->io_vd;
6461732Sbonwick 	vdev_t *tvd = vd->vdev_top;
647789Sahrens 	vdev_t *cvd;
648789Sahrens 	blkptr_t *bp = zio->io_bp;
649789Sahrens 	raidz_map_t *rm;
650789Sahrens 	raidz_col_t *rc;
651789Sahrens 	int c;
652789Sahrens 
6532082Seschrock 	rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
6542082Seschrock 	    vd->vdev_nparity);
655789Sahrens 
6561775Sbillm 	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
657789Sahrens 
658789Sahrens 	if (zio->io_type == ZIO_TYPE_WRITE) {
659789Sahrens 		/*
6602082Seschrock 		 * Generate RAID parity in the first virtual columns.
661789Sahrens 		 */
6622082Seschrock 		if (rm->rm_firstdatacol == 1)
6632082Seschrock 			vdev_raidz_generate_parity_p(rm);
6642082Seschrock 		else
6652082Seschrock 			vdev_raidz_generate_parity_pq(rm);
666789Sahrens 
667789Sahrens 		for (c = 0; c < rm->rm_cols; c++) {
668789Sahrens 			rc = &rm->rm_col[c];
6692082Seschrock 			cvd = vd->vdev_child[rc->rc_devidx];
670789Sahrens 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
671789Sahrens 			    rc->rc_offset, rc->rc_data, rc->rc_size,
672789Sahrens 			    zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
673789Sahrens 			    vdev_raidz_child_done, rc));
674789Sahrens 		}
675789Sahrens 		zio_wait_children_done(zio);
676789Sahrens 		return;
677789Sahrens 	}
678789Sahrens 
679789Sahrens 	ASSERT(zio->io_type == ZIO_TYPE_READ);
680789Sahrens 
6812082Seschrock 	/*
6822082Seschrock 	 * Iterate over the columns in reverse order so that we hit the parity
6832082Seschrock 	 * last -- any errors along the way will force us to read the parity
6842082Seschrock 	 * data.
6852082Seschrock 	 */
686789Sahrens 	for (c = rm->rm_cols - 1; c >= 0; c--) {
687789Sahrens 		rc = &rm->rm_col[c];
6882082Seschrock 		cvd = vd->vdev_child[rc->rc_devidx];
689789Sahrens 		if (vdev_is_dead(cvd)) {
6902082Seschrock 			if (c >= rm->rm_firstdatacol)
6912082Seschrock 				rm->rm_missingdata++;
6922082Seschrock 			else
6932082Seschrock 				rm->rm_missingparity++;
694789Sahrens 			rc->rc_error = ENXIO;
695789Sahrens 			rc->rc_tried = 1;	/* don't even try */
696789Sahrens 			rc->rc_skipped = 1;
697789Sahrens 			continue;
698789Sahrens 		}
699789Sahrens 		if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) {
7002082Seschrock 			if (c >= rm->rm_firstdatacol)
7012082Seschrock 				rm->rm_missingdata++;
7022082Seschrock 			else
7032082Seschrock 				rm->rm_missingparity++;
704789Sahrens 			rc->rc_error = ESTALE;
705789Sahrens 			rc->rc_skipped = 1;
706789Sahrens 			continue;
707789Sahrens 		}
7082082Seschrock 		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
709789Sahrens 		    (zio->io_flags & ZIO_FLAG_SCRUB)) {
710789Sahrens 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
711789Sahrens 			    rc->rc_offset, rc->rc_data, rc->rc_size,
712789Sahrens 			    zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
713789Sahrens 			    vdev_raidz_child_done, rc));
714789Sahrens 		}
715789Sahrens 	}
716789Sahrens 
717789Sahrens 	zio_wait_children_done(zio);
718789Sahrens }
719789Sahrens 
7201544Seschrock /*
7211544Seschrock  * Report a checksum error for a child of a RAID-Z device.
7221544Seschrock  */
7231544Seschrock static void
7241544Seschrock raidz_checksum_error(zio_t *zio, raidz_col_t *rc)
7251544Seschrock {
7262082Seschrock 	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
7271544Seschrock 	dprintf_bp(zio->io_bp, "imputed checksum error on %s: ",
7281544Seschrock 	    vdev_description(vd));
7291544Seschrock 
7301544Seschrock 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
7311544Seschrock 		mutex_enter(&vd->vdev_stat_lock);
7321544Seschrock 		vd->vdev_stat.vs_checksum_errors++;
7331544Seschrock 		mutex_exit(&vd->vdev_stat_lock);
7341544Seschrock 	}
7351544Seschrock 
7361544Seschrock 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE))
7371544Seschrock 		zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
7381544Seschrock 		    zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size);
7391544Seschrock }
7401544Seschrock 
7412082Seschrock /*
7422082Seschrock  * Generate the parity from the data columns. If we tried and were able to
7432082Seschrock  * read the parity without error, verify that the generated parity matches the
7442082Seschrock  * data we read. If it doesn't, we fire off a checksum error. Return the
7452082Seschrock  * number such failures.
7462082Seschrock  */
7472082Seschrock static int
7482082Seschrock raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
7492082Seschrock {
7502082Seschrock 	void *orig[VDEV_RAIDZ_MAXPARITY];
7512082Seschrock 	int c, ret = 0;
7522082Seschrock 	raidz_col_t *rc;
7532082Seschrock 
7542082Seschrock 	for (c = 0; c < rm->rm_firstdatacol; c++) {
7552082Seschrock 		rc = &rm->rm_col[c];
7562082Seschrock 		if (!rc->rc_tried || rc->rc_error != 0)
7572082Seschrock 			continue;
7582082Seschrock 		orig[c] = zio_buf_alloc(rc->rc_size);
7592082Seschrock 		bcopy(rc->rc_data, orig[c], rc->rc_size);
7602082Seschrock 	}
7612082Seschrock 
7622082Seschrock 	if (rm->rm_firstdatacol == 1)
7632082Seschrock 		vdev_raidz_generate_parity_p(rm);
7642082Seschrock 	else
7652082Seschrock 		vdev_raidz_generate_parity_pq(rm);
7662082Seschrock 
7672082Seschrock 	for (c = 0; c < rm->rm_firstdatacol; c++) {
7682082Seschrock 		rc = &rm->rm_col[c];
7692082Seschrock 		if (!rc->rc_tried || rc->rc_error != 0)
7702082Seschrock 			continue;
7712082Seschrock 		if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
7722082Seschrock 			raidz_checksum_error(zio, rc);
7732082Seschrock 			rc->rc_error = ECKSUM;
7742082Seschrock 			ret++;
7752082Seschrock 		}
7762082Seschrock 		zio_buf_free(orig[c], rc->rc_size);
7772082Seschrock 	}
7782082Seschrock 
7792082Seschrock 	return (ret);
7802082Seschrock }
7812082Seschrock 
7822082Seschrock static uint64_t raidz_corrected_p;
7832082Seschrock static uint64_t raidz_corrected_q;
7842082Seschrock static uint64_t raidz_corrected_pq;
7851544Seschrock 
786789Sahrens static void
787789Sahrens vdev_raidz_io_done(zio_t *zio)
788789Sahrens {
789789Sahrens 	vdev_t *vd = zio->io_vd;
790789Sahrens 	vdev_t *cvd;
791789Sahrens 	raidz_map_t *rm = zio->io_vsd;
7922082Seschrock 	raidz_col_t *rc, *rc1;
793789Sahrens 	int unexpected_errors = 0;
7942082Seschrock 	int parity_errors = 0;
7953456Sahl 	int parity_untried = 0;
7962082Seschrock 	int data_errors = 0;
7972082Seschrock 	int n, c, c1;
798789Sahrens 
7991775Sbillm 	ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
800789Sahrens 
801789Sahrens 	zio->io_error = 0;
802789Sahrens 	zio->io_numerrors = 0;
803789Sahrens 
8042082Seschrock 	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
8052082Seschrock 	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
8062082Seschrock 
807789Sahrens 	for (c = 0; c < rm->rm_cols; c++) {
808789Sahrens 		rc = &rm->rm_col[c];
809789Sahrens 
810789Sahrens 		/*
811789Sahrens 		 * We preserve any EIOs because those may be worth retrying;
812789Sahrens 		 * whereas ECKSUM and ENXIO are more likely to be persistent.
813789Sahrens 		 */
814789Sahrens 		if (rc->rc_error) {
815789Sahrens 			if (zio->io_error != EIO)
816789Sahrens 				zio->io_error = rc->rc_error;
8172082Seschrock 
8182082Seschrock 			if (c < rm->rm_firstdatacol)
8192082Seschrock 				parity_errors++;
8202082Seschrock 			else
8212082Seschrock 				data_errors++;
8222082Seschrock 
823789Sahrens 			if (!rc->rc_skipped)
824789Sahrens 				unexpected_errors++;
8252082Seschrock 
826789Sahrens 			zio->io_numerrors++;
8273456Sahl 		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
8283456Sahl 			parity_untried++;
829789Sahrens 		}
830789Sahrens 	}
831789Sahrens 
832789Sahrens 	if (zio->io_type == ZIO_TYPE_WRITE) {
833789Sahrens 		/*
834789Sahrens 		 * If this is not a failfast write, and we were able to
835789Sahrens 		 * write enough columns to reconstruct the data, good enough.
836789Sahrens 		 */
837789Sahrens 		/* XXPOLICY */
838789Sahrens 		if (zio->io_numerrors <= rm->rm_firstdatacol &&
839789Sahrens 		    !(zio->io_flags & ZIO_FLAG_FAILFAST))
840789Sahrens 			zio->io_error = 0;
841789Sahrens 
842789Sahrens 		vdev_raidz_map_free(zio);
843789Sahrens 		zio_next_stage(zio);
844789Sahrens 		return;
845789Sahrens 	}
846789Sahrens 
847789Sahrens 	ASSERT(zio->io_type == ZIO_TYPE_READ);
8482082Seschrock 	/*
8492082Seschrock 	 * There are three potential phases for a read:
8502082Seschrock 	 *	1. produce valid data from the columns read
8512082Seschrock 	 *	2. read all disks and try again
8522082Seschrock 	 *	3. perform combinatorial reconstruction
8532082Seschrock 	 *
8542082Seschrock 	 * Each phase is progressively both more expensive and less likely to
8552082Seschrock 	 * occur. If we encounter more errors than we can repair or all phases
8562082Seschrock 	 * fail, we have no choice but to return an error.
8572082Seschrock 	 */
858789Sahrens 
859789Sahrens 	/*
8602082Seschrock 	 * If the number of errors we saw was correctable -- less than or equal
8613456Sahl 	 * to the number of parity disks read -- attempt to produce data that
8623456Sahl 	 * has a valid checksum. Naturally, this case applies in the absence of
8633456Sahl 	 * any errors.
864789Sahrens 	 */
8653456Sahl 	if (zio->io_numerrors <= rm->rm_firstdatacol - parity_untried) {
8662082Seschrock 		switch (data_errors) {
8672082Seschrock 		case 0:
8682082Seschrock 			if (zio_checksum_error(zio) == 0) {
8692082Seschrock 				zio->io_error = 0;
870*4034Sahl 
871*4034Sahl 				/*
872*4034Sahl 				 * If we read parity information (unnecessarily
873*4034Sahl 				 * as it happens since no reconstruction was
874*4034Sahl 				 * needed) regenerate and verify the parity.
875*4034Sahl 				 * We also regenerate parity when resilvering
876*4034Sahl 				 * so we can write it out to the failed device
877*4034Sahl 				 * later.
878*4034Sahl 				 */
8793456Sahl 				if (parity_errors + parity_untried <
880*4034Sahl 				    rm->rm_firstdatacol ||
881*4034Sahl 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
8823456Sahl 					n = raidz_parity_verify(zio, rm);
8833456Sahl 					unexpected_errors += n;
8843456Sahl 					ASSERT(parity_errors + n <=
8853456Sahl 					    rm->rm_firstdatacol);
8863456Sahl 				}
8872082Seschrock 				goto done;
8882082Seschrock 			}
8892082Seschrock 			break;
8902082Seschrock 
8912082Seschrock 		case 1:
8923456Sahl 			/*
8933456Sahl 			 * We either attempt to read all the parity columns or
8943456Sahl 			 * none of them. If we didn't try to read parity, we
8953456Sahl 			 * wouldn't be here in the correctable case. There must
8963456Sahl 			 * also have been fewer parity errors than parity
8973456Sahl 			 * columns or, again, we wouldn't be in this code path.
8983456Sahl 			 */
8993456Sahl 			ASSERT(parity_untried == 0);
9002082Seschrock 			ASSERT(parity_errors < rm->rm_firstdatacol);
9012082Seschrock 
9022082Seschrock 			/*
9032082Seschrock 			 * Find the column that reported the error.
9042082Seschrock 			 */
9052082Seschrock 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
9062082Seschrock 				rc = &rm->rm_col[c];
9072082Seschrock 				if (rc->rc_error != 0)
9082082Seschrock 					break;
9092082Seschrock 			}
9102082Seschrock 			ASSERT(c != rm->rm_cols);
9112082Seschrock 			ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
9122082Seschrock 			    rc->rc_error == ESTALE);
9132082Seschrock 
9142082Seschrock 			if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) {
9152082Seschrock 				vdev_raidz_reconstruct_p(rm, c);
9162082Seschrock 			} else {
9172082Seschrock 				ASSERT(rm->rm_firstdatacol > 1);
9182082Seschrock 				vdev_raidz_reconstruct_q(rm, c);
9192082Seschrock 			}
9202082Seschrock 
9212082Seschrock 			if (zio_checksum_error(zio) == 0) {
9222082Seschrock 				zio->io_error = 0;
9232082Seschrock 				if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0)
9242082Seschrock 					atomic_inc_64(&raidz_corrected_p);
9252082Seschrock 				else
9262082Seschrock 					atomic_inc_64(&raidz_corrected_q);
927789Sahrens 
9282082Seschrock 				/*
9293456Sahl 				 * If there's more than one parity disk that
9303456Sahl 				 * was successfully read, confirm that the
9313456Sahl 				 * other parity disk produced the correct data.
9323456Sahl 				 * This routine is suboptimal in that it
9333456Sahl 				 * regenerates both the parity we wish to test
9343456Sahl 				 * as well as the parity we just used to
9353456Sahl 				 * perform the reconstruction, but this should
9363456Sahl 				 * be a relatively uncommon case, and can be
9373456Sahl 				 * optimized if it becomes a problem.
938*4034Sahl 				 * We also regenerate parity when resilvering
939*4034Sahl 				 * so we can write it out to the failed device
940*4034Sahl 				 * later.
9412082Seschrock 				 */
942*4034Sahl 				if (parity_errors < rm->rm_firstdatacol - 1 ||
943*4034Sahl 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
9442082Seschrock 					n = raidz_parity_verify(zio, rm);
9452082Seschrock 					unexpected_errors += n;
9462082Seschrock 					ASSERT(parity_errors + n <=
9472082Seschrock 					    rm->rm_firstdatacol);
9482082Seschrock 				}
9492082Seschrock 
9502082Seschrock 				goto done;
9512082Seschrock 			}
9522082Seschrock 			break;
9532082Seschrock 
9542082Seschrock 		case 2:
9552082Seschrock 			/*
9563456Sahl 			 * Two data column errors require double parity.
9573456Sahl 			 */
9583456Sahl 			ASSERT(rm->rm_firstdatacol == 2);
9593456Sahl 
9603456Sahl 			/*
9612082Seschrock 			 * Find the two columns that reported errors.
9622082Seschrock 			 */
9632082Seschrock 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
9642082Seschrock 				rc = &rm->rm_col[c];
9652082Seschrock 				if (rc->rc_error != 0)
9662082Seschrock 					break;
967789Sahrens 			}
9682082Seschrock 			ASSERT(c != rm->rm_cols);
9692082Seschrock 			ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
9702082Seschrock 			    rc->rc_error == ESTALE);
9712082Seschrock 
9722082Seschrock 			for (c1 = c++; c < rm->rm_cols; c++) {
9732082Seschrock 				rc = &rm->rm_col[c];
9742082Seschrock 				if (rc->rc_error != 0)
9752082Seschrock 					break;
9762082Seschrock 			}
9772082Seschrock 			ASSERT(c != rm->rm_cols);
9782082Seschrock 			ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
9792082Seschrock 			    rc->rc_error == ESTALE);
980789Sahrens 
9812082Seschrock 			vdev_raidz_reconstruct_pq(rm, c1, c);
9822082Seschrock 
9832082Seschrock 			if (zio_checksum_error(zio) == 0) {
9842082Seschrock 				zio->io_error = 0;
9852082Seschrock 				atomic_inc_64(&raidz_corrected_pq);
9862082Seschrock 
9872082Seschrock 				goto done;
9882082Seschrock 			}
9892082Seschrock 			break;
9902082Seschrock 
9912082Seschrock 		default:
9922082Seschrock 			ASSERT(rm->rm_firstdatacol <= 2);
9932082Seschrock 			ASSERT(0);
994789Sahrens 		}
995789Sahrens 	}
996789Sahrens 
997789Sahrens 	/*
9982082Seschrock 	 * This isn't a typical situation -- either we got a read error or
9992082Seschrock 	 * a child silently returned bad data. Read every block so we can
10002082Seschrock 	 * try again with as much data and parity as we can track down. If
10012082Seschrock 	 * we've already been through once before, all children will be marked
10022082Seschrock 	 * as tried so we'll proceed to combinatorial reconstruction.
1003789Sahrens 	 */
1004789Sahrens 	unexpected_errors = 1;
10052082Seschrock 	rm->rm_missingdata = 0;
10062082Seschrock 	rm->rm_missingparity = 0;
1007789Sahrens 
10082082Seschrock 	for (c = 0; c < rm->rm_cols; c++) {
10092082Seschrock 		if (rm->rm_col[c].rc_tried)
10102082Seschrock 			continue;
1011789Sahrens 
1012789Sahrens 		zio->io_error = 0;
1013789Sahrens 		zio_vdev_io_redone(zio);
10142082Seschrock 		do {
1015789Sahrens 			rc = &rm->rm_col[c];
1016789Sahrens 			if (rc->rc_tried)
1017789Sahrens 				continue;
1018789Sahrens 			zio_nowait(zio_vdev_child_io(zio, NULL,
10192082Seschrock 			    vd->vdev_child[rc->rc_devidx],
1020789Sahrens 			    rc->rc_offset, rc->rc_data, rc->rc_size,
1021789Sahrens 			    zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
1022789Sahrens 			    vdev_raidz_child_done, rc));
10232082Seschrock 		} while (++c < rm->rm_cols);
10242082Seschrock 		dprintf("rereading\n");
1025789Sahrens 		zio_wait_children_done(zio);
1026789Sahrens 		return;
1027789Sahrens 	}
1028789Sahrens 
1029789Sahrens 	/*
10302082Seschrock 	 * At this point we've attempted to reconstruct the data given the
10312082Seschrock 	 * errors we detected, and we've attempted to read all columns. There
10322082Seschrock 	 * must, therefore, be one or more additional problems -- silent errors
10332082Seschrock 	 * resulting in invalid data rather than explicit I/O errors resulting
10342082Seschrock 	 * in absent data. Before we attempt combinatorial reconstruction make
10352082Seschrock 	 * sure we have a chance of coming up with the right answer.
1036789Sahrens 	 */
10372082Seschrock 	if (zio->io_numerrors >= rm->rm_firstdatacol) {
1038789Sahrens 		ASSERT(zio->io_error != 0);
1039789Sahrens 		goto done;
1040789Sahrens 	}
1041789Sahrens 
10422082Seschrock 	if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) {
10432082Seschrock 		/*
10442082Seschrock 		 * Attempt to reconstruct the data from parity P.
10452082Seschrock 		 */
10462082Seschrock 		for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
10472082Seschrock 			void *orig;
10482082Seschrock 			rc = &rm->rm_col[c];
10492082Seschrock 
10502082Seschrock 			orig = zio_buf_alloc(rc->rc_size);
10512082Seschrock 			bcopy(rc->rc_data, orig, rc->rc_size);
10522082Seschrock 			vdev_raidz_reconstruct_p(rm, c);
10532082Seschrock 
10542082Seschrock 			if (zio_checksum_error(zio) == 0) {
10552082Seschrock 				zio_buf_free(orig, rc->rc_size);
10562082Seschrock 				zio->io_error = 0;
10572082Seschrock 				atomic_inc_64(&raidz_corrected_p);
10582082Seschrock 
10592082Seschrock 				/*
10602082Seschrock 				 * If this child didn't know that it returned
10612082Seschrock 				 * bad data, inform it.
10622082Seschrock 				 */
10632082Seschrock 				if (rc->rc_tried && rc->rc_error == 0)
10642082Seschrock 					raidz_checksum_error(zio, rc);
10652082Seschrock 				rc->rc_error = ECKSUM;
10662082Seschrock 				goto done;
10672082Seschrock 			}
10682082Seschrock 
10692082Seschrock 			bcopy(orig, rc->rc_data, rc->rc_size);
10702082Seschrock 			zio_buf_free(orig, rc->rc_size);
10712082Seschrock 		}
10722082Seschrock 	}
10732082Seschrock 
10742082Seschrock 	if (rm->rm_firstdatacol > 1 && rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) {
10752082Seschrock 		/*
10762082Seschrock 		 * Attempt to reconstruct the data from parity Q.
10772082Seschrock 		 */
10782082Seschrock 		for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
10792082Seschrock 			void *orig;
10802082Seschrock 			rc = &rm->rm_col[c];
10812082Seschrock 
10822082Seschrock 			orig = zio_buf_alloc(rc->rc_size);
10832082Seschrock 			bcopy(rc->rc_data, orig, rc->rc_size);
10842082Seschrock 			vdev_raidz_reconstruct_q(rm, c);
10852082Seschrock 
10862082Seschrock 			if (zio_checksum_error(zio) == 0) {
10872082Seschrock 				zio_buf_free(orig, rc->rc_size);
1088789Sahrens 				zio->io_error = 0;
10892082Seschrock 				atomic_inc_64(&raidz_corrected_q);
10902082Seschrock 
10912082Seschrock 				/*
10922082Seschrock 				 * If this child didn't know that it returned
10932082Seschrock 				 * bad data, inform it.
10942082Seschrock 				 */
10952082Seschrock 				if (rc->rc_tried && rc->rc_error == 0)
10962082Seschrock 					raidz_checksum_error(zio, rc);
10972082Seschrock 				rc->rc_error = ECKSUM;
10982082Seschrock 				goto done;
10992082Seschrock 			}
11002082Seschrock 
11012082Seschrock 			bcopy(orig, rc->rc_data, rc->rc_size);
11022082Seschrock 			zio_buf_free(orig, rc->rc_size);
11032082Seschrock 		}
11042082Seschrock 	}
11052082Seschrock 
11062082Seschrock 	if (rm->rm_firstdatacol > 1 &&
11072082Seschrock 	    rm->rm_col[VDEV_RAIDZ_P].rc_error == 0 &&
11082082Seschrock 	    rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) {
11092082Seschrock 		/*
11102082Seschrock 		 * Attempt to reconstruct the data from both P and Q.
11112082Seschrock 		 */
11122082Seschrock 		for (c = rm->rm_firstdatacol; c < rm->rm_cols - 1; c++) {
11132082Seschrock 			void *orig, *orig1;
11142082Seschrock 			rc = &rm->rm_col[c];
11152082Seschrock 
11162082Seschrock 			orig = zio_buf_alloc(rc->rc_size);
11172082Seschrock 			bcopy(rc->rc_data, orig, rc->rc_size);
11182082Seschrock 
11192082Seschrock 			for (c1 = c + 1; c1 < rm->rm_cols; c1++) {
11202082Seschrock 				rc1 = &rm->rm_col[c1];
11212082Seschrock 
11222082Seschrock 				orig1 = zio_buf_alloc(rc1->rc_size);
11232082Seschrock 				bcopy(rc1->rc_data, orig1, rc1->rc_size);
11242082Seschrock 
11252082Seschrock 				vdev_raidz_reconstruct_pq(rm, c, c1);
11262082Seschrock 
11272082Seschrock 				if (zio_checksum_error(zio) == 0) {
11282082Seschrock 					zio_buf_free(orig, rc->rc_size);
11292082Seschrock 					zio_buf_free(orig1, rc1->rc_size);
11302082Seschrock 					zio->io_error = 0;
11312082Seschrock 					atomic_inc_64(&raidz_corrected_pq);
11322082Seschrock 
11332082Seschrock 					/*
11342082Seschrock 					 * If these children didn't know they
11352082Seschrock 					 * returned bad data, inform them.
11362082Seschrock 					 */
11372082Seschrock 					if (rc->rc_tried && rc->rc_error == 0)
11382082Seschrock 						raidz_checksum_error(zio, rc);
11392082Seschrock 					if (rc1->rc_tried && rc1->rc_error == 0)
11402082Seschrock 						raidz_checksum_error(zio, rc1);
11412082Seschrock 
11422082Seschrock 					rc->rc_error = ECKSUM;
11432082Seschrock 					rc1->rc_error = ECKSUM;
11442082Seschrock 
11452082Seschrock 					goto done;
11462082Seschrock 				}
11472082Seschrock 
11482082Seschrock 				bcopy(orig1, rc1->rc_data, rc1->rc_size);
11492082Seschrock 				zio_buf_free(orig1, rc1->rc_size);
11502082Seschrock 			}
11512082Seschrock 
11522082Seschrock 			bcopy(orig, rc->rc_data, rc->rc_size);
11532082Seschrock 			zio_buf_free(orig, rc->rc_size);
1154789Sahrens 		}
1155789Sahrens 	}
1156789Sahrens 
1157789Sahrens 	/*
11582082Seschrock 	 * All combinations failed to checksum. Generate checksum ereports for
11592082Seschrock 	 * all children.
1160789Sahrens 	 */
1161789Sahrens 	zio->io_error = ECKSUM;
11621544Seschrock 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
11631544Seschrock 		for (c = 0; c < rm->rm_cols; c++) {
11641544Seschrock 			rc = &rm->rm_col[c];
11651544Seschrock 			zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
11662082Seschrock 			    zio->io_spa, vd->vdev_child[rc->rc_devidx], zio,
11671544Seschrock 			    rc->rc_offset, rc->rc_size);
11681544Seschrock 		}
11691544Seschrock 	}
1170789Sahrens 
1171789Sahrens done:
1172789Sahrens 	zio_checksum_verified(zio);
1173789Sahrens 
1174789Sahrens 	if (zio->io_error == 0 && (spa_mode & FWRITE) &&
1175789Sahrens 	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
11761732Sbonwick 		zio_t *rio;
11771732Sbonwick 
1178789Sahrens 		/*
1179789Sahrens 		 * Use the good data we have in hand to repair damaged children.
11801732Sbonwick 		 *
11811732Sbonwick 		 * We issue all repair I/Os as children of 'rio' to arrange
11821732Sbonwick 		 * that vdev_raidz_map_free(zio) will be invoked after all
11831732Sbonwick 		 * repairs complete, but before we advance to the next stage.
1184789Sahrens 		 */
11851732Sbonwick 		rio = zio_null(zio, zio->io_spa,
11861732Sbonwick 		    vdev_raidz_repair_done, zio, ZIO_FLAG_CANFAIL);
11871732Sbonwick 
1188789Sahrens 		for (c = 0; c < rm->rm_cols; c++) {
1189789Sahrens 			rc = &rm->rm_col[c];
11902082Seschrock 			cvd = vd->vdev_child[rc->rc_devidx];
1191789Sahrens 
11921732Sbonwick 			if (rc->rc_error == 0)
11931732Sbonwick 				continue;
11941732Sbonwick 
11951732Sbonwick 			dprintf("%s resilvered %s @ 0x%llx error %d\n",
11961732Sbonwick 			    vdev_description(vd),
11971732Sbonwick 			    vdev_description(cvd),
11981732Sbonwick 			    zio->io_offset, rc->rc_error);
1199789Sahrens 
12001732Sbonwick 			zio_nowait(zio_vdev_child_io(rio, NULL, cvd,
12011732Sbonwick 			    rc->rc_offset, rc->rc_data, rc->rc_size,
12021732Sbonwick 			    ZIO_TYPE_WRITE, zio->io_priority,
12032082Seschrock 			    ZIO_FLAG_IO_REPAIR | ZIO_FLAG_DONT_PROPAGATE |
12042082Seschrock 			    ZIO_FLAG_CANFAIL, NULL, NULL));
12051732Sbonwick 		}
1206789Sahrens 
12071732Sbonwick 		zio_nowait(rio);
12081732Sbonwick 		zio_wait_children_done(zio);
12091732Sbonwick 		return;
1210789Sahrens 	}
1211789Sahrens 
1212789Sahrens 	vdev_raidz_map_free(zio);
1213789Sahrens 	zio_next_stage(zio);
1214789Sahrens }
1215789Sahrens 
1216789Sahrens static void
1217789Sahrens vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
1218789Sahrens {
12192082Seschrock 	if (faulted > vd->vdev_nparity)
12201544Seschrock 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
12211544Seschrock 		    VDEV_AUX_NO_REPLICAS);
1222789Sahrens 	else if (degraded + faulted != 0)
12231544Seschrock 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
1224789Sahrens 	else
12251544Seschrock 		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
1226789Sahrens }
1227789Sahrens 
1228789Sahrens vdev_ops_t vdev_raidz_ops = {
1229789Sahrens 	vdev_raidz_open,
1230789Sahrens 	vdev_raidz_close,
1231789Sahrens 	vdev_raidz_asize,
1232789Sahrens 	vdev_raidz_io_start,
1233789Sahrens 	vdev_raidz_io_done,
1234789Sahrens 	vdev_raidz_state_change,
1235789Sahrens 	VDEV_TYPE_RAIDZ,	/* name of this vdev type */
1236789Sahrens 	B_FALSE			/* not a leaf vdev */
1237789Sahrens };
1238