xref: /onnv-gate/usr/src/uts/common/fs/zfs/vdev_raidz.c (revision 9846:6527c7b4a92e)
1789Sahrens /*
2789Sahrens  * CDDL HEADER START
3789Sahrens  *
4789Sahrens  * The contents of this file are subject to the terms of the
51544Seschrock  * Common Development and Distribution License (the "License").
61544Seschrock  * You may not use this file except in compliance with the License.
7789Sahrens  *
8789Sahrens  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9789Sahrens  * or http://www.opensolaris.org/os/licensing.
10789Sahrens  * See the License for the specific language governing permissions
11789Sahrens  * and limitations under the License.
12789Sahrens  *
13789Sahrens  * When distributing Covered Code, include this CDDL HEADER in each
14789Sahrens  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15789Sahrens  * If applicable, add the following below this CDDL HEADER, with the
16789Sahrens  * fields enclosed by brackets "[]" replaced with your own identifying
17789Sahrens  * information: Portions Copyright [yyyy] [name of copyright owner]
18789Sahrens  *
19789Sahrens  * CDDL HEADER END
20789Sahrens  */
212082Seschrock 
22789Sahrens /*
239434SMark.Musante@Sun.COM  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24789Sahrens  * Use is subject to license terms.
25789Sahrens  */
26789Sahrens 
27789Sahrens #include <sys/zfs_context.h>
28789Sahrens #include <sys/spa.h>
29789Sahrens #include <sys/vdev_impl.h>
30789Sahrens #include <sys/zio.h>
31789Sahrens #include <sys/zio_checksum.h>
32789Sahrens #include <sys/fs/zfs.h>
331544Seschrock #include <sys/fm/fs/zfs.h>
34789Sahrens 
35789Sahrens /*
36789Sahrens  * Virtual device vector for RAID-Z.
372082Seschrock  *
382082Seschrock  * This vdev supports both single and double parity. For single parity, we
392082Seschrock  * use a simple XOR of all the data columns. For double parity, we use both
402082Seschrock  * the simple XOR as well as a technique described in "The mathematics of
412082Seschrock  * RAID-6" by H. Peter Anvin. This technique defines a Galois field, GF(2^8),
422082Seschrock  * over the integers expressable in a single byte. Briefly, the operations on
432082Seschrock  * the field are defined as follows:
442082Seschrock  *
452082Seschrock  *   o addition (+) is represented by a bitwise XOR
462082Seschrock  *   o subtraction (-) is therefore identical to addition: A + B = A - B
472082Seschrock  *   o multiplication of A by 2 is defined by the following bitwise expression:
482082Seschrock  *	(A * 2)_7 = A_6
492082Seschrock  *	(A * 2)_6 = A_5
502082Seschrock  *	(A * 2)_5 = A_4
512082Seschrock  *	(A * 2)_4 = A_3 + A_7
522082Seschrock  *	(A * 2)_3 = A_2 + A_7
532082Seschrock  *	(A * 2)_2 = A_1 + A_7
542082Seschrock  *	(A * 2)_1 = A_0
552082Seschrock  *	(A * 2)_0 = A_7
562082Seschrock  *
572082Seschrock  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
582082Seschrock  *
592082Seschrock  * Observe that any number in the field (except for 0) can be expressed as a
602082Seschrock  * power of 2 -- a generator for the field. We store a table of the powers of
612082Seschrock  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
622082Seschrock  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
632082Seschrock  * than field addition). The inverse of a field element A (A^-1) is A^254.
642082Seschrock  *
652082Seschrock  * The two parity columns, P and Q, over several data columns, D_0, ... D_n-1,
662082Seschrock  * can be expressed by field operations:
672082Seschrock  *
682082Seschrock  *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
692082Seschrock  *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
702082Seschrock  *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
712082Seschrock  *
722082Seschrock  * See the reconstruction code below for how P and Q can used individually or
732082Seschrock  * in concert to recover missing data columns.
74789Sahrens  */
75789Sahrens 
76789Sahrens typedef struct raidz_col {
772082Seschrock 	uint64_t rc_devidx;		/* child device index for I/O */
782082Seschrock 	uint64_t rc_offset;		/* device offset */
792082Seschrock 	uint64_t rc_size;		/* I/O size */
802082Seschrock 	void *rc_data;			/* I/O data */
812082Seschrock 	int rc_error;			/* I/O error for this device */
822082Seschrock 	uint8_t rc_tried;		/* Did we attempt this I/O column? */
832082Seschrock 	uint8_t rc_skipped;		/* Did we skip this I/O column? */
84789Sahrens } raidz_col_t;
85789Sahrens 
86789Sahrens typedef struct raidz_map {
872082Seschrock 	uint64_t rm_cols;		/* Column count */
882082Seschrock 	uint64_t rm_bigcols;		/* Number of oversized columns */
892082Seschrock 	uint64_t rm_asize;		/* Actual total I/O size */
902082Seschrock 	uint64_t rm_missingdata;	/* Count of missing data devices */
912082Seschrock 	uint64_t rm_missingparity;	/* Count of missing parity devices */
922082Seschrock 	uint64_t rm_firstdatacol;	/* First data column/parity count */
932082Seschrock 	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
94789Sahrens } raidz_map_t;
95789Sahrens 
962082Seschrock #define	VDEV_RAIDZ_P		0
972082Seschrock #define	VDEV_RAIDZ_Q		1
982082Seschrock 
992082Seschrock #define	VDEV_RAIDZ_MAXPARITY	2
1002082Seschrock 
1012082Seschrock #define	VDEV_RAIDZ_MUL_2(a)	(((a) << 1) ^ (((a) & 0x80) ? 0x1d : 0))
1022082Seschrock 
1032082Seschrock /*
1042082Seschrock  * These two tables represent powers and logs of 2 in the Galois field defined
1052082Seschrock  * above. These values were computed by repeatedly multiplying by 2 as above.
1062082Seschrock  */
1072082Seschrock static const uint8_t vdev_raidz_pow2[256] = {
1082082Seschrock 	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
1092082Seschrock 	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
1102082Seschrock 	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
1112082Seschrock 	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
1122082Seschrock 	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
1132082Seschrock 	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
1142082Seschrock 	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
1152082Seschrock 	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
1162082Seschrock 	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
1172082Seschrock 	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
1182082Seschrock 	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
1192082Seschrock 	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
1202082Seschrock 	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
1212082Seschrock 	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
1222082Seschrock 	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
1232082Seschrock 	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
1242082Seschrock 	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
1252082Seschrock 	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
1262082Seschrock 	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
1272082Seschrock 	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
1282082Seschrock 	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
1292082Seschrock 	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
1302082Seschrock 	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
1312082Seschrock 	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
1322082Seschrock 	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
1332082Seschrock 	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
1342082Seschrock 	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
1352082Seschrock 	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
1362082Seschrock 	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
1372082Seschrock 	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
1382082Seschrock 	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
1392082Seschrock 	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
1402082Seschrock };
1412082Seschrock static const uint8_t vdev_raidz_log2[256] = {
1422082Seschrock 	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
1432082Seschrock 	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
1442082Seschrock 	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
1452082Seschrock 	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
1462082Seschrock 	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
1472082Seschrock 	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
1482082Seschrock 	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
1492082Seschrock 	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
1502082Seschrock 	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
1512082Seschrock 	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
1522082Seschrock 	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
1532082Seschrock 	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
1542082Seschrock 	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
1552082Seschrock 	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
1562082Seschrock 	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
1572082Seschrock 	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
1582082Seschrock 	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
1592082Seschrock 	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
1602082Seschrock 	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
1612082Seschrock 	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
1622082Seschrock 	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
1632082Seschrock 	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
1642082Seschrock 	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
1652082Seschrock 	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
1662082Seschrock 	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
1672082Seschrock 	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
1682082Seschrock 	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
1692082Seschrock 	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
1702082Seschrock 	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
1712082Seschrock 	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
1722082Seschrock 	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
1732082Seschrock 	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
1742082Seschrock };
1752082Seschrock 
1762082Seschrock /*
1772082Seschrock  * Multiply a given number by 2 raised to the given power.
1782082Seschrock  */
1792082Seschrock static uint8_t
1802082Seschrock vdev_raidz_exp2(uint_t a, int exp)
1812082Seschrock {
1822082Seschrock 	if (a == 0)
1832082Seschrock 		return (0);
1842082Seschrock 
1852082Seschrock 	ASSERT(exp >= 0);
1862082Seschrock 	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
1872082Seschrock 
1882082Seschrock 	exp += vdev_raidz_log2[a];
1892082Seschrock 	if (exp > 255)
1902082Seschrock 		exp -= 255;
1912082Seschrock 
1922082Seschrock 	return (vdev_raidz_pow2[exp]);
1932082Seschrock }
1942082Seschrock 
1957754SJeff.Bonwick@Sun.COM static void
1967754SJeff.Bonwick@Sun.COM vdev_raidz_map_free(zio_t *zio)
1977754SJeff.Bonwick@Sun.COM {
1987754SJeff.Bonwick@Sun.COM 	raidz_map_t *rm = zio->io_vsd;
1997754SJeff.Bonwick@Sun.COM 	int c;
2007754SJeff.Bonwick@Sun.COM 
2017754SJeff.Bonwick@Sun.COM 	for (c = 0; c < rm->rm_firstdatacol; c++)
2027754SJeff.Bonwick@Sun.COM 		zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
2037754SJeff.Bonwick@Sun.COM 
2047754SJeff.Bonwick@Sun.COM 	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols]));
2057754SJeff.Bonwick@Sun.COM }
2067754SJeff.Bonwick@Sun.COM 
207789Sahrens static raidz_map_t *
2082082Seschrock vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
2092082Seschrock     uint64_t nparity)
210789Sahrens {
211789Sahrens 	raidz_map_t *rm;
212789Sahrens 	uint64_t b = zio->io_offset >> unit_shift;
213789Sahrens 	uint64_t s = zio->io_size >> unit_shift;
214789Sahrens 	uint64_t f = b % dcols;
215789Sahrens 	uint64_t o = (b / dcols) << unit_shift;
2162082Seschrock 	uint64_t q, r, c, bc, col, acols, coff, devidx;
217789Sahrens 
2182082Seschrock 	q = s / (dcols - nparity);
2192082Seschrock 	r = s - q * (dcols - nparity);
2202082Seschrock 	bc = (r == 0 ? 0 : r + nparity);
221789Sahrens 
222789Sahrens 	acols = (q == 0 ? bc : dcols);
223789Sahrens 
224789Sahrens 	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[acols]), KM_SLEEP);
225789Sahrens 
226789Sahrens 	rm->rm_cols = acols;
227789Sahrens 	rm->rm_bigcols = bc;
228789Sahrens 	rm->rm_asize = 0;
2292082Seschrock 	rm->rm_missingdata = 0;
2302082Seschrock 	rm->rm_missingparity = 0;
2312082Seschrock 	rm->rm_firstdatacol = nparity;
232789Sahrens 
233789Sahrens 	for (c = 0; c < acols; c++) {
234789Sahrens 		col = f + c;
235789Sahrens 		coff = o;
236789Sahrens 		if (col >= dcols) {
237789Sahrens 			col -= dcols;
238789Sahrens 			coff += 1ULL << unit_shift;
239789Sahrens 		}
2402082Seschrock 		rm->rm_col[c].rc_devidx = col;
241789Sahrens 		rm->rm_col[c].rc_offset = coff;
242789Sahrens 		rm->rm_col[c].rc_size = (q + (c < bc)) << unit_shift;
243789Sahrens 		rm->rm_col[c].rc_data = NULL;
244789Sahrens 		rm->rm_col[c].rc_error = 0;
245789Sahrens 		rm->rm_col[c].rc_tried = 0;
246789Sahrens 		rm->rm_col[c].rc_skipped = 0;
247789Sahrens 		rm->rm_asize += rm->rm_col[c].rc_size;
248789Sahrens 	}
249789Sahrens 
2502082Seschrock 	rm->rm_asize = roundup(rm->rm_asize, (nparity + 1) << unit_shift);
251789Sahrens 
252789Sahrens 	for (c = 0; c < rm->rm_firstdatacol; c++)
253789Sahrens 		rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
254789Sahrens 
255789Sahrens 	rm->rm_col[c].rc_data = zio->io_data;
256789Sahrens 
257789Sahrens 	for (c = c + 1; c < acols; c++)
258789Sahrens 		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
259789Sahrens 		    rm->rm_col[c - 1].rc_size;
260789Sahrens 
2611133Seschrock 	/*
2622082Seschrock 	 * If all data stored spans all columns, there's a danger that parity
2632082Seschrock 	 * will always be on the same device and, since parity isn't read
2642082Seschrock 	 * during normal operation, that that device's I/O bandwidth won't be
2652082Seschrock 	 * used effectively. We therefore switch the parity every 1MB.
2662082Seschrock 	 *
2672082Seschrock 	 * ... at least that was, ostensibly, the theory. As a practical
2682082Seschrock 	 * matter unless we juggle the parity between all devices evenly, we
2692082Seschrock 	 * won't see any benefit. Further, occasional writes that aren't a
2702082Seschrock 	 * multiple of the LCM of the number of children and the minimum
2712082Seschrock 	 * stripe width are sufficient to avoid pessimal behavior.
2722082Seschrock 	 * Unfortunately, this decision created an implicit on-disk format
2733456Sahl 	 * requirement that we need to support for all eternity, but only
2743456Sahl 	 * for single-parity RAID-Z.
2751133Seschrock 	 */
2761133Seschrock 	ASSERT(rm->rm_cols >= 2);
2771133Seschrock 	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
278789Sahrens 
2792082Seschrock 	if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
2802082Seschrock 		devidx = rm->rm_col[0].rc_devidx;
2811133Seschrock 		o = rm->rm_col[0].rc_offset;
2822082Seschrock 		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
2831133Seschrock 		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
2842082Seschrock 		rm->rm_col[1].rc_devidx = devidx;
2851133Seschrock 		rm->rm_col[1].rc_offset = o;
286789Sahrens 	}
287789Sahrens 
288789Sahrens 	zio->io_vsd = rm;
2897754SJeff.Bonwick@Sun.COM 	zio->io_vsd_free = vdev_raidz_map_free;
290789Sahrens 	return (rm);
291789Sahrens }
292789Sahrens 
293789Sahrens static void
2942082Seschrock vdev_raidz_generate_parity_p(raidz_map_t *rm)
2952082Seschrock {
2962082Seschrock 	uint64_t *p, *src, pcount, ccount, i;
2972082Seschrock 	int c;
2982082Seschrock 
2992082Seschrock 	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
3002082Seschrock 
3012082Seschrock 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
3022082Seschrock 		src = rm->rm_col[c].rc_data;
3032082Seschrock 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
3042082Seschrock 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
3052082Seschrock 
3062082Seschrock 		if (c == rm->rm_firstdatacol) {
3072082Seschrock 			ASSERT(ccount == pcount);
3082082Seschrock 			for (i = 0; i < ccount; i++, p++, src++) {
3092082Seschrock 				*p = *src;
3102082Seschrock 			}
3112082Seschrock 		} else {
3122082Seschrock 			ASSERT(ccount <= pcount);
3132082Seschrock 			for (i = 0; i < ccount; i++, p++, src++) {
3142082Seschrock 				*p ^= *src;
3152082Seschrock 			}
3162082Seschrock 		}
3172082Seschrock 	}
3182082Seschrock }
3192082Seschrock 
3202082Seschrock static void
3212082Seschrock vdev_raidz_generate_parity_pq(raidz_map_t *rm)
322789Sahrens {
3232082Seschrock 	uint64_t *q, *p, *src, pcount, ccount, mask, i;
3242082Seschrock 	int c;
3252082Seschrock 
3262082Seschrock 	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
3272082Seschrock 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
3282082Seschrock 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
3292082Seschrock 
3302082Seschrock 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
3312082Seschrock 		src = rm->rm_col[c].rc_data;
3322082Seschrock 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
3332082Seschrock 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
3342082Seschrock 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
3352082Seschrock 
3362082Seschrock 		if (c == rm->rm_firstdatacol) {
3372082Seschrock 			ASSERT(ccount == pcount || ccount == 0);
3382082Seschrock 			for (i = 0; i < ccount; i++, p++, q++, src++) {
3392082Seschrock 				*q = *src;
3402082Seschrock 				*p = *src;
3412082Seschrock 			}
3422082Seschrock 			for (; i < pcount; i++, p++, q++, src++) {
3432082Seschrock 				*q = 0;
3442082Seschrock 				*p = 0;
3452082Seschrock 			}
3462082Seschrock 		} else {
3472082Seschrock 			ASSERT(ccount <= pcount);
348789Sahrens 
3492082Seschrock 			/*
3502082Seschrock 			 * Rather than multiplying each byte individually (as
3512082Seschrock 			 * described above), we are able to handle 8 at once
3522082Seschrock 			 * by generating a mask based on the high bit in each
3532082Seschrock 			 * byte and using that to conditionally XOR in 0x1d.
3542082Seschrock 			 */
3552082Seschrock 			for (i = 0; i < ccount; i++, p++, q++, src++) {
3562082Seschrock 				mask = *q & 0x8080808080808080ULL;
3572082Seschrock 				mask = (mask << 1) - (mask >> 7);
3582082Seschrock 				*q = ((*q << 1) & 0xfefefefefefefefeULL) ^
3592082Seschrock 				    (mask & 0x1d1d1d1d1d1d1d1dULL);
3602082Seschrock 				*q ^= *src;
3612082Seschrock 				*p ^= *src;
3622082Seschrock 			}
3632082Seschrock 
3642082Seschrock 			/*
3652082Seschrock 			 * Treat short columns as though they are full of 0s.
3662082Seschrock 			 */
3672082Seschrock 			for (; i < pcount; i++, q++) {
3682082Seschrock 				mask = *q & 0x8080808080808080ULL;
3692082Seschrock 				mask = (mask << 1) - (mask >> 7);
3702082Seschrock 				*q = ((*q << 1) & 0xfefefefefefefefeULL) ^
3712082Seschrock 				    (mask & 0x1d1d1d1d1d1d1d1dULL);
3722082Seschrock 			}
3732082Seschrock 		}
3742082Seschrock 	}
3752082Seschrock }
3762082Seschrock 
3772082Seschrock static void
3782082Seschrock vdev_raidz_reconstruct_p(raidz_map_t *rm, int x)
3792082Seschrock {
3802082Seschrock 	uint64_t *dst, *src, xcount, ccount, count, i;
3812082Seschrock 	int c;
3822082Seschrock 
3832082Seschrock 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
3842082Seschrock 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
3852082Seschrock 	ASSERT(xcount > 0);
3862082Seschrock 
3872082Seschrock 	src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
3882082Seschrock 	dst = rm->rm_col[x].rc_data;
3892082Seschrock 	for (i = 0; i < xcount; i++, dst++, src++) {
3902082Seschrock 		*dst = *src;
3912082Seschrock 	}
3922082Seschrock 
3932082Seschrock 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
394789Sahrens 		src = rm->rm_col[c].rc_data;
395789Sahrens 		dst = rm->rm_col[x].rc_data;
3962082Seschrock 
3972082Seschrock 		if (c == x)
3982082Seschrock 			continue;
3992082Seschrock 
4002082Seschrock 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
4012082Seschrock 		count = MIN(ccount, xcount);
4022082Seschrock 
4032082Seschrock 		for (i = 0; i < count; i++, dst++, src++) {
4042082Seschrock 			*dst ^= *src;
405789Sahrens 		}
406789Sahrens 	}
407789Sahrens }
408789Sahrens 
4092082Seschrock static void
4102082Seschrock vdev_raidz_reconstruct_q(raidz_map_t *rm, int x)
4112082Seschrock {
4122082Seschrock 	uint64_t *dst, *src, xcount, ccount, count, mask, i;
4132082Seschrock 	uint8_t *b;
4142082Seschrock 	int c, j, exp;
4152082Seschrock 
4162082Seschrock 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
4172082Seschrock 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
4182082Seschrock 
4192082Seschrock 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
4202082Seschrock 		src = rm->rm_col[c].rc_data;
4212082Seschrock 		dst = rm->rm_col[x].rc_data;
4222082Seschrock 
4232082Seschrock 		if (c == x)
4242082Seschrock 			ccount = 0;
4252082Seschrock 		else
4262082Seschrock 			ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
4272082Seschrock 
4282082Seschrock 		count = MIN(ccount, xcount);
4292082Seschrock 
4302082Seschrock 		if (c == rm->rm_firstdatacol) {
4312082Seschrock 			for (i = 0; i < count; i++, dst++, src++) {
4322082Seschrock 				*dst = *src;
4332082Seschrock 			}
4342082Seschrock 			for (; i < xcount; i++, dst++) {
4352082Seschrock 				*dst = 0;
4362082Seschrock 			}
4372082Seschrock 
4382082Seschrock 		} else {
4392082Seschrock 			/*
4402082Seschrock 			 * For an explanation of this, see the comment in
4412082Seschrock 			 * vdev_raidz_generate_parity_pq() above.
4422082Seschrock 			 */
4432082Seschrock 			for (i = 0; i < count; i++, dst++, src++) {
4442082Seschrock 				mask = *dst & 0x8080808080808080ULL;
4452082Seschrock 				mask = (mask << 1) - (mask >> 7);
4462082Seschrock 				*dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
4472082Seschrock 				    (mask & 0x1d1d1d1d1d1d1d1dULL);
4482082Seschrock 				*dst ^= *src;
4492082Seschrock 			}
4502082Seschrock 
4512082Seschrock 			for (; i < xcount; i++, dst++) {
4522082Seschrock 				mask = *dst & 0x8080808080808080ULL;
4532082Seschrock 				mask = (mask << 1) - (mask >> 7);
4542082Seschrock 				*dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
4552082Seschrock 				    (mask & 0x1d1d1d1d1d1d1d1dULL);
4562082Seschrock 			}
4572082Seschrock 		}
4582082Seschrock 	}
4592082Seschrock 
4602082Seschrock 	src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
4612082Seschrock 	dst = rm->rm_col[x].rc_data;
4622082Seschrock 	exp = 255 - (rm->rm_cols - 1 - x);
4632082Seschrock 
4642082Seschrock 	for (i = 0; i < xcount; i++, dst++, src++) {
4652082Seschrock 		*dst ^= *src;
4662082Seschrock 		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
4672082Seschrock 			*b = vdev_raidz_exp2(*b, exp);
4682082Seschrock 		}
4692082Seschrock 	}
4702082Seschrock }
4712082Seschrock 
4722082Seschrock static void
4732082Seschrock vdev_raidz_reconstruct_pq(raidz_map_t *rm, int x, int y)
4742082Seschrock {
4752082Seschrock 	uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
4762082Seschrock 	void *pdata, *qdata;
4772082Seschrock 	uint64_t xsize, ysize, i;
4782082Seschrock 
4792082Seschrock 	ASSERT(x < y);
4802082Seschrock 	ASSERT(x >= rm->rm_firstdatacol);
4812082Seschrock 	ASSERT(y < rm->rm_cols);
4822082Seschrock 
4832082Seschrock 	ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
4842082Seschrock 
4852082Seschrock 	/*
4862082Seschrock 	 * Move the parity data aside -- we're going to compute parity as
4872082Seschrock 	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
4882082Seschrock 	 * reuse the parity generation mechanism without trashing the actual
4892082Seschrock 	 * parity so we make those columns appear to be full of zeros by
4902082Seschrock 	 * setting their lengths to zero.
4912082Seschrock 	 */
4922082Seschrock 	pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
4932082Seschrock 	qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
4942082Seschrock 	xsize = rm->rm_col[x].rc_size;
4952082Seschrock 	ysize = rm->rm_col[y].rc_size;
4962082Seschrock 
4972082Seschrock 	rm->rm_col[VDEV_RAIDZ_P].rc_data =
4982082Seschrock 	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
4992082Seschrock 	rm->rm_col[VDEV_RAIDZ_Q].rc_data =
5002082Seschrock 	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
5012082Seschrock 	rm->rm_col[x].rc_size = 0;
5022082Seschrock 	rm->rm_col[y].rc_size = 0;
5032082Seschrock 
5042082Seschrock 	vdev_raidz_generate_parity_pq(rm);
5052082Seschrock 
5062082Seschrock 	rm->rm_col[x].rc_size = xsize;
5072082Seschrock 	rm->rm_col[y].rc_size = ysize;
5082082Seschrock 
5092082Seschrock 	p = pdata;
5102082Seschrock 	q = qdata;
5112082Seschrock 	pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
5122082Seschrock 	qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
5132082Seschrock 	xd = rm->rm_col[x].rc_data;
5142082Seschrock 	yd = rm->rm_col[y].rc_data;
5152082Seschrock 
5162082Seschrock 	/*
5172082Seschrock 	 * We now have:
5182082Seschrock 	 *	Pxy = P + D_x + D_y
5192082Seschrock 	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
5202082Seschrock 	 *
5212082Seschrock 	 * We can then solve for D_x:
5222082Seschrock 	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
5232082Seschrock 	 * where
5242082Seschrock 	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
5252082Seschrock 	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
5262082Seschrock 	 *
5272082Seschrock 	 * With D_x in hand, we can easily solve for D_y:
5282082Seschrock 	 *	D_y = P + Pxy + D_x
5292082Seschrock 	 */
5302082Seschrock 
5312082Seschrock 	a = vdev_raidz_pow2[255 + x - y];
5322082Seschrock 	b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
5332082Seschrock 	tmp = 255 - vdev_raidz_log2[a ^ 1];
5342082Seschrock 
5352082Seschrock 	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
5362082Seschrock 	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
5372082Seschrock 
5382082Seschrock 	for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
5392082Seschrock 		*xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
5402082Seschrock 		    vdev_raidz_exp2(*q ^ *qxy, bexp);
5412082Seschrock 
5422082Seschrock 		if (i < ysize)
5432082Seschrock 			*yd = *p ^ *pxy ^ *xd;
5442082Seschrock 	}
5452082Seschrock 
5462082Seschrock 	zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
5472082Seschrock 	    rm->rm_col[VDEV_RAIDZ_P].rc_size);
5482082Seschrock 	zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
5492082Seschrock 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
5502082Seschrock 
5512082Seschrock 	/*
5522082Seschrock 	 * Restore the saved parity data.
5532082Seschrock 	 */
5542082Seschrock 	rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
5552082Seschrock 	rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
5562082Seschrock }
5572082Seschrock 
5582082Seschrock 
559789Sahrens static int
560789Sahrens vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
561789Sahrens {
5622082Seschrock 	uint64_t nparity = vd->vdev_nparity;
563789Sahrens 	int lasterror = 0;
564789Sahrens 	int numerrors = 0;
565789Sahrens 
5662082Seschrock 	ASSERT(nparity > 0);
5672082Seschrock 
5682082Seschrock 	if (nparity > VDEV_RAIDZ_MAXPARITY ||
5692082Seschrock 	    vd->vdev_children < nparity + 1) {
570789Sahrens 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
571789Sahrens 		return (EINVAL);
572789Sahrens 	}
573789Sahrens 
574*9846SEric.Taylor@Sun.COM 	vdev_open_children(vd);
575789Sahrens 
576*9846SEric.Taylor@Sun.COM 	for (int c = 0; c < vd->vdev_children; c++) {
577*9846SEric.Taylor@Sun.COM 		vdev_t *cvd = vd->vdev_child[c];
578*9846SEric.Taylor@Sun.COM 
579*9846SEric.Taylor@Sun.COM 		if (cvd->vdev_open_error) {
580*9846SEric.Taylor@Sun.COM 			lasterror = cvd->vdev_open_error;
581789Sahrens 			numerrors++;
582789Sahrens 			continue;
583789Sahrens 		}
584789Sahrens 
585789Sahrens 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
5861732Sbonwick 		*ashift = MAX(*ashift, cvd->vdev_ashift);
587789Sahrens 	}
588789Sahrens 
589789Sahrens 	*asize *= vd->vdev_children;
590789Sahrens 
5912082Seschrock 	if (numerrors > nparity) {
592789Sahrens 		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
593789Sahrens 		return (lasterror);
594789Sahrens 	}
595789Sahrens 
596789Sahrens 	return (0);
597789Sahrens }
598789Sahrens 
599789Sahrens static void
600789Sahrens vdev_raidz_close(vdev_t *vd)
601789Sahrens {
602*9846SEric.Taylor@Sun.COM 	for (int c = 0; c < vd->vdev_children; c++)
603789Sahrens 		vdev_close(vd->vdev_child[c]);
604789Sahrens }
605789Sahrens 
606789Sahrens static uint64_t
607789Sahrens vdev_raidz_asize(vdev_t *vd, uint64_t psize)
608789Sahrens {
609789Sahrens 	uint64_t asize;
6101732Sbonwick 	uint64_t ashift = vd->vdev_top->vdev_ashift;
611789Sahrens 	uint64_t cols = vd->vdev_children;
6122082Seschrock 	uint64_t nparity = vd->vdev_nparity;
613789Sahrens 
6141732Sbonwick 	asize = ((psize - 1) >> ashift) + 1;
6152082Seschrock 	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
6162082Seschrock 	asize = roundup(asize, nparity + 1) << ashift;
617789Sahrens 
618789Sahrens 	return (asize);
619789Sahrens }
620789Sahrens 
621789Sahrens static void
622789Sahrens vdev_raidz_child_done(zio_t *zio)
623789Sahrens {
624789Sahrens 	raidz_col_t *rc = zio->io_private;
625789Sahrens 
626789Sahrens 	rc->rc_error = zio->io_error;
627789Sahrens 	rc->rc_tried = 1;
628789Sahrens 	rc->rc_skipped = 0;
629789Sahrens }
630789Sahrens 
6315530Sbonwick static int
632789Sahrens vdev_raidz_io_start(zio_t *zio)
633789Sahrens {
634789Sahrens 	vdev_t *vd = zio->io_vd;
6351732Sbonwick 	vdev_t *tvd = vd->vdev_top;
636789Sahrens 	vdev_t *cvd;
637789Sahrens 	blkptr_t *bp = zio->io_bp;
638789Sahrens 	raidz_map_t *rm;
639789Sahrens 	raidz_col_t *rc;
640789Sahrens 	int c;
641789Sahrens 
6422082Seschrock 	rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
6432082Seschrock 	    vd->vdev_nparity);
644789Sahrens 
6451775Sbillm 	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
646789Sahrens 
647789Sahrens 	if (zio->io_type == ZIO_TYPE_WRITE) {
648789Sahrens 		/*
6492082Seschrock 		 * Generate RAID parity in the first virtual columns.
650789Sahrens 		 */
6512082Seschrock 		if (rm->rm_firstdatacol == 1)
6522082Seschrock 			vdev_raidz_generate_parity_p(rm);
6532082Seschrock 		else
6542082Seschrock 			vdev_raidz_generate_parity_pq(rm);
655789Sahrens 
656789Sahrens 		for (c = 0; c < rm->rm_cols; c++) {
657789Sahrens 			rc = &rm->rm_col[c];
6582082Seschrock 			cvd = vd->vdev_child[rc->rc_devidx];
659789Sahrens 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
660789Sahrens 			    rc->rc_offset, rc->rc_data, rc->rc_size,
6617754SJeff.Bonwick@Sun.COM 			    zio->io_type, zio->io_priority, 0,
662789Sahrens 			    vdev_raidz_child_done, rc));
663789Sahrens 		}
6645530Sbonwick 
6657754SJeff.Bonwick@Sun.COM 		return (ZIO_PIPELINE_CONTINUE);
666789Sahrens 	}
667789Sahrens 
668789Sahrens 	ASSERT(zio->io_type == ZIO_TYPE_READ);
669789Sahrens 
6702082Seschrock 	/*
6712082Seschrock 	 * Iterate over the columns in reverse order so that we hit the parity
6722082Seschrock 	 * last -- any errors along the way will force us to read the parity
6732082Seschrock 	 * data.
6742082Seschrock 	 */
675789Sahrens 	for (c = rm->rm_cols - 1; c >= 0; c--) {
676789Sahrens 		rc = &rm->rm_col[c];
6772082Seschrock 		cvd = vd->vdev_child[rc->rc_devidx];
6785329Sgw25295 		if (!vdev_readable(cvd)) {
6792082Seschrock 			if (c >= rm->rm_firstdatacol)
6802082Seschrock 				rm->rm_missingdata++;
6812082Seschrock 			else
6822082Seschrock 				rm->rm_missingparity++;
683789Sahrens 			rc->rc_error = ENXIO;
684789Sahrens 			rc->rc_tried = 1;	/* don't even try */
685789Sahrens 			rc->rc_skipped = 1;
686789Sahrens 			continue;
687789Sahrens 		}
6888241SJeff.Bonwick@Sun.COM 		if (vdev_dtl_contains(cvd, DTL_MISSING, bp->blk_birth, 1)) {
6892082Seschrock 			if (c >= rm->rm_firstdatacol)
6902082Seschrock 				rm->rm_missingdata++;
6912082Seschrock 			else
6922082Seschrock 				rm->rm_missingparity++;
693789Sahrens 			rc->rc_error = ESTALE;
694789Sahrens 			rc->rc_skipped = 1;
695789Sahrens 			continue;
696789Sahrens 		}
6972082Seschrock 		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
6989434SMark.Musante@Sun.COM 		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
699789Sahrens 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
700789Sahrens 			    rc->rc_offset, rc->rc_data, rc->rc_size,
7017754SJeff.Bonwick@Sun.COM 			    zio->io_type, zio->io_priority, 0,
702789Sahrens 			    vdev_raidz_child_done, rc));
703789Sahrens 		}
704789Sahrens 	}
705789Sahrens 
7067754SJeff.Bonwick@Sun.COM 	return (ZIO_PIPELINE_CONTINUE);
707789Sahrens }
708789Sahrens 
7091544Seschrock /*
7101544Seschrock  * Report a checksum error for a child of a RAID-Z device.
7111544Seschrock  */
7121544Seschrock static void
7131544Seschrock raidz_checksum_error(zio_t *zio, raidz_col_t *rc)
7141544Seschrock {
7152082Seschrock 	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
7161544Seschrock 
7171544Seschrock 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
7181544Seschrock 		mutex_enter(&vd->vdev_stat_lock);
7191544Seschrock 		vd->vdev_stat.vs_checksum_errors++;
7201544Seschrock 		mutex_exit(&vd->vdev_stat_lock);
7211544Seschrock 	}
7221544Seschrock 
7231544Seschrock 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE))
7241544Seschrock 		zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
7251544Seschrock 		    zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size);
7261544Seschrock }
7271544Seschrock 
7282082Seschrock /*
7292082Seschrock  * Generate the parity from the data columns. If we tried and were able to
7302082Seschrock  * read the parity without error, verify that the generated parity matches the
7312082Seschrock  * data we read. If it doesn't, we fire off a checksum error. Return the
7322082Seschrock  * number such failures.
7332082Seschrock  */
7342082Seschrock static int
7352082Seschrock raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
7362082Seschrock {
7372082Seschrock 	void *orig[VDEV_RAIDZ_MAXPARITY];
7382082Seschrock 	int c, ret = 0;
7392082Seschrock 	raidz_col_t *rc;
7402082Seschrock 
7412082Seschrock 	for (c = 0; c < rm->rm_firstdatacol; c++) {
7422082Seschrock 		rc = &rm->rm_col[c];
7432082Seschrock 		if (!rc->rc_tried || rc->rc_error != 0)
7442082Seschrock 			continue;
7452082Seschrock 		orig[c] = zio_buf_alloc(rc->rc_size);
7462082Seschrock 		bcopy(rc->rc_data, orig[c], rc->rc_size);
7472082Seschrock 	}
7482082Seschrock 
7492082Seschrock 	if (rm->rm_firstdatacol == 1)
7502082Seschrock 		vdev_raidz_generate_parity_p(rm);
7512082Seschrock 	else
7522082Seschrock 		vdev_raidz_generate_parity_pq(rm);
7532082Seschrock 
7542082Seschrock 	for (c = 0; c < rm->rm_firstdatacol; c++) {
7552082Seschrock 		rc = &rm->rm_col[c];
7562082Seschrock 		if (!rc->rc_tried || rc->rc_error != 0)
7572082Seschrock 			continue;
7582082Seschrock 		if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
7592082Seschrock 			raidz_checksum_error(zio, rc);
7602082Seschrock 			rc->rc_error = ECKSUM;
7612082Seschrock 			ret++;
7622082Seschrock 		}
7632082Seschrock 		zio_buf_free(orig[c], rc->rc_size);
7642082Seschrock 	}
7652082Seschrock 
7662082Seschrock 	return (ret);
7672082Seschrock }
7682082Seschrock 
7692082Seschrock static uint64_t raidz_corrected_p;
7702082Seschrock static uint64_t raidz_corrected_q;
7712082Seschrock static uint64_t raidz_corrected_pq;
7721544Seschrock 
7735530Sbonwick static int
7747754SJeff.Bonwick@Sun.COM vdev_raidz_worst_error(raidz_map_t *rm)
7757754SJeff.Bonwick@Sun.COM {
7767754SJeff.Bonwick@Sun.COM 	int error = 0;
7777754SJeff.Bonwick@Sun.COM 
7787754SJeff.Bonwick@Sun.COM 	for (int c = 0; c < rm->rm_cols; c++)
7797754SJeff.Bonwick@Sun.COM 		error = zio_worst_error(error, rm->rm_col[c].rc_error);
7807754SJeff.Bonwick@Sun.COM 
7817754SJeff.Bonwick@Sun.COM 	return (error);
7827754SJeff.Bonwick@Sun.COM }
7837754SJeff.Bonwick@Sun.COM 
7847754SJeff.Bonwick@Sun.COM static void
785789Sahrens vdev_raidz_io_done(zio_t *zio)
786789Sahrens {
787789Sahrens 	vdev_t *vd = zio->io_vd;
788789Sahrens 	vdev_t *cvd;
789789Sahrens 	raidz_map_t *rm = zio->io_vsd;
7902082Seschrock 	raidz_col_t *rc, *rc1;
791789Sahrens 	int unexpected_errors = 0;
7922082Seschrock 	int parity_errors = 0;
7933456Sahl 	int parity_untried = 0;
7942082Seschrock 	int data_errors = 0;
7957754SJeff.Bonwick@Sun.COM 	int total_errors = 0;
7962082Seschrock 	int n, c, c1;
797789Sahrens 
7981775Sbillm 	ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
799789Sahrens 
8002082Seschrock 	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
8012082Seschrock 	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
8022082Seschrock 
803789Sahrens 	for (c = 0; c < rm->rm_cols; c++) {
804789Sahrens 		rc = &rm->rm_col[c];
805789Sahrens 
806789Sahrens 		if (rc->rc_error) {
8077754SJeff.Bonwick@Sun.COM 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
8082082Seschrock 
8092082Seschrock 			if (c < rm->rm_firstdatacol)
8102082Seschrock 				parity_errors++;
8112082Seschrock 			else
8122082Seschrock 				data_errors++;
8132082Seschrock 
814789Sahrens 			if (!rc->rc_skipped)
815789Sahrens 				unexpected_errors++;
8162082Seschrock 
8177754SJeff.Bonwick@Sun.COM 			total_errors++;
8183456Sahl 		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
8193456Sahl 			parity_untried++;
820789Sahrens 		}
821789Sahrens 	}
822789Sahrens 
823789Sahrens 	if (zio->io_type == ZIO_TYPE_WRITE) {
824789Sahrens 		/*
8257754SJeff.Bonwick@Sun.COM 		 * XXX -- for now, treat partial writes as a success.
8267754SJeff.Bonwick@Sun.COM 		 * (If we couldn't write enough columns to reconstruct
8277754SJeff.Bonwick@Sun.COM 		 * the data, the I/O failed.  Otherwise, good enough.)
8287754SJeff.Bonwick@Sun.COM 		 *
8297754SJeff.Bonwick@Sun.COM 		 * Now that we support write reallocation, it would be better
8307754SJeff.Bonwick@Sun.COM 		 * to treat partial failure as real failure unless there are
8317754SJeff.Bonwick@Sun.COM 		 * no non-degraded top-level vdevs left, and not update DTLs
8327754SJeff.Bonwick@Sun.COM 		 * if we intend to reallocate.
833789Sahrens 		 */
834789Sahrens 		/* XXPOLICY */
8357754SJeff.Bonwick@Sun.COM 		if (total_errors > rm->rm_firstdatacol)
8367754SJeff.Bonwick@Sun.COM 			zio->io_error = vdev_raidz_worst_error(rm);
837789Sahrens 
8387754SJeff.Bonwick@Sun.COM 		return;
839789Sahrens 	}
840789Sahrens 
841789Sahrens 	ASSERT(zio->io_type == ZIO_TYPE_READ);
8422082Seschrock 	/*
8432082Seschrock 	 * There are three potential phases for a read:
8442082Seschrock 	 *	1. produce valid data from the columns read
8452082Seschrock 	 *	2. read all disks and try again
8462082Seschrock 	 *	3. perform combinatorial reconstruction
8472082Seschrock 	 *
8482082Seschrock 	 * Each phase is progressively both more expensive and less likely to
8492082Seschrock 	 * occur. If we encounter more errors than we can repair or all phases
8502082Seschrock 	 * fail, we have no choice but to return an error.
8512082Seschrock 	 */
852789Sahrens 
853789Sahrens 	/*
8542082Seschrock 	 * If the number of errors we saw was correctable -- less than or equal
8553456Sahl 	 * to the number of parity disks read -- attempt to produce data that
8563456Sahl 	 * has a valid checksum. Naturally, this case applies in the absence of
8573456Sahl 	 * any errors.
858789Sahrens 	 */
8597754SJeff.Bonwick@Sun.COM 	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
8602082Seschrock 		switch (data_errors) {
8612082Seschrock 		case 0:
8622082Seschrock 			if (zio_checksum_error(zio) == 0) {
8634034Sahl 				/*
8644034Sahl 				 * If we read parity information (unnecessarily
8654034Sahl 				 * as it happens since no reconstruction was
8664034Sahl 				 * needed) regenerate and verify the parity.
8674034Sahl 				 * We also regenerate parity when resilvering
8684034Sahl 				 * so we can write it out to the failed device
8694034Sahl 				 * later.
8704034Sahl 				 */
8713456Sahl 				if (parity_errors + parity_untried <
8724034Sahl 				    rm->rm_firstdatacol ||
8734034Sahl 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
8743456Sahl 					n = raidz_parity_verify(zio, rm);
8753456Sahl 					unexpected_errors += n;
8763456Sahl 					ASSERT(parity_errors + n <=
8773456Sahl 					    rm->rm_firstdatacol);
8783456Sahl 				}
8792082Seschrock 				goto done;
8802082Seschrock 			}
8812082Seschrock 			break;
8822082Seschrock 
8832082Seschrock 		case 1:
8843456Sahl 			/*
8853456Sahl 			 * We either attempt to read all the parity columns or
8863456Sahl 			 * none of them. If we didn't try to read parity, we
8873456Sahl 			 * wouldn't be here in the correctable case. There must
8883456Sahl 			 * also have been fewer parity errors than parity
8893456Sahl 			 * columns or, again, we wouldn't be in this code path.
8903456Sahl 			 */
8913456Sahl 			ASSERT(parity_untried == 0);
8922082Seschrock 			ASSERT(parity_errors < rm->rm_firstdatacol);
8932082Seschrock 
8942082Seschrock 			/*
8952082Seschrock 			 * Find the column that reported the error.
8962082Seschrock 			 */
8972082Seschrock 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
8982082Seschrock 				rc = &rm->rm_col[c];
8992082Seschrock 				if (rc->rc_error != 0)
9002082Seschrock 					break;
9012082Seschrock 			}
9022082Seschrock 			ASSERT(c != rm->rm_cols);
9032082Seschrock 			ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
9042082Seschrock 			    rc->rc_error == ESTALE);
9052082Seschrock 
9062082Seschrock 			if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) {
9072082Seschrock 				vdev_raidz_reconstruct_p(rm, c);
9082082Seschrock 			} else {
9092082Seschrock 				ASSERT(rm->rm_firstdatacol > 1);
9102082Seschrock 				vdev_raidz_reconstruct_q(rm, c);
9112082Seschrock 			}
9122082Seschrock 
9132082Seschrock 			if (zio_checksum_error(zio) == 0) {
9142082Seschrock 				if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0)
9152082Seschrock 					atomic_inc_64(&raidz_corrected_p);
9162082Seschrock 				else
9172082Seschrock 					atomic_inc_64(&raidz_corrected_q);
918789Sahrens 
9192082Seschrock 				/*
9203456Sahl 				 * If there's more than one parity disk that
9213456Sahl 				 * was successfully read, confirm that the
9223456Sahl 				 * other parity disk produced the correct data.
9233456Sahl 				 * This routine is suboptimal in that it
9243456Sahl 				 * regenerates both the parity we wish to test
9253456Sahl 				 * as well as the parity we just used to
9263456Sahl 				 * perform the reconstruction, but this should
9273456Sahl 				 * be a relatively uncommon case, and can be
9283456Sahl 				 * optimized if it becomes a problem.
9294034Sahl 				 * We also regenerate parity when resilvering
9304034Sahl 				 * so we can write it out to the failed device
9314034Sahl 				 * later.
9322082Seschrock 				 */
9334034Sahl 				if (parity_errors < rm->rm_firstdatacol - 1 ||
9344034Sahl 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
9352082Seschrock 					n = raidz_parity_verify(zio, rm);
9362082Seschrock 					unexpected_errors += n;
9372082Seschrock 					ASSERT(parity_errors + n <=
9382082Seschrock 					    rm->rm_firstdatacol);
9392082Seschrock 				}
9402082Seschrock 
9412082Seschrock 				goto done;
9422082Seschrock 			}
9432082Seschrock 			break;
9442082Seschrock 
9452082Seschrock 		case 2:
9462082Seschrock 			/*
9473456Sahl 			 * Two data column errors require double parity.
9483456Sahl 			 */
9493456Sahl 			ASSERT(rm->rm_firstdatacol == 2);
9503456Sahl 
9513456Sahl 			/*
9522082Seschrock 			 * Find the two columns that reported errors.
9532082Seschrock 			 */
9542082Seschrock 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
9552082Seschrock 				rc = &rm->rm_col[c];
9562082Seschrock 				if (rc->rc_error != 0)
9572082Seschrock 					break;
958789Sahrens 			}
9592082Seschrock 			ASSERT(c != rm->rm_cols);
9602082Seschrock 			ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
9612082Seschrock 			    rc->rc_error == ESTALE);
9622082Seschrock 
9632082Seschrock 			for (c1 = c++; c < rm->rm_cols; c++) {
9642082Seschrock 				rc = &rm->rm_col[c];
9652082Seschrock 				if (rc->rc_error != 0)
9662082Seschrock 					break;
9672082Seschrock 			}
9682082Seschrock 			ASSERT(c != rm->rm_cols);
9692082Seschrock 			ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
9702082Seschrock 			    rc->rc_error == ESTALE);
971789Sahrens 
9722082Seschrock 			vdev_raidz_reconstruct_pq(rm, c1, c);
9732082Seschrock 
9742082Seschrock 			if (zio_checksum_error(zio) == 0) {
9752082Seschrock 				atomic_inc_64(&raidz_corrected_pq);
9762082Seschrock 				goto done;
9772082Seschrock 			}
9782082Seschrock 			break;
9792082Seschrock 
9802082Seschrock 		default:
9812082Seschrock 			ASSERT(rm->rm_firstdatacol <= 2);
9822082Seschrock 			ASSERT(0);
983789Sahrens 		}
984789Sahrens 	}
985789Sahrens 
986789Sahrens 	/*
9872082Seschrock 	 * This isn't a typical situation -- either we got a read error or
9882082Seschrock 	 * a child silently returned bad data. Read every block so we can
9892082Seschrock 	 * try again with as much data and parity as we can track down. If
9902082Seschrock 	 * we've already been through once before, all children will be marked
9912082Seschrock 	 * as tried so we'll proceed to combinatorial reconstruction.
992789Sahrens 	 */
993789Sahrens 	unexpected_errors = 1;
9942082Seschrock 	rm->rm_missingdata = 0;
9952082Seschrock 	rm->rm_missingparity = 0;
996789Sahrens 
9972082Seschrock 	for (c = 0; c < rm->rm_cols; c++) {
9982082Seschrock 		if (rm->rm_col[c].rc_tried)
9992082Seschrock 			continue;
1000789Sahrens 
1001789Sahrens 		zio_vdev_io_redone(zio);
10022082Seschrock 		do {
1003789Sahrens 			rc = &rm->rm_col[c];
1004789Sahrens 			if (rc->rc_tried)
1005789Sahrens 				continue;
1006789Sahrens 			zio_nowait(zio_vdev_child_io(zio, NULL,
10072082Seschrock 			    vd->vdev_child[rc->rc_devidx],
1008789Sahrens 			    rc->rc_offset, rc->rc_data, rc->rc_size,
10097754SJeff.Bonwick@Sun.COM 			    zio->io_type, zio->io_priority, 0,
1010789Sahrens 			    vdev_raidz_child_done, rc));
10112082Seschrock 		} while (++c < rm->rm_cols);
10125530Sbonwick 
10137754SJeff.Bonwick@Sun.COM 		return;
1014789Sahrens 	}
1015789Sahrens 
1016789Sahrens 	/*
10172082Seschrock 	 * At this point we've attempted to reconstruct the data given the
10182082Seschrock 	 * errors we detected, and we've attempted to read all columns. There
10192082Seschrock 	 * must, therefore, be one or more additional problems -- silent errors
10202082Seschrock 	 * resulting in invalid data rather than explicit I/O errors resulting
10212082Seschrock 	 * in absent data. Before we attempt combinatorial reconstruction make
10222082Seschrock 	 * sure we have a chance of coming up with the right answer.
1023789Sahrens 	 */
10247754SJeff.Bonwick@Sun.COM 	if (total_errors >= rm->rm_firstdatacol) {
10257754SJeff.Bonwick@Sun.COM 		zio->io_error = vdev_raidz_worst_error(rm);
10267754SJeff.Bonwick@Sun.COM 		/*
10277754SJeff.Bonwick@Sun.COM 		 * If there were exactly as many device errors as parity
10287754SJeff.Bonwick@Sun.COM 		 * columns, yet we couldn't reconstruct the data, then at
10297754SJeff.Bonwick@Sun.COM 		 * least one device must have returned bad data silently.
10307754SJeff.Bonwick@Sun.COM 		 */
10317754SJeff.Bonwick@Sun.COM 		if (total_errors == rm->rm_firstdatacol)
10327754SJeff.Bonwick@Sun.COM 			zio->io_error = zio_worst_error(zio->io_error, ECKSUM);
1033789Sahrens 		goto done;
1034789Sahrens 	}
1035789Sahrens 
10362082Seschrock 	if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) {
10372082Seschrock 		/*
10382082Seschrock 		 * Attempt to reconstruct the data from parity P.
10392082Seschrock 		 */
10402082Seschrock 		for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
10412082Seschrock 			void *orig;
10422082Seschrock 			rc = &rm->rm_col[c];
10432082Seschrock 
10442082Seschrock 			orig = zio_buf_alloc(rc->rc_size);
10452082Seschrock 			bcopy(rc->rc_data, orig, rc->rc_size);
10462082Seschrock 			vdev_raidz_reconstruct_p(rm, c);
10472082Seschrock 
10482082Seschrock 			if (zio_checksum_error(zio) == 0) {
10492082Seschrock 				zio_buf_free(orig, rc->rc_size);
10502082Seschrock 				atomic_inc_64(&raidz_corrected_p);
10512082Seschrock 
10522082Seschrock 				/*
10532082Seschrock 				 * If this child didn't know that it returned
10542082Seschrock 				 * bad data, inform it.
10552082Seschrock 				 */
10562082Seschrock 				if (rc->rc_tried && rc->rc_error == 0)
10572082Seschrock 					raidz_checksum_error(zio, rc);
10582082Seschrock 				rc->rc_error = ECKSUM;
10592082Seschrock 				goto done;
10602082Seschrock 			}
10612082Seschrock 
10622082Seschrock 			bcopy(orig, rc->rc_data, rc->rc_size);
10632082Seschrock 			zio_buf_free(orig, rc->rc_size);
10642082Seschrock 		}
10652082Seschrock 	}
10662082Seschrock 
10672082Seschrock 	if (rm->rm_firstdatacol > 1 && rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) {
10682082Seschrock 		/*
10692082Seschrock 		 * Attempt to reconstruct the data from parity Q.
10702082Seschrock 		 */
10712082Seschrock 		for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
10722082Seschrock 			void *orig;
10732082Seschrock 			rc = &rm->rm_col[c];
10742082Seschrock 
10752082Seschrock 			orig = zio_buf_alloc(rc->rc_size);
10762082Seschrock 			bcopy(rc->rc_data, orig, rc->rc_size);
10772082Seschrock 			vdev_raidz_reconstruct_q(rm, c);
10782082Seschrock 
10792082Seschrock 			if (zio_checksum_error(zio) == 0) {
10802082Seschrock 				zio_buf_free(orig, rc->rc_size);
10812082Seschrock 				atomic_inc_64(&raidz_corrected_q);
10822082Seschrock 
10832082Seschrock 				/*
10842082Seschrock 				 * If this child didn't know that it returned
10852082Seschrock 				 * bad data, inform it.
10862082Seschrock 				 */
10872082Seschrock 				if (rc->rc_tried && rc->rc_error == 0)
10882082Seschrock 					raidz_checksum_error(zio, rc);
10892082Seschrock 				rc->rc_error = ECKSUM;
10902082Seschrock 				goto done;
10912082Seschrock 			}
10922082Seschrock 
10932082Seschrock 			bcopy(orig, rc->rc_data, rc->rc_size);
10942082Seschrock 			zio_buf_free(orig, rc->rc_size);
10952082Seschrock 		}
10962082Seschrock 	}
10972082Seschrock 
10982082Seschrock 	if (rm->rm_firstdatacol > 1 &&
10992082Seschrock 	    rm->rm_col[VDEV_RAIDZ_P].rc_error == 0 &&
11002082Seschrock 	    rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) {
11012082Seschrock 		/*
11022082Seschrock 		 * Attempt to reconstruct the data from both P and Q.
11032082Seschrock 		 */
11042082Seschrock 		for (c = rm->rm_firstdatacol; c < rm->rm_cols - 1; c++) {
11052082Seschrock 			void *orig, *orig1;
11062082Seschrock 			rc = &rm->rm_col[c];
11072082Seschrock 
11082082Seschrock 			orig = zio_buf_alloc(rc->rc_size);
11092082Seschrock 			bcopy(rc->rc_data, orig, rc->rc_size);
11102082Seschrock 
11112082Seschrock 			for (c1 = c + 1; c1 < rm->rm_cols; c1++) {
11122082Seschrock 				rc1 = &rm->rm_col[c1];
11132082Seschrock 
11142082Seschrock 				orig1 = zio_buf_alloc(rc1->rc_size);
11152082Seschrock 				bcopy(rc1->rc_data, orig1, rc1->rc_size);
11162082Seschrock 
11172082Seschrock 				vdev_raidz_reconstruct_pq(rm, c, c1);
11182082Seschrock 
11192082Seschrock 				if (zio_checksum_error(zio) == 0) {
11202082Seschrock 					zio_buf_free(orig, rc->rc_size);
11212082Seschrock 					zio_buf_free(orig1, rc1->rc_size);
11222082Seschrock 					atomic_inc_64(&raidz_corrected_pq);
11232082Seschrock 
11242082Seschrock 					/*
11252082Seschrock 					 * If these children didn't know they
11262082Seschrock 					 * returned bad data, inform them.
11272082Seschrock 					 */
11282082Seschrock 					if (rc->rc_tried && rc->rc_error == 0)
11292082Seschrock 						raidz_checksum_error(zio, rc);
11302082Seschrock 					if (rc1->rc_tried && rc1->rc_error == 0)
11312082Seschrock 						raidz_checksum_error(zio, rc1);
11322082Seschrock 
11332082Seschrock 					rc->rc_error = ECKSUM;
11342082Seschrock 					rc1->rc_error = ECKSUM;
11352082Seschrock 
11362082Seschrock 					goto done;
11372082Seschrock 				}
11382082Seschrock 
11392082Seschrock 				bcopy(orig1, rc1->rc_data, rc1->rc_size);
11402082Seschrock 				zio_buf_free(orig1, rc1->rc_size);
11412082Seschrock 			}
11422082Seschrock 
11432082Seschrock 			bcopy(orig, rc->rc_data, rc->rc_size);
11442082Seschrock 			zio_buf_free(orig, rc->rc_size);
1145789Sahrens 		}
1146789Sahrens 	}
1147789Sahrens 
1148789Sahrens 	/*
11492082Seschrock 	 * All combinations failed to checksum. Generate checksum ereports for
11502082Seschrock 	 * all children.
1151789Sahrens 	 */
1152789Sahrens 	zio->io_error = ECKSUM;
11537754SJeff.Bonwick@Sun.COM 
11541544Seschrock 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
11551544Seschrock 		for (c = 0; c < rm->rm_cols; c++) {
11561544Seschrock 			rc = &rm->rm_col[c];
11571544Seschrock 			zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
11582082Seschrock 			    zio->io_spa, vd->vdev_child[rc->rc_devidx], zio,
11591544Seschrock 			    rc->rc_offset, rc->rc_size);
11601544Seschrock 		}
11611544Seschrock 	}
1162789Sahrens 
1163789Sahrens done:
1164789Sahrens 	zio_checksum_verified(zio);
1165789Sahrens 
11668241SJeff.Bonwick@Sun.COM 	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
1167789Sahrens 	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
1168789Sahrens 		/*
1169789Sahrens 		 * Use the good data we have in hand to repair damaged children.
1170789Sahrens 		 */
1171789Sahrens 		for (c = 0; c < rm->rm_cols; c++) {
1172789Sahrens 			rc = &rm->rm_col[c];
11732082Seschrock 			cvd = vd->vdev_child[rc->rc_devidx];
1174789Sahrens 
11751732Sbonwick 			if (rc->rc_error == 0)
11761732Sbonwick 				continue;
11771732Sbonwick 
11787754SJeff.Bonwick@Sun.COM 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
11791732Sbonwick 			    rc->rc_offset, rc->rc_data, rc->rc_size,
11801732Sbonwick 			    ZIO_TYPE_WRITE, zio->io_priority,
11818241SJeff.Bonwick@Sun.COM 			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
11828241SJeff.Bonwick@Sun.COM 			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
11831732Sbonwick 		}
1184789Sahrens 	}
1185789Sahrens }
1186789Sahrens 
1187789Sahrens static void
1188789Sahrens vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
1189789Sahrens {
11902082Seschrock 	if (faulted > vd->vdev_nparity)
11911544Seschrock 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
11921544Seschrock 		    VDEV_AUX_NO_REPLICAS);
1193789Sahrens 	else if (degraded + faulted != 0)
11941544Seschrock 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
1195789Sahrens 	else
11961544Seschrock 		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
1197789Sahrens }
1198789Sahrens 
1199789Sahrens vdev_ops_t vdev_raidz_ops = {
1200789Sahrens 	vdev_raidz_open,
1201789Sahrens 	vdev_raidz_close,
1202789Sahrens 	vdev_raidz_asize,
1203789Sahrens 	vdev_raidz_io_start,
1204789Sahrens 	vdev_raidz_io_done,
1205789Sahrens 	vdev_raidz_state_change,
1206789Sahrens 	VDEV_TYPE_RAIDZ,	/* name of this vdev type */
1207789Sahrens 	B_FALSE			/* not a leaf vdev */
1208789Sahrens };
1209