xref: /onnv-gate/usr/src/uts/common/fs/zfs/vdev_raidz.c (revision 2082:76b439ec3ac1)
1789Sahrens /*
2789Sahrens  * CDDL HEADER START
3789Sahrens  *
4789Sahrens  * The contents of this file are subject to the terms of the
51544Seschrock  * Common Development and Distribution License (the "License").
61544Seschrock  * You may not use this file except in compliance with the License.
7789Sahrens  *
8789Sahrens  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9789Sahrens  * or http://www.opensolaris.org/os/licensing.
10789Sahrens  * See the License for the specific language governing permissions
11789Sahrens  * and limitations under the License.
12789Sahrens  *
13789Sahrens  * When distributing Covered Code, include this CDDL HEADER in each
14789Sahrens  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15789Sahrens  * If applicable, add the following below this CDDL HEADER, with the
16789Sahrens  * fields enclosed by brackets "[]" replaced with your own identifying
17789Sahrens  * information: Portions Copyright [yyyy] [name of copyright owner]
18789Sahrens  *
19789Sahrens  * CDDL HEADER END
20789Sahrens  */
21*2082Seschrock 
22789Sahrens /*
231544Seschrock  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
24789Sahrens  * Use is subject to license terms.
25789Sahrens  */
26789Sahrens 
27789Sahrens #pragma ident	"%Z%%M%	%I%	%E% SMI"
28789Sahrens 
29789Sahrens #include <sys/zfs_context.h>
30789Sahrens #include <sys/spa.h>
31789Sahrens #include <sys/vdev_impl.h>
32789Sahrens #include <sys/zio.h>
33789Sahrens #include <sys/zio_checksum.h>
34789Sahrens #include <sys/fs/zfs.h>
351544Seschrock #include <sys/fm/fs/zfs.h>
36789Sahrens 
37789Sahrens /*
38789Sahrens  * Virtual device vector for RAID-Z.
39*2082Seschrock  *
40*2082Seschrock  * This vdev supports both single and double parity. For single parity, we
41*2082Seschrock  * use a simple XOR of all the data columns. For double parity, we use both
42*2082Seschrock  * the simple XOR as well as a technique described in "The mathematics of
43*2082Seschrock  * RAID-6" by H. Peter Anvin. This technique defines a Galois field, GF(2^8),
44*2082Seschrock  * over the integers expressable in a single byte. Briefly, the operations on
45*2082Seschrock  * the field are defined as follows:
46*2082Seschrock  *
47*2082Seschrock  *   o addition (+) is represented by a bitwise XOR
48*2082Seschrock  *   o subtraction (-) is therefore identical to addition: A + B = A - B
49*2082Seschrock  *   o multiplication of A by 2 is defined by the following bitwise expression:
50*2082Seschrock  *	(A * 2)_7 = A_6
51*2082Seschrock  *	(A * 2)_6 = A_5
52*2082Seschrock  *	(A * 2)_5 = A_4
53*2082Seschrock  *	(A * 2)_4 = A_3 + A_7
54*2082Seschrock  *	(A * 2)_3 = A_2 + A_7
55*2082Seschrock  *	(A * 2)_2 = A_1 + A_7
56*2082Seschrock  *	(A * 2)_1 = A_0
57*2082Seschrock  *	(A * 2)_0 = A_7
58*2082Seschrock  *
59*2082Seschrock  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
60*2082Seschrock  *
61*2082Seschrock  * Observe that any number in the field (except for 0) can be expressed as a
62*2082Seschrock  * power of 2 -- a generator for the field. We store a table of the powers of
63*2082Seschrock  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
64*2082Seschrock  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
65*2082Seschrock  * than field addition). The inverse of a field element A (A^-1) is A^254.
66*2082Seschrock  *
67*2082Seschrock  * The two parity columns, P and Q, over several data columns, D_0, ... D_n-1,
68*2082Seschrock  * can be expressed by field operations:
69*2082Seschrock  *
70*2082Seschrock  *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
71*2082Seschrock  *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
72*2082Seschrock  *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
73*2082Seschrock  *
74*2082Seschrock  * See the reconstruction code below for how P and Q can used individually or
75*2082Seschrock  * in concert to recover missing data columns.
76789Sahrens  */
77789Sahrens 
78789Sahrens typedef struct raidz_col {
79*2082Seschrock 	uint64_t rc_devidx;		/* child device index for I/O */
80*2082Seschrock 	uint64_t rc_offset;		/* device offset */
81*2082Seschrock 	uint64_t rc_size;		/* I/O size */
82*2082Seschrock 	void *rc_data;			/* I/O data */
83*2082Seschrock 	int rc_error;			/* I/O error for this device */
84*2082Seschrock 	uint8_t rc_tried;		/* Did we attempt this I/O column? */
85*2082Seschrock 	uint8_t rc_skipped;		/* Did we skip this I/O column? */
86789Sahrens } raidz_col_t;
87789Sahrens 
88789Sahrens typedef struct raidz_map {
89*2082Seschrock 	uint64_t rm_cols;		/* Column count */
90*2082Seschrock 	uint64_t rm_bigcols;		/* Number of oversized columns */
91*2082Seschrock 	uint64_t rm_asize;		/* Actual total I/O size */
92*2082Seschrock 	uint64_t rm_missingdata;	/* Count of missing data devices */
93*2082Seschrock 	uint64_t rm_missingparity;	/* Count of missing parity devices */
94*2082Seschrock 	uint64_t rm_firstdatacol;	/* First data column/parity count */
95*2082Seschrock 	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
96789Sahrens } raidz_map_t;
97789Sahrens 
98*2082Seschrock #define	VDEV_RAIDZ_P		0
99*2082Seschrock #define	VDEV_RAIDZ_Q		1
100*2082Seschrock 
101*2082Seschrock #define	VDEV_RAIDZ_MAXPARITY	2
102*2082Seschrock 
103*2082Seschrock #define	VDEV_RAIDZ_MUL_2(a)	(((a) << 1) ^ (((a) & 0x80) ? 0x1d : 0))
104*2082Seschrock 
105*2082Seschrock /*
106*2082Seschrock  * These two tables represent powers and logs of 2 in the Galois field defined
107*2082Seschrock  * above. These values were computed by repeatedly multiplying by 2 as above.
108*2082Seschrock  */
109*2082Seschrock static const uint8_t vdev_raidz_pow2[256] = {
110*2082Seschrock 	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
111*2082Seschrock 	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
112*2082Seschrock 	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
113*2082Seschrock 	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
114*2082Seschrock 	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
115*2082Seschrock 	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
116*2082Seschrock 	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
117*2082Seschrock 	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
118*2082Seschrock 	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
119*2082Seschrock 	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
120*2082Seschrock 	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
121*2082Seschrock 	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
122*2082Seschrock 	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
123*2082Seschrock 	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
124*2082Seschrock 	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
125*2082Seschrock 	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
126*2082Seschrock 	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
127*2082Seschrock 	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
128*2082Seschrock 	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
129*2082Seschrock 	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
130*2082Seschrock 	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
131*2082Seschrock 	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
132*2082Seschrock 	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
133*2082Seschrock 	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
134*2082Seschrock 	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
135*2082Seschrock 	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
136*2082Seschrock 	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
137*2082Seschrock 	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
138*2082Seschrock 	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
139*2082Seschrock 	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
140*2082Seschrock 	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
141*2082Seschrock 	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
142*2082Seschrock };
143*2082Seschrock static const uint8_t vdev_raidz_log2[256] = {
144*2082Seschrock 	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
145*2082Seschrock 	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
146*2082Seschrock 	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
147*2082Seschrock 	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
148*2082Seschrock 	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
149*2082Seschrock 	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
150*2082Seschrock 	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
151*2082Seschrock 	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
152*2082Seschrock 	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
153*2082Seschrock 	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
154*2082Seschrock 	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
155*2082Seschrock 	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
156*2082Seschrock 	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
157*2082Seschrock 	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
158*2082Seschrock 	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
159*2082Seschrock 	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
160*2082Seschrock 	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
161*2082Seschrock 	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
162*2082Seschrock 	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
163*2082Seschrock 	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
164*2082Seschrock 	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
165*2082Seschrock 	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
166*2082Seschrock 	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
167*2082Seschrock 	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
168*2082Seschrock 	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
169*2082Seschrock 	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
170*2082Seschrock 	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
171*2082Seschrock 	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
172*2082Seschrock 	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
173*2082Seschrock 	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
174*2082Seschrock 	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
175*2082Seschrock 	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
176*2082Seschrock };
177*2082Seschrock 
178*2082Seschrock /*
179*2082Seschrock  * Multiply a given number by 2 raised to the given power.
180*2082Seschrock  */
181*2082Seschrock static uint8_t
182*2082Seschrock vdev_raidz_exp2(uint_t a, int exp)
183*2082Seschrock {
184*2082Seschrock 	if (a == 0)
185*2082Seschrock 		return (0);
186*2082Seschrock 
187*2082Seschrock 	ASSERT(exp >= 0);
188*2082Seschrock 	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
189*2082Seschrock 
190*2082Seschrock 	exp += vdev_raidz_log2[a];
191*2082Seschrock 	if (exp > 255)
192*2082Seschrock 		exp -= 255;
193*2082Seschrock 
194*2082Seschrock 	return (vdev_raidz_pow2[exp]);
195*2082Seschrock }
196*2082Seschrock 
197789Sahrens static raidz_map_t *
198*2082Seschrock vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
199*2082Seschrock     uint64_t nparity)
200789Sahrens {
201789Sahrens 	raidz_map_t *rm;
202789Sahrens 	uint64_t b = zio->io_offset >> unit_shift;
203789Sahrens 	uint64_t s = zio->io_size >> unit_shift;
204789Sahrens 	uint64_t f = b % dcols;
205789Sahrens 	uint64_t o = (b / dcols) << unit_shift;
206*2082Seschrock 	uint64_t q, r, c, bc, col, acols, coff, devidx;
207789Sahrens 
208*2082Seschrock 	q = s / (dcols - nparity);
209*2082Seschrock 	r = s - q * (dcols - nparity);
210*2082Seschrock 	bc = (r == 0 ? 0 : r + nparity);
211789Sahrens 
212789Sahrens 	acols = (q == 0 ? bc : dcols);
213789Sahrens 
214789Sahrens 	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[acols]), KM_SLEEP);
215789Sahrens 
216789Sahrens 	rm->rm_cols = acols;
217789Sahrens 	rm->rm_bigcols = bc;
218789Sahrens 	rm->rm_asize = 0;
219*2082Seschrock 	rm->rm_missingdata = 0;
220*2082Seschrock 	rm->rm_missingparity = 0;
221*2082Seschrock 	rm->rm_firstdatacol = nparity;
222789Sahrens 
223789Sahrens 	for (c = 0; c < acols; c++) {
224789Sahrens 		col = f + c;
225789Sahrens 		coff = o;
226789Sahrens 		if (col >= dcols) {
227789Sahrens 			col -= dcols;
228789Sahrens 			coff += 1ULL << unit_shift;
229789Sahrens 		}
230*2082Seschrock 		rm->rm_col[c].rc_devidx = col;
231789Sahrens 		rm->rm_col[c].rc_offset = coff;
232789Sahrens 		rm->rm_col[c].rc_size = (q + (c < bc)) << unit_shift;
233789Sahrens 		rm->rm_col[c].rc_data = NULL;
234789Sahrens 		rm->rm_col[c].rc_error = 0;
235789Sahrens 		rm->rm_col[c].rc_tried = 0;
236789Sahrens 		rm->rm_col[c].rc_skipped = 0;
237789Sahrens 		rm->rm_asize += rm->rm_col[c].rc_size;
238789Sahrens 	}
239789Sahrens 
240*2082Seschrock 	rm->rm_asize = roundup(rm->rm_asize, (nparity + 1) << unit_shift);
241789Sahrens 
242789Sahrens 	for (c = 0; c < rm->rm_firstdatacol; c++)
243789Sahrens 		rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
244789Sahrens 
245789Sahrens 	rm->rm_col[c].rc_data = zio->io_data;
246789Sahrens 
247789Sahrens 	for (c = c + 1; c < acols; c++)
248789Sahrens 		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
249789Sahrens 		    rm->rm_col[c - 1].rc_size;
250789Sahrens 
2511133Seschrock 	/*
252*2082Seschrock 	 * If all data stored spans all columns, there's a danger that parity
253*2082Seschrock 	 * will always be on the same device and, since parity isn't read
254*2082Seschrock 	 * during normal operation, that that device's I/O bandwidth won't be
255*2082Seschrock 	 * used effectively. We therefore switch the parity every 1MB.
256*2082Seschrock 	 *
257*2082Seschrock 	 * ... at least that was, ostensibly, the theory. As a practical
258*2082Seschrock 	 * matter unless we juggle the parity between all devices evenly, we
259*2082Seschrock 	 * won't see any benefit. Further, occasional writes that aren't a
260*2082Seschrock 	 * multiple of the LCM of the number of children and the minimum
261*2082Seschrock 	 * stripe width are sufficient to avoid pessimal behavior.
262*2082Seschrock 	 * Unfortunately, this decision created an implicit on-disk format
263*2082Seschrock 	 * requirement that we need to support for all eternity (but only for
264*2082Seschrock 	 * RAID-Z with one parity device).
2651133Seschrock 	 */
2661133Seschrock 	ASSERT(rm->rm_cols >= 2);
2671133Seschrock 	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
268789Sahrens 
269*2082Seschrock 	if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
270*2082Seschrock 		devidx = rm->rm_col[0].rc_devidx;
2711133Seschrock 		o = rm->rm_col[0].rc_offset;
272*2082Seschrock 		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
2731133Seschrock 		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
274*2082Seschrock 		rm->rm_col[1].rc_devidx = devidx;
2751133Seschrock 		rm->rm_col[1].rc_offset = o;
276789Sahrens 	}
277789Sahrens 
278789Sahrens 	zio->io_vsd = rm;
279789Sahrens 	return (rm);
280789Sahrens }
281789Sahrens 
282789Sahrens static void
283789Sahrens vdev_raidz_map_free(zio_t *zio)
284789Sahrens {
285789Sahrens 	raidz_map_t *rm = zio->io_vsd;
286789Sahrens 	int c;
287789Sahrens 
288789Sahrens 	for (c = 0; c < rm->rm_firstdatacol; c++)
289789Sahrens 		zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
290789Sahrens 
291789Sahrens 	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols]));
292789Sahrens 	zio->io_vsd = NULL;
293789Sahrens }
294789Sahrens 
295789Sahrens static void
296*2082Seschrock vdev_raidz_generate_parity_p(raidz_map_t *rm)
297*2082Seschrock {
298*2082Seschrock 	uint64_t *p, *src, pcount, ccount, i;
299*2082Seschrock 	int c;
300*2082Seschrock 
301*2082Seschrock 	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
302*2082Seschrock 
303*2082Seschrock 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
304*2082Seschrock 		src = rm->rm_col[c].rc_data;
305*2082Seschrock 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
306*2082Seschrock 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
307*2082Seschrock 
308*2082Seschrock 		if (c == rm->rm_firstdatacol) {
309*2082Seschrock 			ASSERT(ccount == pcount);
310*2082Seschrock 			for (i = 0; i < ccount; i++, p++, src++) {
311*2082Seschrock 				*p = *src;
312*2082Seschrock 			}
313*2082Seschrock 		} else {
314*2082Seschrock 			ASSERT(ccount <= pcount);
315*2082Seschrock 			for (i = 0; i < ccount; i++, p++, src++) {
316*2082Seschrock 				*p ^= *src;
317*2082Seschrock 			}
318*2082Seschrock 		}
319*2082Seschrock 	}
320*2082Seschrock }
321*2082Seschrock 
322*2082Seschrock static void
323*2082Seschrock vdev_raidz_generate_parity_pq(raidz_map_t *rm)
324789Sahrens {
325*2082Seschrock 	uint64_t *q, *p, *src, pcount, ccount, mask, i;
326*2082Seschrock 	int c;
327*2082Seschrock 
328*2082Seschrock 	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
329*2082Seschrock 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
330*2082Seschrock 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
331*2082Seschrock 
332*2082Seschrock 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
333*2082Seschrock 		src = rm->rm_col[c].rc_data;
334*2082Seschrock 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
335*2082Seschrock 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
336*2082Seschrock 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
337*2082Seschrock 
338*2082Seschrock 		if (c == rm->rm_firstdatacol) {
339*2082Seschrock 			ASSERT(ccount == pcount || ccount == 0);
340*2082Seschrock 			for (i = 0; i < ccount; i++, p++, q++, src++) {
341*2082Seschrock 				*q = *src;
342*2082Seschrock 				*p = *src;
343*2082Seschrock 			}
344*2082Seschrock 			for (; i < pcount; i++, p++, q++, src++) {
345*2082Seschrock 				*q = 0;
346*2082Seschrock 				*p = 0;
347*2082Seschrock 			}
348*2082Seschrock 		} else {
349*2082Seschrock 			ASSERT(ccount <= pcount);
350789Sahrens 
351*2082Seschrock 			/*
352*2082Seschrock 			 * Rather than multiplying each byte individually (as
353*2082Seschrock 			 * described above), we are able to handle 8 at once
354*2082Seschrock 			 * by generating a mask based on the high bit in each
355*2082Seschrock 			 * byte and using that to conditionally XOR in 0x1d.
356*2082Seschrock 			 */
357*2082Seschrock 			for (i = 0; i < ccount; i++, p++, q++, src++) {
358*2082Seschrock 				mask = *q & 0x8080808080808080ULL;
359*2082Seschrock 				mask = (mask << 1) - (mask >> 7);
360*2082Seschrock 				*q = ((*q << 1) & 0xfefefefefefefefeULL) ^
361*2082Seschrock 				    (mask & 0x1d1d1d1d1d1d1d1dULL);
362*2082Seschrock 				*q ^= *src;
363*2082Seschrock 				*p ^= *src;
364*2082Seschrock 			}
365*2082Seschrock 
366*2082Seschrock 			/*
367*2082Seschrock 			 * Treat short columns as though they are full of 0s.
368*2082Seschrock 			 */
369*2082Seschrock 			for (; i < pcount; i++, q++) {
370*2082Seschrock 				mask = *q & 0x8080808080808080ULL;
371*2082Seschrock 				mask = (mask << 1) - (mask >> 7);
372*2082Seschrock 				*q = ((*q << 1) & 0xfefefefefefefefeULL) ^
373*2082Seschrock 				    (mask & 0x1d1d1d1d1d1d1d1dULL);
374*2082Seschrock 			}
375*2082Seschrock 		}
376*2082Seschrock 	}
377*2082Seschrock }
378*2082Seschrock 
379*2082Seschrock static void
380*2082Seschrock vdev_raidz_reconstruct_p(raidz_map_t *rm, int x)
381*2082Seschrock {
382*2082Seschrock 	uint64_t *dst, *src, xcount, ccount, count, i;
383*2082Seschrock 	int c;
384*2082Seschrock 
385*2082Seschrock 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
386*2082Seschrock 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
387*2082Seschrock 	ASSERT(xcount > 0);
388*2082Seschrock 
389*2082Seschrock 	src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
390*2082Seschrock 	dst = rm->rm_col[x].rc_data;
391*2082Seschrock 	for (i = 0; i < xcount; i++, dst++, src++) {
392*2082Seschrock 		*dst = *src;
393*2082Seschrock 	}
394*2082Seschrock 
395*2082Seschrock 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
396789Sahrens 		src = rm->rm_col[c].rc_data;
397789Sahrens 		dst = rm->rm_col[x].rc_data;
398*2082Seschrock 
399*2082Seschrock 		if (c == x)
400*2082Seschrock 			continue;
401*2082Seschrock 
402*2082Seschrock 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
403*2082Seschrock 		count = MIN(ccount, xcount);
404*2082Seschrock 
405*2082Seschrock 		for (i = 0; i < count; i++, dst++, src++) {
406*2082Seschrock 			*dst ^= *src;
407789Sahrens 		}
408789Sahrens 	}
409789Sahrens }
410789Sahrens 
411*2082Seschrock static void
412*2082Seschrock vdev_raidz_reconstruct_q(raidz_map_t *rm, int x)
413*2082Seschrock {
414*2082Seschrock 	uint64_t *dst, *src, xcount, ccount, count, mask, i;
415*2082Seschrock 	uint8_t *b;
416*2082Seschrock 	int c, j, exp;
417*2082Seschrock 
418*2082Seschrock 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
419*2082Seschrock 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
420*2082Seschrock 
421*2082Seschrock 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
422*2082Seschrock 		src = rm->rm_col[c].rc_data;
423*2082Seschrock 		dst = rm->rm_col[x].rc_data;
424*2082Seschrock 
425*2082Seschrock 		if (c == x)
426*2082Seschrock 			ccount = 0;
427*2082Seschrock 		else
428*2082Seschrock 			ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
429*2082Seschrock 
430*2082Seschrock 		count = MIN(ccount, xcount);
431*2082Seschrock 
432*2082Seschrock 		if (c == rm->rm_firstdatacol) {
433*2082Seschrock 			for (i = 0; i < count; i++, dst++, src++) {
434*2082Seschrock 				*dst = *src;
435*2082Seschrock 			}
436*2082Seschrock 			for (; i < xcount; i++, dst++) {
437*2082Seschrock 				*dst = 0;
438*2082Seschrock 			}
439*2082Seschrock 
440*2082Seschrock 		} else {
441*2082Seschrock 			/*
442*2082Seschrock 			 * For an explanation of this, see the comment in
443*2082Seschrock 			 * vdev_raidz_generate_parity_pq() above.
444*2082Seschrock 			 */
445*2082Seschrock 			for (i = 0; i < count; i++, dst++, src++) {
446*2082Seschrock 				mask = *dst & 0x8080808080808080ULL;
447*2082Seschrock 				mask = (mask << 1) - (mask >> 7);
448*2082Seschrock 				*dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
449*2082Seschrock 				    (mask & 0x1d1d1d1d1d1d1d1dULL);
450*2082Seschrock 				*dst ^= *src;
451*2082Seschrock 			}
452*2082Seschrock 
453*2082Seschrock 			for (; i < xcount; i++, dst++) {
454*2082Seschrock 				mask = *dst & 0x8080808080808080ULL;
455*2082Seschrock 				mask = (mask << 1) - (mask >> 7);
456*2082Seschrock 				*dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
457*2082Seschrock 				    (mask & 0x1d1d1d1d1d1d1d1dULL);
458*2082Seschrock 			}
459*2082Seschrock 		}
460*2082Seschrock 	}
461*2082Seschrock 
462*2082Seschrock 	src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
463*2082Seschrock 	dst = rm->rm_col[x].rc_data;
464*2082Seschrock 	exp = 255 - (rm->rm_cols - 1 - x);
465*2082Seschrock 
466*2082Seschrock 	for (i = 0; i < xcount; i++, dst++, src++) {
467*2082Seschrock 		*dst ^= *src;
468*2082Seschrock 		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
469*2082Seschrock 			*b = vdev_raidz_exp2(*b, exp);
470*2082Seschrock 		}
471*2082Seschrock 	}
472*2082Seschrock }
473*2082Seschrock 
474*2082Seschrock static void
475*2082Seschrock vdev_raidz_reconstruct_pq(raidz_map_t *rm, int x, int y)
476*2082Seschrock {
477*2082Seschrock 	uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
478*2082Seschrock 	void *pdata, *qdata;
479*2082Seschrock 	uint64_t xsize, ysize, i;
480*2082Seschrock 
481*2082Seschrock 	ASSERT(x < y);
482*2082Seschrock 	ASSERT(x >= rm->rm_firstdatacol);
483*2082Seschrock 	ASSERT(y < rm->rm_cols);
484*2082Seschrock 
485*2082Seschrock 	ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
486*2082Seschrock 
487*2082Seschrock 	/*
488*2082Seschrock 	 * Move the parity data aside -- we're going to compute parity as
489*2082Seschrock 	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
490*2082Seschrock 	 * reuse the parity generation mechanism without trashing the actual
491*2082Seschrock 	 * parity so we make those columns appear to be full of zeros by
492*2082Seschrock 	 * setting their lengths to zero.
493*2082Seschrock 	 */
494*2082Seschrock 	pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
495*2082Seschrock 	qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
496*2082Seschrock 	xsize = rm->rm_col[x].rc_size;
497*2082Seschrock 	ysize = rm->rm_col[y].rc_size;
498*2082Seschrock 
499*2082Seschrock 	rm->rm_col[VDEV_RAIDZ_P].rc_data =
500*2082Seschrock 	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
501*2082Seschrock 	rm->rm_col[VDEV_RAIDZ_Q].rc_data =
502*2082Seschrock 	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
503*2082Seschrock 	rm->rm_col[x].rc_size = 0;
504*2082Seschrock 	rm->rm_col[y].rc_size = 0;
505*2082Seschrock 
506*2082Seschrock 	vdev_raidz_generate_parity_pq(rm);
507*2082Seschrock 
508*2082Seschrock 	rm->rm_col[x].rc_size = xsize;
509*2082Seschrock 	rm->rm_col[y].rc_size = ysize;
510*2082Seschrock 
511*2082Seschrock 	p = pdata;
512*2082Seschrock 	q = qdata;
513*2082Seschrock 	pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
514*2082Seschrock 	qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
515*2082Seschrock 	xd = rm->rm_col[x].rc_data;
516*2082Seschrock 	yd = rm->rm_col[y].rc_data;
517*2082Seschrock 
518*2082Seschrock 	/*
519*2082Seschrock 	 * We now have:
520*2082Seschrock 	 *	Pxy = P + D_x + D_y
521*2082Seschrock 	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
522*2082Seschrock 	 *
523*2082Seschrock 	 * We can then solve for D_x:
524*2082Seschrock 	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
525*2082Seschrock 	 * where
526*2082Seschrock 	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
527*2082Seschrock 	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
528*2082Seschrock 	 *
529*2082Seschrock 	 * With D_x in hand, we can easily solve for D_y:
530*2082Seschrock 	 *	D_y = P + Pxy + D_x
531*2082Seschrock 	 */
532*2082Seschrock 
533*2082Seschrock 	a = vdev_raidz_pow2[255 + x - y];
534*2082Seschrock 	b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
535*2082Seschrock 	tmp = 255 - vdev_raidz_log2[a ^ 1];
536*2082Seschrock 
537*2082Seschrock 	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
538*2082Seschrock 	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
539*2082Seschrock 
540*2082Seschrock 	for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
541*2082Seschrock 		*xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
542*2082Seschrock 		    vdev_raidz_exp2(*q ^ *qxy, bexp);
543*2082Seschrock 
544*2082Seschrock 		if (i < ysize)
545*2082Seschrock 			*yd = *p ^ *pxy ^ *xd;
546*2082Seschrock 	}
547*2082Seschrock 
548*2082Seschrock 	zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
549*2082Seschrock 	    rm->rm_col[VDEV_RAIDZ_P].rc_size);
550*2082Seschrock 	zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
551*2082Seschrock 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
552*2082Seschrock 
553*2082Seschrock 	/*
554*2082Seschrock 	 * Restore the saved parity data.
555*2082Seschrock 	 */
556*2082Seschrock 	rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
557*2082Seschrock 	rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
558*2082Seschrock }
559*2082Seschrock 
560*2082Seschrock 
561789Sahrens static int
562789Sahrens vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
563789Sahrens {
564789Sahrens 	vdev_t *cvd;
565*2082Seschrock 	uint64_t nparity = vd->vdev_nparity;
566789Sahrens 	int c, error;
567789Sahrens 	int lasterror = 0;
568789Sahrens 	int numerrors = 0;
569789Sahrens 
570*2082Seschrock 	ASSERT(nparity > 0);
571*2082Seschrock 
572*2082Seschrock 	if (nparity > VDEV_RAIDZ_MAXPARITY ||
573*2082Seschrock 	    vd->vdev_children < nparity + 1) {
574789Sahrens 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
575789Sahrens 		return (EINVAL);
576789Sahrens 	}
577789Sahrens 
578789Sahrens 	for (c = 0; c < vd->vdev_children; c++) {
579789Sahrens 		cvd = vd->vdev_child[c];
580789Sahrens 
581789Sahrens 		if ((error = vdev_open(cvd)) != 0) {
582789Sahrens 			lasterror = error;
583789Sahrens 			numerrors++;
584789Sahrens 			continue;
585789Sahrens 		}
586789Sahrens 
587789Sahrens 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
5881732Sbonwick 		*ashift = MAX(*ashift, cvd->vdev_ashift);
589789Sahrens 	}
590789Sahrens 
591789Sahrens 	*asize *= vd->vdev_children;
592789Sahrens 
593*2082Seschrock 	if (numerrors > nparity) {
594789Sahrens 		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
595789Sahrens 		return (lasterror);
596789Sahrens 	}
597789Sahrens 
598789Sahrens 	return (0);
599789Sahrens }
600789Sahrens 
601789Sahrens static void
602789Sahrens vdev_raidz_close(vdev_t *vd)
603789Sahrens {
604789Sahrens 	int c;
605789Sahrens 
606789Sahrens 	for (c = 0; c < vd->vdev_children; c++)
607789Sahrens 		vdev_close(vd->vdev_child[c]);
608789Sahrens }
609789Sahrens 
610789Sahrens static uint64_t
611789Sahrens vdev_raidz_asize(vdev_t *vd, uint64_t psize)
612789Sahrens {
613789Sahrens 	uint64_t asize;
6141732Sbonwick 	uint64_t ashift = vd->vdev_top->vdev_ashift;
615789Sahrens 	uint64_t cols = vd->vdev_children;
616*2082Seschrock 	uint64_t nparity = vd->vdev_nparity;
617789Sahrens 
6181732Sbonwick 	asize = ((psize - 1) >> ashift) + 1;
619*2082Seschrock 	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
620*2082Seschrock 	asize = roundup(asize, nparity + 1) << ashift;
621789Sahrens 
622789Sahrens 	return (asize);
623789Sahrens }
624789Sahrens 
625789Sahrens static void
626789Sahrens vdev_raidz_child_done(zio_t *zio)
627789Sahrens {
628789Sahrens 	raidz_col_t *rc = zio->io_private;
629789Sahrens 
630789Sahrens 	rc->rc_error = zio->io_error;
631789Sahrens 	rc->rc_tried = 1;
632789Sahrens 	rc->rc_skipped = 0;
633789Sahrens }
634789Sahrens 
635789Sahrens static void
636789Sahrens vdev_raidz_repair_done(zio_t *zio)
637789Sahrens {
6381732Sbonwick 	ASSERT(zio->io_private == zio->io_parent);
6391732Sbonwick 	vdev_raidz_map_free(zio->io_private);
640789Sahrens }
641789Sahrens 
642789Sahrens static void
643789Sahrens vdev_raidz_io_start(zio_t *zio)
644789Sahrens {
645789Sahrens 	vdev_t *vd = zio->io_vd;
6461732Sbonwick 	vdev_t *tvd = vd->vdev_top;
647789Sahrens 	vdev_t *cvd;
648789Sahrens 	blkptr_t *bp = zio->io_bp;
649789Sahrens 	raidz_map_t *rm;
650789Sahrens 	raidz_col_t *rc;
651789Sahrens 	int c;
652789Sahrens 
653*2082Seschrock 	rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
654*2082Seschrock 	    vd->vdev_nparity);
655789Sahrens 
6561775Sbillm 	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
657789Sahrens 
658789Sahrens 	if (zio->io_type == ZIO_TYPE_WRITE) {
659789Sahrens 		/*
660*2082Seschrock 		 * Generate RAID parity in the first virtual columns.
661789Sahrens 		 */
662*2082Seschrock 		if (rm->rm_firstdatacol == 1)
663*2082Seschrock 			vdev_raidz_generate_parity_p(rm);
664*2082Seschrock 		else
665*2082Seschrock 			vdev_raidz_generate_parity_pq(rm);
666789Sahrens 
667789Sahrens 		for (c = 0; c < rm->rm_cols; c++) {
668789Sahrens 			rc = &rm->rm_col[c];
669*2082Seschrock 			cvd = vd->vdev_child[rc->rc_devidx];
670789Sahrens 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
671789Sahrens 			    rc->rc_offset, rc->rc_data, rc->rc_size,
672789Sahrens 			    zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
673789Sahrens 			    vdev_raidz_child_done, rc));
674789Sahrens 		}
675789Sahrens 		zio_wait_children_done(zio);
676789Sahrens 		return;
677789Sahrens 	}
678789Sahrens 
679789Sahrens 	ASSERT(zio->io_type == ZIO_TYPE_READ);
680789Sahrens 
681*2082Seschrock 	/*
682*2082Seschrock 	 * Iterate over the columns in reverse order so that we hit the parity
683*2082Seschrock 	 * last -- any errors along the way will force us to read the parity
684*2082Seschrock 	 * data.
685*2082Seschrock 	 */
686789Sahrens 	for (c = rm->rm_cols - 1; c >= 0; c--) {
687789Sahrens 		rc = &rm->rm_col[c];
688*2082Seschrock 		cvd = vd->vdev_child[rc->rc_devidx];
689789Sahrens 		if (vdev_is_dead(cvd)) {
690*2082Seschrock 			if (c >= rm->rm_firstdatacol)
691*2082Seschrock 				rm->rm_missingdata++;
692*2082Seschrock 			else
693*2082Seschrock 				rm->rm_missingparity++;
694789Sahrens 			rc->rc_error = ENXIO;
695789Sahrens 			rc->rc_tried = 1;	/* don't even try */
696789Sahrens 			rc->rc_skipped = 1;
697789Sahrens 			continue;
698789Sahrens 		}
699789Sahrens 		if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) {
700*2082Seschrock 			if (c >= rm->rm_firstdatacol)
701*2082Seschrock 				rm->rm_missingdata++;
702*2082Seschrock 			else
703*2082Seschrock 				rm->rm_missingparity++;
704789Sahrens 			rc->rc_error = ESTALE;
705789Sahrens 			rc->rc_skipped = 1;
706789Sahrens 			continue;
707789Sahrens 		}
708*2082Seschrock 		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
709789Sahrens 		    (zio->io_flags & ZIO_FLAG_SCRUB)) {
710789Sahrens 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
711789Sahrens 			    rc->rc_offset, rc->rc_data, rc->rc_size,
712789Sahrens 			    zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
713789Sahrens 			    vdev_raidz_child_done, rc));
714789Sahrens 		}
715789Sahrens 	}
716789Sahrens 
717789Sahrens 	zio_wait_children_done(zio);
718789Sahrens }
719789Sahrens 
7201544Seschrock /*
7211544Seschrock  * Report a checksum error for a child of a RAID-Z device.
7221544Seschrock  */
7231544Seschrock static void
7241544Seschrock raidz_checksum_error(zio_t *zio, raidz_col_t *rc)
7251544Seschrock {
726*2082Seschrock 	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
7271544Seschrock 	dprintf_bp(zio->io_bp, "imputed checksum error on %s: ",
7281544Seschrock 	    vdev_description(vd));
7291544Seschrock 
7301544Seschrock 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
7311544Seschrock 		mutex_enter(&vd->vdev_stat_lock);
7321544Seschrock 		vd->vdev_stat.vs_checksum_errors++;
7331544Seschrock 		mutex_exit(&vd->vdev_stat_lock);
7341544Seschrock 	}
7351544Seschrock 
7361544Seschrock 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE))
7371544Seschrock 		zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
7381544Seschrock 		    zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size);
7391544Seschrock }
7401544Seschrock 
741*2082Seschrock /*
742*2082Seschrock  * Generate the parity from the data columns. If we tried and were able to
743*2082Seschrock  * read the parity without error, verify that the generated parity matches the
744*2082Seschrock  * data we read. If it doesn't, we fire off a checksum error. Return the
745*2082Seschrock  * number such failures.
746*2082Seschrock  */
747*2082Seschrock static int
748*2082Seschrock raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
749*2082Seschrock {
750*2082Seschrock 	void *orig[VDEV_RAIDZ_MAXPARITY];
751*2082Seschrock 	int c, ret = 0;
752*2082Seschrock 	raidz_col_t *rc;
753*2082Seschrock 
754*2082Seschrock 	for (c = 0; c < rm->rm_firstdatacol; c++) {
755*2082Seschrock 		rc = &rm->rm_col[c];
756*2082Seschrock 		if (!rc->rc_tried || rc->rc_error != 0)
757*2082Seschrock 			continue;
758*2082Seschrock 		orig[c] = zio_buf_alloc(rc->rc_size);
759*2082Seschrock 		bcopy(rc->rc_data, orig[c], rc->rc_size);
760*2082Seschrock 	}
761*2082Seschrock 
762*2082Seschrock 	if (rm->rm_firstdatacol == 1)
763*2082Seschrock 		vdev_raidz_generate_parity_p(rm);
764*2082Seschrock 	else
765*2082Seschrock 		vdev_raidz_generate_parity_pq(rm);
766*2082Seschrock 
767*2082Seschrock 	for (c = 0; c < rm->rm_firstdatacol; c++) {
768*2082Seschrock 		rc = &rm->rm_col[c];
769*2082Seschrock 		if (!rc->rc_tried || rc->rc_error != 0)
770*2082Seschrock 			continue;
771*2082Seschrock 		if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
772*2082Seschrock 			raidz_checksum_error(zio, rc);
773*2082Seschrock 			rc->rc_error = ECKSUM;
774*2082Seschrock 			ret++;
775*2082Seschrock 		}
776*2082Seschrock 		zio_buf_free(orig[c], rc->rc_size);
777*2082Seschrock 	}
778*2082Seschrock 
779*2082Seschrock 	return (ret);
780*2082Seschrock }
781*2082Seschrock 
782*2082Seschrock static uint64_t raidz_corrected_p;
783*2082Seschrock static uint64_t raidz_corrected_q;
784*2082Seschrock static uint64_t raidz_corrected_pq;
7851544Seschrock 
786789Sahrens static void
787789Sahrens vdev_raidz_io_done(zio_t *zio)
788789Sahrens {
789789Sahrens 	vdev_t *vd = zio->io_vd;
790789Sahrens 	vdev_t *cvd;
791789Sahrens 	raidz_map_t *rm = zio->io_vsd;
792*2082Seschrock 	raidz_col_t *rc, *rc1;
793789Sahrens 	int unexpected_errors = 0;
794*2082Seschrock 	int parity_errors = 0;
795*2082Seschrock 	int data_errors = 0;
796*2082Seschrock 	int n, c, c1;
797789Sahrens 
7981775Sbillm 	ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
799789Sahrens 
800789Sahrens 	zio->io_error = 0;
801789Sahrens 	zio->io_numerrors = 0;
802789Sahrens 
803*2082Seschrock 	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
804*2082Seschrock 	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
805*2082Seschrock 
806789Sahrens 	for (c = 0; c < rm->rm_cols; c++) {
807789Sahrens 		rc = &rm->rm_col[c];
808789Sahrens 
809789Sahrens 		/*
810789Sahrens 		 * We preserve any EIOs because those may be worth retrying;
811789Sahrens 		 * whereas ECKSUM and ENXIO are more likely to be persistent.
812789Sahrens 		 */
813789Sahrens 		if (rc->rc_error) {
814789Sahrens 			if (zio->io_error != EIO)
815789Sahrens 				zio->io_error = rc->rc_error;
816*2082Seschrock 
817*2082Seschrock 			if (c < rm->rm_firstdatacol)
818*2082Seschrock 				parity_errors++;
819*2082Seschrock 			else
820*2082Seschrock 				data_errors++;
821*2082Seschrock 
822789Sahrens 			if (!rc->rc_skipped)
823789Sahrens 				unexpected_errors++;
824*2082Seschrock 
825789Sahrens 			zio->io_numerrors++;
826789Sahrens 		}
827789Sahrens 	}
828789Sahrens 
829789Sahrens 	if (zio->io_type == ZIO_TYPE_WRITE) {
830789Sahrens 		/*
831789Sahrens 		 * If this is not a failfast write, and we were able to
832789Sahrens 		 * write enough columns to reconstruct the data, good enough.
833789Sahrens 		 */
834789Sahrens 		/* XXPOLICY */
835789Sahrens 		if (zio->io_numerrors <= rm->rm_firstdatacol &&
836789Sahrens 		    !(zio->io_flags & ZIO_FLAG_FAILFAST))
837789Sahrens 			zio->io_error = 0;
838789Sahrens 
839789Sahrens 		vdev_raidz_map_free(zio);
840789Sahrens 		zio_next_stage(zio);
841789Sahrens 		return;
842789Sahrens 	}
843789Sahrens 
844789Sahrens 	ASSERT(zio->io_type == ZIO_TYPE_READ);
845*2082Seschrock 	/*
846*2082Seschrock 	 * There are three potential phases for a read:
847*2082Seschrock 	 *	1. produce valid data from the columns read
848*2082Seschrock 	 *	2. read all disks and try again
849*2082Seschrock 	 *	3. perform combinatorial reconstruction
850*2082Seschrock 	 *
851*2082Seschrock 	 * Each phase is progressively both more expensive and less likely to
852*2082Seschrock 	 * occur. If we encounter more errors than we can repair or all phases
853*2082Seschrock 	 * fail, we have no choice but to return an error.
854*2082Seschrock 	 */
855789Sahrens 
856789Sahrens 	/*
857*2082Seschrock 	 * If the number of errors we saw was correctable -- less than or equal
858*2082Seschrock 	 * to the number of parity disks -- attempt to produce data that has a
859*2082Seschrock 	 * valid checksum. Naturally, zero errors falls into this case.
860789Sahrens 	 */
861*2082Seschrock 	if (zio->io_numerrors <= rm->rm_firstdatacol) {
862*2082Seschrock 		switch (data_errors) {
863*2082Seschrock 		case 0:
864*2082Seschrock 			if (zio_checksum_error(zio) == 0) {
865*2082Seschrock 				zio->io_error = 0;
866*2082Seschrock 				n = raidz_parity_verify(zio, rm);
867*2082Seschrock 				unexpected_errors += n;
868*2082Seschrock 				ASSERT(parity_errors + n <=
869*2082Seschrock 				    rm->rm_firstdatacol);
870*2082Seschrock 				goto done;
871*2082Seschrock 			}
872*2082Seschrock 			break;
873*2082Seschrock 
874*2082Seschrock 		case 1:
875*2082Seschrock 			ASSERT(parity_errors < rm->rm_firstdatacol);
876*2082Seschrock 
877*2082Seschrock 			/*
878*2082Seschrock 			 * Find the column that reported the error.
879*2082Seschrock 			 */
880*2082Seschrock 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
881*2082Seschrock 				rc = &rm->rm_col[c];
882*2082Seschrock 				if (rc->rc_error != 0)
883*2082Seschrock 					break;
884*2082Seschrock 			}
885*2082Seschrock 			ASSERT(c != rm->rm_cols);
886*2082Seschrock 			ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
887*2082Seschrock 			    rc->rc_error == ESTALE);
888*2082Seschrock 
889*2082Seschrock 			if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) {
890*2082Seschrock 				vdev_raidz_reconstruct_p(rm, c);
891*2082Seschrock 			} else {
892*2082Seschrock 				ASSERT(rm->rm_firstdatacol > 1);
893*2082Seschrock 				vdev_raidz_reconstruct_q(rm, c);
894*2082Seschrock 			}
895*2082Seschrock 
896*2082Seschrock 			if (zio_checksum_error(zio) == 0) {
897*2082Seschrock 				zio->io_error = 0;
898*2082Seschrock 				if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0)
899*2082Seschrock 					atomic_inc_64(&raidz_corrected_p);
900*2082Seschrock 				else
901*2082Seschrock 					atomic_inc_64(&raidz_corrected_q);
902789Sahrens 
903*2082Seschrock 				/*
904*2082Seschrock 				 * If there's more than one parity disk,
905*2082Seschrock 				 * confirm that the parity disk not used above
906*2082Seschrock 				 * has the correct data.
907*2082Seschrock 				 */
908*2082Seschrock 				if (rm->rm_firstdatacol > 1) {
909*2082Seschrock 					n = raidz_parity_verify(zio, rm);
910*2082Seschrock 					unexpected_errors += n;
911*2082Seschrock 					ASSERT(parity_errors + n <=
912*2082Seschrock 					    rm->rm_firstdatacol);
913*2082Seschrock 				}
914*2082Seschrock 
915*2082Seschrock 				goto done;
916*2082Seschrock 			}
917*2082Seschrock 			break;
918*2082Seschrock 
919*2082Seschrock 		case 2:
920*2082Seschrock 			/*
921*2082Seschrock 			 * Find the two columns that reported errors.
922*2082Seschrock 			 */
923*2082Seschrock 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
924*2082Seschrock 				rc = &rm->rm_col[c];
925*2082Seschrock 				if (rc->rc_error != 0)
926*2082Seschrock 					break;
927789Sahrens 			}
928*2082Seschrock 			ASSERT(c != rm->rm_cols);
929*2082Seschrock 			ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
930*2082Seschrock 			    rc->rc_error == ESTALE);
931*2082Seschrock 
932*2082Seschrock 			for (c1 = c++; c < rm->rm_cols; c++) {
933*2082Seschrock 				rc = &rm->rm_col[c];
934*2082Seschrock 				if (rc->rc_error != 0)
935*2082Seschrock 					break;
936*2082Seschrock 			}
937*2082Seschrock 			ASSERT(c != rm->rm_cols);
938*2082Seschrock 			ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
939*2082Seschrock 			    rc->rc_error == ESTALE);
940789Sahrens 
941*2082Seschrock 			vdev_raidz_reconstruct_pq(rm, c1, c);
942*2082Seschrock 
943*2082Seschrock 			if (zio_checksum_error(zio) == 0) {
944*2082Seschrock 				zio->io_error = 0;
945*2082Seschrock 				atomic_inc_64(&raidz_corrected_pq);
946*2082Seschrock 
947*2082Seschrock 				goto done;
948*2082Seschrock 			}
949*2082Seschrock 			break;
950*2082Seschrock 
951*2082Seschrock 		default:
952*2082Seschrock 			ASSERT(rm->rm_firstdatacol <= 2);
953*2082Seschrock 			ASSERT(0);
954789Sahrens 		}
955789Sahrens 	}
956789Sahrens 
957789Sahrens 	/*
958*2082Seschrock 	 * This isn't a typical situation -- either we got a read error or
959*2082Seschrock 	 * a child silently returned bad data. Read every block so we can
960*2082Seschrock 	 * try again with as much data and parity as we can track down. If
961*2082Seschrock 	 * we've already been through once before, all children will be marked
962*2082Seschrock 	 * as tried so we'll proceed to combinatorial reconstruction.
963789Sahrens 	 */
964789Sahrens 	unexpected_errors = 1;
965*2082Seschrock 	rm->rm_missingdata = 0;
966*2082Seschrock 	rm->rm_missingparity = 0;
967789Sahrens 
968*2082Seschrock 	for (c = 0; c < rm->rm_cols; c++) {
969*2082Seschrock 		if (rm->rm_col[c].rc_tried)
970*2082Seschrock 			continue;
971789Sahrens 
972789Sahrens 		zio->io_error = 0;
973789Sahrens 		zio_vdev_io_redone(zio);
974*2082Seschrock 		do {
975789Sahrens 			rc = &rm->rm_col[c];
976789Sahrens 			if (rc->rc_tried)
977789Sahrens 				continue;
978789Sahrens 			zio_nowait(zio_vdev_child_io(zio, NULL,
979*2082Seschrock 			    vd->vdev_child[rc->rc_devidx],
980789Sahrens 			    rc->rc_offset, rc->rc_data, rc->rc_size,
981789Sahrens 			    zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
982789Sahrens 			    vdev_raidz_child_done, rc));
983*2082Seschrock 		} while (++c < rm->rm_cols);
984*2082Seschrock 		dprintf("rereading\n");
985789Sahrens 		zio_wait_children_done(zio);
986789Sahrens 		return;
987789Sahrens 	}
988789Sahrens 
989789Sahrens 	/*
990*2082Seschrock 	 * At this point we've attempted to reconstruct the data given the
991*2082Seschrock 	 * errors we detected, and we've attempted to read all columns. There
992*2082Seschrock 	 * must, therefore, be one or more additional problems -- silent errors
993*2082Seschrock 	 * resulting in invalid data rather than explicit I/O errors resulting
994*2082Seschrock 	 * in absent data. Before we attempt combinatorial reconstruction make
995*2082Seschrock 	 * sure we have a chance of coming up with the right answer.
996789Sahrens 	 */
997*2082Seschrock 	if (zio->io_numerrors >= rm->rm_firstdatacol) {
998789Sahrens 		ASSERT(zio->io_error != 0);
999789Sahrens 		goto done;
1000789Sahrens 	}
1001789Sahrens 
1002*2082Seschrock 	if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) {
1003*2082Seschrock 		/*
1004*2082Seschrock 		 * Attempt to reconstruct the data from parity P.
1005*2082Seschrock 		 */
1006*2082Seschrock 		for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1007*2082Seschrock 			void *orig;
1008*2082Seschrock 			rc = &rm->rm_col[c];
1009*2082Seschrock 
1010*2082Seschrock 			orig = zio_buf_alloc(rc->rc_size);
1011*2082Seschrock 			bcopy(rc->rc_data, orig, rc->rc_size);
1012*2082Seschrock 			vdev_raidz_reconstruct_p(rm, c);
1013*2082Seschrock 
1014*2082Seschrock 			if (zio_checksum_error(zio) == 0) {
1015*2082Seschrock 				zio_buf_free(orig, rc->rc_size);
1016*2082Seschrock 				zio->io_error = 0;
1017*2082Seschrock 				atomic_inc_64(&raidz_corrected_p);
1018*2082Seschrock 
1019*2082Seschrock 				/*
1020*2082Seschrock 				 * If this child didn't know that it returned
1021*2082Seschrock 				 * bad data, inform it.
1022*2082Seschrock 				 */
1023*2082Seschrock 				if (rc->rc_tried && rc->rc_error == 0)
1024*2082Seschrock 					raidz_checksum_error(zio, rc);
1025*2082Seschrock 				rc->rc_error = ECKSUM;
1026*2082Seschrock 				goto done;
1027*2082Seschrock 			}
1028*2082Seschrock 
1029*2082Seschrock 			bcopy(orig, rc->rc_data, rc->rc_size);
1030*2082Seschrock 			zio_buf_free(orig, rc->rc_size);
1031*2082Seschrock 		}
1032*2082Seschrock 	}
1033*2082Seschrock 
1034*2082Seschrock 	if (rm->rm_firstdatacol > 1 && rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) {
1035*2082Seschrock 		/*
1036*2082Seschrock 		 * Attempt to reconstruct the data from parity Q.
1037*2082Seschrock 		 */
1038*2082Seschrock 		for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1039*2082Seschrock 			void *orig;
1040*2082Seschrock 			rc = &rm->rm_col[c];
1041*2082Seschrock 
1042*2082Seschrock 			orig = zio_buf_alloc(rc->rc_size);
1043*2082Seschrock 			bcopy(rc->rc_data, orig, rc->rc_size);
1044*2082Seschrock 			vdev_raidz_reconstruct_q(rm, c);
1045*2082Seschrock 
1046*2082Seschrock 			if (zio_checksum_error(zio) == 0) {
1047*2082Seschrock 				zio_buf_free(orig, rc->rc_size);
1048789Sahrens 				zio->io_error = 0;
1049*2082Seschrock 				atomic_inc_64(&raidz_corrected_q);
1050*2082Seschrock 
1051*2082Seschrock 				/*
1052*2082Seschrock 				 * If this child didn't know that it returned
1053*2082Seschrock 				 * bad data, inform it.
1054*2082Seschrock 				 */
1055*2082Seschrock 				if (rc->rc_tried && rc->rc_error == 0)
1056*2082Seschrock 					raidz_checksum_error(zio, rc);
1057*2082Seschrock 				rc->rc_error = ECKSUM;
1058*2082Seschrock 				goto done;
1059*2082Seschrock 			}
1060*2082Seschrock 
1061*2082Seschrock 			bcopy(orig, rc->rc_data, rc->rc_size);
1062*2082Seschrock 			zio_buf_free(orig, rc->rc_size);
1063*2082Seschrock 		}
1064*2082Seschrock 	}
1065*2082Seschrock 
1066*2082Seschrock 	if (rm->rm_firstdatacol > 1 &&
1067*2082Seschrock 	    rm->rm_col[VDEV_RAIDZ_P].rc_error == 0 &&
1068*2082Seschrock 	    rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) {
1069*2082Seschrock 		/*
1070*2082Seschrock 		 * Attempt to reconstruct the data from both P and Q.
1071*2082Seschrock 		 */
1072*2082Seschrock 		for (c = rm->rm_firstdatacol; c < rm->rm_cols - 1; c++) {
1073*2082Seschrock 			void *orig, *orig1;
1074*2082Seschrock 			rc = &rm->rm_col[c];
1075*2082Seschrock 
1076*2082Seschrock 			orig = zio_buf_alloc(rc->rc_size);
1077*2082Seschrock 			bcopy(rc->rc_data, orig, rc->rc_size);
1078*2082Seschrock 
1079*2082Seschrock 			for (c1 = c + 1; c1 < rm->rm_cols; c1++) {
1080*2082Seschrock 				rc1 = &rm->rm_col[c1];
1081*2082Seschrock 
1082*2082Seschrock 				orig1 = zio_buf_alloc(rc1->rc_size);
1083*2082Seschrock 				bcopy(rc1->rc_data, orig1, rc1->rc_size);
1084*2082Seschrock 
1085*2082Seschrock 				vdev_raidz_reconstruct_pq(rm, c, c1);
1086*2082Seschrock 
1087*2082Seschrock 				if (zio_checksum_error(zio) == 0) {
1088*2082Seschrock 					zio_buf_free(orig, rc->rc_size);
1089*2082Seschrock 					zio_buf_free(orig1, rc1->rc_size);
1090*2082Seschrock 					zio->io_error = 0;
1091*2082Seschrock 					atomic_inc_64(&raidz_corrected_pq);
1092*2082Seschrock 
1093*2082Seschrock 					/*
1094*2082Seschrock 					 * If these children didn't know they
1095*2082Seschrock 					 * returned bad data, inform them.
1096*2082Seschrock 					 */
1097*2082Seschrock 					if (rc->rc_tried && rc->rc_error == 0)
1098*2082Seschrock 						raidz_checksum_error(zio, rc);
1099*2082Seschrock 					if (rc1->rc_tried && rc1->rc_error == 0)
1100*2082Seschrock 						raidz_checksum_error(zio, rc1);
1101*2082Seschrock 
1102*2082Seschrock 					rc->rc_error = ECKSUM;
1103*2082Seschrock 					rc1->rc_error = ECKSUM;
1104*2082Seschrock 
1105*2082Seschrock 					goto done;
1106*2082Seschrock 				}
1107*2082Seschrock 
1108*2082Seschrock 				bcopy(orig1, rc1->rc_data, rc1->rc_size);
1109*2082Seschrock 				zio_buf_free(orig1, rc1->rc_size);
1110*2082Seschrock 			}
1111*2082Seschrock 
1112*2082Seschrock 			bcopy(orig, rc->rc_data, rc->rc_size);
1113*2082Seschrock 			zio_buf_free(orig, rc->rc_size);
1114789Sahrens 		}
1115789Sahrens 	}
1116789Sahrens 
1117789Sahrens 	/*
1118*2082Seschrock 	 * All combinations failed to checksum. Generate checksum ereports for
1119*2082Seschrock 	 * all children.
1120789Sahrens 	 */
1121789Sahrens 	zio->io_error = ECKSUM;
11221544Seschrock 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
11231544Seschrock 		for (c = 0; c < rm->rm_cols; c++) {
11241544Seschrock 			rc = &rm->rm_col[c];
11251544Seschrock 			zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
1126*2082Seschrock 			    zio->io_spa, vd->vdev_child[rc->rc_devidx], zio,
11271544Seschrock 			    rc->rc_offset, rc->rc_size);
11281544Seschrock 		}
11291544Seschrock 	}
1130789Sahrens 
1131789Sahrens done:
1132789Sahrens 	zio_checksum_verified(zio);
1133789Sahrens 
1134789Sahrens 	if (zio->io_error == 0 && (spa_mode & FWRITE) &&
1135789Sahrens 	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
11361732Sbonwick 		zio_t *rio;
11371732Sbonwick 
1138789Sahrens 		/*
1139789Sahrens 		 * Use the good data we have in hand to repair damaged children.
11401732Sbonwick 		 *
11411732Sbonwick 		 * We issue all repair I/Os as children of 'rio' to arrange
11421732Sbonwick 		 * that vdev_raidz_map_free(zio) will be invoked after all
11431732Sbonwick 		 * repairs complete, but before we advance to the next stage.
1144789Sahrens 		 */
11451732Sbonwick 		rio = zio_null(zio, zio->io_spa,
11461732Sbonwick 		    vdev_raidz_repair_done, zio, ZIO_FLAG_CANFAIL);
11471732Sbonwick 
1148789Sahrens 		for (c = 0; c < rm->rm_cols; c++) {
1149789Sahrens 			rc = &rm->rm_col[c];
1150*2082Seschrock 			cvd = vd->vdev_child[rc->rc_devidx];
1151789Sahrens 
11521732Sbonwick 			if (rc->rc_error == 0)
11531732Sbonwick 				continue;
11541732Sbonwick 
11551732Sbonwick 			dprintf("%s resilvered %s @ 0x%llx error %d\n",
11561732Sbonwick 			    vdev_description(vd),
11571732Sbonwick 			    vdev_description(cvd),
11581732Sbonwick 			    zio->io_offset, rc->rc_error);
1159789Sahrens 
11601732Sbonwick 			zio_nowait(zio_vdev_child_io(rio, NULL, cvd,
11611732Sbonwick 			    rc->rc_offset, rc->rc_data, rc->rc_size,
11621732Sbonwick 			    ZIO_TYPE_WRITE, zio->io_priority,
1163*2082Seschrock 			    ZIO_FLAG_IO_REPAIR | ZIO_FLAG_DONT_PROPAGATE |
1164*2082Seschrock 			    ZIO_FLAG_CANFAIL, NULL, NULL));
11651732Sbonwick 		}
1166789Sahrens 
11671732Sbonwick 		zio_nowait(rio);
11681732Sbonwick 		zio_wait_children_done(zio);
11691732Sbonwick 		return;
1170789Sahrens 	}
1171789Sahrens 
1172789Sahrens 	vdev_raidz_map_free(zio);
1173789Sahrens 	zio_next_stage(zio);
1174789Sahrens }
1175789Sahrens 
1176789Sahrens static void
1177789Sahrens vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
1178789Sahrens {
1179*2082Seschrock 	if (faulted > vd->vdev_nparity)
11801544Seschrock 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
11811544Seschrock 		    VDEV_AUX_NO_REPLICAS);
1182789Sahrens 	else if (degraded + faulted != 0)
11831544Seschrock 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
1184789Sahrens 	else
11851544Seschrock 		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
1186789Sahrens }
1187789Sahrens 
1188789Sahrens vdev_ops_t vdev_raidz_ops = {
1189789Sahrens 	vdev_raidz_open,
1190789Sahrens 	vdev_raidz_close,
1191789Sahrens 	vdev_raidz_asize,
1192789Sahrens 	vdev_raidz_io_start,
1193789Sahrens 	vdev_raidz_io_done,
1194789Sahrens 	vdev_raidz_state_change,
1195789Sahrens 	VDEV_TYPE_RAIDZ,	/* name of this vdev type */
1196789Sahrens 	B_FALSE			/* not a leaf vdev */
1197789Sahrens };
1198