xref: /onnv-gate/usr/src/uts/common/fs/zfs/vdev_raidz.c (revision 789:b348f31ed315)
1*789Sahrens /*
2*789Sahrens  * CDDL HEADER START
3*789Sahrens  *
4*789Sahrens  * The contents of this file are subject to the terms of the
5*789Sahrens  * Common Development and Distribution License, Version 1.0 only
6*789Sahrens  * (the "License").  You may not use this file except in compliance
7*789Sahrens  * with the License.
8*789Sahrens  *
9*789Sahrens  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10*789Sahrens  * or http://www.opensolaris.org/os/licensing.
11*789Sahrens  * See the License for the specific language governing permissions
12*789Sahrens  * and limitations under the License.
13*789Sahrens  *
14*789Sahrens  * When distributing Covered Code, include this CDDL HEADER in each
15*789Sahrens  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16*789Sahrens  * If applicable, add the following below this CDDL HEADER, with the
17*789Sahrens  * fields enclosed by brackets "[]" replaced with your own identifying
18*789Sahrens  * information: Portions Copyright [yyyy] [name of copyright owner]
19*789Sahrens  *
20*789Sahrens  * CDDL HEADER END
21*789Sahrens  */
22*789Sahrens /*
23*789Sahrens  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24*789Sahrens  * Use is subject to license terms.
25*789Sahrens  */
26*789Sahrens 
27*789Sahrens #pragma ident	"%Z%%M%	%I%	%E% SMI"
28*789Sahrens 
29*789Sahrens #include <sys/zfs_context.h>
30*789Sahrens #include <sys/spa.h>
31*789Sahrens #include <sys/vdev_impl.h>
32*789Sahrens #include <sys/zio.h>
33*789Sahrens #include <sys/zio_checksum.h>
34*789Sahrens #include <sys/fs/zfs.h>
35*789Sahrens 
36*789Sahrens /*
37*789Sahrens  * Virtual device vector for RAID-Z.
38*789Sahrens  */
39*789Sahrens 
40*789Sahrens /*
41*789Sahrens  * We currently allow up to two-way replication (i.e. single-fault
42*789Sahrens  * reconstruction) models in RAID-Z vdevs.  The blocks in such vdevs
43*789Sahrens  * must all be multiples of two times the leaf vdev blocksize.
44*789Sahrens  */
45*789Sahrens #define	VDEV_RAIDZ_ALIGN	2ULL
46*789Sahrens 
47*789Sahrens typedef struct raidz_col {
48*789Sahrens 	uint64_t	rc_col;
49*789Sahrens 	uint64_t	rc_offset;
50*789Sahrens 	uint64_t	rc_size;
51*789Sahrens 	void		*rc_data;
52*789Sahrens 	int		rc_error;
53*789Sahrens 	short		rc_tried;
54*789Sahrens 	short		rc_skipped;
55*789Sahrens } raidz_col_t;
56*789Sahrens 
57*789Sahrens typedef struct raidz_map {
58*789Sahrens 	uint64_t	rm_cols;
59*789Sahrens 	uint64_t	rm_bigcols;
60*789Sahrens 	uint64_t	rm_asize;
61*789Sahrens 	int		rm_missing_child;
62*789Sahrens 	int		rm_type;
63*789Sahrens 	int		rm_firstdatacol;
64*789Sahrens 	raidz_col_t	rm_col[1];
65*789Sahrens } raidz_map_t;
66*789Sahrens 
67*789Sahrens #define	RAIDZ_SINGLE	0
68*789Sahrens #define	RAIDZ_PARITY	1
69*789Sahrens 
70*789Sahrens static raidz_map_t *
71*789Sahrens vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
72*789Sahrens 	int raid_type)
73*789Sahrens {
74*789Sahrens 	raidz_map_t *rm;
75*789Sahrens 	uint64_t b = zio->io_offset >> unit_shift;
76*789Sahrens 	uint64_t s = zio->io_size >> unit_shift;
77*789Sahrens 	uint64_t f = b % dcols;
78*789Sahrens 	uint64_t o = (b / dcols) << unit_shift;
79*789Sahrens 	uint64_t q, r, c, bc, col, acols, coff;
80*789Sahrens 	int firstdatacol;
81*789Sahrens 
82*789Sahrens 	switch (raid_type) {
83*789Sahrens 	case RAIDZ_SINGLE:
84*789Sahrens 		q = s / dcols;
85*789Sahrens 		r = s - q * dcols;
86*789Sahrens 		bc = r;
87*789Sahrens 		firstdatacol = 0;
88*789Sahrens 		break;
89*789Sahrens 	case RAIDZ_PARITY:
90*789Sahrens 		q = s / (dcols - 1);
91*789Sahrens 		r = s - q * (dcols - 1);
92*789Sahrens 		bc = r + !!r;
93*789Sahrens 		firstdatacol = 1;
94*789Sahrens 		break;
95*789Sahrens 	}
96*789Sahrens 
97*789Sahrens 	acols = (q == 0 ? bc : dcols);
98*789Sahrens 
99*789Sahrens 	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[acols]), KM_SLEEP);
100*789Sahrens 
101*789Sahrens 	rm->rm_cols = acols;
102*789Sahrens 	rm->rm_bigcols = bc;
103*789Sahrens 	rm->rm_asize = 0;
104*789Sahrens 	rm->rm_missing_child = -1;
105*789Sahrens 	rm->rm_type = raid_type;
106*789Sahrens 	rm->rm_firstdatacol = firstdatacol;
107*789Sahrens 
108*789Sahrens 	for (c = 0; c < acols; c++) {
109*789Sahrens 		col = f + c;
110*789Sahrens 		coff = o;
111*789Sahrens 		if (col >= dcols) {
112*789Sahrens 			col -= dcols;
113*789Sahrens 			coff += 1ULL << unit_shift;
114*789Sahrens 		}
115*789Sahrens 		rm->rm_col[c].rc_col = col;
116*789Sahrens 		rm->rm_col[c].rc_offset = coff;
117*789Sahrens 		rm->rm_col[c].rc_size = (q + (c < bc)) << unit_shift;
118*789Sahrens 		rm->rm_col[c].rc_data = NULL;
119*789Sahrens 		rm->rm_col[c].rc_error = 0;
120*789Sahrens 		rm->rm_col[c].rc_tried = 0;
121*789Sahrens 		rm->rm_col[c].rc_skipped = 0;
122*789Sahrens 		rm->rm_asize += rm->rm_col[c].rc_size;
123*789Sahrens 	}
124*789Sahrens 
125*789Sahrens 	rm->rm_asize = P2ROUNDUP(rm->rm_asize, VDEV_RAIDZ_ALIGN << unit_shift);
126*789Sahrens 
127*789Sahrens 	for (c = 0; c < rm->rm_firstdatacol; c++)
128*789Sahrens 		rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
129*789Sahrens 
130*789Sahrens 	rm->rm_col[c].rc_data = zio->io_data;
131*789Sahrens 
132*789Sahrens 	for (c = c + 1; c < acols; c++)
133*789Sahrens 		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
134*789Sahrens 		    rm->rm_col[c - 1].rc_size;
135*789Sahrens 
136*789Sahrens 	if (raid_type == RAIDZ_PARITY) {
137*789Sahrens 		/*
138*789Sahrens 		 * To prevent hot parity disks, switch the parity and data
139*789Sahrens 		 * columns every 1MB.
140*789Sahrens 		 */
141*789Sahrens 		ASSERT(rm->rm_cols >= 2);
142*789Sahrens 		ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
143*789Sahrens 
144*789Sahrens 		if (zio->io_offset & (1ULL << 20)) {
145*789Sahrens 			col = rm->rm_col[0].rc_col;
146*789Sahrens 			o = rm->rm_col[0].rc_offset;
147*789Sahrens 			rm->rm_col[0].rc_col = rm->rm_col[1].rc_col;
148*789Sahrens 			rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
149*789Sahrens 			rm->rm_col[1].rc_col = col;
150*789Sahrens 			rm->rm_col[1].rc_offset = o;
151*789Sahrens 		}
152*789Sahrens 	}
153*789Sahrens 
154*789Sahrens 	zio->io_vsd = rm;
155*789Sahrens 	return (rm);
156*789Sahrens }
157*789Sahrens 
158*789Sahrens static void
159*789Sahrens vdev_raidz_map_free(zio_t *zio)
160*789Sahrens {
161*789Sahrens 	raidz_map_t *rm = zio->io_vsd;
162*789Sahrens 	int c;
163*789Sahrens 
164*789Sahrens 	for (c = 0; c < rm->rm_firstdatacol; c++)
165*789Sahrens 		zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
166*789Sahrens 
167*789Sahrens 	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols]));
168*789Sahrens 	zio->io_vsd = NULL;
169*789Sahrens }
170*789Sahrens 
171*789Sahrens static void
172*789Sahrens vdev_raidz_reconstruct(raidz_map_t *rm, int x)
173*789Sahrens {
174*789Sahrens 	uint64_t *dst, *src, count, xsize, csize;
175*789Sahrens 	int i, c;
176*789Sahrens 
177*789Sahrens 	for (c = 0; c < rm->rm_cols; c++) {
178*789Sahrens 		if (c == x)
179*789Sahrens 			continue;
180*789Sahrens 		src = rm->rm_col[c].rc_data;
181*789Sahrens 		dst = rm->rm_col[x].rc_data;
182*789Sahrens 		csize = rm->rm_col[c].rc_size;
183*789Sahrens 		xsize = rm->rm_col[x].rc_size;
184*789Sahrens 		count = MIN(csize, xsize) / sizeof (uint64_t);
185*789Sahrens 		if (c == !x) {
186*789Sahrens 			/*
187*789Sahrens 			 * The initial copy happens at either c == 0 or c == 1.
188*789Sahrens 			 * Both of these columns are 'big' columns, so we'll
189*789Sahrens 			 * definitely initialize all of column x.
190*789Sahrens 			 */
191*789Sahrens 			ASSERT3U(xsize, <=, csize);
192*789Sahrens 			for (i = 0; i < count; i++)
193*789Sahrens 				*dst++ = *src++;
194*789Sahrens 		} else {
195*789Sahrens 			for (i = 0; i < count; i++)
196*789Sahrens 				*dst++ ^= *src++;
197*789Sahrens 		}
198*789Sahrens 	}
199*789Sahrens }
200*789Sahrens 
201*789Sahrens static int
202*789Sahrens vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
203*789Sahrens {
204*789Sahrens 	vdev_t *cvd;
205*789Sahrens 	int c, error;
206*789Sahrens 	int lasterror = 0;
207*789Sahrens 	int numerrors = 0;
208*789Sahrens 
209*789Sahrens 	/*
210*789Sahrens 	 * XXX -- minimum children should be raid-type-specific
211*789Sahrens 	 */
212*789Sahrens 	if (vd->vdev_children < 2) {
213*789Sahrens 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
214*789Sahrens 		return (EINVAL);
215*789Sahrens 	}
216*789Sahrens 
217*789Sahrens 	for (c = 0; c < vd->vdev_children; c++) {
218*789Sahrens 		cvd = vd->vdev_child[c];
219*789Sahrens 
220*789Sahrens 		if ((error = vdev_open(cvd)) != 0) {
221*789Sahrens 			lasterror = error;
222*789Sahrens 			numerrors++;
223*789Sahrens 			continue;
224*789Sahrens 		}
225*789Sahrens 
226*789Sahrens 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
227*789Sahrens 		*ashift = cvd->vdev_ashift;
228*789Sahrens 	}
229*789Sahrens 
230*789Sahrens 	*asize *= vd->vdev_children;
231*789Sahrens 
232*789Sahrens 	if (numerrors > 1) {
233*789Sahrens 		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
234*789Sahrens 		return (lasterror);
235*789Sahrens 	}
236*789Sahrens 
237*789Sahrens 	return (0);
238*789Sahrens }
239*789Sahrens 
240*789Sahrens static void
241*789Sahrens vdev_raidz_close(vdev_t *vd)
242*789Sahrens {
243*789Sahrens 	int c;
244*789Sahrens 
245*789Sahrens 	for (c = 0; c < vd->vdev_children; c++)
246*789Sahrens 		vdev_close(vd->vdev_child[c]);
247*789Sahrens }
248*789Sahrens 
249*789Sahrens static uint64_t
250*789Sahrens vdev_raidz_asize(vdev_t *vd, uint64_t psize)
251*789Sahrens {
252*789Sahrens 	uint64_t asize;
253*789Sahrens 	uint64_t cols = vd->vdev_children;
254*789Sahrens 
255*789Sahrens 	/*
256*789Sahrens 	 * These calculations assume RAIDZ_PARITY.
257*789Sahrens 	 */
258*789Sahrens 	asize = psize >> vd->vdev_ashift;
259*789Sahrens 	asize += (asize + cols - 2) / (cols - 1);
260*789Sahrens 	asize = P2ROUNDUP(asize, VDEV_RAIDZ_ALIGN) << vd->vdev_ashift;
261*789Sahrens 
262*789Sahrens 	return (asize);
263*789Sahrens }
264*789Sahrens 
265*789Sahrens static void
266*789Sahrens vdev_raidz_child_done(zio_t *zio)
267*789Sahrens {
268*789Sahrens 	raidz_col_t *rc = zio->io_private;
269*789Sahrens 
270*789Sahrens 	rc->rc_error = zio->io_error;
271*789Sahrens 	rc->rc_tried = 1;
272*789Sahrens 	rc->rc_skipped = 0;
273*789Sahrens }
274*789Sahrens 
275*789Sahrens static void
276*789Sahrens vdev_raidz_repair_done(zio_t *zio)
277*789Sahrens {
278*789Sahrens 	zio_buf_free(zio->io_data, zio->io_size);
279*789Sahrens }
280*789Sahrens 
281*789Sahrens static void
282*789Sahrens vdev_raidz_io_start(zio_t *zio)
283*789Sahrens {
284*789Sahrens 	vdev_t *vd = zio->io_vd;
285*789Sahrens 	vdev_t *cvd;
286*789Sahrens 	blkptr_t *bp = zio->io_bp;
287*789Sahrens 	raidz_map_t *rm;
288*789Sahrens 	raidz_col_t *rc;
289*789Sahrens 	int c;
290*789Sahrens 
291*789Sahrens 	rm = vdev_raidz_map_alloc(zio, vd->vdev_ashift, vd->vdev_children,
292*789Sahrens 	    RAIDZ_PARITY);
293*789Sahrens 
294*789Sahrens 	if (DVA_GET_GANG(ZIO_GET_DVA(zio))) {
295*789Sahrens 		ASSERT3U(rm->rm_asize, ==,
296*789Sahrens 		    vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE));
297*789Sahrens 		ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE);
298*789Sahrens 	} else {
299*789Sahrens 		ASSERT3U(rm->rm_asize, ==, DVA_GET_ASIZE(ZIO_GET_DVA(zio)));
300*789Sahrens 		ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
301*789Sahrens 	}
302*789Sahrens 
303*789Sahrens 	if (zio->io_type == ZIO_TYPE_WRITE) {
304*789Sahrens 
305*789Sahrens 		/*
306*789Sahrens 		 * Generate RAID parity in virtual column 0.
307*789Sahrens 		 */
308*789Sahrens 		vdev_raidz_reconstruct(rm, 0);
309*789Sahrens 
310*789Sahrens 		for (c = 0; c < rm->rm_cols; c++) {
311*789Sahrens 			rc = &rm->rm_col[c];
312*789Sahrens 			cvd = vd->vdev_child[rc->rc_col];
313*789Sahrens 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
314*789Sahrens 			    rc->rc_offset, rc->rc_data, rc->rc_size,
315*789Sahrens 			    zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
316*789Sahrens 			    vdev_raidz_child_done, rc));
317*789Sahrens 		}
318*789Sahrens 		zio_wait_children_done(zio);
319*789Sahrens 		return;
320*789Sahrens 	}
321*789Sahrens 
322*789Sahrens 	ASSERT(zio->io_type == ZIO_TYPE_READ);
323*789Sahrens 
324*789Sahrens 	for (c = rm->rm_cols - 1; c >= 0; c--) {
325*789Sahrens 		rc = &rm->rm_col[c];
326*789Sahrens 		cvd = vd->vdev_child[rc->rc_col];
327*789Sahrens 		if (vdev_is_dead(cvd)) {
328*789Sahrens 			rm->rm_missing_child = c;
329*789Sahrens 			rc->rc_error = ENXIO;
330*789Sahrens 			rc->rc_tried = 1;	/* don't even try */
331*789Sahrens 			rc->rc_skipped = 1;
332*789Sahrens 			continue;
333*789Sahrens 		}
334*789Sahrens 		if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) {
335*789Sahrens 			rm->rm_missing_child = c;
336*789Sahrens 			rc->rc_error = ESTALE;
337*789Sahrens 			rc->rc_skipped = 1;
338*789Sahrens 			continue;
339*789Sahrens 		}
340*789Sahrens 		if (c >= rm->rm_firstdatacol || rm->rm_missing_child != -1 ||
341*789Sahrens 		    (zio->io_flags & ZIO_FLAG_SCRUB)) {
342*789Sahrens 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
343*789Sahrens 			    rc->rc_offset, rc->rc_data, rc->rc_size,
344*789Sahrens 			    zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
345*789Sahrens 			    vdev_raidz_child_done, rc));
346*789Sahrens 		}
347*789Sahrens 	}
348*789Sahrens 
349*789Sahrens 	zio_wait_children_done(zio);
350*789Sahrens }
351*789Sahrens 
352*789Sahrens static void
353*789Sahrens vdev_raidz_io_done(zio_t *zio)
354*789Sahrens {
355*789Sahrens 	vdev_t *vd = zio->io_vd;
356*789Sahrens 	vdev_t *cvd;
357*789Sahrens 	raidz_map_t *rm = zio->io_vsd;
358*789Sahrens 	raidz_col_t *rc;
359*789Sahrens 	blkptr_t *bp = zio->io_bp;
360*789Sahrens 	int unexpected_errors = 0;
361*789Sahrens 	int c;
362*789Sahrens 
363*789Sahrens 	ASSERT(bp != NULL);	/* XXX need to add code to enforce this */
364*789Sahrens 
365*789Sahrens 	zio->io_error = 0;
366*789Sahrens 	zio->io_numerrors = 0;
367*789Sahrens 
368*789Sahrens 	for (c = 0; c < rm->rm_cols; c++) {
369*789Sahrens 		rc = &rm->rm_col[c];
370*789Sahrens 
371*789Sahrens 		/*
372*789Sahrens 		 * We preserve any EIOs because those may be worth retrying;
373*789Sahrens 		 * whereas ECKSUM and ENXIO are more likely to be persistent.
374*789Sahrens 		 */
375*789Sahrens 		if (rc->rc_error) {
376*789Sahrens 			if (zio->io_error != EIO)
377*789Sahrens 				zio->io_error = rc->rc_error;
378*789Sahrens 			if (!rc->rc_skipped)
379*789Sahrens 				unexpected_errors++;
380*789Sahrens 			zio->io_numerrors++;
381*789Sahrens 		}
382*789Sahrens 	}
383*789Sahrens 
384*789Sahrens 	if (zio->io_type == ZIO_TYPE_WRITE) {
385*789Sahrens 		/*
386*789Sahrens 		 * If this is not a failfast write, and we were able to
387*789Sahrens 		 * write enough columns to reconstruct the data, good enough.
388*789Sahrens 		 */
389*789Sahrens 		/* XXPOLICY */
390*789Sahrens 		if (zio->io_numerrors <= rm->rm_firstdatacol &&
391*789Sahrens 		    !(zio->io_flags & ZIO_FLAG_FAILFAST))
392*789Sahrens 			zio->io_error = 0;
393*789Sahrens 
394*789Sahrens 		vdev_raidz_map_free(zio);
395*789Sahrens 		zio_next_stage(zio);
396*789Sahrens 		return;
397*789Sahrens 	}
398*789Sahrens 
399*789Sahrens 	ASSERT(zio->io_type == ZIO_TYPE_READ);
400*789Sahrens 
401*789Sahrens 	/*
402*789Sahrens 	 * If there were no I/O errors, and the data checksums correctly,
403*789Sahrens 	 * the read is complete.
404*789Sahrens 	 */
405*789Sahrens 	/* XXPOLICY */
406*789Sahrens 	if (zio->io_numerrors == 0 && zio_checksum_error(zio) == 0) {
407*789Sahrens 		ASSERT(unexpected_errors == 0);
408*789Sahrens 		ASSERT(zio->io_error == 0);
409*789Sahrens 
410*789Sahrens 		/*
411*789Sahrens 		 * We know the data's good.  If we read the parity,
412*789Sahrens 		 * verify that it's good as well.  If not, fix it.
413*789Sahrens 		 */
414*789Sahrens 		for (c = 0; c < rm->rm_firstdatacol; c++) {
415*789Sahrens 			void *orig;
416*789Sahrens 			rc = &rm->rm_col[c];
417*789Sahrens 			if (!rc->rc_tried)
418*789Sahrens 				continue;
419*789Sahrens 			orig = zio_buf_alloc(rc->rc_size);
420*789Sahrens 			bcopy(rc->rc_data, orig, rc->rc_size);
421*789Sahrens 			vdev_raidz_reconstruct(rm, c);
422*789Sahrens 			if (bcmp(orig, rc->rc_data, rc->rc_size) != 0) {
423*789Sahrens 				vdev_checksum_error(zio,
424*789Sahrens 				    vd->vdev_child[rc->rc_col]);
425*789Sahrens 				rc->rc_error = ECKSUM;
426*789Sahrens 				unexpected_errors++;
427*789Sahrens 			}
428*789Sahrens 			zio_buf_free(orig, rc->rc_size);
429*789Sahrens 		}
430*789Sahrens 		goto done;
431*789Sahrens 	}
432*789Sahrens 
433*789Sahrens 	/*
434*789Sahrens 	 * If there was exactly one I/O error, it's the one we expected,
435*789Sahrens 	 * and the reconstructed data checksums, the read is complete.
436*789Sahrens 	 * This happens when one child is offline and vdev_fault_assess()
437*789Sahrens 	 * knows it, or when one child has stale data and the DTL knows it.
438*789Sahrens 	 */
439*789Sahrens 	if (zio->io_numerrors == 1 && (c = rm->rm_missing_child) != -1) {
440*789Sahrens 		rc = &rm->rm_col[c];
441*789Sahrens 		ASSERT(unexpected_errors == 0);
442*789Sahrens 		ASSERT(rc->rc_error == ENXIO || rc->rc_error == ESTALE);
443*789Sahrens 		vdev_raidz_reconstruct(rm, c);
444*789Sahrens 		if (zio_checksum_error(zio) == 0) {
445*789Sahrens 			zio->io_error = 0;
446*789Sahrens 			goto done;
447*789Sahrens 		}
448*789Sahrens 	}
449*789Sahrens 
450*789Sahrens 	/*
451*789Sahrens 	 * This isn't a typical error -- either we got a read error or
452*789Sahrens 	 * more than one child claimed a problem.  Read every block we
453*789Sahrens 	 * haven't already so we can try combinatorial reconstruction.
454*789Sahrens 	 */
455*789Sahrens 	unexpected_errors = 1;
456*789Sahrens 	rm->rm_missing_child = -1;
457*789Sahrens 
458*789Sahrens 	for (c = 0; c < rm->rm_cols; c++)
459*789Sahrens 		if (!rm->rm_col[c].rc_tried)
460*789Sahrens 			break;
461*789Sahrens 
462*789Sahrens 	if (c != rm->rm_cols) {
463*789Sahrens 		zio->io_error = 0;
464*789Sahrens 		zio_vdev_io_redone(zio);
465*789Sahrens 		for (c = 0; c < rm->rm_cols; c++) {
466*789Sahrens 			rc = &rm->rm_col[c];
467*789Sahrens 			if (rc->rc_tried)
468*789Sahrens 				continue;
469*789Sahrens 			zio_nowait(zio_vdev_child_io(zio, NULL,
470*789Sahrens 			    vd->vdev_child[rc->rc_col],
471*789Sahrens 			    rc->rc_offset, rc->rc_data, rc->rc_size,
472*789Sahrens 			    zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
473*789Sahrens 			    vdev_raidz_child_done, rc));
474*789Sahrens 		}
475*789Sahrens 		zio_wait_children_done(zio);
476*789Sahrens 		return;
477*789Sahrens 	}
478*789Sahrens 
479*789Sahrens 	/*
480*789Sahrens 	 * If there were more errors than parity disks, give up.
481*789Sahrens 	 */
482*789Sahrens 	if (zio->io_numerrors > rm->rm_firstdatacol) {
483*789Sahrens 		ASSERT(zio->io_error != 0);
484*789Sahrens 		goto done;
485*789Sahrens 	}
486*789Sahrens 
487*789Sahrens 	/*
488*789Sahrens 	 * The number of I/O errors is correctable.  Correct them here.
489*789Sahrens 	 */
490*789Sahrens 	ASSERT(zio->io_numerrors <= rm->rm_firstdatacol);
491*789Sahrens 	for (c = 0; c < rm->rm_cols; c++) {
492*789Sahrens 		rc = &rm->rm_col[c];
493*789Sahrens 		ASSERT(rc->rc_tried);
494*789Sahrens 		if (rc->rc_error) {
495*789Sahrens 			vdev_raidz_reconstruct(rm, c);
496*789Sahrens 			if (zio_checksum_error(zio) == 0)
497*789Sahrens 				zio->io_error = 0;
498*789Sahrens 			else
499*789Sahrens 				zio->io_error = rc->rc_error;
500*789Sahrens 			goto done;
501*789Sahrens 		}
502*789Sahrens 	}
503*789Sahrens 
504*789Sahrens 	/*
505*789Sahrens 	 * There were no I/O errors, but the data doesn't checksum.
506*789Sahrens 	 * Try all permutations to see if we can find one that does.
507*789Sahrens 	 */
508*789Sahrens 	ASSERT(zio->io_numerrors == 0);
509*789Sahrens 	for (c = 0; c < rm->rm_cols; c++) {
510*789Sahrens 		void *orig;
511*789Sahrens 		rc = &rm->rm_col[c];
512*789Sahrens 
513*789Sahrens 		orig = zio_buf_alloc(rc->rc_size);
514*789Sahrens 		bcopy(rc->rc_data, orig, rc->rc_size);
515*789Sahrens 		vdev_raidz_reconstruct(rm, c);
516*789Sahrens 
517*789Sahrens 		if (zio_checksum_error(zio) == 0) {
518*789Sahrens 			zio_buf_free(orig, rc->rc_size);
519*789Sahrens 			zio->io_error = 0;
520*789Sahrens 			/*
521*789Sahrens 			 * If this child didn't know that it returned bad data,
522*789Sahrens 			 * inform it.
523*789Sahrens 			 */
524*789Sahrens 			if (rc->rc_tried && rc->rc_error == 0)
525*789Sahrens 				vdev_checksum_error(zio,
526*789Sahrens 				    vd->vdev_child[rc->rc_col]);
527*789Sahrens 			rc->rc_error = ECKSUM;
528*789Sahrens 			goto done;
529*789Sahrens 		}
530*789Sahrens 
531*789Sahrens 		bcopy(orig, rc->rc_data, rc->rc_size);
532*789Sahrens 		zio_buf_free(orig, rc->rc_size);
533*789Sahrens 	}
534*789Sahrens 
535*789Sahrens 	/*
536*789Sahrens 	 * All combinations failed to checksum.
537*789Sahrens 	 */
538*789Sahrens 	zio->io_error = ECKSUM;
539*789Sahrens 
540*789Sahrens done:
541*789Sahrens 	zio_checksum_verified(zio);
542*789Sahrens 
543*789Sahrens 	if (zio->io_error == 0 && (spa_mode & FWRITE) &&
544*789Sahrens 	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
545*789Sahrens 		/*
546*789Sahrens 		 * Use the good data we have in hand to repair damaged children.
547*789Sahrens 		 */
548*789Sahrens 		for (c = 0; c < rm->rm_cols; c++) {
549*789Sahrens 			rc = &rm->rm_col[c];
550*789Sahrens 			cvd = vd->vdev_child[rc->rc_col];
551*789Sahrens 
552*789Sahrens 			if (rc->rc_error) {
553*789Sahrens 				/*
554*789Sahrens 				 * Make a copy of the data because we're
555*789Sahrens 				 * going to free the RAID-Z map below.
556*789Sahrens 				 */
557*789Sahrens 				void *data = zio_buf_alloc(rc->rc_size);
558*789Sahrens 				bcopy(rc->rc_data, data, rc->rc_size);
559*789Sahrens 
560*789Sahrens 				dprintf("%s resilvered %s @ 0x%llx error %d\n",
561*789Sahrens 				    vdev_description(vd),
562*789Sahrens 				    vdev_description(cvd),
563*789Sahrens 				    zio->io_offset, rc->rc_error);
564*789Sahrens 
565*789Sahrens 				zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
566*789Sahrens 				    rc->rc_offset, data, rc->rc_size,
567*789Sahrens 				    ZIO_TYPE_WRITE, zio->io_priority,
568*789Sahrens 				    ZIO_FLAG_IO_REPAIR | ZIO_FLAG_CANFAIL |
569*789Sahrens 				    ZIO_FLAG_DONT_PROPAGATE,
570*789Sahrens 				    vdev_raidz_repair_done, NULL));
571*789Sahrens 			}
572*789Sahrens 		}
573*789Sahrens 	}
574*789Sahrens 
575*789Sahrens 	vdev_raidz_map_free(zio);
576*789Sahrens 	zio_next_stage(zio);
577*789Sahrens }
578*789Sahrens 
579*789Sahrens static void
580*789Sahrens vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
581*789Sahrens {
582*789Sahrens 	if (faulted > 1)
583*789Sahrens 		vdev_set_state(vd, VDEV_STATE_CANT_OPEN, VDEV_AUX_NO_REPLICAS);
584*789Sahrens 	else if (degraded + faulted != 0)
585*789Sahrens 		vdev_set_state(vd, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
586*789Sahrens 	else
587*789Sahrens 		vdev_set_state(vd, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
588*789Sahrens }
589*789Sahrens 
590*789Sahrens vdev_ops_t vdev_raidz_ops = {
591*789Sahrens 	vdev_raidz_open,
592*789Sahrens 	vdev_raidz_close,
593*789Sahrens 	vdev_raidz_asize,
594*789Sahrens 	vdev_raidz_io_start,
595*789Sahrens 	vdev_raidz_io_done,
596*789Sahrens 	vdev_raidz_state_change,
597*789Sahrens 	VDEV_TYPE_RAIDZ,	/* name of this vdev type */
598*789Sahrens 	B_FALSE			/* not a leaf vdev */
599*789Sahrens };
600