xref: /onnv-gate/usr/src/uts/common/fs/zfs/vdev_raidz.c (revision 1544:938876158511)
1789Sahrens /*
2789Sahrens  * CDDL HEADER START
3789Sahrens  *
4789Sahrens  * The contents of this file are subject to the terms of the
5*1544Seschrock  * Common Development and Distribution License (the "License").
6*1544Seschrock  * You may not use this file except in compliance with the License.
7789Sahrens  *
8789Sahrens  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9789Sahrens  * or http://www.opensolaris.org/os/licensing.
10789Sahrens  * See the License for the specific language governing permissions
11789Sahrens  * and limitations under the License.
12789Sahrens  *
13789Sahrens  * When distributing Covered Code, include this CDDL HEADER in each
14789Sahrens  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15789Sahrens  * If applicable, add the following below this CDDL HEADER, with the
16789Sahrens  * fields enclosed by brackets "[]" replaced with your own identifying
17789Sahrens  * information: Portions Copyright [yyyy] [name of copyright owner]
18789Sahrens  *
19789Sahrens  * CDDL HEADER END
20789Sahrens  */
21789Sahrens /*
22*1544Seschrock  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23789Sahrens  * Use is subject to license terms.
24789Sahrens  */
25789Sahrens 
26789Sahrens #pragma ident	"%Z%%M%	%I%	%E% SMI"
27789Sahrens 
28789Sahrens #include <sys/zfs_context.h>
29789Sahrens #include <sys/spa.h>
30789Sahrens #include <sys/vdev_impl.h>
31789Sahrens #include <sys/zio.h>
32789Sahrens #include <sys/zio_checksum.h>
33789Sahrens #include <sys/fs/zfs.h>
34*1544Seschrock #include <sys/fm/fs/zfs.h>
35789Sahrens 
36789Sahrens /*
37789Sahrens  * Virtual device vector for RAID-Z.
38789Sahrens  */
39789Sahrens 
40789Sahrens /*
41789Sahrens  * We currently allow up to two-way replication (i.e. single-fault
42789Sahrens  * reconstruction) models in RAID-Z vdevs.  The blocks in such vdevs
43789Sahrens  * must all be multiples of two times the leaf vdev blocksize.
44789Sahrens  */
45789Sahrens #define	VDEV_RAIDZ_ALIGN	2ULL
46789Sahrens 
47789Sahrens typedef struct raidz_col {
48789Sahrens 	uint64_t	rc_col;
49789Sahrens 	uint64_t	rc_offset;
50789Sahrens 	uint64_t	rc_size;
51789Sahrens 	void		*rc_data;
52789Sahrens 	int		rc_error;
53789Sahrens 	short		rc_tried;
54789Sahrens 	short		rc_skipped;
55789Sahrens } raidz_col_t;
56789Sahrens 
57789Sahrens typedef struct raidz_map {
58789Sahrens 	uint64_t	rm_cols;
59789Sahrens 	uint64_t	rm_bigcols;
60789Sahrens 	uint64_t	rm_asize;
61789Sahrens 	int		rm_missing_child;
62789Sahrens 	int		rm_firstdatacol;
63789Sahrens 	raidz_col_t	rm_col[1];
64789Sahrens } raidz_map_t;
65789Sahrens 
66789Sahrens static raidz_map_t *
671133Seschrock vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols)
68789Sahrens {
69789Sahrens 	raidz_map_t *rm;
70789Sahrens 	uint64_t b = zio->io_offset >> unit_shift;
71789Sahrens 	uint64_t s = zio->io_size >> unit_shift;
72789Sahrens 	uint64_t f = b % dcols;
73789Sahrens 	uint64_t o = (b / dcols) << unit_shift;
74789Sahrens 	uint64_t q, r, c, bc, col, acols, coff;
75789Sahrens 	int firstdatacol;
76789Sahrens 
771133Seschrock 	q = s / (dcols - 1);
781133Seschrock 	r = s - q * (dcols - 1);
791133Seschrock 	bc = r + !!r;
801133Seschrock 	firstdatacol = 1;
81789Sahrens 
82789Sahrens 	acols = (q == 0 ? bc : dcols);
83789Sahrens 
84789Sahrens 	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[acols]), KM_SLEEP);
85789Sahrens 
86789Sahrens 	rm->rm_cols = acols;
87789Sahrens 	rm->rm_bigcols = bc;
88789Sahrens 	rm->rm_asize = 0;
89789Sahrens 	rm->rm_missing_child = -1;
90789Sahrens 	rm->rm_firstdatacol = firstdatacol;
91789Sahrens 
92789Sahrens 	for (c = 0; c < acols; c++) {
93789Sahrens 		col = f + c;
94789Sahrens 		coff = o;
95789Sahrens 		if (col >= dcols) {
96789Sahrens 			col -= dcols;
97789Sahrens 			coff += 1ULL << unit_shift;
98789Sahrens 		}
99789Sahrens 		rm->rm_col[c].rc_col = col;
100789Sahrens 		rm->rm_col[c].rc_offset = coff;
101789Sahrens 		rm->rm_col[c].rc_size = (q + (c < bc)) << unit_shift;
102789Sahrens 		rm->rm_col[c].rc_data = NULL;
103789Sahrens 		rm->rm_col[c].rc_error = 0;
104789Sahrens 		rm->rm_col[c].rc_tried = 0;
105789Sahrens 		rm->rm_col[c].rc_skipped = 0;
106789Sahrens 		rm->rm_asize += rm->rm_col[c].rc_size;
107789Sahrens 	}
108789Sahrens 
109789Sahrens 	rm->rm_asize = P2ROUNDUP(rm->rm_asize, VDEV_RAIDZ_ALIGN << unit_shift);
110789Sahrens 
111789Sahrens 	for (c = 0; c < rm->rm_firstdatacol; c++)
112789Sahrens 		rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
113789Sahrens 
114789Sahrens 	rm->rm_col[c].rc_data = zio->io_data;
115789Sahrens 
116789Sahrens 	for (c = c + 1; c < acols; c++)
117789Sahrens 		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
118789Sahrens 		    rm->rm_col[c - 1].rc_size;
119789Sahrens 
1201133Seschrock 	/*
1211133Seschrock 	 * To prevent hot parity disks, switch the parity and data
1221133Seschrock 	 * columns every 1MB.
1231133Seschrock 	 */
1241133Seschrock 	ASSERT(rm->rm_cols >= 2);
1251133Seschrock 	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
126789Sahrens 
1271133Seschrock 	if (zio->io_offset & (1ULL << 20)) {
1281133Seschrock 		col = rm->rm_col[0].rc_col;
1291133Seschrock 		o = rm->rm_col[0].rc_offset;
1301133Seschrock 		rm->rm_col[0].rc_col = rm->rm_col[1].rc_col;
1311133Seschrock 		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
1321133Seschrock 		rm->rm_col[1].rc_col = col;
1331133Seschrock 		rm->rm_col[1].rc_offset = o;
134789Sahrens 	}
135789Sahrens 
136789Sahrens 	zio->io_vsd = rm;
137789Sahrens 	return (rm);
138789Sahrens }
139789Sahrens 
140789Sahrens static void
141789Sahrens vdev_raidz_map_free(zio_t *zio)
142789Sahrens {
143789Sahrens 	raidz_map_t *rm = zio->io_vsd;
144789Sahrens 	int c;
145789Sahrens 
146789Sahrens 	for (c = 0; c < rm->rm_firstdatacol; c++)
147789Sahrens 		zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
148789Sahrens 
149789Sahrens 	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols]));
150789Sahrens 	zio->io_vsd = NULL;
151789Sahrens }
152789Sahrens 
153789Sahrens static void
154789Sahrens vdev_raidz_reconstruct(raidz_map_t *rm, int x)
155789Sahrens {
156789Sahrens 	uint64_t *dst, *src, count, xsize, csize;
157789Sahrens 	int i, c;
158789Sahrens 
159789Sahrens 	for (c = 0; c < rm->rm_cols; c++) {
160789Sahrens 		if (c == x)
161789Sahrens 			continue;
162789Sahrens 		src = rm->rm_col[c].rc_data;
163789Sahrens 		dst = rm->rm_col[x].rc_data;
164789Sahrens 		csize = rm->rm_col[c].rc_size;
165789Sahrens 		xsize = rm->rm_col[x].rc_size;
166789Sahrens 		count = MIN(csize, xsize) / sizeof (uint64_t);
167789Sahrens 		if (c == !x) {
168789Sahrens 			/*
169789Sahrens 			 * The initial copy happens at either c == 0 or c == 1.
170789Sahrens 			 * Both of these columns are 'big' columns, so we'll
171789Sahrens 			 * definitely initialize all of column x.
172789Sahrens 			 */
173789Sahrens 			ASSERT3U(xsize, <=, csize);
174789Sahrens 			for (i = 0; i < count; i++)
175789Sahrens 				*dst++ = *src++;
176789Sahrens 		} else {
177789Sahrens 			for (i = 0; i < count; i++)
178789Sahrens 				*dst++ ^= *src++;
179789Sahrens 		}
180789Sahrens 	}
181789Sahrens }
182789Sahrens 
183789Sahrens static int
184789Sahrens vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
185789Sahrens {
186789Sahrens 	vdev_t *cvd;
187789Sahrens 	int c, error;
188789Sahrens 	int lasterror = 0;
189789Sahrens 	int numerrors = 0;
190789Sahrens 
191789Sahrens 	/*
192789Sahrens 	 * XXX -- minimum children should be raid-type-specific
193789Sahrens 	 */
194789Sahrens 	if (vd->vdev_children < 2) {
195789Sahrens 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
196789Sahrens 		return (EINVAL);
197789Sahrens 	}
198789Sahrens 
199789Sahrens 	for (c = 0; c < vd->vdev_children; c++) {
200789Sahrens 		cvd = vd->vdev_child[c];
201789Sahrens 
202789Sahrens 		if ((error = vdev_open(cvd)) != 0) {
203789Sahrens 			lasterror = error;
204789Sahrens 			numerrors++;
205789Sahrens 			continue;
206789Sahrens 		}
207789Sahrens 
208789Sahrens 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
209789Sahrens 		*ashift = cvd->vdev_ashift;
210789Sahrens 	}
211789Sahrens 
212789Sahrens 	*asize *= vd->vdev_children;
213789Sahrens 
214789Sahrens 	if (numerrors > 1) {
215789Sahrens 		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
216789Sahrens 		return (lasterror);
217789Sahrens 	}
218789Sahrens 
219789Sahrens 	return (0);
220789Sahrens }
221789Sahrens 
222789Sahrens static void
223789Sahrens vdev_raidz_close(vdev_t *vd)
224789Sahrens {
225789Sahrens 	int c;
226789Sahrens 
227789Sahrens 	for (c = 0; c < vd->vdev_children; c++)
228789Sahrens 		vdev_close(vd->vdev_child[c]);
229789Sahrens }
230789Sahrens 
231789Sahrens static uint64_t
232789Sahrens vdev_raidz_asize(vdev_t *vd, uint64_t psize)
233789Sahrens {
234789Sahrens 	uint64_t asize;
235789Sahrens 	uint64_t cols = vd->vdev_children;
236789Sahrens 
237789Sahrens 	asize = psize >> vd->vdev_ashift;
238789Sahrens 	asize += (asize + cols - 2) / (cols - 1);
239789Sahrens 	asize = P2ROUNDUP(asize, VDEV_RAIDZ_ALIGN) << vd->vdev_ashift;
240789Sahrens 
241789Sahrens 	return (asize);
242789Sahrens }
243789Sahrens 
244789Sahrens static void
245789Sahrens vdev_raidz_child_done(zio_t *zio)
246789Sahrens {
247789Sahrens 	raidz_col_t *rc = zio->io_private;
248789Sahrens 
249789Sahrens 	rc->rc_error = zio->io_error;
250789Sahrens 	rc->rc_tried = 1;
251789Sahrens 	rc->rc_skipped = 0;
252789Sahrens }
253789Sahrens 
254789Sahrens static void
255789Sahrens vdev_raidz_repair_done(zio_t *zio)
256789Sahrens {
257789Sahrens 	zio_buf_free(zio->io_data, zio->io_size);
258789Sahrens }
259789Sahrens 
260789Sahrens static void
261789Sahrens vdev_raidz_io_start(zio_t *zio)
262789Sahrens {
263789Sahrens 	vdev_t *vd = zio->io_vd;
264789Sahrens 	vdev_t *cvd;
265789Sahrens 	blkptr_t *bp = zio->io_bp;
266789Sahrens 	raidz_map_t *rm;
267789Sahrens 	raidz_col_t *rc;
268789Sahrens 	int c;
269789Sahrens 
2701133Seschrock 	rm = vdev_raidz_map_alloc(zio, vd->vdev_ashift, vd->vdev_children);
271789Sahrens 
272789Sahrens 	if (DVA_GET_GANG(ZIO_GET_DVA(zio))) {
273789Sahrens 		ASSERT3U(rm->rm_asize, ==,
274789Sahrens 		    vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE));
275789Sahrens 		ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE);
276789Sahrens 	} else {
277789Sahrens 		ASSERT3U(rm->rm_asize, ==, DVA_GET_ASIZE(ZIO_GET_DVA(zio)));
278789Sahrens 		ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
279789Sahrens 	}
280789Sahrens 
281789Sahrens 	if (zio->io_type == ZIO_TYPE_WRITE) {
282789Sahrens 
283789Sahrens 		/*
284789Sahrens 		 * Generate RAID parity in virtual column 0.
285789Sahrens 		 */
286789Sahrens 		vdev_raidz_reconstruct(rm, 0);
287789Sahrens 
288789Sahrens 		for (c = 0; c < rm->rm_cols; c++) {
289789Sahrens 			rc = &rm->rm_col[c];
290789Sahrens 			cvd = vd->vdev_child[rc->rc_col];
291789Sahrens 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
292789Sahrens 			    rc->rc_offset, rc->rc_data, rc->rc_size,
293789Sahrens 			    zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
294789Sahrens 			    vdev_raidz_child_done, rc));
295789Sahrens 		}
296789Sahrens 		zio_wait_children_done(zio);
297789Sahrens 		return;
298789Sahrens 	}
299789Sahrens 
300789Sahrens 	ASSERT(zio->io_type == ZIO_TYPE_READ);
301789Sahrens 
302789Sahrens 	for (c = rm->rm_cols - 1; c >= 0; c--) {
303789Sahrens 		rc = &rm->rm_col[c];
304789Sahrens 		cvd = vd->vdev_child[rc->rc_col];
305789Sahrens 		if (vdev_is_dead(cvd)) {
306789Sahrens 			rm->rm_missing_child = c;
307789Sahrens 			rc->rc_error = ENXIO;
308789Sahrens 			rc->rc_tried = 1;	/* don't even try */
309789Sahrens 			rc->rc_skipped = 1;
310789Sahrens 			continue;
311789Sahrens 		}
312789Sahrens 		if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) {
313789Sahrens 			rm->rm_missing_child = c;
314789Sahrens 			rc->rc_error = ESTALE;
315789Sahrens 			rc->rc_skipped = 1;
316789Sahrens 			continue;
317789Sahrens 		}
318789Sahrens 		if (c >= rm->rm_firstdatacol || rm->rm_missing_child != -1 ||
319789Sahrens 		    (zio->io_flags & ZIO_FLAG_SCRUB)) {
320789Sahrens 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
321789Sahrens 			    rc->rc_offset, rc->rc_data, rc->rc_size,
322789Sahrens 			    zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
323789Sahrens 			    vdev_raidz_child_done, rc));
324789Sahrens 		}
325789Sahrens 	}
326789Sahrens 
327789Sahrens 	zio_wait_children_done(zio);
328789Sahrens }
329789Sahrens 
330*1544Seschrock /*
331*1544Seschrock  * Report a checksum error for a child of a RAID-Z device.
332*1544Seschrock  */
333*1544Seschrock static void
334*1544Seschrock raidz_checksum_error(zio_t *zio, raidz_col_t *rc)
335*1544Seschrock {
336*1544Seschrock 	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_col];
337*1544Seschrock 	dprintf_bp(zio->io_bp, "imputed checksum error on %s: ",
338*1544Seschrock 	    vdev_description(vd));
339*1544Seschrock 
340*1544Seschrock 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
341*1544Seschrock 		mutex_enter(&vd->vdev_stat_lock);
342*1544Seschrock 		vd->vdev_stat.vs_checksum_errors++;
343*1544Seschrock 		mutex_exit(&vd->vdev_stat_lock);
344*1544Seschrock 	}
345*1544Seschrock 
346*1544Seschrock 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE))
347*1544Seschrock 		zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
348*1544Seschrock 		    zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size);
349*1544Seschrock }
350*1544Seschrock 
351*1544Seschrock 
352789Sahrens static void
353789Sahrens vdev_raidz_io_done(zio_t *zio)
354789Sahrens {
355789Sahrens 	vdev_t *vd = zio->io_vd;
356789Sahrens 	vdev_t *cvd;
357789Sahrens 	raidz_map_t *rm = zio->io_vsd;
358789Sahrens 	raidz_col_t *rc;
359789Sahrens 	blkptr_t *bp = zio->io_bp;
360789Sahrens 	int unexpected_errors = 0;
361789Sahrens 	int c;
362789Sahrens 
363789Sahrens 	ASSERT(bp != NULL);	/* XXX need to add code to enforce this */
364789Sahrens 
365789Sahrens 	zio->io_error = 0;
366789Sahrens 	zio->io_numerrors = 0;
367789Sahrens 
368789Sahrens 	for (c = 0; c < rm->rm_cols; c++) {
369789Sahrens 		rc = &rm->rm_col[c];
370789Sahrens 
371789Sahrens 		/*
372789Sahrens 		 * We preserve any EIOs because those may be worth retrying;
373789Sahrens 		 * whereas ECKSUM and ENXIO are more likely to be persistent.
374789Sahrens 		 */
375789Sahrens 		if (rc->rc_error) {
376789Sahrens 			if (zio->io_error != EIO)
377789Sahrens 				zio->io_error = rc->rc_error;
378789Sahrens 			if (!rc->rc_skipped)
379789Sahrens 				unexpected_errors++;
380789Sahrens 			zio->io_numerrors++;
381789Sahrens 		}
382789Sahrens 	}
383789Sahrens 
384789Sahrens 	if (zio->io_type == ZIO_TYPE_WRITE) {
385789Sahrens 		/*
386789Sahrens 		 * If this is not a failfast write, and we were able to
387789Sahrens 		 * write enough columns to reconstruct the data, good enough.
388789Sahrens 		 */
389789Sahrens 		/* XXPOLICY */
390789Sahrens 		if (zio->io_numerrors <= rm->rm_firstdatacol &&
391789Sahrens 		    !(zio->io_flags & ZIO_FLAG_FAILFAST))
392789Sahrens 			zio->io_error = 0;
393789Sahrens 
394789Sahrens 		vdev_raidz_map_free(zio);
395789Sahrens 		zio_next_stage(zio);
396789Sahrens 		return;
397789Sahrens 	}
398789Sahrens 
399789Sahrens 	ASSERT(zio->io_type == ZIO_TYPE_READ);
400789Sahrens 
401789Sahrens 	/*
402789Sahrens 	 * If there were no I/O errors, and the data checksums correctly,
403789Sahrens 	 * the read is complete.
404789Sahrens 	 */
405789Sahrens 	/* XXPOLICY */
406789Sahrens 	if (zio->io_numerrors == 0 && zio_checksum_error(zio) == 0) {
407789Sahrens 		ASSERT(unexpected_errors == 0);
408789Sahrens 		ASSERT(zio->io_error == 0);
409789Sahrens 
410789Sahrens 		/*
411789Sahrens 		 * We know the data's good.  If we read the parity,
412789Sahrens 		 * verify that it's good as well.  If not, fix it.
413789Sahrens 		 */
414789Sahrens 		for (c = 0; c < rm->rm_firstdatacol; c++) {
415789Sahrens 			void *orig;
416789Sahrens 			rc = &rm->rm_col[c];
417789Sahrens 			if (!rc->rc_tried)
418789Sahrens 				continue;
419789Sahrens 			orig = zio_buf_alloc(rc->rc_size);
420789Sahrens 			bcopy(rc->rc_data, orig, rc->rc_size);
421789Sahrens 			vdev_raidz_reconstruct(rm, c);
422789Sahrens 			if (bcmp(orig, rc->rc_data, rc->rc_size) != 0) {
423*1544Seschrock 				raidz_checksum_error(zio, rc);
424789Sahrens 				rc->rc_error = ECKSUM;
425789Sahrens 				unexpected_errors++;
426789Sahrens 			}
427789Sahrens 			zio_buf_free(orig, rc->rc_size);
428789Sahrens 		}
429789Sahrens 		goto done;
430789Sahrens 	}
431789Sahrens 
432789Sahrens 	/*
433789Sahrens 	 * If there was exactly one I/O error, it's the one we expected,
434789Sahrens 	 * and the reconstructed data checksums, the read is complete.
435789Sahrens 	 * This happens when one child is offline and vdev_fault_assess()
436789Sahrens 	 * knows it, or when one child has stale data and the DTL knows it.
437789Sahrens 	 */
438789Sahrens 	if (zio->io_numerrors == 1 && (c = rm->rm_missing_child) != -1) {
439789Sahrens 		rc = &rm->rm_col[c];
440789Sahrens 		ASSERT(unexpected_errors == 0);
441789Sahrens 		ASSERT(rc->rc_error == ENXIO || rc->rc_error == ESTALE);
442789Sahrens 		vdev_raidz_reconstruct(rm, c);
443789Sahrens 		if (zio_checksum_error(zio) == 0) {
444789Sahrens 			zio->io_error = 0;
445789Sahrens 			goto done;
446789Sahrens 		}
447789Sahrens 	}
448789Sahrens 
449789Sahrens 	/*
450789Sahrens 	 * This isn't a typical error -- either we got a read error or
451789Sahrens 	 * more than one child claimed a problem.  Read every block we
452789Sahrens 	 * haven't already so we can try combinatorial reconstruction.
453789Sahrens 	 */
454789Sahrens 	unexpected_errors = 1;
455789Sahrens 	rm->rm_missing_child = -1;
456789Sahrens 
457789Sahrens 	for (c = 0; c < rm->rm_cols; c++)
458789Sahrens 		if (!rm->rm_col[c].rc_tried)
459789Sahrens 			break;
460789Sahrens 
461789Sahrens 	if (c != rm->rm_cols) {
462789Sahrens 		zio->io_error = 0;
463789Sahrens 		zio_vdev_io_redone(zio);
464789Sahrens 		for (c = 0; c < rm->rm_cols; c++) {
465789Sahrens 			rc = &rm->rm_col[c];
466789Sahrens 			if (rc->rc_tried)
467789Sahrens 				continue;
468789Sahrens 			zio_nowait(zio_vdev_child_io(zio, NULL,
469789Sahrens 			    vd->vdev_child[rc->rc_col],
470789Sahrens 			    rc->rc_offset, rc->rc_data, rc->rc_size,
471789Sahrens 			    zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
472789Sahrens 			    vdev_raidz_child_done, rc));
473789Sahrens 		}
474789Sahrens 		zio_wait_children_done(zio);
475789Sahrens 		return;
476789Sahrens 	}
477789Sahrens 
478789Sahrens 	/*
479789Sahrens 	 * If there were more errors than parity disks, give up.
480789Sahrens 	 */
481789Sahrens 	if (zio->io_numerrors > rm->rm_firstdatacol) {
482789Sahrens 		ASSERT(zio->io_error != 0);
483789Sahrens 		goto done;
484789Sahrens 	}
485789Sahrens 
486789Sahrens 	/*
487789Sahrens 	 * The number of I/O errors is correctable.  Correct them here.
488789Sahrens 	 */
489789Sahrens 	ASSERT(zio->io_numerrors <= rm->rm_firstdatacol);
490789Sahrens 	for (c = 0; c < rm->rm_cols; c++) {
491789Sahrens 		rc = &rm->rm_col[c];
492789Sahrens 		ASSERT(rc->rc_tried);
493789Sahrens 		if (rc->rc_error) {
494789Sahrens 			vdev_raidz_reconstruct(rm, c);
495789Sahrens 			if (zio_checksum_error(zio) == 0)
496789Sahrens 				zio->io_error = 0;
497789Sahrens 			else
498789Sahrens 				zio->io_error = rc->rc_error;
499789Sahrens 			goto done;
500789Sahrens 		}
501789Sahrens 	}
502789Sahrens 
503789Sahrens 	/*
504789Sahrens 	 * There were no I/O errors, but the data doesn't checksum.
505789Sahrens 	 * Try all permutations to see if we can find one that does.
506789Sahrens 	 */
507789Sahrens 	ASSERT(zio->io_numerrors == 0);
508789Sahrens 	for (c = 0; c < rm->rm_cols; c++) {
509789Sahrens 		void *orig;
510789Sahrens 		rc = &rm->rm_col[c];
511789Sahrens 
512789Sahrens 		orig = zio_buf_alloc(rc->rc_size);
513789Sahrens 		bcopy(rc->rc_data, orig, rc->rc_size);
514789Sahrens 		vdev_raidz_reconstruct(rm, c);
515789Sahrens 
516789Sahrens 		if (zio_checksum_error(zio) == 0) {
517789Sahrens 			zio_buf_free(orig, rc->rc_size);
518789Sahrens 			zio->io_error = 0;
519789Sahrens 			/*
520789Sahrens 			 * If this child didn't know that it returned bad data,
521789Sahrens 			 * inform it.
522789Sahrens 			 */
523789Sahrens 			if (rc->rc_tried && rc->rc_error == 0)
524*1544Seschrock 				raidz_checksum_error(zio, rc);
525789Sahrens 			rc->rc_error = ECKSUM;
526789Sahrens 			goto done;
527789Sahrens 		}
528789Sahrens 
529789Sahrens 		bcopy(orig, rc->rc_data, rc->rc_size);
530789Sahrens 		zio_buf_free(orig, rc->rc_size);
531789Sahrens 	}
532789Sahrens 
533789Sahrens 	/*
534*1544Seschrock 	 * All combinations failed to checksum.  Generate checksum ereports for
535*1544Seschrock 	 * every one.
536789Sahrens 	 */
537789Sahrens 	zio->io_error = ECKSUM;
538*1544Seschrock 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
539*1544Seschrock 		for (c = 0; c < rm->rm_cols; c++) {
540*1544Seschrock 			rc = &rm->rm_col[c];
541*1544Seschrock 			zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
542*1544Seschrock 			    zio->io_spa, vd->vdev_child[rc->rc_col], zio,
543*1544Seschrock 			    rc->rc_offset, rc->rc_size);
544*1544Seschrock 		}
545*1544Seschrock 	}
546789Sahrens 
547789Sahrens done:
548789Sahrens 	zio_checksum_verified(zio);
549789Sahrens 
550789Sahrens 	if (zio->io_error == 0 && (spa_mode & FWRITE) &&
551789Sahrens 	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
552789Sahrens 		/*
553789Sahrens 		 * Use the good data we have in hand to repair damaged children.
554789Sahrens 		 */
555789Sahrens 		for (c = 0; c < rm->rm_cols; c++) {
556789Sahrens 			rc = &rm->rm_col[c];
557789Sahrens 			cvd = vd->vdev_child[rc->rc_col];
558789Sahrens 
559789Sahrens 			if (rc->rc_error) {
560789Sahrens 				/*
561789Sahrens 				 * Make a copy of the data because we're
562789Sahrens 				 * going to free the RAID-Z map below.
563789Sahrens 				 */
564789Sahrens 				void *data = zio_buf_alloc(rc->rc_size);
565789Sahrens 				bcopy(rc->rc_data, data, rc->rc_size);
566789Sahrens 
567789Sahrens 				dprintf("%s resilvered %s @ 0x%llx error %d\n",
568789Sahrens 				    vdev_description(vd),
569789Sahrens 				    vdev_description(cvd),
570789Sahrens 				    zio->io_offset, rc->rc_error);
571789Sahrens 
572789Sahrens 				zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
573789Sahrens 				    rc->rc_offset, data, rc->rc_size,
574789Sahrens 				    ZIO_TYPE_WRITE, zio->io_priority,
575789Sahrens 				    ZIO_FLAG_IO_REPAIR | ZIO_FLAG_CANFAIL |
576789Sahrens 				    ZIO_FLAG_DONT_PROPAGATE,
577789Sahrens 				    vdev_raidz_repair_done, NULL));
578789Sahrens 			}
579789Sahrens 		}
580789Sahrens 	}
581789Sahrens 
582789Sahrens 	vdev_raidz_map_free(zio);
583789Sahrens 	zio_next_stage(zio);
584789Sahrens }
585789Sahrens 
586789Sahrens static void
587789Sahrens vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
588789Sahrens {
589789Sahrens 	if (faulted > 1)
590*1544Seschrock 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
591*1544Seschrock 		    VDEV_AUX_NO_REPLICAS);
592789Sahrens 	else if (degraded + faulted != 0)
593*1544Seschrock 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
594789Sahrens 	else
595*1544Seschrock 		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
596789Sahrens }
597789Sahrens 
598789Sahrens vdev_ops_t vdev_raidz_ops = {
599789Sahrens 	vdev_raidz_open,
600789Sahrens 	vdev_raidz_close,
601789Sahrens 	vdev_raidz_asize,
602789Sahrens 	vdev_raidz_io_start,
603789Sahrens 	vdev_raidz_io_done,
604789Sahrens 	vdev_raidz_state_change,
605789Sahrens 	VDEV_TYPE_RAIDZ,	/* name of this vdev type */
606789Sahrens 	B_FALSE			/* not a leaf vdev */
607789Sahrens };
608