xref: /netbsd-src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_raidz.c (revision d4edddf34ac1d753afcf91453a2f2da66a6ea281)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
25  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
26  * Copyright (c) 2014 Integros [integros.com]
27  */
28 
29 #include <sys/zfs_context.h>
30 #include <sys/spa.h>
31 #include <sys/vdev_impl.h>
32 #ifndef __FreeBSD__
33 #include <sys/vdev_disk.h>
34 #endif
35 #include <sys/vdev_file.h>
36 #include <sys/vdev_raidz.h>
37 #include <sys/zio.h>
38 #include <sys/zio_checksum.h>
39 #include <sys/fs/zfs.h>
40 #include <sys/fm/fs/zfs.h>
41 #ifdef __FreeBSD__
42 #include <sys/bio.h>
43 #endif
44 
45 /*
46  * Virtual device vector for RAID-Z.
47  *
48  * This vdev supports single, double, and triple parity. For single parity,
49  * we use a simple XOR of all the data columns. For double or triple parity,
50  * we use a special case of Reed-Solomon coding. This extends the
51  * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
52  * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
53  * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
54  * former is also based. The latter is designed to provide higher performance
55  * for writes.
56  *
57  * Note that the Plank paper claimed to support arbitrary N+M, but was then
58  * amended six years later identifying a critical flaw that invalidates its
59  * claims. Nevertheless, the technique can be adapted to work for up to
60  * triple parity. For additional parity, the amendment "Note: Correction to
61  * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
62  * is viable, but the additional complexity means that write performance will
63  * suffer.
64  *
65  * All of the methods above operate on a Galois field, defined over the
66  * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
67  * can be expressed with a single byte. Briefly, the operations on the
68  * field are defined as follows:
69  *
70  *   o addition (+) is represented by a bitwise XOR
71  *   o subtraction (-) is therefore identical to addition: A + B = A - B
72  *   o multiplication of A by 2 is defined by the following bitwise expression:
73  *
74  *	(A * 2)_7 = A_6
75  *	(A * 2)_6 = A_5
76  *	(A * 2)_5 = A_4
77  *	(A * 2)_4 = A_3 + A_7
78  *	(A * 2)_3 = A_2 + A_7
79  *	(A * 2)_2 = A_1 + A_7
80  *	(A * 2)_1 = A_0
81  *	(A * 2)_0 = A_7
82  *
83  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
84  * As an aside, this multiplication is derived from the error correcting
85  * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
86  *
87  * Observe that any number in the field (except for 0) can be expressed as a
88  * power of 2 -- a generator for the field. We store a table of the powers of
89  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
90  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
91  * than field addition). The inverse of a field element A (A^-1) is therefore
92  * A ^ (255 - 1) = A^254.
93  *
94  * The up-to-three parity columns, P, Q, R over several data columns,
95  * D_0, ... D_n-1, can be expressed by field operations:
96  *
97  *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
98  *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
99  *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
100  *	R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
101  *	  = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
102  *
103  * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
104  * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
105  * independent coefficients. (There are no additional coefficients that have
106  * this property which is why the uncorrected Plank method breaks down.)
107  *
108  * See the reconstruction code below for how P, Q and R can used individually
109  * or in concert to recover missing data columns.
110  */
111 
112 typedef struct raidz_col {
113 	uint64_t rc_devidx;		/* child device index for I/O */
114 	uint64_t rc_offset;		/* device offset */
115 	uint64_t rc_size;		/* I/O size */
116 	void *rc_data;			/* I/O data */
117 	void *rc_gdata;			/* used to store the "good" version */
118 	int rc_error;			/* I/O error for this device */
119 	uint8_t rc_tried;		/* Did we attempt this I/O column? */
120 	uint8_t rc_skipped;		/* Did we skip this I/O column? */
121 } raidz_col_t;
122 
123 typedef struct raidz_map {
124 	uint64_t rm_cols;		/* Regular column count */
125 	uint64_t rm_scols;		/* Count including skipped columns */
126 	uint64_t rm_bigcols;		/* Number of oversized columns */
127 	uint64_t rm_asize;		/* Actual total I/O size */
128 	uint64_t rm_missingdata;	/* Count of missing data devices */
129 	uint64_t rm_missingparity;	/* Count of missing parity devices */
130 	uint64_t rm_firstdatacol;	/* First data column/parity count */
131 	uint64_t rm_nskip;		/* Skipped sectors for padding */
132 	uint64_t rm_skipstart;		/* Column index of padding start */
133 	void *rm_datacopy;		/* rm_asize-buffer of copied data */
134 	uintptr_t rm_reports;		/* # of referencing checksum reports */
135 	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
136 	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
137 	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
138 } raidz_map_t;
139 
140 #define	VDEV_RAIDZ_P		0
141 #define	VDEV_RAIDZ_Q		1
142 #define	VDEV_RAIDZ_R		2
143 
144 #define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
145 #define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
146 
147 /*
148  * We provide a mechanism to perform the field multiplication operation on a
149  * 64-bit value all at once rather than a byte at a time. This works by
150  * creating a mask from the top bit in each byte and using that to
151  * conditionally apply the XOR of 0x1d.
152  */
153 #define	VDEV_RAIDZ_64MUL_2(x, mask) \
154 { \
155 	(mask) = (x) & 0x8080808080808080ULL; \
156 	(mask) = ((mask) << 1) - ((mask) >> 7); \
157 	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
158 	    ((mask) & 0x1d1d1d1d1d1d1d1d); \
159 }
160 
161 #define	VDEV_RAIDZ_64MUL_4(x, mask) \
162 { \
163 	VDEV_RAIDZ_64MUL_2((x), mask); \
164 	VDEV_RAIDZ_64MUL_2((x), mask); \
165 }
166 
167 #define	VDEV_LABEL_OFFSET(x)	(x + VDEV_LABEL_START_SIZE)
168 
169 /*
170  * Force reconstruction to use the general purpose method.
171  */
172 int vdev_raidz_default_to_general;
173 
174 /* Powers of 2 in the Galois field defined above. */
175 static const uint8_t vdev_raidz_pow2[256] = {
176 	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
177 	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
178 	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
179 	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
180 	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
181 	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
182 	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
183 	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
184 	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
185 	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
186 	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
187 	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
188 	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
189 	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
190 	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
191 	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
192 	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
193 	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
194 	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
195 	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
196 	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
197 	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
198 	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
199 	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
200 	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
201 	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
202 	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
203 	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
204 	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
205 	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
206 	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
207 	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
208 };
209 /* Logs of 2 in the Galois field defined above. */
210 static const uint8_t vdev_raidz_log2[256] = {
211 	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
212 	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
213 	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
214 	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
215 	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
216 	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
217 	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
218 	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
219 	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
220 	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
221 	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
222 	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
223 	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
224 	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
225 	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
226 	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
227 	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
228 	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
229 	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
230 	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
231 	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
232 	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
233 	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
234 	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
235 	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
236 	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
237 	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
238 	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
239 	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
240 	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
241 	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
242 	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
243 };
244 
245 static void vdev_raidz_generate_parity(raidz_map_t *rm);
246 
247 /*
248  * Multiply a given number by 2 raised to the given power.
249  */
250 static uint8_t
vdev_raidz_exp2(uint_t a,int exp)251 vdev_raidz_exp2(uint_t a, int exp)
252 {
253 	if (a == 0)
254 		return (0);
255 
256 	ASSERT(exp >= 0);
257 	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
258 
259 	exp += vdev_raidz_log2[a];
260 	if (exp > 255)
261 		exp -= 255;
262 
263 	return (vdev_raidz_pow2[exp]);
264 }
265 
266 static void
vdev_raidz_map_free(raidz_map_t * rm)267 vdev_raidz_map_free(raidz_map_t *rm)
268 {
269 	int c;
270 	size_t size;
271 
272 	for (c = 0; c < rm->rm_firstdatacol; c++) {
273 		if (rm->rm_col[c].rc_data != NULL)
274 			zio_buf_free(rm->rm_col[c].rc_data,
275 			    rm->rm_col[c].rc_size);
276 
277 		if (rm->rm_col[c].rc_gdata != NULL)
278 			zio_buf_free(rm->rm_col[c].rc_gdata,
279 			    rm->rm_col[c].rc_size);
280 	}
281 
282 	size = 0;
283 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
284 		size += rm->rm_col[c].rc_size;
285 
286 	if (rm->rm_datacopy != NULL)
287 		zio_buf_free(rm->rm_datacopy, size);
288 
289 	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
290 }
291 
292 static void
vdev_raidz_map_free_vsd(zio_t * zio)293 vdev_raidz_map_free_vsd(zio_t *zio)
294 {
295 	raidz_map_t *rm = zio->io_vsd;
296 
297 	ASSERT0(rm->rm_freed);
298 	rm->rm_freed = 1;
299 
300 	if (rm->rm_reports == 0)
301 		vdev_raidz_map_free(rm);
302 }
303 
304 /*ARGSUSED*/
305 static void
vdev_raidz_cksum_free(void * arg,size_t ignored)306 vdev_raidz_cksum_free(void *arg, size_t ignored)
307 {
308 	raidz_map_t *rm = arg;
309 
310 	ASSERT3U(rm->rm_reports, >, 0);
311 
312 	if (--rm->rm_reports == 0 && rm->rm_freed != 0)
313 		vdev_raidz_map_free(rm);
314 }
315 
316 static void
vdev_raidz_cksum_finish(zio_cksum_report_t * zcr,const void * good_data)317 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
318 {
319 	raidz_map_t *rm = zcr->zcr_cbdata;
320 	size_t c = zcr->zcr_cbinfo;
321 	size_t x;
322 
323 	const char *good = NULL;
324 	const char *bad = rm->rm_col[c].rc_data;
325 
326 	if (good_data == NULL) {
327 		zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
328 		return;
329 	}
330 
331 	if (c < rm->rm_firstdatacol) {
332 		/*
333 		 * The first time through, calculate the parity blocks for
334 		 * the good data (this relies on the fact that the good
335 		 * data never changes for a given logical ZIO)
336 		 */
337 		if (rm->rm_col[0].rc_gdata == NULL) {
338 			char *bad_parity[VDEV_RAIDZ_MAXPARITY];
339 			char *buf;
340 
341 			/*
342 			 * Set up the rm_col[]s to generate the parity for
343 			 * good_data, first saving the parity bufs and
344 			 * replacing them with buffers to hold the result.
345 			 */
346 			for (x = 0; x < rm->rm_firstdatacol; x++) {
347 				bad_parity[x] = rm->rm_col[x].rc_data;
348 				rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata =
349 				    zio_buf_alloc(rm->rm_col[x].rc_size);
350 			}
351 
352 			/* fill in the data columns from good_data */
353 			buf = (char *)good_data;
354 			for (; x < rm->rm_cols; x++) {
355 				rm->rm_col[x].rc_data = buf;
356 				buf += rm->rm_col[x].rc_size;
357 			}
358 
359 			/*
360 			 * Construct the parity from the good data.
361 			 */
362 			vdev_raidz_generate_parity(rm);
363 
364 			/* restore everything back to its original state */
365 			for (x = 0; x < rm->rm_firstdatacol; x++)
366 				rm->rm_col[x].rc_data = bad_parity[x];
367 
368 			buf = rm->rm_datacopy;
369 			for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
370 				rm->rm_col[x].rc_data = buf;
371 				buf += rm->rm_col[x].rc_size;
372 			}
373 		}
374 
375 		ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
376 		good = rm->rm_col[c].rc_gdata;
377 	} else {
378 		/* adjust good_data to point at the start of our column */
379 		good = good_data;
380 
381 		for (x = rm->rm_firstdatacol; x < c; x++)
382 			good += rm->rm_col[x].rc_size;
383 	}
384 
385 	/* we drop the ereport if it ends up that the data was good */
386 	zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
387 }
388 
389 /*
390  * Invoked indirectly by zfs_ereport_start_checksum(), called
391  * below when our read operation fails completely.  The main point
392  * is to keep a copy of everything we read from disk, so that at
393  * vdev_raidz_cksum_finish() time we can compare it with the good data.
394  */
395 static void
vdev_raidz_cksum_report(zio_t * zio,zio_cksum_report_t * zcr,void * arg)396 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
397 {
398 	size_t c = (size_t)(uintptr_t)arg;
399 	caddr_t buf;
400 
401 	raidz_map_t *rm = zio->io_vsd;
402 	size_t size;
403 
404 	/* set up the report and bump the refcount  */
405 	zcr->zcr_cbdata = rm;
406 	zcr->zcr_cbinfo = c;
407 	zcr->zcr_finish = vdev_raidz_cksum_finish;
408 	zcr->zcr_free = vdev_raidz_cksum_free;
409 
410 	rm->rm_reports++;
411 	ASSERT3U(rm->rm_reports, >, 0);
412 
413 	if (rm->rm_datacopy != NULL)
414 		return;
415 
416 	/*
417 	 * It's the first time we're called for this raidz_map_t, so we need
418 	 * to copy the data aside; there's no guarantee that our zio's buffer
419 	 * won't be re-used for something else.
420 	 *
421 	 * Our parity data is already in separate buffers, so there's no need
422 	 * to copy them.
423 	 */
424 
425 	size = 0;
426 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
427 		size += rm->rm_col[c].rc_size;
428 
429 	buf = rm->rm_datacopy = zio_buf_alloc(size);
430 
431 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
432 		raidz_col_t *col = &rm->rm_col[c];
433 
434 		bcopy(col->rc_data, buf, col->rc_size);
435 		col->rc_data = buf;
436 
437 		buf += col->rc_size;
438 	}
439 	ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
440 }
441 
442 static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
443 	vdev_raidz_map_free_vsd,
444 	vdev_raidz_cksum_report
445 };
446 
447 /*
448  * Divides the IO evenly across all child vdevs; usually, dcols is
449  * the number of children in the target vdev.
450  */
451 static raidz_map_t *
vdev_raidz_map_alloc(caddr_t data,uint64_t size,uint64_t offset,boolean_t dofree,uint64_t unit_shift,uint64_t dcols,uint64_t nparity)452 vdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset, boolean_t dofree,
453     uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
454 {
455 	raidz_map_t *rm;
456 	/* The starting RAIDZ (parent) vdev sector of the block. */
457 	uint64_t b = offset >> unit_shift;
458 	/* The zio's size in units of the vdev's minimum sector size. */
459 	uint64_t s = size >> unit_shift;
460 	/* The first column for this stripe. */
461 	uint64_t f = b % dcols;
462 	/* The starting byte offset on each child vdev. */
463 	uint64_t o = (b / dcols) << unit_shift;
464 	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
465 
466 	/*
467 	 * "Quotient": The number of data sectors for this stripe on all but
468 	 * the "big column" child vdevs that also contain "remainder" data.
469 	 */
470 	q = s / (dcols - nparity);
471 
472 	/*
473 	 * "Remainder": The number of partial stripe data sectors in this I/O.
474 	 * This will add a sector to some, but not all, child vdevs.
475 	 */
476 	r = s - q * (dcols - nparity);
477 
478 	/* The number of "big columns" - those which contain remainder data. */
479 	bc = (r == 0 ? 0 : r + nparity);
480 
481 	/*
482 	 * The total number of data and parity sectors associated with
483 	 * this I/O.
484 	 */
485 	tot = s + nparity * (q + (r == 0 ? 0 : 1));
486 
487 	/* acols: The columns that will be accessed. */
488 	/* scols: The columns that will be accessed or skipped. */
489 	if (q == 0) {
490 		/* Our I/O request doesn't span all child vdevs. */
491 		acols = bc;
492 		scols = MIN(dcols, roundup(bc, nparity + 1));
493 	} else {
494 		acols = dcols;
495 		scols = dcols;
496 	}
497 
498 	ASSERT3U(acols, <=, scols);
499 
500 	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
501 
502 	rm->rm_cols = acols;
503 	rm->rm_scols = scols;
504 	rm->rm_bigcols = bc;
505 	rm->rm_skipstart = bc;
506 	rm->rm_missingdata = 0;
507 	rm->rm_missingparity = 0;
508 	rm->rm_firstdatacol = nparity;
509 	rm->rm_datacopy = NULL;
510 	rm->rm_reports = 0;
511 	rm->rm_freed = 0;
512 	rm->rm_ecksuminjected = 0;
513 
514 	asize = 0;
515 
516 	for (c = 0; c < scols; c++) {
517 		col = f + c;
518 		coff = o;
519 		if (col >= dcols) {
520 			col -= dcols;
521 			coff += 1ULL << unit_shift;
522 		}
523 		rm->rm_col[c].rc_devidx = col;
524 		rm->rm_col[c].rc_offset = coff;
525 		rm->rm_col[c].rc_data = NULL;
526 		rm->rm_col[c].rc_gdata = NULL;
527 		rm->rm_col[c].rc_error = 0;
528 		rm->rm_col[c].rc_tried = 0;
529 		rm->rm_col[c].rc_skipped = 0;
530 
531 		if (c >= acols)
532 			rm->rm_col[c].rc_size = 0;
533 		else if (c < bc)
534 			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
535 		else
536 			rm->rm_col[c].rc_size = q << unit_shift;
537 
538 		asize += rm->rm_col[c].rc_size;
539 	}
540 
541 	ASSERT3U(asize, ==, tot << unit_shift);
542 	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
543 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
544 	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
545 	ASSERT3U(rm->rm_nskip, <=, nparity);
546 
547 	if (!dofree) {
548 		for (c = 0; c < rm->rm_firstdatacol; c++) {
549 			rm->rm_col[c].rc_data =
550 			    zio_buf_alloc(rm->rm_col[c].rc_size);
551 		}
552 
553 		rm->rm_col[c].rc_data = data;
554 
555 		for (c = c + 1; c < acols; c++) {
556 			rm->rm_col[c].rc_data =
557 			    (char *)rm->rm_col[c - 1].rc_data +
558 			    rm->rm_col[c - 1].rc_size;
559 		}
560 	}
561 
562 	/*
563 	 * If all data stored spans all columns, there's a danger that parity
564 	 * will always be on the same device and, since parity isn't read
565 	 * during normal operation, that that device's I/O bandwidth won't be
566 	 * used effectively. We therefore switch the parity every 1MB.
567 	 *
568 	 * ... at least that was, ostensibly, the theory. As a practical
569 	 * matter unless we juggle the parity between all devices evenly, we
570 	 * won't see any benefit. Further, occasional writes that aren't a
571 	 * multiple of the LCM of the number of children and the minimum
572 	 * stripe width are sufficient to avoid pessimal behavior.
573 	 * Unfortunately, this decision created an implicit on-disk format
574 	 * requirement that we need to support for all eternity, but only
575 	 * for single-parity RAID-Z.
576 	 *
577 	 * If we intend to skip a sector in the zeroth column for padding
578 	 * we must make sure to note this swap. We will never intend to
579 	 * skip the first column since at least one data and one parity
580 	 * column must appear in each row.
581 	 */
582 	ASSERT(rm->rm_cols >= 2);
583 	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
584 
585 	if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
586 		devidx = rm->rm_col[0].rc_devidx;
587 		o = rm->rm_col[0].rc_offset;
588 		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
589 		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
590 		rm->rm_col[1].rc_devidx = devidx;
591 		rm->rm_col[1].rc_offset = o;
592 
593 		if (rm->rm_skipstart == 0)
594 			rm->rm_skipstart = 1;
595 	}
596 
597 	return (rm);
598 }
599 
600 static void
vdev_raidz_generate_parity_p(raidz_map_t * rm)601 vdev_raidz_generate_parity_p(raidz_map_t *rm)
602 {
603 	uint64_t *p, *src, pcount, ccount, i;
604 	int c;
605 
606 	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
607 
608 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
609 		src = rm->rm_col[c].rc_data;
610 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
611 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
612 
613 		if (c == rm->rm_firstdatacol) {
614 			ASSERT(ccount == pcount);
615 			for (i = 0; i < ccount; i++, src++, p++) {
616 				*p = *src;
617 			}
618 		} else {
619 			ASSERT(ccount <= pcount);
620 			for (i = 0; i < ccount; i++, src++, p++) {
621 				*p ^= *src;
622 			}
623 		}
624 	}
625 }
626 
627 static void
vdev_raidz_generate_parity_pq(raidz_map_t * rm)628 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
629 {
630 	uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
631 	int c;
632 
633 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
634 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
635 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
636 
637 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
638 		src = rm->rm_col[c].rc_data;
639 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
640 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
641 
642 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
643 
644 		if (c == rm->rm_firstdatacol) {
645 			ASSERT(ccnt == pcnt || ccnt == 0);
646 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
647 				*p = *src;
648 				*q = *src;
649 			}
650 			for (; i < pcnt; i++, src++, p++, q++) {
651 				*p = 0;
652 				*q = 0;
653 			}
654 		} else {
655 			ASSERT(ccnt <= pcnt);
656 
657 			/*
658 			 * Apply the algorithm described above by multiplying
659 			 * the previous result and adding in the new value.
660 			 */
661 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
662 				*p ^= *src;
663 
664 				VDEV_RAIDZ_64MUL_2(*q, mask);
665 				*q ^= *src;
666 			}
667 
668 			/*
669 			 * Treat short columns as though they are full of 0s.
670 			 * Note that there's therefore nothing needed for P.
671 			 */
672 			for (; i < pcnt; i++, q++) {
673 				VDEV_RAIDZ_64MUL_2(*q, mask);
674 			}
675 		}
676 	}
677 }
678 
679 static void
vdev_raidz_generate_parity_pqr(raidz_map_t * rm)680 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
681 {
682 	uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
683 	int c;
684 
685 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
686 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
687 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
688 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
689 	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
690 
691 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
692 		src = rm->rm_col[c].rc_data;
693 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
694 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
695 		r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
696 
697 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
698 
699 		if (c == rm->rm_firstdatacol) {
700 			ASSERT(ccnt == pcnt || ccnt == 0);
701 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
702 				*p = *src;
703 				*q = *src;
704 				*r = *src;
705 			}
706 			for (; i < pcnt; i++, src++, p++, q++, r++) {
707 				*p = 0;
708 				*q = 0;
709 				*r = 0;
710 			}
711 		} else {
712 			ASSERT(ccnt <= pcnt);
713 
714 			/*
715 			 * Apply the algorithm described above by multiplying
716 			 * the previous result and adding in the new value.
717 			 */
718 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
719 				*p ^= *src;
720 
721 				VDEV_RAIDZ_64MUL_2(*q, mask);
722 				*q ^= *src;
723 
724 				VDEV_RAIDZ_64MUL_4(*r, mask);
725 				*r ^= *src;
726 			}
727 
728 			/*
729 			 * Treat short columns as though they are full of 0s.
730 			 * Note that there's therefore nothing needed for P.
731 			 */
732 			for (; i < pcnt; i++, q++, r++) {
733 				VDEV_RAIDZ_64MUL_2(*q, mask);
734 				VDEV_RAIDZ_64MUL_4(*r, mask);
735 			}
736 		}
737 	}
738 }
739 
740 /*
741  * Generate RAID parity in the first virtual columns according to the number of
742  * parity columns available.
743  */
744 static void
vdev_raidz_generate_parity(raidz_map_t * rm)745 vdev_raidz_generate_parity(raidz_map_t *rm)
746 {
747 	switch (rm->rm_firstdatacol) {
748 	case 1:
749 		vdev_raidz_generate_parity_p(rm);
750 		break;
751 	case 2:
752 		vdev_raidz_generate_parity_pq(rm);
753 		break;
754 	case 3:
755 		vdev_raidz_generate_parity_pqr(rm);
756 		break;
757 	default:
758 		cmn_err(CE_PANIC, "invalid RAID-Z configuration");
759 	}
760 }
761 
762 static int
vdev_raidz_reconstruct_p(raidz_map_t * rm,int * tgts,int ntgts)763 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
764 {
765 	uint64_t *dst, *src, xcount, ccount, count, i;
766 	int x = tgts[0];
767 	int c;
768 
769 	ASSERT(ntgts == 1);
770 	ASSERT(x >= rm->rm_firstdatacol);
771 	ASSERT(x < rm->rm_cols);
772 
773 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
774 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
775 	ASSERT(xcount > 0);
776 
777 	src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
778 	dst = rm->rm_col[x].rc_data;
779 	for (i = 0; i < xcount; i++, dst++, src++) {
780 		*dst = *src;
781 	}
782 
783 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
784 		src = rm->rm_col[c].rc_data;
785 		dst = rm->rm_col[x].rc_data;
786 
787 		if (c == x)
788 			continue;
789 
790 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
791 		count = MIN(ccount, xcount);
792 
793 		for (i = 0; i < count; i++, dst++, src++) {
794 			*dst ^= *src;
795 		}
796 	}
797 
798 	return (1 << VDEV_RAIDZ_P);
799 }
800 
801 static int
vdev_raidz_reconstruct_q(raidz_map_t * rm,int * tgts,int ntgts)802 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
803 {
804 	uint64_t *dst, *src, xcount, ccount, count, mask, i;
805 	uint8_t *b;
806 	int x = tgts[0];
807 	int c, j, exp;
808 
809 	ASSERT(ntgts == 1);
810 
811 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
812 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
813 
814 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
815 		src = rm->rm_col[c].rc_data;
816 		dst = rm->rm_col[x].rc_data;
817 
818 		if (c == x)
819 			ccount = 0;
820 		else
821 			ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
822 
823 		count = MIN(ccount, xcount);
824 
825 		if (c == rm->rm_firstdatacol) {
826 			for (i = 0; i < count; i++, dst++, src++) {
827 				*dst = *src;
828 			}
829 			for (; i < xcount; i++, dst++) {
830 				*dst = 0;
831 			}
832 
833 		} else {
834 			for (i = 0; i < count; i++, dst++, src++) {
835 				VDEV_RAIDZ_64MUL_2(*dst, mask);
836 				*dst ^= *src;
837 			}
838 
839 			for (; i < xcount; i++, dst++) {
840 				VDEV_RAIDZ_64MUL_2(*dst, mask);
841 			}
842 		}
843 	}
844 
845 	src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
846 	dst = rm->rm_col[x].rc_data;
847 	exp = 255 - (rm->rm_cols - 1 - x);
848 
849 	for (i = 0; i < xcount; i++, dst++, src++) {
850 		*dst ^= *src;
851 		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
852 			*b = vdev_raidz_exp2(*b, exp);
853 		}
854 	}
855 
856 	return (1 << VDEV_RAIDZ_Q);
857 }
858 
859 static int
vdev_raidz_reconstruct_pq(raidz_map_t * rm,int * tgts,int ntgts)860 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
861 {
862 	uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
863 	void *pdata, *qdata;
864 	uint64_t xsize, ysize, i;
865 	int x = tgts[0];
866 	int y = tgts[1];
867 
868 	ASSERT(ntgts == 2);
869 	ASSERT(x < y);
870 	ASSERT(x >= rm->rm_firstdatacol);
871 	ASSERT(y < rm->rm_cols);
872 
873 	ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
874 
875 	/*
876 	 * Move the parity data aside -- we're going to compute parity as
877 	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
878 	 * reuse the parity generation mechanism without trashing the actual
879 	 * parity so we make those columns appear to be full of zeros by
880 	 * setting their lengths to zero.
881 	 */
882 	pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
883 	qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
884 	xsize = rm->rm_col[x].rc_size;
885 	ysize = rm->rm_col[y].rc_size;
886 
887 	rm->rm_col[VDEV_RAIDZ_P].rc_data =
888 	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
889 	rm->rm_col[VDEV_RAIDZ_Q].rc_data =
890 	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
891 	rm->rm_col[x].rc_size = 0;
892 	rm->rm_col[y].rc_size = 0;
893 
894 	vdev_raidz_generate_parity_pq(rm);
895 
896 	rm->rm_col[x].rc_size = xsize;
897 	rm->rm_col[y].rc_size = ysize;
898 
899 	p = pdata;
900 	q = qdata;
901 	pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
902 	qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
903 	xd = rm->rm_col[x].rc_data;
904 	yd = rm->rm_col[y].rc_data;
905 
906 	/*
907 	 * We now have:
908 	 *	Pxy = P + D_x + D_y
909 	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
910 	 *
911 	 * We can then solve for D_x:
912 	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
913 	 * where
914 	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
915 	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
916 	 *
917 	 * With D_x in hand, we can easily solve for D_y:
918 	 *	D_y = P + Pxy + D_x
919 	 */
920 
921 	a = vdev_raidz_pow2[255 + x - y];
922 	b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
923 	tmp = 255 - vdev_raidz_log2[a ^ 1];
924 
925 	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
926 	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
927 
928 	for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
929 		*xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
930 		    vdev_raidz_exp2(*q ^ *qxy, bexp);
931 
932 		if (i < ysize)
933 			*yd = *p ^ *pxy ^ *xd;
934 	}
935 
936 	zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
937 	    rm->rm_col[VDEV_RAIDZ_P].rc_size);
938 	zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
939 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
940 
941 	/*
942 	 * Restore the saved parity data.
943 	 */
944 	rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
945 	rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
946 
947 	return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
948 }
949 
950 /* BEGIN CSTYLED */
951 /*
952  * In the general case of reconstruction, we must solve the system of linear
953  * equations defined by the coeffecients used to generate parity as well as
954  * the contents of the data and parity disks. This can be expressed with
955  * vectors for the original data (D) and the actual data (d) and parity (p)
956  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
957  *
958  *            __   __                     __     __
959  *            |     |         __     __   |  p_0  |
960  *            |  V  |         |  D_0  |   | p_m-1 |
961  *            |     |    x    |   :   | = |  d_0  |
962  *            |  I  |         | D_n-1 |   |   :   |
963  *            |     |         ~~     ~~   | d_n-1 |
964  *            ~~   ~~                     ~~     ~~
965  *
966  * I is simply a square identity matrix of size n, and V is a vandermonde
967  * matrix defined by the coeffecients we chose for the various parity columns
968  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
969  * computation as well as linear separability.
970  *
971  *      __               __               __     __
972  *      |   1   ..  1 1 1 |               |  p_0  |
973  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
974  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
975  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
976  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
977  *      |   :       : : : |   |   :   |   |  d_2  |
978  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
979  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
980  *      |   0   ..  0 0 1 |               | d_n-1 |
981  *      ~~               ~~               ~~     ~~
982  *
983  * Note that I, V, d, and p are known. To compute D, we must invert the
984  * matrix and use the known data and parity values to reconstruct the unknown
985  * data values. We begin by removing the rows in V|I and d|p that correspond
986  * to failed or missing columns; we then make V|I square (n x n) and d|p
987  * sized n by removing rows corresponding to unused parity from the bottom up
988  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
989  * using Gauss-Jordan elimination. In the example below we use m=3 parity
990  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
991  *           __                               __
992  *           |  1   1   1   1   1   1   1   1  |
993  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
994  *           |  19 205 116  29  64  16  4   1  |      / /
995  *           |  1   0   0   0   0   0   0   0  |     / /
996  *           |  0   1   0   0   0   0   0   0  | <--' /
997  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
998  *           |  0   0   0   1   0   0   0   0  |
999  *           |  0   0   0   0   1   0   0   0  |
1000  *           |  0   0   0   0   0   1   0   0  |
1001  *           |  0   0   0   0   0   0   1   0  |
1002  *           |  0   0   0   0   0   0   0   1  |
1003  *           ~~                               ~~
1004  *           __                               __
1005  *           |  1   1   1   1   1   1   1   1  |
1006  *           |  19 205 116  29  64  16  4   1  |
1007  *           |  1   0   0   0   0   0   0   0  |
1008  *  (V|I)' = |  0   0   0   1   0   0   0   0  |
1009  *           |  0   0   0   0   1   0   0   0  |
1010  *           |  0   0   0   0   0   1   0   0  |
1011  *           |  0   0   0   0   0   0   1   0  |
1012  *           |  0   0   0   0   0   0   0   1  |
1013  *           ~~                               ~~
1014  *
1015  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1016  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1017  * matrix is not singular.
1018  * __                                                                 __
1019  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1020  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1021  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1022  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1023  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1024  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1025  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1026  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1027  * ~~                                                                 ~~
1028  * __                                                                 __
1029  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1030  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1031  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1032  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1033  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1034  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1035  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1036  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1037  * ~~                                                                 ~~
1038  * __                                                                 __
1039  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1040  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1041  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
1042  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1043  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1044  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1045  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1046  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1047  * ~~                                                                 ~~
1048  * __                                                                 __
1049  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1050  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1051  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
1052  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1053  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1054  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1055  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1056  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1057  * ~~                                                                 ~~
1058  * __                                                                 __
1059  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1060  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1061  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1062  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1063  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1064  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1065  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1066  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1067  * ~~                                                                 ~~
1068  * __                                                                 __
1069  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1070  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
1071  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1072  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1073  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1074  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1075  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1076  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1077  * ~~                                                                 ~~
1078  *                   __                               __
1079  *                   |  0   0   1   0   0   0   0   0  |
1080  *                   | 167 100  5   41 159 169 217 208 |
1081  *                   | 166 100  4   40 158 168 216 209 |
1082  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
1083  *                   |  0   0   0   0   1   0   0   0  |
1084  *                   |  0   0   0   0   0   1   0   0  |
1085  *                   |  0   0   0   0   0   0   1   0  |
1086  *                   |  0   0   0   0   0   0   0   1  |
1087  *                   ~~                               ~~
1088  *
1089  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1090  * of the missing data.
1091  *
1092  * As is apparent from the example above, the only non-trivial rows in the
1093  * inverse matrix correspond to the data disks that we're trying to
1094  * reconstruct. Indeed, those are the only rows we need as the others would
1095  * only be useful for reconstructing data known or assumed to be valid. For
1096  * that reason, we only build the coefficients in the rows that correspond to
1097  * targeted columns.
1098  */
1099 /* END CSTYLED */
1100 
1101 static void
vdev_raidz_matrix_init(raidz_map_t * rm,int n,int nmap,int * map,uint8_t ** rows)1102 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1103     uint8_t **rows)
1104 {
1105 	int i, j;
1106 	int pow;
1107 
1108 	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1109 
1110 	/*
1111 	 * Fill in the missing rows of interest.
1112 	 */
1113 	for (i = 0; i < nmap; i++) {
1114 		ASSERT3S(0, <=, map[i]);
1115 		ASSERT3S(map[i], <=, 2);
1116 
1117 		pow = map[i] * n;
1118 		if (pow > 255)
1119 			pow -= 255;
1120 		ASSERT(pow <= 255);
1121 
1122 		for (j = 0; j < n; j++) {
1123 			pow -= map[i];
1124 			if (pow < 0)
1125 				pow += 255;
1126 			rows[i][j] = vdev_raidz_pow2[pow];
1127 		}
1128 	}
1129 }
1130 
1131 static void
vdev_raidz_matrix_invert(raidz_map_t * rm,int n,int nmissing,int * missing,uint8_t ** rows,uint8_t ** invrows,const uint8_t * used)1132 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1133     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1134 {
1135 	int i, j, ii, jj;
1136 	uint8_t log;
1137 
1138 	/*
1139 	 * Assert that the first nmissing entries from the array of used
1140 	 * columns correspond to parity columns and that subsequent entries
1141 	 * correspond to data columns.
1142 	 */
1143 	for (i = 0; i < nmissing; i++) {
1144 		ASSERT3S(used[i], <, rm->rm_firstdatacol);
1145 	}
1146 	for (; i < n; i++) {
1147 		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1148 	}
1149 
1150 	/*
1151 	 * First initialize the storage where we'll compute the inverse rows.
1152 	 */
1153 	for (i = 0; i < nmissing; i++) {
1154 		for (j = 0; j < n; j++) {
1155 			invrows[i][j] = (i == j) ? 1 : 0;
1156 		}
1157 	}
1158 
1159 	/*
1160 	 * Subtract all trivial rows from the rows of consequence.
1161 	 */
1162 	for (i = 0; i < nmissing; i++) {
1163 		for (j = nmissing; j < n; j++) {
1164 			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1165 			jj = used[j] - rm->rm_firstdatacol;
1166 			ASSERT3S(jj, <, n);
1167 			invrows[i][j] = rows[i][jj];
1168 			rows[i][jj] = 0;
1169 		}
1170 	}
1171 
1172 	/*
1173 	 * For each of the rows of interest, we must normalize it and subtract
1174 	 * a multiple of it from the other rows.
1175 	 */
1176 	for (i = 0; i < nmissing; i++) {
1177 		for (j = 0; j < missing[i]; j++) {
1178 			ASSERT0(rows[i][j]);
1179 		}
1180 		ASSERT3U(rows[i][missing[i]], !=, 0);
1181 
1182 		/*
1183 		 * Compute the inverse of the first element and multiply each
1184 		 * element in the row by that value.
1185 		 */
1186 		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1187 
1188 		for (j = 0; j < n; j++) {
1189 			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1190 			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1191 		}
1192 
1193 		for (ii = 0; ii < nmissing; ii++) {
1194 			if (i == ii)
1195 				continue;
1196 
1197 			ASSERT3U(rows[ii][missing[i]], !=, 0);
1198 
1199 			log = vdev_raidz_log2[rows[ii][missing[i]]];
1200 
1201 			for (j = 0; j < n; j++) {
1202 				rows[ii][j] ^=
1203 				    vdev_raidz_exp2(rows[i][j], log);
1204 				invrows[ii][j] ^=
1205 				    vdev_raidz_exp2(invrows[i][j], log);
1206 			}
1207 		}
1208 	}
1209 
1210 	/*
1211 	 * Verify that the data that is left in the rows are properly part of
1212 	 * an identity matrix.
1213 	 */
1214 	for (i = 0; i < nmissing; i++) {
1215 		for (j = 0; j < n; j++) {
1216 			if (j == missing[i]) {
1217 				ASSERT3U(rows[i][j], ==, 1);
1218 			} else {
1219 				ASSERT0(rows[i][j]);
1220 			}
1221 		}
1222 	}
1223 }
1224 
1225 static void
vdev_raidz_matrix_reconstruct(raidz_map_t * rm,int n,int nmissing,int * missing,uint8_t ** invrows,const uint8_t * used)1226 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1227     int *missing, uint8_t **invrows, const uint8_t *used)
1228 {
1229 	int i, j, x, cc, c;
1230 	uint8_t *src;
1231 	uint64_t ccount;
1232 	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1233 	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1234 	uint8_t log = 0;
1235 	uint8_t val;
1236 	int ll;
1237 	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1238 	uint8_t *p, *pp;
1239 	size_t psize;
1240 
1241 	psize = sizeof (invlog[0][0]) * n * nmissing;
1242 	p = kmem_alloc(psize, KM_SLEEP);
1243 
1244 	for (pp = p, i = 0; i < nmissing; i++) {
1245 		invlog[i] = pp;
1246 		pp += n;
1247 	}
1248 
1249 	for (i = 0; i < nmissing; i++) {
1250 		for (j = 0; j < n; j++) {
1251 			ASSERT3U(invrows[i][j], !=, 0);
1252 			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1253 		}
1254 	}
1255 
1256 	for (i = 0; i < n; i++) {
1257 		c = used[i];
1258 		ASSERT3U(c, <, rm->rm_cols);
1259 
1260 		src = rm->rm_col[c].rc_data;
1261 		ccount = rm->rm_col[c].rc_size;
1262 		for (j = 0; j < nmissing; j++) {
1263 			cc = missing[j] + rm->rm_firstdatacol;
1264 			ASSERT3U(cc, >=, rm->rm_firstdatacol);
1265 			ASSERT3U(cc, <, rm->rm_cols);
1266 			ASSERT3U(cc, !=, c);
1267 
1268 			dst[j] = rm->rm_col[cc].rc_data;
1269 			dcount[j] = rm->rm_col[cc].rc_size;
1270 		}
1271 
1272 		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1273 
1274 		for (x = 0; x < ccount; x++, src++) {
1275 			if (*src != 0)
1276 				log = vdev_raidz_log2[*src];
1277 
1278 			for (cc = 0; cc < nmissing; cc++) {
1279 				if (x >= dcount[cc])
1280 					continue;
1281 
1282 				if (*src == 0) {
1283 					val = 0;
1284 				} else {
1285 					if ((ll = log + invlog[cc][i]) >= 255)
1286 						ll -= 255;
1287 					val = vdev_raidz_pow2[ll];
1288 				}
1289 
1290 				if (i == 0)
1291 					dst[cc][x] = val;
1292 				else
1293 					dst[cc][x] ^= val;
1294 			}
1295 		}
1296 	}
1297 
1298 	kmem_free(p, psize);
1299 }
1300 
1301 static int
vdev_raidz_reconstruct_general(raidz_map_t * rm,int * tgts,int ntgts)1302 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1303 {
1304 	int n, i, c, t, tt;
1305 	int nmissing_rows;
1306 	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1307 	int parity_map[VDEV_RAIDZ_MAXPARITY];
1308 
1309 	uint8_t *p, *pp;
1310 	size_t psize;
1311 
1312 	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1313 	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1314 	uint8_t *used;
1315 
1316 	int code = 0;
1317 
1318 
1319 	n = rm->rm_cols - rm->rm_firstdatacol;
1320 
1321 	/*
1322 	 * Figure out which data columns are missing.
1323 	 */
1324 	nmissing_rows = 0;
1325 	for (t = 0; t < ntgts; t++) {
1326 		if (tgts[t] >= rm->rm_firstdatacol) {
1327 			missing_rows[nmissing_rows++] =
1328 			    tgts[t] - rm->rm_firstdatacol;
1329 		}
1330 	}
1331 
1332 	/*
1333 	 * Figure out which parity columns to use to help generate the missing
1334 	 * data columns.
1335 	 */
1336 	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1337 		ASSERT(tt < ntgts);
1338 		ASSERT(c < rm->rm_firstdatacol);
1339 
1340 		/*
1341 		 * Skip any targeted parity columns.
1342 		 */
1343 		if (c == tgts[tt]) {
1344 			tt++;
1345 			continue;
1346 		}
1347 
1348 		code |= 1 << c;
1349 
1350 		parity_map[i] = c;
1351 		i++;
1352 	}
1353 
1354 	ASSERT(code != 0);
1355 	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1356 
1357 	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1358 	    nmissing_rows * n + sizeof (used[0]) * n;
1359 	p = kmem_alloc(psize, KM_SLEEP);
1360 
1361 	for (pp = p, i = 0; i < nmissing_rows; i++) {
1362 		rows[i] = pp;
1363 		pp += n;
1364 		invrows[i] = pp;
1365 		pp += n;
1366 	}
1367 	used = pp;
1368 
1369 	for (i = 0; i < nmissing_rows; i++) {
1370 		used[i] = parity_map[i];
1371 	}
1372 
1373 	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1374 		if (tt < nmissing_rows &&
1375 		    c == missing_rows[tt] + rm->rm_firstdatacol) {
1376 			tt++;
1377 			continue;
1378 		}
1379 
1380 		ASSERT3S(i, <, n);
1381 		used[i] = c;
1382 		i++;
1383 	}
1384 
1385 	/*
1386 	 * Initialize the interesting rows of the matrix.
1387 	 */
1388 	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1389 
1390 	/*
1391 	 * Invert the matrix.
1392 	 */
1393 	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1394 	    invrows, used);
1395 
1396 	/*
1397 	 * Reconstruct the missing data using the generated matrix.
1398 	 */
1399 	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1400 	    invrows, used);
1401 
1402 	kmem_free(p, psize);
1403 
1404 	return (code);
1405 }
1406 
1407 static int
vdev_raidz_reconstruct(raidz_map_t * rm,int * t,int nt)1408 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1409 {
1410 	int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1411 	int ntgts;
1412 	int i, c;
1413 	int code;
1414 	int nbadparity, nbaddata;
1415 	int parity_valid[VDEV_RAIDZ_MAXPARITY] = {0};
1416 
1417 	/*
1418 	 * The tgts list must already be sorted.
1419 	 */
1420 	for (i = 1; i < nt; i++) {
1421 		ASSERT(t[i] > t[i - 1]);
1422 	}
1423 
1424 	nbadparity = rm->rm_firstdatacol;
1425 	nbaddata = rm->rm_cols - nbadparity;
1426 	ntgts = 0;
1427 	for (i = 0, c = 0; c < rm->rm_cols; c++) {
1428 		if (c < rm->rm_firstdatacol)
1429 			parity_valid[c] = B_FALSE;
1430 
1431 		if (i < nt && c == t[i]) {
1432 			tgts[ntgts++] = c;
1433 			i++;
1434 		} else if (rm->rm_col[c].rc_error != 0) {
1435 			tgts[ntgts++] = c;
1436 		} else if (c >= rm->rm_firstdatacol) {
1437 			nbaddata--;
1438 		} else {
1439 			parity_valid[c] = B_TRUE;
1440 			nbadparity--;
1441 		}
1442 	}
1443 
1444 	ASSERT(ntgts >= nt);
1445 	ASSERT(nbaddata >= 0);
1446 	ASSERT(nbaddata + nbadparity == ntgts);
1447 
1448 	dt = &tgts[nbadparity];
1449 
1450 	/*
1451 	 * See if we can use any of our optimized reconstruction routines.
1452 	 */
1453 	if (!vdev_raidz_default_to_general) {
1454 		switch (nbaddata) {
1455 		case 1:
1456 			if (parity_valid[VDEV_RAIDZ_P])
1457 				return (vdev_raidz_reconstruct_p(rm, dt, 1));
1458 
1459 			ASSERT(rm->rm_firstdatacol > 1);
1460 
1461 			if (parity_valid[VDEV_RAIDZ_Q])
1462 				return (vdev_raidz_reconstruct_q(rm, dt, 1));
1463 
1464 			ASSERT(rm->rm_firstdatacol > 2);
1465 			break;
1466 
1467 		case 2:
1468 			ASSERT(rm->rm_firstdatacol > 1);
1469 
1470 			if (parity_valid[VDEV_RAIDZ_P] &&
1471 			    parity_valid[VDEV_RAIDZ_Q])
1472 				return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1473 
1474 			ASSERT(rm->rm_firstdatacol > 2);
1475 
1476 			break;
1477 		}
1478 	}
1479 
1480 	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1481 	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1482 	ASSERT(code > 0);
1483 	return (code);
1484 }
1485 
1486 static int
vdev_raidz_open(vdev_t * vd,uint64_t * asize,uint64_t * max_asize,uint64_t * logical_ashift,uint64_t * physical_ashift)1487 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1488     uint64_t *logical_ashift, uint64_t *physical_ashift)
1489 {
1490 	vdev_t *cvd;
1491 	uint64_t nparity = vd->vdev_nparity;
1492 	int c;
1493 	int lasterror = 0;
1494 	int numerrors = 0;
1495 
1496 	ASSERT(nparity > 0);
1497 
1498 	if (nparity > VDEV_RAIDZ_MAXPARITY ||
1499 	    vd->vdev_children < nparity + 1) {
1500 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1501 		return (SET_ERROR(EINVAL));
1502 	}
1503 
1504 	vdev_open_children(vd);
1505 
1506 	for (c = 0; c < vd->vdev_children; c++) {
1507 		cvd = vd->vdev_child[c];
1508 
1509 		if (cvd->vdev_open_error != 0) {
1510 			lasterror = cvd->vdev_open_error;
1511 			numerrors++;
1512 			continue;
1513 		}
1514 
1515 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1516 		*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1517 		*logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
1518 		*physical_ashift = MAX(*physical_ashift,
1519 		    cvd->vdev_physical_ashift);
1520 	}
1521 
1522 	*asize *= vd->vdev_children;
1523 	*max_asize *= vd->vdev_children;
1524 
1525 	if (numerrors > nparity) {
1526 		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1527 		return (lasterror);
1528 	}
1529 
1530 	return (0);
1531 }
1532 
1533 static void
vdev_raidz_close(vdev_t * vd)1534 vdev_raidz_close(vdev_t *vd)
1535 {
1536 	int c;
1537 
1538 	for (c = 0; c < vd->vdev_children; c++)
1539 		vdev_close(vd->vdev_child[c]);
1540 }
1541 
1542 #ifdef illumos
1543 /*
1544  * Handle a read or write I/O to a RAID-Z dump device.
1545  *
1546  * The dump device is in a unique situation compared to other ZFS datasets:
1547  * writing to this device should be as simple and fast as possible.  In
1548  * addition, durability matters much less since the dump will be extracted
1549  * once the machine reboots.  For that reason, this function eschews parity for
1550  * performance and simplicity.  The dump device uses the checksum setting
1551  * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this
1552  * dataset.
1553  *
1554  * Blocks of size 128 KB have been preallocated for this volume.  I/Os less than
1555  * 128 KB will not fill an entire block; in addition, they may not be properly
1556  * aligned.  In that case, this function uses the preallocated 128 KB block and
1557  * omits reading or writing any "empty" portions of that block, as opposed to
1558  * allocating a fresh appropriately-sized block.
1559  *
1560  * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs:
1561  *
1562  *     vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB)
1563  *
1564  * If this were a standard RAID-Z dataset, a block of at least 40 KB would be
1565  * allocated which spans all five child vdevs.  8 KB of data would be written to
1566  * each of four vdevs, with the fifth containing the parity bits.
1567  *
1568  *       parity    data     data     data     data
1569  *     |   PP   |   XX   |   XX   |   XX   |   XX   |
1570  *         ^        ^        ^        ^        ^
1571  *         |        |        |        |        |
1572  *   8 KB parity    ------8 KB data blocks------
1573  *
1574  * However, when writing to the dump device, the behavior is different:
1575  *
1576  *     vdev_raidz_physio(data, size: 32 KB, offset: 64 KB)
1577  *
1578  * Unlike the normal RAID-Z case in which the block is allocated based on the
1579  * I/O size, reads and writes here always use a 128 KB logical I/O size.  If the
1580  * I/O size is less than 128 KB, only the actual portions of data are written.
1581  * In this example the data is written to the third data vdev since that vdev
1582  * contains the offset [64 KB, 96 KB).
1583  *
1584  *       parity    data     data     data     data
1585  *     |        |        |        |   XX   |        |
1586  *                                    ^
1587  *                                    |
1588  *                             32 KB data block
1589  *
1590  * As a result, an individual I/O may not span all child vdevs; moreover, a
1591  * small I/O may only operate on a single child vdev.
1592  *
1593  * Note that since there are no parity bits calculated or written, this format
1594  * remains the same no matter how many parity bits are used in a normal RAID-Z
1595  * stripe.  On a RAID-Z3 configuration with seven child vdevs, the example above
1596  * would look like:
1597  *
1598  *       parity   parity   parity    data     data     data     data
1599  *     |        |        |        |        |        |   XX   |        |
1600  *                                                      ^
1601  *                                                      |
1602  *                                               32 KB data block
1603  */
1604 int
vdev_raidz_physio(vdev_t * vd,caddr_t data,size_t size,uint64_t offset,uint64_t origoffset,boolean_t doread,boolean_t isdump)1605 vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
1606     uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump)
1607 {
1608 	vdev_t *tvd = vd->vdev_top;
1609 	vdev_t *cvd;
1610 	raidz_map_t *rm;
1611 	raidz_col_t *rc;
1612 	int c, err = 0;
1613 
1614 	uint64_t start, end, colstart, colend;
1615 	uint64_t coloffset, colsize, colskip;
1616 
1617 	int flags = doread ? BIO_READ : BIO_WRITE;
1618 
1619 #ifdef	_KERNEL
1620 
1621 	/*
1622 	 * Don't write past the end of the block
1623 	 */
1624 	VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE);
1625 
1626 	start = offset;
1627 	end = start + size;
1628 
1629 	/*
1630 	 * Allocate a RAID-Z map for this block.  Note that this block starts
1631 	 * from the "original" offset, this is, the offset of the extent which
1632 	 * contains the requisite offset of the data being read or written.
1633 	 *
1634 	 * Even if this I/O operation doesn't span the full block size, let's
1635 	 * treat the on-disk format as if the only blocks are the complete 128
1636 	 * KB size.
1637 	 */
1638 	rm = vdev_raidz_map_alloc(data - (offset - origoffset),
1639 	    SPA_OLD_MAXBLOCKSIZE, origoffset, B_FALSE, tvd->vdev_ashift,
1640 	    vd->vdev_children, vd->vdev_nparity);
1641 
1642 	coloffset = origoffset;
1643 
1644 	for (c = rm->rm_firstdatacol; c < rm->rm_cols;
1645 	    c++, coloffset += rc->rc_size) {
1646 		rc = &rm->rm_col[c];
1647 		cvd = vd->vdev_child[rc->rc_devidx];
1648 
1649 		/*
1650 		 * Find the start and end of this column in the RAID-Z map,
1651 		 * keeping in mind that the stated size and offset of the
1652 		 * operation may not fill the entire column for this vdev.
1653 		 *
1654 		 * If any portion of the data spans this column, issue the
1655 		 * appropriate operation to the vdev.
1656 		 */
1657 		if (coloffset + rc->rc_size <= start)
1658 			continue;
1659 		if (coloffset >= end)
1660 			continue;
1661 
1662 		colstart = MAX(coloffset, start);
1663 		colend = MIN(end, coloffset + rc->rc_size);
1664 		colsize = colend - colstart;
1665 		colskip = colstart - coloffset;
1666 
1667 		VERIFY3U(colsize, <=, rc->rc_size);
1668 		VERIFY3U(colskip, <=, rc->rc_size);
1669 
1670 		/*
1671 		 * Note that the child vdev will have a vdev label at the start
1672 		 * of its range of offsets, hence the need for
1673 		 * VDEV_LABEL_OFFSET().  See zio_vdev_child_io() for another
1674 		 * example of why this calculation is needed.
1675 		 */
1676 		if ((err = vdev_disk_physio(cvd,
1677 		    ((char *)rc->rc_data) + colskip, colsize,
1678 		    VDEV_LABEL_OFFSET(rc->rc_offset) + colskip,
1679 		    flags, isdump)) != 0)
1680 			break;
1681 	}
1682 
1683 	vdev_raidz_map_free(rm);
1684 #endif	/* KERNEL */
1685 
1686 	return (err);
1687 }
1688 #endif
1689 
1690 static uint64_t
vdev_raidz_asize(vdev_t * vd,uint64_t psize)1691 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1692 {
1693 	uint64_t asize;
1694 	uint64_t ashift = vd->vdev_top->vdev_ashift;
1695 	uint64_t cols = vd->vdev_children;
1696 	uint64_t nparity = vd->vdev_nparity;
1697 
1698 	asize = ((psize - 1) >> ashift) + 1;
1699 	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1700 	asize = roundup(asize, nparity + 1) << ashift;
1701 
1702 	return (asize);
1703 }
1704 
1705 static void
vdev_raidz_child_done(zio_t * zio)1706 vdev_raidz_child_done(zio_t *zio)
1707 {
1708 	raidz_col_t *rc = zio->io_private;
1709 
1710 	rc->rc_error = zio->io_error;
1711 	rc->rc_tried = 1;
1712 	rc->rc_skipped = 0;
1713 }
1714 
1715 /*
1716  * Start an IO operation on a RAIDZ VDev
1717  *
1718  * Outline:
1719  * - For write operations:
1720  *   1. Generate the parity data
1721  *   2. Create child zio write operations to each column's vdev, for both
1722  *      data and parity.
1723  *   3. If the column skips any sectors for padding, create optional dummy
1724  *      write zio children for those areas to improve aggregation continuity.
1725  * - For read operations:
1726  *   1. Create child zio read operations to each data column's vdev to read
1727  *      the range of data required for zio.
1728  *   2. If this is a scrub or resilver operation, or if any of the data
1729  *      vdevs have had errors, then create zio read operations to the parity
1730  *      columns' VDevs as well.
1731  */
1732 static void
vdev_raidz_io_start(zio_t * zio)1733 vdev_raidz_io_start(zio_t *zio)
1734 {
1735 	vdev_t *vd = zio->io_vd;
1736 	vdev_t *tvd = vd->vdev_top;
1737 	vdev_t *cvd;
1738 	raidz_map_t *rm;
1739 	raidz_col_t *rc;
1740 	int c, i;
1741 
1742 	rm = vdev_raidz_map_alloc(zio->io_data, zio->io_size, zio->io_offset,
1743 	    zio->io_type == ZIO_TYPE_FREE,
1744 	    tvd->vdev_ashift, vd->vdev_children,
1745 	    vd->vdev_nparity);
1746 
1747 	zio->io_vsd = rm;
1748 	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
1749 
1750 	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1751 
1752 	if (zio->io_type == ZIO_TYPE_FREE) {
1753 		for (c = 0; c < rm->rm_cols; c++) {
1754 			rc = &rm->rm_col[c];
1755 			cvd = vd->vdev_child[rc->rc_devidx];
1756 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1757 			    rc->rc_offset, rc->rc_data, rc->rc_size,
1758 			    zio->io_type, zio->io_priority, 0,
1759 			    vdev_raidz_child_done, rc));
1760 		}
1761 
1762 		zio_execute(zio);
1763 		return;
1764 	}
1765 
1766 	if (zio->io_type == ZIO_TYPE_WRITE) {
1767 		vdev_raidz_generate_parity(rm);
1768 
1769 		for (c = 0; c < rm->rm_cols; c++) {
1770 			rc = &rm->rm_col[c];
1771 			cvd = vd->vdev_child[rc->rc_devidx];
1772 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1773 			    rc->rc_offset, rc->rc_data, rc->rc_size,
1774 			    zio->io_type, zio->io_priority, 0,
1775 			    vdev_raidz_child_done, rc));
1776 		}
1777 
1778 		/*
1779 		 * Generate optional I/Os for any skipped sectors to improve
1780 		 * aggregation contiguity.
1781 		 */
1782 		for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
1783 			ASSERT(c <= rm->rm_scols);
1784 			if (c == rm->rm_scols)
1785 				c = 0;
1786 			rc = &rm->rm_col[c];
1787 			cvd = vd->vdev_child[rc->rc_devidx];
1788 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1789 			    rc->rc_offset + rc->rc_size, NULL,
1790 			    1 << tvd->vdev_ashift,
1791 			    zio->io_type, zio->io_priority,
1792 			    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1793 		}
1794 
1795 		zio_execute(zio);
1796 		return;
1797 	}
1798 
1799 	ASSERT(zio->io_type == ZIO_TYPE_READ);
1800 
1801 	/*
1802 	 * Iterate over the columns in reverse order so that we hit the parity
1803 	 * last -- any errors along the way will force us to read the parity.
1804 	 */
1805 	for (c = rm->rm_cols - 1; c >= 0; c--) {
1806 		rc = &rm->rm_col[c];
1807 		cvd = vd->vdev_child[rc->rc_devidx];
1808 		if (!vdev_readable(cvd)) {
1809 			if (c >= rm->rm_firstdatacol)
1810 				rm->rm_missingdata++;
1811 			else
1812 				rm->rm_missingparity++;
1813 			rc->rc_error = SET_ERROR(ENXIO);
1814 			rc->rc_tried = 1;	/* don't even try */
1815 			rc->rc_skipped = 1;
1816 			continue;
1817 		}
1818 		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1819 			if (c >= rm->rm_firstdatacol)
1820 				rm->rm_missingdata++;
1821 			else
1822 				rm->rm_missingparity++;
1823 			rc->rc_error = SET_ERROR(ESTALE);
1824 			rc->rc_skipped = 1;
1825 			continue;
1826 		}
1827 		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
1828 		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
1829 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1830 			    rc->rc_offset, rc->rc_data, rc->rc_size,
1831 			    zio->io_type, zio->io_priority, 0,
1832 			    vdev_raidz_child_done, rc));
1833 		}
1834 	}
1835 
1836 	zio_execute(zio);
1837 }
1838 
1839 
1840 /*
1841  * Report a checksum error for a child of a RAID-Z device.
1842  */
1843 static void
raidz_checksum_error(zio_t * zio,raidz_col_t * rc,void * bad_data)1844 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
1845 {
1846 	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
1847 
1848 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1849 		zio_bad_cksum_t zbc;
1850 		raidz_map_t *rm = zio->io_vsd;
1851 
1852 		mutex_enter(&vd->vdev_stat_lock);
1853 		vd->vdev_stat.vs_checksum_errors++;
1854 		mutex_exit(&vd->vdev_stat_lock);
1855 
1856 		zbc.zbc_has_cksum = 0;
1857 		zbc.zbc_injected = rm->rm_ecksuminjected;
1858 
1859 		zfs_ereport_post_checksum(zio->io_spa, vd, zio,
1860 		    rc->rc_offset, rc->rc_size, rc->rc_data, bad_data,
1861 		    &zbc);
1862 	}
1863 }
1864 
1865 /*
1866  * We keep track of whether or not there were any injected errors, so that
1867  * any ereports we generate can note it.
1868  */
1869 static int
raidz_checksum_verify(zio_t * zio)1870 raidz_checksum_verify(zio_t *zio)
1871 {
1872 	zio_bad_cksum_t zbc;
1873 	raidz_map_t *rm = zio->io_vsd;
1874 
1875 	int ret = zio_checksum_error(zio, &zbc);
1876 	if (ret != 0 && zbc.zbc_injected != 0)
1877 		rm->rm_ecksuminjected = 1;
1878 
1879 	return (ret);
1880 }
1881 
1882 /*
1883  * Generate the parity from the data columns. If we tried and were able to
1884  * read the parity without error, verify that the generated parity matches the
1885  * data we read. If it doesn't, we fire off a checksum error. Return the
1886  * number such failures.
1887  */
1888 static int
raidz_parity_verify(zio_t * zio,raidz_map_t * rm)1889 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
1890 {
1891 	void *orig[VDEV_RAIDZ_MAXPARITY];
1892 	int c, ret = 0;
1893 	raidz_col_t *rc;
1894 
1895 	blkptr_t *bp = zio->io_bp;
1896 	enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
1897 	    (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
1898 
1899 	if (checksum == ZIO_CHECKSUM_NOPARITY)
1900 		return (ret);
1901 
1902 	for (c = 0; c < rm->rm_firstdatacol; c++) {
1903 		rc = &rm->rm_col[c];
1904 		if (!rc->rc_tried || rc->rc_error != 0)
1905 			continue;
1906 		orig[c] = zio_buf_alloc(rc->rc_size);
1907 		bcopy(rc->rc_data, orig[c], rc->rc_size);
1908 	}
1909 
1910 	vdev_raidz_generate_parity(rm);
1911 
1912 	for (c = 0; c < rm->rm_firstdatacol; c++) {
1913 		rc = &rm->rm_col[c];
1914 		if (!rc->rc_tried || rc->rc_error != 0)
1915 			continue;
1916 		if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1917 			raidz_checksum_error(zio, rc, orig[c]);
1918 			rc->rc_error = SET_ERROR(ECKSUM);
1919 			ret++;
1920 		}
1921 		zio_buf_free(orig[c], rc->rc_size);
1922 	}
1923 
1924 	return (ret);
1925 }
1926 
1927 /*
1928  * Keep statistics on all the ways that we used parity to correct data.
1929  */
1930 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
1931 
1932 static int
vdev_raidz_worst_error(raidz_map_t * rm)1933 vdev_raidz_worst_error(raidz_map_t *rm)
1934 {
1935 	int error = 0;
1936 
1937 	for (int c = 0; c < rm->rm_cols; c++)
1938 		error = zio_worst_error(error, rm->rm_col[c].rc_error);
1939 
1940 	return (error);
1941 }
1942 
1943 /*
1944  * Iterate over all combinations of bad data and attempt a reconstruction.
1945  * Note that the algorithm below is non-optimal because it doesn't take into
1946  * account how reconstruction is actually performed. For example, with
1947  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1948  * is targeted as invalid as if columns 1 and 4 are targeted since in both
1949  * cases we'd only use parity information in column 0.
1950  */
1951 static int
vdev_raidz_combrec(zio_t * zio,int total_errors,int data_errors)1952 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
1953 {
1954 	raidz_map_t *rm = zio->io_vsd;
1955 	raidz_col_t *rc;
1956 	void *orig[VDEV_RAIDZ_MAXPARITY];
1957 	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1958 	int *tgts = &tstore[1];
1959 	int current, next, i, c, n;
1960 	int code, ret = 0;
1961 
1962 	ASSERT(total_errors < rm->rm_firstdatacol);
1963 
1964 	/*
1965 	 * This simplifies one edge condition.
1966 	 */
1967 	tgts[-1] = -1;
1968 
1969 	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1970 		/*
1971 		 * Initialize the targets array by finding the first n columns
1972 		 * that contain no error.
1973 		 *
1974 		 * If there were no data errors, we need to ensure that we're
1975 		 * always explicitly attempting to reconstruct at least one
1976 		 * data column. To do this, we simply push the highest target
1977 		 * up into the data columns.
1978 		 */
1979 		for (c = 0, i = 0; i < n; i++) {
1980 			if (i == n - 1 && data_errors == 0 &&
1981 			    c < rm->rm_firstdatacol) {
1982 				c = rm->rm_firstdatacol;
1983 			}
1984 
1985 			while (rm->rm_col[c].rc_error != 0) {
1986 				c++;
1987 				ASSERT3S(c, <, rm->rm_cols);
1988 			}
1989 
1990 			tgts[i] = c++;
1991 		}
1992 
1993 		/*
1994 		 * Setting tgts[n] simplifies the other edge condition.
1995 		 */
1996 		tgts[n] = rm->rm_cols;
1997 
1998 		/*
1999 		 * These buffers were allocated in previous iterations.
2000 		 */
2001 		for (i = 0; i < n - 1; i++) {
2002 			ASSERT(orig[i] != NULL);
2003 		}
2004 
2005 		orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
2006 
2007 		current = 0;
2008 		next = tgts[current];
2009 
2010 		while (current != n) {
2011 			tgts[current] = next;
2012 			current = 0;
2013 
2014 			/*
2015 			 * Save off the original data that we're going to
2016 			 * attempt to reconstruct.
2017 			 */
2018 			for (i = 0; i < n; i++) {
2019 				ASSERT(orig[i] != NULL);
2020 				c = tgts[i];
2021 				ASSERT3S(c, >=, 0);
2022 				ASSERT3S(c, <, rm->rm_cols);
2023 				rc = &rm->rm_col[c];
2024 				bcopy(rc->rc_data, orig[i], rc->rc_size);
2025 			}
2026 
2027 			/*
2028 			 * Attempt a reconstruction and exit the outer loop on
2029 			 * success.
2030 			 */
2031 			code = vdev_raidz_reconstruct(rm, tgts, n);
2032 			if (raidz_checksum_verify(zio) == 0) {
2033 				atomic_inc_64(&raidz_corrected[code]);
2034 
2035 				for (i = 0; i < n; i++) {
2036 					c = tgts[i];
2037 					rc = &rm->rm_col[c];
2038 					ASSERT(rc->rc_error == 0);
2039 					if (rc->rc_tried)
2040 						raidz_checksum_error(zio, rc,
2041 						    orig[i]);
2042 					rc->rc_error = SET_ERROR(ECKSUM);
2043 				}
2044 
2045 				ret = code;
2046 				goto done;
2047 			}
2048 
2049 			/*
2050 			 * Restore the original data.
2051 			 */
2052 			for (i = 0; i < n; i++) {
2053 				c = tgts[i];
2054 				rc = &rm->rm_col[c];
2055 				bcopy(orig[i], rc->rc_data, rc->rc_size);
2056 			}
2057 
2058 			do {
2059 				/*
2060 				 * Find the next valid column after the current
2061 				 * position..
2062 				 */
2063 				for (next = tgts[current] + 1;
2064 				    next < rm->rm_cols &&
2065 				    rm->rm_col[next].rc_error != 0; next++)
2066 					continue;
2067 
2068 				ASSERT(next <= tgts[current + 1]);
2069 
2070 				/*
2071 				 * If that spot is available, we're done here.
2072 				 */
2073 				if (next != tgts[current + 1])
2074 					break;
2075 
2076 				/*
2077 				 * Otherwise, find the next valid column after
2078 				 * the previous position.
2079 				 */
2080 				for (c = tgts[current - 1] + 1;
2081 				    rm->rm_col[c].rc_error != 0; c++)
2082 					continue;
2083 
2084 				tgts[current] = c;
2085 				current++;
2086 
2087 			} while (current != n);
2088 		}
2089 	}
2090 	n--;
2091 done:
2092 	for (i = 0; i < n; i++) {
2093 		zio_buf_free(orig[i], rm->rm_col[0].rc_size);
2094 	}
2095 
2096 	return (ret);
2097 }
2098 
2099 /*
2100  * Complete an IO operation on a RAIDZ VDev
2101  *
2102  * Outline:
2103  * - For write operations:
2104  *   1. Check for errors on the child IOs.
2105  *   2. Return, setting an error code if too few child VDevs were written
2106  *      to reconstruct the data later.  Note that partial writes are
2107  *      considered successful if they can be reconstructed at all.
2108  * - For read operations:
2109  *   1. Check for errors on the child IOs.
2110  *   2. If data errors occurred:
2111  *      a. Try to reassemble the data from the parity available.
2112  *      b. If we haven't yet read the parity drives, read them now.
2113  *      c. If all parity drives have been read but the data still doesn't
2114  *         reassemble with a correct checksum, then try combinatorial
2115  *         reconstruction.
2116  *      d. If that doesn't work, return an error.
2117  *   3. If there were unexpected errors or this is a resilver operation,
2118  *      rewrite the vdevs that had errors.
2119  */
2120 static void
vdev_raidz_io_done(zio_t * zio)2121 vdev_raidz_io_done(zio_t *zio)
2122 {
2123 	vdev_t *vd = zio->io_vd;
2124 	vdev_t *cvd;
2125 	raidz_map_t *rm = zio->io_vsd;
2126 	raidz_col_t *rc;
2127 	int unexpected_errors = 0;
2128 	int parity_errors = 0;
2129 	int parity_untried = 0;
2130 	int data_errors = 0;
2131 	int total_errors = 0;
2132 	int n, c;
2133 	int tgts[VDEV_RAIDZ_MAXPARITY];
2134 	int code;
2135 
2136 	ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
2137 
2138 	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
2139 	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
2140 
2141 	for (c = 0; c < rm->rm_cols; c++) {
2142 		rc = &rm->rm_col[c];
2143 
2144 		if (rc->rc_error) {
2145 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
2146 
2147 			if (c < rm->rm_firstdatacol)
2148 				parity_errors++;
2149 			else
2150 				data_errors++;
2151 
2152 			if (!rc->rc_skipped)
2153 				unexpected_errors++;
2154 
2155 			total_errors++;
2156 		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
2157 			parity_untried++;
2158 		}
2159 	}
2160 
2161 	if (zio->io_type == ZIO_TYPE_WRITE) {
2162 		/*
2163 		 * XXX -- for now, treat partial writes as a success.
2164 		 * (If we couldn't write enough columns to reconstruct
2165 		 * the data, the I/O failed.  Otherwise, good enough.)
2166 		 *
2167 		 * Now that we support write reallocation, it would be better
2168 		 * to treat partial failure as real failure unless there are
2169 		 * no non-degraded top-level vdevs left, and not update DTLs
2170 		 * if we intend to reallocate.
2171 		 */
2172 		/* XXPOLICY */
2173 		if (total_errors > rm->rm_firstdatacol)
2174 			zio->io_error = vdev_raidz_worst_error(rm);
2175 
2176 		return;
2177 	} else if (zio->io_type == ZIO_TYPE_FREE) {
2178 		return;
2179 	}
2180 
2181 	ASSERT(zio->io_type == ZIO_TYPE_READ);
2182 	/*
2183 	 * There are three potential phases for a read:
2184 	 *	1. produce valid data from the columns read
2185 	 *	2. read all disks and try again
2186 	 *	3. perform combinatorial reconstruction
2187 	 *
2188 	 * Each phase is progressively both more expensive and less likely to
2189 	 * occur. If we encounter more errors than we can repair or all phases
2190 	 * fail, we have no choice but to return an error.
2191 	 */
2192 
2193 	/*
2194 	 * If the number of errors we saw was correctable -- less than or equal
2195 	 * to the number of parity disks read -- attempt to produce data that
2196 	 * has a valid checksum. Naturally, this case applies in the absence of
2197 	 * any errors.
2198 	 */
2199 	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
2200 		if (data_errors == 0) {
2201 			if (raidz_checksum_verify(zio) == 0) {
2202 				/*
2203 				 * If we read parity information (unnecessarily
2204 				 * as it happens since no reconstruction was
2205 				 * needed) regenerate and verify the parity.
2206 				 * We also regenerate parity when resilvering
2207 				 * so we can write it out to the failed device
2208 				 * later.
2209 				 */
2210 				if (parity_errors + parity_untried <
2211 				    rm->rm_firstdatacol ||
2212 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
2213 					n = raidz_parity_verify(zio, rm);
2214 					unexpected_errors += n;
2215 					ASSERT(parity_errors + n <=
2216 					    rm->rm_firstdatacol);
2217 				}
2218 				goto done;
2219 			}
2220 		} else {
2221 			/*
2222 			 * We either attempt to read all the parity columns or
2223 			 * none of them. If we didn't try to read parity, we
2224 			 * wouldn't be here in the correctable case. There must
2225 			 * also have been fewer parity errors than parity
2226 			 * columns or, again, we wouldn't be in this code path.
2227 			 */
2228 			ASSERT(parity_untried == 0);
2229 			ASSERT(parity_errors < rm->rm_firstdatacol);
2230 
2231 			/*
2232 			 * Identify the data columns that reported an error.
2233 			 */
2234 			n = 0;
2235 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
2236 				rc = &rm->rm_col[c];
2237 				if (rc->rc_error != 0) {
2238 					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2239 					tgts[n++] = c;
2240 				}
2241 			}
2242 
2243 			ASSERT(rm->rm_firstdatacol >= n);
2244 
2245 			code = vdev_raidz_reconstruct(rm, tgts, n);
2246 
2247 			if (raidz_checksum_verify(zio) == 0) {
2248 				atomic_inc_64(&raidz_corrected[code]);
2249 
2250 				/*
2251 				 * If we read more parity disks than were used
2252 				 * for reconstruction, confirm that the other
2253 				 * parity disks produced correct data. This
2254 				 * routine is suboptimal in that it regenerates
2255 				 * the parity that we already used in addition
2256 				 * to the parity that we're attempting to
2257 				 * verify, but this should be a relatively
2258 				 * uncommon case, and can be optimized if it
2259 				 * becomes a problem. Note that we regenerate
2260 				 * parity when resilvering so we can write it
2261 				 * out to failed devices later.
2262 				 */
2263 				if (parity_errors < rm->rm_firstdatacol - n ||
2264 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
2265 					n = raidz_parity_verify(zio, rm);
2266 					unexpected_errors += n;
2267 					ASSERT(parity_errors + n <=
2268 					    rm->rm_firstdatacol);
2269 				}
2270 
2271 				goto done;
2272 			}
2273 		}
2274 	}
2275 
2276 	/*
2277 	 * This isn't a typical situation -- either we got a read error or
2278 	 * a child silently returned bad data. Read every block so we can
2279 	 * try again with as much data and parity as we can track down. If
2280 	 * we've already been through once before, all children will be marked
2281 	 * as tried so we'll proceed to combinatorial reconstruction.
2282 	 */
2283 	unexpected_errors = 1;
2284 	rm->rm_missingdata = 0;
2285 	rm->rm_missingparity = 0;
2286 
2287 	for (c = 0; c < rm->rm_cols; c++) {
2288 		if (rm->rm_col[c].rc_tried)
2289 			continue;
2290 
2291 		zio_vdev_io_redone(zio);
2292 		do {
2293 			rc = &rm->rm_col[c];
2294 			if (rc->rc_tried)
2295 				continue;
2296 			zio_nowait(zio_vdev_child_io(zio, NULL,
2297 			    vd->vdev_child[rc->rc_devidx],
2298 			    rc->rc_offset, rc->rc_data, rc->rc_size,
2299 			    zio->io_type, zio->io_priority, 0,
2300 			    vdev_raidz_child_done, rc));
2301 		} while (++c < rm->rm_cols);
2302 
2303 		return;
2304 	}
2305 
2306 	/*
2307 	 * At this point we've attempted to reconstruct the data given the
2308 	 * errors we detected, and we've attempted to read all columns. There
2309 	 * must, therefore, be one or more additional problems -- silent errors
2310 	 * resulting in invalid data rather than explicit I/O errors resulting
2311 	 * in absent data. We check if there is enough additional data to
2312 	 * possibly reconstruct the data and then perform combinatorial
2313 	 * reconstruction over all possible combinations. If that fails,
2314 	 * we're cooked.
2315 	 */
2316 	if (total_errors > rm->rm_firstdatacol) {
2317 		zio->io_error = vdev_raidz_worst_error(rm);
2318 
2319 	} else if (total_errors < rm->rm_firstdatacol &&
2320 	    (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2321 		/*
2322 		 * If we didn't use all the available parity for the
2323 		 * combinatorial reconstruction, verify that the remaining
2324 		 * parity is correct.
2325 		 */
2326 		if (code != (1 << rm->rm_firstdatacol) - 1)
2327 			(void) raidz_parity_verify(zio, rm);
2328 	} else {
2329 		/*
2330 		 * We're here because either:
2331 		 *
2332 		 *	total_errors == rm_first_datacol, or
2333 		 *	vdev_raidz_combrec() failed
2334 		 *
2335 		 * In either case, there is enough bad data to prevent
2336 		 * reconstruction.
2337 		 *
2338 		 * Start checksum ereports for all children which haven't
2339 		 * failed, and the IO wasn't speculative.
2340 		 */
2341 		zio->io_error = SET_ERROR(ECKSUM);
2342 
2343 		if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2344 			for (c = 0; c < rm->rm_cols; c++) {
2345 				rc = &rm->rm_col[c];
2346 				if (rc->rc_error == 0) {
2347 					zio_bad_cksum_t zbc;
2348 					zbc.zbc_has_cksum = 0;
2349 					zbc.zbc_injected =
2350 					    rm->rm_ecksuminjected;
2351 
2352 					zfs_ereport_start_checksum(
2353 					    zio->io_spa,
2354 					    vd->vdev_child[rc->rc_devidx],
2355 					    zio, rc->rc_offset, rc->rc_size,
2356 					    (void *)(uintptr_t)c, &zbc);
2357 				}
2358 			}
2359 		}
2360 	}
2361 
2362 done:
2363 	zio_checksum_verified(zio);
2364 
2365 	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2366 	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2367 		/*
2368 		 * Use the good data we have in hand to repair damaged children.
2369 		 */
2370 		for (c = 0; c < rm->rm_cols; c++) {
2371 			rc = &rm->rm_col[c];
2372 			cvd = vd->vdev_child[rc->rc_devidx];
2373 
2374 			if (rc->rc_error == 0)
2375 				continue;
2376 
2377 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2378 			    rc->rc_offset, rc->rc_data, rc->rc_size,
2379 			    ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
2380 			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2381 			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2382 		}
2383 	}
2384 }
2385 
2386 static void
vdev_raidz_state_change(vdev_t * vd,int faulted,int degraded)2387 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2388 {
2389 	if (faulted > vd->vdev_nparity)
2390 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2391 		    VDEV_AUX_NO_REPLICAS);
2392 	else if (degraded + faulted != 0)
2393 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2394 	else
2395 		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2396 }
2397 
2398 vdev_ops_t vdev_raidz_ops = {
2399 	vdev_raidz_open,
2400 	vdev_raidz_close,
2401 	vdev_raidz_asize,
2402 	vdev_raidz_io_start,
2403 	vdev_raidz_io_done,
2404 	vdev_raidz_state_change,
2405 	NULL,
2406 	NULL,
2407 	VDEV_TYPE_RAIDZ,	/* name of this vdev type */
2408 	B_FALSE			/* not a leaf vdev */
2409 };
2410